def index_canvas(uri, manifest_uri, db_engine): q = sqla_text(''' INSERT INTO canvases(jsonld_id, manifest_jsonld_id) VALUES (:cid, :mid) ON CONFLICT (jsonld_id, manifest_jsonld_id) DO NOTHING''') db_engine.execute(q, cid=uri, mid=manifest_uri)
def index_curation(uri, db_engine): q = sqla_text(''' INSERT INTO curations(jsonld_id) VALUES (:cid) ON CONFLICT (jsonld_id) DO NOTHING''') db_engine.execute(q, cid=uri)
def execute_sql_file(path): # This helper is typically used within a multiprocessing pool; create a new database # engine for each job. db.engine.dispose() print(('Running {}'.format(path))) with open(path) as fp: cmd = '\n'.join( [line for line in fp.readlines() if not line.startswith('--')]) db.engine.execute(sqla_text(cmd), **SQL_CONFIG)
def ping_connection(self): """ Required to maintain the connection to MySQL. Perform ping before each query to the database. OperationalError if an exception occurs, remove sessions, and connects again. """ try: self.session_db.execute(sqla_text("SELECT 1")) except OperationalError as exc: # OperationalError: (2006, 'MySQL server has gone away') LOGGER.warning("Database server went away: %a", exc) LOGGER.info("Reconnect to server") self.session_db.remove() try: self.session_db.execute(sqla_text("SELECT 1")) except SQLAlchemyError as exc: LOGGER.error("Could not reconnect to the MySQL database: %s", make_exc_ascii_str(exc)) sys.exit(1)
def deintex_curation(cur_uri, db_engine): # get Curation DB ID s_cur = sqla_text(''' SELECT id FROM curations WHERE jsonld_id=:cid''') cur_db_id = db_engine.execute(s_cur, cid=cur_uri).fetchone() if cur_db_id: cur_db_id = cur_db_id[0] # delete Curation elements ("cutouts") d_cel = sqla_text(''' DELETE FROM curation_elements WHERE curation_id=:cdbid''') db_engine.execute(d_cel, cdbid=cur_db_id) # delete Curation d_cur = sqla_text(''' DELETE FROM curations WHERE jsonld_id=:jid''') db_engine.execute(d_cur, jid=cur_uri)
def execute_sql_file(path): # This helper is typically used within a multiprocessing pool; create a new database # engine for each job. db.engine.dispose() print(('Running {}'.format(path))) with open(path) as fp: cmd = '\n'.join([ line for line in fp.readlines() if not line.startswith('--') ]) db.engine.execute(sqla_text(cmd), **SQL_CONFIG)
def index_curation_element(cur_id, can_id, manifest_id, xywh, db_engine): s_cur = sqla_text(''' SELECT id FROM curations WHERE jsonld_id=:cid''') cur_db_id = db_engine.execute(s_cur, cid=cur_id).fetchone()[0] s_can = sqla_text(''' SELECT id FROM canvases WHERE jsonld_id=:cid AND manifest_jsonld_id=:mid''') can_db_id = db_engine.execute(s_can, cid=can_id, mid=manifest_id).fetchone()[0] x, y, w, h = [int(elem) for elem in xywh.split(',')] poly = ('ST_GeomFromText(' '\'POLYGON(({} {}, {} {}, {} {}, {} {}, {} {}))\')').format( x, y, x + w, y, x + w, y + h, x, y + h, x, y) # because the IDs are only numbers and the PostGIS polygon won't go # through sqla_text we use format here ¯\_(ツ)_/¯ q = '''INSERT INTO curation_elements(canvas_id, curation_id, area) VALUES ({}, {}, {})'''.format(can_db_id, cur_db_id, poly) db_engine.execute(q)
def _setup_db(self): wait_timeout = int(self.config.get("wait_timeout", self._DEFAULT_WAIT_TIMEOUT)) wait_timeout = min(max(wait_timeout, self._MIN_WAIT_TIMEOUT), self._MAX_WAIT_TIMEOUT) # (`pool_recycle` should be significantly less than `wait_timeout`) pool_recycle = wait_timeout // 2 engine = create_engine( self.config["uri"], connect_args=dict( charset=self.config.get( "connect_charset", N6DataBackendAPI.EVENT_DB_LEGACY_CHARSET), use_unicode=True, binary_prefix=True, cursorclass=N6RecorderCursor), pool_recycle=pool_recycle, echo=str_to_bool(self.config.get("echo", "false"))) self._install_session_variables_setter( engine, wait_timeout=wait_timeout, time_zone="'+00:00'") session_db = N6DataBackendAPI.configure_db_session(engine) session_db.execute(sqla_text("SELECT 1")) # Let's crash early if db is misconfigured. return session_db
def execute_sql_file(path): print(('Running {}'.format(path))) with open(path) as fp: cmd = '\n'.join( [line for line in fp.readlines() if not line.startswith('--')]) db.engine.execute(sqla_text(cmd))
def index(): canvas_uri_raw = request.args.get('canvas') area_xywh = request.args.get('xywh') if not canvas_uri_raw: return abort(400) canvas_uri = urllib.parse.unquote(canvas_uri_raw) cfg = Cfg() db_engine = create_engine(cfg.cfg['db_uri']) q_can = sqla_text(''' SELECT id, manifest_jsonld_id FROM canvases WHERE jsonld_id=:can_uri ''') can_db_tpls = db_engine.execute( q_can, can_uri=canvas_uri ).fetchall() if not can_db_tpls: return abort(404) # FIXME not there, respond accordingly else: if len(can_db_tpls) == 1: can_db_id = int(can_db_tpls[0]['id']) can_db_man_jsonld_id = can_db_tpls[0]['manifest_jsonld_id'] else: print('multiple canvases w/ same ID (!!!)') # FIXME problem area_query_insert = '' if area_xywh: x, y, w, h = [int(elem) for elem in area_xywh.split(',')] poly = ('ST_GeomFromText(' '\'POLYGON(({} {}, {} {}, {} {}, {} {}, {} {}))\')').format( x, y, x+w, y, x+w, y+h, x, y+h, x, y ) area_query_insert = 'ST_Within(area, {}) and '.format(poly) q_area = '''SELECT curations.jsonld_id as uri, areajson FROM curations JOIN (SELECT curation_id, ST_AsGeoJSON(area) as areajson FROM curation_elements WHERE {} canvas_id = {}) as cue ON curations.id = cue.curation_id; '''.format(area_query_insert, can_db_id) cur_uris = db_engine.execute(q_area).fetchall() backlinks_flat = [] for row in cur_uris: uri = row['uri'] area = json.loads(row['areajson']) backlinks_flat.append([uri, area]) # backlinks_by_uri = {} # for bl in backlinks_flat: # uri, area = bl # if uri not in backlinks_by_uri: # backlinks_by_uri[uri] = {'areas':[]} # backlinks_by_uri[uri]['areas'].append(area) backlinks_by_area = {} for bl in backlinks_flat: uri, area = bl coords = area['coordinates'][0] if not len(coords) == 5: print('unexpected polygon shape (!!!)') # FIXME problem p1, p2, p3, p4, p5 = coords xywh = '{},{},{},{}'.format(p1[0], p1[1], p2[0]-p1[0], p3[1]-p1[1]) if xywh not in backlinks_by_area: backlinks_by_area[xywh] = [] backlinks_by_area[xywh].append(uri) display_curation = build_annotation_container_curation( canvas_uri, can_db_man_jsonld_id, backlinks_by_area, request.url, request.base_url) # ret = { # 'canvas': canvas_uri, # 'curations_backlinks': backlinks_by_area # } return jsonify(display_curation)
def crawl_single(as_url, db_engine): """ Crawl, given a URL to an Activity Stream """ last_activity_query = sqla_text(''' SELECT last_activity FROM last_activity_times WHERE acticity_stream_url=:as_url ''') last_activity_db = db_engine.execute(last_activity_query, as_url=as_url).fetchone() if not last_activity_db: last_activity_time_db = datetime.datetime.fromtimestamp(0) log('First time crawling this Activity Stream.') else: last_activity_time_db = dateutil.parser.parse(last_activity_db[0]) log('Last cralwed activity at {}.'.format(last_activity_time_db)) log('Retrieving Activity Stream ({})'.format(as_url)) try: resp = requests.get(as_url) except requests.exceptions.RequestException as e: msg = 'Could not access Activity Stream. ({})'.format(e) log(msg) print(msg) return if resp.status_code != 200: msg = ('Could not access Activity Stream. (HTTP {})').format( resp.status_code) log(msg) print(msg) return as_oc = resp.json() log('Start iterating over Activity Stream pages') as_ocp = get_referenced(as_oc, 'last') new_canvases = 0 new_activity = False # NOTE: seen_activity_objs is used to prevent processing obsolete # activities. Since we go through the Activity Stream backwards, we # only process the most recent Activity per IIIF doc. # (Not doing so might lead to for example trying to process a Create # for a document for which a Delete was processed just before.) seen_activity_objs = [] last_activity_time_as = None # for all AS pages while True: # for all AC items log('going through AS page {}'.format(as_ocp['id'])) for activity in as_ocp['orderedItems']: if activity['type'] in ['Create', 'Update', 'Delete']: # Reduce noise log('going through {} item {}'.format(activity['type'], activity['id'])) activity_end_time = dateutil.parser.parse(activity['endTime']) # if we haven't seen it yet and it's about a Curation if activity_end_time > last_activity_time_db and \ activity['object']['@type'] == 'cr:Curation' and \ activity['object'] not in seen_activity_objs: if last_activity_time_as == None: # b/c we're going backwards (i.e. from new to old) last_activity_time_as = activity_end_time new_activity = True if activity['type'] == 'Create': log('Create') new_canvases += process_curation_create( activity, db_engine) elif activity['type'] == 'Update': log('Update') log(' ≈Delete') process_curation_delete(activity, db_engine) log(' +Create') new_canvases += process_curation_create( activity, db_engine) elif activity['type'] == 'Delete': log('Delete') process_curation_delete(activity, db_engine) seen_activity_objs.append(activity['object']) else: if activity['type'] in ['Create', 'Update', 'Delete']: # Reduce noise log('skipping') if not as_ocp.get('prev', False): break as_ocp = get_referenced(as_ocp, 'prev') if last_activity_time_as == None: last_activity_time_as = last_activity_time_db # persist crawl log if not last_activity_db: last_activity_update = sqla_text(''' INSERT INTO last_activity_times(acticity_stream_url, last_activity) VALUES (:as_url, :new_time)''') else: last_activity_update = sqla_text(''' UPDATE last_activity_times SET last_activity=:new_time WHERE acticity_stream_url=:as_url ''') last_activity_db = db_engine.execute( last_activity_update, new_time=last_activity_time_as.isoformat(), as_url=as_url) if new_activity: pass # foo else: pass