예제 #1
0
def index_canvas(uri, manifest_uri, db_engine):
    q = sqla_text('''
        INSERT INTO canvases(jsonld_id, manifest_jsonld_id)
        VALUES (:cid, :mid)
        ON CONFLICT (jsonld_id, manifest_jsonld_id)
        DO NOTHING''')
    db_engine.execute(q, cid=uri, mid=manifest_uri)
예제 #2
0
def index_curation(uri, db_engine):
    q = sqla_text('''
        INSERT INTO curations(jsonld_id)
        VALUES (:cid)
        ON CONFLICT (jsonld_id)
        DO NOTHING''')
    db_engine.execute(q, cid=uri)
예제 #3
0
def execute_sql_file(path):
    # This helper is typically used within a multiprocessing pool; create a new database
    # engine for each job.
    db.engine.dispose()
    print(('Running {}'.format(path)))
    with open(path) as fp:
        cmd = '\n'.join(
            [line for line in fp.readlines() if not line.startswith('--')])
        db.engine.execute(sqla_text(cmd), **SQL_CONFIG)
예제 #4
0
파일: recorder.py 프로젝트: CERT-Polska/n6
 def ping_connection(self):
     """
     Required to maintain the connection to MySQL.
     Perform ping before each query to the database.
     OperationalError if an exception occurs, remove sessions, and connects again.
     """
     try:
         self.session_db.execute(sqla_text("SELECT 1"))
     except OperationalError as exc:
         # OperationalError: (2006, 'MySQL server has gone away')
         LOGGER.warning("Database server went away: %a", exc)
         LOGGER.info("Reconnect to server")
         self.session_db.remove()
         try:
             self.session_db.execute(sqla_text("SELECT 1"))
         except SQLAlchemyError as exc:
             LOGGER.error("Could not reconnect to the MySQL database: %s",
                          make_exc_ascii_str(exc))
             sys.exit(1)
예제 #5
0
def deintex_curation(cur_uri, db_engine):
    # get Curation DB ID
    s_cur = sqla_text('''
        SELECT id
        FROM curations
        WHERE jsonld_id=:cid''')
    cur_db_id = db_engine.execute(s_cur, cid=cur_uri).fetchone()
    if cur_db_id:
        cur_db_id = cur_db_id[0]
        # delete Curation elements ("cutouts")
        d_cel = sqla_text('''
            DELETE FROM curation_elements
            WHERE curation_id=:cdbid''')
        db_engine.execute(d_cel, cdbid=cur_db_id)
        # delete Curation
        d_cur = sqla_text('''
            DELETE FROM curations
            WHERE jsonld_id=:jid''')
        db_engine.execute(d_cur, jid=cur_uri)
예제 #6
0
def execute_sql_file(path):
    # This helper is typically used within a multiprocessing pool; create a new database
    # engine for each job.
    db.engine.dispose()
    print(('Running {}'.format(path)))
    with open(path) as fp:
        cmd = '\n'.join([
            line for line in fp.readlines()
            if not line.startswith('--')
        ])
        db.engine.execute(sqla_text(cmd), **SQL_CONFIG)
예제 #7
0
def index_curation_element(cur_id, can_id, manifest_id, xywh, db_engine):
    s_cur = sqla_text('''
        SELECT id
        FROM curations
        WHERE jsonld_id=:cid''')
    cur_db_id = db_engine.execute(s_cur, cid=cur_id).fetchone()[0]
    s_can = sqla_text('''
        SELECT id
        FROM canvases
        WHERE jsonld_id=:cid AND manifest_jsonld_id=:mid''')
    can_db_id = db_engine.execute(s_can, cid=can_id,
                                  mid=manifest_id).fetchone()[0]
    x, y, w, h = [int(elem) for elem in xywh.split(',')]
    poly = ('ST_GeomFromText('
            '\'POLYGON(({} {}, {} {}, {} {}, {} {}, {} {}))\')').format(
                x, y, x + w, y, x + w, y + h, x, y + h, x, y)
    # because the IDs are only numbers and the PostGIS polygon won't go
    # through sqla_text we use format here ¯\_(ツ)_/¯
    q = '''INSERT INTO curation_elements(canvas_id, curation_id, area)
        VALUES ({}, {}, {})'''.format(can_db_id, cur_db_id, poly)
    db_engine.execute(q)
예제 #8
0
 def _setup_db(self):
     wait_timeout = int(self.config.get("wait_timeout", self._DEFAULT_WAIT_TIMEOUT))
     wait_timeout = min(max(wait_timeout, self._MIN_WAIT_TIMEOUT), self._MAX_WAIT_TIMEOUT)
     # (`pool_recycle` should be significantly less than `wait_timeout`)
     pool_recycle = wait_timeout // 2
     engine = create_engine(
         self.config["uri"],
         connect_args=dict(
             charset=self.config.get(
                 "connect_charset",
                 N6DataBackendAPI.EVENT_DB_LEGACY_CHARSET),
             use_unicode=True,
             binary_prefix=True,
             cursorclass=N6RecorderCursor),
         pool_recycle=pool_recycle,
         echo=str_to_bool(self.config.get("echo", "false")))
     self._install_session_variables_setter(
         engine,
         wait_timeout=wait_timeout,
         time_zone="'+00:00'")
     session_db = N6DataBackendAPI.configure_db_session(engine)
     session_db.execute(sqla_text("SELECT 1"))  # Let's crash early if db is misconfigured.
     return session_db
예제 #9
0
def execute_sql_file(path):
    print(('Running {}'.format(path)))
    with open(path) as fp:
        cmd = '\n'.join(
            [line for line in fp.readlines() if not line.startswith('--')])
        db.engine.execute(sqla_text(cmd))
예제 #10
0
def index():
    canvas_uri_raw = request.args.get('canvas')
    area_xywh = request.args.get('xywh')
    if not canvas_uri_raw:
        return abort(400)
    canvas_uri = urllib.parse.unquote(canvas_uri_raw)

    cfg = Cfg()
    db_engine = create_engine(cfg.cfg['db_uri'])

    q_can = sqla_text('''
        SELECT id, manifest_jsonld_id
        FROM canvases
        WHERE jsonld_id=:can_uri
        ''')
    can_db_tpls = db_engine.execute(
        q_can,
        can_uri=canvas_uri
        ).fetchall()
    if not can_db_tpls:
        return abort(404)  # FIXME not there, respond accordingly
    else:
        if len(can_db_tpls) == 1:
            can_db_id = int(can_db_tpls[0]['id'])
            can_db_man_jsonld_id = can_db_tpls[0]['manifest_jsonld_id']
        else:
            print('multiple canvases w/ same ID (!!!)')  # FIXME problem

    area_query_insert = ''
    if area_xywh:
        x, y, w, h = [int(elem) for elem in area_xywh.split(',')]
        poly = ('ST_GeomFromText('
            '\'POLYGON(({} {}, {} {}, {} {}, {} {}, {} {}))\')').format(
            x, y,
            x+w, y,
            x+w, y+h,
            x, y+h,
            x, y
            )
        area_query_insert = 'ST_Within(area, {}) and '.format(poly)
    q_area = '''SELECT curations.jsonld_id as uri, areajson
        FROM curations
        JOIN
            (SELECT curation_id, ST_AsGeoJSON(area) as areajson
            FROM curation_elements
            WHERE {} canvas_id = {}) as cue
        ON curations.id = cue.curation_id;
        '''.format(area_query_insert, can_db_id)
    cur_uris = db_engine.execute(q_area).fetchall()

    backlinks_flat = []
    for row in cur_uris:
        uri = row['uri']
        area = json.loads(row['areajson'])
        backlinks_flat.append([uri, area])
    # backlinks_by_uri = {}
    # for bl in backlinks_flat:
    #     uri, area = bl
    #     if uri not in backlinks_by_uri:
    #         backlinks_by_uri[uri] = {'areas':[]}
    #     backlinks_by_uri[uri]['areas'].append(area)
    backlinks_by_area = {}
    for bl in backlinks_flat:
        uri, area = bl
        coords = area['coordinates'][0]
        if not len(coords) == 5:
            print('unexpected polygon shape (!!!)')  # FIXME problem
        p1, p2, p3, p4, p5 = coords
        xywh = '{},{},{},{}'.format(p1[0], p1[1], p2[0]-p1[0], p3[1]-p1[1])
        if xywh not in backlinks_by_area:
            backlinks_by_area[xywh] = []
        backlinks_by_area[xywh].append(uri)
    display_curation = build_annotation_container_curation(
        canvas_uri,
        can_db_man_jsonld_id,
        backlinks_by_area,
        request.url,
        request.base_url)

    # ret = {
    #     'canvas': canvas_uri,
    #     'curations_backlinks': backlinks_by_area
    #     }

    return jsonify(display_curation)
예제 #11
0
def crawl_single(as_url, db_engine):
    """ Crawl, given a URL to an Activity Stream
    """

    last_activity_query = sqla_text('''
        SELECT last_activity
        FROM last_activity_times
        WHERE acticity_stream_url=:as_url
        ''')

    last_activity_db = db_engine.execute(last_activity_query,
                                         as_url=as_url).fetchone()

    if not last_activity_db:
        last_activity_time_db = datetime.datetime.fromtimestamp(0)
        log('First time crawling this Activity Stream.')
    else:
        last_activity_time_db = dateutil.parser.parse(last_activity_db[0])
        log('Last cralwed activity at {}.'.format(last_activity_time_db))

    log('Retrieving Activity Stream ({})'.format(as_url))
    try:
        resp = requests.get(as_url)
    except requests.exceptions.RequestException as e:
        msg = 'Could not access Activity Stream. ({})'.format(e)
        log(msg)
        print(msg)
        return
    if resp.status_code != 200:
        msg = ('Could not access Activity Stream. (HTTP {})').format(
            resp.status_code)
        log(msg)
        print(msg)
        return

    as_oc = resp.json()
    log('Start iterating over Activity Stream pages')
    as_ocp = get_referenced(as_oc, 'last')

    new_canvases = 0
    new_activity = False
    # NOTE: seen_activity_objs is used to prevent processing obsolete
    #       activities. Since we go through the Activity Stream backwards, we
    #       only process the most recent Activity per IIIF doc.
    #       (Not doing so might lead to for example trying to process a Create
    #       for a document for which a Delete was processed just before.)
    seen_activity_objs = []
    last_activity_time_as = None
    # for all AS pages
    while True:
        # for all AC items
        log('going through AS page {}'.format(as_ocp['id']))
        for activity in as_ocp['orderedItems']:
            if activity['type'] in ['Create', 'Update', 'Delete']:
                # Reduce noise
                log('going through {} item {}'.format(activity['type'],
                                                      activity['id']))
            activity_end_time = dateutil.parser.parse(activity['endTime'])
            # if we haven't seen it yet and it's about a Curation
            if activity_end_time > last_activity_time_db and \
                    activity['object']['@type'] == 'cr:Curation' and \
                    activity['object'] not in seen_activity_objs:
                if last_activity_time_as == None:
                    # b/c we're going backwards (i.e. from new to old)
                    last_activity_time_as = activity_end_time
                new_activity = True
                if activity['type'] == 'Create':
                    log('Create')
                    new_canvases += process_curation_create(
                        activity, db_engine)
                elif activity['type'] == 'Update':
                    log('Update')
                    log(' ≈Delete')
                    process_curation_delete(activity, db_engine)
                    log(' +Create')
                    new_canvases += process_curation_create(
                        activity, db_engine)
                elif activity['type'] == 'Delete':
                    log('Delete')
                    process_curation_delete(activity, db_engine)
                seen_activity_objs.append(activity['object'])
            else:
                if activity['type'] in ['Create', 'Update', 'Delete']:
                    # Reduce noise
                    log('skipping')

        if not as_ocp.get('prev', False):
            break
        as_ocp = get_referenced(as_ocp, 'prev')

    if last_activity_time_as == None:
        last_activity_time_as = last_activity_time_db
    # persist crawl log
    if not last_activity_db:
        last_activity_update = sqla_text('''
            INSERT INTO last_activity_times(acticity_stream_url, last_activity)
            VALUES (:as_url, :new_time)''')
    else:
        last_activity_update = sqla_text('''
            UPDATE last_activity_times
            SET last_activity=:new_time
            WHERE acticity_stream_url=:as_url
            ''')

    last_activity_db = db_engine.execute(
        last_activity_update,
        new_time=last_activity_time_as.isoformat(),
        as_url=as_url)

    if new_activity:
        pass
        # foo
    else:
        pass