Пример #1
0
def continuous_migration():
    """Task to continuously migrate what is pushed up by Legacy."""
    from redis import StrictRedis
    redis_url = current_app.config.get('CACHE_REDIS_URL')
    r = StrictRedis.from_url(redis_url)

    try:
        while r.llen('legacy_records'):
            raw_record = r.lpop('legacy_records')
            if raw_record:
                # The record might be None, in case a parallel
                # continuous_migration task has already consumed the queue.
                raw_record = zlib.decompress(raw_record)
                record = marc_create_record(raw_record, keep_singletons=False)
                recid = int(record['001'][0])
                prod_record = InspireProdRecords(recid=recid)
                prod_record.marcxml = raw_record
                try:
                    with db.session.begin_nested():
                        errors, dummy = create_record(
                            record, force=True, validation=True
                        )
                        logger.info("Successfully migrated record {}".format(recid))
                        prod_record.successful = True
                        prod_record.valid = not errors
                        prod_record.errors = errors
                        db.session.merge(prod_record)
                except Exception as err:
                    logger.error("Error when migrating record {}".format(recid))
                    logger.exception(err)
                    prod_record.successful = False
                    db.session.merge(prod_record)
    finally:
        db.session.commit()
        db.session.close()
Пример #2
0
def migrate_chunk(chunk, broken_output=None, dry_run=False):
    from invenio_indexer.api import RecordIndexer

    from ..pidstore.minters import inspire_recid_minter

    indexer = RecordIndexer()

    index_queue = []
    for raw_record in chunk:
        record = marc_create_record(raw_record, keep_singletons=False)
        json_record = create_record(record)
        if '$schema' in json_record:
            json_record['$schema'] = url_for(
                'invenio_jsonschemas.get_schema',
                schema_path="records/{0}".format(json_record['$schema'])
            )
        rec_uuid = str(Record.create(json_record, id_=None).id)

        # Create persistent identifier.
        pid = inspire_recid_minter(rec_uuid, json_record)

        index_queue.append(pid.object_uuid)

        db.session.commit()

    # Request record indexing
    for i in index_queue:
        indexer.index_by_id(i)

    # Send task to migrate files.
    return rec_uuid
Пример #3
0
def continuous_migration():
    """Task to continuously migrate what is pushed up by Legacy."""
    indexer = RecordIndexer()
    redis_url = current_app.config.get('CACHE_REDIS_URL')
    r = StrictRedis.from_url(redis_url)

    try:
        while r.llen('legacy_records'):
            raw_record = r.lpop('legacy_records')
            if raw_record:
                # FIXME use migrate_and_insert_record(raw_record)
                # The record might be None, in case a parallel
                # continuous_migration task has already consumed the queue.
                raw_record = zlib.decompress(raw_record)
                record = marc_create_record(raw_record, keep_singletons=False)
                recid = int(record['001'][0])
                prod_record = InspireProdRecords(recid=recid)
                prod_record.marcxml = raw_record
                json_record = create_record(record)
                with db.session.begin_nested():
                    try:
                        record = record_upsert(json_record)
                    except ValidationError as e:
                        # Invalid record, will not get indexed
                        errors = "ValidationError: Record {0}: {1}".format(
                            recid, e
                        )
                        prod_record.valid = False
                        prod_record.errors = errors
                        db.session.merge(prod_record)
                        continue
                indexer.index_by_id(record.id)
    finally:
        db.session.commit()
        db.session.close()
Пример #4
0
def migrate_chunk(chunk, broken_output=None, dry_run=False):
    from invenio_indexer.api import RecordIndexer

    from ..pidstore.minters import inspire_recid_minter

    indexer = RecordIndexer()

    index_queue = []
    for raw_record in chunk:
        record = marc_create_record(raw_record, keep_singletons=False)
        json_record = create_record(record)
        if '$schema' in json_record:
            json_record['$schema'] = url_for(
                'invenio_jsonschemas.get_schema',
                schema_path="records/{0}".format(json_record['$schema'])
            )
        rec_uuid = str(Record.create(json_record, id_=None).id)

        # Create persistent identifier.
        pid = inspire_recid_minter(rec_uuid, json_record)

        index_queue.append(pid.object_uuid)

        db.session.commit()

    # Request record indexing
    for i in index_queue:
        indexer.index_by_id(i)

    # Send task to migrate files.
    return rec_uuid
Пример #5
0
def continuous_migration():
    """Task to continuously migrate what is pushed up by Legacy."""
    from redis import StrictRedis
    redis_url = current_app.config.get('CACHE_REDIS_URL')
    r = StrictRedis.from_url(redis_url)

    try:
        while r.llen('legacy_records'):
            raw_record = r.lpop('legacy_records')
            if raw_record:
                # The record might be None, in case a parallel
                # continuous_migration task has already consumed the queue.
                raw_record = zlib.decompress(raw_record)
                record = marc_create_record(raw_record, keep_singletons=False)
                recid = int(record['001'][0])
                prod_record = InspireProdRecords(recid=recid)
                prod_record.marcxml = raw_record
                try:
                    with db.session.begin_nested():
                        errors, dummy = create_record(
                            record, force=True, validation=True
                        )
                        logger.info("Successfully migrated record {}".format(recid))
                        prod_record.successful = True
                        prod_record.valid = not errors
                        prod_record.errors = errors
                        db.session.merge(prod_record)
                except Exception as err:
                    logger.error("Error when migrating record {}".format(recid))
                    logger.exception(err)
                    prod_record.successful = False
                    db.session.merge(prod_record)
    finally:
        db.session.commit()
        db.session.close()
Пример #6
0
def continuous_migration():
    """Task to continuously migrate what is pushed up by Legacy."""
    indexer = RecordIndexer()
    redis_url = current_app.config.get('CACHE_REDIS_URL')
    r = StrictRedis.from_url(redis_url)

    try:
        while r.llen('legacy_records'):
            raw_record = r.lpop('legacy_records')
            if raw_record:
                # FIXME use migrate_and_insert_record(raw_record)
                # The record might be None, in case a parallel
                # continuous_migration task has already consumed the queue.
                raw_record = zlib.decompress(raw_record)
                record = marc_create_record(raw_record, keep_singletons=False)
                recid = int(record['001'][0])
                prod_record = InspireProdRecords(recid=recid)
                prod_record.marcxml = raw_record
                json_record = create_record(record)
                with db.session.begin_nested():
                    try:
                        record = record_upsert(json_record)
                    except ValidationError as e:
                        # Invalid record, will not get indexed
                        errors = "ValidationError: Record {0}: {1}".format(
                            recid, e
                        )
                        prod_record.valid = False
                        prod_record.errors = errors
                        db.session.merge(prod_record)
                        continue
                indexer.index_by_id(record.id)
    finally:
        db.session.commit()
        db.session.close()
Пример #7
0
def create_record(data, force=False, dry_run=False):
    record = marc_create_record(data)
    recid = None
    if '001' in record:
        recid = int(record['001'][0])
    if not dry_run and recid:
        prod_record = InspireProdRecords(recid=recid)
        prod_record.marcxml = data
    try:
        if _collection_in_record(record, 'institution'):
            json = strip_empty_values(institutions.do(record))
        elif _collection_in_record(record, 'experiment'):
            json = strip_empty_values(experiments.do(record))
        elif _collection_in_record(record, 'journals'):
            json = strip_empty_values(journals.do(record))
        elif _collection_in_record(record, 'hepnames'):
            json = strip_empty_values(hepnames.do(record))
        elif _collection_in_record(record, 'job') or \
                _collection_in_record(record, 'jobhidden'):
            json = strip_empty_values(jobs.do(record))
        elif _collection_in_record(record, 'conferences'):
            json = strip_empty_values(conferences.do(record))
        else:
            json = strip_empty_values(hep.do(record))
        if dry_run:
            return recid, json

        if force and any(key in json for key in ('control_number', 'recid')):
            try:
                control_number = json['control_number']
            except KeyError:
                control_number = json['recid']
            control_number = int(control_number)
            # Searches if record already exists.
            record = Record.get_record(control_number)
            if record is None:
                # Adds the record to the db session.
                rec = RecordModel(id=control_number)
                db.session.merge(rec)
                record = Record.create(json)
            else:
                record = Record(json, model=record.model)
                record.commit()
            if recid:
                prod_record.successful = True
                db.session.merge(prod_record)
            logger.info("Elaborated record {}".format(control_number))
            return control_number, dict(record)
    except Exception:
        if recid:
            prod_record.successful = False
            db.session.merge(prod_record)
            logger.exception("Error in elaborating record ID {}".format(recid))
        raise
Пример #8
0
def migrate_and_insert_record(raw_record):
    """Convert a marc21 record to JSON and insert it into the DB."""
    try:
        record = marc_create_record(raw_record, keep_singletons=False)
    except Exception as e:
        logger.exception('Migrator MARC 21 read Error')
        return None

    recid = int(record['001'])
    prod_record = InspireProdRecords(recid=recid)
    prod_record.marcxml = raw_record
    error = None

    try:
        json_record = create_record(record)
    except Exception as e:
        logger.exception('Migrator DoJSON Error')
        error = e

    try:
        if not error:
            record = record_upsert(json_record)
    except ValidationError as e:
        # Aggregate logs by part of schema being validated.
        pattern = u'Migrator Validation Error: {} on {}: Value: %r, Record: %r'
        logger.error(pattern.format('.'.join(e.schema_path),
                                    e.validator_value),
                     e.instance, recid)
        error = e
    except Exception as e:
        # Receivers can always cause exceptions and we could dump the entire
        # chunk because of a single broken record.
        logger.exception('Migrator Record Insert Error')
        error = e

    if error:
        # Invalid record, will not get indexed.
        error_str = u'{0}: Record {1}: {2}'.format(type(error), recid, e)
        prod_record.valid = False
        prod_record.errors = error_str
        db.session.merge(prod_record)
        return None
    else:
        prod_record.valid = True
        db.session.merge(prod_record)
        return record
Пример #9
0
def migrate_and_insert_record(raw_record):
    """Convert a marc21 record to JSON and insert it into the DB."""
    try:
        record = marc_create_record(raw_record, keep_singletons=False)
    except Exception as e:
        logger.exception('Migrator MARC 21 read Error')
        return None

    recid = int(record['001'])
    prod_record = InspireProdRecords(recid=recid)
    prod_record.marcxml = raw_record
    error = None

    try:
        json_record = create_record(record)
    except Exception as e:
        logger.exception('Migrator DoJSON Error')
        error = e

    try:
        if not error:
            record = record_upsert(json_record)
    except ValidationError as e:
        # Aggregate logs by part of schema being validated.
        pattern = u'Migrator Validation Error: {} on {}: Value: %r, Record: %r'
        logger.error(pattern.format('.'.join(e.schema_path),
                                    e.validator_value),
                     e.instance, recid)
        error = e
    except Exception as e:
        # Receivers can always cause exceptions and we could dump the entire
        # chunk because of a single broken record.
        logger.exception('Migrator Record Insert Error')
        error = e

    if error:
        # Invalid record, will not get indexed.
        error_str = u'{0}: Record {1}: {2}'.format(type(error), recid, e)
        prod_record.valid = False
        prod_record.errors = error_str
        db.session.merge(prod_record)
        return None
    else:
        prod_record.valid = True
        db.session.merge(prod_record)
        return record
Пример #10
0
def migrate_chunk(chunk):
    index_queue = []
    try:
        for raw_record in chunk:
            record = marc_create_record(raw_record, keep_singletons=False)
            recid = int(record['001'])
            prod_record = InspireProdRecords(recid=recid)
            prod_record.marcxml = raw_record
            json_record = create_record(record)
            with db.session.begin_nested():
                try:
                    record = record_upsert(json_record)
                except ValidationError as e:
                    # Invalid record, will not get indexed
                    errors = "ValidationError: Record {0}: {1}".format(
                        recid, e
                    )
                    current_app.logger.warning(errors)
                    prod_record.valid = False
                    prod_record.errors = errors
                    db.session.merge(prod_record)
                    continue

                index_queue.append(create_index_op(record))

                prod_record.valid = True
                db.session.merge(prod_record)
        db.session.commit()
    finally:
        db.session.close()

    req_timeout = current_app.config['INDEXER_BULK_REQUEST_TIMEOUT']
    es_bulk(
        current_search_client,
        index_queue,
        stats_only=True,
        request_timeout=req_timeout,
    )
Пример #11
0
def migrate_chunk(chunk, broken_output=None, dry_run=False):
    from flask_sqlalchemy import models_committed
    from invenio_records.receivers import record_modification
    from invenio_records.tasks.index import get_record_index
    from invenio.base.globals import cfg
    from elasticsearch.helpers import bulk as es_bulk
    from inspirehep.modules.citations.receivers import (
        catch_citations_insert,
        add_citation_count_on_insert_or_update,
        catch_citations_update
    )
    from invenio_records.signals import before_record_index, after_record_insert
    models_committed.disconnect(record_modification)
    after_record_insert.disconnect(catch_citations_insert)
    before_record_index.disconnect(add_citation_count_on_insert_or_update)
    before_record_index.disconnect(catch_citations_update)

    records_to_index = []
    try:
        for raw_record in chunk:
            json = None
            record = marc_create_record(raw_record, keep_singletons=False)
            recid = int(record['001'])
            if not dry_run:
                prod_record = InspireProdRecords(recid=recid)
                prod_record.marcxml = raw_record
            try:
                with db.session.begin_nested():
                    errors, recid, json = create_record(
                        recid, record, force=True,
                        dry_run=dry_run, validation=True
                    )
                    if dry_run:
                        continue
                    prod_record.valid = not errors
                    prod_record.errors = errors
                    index = get_record_index(json) or \
                        cfg['SEARCH_ELASTIC_DEFAULT_INDEX']
                    before_record_index.send(recid, json=json, index=index)
                    json.update({'_index': index, '_type': 'record',
                                 '_id': recid, 'citation_count': 0})
                    records_to_index.append(json)
                    prod_record.successful = True
                    db.session.merge(prod_record)
            except Exception as err:
                logger.error("ERROR with record {} and json {}".format(recid, json))
                logger.exception(err)
                if not dry_run:
                    prod_record.successful = False
                    db.session.merge(prod_record)
        logger.info("Committing chunk")
        db.session.commit()
        logger.info("Sending chunk to elasticsearch")
        if not dry_run:
            es_bulk(es, records_to_index, request_timeout=60)
    finally:
        models_committed.connect(record_modification)
        after_record_insert.connect(catch_citations_insert)
        before_record_index.connect(add_citation_count_on_insert_or_update)
        before_record_index.connect(catch_citations_update)
        db.session.close()