Exemplo n.º 1
0
def migrate_chunk(chunk, broken_output=None, dry_run=False):
    from flask_sqlalchemy import models_committed
    from invenio_records.receivers import record_modification
    from invenio_records.tasks.index import get_record_index
    from invenio.base.globals import cfg
    from elasticsearch.helpers import bulk as es_bulk
    from inspirehep.modules.citations.receivers import (
        catch_citations_insert,
        add_citation_count_on_insert_or_update,
        catch_citations_update
    )
    from invenio_records.signals import before_record_index, after_record_insert
    models_committed.disconnect(record_modification)
    after_record_insert.disconnect(catch_citations_insert)
    before_record_index.disconnect(add_citation_count_on_insert_or_update)
    before_record_index.disconnect(catch_citations_update)

    records_to_index = []
    try:
        for record in chunk:
            recid = json = None
            try:
                recid, json = create_record(record,
                                            force=True, dry_run=dry_run)
                index = get_record_index(json) or \
                    cfg['SEARCH_ELASTIC_DEFAULT_INDEX']
                before_record_index.send(recid, json=json, index=index)
                json.update({'_index': index, '_type': 'record', '_id': recid, 'citation_count': 0})
                records_to_index.append(json)
            except Exception as err:
                logger.error("ERROR with record {} and json {}".format(recid, json))
                logger.exception(err)
                if broken_output:
                    broken_output_fd = open(broken_output, "a")
                    print(record, file=broken_output_fd)

        logger.info("Committing chunk")
        db.session.commit()
        logger.info("Sending chunk to elasticsearch")
        es_bulk(es, records_to_index, request_timeout=60)
    finally:
        models_committed.connect(record_modification)
        after_record_insert.connect(catch_citations_insert)
        before_record_index.connect(add_citation_count_on_insert_or_update)
        before_record_index.connect(catch_citations_update)
        db.session.close()
Exemplo n.º 2
0
def index_holdingpen_record(sender, **kwargs):
    """Index a Holding Pen record."""
    from invenio_ext.es import es
    from invenio_records.api import Record
    from invenio_records.signals import before_record_index
    from invenio_records.recordext.functions.get_record_collections import (
        get_record_collections,
    )
    from invenio_records.tasks.index import get_record_index

    from invenio_workflows.registry import workflows

    if not sender.workflow:
        # No workflow registered to object yet. Skip indexing
        return

    if sender.version == ObjectVersion.INITIAL:
        # Ignore initial versions
        return

    workflow = workflows.get(sender.workflow.name)
    if not workflow:
        current_app.logger.info(
            "Workflow {0} not found for sender: {1}".format(
                sender.workflow.name, sender.id
            )
        )
        return

    if not hasattr(sender, 'data'):
        sender.data = sender.get_data()
    if not hasattr(sender, 'extra_data'):
        sender.extra_data = sender.get_extra_data()

    record = Record({})
    record["version"] = ObjectVersion.name_from_version(sender.version)
    record["type"] = sender.data_type
    record["status"] = sender.status
    record["created"] = sender.created.isoformat()
    record["modified"] = sender.modified.isoformat()
    record["uri"] = sender.uri
    record["id_workflow"] = sender.id_workflow
    record["id_user"] = sender.id_user
    record["id_parent"] = sender.id_parent
    record["workflow"] = sender.workflow.name
    try:
        record.update(workflow.get_record(sender))
    except Exception as err:
        current_app.logger.exception(err)

    try:
        record.update(workflow.get_sort_data(sender))
    except Exception as err:
        current_app.logger.exception(err)

    # Add collection to get correct mapping
    record["_collections"] = get_record_collections(record)

    # Depends on "_collections" being filled correctly for record
    record_index = get_record_index(record) or current_app.config["SEARCH_ELASTIC_DEFAULT_INDEX"]

    # Trigger any before_record_index receivers
    before_record_index.send(sender.id, json=record, index=record_index)

    if record_index:
        index = current_app.config['WORKFLOWS_HOLDING_PEN_ES_PREFIX'] + record_index
        es.index(
            index=index,
            doc_type=current_app.config["WORKFLOWS_HOLDING_PEN_DOC_TYPE"],
            body=dict(record),
            id=sender.id
        )
Exemplo n.º 3
0
def migrate_chunk(chunk, broken_output=None, dry_run=False):
    from flask_sqlalchemy import models_committed
    from invenio_records.receivers import record_modification
    from invenio_records.tasks.index import get_record_index
    from invenio.base.globals import cfg
    from elasticsearch.helpers import bulk as es_bulk
    from inspirehep.modules.citations.receivers import (
        catch_citations_insert,
        add_citation_count_on_insert_or_update,
        catch_citations_update
    )
    from invenio_records.signals import before_record_index, after_record_insert
    models_committed.disconnect(record_modification)
    after_record_insert.disconnect(catch_citations_insert)
    before_record_index.disconnect(add_citation_count_on_insert_or_update)
    before_record_index.disconnect(catch_citations_update)

    records_to_index = []
    try:
        for raw_record in chunk:
            json = None
            record = marc_create_record(raw_record, keep_singletons=False)
            recid = int(record['001'])
            if not dry_run:
                prod_record = InspireProdRecords(recid=recid)
                prod_record.marcxml = raw_record
            try:
                with db.session.begin_nested():
                    errors, recid, json = create_record(
                        recid, record, force=True,
                        dry_run=dry_run, validation=True
                    )
                    if dry_run:
                        continue
                    prod_record.valid = not errors
                    prod_record.errors = errors
                    index = get_record_index(json) or \
                        cfg['SEARCH_ELASTIC_DEFAULT_INDEX']
                    before_record_index.send(recid, json=json, index=index)
                    json.update({'_index': index, '_type': 'record',
                                 '_id': recid, 'citation_count': 0})
                    records_to_index.append(json)
                    prod_record.successful = True
                    db.session.merge(prod_record)
            except Exception as err:
                logger.error("ERROR with record {} and json {}".format(recid, json))
                logger.exception(err)
                if not dry_run:
                    prod_record.successful = False
                    db.session.merge(prod_record)
        logger.info("Committing chunk")
        db.session.commit()
        logger.info("Sending chunk to elasticsearch")
        if not dry_run:
            es_bulk(es, records_to_index, request_timeout=60)
    finally:
        models_committed.connect(record_modification)
        after_record_insert.connect(catch_citations_insert)
        before_record_index.connect(add_citation_count_on_insert_or_update)
        before_record_index.connect(catch_citations_update)
        db.session.close()