def continuous_migration(): """Task to continuously migrate what is pushed up by Legacy.""" from redis import StrictRedis redis_url = current_app.config.get('CACHE_REDIS_URL') r = StrictRedis.from_url(redis_url) try: while r.llen('legacy_records'): raw_record = r.lpop('legacy_records') if raw_record: # The record might be None, in case a parallel # continuous_migration task has already consumed the queue. raw_record = zlib.decompress(raw_record) record = marc_create_record(raw_record, keep_singletons=False) recid = int(record['001'][0]) prod_record = InspireProdRecords(recid=recid) prod_record.marcxml = raw_record try: with db.session.begin_nested(): errors, dummy = create_record( record, force=True, validation=True ) logger.info("Successfully migrated record {}".format(recid)) prod_record.successful = True prod_record.valid = not errors prod_record.errors = errors db.session.merge(prod_record) except Exception as err: logger.error("Error when migrating record {}".format(recid)) logger.exception(err) prod_record.successful = False db.session.merge(prod_record) finally: db.session.commit() db.session.close()
def migrate_chunk(chunk, broken_output=None, dry_run=False): from invenio_indexer.api import RecordIndexer from ..pidstore.minters import inspire_recid_minter indexer = RecordIndexer() index_queue = [] for raw_record in chunk: record = marc_create_record(raw_record, keep_singletons=False) json_record = create_record(record) if '$schema' in json_record: json_record['$schema'] = url_for( 'invenio_jsonschemas.get_schema', schema_path="records/{0}".format(json_record['$schema']) ) rec_uuid = str(Record.create(json_record, id_=None).id) # Create persistent identifier. pid = inspire_recid_minter(rec_uuid, json_record) index_queue.append(pid.object_uuid) db.session.commit() # Request record indexing for i in index_queue: indexer.index_by_id(i) # Send task to migrate files. return rec_uuid
def continuous_migration(): """Task to continuously migrate what is pushed up by Legacy.""" indexer = RecordIndexer() redis_url = current_app.config.get('CACHE_REDIS_URL') r = StrictRedis.from_url(redis_url) try: while r.llen('legacy_records'): raw_record = r.lpop('legacy_records') if raw_record: # FIXME use migrate_and_insert_record(raw_record) # The record might be None, in case a parallel # continuous_migration task has already consumed the queue. raw_record = zlib.decompress(raw_record) record = marc_create_record(raw_record, keep_singletons=False) recid = int(record['001'][0]) prod_record = InspireProdRecords(recid=recid) prod_record.marcxml = raw_record json_record = create_record(record) with db.session.begin_nested(): try: record = record_upsert(json_record) except ValidationError as e: # Invalid record, will not get indexed errors = "ValidationError: Record {0}: {1}".format( recid, e ) prod_record.valid = False prod_record.errors = errors db.session.merge(prod_record) continue indexer.index_by_id(record.id) finally: db.session.commit() db.session.close()
def create_record(data, force=False, dry_run=False): record = marc_create_record(data) recid = None if '001' in record: recid = int(record['001'][0]) if not dry_run and recid: prod_record = InspireProdRecords(recid=recid) prod_record.marcxml = data try: if _collection_in_record(record, 'institution'): json = strip_empty_values(institutions.do(record)) elif _collection_in_record(record, 'experiment'): json = strip_empty_values(experiments.do(record)) elif _collection_in_record(record, 'journals'): json = strip_empty_values(journals.do(record)) elif _collection_in_record(record, 'hepnames'): json = strip_empty_values(hepnames.do(record)) elif _collection_in_record(record, 'job') or \ _collection_in_record(record, 'jobhidden'): json = strip_empty_values(jobs.do(record)) elif _collection_in_record(record, 'conferences'): json = strip_empty_values(conferences.do(record)) else: json = strip_empty_values(hep.do(record)) if dry_run: return recid, json if force and any(key in json for key in ('control_number', 'recid')): try: control_number = json['control_number'] except KeyError: control_number = json['recid'] control_number = int(control_number) # Searches if record already exists. record = Record.get_record(control_number) if record is None: # Adds the record to the db session. rec = RecordModel(id=control_number) db.session.merge(rec) record = Record.create(json) else: record = Record(json, model=record.model) record.commit() if recid: prod_record.successful = True db.session.merge(prod_record) logger.info("Elaborated record {}".format(control_number)) return control_number, dict(record) except Exception: if recid: prod_record.successful = False db.session.merge(prod_record) logger.exception("Error in elaborating record ID {}".format(recid)) raise
def migrate_and_insert_record(raw_record): """Convert a marc21 record to JSON and insert it into the DB.""" try: record = marc_create_record(raw_record, keep_singletons=False) except Exception as e: logger.exception('Migrator MARC 21 read Error') return None recid = int(record['001']) prod_record = InspireProdRecords(recid=recid) prod_record.marcxml = raw_record error = None try: json_record = create_record(record) except Exception as e: logger.exception('Migrator DoJSON Error') error = e try: if not error: record = record_upsert(json_record) except ValidationError as e: # Aggregate logs by part of schema being validated. pattern = u'Migrator Validation Error: {} on {}: Value: %r, Record: %r' logger.error(pattern.format('.'.join(e.schema_path), e.validator_value), e.instance, recid) error = e except Exception as e: # Receivers can always cause exceptions and we could dump the entire # chunk because of a single broken record. logger.exception('Migrator Record Insert Error') error = e if error: # Invalid record, will not get indexed. error_str = u'{0}: Record {1}: {2}'.format(type(error), recid, e) prod_record.valid = False prod_record.errors = error_str db.session.merge(prod_record) return None else: prod_record.valid = True db.session.merge(prod_record) return record
def migrate_chunk(chunk): index_queue = [] try: for raw_record in chunk: record = marc_create_record(raw_record, keep_singletons=False) recid = int(record['001']) prod_record = InspireProdRecords(recid=recid) prod_record.marcxml = raw_record json_record = create_record(record) with db.session.begin_nested(): try: record = record_upsert(json_record) except ValidationError as e: # Invalid record, will not get indexed errors = "ValidationError: Record {0}: {1}".format( recid, e ) current_app.logger.warning(errors) prod_record.valid = False prod_record.errors = errors db.session.merge(prod_record) continue index_queue.append(create_index_op(record)) prod_record.valid = True db.session.merge(prod_record) db.session.commit() finally: db.session.close() req_timeout = current_app.config['INDEXER_BULK_REQUEST_TIMEOUT'] es_bulk( current_search_client, index_queue, stats_only=True, request_timeout=req_timeout, )
def migrate_chunk(chunk, broken_output=None, dry_run=False): from flask_sqlalchemy import models_committed from invenio_records.receivers import record_modification from invenio_records.tasks.index import get_record_index from invenio.base.globals import cfg from elasticsearch.helpers import bulk as es_bulk from inspirehep.modules.citations.receivers import ( catch_citations_insert, add_citation_count_on_insert_or_update, catch_citations_update ) from invenio_records.signals import before_record_index, after_record_insert models_committed.disconnect(record_modification) after_record_insert.disconnect(catch_citations_insert) before_record_index.disconnect(add_citation_count_on_insert_or_update) before_record_index.disconnect(catch_citations_update) records_to_index = [] try: for raw_record in chunk: json = None record = marc_create_record(raw_record, keep_singletons=False) recid = int(record['001']) if not dry_run: prod_record = InspireProdRecords(recid=recid) prod_record.marcxml = raw_record try: with db.session.begin_nested(): errors, recid, json = create_record( recid, record, force=True, dry_run=dry_run, validation=True ) if dry_run: continue prod_record.valid = not errors prod_record.errors = errors index = get_record_index(json) or \ cfg['SEARCH_ELASTIC_DEFAULT_INDEX'] before_record_index.send(recid, json=json, index=index) json.update({'_index': index, '_type': 'record', '_id': recid, 'citation_count': 0}) records_to_index.append(json) prod_record.successful = True db.session.merge(prod_record) except Exception as err: logger.error("ERROR with record {} and json {}".format(recid, json)) logger.exception(err) if not dry_run: prod_record.successful = False db.session.merge(prod_record) logger.info("Committing chunk") db.session.commit() logger.info("Sending chunk to elasticsearch") if not dry_run: es_bulk(es, records_to_index, request_timeout=60) finally: models_committed.connect(record_modification) after_record_insert.connect(catch_citations_insert) before_record_index.connect(add_citation_count_on_insert_or_update) before_record_index.connect(catch_citations_update) db.session.close()