def test_filteroverdo_wraps_exceptions(): record = ('<record>' ' <datafield tag="269" ind1=" " ind2=" ">' ' <subfield code="c">Ceci n’est pas une dâte</subfield>' ' </datafield>' ' <datafield tag="980" ind1=" " ind2=" ">' ' <subfield code="a">HEP</subfield>' ' </datafield>' '</record>') # synthetic data with pytest.raises(DoJsonError) as exc: marcxml2record(record) assert 'Error in rule "preprint_date" for field "269__"' in str(exc.value)
def merged_records(app): merged_snippet = ( '<record>' ' <controlfield tag="001">111</controlfield>' ' <datafield tag="245" ind1=" " ind2=" ">' ' <subfield code="a">merged</subfield>' ' </datafield>' ' <datafield tag="981" ind1=" " ind2=" ">' ' <subfield code="a">222</subfield>' ' </datafield>' ' <datafield tag="980" ind1=" " ind2=" ">' ' <subfield code="a">HEP</subfield>' ' </datafield>' '</record>' ) deleted_snippet = ( '<record>' ' <controlfield tag="001">222</controlfield>' ' <datafield tag="245" ind1=" " ind2=" ">' ' <subfield code="a">deleted</subfield>' ' </datafield>' ' <datafield tag="970" ind1=" " ind2=" ">' ' <subfield code="d">111</subfield>' ' </datafield>' ' <datafield tag="980" ind1=" " ind2=" ">' ' <subfield code="a">HEP</subfield>' ' <subfield code="c">DELETED</subfield>' ' </datafield>' '</record>' ) merged_record = marcxml2record(merged_snippet) merged_record['$schema'] = 'http://localhost:5000/schemas/records/hep.json' deleted_record = marcxml2record(deleted_snippet) deleted_record['$schema'] = 'http://localhost:5000/schemas/records/hep.json' with db.session.begin_nested(): merged_uuid = record_insert_or_replace(merged_record).id deleted_uuid = record_insert_or_replace(deleted_record).id db.session.commit() es.indices.refresh('records-hep') yield _delete_merged_records('lit', 111, 222, merged_uuid, deleted_uuid)
def merged_records(app): merged_snippet = ( '<record>' ' <controlfield tag="001">111</controlfield>' ' <datafield tag="245" ind1=" " ind2=" ">' ' <subfield code="a">merged</subfield>' ' </datafield>' ' <datafield tag="981" ind1=" " ind2=" ">' ' <subfield code="a">222</subfield>' ' </datafield>' ' <datafield tag="980" ind1=" " ind2=" ">' ' <subfield code="a">HEP</subfield>' ' </datafield>' '</record>' ) deleted_snippet = ( '<record>' ' <controlfield tag="001">222</controlfield>' ' <datafield tag="245" ind1=" " ind2=" ">' ' <subfield code="a">deleted</subfield>' ' </datafield>' ' <datafield tag="970" ind1=" " ind2=" ">' ' <subfield code="d">111</subfield>' ' </datafield>' ' <datafield tag="980" ind1=" " ind2=" ">' ' <subfield code="a">HEP</subfield>' ' <subfield code="c">DELETED</subfield>' ' </datafield>' '</record>' ) merged_record = marcxml2record(merged_snippet) merged_record['$schema'] = 'http://localhost:5000/schemas/records/hep.json' deleted_record = marcxml2record(deleted_snippet) deleted_record['$schema'] = 'http://localhost:5000/schemas/records/hep.json' with db.session.begin_nested(): merged_uuid = _create_record(merged_record).id deleted_uuid = _create_record(deleted_record).id db.session.commit() es.indices.refresh('records-hep') yield _delete_merged_records('lit', 111, 222, merged_uuid, deleted_uuid)
def update(recid): """View for INSPIRE author update form.""" data = {} if recid: try: url = get_legacy_url_for_recid(recid) + '/export/xm' xml = requests.get(url) record_regex = re.compile( r"\<record\>.*\<\/record\>", re.MULTILINE + re.DOTALL) xml_content = record_regex.search(xml.content).group() data = marcxml2record(xml_content) convert_for_form(data) except requests.exceptions.RequestException: pass data["control_number"] = recid else: return redirect(url_for("inspirehep_authors.new")) form = AuthorUpdateForm(data=data, is_update=True) ctx = { "action": url_for('.submitupdate'), "name": "authorUpdateForm", "id": "authorUpdateForm", } return render_template('authors/forms/update_form.html', form=form, **ctx)
def migrate_and_insert_record(raw_record, skip_files=False): """Migrate a record and insert it if valid, or log otherwise.""" try: json_record = marcxml2record(raw_record) recid = json_record['control_number'] except Exception as e: LOGGER.exception('Migrator DoJSON Error') recid = _get_recid(raw_record) _store_migrator_error(recid, raw_record, e) return None if '$schema' in json_record: ensure_valid_schema(json_record) try: record = record_insert_or_replace(json_record, skip_files=skip_files) except ValidationError as e: pattern = u'Migrator Validator Error: {}, Value: %r, Record: %r' LOGGER.error(pattern.format('.'.join(e.schema_path)), e.instance, recid) _store_migrator_error(recid, raw_record, e) except Exception as e: LOGGER.exception('Migrator Record Insert Error') _store_migrator_error(recid, raw_record, e) else: prod_record = InspireProdRecords(recid=recid) prod_record.marcxml = raw_record prod_record.valid = True db.session.merge(prod_record) return record
def core_record(): """Provide record fixtures.""" record_oai_arxiv_plots = pkg_resources.resource_string( __name__, os.path.join( '../fixtures', 'oai_arxiv_core_record.xml' ) ) # Convert to MARCXML, then dict, then HEP JSON record_oai_arxiv_plots_marcxml = convert( record_oai_arxiv_plots, "oaiarXiv2marcxml.xsl" ) json_data = marcxml2record(record_oai_arxiv_plots_marcxml) categories = {'core': [], 'non-core': []} for eprint in json_data.get('arxiv_eprints', []): categories['core'].extend(eprint.get('categories', [])) if 'preprint_date' in json_data: json_data['preprint_date'] = datetime.date.today().isoformat() assert categories return json_data, categories
def _author_list(obj, eng): arxiv_id = get_arxiv_id(obj.data) filename = secure_filename('{0}.tar.gz'.format(arxiv_id)) try: tarball = obj.files[filename] except KeyError: obj.log.info( 'Skipping author list extraction, no tarball with name "%s" found' % filename) return with TemporaryDirectory(prefix='author_list') as scratch_space, \ retrieve_uri(tarball.file.uri, outdir=scratch_space) as tarball_file: try: file_list = untar(tarball_file, scratch_space) except InvalidTarball: obj.log.info( 'Invalid tarball %s for arxiv_id %s', tarball.file.uri, arxiv_id, ) return obj.log.info('Extracted tarball to: {0}'.format(scratch_space)) xml_files_list = [ path for path in file_list if path.endswith('.xml') ] obj.log.info('Found xmlfiles: {0}'.format(xml_files_list)) extracted_authors = [] for xml_file in xml_files_list: with open(xml_file, 'r') as xml_file_fd: xml_content = xml_file_fd.read() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info('Found a match for author extraction') try: authors_xml = convert(xml_content, stylesheet) except XMLSyntaxError: # Probably the %auto-ignore comment exists, so we skip the # first line. See: inspirehep/inspire-next/issues/2195 authors_xml = convert( xml_content.split('\n', 1)[1], stylesheet, ) extracted_authors.extend( marcxml2record(authors_xml).get('authors', [])) if extracted_authors: for author in extracted_authors: author['full_name'] = decode_latex(author['full_name']) obj.data['authors'] = extracted_authors
def reporterrors(output): """Reports in a friendly way all failed records and corresponding motivation.""" click.echo("Reporting broken records into {0}".format(output)) errors = {} results = LegacyRecordsMirror.query.filter( LegacyRecordsMirror.valid == False) # noqa: ignore=F712 results_length = results.count() with click.progressbar(results.yield_per(100), length=results_length) as bar: for obj in bar: marc_record = create_record(obj.marcxml, keep_singletons=False) collection = get_collection(marc_record) if 'DELETED' in collection: continue recid = int(marc_record['001']) try: json_record = marcxml2record(obj.marcxml) except Exception as err: tb = u''.join(traceback.format_tb(sys.exc_info()[2])) errors.setdefault((collection, 'dojson', tb), []).append(recid) continue ensure_valid_schema(json_record) try: validate(json_record) except jsonschema.exceptions.ValidationError as err: exc = [ row for row in str(err).splitlines() if row.startswith('Failed validating') ][0] details = u'\n'.join( dropwhile(lambda x: not x.startswith('On instance'), str(err).splitlines())) errors.setdefault((collection, 'validation', exc), []).append( (recid, details)) continue with open(output, "w") as out: csv_writer = csv.writer(out) for (collection, stage, error), elements in errors.iteritems(): if stage == 'dojson': csv_writer.writerow( (collection, stage, error, '\n'.join('http://inspirehep.net/record/{}'.format(recid) for recid in elements))) else: for recid, details in elements: csv_writer.writerow( (collection, stage, error, 'http://inspirehep.net/record/{}'.format(recid), details)) click.echo("Dumped errors into {}".format(output))
def _author_list(obj, eng): arxiv_id = LiteratureReader(obj.data).arxiv_id filename = secure_filename('{0}.tar.gz'.format(arxiv_id)) try: tarball = obj.files[filename] except KeyError: obj.log.info( 'Skipping author list extraction, no tarball with name "%s" found' % filename ) return with TemporaryDirectory(prefix='author_list') as scratch_space, \ retrieve_uri(tarball.file.uri, outdir=scratch_space) as tarball_file: try: file_list = untar(tarball_file, scratch_space) except InvalidTarball: obj.log.info( 'Invalid tarball %s for arxiv_id %s', tarball.file.uri, arxiv_id, ) return obj.log.info('Extracted tarball to: {0}'.format(scratch_space)) xml_files_list = [path for path in file_list if path.endswith('.xml')] obj.log.info('Found xmlfiles: {0}'.format(xml_files_list)) extracted_authors = [] for xml_file in xml_files_list: with open(xml_file, 'r') as xml_file_fd: xml_content = xml_file_fd.read() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info('Found a match for author extraction') try: authors_xml = convert(xml_content, stylesheet) except XMLSyntaxError: # Probably the %auto-ignore comment exists, so we skip the # first line. See: inspirehep/inspire-next/issues/2195 authors_xml = convert( xml_content.split('\n', 1)[1], stylesheet, ) extracted_authors.extend(marcxml2record(authors_xml).get('authors', [])) if extracted_authors: for author in extracted_authors: author['full_name'] = decode_latex(author['full_name']) obj.data['authors'] = extracted_authors
def _parsed_items_from_marcxml( self, marcxml_records, base_url="", hostname="", url_schema=None, ftp_params=None, url="" ): app = Flask('hepcrawl') app.config.update(self.settings.getdict('MARC_TO_HEP_SETTINGS', {})) file_name = url.split('/')[-1] with app.app_context(): parsed_items = [] for xml_record in marcxml_records: try: record = marcxml2record(xml_record) parsed_item = ParsedItem(record=record, record_format='hep') parsed_item.ftp_params = ftp_params parsed_item.file_name = file_name files_to_download = [ self._get_full_uri( current_url=document['url'], base_url=base_url, schema=url_schema, hostname=hostname, ) for document in parsed_item.record.get('documents', []) if self._has_to_be_downloaded(document['url']) ] parsed_item.file_urls = files_to_download self.logger.info('Got the following attached documents to download: %s'% files_to_download) self.logger.info('Got item: %s' % parsed_item) parsed_items.append(parsed_item) except Exception as e: tb = ''.join(traceback.format_tb(sys.exc_info()[2])) error_parsed_item = ParsedItem.from_exception( record_format='hep', exception=repr(e), traceback=tb, source_data=xml_record, file_name=file_name ) parsed_items.append(error_parsed_item) return parsed_items
def _parsed_items_from_marcxml(self, marcxml_records, base_url="", url=""): self.logger.info('parsing record') app = Flask('hepcrawl') app.config.update(self.settings.getdict('MARC_TO_HEP_SETTINGS', {})) file_name = url.split('/')[-1].split("?")[0] with app.app_context(): parsed_items = [] for xml_record in marcxml_records: try: record = marcxml2record(xml_record) parsed_item = ParsedItem(record=record, record_format='hep') parsed_item.file_name = file_name new_documents = [] files_to_download = [] self.logger.info("Parsed document: %s", parsed_item.record) self.logger.info("Record have documents: %s", "documents" in parsed_item.record) for document in parsed_item.record.get('documents', []): if self._is_local_path(document['url']): document['url'] = self._get_full_uri( document['url']) self.logger.info("Updating document %s", document) else: files_to_download.append(document['url']) new_documents.append(document) if new_documents: parsed_item.record['documents'] = new_documents parsed_item.file_urls = files_to_download self.logger.info( 'Got the following attached documents to download: %s', files_to_download) self.logger.info('Got item: %s', parsed_item) parsed_items.append(parsed_item) except Exception as e: tb = ''.join(traceback.format_tb(sys.exc_info()[2])) error_parsed_item = ParsedItem.from_exception( record_format='hep', exception=repr(e), traceback=tb, source_data=xml_record, file_name=file_name) parsed_items.append(error_parsed_item) return parsed_items
def _get_crawl_result(xml_record): app = Flask('hepcrawl') app.config.update(self.settings.getdict('MARC_TO_HEP_SETTINGS', {})) with app.app_context(): item = ParsedItem(record={}, record_format='hep') try: item.record = marcxml2record(xml_record) except Exception as e: item.exception = repr(e) item.traceback = traceback.format_tb(sys.exc_info()[2]) item.source_data = xml_record return item
def generate_record(): """Provide record fixture.""" record_oai_arxiv_plots = pkg_resources.resource_string( __name__, os.path.join('../fixtures', 'oai_arxiv_record_with_plots.xml')) # Convert to MARCXML, then dict, then HEP JSON record_oai_arxiv_plots_marcxml = convert(record_oai_arxiv_plots, "oaiarXiv2marcxml.xsl") json_data = marcxml2record(record_oai_arxiv_plots_marcxml) if 'preprint_date' in json_data: json_data['preprint_date'] = datetime.date.today().isoformat() return json_data
def _author_list(obj, eng): arxiv_id = get_arxiv_id(obj.data) filename = secure_filename('{0}.tar.gz'.format(arxiv_id)) tarball = obj.files[filename] if tarball: with TemporaryDirectory(prefix='author_list') as scratch_space: tarball_file = retrieve_uri( tarball.file.uri, outdir=scratch_space, ) try: file_list = untar(tarball_file, scratch_space) except InvalidTarball: obj.log.info( 'Invalid tarball %s for arxiv_id %s', tarball.file.uri, arxiv_id, ) return obj.log.info('Extracted tarball to: {0}'.format(scratch_space)) xml_files_list = [ path for path in file_list if path.endswith('.xml') ] obj.log.info('Found xmlfiles: {0}'.format(xml_files_list)) for xml_file in xml_files_list: with open(xml_file, 'r') as xml_file_fd: xml_content = xml_file_fd.read() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info('Found a match for author extraction') try: authors_xml = convert(xml_content, stylesheet) except XMLSyntaxError: # Probably the %auto-ignore comment exists, so we skip the # first line. See: inspirehep/inspire-next/issues/2195 authors_xml = convert( xml_content.split('\n', 1)[1], stylesheet, ) authorlist_record = marcxml2record(authors_xml) obj.data.update(authorlist_record) break
def migrate_and_insert_record(raw_record, skip_files=False): """Convert a marc21 record to JSON and insert it into the DB.""" error = None try: json_record = marcxml2record(raw_record) if '$schema' in json_record: json_record['$schema'] = url_for( 'invenio_jsonschemas.get_schema', schema_path='records/{0}'.format(json_record['$schema']), ) except Exception as e: LOGGER.exception('Migrator DoJSON Error') error = e else: recid = json_record['control_number'] prod_record = InspireProdRecords(recid=recid) prod_record.marcxml = raw_record try: if not error: record = record_insert_or_replace(json_record, skip_files=skip_files) except ValidationError as e: # Aggregate logs by part of schema being validated. pattern = u'Migrator Validator Error: {}, Value: %r, Record: %r' LOGGER.error(pattern.format('.'.join(e.schema_path)), e.instance, recid) error = e except Exception as e: # Receivers can always cause exceptions and we could dump the entire # chunk because of a single broken record. LOGGER.exception('Migrator Record Insert Error') error = e if error: # Invalid record, will not get indexed. error_str = u'{0}: Record {1}: {2}'.format(type(error), recid, e) prod_record.valid = False prod_record.errors = error_str db.session.merge(prod_record) return None else: prod_record.valid = True db.session.merge(prod_record) return record
def get_record_from_legacy(record_id=None): data = {} try: url = get_legacy_url_for_recid(record_id) + '/export/xm' xml = requests.get(url) record_regex = re.compile( r"\<record\>.*\<\/record\>", re.MULTILINE + re.DOTALL) xml_content = record_regex.search(xml.content).group() data = marcxml2record(xml_content) except requests.exceptions.RequestException: current_app.logger.error( 'Failed to get record {} from legacy.'.format(record_id), ) except Exception: current_app.logger.error( 'Error parsing the record {} from legacy.'.format(record_id), ) return data
def core_record(): """Provide record fixture.""" record_oai_arxiv_plots = pkg_resources.resource_string( __name__, os.path.join('../fixtures', 'oai_arxiv_core_record.xml')) # Convert to MARCXML, then dict, then HEP JSON record_oai_arxiv_plots_marcxml = convert(record_oai_arxiv_plots, "oaiarXiv2marcxml.xsl") json_data = marcxml2record(record_oai_arxiv_plots_marcxml) categories = {'core': [], 'non-core': []} for eprint in json_data.get('arxiv_eprints', []): categories['core'].extend(eprint.get('categories', [])) if 'preprint_date' in json_data: json_data['preprint_date'] = datetime.date.today().isoformat() assert categories return json_data, categories
def migrate_record_from_mirror(prod_record, skip_files=False): """Migrate a mirrored legacy record into an Inspire record. Args: prod_record(LegacyRecordsMirror): the mirrored record to migrate. skip_files(bool): flag indicating whether the files in the record metadata should be copied over from legacy and attach to the record. Returns: dict: the migrated record metadata, which is also inserted into the database. """ try: json_record = marcxml2record(prod_record.marcxml) except Exception as exc: LOGGER.exception('Migrator DoJSON Error') prod_record.error = exc db.session.merge(prod_record) return None if '$schema' in json_record: ensure_valid_schema(json_record) try: with db.session.begin_nested(): record = InspireRecord.create_or_update(json_record, skip_files=skip_files) record.commit() except ValidationError as exc: pattern = u'Migrator Validator Error: {}, Value: %r, Record: %r' LOGGER.error(pattern.format('.'.join(exc.schema_path)), exc.instance, prod_record.recid) prod_record.error = exc db.session.merge(prod_record) except Exception as exc: LOGGER.exception('Migrator Record Insert Error') prod_record.error = exc db.session.merge(prod_record) else: prod_record.valid = True db.session.merge(prod_record) return record
def parse_received_package(file_data, package_name): """Parses received MarcXML data, also applies the needed mappings.""" # Delete XML header if exists. Dojson library will call lxml.etree.parse on a decoded string, # which results in 'ValueError: Unicode strings with encoding declaration are not supported.' file_data = file_data.replace('<?xml version="1.0" encoding="UTF-8"?>', '') try: obj = marcxml2record(file_data) except Exception as e: logger.error('Marcxml parsing failed for package %s: %s' % (package_name, e)) raise InvalidUsage("MARCXML is not valid.") obj['$schema'] = url_for('invenio_jsonschemas.get_schema', schema_path="hep.json") if 'self' in obj: del obj['self'] _add_additional_info(obj) return obj
def deleted_record(app): snippet = ('<record>' ' <controlfield tag="001">111</controlfield>' ' <datafield tag="245" ind1=" " ind2=" ">' ' <subfield code="a">deleted</subfield>' ' </datafield>' ' <datafield tag="980" ind1=" " ind2=" ">' ' <subfield code="a">HEP</subfield>' ' <subfield code="c">DELETED</subfield>' ' </datafield>' '</record>') record = marcxml2record(snippet) record['$schema'] = 'http://localhost:5000/schemas/records/hep.json' with db.session.begin_nested(): _create_record(record) db.session.commit() yield _delete_record('lit', 111)
def already_harvested_on_legacy_record(): """Provide record fixture.""" record_oai_arxiv_plots = pkg_resources.resource_string( __name__, os.path.join( '../fixtures', 'oai_arxiv_record_already_on_legacy.xml' ) ) # Convert to MARCXML, then dict, then HEP JSON record_oai_arxiv_plots_marcxml = convert( record_oai_arxiv_plots, "oaiarXiv2marcxml.xsl" ) json_data = marcxml2record(record_oai_arxiv_plots_marcxml) categories = [] for eprint in json_data.get('arxiv_eprints', []): categories.extend(eprint.get('categories', [])) assert categories return json_data, categories
def deleted_record(app): snippet = ( '<record>' ' <controlfield tag="001">111</controlfield>' ' <datafield tag="245" ind1=" " ind2=" ">' ' <subfield code="a">deleted</subfield>' ' </datafield>' ' <datafield tag="980" ind1=" " ind2=" ">' ' <subfield code="a">HEP</subfield>' ' <subfield code="c">DELETED</subfield>' ' </datafield>' '</record>' ) record = marcxml2record(snippet) record['$schema'] = 'http://localhost:5000/schemas/records/hep.json' with db.session.begin_nested(): _create_record(record) db.session.commit() yield _delete_record('lit', 111)
def migrate_record_from_mirror(prod_record, disable_orcid_push=True, disable_citation_update=True): """Migrate a mirrored legacy record into an Inspire record. Args: prod_record(LegacyRecordsMirror): the mirrored record to migrate. Returns: dict: the migrated record metadata, which is also inserted into the database. """ logger = LOGGER.bind(recid=prod_record.recid) try: json_record = marcxml2record(prod_record.marcxml) except Exception as exc: logger.exception("Error converting from marcxml") prod_record.error = exc db.session.merge(prod_record) return None if "$schema" in json_record: ensure_valid_schema(json_record) pid_type = PidStoreBase.get_pid_type_from_schema( json_record.get("$schema")) if pid_type in current_app.config.get("MIGRATION_PID_TYPE_BLACKLIST"): prod_record.error = Exception( f"Record: ${prod_record.recid} has blacklisted pid_type: ${pid_type} is blacklisted" ) db.session.merge(prod_record) return try: with db.session.begin_nested(): cls = InspireRecord.get_class_for_record(json_record) for deleted_record in cls.get_linked_records_from_dict_field( json_record, "deleted_records"): deleted_record.pidstore_handler( deleted_record.id, deleted_record).delete_external_pids() record = cls.create_or_update( json_record, disable_orcid_push=disable_orcid_push, disable_citation_update=disable_citation_update, ) except ValidationError as exc: path = ".".join(exc.schema_path) logger.warn( "Migrator validator error", path=path, value=exc.instance, recid=prod_record.recid, ) prod_record.error = exc db.session.merge(prod_record) except PIDValueError as exc: message = f"pid_type:'{exc.pid_type}', pid_value:'{exc.pid_value}'" logger.exception("PIDValueError while migrate from mirror", msg=message) exc.args = (message, ) prod_record.error = exc db.session.merge(prod_record) except Exception as exc: logger.exception("Error while migrating record into mirror") prod_record.error = exc db.session.merge(prod_record) else: prod_record.valid = True db.session.merge(prod_record) return record
def migrate_record_from_mirror(prod_record, disable_external_push=True, disable_relations_update=True): """Migrate a mirrored legacy record into an Inspire record. Args: prod_record(LegacyRecordsMirror): the mirrored record to migrate. Returns: dict: the migrated record metadata, which is also inserted into the database. """ logger = LOGGER.bind(recid=prod_record.recid) try: json_record = marcxml2record(prod_record.marcxml) except NotSupportedError as exc: logger.warning(str(exc), recid=prod_record.recid) prod_record.valid = True db.session.merge(prod_record) return except Exception as exc: logger.exception("Error converting from marcxml") prod_record.error = exc db.session.merge(prod_record) return if "$schema" in json_record: ensure_valid_schema(json_record) pid_type = PidStoreBase.get_pid_type_from_schema( json_record.get("$schema")) if pid_type in current_app.config.get("MIGRATION_PID_TYPE_BLACKLIST"): prod_record.error = Exception( f"Record: {prod_record.recid} has blacklisted pid_type: {pid_type} is blacklisted" ) db.session.merge(prod_record) return try: with db.session.begin_nested(): cls = InspireRecord.get_class_for_record(json_record) original_urls = replace_afs_file_locations_with_local(json_record) record = cls.create_or_update( json_record, disable_external_push=disable_external_push, disable_relations_update=disable_relations_update, ) cache_afs_file_locations(record) except ValidationError as exc: path = ".".join(exc.schema_path) logger.warn( "Migrator validator error", path=path, value=exc.instance, recid=prod_record.recid, ) prod_record.error = exc db.session.merge(prod_record) except DownloadFileError as exc: removed_cached_files = remove_cached_afs_file_locations(original_urls) if not removed_cached_files: logger.exception("DownloadFileError while migrate from mirror") prod_record.error = exc db.session.merge(prod_record) else: return migrate_record_from_mirror( prod_record=prod_record, disable_external_push=disable_external_push, disable_relations_update=disable_relations_update, ) except PIDValueError as exc: message = f"pid_type:'{exc.pid_type}', pid_value:'{exc.pid_value}'" logger.exception("PIDValueError while migrate from mirror", msg=message) exc.args = (message, ) prod_record.error = exc db.session.merge(prod_record) except ThreadsTimeoutError: raise except Exception as exc: logger.exception("Error while migrating record into mirror") prod_record.error = exc db.session.merge(prod_record) else: prod_record.valid = True db.session.merge(prod_record) return record
def reporterrors(output): """Reports in a friendly way all failed records and corresponding motivation.""" click.echo("Reporting broken records into {0}".format(output)) errors = {} results = LegacyRecordsMirror.query.filter(LegacyRecordsMirror.valid == False) # noqa: ignore=F712 results_length = results.count() with click.progressbar(results.yield_per(100), length=results_length) as bar: for obj in bar: marc_record = create_record(obj.marcxml, keep_singletons=False) collection = get_collection(marc_record) if 'DELETED' in collection: continue recid = int(marc_record['001']) try: json_record = marcxml2record(obj.marcxml) except Exception as err: tb = u''.join(traceback.format_tb(sys.exc_info()[2])) errors.setdefault((collection, 'dojson', tb), []).append(recid) continue ensure_valid_schema(json_record) try: validate(json_record) except jsonschema.exceptions.ValidationError as err: exc = [ row for row in str(err).splitlines() if row.startswith('Failed validating') ][0] details = u'\n'.join( dropwhile( lambda x: not x.startswith('On instance'), str(err).splitlines() ) ) errors.setdefault( (collection, 'validation', exc), [] ).append((recid, details)) continue with open(output, "w") as out: csv_writer = csv.writer(out) for (collection, stage, error), elements in errors.iteritems(): if stage == 'dojson': csv_writer.writerow(( collection, stage, error, '\n'.join( 'http://inspirehep.net/record/{}'.format(recid) for recid in elements ) )) else: for recid, details in elements: csv_writer.writerow(( collection, stage, error, 'http://inspirehep.net/record/{}'.format(recid), details )) click.echo("Dumped errors into {}".format(output))