def test_create_records_from_mirror_recids_with_different_types_of_record(inspire_app): raw_record_literature_valid = ( b"<record>" b' <controlfield tag="001">666</controlfield>' b' <datafield tag="245" ind1=" " ind2=" ">' b' <subfield code="a">On the validity of INSPIRE records</subfield>' b" </datafield>" b' <datafield tag="980" ind1=" " ind2=" ">' b' <subfield code="a">HEP</subfield>' b" </datafield>" b"</record>" ) valid_record_literature = LegacyRecordsMirror.from_marcxml( raw_record_literature_valid ) db.session.add(valid_record_literature) raw_record_invalid = ( b"<record>" b' <controlfield tag="001">667</controlfield>' b' <datafield tag="260" ind1=" " ind2=" ">' b' <subfield code="c">Definitely not a date</subfield>' b" </datafield>" b' <datafield tag="980" ind1=" " ind2=" ">' b' <subfield code="a">HEP</subfield>' b" </datafield>" b"</record>" ) invalid_record = LegacyRecordsMirror.from_marcxml(raw_record_invalid) db.session.add(invalid_record) raw_record_author_valid = ( b"<record>" b' <controlfield tag="001">668</controlfield>' b' <datafield tag="100" ind1=" " ind2=" ">' b' <subfield code="a">Jessica Jones</subfield>' b' <subfield code="q">Jones Jessica</subfield>' b" </datafield>" b' <datafield tag="980" ind1=" " ind2=" ">' b' <subfield code="a">HEPNAMES</subfield>' b" </datafield>" b"</record>" ) valid_record_author = LegacyRecordsMirror.from_marcxml(raw_record_author_valid) db.session.add(valid_record_author) task_results = create_records_from_mirror_recids([666, 667, 668]) record_literature = InspireRecord.get_record_by_pid_value(666, "lit") assert str(record_literature.id) in task_results record_author = InspireRecord.get_record_by_pid_value(668, "aut") assert str(record_author.id) in task_results with pytest.raises(PIDDoesNotExistError): InspireRecord.get_record_by_pid_value(667, "lit")
def test_inspire_prod_records_from_marcxml_raises_for_invalid_recid(): raw_record = """ <record> <controlfield tag="001">foo</controlfield> <controlfield tag="005">20171011194718.0</controlfield> <datafield tag="100" ind1=" " ind2=" "> <subfield code="a">Chetyrkin, K.G.</subfield> </datafield> </record> """ with pytest.raises(ValueError): LegacyRecordsMirror.from_marcxml(raw_record)
def test_create_record_from_mirror_recids_retries_on_timeout_error( retry_mock, inspire_app, s3): raw_record_literature = ( b"<record>" b' <controlfield tag="001">666</controlfield>' b' <datafield tag="245" ind1=" " ind2=" ">' b' <subfield code="a">On the validity of INSPIRE records</subfield>' b" </datafield>" b' <datafield tag="980" ind1=" " ind2=" ">' b' <subfield code="a">HEP</subfield>' b" </datafield>" b' <datafield tag="FFT" ind1=" " ind2=" ">' b' <subfield code="a">/opt/cds-invenio/var/data/files/g97/1940001/content.pdf;2</subfield>' b' <subfield code="d"></subfield>' b' <subfield code="f">.pdf</subfield>' b' <subfield code="n">arXiv:1409.0794</subfield>' b' <subfield code="r"></subfield>' b' <subfield code="s">2015-01-12 03:41:58</subfield>' b' <subfield code="v">2</subfield>' b' <subfield code="z"></subfield>' b" </datafield>" b"</record>") record_literature = LegacyRecordsMirror.from_marcxml(raw_record_literature) db.session.add(record_literature) with patch.dict(current_app.config, {"FILES_UPLOAD_THREAD_TIMEOUT": 1}), patch.object( current_s3_instance, "is_s3_url") as is_s3_url_mock: def sleep_2s(*args): sleep(2) is_s3_url_mock.side_effect = sleep_2s with pytest.raises(Retry): create_records_from_mirror_recids([666])
def migrate_and_insert_record(raw_record, disable_external_push=False, disable_relations_update=False): """Migrate a record and insert it if valid, or log otherwise.""" prod_record = LegacyRecordsMirror.from_marcxml(raw_record) db.session.merge(prod_record) return migrate_record_from_mirror(prod_record, disable_external_push, disable_relations_update)
def test_migrating_deleted_record_registers_control_number_with_deleted_status( inspire_app, datadir): raw_record_xml = (datadir / "dummy_deleted.xml").read_text() deleted_record = LegacyRecordsMirror.from_marcxml(raw_record_xml) db.session.add(deleted_record) create_records_from_mirror_recids([12345]) pid = PersistentIdentifier.query.filter_by(pid_value="12345").one() assert pid.status == PIDStatus.DELETED
def insert_into_mirror(raw_records): migrated_records = [] for raw_record in raw_records: prod_record = LegacyRecordsMirror.from_marcxml(raw_record) db.session.merge(prod_record) if prod_record: migrated_records.append(prod_record.recid) db.session.commit() return migrated_records
def test_migrate_record_from_specified_date_only(inspire_app): raw_record_1 = ( b"<record>" b' <controlfield tag="001">666</controlfield>' b' <datafield tag="245" ind1=" " ind2=" ">' b' <subfield code="a">On the validity of INSPIRE records</subfield>' b" </datafield>" b' <datafield tag="980" ind1=" " ind2=" ">' b' <subfield code="a">HEP</subfield>' b" </datafield>" b"</record>" ) raw_record_2 = ( b"<record>" b' <controlfield tag="001">667</controlfield>' b' <datafield tag="245" ind1=" " ind2=" ">' b' <subfield code="a">On the validity of INSPIRE records</subfield>' b" </datafield>" b' <datafield tag="980" ind1=" " ind2=" ">' b' <subfield code="a">HEP</subfield>' b" </datafield>" b"</record>" ) prod_record_1 = LegacyRecordsMirror.from_marcxml(raw_record_1) prod_record_1.last_updated = datetime.datetime(2010, 1, 1) db.session.merge(prod_record_1) prod_record_2 = LegacyRecordsMirror.from_marcxml(raw_record_2) prod_record_1.last_updated = datetime.datetime(2020, 1, 1) db.session.merge(prod_record_2) migrate_from_mirror(date_from="2015-01-01") rec_1 = LegacyRecordsMirror.query.filter_by(recid=666).one() rec_2 = LegacyRecordsMirror.query.filter_by(recid=667).one() # Only record 2 should be valid as rec_1 migration should not run. assert rec_1.valid is None assert rec_2.valid is True with pytest.raises(PIDDoesNotExistError): LiteratureRecord.get_record_by_pid_value("666") assert LiteratureRecord.get_record_by_pid_value("667")
def test_migrating_deleted_record_registers_control_number_regression(inspire_app): raw_deleted_record = b'<record>\n <controlfield tag="001">1775082</controlfield>\n <controlfield tag="005">20200131230810.0</controlfield>\n <datafield tag="856" ind1="4" ind2=" ">\n <subfield code="u">https://gambit.hepforge.org/</subfield>\n </datafield>\n <datafield tag="909" ind1="C" ind2="O">\n <subfield code="o">oai:inspirehep.net:1775082</subfield>\n <subfield code="q">INSPIRE:Experiments</subfield>\n </datafield>\n <datafield tag="961" ind1=" " ind2=" ">\n <subfield code="x">2020-01-13</subfield>\n <subfield code="c">2020-01-31</subfield>\n </datafield>\n <datafield tag="980" ind1=" " ind2=" ">\n <subfield code="a">CORE</subfield>\n </datafield>\n <datafield tag="980" ind1=" " ind2=" ">\n <subfield code="a">EXPERIMENT</subfield>\n </datafield>\n <datafield tag="980" ind1=" " ind2=" ">\n <subfield code="c">DELETED</subfield>\n </datafield>\n <datafield tag="710" ind1=" " ind2=" ">\n <subfield code="g">GAMBIT</subfield>\n </datafield>\n <datafield tag="245" ind1=" " ind2=" ">\n <subfield code="a">GAMBIT : Global And Modular BSM Inference Tool</subfield>\n </datafield>\n <datafield tag="372" ind1=" " ind2=" ">\n <subfield code="9">INSPIRE</subfield>\n <subfield code="a">9.2</subfield>\n </datafield>\n <datafield tag="520" ind1=" " ind2=" ">\n <subfield code="a">GAMBIT is a global fitting code for generic Beyond the Standard Model theories, designed to allow fast and easy definition of new models, observables, likelihoods, scanners and backend physics codes.</subfield>\n </datafield>\n <datafield tag="119" ind1=" " ind2=" ">\n <subfield code="a">GAMBIT</subfield>\n <subfield code="c">GAMBIT</subfield>\n <subfield code="d">GAMBIT</subfield>\n </datafield>\n</record>' deleted_record = LegacyRecordsMirror.from_marcxml(raw_deleted_record) db.session.add(deleted_record) create_records_from_mirror_recids([1775082]) pid = PersistentIdentifier.query.filter_by(pid_value="1775082").one() assert InspireRecord.get_record_by_pid_value("1775082", "exp") assert pid.status == PIDStatus.DELETED
def test_inspire_prod_records_from_marcxml(): raw_record = b""" <record> <controlfield tag="001">1591551</controlfield> <controlfield tag="005">20171011194718.0</controlfield> <datafield tag="100" ind1=" " ind2=" "> <subfield code="a">Chetyrkin, K.G.</subfield> </datafield> </record> """ record = LegacyRecordsMirror.from_marcxml(raw_record) assert record.recid == 1591551 assert record.marcxml == raw_record assert record.valid is None assert record.error is None
def test_migrate_recids_from_mirror_all_only_with_literature_author_and_invalid( inspire_app, celery_app_with_context, celery_session_worker): raw_record_citer = ( b"<record>" b' <controlfield tag="001">666</controlfield>' b' <datafield tag="245" ind1=" " ind2=" ">' b' <subfield code="a">This is a citer record</subfield>' b" </datafield>" b' <datafield tag="980" ind1=" " ind2=" ">' b' <subfield code="a">HEP</subfield>' b" </datafield>" b' <datafield tag="999" ind1="C" ind2="5">' b' <subfield code="0">667</subfield>' b' <subfield code="h">Achasov, M.N.</subfield>' b' <subfield code="k">snd-2018</subfield>' b' <subfield code="m">(SND Collaboration)</subfield>' b' <subfield code="o">2</subfield>' b' <subfield code="s">Phys.Rev.,D97,012008</subfield>' b' <subfield code="x">' b" [2] M. N. Achasov (SND Collaboration), Phys. Rev. D 97, 012008 (2018)." b" </subfield>" b' <subfield code="y">2018</subfield>' b' <subfield code="z">0</subfield>' b' <subfield code="z">1</subfield>' b" </datafield>" b"</record>") valid_record_literature_citer = LegacyRecordsMirror.from_marcxml( raw_record_citer) citer_control_number = 666 db.session.add(valid_record_literature_citer) raw_record_citing = ( b"<record>" b' <controlfield tag="001">667</controlfield>' b' <datafield tag="245" ind1=" " ind2=" ">' b' <subfield code="a">This is a citing record</subfield>' b" </datafield>" b' <datafield tag="980" ind1=" " ind2=" ">' b' <subfield code="a">HEP</subfield>' b" </datafield>" b"</record>") valid_record_literature_citing = LegacyRecordsMirror.from_marcxml( raw_record_citing) citing_control_number = 667 db.session.add(valid_record_literature_citing) raw_record_invalid = ( b"<record>" b' <controlfield tag="001">668</controlfield>' b' <datafield tag="260" ind1=" " ind2=" ">' b' <subfield code="c">Definitely not a date</subfield>' b" </datafield>" b' <datafield tag="980" ind1=" " ind2=" ">' b' <subfield code="a">HEP</subfield>' b" </datafield>" b"</record>") invalid_record = LegacyRecordsMirror.from_marcxml(raw_record_invalid) db.session.add(invalid_record) invalid_control_number = 668 raw_record_author_valid = ( b"<record>" b' <controlfield tag="001">669</controlfield>' b' <datafield tag="100" ind1=" " ind2=" ">' b' <subfield code="a">Jessica Jones</subfield>' b' <subfield code="q">Jones Jessica</subfield>' b" </datafield>" b' <datafield tag="980" ind1=" " ind2=" ">' b' <subfield code="a">HEPNAMES</subfield>' b" </datafield>" b"</record>") valid_record_author = LegacyRecordsMirror.from_marcxml( raw_record_author_valid) db.session.add(valid_record_author) author_control_number = 669 db.session.commit() migrate_from_mirror(also_migrate="all") def assert_migrator_task(): record_citer = InspireRecord.get_record_by_pid_value( citer_control_number, "lit") record_citing = InspireRecord.get_record_by_pid_value( citing_control_number, "lit") record_author = InspireRecord.get_record_by_pid_value( author_control_number, "aut") assert record_citing.citation_count == 1 record_citer_es = InspireSearch.get_record_data_from_es(record_citer) result_citer_control_number = record_citer_es["control_number"] assert citer_control_number == result_citer_control_number record_citing_es = InspireSearch.get_record_data_from_es(record_citing) result_citing_control_number = record_citing_es["control_number"] assert citing_control_number == result_citing_control_number record_author_es = InspireSearch.get_record_data_from_es(record_author) result_author_control_number = record_author_es["control_number"] assert author_control_number == result_author_control_number with pytest.raises(PIDDoesNotExistError): InspireRecord.get_record_by_pid_value(invalid_control_number, "lit") retry_until_pass(assert_migrator_task)
def test_migrate_recids_from_mirror_all_only_with_literature( app, celery_app_with_context, celery_session_worker): raw_record_citer = ( b"<record>" b' <controlfield tag="001">666</controlfield>' b' <datafield tag="245" ind1=" " ind2=" ">' b' <subfield code="a">This is a citer record</subfield>' b" </datafield>" b' <datafield tag="980" ind1=" " ind2=" ">' b' <subfield code="a">HEP</subfield>' b" </datafield>" b' <datafield tag="999" ind1="C" ind2="5">' b' <subfield code="0">667</subfield>' b' <subfield code="h">Achasov, M.N.</subfield>' b' <subfield code="k">snd-2018</subfield>' b' <subfield code="m">(SND Collaboration)</subfield>' b' <subfield code="o">2</subfield>' b' <subfield code="s">Phys.Rev.,D97,012008</subfield>' b' <subfield code="x">' b" [2] M. N. Achasov (SND Collaboration), Phys. Rev. D 97, 012008 (2018)." b" </subfield>" b' <subfield code="y">2018</subfield>' b' <subfield code="z">0</subfield>' b' <subfield code="z">1</subfield>' b" </datafield>" b"</record>") valid_record_literature_citer = LegacyRecordsMirror.from_marcxml( raw_record_citer) citer_control_number = 666 db.session.add(valid_record_literature_citer) raw_record_citing = ( b"<record>" b' <controlfield tag="001">667</controlfield>' b' <datafield tag="245" ind1=" " ind2=" ">' b' <subfield code="a">This is a citing record</subfield>' b" </datafield>" b' <datafield tag="980" ind1=" " ind2=" ">' b' <subfield code="a">HEP</subfield>' b" </datafield>" b"</record>") valid_record_literature_citing = LegacyRecordsMirror.from_marcxml( raw_record_citing) citing_control_number = 667 db.session.add(valid_record_literature_citing) db.session.commit() migrate_from_mirror(also_migrate="all") # I don't like timeouts, it's the only way to wait for this chain time.sleep(5) record_citer = InspireRecord.get_record_by_pid_value( citer_control_number, "lit") record_citing = InspireRecord.get_record_by_pid_value( citing_control_number, "lit") assert record_citing.citation_count == 1 record_citer_es = InspireSearch.get_record_data_from_es(record_citer) result_citer_control_number = record_citer_es["control_number"] assert citer_control_number == result_citer_control_number record_citing_es = InspireSearch.get_record_data_from_es(record_citing) result_citing_control_number = record_citing_es["control_number"] assert citing_control_number == result_citing_control_number