def test_index_record(inspire_app, celery_app_with_context, celery_session_worker): models_committed.disconnect(index_after_commit) records = [ create_record_async("lit"), create_record_async("aut"), create_record_async("job"), create_record_async("jou"), create_record_async("exp"), create_record_async("con"), create_record_async("dat"), create_record_async("ins"), ] uuids = [record.id for record in records] task = index_records.delay(uuids) results = task.get(timeout=5) uuids = [str(uuid) for uuid in uuids] assert results == uuids for record in records: result = InspireSearch.get_record_data_from_es(record) assert record["control_number"] == result["control_number"] models_committed.connect(index_after_commit)
def test_process_references_in_records_reindexes_conferences_when_pub_info_changes( inspire_app, celery_app_with_context, celery_session_worker): # disconnect this signal so records don't get indexed models_committed.disconnect(index_after_commit) conference_data = faker.record("con", with_control_number=True) conference_record = InspireRecord.create(conference_data) conference_control_number = conference_record["control_number"] conf_ref = f"http://localhost:8000/api/conferences/{conference_control_number}" data = faker.record("lit", with_control_number=True) data["publication_info"] = [{"conference_record": {"$ref": conf_ref}}] data["document_type"] = ["conference paper"] record = InspireRecord.create(data) db.session.commit() # reconnect signal before we call process_references_in_records models_committed.connect(index_after_commit) uuids = [record.id] task = process_references_in_records.delay(uuids) result = task.get(timeout=5) conference_record_es = InspireSearch.get_record_data_from_es( conference_record) expected_number_of_contributions = 1 assert (expected_number_of_contributions == conference_record_es["number_of_contributions"])
def assert_recalculate_references_task(): author_record_from_es = InspireSearch.get_record_data_from_es(author) job_record_from_es = InspireSearch.get_record_data_from_es(job) literature_record_from_es = InspireSearch.get_record_data_from_es( literature) assert (author_record_from_es["positions"][0]["record"]["$ref"] == merged_institution_record["self"]["$ref"]) assert (job_record_from_es["institutions"][0]["record"]["$ref"] == merged_institution_record["self"]["$ref"]) assert ( literature_record_from_es["authors"][0]["affiliations"][0] ["record"]["$ref"] == merged_institution_record["self"]["$ref"]) assert ( literature_record_from_es["thesis_info"]["institutions"][0] ["record"]["$ref"] == merged_institution_record["self"]["$ref"])
def assert_disambiguation_on_update(): db.session.close() literature_record = LiteratureRecord.get_record(literature_record_uuid) literature_record_from_es = InspireSearch.get_record_data_from_es( literature_record) assert literature_record["authors"][0]["record"]["$ref"] assert literature_record_from_es["authors"][0]["record"]["$ref"]
def test_process_references_in_records_reindexes_experiments_when_linked_experiments_change( app, clean_celery_session): # disconnect this signal so records don't get indexed models_committed.disconnect(index_after_commit) experiment_data = faker.record("exp", with_control_number=True) experiment = InspireRecord.create(experiment_data) db.session.commit() experiment_control_number = experiment["control_number"] exp_ref = f"http://localhost:8000/api/experiments/{experiment_control_number}" data = faker.record("lit", with_control_number=True) data["accelerator_experiments"] = [{ "legacy_name": "LIGO", "record": { "$ref": exp_ref } }] record = InspireRecord.create(data) db.session.commit() models_committed.connect(index_after_commit) task = process_references_in_records.delay([record.id]) task.get(timeout=5) experiment_record_es = InspireSearch.get_record_data_from_es(experiment) expected_number_of_paper = 1 assert expected_number_of_paper == experiment_record_es["number_of_papers"]
def assert_disambiguation_task(): literature_record_from_es = InspireSearch.get_record_data_from_es( literature_record) assert { "schema": "INSPIRE BAI", "value": "J.M.Maldacena.1", } in literature_record_from_es["authors"][0]["ids"]
def assert_disambiguation_task(): literature_record_from_es = InspireSearch.get_record_data_from_es( literature_record) author_record_from_es = AuthorsSearch().query_from_iq("").execute() assert author_record_from_es.hits[0].name["value"] == "Michal Kowal" assert (literature_record_from_es["authors"][0]["recid"] == author_record_from_es.hits[0].control_number)
def assert_references(): current_search.flush_and_refresh("records-hep") record_from_es = InspireSearch.get_record_data_from_es(record) assert expected_facet_author_name == record_from_es[ "facet_author_name"] assert expected_record_ref == record_from_es["authors"][0]["record"][ "$ref"]
def update_references_pointing_to_merged_record(refs_to_schema, merged_record_uri, new_record_uri): for index, path in refs_to_schema: query = get_query_for_given_path(index, path, merged_record_uri) es_index_name = f"records-{index}" matched_records = InspireSearch( index=es_index_name).query(query).scan() for matched_record in matched_records: pid_type = current_app.config["SCHEMA_TO_PID_TYPES"][index] record_class = InspireRecord.get_subclasses()[pid_type] matched_inspire_record_data = ( db.session.query(RecordMetadata).with_for_update().filter_by( id=matched_record.meta.id).first()) matched_inspire_record = record_class( matched_inspire_record_data.json, model=matched_inspire_record_data) referenced_records_in_path = flatten_list( get_value(matched_inspire_record, path[:-len(".$ref")], [])) for referenced_record in referenced_records_in_path: update_reference_if_reference_uri_matches( referenced_record, merged_record_uri, new_record_uri) matched_inspire_record.update(dict(matched_inspire_record)) LOGGER.info("Updated reference for record", uuid=str(matched_inspire_record.id)) db.session.commit()
def assert_migrator_task(): record_citer = InspireRecord.get_record_by_pid_value( citer_control_number, "lit") record_citing = InspireRecord.get_record_by_pid_value( citing_control_number, "lit") assert record_citing.citation_count == 1 record_citer_es = InspireSearch.get_record_data_from_es(record_citer) result_citer_control_number = record_citer_es["control_number"] assert citer_control_number == result_citer_control_number record_citing_es = InspireSearch.get_record_data_from_es(record_citing) result_citing_control_number = record_citing_es["control_number"] assert citing_control_number == result_citing_control_number
def assert_all_records_in_es(): literature_records_from_es = list(LiteratureSearch().query_from_iq( query_string= f"publication_info.journal_record.$ref: {journal_record_reference}" ).scan()) journal_record_from_es = InspireSearch.get_record_data_from_es(journal) assert len(literature_records_from_es) == 11 and journal_record_from_es
def assert_disambiguation_task(): literature_record_from_es = InspireSearch.get_record_data_from_es( literature_record) # new author is created assert (literature_record_from_es["authors"][0].get("record") != "http://localhost:5000/api/authors/90676330") assert (literature_record_from_es["authors"][0].get("record") != "http://localhost:5000/api/authors/90676331")
def assert_disambiguation_on_record_update(): literature_record_from_es = InspireSearch.get_record_data_from_es( literature_record_3) assert (literature_record_from_es["authors"][0]["ids"] == lit_record["authors"][0]["ids"]) assert (literature_record_from_es["authors"][0]["record"] == lit_record["authors"][0]["record"])
def test_continuous_migration_with_invalid_control_number( app, cache, celery_app_with_context, celery_session_worker): raw_record_citer = ( b"<record>" b' <controlfield tag="001">666</controlfield>' b' <datafield tag="245" ind1=" " ind2=" ">' b' <subfield code="a">This is a citer record</subfield>' b" </datafield>" b' <datafield tag="980" ind1=" " ind2=" ">' b' <subfield code="a">HEP</subfield>' b" </datafield>" b' <datafield tag="999" ind1="C" ind2="5">' b' <subfield code="0">667</subfield>' b' <subfield code="h">Achasov, M.N.</subfield>' b' <subfield code="k">snd-2018</subfield>' b' <subfield code="m">(SND Collaboration)</subfield>' b' <subfield code="o">2</subfield>' b' <subfield code="s">Phys.Rev.,D97,012008</subfield>' b' <subfield code="x">' b" [2] M. N. Achasov (SND Collaboration), Phys. Rev. D 97, 012008 (2018)." b" </subfield>" b' <subfield code="y">2018</subfield>' b' <subfield code="z">0</subfield>' b' <subfield code="z">1</subfield>' b" </datafield>" b"</record>") citer_control_number = 666 raw_record_cited = ( b"<record>" b' <controlfield tag="001">this is not a control number</controlfield>' b' <datafield tag="245" ind1=" " ind2=" ">' b' <subfield code="a">This is a citing record</subfield>' b" </datafield>" b' <datafield tag="980" ind1=" " ind2=" ">' b' <subfield code="a">HEP</subfield>' b" </datafield>" b"</record>") cache.rpush("legacy_records", zlib.compress(raw_record_citer)) cache.rpush("legacy_records", zlib.compress(raw_record_cited)) assert cache.llen("legacy_records") == 2 with pytest.raises(ValueError): continuous_migration() record_citer = InspireRecord.get_record_by_pid_value( citer_control_number, "lit") record_citer_es = InspireSearch.get_record_data_from_es(record_citer) result_citer_control_number = record_citer_es["control_number"] assert citer_control_number == result_citer_control_number # I don't like timeouts, it's the only way to wait for this chain time.sleep(5) assert cache.llen("legacy_records") == 1
def assert_disambiguation_task(): literature_record_from_es = InspireSearch.get_record_data_from_es( literature_record) literature_record_from_es_authors = literature_record_from_es.get( "authors") assert (str(author_1["control_number"]) in literature_record_from_es_authors[0]["record"]["$ref"]) assert (str(author_2["control_number"]) in literature_record_from_es_authors[1]["record"]["$ref"])
def test_search_factory_with_query(inspire_app): with current_app.test_request_context("?q=foo"): search = InspireSearch() expected_query_string = "foo" expected_search_to_dict = { "query": { "query_string": { "default_operator": "AND", "query": "foo" } }, "track_total_hits": True, } query_string, search = inspire_search_factory(None, search) search_to_dict = search.to_dict() assert expected_query_string == query_string assert expected_search_to_dict == search_to_dict
def test_index_record_deletes_a_deleted_record(inspire_app, clean_celery_session): record_to_delete = create_record_async("lit") record_to_delete_control_number = record_to_delete["control_number"] record_to_delete = InspireRecord.get_record_by_pid_value( record_to_delete_control_number, "lit") record_to_delete.delete() db.session.commit() uuids = [record_to_delete.id] task = index_records.delay(uuids) results = task.get(timeout=5) uuids = [str(uuid) for uuid in uuids] assert results == uuids with pytest.raises(TransportError): InspireSearch.get_record_data_from_es(record_to_delete)
def assert_first_disambiguation_no_match(): literature_record_from_es = InspireSearch.get_record_data_from_es( literature_record_3) assert get_values_for_schema( literature_record_from_es["authors"][0]["ids"], "INSPIRE BAI") assert (literature_record_from_es["authors"][0]["ids"] != literature_record["authors"][0]["ids"]) assert (literature_record_from_es["authors"][0]["ids"] != literature_record_2["authors"][0]["ids"])
def test_process_references_in_records(inspire_app, celery_app_with_context, celery_session_worker): # disconnect this signal so records don't get indexed models_committed.disconnect(index_after_commit) cited_record_1 = LiteratureRecord.create(faker.record("lit")) cited_record_2 = LiteratureRecord.create(faker.record("lit")) data_citing_record_1 = faker.record( "lit", literature_citations=[cited_record_1["control_number"]]) citing_record_1 = LiteratureRecord.create(data_citing_record_1) data_citing_record_2 = faker.record( "lit", literature_citations=[cited_record_2["control_number"]]) citing_record_2 = LiteratureRecord.create(data_citing_record_2) db.session.commit() # reconnect signal before we call process_references_in_records models_committed.connect(index_after_commit) uuids = [citing_record_1.id, citing_record_2.id] task = process_references_in_records.delay(uuids) result = task.get(timeout=5) result_cited_record_1 = InspireSearch.get_record_data_from_es( cited_record_1) expected_result_cited_record_1_citation_count = 1 assert (expected_result_cited_record_1_citation_count == result_cited_record_1["citation_count"]) result_cited_record_2 = InspireSearch.get_record_data_from_es( cited_record_2) expected_result_cited_record_2_citation_count = 1 assert (expected_result_cited_record_2_citation_count == result_cited_record_2["citation_count"])
def assert_all_records_in_es(): literature_record_from_es = InspireSearch.get_record_data_from_es( literature) seminar_record_from_es = InspireSearch.get_record_data_from_es(seminar) assert all([literature_record_from_es, seminar_record_from_es])
def test_continuous_migration_with_an_invalid_record(app, cache, celery_app_with_context, celery_session_worker): raw_record_citer = ( b"<record>" b' <controlfield tag="001">666</controlfield>' b' <datafield tag="245" ind1=" " ind2=" ">' b' <subfield code="a">This is a citer record</subfield>' b" </datafield>" b' <datafield tag="980" ind1=" " ind2=" ">' b' <subfield code="a">HEP</subfield>' b" </datafield>" b' <datafield tag="999" ind1="C" ind2="5">' b' <subfield code="0">667</subfield>' b' <subfield code="h">Achasov, M.N.</subfield>' b' <subfield code="k">snd-2018</subfield>' b' <subfield code="m">(SND Collaboration)</subfield>' b' <subfield code="o">2</subfield>' b' <subfield code="s">Phys.Rev.,D97,012008</subfield>' b' <subfield code="x">' b" [2] M. N. Achasov (SND Collaboration), Phys. Rev. D 97, 012008 (2018)." b" </subfield>" b' <subfield code="y">2018</subfield>' b' <subfield code="z">0</subfield>' b' <subfield code="z">1</subfield>' b" </datafield>" b"</record>") citer_control_number = 666 raw_record_cited = ( b"<record>" b' <controlfield tag="001">667</controlfield>' b' <datafield tag="245" ind1=" " ind2=" ">' b' <subfield code="a">This is a citing record</subfield>' b" </datafield>" b' <datafield tag="980" ind1=" " ind2=" ">' b' <subfield code="a">HEP</subfield>' b" </datafield>" b"</record>") cited_control_number = 667 raw_record_invalid = ( b"<record>" b' <controlfield tag="001">668</controlfield>' b' <datafield tag="260" ind1=" " ind2=" ">' b' <subfield code="c">Definitely not a date</subfield>' b" </datafield>" b' <datafield tag="980" ind1=" " ind2=" ">' b' <subfield code="a">HEP</subfield>' b" </datafield>" b"</record>") invalid_control_number = 668 cache.rpush("legacy_records", zlib.compress(raw_record_citer)) cache.rpush("legacy_records", zlib.compress(raw_record_invalid)) cache.rpush("legacy_records", zlib.compress(raw_record_cited)) assert cache.llen("legacy_records") == 3 continuous_migration() # I don't like timeouts, it's the only way to wait for this chain time.sleep(10) record_citer = InspireRecord.get_record_by_pid_value( citer_control_number, "lit") record_cited = InspireRecord.get_record_by_pid_value( cited_control_number, "lit") with pytest.raises(PIDDoesNotExistError): InspireRecord.get_record_by_pid_value(invalid_control_number, "lit") assert record_cited.citation_count == 1 record_citer_es = InspireSearch.get_record_data_from_es(record_citer) result_citer_control_number = record_citer_es["control_number"] assert citer_control_number == result_citer_control_number record_cited_es = InspireSearch.get_record_data_from_es(record_cited) result_cited_control_number = record_cited_es["control_number"] assert cited_control_number == result_cited_control_number with app.test_client() as client: result = client.get( f"/literature/{result_cited_control_number}/citations").json result_citation_count = result["metadata"]["citation_count"] assert 1 == result_citation_count assert cache.llen("legacy_records") == 0
def assert_authors_records_exist_in_es(): author_record_from_es = InspireSearch.get_record_data_from_es( author_record) assert author_record_from_es
def assert_lit_records_exist_in_es(): lit_record_from_es = InspireSearch.get_record_data_from_es( literature_record) assert lit_record_from_es
def assert_disambiguation_task(): literature_record_from_es = InspireSearch.get_record_data_from_es( literature_record_3) assert (literature_data_2["authors"][0]["record"] == literature_record_from_es["authors"][0]["record"])
def assert_disambiguation_task(): literature_record_from_es = InspireSearch.get_record_data_from_es( literature_record) assert not literature_record_from_es["authors"][0].get("record")
def assert_recalculate_references_task(): literature_record_from_es = InspireSearch.get_record_data_from_es( literature) assert (literature_record_from_es["publication_info"][0] ["conference_record"]["$ref"] == merged_conference_record["self"]["$ref"])
def test_process_references_in_records_reindexes_institutions_when_linked_institutions_change( inspire_app, celery_app_with_context, celery_session_worker): # disconnect this signal so records don't get indexed models_committed.disconnect(index_after_commit) institution_data = faker.record("ins", with_control_number=True) institution = InspireRecord.create(institution_data) institution_control_number = institution["control_number"] inst_ref = f"http://localhost:8000/api/institutions/{institution_control_number}" data = faker.record("lit", with_control_number=True) data.update({ "authors": [{ "full_name": "John Doe", "affiliations": [{ "value": "Institution", "record": { "$ref": inst_ref } }], }] }) record_authors_aff = InspireRecord.create(data) db.session.commit() data = faker.record("lit", with_control_number=True) data.update( {"thesis_info": { "institutions": [{ "record": { "$ref": inst_ref } }] }}) record_thesis_info = InspireRecord.create(data) db.session.commit() data = faker.record("lit", with_control_number=True) data.update({ "record_affiliations": [{ "record": { "$ref": inst_ref }, "value": "Institution" }] }) record_affiliations = InspireRecord.create(data) db.session.commit() # reconnect signal before we call process_references_in_records models_committed.connect(index_after_commit) task = process_references_in_records.delay( [record_authors_aff.id, record_thesis_info.id, record_affiliations.id]) task.get(timeout=5) institution_record_es = InspireSearch.get_record_data_from_es(institution) expected_number_of_paper = 3 assert expected_number_of_paper == institution_record_es[ "number_of_papers"]
def assert_disambiguation_on_record_update(): literature_record_from_es = InspireSearch.get_record_data_from_es( literature_record) assert (get_values_for_schema( literature_record_from_es["authors"][0]["ids"], "INSPIRE BAI")[0] == old_bai)
def assert_disambiguation_task(): literature_record_from_es = InspireSearch.get_record_data_from_es( literature_record) assert (str(author_data["control_number"]) in literature_record_from_es["authors"][0]["record"]["$ref"])
def test_continuous_migration_with_different_type_of_records( inspire_app, celery_app_with_context, celery_session_worker, redis): raw_record_citer = ( b"<record>" b' <controlfield tag="001">666</controlfield>' b' <datafield tag="245" ind1=" " ind2=" ">' b' <subfield code="a">This is a citer record</subfield>' b" </datafield>" b' <datafield tag="980" ind1=" " ind2=" ">' b' <subfield code="a">HEP</subfield>' b" </datafield>" b' <datafield tag="999" ind1="C" ind2="5">' b' <subfield code="0">667</subfield>' b' <subfield code="h">Achasov, M.N.</subfield>' b' <subfield code="k">snd-2018</subfield>' b' <subfield code="m">(SND Collaboration)</subfield>' b' <subfield code="o">2</subfield>' b' <subfield code="s">Phys.Rev.,D97,012008</subfield>' b' <subfield code="x">' b" [2] M. N. Achasov (SND Collaboration), Phys. Rev. D 97, 012008 (2018)." b" </subfield>" b' <subfield code="y">2018</subfield>' b' <subfield code="z">0</subfield>' b' <subfield code="z">1</subfield>' b" </datafield>" b"</record>") citer_control_number = 666 raw_record_cited = ( b"<record>" b' <controlfield tag="001">667</controlfield>' b' <datafield tag="245" ind1=" " ind2=" ">' b' <subfield code="a">This is a citing record</subfield>' b" </datafield>" b' <datafield tag="980" ind1=" " ind2=" ">' b' <subfield code="a">HEP</subfield>' b" </datafield>" b"</record>") cited_control_number = 667 raw_author = (b"<record>" b' <controlfield tag="001">668</controlfield>' b' <datafield tag="100" ind1=" " ind2=" ">' b' <subfield code="a">Jessica Jones</subfield>' b' <subfield code="q">Jones Jessica</subfield>' b" </datafield>" b' <datafield tag="980" ind1=" " ind2=" ">' b' <subfield code="a">HEPNAMES</subfield>' b" </datafield>" b"</record>") author_control_number = 668 redis.rpush("legacy_records", zlib.compress(raw_record_citer)) redis.rpush("legacy_records", zlib.compress(raw_author)) redis.rpush("legacy_records", zlib.compress(raw_record_cited)) redis.rpush("legacy_records", b"END") assert redis.llen("legacy_records") == 4 continuous_migration() # I don't like timeouts, it's the only way to wait for this chain time.sleep(5) record_citer = InspireRecord.get_record_by_pid_value( citer_control_number, "lit") record_cited = InspireRecord.get_record_by_pid_value( cited_control_number, "lit") record_author = InspireRecord.get_record_by_pid_value( author_control_number, "aut") assert record_cited.citation_count == 1 record_citer_es = InspireSearch.get_record_data_from_es(record_citer) result_citer_control_number = record_citer_es["control_number"] assert citer_control_number == result_citer_control_number record_cited_es = InspireSearch.get_record_data_from_es(record_cited) result_cited_control_number = record_cited_es["control_number"] assert cited_control_number == result_cited_control_number record_author_es = InspireSearch.get_record_data_from_es(record_author) result_author_control_number = record_author_es["control_number"] assert author_control_number == result_author_control_number with inspire_app.test_client() as client: result = client.get( f"/api/literature/{result_cited_control_number}/citations").json result_citation_count = result["metadata"]["citation_count"] assert 1 == result_citation_count assert redis.llen("legacy_records") == 0