def test_get_search_with_source_with_LiteratureSearch_instance_with_defined_headers( inspire_app): config = { "LITERATURE_SOURCE_INCLUDES_BY_CONTENT_TYPE": { "application/vnd+inspire.record.ui+json": ["title", "description"] }, "LITERATURE_SOURCE_EXCLUDES_BY_CONTENT_TYPE": { "application/vnd+inspire.record.ui+json": ["excludes_with_includes_looks_stupid"], "application/bibtex": ["control_number"], }, } headers = {"Accept": "application/vnd+inspire.record.ui+json"} with override_config(**config), current_app.test_request_context( headers=headers): search = LiteratureSearch() search = get_search_with_source(search) expected_source_includes = ["title", "description"] expected_source_excludes = ["excludes_with_includes_looks_stupid"] search_to_dict = search.to_dict() search_source = search_to_dict["_source"] assert expected_source_includes == search_source["includes"] assert expected_source_excludes == search_source["excludes"]
def test_indexer_populates_referenced_authors_bais(inspire_app): data_authors = { "authors": [ { "full_name": "Jean-Luc Picard", "ids": [{ "schema": "INSPIRE BAI", "value": "Jean.L.Picard.1" }], }, { "full_name": "John Doe", "ids": [{ "schema": "INSPIRE BAI", "value": "J.Doe.1" }], }, ] } cited_record_1 = create_record("lit", data=data_authors) data_authors = { "authors": [ { "full_name": "Jean-Luc Picard", "ids": [{ "schema": "INSPIRE BAI", "value": "Jean.L.Picard.1" }], }, { "full_name": "Steven Johnson", "ids": [{ "schema": "INSPIRE BAI", "value": "S.Johnson.1" }], }, ] } cited_record_2 = create_record("lit", data=data_authors) citing_record = create_record( "lit", literature_citations=[ cited_record_1["control_number"], cited_record_2["control_number"], ], ) expected_rec3_referenced_authors_bais = [ "J.Doe.1", "Jean.L.Picard.1", "S.Johnson.1", ] rec1_es = LiteratureSearch.get_record_data_from_es(cited_record_1) rec2_es = LiteratureSearch.get_record_data_from_es(cited_record_2) rec3_es = LiteratureSearch.get_record_data_from_es(citing_record) assert "referenced_authors_bais" not in rec1_es assert "referenced_authors_bais" not in rec2_es assert (sorted(rec3_es["referenced_authors_bais"]) == expected_rec3_referenced_authors_bais)
def assert_disambiguation_cli(): records = LiteratureSearch().get_records(record_uuids).execute() for record in records: for author in record.authors: assert "record" in author record_not_disambiguated = (LiteratureSearch().get_records( [record_that_shouldnt_be_disambiguated_uuid]).execute()) assert "record" not in record_not_disambiguated[0]["authors"][0]
def test_get_search_with_source_with_fields_query_param(inspire_app): with current_app.test_request_context("?fields=authors,ids"): search = LiteratureSearch() search = get_search_with_source(search) expected_search_to_dict_source = { "includes": ["authors", "ids", "control_number", "_updated", "_created"] } search_to_dict = search.to_dict() assert expected_search_to_dict_source == search_to_dict["_source"]
def test_get_search_with_source_with_LiteratureSearch_instance_without_config( inspire_app): config = { "LITERATURE_SOURCE_INCLUDES_BY_CONTENT_TYPE": None, "LITERATURE_SOURCE_EXCLUDES_BY_CONTENT_TYPE": None, } with override_config(**config), current_app.test_request_context(): search = LiteratureSearch() search = get_search_with_source(search) search_to_dict = search.to_dict() assert "_source" not in search_to_dict
def test_get_search_with_source_with_LiteratureSearch_instance_without_config( base_app): config = { "LITERATURE_SOURCE_INCLUDES_BY_CONTENT_TYPE": None, "LITERATURE_SOURCE_EXCLUDES_BY_CONTENT_TYPE": None, } with patch.dict(base_app.config, config), base_app.test_request_context(): search = LiteratureSearch() search = get_search_with_source(search) search_to_dict = search.to_dict() assert "_source" not in search_to_dict
def test_gracefully_handle_records_updating_in_wrong_order( inspire_app, clean_celery_session): # We want to run indexing in weird order, so disable auto indexing models_committed.disconnect(index_after_commit) cited_record = LiteratureRecord.create(data=faker.record("lit")) record_data = faker.record( "lit", literature_citations=[cited_record.control_number]) record = LiteratureRecord.create(data=record_data) db.session.commit() record = LiteratureRecord.get_record_by_pid_value(record.control_number) index_record(record.id, record.model.versions[-1].version_id) assert LiteratureSearch().get_source( cited_record.id)["citation_count"] == 1 data = dict(record) del data["references"] record.update(data) db.session.commit() record = LiteratureRecord.get_record_by_pid_value(record.control_number) data = dict(record) data["titles"][0] = {"title": "New Title"} record.update(data) db.session.commit() record = LiteratureRecord.get_record_by_pid_value(record.control_number) index_record(record.id, record.model.versions[-1].version_id) record = LiteratureRecord.get_record_by_pid_value(record.control_number) assert LiteratureSearch().get_source( cited_record.id)["citation_count"] == 1 assert LiteratureSearch().get_source(record.id)["titles"] == [{ "title": "New Title" }] index_record(record.id, record.model.versions[-2].version_id) assert LiteratureSearch().get_source( cited_record.id)["citation_count"] == 0 assert LiteratureSearch().get_source(record.id)["titles"] == [{ "title": "New Title" }] models_committed.connect(index_after_commit)
def test_indexer_oai_set_CERN_arxiv_and_CDS(inspire_app): extra_data = { "report_numbers": [{ "value": "CERN-2020-001" }], "arxiv_eprints": [{ "value": "2009.01484" }], "_export_to": { "CDS": True }, } record_data = faker.record("lit", data=extra_data) record = LiteratureRecord.create(record_data) record.index(delay=False) result_record = LiteratureSearch.get_record_data_from_es(record) expected_id = f"oai:inspirehep.net:{record['control_number']}" expected_updated = record.updated.strftime(OAI_TIME_FORMAT) expected_sets = [ inspire_app.config["OAI_SET_CDS"], inspire_app.config["OAI_SET_CERN_ARXIV"], ] assert expected_id == result_record["_oai"]["id"] assert expected_updated == result_record["_oai"]["updated"] assert expected_sets == result_record["_oai"]["sets"]
def _get_all_not_disambiguated_records_search(): query = { "query": { "bool": { "must": [ { "nested": { "path": "authors", "query": { "bool": { "must_not": { "exists": { "field": "authors.record.$ref" } } } }, } }, { "match": { "_collections": "Literature" } }, ] } } } search_obj = (LiteratureSearch().from_dict(query).params( track_total_hits=True, _source={}, size=1000, scroll="60m")) return search_obj
def assert_record_in_es(): current_search.flush_and_refresh("*") record_lit_es = (LiteratureSearch().get_record(str( rec.id)).execute().hits.hits[0]) document = record_lit_es._source["documents"][0] assert "attachment" in document assert "text" not in document # pipeline should remove it
def assert_disambiguation_cli(): record_from_es = LiteratureSearch.get_record_data_from_es(record) for author in record_from_es["authors"]: if author["full_name"] == "Test Author Dismabiguated": assert author["record"]["$ref"] == disambiguated_author_ref else: assert "record" in author
def test_literature_journal_title_search_is_case_insensitive(inspire_app): record1 = create_record( "lit", data={ "publication_info": [{ "year": 2017, "artid": "020", "page_start": "020", "journal_title": "JHEP", "journal_record": { "$ref": "https://inspirebeta.net/api/journals/1213103" }, "journal_volume": "10", }], }, ) record2 = create_record( "lit", data={ "publication_info": [{ "year": 2017, "artid": "021", "page_start": "021", "journal_title": "JHEP", "journal_volume": "10", }], }, ) result_lowercase = LiteratureSearch().query_from_iq("j jhep").execute() result_uppercase = LiteratureSearch().query_from_iq("j JHEP").execute() assert result_lowercase assert result_uppercase hits_lowercase = result_lowercase["hits"]["hits"] hits_uppercase = result_uppercase["hits"]["hits"] result_lowercase_found_record_ids = [hit._id for hit in hits_lowercase] result_uppercase_found_record_ids = [hit._id for hit in hits_uppercase] assert len(result_lowercase_found_record_ids) == 2 assert len(result_uppercase_found_record_ids) == 2 assert str(record1.id) in result_lowercase_found_record_ids assert str(record2.id) in result_lowercase_found_record_ids assert str(record1.id) in result_uppercase_found_record_ids assert str(record2.id) in result_uppercase_found_record_ids
def assert_all_records_in_es(): literature_records_from_es = list(LiteratureSearch().query_from_iq( query_string= f"publication_info.journal_record.$ref: {journal_record_reference}" ).scan()) journal_record_from_es = InspireSearch.get_record_data_from_es(journal) assert len(literature_records_from_es) == 11 and journal_record_from_es
def assert_update_in_es(): current_search.flush_and_refresh("*") record_lit_es = (LiteratureSearch().get_record(str( record.id)).execute().hits.hits[0]) assert "new_doc.pdf" == record_lit_es._source["documents"][0][ "key"] assert (record_first_attachment != record_lit_es._source["documents"][0]["attachment"])
def test_get_search_with_source_with_LiteratureSearch_instance_with_not_defined_headers( inspire_app): config = { "LITERATURE_SOURCE_INCLUDES_BY_CONTENT_TYPE": { "application/vnd+inspire.record.ui+json": ["title", "description"] }, "LITERATURE_SOURCE_EXCLUDES_BY_CONTENT_TYPE": { "application/bibtex": ["control_number"] }, } headers = {"Accept": "application/json"} with override_config(**config), current_app.test_request_context( headers=headers): search = LiteratureSearch() search = get_search_with_source(search) search_to_dict = search.to_dict() assert "_source" not in search_to_dict
def test_get_search_with_source_with_fields_query_param_and_wrong_formats( inspire_app): with current_app.test_request_context("?fields=authors,ids&format=bibtex"): search = LiteratureSearch() with pytest.raises(FieldsParamForbidden): get_search_with_source(search) with current_app.test_request_context( "?fields=authors,ids&format=latex-eu"): search = LiteratureSearch() with pytest.raises(FieldsParamForbidden): get_search_with_source(search) with current_app.test_request_context( "?fields=authors,ids&format=latex-us"): search = LiteratureSearch() with pytest.raises(FieldsParamForbidden): get_search_with_source(search)
def test_literature_get_records_by_pids_returns_correct_record(inspire_app): record1 = create_record("lit") record1_control_number = record1["control_number"] record2 = create_record("lit") record2_control_number = record2["control_number"] expected_control_numbers = [record1_control_number, record2_control_number] result = LiteratureSearch().get_records_by_pids([("lit", record1_control_number)]) assert len(result) == 1 assert (json.loads( result[0]._ui_display)["control_number"] == record1["control_number"]) result = LiteratureSearch().get_records_by_pids([ ("lit", record1_control_number), ("lit", record2_control_number) ]) assert len(result) == len(expected_control_numbers) for rec in result: assert rec.to_dict()["control_number"] in expected_control_numbers
def query_report_number(report_number): query = Q("match", report_numbers__value__fuzzy=report_number) source = ["control_number"] results = LiteratureSearch().query(query).source(source).execute() if len(results.hits) == 1: control_number = results.hits[0]["control_number"] return get_record_for_pid_or_none( "lit", control_number, ) return None
def get_literature_recids_for_orcid(orcid): """Return the Literature recids that were claimed by an ORCiD. We record the fact that the Author record X has claimed the Literature record Y by storing in Y an author object with a ``$ref`` pointing to X and the key ``curated_relation`` set to ``True``. Therefore this method first searches the DB for the Author records for the one containing the given ORCiD, and then uses its recid to search in ES for the Literature records that satisfy the above property. Args: orcid (str): the ORCiD. Return: list(int): the recids of the Literature records that were claimed by that ORCiD. """ orcid_object = f'[{{"schema": "ORCID", "value": "{orcid}"}}]' # this first query is written in a way that can use the index on (json -> ids) author_rec_uuid = ( db.session.query(RecordMetadata.id) .filter(type_coerce(RecordMetadata.json, JSONB)["ids"].contains(orcid_object)) .one() .id ) author_record = ( db.session.query(PersistentIdentifier) .filter( PersistentIdentifier.object_type == "rec", PersistentIdentifier.object_uuid == author_rec_uuid, PersistentIdentifier.pid_type == "aut", ) .one() ) author_recid = ( author_record.pid_value if not author_record.is_redirected() else InspireRedirect.get_redirect(author_record).pid_value ) query = Q("match", authors__curated_relation=True) & Q( "match", **{"authors.record.$ref": author_recid} ) search_by_curated_author = ( LiteratureSearch() .query("nested", path="authors", query=query) .params(_source=["control_number"], size=9999) ) return [el["control_number"] for el in search_by_curated_author]
def test_reindex_one_type_of_record(inspire_app, cli): record_lit = create_record_factory("lit") create_record_factory("aut") cli.invoke(["index", "reindex", "-p", "lit"]) current_search.flush_and_refresh("*") expected_aut_len = 0 results_lit_uuid = LiteratureSearch().execute().hits.hits[0]["_id"] results_aut_len = len(AuthorsSearch().execute().hits.hits) assert str(record_lit.id) == results_lit_uuid assert expected_aut_len == results_aut_len
def test_migrate_from_mirror_removes_record_from_es(inspire_app, datadir): data = orjson.loads((datadir / "dummy_record.json").read_text()) create_record("lit", data=data) expected_record_lit_es_len = 1 record_lit_uuid = LiteratureRecord.get_uuid_from_pid_value(12345) record_lit_es = LiteratureSearch().get_record(str(record_lit_uuid)).execute().hits record_lit_es_len = len(record_lit_es) assert expected_record_lit_es_len == record_lit_es_len record_deleted_fixture_path = pkg_resources.resource_filename( __name__, os.path.join("fixtures", "dummy_deleted.xml") ) migrate_from_file(record_deleted_fixture_path) current_search.flush_and_refresh("records-hep") expected_record_lit_es_len = 0 record_lit_uuid = LiteratureRecord.get_uuid_from_pid_value(12345) record_lit_es = LiteratureSearch().get_record(str(record_lit_uuid)).execute().hits record_lit_es_len = len(record_lit_es) assert expected_record_lit_es_len == record_lit_es_len
def assert_assign(): for literature in [literature_1, literature_2]: current_search.flush_and_refresh("*") literature_after = LiteratureSearch.get_record_data_from_es( literature) literature_author = literature_after["authors"][0] assert literature_author["record"] == { "$ref": f"http://localhost:5000/api/authors/{to_author['control_number']}" } assert literature_author["curated_relation"] assert literature_author["ids"] == to_author["ids"]
def assert_assign(): current_search.flush_and_refresh("*") literature_after = LiteratureSearch.get_record_data_from_es(literature) literature_author = literature_after["authors"][1] to_author_after = AuthorsRecord.get_record_by_pid_value( to_author["control_number"]) assert literature_author["record"] == { "$ref": f"http://localhost:5000/api/authors/{to_author['control_number']}" } assert literature_author["curated_relation"] assert literature_author["ids"] == to_author["ids"] assert not to_author_after["stub"]
def test_get_search_with_source_with_fields_query_param_and_wrong_mimetype( inspire_app): with current_app.test_request_context( "?fields=authors,ids", headers={"Accept": "application/x-bibtex"}): with pytest.raises(FieldsParamForbidden): search = LiteratureSearch() get_search_with_source(search) with current_app.test_request_context( "?fields=authors,ids", headers={"Accept": "application/vnd+inspire.latex.eu+x-latex"}, ): with pytest.raises(FieldsParamForbidden): search = LiteratureSearch() get_search_with_source(search) with current_app.test_request_context( "?fields=authors,ids", headers={"Accept": "application/vnd+inspire.latex.us+x-latex"}, ): with pytest.raises(FieldsParamForbidden): search = LiteratureSearch() get_search_with_source(search)
def test_reference_convert_old_publication_info_to_new_with_exception( mock_convert_old_publication_info_to_new, inspire_app): mock_convert_old_publication_info_to_new.side_effect = Exception() reference = { "reference": { "publication_info": { "journal_title": "JHEP", "journal_volume": "06", "page_start": "131", "year": 2018, } } } result = LiteratureSearch().convert_old_publication_info_to_new(reference) assert reference == result
def test_indexer_oai_set_CDS(inspire_app): extra_data = {"_export_to": {"CDS": True}} record_data = faker.record("lit", data=extra_data) record = LiteratureRecord.create(record_data) record.index(delay=False) result_record = LiteratureSearch.get_record_data_from_es(record) expected_id = f"oai:inspirehep.net:{record['control_number']}" expected_updated = "1994-12-19T00:00:00" expected_sets = [inspire_app.config["OAI_SET_CDS"]] assert expected_id == result_record["_oai"]["id"] assert expected_updated == result_record["_oai"]["updated"] assert expected_sets == result_record["_oai"]["sets"]
def test_cli_reindex_deleted_and_redirected_records(inspire_app, cli): redirected = create_record("lit") new_record = create_record("lit") deleted = create_record("lit") # disable signals so re-indexing won't run automatically after record update models_committed.disconnect(index_after_commit) # redirect one record new_record_data = dict(new_record) new_record_data["deleted_records"] = [redirected["self"]] new_record.update(new_record_data) # delete one record deleted.delete() # re-enable signals models_committed.connect(index_after_commit) # check if deleted and redirected were left in ES current_search.flush_and_refresh("*") expected_control_numbers = [ redirected.control_number, new_record.control_number, deleted.control_number, ] results = LiteratureSearch().query_from_iq("").execute() control_numbers_from_es = [x.control_number for x in results.hits] assert set(control_numbers_from_es) == set(expected_control_numbers) cli.invoke(["index", "reindex", "-p", "lit"]) current_search.flush_and_refresh("*") expected_control_numbers = [new_record.control_number] results = LiteratureSearch().query_from_iq("").execute() control_numbers_from_es = [x.control_number for x in results.hits] assert set(control_numbers_from_es) == set(expected_control_numbers)
def test_reindex_all_types_records(inspire_app, cli): record_lit = create_record_factory("lit") record_aut = create_record_factory("aut") record_job = create_record_factory("job") record_con = create_record_factory("con") cli.invoke(["index", "reindex", "--all"]) current_search.flush_and_refresh("*") results_lit_uuid = LiteratureSearch().execute().hits.hits[0]["_id"] results_aut_uuid = AuthorsSearch().execute().hits.hits[0]["_id"] results_con_uuid = ConferencesSearch().execute().hits.hits[0]["_id"] results_job_uuid = JobsSearch().execute().hits.hits[0]["_id"] assert str(record_lit.id) == results_lit_uuid assert str(record_aut.id) == results_aut_uuid assert str(record_con.id) == results_con_uuid assert str(record_job.id) == results_job_uuid
def test_migrate_from_mirror_doesnt_index_deleted_records(inspire_app): record_fixture_path = pkg_resources.resource_filename( __name__, os.path.join("fixtures", "dummy.xml")) record_fixture_path_deleted = pkg_resources.resource_filename( __name__, os.path.join("fixtures", "deleted_record.xml")) migrate_from_file(record_fixture_path) migrate_from_file(record_fixture_path_deleted) current_search.flush_and_refresh("records-hep") expected_record_lit_es_len = 1 record_lit_uuid = LiteratureRecord.get_uuid_from_pid_value(12345) record_lit_es = LiteratureSearch().get_record( str(record_lit_uuid)).execute().hits record_lit_es_len = len(record_lit_es) assert expected_record_lit_es_len == record_lit_es_len
def test_indexer_oai_set_CERN_arxiv(inspire_app): extra_data = { "report_numbers": [{"value": "CERN-2020-001"}], "arxiv_eprints": [{"value": "2009.01484"}], } record_data = faker.record("lit", data=extra_data) record = LiteratureRecord.create(record_data) record.index(delay=False) result_record = LiteratureSearch.get_record_data_from_es(record) expected_id = f"oai:inspirehep.net:{record['control_number']}" expected_updated = "1994-12-19T00:00:00" expected_sets = [inspire_app.config["OAI_SET_CERN_ARXIV"]] assert expected_id == result_record["_oai"]["id"] assert expected_updated == result_record["_oai"]["updated"] assert expected_sets == result_record["_oai"]["sets"]