def clean_stub_authors(): """Removes all the authors created by disambiguation and having no linked papers.""" # We get all the stub authors (created by disambiguation) from ES and we verify # in db if the returned records are stub (ES data might be outdated) stub_authors_query = Q("term", stub=True) stub_authors_search = (AuthorsSearch().query(stub_authors_query).source( ["control_number"])) stub_authors_control_numbers = [("aut", str(author["control_number"])) for author in stub_authors_search.scan()] # We change isolation level in db to the higher one (serializable) to avoid # issues with race condition db.session.connection( execution_options={"isolation_level": "SERIALIZABLE"}) stub_authors_verified = AuthorsRecord.get_records_by_pids( stub_authors_control_numbers) stub_authors_bais = { get_values_for_schema(author["ids"], "INSPIRE BAI")[0]: author for author in stub_authors_verified if author.get("stub") } # We verify which authors have linked papers stub_authors_with_papers = set( query_authors_with_linked_papers_by_bai(stub_authors_bais.keys())) # For every author who has not linked papers we delete record authors_to_remove = set( stub_authors_bais.keys()).difference(stub_authors_with_papers) click.echo( f"Removing {len(authors_to_remove)} stub authors with no linked papers" ) for author_bai in authors_to_remove: author = stub_authors_bais[author_bai] author.delete() db.session.commit() click.echo("Successfully removed stub authors")
def assert_disambiguation_task(): literature_record_from_es = InspireSearch.get_record_data_from_es( literature_record) author_record_from_es = AuthorsSearch().query_from_iq("").execute() assert author_record_from_es.hits[0].name["value"] == "Michal Kowal" assert (literature_record_from_es["authors"][0]["recid"] == author_record_from_es.hits[0].control_number)
def test_authors_search_query(inspire_app): query_to_dict = AuthorsSearch().query_from_iq("J Ellis").to_dict() expexted_query = { "query": { "bool": { "should": [ { "match": { "names_analyzed": "J Ellis" } }, { "match": { "names_analyzed_initials": "J Ellis" } }, { "query_string": { "query": "J Ellis" } }, ] } }, "track_total_hits": True, } assert expexted_query == query_to_dict
def test_indexer_deletes_record_from_es(es_clear, db, datadir, create_record): data = json.loads((datadir / "999108.json").read_text()) record = create_record("aut", data=data) record["deleted"] = True record._index() es_clear.indices.refresh("records-authors") expected_records_count = 0 record_lit_es = AuthorsSearch().get_record(str(record.id)).execute().hits assert expected_records_count == len(record_lit_es)
def test_indexer_deletes_record_from_es(inspire_app, datadir): data = json.loads((datadir / "999108.json").read_text()) record = create_record("aut", data=data) record["deleted"] = True record.index(delay=False) current_search.flush_and_refresh("records-authors") expected_records_count = 0 record_lit_es = AuthorsSearch().get_record(str(record.id)).execute().hits assert expected_records_count == len(record_lit_es)
def test_reindex_one_type_of_record(inspire_app, cli): record_lit = create_record_factory("lit") create_record_factory("aut") cli.invoke(["index", "reindex", "-p", "lit"]) current_search.flush_and_refresh("*") expected_aut_len = 0 results_lit_uuid = LiteratureSearch().execute().hits.hits[0]["_id"] results_aut_len = len(AuthorsSearch().execute().hits.hits) assert str(record_lit.id) == results_lit_uuid assert expected_aut_len == results_aut_len
def test_authors_query_for_query_with_colon(inspire_app): query_to_dict = (AuthorsSearch().query_from_iq( "positions.record.$ref:905189").to_dict()) expected_query = { "query": { "query_string": { "query": "positions.record.$ref:905189" } }, "track_total_hits": True, } assert expected_query == query_to_dict
def test_reindex_all_types_records(inspire_app, cli): record_lit = create_record_factory("lit") record_aut = create_record_factory("aut") record_job = create_record_factory("job") record_con = create_record_factory("con") cli.invoke(["index", "reindex", "--all"]) current_search.flush_and_refresh("*") results_lit_uuid = LiteratureSearch().execute().hits.hits[0]["_id"] results_aut_uuid = AuthorsSearch().execute().hits.hits[0]["_id"] results_con_uuid = ConferencesSearch().execute().hits.hits[0]["_id"] results_job_uuid = JobsSearch().execute().hits.hits[0]["_id"] assert str(record_lit.id) == results_lit_uuid assert str(record_aut.id) == results_aut_uuid assert str(record_con.id) == results_con_uuid assert str(record_job.id) == results_job_uuid
def do(record, logger, state): for advisor in record["advisors"]: if not advisor_has_inspire_id_but_no_record(advisor): continue inspire_id = get_values_for_schema(advisor["ids"], "INSPIRE ID")[0] hits = ( AuthorsSearch().query_from_iq(f"ids.value:{inspire_id}").execute().hits ) recids = [hit.control_number for hit in hits] if not len(recids) == 1: logger.warning( "No unique match for INSPIRE ID, skipping.", inspire_id=inspire_id, recids=recids, ) continue recid = recids[0] advisor["record"] = get_ref_from_pid("aut", recid)
def test_empty_authors_search_query(inspire_app): query_to_dict = AuthorsSearch().query_from_iq("").to_dict() expexted_query = {"query": {"match_all": {}}, "track_total_hits": True} assert expexted_query == query_to_dict
def assert_disambiguation_task(): author_records_from_es = AuthorsSearch().query_from_iq("").execute() assert len(author_records_from_es.hits) == 2
def assert_record(): current_search.flush_and_refresh("records-authors") record_from_es = AuthorsSearch().get_record_data_from_es(record) assert expected_control_number == record_from_es["control_number"]
def assert_record(): current_search.flush_and_refresh("records-authors") record_from_es = AuthorsSearch().get_record_data_from_es(advisor) assert record_from_es["students"][0]["name"] == expected_student_name
def assert_record(): current_search.flush_and_refresh("records-authors") records_from_es = AuthorsSearch().query_from_iq("").execute() assert len(records_from_es.hits) == 2
def assert_record_is_deleted_from_es(): current_search.flush_and_refresh("records-authors") expected_records_count = 0 record_lit_es = AuthorsSearch().get_record(str(record.id)).execute().hits assert expected_records_count == len(record_lit_es)
def authors(): return AuthorsSearch()
def assert_record(): current_search.flush_and_refresh("records-authors") record_from_es = AuthorsSearch().get_record_data_from_es(rec) assert expected_death_date == record_from_es["death_date"]