def before_dump(self, data): family_name, given_name = self.get_name_splitted(data) return { 'advisors': get_value(data, 'advisors', default=missing), 'acquisition_source': get_value( data, 'acquisition_source', default=missing), 'arxiv_categories': get_value( data, 'arxiv_categories', default=missing), 'blog': self.get_first_or_missing( self.get_value_by_description_key(data.get('urls', []), 'blog')), 'display_name': get_value( data, 'name.preferred_name', default=missing), 'family_name': self.get_value_or_missing(family_name), 'given_name': self.get_value_or_missing(given_name), 'linkedin': self.get_first_or_missing( get_values_for_schema(data.get('ids', []), 'LINKEDIN')), 'native_name': get_value( data, 'name.native_names[0]', default=missing), 'orcid': self.get_first_or_missing( get_values_for_schema(data.get('ids', []), 'ORCID')), 'positions': get_value(data, 'positions', default=missing), 'project_membership': get_value( data, 'project_membership', default=missing), 'public_emails': get_value( data, 'email_addresses.value', default=missing), 'status': get_value(data, 'status', default=missing), 'twitter': self.get_first_or_missing( get_values_for_schema(data.get('ids', []), 'TWITTER')), 'websites': get_value(data, 'urls.value', default=missing), }
def get_orcids_for_push(record): """Obtain the ORCIDs associated to the list of authors in the Literature record. The ORCIDs are looked up both in the ``ids`` of the ``authors`` and in the Author records that have claimed the paper. Args: record(dict): metadata from a Literature record Returns: Iterator[str]: all ORCIDs associated to these authors """ orcids_on_record = [] author_recids_with_claims = [] for author in record.get('authors', []): orcids_in_author = get_values_for_schema(author.get('ids', []), 'ORCID') if orcids_in_author: orcids_on_record.extend(orcids_in_author) elif author.get('curated_relation') is True and 'record' in author: author_recids_with_claims.append(get_recid_from_ref(author['record'])) author_records = get_db_records(('aut', recid) for recid in author_recids_with_claims) all_ids = (author.get('ids', []) for author in author_records) orcids_in_authors = chain.from_iterable(get_values_for_schema(ids, 'ORCID') for ids in all_ids) return chain(orcids_on_record, orcids_in_authors)
def get_orcids_for_push(record): """Obtain the ORCIDs associated to the list of authors in the Literature record. The ORCIDs are looked up both in the ``ids`` of the ``authors`` and in the Author records that have claimed the paper. Args: record(dict): metadata from a Literature record Returns: Iterator[str]: all ORCIDs associated to these authors """ orcids_on_record = [] author_recids_with_claims = [] for author in record.get("authors", []): orcids_in_author = get_values_for_schema(author.get("ids", []), "ORCID") if orcids_in_author: orcids_on_record.extend(orcids_in_author) elif author.get("curated_relation") is True and "record" in author: author_recids_with_claims.append( get_recid_from_ref(author["record"])) author_records = AuthorsRecord.get_records_by_pids( ("aut", str(recid)) for recid in author_recids_with_claims) all_ids = (author.get("ids", []) for author in author_records) orcids_in_authors = chain.from_iterable( get_values_for_schema(ids, "ORCID") for ids in all_ids) return chain(orcids_on_record, orcids_in_authors)
def before_dump(self, data): family_name, given_name = self.get_name_split(data) return { "advisors": get_value(data, "advisors", default=missing), "alternate_name": get_value(data, "name.name_variants[0]", default=missing), "acquisition_source": get_value( data, "acquisition_source", default=missing ), "arxiv_categories": get_value(data, "arxiv_categories", default=missing), "blog": self.get_first_or_missing( self.get_value_by_description_key(data.get("urls", []), "blog") ), "display_name": get_value(data, "name.preferred_name", default=missing), "family_name": self.get_value_or_missing(family_name), "given_name": self.get_value_or_missing(given_name), "linkedin": self.get_first_or_missing( get_values_for_schema(data.get("ids", []), "LINKEDIN") ), "native_name": get_value(data, "name.native_names[0]", default=missing), "orcid": self.get_first_or_missing( get_values_for_schema(data.get("ids", []), "ORCID") ), "positions": get_value(data, "positions", default=missing), "project_membership": get_value( data, "project_membership", default=missing ), "emails": get_value(data, "email_addresses", default=missing), "status": get_value(data, "status", default=missing), "twitter": self.get_first_or_missing( get_values_for_schema(data.get("ids", []), "TWITTER") ), "websites": get_value(data, "urls.value", default=missing), }
def test_id_is_not_written_to_record_for_stale_data_push( mock_hal_create, mock_update_record_with_new_ids, inspire_app, get_fixture ): hal_create_receipt = Deposit_Receipt() hal_create_receipt.id = "hal:123456" mock_hal_create.return_value = hal_create_receipt def side_effect(*args, **kwargs): if side_effect.counter == 0: side_effect.counter += 1 raise StaleDataError else: return update_record_with_new_ids(*args, **kwargs) side_effect.counter = 0 mock_update_record_with_new_ids.side_effect = side_effect record_json = orjson.loads(get_fixture("hal_preprod_record.json")) record_data = faker.record("lit", data=record_json) record = InspireRecord.create(record_data) institute_json = orjson.loads(get_fixture("hal_preprod_institute.json")) institute_data = faker.record("ins", data=institute_json) InspireRecord.create(institute_data) _hal_push(record) record = InspireRecord.get_record_by_pid_value(record["control_number"], "lit") assert get_values_for_schema(record["external_system_identifiers"], "HAL") == [ "hal:123456" ]
def clean_stub_authors(): """Removes all the authors created by disambiguation and having no linked papers.""" # We get all the stub authors (created by disambiguation) from ES and we verify # in db if the returned records are stub (ES data might be outdated) stub_authors_query = Q("term", stub=True) stub_authors_search = (AuthorsSearch().query(stub_authors_query).source( ["control_number"])) stub_authors_control_numbers = [("aut", str(author["control_number"])) for author in stub_authors_search.scan()] # We change isolation level in db to the higher one (serializable) to avoid # issues with race condition db.session.connection( execution_options={"isolation_level": "SERIALIZABLE"}) stub_authors_verified = AuthorsRecord.get_records_by_pids( stub_authors_control_numbers) stub_authors_bais = { get_values_for_schema(author["ids"], "INSPIRE BAI")[0]: author for author in stub_authors_verified if author.get("stub") } # We verify which authors have linked papers stub_authors_with_papers = set( query_authors_with_linked_papers_by_bai(stub_authors_bais.keys())) # For every author who has not linked papers we delete record authors_to_remove = set( stub_authors_bais.keys()).difference(stub_authors_with_papers) click.echo( f"Removing {len(authors_to_remove)} stub authors with no linked papers" ) for author_bai in authors_to_remove: author = stub_authors_bais[author_bai] author.delete() db.session.commit() click.echo("Successfully removed stub authors")
def test_get_values_for_schema(): elements = [ {'schema': 'good', 'value': 'first'}, {'schema': 'bad', 'value': 'second'}, {'schema': 'good', 'value': 'third'}, ] assert get_values_for_schema(elements, 'good') == ['first', 'third']
def test_orcid_is_updated_if_was_moved(inspire_app, user_remote_account): old_orcid = user_remote_account.remote_account.extra_data["orcid"] data = { "$schema": "http://localhost:5000/schemas/records/authors.json", "_collections": ["Authors"], "control_number": 123456789, "ids": [ { "schema": "INSPIRE BAI", "value": "J.Smith.1" }, { "schema": "ORCID", "value": old_orcid }, ], "name": { "value": "Smith, John" }, } rec = create_record("aut", data=data) db.session.commit() new_orcid = "0000-0003-4792-9178" update_moved_orcid(old_orcid, new_orcid) author_record = InspireRecord.get_record_by_pid_value( rec["control_number"], "aut") assert new_orcid in get_values_for_schema(author_record.get("ids", []), "ORCID")
def assert_disambiguation_on_record_update(): literature_record_from_es = InspireSearch.get_record_data_from_es( lit_record) assert (get_values_for_schema( literature_record_from_es["authors"][0]["ids"], "INSPIRE BAI")[0] != "M.F.A.Hearn.1")
def test_push_happy_flow(inspire_app, get_fixture): record_json = orjson.loads(get_fixture("hal_preprod_record.json")) record_data = faker.record("lit", data=record_json) record = InspireRecord.create(record_data) institute_json = orjson.loads(get_fixture("hal_preprod_institute.json")) institute_data = faker.record("ins", data=institute_json) InspireRecord.create(institute_data) # hal create receipt = _hal_push(record) assert receipt assert receipt.parsed hal_id = receipt.id assert hal_id updated_record = InspireRecord.get_record_by_pid_value( record["control_number"], "lit" ) assert ( get_values_for_schema( get_value(updated_record, "external_system_identifiers", []), "HAL" )[0] == hal_id ) # hal update receipt = _hal_push(record) assert receipt assert receipt.parsed
def get_reference_and_bai_if_unambiguous_match(matched_refs_ids): if len(matched_refs_ids) == 1: author_reference, author_ids = matched_refs_ids.popitem() author_bais = get_values_for_schema(author_ids, "INSPIRE BAI") return { "author_reference": author_reference, "author_bai": author_bais[0] if author_bais else None, }
def get_author_with_record_facet_author_name(author): author_ids = author.get('ids', []) bai = get_values_for_schema(author_ids, 'INSPIRE BAI')[0] author_preferred_name = get_value(author, 'name.preferred_name') if author_preferred_name: return u'{}_{}'.format(bai, author_preferred_name) else: return u'{}_{}'.format(bai, get_author_display_name(author['name']['value']))
def can_user_edit_author_record(author_record): if is_superuser_or_cataloger_logged_in(): return True ids = author_record.get("ids", []) orcids = get_values_for_schema(ids, "ORCID") user_orcid = get_current_user_orcid() return user_orcid in orcids
def get_bais_by_recid(record): record_bais = {} for author in record.get("authors", []): bai = (get_values_for_schema(author.get("ids", []), "INSPIRE BAI") or [None])[0] recid = get_author_recid(author) if bai and recid: record_bais[recid] = bai return record_bais
def get_author_with_record_facet_author_name(author): author_ids = author.get('ids', []) author_bai = get_values_for_schema(author_ids, 'INSPIRE BAI') bai = author_bai[0] if author_bai else 'BAI' author_preferred_name = get_value(author, 'name.preferred_name') if author_preferred_name: return u'{}_{}'.format(bai, author_preferred_name) else: return u'{}_{}'.format(bai, get_author_display_name(author['name']['value']))
def assert_first_disambiguation_no_match(): literature_record_from_es = InspireSearch.get_record_data_from_es( literature_record_3) assert get_values_for_schema( literature_record_from_es["authors"][0]["ids"], "INSPIRE BAI") assert (literature_record_from_es["authors"][0]["ids"] != literature_record["authors"][0]["ids"]) assert (literature_record_from_es["authors"][0]["ids"] != literature_record_2["authors"][0]["ids"])
def get_facet_author_name_for_author(author): author_ids = author.get("ids", []) author_bai = get_values_for_schema(author_ids, "INSPIRE BAI") bai = author_bai[0] if author_bai else "BAI" author_preferred_name = get_value(author, "name.preferred_name") if author_preferred_name: return "{}_{}".format(bai, author_preferred_name) return "{}_{}".format( bai, get_display_name_for_author_name(get_value(author, "name.value")))
def assign_papers( from_author_recid, to_author_record, author_papers, is_stub_author=False ): author_bai = get_values_for_schema(to_author_record["ids"], "INSPIRE BAI")[0] for record in author_papers: lit_author = get_author_by_recid(record, from_author_recid) lit_author["record"] = get_record_ref( to_author_record["control_number"], endpoint="authors" ) if not is_stub_author: lit_author["curated_relation"] = True lit_author["ids"] = update_author_bai(author_bai, lit_author) record.update(dict(record))
def _hal_push(record): if "Literature" in record["_collections"] or "HAL Hidden" in record["_collections"]: tei = convert_to_tei(record) ids = record.get("external_system_identifiers", []) hal_value = get_values_for_schema(ids, "HAL") hal_id = hal_value[0] if hal_value else "" if hal_id: receipt = _hal_update(tei, hal_id, record) else: receipt = _hal_create(tei, record) if receipt and receipt.id != hal_id: _write_hal_id_to_record(record, receipt.id) return receipt
def assign_author_to_papers(self): from .literature import LiteratureRecord bai_list = get_values_for_schema(self.get("ids", []), "INSPIRE BAI") if not bai_list: return bai = bai_list[0] author_papers_ids = [ str(record_control_number) for record_control_number in self.query_author_papers(bai) ] author_papers = LiteratureRecord.get_records(author_papers_ids) for paper in author_papers: author = get_author_by_bai(paper, bai) author["record"] = self.get("self") paper.update(dict(paper))
def add_ads_links_for_arxiv_papers(self, data): arxiv_id = get_value(data, "arxiv_eprints[0].value") external_system_ids = get_value(data, "external_system_identifiers", default=[]) ads_ids = get_values_for_schema(external_system_ids, "ADS") if arxiv_id and not ads_ids: external_system_ids.append({ "schema": "ADS", "value": f"arXiv:{arxiv_id}" }) data["external_system_identifiers"] = external_system_ids return data
def _hal_push(record): if "Literature" in record["_collections"] or "HAL Hidden" in record[ "_collections"]: tei = convert_to_tei(record) ids = record.get("external_system_identifiers", []) hal_value = get_values_for_schema(ids, "HAL") hal_id = hal_value[0] if hal_value else "" lock_name = f"hal:{record['control_number']}" with distributed_lock(lock_name, blocking=True): if hal_id: receipt = _hal_update(tei, hal_id, record) else: receipt = _hal_create(tei, record) if receipt and receipt.id != hal_id: _write_hal_id_to_record(record, receipt.id) return receipt
def do(record, logger, state): for advisor in record["advisors"]: if not advisor_has_inspire_id_but_no_record(advisor): continue inspire_id = get_values_for_schema(advisor["ids"], "INSPIRE ID")[0] hits = ( AuthorsSearch().query_from_iq(f"ids.value:{inspire_id}").execute().hits ) recids = [hit.control_number for hit in hits] if not len(recids) == 1: logger.warning( "No unique match for INSPIRE ID, skipping.", inspire_id=inspire_id, recids=recids, ) continue recid = recids[0] advisor["record"] = get_ref_from_pid("aut", recid)
def process_cds_record(cds_record): control_numbers = get_value(cds_record, "metadata.other_ids", []) arxivs = get_value(cds_record, "metadata.eprints", []) dois = get_value(cds_record, "metadata.dois.value", []) report_numbers = get_value(cds_record, "metadata.report_numbers.value", []) cds_id = cds_record.get("id") or get_value(cds_record, "metadata.control_number", []) if not cds_id: LOGGER.info( "Cannot extract CDS id from CDS response", cds_data=cds_record, ) return record = get_record_for_provided_ids(control_numbers, arxivs, dois, report_numbers) if not record: LOGGER.warning( "Cannot find record with any of the provided IDS", control_numbers=control_numbers, arxivs=arxivs, dois=dois, report_numbers=report_numbers, ) return None control_number = record.control_number ids = record.get("external_system_identifiers", []) values = get_values_for_schema(ids, "CDS") if cds_id in values: LOGGER.info( "Correct CDS identifier is already present in the record", recid=control_number, cds_id=cds_id, ) return builder = LiteratureBuilder(record=record) builder.add_external_system_identifier(cds_id, "CDS") data = dict(builder.record) record.update(data)
def assign_papers( self, from_author_recid, to_author_record, author_papers_recids, is_stub_author=False, ): author_bai = get_values_for_schema(to_author_record["ids"], "INSPIRE BAI")[0] for recid in author_papers_recids: record = LiteratureRecord.get_record_by_pid_value(recid) lit_author = get_author_by_recid(record, from_author_recid) lit_author["record"] = get_record_ref( to_author_record["control_number"], endpoint="authors") if not is_stub_author: lit_author["curated_relation"] = True lit_author["ids"] = update_author_bai(author_bai, lit_author) record.update(dict(record)) db.session.commit()
def disambiguate_authors(self, record_uuid): # handle case when we try to get a record which is deleted try: record = InspireRecord.get_record(record_uuid) except NoResultFound: return if "Literature" not in record["_collections"]: return authors = record.get_modified_authors() updated_authors = [] for author in authors: if author.get("curated_relation"): continue matched_author_data = match_author(author) if not matched_author_data: matched_author_data = match_literature_author(author, record) if matched_author_data: author["record"] = { "$ref": matched_author_data["author_reference"] } assign_bai_to_literature_author( author, matched_author_data.get("author_bai")) updated_authors.append( matched_author_data["author_reference"].split("/")[-1]) elif "record" not in author: new_author_record = create_new_author(author["full_name"], record["control_number"]) author["record"] = new_author_record["self"] new_author_bai = get_values_for_schema(new_author_record["ids"], "INSPIRE BAI")[0] assign_bai_to_literature_author(author, new_author_bai) updated_authors.append(new_author_record["control_number"]) if updated_authors: LOGGER.info( "Updated references for authors", { "uuid": str(record.id), "recid": record["control_number"], "authors_control_numbers": updated_authors, }, ) record.update(dict(record)) db.session.commit()
def test_disambiguation_on_record_update_unambiguous_match( inspire_app, clean_celery_session, enable_disambiguation): literature_data = faker.record("lit", with_control_number=True) literature_data.update({ "authors": [{ "full_name": "Kowalczyk, Elisabeth", "ids": [{ "schema": "INSPIRE BAI", "value": "E.Kowalczyk.1" }], }] }) literature_record = LiteratureRecord.create(data=literature_data) db.session.commit() def assert_first_disambiguation_no_match(): literature_record_from_es = InspireSearch.get_record_data_from_es( literature_record) assert get_values_for_schema( literature_record_from_es["authors"][0]["ids"], "INSPIRE BAI") retry_until_pass(assert_first_disambiguation_no_match, retry_interval=2) old_bai = get_values_for_schema(literature_record["authors"][0]["ids"], "INSPIRE BAI")[0] db.session.expire_all() lit_record = InspireRecord.get_record(literature_record.id) lit_record["authors"][0]["emails"] = ["test.test@com"] lit_record.update(dict(lit_record)) db.session.commit() def assert_disambiguation_on_record_update(): literature_record_from_es = InspireSearch.get_record_data_from_es( literature_record) assert (get_values_for_schema( literature_record_from_es["authors"][0]["ids"], "INSPIRE BAI")[0] == old_bai) retry_until_pass(assert_disambiguation_on_record_update, retry_interval=2)
def _disambiguate_authors(authors_to_disambiguate, record): updated_authors = [] for author in authors_to_disambiguate: if author.get("curated_relation"): continue assigned_author_recid = None matched_author_data = match_author(author) if not matched_author_data: matched_author_data = match_literature_author(author, record) if matched_author_data: author["record"] = {"$ref": matched_author_data["author_reference"]} assign_bai_to_literature_author( author, matched_author_data.get("author_bai") ) assigned_author_recid = matched_author_data["author_reference"].split("/")[ -1 ] elif "record" not in author: linked_author_record = create_new_author( author["full_name"], record["control_number"] ) author["record"] = linked_author_record["self"] new_author_bai = get_values_for_schema( linked_author_record["ids"], "INSPIRE BAI" )[0] assign_bai_to_literature_author(author, new_author_bai) assigned_author_recid = linked_author_record["control_number"] if assigned_author_recid: if len(author["full_name"].split(",")[0].split(" ")) == 1: if matched_author_data: linked_author_record = AuthorsRecord.get_record_by_pid_value( assigned_author_recid ) author["full_name"] = reorder_lit_author_names( author["full_name"], linked_author_record["name"]["value"] ) updated_authors.append(assigned_author_recid) return updated_authors
def get_authors_bais(self): return get_values_for_schema( flatten_list(self.get_value("authors.ids", [])), "INSPIRE BAI")
def does_current_user_own_author_record(author): author_orcids = get_values_for_schema(author.get("ids", []), "ORCID") if author_orcids: author_orcid = author_orcids.pop() return get_current_user_orcid() == author_orcid return False
def get_first_value_for_schema(list, schema): ids_for_schema = get_values_for_schema(list, schema) return ids_for_schema[0] if ids_for_schema else None
def get_pid_values(self): return set(get_values_for_schema(self.data.get("ids", []), "ORCID"))