def test_original_and_full_name(self): """Test that original and full name access works""" for n in namesA: name = ADSName.parse(n) self.assertEqual(name.original_name, n) self.assertEqual(name.bare_original_name, n) name = ADSName.parse(n.upper()) self.assertEqual(name.original_name, n.upper()) self.assertEqual(name.bare_original_name, n.upper()) self.assertNotEqual(name.original_name, n) self.assertNotEqual(name.bare_original_name, n) self.assertEqual(name.full_name, n) self.assertEqual(name.qualified_full_name, n) for modifier in ['=', '<', '>', '<=', '>=']: name = ADSName.parse(modifier + n) self.assertEqual(name.original_name, modifier + n) self.assertEqual(name.bare_original_name, n) self.assertEqual(name.full_name, n) self.assertEqual(name.qualified_full_name, modifier + n) for modifier, cor_modifier in zip(['=<', '=>'], ['<=', '>=']): name = ADSName.parse(modifier + n) self.assertEqual(name.original_name, modifier + n) self.assertEqual(name.bare_original_name, n) self.assertEqual(name.full_name, n) self.assertEqual(name.qualified_full_name, cor_modifier + n)
def _insert_document_data(pairings, doc_data, repo, excluded_names): """Stores all required document data, and back fills indices""" for k1 in pairings.keys(): author1 = ADSName.parse(k1) for k2 in pairings[k1].keys(): author2 = ADSName.parse(k2) replacement = [] for bibcode in pairings[k1][k2]: if bibcode in doc_data: doc_record = doc_data[bibcode] else: doc_record = repo.get_document(bibcode).asdict() lb.on_doc_loaded() del doc_record['bibcode'] del doc_record['timestamp'] del doc_record['doctype'] doc_data[bibcode] = doc_record auth_1_idx, auth_2_idx = _find_indices(doc_record['authors'], bibcode, author1, author2, excluded_names) replacement.append((bibcode, auth_1_idx, auth_2_idx)) pairings[k1][k2] = replacement
def get_papers_for_orcid_id(self, orcid_id): orcid_id = normalize_orcid_id(orcid_id) lb.i(f"Querying ADS for orcid id " + orcid_id) query = f"orcid:({orcid_id})" documents = self._inner_query_for_author(query, 1) author_record = AuthorRecord(name=ADSName.parse(orcid_id, preserve=True), documents=[]) names = set() for document in documents: try: i = document.orcid_ids.index(orcid_id) except ValueError: lb.w(f"ORCID ID not found in {document.bibcode}") continue author_record.documents.append(document.bibcode) names.add(document.authors[i]) # Find the most-detailed form of the name if len(names): names = [ADSName.parse(n) for n in names] intermed = [(n.level_of_detail, len(n.full_name), n) for n in names] intermed.sort(reverse=True) author_record.name = intermed[0][-1] return author_record, documents
def get_name_as_in_ADS(target_name, names_in_result: []): """For presentation in the UI, figures out how to capitalize a name The user may have typed in the query names in all lowercase. For the large banner at the top of the page, it would be nice to format the names more properly. Rather than just defaulting to first-letter-uppercase, we can use our ADS data to present the name in a form (or one of the forms) ADS has for the name. This means we may also pick up diacritics. Looks through all the publications belonging to the name and how the author's name appears in those publications. Grabs (one of) the most-detailed forms. If it contains more given names than the target names, truncates the list. Shortens given names to initials if the target name has an initial at that position.""" # Unique-ify names_in_result names_in_result = list(set(names_in_result)) repo = Repository(can_skip_refresh=True) names_in_result = [ADSName.parse(name) for name in names_in_result] orcid = is_orcid_id(target_name) if orcid: record = repo.get_author_record_by_orcid_id(target_name) else: target_name = ADSName.parse(target_name) record = repo.get_author_record(target_name) aliases = record.appears_as.keys() aliases = [ADSName.parse(alias) for alias in aliases] # Remove all aliases that aren't consistent with any of the name forms # used in the set of possible chains. E.g. if the user searched for # "Last" and all chains terminate at "Last, B.", then we shouldn't view # "Last, I." as a viable alias. aliases = [alias for alias in aliases if alias in names_in_result] # Grab the most-detailed alias. As tie-breaker, choose the form with the # most publications. alias = sorted([(a.level_of_detail, len(record.appears_as[a.original_name]), a.original_name) for a in aliases])[-1][-1] alias = ADSName.parse(alias, preserve=True) if orcid: gns = alias.given_names else: # Trim it down to size gns = alias.given_names if len(gns) > len(target_name.given_names): gns = gns[:len(target_name.given_names)] # Ensure we have initials where we need them gns = [ gn if len(tgn) > 1 else gn[0] for gn, tgn in zip(gns, target_name.given_names) ] final_name = ADSName.parse(alias.last_name, *gns, preserve=True) return final_name.full_name
def test_repr(self): """Test than string representations of ADSNames are as expected""" for name in namesA: for modifier in ['', '=', '>', '<', '<=', '>=']: name2 = modifier + name self.assertEqual(name2, repr(ADSName.parse(name2))) name2 = ">=" + name self.assertEqual(name2, repr(ADSName.parse("=>" + name))) name2 = "<=" + name self.assertEqual(name2, repr(ADSName.parse("=<" + name)))
def test_preserve_case(self): # Ensure the parsing cache is populated for name in namesA: parsed = ADSName.parse(name.upper()) self.assertEqual(parsed.full_name, name) for name in namesA: parsed = ADSName.parse(name.upper(), preserve=True) self.assertEqual(parsed.full_name, name.upper()) for name in namesA: parsed = ADSName.parse(name.upper()) parsed = ADSName.parse(parsed, preserve=True) self.assertEqual(parsed.full_name, name.upper())
def get_author_record(self, author: Name) -> AuthorRecord: author = ADSName.parse(author) try: author_record = cache_buddy.load_author(author) except CacheMiss: author_record = self._try_generating_author_record(author) if author_record is None: author_record, documents = \ self.ads_buddy.get_papers_for_author(author) cache_buddy.cache_documents(documents) if type(author_record) == AuthorRecord: self._fill_in_coauthors(author_record) if len(author_record.documents): cache_buddy.cache_author(author_record) else: for rec in author_record.values(): self._fill_in_coauthors(rec) cache_buddy.cache_authors([ ar for ar in author_record.values() if len(ar.documents) ]) author_record = author_record[author] lb.on_author_queried() lb.on_doc_queried(len(author_record.documents)) return author_record
def _find_indices(authors, bibcode, author1, author2, excluded_names): key1 = (bibcode, author1.original_name) key2 = (bibcode, author2.original_name) try: auth_1_idx = indices_cache[key1] except KeyError: auth_1_idx = None try: auth_2_idx = indices_cache[key2] except KeyError: auth_2_idx = None for i, author in enumerate(authors): if auth_1_idx is not None and auth_2_idx is not None: break author = ADSName.parse(author) if author in excluded_names: continue if auth_1_idx is None and author1 == author: auth_1_idx = i if auth_2_idx is None and author2 == author: auth_2_idx = i indices_cache[key1] = auth_1_idx indices_cache[key2] = auth_2_idx return auth_1_idx, auth_2_idx
def load_author(key): if key[0] in '<>=': raise CacheMiss(key) orcid = "ORCID" in key if orcid: name = None else: name = ADSName.parse(key) docs = [] coauthors = defaultdict(list) appears_as = defaultdict(list) for bibcode, document in documents.items(): matched = None # Go through the document's authors until/if we find our search author for orcid_id, author in zip_longest(document['orcid_ids'], document['authors']): if orcid and orcid_id == key: matched = author aname = ADSName.parse(author) if name is None or aname.is_more_specific_than(name): name = aname elif not orcid and name == author: matched = author if matched is not None: docs.append(bibcode) idx = len(docs) - 1 appears_as[matched].append(idx) for coauthor in document['authors']: if coauthor != matched: coauthors[coauthor].append(idx) if len(docs) or key.endswith("nodocs"): for coauthor, coauthor_dat in coauthors.items(): coauthors[coauthor] = ','.join(str(i) for i in coauthor_dat) for alias, alias_dat in appears_as.items(): appears_as[alias] = ','.join(str(i) for i in alias_dat) return { # defaultdict doesn't play nicely with AuthorRecord's asdict() 'name': name.qualified_full_name, 'documents': docs, 'coauthors': dict(**coauthors), 'appears_as': dict(**appears_as), 'timestamp': TIME, 'version': AUTHOR_VERSION_NUMBER, } else: raise CacheMiss(key)
def test_exact_equality(self): for i in range(len(namesA)): aname = ADSName.parse("=" + namesA[i]) self.assertEqual(aname, namesA[i]) self.assertEqual(aname, aname) for j in range(len(namesA)): if i != j: self.assertNotEqual(namesA[j], aname)
def test_full_name_formatting(self): """Test than name parsing is insensitive to spacing and periods""" for n in namesA: name1 = ADSName.parse(n) name2 = ADSName.parse(n.replace(", ", ",").replace(".", "")) self.assertEqual(str(name1), str(name2)) self.assertEqual(name1, name2) for modifier in ['=', '<', '>', '<=', '>=']: name1 = ADSName.parse(modifier + n) name2 = ADSName.parse( (modifier + n).replace(", ", ",").replace(".", "")) self.assertEqual(str(name1), str(name2)) if '=' in modifier: self.assertEqual(name1, name2) else: self.assertNotEqual(name1, name2)
def normalize_author_names(paper_choices, repo): """Re-builds a chain with names representative of the linking papers. Builds a new chain where each name is as seen in the top paper choice for that chain link. Names that aren't the first or last in the chain appear on two chosen papers, and of those two versions of the name, the least specific is chosen.""" new_chain = [] for i, pc in enumerate(zip(*paper_choices)): bibcode, a1idx, a2idx = pc[0] doc = repo.get_document(bibcode) a1name = doc.authors[a1idx] a2name = doc.authors[a2idx] if (i != 0 and ADSName.parse(a1name).level_of_detail < ADSName.parse( new_chain[-1]).level_of_detail): new_chain[-1] = a1name elif i == 0: new_chain.append(a1name) new_chain.append(a2name) return tuple(new_chain)
def notify_of_upcoming_author_request(self, *authors): authors = [ADSName.parse(author) for author in authors] # If appropriate, the backing cache will pre-fetch the data while # checking if it exists is_in_cache = cache_buddy.authors_are_in_cache(authors) authors = [a for a, iic in zip(authors, is_in_cache) if not iic] can_generate = self._can_generate_author_requests(authors) authors = [a for a, cg in zip(authors, can_generate) if not cg] self.ads_buddy.add_authors_to_prefetch_queue(*authors)
def test_level_of_detail(self): self.assertEqual(0, ADSName.parse("last").level_of_detail) self.assertEqual(3, ADSName.parse("last, f").level_of_detail) self.assertEqual(10, ADSName.parse("last, first").level_of_detail) self.assertEqual(6, ADSName.parse("last, f m").level_of_detail) self.assertEqual(13, ADSName.parse("last, f middle").level_of_detail) self.assertEqual(20, ADSName.parse("last, first middle").level_of_detail) self.assertEqual(23, ADSName.parse("last, first middle m").level_of_detail)
def test_equality(self): for data in namesB: nameB = ADSName.parse(data[0]) results = data[1:] for i, result in enumerate(results): if result: self.assertEqual(nameB, namesA[i]) else: self.assertNotEqual(nameB, namesA[i]) self.assertNotEqual(nameB, 1) self.assertNotEqual(nameB, "a string")
def _select_authors_to_prefetch(self): lb.d(f"{len(self.prefetch_queue)} authors in prefetch queue") n_prefetches = MAXIMUM_RESPONSE_SIZE // ESTIMATED_DOCUMENTS_PER_AUTHOR - 1 if n_prefetches > len(self.prefetch_queue): n_prefetches = len(self.prefetch_queue) if n_prefetches <= 0: return [] prefetches = [] for _ in range(n_prefetches): name = self.prefetch_queue.popleft() self.prefetch_set.remove(name) prefetches.append(ADSName.parse(name)) return prefetches
def graph_translation(chains, source, dest): # We have a list of chains---the table in the web view. These chains # may contain many different forms of a name, and it's important to # preserve that for the table display. But for the graph display, it's # better to collapse down to one node per person, not one node per # name form. So within each column, we want to canonicalize each name # to the least-detailed form of that name appearing in the column. source = get_name_as_in_ADS(source, [c[0] for c in chains]) dest = get_name_as_in_ADS(dest, [c[-1] for c in chains]) source = ADSName.parse(source) dest = ADSName.parse(dest) nads = [] for i in range(len(chains[0])): # This dict will map names to canonical forms nad = NameAwareDict() nads.append(nad) if i == 0: nad[source] = source if i == len(chains[0]) - 1: nad[dest] = dest for chain in chains: name = ADSName.parse(chain[i]) if name in nad: if name.level_of_detail < nad[name].level_of_detail: nad[name] = name else: nad[name] = name mappings = [] for i, nad in enumerate(nads): mapping = {} mappings.append(mapping) for chain in chains: name = chain[i].lower() mapping[name] = nad[name].original_name return mappings
def test_modifier_functions(self): for mod, req_exact, req_less, req_more, allow_same in ( ['', False, False, False, True], ['>', False, False, True, False], ['<', False, True, False, False], ['=', True, False, False, False], ['>=', False, False, True, True], ['<=', False, True, False, True], ): name = ADSName.parse(mod + namesA[1]) self.assertEqual(name.require_exact_match, req_exact) self.assertEqual(name.require_less_specific, req_less) self.assertEqual(name.require_more_specific, req_more) self.assertEqual(name.allow_same_specific, allow_same) self.assertEqual(name.excludes_self, (req_less or req_more) and not allow_same) self.assertEqual(name.has_modifiers(), mod != '') self.assertEqual(name.modifiers, mod) self.assertEqual(name.without_modifiers().full_name, namesA[1])
def _score_author_chain_link(con1, con2, repo): """Scores the reliability of name matching between two papers Accepts two "connections", tuples containing a bibcode followed by two indices locating an author in the author list of the associated bibccode. The author in question will be indicated by the latter index in the first connection and the earlier index in the second connection. When ORCID ids are available, they solely determine the score, which will fall between 0.7 and 1 depending on the source of the ORCID ids. Otherwise the score will be derived from the faction of overlap between the author's affiliations in the two papers, and the level of detail of the author's name as printed in the two papers. These scores will fall in the range (0, 0.4), with contributions of up to 0.3 from affiliation matching and up to 0.1 from name detail""" doc1 = repo.get_document(con1[0]) doc2 = repo.get_document(con2[0]) idx1 = con1[2] idx2 = con2[1] orcid_id_1 = doc1.orcid_ids[idx1] orcid_id_2 = doc2.orcid_ids[idx2] if orcid_id_1 != '' and orcid_id_2 != '': if orcid_id_1 == orcid_id_2: orcid_src_1 = doc1.orcid_id_src[idx1] orcid_src_2 = doc2.orcid_id_src[idx2] # Looking at the source of ADS's ORCID id data, each score element # is 1 for orcid_pub, .92 for orcid_user, and .84 for orcid_other. # The values for the two ORCID ids are multiplied together score1 = 1 - .08 * (orcid_src_1 - 1) score2 = 1 - .08 * (orcid_src_2 - 1) return score1 * score2 else: # The ORCID ids _don't_ match! return None # Attempt some affiliation fuzzy-matching # _process_affil will do some processing and return a list of the # comma-delimited chunks in the affiliation. affil1 = _process_affil(doc1.affils[idx1]) affil2 = _process_affil(doc2.affils[idx2]) # Compute the fraction of the chunks of each affil that are present # in the other try: one_in_two = sum(chunk in affil2 for chunk in affil1) / len(affil1) two_in_one = sum(chunk in affil1 for chunk in affil2) / len(affil2) except ZeroDivisionError: one_in_two = 0 two_in_one = 0 # Average these two fractions affil_frac_in_common = (one_in_two + two_in_one) / 2 # Put the score in the range (0, 0.3) affil_score = affil_frac_in_common * .3 name1 = ADSName.parse(doc1.authors[idx1]) name2 = ADSName.parse(doc2.authors[idx2]) if name1 != name2: # This can occur, e.g. if J. Doe was encountered first, creating a # J. Doe node in PathFinder, then Jane and John Doe were encountered # and added to that node, and now a proposed chain runs from Jane # to John. return None detail1 = name1.level_of_detail detail2 = name2.level_of_detail # level_of_detail examples: # Last, First Middle: 20 # Last, First, M: 13 # Last, First: 10 # Last, F: 3 # Last: 0 # # We'll score based on the less-detailed name, take 20 as the ideal value, # and put the name score in the range (0, 0.1) detail_score = min(detail1, detail2) / 20 * .1 return detail_score + affil_score
def test_with_synonyms(self): synonyms = [ "test_synAA; test_synAB", "test_synB, a; test_synB, b", "test_synCA, q; test_synCB, q", "test_synD, a; test_synD, b c", "test_synEB, b; test_synEA, a", "test_synFA, a b c d; test_synFB, a", "test_synGA, a b c d; test_synGB, a; test_synGC, b" ] # Hack: inject test synonyms ads_name._name_cache.clear() ads_name._parse_name_synonyms(synonyms) for synonym in synonyms: names = synonym.split(';') # The second copy is for the deletion tests later nad = NameAwareDict() nad2 = NameAwareDict() for i, name in enumerate(names): nad[name] = i nad2[name] = i # Do the insertion in both orders, to ensure we try both # "canonical first" and "canonical last" nad_rev = NameAwareDict() nad_rev2 = NameAwareDict() for i, name in enumerate(reversed(names)): nad_rev[name] = i nad_rev2[name] = i # Ensure that, after inserting under one form and updating under # the other form, we can get the latest value from either form. for name in names: self.assertEqual(nad[name], i) self.assertEqual(nad_rev[name], i) # Check other misc methods for name in names: self.assertIn(name, nad) self.assertIn(name, nad_rev) self.assertEqual(len(nad), 1) self.assertEqual(len(nad_rev), 1) self.assertEqual(nad.keys(), (ADSName.parse(names[-1]), )) self.assertEqual(nad_rev.keys(), (ADSName.parse(names[0]), )) self.assertEqual(nad.values(), (i, )) self.assertEqual(nad_rev.values(), (i, )) # Ensure that deleting one form deletes them all. del nad[names[0]] self.assertEqual(len(nad), 0) for name in names: self.assertNotIn(name, nad) del nad2[names[1]] self.assertEqual(len(nad2), 0) for name in names: self.assertNotIn(name, nad2) del nad_rev[names[0]] self.assertEqual(len(nad_rev), 0) for name in names: self.assertNotIn(name, nad_rev) del nad_rev2[names[1]] self.assertEqual(len(nad_rev2), 0) for name in names: self.assertNotIn(name, nad_rev2) # Verify functionality with '@' modifier for synonym in synonyms: names_orig = synonym.split(';') for names in [names_orig, list(reversed(names_orig))]: # We'll insert under one name, then verify we can't access # or delete under the other nad1 = NameAwareDict() nad2 = NameAwareDict() nad3 = NameAwareDict() nad4 = NameAwareDict() nad1[names[0]] = 1 nad2[names[-1]] = 1 nad3['@' + names[0]] = 1 nad4['@' + names[-1]] = 1 with self.assertRaises(KeyError): nad1['@' + names[-1]] with self.assertRaises(KeyError): nad2['@' + names[0]] with self.assertRaises(KeyError): nad3[names[-1]] with self.assertRaises(KeyError): nad4[names[0]] # I don't think it's worth it to test modification because # it's hard to define how it should work. If we store under # 'name' which has 'name2' as a synonym, we get the same # value for 'name' and 'name2'. If we then store under # '@name2', what should we get when retrieving as 'name2'? # If we then store again under 'name', what should we get # for 'name2'? Or for '@name2'? # nad1['@' + names[-1]] = 2 # self.assertEqual(nad1[names[0]], 1) # nad1['@' + names[0]] = 2 # self.assertEqual(nad1[names[-1]], 1) # nad1[names[-1]] = 2 # self.assertEqual('@' + nad1[names[0]], 1) # nad1[names[0]] = 2 # self.assertEqual('@' + nad1[names[-1]], 1) with self.assertRaises(KeyError): del nad1['@' + names[-1]] with self.assertRaises(KeyError): del nad2['@' + names[0]] with self.assertRaises(KeyError): del nad3[names[-1]] with self.assertRaises(KeyError): del nad4[names[0]] # Remove our test synonyms ads_name._name_cache.clear() ads_name._name_synonyms.clear() ads_name._load_synonyms()
def test_with_specificity(self): nad = NameAwareDict() for name in diff_names: nad[name] = PathNode(name) for i, name in enumerate(equal_names): lt = ADSName.parse("<" + str(name)) lte = ADSName.parse("<=" + str(name)) gt = ADSName.parse(">" + str(name)) gte = ADSName.parse(">=" + str(name)) ex = ADSName.parse("=" + str(name)) if i == 0: self.assertNotIn(lt, nad) self.assertNotIn(lte, nad) else: self.assertIn(lt, nad) self.assertIn(lte, nad) self.assertNotIn(gt, nad) self.assertNotIn(gte, nad) self.assertNotIn(ex, nad) # Node "Last, First" will match and overwrite an existing entry # for "Last, F" nad[name] = PathNode(name) self.assertNotIn(lt, nad) self.assertIn(gte, nad) self.assertIn(lte, nad) self.assertNotIn(gt, nad) self.assertIn(ex, nad) nad = NameAwareDict() for name in diff_names: nad[name] = PathNode(name) for i, name in enumerate(equal_names[::-1]): lt = ADSName.parse("<" + str(name)) lte = ADSName.parse("<=" + str(name)) gt = ADSName.parse(">" + str(name)) gte = ADSName.parse(">=" + str(name)) ex = ADSName.parse("=" + str(name)) if i == 0: self.assertNotIn(gt, nad) self.assertNotIn(gte, nad) else: self.assertIn(gt, nad) self.assertIn(gte, nad) self.assertNotIn(lt, nad) self.assertNotIn(lte, nad) self.assertNotIn(ex, nad) # Node "Last, First" will match and overwrite an existing entry # for "Last, F" nad[name] = PathNode(name) self.assertNotIn(lt, nad) self.assertIn(gte, nad) self.assertIn(lte, nad) self.assertNotIn(gt, nad) self.assertIn(ex, nad)
def test_synonyms(self): synonyms = [ "test_synAA;test_synAB", "test_synBB, ;test_synBA,", "test_synCA, q; test_synCB, q", "test_synD, a; test_synD, b c", "test_synEB, b; test_synEA, a", "test_synFA, a b c d; test_synFB, a" ] # Hack: inject test synonyms ads_name._name_cache.clear() ads_name._parse_name_synonyms(synonyms) for syn in synonyms: names = syn.split(';') self.assertEqual(ADSName.parse(names[0]), ADSName.parse(names[1])) self.assertNotEqual('@' + ADSName.parse(names[0]), '@' + ADSName.parse(names[1])) self.assertNotEqual('@' + ADSName.parse(names[0]), ADSName.parse(names[1])) self.assertNotEqual(ADSName.parse(names[0]), '@' + ADSName.parse(names[1])) for other_synonyms in synonyms: if other_synonyms != syn: other_names = other_synonyms.split(';') for other_name in other_names: self.assertNotEqual(ADSName.parse(names[0]), ADSName.parse(other_name)) self.assertNotEqual(ADSName.parse(names[1]), ADSName.parse(other_name)) # A synonym without given names should work with given names provided self.assertEqual(ADSName.parse("test_synAA, a"), ADSName.parse("test_synAB, abc")) # A synonym with given names should work without given names provided self.assertEqual(ADSName.parse("test_synEA"), ADSName.parse("test_synEB")) # "test_synD, b c" should be selected as canonical. self.assertEqual(ADSName.parse("test_synD, a b c d"), ADSName.parse("test_synD, b")) self.assertEqual( ADSName.parse("test_synD, a b c d").synonym, ADSName.parse("test_synD, b c")) self.assertIsNone(ADSName.parse("test_synD, b c d").synonym) # Names not matching a synonym should be unaffected self.assertIsNone(ADSName.parse("test_synD, e").synonym) self.assertIsNone(ADSName.parse("test_synEA, f").synonym) self.assertIsNone(ADSName.parse("test_synEA, f").synonym) # Synonyms should be possibilities, not mandatory. So 'test_synFB, q', # which is not synonym-ized due to the differing initial, should still # be equal to 'test_synFB', which gets synonym-ized to 'test_synFA' self.assertEqual(ADSName.parse("test_synFB"), ADSName.parse("test_synFB, q")) # Nothing should be changed when using the `preserve` flag self.assertIsNone( ADSName.parse("test_synEA, abc d.", preserve=True).synonym) self.assertIsNone( ADSName.parse("test_synEA, abc d.", preserve=True).synonym) self.assertNotEqual(ADSName.parse("test_synEA, abc d.", preserve=True), ADSName.parse("test_synEB, b", preserve=True)) # Remove our test synonyms ads_name._name_cache.clear() ads_name._name_synonyms.clear() ads_name._load_synonyms()
def get_papers_for_author(self, query_author): query_author = ADSName.parse(query_author) query_authors = self._select_authors_to_prefetch() if query_author not in query_authors: query_authors.append(query_author) lb.i(f"Querying ADS for author " + query_author.qualified_full_name) if len(query_authors) > 1: lb.i(" Also prefetching. Query: " + "; ".join([a.qualified_full_name for a in query_authors])) query_strings = [] for author in query_authors: query_string = '"' + author.full_name + '"' if author.require_exact_match: query_string = "=" + query_string query_strings.append(query_string) query = " OR ".join(query_strings) query = f"author:({query})" documents = self._inner_query_for_author(query, len(query_authors)) author_records = NameAwareDict() for author in query_authors: author_records[author] = AuthorRecord(name=author, documents=[]) # We need to go through all the documents and match them to our # author list. This is critically important if we're pre-fetching # authors, but it's also important to support the "<" and ">" # specificity selectors for author names for document in documents: matched = False names = [ADSName.parse(n) for n in document.authors] for name in names: try: author_records[name].documents.append(document.bibcode) matched = True except KeyError: pass if (not matched and all( not a.require_more_specific and not a.require_less_specific for a in query_authors)): # See if we can guess which names should have been matched guesses = [] doc_authors = [n.full_name for n in names] doc_authors_initialized = \ [n.convert_to_initials().full_name for n in names] for query_author in query_authors: guess = difflib.get_close_matches(query_author.full_name, doc_authors, n=1, cutoff=0.8) if len(guess): guesses.append( f"{query_author.full_name} -> {guess[0]}") else: # Try again, changing names to use initials throughout guess = difflib.get_close_matches( query_author.convert_to_initials().full_name, doc_authors_initialized, n=1, cutoff=0.7) if len(guess): # Having found a match with initialized names, # report using the full form of each name chosen_doc_author = doc_authors[ doc_authors_initialized.index(guess[0])] guesses.append(f"{query_author.full_name}" f" -> {chosen_doc_author}") msg = "ADS Buddy: No matches for " + document.bibcode if len(guesses): msg += " . Guesses: " + "; ".join(guesses) lb.w(msg) for author_record in author_records.values(): # Remove any duplicate document listings # Becomes important for papers with _many_ authors, e.g. LIGO # papers, which use only initials and so can have duplicate names author_record.documents = sorted(set(author_record.documents)) if len(query_authors) == 1: return author_records[query_author], documents else: return author_records, documents
def test_specificity_equality(self): for i, name1 in enumerate(namesA): name1_lt = ADSName.parse("<" + name1) name1_gt = ADSName.parse(">" + name1) name1_lte = ADSName.parse("<=" + name1) name1_gte = ADSName.parse(">=" + name1) self.assertNotEqual(name1_lt, name1_lt) self.assertNotEqual(name1_gt, name1_gt) self.assertNotEqual(name1_lt, name1_gt) self.assertNotEqual(name1_gt, name1_lt) self.assertEqual(name1_lte, name1_lte) self.assertEqual(name1_gte, name1_gte) self.assertEqual(name1_lte, name1_gte) self.assertEqual(name1_gte, name1_lte) self.assertNotEqual(name1_lte, name1_lt) self.assertNotEqual(name1_gte, name1_gt) self.assertNotEqual(name1_lte, name1_gt) self.assertNotEqual(name1_gte, name1_lt) self.assertNotEqual(name1_lt, name1_lte) self.assertNotEqual(name1_gt, name1_gte) self.assertNotEqual(name1_lt, name1_gte) self.assertNotEqual(name1_gt, name1_lte) for j, name2 in enumerate(namesA): name2 = ADSName.parse(name2) # A larger index corresponds to more specificity, with a # few exceptions if i == j: self.assertNotEqual(name1_lt, name2) self.assertNotEqual(name2, name1_lt) self.assertNotEqual(name1_gt, name2) self.assertNotEqual(name2, name1_gt) self.assertEqual(name1_lte, name2) self.assertEqual(name2, name1_lte) self.assertEqual(name1_gte, name2) self.assertEqual(name2, name1_gte) elif ((i == 2 and j == 4) or (i == 3 and j in (4, 5)) or (i == 4 and j in (2, 3)) or (i == 5 and j == 3)): self.assertNotEqual(name1_lt, name2) self.assertNotEqual(name2, name1_lt) self.assertNotEqual(name1_gt, name2) self.assertNotEqual(name2, name1_gt) self.assertNotEqual(name1_lte, name2) self.assertNotEqual(name2, name1_lte) self.assertNotEqual(name1_gte, name2) self.assertNotEqual(name2, name1_gte) elif i > j: self.assertEqual(name1_lt, name2) self.assertEqual(name2, name1_lt) self.assertNotEqual(name1_gt, name2) self.assertNotEqual(name2, name1_gt) self.assertEqual(name1_lte, name2) self.assertEqual(name2, name1_lte) self.assertNotEqual(name1_gte, name2) self.assertNotEqual(name2, name1_gte) elif i < j: self.assertNotEqual(name1_lt, name2) self.assertNotEqual(name2, name1_lt) self.assertEqual(name1_gt, name2) self.assertEqual(name2, name1_gt) self.assertNotEqual(name1_lte, name2) self.assertNotEqual(name2, name1_lte) self.assertEqual(name1_gte, name2) self.assertEqual(name2, name1_gte) else: self.fail("Shouldn't get here")
def _article_to_record(self, article): # Not every ORCID ID field is returned for every document, and not # every returned list has an entry for each author for key in ('orcid_pub', 'orcid_user', 'orcid_other'): if key not in article: article[key] = [] article[key] = ['' if x == '-' else x for x in article[key]] article[key] += \ [''] * (len(article['author']) - len(article[key])) # Choose one ORCID ID for each author orcid_id = [] orcid_src = [] for op, ou, oo in zip(article['orcid_pub'], article['orcid_user'], article['orcid_other']): if op != '' and is_orcid_id(op): orcid_id.append(normalize_orcid_id(op)) orcid_src.append(1) elif ou != '' and is_orcid_id(ou): orcid_id.append(normalize_orcid_id(ou)) orcid_src.append(2) elif oo != '' and is_orcid_id(oo): orcid_id.append(normalize_orcid_id(oo)) orcid_src.append(3) else: orcid_id.append('') orcid_src.append(0) article['aff'] = ['' if x == '-' else x for x in article['aff']] document = DocumentRecord( bibcode=article["bibcode"], title=(unescape(article["title"][0]) if "title" in article else "[No title given]"), authors=[unescape(a) for a in article["author"]], affils=[unescape(a) for a in article["aff"]], doctype=article["doctype"], keywords=([unescape(k) for k in article["keyword"]] if "keyword" in article else []), publication=(unescape(article["pub"]) if "pub" in article else "[Publication not given]"), pubdate=article["date"], citation_count=(article["citation_count"] if "citation_count" in article else 0), read_count=(article["read_count"] if "read_count" in article else 0), orcid_ids=orcid_id, orcid_id_src=orcid_src) # Alter the DocumentRecord in-place to remove invalid author names bad_indices = [] names = [] for i, author in enumerate(document.authors): try: name = ADSName.parse(author) except InvalidName: lb.w(f"Invalid name for {document.bibcode}: {author}") bad_indices.append(i) continue if name.full_name in ("et al", "anonymous"): bad_indices.append(i) continue names.append(name) for i in reversed(bad_indices): document.delete_author(i) return document
from unittest import TestCase import names.ads_name as ads_name from names.ads_name import ADSName from names.name_aware import NameAwareDict, NameAwareSet from path_node import PathNode equal_names_str = [ "Murray, S.", "Murray, Stephen", "Murray, Stephen S", "Murray, Stephen Steve" ] equal_names = [ADSName.parse(n) for n in equal_names_str] diff_names_str = ["Murray, Eva", "Burray, Eva", "Murray, Eric"] diff_names = [ADSName.parse(n) for n in diff_names_str] class TestNameAwareDict(TestCase): def test_get_set_item(self): nad = NameAwareDict() with self.assertRaises(KeyError): nad[diff_names[0]] with self.assertRaises(KeyError): nad[diff_names[1]] node = PathNode(equal_names[0]) nad[equal_names[0]] = node diff_nodes = [] for name in diff_names:
def find_path(self): lb.on_start_path_finding() self.n_iterations = 0 if is_orcid_id(self.orig_src): src_rec = self.repository.get_author_record_by_orcid_id( self.orig_src) self.src = PathNode(name=src_rec.name, dist_from_src=0, legal_bibcodes=set(src_rec.documents)) else: src_rec = self.repository.get_author_record(self.orig_src) self.src = PathNode(name=self.orig_src, dist_from_src=0) if is_orcid_id(self.orig_dest): dest_rec = self.repository.get_author_record_by_orcid_id( self.orig_dest) self.dest = PathNode(name=dest_rec.name, dist_from_dest=0, legal_bibcodes=set(dest_rec.documents)) else: dest_rec = self.repository.get_author_record(self.orig_dest) self.dest = PathNode(name=self.orig_dest, dist_from_dest=0) # If we were given a name and an ORCID ID and they turn out to refer # to the same person, error out. mixed_name_formats = ( (type(self.orig_src) == ADSName and type(self.orig_dest) == str) or (type(self.orig_src) == str and type(self.orig_dest) == ADSName)) if mixed_name_formats and src_rec.name == dest_rec.name: raise PathFinderError( "src_is_dest_after_orcid", 'After looking up the ORCID ID, the "source" and "destination"' ' identities are equal (or at least overlap).') self.nodes[src_rec.name] = self.src self.nodes[dest_rec.name] = self.dest self.authors_to_expand_src_next.append(self.src.name) self.authors_to_expand_dest_next.append(self.dest.name) if (len(src_rec.documents) == 0 or all( [d in self.excluded_bibcodes for d in src_rec.documents])): raise PathFinderError( "src_empty", "No documents found for " + self.src.name.original_name) if (len(dest_rec.documents) == 0 or all( [d in self.excluded_bibcodes for d in dest_rec.documents])): raise PathFinderError( "dest_empty", "No documents found for " + self.dest.name.original_name) while True: lb.d("Beginning new iteration") lb.d(f"{len(self.authors_to_expand_src_next)} " "authors on src side") lb.d(f"{len(self.authors_to_expand_dest_next)} " "authors on dest side") if (len(self.authors_to_expand_src_next) == 0 or len(self.authors_to_expand_dest_next) == 0): raise PathFinderError( "no_authors_to_expand", "No connections possible after " f"{self.n_iterations} iterations") # Of the two lists of authors we could expand, let's always # choose the shortest. This tends to get us to a solution # faster. expanding_from_src = (len(self.authors_to_expand_src_next) < len( self.authors_to_expand_dest_next)) lb.d("Expanding from " f"{'src' if expanding_from_src else 'dest'} side") authors = (self.authors_to_expand_src if expanding_from_src else self.authors_to_expand_dest) authors_next = (self.authors_to_expand_src_next if expanding_from_src else self.authors_to_expand_dest_next) authors.clear() authors.extend(authors_next) authors_next.clear() # There's no point pre-fetching for only one author, and this # ensures we don't re-fetch the src and dest authors if they # were provided by ORCID ID if len(authors) > 1: self.repository.notify_of_upcoming_author_request(*authors) for expand_author in authors: lb.d(f"Expanding author {expand_author}") expand_node = self.nodes[expand_author] expand_node_dist = expand_node.dist(expanding_from_src) # We already have src and dest records handy, and this special # handling is required if either was provided by ORCID ID if expand_node is self.src: record = src_rec elif expand_node is self.dest: record = dest_rec else: record = self.repository.get_author_record(expand_author) # Here's a tricky one. If "<=Last, F" is in the exclude # list, and if we previously came across "Last, First" and # we're now expanding that node, we're ok using papers # written under "Last, First" but we're _not_ ok using # papers written under "Last, F.". So we need to ensure # we're allowed to use each paper by ensuring Last, First's # name appears on it in a way that's not excluded. ok_aliases = [ name for name in record.appears_as if name not in self.excluded_names ] if (len(self.excluded_bibcodes) or len(ok_aliases) != len(record.appears_as)): ok_bibcodes = { bibcode for alias in ok_aliases for bibcode in record.appears_as[alias] if bibcode not in self.excluded_bibcodes } else: ok_bibcodes = None for coauthor, bibcodes in record.coauthors.items(): # lb.d(f" Checking coauthor {coauthor}") if ok_bibcodes is not None: bibcodes = [ bibcode for bibcode in bibcodes if bibcode in ok_bibcodes ] if len(bibcodes) == 0: continue coauthor = ADSName.parse(coauthor) if coauthor in self.excluded_names: # lb.d(" Author is excluded") continue try: node = self.nodes[coauthor] # lb.d(f" Author exists in graph") except KeyError: # lb.d(f" New author added to graph") lb.on_coauthor_seen() node = PathNode(name=coauthor) self.nodes[coauthor] = node node.set_dist(expand_node_dist + 1, expanding_from_src) node.neighbors(expanding_from_src).add(expand_node) links = node.links(expanding_from_src)[expand_node] links.update(bibcodes) authors_next.append(coauthor) continue # if (node.dist(expanding_from_src) # <= expand_node_dist): # This node is closer to the src/dest than we are # and must have been encountered in a # previous expansion cycle. Ignore it. # pass if (node.dist(expanding_from_src) > expand_node_dist): # We provide an equal-or-better route from the # src/dest than the route (if any) that this node # is aware of, meaning this node is a viable next # step along the chain from the src/dest through # us. That it already exists suggests it has # multiple chains of equal length connecting it to # the src or dest. # If the src or dest was given via ORCID ID, we need # to make sure we have a valid connection. (E.g. if # the given ID is for one J Doe and our expand_author # is connected to a different J Doe, we need to # exclude that. if len(node.legal_bibcodes): legal_bibcodes = set( bibcodes) & node.legal_bibcodes else: legal_bibcodes = bibcodes if len(legal_bibcodes): links = node.links(expanding_from_src)[expand_node] links.update(legal_bibcodes) node.set_dist(expand_node_dist + 1, expanding_from_src) node.neighbors(expanding_from_src).add(expand_node) # lb.d(f" Added viable step") if self.node_connects(node, expanding_from_src): self.connecting_nodes.add(node) lb.d(f" Connecting author found!") lb.d("All expansions complete") self.n_iterations += 1 if len(self.connecting_nodes) > 0: break elif self.n_iterations > 8: raise PathFinderError( "too_far", "The distance is >8, which is quite far. Giving up.") else: continue self.produce_final_graph() lb.set_n_connections(len(self.connecting_nodes)) lb.set_distance(self.src.dist_from_dest) lb.on_stop_path_finding()
def test_errors(self): with self.assertRaises(InvalidName): ADSName.parse(",last, first") with self.assertRaises(InvalidName): ADSName.parse(",last")
def __post_init__(self): if self.name is not None: self.name = ADSName.parse(self.name) if self.timestamp == -1: self.timestamp = int(time.time())
def __init__(self, src, dest, excluded_names=None): self.repository = Repository() if not key_is_valid(src) and not is_orcid_id(src): raise PathFinderError("invalid_char_in_name", 'The "source" name is invalid.') if not key_is_valid(dest) and not is_orcid_id(dest): raise PathFinderError("invalid_char_in_name", 'The "destination" name is invalid.') names_to_be_queried = [] if is_orcid_id(src): src = normalize_orcid_id(src) else: try: src = ADSName.parse(src) except InvalidName: raise PathFinderError("invalid_char_in_name", 'The "source" name is invalid.') if src.excludes_self: raise PathFinderError( "src_invalid_lt_gt", "'<' and '>' are invalid modifiers for the source and " "destination authors and can only be used in the " "exclusions " "list. Try '<=' or '>=' instead.") names_to_be_queried.append(src) if is_orcid_id(dest): dest = normalize_orcid_id(dest) else: try: dest = ADSName.parse(dest) except InvalidName: raise PathFinderError("invalid_char_in_name", 'The "destination" name is invalid.') if dest.excludes_self: raise PathFinderError( "dest_invalid_lt_gt", "'<' and '>' are invalid modifiers for the source and " "destination authors and can only be used in the " "exclusions " "list. Try '<=' or '>=' instead.") names_to_be_queried.append(dest) if type(src) == type(dest) and src == dest: raise PathFinderError( "src_is_dest", 'The "source" and "destination" names are equal (or at least' ' consistent). The distance is zero. APPA would like something' ' more challenging, please.') self.excluded_names = NameAwareSet() self.excluded_bibcodes = set() if excluded_names is not None: if type(excluded_names) is str: excluded_names = [excluded_names] for name in excluded_names: name = name.strip() if name == '': continue elif is_bibcode(name): self.excluded_bibcodes.add(name) else: try: self.excluded_names.add(ADSName.parse(name)) except InvalidName: raise PathFinderError( "invalid_excl", f"'{name}' is an invalid name to exclude.") self.repository.notify_of_upcoming_author_request(*names_to_be_queried) self.authors_to_expand_src = [] self.authors_to_expand_src_next = [] self.authors_to_expand_dest = [] self.authors_to_expand_dest_next = [] self.nodes = NameAwareDict() self.connecting_nodes = set() self.orig_src = src self.orig_dest = dest