def get_name_as_in_ADS(target_name, names_in_result: []): """For presentation in the UI, figures out how to capitalize a name The user may have typed in the query names in all lowercase. For the large banner at the top of the page, it would be nice to format the names more properly. Rather than just defaulting to first-letter-uppercase, we can use our ADS data to present the name in a form (or one of the forms) ADS has for the name. This means we may also pick up diacritics. Looks through all the publications belonging to the name and how the author's name appears in those publications. Grabs (one of) the most-detailed forms. If it contains more given names than the target names, truncates the list. Shortens given names to initials if the target name has an initial at that position.""" # Unique-ify names_in_result names_in_result = list(set(names_in_result)) repo = Repository(can_skip_refresh=True) names_in_result = [ADSName.parse(name) for name in names_in_result] orcid = is_orcid_id(target_name) if orcid: record = repo.get_author_record_by_orcid_id(target_name) else: target_name = ADSName.parse(target_name) record = repo.get_author_record(target_name) aliases = record.appears_as.keys() aliases = [ADSName.parse(alias) for alias in aliases] # Remove all aliases that aren't consistent with any of the name forms # used in the set of possible chains. E.g. if the user searched for # "Last" and all chains terminate at "Last, B.", then we shouldn't view # "Last, I." as a viable alias. aliases = [alias for alias in aliases if alias in names_in_result] # Grab the most-detailed alias. As tie-breaker, choose the form with the # most publications. alias = sorted([(a.level_of_detail, len(record.appears_as[a.original_name]), a.original_name) for a in aliases])[-1][-1] alias = ADSName.parse(alias, preserve=True) if orcid: gns = alias.given_names else: # Trim it down to size gns = alias.given_names if len(gns) > len(target_name.given_names): gns = gns[:len(target_name.given_names)] # Ensure we have initials where we need them gns = [ gn if len(tgn) > 1 else gn[0] for gn, tgn in zip(gns, target_name.given_names) ] final_name = ADSName.parse(alias.last_name, *gns, preserve=True) return final_name.full_name
class TestRecordCompression(TestCase): """Uses mock_backing_cache records to test compression & decompression Loads the author and document records. Checks that the decompressed records appear consistent. Re-compresses those records and checks that they match the compressed source records.""" def setUp(self): self.real_backing_cache = cache_buddy.backing_cache cache_buddy.backing_cache = mock_backing_cache self.repository = Repository() def tearDown(self): cache_buddy.backing_cache = self.real_backing_cache self.real_backing_cache = None cache_buddy._loaded_authors = {} cache_buddy._loaded_documents = {} def test_author_record_compression(self): for author in mock_backing_cache.authors: raw_data = mock_backing_cache.load_author(author) raw_data = {**raw_data} del raw_data['version'] record = self.repository.get_author_record(author) # We have an uncompressed record. Check it for consistency for alias in record.appears_as: raw_datum = raw_data['appears_as'][alias].split(',') self.assertEqual(len(record.appears_as[alias]), len(raw_datum)) for idx, bibcode in zip(raw_datum, record.appears_as[alias]): self.assertEqual(bibcode, record.documents[int(idx)]) for coauthor in record.coauthors: raw_datum = raw_data['coauthors'][coauthor].split(',') self.assertEqual(len(record.coauthors[coauthor]), len(raw_datum)) for idx, bibcode in zip(raw_datum, record.coauthors[coauthor]): self.assertEqual(bibcode, record.documents[int(idx)]) uncompressed_appears_as = copy.deepcopy(record.appears_as) uncompressed_coauthors = copy.deepcopy(record.coauthors) # Make sure the copy is independent of the original native_copy = record.copy() record.compress() self.assertNotEqual(record.asdict(), native_copy.asdict()) for alias in record.appears_as: self.assertEqual(len(record.appears_as[alias].split(',')), len(uncompressed_appears_as[alias])) for coauthor in record.coauthors: self.assertEqual(len(record.coauthors[coauthor].split(',')), len(uncompressed_coauthors[coauthor])) # The source record in mock_backing_cache is compressed. Check that # it matches the re-compressed record self.assertEqual(raw_data, record.asdict()) # Make sure the copy is independent of the original native_copy = record.copy() record.decompress() self.assertNotEqual(record.asdict(), native_copy.asdict()) def test_document_record_compression(self): for document, raw_data in mock_backing_cache.documents.items(): raw_data = {**raw_data} del raw_data['version'] record = self.repository.get_document(document) # We have an uncompressed record. Check it for consistency self.assertEqual(len(record.authors), len(record.affils)) self.assertEqual(len(record.authors), len(record.orcid_ids)) self.assertEqual(len(record.authors), len(record.orcid_id_src)) for orcid_id, src in zip(record.orcid_ids, record.orcid_id_src): if orcid_id == '': self.assertEqual(src, 0) if src != 0: self.assertNotEqual('', orcid_id) uncompressed_affils = copy.deepcopy(record.affils) uncompressed_orcid_ids = copy.deepcopy(record.orcid_ids) uncompressed_orcid_srcs = copy.deepcopy(record.orcid_id_src) # Make sure the copy is independent of the original native_copy = record.copy() record.compress() self.assertNotEqual(record.asdict(), native_copy.asdict()) # Ensure only empty items are removed for affil in uncompressed_affils[len(record.affils):]: self.assertEqual(affil, '') for orcid_id in uncompressed_orcid_ids[len(record.orcid_ids):]: self.assertEqual(orcid_id, '') for orcid_src in uncompressed_orcid_srcs[len(record.orcid_id_src.split(',')):]: self.assertEqual(orcid_src, 0) # The source record in mock_backing_cache is compressed. Check that # it matches the re-compressed record self.assertEqual(raw_data, record.asdict()) # Make sure the copy is independent of the original native_copy = record.copy() record.decompress() self.assertNotEqual(record.asdict(), native_copy.asdict())
class PathFinder: repository: Repository() nodes: NameAwareDict src: PathNode dest: PathNode excluded_names: NameAwareSet excluded_bibcodes: set connecting_nodes: Set[PathNode] n_iterations: int authors_to_expand_src = List[AuthorRecord] authors_to_expand_src_next = List[AuthorRecord] authors_to_expand_dest = List[AuthorRecord] authors_to_expand_dest_next = List[AuthorRecord] def __init__(self, src, dest, excluded_names=None): self.repository = Repository() if not key_is_valid(src) and not is_orcid_id(src): raise PathFinderError("invalid_char_in_name", 'The "source" name is invalid.') if not key_is_valid(dest) and not is_orcid_id(dest): raise PathFinderError("invalid_char_in_name", 'The "destination" name is invalid.') names_to_be_queried = [] if is_orcid_id(src): src = normalize_orcid_id(src) else: try: src = ADSName.parse(src) except InvalidName: raise PathFinderError("invalid_char_in_name", 'The "source" name is invalid.') if src.excludes_self: raise PathFinderError( "src_invalid_lt_gt", "'<' and '>' are invalid modifiers for the source and " "destination authors and can only be used in the " "exclusions " "list. Try '<=' or '>=' instead.") names_to_be_queried.append(src) if is_orcid_id(dest): dest = normalize_orcid_id(dest) else: try: dest = ADSName.parse(dest) except InvalidName: raise PathFinderError("invalid_char_in_name", 'The "destination" name is invalid.') if dest.excludes_self: raise PathFinderError( "dest_invalid_lt_gt", "'<' and '>' are invalid modifiers for the source and " "destination authors and can only be used in the " "exclusions " "list. Try '<=' or '>=' instead.") names_to_be_queried.append(dest) if type(src) == type(dest) and src == dest: raise PathFinderError( "src_is_dest", 'The "source" and "destination" names are equal (or at least' ' consistent). The distance is zero. APPA would like something' ' more challenging, please.') self.excluded_names = NameAwareSet() self.excluded_bibcodes = set() if excluded_names is not None: if type(excluded_names) is str: excluded_names = [excluded_names] for name in excluded_names: name = name.strip() if name == '': continue elif is_bibcode(name): self.excluded_bibcodes.add(name) else: try: self.excluded_names.add(ADSName.parse(name)) except InvalidName: raise PathFinderError( "invalid_excl", f"'{name}' is an invalid name to exclude.") self.repository.notify_of_upcoming_author_request(*names_to_be_queried) self.authors_to_expand_src = [] self.authors_to_expand_src_next = [] self.authors_to_expand_dest = [] self.authors_to_expand_dest_next = [] self.nodes = NameAwareDict() self.connecting_nodes = set() self.orig_src = src self.orig_dest = dest def find_path(self): lb.on_start_path_finding() self.n_iterations = 0 if is_orcid_id(self.orig_src): src_rec = self.repository.get_author_record_by_orcid_id( self.orig_src) self.src = PathNode(name=src_rec.name, dist_from_src=0, legal_bibcodes=set(src_rec.documents)) else: src_rec = self.repository.get_author_record(self.orig_src) self.src = PathNode(name=self.orig_src, dist_from_src=0) if is_orcid_id(self.orig_dest): dest_rec = self.repository.get_author_record_by_orcid_id( self.orig_dest) self.dest = PathNode(name=dest_rec.name, dist_from_dest=0, legal_bibcodes=set(dest_rec.documents)) else: dest_rec = self.repository.get_author_record(self.orig_dest) self.dest = PathNode(name=self.orig_dest, dist_from_dest=0) # If we were given a name and an ORCID ID and they turn out to refer # to the same person, error out. mixed_name_formats = ( (type(self.orig_src) == ADSName and type(self.orig_dest) == str) or (type(self.orig_src) == str and type(self.orig_dest) == ADSName)) if mixed_name_formats and src_rec.name == dest_rec.name: raise PathFinderError( "src_is_dest_after_orcid", 'After looking up the ORCID ID, the "source" and "destination"' ' identities are equal (or at least overlap).') self.nodes[src_rec.name] = self.src self.nodes[dest_rec.name] = self.dest self.authors_to_expand_src_next.append(self.src.name) self.authors_to_expand_dest_next.append(self.dest.name) if (len(src_rec.documents) == 0 or all( [d in self.excluded_bibcodes for d in src_rec.documents])): raise PathFinderError( "src_empty", "No documents found for " + self.src.name.original_name) if (len(dest_rec.documents) == 0 or all( [d in self.excluded_bibcodes for d in dest_rec.documents])): raise PathFinderError( "dest_empty", "No documents found for " + self.dest.name.original_name) while True: lb.d("Beginning new iteration") lb.d(f"{len(self.authors_to_expand_src_next)} " "authors on src side") lb.d(f"{len(self.authors_to_expand_dest_next)} " "authors on dest side") if (len(self.authors_to_expand_src_next) == 0 or len(self.authors_to_expand_dest_next) == 0): raise PathFinderError( "no_authors_to_expand", "No connections possible after " f"{self.n_iterations} iterations") # Of the two lists of authors we could expand, let's always # choose the shortest. This tends to get us to a solution # faster. expanding_from_src = (len(self.authors_to_expand_src_next) < len( self.authors_to_expand_dest_next)) lb.d("Expanding from " f"{'src' if expanding_from_src else 'dest'} side") authors = (self.authors_to_expand_src if expanding_from_src else self.authors_to_expand_dest) authors_next = (self.authors_to_expand_src_next if expanding_from_src else self.authors_to_expand_dest_next) authors.clear() authors.extend(authors_next) authors_next.clear() # There's no point pre-fetching for only one author, and this # ensures we don't re-fetch the src and dest authors if they # were provided by ORCID ID if len(authors) > 1: self.repository.notify_of_upcoming_author_request(*authors) for expand_author in authors: lb.d(f"Expanding author {expand_author}") expand_node = self.nodes[expand_author] expand_node_dist = expand_node.dist(expanding_from_src) # We already have src and dest records handy, and this special # handling is required if either was provided by ORCID ID if expand_node is self.src: record = src_rec elif expand_node is self.dest: record = dest_rec else: record = self.repository.get_author_record(expand_author) # Here's a tricky one. If "<=Last, F" is in the exclude # list, and if we previously came across "Last, First" and # we're now expanding that node, we're ok using papers # written under "Last, First" but we're _not_ ok using # papers written under "Last, F.". So we need to ensure # we're allowed to use each paper by ensuring Last, First's # name appears on it in a way that's not excluded. ok_aliases = [ name for name in record.appears_as if name not in self.excluded_names ] if (len(self.excluded_bibcodes) or len(ok_aliases) != len(record.appears_as)): ok_bibcodes = { bibcode for alias in ok_aliases for bibcode in record.appears_as[alias] if bibcode not in self.excluded_bibcodes } else: ok_bibcodes = None for coauthor, bibcodes in record.coauthors.items(): # lb.d(f" Checking coauthor {coauthor}") if ok_bibcodes is not None: bibcodes = [ bibcode for bibcode in bibcodes if bibcode in ok_bibcodes ] if len(bibcodes) == 0: continue coauthor = ADSName.parse(coauthor) if coauthor in self.excluded_names: # lb.d(" Author is excluded") continue try: node = self.nodes[coauthor] # lb.d(f" Author exists in graph") except KeyError: # lb.d(f" New author added to graph") lb.on_coauthor_seen() node = PathNode(name=coauthor) self.nodes[coauthor] = node node.set_dist(expand_node_dist + 1, expanding_from_src) node.neighbors(expanding_from_src).add(expand_node) links = node.links(expanding_from_src)[expand_node] links.update(bibcodes) authors_next.append(coauthor) continue # if (node.dist(expanding_from_src) # <= expand_node_dist): # This node is closer to the src/dest than we are # and must have been encountered in a # previous expansion cycle. Ignore it. # pass if (node.dist(expanding_from_src) > expand_node_dist): # We provide an equal-or-better route from the # src/dest than the route (if any) that this node # is aware of, meaning this node is a viable next # step along the chain from the src/dest through # us. That it already exists suggests it has # multiple chains of equal length connecting it to # the src or dest. # If the src or dest was given via ORCID ID, we need # to make sure we have a valid connection. (E.g. if # the given ID is for one J Doe and our expand_author # is connected to a different J Doe, we need to # exclude that. if len(node.legal_bibcodes): legal_bibcodes = set( bibcodes) & node.legal_bibcodes else: legal_bibcodes = bibcodes if len(legal_bibcodes): links = node.links(expanding_from_src)[expand_node] links.update(legal_bibcodes) node.set_dist(expand_node_dist + 1, expanding_from_src) node.neighbors(expanding_from_src).add(expand_node) # lb.d(f" Added viable step") if self.node_connects(node, expanding_from_src): self.connecting_nodes.add(node) lb.d(f" Connecting author found!") lb.d("All expansions complete") self.n_iterations += 1 if len(self.connecting_nodes) > 0: break elif self.n_iterations > 8: raise PathFinderError( "too_far", "The distance is >8, which is quite far. Giving up.") else: continue self.produce_final_graph() lb.set_n_connections(len(self.connecting_nodes)) lb.set_distance(self.src.dist_from_dest) lb.on_stop_path_finding() def node_connects(self, node: PathNode, expanding_from_src: bool): if (len(node.neighbors_toward_src) > 0 and len(node.neighbors_toward_dest) > 0): return True if expanding_from_src and node is self.dest: return True if not expanding_from_src and node is self.src: return True def produce_final_graph(self): # Step one: Make all linkages bidirectional nodes_to_walk = deque(self.connecting_nodes) visited = set() while len(nodes_to_walk): node = nodes_to_walk.popleft() if node in visited: continue visited.add(node) for neighbor in node.neighbors_toward_src: if neighbor not in visited: nodes_to_walk.append(neighbor) neighbor.neighbors_toward_dest.add(node) neighbor.dist_from_dest = min(node.dist_from_dest + 1, neighbor.dist_from_dest) neighbor.links_toward_dest[node] = \ node.links_toward_src[neighbor] for neighbor in node.neighbors_toward_dest: if neighbor not in visited: nodes_to_walk.append(neighbor) neighbor.neighbors_toward_src.add(node) neighbor.dist_from_src = min(node.dist_from_src + 1, neighbor.dist_from_src) neighbor.links_toward_src[node] = \ node.links_toward_dest[neighbor] # Step two: Remove any links that aren't along the most direct route nodes_to_walk = [self.src] while len(nodes_to_walk): node = nodes_to_walk.pop() if len(node.neighbors_toward_dest): dist_of_best_neighbor = min( (neighbor.dist_from_dest for neighbor in node.neighbors_toward_dest)) # Copy the set we're iterating over, since we mutate it # in the loop for neighbor in list(node.neighbors_toward_dest): if neighbor.dist_from_dest != dist_of_best_neighbor: node.neighbors_toward_dest.remove(neighbor) node.links_toward_dest.pop(neighbor) neighbor.neighbors_toward_src.remove(node) neighbor.links_toward_src.pop(node) else: nodes_to_walk.append(neighbor) if len(node.neighbors_toward_src): dist_of_best_neighbor = min( (neighbor.dist_from_src for neighbor in node.neighbors_toward_src)) for neighbor in list(node.neighbors_toward_src): if neighbor.dist_from_src != dist_of_best_neighbor: node.neighbors_toward_src.remove(neighbor) node.links_toward_src.pop(neighbor) neighbor.neighbors_toward_dest.remove(node) neighbor.links_toward_dest.pop(node) # Step three: Remove nodes that aren't on a path between src and dest for name, node in self.nodes.items(): if node is self.src or node is self.dest: continue if (len(node.neighbors_toward_src) == 0 or len(node.neighbors_toward_dest) == 0): del self.nodes[name]
class TestRepository(TestCase): def setUp(self): self.real_backing_cache = cache_buddy.backing_cache cache_buddy.backing_cache = mock_backing_cache self.repository = Repository() mock_backing_cache.store_author.reset_mock() def tearDown(self): cache_buddy.backing_cache = self.real_backing_cache cache_buddy._loaded_authors = {} cache_buddy._loaded_documents = {} mock_backing_cache.store_author.reset_mock() def test_get_author(self): record = self.repository.get_author_record('author, a.') record.compress() record = record.asdict() record['version'] = cache_buddy.AUTHOR_VERSION_NUMBER self.assertEqual(record, mock_backing_cache.load_author('author, a.')) def test_get_document(self): record = self.repository.get_document('paperAB') record.compress() record = record.asdict() record['version'] = cache_buddy.DOCUMENT_VERSION_NUMBER self.assertEqual(record, mock_backing_cache.documents['paperAB']) def test_author_record_generation(self): record = self.repository.get_author_record('>author, a.') self.assertEqual(len(record.documents), 3) self.assertEqual(record.documents[0], 'paperAB2') self.assertEqual(record.documents[1], 'paperAE') self.assertEqual(record.documents[2], 'paperAK') mock_backing_cache.store_author.assert_called_once() cached_record = mock_backing_cache.store_author.call_args[0][0] self.assertEqual(cached_record['name'], '>author, a.') self.assertEqual(cached_record['documents'], record.documents) mock_backing_cache.store_author.reset_mock() record = self.repository.get_author_record('=author, a.') self.assertEqual(len(record.documents), 1) self.assertEqual(sorted(record.documents)[0], 'paperAB') mock_backing_cache.store_author.assert_called_once() cached_record = mock_backing_cache.store_author.call_args[0][0] self.assertEqual(cached_record['name'], '=author, a.') self.assertEqual(cached_record['documents'], record.documents) mock_backing_cache.store_author.reset_mock() record = self.repository.get_author_record('<author, aa') self.assertEqual(len(record.documents), 1) self.assertEqual(sorted(record.documents)[0], 'paperAB') mock_backing_cache.store_author.assert_called_once() cached_record = mock_backing_cache.store_author.call_args[0][0] self.assertEqual(cached_record['name'], '<author, aa') self.assertEqual(cached_record['documents'], record.documents) mock_backing_cache.store_author.reset_mock()