def extract_viaf_info(self, cluster, working_sort_name=None, working_display_name=False): """ Extract name info from a single VIAF cluster. :return: a tuple containing: - ContributorData object filled with display, sort, family, and wikipedia names. - dictionary of ways the xml cluster data matched the names searched for. - list of titles attributed to the contributor in the cluster. or Nones on error. """ contributor_data = ContributorData() contributor_titles = [] match_confidences = {} # Find out if one of the working names shows up in a name record. # Note: Potentially sets contributor_data.sort_name. match_confidences = self.cluster_has_record_for_named_author( cluster, working_sort_name, working_display_name, contributor_data ) # Get the VIAF ID for this cluster, just in case we don't have one yet. viaf_tag = self._xpath1(cluster, './/*[local-name()="viafID"]') if viaf_tag is None: contributor_data.viaf = None else: contributor_data.viaf = viaf_tag.text # If we don't have a working sort name, find the most popular # sort name in this cluster and use it as the sort name. sort_name_popularity = self.sort_names_by_popularity(cluster) # Does this cluster have a Wikipedia page? contributor_data.wikipedia_name = self.extract_wikipedia_name(cluster) if contributor_data.wikipedia_name: contributor_data.display_name = self.wikipedia_name_to_display_name(contributor_data.wikipedia_name) working_display_name = contributor_data.display_name # TODO: There's a problem here when someone's record has a # Wikipedia page other than their personal page (e.g. for # a band they're in.) known_name = working_sort_name or working_display_name unimarcs = self._xpath(cluster, './/*[local-name()="datafield"][@dtype="UNIMARC"]') candidates = [] for unimarc in unimarcs: (possible_given, possible_family, possible_extra, possible_sort_name) = self.extract_name_from_unimarc(unimarc) # Some part of this name must also show up in the original # name for it to even be considered. Otherwise it's a # better bet to try to munge the original name. for v in (possible_given, possible_family, possible_extra): if not v: continue if not known_name or v in known_name: self.log.debug( "FOUND %s in %s", v, known_name ) candidates.append((possible_given, possible_family, possible_extra)) if possible_sort_name: if possible_sort_name.endswith(","): possible_sort_name = possible_sort_name[:-1] sort_name_popularity[possible_sort_name] += 1 break else: self.log.debug( "EXCLUDED %s/%s/%s for lack of resemblance to %s", possible_given, possible_family, possible_extra, known_name ) pass if sort_name_popularity and not contributor_data.sort_name: contributor_data.sort_name, ignore = sort_name_popularity.most_common(1)[0] if contributor_data.display_name: parts = contributor_data.display_name.split(" ") if len(parts) == 2: # Pretty clearly given name+family name. # If it gets more complicated than this we can't # be confident. candidates.append(parts + [None]) display_nameparts = self.best_choice(candidates) if display_nameparts[1]: # Family name contributor_data.family_name = display_nameparts[1] contributor_data.display_name = contributor_data.display_name or self.combine_nameparts(*display_nameparts) or working_display_name # Now go through the title elements, and make a list. titles = self._xpath(cluster, './/*[local-name()="titles"]/*[local-name()="work"]/*[local-name()="title"]') for title in titles: contributor_titles.append(title.text) return contributor_data, match_confidences, contributor_titles
def extract_viaf_info(self, cluster, working_sort_name=None, working_display_name=False): """ Extract name info from a single VIAF cluster. :return: a tuple containing: - ContributorData object filled with display, sort, family, and wikipedia names. - dictionary of ways the xml cluster data matched the names searched for. - list of titles attributed to the contributor in the cluster. or Nones on error. """ contributor_data = ContributorData() contributor_titles = [] match_confidences = {} # Find out if one of the working names shows up in a name record. # Note: Potentially sets contributor_data.sort_name. match_confidences = self.cluster_has_record_for_named_author( cluster, working_sort_name, working_display_name, contributor_data) # Get the VIAF ID for this cluster, just in case we don't have one yet. viaf_tag = self._xpath1(cluster, './/*[local-name()="viafID"]') if viaf_tag is None: contributor_data.viaf = None else: contributor_data.viaf = viaf_tag.text # If we don't have a working sort name, find the most popular # sort name in this cluster and use it as the sort name. sort_name_popularity = self.sort_names_by_popularity(cluster) # Does this cluster have a Wikipedia page? contributor_data.wikipedia_name = self.extract_wikipedia_name(cluster) if contributor_data.wikipedia_name: contributor_data.display_name = self.wikipedia_name_to_display_name( contributor_data.wikipedia_name) working_display_name = contributor_data.display_name # TODO: There's a problem here when someone's record has a # Wikipedia page other than their personal page (e.g. for # a band they're in.) known_name = working_sort_name or working_display_name unimarcs = self._xpath( cluster, './/*[local-name()="datafield"][@dtype="UNIMARC"]') candidates = [] for unimarc in unimarcs: (possible_given, possible_family, possible_extra, possible_sort_name) = self.extract_name_from_unimarc(unimarc) # Some part of this name must also show up in the original # name for it to even be considered. Otherwise it's a # better bet to try to munge the original name. for v in (possible_given, possible_family, possible_extra): if not v: continue if not known_name or v in known_name: self.log.debug("FOUND %s in %s", v, known_name) candidates.append( (possible_given, possible_family, possible_extra)) if possible_sort_name: if possible_sort_name.endswith(","): possible_sort_name = possible_sort_name[:-1] sort_name_popularity[possible_sort_name] += 1 break else: self.log.debug( "EXCLUDED %s/%s/%s for lack of resemblance to %s", possible_given, possible_family, possible_extra, known_name) pass if sort_name_popularity and not contributor_data.sort_name: contributor_data.sort_name, ignore = sort_name_popularity.most_common( 1)[0] if contributor_data.display_name: parts = contributor_data.display_name.split(" ") if len(parts) == 2: # Pretty clearly given name+family name. # If it gets more complicated than this we can't # be confident. candidates.append(parts + [None]) display_nameparts = self.best_choice(candidates) if display_nameparts[1]: # Family name contributor_data.family_name = display_nameparts[1] contributor_data.display_name = contributor_data.display_name or self.combine_nameparts( *display_nameparts) or working_display_name # Now go through the title elements, and make a list. titles = self._xpath( cluster, './/*[local-name()="titles"]/*[local-name()="work"]/*[local-name()="title"]' ) for title in titles: contributor_titles.append(title.text) return contributor_data, match_confidences, contributor_titles
def cluster_has_record_for_named_author( self, cluster, working_sort_name, working_display_name, contributor_data=None): """ Looks through the xml cluster for all fields that could indicate the author's name. Don't short-circuit the xml parsing process -- if found an author name match, keep parsing and see what else can find. :return: a dictionary containing description of xml field that matched author name searched for. """ match_confidences = {} if not contributor_data: contributor_data = ContributorData() # If we have a sort name to look for, and it's in this cluster's # sort names, great. if working_sort_name: for potential_match in self.sort_names_for_cluster(cluster): match_confidence = contributor_name_match_ratio(potential_match, working_sort_name) match_confidences["sort_name"] = match_confidence # fuzzy match filter may not always give a 100% match, so cap arbitrarily at 90% as a "sure match" if match_confidence > 90: contributor_data.sort_name=potential_match return match_confidences # If we have a display name to look for, and this cluster's # Wikipedia name converts to the display name, great. if working_display_name: wikipedia_name = self.extract_wikipedia_name(cluster) if wikipedia_name: contributor_data.wikipedia_name=wikipedia_name display_name = self.wikipedia_name_to_display_name(wikipedia_name) match_confidence = contributor_name_match_ratio(display_name, working_display_name) match_confidences["display_name"] = match_confidence if match_confidence > 90: contributor_data.display_name=display_name return match_confidences # If there are UNIMARC records, and every part of the UNIMARC # record matches the sort name or the display name, great. unimarcs = self._xpath(cluster, './/*[local-name()="datafield"][@dtype="UNIMARC"]') candidates = [] for unimarc in unimarcs: (possible_given, possible_family, possible_extra, possible_sort_name) = self.extract_name_from_unimarc(unimarc) if working_sort_name: match_confidence = contributor_name_match_ratio(possible_sort_name, working_sort_name) match_confidences["unimarc"] = match_confidence if match_confidence > 90: contributor_data.family_name=possible_sort_name return match_confidences for name in (working_sort_name, working_display_name): if not name: continue if (possible_given and possible_given in name and possible_family and possible_family in name and ( not possible_extra or possible_extra in name)): match_confidences["unimarc"] = 90 contributor_data.family_name=possible_family return match_confidences # Last-ditch effort. Guess at the sort name and see if *that's* one # of the cluster sort names. if working_display_name and not working_sort_name: test_sort_name = display_name_to_sort_name(working_display_name) for potential_match in self.sort_names_for_cluster(cluster): match_confidence = contributor_name_match_ratio(potential_match, test_sort_name) match_confidences["guessed_sort_name"] = match_confidence if match_confidence > 90: contributor_data.sort_name=potential_match return match_confidences # OK, last last-ditch effort. See if the alternate name forms (pseudonyms) are it. if working_sort_name: for potential_match in self.alternate_name_forms_for_cluster(cluster): match_confidence = contributor_name_match_ratio(potential_match, working_sort_name) match_confidences["alternate_name"] = match_confidence if match_confidence > 90: contributor_data.family_name=potential_match return match_confidences return match_confidences
def cluster_has_record_for_named_author(self, cluster, working_sort_name, working_display_name, contributor_data=None): """ Looks through the xml cluster for all fields that could indicate the author's name. Don't short-circuit the xml parsing process -- if found an author name match, keep parsing and see what else can find. :return: a dictionary containing description of xml field that matched author name searched for. """ match_confidences = {} if not contributor_data: contributor_data = ContributorData() # If we have a sort name to look for, and it's in this cluster's # sort names, great. if working_sort_name: for potential_match in self.sort_names_for_cluster(cluster): match_confidence = contributor_name_match_ratio( potential_match, working_sort_name) match_confidences["sort_name"] = match_confidence # fuzzy match filter may not always give a 100% match, so cap arbitrarily at 90% as a "sure match" if match_confidence > 90: contributor_data.sort_name = potential_match return match_confidences # If we have a display name to look for, and this cluster's # Wikipedia name converts to the display name, great. if working_display_name: wikipedia_name = self.extract_wikipedia_name(cluster) if wikipedia_name: contributor_data.wikipedia_name = wikipedia_name display_name = self.wikipedia_name_to_display_name( wikipedia_name) match_confidence = contributor_name_match_ratio( display_name, working_display_name) match_confidences["display_name"] = match_confidence if match_confidence > 90: contributor_data.display_name = display_name return match_confidences # If there are UNIMARC records, and every part of the UNIMARC # record matches the sort name or the display name, great. unimarcs = self._xpath( cluster, './/*[local-name()="datafield"][@dtype="UNIMARC"]') candidates = [] for unimarc in unimarcs: (possible_given, possible_family, possible_extra, possible_sort_name) = self.extract_name_from_unimarc(unimarc) if working_sort_name: match_confidence = contributor_name_match_ratio( possible_sort_name, working_sort_name) match_confidences["unimarc"] = match_confidence if match_confidence > 90: contributor_data.family_name = possible_sort_name return match_confidences for name in (working_sort_name, working_display_name): if not name: continue if (possible_given and possible_given in name and possible_family and possible_family in name and (not possible_extra or possible_extra in name)): match_confidences["unimarc"] = 90 contributor_data.family_name = possible_family return match_confidences # Last-ditch effort. Guess at the sort name and see if *that's* one # of the cluster sort names. if working_display_name and not working_sort_name: test_sort_name = display_name_to_sort_name(working_display_name) for potential_match in self.sort_names_for_cluster(cluster): match_confidence = contributor_name_match_ratio( potential_match, test_sort_name) match_confidences["guessed_sort_name"] = match_confidence if match_confidence > 90: contributor_data.sort_name = potential_match return match_confidences # OK, last last-ditch effort. See if the alternate name forms (pseudonyms) are it. if working_sort_name: for potential_match in self.alternate_name_forms_for_cluster( cluster): match_confidence = contributor_name_match_ratio( potential_match, working_sort_name) match_confidences["alternate_name"] = match_confidence if match_confidence > 90: contributor_data.family_name = potential_match return match_confidences return match_confidences