def test_name_tidy(self): # remove improper comma sort_name = display_name_to_sort_name("Bitshifter, Bob,") assert "Bitshifter, Bob" == sort_name # remove improper period sort_name = display_name_to_sort_name("Bitshifter, Bober.") assert "Bitshifter, Bober" == sort_name # retain proper period sort_name = display_name_to_sort_name("Bitshifter, B.") assert "Bitshifter, B." == sort_name
def default_name(self, display_name): shortened_name = self.primary_author_name(display_name) return display_name_to_sort_name(shortened_name)
def cluster_has_record_for_named_author( self, cluster, working_sort_name, working_display_name, contributor_data=None): """ Looks through the xml cluster for all fields that could indicate the author's name. Don't short-circuit the xml parsing process -- if found an author name match, keep parsing and see what else can find. :return: a dictionary containing description of xml field that matched author name searched for. """ match_confidences = {} if not contributor_data: contributor_data = ContributorData() # If we have a sort name to look for, and it's in this cluster's # sort names, great. if working_sort_name: for potential_match in self.sort_names_for_cluster(cluster): match_confidence = contributor_name_match_ratio(potential_match, working_sort_name) match_confidences["sort_name"] = match_confidence # fuzzy match filter may not always give a 100% match, so cap arbitrarily at 90% as a "sure match" if match_confidence > 90: contributor_data.sort_name=potential_match return match_confidences # If we have a display name to look for, and this cluster's # Wikipedia name converts to the display name, great. if working_display_name: wikipedia_name = self.extract_wikipedia_name(cluster) if wikipedia_name: contributor_data.wikipedia_name=wikipedia_name display_name = self.wikipedia_name_to_display_name(wikipedia_name) match_confidence = contributor_name_match_ratio(display_name, working_display_name) match_confidences["display_name"] = match_confidence if match_confidence > 90: contributor_data.display_name=display_name return match_confidences # If there are UNIMARC records, and every part of the UNIMARC # record matches the sort name or the display name, great. unimarcs = self._xpath(cluster, './/*[local-name()="datafield"][@dtype="UNIMARC"]') candidates = [] for unimarc in unimarcs: (possible_given, possible_family, possible_extra, possible_sort_name) = self.extract_name_from_unimarc(unimarc) if working_sort_name: match_confidence = contributor_name_match_ratio(possible_sort_name, working_sort_name) match_confidences["unimarc"] = match_confidence if match_confidence > 90: contributor_data.family_name=possible_sort_name return match_confidences for name in (working_sort_name, working_display_name): if not name: continue if (possible_given and possible_given in name and possible_family and possible_family in name and ( not possible_extra or possible_extra in name)): match_confidences["unimarc"] = 90 contributor_data.family_name=possible_family return match_confidences # Last-ditch effort. Guess at the sort name and see if *that's* one # of the cluster sort names. if working_display_name and not working_sort_name: test_sort_name = display_name_to_sort_name(working_display_name) for potential_match in self.sort_names_for_cluster(cluster): match_confidence = contributor_name_match_ratio(potential_match, test_sort_name) match_confidences["guessed_sort_name"] = match_confidence if match_confidence > 90: contributor_data.sort_name=potential_match return match_confidences # OK, last last-ditch effort. See if the alternate name forms (pseudonyms) are it. if working_sort_name: for potential_match in self.alternate_name_forms_for_cluster(cluster): match_confidence = contributor_name_match_ratio(potential_match, working_sort_name) match_confidences["alternate_name"] = match_confidence if match_confidence > 90: contributor_data.family_name=potential_match return match_confidences return match_confidences
def cluster_has_record_for_named_author(self, cluster, working_sort_name, working_display_name, contributor_data=None): """ Looks through the xml cluster for all fields that could indicate the author's name. Don't short-circuit the xml parsing process -- if found an author name match, keep parsing and see what else can find. :return: a dictionary containing description of xml field that matched author name searched for. """ match_confidences = {} if not contributor_data: contributor_data = ContributorData() # If we have a sort name to look for, and it's in this cluster's # sort names, great. if working_sort_name: for potential_match in self.sort_names_for_cluster(cluster): match_confidence = contributor_name_match_ratio( potential_match, working_sort_name) match_confidences["sort_name"] = match_confidence # fuzzy match filter may not always give a 100% match, so cap arbitrarily at 90% as a "sure match" if match_confidence > 90: contributor_data.sort_name = potential_match return match_confidences # If we have a display name to look for, and this cluster's # Wikipedia name converts to the display name, great. if working_display_name: wikipedia_name = self.extract_wikipedia_name(cluster) if wikipedia_name: contributor_data.wikipedia_name = wikipedia_name display_name = self.wikipedia_name_to_display_name( wikipedia_name) match_confidence = contributor_name_match_ratio( display_name, working_display_name) match_confidences["display_name"] = match_confidence if match_confidence > 90: contributor_data.display_name = display_name return match_confidences # If there are UNIMARC records, and every part of the UNIMARC # record matches the sort name or the display name, great. unimarcs = self._xpath( cluster, './/*[local-name()="datafield"][@dtype="UNIMARC"]') candidates = [] for unimarc in unimarcs: (possible_given, possible_family, possible_extra, possible_sort_name) = self.extract_name_from_unimarc(unimarc) if working_sort_name: match_confidence = contributor_name_match_ratio( possible_sort_name, working_sort_name) match_confidences["unimarc"] = match_confidence if match_confidence > 90: contributor_data.family_name = possible_sort_name return match_confidences for name in (working_sort_name, working_display_name): if not name: continue if (possible_given and possible_given in name and possible_family and possible_family in name and (not possible_extra or possible_extra in name)): match_confidences["unimarc"] = 90 contributor_data.family_name = possible_family return match_confidences # Last-ditch effort. Guess at the sort name and see if *that's* one # of the cluster sort names. if working_display_name and not working_sort_name: test_sort_name = display_name_to_sort_name(working_display_name) for potential_match in self.sort_names_for_cluster(cluster): match_confidence = contributor_name_match_ratio( potential_match, test_sort_name) match_confidences["guessed_sort_name"] = match_confidence if match_confidence > 90: contributor_data.sort_name = potential_match return match_confidences # OK, last last-ditch effort. See if the alternate name forms (pseudonyms) are it. if working_sort_name: for potential_match in self.alternate_name_forms_for_cluster( cluster): match_confidence = contributor_name_match_ratio( potential_match, working_sort_name) match_confidences["alternate_name"] = match_confidence if match_confidence > 90: contributor_data.family_name = potential_match return match_confidences return match_confidences