def setup(self): super(TestVIAFNameParser, self).setup() self.parser = VIAFParser()
def setup(self): super(TestNameParser, self).setup() self.parser = VIAFParser()
class TestVIAFNameParser(DatabaseTest): """Test the name parsing code built into VIAFParser (as opposed to the simpler standalone code in the NameParser class). """ def setup(self): super(TestVIAFNameParser, self).setup() self.parser = VIAFParser() def sample_data(self, filename): return sample_data(filename, "viaf") def test_entry_with_wikipedia_name(self): xml = self.sample_data("will_eisner.xml") (contributor_data, match_confidences, contributor_titles) = self.parser.parse(xml, None) eq_("10455", contributor_data.viaf) eq_("Will Eisner", contributor_data.display_name) eq_("Eisner", contributor_data.family_name) eq_("Will_Eisner", contributor_data.wikipedia_name) def test_entry_with_wikipedia_name_that_is_actually_wikidata_id(self): xml = self.sample_data("michelle_belanger.xml") (contributor_data, match_confidences, contributor_titles) = self.parser.parse(xml, None) eq_('38770861', contributor_data.viaf) eq_("Michelle A. Belanger", contributor_data.display_name) eq_("Belanger", contributor_data.family_name) eq_(None, contributor_data.wikipedia_name) def test_entry_without_wikipedia_name(self): xml = self.sample_data("palmer.xml") (contributor_data, match_confidences, contributor_titles) = self.parser.parse(xml) eq_("2506349", contributor_data.viaf) eq_("Roy Ernest Palmer", contributor_data.display_name) eq_("Palmer", contributor_data.family_name) eq_(None, contributor_data.wikipedia_name) def test_simple_corporate_entry(self): xml = self.sample_data("aquarius.xml") (contributor_data, match_confidences, contributor_titles) = self.parser.parse(xml) eq_("159591140", contributor_data.viaf) eq_("Aquarius Paris", contributor_data.display_name) eq_("Aquarius", contributor_data.family_name) eq_(None, contributor_data.wikipedia_name) def test_many_names(self): # Even if we pass in "Sam Clemens" as the working name, the # family name we get back is "Twain", because we give very # high consideration to the Wikipedia name. xml = self.sample_data("mark_twain.xml") (contributor_data, match_confidences, contributor_titles) = self.parser.parse(xml, working_display_name="Sam Clemens") eq_("50566653", contributor_data.viaf) eq_("Mark Twain", contributor_data.display_name) eq_("Mark_Twain", contributor_data.wikipedia_name) eq_("Twain", contributor_data.family_name) # Let's try again without the Wikipedia name. xml = self.sample_data("mark_twain_no_wikipedia.xml") # The author is better known as Mark Twain, so this # name wins by popularity if we don't specify a name going in. (contributor_data, match_confidences, contributor_titles) = self.parser.parse(xml, None) eq_("50566653", contributor_data.viaf) eq_("Mark Twain", contributor_data.display_name) eq_("Twain", contributor_data.family_name) eq_(None, contributor_data.wikipedia_name) # NOTE: Old behavior: Even if we go in expecting something like "Sam Clemens", we get the consensus result. # New behavior: If the wikipedia name is not there to correct our working_display_name for us, # then either "Samuel Langhorne Clemens" or "Mark Twain" is acceptable to us here now. (contributor_data, match_confidences, contributor_titles) = self.parser.parse(xml, working_display_name="Samuel Langhorne Clemens") eq_("50566653", contributor_data.viaf) eq_("Samuel Langhorne Clemens", contributor_data.display_name) eq_("Twain, Mark", contributor_data.sort_name) eq_("Clemens", contributor_data.family_name) eq_(None, contributor_data.wikipedia_name) def test_ignore_results_if_author_not_in_viaf(self): # This is the VIAF result for searching for "Howard, # J. J.". There are lots of results but none of them is the # correct one. This test verifies that we ignore all the # incorrect results. xml = self.sample_data("howard_j_j.xml") name = "Howard, J. J." contributor_candidates = self.parser.parse_multiple(xml, working_sort_name=name) for contributor in contributor_candidates: match_quality = self.parser.weigh_contributor(candidate=contributor, working_sort_name=name, strict=True) eq_(match_quality, 0) def test_multiple_results_with_success(self): xml = self.sample_data("lancelyn_green.xml") name = "Green, Roger Lancelyn" contributor_candidates = self.parser.parse_multiple(xml, working_sort_name=name) contributor_candidates = self.parser.order_candidates(working_sort_name=name, contributor_candidates=contributor_candidates) (contributor_data, match_confidences, contributor_titles) = contributor_candidates[0] eq_("29620265", contributor_data.viaf) eq_("Roger Lancelyn Green", contributor_data.display_name) eq_("Green", contributor_data.family_name) eq_("Roger_Lancelyn_Green", contributor_data.wikipedia_name) def test_multiple_results_with_viaf_number_but_no_name(self): # This author's VIAF entry doesn't have any name information # we don't already have, but it does have a VIAF entry, and we # pick that up. xml = self.sample_data("kate_lister.xml") name = "Lister, Kate" (contributor_data, match_confidences, contributor_titles) = self.parser.parse(xml) eq_("68169992", contributor_data.viaf) eq_(None, contributor_data.display_name) eq_(None, contributor_data.family_name) eq_("Lister, Kate", contributor_data.sort_name) eq_(None, contributor_data.wikipedia_name) def test_library_popularity(self): # A good match higher in the list (in terms of library popularity) penalizes a match lower in the list, # but a bad match high in the list doesn't penalize matches below it. # our correct match Amy Levin is 4th down the list, so without the title, she should be penalized xml = self.sample_data("amy_levin_all_viaf.xml") name = "Levin, Amy" contributor_candidates = self.parser.parse_multiple(xml, working_sort_name=name) contributor_candidates = self.parser.order_candidates(working_sort_name=name, contributor_candidates=contributor_candidates) (contributor_data, match_confidences, contributor_titles) = contributor_candidates[0] eq_(contributor_data.viaf, '215866998') eq_(match_confidences['library_popularity'], 1) for (contributor_data, match_confidences, contributor_titles) in contributor_candidates: if contributor_data.viaf == '315591707': eq_(match_confidences['library_popularity'], 4) # with the title, the right Amy is selected, despite the library unpopularity contributor_candidates = self.parser.order_candidates(working_sort_name=name, contributor_candidates=contributor_candidates, known_titles=["Faithfully Feminist"]) (contributor_data, match_confidences, contributor_titles) = contributor_candidates[0] eq_(contributor_data.viaf, '315591707') eq_(match_confidences['library_popularity'], 4) for (contributor_data, match_confidences, contributor_titles) in contributor_candidates: if contributor_data.viaf == '215866998': eq_(match_confidences['library_popularity'], 1) # has 10 clusters, first three bearing no resemblance to any John xml = self.sample_data("john_jewel_all_viaf.xml") name = "Jewel, John" contributor_candidates = self.parser.parse_multiple(xml, working_sort_name=name) contributor_candidates = self.parser.order_candidates(working_sort_name=name, contributor_candidates=contributor_candidates, known_titles=["The Apology of the Church of England"]) (contributor_data, match_confidences, contributor_titles) = contributor_candidates[0] eq_("176145857856923020062", contributor_data.viaf) eq_(match_confidences['library_popularity'], 4) # without using the title, we still get the right John Jewel, even though he's 4th down the list, he hasn't suffered contributor_candidates = self.parser.order_candidates(working_sort_name=name, contributor_candidates=contributor_candidates) (contributor_data, match_confidences, contributor_titles) = contributor_candidates[0] eq_("176145857856923020062", contributor_data.viaf) eq_(match_confidences['library_popularity'], 4) def test_birthdates(self): # TODO: waiting on https://github.com/NYPL-Simplified/Simplified/issues/61 # Good for testing separating authors by birth dates -- VIAF has several Amy Levins, with different birthdates. xml = self.sample_data("amy_levin_all_viaf.xml") name = "Levin, Amy" contributor_candidates = self.parser.parse_multiple(xml, working_sort_name=name) contributor_candidates = self.parser.order_candidates(working_sort_name=name, contributor_candidates=contributor_candidates) (contributor_data, match_confidences, contributor_titles) = contributor_candidates[0] eq_(contributor_data.viaf, '215866998') # make sure birthdate is 1957 contributor_candidates = self.parser.order_candidates(working_sort_name=name, contributor_candidates=contributor_candidates, known_titles=["Faithfully Feminist"]) (contributor_data, match_confidences, contributor_titles) = contributor_candidates[0] eq_(contributor_data.viaf, '315591707')
class TestNameParser(DatabaseTest): def setup(self): super(TestNameParser, self).setup() self.parser = VIAFParser() def sample_data(self, filename): return sample_data(filename, "viaf") def test_entry_with_wikipedia_name(self): xml = self.sample_data("will_eisner.xml") (contributor_data, match_confidences, contributor_titles) = self.parser.parse(xml, None) eq_("10455", contributor_data.viaf) eq_("Will Eisner", contributor_data.display_name) eq_("Eisner", contributor_data.family_name) eq_("Will_Eisner", contributor_data.wikipedia_name) def test_entry_with_wikipedia_name_that_is_actually_wikidata_id(self): xml = self.sample_data("michelle_belanger.xml") (contributor_data, match_confidences, contributor_titles) = self.parser.parse(xml, None) eq_('38770861', contributor_data.viaf) eq_("Michelle A. Belanger", contributor_data.display_name) eq_("Belanger", contributor_data.family_name) eq_(None, contributor_data.wikipedia_name) def test_entry_without_wikipedia_name(self): xml = self.sample_data("palmer.xml") (contributor_data, match_confidences, contributor_titles) = self.parser.parse(xml) eq_("2506349", contributor_data.viaf) eq_("Roy Ernest Palmer", contributor_data.display_name) eq_("Palmer", contributor_data.family_name) eq_(None, contributor_data.wikipedia_name) def test_simple_corporate_entry(self): xml = self.sample_data("aquarius.xml") (contributor_data, match_confidences, contributor_titles) = self.parser.parse(xml) eq_("159591140", contributor_data.viaf) eq_("Aquarius Paris", contributor_data.display_name) eq_("Aquarius", contributor_data.family_name) eq_(None, contributor_data.wikipedia_name) def test_many_names(self): # Even if we pass in "Sam Clemens" as the working name, the # family name we get back is "Twain", because we give very # high consideration to the Wikipedia name. xml = self.sample_data("mark_twain.xml") (contributor_data, match_confidences, contributor_titles) = self.parser.parse( xml, working_display_name="Sam Clemens") eq_("50566653", contributor_data.viaf) eq_("Mark Twain", contributor_data.display_name) eq_("Mark_Twain", contributor_data.wikipedia_name) eq_("Twain", contributor_data.family_name) # Let's try again without the Wikipedia name. xml = self.sample_data("mark_twain_no_wikipedia.xml") # The author is better known as Mark Twain, so this # name wins by popularity if we don't specify a name going in. (contributor_data, match_confidences, contributor_titles) = self.parser.parse(xml, None) eq_("50566653", contributor_data.viaf) eq_("Mark Twain", contributor_data.display_name) eq_("Twain", contributor_data.family_name) eq_(None, contributor_data.wikipedia_name) # NOTE: Old behavior: Even if we go in expecting something like "Sam Clemens", we get the consensus result. # New behavior: If the wikipedia name is not there to correct our working_display_name for us, # then either "Samuel Langhorne Clemens" or "Mark Twain" is acceptable to us here now. (contributor_data, match_confidences, contributor_titles) = self.parser.parse( xml, working_display_name="Samuel Langhorne Clemens") eq_("50566653", contributor_data.viaf) eq_("Samuel Langhorne Clemens", contributor_data.display_name) eq_("Twain, Mark", contributor_data.sort_name) eq_("Clemens", contributor_data.family_name) eq_(None, contributor_data.wikipedia_name) def test_ignore_results_if_author_not_in_viaf(self): # This is the VIAF result for searching for "Howard, # J. J.". There are lots of results but none of them is the # correct one. This test verifies that we ignore all the # incorrect results. xml = self.sample_data("howard_j_j.xml") name = "Howard, J. J." contributor_candidates = self.parser.parse_multiple( xml, working_sort_name=name) for contributor in contributor_candidates: match_quality = self.parser.weigh_contributor( candidate=contributor, working_sort_name=name, strict=True) eq_(match_quality, 0) def test_multiple_results_with_success(self): xml = self.sample_data("lancelyn_green.xml") name = "Green, Roger Lancelyn" contributor_candidates = self.parser.parse_multiple( xml, working_sort_name=name) contributor_candidates = self.parser.order_candidates( working_sort_name=name, contributor_candidates=contributor_candidates) (contributor_data, match_confidences, contributor_titles) = contributor_candidates[0] eq_("29620265", contributor_data.viaf) eq_("Roger Lancelyn Green", contributor_data.display_name) eq_("Green", contributor_data.family_name) eq_("Roger_Lancelyn_Green", contributor_data.wikipedia_name) def test_multiple_results_with_viaf_number_but_no_name(self): # This author's VIAF entry doesn't have any name information # we don't already have, but it does have a VIAF entry, and we # pick that up. xml = self.sample_data("kate_lister.xml") name = "Lister, Kate" (contributor_data, match_confidences, contributor_titles) = self.parser.parse(xml) eq_("68169992", contributor_data.viaf) eq_(None, contributor_data.display_name) eq_(None, contributor_data.family_name) eq_("Lister, Kate", contributor_data.sort_name) eq_(None, contributor_data.wikipedia_name) def test_library_popularity(self): # A good match higher in the list (in terms of library popularity) penalizes a match lower in the list, # but a bad match high in the list doesn't penalize matches below it. # our correct match Amy Levin is 4th down the list, so without the title, she should be penalized xml = self.sample_data("amy_levin_all_viaf.xml") name = "Levin, Amy" contributor_candidates = self.parser.parse_multiple( xml, working_sort_name=name) contributor_candidates = self.parser.order_candidates( working_sort_name=name, contributor_candidates=contributor_candidates) (contributor_data, match_confidences, contributor_titles) = contributor_candidates[0] eq_(contributor_data.viaf, '215866998') eq_(match_confidences['library_popularity'], 1) for (contributor_data, match_confidences, contributor_titles) in contributor_candidates: if contributor_data.viaf == '315591707': eq_(match_confidences['library_popularity'], 4) # with the title, the right Amy is selected, despite the library unpopularity contributor_candidates = self.parser.order_candidates( working_sort_name=name, contributor_candidates=contributor_candidates, known_titles=["Faithfully Feminist"]) (contributor_data, match_confidences, contributor_titles) = contributor_candidates[0] eq_(contributor_data.viaf, '315591707') eq_(match_confidences['library_popularity'], 4) for (contributor_data, match_confidences, contributor_titles) in contributor_candidates: if contributor_data.viaf == '215866998': eq_(match_confidences['library_popularity'], 1) # has 10 clusters, first three bearing no resemblance to any John xml = self.sample_data("john_jewel_all_viaf.xml") name = "Jewel, John" contributor_candidates = self.parser.parse_multiple( xml, working_sort_name=name) contributor_candidates = self.parser.order_candidates( working_sort_name=name, contributor_candidates=contributor_candidates, known_titles=["The Apology of the Church of England"]) (contributor_data, match_confidences, contributor_titles) = contributor_candidates[0] eq_("176145857856923020062", contributor_data.viaf) eq_(match_confidences['library_popularity'], 4) # without using the title, we still get the right John Jewel, even though he's 4th down the list, he hasn't suffered contributor_candidates = self.parser.order_candidates( working_sort_name=name, contributor_candidates=contributor_candidates) (contributor_data, match_confidences, contributor_titles) = contributor_candidates[0] eq_("176145857856923020062", contributor_data.viaf) eq_(match_confidences['library_popularity'], 4) def test_birthdates(self): # TODO: waiting on https://github.com/NYPL-Simplified/Simplified/issues/61 # Good for testing separating authors by birth dates -- VIAF has several Amy Levins, with different birthdates. xml = self.sample_data("amy_levin_all_viaf.xml") name = "Levin, Amy" contributor_candidates = self.parser.parse_multiple( xml, working_sort_name=name) contributor_candidates = self.parser.order_candidates( working_sort_name=name, contributor_candidates=contributor_candidates) (contributor_data, match_confidences, contributor_titles) = contributor_candidates[0] eq_(contributor_data.viaf, '215866998') # make sure birthdate is 1957 contributor_candidates = self.parser.order_candidates( working_sort_name=name, contributor_candidates=contributor_candidates, known_titles=["Faithfully Feminist"]) (contributor_data, match_confidences, contributor_titles) = contributor_candidates[0] eq_(contributor_data.viaf, '315591707')