def test_extract_multiple_works_with_author_restriction(self): """We can choose to only accept works by a given author.""" xml = self.sample_data("multi_work_response.xml") [wrong_author], ignore = Contributor.lookup(self._db, sort_name="Wrong Author") status, swids = OCLCXMLParser.parse( self._db, xml, languages=["eng"], authors=[wrong_author]) # This person is not listed as an author of any work in the dataset, # so none of those works were picked up. eq_(0, len(swids)) [melville], ignore = Contributor.lookup(self._db, sort_name="Melville, Herman") status, swids = OCLCXMLParser.parse( self._db, xml, languages=["eng"], authors=[melville]) # We picked up 11 of the 25 works in the dataset. eq_(11, len(swids)) # The missing works (as you can verify by looking at # oclc_multi_work_response.xml) either don't credit Herman # Melville at all (the 1956 Gregory Peck movie "Moby Dick"), # credit him as "Associated name" rather than as an author # (four books about "Moby Dick"), or credit him as an author # but not as the primary author (academic works and adaptations). for missing in '10798812', '13424036', '22658644', '250604212', '474972877', '13358012', '153927888', '13206523', '46935692', "14135019", "51088077", "105446800", "164732682", "26863225": assert missing not in swids
def test_to_edition_sets_sort_author_name_if_obvious(self): [contributor], ignore = Contributor.lookup(self._db, u"Hawkins, Paula") contributor.display_name = u"Paula Hawkins" title = NYTBestSellerListTitle(self.one_list_title) edition = title.to_edition(self._db, self.metadata_client) eq_(contributor.sort_name, edition.sort_author) eq_(contributor.display_name, edition.author) assert edition.permanent_work_id is not None
def test_to_edition_sets_sort_author_name_if_obvious(self): [contributor], ignore = Contributor.lookup( self._db, u"Hawkins, Paula") contributor.display_name = u"Paula Hawkins" title = NYTBestSellerListTitle(self.one_list_title, Edition.BOOK_MEDIUM) edition = title.to_edition(self._db, self.metadata_client) eq_(contributor.sort_name, edition.sort_author) eq_(contributor.display_name, edition.author) assert edition.permanent_work_id is not None
def test_extract_multiple_works_with_author_restriction(self): """We can choose to only accept works by a given author.""" xml = self.sample_data("multi_work_response.xml") [wrong_author], ignore = Contributor.lookup(self._db, sort_name="Wrong Author") status, swids = OCLCXMLParser.parse(self._db, xml, languages=["eng"], authors=[wrong_author]) # This person is not listed as an author of any work in the dataset, # so none of those works were picked up. eq_(0, len(swids)) [melville], ignore = Contributor.lookup(self._db, sort_name="Melville, Herman") status, swids = OCLCXMLParser.parse(self._db, xml, languages=["eng"], authors=[melville]) # We picked up 11 of the 25 works in the dataset. eq_(11, len(swids)) # The missing works (as you can verify by looking at # oclc_multi_work_response.xml) either don't credit Herman # Melville at all (the 1956 Gregory Peck movie "Moby Dick"), # credit him as "Associated name" rather than as an author # (four books about "Moby Dick"), or credit him as an author # but not as the primary author (academic works and adaptations). for missing in ( "10798812", "13424036", "22658644", "250604212", "474972877", "13358012", "153927888", "13206523", "46935692", "14135019", "51088077", "105446800", "164732682", "26863225", ): assert missing not in swids
def _parse_single_author(cls, _db, author, lc=None, viaf=None, existing_authors=[], default_role=Contributor.AUTHOR_ROLE, primary_author=None): default_role_used = False # First find roles if present # "Giles, Lionel, 1875-1958 [Writer of added commentary; Translator]" author = author.strip() m = cls.ROLES.search(author) if m: author = author[:m.start()].strip() role_string = m.groups()[0] roles = [x.strip() for x in role_string.split(";")] elif default_role: roles = [default_role] default_role_used = True else: roles = [] # Author string now looks like # "Giles, Lionel, 1875-1958" m = cls.LIFESPAN.search(author) kwargs = dict() if m: author = author[:m.start()].strip() birth, death = m.groups() if birth: kwargs[Contributor.BIRTH_DATE] = birth if death: kwargs[Contributor.DEATH_DATE] = death # Author string now looks like # "Giles, Lionel," if author.endswith(","): author = author[:-1] contributor = None if not author: # No name was given for the author. return None, roles, default_role_used if primary_author and author == primary_author.sort_name: if Contributor.AUTHOR_ROLE in roles: roles.remove(Contributor.AUTHOR_ROLE) if Contributor.UNKNOWN_ROLE in roles: roles.remove(Contributor.UNKNOWN_ROLE) roles.insert(0, Contributor.PRIMARY_AUTHOR_ROLE) if existing_authors: # Calling Contributor.lookup will result in a database # hit, and looking up a contributor based on name may # result in multiple results (see below). We'll have no # way of distinguishing between those results. If # possible, it's much more reliable to look through # existing_authors (the authors derived from an entry's # <authors> tag). for x in existing_authors: if cls._contributor_match(x, author, lc, viaf): contributor = x break if contributor: was_new = False if not contributor: contributor, was_new = Contributor.lookup( _db, author, viaf, lc, extra=kwargs) if isinstance(contributor, list): # We asked for an author based solely on the name, which makes # Contributor.lookup() return a list. if len(contributor) == 1: # Fortunately, either the database knows about only # one author with that name, or it didn't know about # any authors with that name and it just created one, # so we can unambiguously use it. contributor = contributor[0] else: # Uh-oh. The database knows about multiple authors # with that name. We have no basis for deciding which # author we mean. But we would prefer to identify with # an author who has a known LC or VIAF number. # # This should happen very rarely because of our check # against existing_authors above. But it will happen # for authors that have a work in Project Gutenberg. with_id = [x for x in contributor if x.lc is not None or x.viaf is not None] if with_id: contributor = with_id[0] else: contributor = contributor[0] return contributor, roles, default_role_used
def _parse_single_author(cls, _db, author, lc=None, viaf=None, existing_authors=[], default_role=Contributor.AUTHOR_ROLE, primary_author=None): default_role_used = False # First find roles if present # "Giles, Lionel, 1875-1958 [Writer of added commentary; Translator]" author = author.strip() m = cls.ROLES.search(author) if m: author = author[:m.start()].strip() role_string = m.groups()[0] roles = [x.strip() for x in role_string.split(";")] elif default_role: roles = [default_role] default_role_used = True else: roles = [] # Author string now looks like # "Giles, Lionel, 1875-1958" m = cls.LIFESPAN.search(author) kwargs = dict() if m: author = author[:m.start()].strip() birth, death = m.groups() if birth: kwargs[Contributor.BIRTH_DATE] = birth if death: kwargs[Contributor.DEATH_DATE] = death # Author string now looks like # "Giles, Lionel," if author.endswith(","): author = author[:-1] contributor = None if not author: # No name was given for the author. return None, roles, default_role_used if primary_author and author == primary_author.sort_name: if Contributor.AUTHOR_ROLE in roles: roles.remove(Contributor.AUTHOR_ROLE) if Contributor.UNKNOWN_ROLE in roles: roles.remove(Contributor.UNKNOWN_ROLE) roles.insert(0, Contributor.PRIMARY_AUTHOR_ROLE) if existing_authors: # Calling Contributor.lookup will result in a database # hit, and looking up a contributor based on name may # result in multiple results (see below). We'll have no # way of distinguishing between those results. If # possible, it's much more reliable to look through # existing_authors (the authors derived from an entry's # <authors> tag). for x in existing_authors: if cls._contributor_match(x, author, lc, viaf): contributor = x break if contributor: was_new = False if not contributor: contributor, was_new = Contributor.lookup(_db, author, viaf, lc, extra=kwargs) if isinstance(contributor, list): # We asked for an author based solely on the name, which makes # Contributor.lookup() return a list. if len(contributor) == 1: # Fortunately, either the database knows about only # one author with that name, or it didn't know about # any authors with that name and it just created one, # so we can unambiguously use it. contributor = contributor[0] else: # Uh-oh. The database knows about multiple authors # with that name. We have no basis for deciding which # author we mean. But we would prefer to identify with # an author who has a known LC or VIAF number. # # This should happen very rarely because of our check # against existing_authors above. But it will happen # for authors that have a work in Project Gutenberg. with_id = [ x for x in contributor if x.lc is not None or x.viaf is not None ] if with_id: contributor = with_id[0] else: contributor = contributor[0] return contributor, roles, default_role_used