def test_initialization(self): assert_raises_regexp( ValueError, "ContributorLane can't be created without contributor", ContributorLane, self._default_library, None) parent = WorkList() parent.initialize(self._default_library) lane = ContributorLane( self._default_library, self.contributor, parent, languages=['a'], audiences=['b'], ) eq_(self.contributor, lane.contributor) eq_(['a'], lane.languages) eq_(['b'], lane.audiences) eq_([lane], parent.children) # The contributor_key will be used in links to other pages # of this Lane and so on. eq_("Lois Lane", lane.contributor_key) # If the contributor used to create a ContributorLane has no # display name, their sort name is used as the # contributor_key. contributor = ContributorData(sort_name="Lane, Lois") lane = ContributorLane(self._default_library, contributor) eq_(contributor, lane.contributor) eq_("Lane, Lois", lane.contributor_key)
def test_circulationdata_can_be_deepcopied(self): # Check that we didn't put something in the CirculationData that # will prevent it from being copied. (e.g., self.log) subject = SubjectData(Subject.TAG, "subject") contributor = ContributorData() identifier = IdentifierData(Identifier.GUTENBERG_ID, "1") link = LinkData(Hyperlink.OPEN_ACCESS_DOWNLOAD, "example.epub") format = FormatData(Representation.EPUB_MEDIA_TYPE, DeliveryMechanism.NO_DRM) rights_uri = RightsStatus.GENERIC_OPEN_ACCESS circulation_data = CirculationData( DataSource.GUTENBERG, primary_identifier=identifier, links=[link], licenses_owned=5, licenses_available=5, licenses_reserved=None, patrons_in_hold_queue=None, formats=[format], default_rights_uri=rights_uri, ) circulation_data_copy = deepcopy(circulation_data) # If deepcopy didn't throw an exception we're ok. assert circulation_data_copy is not None
def test_initialization(self): with pytest.raises(ValueError) as excinfo: ContributorLane(self._default_library, None) assert "ContributorLane can't be created without contributor" in str( excinfo.value ) parent = WorkList() parent.initialize(self._default_library) lane = ContributorLane( self._default_library, self.contributor, parent, languages=["a"], audiences=["b"], ) assert self.contributor == lane.contributor assert ["a"] == lane.languages assert ["b"] == lane.audiences assert [lane] == parent.children # The contributor_key will be used in links to other pages # of this Lane and so on. assert "Lois Lane" == lane.contributor_key # If the contributor used to create a ContributorLane has no # display name, their sort name is used as the # contributor_key. contributor = ContributorData(sort_name="Lane, Lois") lane = ContributorLane(self._default_library, contributor) assert contributor == lane.contributor assert "Lane, Lois" == lane.contributor_key
def extract_bibliographic(self, element): identifiers = [] contributors = [] identifiers.append(IdentifierData(Identifier.ISBN, element["isbn"])) sort_name = element["author"] if not sort_name: sort_name = Edition.UNKNOWN_AUTHOR contributors.append(ContributorData(sort_name=sort_name)) primary_identifier = IdentifierData(EnkiAPI.ENKI_ID, element["id"]) image_url = element["large_image"] thumbnail_url = element["large_image"] images = [ LinkData(rel=Hyperlink.THUMBNAIL_IMAGE, href=thumbnail_url, media_type=Representation.PNG_MEDIA_TYPE), LinkData(rel=Hyperlink.IMAGE, href=image_url, media_type=Representation.PNG_MEDIA_TYPE) ] metadata = Metadata( data_source=DataSource.ENKI, title=element["title"], language="eng", medium=Edition.BOOK_MEDIUM, publisher=element["publisher"], primary_identifier=primary_identifier, identifiers=identifiers, contributors=contributors, links=images, ) licenses_owned = element["availability"]["totalCopies"] licenses_available = element["availability"]["availableCopies"] hold = element["availability"]["onHold"] drm_type = EnkiAPI.adobe_drm if (element["availability"]["accessType"] == 'acs') else EnkiAPI.no_drm formats = [] formats.append( FormatData(content_type=Representation.EPUB_MEDIA_TYPE, drm_scheme=drm_type)) circulationdata = CirculationData( data_source=DataSource.ENKI, primary_identifier=primary_identifier, formats=formats, licenses_owned=int(licenses_owned), licenses_available=int(licenses_available), patrons_in_hold_queue=int(hold)) metadata.circulation = circulationdata return metadata
def parse(cls, string): """Parse a string into a ContributorData object. This may include sort_name, birth_date, and death_date. """ string = string.strip() sort_name, birth, death = cls._get_lifespan(string) extra = dict() if birth is not None: extra[Contributor.BIRTH_DATE] = birth if death is not None: extra[Contributor.DEATH_DATE] = death return ContributorData( sort_name=sort_name, extra=extra, )
def cluster_has_record_for_named_author( self, cluster, working_sort_name, working_display_name, contributor_data=None): """ Looks through the xml cluster for all fields that could indicate the author's name. Don't short-circuit the xml parsing process -- if found an author name match, keep parsing and see what else can find. :return: a dictionary containing description of xml field that matched author name searched for. """ match_confidences = {} if not contributor_data: contributor_data = ContributorData() # If we have a sort name to look for, and it's in this cluster's # sort names, great. if working_sort_name: for potential_match in self.sort_names_for_cluster(cluster): match_confidence = contributor_name_match_ratio(potential_match, working_sort_name) match_confidences["sort_name"] = match_confidence # fuzzy match filter may not always give a 100% match, so cap arbitrarily at 90% as a "sure match" if match_confidence > 90: contributor_data.sort_name=potential_match return match_confidences # If we have a display name to look for, and this cluster's # Wikipedia name converts to the display name, great. if working_display_name: wikipedia_name = self.extract_wikipedia_name(cluster) if wikipedia_name: contributor_data.wikipedia_name=wikipedia_name display_name = self.wikipedia_name_to_display_name(wikipedia_name) match_confidence = contributor_name_match_ratio(display_name, working_display_name) match_confidences["display_name"] = match_confidence if match_confidence > 90: contributor_data.display_name=display_name return match_confidences # If there are UNIMARC records, and every part of the UNIMARC # record matches the sort name or the display name, great. unimarcs = self._xpath(cluster, './/*[local-name()="datafield"][@dtype="UNIMARC"]') candidates = [] for unimarc in unimarcs: (possible_given, possible_family, possible_extra, possible_sort_name) = self.extract_name_from_unimarc(unimarc) if working_sort_name: match_confidence = contributor_name_match_ratio(possible_sort_name, working_sort_name) match_confidences["unimarc"] = match_confidence if match_confidence > 90: contributor_data.family_name=possible_sort_name return match_confidences for name in (working_sort_name, working_display_name): if not name: continue if (possible_given and possible_given in name and possible_family and possible_family in name and ( not possible_extra or possible_extra in name)): match_confidences["unimarc"] = 90 contributor_data.family_name=possible_family return match_confidences # Last-ditch effort. Guess at the sort name and see if *that's* one # of the cluster sort names. if working_display_name and not working_sort_name: test_sort_name = display_name_to_sort_name(working_display_name) for potential_match in self.sort_names_for_cluster(cluster): match_confidence = contributor_name_match_ratio(potential_match, test_sort_name) match_confidences["guessed_sort_name"] = match_confidence if match_confidence > 90: contributor_data.sort_name=potential_match return match_confidences # OK, last last-ditch effort. See if the alternate name forms (pseudonyms) are it. if working_sort_name: for potential_match in self.alternate_name_forms_for_cluster(cluster): match_confidence = contributor_name_match_ratio(potential_match, working_sort_name) match_confidences["alternate_name"] = match_confidence if match_confidence > 90: contributor_data.family_name=potential_match return match_confidences return match_confidences
for isbn in d.get('isbns', []): isbn13 = isbn.get('isbn13', None) if isbn13: other_isbns.append( IdentifierData(Identifier.ISBN, isbn13, 0.50) ) primary_isbn = primary_isbn13 or primary_isbn10 if primary_isbn: primary_isbn = IdentifierData(Identifier.ISBN, primary_isbn, 0.90) contributors = [] if display_author: contributors.append( ContributorData(display_name=display_author) ) metadata = Metadata( data_source=DataSource.NYT, title=title, language='eng', published=published_date, publisher=publisher, contributors=contributors, primary_identifier=primary_isbn, identifiers=other_isbns, ) super(NYTBestSellerListTitle, self).__init__( metadata, first_appearance, most_recent_appearance,
# other books in the same series, as well as ISBNs that # are just wrong. Assign these equivalencies at a low # level of confidence. for isbn in d.get('isbns', []): isbn13 = isbn.get('isbn13', None) if isbn13: other_isbns.append( IdentifierData(Identifier.ISBN, isbn13, 0.50)) primary_isbn = primary_isbn13 or primary_isbn10 if primary_isbn: primary_isbn = IdentifierData(Identifier.ISBN, primary_isbn, 0.90) contributors = [] if display_author: contributors.append(ContributorData(display_name=display_author)) metadata = Metadata( data_source=DataSource.NYT, title=title, medium=medium, language='eng', published=published_date, publisher=publisher, contributors=contributors, primary_identifier=primary_isbn, identifiers=other_isbns, ) super(NYTBestSellerListTitle, self).__init__(metadata, first_appearance,
def extract_bibliographic(self, element): """Extract Metadata and CirculationData from a dictionary of information from Enki. :return: A Metadata with attached CirculationData. """ # TODO: it's not clear what these are or whether we'd find them # useful: # dateSaved # length # publishDate primary_identifier = IdentifierData(EnkiAPI.ENKI_ID, element["id"]) identifiers = [] identifiers.append(IdentifierData(Identifier.ISBN, element["isbn"])) contributors = [] sort_name = element.get("author", None) or Edition.UNKNOWN_AUTHOR contributors.append(ContributorData(sort_name=sort_name)) links = [] description = element.get("description") if description: links.append( LinkData( rel=Hyperlink.DESCRIPTION, content=description, media_type="text/html", ) ) # NOTE: When this method is called by, e.g. updated_titles(), # the large and small images are available separately. When # this method is called by get_item(), we only get a single # image, in 'cover'. In get_item() we ask that that image be 'large', # which means we'll be filing it as a normal-sized image. # full_image = None thumbnail_image = None for key, rel in ( ("cover", Hyperlink.IMAGE), ("small_image", Hyperlink.THUMBNAIL_IMAGE), ("large_image", Hyperlink.IMAGE), ): url = element.get(key) if not url: continue link = LinkData(rel=rel, href=url, media_type=Representation.PNG_MEDIA_TYPE) if rel == Hyperlink.THUMBNAIL_IMAGE: # Don't add a thumbnail to the list of links -- wait # until the end and then make it a thumbnail of the # primary image. thumbnail_image = link else: if rel == Hyperlink.IMAGE: full_image = link links.append(link) if thumbnail_image: if full_image: # Set the thumbnail as the thumbnail _of_ the full image. full_image.thumbnail = thumbnail_image else: # Treat the thumbnail as the full image. thumbnail_image.rel = Hyperlink.IMAGE links.append(thumbnail_image) # We treat 'subject', 'topic', and 'genre' as interchangeable # sets of tags. This data is based on BISAC but it's not reliably # presented in a form that can be parsed as BISAC. subjects = [] seen_topics = set() for key in ("subject", "topic", "genre"): for topic in element.get(key, []): if not topic or topic in seen_topics: continue subjects.append( SubjectData( Subject.TAG, topic, weight=Classification.TRUSTED_DISTRIBUTOR_WEIGHT, ) ) seen_topics.add(topic) language_code = element.get("language", "English") language = self.LANGUAGE_CODES.get(language_code, "eng") metadata = Metadata( data_source=DataSource.ENKI, title=element.get("title"), language=language, medium=Edition.BOOK_MEDIUM, publisher=element.get("publisher"), primary_identifier=primary_identifier, identifiers=identifiers, contributors=contributors, links=links, subjects=subjects, ) circulationdata = self.extract_circulation( primary_identifier, element.get("availability", {}), element.get("formattype", None), ) metadata.circulation = circulationdata return metadata
def cluster_has_record_for_named_author(self, cluster, working_sort_name, working_display_name, contributor_data=None): """ Looks through the xml cluster for all fields that could indicate the author's name. Don't short-circuit the xml parsing process -- if found an author name match, keep parsing and see what else can find. :return: a dictionary containing description of xml field that matched author name searched for. """ match_confidences = {} if not contributor_data: contributor_data = ContributorData() # If we have a sort name to look for, and it's in this cluster's # sort names, great. if working_sort_name: for potential_match in self.sort_names_for_cluster(cluster): match_confidence = contributor_name_match_ratio( potential_match, working_sort_name) match_confidences["sort_name"] = match_confidence # fuzzy match filter may not always give a 100% match, so cap arbitrarily at 90% as a "sure match" if match_confidence > 90: contributor_data.sort_name = potential_match return match_confidences # If we have a display name to look for, and this cluster's # Wikipedia name converts to the display name, great. if working_display_name: wikipedia_name = self.extract_wikipedia_name(cluster) if wikipedia_name: contributor_data.wikipedia_name = wikipedia_name display_name = self.wikipedia_name_to_display_name( wikipedia_name) match_confidence = contributor_name_match_ratio( display_name, working_display_name) match_confidences["display_name"] = match_confidence if match_confidence > 90: contributor_data.display_name = display_name return match_confidences # If there are UNIMARC records, and every part of the UNIMARC # record matches the sort name or the display name, great. unimarcs = self._xpath( cluster, './/*[local-name()="datafield"][@dtype="UNIMARC"]') candidates = [] for unimarc in unimarcs: (possible_given, possible_family, possible_extra, possible_sort_name) = self.extract_name_from_unimarc(unimarc) if working_sort_name: match_confidence = contributor_name_match_ratio( possible_sort_name, working_sort_name) match_confidences["unimarc"] = match_confidence if match_confidence > 90: contributor_data.family_name = possible_sort_name return match_confidences for name in (working_sort_name, working_display_name): if not name: continue if (possible_given and possible_given in name and possible_family and possible_family in name and (not possible_extra or possible_extra in name)): match_confidences["unimarc"] = 90 contributor_data.family_name = possible_family return match_confidences # Last-ditch effort. Guess at the sort name and see if *that's* one # of the cluster sort names. if working_display_name and not working_sort_name: test_sort_name = display_name_to_sort_name(working_display_name) for potential_match in self.sort_names_for_cluster(cluster): match_confidence = contributor_name_match_ratio( potential_match, test_sort_name) match_confidences["guessed_sort_name"] = match_confidence if match_confidence > 90: contributor_data.sort_name = potential_match return match_confidences # OK, last last-ditch effort. See if the alternate name forms (pseudonyms) are it. if working_sort_name: for potential_match in self.alternate_name_forms_for_cluster( cluster): match_confidence = contributor_name_match_ratio( potential_match, working_sort_name) match_confidences["alternate_name"] = match_confidence if match_confidence > 90: contributor_data.family_name = potential_match return match_confidences return match_confidences
def parse(cls, file, data_source_name): metadata_records = [] # TODO: ONIX has plain language 'reference names' and short tags that # may be used interchangably. This code currently only handles short tags, # and it's not comprehensive. parser = XMLParser() tree = etree.parse(file) root = tree.getroot() for record in root.findall('product'): title = parser.text_of_optional_subtag(record, 'descriptivedetail/titledetail/titleelement/b203') if not title: title_prefix = parser.text_of_optional_subtag(record, 'descriptivedetail/titledetail/titleelement/b030') title_without_prefix = parser.text_of_optional_subtag(record, 'descriptivedetail/titledetail/titleelement/b031') if title_prefix and title_without_prefix: title = title_prefix + " " + title_without_prefix subtitle = parser.text_of_optional_subtag(record, 'descriptivedetail/titledetail/titleelement/b029') language = parser.text_of_optional_subtag(record, 'descriptivedetail/language/b252') or "eng" publisher = parser.text_of_optional_subtag(record, 'publishingdetail/publisher/b081') imprint = parser.text_of_optional_subtag(record, 'publishingdetail/imprint/b079') if imprint == publisher: imprint = None publishing_date = parser.text_of_optional_subtag(record, 'publishingdetail/publishingdate/b306') issued = None if publishing_date: issued = datetime.datetime.strptime(publishing_date, "%Y%m%d") identifier_tags = parser._xpath(record, 'productidentifier') identifiers = [] primary_identifier = None for tag in identifier_tags: type = parser.text_of_subtag(tag, "b221") if type == '02' or type == '15': primary_identifier = IdentifierData(Identifier.ISBN, parser.text_of_subtag(tag, 'b244')) identifiers.append(primary_identifier) subject_tags = parser._xpath(record, 'descriptivedetail/subject') subjects = [] weight = Classification.TRUSTED_DISTRIBUTOR_WEIGHT for tag in subject_tags: type = parser.text_of_subtag(tag, 'b067') if type in cls.SUBJECT_TYPES: subjects.append( SubjectData( cls.SUBJECT_TYPES[type], parser.text_of_subtag(tag, 'b069'), weight=weight ) ) audience_tags = parser._xpath(record, 'descriptivedetail/audience/b204') audiences = [] for tag in audience_tags: if tag.text in cls.AUDIENCE_TYPES: subjects.append( SubjectData( Subject.FREEFORM_AUDIENCE, cls.AUDIENCE_TYPES[tag.text], weight=weight ) ) contributor_tags = parser._xpath(record, 'descriptivedetail/contributor') contributors = [] for tag in contributor_tags: type = parser.text_of_subtag(tag, 'b035') if type in cls.CONTRIBUTOR_TYPES: display_name = parser.text_of_subtag(tag, 'b036') sort_name = parser.text_of_optional_subtag(tag, 'b037') family_name = parser.text_of_optional_subtag(tag, 'b040') bio = parser.text_of_optional_subtag(tag, 'b044') contributors.append(ContributorData(sort_name=sort_name, display_name=display_name, family_name=family_name, roles=[cls.CONTRIBUTOR_TYPES[type]], biography=bio)) collateral_tags = parser._xpath(record, 'collateraldetail/textcontent') links = [] for tag in collateral_tags: type = parser.text_of_subtag(tag, 'x426') # TODO: '03' is the summary in the example I'm testing, but that # might not be generally true. if type == '03': text = parser.text_of_subtag(tag, 'd104') links.append(LinkData(rel=Hyperlink.DESCRIPTION, media_type=Representation.TEXT_HTML_MEDIA_TYPE, content=text)) metadata_records.append(Metadata( data_source=data_source_name, title=title, subtitle=subtitle, language=language, medium=Edition.BOOK_MEDIUM, publisher=publisher, imprint=imprint, issued=issued, primary_identifier=primary_identifier, identifiers=identifiers, subjects=subjects, contributors=contributors, links=links )) return metadata_records
def __init__(self, product): self.subjects = [] self.identifiers = [] self.contributors = [] self.links = [] self.product = product self.var = defaultdict(list) self.unrecognized_tags = dict() self.title = None for f in self.product.get('varFields', []): marctag = MarcTag(f) self.var[marctag.marcTag].append(marctag) # Find a title. for num in ('245', '240'): for tag in self.tags(num): self.title = tag.a if self.title: break if self.title: break # Contributors for tag in self.tags('100'): role = tag.e or 'author.' sort_name = tag.a self.contributors.append( ContributorData(sort_name=sort_name, roles=[role])) # Subjects for number in ('050', '908'): for tag in self.tags(number): # Library of Congress classification if tag.a: self.subjects.append( SubjectData(type=Subject.LCC, identifier=tag.a)) # TODO: tag.b ("Pap 2014eb") includes potentially useful # date information. for tag in self.tags('856'): if tag.subfields.get('3', {}).get('content') == 'Image': continue if tag.u: if tag.y == 'Access eNYPL' or tag.z == 'Access eNYPL': self.links.append(LinkData(rel='alternate', href=tag.u)) for tag in self.tags('082'): if tag.a: self.subjects.append( SubjectData(type=Subject.DDC, identifier=tag.a)) for v in range(650, 656): for tag in self.tags(v): type = getattr(tag, '2', None) native_type = Subject.TAG if type: if type.endswith('.'): type = type[:-1] Representation.tag_type[type] += 1 native_type = self.shadowcat_subject_type_to_native_type.get( type, Subject.TAG) identifiers = [x for x in [tag.a, tag.v] if x] for identifier in identifiers: self.subjects.append( SubjectData(type=native_type, identifier=identifier)) # Identifiers for tag in self.tags('037'): if tag.a and (tag.b in self.marc_037_b_to_identifier_type): t = self.marc_037_b_to_identifier_type[tag.b] self.identifiers.append( IdentifierData(type=t, identifier=tag.a)) for tag in self.tags('020'): isbn = tag.a if not isbn: continue for r in self.isbn_res: m = r.search(isbn) if m: isbn = m.groups()[0] self.identifiers.append( IdentifierData(type=Identifier.ISBN, identifier=isbn)) for key in ['385', '521']: for tag in self.tags(key): identifier = tag.a if identifier.lower() in self.audience_blacklist: continue self.subjects.append( SubjectData(type=Subject.FREEFORM_AUDIENCE, identifier=identifier)) for tag in self.tags('035'): potential = tag.a identifier = None for r, type in self.marc_035_a_to_identifier_type.items(): m = r.search(potential) if m: identifier = m.groups()[0] break if identifier: self.identifiers.append( IdentifierData(type=type, identifier=identifier)) # Keep track of items we haven't seen before. for key, var in self.var.items(): if key not in self.known_vars: self.unrecognized_tags[key] = var
def parse(cls, file, data_source_name): metadata_records = [] # TODO: ONIX has plain language 'reference names' and short tags that # may be used interchangably. This code currently only handles short tags, # and it's not comprehensive. parser = XMLParser() tree = etree.parse(file) root = tree.getroot() for record in root.findall('product'): title = parser.text_of_optional_subtag( record, 'descriptivedetail/titledetail/titleelement/b203') if not title: title_prefix = parser.text_of_optional_subtag( record, 'descriptivedetail/titledetail/titleelement/b030') title_without_prefix = parser.text_of_optional_subtag( record, 'descriptivedetail/titledetail/titleelement/b031') if title_prefix and title_without_prefix: title = title_prefix + " " + title_without_prefix subtitle = parser.text_of_optional_subtag( record, 'descriptivedetail/titledetail/titleelement/b029') language = parser.text_of_optional_subtag( record, 'descriptivedetail/language/b252') or "eng" publisher = parser.text_of_optional_subtag( record, 'publishingdetail/publisher/b081') imprint = parser.text_of_optional_subtag( record, 'publishingdetail/imprint/b079') if imprint == publisher: imprint = None publishing_date = parser.text_of_optional_subtag( record, 'publishingdetail/publishingdate/b306') issued = None if publishing_date: issued = datetime.datetime.strptime(publishing_date, "%Y%m%d") identifier_tags = parser._xpath(record, 'productidentifier') identifiers = [] primary_identifier = None for tag in identifier_tags: type = parser.text_of_subtag(tag, "b221") if type == '02' or type == '15': primary_identifier = IdentifierData( Identifier.ISBN, parser.text_of_subtag(tag, 'b244')) identifiers.append(primary_identifier) subject_tags = parser._xpath(record, 'descriptivedetail/subject') subjects = [] weight = Classification.TRUSTED_DISTRIBUTOR_WEIGHT for tag in subject_tags: type = parser.text_of_subtag(tag, 'b067') if type in cls.SUBJECT_TYPES: subjects.append( SubjectData(cls.SUBJECT_TYPES[type], parser.text_of_subtag(tag, 'b069'), weight=weight)) audience_tags = parser._xpath(record, 'descriptivedetail/audience/b204') audiences = [] for tag in audience_tags: if tag.text in cls.AUDIENCE_TYPES: subjects.append( SubjectData(Subject.FREEFORM_AUDIENCE, cls.AUDIENCE_TYPES[tag.text], weight=weight)) contributor_tags = parser._xpath(record, 'descriptivedetail/contributor') contributors = [] for tag in contributor_tags: type = parser.text_of_subtag(tag, 'b035') if type in cls.CONTRIBUTOR_TYPES: display_name = parser.text_of_subtag(tag, 'b036') sort_name = parser.text_of_optional_subtag(tag, 'b037') family_name = parser.text_of_optional_subtag(tag, 'b040') bio = parser.text_of_optional_subtag(tag, 'b044') contributors.append( ContributorData(sort_name=sort_name, display_name=display_name, family_name=family_name, roles=[cls.CONTRIBUTOR_TYPES[type]], biography=bio)) collateral_tags = parser._xpath(record, 'collateraldetail/textcontent') links = [] for tag in collateral_tags: type = parser.text_of_subtag(tag, 'x426') # TODO: '03' is the summary in the example I'm testing, but that # might not be generally true. if type == '03': text = parser.text_of_subtag(tag, 'd104') links.append( LinkData( rel=Hyperlink.DESCRIPTION, media_type=Representation.TEXT_HTML_MEDIA_TYPE, content=text)) usage_constraint_tags = parser._xpath( record, 'descriptivedetail/epubusageconstraint') licenses_owned = LicensePool.UNLIMITED_ACCESS if usage_constraint_tags: cls._logger.debug('Found {0} EpubUsageConstraint tags'.format( len(usage_constraint_tags))) for usage_constraint_tag in usage_constraint_tags: usage_status = parser.text_of_subtag(usage_constraint_tag, 'x319') cls._logger.debug('EpubUsageStatus: {0}'.format(usage_status)) if usage_status == UsageStatus.PROHIBITED.value: raise Exception('The content is prohibited') elif usage_status == UsageStatus.LIMITED.value: usage_limit_tags = parser._xpath( record, 'descriptivedetail/epubusageconstraint/epubusagelimit') cls._logger.debug('Found {0} EpubUsageLimit tags'.format( len(usage_limit_tags))) if not usage_limit_tags: continue [usage_limit_tag] = usage_limit_tags usage_unit = parser.text_of_subtag(usage_limit_tag, 'x321') cls._logger.debug('EpubUsageUnit: {0}'.format(usage_unit)) if usage_unit == UsageUnit.COPIES.value or usage_status == UsageUnit.CONCURRENT_USERS.value: quantity_limit = parser.text_of_subtag( usage_limit_tag, 'x320') cls._logger.debug( 'Quantity: {0}'.format(quantity_limit)) if licenses_owned == LicensePool.UNLIMITED_ACCESS: licenses_owned = 0 licenses_owned += int(quantity_limit) metadata_records.append( Metadata(data_source=data_source_name, title=title, subtitle=subtitle, language=language, medium=Edition.BOOK_MEDIUM, publisher=publisher, imprint=imprint, issued=issued, primary_identifier=primary_identifier, identifiers=identifiers, subjects=subjects, contributors=contributors, links=links, circulation=CirculationData( data_source_name, primary_identifier, licenses_owned=licenses_owned, licenses_available=licenses_owned, licenses_reserved=0, patrons_in_hold_queue=0))) return metadata_records
subject_identifier = subject_detail.get('id') metadata.subjects.append( SubjectData( type=subject_type, identifier=subject_identifier, name=subject_name, )) else: metadata.subjects.append( SubjectData(type=subject_type, identifier=subject_detail)) viafs = [self.VIAF_ID.search(uri) for uri in creator_uris] viafs = [viaf.groups()[0] for viaf in viafs if viaf is not None] for viaf in viafs: metadata.contributors.append(ContributorData(viaf=viaf)) if creator_uris and not viafs: # We vastly prefer VIAF author information over OCLC. # We'll only extract OCLC author information if we have # _NO_ author information at all. contributors_data = [] for uri in creator_uris: external = self.EXTERNAL_PERSON_URI.search(uri) if external: contributors_data += self.get_contributors(uri) internal = self.INTERNAL_PERSON_URI.search(uri) if internal: graphs = self.internal_lookup(subgraph, [uri]) for person_graph in graphs: contributor_data = self.extract_contributor(
def __init__(self, data, medium): data = data try: bestsellers_date = NYTAPI.parse_datetime(data.get("bestsellers_date")) first_appearance = bestsellers_date most_recent_appearance = bestsellers_date except ValueError as e: first_appearance = None most_recent_appearance = None try: # This is the date the _book_ was published, not the date # the _bestseller list_ was published. published_date = NYTAPI.parse_date(data.get("published_date")) except ValueError as e: published_date = None details = data["book_details"] other_isbns = [] if len(details) == 0: publisher = annotation = primary_isbn10 = primary_isbn13 = title = None display_author = None else: d = details[0] title = d.get("title", None) display_author = d.get("author", None) publisher = d.get("publisher", None) annotation = d.get("description", None) primary_isbn10 = d.get("primary_isbn10", None) primary_isbn13 = d.get("primary_isbn13", None) # The list of other ISBNs frequently contains ISBNs for # other books in the same series, as well as ISBNs that # are just wrong. Assign these equivalencies at a low # level of confidence. for isbn in d.get("isbns", []): isbn13 = isbn.get("isbn13", None) if isbn13: other_isbns.append(IdentifierData(Identifier.ISBN, isbn13, 0.50)) primary_isbn = primary_isbn13 or primary_isbn10 if primary_isbn: primary_isbn = IdentifierData(Identifier.ISBN, primary_isbn, 0.90) contributors = [] if display_author: contributors.append(ContributorData(display_name=display_author)) metadata = Metadata( data_source=DataSource.NYT, title=title, medium=medium, language="eng", published=published_date, publisher=publisher, contributors=contributors, primary_identifier=primary_isbn, identifiers=other_isbns, ) super(NYTBestSellerListTitle, self).__init__( metadata, first_appearance, most_recent_appearance, annotation )
def extract_viaf_info(self, cluster, working_sort_name=None, working_display_name=False): """ Extract name info from a single VIAF cluster. :return: a tuple containing: - ContributorData object filled with display, sort, family, and wikipedia names. - dictionary of ways the xml cluster data matched the names searched for. - list of titles attributed to the contributor in the cluster. or Nones on error. """ contributor_data = ContributorData() contributor_titles = [] match_confidences = {} # Find out if one of the working names shows up in a name record. # Note: Potentially sets contributor_data.sort_name. match_confidences = self.cluster_has_record_for_named_author( cluster, working_sort_name, working_display_name, contributor_data ) # Get the VIAF ID for this cluster, just in case we don't have one yet. viaf_tag = self._xpath1(cluster, './/*[local-name()="viafID"]') if viaf_tag is None: contributor_data.viaf = None else: contributor_data.viaf = viaf_tag.text # If we don't have a working sort name, find the most popular # sort name in this cluster and use it as the sort name. sort_name_popularity = self.sort_names_by_popularity(cluster) # Does this cluster have a Wikipedia page? contributor_data.wikipedia_name = self.extract_wikipedia_name(cluster) if contributor_data.wikipedia_name: contributor_data.display_name = self.wikipedia_name_to_display_name(contributor_data.wikipedia_name) working_display_name = contributor_data.display_name # TODO: There's a problem here when someone's record has a # Wikipedia page other than their personal page (e.g. for # a band they're in.) known_name = working_sort_name or working_display_name unimarcs = self._xpath(cluster, './/*[local-name()="datafield"][@dtype="UNIMARC"]') candidates = [] for unimarc in unimarcs: (possible_given, possible_family, possible_extra, possible_sort_name) = self.extract_name_from_unimarc(unimarc) # Some part of this name must also show up in the original # name for it to even be considered. Otherwise it's a # better bet to try to munge the original name. for v in (possible_given, possible_family, possible_extra): if not v: continue if not known_name or v in known_name: self.log.debug( "FOUND %s in %s", v, known_name ) candidates.append((possible_given, possible_family, possible_extra)) if possible_sort_name: if possible_sort_name.endswith(","): possible_sort_name = possible_sort_name[:-1] sort_name_popularity[possible_sort_name] += 1 break else: self.log.debug( "EXCLUDED %s/%s/%s for lack of resemblance to %s", possible_given, possible_family, possible_extra, known_name ) pass if sort_name_popularity and not contributor_data.sort_name: contributor_data.sort_name, ignore = sort_name_popularity.most_common(1)[0] if contributor_data.display_name: parts = contributor_data.display_name.split(" ") if len(parts) == 2: # Pretty clearly given name+family name. # If it gets more complicated than this we can't # be confident. candidates.append(parts + [None]) display_nameparts = self.best_choice(candidates) if display_nameparts[1]: # Family name contributor_data.family_name = display_nameparts[1] contributor_data.display_name = contributor_data.display_name or self.combine_nameparts(*display_nameparts) or working_display_name # Now go through the title elements, and make a list. titles = self._xpath(cluster, './/*[local-name()="titles"]/*[local-name()="work"]/*[local-name()="title"]') for title in titles: contributor_titles.append(title.text) return contributor_data, match_confidences, contributor_titles
def record_info_to_metadata(cls, book, availability): """Turn Odilo's JSON representation of a book into a Metadata object. Note: The json data passed into this method is from a different file/stream from the json data that goes into the book_info_to_circulation() method. """ if 'id' not in book: return None odilo_id = book['id'] primary_identifier = IdentifierData(Identifier.ODILO_ID, odilo_id) active = book.get('active') title = book.get('title') subtitle = book.get('subtitle') series = book.get('series') series_position = book.get('seriesPosition') contributors = [] sort_author = book.get('author') if sort_author: roles = [Contributor.AUTHOR_ROLE] display_author = sort_name_to_display_name(sort_author) contributor = ContributorData(sort_name=sort_author, display_name=display_author, roles=roles, biography=None) contributors.append(contributor) publisher = book.get('publisher') # Metadata --> Marc21 260$c published = book.get('publicationDate') if not published: # yyyyMMdd --> record creation date published = book.get('releaseDate') if published: try: published = datetime.datetime.strptime(published, "%Y%m%d") except ValueError as e: cls.log.warn('Cannot parse publication date from: ' + published + ', message: ' + e.message) # yyyyMMdd --> record last modification date last_update = book.get('modificationDate') if last_update: try: last_update = datetime.datetime.strptime(last_update, "%Y%m%d") except ValueError as e: cls.log.warn('Cannot parse last update date from: ' + last_update + ', message: ' + e.message) language = book.get('language', 'spa') subjects = [] for subject in book.get('subjects', []): subjects.append( SubjectData(type=Subject.TAG, identifier=subject, weight=100)) for subjectBisacCode in book.get('subjectsBisacCodes', []): subjects.append( SubjectData(type=Subject.BISAC, identifier=subjectBisacCode, weight=100)) grade_level = book.get('gradeLevel') if grade_level: subject = SubjectData(type=Subject.GRADE_LEVEL, identifier=grade_level, weight=10) subjects.append(subject) medium = None file_format = book.get('fileFormat') formats = [] for format_received in book.get('formats', []): if format_received in cls.format_data_for_odilo_format: medium = cls.set_format(format_received, formats) elif format_received == cls.ACSM and file_format: medium = cls.set_format( format_received + '_' + file_format.upper(), formats) else: cls.log.warn('Unrecognized format received: ' + format_received) if not medium: medium = Edition.BOOK_MEDIUM identifiers = [] isbn = book.get('isbn') if isbn: if isbnlib.is_isbn10(isbn): isbn = isbnlib.to_isbn13(isbn) identifiers.append(IdentifierData(Identifier.ISBN, isbn, 1)) # A cover links = [] cover_image_url = book.get('coverImageUrl') if cover_image_url: image_data = cls.image_link_to_linkdata(cover_image_url, Hyperlink.THUMBNAIL_IMAGE) if image_data: links.append(image_data) original_image_url = book.get('originalImageUrl') if original_image_url: image_data = cls.image_link_to_linkdata(original_image_url, Hyperlink.IMAGE) if image_data: links.append(image_data) # Descriptions become links. description = book.get('description') if description: links.append( LinkData(rel=Hyperlink.DESCRIPTION, content=description, media_type="text/html")) metadata = Metadata(data_source=DataSource.ODILO, title=title, subtitle=subtitle, language=language, medium=medium, series=series, series_position=series_position, publisher=publisher, published=published, primary_identifier=primary_identifier, identifiers=identifiers, subjects=subjects, contributors=contributors, links=links, data_source_last_updated=last_update) metadata.circulation = OdiloRepresentationExtractor.record_info_to_circulation( availability) # 'active' --> means that the book exists but it's no longer in the collection # (it could be available again in the future) if not active: metadata.circulation.licenses_owned = 0 metadata.circulation.formats = formats return metadata, active
def extract_viaf_info(self, cluster, working_sort_name=None, working_display_name=False): """ Extract name info from a single VIAF cluster. :return: a tuple containing: - ContributorData object filled with display, sort, family, and wikipedia names. - dictionary of ways the xml cluster data matched the names searched for. - list of titles attributed to the contributor in the cluster. or Nones on error. """ contributor_data = ContributorData() contributor_titles = [] match_confidences = {} # Find out if one of the working names shows up in a name record. # Note: Potentially sets contributor_data.sort_name. match_confidences = self.cluster_has_record_for_named_author( cluster, working_sort_name, working_display_name, contributor_data) # Get the VIAF ID for this cluster, just in case we don't have one yet. viaf_tag = self._xpath1(cluster, './/*[local-name()="viafID"]') if viaf_tag is None: contributor_data.viaf = None else: contributor_data.viaf = viaf_tag.text # If we don't have a working sort name, find the most popular # sort name in this cluster and use it as the sort name. sort_name_popularity = self.sort_names_by_popularity(cluster) # Does this cluster have a Wikipedia page? contributor_data.wikipedia_name = self.extract_wikipedia_name(cluster) if contributor_data.wikipedia_name: contributor_data.display_name = self.wikipedia_name_to_display_name( contributor_data.wikipedia_name) working_display_name = contributor_data.display_name # TODO: There's a problem here when someone's record has a # Wikipedia page other than their personal page (e.g. for # a band they're in.) known_name = working_sort_name or working_display_name unimarcs = self._xpath( cluster, './/*[local-name()="datafield"][@dtype="UNIMARC"]') candidates = [] for unimarc in unimarcs: (possible_given, possible_family, possible_extra, possible_sort_name) = self.extract_name_from_unimarc(unimarc) # Some part of this name must also show up in the original # name for it to even be considered. Otherwise it's a # better bet to try to munge the original name. for v in (possible_given, possible_family, possible_extra): if not v: continue if not known_name or v in known_name: self.log.debug("FOUND %s in %s", v, known_name) candidates.append( (possible_given, possible_family, possible_extra)) if possible_sort_name: if possible_sort_name.endswith(","): possible_sort_name = possible_sort_name[:-1] sort_name_popularity[possible_sort_name] += 1 break else: self.log.debug( "EXCLUDED %s/%s/%s for lack of resemblance to %s", possible_given, possible_family, possible_extra, known_name) pass if sort_name_popularity and not contributor_data.sort_name: contributor_data.sort_name, ignore = sort_name_popularity.most_common( 1)[0] if contributor_data.display_name: parts = contributor_data.display_name.split(" ") if len(parts) == 2: # Pretty clearly given name+family name. # If it gets more complicated than this we can't # be confident. candidates.append(parts + [None]) display_nameparts = self.best_choice(candidates) if display_nameparts[1]: # Family name contributor_data.family_name = display_nameparts[1] contributor_data.display_name = contributor_data.display_name or self.combine_nameparts( *display_nameparts) or working_display_name # Now go through the title elements, and make a list. titles = self._xpath( cluster, './/*[local-name()="titles"]/*[local-name()="work"]/*[local-name()="title"]' ) for title in titles: contributor_titles.append(title.text) return contributor_data, match_confidences, contributor_titles
class TestCirculationMonitor(Axis360Test): BIBLIOGRAPHIC_DATA = Metadata( DataSource.AXIS_360, publisher=u'Random House Inc', language='eng', title=u'Faith of My Fathers : A Family Memoir', imprint=u'Random House Inc2', published=datetime.datetime(2000, 3, 7, 0, 0), primary_identifier=IdentifierData(type=Identifier.AXIS_360_ID, identifier=u'0003642860'), identifiers=[ IdentifierData(type=Identifier.ISBN, identifier=u'9780375504587') ], contributors=[ ContributorData(sort_name=u"McCain, John", roles=[Contributor.PRIMARY_AUTHOR_ROLE]), ContributorData(sort_name=u"Salter, Mark", roles=[Contributor.AUTHOR_ROLE]), ], subjects=[ SubjectData(type=Subject.BISAC, identifier=u'BIOGRAPHY & AUTOBIOGRAPHY / Political'), SubjectData(type=Subject.FREEFORM_AUDIENCE, identifier=u'Adult'), ], ) AVAILABILITY_DATA = CirculationData( data_source=DataSource.AXIS_360, primary_identifier=BIBLIOGRAPHIC_DATA.primary_identifier, licenses_owned=9, licenses_available=8, licenses_reserved=0, patrons_in_hold_queue=0, last_checked=datetime.datetime(2015, 5, 20, 2, 9, 8), ) def test_process_book(self): integration, ignore = create( self._db, ExternalIntegration, goal=ExternalIntegration.ANALYTICS_GOAL, protocol="core.local_analytics_provider", ) monitor = Axis360CirculationMonitor( self._db, self.collection, api_class=MockAxis360API, metadata_client=MockMetadataWranglerOPDSLookup('url')) edition, license_pool = monitor.process_book(self.BIBLIOGRAPHIC_DATA, self.AVAILABILITY_DATA) eq_(u'Faith of My Fathers : A Family Memoir', edition.title) eq_(u'eng', edition.language) eq_(u'Random House Inc', edition.publisher) eq_(u'Random House Inc2', edition.imprint) eq_(Identifier.AXIS_360_ID, edition.primary_identifier.type) eq_(u'0003642860', edition.primary_identifier.identifier) [isbn] = [ x for x in edition.equivalent_identifiers() if x is not edition.primary_identifier ] eq_(Identifier.ISBN, isbn.type) eq_(u'9780375504587', isbn.identifier) eq_( ["McCain, John", "Salter, Mark"], sorted([x.sort_name for x in edition.contributors]), ) subs = sorted((x.subject.type, x.subject.identifier) for x in edition.primary_identifier.classifications) eq_([(Subject.BISAC, u'BIOGRAPHY & AUTOBIOGRAPHY / Political'), (Subject.FREEFORM_AUDIENCE, u'Adult')], subs) eq_(9, license_pool.licenses_owned) eq_(8, license_pool.licenses_available) eq_(0, license_pool.patrons_in_hold_queue) eq_(datetime.datetime(2015, 5, 20, 2, 9, 8), license_pool.last_checked) # Three circulation events were created, backdated to the # last_checked date of the license pool. events = license_pool.circulation_events eq_([ u'distributor_title_add', u'distributor_check_in', u'distributor_license_add' ], [x.type for x in events]) for e in events: eq_(e.start, license_pool.last_checked) # A presentation-ready work has been created for the LicensePool. work = license_pool.work eq_(True, work.presentation_ready) eq_("Faith of My Fathers : A Family Memoir", work.title) # A CoverageRecord has been provided for this book in the Axis # 360 bibliographic coverage provider, so that in the future # it doesn't have to make a separate API request to ask about # this book. records = [ x for x in license_pool.identifier.coverage_records if x.data_source.name == DataSource.AXIS_360 and x.operation is None ] eq_(1, len(records)) def test_process_book_updates_old_licensepool(self): """If the LicensePool already exists, the circulation monitor updates it. """ edition, licensepool = self._edition( with_license_pool=True, identifier_type=Identifier.AXIS_360_ID, identifier_id=u'0003642860') # We start off with availability information based on the # default for test data. eq_(1, licensepool.licenses_owned) identifier = IdentifierData( type=licensepool.identifier.type, identifier=licensepool.identifier.identifier) metadata = Metadata(DataSource.AXIS_360, primary_identifier=identifier) monitor = Axis360CirculationMonitor( self._db, self.collection, api_class=MockAxis360API, metadata_client=MockMetadataWranglerOPDSLookup('url')) edition, licensepool = monitor.process_book(metadata, self.AVAILABILITY_DATA) # Now we have information based on the CirculationData. eq_(9, licensepool.licenses_owned)
def parse(cls, file, data_source_name, default_medium=None): metadata_records = [] # TODO: ONIX has plain language 'reference names' and short tags that # may be used interchangably. This code currently only handles short tags, # and it's not comprehensive. parser = XMLParser() tree = etree.parse(file) root = tree.getroot() for record in root.findall("product"): title = parser.text_of_optional_subtag( record, "descriptivedetail/titledetail/titleelement/b203") if not title: title_prefix = parser.text_of_optional_subtag( record, "descriptivedetail/titledetail/titleelement/b030") title_without_prefix = parser.text_of_optional_subtag( record, "descriptivedetail/titledetail/titleelement/b031") if title_prefix and title_without_prefix: title = title_prefix + " " + title_without_prefix medium = parser.text_of_optional_subtag(record, "b385") if not medium and default_medium: medium = default_medium else: medium = cls.PRODUCT_CONTENT_TYPES.get( medium, EditionConstants.BOOK_MEDIUM) subtitle = parser.text_of_optional_subtag( record, "descriptivedetail/titledetail/titleelement/b029") language = (parser.text_of_optional_subtag( record, "descriptivedetail/language/b252") or "eng") publisher = parser.text_of_optional_subtag( record, "publishingdetail/publisher/b081") imprint = parser.text_of_optional_subtag( record, "publishingdetail/imprint/b079") if imprint == publisher: imprint = None publishing_date = parser.text_of_optional_subtag( record, "publishingdetail/publishingdate/b306") issued = None if publishing_date: issued = dateutil.parser.isoparse(publishing_date) if issued.tzinfo is None: cls._logger.warning( "Publishing date {} does not contain timezone information. Assuming UTC." .format(publishing_date)) issued = to_utc(issued) identifier_tags = parser._xpath(record, "productidentifier") identifiers = [] primary_identifier = None for tag in identifier_tags: type = parser.text_of_subtag(tag, "b221") if type == "02" or type == "15": primary_identifier = IdentifierData( Identifier.ISBN, parser.text_of_subtag(tag, "b244")) identifiers.append(primary_identifier) subject_tags = parser._xpath(record, "descriptivedetail/subject") subjects = [] weight = Classification.TRUSTED_DISTRIBUTOR_WEIGHT for tag in subject_tags: type = parser.text_of_subtag(tag, "b067") if type in cls.SUBJECT_TYPES: b069 = parser.text_of_optional_subtag(tag, "b069") if b069: subjects.append( SubjectData(cls.SUBJECT_TYPES[type], b069, weight=weight)) audience_tags = parser._xpath(record, "descriptivedetail/audience/b204") audiences = [] for tag in audience_tags: if tag.text in cls.AUDIENCE_TYPES: subjects.append( SubjectData( Subject.FREEFORM_AUDIENCE, cls.AUDIENCE_TYPES[tag.text], weight=weight, )) # TODO: We don't handle ONIX unnamed and alternatively named contributors. contributor_tags = parser._xpath(record, "descriptivedetail/contributor") contributors = [] for tag in contributor_tags: type = parser.text_of_subtag(tag, "b035") if type in cls.CONTRIBUTOR_TYPES: person_name_display = parser.text_of_optional_subtag( tag, "b036") person_name_inverted = parser.text_of_optional_subtag( tag, "b037") corp_name_display = parser.text_of_optional_subtag( tag, "b047") corp_name_inverted = parser.text_of_optional_subtag( tag, "x443") bio = parser.text_of_optional_subtag(tag, "b044") family_name = None if person_name_display or person_name_inverted: display_name = person_name_display sort_name = person_name_inverted family_name = parser.text_of_optional_subtag( tag, "b040") elif corp_name_display or corp_name_inverted: display_name = corp_name_display # Sort form for corporate name might just be the display name sort_name = corp_name_inverted or corp_name_display else: sort_name = display_name = None contributors.append( ContributorData( sort_name=sort_name, display_name=display_name, family_name=family_name, roles=[cls.CONTRIBUTOR_TYPES[type]], biography=bio, )) collateral_tags = parser._xpath(record, "collateraldetail/textcontent") links = [] for tag in collateral_tags: type = parser.text_of_subtag(tag, "x426") # TODO: '03' is the summary in the example I'm testing, but that # might not be generally true. if type == "03": text = parser.text_of_subtag(tag, "d104") links.append( LinkData( rel=Hyperlink.DESCRIPTION, media_type=Representation.TEXT_HTML_MEDIA_TYPE, content=text, )) usage_constraint_tags = parser._xpath( record, "descriptivedetail/epubusageconstraint") licenses_owned = LicensePool.UNLIMITED_ACCESS if usage_constraint_tags: cls._logger.debug("Found {0} EpubUsageConstraint tags".format( len(usage_constraint_tags))) for usage_constraint_tag in usage_constraint_tags: usage_status = parser.text_of_subtag(usage_constraint_tag, "x319") cls._logger.debug("EpubUsageStatus: {0}".format(usage_status)) if usage_status == UsageStatus.PROHIBITED.value: raise Exception("The content is prohibited") elif usage_status == UsageStatus.LIMITED.value: usage_limit_tags = parser._xpath( record, "descriptivedetail/epubusageconstraint/epubusagelimit") cls._logger.debug("Found {0} EpubUsageLimit tags".format( len(usage_limit_tags))) if not usage_limit_tags: continue [usage_limit_tag] = usage_limit_tags usage_unit = parser.text_of_subtag(usage_limit_tag, "x321") cls._logger.debug("EpubUsageUnit: {0}".format(usage_unit)) if (usage_unit == UsageUnit.COPIES.value or usage_status == UsageUnit.CONCURRENT_USERS.value): quantity_limit = parser.text_of_subtag( usage_limit_tag, "x320") cls._logger.debug( "Quantity: {0}".format(quantity_limit)) if licenses_owned == LicensePool.UNLIMITED_ACCESS: licenses_owned = 0 licenses_owned += int(quantity_limit) metadata_records.append( Metadata( data_source=data_source_name, title=title, subtitle=subtitle, language=language, medium=medium, publisher=publisher, imprint=imprint, issued=issued, primary_identifier=primary_identifier, identifiers=identifiers, subjects=subjects, contributors=contributors, links=links, circulation=CirculationData( data_source_name, primary_identifier, licenses_owned=licenses_owned, licenses_available=licenses_owned, licenses_reserved=0, patrons_in_hold_queue=0, ), )) return metadata_records
def test_viaf_authors_get_viaf_lookup(self): # TODO: The code this calls could be refactored quite a bit -- # we don't really need to test all of process_item() here. # But ATM it does seem to be our only test of process_item(). oclc = MockOCLCLinkedDataAPI() viaf = MockVIAFClient() provider = LinkedDataCoverageProvider(self._db, api=oclc, viaf_api=viaf) # Here's a placeholder that will be filled in with information from # OCLC Linked Data. edition = self._edition() for i in edition.contributions: self._db.delete(i) self._db.commit() identifier = edition.primary_identifier # OCLC Linked Data is going to mention two authors -- one with # a sort name + VIAF, and one with a VIAF but no sort name. contributor1 = ContributorData(viaf="1") contributor2 = ContributorData(viaf="2", sort_name="Jordan, Robert") contributor3 = ContributorData(sort_name="Rice, Anne", display_name="Anne Rice") idata = IdentifierData(type=identifier.type, identifier=identifier.identifier) metadata = Metadata( DataSource.OCLC_LINKED_DATA, contributors=[contributor1, contributor2, contributor3], primary_identifier=idata, title=u"foo") oclc.queue_info_for(metadata) # Our OCLC Linked Data client is going to try to fill in the # data, asking VIAF about the contributors that have VIAF data, # and not those who do not. lookup1 = (ContributorData(viaf="1", display_name="Display Name", family_name="Family", sort_name="Name, Sort", wikipedia_name="Wikipedia_Name"), None, None) lookup2 = (ContributorData(viaf="2", wikipedia_name="Robert_Jordan_(Author)", biography="That guy."), None, None) viaf.queue_lookup(lookup1, lookup2, "Unrequested lookup") provider.process_item(identifier) # Both VIAF-identified authors have had their information updated # with the VIAF results. filled_in = sorted([(x.sort_name, x.display_name, x.viaf, x.wikipedia_name, x.biography) for x in edition.contributors]) eq_([(u'Jordan, Robert', None, u'2', u'Robert_Jordan_(Author)', u'That guy.'), (u'Name, Sort', u'Display Name', u'1', u'Wikipedia_Name', None), (u'Rice, Anne', u'Anne Rice', None, None, None)], filled_in) # The author without VIAF data didn't request a VIAF lookup. # Instead, that result is still in the mock VIAF queue. eq_(viaf.results, ["Unrequested lookup"])
def add_with_metadata(self, collection_details): """Adds identifiers with their metadata to a Collection's catalog""" client = authenticated_client_from_request(self._db) if isinstance(client, ProblemDetail): return client collection = collection_from_details(self._db, client, collection_details) data_source = DataSource.lookup(self._db, collection.name, autocreate=True) messages = [] feed = feedparser.parse(request.data) entries = feed.get("entries", []) entries_by_urn = {entry.get('id'): entry for entry in entries} identifiers_by_urn, invalid_urns = Identifier.parse_urns( self._db, entries_by_urn.keys()) messages = list() for urn in invalid_urns: messages.append( OPDSMessage(urn, INVALID_URN.status_code, INVALID_URN.detail)) for urn, identifier in identifiers_by_urn.items(): entry = entries_by_urn[urn] status = HTTP_OK description = "Already in catalog" if identifier not in collection.catalog: collection.catalog_identifier(identifier) status = HTTP_CREATED description = "Successfully added" message = OPDSMessage(urn, status, description) # Get a cover if it exists. image_types = set([Hyperlink.IMAGE, Hyperlink.THUMBNAIL_IMAGE]) images = [ l for l in entry.get("links", []) if l.get("rel") in image_types ] links = [ LinkData(image.get("rel"), image.get("href")) for image in images ] # Create an edition to hold the title and author. LicensePool.calculate_work # refuses to create a Work when there's no title, and if we have a title, author # and language we can attempt to look up the edition in OCLC. title = entry.get("title") or "Unknown Title" author = ContributorData(sort_name=(entry.get("author") or Edition.UNKNOWN_AUTHOR), roles=[Contributor.PRIMARY_AUTHOR_ROLE]) language = entry.get("dcterms_language") presentation = PresentationCalculationPolicy( choose_edition=False, set_edition_metadata=False, classify=False, choose_summary=False, calculate_quality=False, choose_cover=False, regenerate_opds_entries=False, ) replace = ReplacementPolicy( presentation_calculation_policy=presentation) metadata = Metadata( data_source, primary_identifier=IdentifierData(identifier.type, identifier.identifier), title=title, language=language, contributors=[author], links=links, ) edition, ignore = metadata.edition(self._db) metadata.apply(edition, collection, replace=replace) messages.append(message) title = "%s Catalog Item Additions for %s" % (collection.protocol, client.url) url = self.collection_feed_url("add_with_metadata", collection) addition_feed = AcquisitionFeed(self._db, title, url, [], VerboseAnnotator, precomposed_entries=messages) return feed_response(addition_feed)
def lookup_info_to_metadata(self, lookup_representation): """Transforms a NoveList JSON representation into a Metadata object""" if not lookup_representation.content: return None lookup_info = json.loads(lookup_representation.content) book_info = lookup_info['TitleInfo'] if book_info: novelist_identifier = book_info.get('ui') if not book_info or not novelist_identifier: # NoveList didn't know the ISBN. return None primary_identifier, ignore = Identifier.for_foreign_id( self._db, Identifier.NOVELIST_ID, novelist_identifier) metadata = Metadata(self.source, primary_identifier=primary_identifier) # Get the equivalent ISBN identifiers. metadata.identifiers += self._extract_isbns(book_info) author = book_info.get('author') if author: metadata.contributors.append(ContributorData(sort_name=author)) description = book_info.get('description') if description: metadata.links.append( LinkData(rel=Hyperlink.DESCRIPTION, content=description, media_type=Representation.TEXT_PLAIN)) audience_level = book_info.get('audience_level') if audience_level: metadata.subjects.append( SubjectData(Subject.FREEFORM_AUDIENCE, audience_level)) novelist_rating = book_info.get('rating') if novelist_rating: metadata.measurements.append( MeasurementData(Measurement.RATING, novelist_rating)) # Extract feature content if it is available. series_info = None appeals_info = None lexile_info = None goodreads_info = None recommendations_info = None feature_content = lookup_info.get('FeatureContent') if feature_content: series_info = feature_content.get('SeriesInfo') appeals_info = feature_content.get('Appeals') lexile_info = feature_content.get('LexileInfo') goodreads_info = feature_content.get('GoodReads') recommendations_info = feature_content.get('SimilarTitles') metadata, title_key = self.get_series_information( metadata, series_info, book_info) metadata.title = book_info.get(title_key) subtitle = TitleProcessor.extract_subtitle(metadata.title, book_info.get('full_title')) metadata.subtitle = self._scrub_subtitle(subtitle) # TODO: How well do we trust this data? We could conceivably bump up # the weight here. if appeals_info: extracted_genres = False for appeal in appeals_info: genres = appeal.get('genres') if genres: for genre in genres: metadata.subjects.append( SubjectData(Subject.TAG, genre['Name'])) extracted_genres = True if extracted_genres: break if lexile_info: metadata.subjects.append( SubjectData(Subject.LEXILE_SCORE, lexile_info['Lexile'])) if goodreads_info: metadata.measurements.append( MeasurementData(Measurement.RATING, goodreads_info['average_rating'])) metadata = self.get_recommendations(metadata, recommendations_info) # If nothing interesting comes from the API, ignore it. if not (metadata.measurements or metadata.series_position or metadata.series or metadata.subjects or metadata.links or metadata.subtitle or metadata.recommendations): metadata = None return metadata
def parse(cls, file, data_source_name): reader = MARCReader(file) metadata_records = [] for record in reader: title = record.title() if title.endswith(' /'): title = title[:-len(' /')] issued_year = datetime.datetime.strptime(record.pubyear(), "%Y.") publisher = record.publisher() if publisher.endswith(','): publisher = publisher[:-1] links = [] summary = record.notes()[0]['a'] if summary: summary_link = LinkData( rel=Hyperlink.DESCRIPTION, media_type=Representation.TEXT_PLAIN, content=summary, ) links.append(summary_link) isbn = record['020']['a'].split(" ")[0] primary_identifier = IdentifierData(Identifier.ISBN, isbn) subjects = [ SubjectData( Classifier.FAST, subject['a'], ) for subject in record.subjects() ] author = record.author() if author: old_author = author # Turn 'Dante Alighieri, 1265-1321, author.' # into 'Dante Alighieri'. The metadata wrangler will # take it from there. for regex in cls.END_OF_AUTHOR_NAME_RES: match = regex.search(author) if match: old_author = author author = author[:match.start()] break author_names = [author] else: author_names = ['Anonymous'] contributors = [ ContributorData( sort_name=author, roles=[Contributor.AUTHOR_ROLE], ) for author in author_names ] metadata_records.append( Metadata(data_source=data_source_name, title=title, language='eng', medium=Edition.BOOK_MEDIUM, publisher=publisher, issued=issued_year, primary_identifier=primary_identifier, subjects=subjects, contributors=contributors, links=links)) return metadata_records