def test_circulationdata_can_be_deepcopied(self): # Check that we didn't put something in the CirculationData that # will prevent it from being copied. (e.g., self.log) subject = SubjectData(Subject.TAG, "subject") contributor = ContributorData() identifier = IdentifierData(Identifier.GUTENBERG_ID, "1") link = LinkData(Hyperlink.OPEN_ACCESS_DOWNLOAD, "example.epub") format = FormatData(Representation.EPUB_MEDIA_TYPE, DeliveryMechanism.NO_DRM) rights_uri = RightsStatus.GENERIC_OPEN_ACCESS circulation_data = CirculationData( DataSource.GUTENBERG, primary_identifier=identifier, links=[link], licenses_owned=5, licenses_available=5, licenses_reserved=None, patrons_in_hold_queue=None, formats=[format], default_rights_uri=rights_uri, ) circulation_data_copy = deepcopy(circulation_data) # If deepcopy didn't throw an exception we're ok. assert circulation_data_copy is not None
def lookup_info_to_metadata(self, lookup_representation): """Transforms a NoveList JSON representation into a Metadata object""" if not lookup_representation.content: return None lookup_info = json.loads(lookup_representation.content) book_info = lookup_info['TitleInfo'] if book_info: novelist_identifier = book_info.get('ui') if not book_info or not novelist_identifier: # NoveList didn't know the ISBN. return None primary_identifier, ignore = Identifier.for_foreign_id( self._db, Identifier.NOVELIST_ID, novelist_identifier) metadata = Metadata(self.source, primary_identifier=primary_identifier) # Get the equivalent ISBN identifiers. metadata.identifiers += self._extract_isbns(book_info) author = book_info.get('author') if author: metadata.contributors.append(ContributorData(sort_name=author)) description = book_info.get('description') if description: metadata.links.append( LinkData(rel=Hyperlink.DESCRIPTION, content=description, media_type=Representation.TEXT_PLAIN)) audience_level = book_info.get('audience_level') if audience_level: metadata.subjects.append( SubjectData(Subject.FREEFORM_AUDIENCE, audience_level)) novelist_rating = book_info.get('rating') if novelist_rating: metadata.measurements.append( MeasurementData(Measurement.RATING, novelist_rating)) # Extract feature content if it is available. series_info = None appeals_info = None lexile_info = None goodreads_info = None recommendations_info = None feature_content = lookup_info.get('FeatureContent') if feature_content: series_info = feature_content.get('SeriesInfo') appeals_info = feature_content.get('Appeals') lexile_info = feature_content.get('LexileInfo') goodreads_info = feature_content.get('GoodReads') recommendations_info = feature_content.get('SimilarTitles') metadata, title_key = self.get_series_information( metadata, series_info, book_info) metadata.title = book_info.get(title_key) subtitle = TitleProcessor.extract_subtitle(metadata.title, book_info.get('full_title')) metadata.subtitle = self._scrub_subtitle(subtitle) # TODO: How well do we trust this data? We could conceivably bump up # the weight here. if appeals_info: extracted_genres = False for appeal in appeals_info: genres = appeal.get('genres') if genres: for genre in genres: metadata.subjects.append( SubjectData(Subject.TAG, genre['Name'])) extracted_genres = True if extracted_genres: break if lexile_info: metadata.subjects.append( SubjectData(Subject.LEXILE_SCORE, lexile_info['Lexile'])) if goodreads_info: metadata.measurements.append( MeasurementData(Measurement.RATING, goodreads_info['average_rating'])) metadata = self.get_recommendations(metadata, recommendations_info) # If nothing interesting comes from the API, ignore it. if not (metadata.measurements or metadata.series_position or metadata.series or metadata.subjects or metadata.links or metadata.subtitle or metadata.recommendations): metadata = None return metadata
class TestCirculationMonitor(Axis360Test): BIBLIOGRAPHIC_DATA = Metadata( DataSource.AXIS_360, publisher=u'Random House Inc', language='eng', title=u'Faith of My Fathers : A Family Memoir', imprint=u'Random House Inc2', published=datetime.datetime(2000, 3, 7, 0, 0), primary_identifier=IdentifierData(type=Identifier.AXIS_360_ID, identifier=u'0003642860'), identifiers=[ IdentifierData(type=Identifier.ISBN, identifier=u'9780375504587') ], contributors=[ ContributorData(sort_name=u"McCain, John", roles=[Contributor.PRIMARY_AUTHOR_ROLE]), ContributorData(sort_name=u"Salter, Mark", roles=[Contributor.AUTHOR_ROLE]), ], subjects=[ SubjectData(type=Subject.BISAC, identifier=u'BIOGRAPHY & AUTOBIOGRAPHY / Political'), SubjectData(type=Subject.FREEFORM_AUDIENCE, identifier=u'Adult'), ], ) AVAILABILITY_DATA = CirculationData( data_source=DataSource.AXIS_360, primary_identifier=BIBLIOGRAPHIC_DATA.primary_identifier, licenses_owned=9, licenses_available=8, licenses_reserved=0, patrons_in_hold_queue=0, last_checked=datetime.datetime(2015, 5, 20, 2, 9, 8), ) def test_process_book(self): integration, ignore = create( self._db, ExternalIntegration, goal=ExternalIntegration.ANALYTICS_GOAL, protocol="core.local_analytics_provider", ) monitor = Axis360CirculationMonitor( self._db, self.collection, api_class=MockAxis360API, metadata_client=MockMetadataWranglerOPDSLookup('url')) edition, license_pool = monitor.process_book(self.BIBLIOGRAPHIC_DATA, self.AVAILABILITY_DATA) eq_(u'Faith of My Fathers : A Family Memoir', edition.title) eq_(u'eng', edition.language) eq_(u'Random House Inc', edition.publisher) eq_(u'Random House Inc2', edition.imprint) eq_(Identifier.AXIS_360_ID, edition.primary_identifier.type) eq_(u'0003642860', edition.primary_identifier.identifier) [isbn] = [ x for x in edition.equivalent_identifiers() if x is not edition.primary_identifier ] eq_(Identifier.ISBN, isbn.type) eq_(u'9780375504587', isbn.identifier) eq_( ["McCain, John", "Salter, Mark"], sorted([x.sort_name for x in edition.contributors]), ) subs = sorted((x.subject.type, x.subject.identifier) for x in edition.primary_identifier.classifications) eq_([(Subject.BISAC, u'BIOGRAPHY & AUTOBIOGRAPHY / Political'), (Subject.FREEFORM_AUDIENCE, u'Adult')], subs) eq_(9, license_pool.licenses_owned) eq_(8, license_pool.licenses_available) eq_(0, license_pool.patrons_in_hold_queue) eq_(datetime.datetime(2015, 5, 20, 2, 9, 8), license_pool.last_checked) # Three circulation events were created, backdated to the # last_checked date of the license pool. events = license_pool.circulation_events eq_([ u'distributor_title_add', u'distributor_check_in', u'distributor_license_add' ], [x.type for x in events]) for e in events: eq_(e.start, license_pool.last_checked) # A presentation-ready work has been created for the LicensePool. work = license_pool.work eq_(True, work.presentation_ready) eq_("Faith of My Fathers : A Family Memoir", work.title) # A CoverageRecord has been provided for this book in the Axis # 360 bibliographic coverage provider, so that in the future # it doesn't have to make a separate API request to ask about # this book. records = [ x for x in license_pool.identifier.coverage_records if x.data_source.name == DataSource.AXIS_360 and x.operation is None ] eq_(1, len(records)) def test_process_book_updates_old_licensepool(self): """If the LicensePool already exists, the circulation monitor updates it. """ edition, licensepool = self._edition( with_license_pool=True, identifier_type=Identifier.AXIS_360_ID, identifier_id=u'0003642860') # We start off with availability information based on the # default for test data. eq_(1, licensepool.licenses_owned) identifier = IdentifierData( type=licensepool.identifier.type, identifier=licensepool.identifier.identifier) metadata = Metadata(DataSource.AXIS_360, primary_identifier=identifier) monitor = Axis360CirculationMonitor( self._db, self.collection, api_class=MockAxis360API, metadata_client=MockMetadataWranglerOPDSLookup('url')) edition, licensepool = monitor.process_book(metadata, self.AVAILABILITY_DATA) # Now we have information based on the CirculationData. eq_(9, licensepool.licenses_owned)
def extract_bibliographic(self, element): """Extract Metadata and CirculationData from a dictionary of information from Enki. :return: A Metadata with attached CirculationData. """ # TODO: it's not clear what these are or whether we'd find them # useful: # dateSaved # length # publishDate primary_identifier = IdentifierData(EnkiAPI.ENKI_ID, element["id"]) identifiers = [] identifiers.append(IdentifierData(Identifier.ISBN, element["isbn"])) contributors = [] sort_name = element.get("author", None) or Edition.UNKNOWN_AUTHOR contributors.append(ContributorData(sort_name=sort_name)) links = [] description = element.get("description") if description: links.append( LinkData( rel=Hyperlink.DESCRIPTION, content=description, media_type="text/html", ) ) # NOTE: When this method is called by, e.g. updated_titles(), # the large and small images are available separately. When # this method is called by get_item(), we only get a single # image, in 'cover'. In get_item() we ask that that image be 'large', # which means we'll be filing it as a normal-sized image. # full_image = None thumbnail_image = None for key, rel in ( ("cover", Hyperlink.IMAGE), ("small_image", Hyperlink.THUMBNAIL_IMAGE), ("large_image", Hyperlink.IMAGE), ): url = element.get(key) if not url: continue link = LinkData(rel=rel, href=url, media_type=Representation.PNG_MEDIA_TYPE) if rel == Hyperlink.THUMBNAIL_IMAGE: # Don't add a thumbnail to the list of links -- wait # until the end and then make it a thumbnail of the # primary image. thumbnail_image = link else: if rel == Hyperlink.IMAGE: full_image = link links.append(link) if thumbnail_image: if full_image: # Set the thumbnail as the thumbnail _of_ the full image. full_image.thumbnail = thumbnail_image else: # Treat the thumbnail as the full image. thumbnail_image.rel = Hyperlink.IMAGE links.append(thumbnail_image) # We treat 'subject', 'topic', and 'genre' as interchangeable # sets of tags. This data is based on BISAC but it's not reliably # presented in a form that can be parsed as BISAC. subjects = [] seen_topics = set() for key in ("subject", "topic", "genre"): for topic in element.get(key, []): if not topic or topic in seen_topics: continue subjects.append( SubjectData( Subject.TAG, topic, weight=Classification.TRUSTED_DISTRIBUTOR_WEIGHT, ) ) seen_topics.add(topic) language_code = element.get("language", "English") language = self.LANGUAGE_CODES.get(language_code, "eng") metadata = Metadata( data_source=DataSource.ENKI, title=element.get("title"), language=language, medium=Edition.BOOK_MEDIUM, publisher=element.get("publisher"), primary_identifier=primary_identifier, identifiers=identifiers, contributors=contributors, links=links, subjects=subjects, ) circulationdata = self.extract_circulation( primary_identifier, element.get("availability", {}), element.get("formattype", None), ) metadata.circulation = circulationdata return metadata
def record_info_to_metadata(cls, book, availability): """Turn Odilo's JSON representation of a book into a Metadata object. Note: The json data passed into this method is from a different file/stream from the json data that goes into the book_info_to_circulation() method. """ if 'id' not in book: return None odilo_id = book['id'] primary_identifier = IdentifierData(Identifier.ODILO_ID, odilo_id) active = book.get('active') title = book.get('title') subtitle = book.get('subtitle') series = book.get('series') series_position = book.get('seriesPosition') contributors = [] sort_author = book.get('author') if sort_author: roles = [Contributor.AUTHOR_ROLE] display_author = sort_name_to_display_name(sort_author) contributor = ContributorData(sort_name=sort_author, display_name=display_author, roles=roles, biography=None) contributors.append(contributor) publisher = book.get('publisher') # Metadata --> Marc21 260$c published = book.get('publicationDate') if not published: # yyyyMMdd --> record creation date published = book.get('releaseDate') if published: try: published = datetime.datetime.strptime(published, "%Y%m%d") except ValueError as e: cls.log.warn('Cannot parse publication date from: ' + published + ', message: ' + e.message) # yyyyMMdd --> record last modification date last_update = book.get('modificationDate') if last_update: try: last_update = datetime.datetime.strptime(last_update, "%Y%m%d") except ValueError as e: cls.log.warn('Cannot parse last update date from: ' + last_update + ', message: ' + e.message) language = book.get('language', 'spa') subjects = [] for subject in book.get('subjects', []): subjects.append( SubjectData(type=Subject.TAG, identifier=subject, weight=100)) for subjectBisacCode in book.get('subjectsBisacCodes', []): subjects.append( SubjectData(type=Subject.BISAC, identifier=subjectBisacCode, weight=100)) grade_level = book.get('gradeLevel') if grade_level: subject = SubjectData(type=Subject.GRADE_LEVEL, identifier=grade_level, weight=10) subjects.append(subject) medium = None file_format = book.get('fileFormat') formats = [] for format_received in book.get('formats', []): if format_received in cls.format_data_for_odilo_format: medium = cls.set_format(format_received, formats) elif format_received == cls.ACSM and file_format: medium = cls.set_format( format_received + '_' + file_format.upper(), formats) else: cls.log.warn('Unrecognized format received: ' + format_received) if not medium: medium = Edition.BOOK_MEDIUM identifiers = [] isbn = book.get('isbn') if isbn: if isbnlib.is_isbn10(isbn): isbn = isbnlib.to_isbn13(isbn) identifiers.append(IdentifierData(Identifier.ISBN, isbn, 1)) # A cover links = [] cover_image_url = book.get('coverImageUrl') if cover_image_url: image_data = cls.image_link_to_linkdata(cover_image_url, Hyperlink.THUMBNAIL_IMAGE) if image_data: links.append(image_data) original_image_url = book.get('originalImageUrl') if original_image_url: image_data = cls.image_link_to_linkdata(original_image_url, Hyperlink.IMAGE) if image_data: links.append(image_data) # Descriptions become links. description = book.get('description') if description: links.append( LinkData(rel=Hyperlink.DESCRIPTION, content=description, media_type="text/html")) metadata = Metadata(data_source=DataSource.ODILO, title=title, subtitle=subtitle, language=language, medium=medium, series=series, series_position=series_position, publisher=publisher, published=published, primary_identifier=primary_identifier, identifiers=identifiers, subjects=subjects, contributors=contributors, links=links, data_source_last_updated=last_update) metadata.circulation = OdiloRepresentationExtractor.record_info_to_circulation( availability) # 'active' --> means that the book exists but it's no longer in the collection # (it could be available again in the future) if not active: metadata.circulation.licenses_owned = 0 metadata.circulation.formats = formats return metadata, active
def parse(cls, file, data_source_name, default_medium=None): metadata_records = [] # TODO: ONIX has plain language 'reference names' and short tags that # may be used interchangably. This code currently only handles short tags, # and it's not comprehensive. parser = XMLParser() tree = etree.parse(file) root = tree.getroot() for record in root.findall("product"): title = parser.text_of_optional_subtag( record, "descriptivedetail/titledetail/titleelement/b203") if not title: title_prefix = parser.text_of_optional_subtag( record, "descriptivedetail/titledetail/titleelement/b030") title_without_prefix = parser.text_of_optional_subtag( record, "descriptivedetail/titledetail/titleelement/b031") if title_prefix and title_without_prefix: title = title_prefix + " " + title_without_prefix medium = parser.text_of_optional_subtag(record, "b385") if not medium and default_medium: medium = default_medium else: medium = cls.PRODUCT_CONTENT_TYPES.get( medium, EditionConstants.BOOK_MEDIUM) subtitle = parser.text_of_optional_subtag( record, "descriptivedetail/titledetail/titleelement/b029") language = (parser.text_of_optional_subtag( record, "descriptivedetail/language/b252") or "eng") publisher = parser.text_of_optional_subtag( record, "publishingdetail/publisher/b081") imprint = parser.text_of_optional_subtag( record, "publishingdetail/imprint/b079") if imprint == publisher: imprint = None publishing_date = parser.text_of_optional_subtag( record, "publishingdetail/publishingdate/b306") issued = None if publishing_date: issued = dateutil.parser.isoparse(publishing_date) if issued.tzinfo is None: cls._logger.warning( "Publishing date {} does not contain timezone information. Assuming UTC." .format(publishing_date)) issued = to_utc(issued) identifier_tags = parser._xpath(record, "productidentifier") identifiers = [] primary_identifier = None for tag in identifier_tags: type = parser.text_of_subtag(tag, "b221") if type == "02" or type == "15": primary_identifier = IdentifierData( Identifier.ISBN, parser.text_of_subtag(tag, "b244")) identifiers.append(primary_identifier) subject_tags = parser._xpath(record, "descriptivedetail/subject") subjects = [] weight = Classification.TRUSTED_DISTRIBUTOR_WEIGHT for tag in subject_tags: type = parser.text_of_subtag(tag, "b067") if type in cls.SUBJECT_TYPES: b069 = parser.text_of_optional_subtag(tag, "b069") if b069: subjects.append( SubjectData(cls.SUBJECT_TYPES[type], b069, weight=weight)) audience_tags = parser._xpath(record, "descriptivedetail/audience/b204") audiences = [] for tag in audience_tags: if tag.text in cls.AUDIENCE_TYPES: subjects.append( SubjectData( Subject.FREEFORM_AUDIENCE, cls.AUDIENCE_TYPES[tag.text], weight=weight, )) # TODO: We don't handle ONIX unnamed and alternatively named contributors. contributor_tags = parser._xpath(record, "descriptivedetail/contributor") contributors = [] for tag in contributor_tags: type = parser.text_of_subtag(tag, "b035") if type in cls.CONTRIBUTOR_TYPES: person_name_display = parser.text_of_optional_subtag( tag, "b036") person_name_inverted = parser.text_of_optional_subtag( tag, "b037") corp_name_display = parser.text_of_optional_subtag( tag, "b047") corp_name_inverted = parser.text_of_optional_subtag( tag, "x443") bio = parser.text_of_optional_subtag(tag, "b044") family_name = None if person_name_display or person_name_inverted: display_name = person_name_display sort_name = person_name_inverted family_name = parser.text_of_optional_subtag( tag, "b040") elif corp_name_display or corp_name_inverted: display_name = corp_name_display # Sort form for corporate name might just be the display name sort_name = corp_name_inverted or corp_name_display else: sort_name = display_name = None contributors.append( ContributorData( sort_name=sort_name, display_name=display_name, family_name=family_name, roles=[cls.CONTRIBUTOR_TYPES[type]], biography=bio, )) collateral_tags = parser._xpath(record, "collateraldetail/textcontent") links = [] for tag in collateral_tags: type = parser.text_of_subtag(tag, "x426") # TODO: '03' is the summary in the example I'm testing, but that # might not be generally true. if type == "03": text = parser.text_of_subtag(tag, "d104") links.append( LinkData( rel=Hyperlink.DESCRIPTION, media_type=Representation.TEXT_HTML_MEDIA_TYPE, content=text, )) usage_constraint_tags = parser._xpath( record, "descriptivedetail/epubusageconstraint") licenses_owned = LicensePool.UNLIMITED_ACCESS if usage_constraint_tags: cls._logger.debug("Found {0} EpubUsageConstraint tags".format( len(usage_constraint_tags))) for usage_constraint_tag in usage_constraint_tags: usage_status = parser.text_of_subtag(usage_constraint_tag, "x319") cls._logger.debug("EpubUsageStatus: {0}".format(usage_status)) if usage_status == UsageStatus.PROHIBITED.value: raise Exception("The content is prohibited") elif usage_status == UsageStatus.LIMITED.value: usage_limit_tags = parser._xpath( record, "descriptivedetail/epubusageconstraint/epubusagelimit") cls._logger.debug("Found {0} EpubUsageLimit tags".format( len(usage_limit_tags))) if not usage_limit_tags: continue [usage_limit_tag] = usage_limit_tags usage_unit = parser.text_of_subtag(usage_limit_tag, "x321") cls._logger.debug("EpubUsageUnit: {0}".format(usage_unit)) if (usage_unit == UsageUnit.COPIES.value or usage_status == UsageUnit.CONCURRENT_USERS.value): quantity_limit = parser.text_of_subtag( usage_limit_tag, "x320") cls._logger.debug( "Quantity: {0}".format(quantity_limit)) if licenses_owned == LicensePool.UNLIMITED_ACCESS: licenses_owned = 0 licenses_owned += int(quantity_limit) metadata_records.append( Metadata( data_source=data_source_name, title=title, subtitle=subtitle, language=language, medium=medium, publisher=publisher, imprint=imprint, issued=issued, primary_identifier=primary_identifier, identifiers=identifiers, subjects=subjects, contributors=contributors, links=links, circulation=CirculationData( data_source_name, primary_identifier, licenses_owned=licenses_owned, licenses_available=licenses_owned, licenses_reserved=0, patrons_in_hold_queue=0, ), )) return metadata_records
def parse(cls, file, data_source_name): metadata_records = [] # TODO: ONIX has plain language 'reference names' and short tags that # may be used interchangably. This code currently only handles short tags, # and it's not comprehensive. parser = XMLParser() tree = etree.parse(file) root = tree.getroot() for record in root.findall('product'): title = parser.text_of_optional_subtag(record, 'descriptivedetail/titledetail/titleelement/b203') if not title: title_prefix = parser.text_of_optional_subtag(record, 'descriptivedetail/titledetail/titleelement/b030') title_without_prefix = parser.text_of_optional_subtag(record, 'descriptivedetail/titledetail/titleelement/b031') if title_prefix and title_without_prefix: title = title_prefix + " " + title_without_prefix subtitle = parser.text_of_optional_subtag(record, 'descriptivedetail/titledetail/titleelement/b029') language = parser.text_of_optional_subtag(record, 'descriptivedetail/language/b252') or "eng" publisher = parser.text_of_optional_subtag(record, 'publishingdetail/publisher/b081') imprint = parser.text_of_optional_subtag(record, 'publishingdetail/imprint/b079') if imprint == publisher: imprint = None publishing_date = parser.text_of_optional_subtag(record, 'publishingdetail/publishingdate/b306') issued = None if publishing_date: issued = datetime.datetime.strptime(publishing_date, "%Y%m%d") identifier_tags = parser._xpath(record, 'productidentifier') identifiers = [] primary_identifier = None for tag in identifier_tags: type = parser.text_of_subtag(tag, "b221") if type == '02' or type == '15': primary_identifier = IdentifierData(Identifier.ISBN, parser.text_of_subtag(tag, 'b244')) identifiers.append(primary_identifier) subject_tags = parser._xpath(record, 'descriptivedetail/subject') subjects = [] weight = Classification.TRUSTED_DISTRIBUTOR_WEIGHT for tag in subject_tags: type = parser.text_of_subtag(tag, 'b067') if type in cls.SUBJECT_TYPES: subjects.append( SubjectData( cls.SUBJECT_TYPES[type], parser.text_of_subtag(tag, 'b069'), weight=weight ) ) audience_tags = parser._xpath(record, 'descriptivedetail/audience/b204') audiences = [] for tag in audience_tags: if tag.text in cls.AUDIENCE_TYPES: subjects.append( SubjectData( Subject.FREEFORM_AUDIENCE, cls.AUDIENCE_TYPES[tag.text], weight=weight ) ) contributor_tags = parser._xpath(record, 'descriptivedetail/contributor') contributors = [] for tag in contributor_tags: type = parser.text_of_subtag(tag, 'b035') if type in cls.CONTRIBUTOR_TYPES: display_name = parser.text_of_subtag(tag, 'b036') sort_name = parser.text_of_optional_subtag(tag, 'b037') family_name = parser.text_of_optional_subtag(tag, 'b040') bio = parser.text_of_optional_subtag(tag, 'b044') contributors.append(ContributorData(sort_name=sort_name, display_name=display_name, family_name=family_name, roles=[cls.CONTRIBUTOR_TYPES[type]], biography=bio)) collateral_tags = parser._xpath(record, 'collateraldetail/textcontent') links = [] for tag in collateral_tags: type = parser.text_of_subtag(tag, 'x426') # TODO: '03' is the summary in the example I'm testing, but that # might not be generally true. if type == '03': text = parser.text_of_subtag(tag, 'd104') links.append(LinkData(rel=Hyperlink.DESCRIPTION, media_type=Representation.TEXT_HTML_MEDIA_TYPE, content=text)) metadata_records.append(Metadata( data_source=data_source_name, title=title, subtitle=subtitle, language=language, medium=Edition.BOOK_MEDIUM, publisher=publisher, imprint=imprint, issued=issued, primary_identifier=primary_identifier, identifiers=identifiers, subjects=subjects, contributors=contributors, links=links )) return metadata_records
def __init__(self, product): self.subjects = [] self.identifiers = [] self.contributors = [] self.links = [] self.product = product self.var = defaultdict(list) self.unrecognized_tags = dict() self.title = None for f in self.product.get('varFields', []): marctag = MarcTag(f) self.var[marctag.marcTag].append(marctag) # Find a title. for num in ('245', '240'): for tag in self.tags(num): self.title = tag.a if self.title: break if self.title: break # Contributors for tag in self.tags('100'): role = tag.e or 'author.' sort_name = tag.a self.contributors.append( ContributorData(sort_name=sort_name, roles=[role])) # Subjects for number in ('050', '908'): for tag in self.tags(number): # Library of Congress classification if tag.a: self.subjects.append( SubjectData(type=Subject.LCC, identifier=tag.a)) # TODO: tag.b ("Pap 2014eb") includes potentially useful # date information. for tag in self.tags('856'): if tag.subfields.get('3', {}).get('content') == 'Image': continue if tag.u: if tag.y == 'Access eNYPL' or tag.z == 'Access eNYPL': self.links.append(LinkData(rel='alternate', href=tag.u)) for tag in self.tags('082'): if tag.a: self.subjects.append( SubjectData(type=Subject.DDC, identifier=tag.a)) for v in range(650, 656): for tag in self.tags(v): type = getattr(tag, '2', None) native_type = Subject.TAG if type: if type.endswith('.'): type = type[:-1] Representation.tag_type[type] += 1 native_type = self.shadowcat_subject_type_to_native_type.get( type, Subject.TAG) identifiers = [x for x in [tag.a, tag.v] if x] for identifier in identifiers: self.subjects.append( SubjectData(type=native_type, identifier=identifier)) # Identifiers for tag in self.tags('037'): if tag.a and (tag.b in self.marc_037_b_to_identifier_type): t = self.marc_037_b_to_identifier_type[tag.b] self.identifiers.append( IdentifierData(type=t, identifier=tag.a)) for tag in self.tags('020'): isbn = tag.a if not isbn: continue for r in self.isbn_res: m = r.search(isbn) if m: isbn = m.groups()[0] self.identifiers.append( IdentifierData(type=Identifier.ISBN, identifier=isbn)) for key in ['385', '521']: for tag in self.tags(key): identifier = tag.a if identifier.lower() in self.audience_blacklist: continue self.subjects.append( SubjectData(type=Subject.FREEFORM_AUDIENCE, identifier=identifier)) for tag in self.tags('035'): potential = tag.a identifier = None for r, type in self.marc_035_a_to_identifier_type.items(): m = r.search(potential) if m: identifier = m.groups()[0] break if identifier: self.identifiers.append( IdentifierData(type=type, identifier=identifier)) # Keep track of items we haven't seen before. for key, var in self.var.items(): if key not in self.known_vars: self.unrecognized_tags[key] = var
def parse(cls, file, data_source_name): metadata_records = [] # TODO: ONIX has plain language 'reference names' and short tags that # may be used interchangably. This code currently only handles short tags, # and it's not comprehensive. parser = XMLParser() tree = etree.parse(file) root = tree.getroot() for record in root.findall('product'): title = parser.text_of_optional_subtag( record, 'descriptivedetail/titledetail/titleelement/b203') if not title: title_prefix = parser.text_of_optional_subtag( record, 'descriptivedetail/titledetail/titleelement/b030') title_without_prefix = parser.text_of_optional_subtag( record, 'descriptivedetail/titledetail/titleelement/b031') if title_prefix and title_without_prefix: title = title_prefix + " " + title_without_prefix subtitle = parser.text_of_optional_subtag( record, 'descriptivedetail/titledetail/titleelement/b029') language = parser.text_of_optional_subtag( record, 'descriptivedetail/language/b252') or "eng" publisher = parser.text_of_optional_subtag( record, 'publishingdetail/publisher/b081') imprint = parser.text_of_optional_subtag( record, 'publishingdetail/imprint/b079') if imprint == publisher: imprint = None publishing_date = parser.text_of_optional_subtag( record, 'publishingdetail/publishingdate/b306') issued = None if publishing_date: issued = datetime.datetime.strptime(publishing_date, "%Y%m%d") identifier_tags = parser._xpath(record, 'productidentifier') identifiers = [] primary_identifier = None for tag in identifier_tags: type = parser.text_of_subtag(tag, "b221") if type == '02' or type == '15': primary_identifier = IdentifierData( Identifier.ISBN, parser.text_of_subtag(tag, 'b244')) identifiers.append(primary_identifier) subject_tags = parser._xpath(record, 'descriptivedetail/subject') subjects = [] weight = Classification.TRUSTED_DISTRIBUTOR_WEIGHT for tag in subject_tags: type = parser.text_of_subtag(tag, 'b067') if type in cls.SUBJECT_TYPES: subjects.append( SubjectData(cls.SUBJECT_TYPES[type], parser.text_of_subtag(tag, 'b069'), weight=weight)) audience_tags = parser._xpath(record, 'descriptivedetail/audience/b204') audiences = [] for tag in audience_tags: if tag.text in cls.AUDIENCE_TYPES: subjects.append( SubjectData(Subject.FREEFORM_AUDIENCE, cls.AUDIENCE_TYPES[tag.text], weight=weight)) contributor_tags = parser._xpath(record, 'descriptivedetail/contributor') contributors = [] for tag in contributor_tags: type = parser.text_of_subtag(tag, 'b035') if type in cls.CONTRIBUTOR_TYPES: display_name = parser.text_of_subtag(tag, 'b036') sort_name = parser.text_of_optional_subtag(tag, 'b037') family_name = parser.text_of_optional_subtag(tag, 'b040') bio = parser.text_of_optional_subtag(tag, 'b044') contributors.append( ContributorData(sort_name=sort_name, display_name=display_name, family_name=family_name, roles=[cls.CONTRIBUTOR_TYPES[type]], biography=bio)) collateral_tags = parser._xpath(record, 'collateraldetail/textcontent') links = [] for tag in collateral_tags: type = parser.text_of_subtag(tag, 'x426') # TODO: '03' is the summary in the example I'm testing, but that # might not be generally true. if type == '03': text = parser.text_of_subtag(tag, 'd104') links.append( LinkData( rel=Hyperlink.DESCRIPTION, media_type=Representation.TEXT_HTML_MEDIA_TYPE, content=text)) usage_constraint_tags = parser._xpath( record, 'descriptivedetail/epubusageconstraint') licenses_owned = LicensePool.UNLIMITED_ACCESS if usage_constraint_tags: cls._logger.debug('Found {0} EpubUsageConstraint tags'.format( len(usage_constraint_tags))) for usage_constraint_tag in usage_constraint_tags: usage_status = parser.text_of_subtag(usage_constraint_tag, 'x319') cls._logger.debug('EpubUsageStatus: {0}'.format(usage_status)) if usage_status == UsageStatus.PROHIBITED.value: raise Exception('The content is prohibited') elif usage_status == UsageStatus.LIMITED.value: usage_limit_tags = parser._xpath( record, 'descriptivedetail/epubusageconstraint/epubusagelimit') cls._logger.debug('Found {0} EpubUsageLimit tags'.format( len(usage_limit_tags))) if not usage_limit_tags: continue [usage_limit_tag] = usage_limit_tags usage_unit = parser.text_of_subtag(usage_limit_tag, 'x321') cls._logger.debug('EpubUsageUnit: {0}'.format(usage_unit)) if usage_unit == UsageUnit.COPIES.value or usage_status == UsageUnit.CONCURRENT_USERS.value: quantity_limit = parser.text_of_subtag( usage_limit_tag, 'x320') cls._logger.debug( 'Quantity: {0}'.format(quantity_limit)) if licenses_owned == LicensePool.UNLIMITED_ACCESS: licenses_owned = 0 licenses_owned += int(quantity_limit) metadata_records.append( Metadata(data_source=data_source_name, title=title, subtitle=subtitle, language=language, medium=Edition.BOOK_MEDIUM, publisher=publisher, imprint=imprint, issued=issued, primary_identifier=primary_identifier, identifiers=identifiers, subjects=subjects, contributors=contributors, links=links, circulation=CirculationData( data_source_name, primary_identifier, licenses_owned=licenses_owned, licenses_available=licenses_owned, licenses_reserved=0, patrons_in_hold_queue=0))) return metadata_records
elif len(isbn) != 13: continue if isbn: metadata.identifiers.append( IdentifierData(type=Identifier.ISBN, identifier=isbn)) for subject_type, subjects_details in subjects.items(): for subject_detail in subjects_details: if isinstance(subject_detail, dict): subject_name = subject_detail.get('name') subject_identifier = subject_detail.get('id') metadata.subjects.append( SubjectData( type=subject_type, identifier=subject_identifier, name=subject_name, )) else: metadata.subjects.append( SubjectData(type=subject_type, identifier=subject_detail)) viafs = [self.VIAF_ID.search(uri) for uri in creator_uris] viafs = [viaf.groups()[0] for viaf in viafs if viaf is not None] for viaf in viafs: metadata.contributors.append(ContributorData(viaf=viaf)) if creator_uris and not viafs: # We vastly prefer VIAF author information over OCLC. # We'll only extract OCLC author information if we have
def parse(cls, file, data_source_name): reader = MARCReader(file) metadata_records = [] for record in reader: title = record.title() if title.endswith(' /'): title = title[:-len(' /')] issued_year = datetime.datetime.strptime(record.pubyear(), "%Y.") publisher = record.publisher() if publisher.endswith(','): publisher = publisher[:-1] links = [] summary = record.notes()[0]['a'] if summary: summary_link = LinkData( rel=Hyperlink.DESCRIPTION, media_type=Representation.TEXT_PLAIN, content=summary, ) links.append(summary_link) isbn = record['020']['a'].split(" ")[0] primary_identifier = IdentifierData(Identifier.ISBN, isbn) subjects = [ SubjectData( Classifier.FAST, subject['a'], ) for subject in record.subjects() ] author = record.author() if author: old_author = author # Turn 'Dante Alighieri, 1265-1321, author.' # into 'Dante Alighieri'. The metadata wrangler will # take it from there. for regex in cls.END_OF_AUTHOR_NAME_RES: match = regex.search(author) if match: old_author = author author = author[:match.start()] break author_names = [author] else: author_names = ['Anonymous'] contributors = [ ContributorData( sort_name=author, roles=[Contributor.AUTHOR_ROLE], ) for author in author_names ] metadata_records.append( Metadata(data_source=data_source_name, title=title, language='eng', medium=Edition.BOOK_MEDIUM, publisher=publisher, issued=issued_year, primary_identifier=primary_identifier, subjects=subjects, contributors=contributors, links=links)) return metadata_records