def test_apply(self): edition_old, pool = self._edition(with_license_pool=True) metadata = Metadata( data_source=DataSource.OVERDRIVE, title=u"The Harry Otter and the Seaweed of Ages", sort_title=u"Harry Otter and the Seaweed of Ages, The", subtitle=u"Kelp At It", series=u"The Harry Otter Sagas", series_position=u"4", language=u"eng", medium=u"Audio", publisher=u"Scholastic Inc", imprint=u"Follywood", published=datetime.date(1987, 5, 4), issued=datetime.date(1989, 4, 5)) edition_new, changed = metadata.apply(edition_old) eq_(changed, True) eq_(edition_new.title, u"The Harry Otter and the Seaweed of Ages") eq_(edition_new.sort_title, u"Harry Otter and the Seaweed of Ages, The") eq_(edition_new.subtitle, u"Kelp At It") eq_(edition_new.series, u"The Harry Otter Sagas") eq_(edition_new.series_position, u"4") eq_(edition_new.language, u"eng") eq_(edition_new.medium, u"Audio") eq_(edition_new.publisher, u"Scholastic Inc") eq_(edition_new.imprint, u"Follywood") eq_(edition_new.published, datetime.date(1987, 5, 4)) eq_(edition_new.issued, datetime.date(1989, 4, 5)) edition_new, changed = metadata.apply(edition_new) eq_(changed, False)
def test_non_open_access_book_not_mirrored(self): data_source = DataSource.lookup(self._db, DataSource.GUTENBERG) m = Metadata(data_source=data_source) mirror = DummyS3Uploader(fail=True) h = DummyHTTPClient() policy = ReplacementPolicy(mirror=mirror, http_get=h.do_get) content = "foo" link = LinkData(rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, media_type=Representation.EPUB_MEDIA_TYPE, href="http://example.com/", content=content, rights_uri=RightsStatus.IN_COPYRIGHT) identifier = self._identifier() link_obj, is_new = identifier.add_link( rel=link.rel, href=link.href, data_source=data_source, media_type=link.media_type, content=link.content, ) # The Hyperlink object makes it look like an open-access book, # but the context we have from the OPDS feed says that it's # not. m.mirror_link(None, data_source, link, link_obj, policy) # No HTTP requests were made. eq_([], h.requests) # Nothing was uploaded. eq_([], mirror.uploaded)
def test_mirror_404_error(self): mirror = DummyS3Uploader() h = DummyHTTPClient() h.queue_response(404) policy = ReplacementPolicy(mirror=mirror, http_get=h.do_get) edition, pool = self._edition(with_license_pool=True) data_source = DataSource.lookup(self._db, DataSource.GUTENBERG) link = LinkData( rel=Hyperlink.IMAGE, media_type=Representation.JPEG_MEDIA_TYPE, href="http://example.com/", ) link_obj, ignore = edition.primary_identifier.add_link( rel=link.rel, href=link.href, data_source=data_source, license_pool=pool, media_type=link.media_type, content=link.content, ) m = Metadata(data_source=data_source) m.mirror_link(edition, data_source, link, link_obj, policy) # Since we got a 404 error, the cover image was not mirrored. eq_(404, link_obj.resource.representation.status_code) eq_(None, link_obj.resource.representation.mirror_url) eq_([], mirror.uploaded)
def test_make_thumbnail_assigns_pool(self): identifier = IdentifierData(Identifier.GUTENBERG_ID, "1") #identifier = self._identifier() #identifier = IdentifierData(type=Identifier.GUTENBERG_ID, identifier=edition.primary_identifier) edition = self._edition(identifier_id=identifier.identifier) link = LinkData( rel=Hyperlink.THUMBNAIL_IMAGE, href="http://thumbnail.com/", media_type=Representation.JPEG_MEDIA_TYPE, ) metadata = Metadata( data_source=edition.data_source, primary_identifier=identifier, links=[link], ) circulation = CirculationData(data_source=edition.data_source, primary_identifier=identifier) metadata.circulation = circulation metadata.apply(edition) thumbnail_link = edition.primary_identifier.links[0] circulation_pool, is_new = circulation.license_pool(self._db) eq_(thumbnail_link.license_pool, circulation_pool)
def test_update_contributions(self): edition = self._edition() # A test edition is created with a test contributor. This # particular contributor is about to be destroyed and replaced by # new data. [old_contributor] = edition.contributors contributor = ContributorData(display_name="Robert Jordan", sort_name="Jordan, Robert", wikipedia_name="Robert_Jordan", viaf="79096089", lc="123", roles=[Contributor.PRIMARY_AUTHOR_ROLE]) metadata = Metadata(DataSource.OVERDRIVE, contributors=[contributor]) metadata.update_contributions(self._db, edition, replace=True) # The old contributor has been removed and replaced with the new # one. [contributor] = edition.contributors assert contributor != old_contributor # And the new one has all the information provided by # the Metadata object. eq_("Jordan, Robert", contributor.sort_name) eq_("Robert Jordan", contributor.display_name) eq_("79096089", contributor.viaf) eq_("123", contributor.lc) eq_("Robert_Jordan", contributor.wikipedia_name)
def test_success(self): pwid = 'pwid1' # Here's a print book. book = self._edition() book.medium = Edition.BOOK_MEDIUM book.permanent_work_id = pwid # Here's an audio book with the same PWID. audio = self._edition() audio.medium = Edition.AUDIO_MEDIUM audio.permanent_work_id = pwid # Here's an Metadata object for a second print book with the # same PWID. identifier = self._identifier() identifierdata = IdentifierData(type=identifier.type, identifier=identifier.identifier) metadata = Metadata(DataSource.GUTENBERG, primary_identifier=identifierdata, medium=Edition.BOOK_MEDIUM) metadata.permanent_work_id = pwid # Call the method we're testing. metadata.associate_with_identifiers_based_on_permanent_work_id( self._db) # The identifier of the second print book has been associated # with the identifier of the first print book, but not # with the identifier of the audiobook equivalent_identifiers = [x.output for x in identifier.equivalencies] eq_([book.primary_identifier], equivalent_identifiers)
def test_apply_identifier_equivalency(self): # Set up primary identifier with matching & new IdentifierData objects edition, pool = self._edition(with_license_pool=True) primary = edition.primary_identifier primary_as_data = IdentifierData(type=primary.type, identifier=primary.identifier) other_data = IdentifierData(type=u"abc", identifier=u"def") # Prep Metadata object. metadata = Metadata(data_source=DataSource.OVERDRIVE, primary_identifier=primary, identifiers=[primary_as_data, other_data]) # The primary identifier is put into the identifiers array after init eq_(3, len(metadata.identifiers)) assert primary in metadata.identifiers metadata.apply(edition) # Neither the primary edition nor the identifier data that represents # it have become equivalencies. eq_(1, len(primary.equivalencies)) [equivalency] = primary.equivalencies eq_(equivalency.output.type, u"abc") eq_(equivalency.output.identifier, u"def")
def test_coverage_record(self): edition, pool = self._edition(with_license_pool=True) data_source = edition.data_source # No preexisting coverage record coverage = CoverageRecord.lookup(edition, data_source) eq_(coverage, None) last_update = datetime.datetime(2015, 1, 1) m = Metadata(data_source=data_source, title=u"New title", data_source_last_updated=last_update) m.apply(edition) coverage = CoverageRecord.lookup(edition, data_source) eq_(last_update, coverage.timestamp) eq_(u"New title", edition.title) older_last_update = datetime.datetime(2014, 1, 1) m = Metadata(data_source=data_source, title=u"Another new title", data_source_last_updated=older_last_update) m.apply(edition) eq_(u"New title", edition.title) coverage = CoverageRecord.lookup(edition, data_source) eq_(last_update, coverage.timestamp) m.apply(edition, force=True) eq_(u"Another new title", edition.title) coverage = CoverageRecord.lookup(edition, data_source) eq_(older_last_update, coverage.timestamp)
def test_mirror_open_access_link_mirror_failure(self): edition, pool = self._edition(with_license_pool=True) data_source = DataSource.lookup(self._db, DataSource.GUTENBERG) m = Metadata(data_source=data_source) mirror = DummyS3Uploader(fail=True) h = DummyHTTPClient() policy = ReplacementPolicy(mirror=mirror, http_get=h.do_get) content = open(self.sample_cover_path("test-book-cover.png")).read() link = LinkData(rel=Hyperlink.IMAGE, media_type=Representation.JPEG_MEDIA_TYPE, href="http://example.com/", content=content) link_obj, ignore = edition.primary_identifier.add_link( rel=link.rel, href=link.href, data_source=data_source, license_pool=pool, media_type=link.media_type, content=link.content, ) h.queue_response(200, media_type=Representation.JPEG_MEDIA_TYPE) m.mirror_link(edition, data_source, link, link_obj, policy) representation = link_obj.resource.representation # The representation was fetched successfully. eq_(None, representation.fetch_exception) assert representation.fetched_at != None # But mirroing failed. assert representation.mirror_exception != None eq_(None, representation.mirrored_at) eq_(link.media_type, representation.media_type) eq_(link.href, representation.url) # The mirror url should still be set. assert "Gutenberg" in representation.mirror_url assert representation.mirror_url.endswith( "%s/cover.jpg" % edition.primary_identifier.identifier) # Book content is still there since it wasn't mirrored. assert representation.content != None # the edition's identifier-associated license pool should not be # suppressed just because fetch failed on getting image. eq_(False, pool.suppressed) # the license pool only gets its license_exception column filled in # if fetch failed on getting an Hyperlink.OPEN_ACCESS_DOWNLOAD-type epub. eq_(None, pool.license_exception)
def test_measurements(self): edition = self._edition() measurement = MeasurementData(quantity_measured=Measurement.POPULARITY, value=100) metadata = Metadata(measurements=[measurement], data_source=edition.data_source) metadata.apply(edition) [m] = edition.primary_identifier.measurements eq_(Measurement.POPULARITY, m.quantity_measured) eq_(100, m.value)
def test_mirror_with_content_modifier(self): edition, pool = self._edition(with_license_pool=True) data_source = DataSource.lookup(self._db, DataSource.GUTENBERG) m = Metadata(data_source=data_source) mirror = DummyS3Uploader() def dummy_content_modifier(representation): representation.content = "Replaced Content" h = DummyHTTPClient() policy = ReplacementPolicy(mirror=mirror, content_modifier=dummy_content_modifier, http_get=h.do_get) link = LinkData( rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, media_type=Representation.EPUB_MEDIA_TYPE, href="http://example.com/test.epub", content="I'm an epub", ) link_obj, ignore = edition.primary_identifier.add_link( rel=link.rel, href=link.href, data_source=data_source, license_pool=pool, media_type=link.media_type, content=link.content, ) h.queue_response(200, media_type=Representation.EPUB_MEDIA_TYPE) m.mirror_link(edition, data_source, link, link_obj, policy) representation = link_obj.resource.representation # The representation was fetched successfully. eq_(None, representation.fetch_exception) assert representation.fetched_at != None # The mirror url is set. assert "Gutenberg" in representation.mirror_url assert representation.mirror_url.endswith( "%s/%s.epub" % (edition.primary_identifier.identifier, edition.title)) # Content isn't there since it was mirrored. eq_(None, representation.content) # The representation was mirrored, with the modified content. eq_([representation], mirror.uploaded) eq_(["Replaced Content"], mirror.content)
def test_links(self): edition = self._edition() l1 = LinkData(rel=Hyperlink.IMAGE, href="http://example.com/") l2 = LinkData(rel=Hyperlink.DESCRIPTION, content="foo") metadata = Metadata(links=[l1, l2], data_source=edition.data_source) metadata.apply(edition) [image, description] = sorted(edition.primary_identifier.links, key=lambda x: x.rel) eq_(Hyperlink.IMAGE, image.rel) eq_("http://example.com/", image.resource.url) eq_(Hyperlink.DESCRIPTION, description.rel) eq_("foo", description.resource.representation.content)
def test_links_filtered(self): # test that filter links to only metadata-relevant ones link1 = LinkData(Hyperlink.OPEN_ACCESS_DOWNLOAD, "example.epub") link2 = LinkData(rel=Hyperlink.IMAGE, href="http://example.com/") link3 = LinkData(rel=Hyperlink.DESCRIPTION, content="foo") link4 = LinkData( rel=Hyperlink.THUMBNAIL_IMAGE, href="http://thumbnail.com/", media_type=Representation.JPEG_MEDIA_TYPE, ) link5 = LinkData( rel=Hyperlink.IMAGE, href="http://example.com/", thumbnail=link4, media_type=Representation.JPEG_MEDIA_TYPE, ) links = [link1, link2, link3, link4, link5] identifier = IdentifierData(Identifier.GUTENBERG_ID, "1") metadata = Metadata( data_source=DataSource.GUTENBERG, primary_identifier=identifier, links=links, ) filtered_links = sorted(metadata.links, key=lambda x: x.rel) eq_([link2, link5, link4, link3], filtered_links)
def test_mirror_open_access_link_fetch_failure(self): edition, pool = self._edition(with_license_pool=True) data_source = DataSource.lookup(self._db, DataSource.GUTENBERG) m = Metadata(data_source=data_source) mirror = DummyS3Uploader() h = DummyHTTPClient() policy = ReplacementPolicy(mirror=mirror, http_get=h.do_get) link = LinkData( rel=Hyperlink.IMAGE, media_type=Representation.JPEG_MEDIA_TYPE, href="http://example.com/", ) link_obj, ignore = edition.primary_identifier.add_link( rel=link.rel, href=link.href, data_source=data_source, license_pool=pool, media_type=link.media_type, content=link.content, ) h.queue_response(403) m.mirror_link(edition, data_source, link, link_obj, policy) representation = link_obj.resource.representation # Fetch failed, so we should have a fetch exception but no mirror url. assert representation.fetch_exception != None eq_(None, representation.mirror_exception) eq_(None, representation.mirror_url) eq_(link.href, representation.url) assert representation.fetched_at != None eq_(None, representation.mirrored_at) # the edition's identifier-associated license pool should not be # suppressed just because fetch failed on getting image. eq_(False, pool.suppressed) # the license pool only gets its license_exception column filled in # if fetch failed on getting an Hyperlink.OPEN_ACCESS_DOWNLOAD-type epub. eq_(None, pool.license_exception)
def test_update(self): # Tests that Metadata.update correctly prefers new fields to old, unless # new fields aren't defined. edition_old, pool = self._edition(with_license_pool=True) edition_old.publisher = "test_old_publisher" edition_old.subtitle = "old_subtitile" metadata_old = Metadata.from_edition(edition_old) edition_new, pool = self._edition(with_license_pool=True) # set more fields on metadatas edition_new.publisher = None edition_new.subtitle = "new_updated_subtitile" metadata_new = Metadata.from_edition(edition_new) metadata_old.update(metadata_new) eq_(metadata_old.publisher, "test_old_publisher") eq_(metadata_old.subtitle, metadata_new.subtitle)
def test_image_and_thumbnail(self): edition = self._edition() l2 = LinkData( rel=Hyperlink.THUMBNAIL_IMAGE, href="http://thumbnail.com/", media_type=Representation.JPEG_MEDIA_TYPE, ) l1 = LinkData( rel=Hyperlink.IMAGE, href="http://example.com/", thumbnail=l2, media_type=Representation.JPEG_MEDIA_TYPE, ) metadata = Metadata(links=[l1, l2], data_source=edition.data_source) metadata.apply(edition) [image, thumbnail] = sorted(edition.primary_identifier.links, key=lambda x: x.rel) eq_(Hyperlink.IMAGE, image.rel) eq_([thumbnail.resource.representation], image.resource.representation.thumbnails)
def test_classifications_from_another_source_not_updated(self): # Set up an edition whose primary identifier has two # classifications. source1 = DataSource.lookup(self._db, DataSource.AXIS_360) source2 = DataSource.lookup(self._db, DataSource.METADATA_WRANGLER) edition = self._edition() identifier = edition.primary_identifier c1 = identifier.classify(source1, Subject.TAG, "i will persist") c2 = identifier.classify(source2, Subject.TAG, "i will perish") # Now we get some new metadata from source #2. subjects = [SubjectData(type=Subject.TAG, identifier="i will conquer")] metadata = Metadata(subjects=subjects, data_source=source2) replace = ReplacementPolicy(subjects=True) metadata.apply(edition, replace=replace) # The old classification from source #2 has been destroyed. # The old classification from source #1 is still there. eq_(['i will conquer', 'i will persist'], sorted([x.subject.identifier for x in identifier.classifications]))
def test_apply_no_value(self): edition_old, pool = self._edition(with_license_pool=True) metadata = Metadata(data_source=DataSource.PRESENTATION_EDITION, subtitle=NO_VALUE, series=NO_VALUE, series_position=NO_NUMBER) edition_new, changed = metadata.apply(edition_old) eq_(changed, True) eq_(edition_new.title, edition_old.title) eq_(edition_new.sort_title, edition_old.sort_title) eq_(edition_new.subtitle, None) eq_(edition_new.series, None) eq_(edition_new.series_position, None) eq_(edition_new.language, edition_old.language) eq_(edition_new.medium, edition_old.medium) eq_(edition_new.publisher, edition_old.publisher) eq_(edition_new.imprint, edition_old.imprint) eq_(edition_new.published, edition_old.published) eq_(edition_new.issued, edition_old.issued)
def extract_bibliographic(self, element, ns): identifiers = [] contributors = [] identifiers.append(IdentifierData(Identifier.ISBN, element["isbn"])) sort_name = element["author"] if not sort_name: sort_name = "Unknown" contributors.append(ContributorData(sort_name=sort_name)) primary_identifier = IdentifierData(Identifier.ENKI_ID, element["id"]) metadata = Metadata( data_source=DataSource.ENKI, title=element["title"], language="ENGLISH", medium=Edition.BOOK_MEDIUM, #series=series, publisher=element["publisher"], #imprint=imprint, #published=publication_date, primary_identifier=primary_identifier, identifiers=identifiers, #subjects=subjects, contributors=contributors, ) #TODO: This should parse the content type and look it up in the Enki Delivery Data above. Currently, # we assume everything is an ePub that uses Adobe DRM, which is a safe assumption only for now. formats = [] formats.append( FormatData(content_type=Representation.EPUB_MEDIA_TYPE, drm_scheme=DeliveryMechanism.ADOBE_DRM)) circulationdata = CirculationData( data_source=DataSource.ENKI, primary_identifier=primary_identifier, formats=formats, ) metadata.circulation = circulationdata return metadata
def test_metadata_can_be_deepcopied(self): # Check that we didn't put something in the metadata that # will prevent it from being copied. (e.g., self.log) subject = SubjectData(Subject.TAG, "subject") contributor = ContributorData() identifier = IdentifierData(Identifier.GUTENBERG_ID, "1") link = LinkData(Hyperlink.OPEN_ACCESS_DOWNLOAD, "example.epub") measurement = MeasurementData(Measurement.RATING, 5) circulation = CirculationData(data_source=DataSource.GUTENBERG, primary_identifier=identifier, licenses_owned=0, licenses_available=0, licenses_reserved=0, patrons_in_hold_queue=0) primary_as_data = IdentifierData(type=identifier.type, identifier=identifier.identifier) other_data = IdentifierData(type=u"abc", identifier=u"def") m = Metadata( DataSource.GUTENBERG, subjects=[subject], contributors=[contributor], primary_identifier=identifier, links=[link], measurements=[measurement], circulation=circulation, title="Hello Title", subtitle="Subtle Hello", sort_title="Sorting Howdy", language="US English", medium=Edition.BOOK_MEDIUM, series="1", series_position=1, publisher="Hello World Publishing House", imprint=u"Follywood", issued=datetime.datetime.utcnow(), published=datetime.datetime.utcnow(), identifiers=[primary_as_data, other_data], data_source_last_updated=datetime.datetime.utcnow(), ) m_copy = deepcopy(m) # If deepcopy didn't throw an exception we're ok. assert m_copy is not None
def test_from_edition(self): # Makes sure Metadata.from_edition copies all the fields over. edition, pool = self._edition(with_license_pool=True) edition.series = "Harry Otter and the Mollusk of Infamy" edition.series_position = "14" metadata = Metadata.from_edition(edition) # make sure the metadata and the originating edition match for field in Metadata.BASIC_EDITION_FIELDS: eq_(getattr(edition, field), getattr(metadata, field)) e_contribution = edition.contributions[0] m_contributor_data = metadata.contributors[0] eq_(e_contribution.contributor.sort_name, m_contributor_data.sort_name) eq_(e_contribution.role, m_contributor_data.roles[0]) eq_(edition.data_source, metadata.data_source(self._db)) eq_(edition.primary_identifier.identifier, metadata.primary_identifier.identifier)
def test_filter_recommendations(self): metadata = Metadata(DataSource.OVERDRIVE) known_identifier = self._identifier() unknown_identifier = IdentifierData(Identifier.ISBN, "hey there") # Unknown identifiers are filtered out of the recommendations. metadata.recommendations += [known_identifier, unknown_identifier] metadata.filter_recommendations(self._db) eq_([known_identifier], metadata.recommendations) # It works with IdentifierData as well. known_identifier_data = IdentifierData(known_identifier.type, known_identifier.identifier) metadata.recommendations = [known_identifier_data, unknown_identifier] metadata.filter_recommendations(self._db) [result] = metadata.recommendations # The IdentifierData has been replaced by a bonafide Identifier. eq_(True, isinstance(result, Identifier)) # The genuwine article. eq_(known_identifier, result)
def book_info_to_metadata(cls, book, include_bibliographic=True, include_formats=True): """Turn Overdrive's JSON representation of a book into a Metadata object. Note: The json data passed into this method is from a different file/stream from the json data that goes into the book_info_to_circulation() method. """ if not 'id' in book: return None overdrive_id = book['id'] primary_identifier = IdentifierData(Identifier.OVERDRIVE_ID, overdrive_id) if include_bibliographic: title = book.get('title', None) sort_title = book.get('sortTitle') subtitle = book.get('subtitle', None) series = book.get('series', None) publisher = book.get('publisher', None) imprint = book.get('imprint', None) if 'publishDate' in book: published = datetime.datetime.strptime( book['publishDate'][:10], cls.DATE_FORMAT) else: published = None languages = [l['code'] for l in book.get('languages', [])] if 'eng' in languages or not languages: language = 'eng' else: language = sorted(languages)[0] contributors = [] for creator in book.get('creators', []): sort_name = creator['fileAs'] display_name = creator['name'] role = creator['role'] roles = cls.parse_roles(overdrive_id, role) or [Contributor.UNKNOWN_ROLE] contributor = ContributorData(sort_name=sort_name, display_name=display_name, roles=roles, biography=creator.get( 'bioText', None)) contributors.append(contributor) subjects = [] for sub in book.get('subjects', []): subject = SubjectData(type=Subject.OVERDRIVE, identifier=sub['value'], weight=100) subjects.append(subject) for sub in book.get('keywords', []): subject = SubjectData(type=Subject.TAG, identifier=sub['value'], weight=1) subjects.append(subject) extra = dict() if 'grade_levels' in book: # n.b. Grade levels are measurements of reading level, not # age appropriateness. We can use them as a measure of age # appropriateness in a pinch, but we weight them less # heavily than other information from Overdrive. for i in book['grade_levels']: subject = SubjectData(type=Subject.GRADE_LEVEL, identifier=i['value'], weight=10) subjects.append(subject) overdrive_medium = book.get('mediaType', None) if overdrive_medium and overdrive_medium not in cls.overdrive_medium_to_simplified_medium: cls.log.error("Could not process medium %s for %s", overdrive_medium, overdrive_id) medium = cls.overdrive_medium_to_simplified_medium.get( overdrive_medium, Edition.BOOK_MEDIUM) measurements = [] if 'awards' in book: extra['awards'] = book.get('awards', []) num_awards = len(extra['awards']) measurements.append( MeasurementData(Measurement.AWARDS, str(num_awards))) for name, subject_type in (('ATOS', Subject.ATOS_SCORE), ('lexileScore', Subject.LEXILE_SCORE), ('interestLevel', Subject.INTEREST_LEVEL)): if not name in book: continue identifier = str(book[name]) subjects.append( SubjectData(type=subject_type, identifier=identifier, weight=100)) for grade_level_info in book.get('gradeLevels', []): grade_level = grade_level_info.get('value') subjects.append( SubjectData(type=Subject.GRADE_LEVEL, identifier=grade_level, weight=100)) identifiers = [] links = [] for format in book.get('formats', []): for new_id in format.get('identifiers', []): t = new_id['type'] v = new_id['value'] orig_v = v type_key = None if t == 'ASIN': type_key = Identifier.ASIN elif t == 'ISBN': type_key = Identifier.ISBN if len(v) == 10: v = isbnlib.to_isbn13(v) if v is None or not isbnlib.is_isbn13(v): # Overdrive sometimes uses invalid values # like "n/a" as placeholders. Ignore such # values to avoid a situation where hundreds of # books appear to have the same ISBN. ISBNs # which fail check digit checks or are invalid # also can occur. Log them for review. cls.log.info("Bad ISBN value provided: %s", orig_v) continue elif t == 'DOI': type_key = Identifier.DOI elif t == 'UPC': type_key = Identifier.UPC elif t == 'PublisherCatalogNumber': continue if type_key and v: identifiers.append(IdentifierData(type_key, v, 1)) # Samples become links. if 'samples' in format: if not format['id'] in cls.format_data_for_overdrive_format: # Useless to us. continue content_type, drm_scheme = cls.format_data_for_overdrive_format.get( format['id']) if Representation.is_media_type(content_type): for sample_info in format['samples']: href = sample_info['url'] links.append( LinkData(rel=Hyperlink.SAMPLE, href=href, media_type=content_type)) # A cover and its thumbnail become a single LinkData. if 'images' in book: images = book['images'] image_data = cls.image_link_to_linkdata( images.get('cover'), Hyperlink.IMAGE) for name in ['cover300Wide', 'cover150Wide', 'thumbnail']: # Try to get a thumbnail that's as close as possible # to the size we use. image = images.get(name) thumbnail_data = cls.image_link_to_linkdata( image, Hyperlink.THUMBNAIL_IMAGE) if not image_data: image_data = cls.image_link_to_linkdata( image, Hyperlink.IMAGE) if thumbnail_data: break if image_data: if thumbnail_data: image_data.thumbnail = thumbnail_data links.append(image_data) # Descriptions become links. short = book.get('shortDescription') full = book.get('fullDescription') if full: links.append( LinkData( rel=Hyperlink.DESCRIPTION, content=full, media_type="text/html", )) if short and (not full or not full.startswith(short)): links.append( LinkData( rel=Hyperlink.SHORT_DESCRIPTION, content=short, media_type="text/html", )) # Add measurements: rating and popularity if book.get('starRating') is not None and book['starRating'] > 0: measurements.append( MeasurementData(quantity_measured=Measurement.RATING, value=book['starRating'])) if book.get('popularity'): measurements.append( MeasurementData(quantity_measured=Measurement.POPULARITY, value=book['popularity'])) metadata = Metadata( data_source=DataSource.OVERDRIVE, title=title, subtitle=subtitle, sort_title=sort_title, language=language, medium=medium, series=series, publisher=publisher, imprint=imprint, published=published, primary_identifier=primary_identifier, identifiers=identifiers, subjects=subjects, contributors=contributors, measurements=measurements, links=links, ) else: metadata = Metadata( data_source=DataSource.OVERDRIVE, primary_identifier=primary_identifier, ) if include_formats: formats = [] for format in book.get('formats', []): format_id = format['id'] if format_id in cls.format_data_for_overdrive_format: content_type, drm_scheme = cls.format_data_for_overdrive_format.get( format_id) formats.append(FormatData(content_type, drm_scheme)) elif format_id not in cls.ignorable_overdrive_formats: cls.log.error( "Could not process Overdrive format %s for %s", format_id, overdrive_id) # Also make a CirculationData so we can write the formats, circulationdata = CirculationData( data_source=DataSource.OVERDRIVE, primary_identifier=primary_identifier, formats=formats, ) metadata.circulation = circulationdata return metadata
class ItemListParser(XMLParser): DATE_FORMAT = "%Y-%m-%d" YEAR_FORMAT = "%Y" NAMESPACES = {} def parse(self, xml): for i in self.process_all(xml, "//Item"): yield i parenthetical = re.compile(" \([^)]+\)$") @classmethod def contributors_from_string(cls, string): contributors = [] if not string: return contributors for sort_name in string.split(';'): sort_name = cls.parenthetical.sub("", sort_name.strip()) contributors.append( ContributorData( sort_name=sort_name.strip(), roles=[Contributor.AUTHOR_ROLE] ) ) return contributors @classmethod def parse_genre_string(self, s): genres = [] if not s: return genres for i in s.split(","): i = i.strip() if not i: continue i = i.replace("&amp;", "&").replace("&", "&").replace("'", "'") genres.append(SubjectData(Subject.THREEM, i, weight=15)) return genres def process_one(self, tag, namespaces): """Turn an <item> tag into a Metadata and an encompassed CirculationData objects, and return the Metadata.""" def value(threem_key): return self.text_of_optional_subtag(tag, threem_key) links = dict() identifiers = dict() subjects = [] primary_identifier = IdentifierData( Identifier.THREEM_ID, value("ItemId") ) identifiers = [] for key in ('ISBN13', 'PhysicalISBN'): v = value(key) if v: identifiers.append( IdentifierData(Identifier.ISBN, v) ) subjects = self.parse_genre_string(value("Genre")) title = value("Title") subtitle = value("SubTitle") publisher = value("Publisher") language = value("Language") contributors = list(self.contributors_from_string(value('Authors'))) published_date = None published = value("PubDate") if published: formats = [self.DATE_FORMAT, self.YEAR_FORMAT] else: published = value("PubYear") formats = [self.YEAR_FORMAT] for format in formats: try: published_date = datetime.strptime(published, format) except ValueError, e: pass links = [] description = value("Description") if description: links.append( LinkData(rel=Hyperlink.DESCRIPTION, content=description) ) cover_url = value("CoverLinkURL").replace("&", "&") links.append(LinkData(rel=Hyperlink.IMAGE, href=cover_url)) alternate_url = value("BookLinkURL").replace("&", "&") links.append(LinkData(rel='alternate', href=alternate_url)) measurements = [] pages = value("NumberOfPages") if pages: pages = int(pages) measurements.append( MeasurementData(quantity_measured=Measurement.PAGE_COUNT, value=pages) ) medium = Edition.BOOK_MEDIUM book_format = value("BookFormat") format = None if book_format == 'EPUB': format = FormatData( content_type=Representation.EPUB_MEDIA_TYPE, drm_scheme=DeliveryMechanism.ADOBE_DRM ) elif book_format == 'PDF': format = FormatData( content_type=Representation.PDF_MEDIA_TYPE, drm_scheme=DeliveryMechanism.ADOBE_DRM ) elif book_format == 'MP3': format = FormatData( content_type=Representation.MP3_MEDIA_TYPE, drm_scheme=DeliveryMechanism.ADOBE_DRM ) medium = Edition.AUDIO_MEDIUM formats = [format] metadata = Metadata( data_source=DataSource.THREEM, title=title, subtitle=subtitle, language=language, medium=medium, publisher=publisher, published=published_date, primary_identifier=primary_identifier, identifiers=identifiers, subjects=subjects, contributors=contributors, measurements=measurements, links=links, ) # Also make a CirculationData so we can write the formats, circulationdata = CirculationData( data_source=DataSource.THREEM, primary_identifier=primary_identifier, formats=formats, links=links, ) metadata.circulation = circulationdata return metadata
def extract_bibliographic(self, element, ns): """Turn bibliographic metadata into a Metadata and a CirculationData objects, and return them as a tuple.""" # TODO: These are consistently empty (some are clearly for # audiobooks) so I don't know what they do and/or what format # they're in. # # annotation # edition # narrator # runtime identifier = self.text_of_subtag(element, 'axis:titleId', ns) isbn = self.text_of_optional_subtag(element, 'axis:isbn', ns) title = self.text_of_subtag(element, 'axis:productTitle', ns) contributor = self.text_of_optional_subtag(element, 'axis:contributor', ns) contributors = [] found_primary_author = False if contributor: for c in self.parse_list(contributor): contributor = self.parse_contributor(c, found_primary_author) if Contributor.PRIMARY_AUTHOR_ROLE in contributor.roles: found_primary_author = True contributors.append(contributor) subject = self.text_of_optional_subtag(element, 'axis:subject', ns) subjects = [] if subject: for subject_identifier in self.parse_list(subject): subjects.append( SubjectData(type=Subject.BISAC, identifier=subject_identifier, weight=1)) publication_date = self.text_of_optional_subtag( element, 'axis:publicationDate', ns) if publication_date: publication_date = datetime.datetime.strptime( publication_date, self.SHORT_DATE_FORMAT) series = self.text_of_optional_subtag(element, 'axis:series', ns) publisher = self.text_of_optional_subtag(element, 'axis:publisher', ns) imprint = self.text_of_optional_subtag(element, 'axis:imprint', ns) audience = self.text_of_optional_subtag(element, 'axis:audience', ns) if audience: subjects.append( SubjectData( type=Subject.THETA_AUDIENCE, identifier=audience, weight=1, )) language = self.text_of_subtag(element, 'axis:language', ns) # We don't use this for anything. # file_size = self.int_of_optional_subtag(element, 'theta:fileSize', ns) primary_identifier = IdentifierData(Identifier.THETA_ID, identifier) identifiers = [] if isbn: identifiers.append(IdentifierData(Identifier.ISBN, isbn)) formats = [] acceptable = False seen_formats = [] for format_tag in self._xpath( element, 'axis:availability/axis:availableFormats/axis:formatName', ns): informal_name = format_tag.text seen_formats.append(informal_name) if informal_name not in self.DELIVERY_DATA_FOR_THETA_FORMAT: self.log("Unrecognized Theta format name for %s: %s" % (identifier, informal_name)) elif self.DELIVERY_DATA_FOR_THETA_FORMAT.get(informal_name): content_type, drm_scheme = self.DELIVERY_DATA_FOR_THETA_FORMAT[ informal_name] formats.append( FormatData(content_type=content_type, drm_scheme=drm_scheme)) if not formats: self.log.error("No supported format for %s (%s)! Saw: %s", identifier, title, ", ".join(seen_formats)) metadata = Metadata( data_source=DataSource.THETA, title=title, language=language, medium=Edition.BOOK_MEDIUM, series=series, publisher=publisher, imprint=imprint, published=publication_date, primary_identifier=primary_identifier, identifiers=identifiers, subjects=subjects, contributors=contributors, ) circulationdata = CirculationData( data_source=DataSource.THETA, primary_identifier=primary_identifier, formats=formats, ) metadata.circulation = circulationdata return metadata
def test_set_metadata_incorporates_replacement_policy(self): """Make sure that if a ReplacementPolicy is passed in to set_metadata(), the policy's settings (and those of its .presentation_calculation_policy) are respected. """ edition, pool = self._edition(with_license_pool=True) identifier = edition.primary_identifier # All images and open-access content should be uploaded to # this 'mirror'. mirror = DummyS3Uploader() http = DummyHTTPClient() http.queue_response( 200, content='I am an epub.', media_type=Representation.EPUB_MEDIA_TYPE, ) class Tripwire(PresentationCalculationPolicy): # This class sets a variable if one of its properties is # accessed. def __init__(self, *args, **kwargs): self.tripped = False def __getattr__(self, name): self.tripped = True return True presentation_calculation_policy = Tripwire() metadata_replacement_policy = ReplacementPolicy( mirror=mirror, http_get=http.do_get, presentation_calculation_policy=presentation_calculation_policy) circulationdata_replacement_policy = ReplacementPolicy( mirror=mirror, http_get=http.do_get, ) output_source = DataSource.lookup(self._db, DataSource.GUTENBERG) provider = CoverageProvider("service", [identifier.type], output_source) metadata = Metadata(output_source) # We've got a CirculationData object that includes an open-access download. link = LinkData(rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, href="http://foo.com/") circulationdata = CirculationData( output_source, primary_identifier=metadata.primary_identifier, links=[link]) provider.set_metadata_and_circulation_data( identifier, metadata, circulationdata, metadata_replacement_policy=metadata_replacement_policy, circulationdata_replacement_policy= circulationdata_replacement_policy, ) # The open-access download was 'downloaded' and 'mirrored'. [mirrored] = mirror.uploaded eq_("http://foo.com/", mirrored.url) assert mirrored.mirror_url.endswith( "/%s/%s.epub" % (identifier.identifier, edition.title)) # The book content was removed from the db after it was # mirrored successfully. eq_(None, mirrored.content) # Our custom PresentationCalculationPolicy was used when # determining whether to recalculate the work's # presentation. We know this because the tripwire was # triggered. eq_(True, presentation_calculation_policy.tripped)
class TestBibliographicCoverageProvider(DatabaseTest): BIBLIOGRAPHIC_DATA = Metadata( DataSource.OVERDRIVE, publisher=u'Perfection Learning', language='eng', title=u'A Girl Named Disaster', published=datetime.datetime(1998, 3, 1, 0, 0), primary_identifier=IdentifierData( type=Identifier.OVERDRIVE_ID, identifier=u'ba9b3419-b0bd-4ca7-a24f-26c4246b6b44'), identifiers=[ IdentifierData(type=Identifier.OVERDRIVE_ID, identifier=u'ba9b3419-b0bd-4ca7-a24f-26c4246b6b44'), IdentifierData(type=Identifier.ISBN, identifier=u'9781402550805') ], contributors=[ ContributorData(sort_name=u"Nancy Farmer", roles=[Contributor.PRIMARY_AUTHOR_ROLE]) ], subjects=[ SubjectData(type=Subject.TOPIC, identifier=u'Action & Adventure'), SubjectData(type=Subject.FREEFORM_AUDIENCE, identifier=u'Young Adult'), SubjectData(type=Subject.PLACE, identifier=u'Africa') ], ) CIRCULATION_DATA = CirculationData( DataSource.OVERDRIVE, primary_identifier=BIBLIOGRAPHIC_DATA.primary_identifier, ) def test_edition(self): provider = MockBibliographicCoverageProvider(self._db) provider.CAN_CREATE_LICENSE_POOLS = False identifier = self._identifier(identifier_type=Identifier.OVERDRIVE_ID) test_metadata = self.BIBLIOGRAPHIC_DATA # Returns a CoverageFailure if the identifier doesn't have a # license pool and none can be created. result = provider.work(identifier) assert isinstance(result, CoverageFailure) eq_("No license pool available", result.exception) # Returns an Edition otherwise, creating it if necessary. edition, lp = self._edition(with_license_pool=True) identifier = edition.primary_identifier eq_(edition, provider.edition(identifier)) # The Edition will be created if necessary. lp.identifier.primarily_identifies = [] e2 = provider.edition(identifier) assert edition != e2 assert isinstance(e2, Edition) def test_work(self): provider = MockBibliographicCoverageProvider(self._db) identifier = self._identifier(identifier_type=Identifier.OVERDRIVE_ID) test_metadata = self.BIBLIOGRAPHIC_DATA provider.CAN_CREATE_LICENSE_POOLS = False # Returns a CoverageFailure if the identifier doesn't have a # license pool. result = provider.work(identifier) assert isinstance(result, CoverageFailure) eq_("No license pool available", result.exception) # Returns a CoverageFailure if there's no work available. edition, lp = self._edition(with_license_pool=True) # Remove edition so that the work won't be calculated lp.identifier.primarily_identifies = [] result = provider.work(lp.identifier) assert isinstance(result, CoverageFailure) eq_("Work could not be calculated", result.exception) # Returns the work if it can be created or found. ed, lp = self._edition(with_license_pool=True) result = provider.work(lp.identifier) eq_(result, lp.work) def test_set_metadata(self): provider = MockBibliographicCoverageProvider(self._db) provider.CAN_CREATE_LICENSE_POOLS = False identifier = self._identifier(identifier_type=Identifier.OVERDRIVE_ID) test_metadata = self.BIBLIOGRAPHIC_DATA test_circulationdata = self.CIRCULATION_DATA # If there is no LicensePool and it can't be autocreated, a # CoverageRecord results. result = provider.work(identifier) assert isinstance(result, CoverageFailure) eq_("No license pool available", result.exception) edition, lp = self._edition(data_source_name=DataSource.OVERDRIVE, identifier_type=Identifier.OVERDRIVE_ID, identifier_id=self.BIBLIOGRAPHIC_DATA. primary_identifier.identifier, with_license_pool=True) # If no metadata is passed in, a CoverageFailure results. result = provider.set_metadata_and_circulation_data( edition.primary_identifier, None, None) assert isinstance(result, CoverageFailure) eq_("Received neither metadata nor circulation data from input source", result.exception) # If no work can be created (in this case, because there's no title), # a CoverageFailure results. edition.title = None old_title = test_metadata.title test_metadata.title = None result = provider.set_metadata_and_circulation_data( edition.primary_identifier, test_metadata, test_circulationdata) assert isinstance(result, CoverageFailure) eq_("Work could not be calculated", result.exception) test_metadata.title = old_title # Test success result = provider.set_metadata_and_circulation_data( edition.primary_identifier, test_metadata, test_circulationdata) eq_(result, edition.primary_identifier) # If there's an exception setting the metadata, a # CoverageRecord results. This call raises a ValueError # because the primary identifier & the edition's primary # identifier don't match. test_metadata.primary_identifier = self._identifier( identifier_type=Identifier.OVERDRIVE_ID) result = provider.set_metadata_and_circulation_data( lp.identifier, test_metadata, test_circulationdata) assert isinstance(result, CoverageFailure) assert "ValueError" in result.exception def test_autocreate_licensepool(self): provider = MockBibliographicCoverageProvider(self._db) identifier = self._identifier(identifier_type=Identifier.OVERDRIVE_ID) # If this constant is set to False, the coverage provider cannot # autocreate LicensePools for identifiers. provider.CAN_CREATE_LICENSE_POOLS = False eq_(None, provider.license_pool(identifier)) # If it's set to True, the coverage provider can autocreate # LicensePools for identifiers. provider.CAN_CREATE_LICENSE_POOLS = True pool = provider.license_pool(identifier) eq_(pool.data_source, provider.output_source) eq_(pool.identifier, identifier) def test_set_presentation_ready(self): provider = MockBibliographicCoverageProvider(self._db) identifier = self._identifier(identifier_type=Identifier.OVERDRIVE_ID) test_metadata = self.BIBLIOGRAPHIC_DATA # If the work can't be found, it can't be made presentation ready. provider.CAN_CREATE_LICENSE_POOLS = False result = provider.set_presentation_ready(identifier) assert isinstance(result, CoverageFailure) eq_("No license pool available", result.exception) # Test success. ed, lp = self._edition(with_license_pool=True) result = provider.set_presentation_ready(ed.primary_identifier) eq_(result, ed.primary_identifier) def test_process_batch_sets_work_presentation_ready(self): work = self._work(with_license_pool=True, with_open_access_download=True) identifier = work.license_pools[0].identifier work.presentation_ready = False provider = MockBibliographicCoverageProvider(self._db) [result] = provider.process_batch([identifier]) eq_(result, identifier) eq_(True, work.presentation_ready) # ensure_coverage does the same thing. work.presentation_ready = False result = provider.ensure_coverage(identifier) assert isinstance(result, CoverageRecord) eq_(result.identifier, identifier) eq_(True, work.presentation_ready) def test_failure_does_not_set_work_presentation_ready(self): work = self._work(with_license_pool=True, with_open_access_download=True) identifier = work.license_pools[0].identifier work.presentation_ready = False provider = MockFailureBibliographicCoverageProvider(self._db) [result] = provider.process_batch([identifier]) assert isinstance(result, CoverageFailure) eq_(False, work.presentation_ready)
def test_open_access_content_mirrored(self): # Make sure that open access material links are translated to our S3 buckets, and that # commercial material links are left as is. # Note: Mirroring tests passing does not guarantee that all code now # correctly calls on CirculationData, as well as Metadata. This is a risk. mirror = DummyS3Uploader() # Here's a book. edition, pool = self._edition(with_license_pool=True) # Here's a link to the content of the book, which will be mirrored. link_mirrored = LinkData( rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, href="http://example.com/", media_type=Representation.EPUB_MEDIA_TYPE, content="i am a tiny book" ) # This link will not be mirrored. link_unmirrored = LinkData( rel=Hyperlink.DRM_ENCRYPTED_DOWNLOAD, href="http://example.com/2", media_type=Representation.EPUB_MEDIA_TYPE, content="i am a pricy book" ) # Apply the metadata. policy = ReplacementPolicy(mirror=mirror) metadata = Metadata(data_source=edition.data_source, links=[link_mirrored, link_unmirrored], ) metadata.apply(edition, replace=policy) # make sure the refactor is done right, and metadata does not upload eq_(0, len(mirror.uploaded)) circulation_data = CirculationData( data_source=edition.data_source, primary_identifier=edition.primary_identifier, links=[link_mirrored, link_unmirrored], ) circulation_data.apply(pool, replace=policy) # make sure the refactor is done right, and circulation does upload eq_(1, len(mirror.uploaded)) # Only the open-access link has been 'mirrored'. [book] = mirror.uploaded # It's remained an open-access link. eq_( [Hyperlink.OPEN_ACCESS_DOWNLOAD], [x.rel for x in book.resource.links] ) # It's been 'mirrored' to the appropriate S3 bucket. assert book.mirror_url.startswith('http://s3.amazonaws.com/test.content.bucket/') expect = '/%s/%s.epub' % ( edition.primary_identifier.identifier, edition.title ) assert book.mirror_url.endswith(expect) # make sure the mirrored link is safely on edition sorted_edition_links = sorted(edition.license_pool.identifier.links, key=lambda x: x.rel) unmirrored_representation, mirrored_representation = [edlink.resource.representation for edlink in sorted_edition_links] assert mirrored_representation.mirror_url.startswith('http://s3.amazonaws.com/test.content.bucket/') # make sure the unmirrored link is safely on edition eq_('http://example.com/2', unmirrored_representation.url) # make sure the unmirrored link has not been translated to an S3 URL eq_(None, unmirrored_representation.mirror_url)
def test_image_scale_and_mirror(self): # Make sure that open access material links are translated to our S3 buckets, and that # commercial material links are left as is. # Note: mirroring links is now also CirculationData's job. So the unit tests # that test for that have been changed to call to mirror cover images. # However, updated tests passing does not guarantee that all code now # correctly calls on CirculationData, too. This is a risk. mirror = DummyS3Uploader() edition, pool = self._edition(with_license_pool=True) content = open(self.sample_cover_path("test-book-cover.png")).read() l1 = LinkData(rel=Hyperlink.IMAGE, href="http://example.com/", media_type=Representation.JPEG_MEDIA_TYPE, content=content) thumbnail_content = open( self.sample_cover_path("tiny-image-cover.png")).read() l2 = LinkData(rel=Hyperlink.THUMBNAIL_IMAGE, href="http://example.com/thumb.jpg", media_type=Representation.JPEG_MEDIA_TYPE, content=content) # When we call metadata.apply, all image links will be scaled and # 'mirrored'. policy = ReplacementPolicy(mirror=mirror) metadata = Metadata(links=[l1, l2], data_source=edition.data_source) metadata.apply(edition, replace=policy) # Two Representations were 'mirrored'. image, thumbnail = mirror.uploaded # The image... [image_link] = image.resource.links eq_(Hyperlink.IMAGE, image_link.rel) # And its thumbnail. eq_(image, thumbnail.thumbnail_of) # The original image is too big to be a thumbnail. eq_(600, image.image_height) eq_(400, image.image_width) # The thumbnail is the right height. eq_(Edition.MAX_THUMBNAIL_HEIGHT, thumbnail.image_height) eq_(Edition.MAX_THUMBNAIL_WIDTH, thumbnail.image_width) # The thumbnail is newly generated from the full-size # image--the thumbnail that came in from the OPDS feed was # ignored. assert thumbnail.url != l2.href assert thumbnail.content != l2.content # Both images have been 'mirrored' to Amazon S3. assert image.mirror_url.startswith( 'http://s3.amazonaws.com/test.cover.bucket/') assert image.mirror_url.endswith('cover.jpg') # The thumbnail image has been converted to PNG. assert thumbnail.mirror_url.startswith( 'http://s3.amazonaws.com/test.cover.bucket/scaled/300/') assert thumbnail.mirror_url.endswith('cover.png')
def record_info_to_metadata(cls, book, availability): """Turn Odilo's JSON representation of a book into a Metadata object. Note: The json data passed into this method is from a different file/stream from the json data that goes into the book_info_to_circulation() method. """ if 'id' not in book: return None odilo_id = book['id'] primary_identifier = IdentifierData(Identifier.ODILO_ID, odilo_id) active = book.get('active') title = book.get('title') subtitle = book.get('subtitle') series = book.get('series') series_position = book.get('seriesPosition') contributors = [] sort_author = book.get('author') if sort_author: roles = [Contributor.AUTHOR_ROLE] display_author = sort_name_to_display_name(sort_author) contributor = ContributorData(sort_name=sort_author, display_name=display_author, roles=roles, biography=None) contributors.append(contributor) publisher = book.get('publisher') # Metadata --> Marc21 260$c published = book.get('publicationDate') if not published: # yyyyMMdd --> record creation date published = book.get('releaseDate') if published: try: published = datetime.datetime.strptime(published, "%Y%m%d") except ValueError as e: cls.log.warn('Cannot parse publication date from: ' + published + ', message: ' + e.message) # yyyyMMdd --> record last modification date last_update = book.get('modificationDate') if last_update: try: last_update = datetime.datetime.strptime(last_update, "%Y%m%d") except ValueError as e: cls.log.warn('Cannot parse last update date from: ' + last_update + ', message: ' + e.message) language = book.get('language', 'spa') subjects = [] for subject in book.get('subjects', []): subjects.append( SubjectData(type=Subject.TAG, identifier=subject, weight=100)) for subjectBisacCode in book.get('subjectsBisacCodes', []): subjects.append( SubjectData(type=Subject.BISAC, identifier=subjectBisacCode, weight=100)) grade_level = book.get('gradeLevel') if grade_level: subject = SubjectData(type=Subject.GRADE_LEVEL, identifier=grade_level, weight=10) subjects.append(subject) medium = None file_format = book.get('fileFormat') formats = [] for format_received in book.get('formats', []): if format_received in cls.format_data_for_odilo_format: medium = cls.set_format(format_received, formats) elif format_received == OdiloAPI.ACSM and file_format: medium = cls.set_format( format_received + '_' + file_format.upper(), formats) else: cls.log.warn('Unrecognized format received: ' + format_received) if not medium: medium = Edition.BOOK_MEDIUM identifiers = [] isbn = book.get('isbn') if isbn: if isbnlib.is_isbn10(isbn): isbn = isbnlib.to_isbn13(isbn) identifiers.append(IdentifierData(Identifier.ISBN, isbn, 1)) # A cover links = [] cover_image_url = book.get('coverImageUrl') if cover_image_url: image_data = cls.image_link_to_linkdata(cover_image_url, Hyperlink.THUMBNAIL_IMAGE) if image_data: links.append(image_data) original_image_url = book.get('originalImageUrl') if original_image_url: image_data = cls.image_link_to_linkdata(original_image_url, Hyperlink.IMAGE) if image_data: links.append(image_data) # Descriptions become links. description = book.get('description') if description: links.append( LinkData(rel=Hyperlink.DESCRIPTION, content=description, media_type="text/html")) metadata = Metadata(data_source=DataSource.ODILO, title=title, subtitle=subtitle, language=language, medium=medium, series=series, series_position=series_position, publisher=publisher, published=published, primary_identifier=primary_identifier, identifiers=identifiers, subjects=subjects, contributors=contributors, links=links, data_source_last_updated=last_update) metadata.circulation = OdiloRepresentationExtractor.record_info_to_circulation( availability) # 'active' --> means that the book exists but it's no longer in the collection # (it could be available again in the future) if not active: metadata.circulation.licenses_owned = 0 metadata.circulation.formats = formats return metadata, active