def test_non_open_access_book_not_mirrored(self): data_source = DataSource.lookup(self._db, DataSource.GUTENBERG) m = Metadata(data_source=data_source) mirror = DummyS3Uploader(fail=True) h = DummyHTTPClient() policy = ReplacementPolicy(mirror=mirror, http_get=h.do_get) content = "foo" link = LinkData(rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, media_type=Representation.EPUB_MEDIA_TYPE, href="http://example.com/", content=content, rights_uri=RightsStatus.IN_COPYRIGHT) identifier = self._identifier() link_obj, is_new = identifier.add_link( rel=link.rel, href=link.href, data_source=data_source, media_type=link.media_type, content=link.content, ) # The Hyperlink object makes it look like an open-access book, # but the context we have from the OPDS feed says that it's # not. m.mirror_link(None, data_source, link, link_obj, policy) # No HTTP requests were made. eq_([], h.requests) # Nothing was uploaded. eq_([], mirror.uploaded)
def test_mirror_404_error(self): mirror = DummyS3Uploader() h = DummyHTTPClient() h.queue_response(404) policy = ReplacementPolicy(mirror=mirror, http_get=h.do_get) edition, pool = self._edition(with_license_pool=True) data_source = DataSource.lookup(self._db, DataSource.GUTENBERG) link = LinkData( rel=Hyperlink.IMAGE, media_type=Representation.JPEG_MEDIA_TYPE, href="http://example.com/", ) link_obj, ignore = edition.primary_identifier.add_link( rel=link.rel, href=link.href, data_source=data_source, license_pool=pool, media_type=link.media_type, content=link.content, ) m = Metadata(data_source=data_source) m.mirror_link(edition, data_source, link, link_obj, policy) # Since we got a 404 error, the cover image was not mirrored. eq_(404, link_obj.resource.representation.status_code) eq_(None, link_obj.resource.representation.mirror_url) eq_([], mirror.uploaded)
def test_mirror_open_access_link_mirror_failure(self): edition, pool = self._edition(with_license_pool=True) data_source = DataSource.lookup(self._db, DataSource.GUTENBERG) m = Metadata(data_source=data_source) mirror = DummyS3Uploader(fail=True) h = DummyHTTPClient() policy = ReplacementPolicy(mirror=mirror, http_get=h.do_get) content = open(self.sample_cover_path("test-book-cover.png")).read() link = LinkData(rel=Hyperlink.IMAGE, media_type=Representation.JPEG_MEDIA_TYPE, href="http://example.com/", content=content) link_obj, ignore = edition.primary_identifier.add_link( rel=link.rel, href=link.href, data_source=data_source, license_pool=pool, media_type=link.media_type, content=link.content, ) h.queue_response(200, media_type=Representation.JPEG_MEDIA_TYPE) m.mirror_link(edition, data_source, link, link_obj, policy) representation = link_obj.resource.representation # The representation was fetched successfully. eq_(None, representation.fetch_exception) assert representation.fetched_at != None # But mirroing failed. assert representation.mirror_exception != None eq_(None, representation.mirrored_at) eq_(link.media_type, representation.media_type) eq_(link.href, representation.url) # The mirror url should still be set. assert "Gutenberg" in representation.mirror_url assert representation.mirror_url.endswith( "%s/cover.jpg" % edition.primary_identifier.identifier) # Book content is still there since it wasn't mirrored. assert representation.content != None # the edition's identifier-associated license pool should not be # suppressed just because fetch failed on getting image. eq_(False, pool.suppressed) # the license pool only gets its license_exception column filled in # if fetch failed on getting an Hyperlink.OPEN_ACCESS_DOWNLOAD-type epub. eq_(None, pool.license_exception)
def test_mirror_with_content_modifier(self): edition, pool = self._edition(with_license_pool=True) data_source = DataSource.lookup(self._db, DataSource.GUTENBERG) m = Metadata(data_source=data_source) mirror = DummyS3Uploader() def dummy_content_modifier(representation): representation.content = "Replaced Content" h = DummyHTTPClient() policy = ReplacementPolicy(mirror=mirror, content_modifier=dummy_content_modifier, http_get=h.do_get) link = LinkData( rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, media_type=Representation.EPUB_MEDIA_TYPE, href="http://example.com/test.epub", content="I'm an epub", ) link_obj, ignore = edition.primary_identifier.add_link( rel=link.rel, href=link.href, data_source=data_source, license_pool=pool, media_type=link.media_type, content=link.content, ) h.queue_response(200, media_type=Representation.EPUB_MEDIA_TYPE) m.mirror_link(edition, data_source, link, link_obj, policy) representation = link_obj.resource.representation # The representation was fetched successfully. eq_(None, representation.fetch_exception) assert representation.fetched_at != None # The mirror url is set. assert "Gutenberg" in representation.mirror_url assert representation.mirror_url.endswith( "%s/%s.epub" % (edition.primary_identifier.identifier, edition.title)) # Content isn't there since it was mirrored. eq_(None, representation.content) # The representation was mirrored, with the modified content. eq_([representation], mirror.uploaded) eq_(["Replaced Content"], mirror.content)
def test_mirror_open_access_link_mirror_failure(self): mirror = DummyS3Uploader(fail=True) h = DummyHTTPClient() edition, pool = self._edition(with_license_pool=True) data_source = DataSource.lookup(self._db, DataSource.GUTENBERG) policy = ReplacementPolicy(mirror=mirror, http_get=h.do_get) circulation_data = CirculationData( data_source=edition.data_source, primary_identifier=edition.primary_identifier, ) link = LinkData( rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, media_type=Representation.EPUB_MEDIA_TYPE, href=self._url, ) link_obj, ignore = edition.primary_identifier.add_link( rel=link.rel, href=link.href, data_source=data_source, license_pool=pool, media_type=link.media_type, content=link.content, ) h.queue_response(200, media_type=Representation.EPUB_MEDIA_TYPE) circulation_data.mirror_link(pool, data_source, link, link_obj, policy) representation = link_obj.resource.representation # The representation was fetched successfully. eq_(None, representation.fetch_exception) assert representation.fetched_at != None # But mirroing failed. assert representation.mirror_exception != None eq_(None, representation.mirrored_at) eq_(link.media_type, representation.media_type) eq_(link.href, representation.url) # The mirror url should still be set. assert "Gutenberg" in representation.mirror_url assert representation.mirror_url.endswith("%s.epub" % edition.title) # Book content is still there since it wasn't mirrored. assert representation.content != None # The license pool is suppressed when mirroring fails. eq_(True, pool.suppressed) assert representation.mirror_exception in pool.license_exception
def test_mirror_open_access_link_fetch_failure(self): edition, pool = self._edition(with_license_pool=True) data_source = DataSource.lookup(self._db, DataSource.GUTENBERG) m = Metadata(data_source=data_source) mirror = DummyS3Uploader() h = DummyHTTPClient() policy = ReplacementPolicy(mirror=mirror, http_get=h.do_get) link = LinkData( rel=Hyperlink.IMAGE, media_type=Representation.JPEG_MEDIA_TYPE, href="http://example.com/", ) link_obj, ignore = edition.primary_identifier.add_link( rel=link.rel, href=link.href, data_source=data_source, license_pool=pool, media_type=link.media_type, content=link.content, ) h.queue_response(403) m.mirror_link(edition, data_source, link, link_obj, policy) representation = link_obj.resource.representation # Fetch failed, so we should have a fetch exception but no mirror url. assert representation.fetch_exception != None eq_(None, representation.mirror_exception) eq_(None, representation.mirror_url) eq_(link.href, representation.url) assert representation.fetched_at != None eq_(None, representation.mirrored_at) # the edition's identifier-associated license pool should not be # suppressed just because fetch failed on getting image. eq_(False, pool.suppressed) # the license pool only gets its license_exception column filled in # if fetch failed on getting an Hyperlink.OPEN_ACCESS_DOWNLOAD-type epub. eq_(None, pool.license_exception)
def test_mirror_open_access_link_fetch_failure(self): mirror = DummyS3Uploader() h = DummyHTTPClient() edition, pool = self._edition(with_license_pool=True) data_source = DataSource.lookup(self._db, DataSource.GUTENBERG) policy = ReplacementPolicy(mirror=mirror, http_get=h.do_get) circulation_data = CirculationData( data_source=edition.data_source, primary_identifier=edition.primary_identifier, ) link = LinkData( rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, media_type=Representation.EPUB_MEDIA_TYPE, href=self._url, ) link_obj, ignore = edition.primary_identifier.add_link( rel=link.rel, href=link.href, data_source=data_source, license_pool=pool, media_type=link.media_type, content=link.content, ) h.queue_response(403) circulation_data.mirror_link(pool, data_source, link, link_obj, policy) representation = link_obj.resource.representation # Fetch failed, so we should have a fetch exception but no mirror url. assert representation.fetch_exception != None eq_(None, representation.mirror_exception) eq_(None, representation.mirror_url) eq_(link.href, representation.url) assert representation.fetched_at != None eq_(None, representation.mirrored_at) # The license pool is suppressed when fetch fails. eq_(True, pool.suppressed) assert representation.fetch_exception in pool.license_exception
def test_image_scale_and_mirror(self): # Make sure that open access material links are translated to our S3 buckets, and that # commercial material links are left as is. # Note: mirroring links is now also CirculationData's job. So the unit tests # that test for that have been changed to call to mirror cover images. # However, updated tests passing does not guarantee that all code now # correctly calls on CirculationData, too. This is a risk. mirror = DummyS3Uploader() edition, pool = self._edition(with_license_pool=True) content = open(self.sample_cover_path("test-book-cover.png")).read() l1 = LinkData(rel=Hyperlink.IMAGE, href="http://example.com/", media_type=Representation.JPEG_MEDIA_TYPE, content=content) thumbnail_content = open( self.sample_cover_path("tiny-image-cover.png")).read() l2 = LinkData(rel=Hyperlink.THUMBNAIL_IMAGE, href="http://example.com/thumb.jpg", media_type=Representation.JPEG_MEDIA_TYPE, content=content) # When we call metadata.apply, all image links will be scaled and # 'mirrored'. policy = ReplacementPolicy(mirror=mirror) metadata = Metadata(links=[l1, l2], data_source=edition.data_source) metadata.apply(edition, replace=policy) # Two Representations were 'mirrored'. image, thumbnail = mirror.uploaded # The image... [image_link] = image.resource.links eq_(Hyperlink.IMAGE, image_link.rel) # And its thumbnail. eq_(image, thumbnail.thumbnail_of) # The original image is too big to be a thumbnail. eq_(600, image.image_height) eq_(400, image.image_width) # The thumbnail is the right height. eq_(Edition.MAX_THUMBNAIL_HEIGHT, thumbnail.image_height) eq_(Edition.MAX_THUMBNAIL_WIDTH, thumbnail.image_width) # The thumbnail is newly generated from the full-size # image--the thumbnail that came in from the OPDS feed was # ignored. assert thumbnail.url != l2.href assert thumbnail.content != l2.content # Both images have been 'mirrored' to Amazon S3. assert image.mirror_url.startswith( 'http://s3.amazonaws.com/test.cover.bucket/') assert image.mirror_url.endswith('cover.jpg') # The thumbnail image has been converted to PNG. assert thumbnail.mirror_url.startswith( 'http://s3.amazonaws.com/test.cover.bucket/scaled/300/') assert thumbnail.mirror_url.endswith('cover.png')
def test_open_access_content_mirrored(self): # Make sure that open access material links are translated to our S3 buckets, and that # commercial material links are left as is. # Note: Mirroring tests passing does not guarantee that all code now # correctly calls on CirculationData, as well as Metadata. This is a risk. mirror = DummyS3Uploader() # Here's a book. edition, pool = self._edition(with_license_pool=True) # Here's a link to the content of the book, which will be mirrored. link_mirrored = LinkData( rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, href="http://example.com/", media_type=Representation.EPUB_MEDIA_TYPE, content="i am a tiny book" ) # This link will not be mirrored. link_unmirrored = LinkData( rel=Hyperlink.DRM_ENCRYPTED_DOWNLOAD, href="http://example.com/2", media_type=Representation.EPUB_MEDIA_TYPE, content="i am a pricy book" ) # Apply the metadata. policy = ReplacementPolicy(mirror=mirror) metadata = Metadata(data_source=edition.data_source, links=[link_mirrored, link_unmirrored], ) metadata.apply(edition, replace=policy) # make sure the refactor is done right, and metadata does not upload eq_(0, len(mirror.uploaded)) circulation_data = CirculationData( data_source=edition.data_source, primary_identifier=edition.primary_identifier, links=[link_mirrored, link_unmirrored], ) circulation_data.apply(pool, replace=policy) # make sure the refactor is done right, and circulation does upload eq_(1, len(mirror.uploaded)) # Only the open-access link has been 'mirrored'. [book] = mirror.uploaded # It's remained an open-access link. eq_( [Hyperlink.OPEN_ACCESS_DOWNLOAD], [x.rel for x in book.resource.links] ) # It's been 'mirrored' to the appropriate S3 bucket. assert book.mirror_url.startswith('http://s3.amazonaws.com/test.content.bucket/') expect = '/%s/%s.epub' % ( edition.primary_identifier.identifier, edition.title ) assert book.mirror_url.endswith(expect) # make sure the mirrored link is safely on edition sorted_edition_links = sorted(edition.license_pool.identifier.links, key=lambda x: x.rel) unmirrored_representation, mirrored_representation = [edlink.resource.representation for edlink in sorted_edition_links] assert mirrored_representation.mirror_url.startswith('http://s3.amazonaws.com/test.content.bucket/') # make sure the unmirrored link is safely on edition eq_('http://example.com/2', unmirrored_representation.url) # make sure the unmirrored link has not been translated to an S3 URL eq_(None, unmirrored_representation.mirror_url)
def test_set_metadata_incorporates_replacement_policy(self): """Make sure that if a ReplacementPolicy is passed in to set_metadata(), the policy's settings (and those of its .presentation_calculation_policy) are respected. """ edition, pool = self._edition(with_license_pool=True) identifier = edition.primary_identifier # All images and open-access content should be uploaded to # this 'mirror'. mirror = DummyS3Uploader() http = DummyHTTPClient() http.queue_response( 200, content='I am an epub.', media_type=Representation.EPUB_MEDIA_TYPE, ) class Tripwire(PresentationCalculationPolicy): # This class sets a variable if one of its properties is # accessed. def __init__(self, *args, **kwargs): self.tripped = False def __getattr__(self, name): self.tripped = True return True presentation_calculation_policy = Tripwire() metadata_replacement_policy = ReplacementPolicy( mirror=mirror, http_get=http.do_get, presentation_calculation_policy=presentation_calculation_policy) circulationdata_replacement_policy = ReplacementPolicy( mirror=mirror, http_get=http.do_get, ) output_source = DataSource.lookup(self._db, DataSource.GUTENBERG) provider = CoverageProvider("service", [identifier.type], output_source) metadata = Metadata(output_source) # We've got a CirculationData object that includes an open-access download. link = LinkData(rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, href="http://foo.com/") circulationdata = CirculationData( output_source, primary_identifier=metadata.primary_identifier, links=[link]) provider.set_metadata_and_circulation_data( identifier, metadata, circulationdata, metadata_replacement_policy=metadata_replacement_policy, circulationdata_replacement_policy= circulationdata_replacement_policy, ) # The open-access download was 'downloaded' and 'mirrored'. [mirrored] = mirror.uploaded eq_("http://foo.com/", mirrored.url) assert mirrored.mirror_url.endswith( "/%s/%s.epub" % (identifier.identifier, edition.title)) # The book content was removed from the db after it was # mirrored successfully. eq_(None, mirrored.content) # Our custom PresentationCalculationPolicy was used when # determining whether to recalculate the work's # presentation. We know this because the tripwire was # triggered. eq_(True, presentation_calculation_policy.tripped)
def test_resources_are_mirrored_on_import(self): svg = """<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd"> <svg xmlns="http://www.w3.org/2000/svg" width="1000" height="500"> <ellipse cx="50" cy="25" rx="50" ry="25" style="fill:blue;"/> </svg>""" http = DummyHTTPClient() # The request to http://root/full-cover-image.png # will result in a 404 error, and the image will not be mirrored. http.queue_response(404, media_type="text/plain") http.queue_response( 200, content='I am 10557.epub.images', media_type=Representation.EPUB_MEDIA_TYPE, ) http.queue_response(200, content=svg, media_type=Representation.SVG_MEDIA_TYPE) http.queue_response(200, content='I am 10441.epub.images', media_type=Representation.EPUB_MEDIA_TYPE) s3 = DummyS3Uploader() importer = OPDSImporter(self._db, data_source_name=DataSource.OA_CONTENT_SERVER, mirror=s3, http_get=http.do_get) imported_editions, pools, works, failures = (importer.import_from_feed( self.content_server_mini_feed, feed_url='http://root')) e1 = imported_editions[0] e2 = imported_editions[1] # The import process requested each remote resource in the # order they appeared in the OPDS feed. The thumbnail # image was not requested, since we were going to make our own # thumbnail anyway. eq_(http.requests, [ 'http://www.gutenberg.org/ebooks/10441.epub.images', 'https://s3.amazonaws.com/book-covers.nypl.org/Gutenberg-Illustrated/10441/cover_10441_9.png', 'http://www.gutenberg.org/ebooks/10557.epub.images', 'http://root/full-cover-image.png', ]) [e1_oa_link, e1_image_link, e1_description_link] = sorted(e1.primary_identifier.links, key=lambda x: x.rel) [e2_image_link, e2_oa_link] = e2.primary_identifier.links # The two open-access links were mirrored to S3, as was the # original SVG image and its PNG thumbnail. The PNG image was # not mirrored because our attempt to download it resulted in # a 404 error. imported_representations = [ e1_oa_link.resource.representation, e1_image_link.resource.representation, e1_image_link.resource.representation.thumbnails[0], e2_oa_link.resource.representation, ] eq_(imported_representations, s3.uploaded) eq_(4, len(s3.uploaded)) eq_("I am 10441.epub.images", s3.content[0]) eq_(svg, s3.content[1]) eq_("I am 10557.epub.images", s3.content[3]) # Each resource was 'mirrored' to an Amazon S3 bucket. # # The "mouse" book was mirrored to a bucket corresponding to # Project Gutenberg, its data source. # # The images were mirrored to a bucket corresponding to the # open-access content server, _their_ data source. # # The "crow" book was mirrored to a bucket corresponding to # the open-access content source, the default data source used # when no distributor was specified for a book. url0 = 'http://s3.amazonaws.com/test.content.bucket/Gutenberg/Gutenberg%20ID/10441/The%20Green%20Mouse.epub.images' url1 = u'http://s3.amazonaws.com/test.cover.bucket/Library%20Simplified%20Open%20Access%20Content%20Server/Gutenberg%20ID/10441/cover_10441_9.png' url2 = u'http://s3.amazonaws.com/test.cover.bucket/scaled/300/Library%20Simplified%20Open%20Access%20Content%20Server/Gutenberg%20ID/10441/cover_10441_9.png' url3 = 'http://s3.amazonaws.com/test.content.bucket/Library%20Simplified%20Open%20Access%20Content%20Server/Gutenberg%20ID/10557/Johnny%20Crow%27s%20Party.epub.images' uploaded_urls = [x.mirror_url for x in s3.uploaded] eq_([url0, url1, url2, url3], uploaded_urls) # If we fetch the feed again, and the entries have been updated since the # cutoff, but the content of the open access links hasn't changed, we won't mirror # them again. cutoff = datetime.datetime(2013, 1, 2, 16, 56, 40) http.queue_response(304, media_type=Representation.EPUB_MEDIA_TYPE) http.queue_response(304, media_type=Representation.SVG_MEDIA_TYPE) http.queue_response(304, media_type=Representation.EPUB_MEDIA_TYPE) imported_editions, pools, works, failures = (importer.import_from_feed( self.content_server_mini_feed)) eq_([e1, e2], imported_editions) # Nothing new has been uploaded eq_(4, len(s3.uploaded)) # If the content has changed, it will be mirrored again. http.queue_response(200, content="I am a new version of 10557.epub.images", media_type=Representation.EPUB_MEDIA_TYPE) http.queue_response(200, content=svg, media_type=Representation.SVG_MEDIA_TYPE) http.queue_response(200, content="I am a new version of 10441.epub.images", media_type=Representation.EPUB_MEDIA_TYPE) imported_editions, pools, works, failures = (importer.import_from_feed( self.content_server_mini_feed)) eq_([e1, e2], imported_editions) eq_(8, len(s3.uploaded)) eq_("I am a new version of 10441.epub.images", s3.content[4]) eq_(svg, s3.content[5]) eq_("I am a new version of 10557.epub.images", s3.content[7])