def setup(self): super(TestFeedbooksOPDSImporter, self).setup() self.http = DummyHTTPClient() self.metadata = DummyMetadataClient() self.mirrors = dict(covers_mirror=MockS3Uploader(),books_mirror=MockS3Uploader()) self.data_source = DataSource.lookup(self._db, DataSource.FEEDBOOKS) # Create a default importer that's good enough for most tests. self.collection, self.importer = self._importer()
def test_load_cover_link(self): # Create a directory import script with an empty mock filesystem. script = MockDirectoryImportScript(self._db, {}) identifier = self._identifier(Identifier.GUTENBERG_ID, "2345") gutenberg = DataSource.lookup(self._db, DataSource.GUTENBERG) mirror = MockS3Uploader() args = (identifier, gutenberg, "covers", mirror) # There is nothing on the mock filesystem, so in this case # load_cover_link returns None. eq_(None, script.load_cover_link(*args)) # But we tried. eq_( ('2345', 'covers', Representation.COMMON_IMAGE_EXTENSIONS, 'cover image'), script._locate_file_args ) # Try another script that has a populated mock filesystem. mock_filesystem = { 'covers' : ( 'acover.jpeg', Representation.JPEG_MEDIA_TYPE, "I'm an image." ) } script = MockDirectoryImportScript(self._db, mock_filesystem) link = script.load_cover_link(*args) eq_(Hyperlink.IMAGE, link.rel) assert link.href.endswith( '/test.cover.bucket/Gutenberg/Gutenberg+ID/2345/2345.jpg' ) eq_(Representation.JPEG_MEDIA_TYPE, link.media_type) eq_("I'm an image.", link.content)
def setup(self): super(TestIntegrationClientCoverImageCoverageProvider, self).setup() mirror = MockS3Uploader() replacement_policy = ReplacementPolicy.from_metadata_source( mirror=mirror) self.collection = self._collection( protocol=ExternalIntegration.OPDS_FOR_DISTRIBUTORS) self.provider = IntegrationClientCoverImageCoverageProvider( replacement_policy=replacement_policy, collection=self.collection)
def test_mirror_open_access_link_mirror_failure(self): mirrors = dict(books_mirror=MockS3Uploader(fail=True), covers_mirror=None) h = DummyHTTPClient() edition, pool = self._edition(with_license_pool=True) data_source = DataSource.lookup(self._db, DataSource.GUTENBERG) policy = ReplacementPolicy(mirrors=mirrors, http_get=h.do_get) circulation_data = CirculationData( data_source=edition.data_source, primary_identifier=edition.primary_identifier, ) link = LinkData( rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, media_type=Representation.EPUB_MEDIA_TYPE, href=self._url, ) link_obj, ignore = edition.primary_identifier.add_link( rel=link.rel, href=link.href, data_source=data_source, media_type=link.media_type, content=link.content, ) h.queue_response(200, media_type=Representation.EPUB_MEDIA_TYPE) circulation_data.mirror_link(pool, data_source, link, link_obj, policy) representation = link_obj.resource.representation # The representation was fetched successfully. assert None == representation.fetch_exception assert representation.fetched_at != None # But mirroing failed. assert representation.mirror_exception != None assert None == representation.mirrored_at assert link.media_type == representation.media_type assert link.href == representation.url # The mirror url was never set. assert None == representation.mirror_url # Book content is still there since it wasn't mirrored. assert representation.content != None # The license pool is suppressed when mirroring fails. assert True == pool.suppressed assert representation.mirror_exception in pool.license_exception
def test_replacement_policy_uses_provided_mirror(self): collection = MockOverdriveAPI.mock_collection(self._db) mirror = MockS3Uploader() replacement_policy = ReplacementPolicy.from_metadata_source( mirror=mirror) api = MockOverdriveAPI(self._db, collection) api.queue_collection_token() provider = OverdriveBibliographicCoverageProvider( collection, replacement_policy=replacement_policy, api_class=api) # Any resources discovered by Overdrive will be # sent through this mirror. eq_(mirror, provider.replacement_policy.mirror) http = DummyHTTPClient() provider.replacement_policy.http_get = http.do_get # Now let's try looking up a specific identifier through 'Overdrive'. identifier = self._identifier(Identifier.OVERDRIVE_ID, "3896665d-9d81-4cac-bd43-ffc5066de1f5") body = self.data_file("overdrive/overdrive_metadata.json") provider.api.queue_response(200, {}, body) test_cover = self.data_file("covers/test-book-cover.png") test_small_cover = self.data_file("covers/tiny-image-cover.png") # Overdrive's full-sized image -- we will be creating our own # thumbnail from this. http.queue_response(200, "image/jpeg", {}, test_cover) # Overdrive's thumbnail image -- we will not be using this http.queue_response(200, "image/jpeg", {}, test_small_cover) record = provider.ensure_coverage(identifier) eq_("success", record.status) # The full image and the thumbnail have been uploaded to # the fake S3. full, thumbnail = mirror.uploaded eq_(test_cover, full.content) # The URLs for the Resource objects are our S3 URLs, not Overdrive's # URLs. expect = "Overdrive/Overdrive+ID/%s" % identifier.identifier for url in [full.mirror_url, thumbnail.mirror_url]: assert expect in url assert "/scaled/" in thumbnail.mirror_url assert "/scaled/" not in full.mirror_url # The thumbnail is a newly created image that is not the # same as the full image or the test cover. assert thumbnail.content != test_small_cover assert thumbnail.content != test_cover
def test_load_circulation_data(self): # Create a directory import script with an empty mock filesystem. script = MockDirectoryImportScript(self._db, {}) identifier = self._identifier(Identifier.GUTENBERG_ID, "2345") gutenberg = DataSource.lookup(self._db, DataSource.GUTENBERG) mirror = MockS3Uploader() args = (identifier, gutenberg, "ebooks", mirror, "Name of book", "rights URI") # There is nothing on the mock filesystem, so in this case # load_circulation_data returns None. eq_(None, script.load_circulation_data(*args)) # But we tried. eq_( ('2345', 'ebooks', Representation.COMMON_EBOOK_EXTENSIONS, 'ebook file'), script._locate_file_args ) # Try another script that has a populated mock filesystem. mock_filesystem = { 'ebooks' : ( 'book.epub', Representation.EPUB_MEDIA_TYPE, "I'm an EPUB." ) } script = MockDirectoryImportScript(self._db, mock_filesystem) # Now _locate_file finds something on the mock filesystem, and # load_circulation_data loads it into a fully populated # CirculationData object. circulation = script.load_circulation_data(*args) eq_(identifier, circulation.primary_identifier(self._db)) eq_(gutenberg, circulation.data_source(self._db)) eq_("rights URI", circulation.default_rights_uri) # The CirculationData has an open-access link associated with it. [link] = circulation.links eq_(Hyperlink.OPEN_ACCESS_DOWNLOAD, link.rel) assert link.href.endswith( '/test.content.bucket/Gutenberg/Gutenberg+ID/2345/Name+of+book.epub' ) eq_(Representation.EPUB_MEDIA_TYPE, link.media_type) eq_("I'm an EPUB.", link.content) # This open-access link will be made available through a # delivery mechanism described by this FormatData. [format] = circulation.formats eq_(link, format.link) eq_(link.media_type, format.content_type) eq_(DeliveryMechanism.NO_DRM, format.drm_scheme)
def test_mirror_open_access_link_fetch_failure(self): mirrors = dict(books_mirror=MockS3Uploader()) h = DummyHTTPClient() edition, pool = self._edition(with_license_pool=True) data_source = DataSource.lookup(self._db, DataSource.GUTENBERG) policy = ReplacementPolicy(mirrors=mirrors, http_get=h.do_get) circulation_data = CirculationData( data_source=edition.data_source, primary_identifier=edition.primary_identifier, ) link = LinkData( rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, media_type=Representation.EPUB_MEDIA_TYPE, href=self._url, ) link_obj, ignore = edition.primary_identifier.add_link( rel=link.rel, href=link.href, data_source=data_source, media_type=link.media_type, content=link.content, ) h.queue_response(403) circulation_data.mirror_link(pool, data_source, link, link_obj, policy) representation = link_obj.resource.representation # Fetch failed, so we should have a fetch exception but no mirror url. assert representation.fetch_exception != None assert None == representation.mirror_exception assert None == representation.mirror_url assert link.href == representation.url assert representation.fetched_at != None assert None == representation.mirrored_at # The license pool is suppressed when fetch fails. assert True == pool.suppressed assert representation.fetch_exception in pool.license_exception
def test_open_access_content_mirrored(self): # Make sure that open access material links are translated to our S3 buckets, and that # commercial material links are left as is. # Note: Mirroring tests passing does not guarantee that all code now # correctly calls on CirculationData, as well as Metadata. This is a risk. mirrors = dict(books_mirror=MockS3Uploader(), covers_mirror=None) mirror_type = ExternalIntegrationLink.OPEN_ACCESS_BOOKS # Here's a book. edition, pool = self._edition(with_license_pool=True) # Here's a link to the content of the book, which will be mirrored. link_mirrored = LinkData( rel=Hyperlink.OPEN_ACCESS_DOWNLOAD, href="http://example.com/", media_type=Representation.EPUB_MEDIA_TYPE, content="i am a tiny book", ) # This link will not be mirrored. link_unmirrored = LinkData( rel=Hyperlink.DRM_ENCRYPTED_DOWNLOAD, href="http://example.com/2", media_type=Representation.EPUB_MEDIA_TYPE, content="i am a pricy book", ) # Apply the metadata. policy = ReplacementPolicy(mirrors=mirrors) metadata = Metadata( data_source=edition.data_source, links=[link_mirrored, link_unmirrored], ) metadata.apply(edition, pool.collection, replace=policy) # make sure the refactor is done right, and metadata does not upload assert 0 == len(mirrors[mirror_type].uploaded) circulation_data = CirculationData( data_source=edition.data_source, primary_identifier=edition.primary_identifier, links=[link_mirrored, link_unmirrored], ) circulation_data.apply(self._db, pool.collection, replace=policy) # make sure the refactor is done right, and circulation does upload assert 1 == len(mirrors[mirror_type].uploaded) # Only the open-access link has been 'mirrored'. [book] = mirrors[mirror_type].uploaded # It's remained an open-access link. assert [Hyperlink.OPEN_ACCESS_DOWNLOAD ] == [x.rel for x in book.resource.links] # It's been 'mirrored' to the appropriate S3 bucket. assert book.mirror_url.startswith( "https://test-content-bucket.s3.amazonaws.com/") expect = "/%s/%s.epub" % (edition.primary_identifier.identifier, edition.title) assert book.mirror_url.endswith(expect) # make sure the mirrored link is safely on edition sorted_edition_links = sorted(pool.identifier.links, key=lambda x: x.rel) unmirrored_representation, mirrored_representation = [ edlink.resource.representation for edlink in sorted_edition_links ] assert mirrored_representation.mirror_url.startswith( "https://test-content-bucket.s3.amazonaws.com/") # make sure the unmirrored link is safely on edition assert "http://example.com/2" == unmirrored_representation.url # make sure the unmirrored link has not been translated to an S3 URL assert None == unmirrored_representation.mirror_url
def test_records(self): integration = self._integration() now = utc_now() exporter = MARCExporter.from_config(self._default_library) annotator = Annotator() lane = self._lane("Test Lane", genres=["Mystery"]) w1 = self._work(genre="Mystery", with_open_access_download=True) w2 = self._work(genre="Mystery", with_open_access_download=True) search_engine = MockExternalSearchIndex() search_engine.bulk_update([w1, w2]) # If there's a storage protocol but not corresponding storage integration, # it raises an exception. pytest.raises(Exception, exporter.records, lane, annotator) # If there is a storage integration, the output file is mirrored. mirror_integration = self._external_integration( ExternalIntegration.S3, ExternalIntegration.STORAGE_GOAL, username="******", password="******", ) mirror = MockS3Uploader() exporter.records( lane, annotator, mirror_integration, mirror=mirror, query_batch_size=1, upload_batch_size=1, search_engine=search_engine, ) # The file was mirrored and a CachedMARCFile was created to track the mirrored file. assert 1 == len(mirror.uploaded) [cache] = self._db.query(CachedMARCFile).all() assert self._default_library == cache.library assert lane == cache.lane assert mirror.uploaded[0] == cache.representation assert None == cache.representation.content assert ("https://test-marc-bucket.s3.amazonaws.com/%s/%s/%s.mrc" % ( self._default_library.short_name, quote(str(cache.representation.fetched_at)), quote(lane.display_name), ) == mirror.uploaded[0].mirror_url) assert None == cache.start_time assert cache.end_time > now # The content was uploaded in two parts. assert 2 == len(mirror.content[0]) complete_file = b"".join(mirror.content[0]) records = list(MARCReader(complete_file)) assert 2 == len(records) title_fields = [record.get_fields("245") for record in records] titles = [fields[0].get_subfields("a")[0] for fields in title_fields] assert set([w1.title, w2.title]) == set(titles) assert w1.title in w1.marc_record assert w2.title in w2.marc_record self._db.delete(cache) # It also works with a WorkList instead of a Lane, in which case # there will be no lane in the CachedMARCFile. worklist = WorkList() worklist.initialize(self._default_library, display_name="All Books") mirror = MockS3Uploader() exporter.records( worklist, annotator, mirror_integration, mirror=mirror, query_batch_size=1, upload_batch_size=1, search_engine=search_engine, ) assert 1 == len(mirror.uploaded) [cache] = self._db.query(CachedMARCFile).all() assert self._default_library == cache.library assert None == cache.lane assert mirror.uploaded[0] == cache.representation assert None == cache.representation.content assert ("https://test-marc-bucket.s3.amazonaws.com/%s/%s/%s.mrc" % ( self._default_library.short_name, quote(str(cache.representation.fetched_at)), quote(worklist.display_name), ) == mirror.uploaded[0].mirror_url) assert None == cache.start_time assert cache.end_time > now assert 2 == len(mirror.content[0]) complete_file = b"".join(mirror.content[0]) records = list(MARCReader(complete_file)) assert 2 == len(records) self._db.delete(cache) # If a start time is set, it's used in the mirror url. # # (Our mock search engine returns everthing in its 'index', # so this doesn't test that the start time is actually used to # find works -- that's in the search index tests and the # tests of MARCExporterFacets.) start_time = now - datetime.timedelta(days=3) mirror = MockS3Uploader() exporter.records( lane, annotator, mirror_integration, start_time=start_time, mirror=mirror, query_batch_size=2, upload_batch_size=2, search_engine=search_engine, ) [cache] = self._db.query(CachedMARCFile).all() assert self._default_library == cache.library assert lane == cache.lane assert mirror.uploaded[0] == cache.representation assert None == cache.representation.content assert ("https://test-marc-bucket.s3.amazonaws.com/%s/%s-%s/%s.mrc" % ( self._default_library.short_name, quote(str(start_time)), quote(str(cache.representation.fetched_at)), quote(lane.display_name), ) == mirror.uploaded[0].mirror_url) assert start_time == cache.start_time assert cache.end_time > now self._db.delete(cache) # If the search engine returns no contents for the lane, # nothing will be mirrored, but a CachedMARCFile is still # created to track that we checked for updates. empty_search_engine = MockExternalSearchIndex() mirror = MockS3Uploader() exporter.records( lane, annotator, mirror_integration, mirror=mirror, search_engine=empty_search_engine, ) assert [] == mirror.content[0] [cache] = self._db.query(CachedMARCFile).all() assert cache.representation == mirror.uploaded[0] assert self._default_library == cache.library assert lane == cache.lane assert None == cache.representation.content assert None == cache.start_time assert cache.end_time > now self._db.delete(cache)
def test_work_from_metadata(self): """Validate the ability to create a new Work from appropriate metadata. """ class Mock(MockDirectoryImportScript): """In this test we need to verify that annotate_metadata was called but did nothing. """ def annotate_metadata(self, metadata, *args, **kwargs): metadata.annotated = True return super(Mock, self).annotate_metadata( metadata, *args, **kwargs ) identifier = IdentifierData(Identifier.GUTENBERG_ID, "1003") identifier_obj, ignore = identifier.load(self._db) metadata = Metadata( DataSource.GUTENBERG, primary_identifier=identifier, title=u"A book" ) metadata.annotated = False datasource = DataSource.lookup(self._db, DataSource.GUTENBERG) policy = ReplacementPolicy.from_license_source(self._db) mirror = MockS3Uploader() policy.mirror = mirror # Here, work_from_metadata calls annotate_metadata, but does # not actually import anything because there are no files 'on # disk' and thus no way to actually get the book. collection = self._default_collection args = (collection, metadata, policy, "cover directory", "ebook directory", RightsStatus.CC0) script = Mock(self._db) eq_(None, script.work_from_metadata(*args)) eq_(True, metadata.annotated) # Now let's try it with some files 'on disk'. with open(self.sample_cover_path('test-book-cover.png')) as fh: image = fh.read() mock_filesystem = { 'cover directory' : ( 'cover.jpg', Representation.JPEG_MEDIA_TYPE, image ), 'ebook directory' : ( 'book.epub', Representation.EPUB_MEDIA_TYPE, "I'm an EPUB." ) } script = MockDirectoryImportScript( self._db, mock_filesystem=mock_filesystem ) work = script.work_from_metadata(*args) # We have created a book. It has a cover image, which has a # thumbnail. eq_("A book", work.title) assert work.cover_full_url.endswith( '/test.cover.bucket/Gutenberg/Gutenberg+ID/1003/1003.jpg' ) assert work.cover_thumbnail_url.endswith( '/test.cover.bucket/scaled/300/Gutenberg/Gutenberg+ID/1003/1003.png' ) [pool] = work.license_pools assert pool.open_access_download_url.endswith( '/test.content.bucket/Gutenberg/Gutenberg+ID/1003/A+book.epub' ) eq_(RightsStatus.CC0, pool.delivery_mechanisms[0].rights_status.uri) # The mock S3Uploader has a record of 'uploading' all these files # to S3. epub, full, thumbnail = mirror.uploaded eq_(epub.url, pool.open_access_download_url) eq_(full.url, work.cover_full_url) eq_(thumbnail.url, work.cover_thumbnail_url) # The EPUB Representation was cleared out after the upload, to # save database space. eq_("I'm an EPUB.", mirror.content[0]) eq_(None, epub.content)