def create_missing_books(self, subset=None): """Finds books present in the PG catalog but missing from Edition. Yields (Edition, LicensePool) 2-tuples. """ books = list(self.all_books()) for pg_id, archive, archive_item in books: if subset is not None and not subset(pg_id, archive, archive_item): continue self.log.info("Considering %s" % pg_id) # Find an existing Edition for the book. book = Edition.for_foreign_id( self._db, self.source, Identifier.GUTENBERG_ID, pg_id, create_if_not_exists=False) if not book: # Create a new Edition object with bibliographic # information from the Project Gutenberg RDF file. fh = archive.extractfile(archive_item) data = fh.read() fake_fh = StringIO(data) book, license, new = GutenbergRDFExtractor.book_in( self.collection, pg_id, fake_fh) if book and license: yield (book, license)
def update_licensepool_with_book_info(self, book, license_pool, is_new_pool): """Update a book's LicensePool with information from a JSON representation of its circulation info. Then, create an Edition and make sure it has bibliographic coverage. If the new Edition is the only candidate for the pool's presentation_edition, promote it to presentation status. """ circulation = OverdriveRepresentationExtractor.book_info_to_circulation( book ) license_pool, circulation_changed = circulation.apply(license_pool) edition, is_new_edition = Edition.for_foreign_id( self._db, self.source, license_pool.identifier.type, license_pool.identifier.identifier) # If the pool does not already have a presentation edition, # and if this edition is newly made, then associate pool and edition # as presentation_edition if ((not license_pool.presentation_edition) and is_new_edition): edition_changed = license_pool.set_presentation_edition( policy=None ) if is_new_pool: license_pool.open_access = False self.log.info("New Overdrive book discovered: %r", edition) return license_pool, is_new_pool, circulation_changed
def update_licensepool_with_book_info(self, book, license_pool, is_new_pool): """Update a book's LicensePool with information from a JSON representation of its circulation info. Then, create an Edition and make sure it has bibliographic coverage. If the new Edition is the only candidate for the pool's presentation_edition, promote it to presentation status. """ circulation = OverdriveRepresentationExtractor.book_info_to_circulation( book) license_pool, circulation_changed = circulation.apply( self._db, license_pool.collection) edition, is_new_edition = Edition.for_foreign_id( self._db, self.source, license_pool.identifier.type, license_pool.identifier.identifier) # If the pool does not already have a presentation edition, # and if this edition is newly made, then associate pool and edition # as presentation_edition if ((not license_pool.presentation_edition) and is_new_edition): edition_changed = license_pool.set_presentation_edition() if is_new_pool: license_pool.open_access = False self.log.info("New Overdrive book discovered: %r", edition) return license_pool, is_new_pool, circulation_changed
def process_item(self, identifier): """Associate bibliographic metadata with the given Identifier. :param Identifier: Look up this Identifier on Content Cafe. """ try: # Create a Metadata object. metadata = self.content_cafe.create_metadata(identifier) if not metadata: # TODO: The only time this is really a transient error # is when the book is too new for Content Cafe to know # about it, which isn't often. It would be best to # keep this as a transient failure but give it a relatively # long and exponentially increasing retry time. return self.failure( identifier, "Content Cafe has no knowledge of this identifier.", transient=True ) edition, is_new = Edition.for_foreign_id( self._db, self.data_source, identifier.type, identifier.identifier ) # We're passing in collection=None even though we # technically have a Collection available, because our # goal is to add metadata for the book without reference # to any particular collection. metadata.apply( edition, collection=None, replace=self.replacement_policy ) return identifier except Exception as e: self.log.error('Coverage error for %r', identifier, exc_info=e) return self.failure(identifier, repr(e), transient=True)
def process_item(self, identifier): """Associate bibliographic metadata with the given Identifier. :param Identifier: Look up this Identifier on Content Cafe. """ try: # Create a Metadata object. metadata = self.content_cafe.create_metadata(identifier) if not metadata: # TODO: The only time this is really a transient error # is when the book is too new for Content Cafe to know # about it, which isn't often. It would be best to # keep this as a transient failure but give it a relatively # long and exponentially increasing retry time. return self.failure( identifier, "Content Cafe has no knowledge of this identifier.", transient=True) edition, is_new = Edition.for_foreign_id(self._db, self.data_source, identifier.type, identifier.identifier) # We're passing in collection=None even though we # technically have a Collection available, because our # goal is to add metadata for the book without reference # to any particular collection. metadata.apply(edition, collection=None, replace=self.replacement_policy) return identifier except Exception as e: self.log.error('Coverage error for %r', identifier, exc_info=e) return self.failure(identifier, repr(e), transient=True)
def test_finalize_edition(self): provider_no_presentation_ready = self._provider(presentation_ready_on_success=False) provider_presentation_ready = self._provider(presentation_ready_on_success=True) identifier = self._identifier() source = DataSource.lookup(self._db, DataSource.GUTENBERG) # Here's an Edition with no LicensePool. edition, is_new = Edition.for_foreign_id( self._db, source, identifier.type, identifier.identifier ) edition.title = self._str # This will effectively do nothing. provider_no_presentation_ready.finalize_edition(edition) # No Works have been created. eq_(0, self._db.query(Work).count()) # But if there's also a LicensePool... pool, is_new = LicensePool.for_foreign_id( self._db, source, identifier.type, identifier.identifier ) # finalize_edition() will create a Work. provider_no_presentation_ready.finalize_edition(edition) work = pool.work eq_(work, edition.work) eq_(False, work.presentation_ready) # If the provider is configured to do so, finalize_edition() # will also set the Work as presentation-ready. provider_presentation_ready.finalize_edition(edition) eq_(True, work.presentation_ready)
def _edition(self, licensepool): """Find or create the Edition that would be used to contain Overdrive metadata for the given LicensePool. """ return Edition.for_foreign_id( self._db, self.source, licensepool.identifier.type, licensepool.identifier.identifier )
def edition(self, identifier): """Finds or creates an edition with license-offering DataSource.GUTENBERG instead of the class's local DataSource.GUTENBERG_EPUB_GENERATOR """ edition, is_new = Edition.for_foreign_id( self._db, DataSource.GUTENBERG, identifier.type, identifier.identifier ) return edition
def items_that_need_coverage(self): """Returns Editions associated with an open-access LicensePool but with no open-access download URL. """ q = Edition.missing_coverage_from(self._db, [], self.coverage_source) clause = and_(Edition.data_source_id==LicensePool.data_source_id, Edition.primary_identifier_id==LicensePool.identifier_id) q = q.join(LicensePool, clause) q = q.filter(LicensePool.open_access == True).filter( Edition.open_access_download_url==None ) return q
def handle_event(self, bibliotheca_id, isbn, foreign_patron_id, start_time, end_time, internal_event_type): # Find or lookup the LicensePool for this event. license_pool, is_new = LicensePool.for_foreign_id( self._db, self.api.source, Identifier.BIBLIOTHECA_ID, bibliotheca_id, collection=self.collection) if is_new: # This is a new book. Immediately acquire bibliographic # coverage for it. This will set the # DistributionMechanisms and make the book # presentation-ready. However, its circulation information # might not be up to date until we process some more # events. record = self.bibliographic_coverage_provider.ensure_coverage( license_pool.identifier, force=True) bibliotheca_identifier = license_pool.identifier isbn, ignore = Identifier.for_foreign_id(self._db, Identifier.ISBN, isbn) edition, ignore = Edition.for_foreign_id(self._db, self.api.source, Identifier.BIBLIOTHECA_ID, bibliotheca_id) # The ISBN and the Bibliotheca identifier are exactly equivalent. bibliotheca_identifier.equivalent_to(self.api.source, isbn, strength=1) # Log the event. start = start_time or CirculationEvent.NO_DATE # Make sure the effects of the event reported by Bibliotheca # are made visible on the LicensePool and turned into # analytics events. This is not 100% reliable, but it # should be mostly accurate, and the BibliothecaCirculationSweep # will periodically correct the errors. license_pool.update_availability_from_delta(internal_event_type, start_time, 1, self.analytics) if is_new: # This is our first time seeing this LicensePool. Log its # occurance as a separate event. license_pool.collect_analytics_event( self.analytics, CirculationEvent.DISTRIBUTOR_TITLE_ADD, license_pool.last_checked or start_time, 0, 1) title = edition.title or "[no title]" self.log.info("%r %s: %s", start_time, title, internal_event_type) return start_time
def handle_event(self, threem_id, isbn, foreign_patron_id, start_time, end_time, internal_event_type): # Find or lookup the LicensePool for this event. license_pool, is_new = LicensePool.for_foreign_id( self._db, self.api.source, Identifier.THREEM_ID, threem_id) if is_new: # Immediately acquire bibliographic coverage for this book. # This will set the DistributionMechanisms and make the # book presentation-ready. However, its circulation information # might not be up to date until we process some more events. record = self.bibliographic_coverage_provider.ensure_coverage( license_pool.identifier, force=True) threem_identifier = license_pool.identifier isbn, ignore = Identifier.for_foreign_id(self._db, Identifier.ISBN, isbn) edition, ignore = Edition.for_foreign_id(self._db, self.api.source, Identifier.THREEM_ID, threem_id) # The ISBN and the 3M identifier are exactly equivalent. threem_identifier.equivalent_to(self.api.source, isbn, strength=1) # Log the event. event, was_new = get_one_or_create(self._db, CirculationEvent, license_pool=license_pool, type=internal_event_type, start=start_time, foreign_patron_id=foreign_patron_id, create_method_kwargs=dict( delta=1, end=end_time)) # If this is our first time seeing this LicensePool, log its # occurance as a separate event if is_new: event = get_one_or_create( self._db, CirculationEvent, type=CirculationEvent.DISTRIBUTOR_TITLE_ADD, license_pool=license_pool, create_method_kwargs=dict( start=license_pool.last_checked or start_time, delta=1, end=license_pool.last_checked or end_time, )) title = edition.title or "[no title]" self.log.info("%r %s: %s", start_time, title, internal_event_type) return start_time
def handle_event(self, threem_id, isbn, foreign_patron_id, start_time, end_time, internal_event_type): # Find or lookup the LicensePool for this event. license_pool, is_new = LicensePool.for_foreign_id( self._db, self.api.source, Identifier.THREEM_ID, threem_id) if is_new: # Immediately acquire bibliographic coverage for this book. # This will set the DistributionMechanisms and make the # book presentation-ready. However, its circulation information # might not be up to date until we process some more events. record = self.bibliographic_coverage_provider.ensure_coverage( license_pool.identifier, force=True ) threem_identifier = license_pool.identifier isbn, ignore = Identifier.for_foreign_id( self._db, Identifier.ISBN, isbn) edition, ignore = Edition.for_foreign_id( self._db, self.api.source, Identifier.THREEM_ID, threem_id) # The ISBN and the 3M identifier are exactly equivalent. threem_identifier.equivalent_to(self.api.source, isbn, strength=1) # Log the event. event, was_new = get_one_or_create( self._db, CirculationEvent, license_pool=license_pool, type=internal_event_type, start=start_time, foreign_patron_id=foreign_patron_id, create_method_kwargs=dict(delta=1,end=end_time) ) # If this is our first time seeing this LicensePool, log its # occurance as a separate event if is_new: event = get_one_or_create( self._db, CirculationEvent, type=CirculationEvent.TITLE_ADD, license_pool=license_pool, create_method_kwargs=dict( start=license_pool.last_checked or start_time, delta=1, end=license_pool.last_checked or end_time, ) ) title = edition.title or "[no title]" self.log.info("%r %s: %s", start_time, title, internal_event_type) return start_time
def test_register_equivalency(self): data_source = DataSource.lookup(self._db, DataSource.GUTENBERG) id = "549" # We've got a record. record, was_new = Edition.for_foreign_id(self._db, data_source, Identifier.GUTENBERG_ID, id) # Then we look it up and discover another identifier for it. data_source_2 = DataSource.lookup(self._db, DataSource.OCLC) record2, was_new = Edition.for_foreign_id(self._db, data_source_2, Identifier.OCLC_NUMBER, "22") eq = record.primary_identifier.equivalent_to( data_source_2, record2.primary_identifier, 1) assert eq.input == record.primary_identifier assert eq.output == record2.primary_identifier assert eq.data_source == data_source_2 assert [eq] == record.primary_identifier.equivalencies assert set([record, record2]) == set(record.equivalent_editions().all())
def test_finalize_license_pool(self): provider_no_presentation_ready = self._provider( presentation_ready_on_success=False) provider_presentation_ready = self._provider( presentation_ready_on_success=True) identifier = self._identifier() license_source = DataSource.lookup(self._db, DataSource.GUTENBERG) data_source = DataSource.lookup(self._db, DataSource.OA_CONTENT_SERVER) # Here's a LicensePool with no presentation edition. pool, is_new = LicensePool.for_foreign_id(self._db, license_source, identifier.type, identifier.identifier) eq_(None, pool.presentation_edition) # Calling finalize_license_pool() here won't do much. provider_no_presentation_ready.finalize_license_pool(pool) # A presentation edition has been created for the LicensePool, # but it has no title (in fact it has no data at all), so no # Work was created. eq_(None, pool.presentation_edition.title) eq_(0, self._db.query(Work).count()) # Here's an Edition for the same book as the LicensePool but # from a different data source. edition, is_new = Edition.for_foreign_id(self._db, data_source, identifier.type, identifier.identifier) edition.title = self._str # Although Edition and LicensePool share an identifier, they # are not otherwise related. eq_(None, pool.presentation_edition.title) # finalize_license_pool() will create a Work and update the # LicensePool's presentation edition, based on the brand-new # Edition. provider_no_presentation_ready.finalize_license_pool(pool) work = pool.work eq_(edition.title, pool.presentation_edition.title) eq_(False, work.presentation_ready) # If the provider is configured to do so, finalize_license_pool() # will also set the Work as presentation-ready. provider_presentation_ready.finalize_license_pool(pool) eq_(True, work.presentation_ready)
def update_licensepool_with_book_info(self, book, license_pool, is_new): """Update a book's LicensePool with information from a JSON representation of its circulation info. Also creates an Edition and gives it very basic bibliographic information (the title), if possible. """ circulation = OverdriveRepresentationExtractor.book_info_to_circulation( book ) circulation_changed = circulation.update(license_pool, is_new) edition, ignore = Edition.for_foreign_id( self._db, self.source, license_pool.identifier.type, license_pool.identifier.identifier) edition.title = edition.title or book.get('title') if is_new: license_pool.open_access = False self.log.info("New Overdrive book discovered: %r", edition) return license_pool, is_new, circulation_changed
def test_finalize_license_pool(self): identifier = self._identifier() license_source = DataSource.lookup(self._db, DataSource.GUTENBERG) data_source = DataSource.lookup(self._db, DataSource.OA_CONTENT_SERVER) # Here's a LicensePool with no presentation edition. pool, is_new = LicensePool.for_foreign_id( self._db, license_source, identifier.type, identifier.identifier, collection=self._default_collection) eq_(None, pool.presentation_edition) # Here's an Edition for the same book as the LicensePool but # from a different data source. edition, is_new = Edition.for_foreign_id(self._db, data_source, identifier.type, identifier.identifier) edition.title = self._str # Although Edition and LicensePool share an identifier, they # are not otherwise related. eq_(None, pool.presentation_edition) # finalize_license_pool() will create a Work and update the # LicensePool's presentation edition, based on the brand-new # Edition. lookup = MockSimplifiedOPDSLookup(self._url) provider = ContentServerBibliographicCoverageProvider( self._default_collection, lookup) provider.finalize_license_pool(pool) work = pool.work eq_(edition.title, pool.presentation_edition.title) eq_(True, work.presentation_ready)
def _extract_publication_metadata(self, feed, publication, data_source_name): """Extract a Metadata object from webpub-manifest-parser's publication. :param publication: Feed object :type publication: opds2_ast.OPDS2Feed :param publication: Publication object :type publication: opds2_ast.OPDS2Publication :param data_source_name: Data source's name :type data_source_name: str :return: Publication's metadata :rtype: Metadata """ metadata = super(ODL2Importer, self)._extract_publication_metadata( feed, publication, data_source_name ) formats = [] licenses = [] medium = None with self._get_configuration(self._db) as configuration: skipped_license_formats = configuration.skipped_license_formats if skipped_license_formats: skipped_license_formats = set(skipped_license_formats) if publication.licenses: for odl_license in publication.licenses: identifier = odl_license.metadata.identifier checkout_link = first_or_default( odl_license.links.get_by_rel(OPDS2LinkRelationsRegistry.BORROW.key) ) if checkout_link: checkout_link = checkout_link.href license_info_document_link = first_or_default( odl_license.links.get_by_rel(OPDS2LinkRelationsRegistry.SELF.key) ) if license_info_document_link: license_info_document_link = license_info_document_link.href expires = ( to_utc(odl_license.metadata.terms.expires) if odl_license.metadata.terms else None ) concurrency = ( int(odl_license.metadata.terms.concurrency) if odl_license.metadata.terms else None ) if not license_info_document_link: parsed_license = None else: parsed_license = ODLImporter.get_license_data( license_info_document_link, checkout_link, identifier, expires, concurrency, self.http_get, ) if parsed_license is not None: licenses.append(parsed_license) # DPLA feed doesn't have information about a DRM protection used for audiobooks. # We want to try to extract that information from the License Info Document it's present there. license_formats = set(odl_license.metadata.formats) if parsed_license and parsed_license.content_types: license_formats |= set(parsed_license.content_types) for license_format in license_formats: if ( skipped_license_formats and license_format in skipped_license_formats ): continue if not medium: medium = Edition.medium_from_media_type(license_format) if license_format in ODLImporter.LICENSE_FORMATS: # Special case to handle DeMarque audiobooks which # include the protection in the content type drm_schemes = [ ODLImporter.LICENSE_FORMATS[license_format][ ODLImporter.DRM_SCHEME ] ] license_format = ODLImporter.LICENSE_FORMATS[license_format][ ODLImporter.CONTENT_TYPE ] else: drm_schemes = ( odl_license.metadata.protection.formats if odl_license.metadata.protection else [] ) for drm_scheme in drm_schemes or [None]: formats.append( FormatData( content_type=license_format, drm_scheme=drm_scheme, rights_uri=RightsStatus.IN_COPYRIGHT, ) ) metadata.circulation.licenses = licenses metadata.circulation.licenses_owned = None metadata.circulation.licenses_available = None metadata.circulation.licenses_reserved = None metadata.circulation.patrons_in_hold_queue = None metadata.circulation.formats.extend(formats) metadata.medium = medium return metadata
def test_recursively_equivalent_identifiers(self): # We start with a Gutenberg book. gutenberg = DataSource.lookup(self._db, DataSource.GUTENBERG) record, ignore = Edition.for_foreign_id(self._db, gutenberg, Identifier.GUTENBERG_ID, "100") gutenberg_id = record.primary_identifier # We use OCLC Classify to do a title/author lookup. oclc = DataSource.lookup(self._db, DataSource.OCLC) search_id, ignore = Identifier.for_foreign_id(self._db, Identifier.OCLC_WORK, "60010") gutenberg_id.equivalent_to(oclc, search_id, 1) # The title/author lookup associates the search term with two # different OCLC Numbers. oclc_id, ignore = Identifier.for_foreign_id(self._db, Identifier.OCLC_NUMBER, "9999") oclc_id_2, ignore = Identifier.for_foreign_id(self._db, Identifier.OCLC_NUMBER, "1000") search_id.equivalent_to(oclc, oclc_id, 1) search_id.equivalent_to(oclc, oclc_id_2, 1) # We then use OCLC Linked Data to connect one of the OCLC # Numbers with an ISBN. linked_data = DataSource.lookup(self._db, DataSource.OCLC_LINKED_DATA) isbn_id, ignore = Identifier.for_foreign_id(self._db, Identifier.ISBN, "900100434X") oclc_id.equivalent_to(linked_data, isbn_id, 1) # As it turns out, we have an Overdrive work record... overdrive = DataSource.lookup(self._db, DataSource.OVERDRIVE) overdrive_record, ignore = Edition.for_foreign_id( self._db, overdrive, Identifier.OVERDRIVE_ID, "{111-222}") overdrive_id = overdrive_record.primary_identifier # ...which is tied (by Overdrive) to the same ISBN. overdrive_id.equivalent_to(overdrive, isbn_id, 1) # Finally, here's a completely unrelated Edition, which # will not be showing up. gutenberg2, ignore = Edition.for_foreign_id(self._db, gutenberg, Identifier.GUTENBERG_ID, "200") gutenberg2.title = "Unrelated Gutenberg record." levels = [ record.equivalent_identifiers(policy=PresentationCalculationPolicy( equivalent_identifier_levels=i)) for i in range(0, 5) ] # At level 0, the only identifier found is the Gutenberg ID. assert set([gutenberg_id]) == set(levels[0]) # At level 1, we pick up the title/author lookup. assert set([gutenberg_id, search_id]) == set(levels[1]) # At level 2, we pick up the title/author lookup and the two # OCLC Numbers. assert set([gutenberg_id, search_id, oclc_id, oclc_id_2]) == set(levels[2]) # At level 3, we also pick up the ISBN. assert set([gutenberg_id, search_id, oclc_id, oclc_id_2, isbn_id]) == set(levels[3]) # At level 4, the recursion starts to go in the other # direction: we pick up the Overdrive ID that's equivalent to # the same ISBN as the OCLC Number. assert set([ gutenberg_id, search_id, oclc_id, oclc_id_2, isbn_id, overdrive_id ]) == set(levels[4])