def __init__(self, _db, mirror, user_id, password, uploader=None, soap_client=None): self._db = _db self.mirror = mirror if self.mirror: self.scaler = ImageScaler(_db, [self.mirror], uploader=uploader) else: self.scaler = None self.user_id = user_id self.password = password self.soap_client = (soap_client or ContentCafeSOAPClient(user_id, password))
def __init__(self, db, mirror, user_id=None, password=None, uploader=None, soap_client=None): self._db = db self.mirror = mirror if self.mirror: self.scaler = ImageScaler(db, [self.mirror], uploader=uploader) else: self.scaler = None integration = Configuration.integration("Content Cafe") self.user_id = user_id or integration['username'] self.password = password or integration['password'] self.log = logging.getLogger("Content Cafe API") self.soap_client = ( soap_client or ContentCafeSOAPClient(self.user_id, self.password) )
def __init__(self, _db, batch_size=10, cutoff_time=None, uploader=None, providers=None, **kwargs): output_source, made_new = get_one_or_create( _db, DataSource, name=DataSource.INTERNAL_PROCESSING ) # Other components don't have INTERNAL_PROCESSING as offering # licenses, but we do, because we're responsible for managing # LicensePools. output_source.offers_licenses=True input_identifier_types = [Identifier.OVERDRIVE_ID, Identifier.ISBN] super(IdentifierResolutionCoverageProvider, self).__init__( service_name="Identifier Resolution Coverage Provider", input_identifier_types=input_identifier_types, output_source=output_source, batch_size=batch_size, operation=CoverageRecord.RESOLVE_IDENTIFIER_OPERATION, ) # Since we are the metadata wrangler, any resources we find, # we mirror to S3. mirror = uploader or S3Uploader() # We're going to be aggressive about recalculating the presentation # for this work because either the work is currently not set up # at all, or something went wrong trying to set it up. presentation_calculation_policy = PresentationCalculationPolicy( regenerate_opds_entries=True, update_search_index=True ) policy = ReplacementPolicy.from_metadata_source( mirror=mirror, even_if_not_apparently_updated=True, presentation_calculation_policy=presentation_calculation_policy ) if providers: # For testing purposes. Initializing the real coverage providers # during tests can cause requests to third-parties. (self.required_coverage_providers, self.optional_coverage_providers) = providers else: overdrive = OverdriveBibliographicCoverageProvider( _db, metadata_replacement_policy=policy ) content_cafe = ContentCafeCoverageProvider(self._db) content_server = ContentServerCoverageProvider(self._db) oclc_classify = OCLCClassifyCoverageProvider(self._db) self.required_coverage_providers = [ overdrive, content_cafe, content_server, oclc_classify ] self.optional_coverage_providers = [] self.viaf = VIAFClient(self._db) self.image_mirrors = { DataSource.OVERDRIVE : OverdriveCoverImageMirror( self._db, uploader=uploader ) } self.image_scaler = ImageScaler( self._db, self.image_mirrors.values(), uploader=uploader ) self.oclc_linked_data = LinkedDataCoverageProvider(self._db)
class IdentifierResolutionCoverageProvider(CoverageProvider): """ Resolve all of the Identifiers with CoverageProviders in transient failure states, turning them into Editions with LicensePools. Create CoverageProviders to contact 3rd party entities for information on Identifier-represented library item (book). For ISBNs, make a bunch of Resources, rather than LicensePooled Editions. """ CAN_CREATE_LICENSE_POOLS = True LICENSE_SOURCE_NOT_ACCESSIBLE = ( "Could not access underlying license source over the network.") UNKNOWN_FAILURE = "Unknown failure." def __init__(self, _db, batch_size=10, cutoff_time=None, uploader=None, providers=None, **kwargs): output_source, made_new = get_one_or_create( _db, DataSource, name=DataSource.INTERNAL_PROCESSING ) # Other components don't have INTERNAL_PROCESSING as offering # licenses, but we do, because we're responsible for managing # LicensePools. output_source.offers_licenses=True input_identifier_types = [Identifier.OVERDRIVE_ID, Identifier.ISBN] super(IdentifierResolutionCoverageProvider, self).__init__( service_name="Identifier Resolution Coverage Provider", input_identifier_types=input_identifier_types, output_source=output_source, batch_size=batch_size, operation=CoverageRecord.RESOLVE_IDENTIFIER_OPERATION, ) # Since we are the metadata wrangler, any resources we find, # we mirror to S3. mirror = uploader or S3Uploader() # We're going to be aggressive about recalculating the presentation # for this work because either the work is currently not set up # at all, or something went wrong trying to set it up. presentation_calculation_policy = PresentationCalculationPolicy( regenerate_opds_entries=True, update_search_index=True ) policy = ReplacementPolicy.from_metadata_source( mirror=mirror, even_if_not_apparently_updated=True, presentation_calculation_policy=presentation_calculation_policy ) if providers: # For testing purposes. Initializing the real coverage providers # during tests can cause requests to third-parties. (self.required_coverage_providers, self.optional_coverage_providers) = providers else: overdrive = OverdriveBibliographicCoverageProvider( _db, metadata_replacement_policy=policy ) content_cafe = ContentCafeCoverageProvider(self._db) content_server = ContentServerCoverageProvider(self._db) oclc_classify = OCLCClassifyCoverageProvider(self._db) self.required_coverage_providers = [ overdrive, content_cafe, content_server, oclc_classify ] self.optional_coverage_providers = [] self.viaf = VIAFClient(self._db) self.image_mirrors = { DataSource.OVERDRIVE : OverdriveCoverImageMirror( self._db, uploader=uploader ) } self.image_scaler = ImageScaler( self._db, self.image_mirrors.values(), uploader=uploader ) self.oclc_linked_data = LinkedDataCoverageProvider(self._db) def items_that_need_coverage(self, identifiers=None, **kwargs): """Find all identifiers lacking coverage from this CoverageProvider. Only identifiers that have been requested via the URNLookupController (and thus given 'transient failure' CoverageRecords) should be returned. Identifiers created through previous resolution processes can be ignored. """ qu = super(IdentifierResolutionCoverageProvider, self).items_that_need_coverage( identifiers=identifiers, **kwargs ) qu = qu.filter(CoverageRecord.id != None) return qu def process_item(self, identifier): """For this identifier, checks that it has all of the available 3rd party metadata, and if not, obtains it. If metadata failed to be obtained, and the coverage was deemed required, then returns a CoverageFailure. """ self.log.info("Ensuring coverage for %r", identifier) license_pool = self.license_pool(identifier) if isinstance(license_pool, CoverageFailure): error = ValueError( "Could not generate LicensePool for %r" % identifier ) return self.transform_exception_into_failure(e, identifier) # Go through all relevant providers and tries to ensure coverage. # If there's a failure or an exception, create a CoverageFailure. for provider in self.required_coverage_providers: if not identifier.type in provider.input_identifier_types: continue try: record = provider.ensure_coverage(identifier, force=True) except Exception as e: return self.transform_exception_into_failure(e, identifier) if record.exception: error_msg = "500: " + record.exception transiency = True if record.status == CoverageRecord.PERSISTENT_FAILURE: transiency = False return CoverageFailure( identifier, error_msg, data_source=self.output_source, transient=transiency ) # Now go through the optional providers. It's the same deal, # but a CoverageFailure doesn't cause the entire identifier # resolution process to fail. for provider in self.optional_coverage_providers: if not identifier.type in provider.input_identifier_types: continue try: record = provider.ensure_coverage(identifier, force=True) except Exception as e: return self.transform_exception_into_failure(e, identifier) try: self.finalize(identifier) except Exception as e: return self.transform_exception_into_failure(e, identifier) return identifier def transform_exception_into_failure(self, error, identifier): """Ensures coverage of a given identifier by a given provider with appropriate error handling for broken providers. """ self.log.warn( "Error completing coverage for %r: %r", identifier, error, exc_info=error ) return CoverageFailure( identifier, repr(error), data_source=self.output_source, transient=True ) def finalize(self, identifier): """Sets equivalent identifiers from OCLC and processes the work.""" self.resolve_equivalent_oclc_identifiers(identifier) if identifier.type==Identifier.ISBN: # Currently we don't try to create Works for ISBNs, # we just make sure all the Resources associated with the # ISBN are properly handled. At this point, that has # completed successfully, so do nothing. pass else: self.process_work(identifier) def process_work(self, identifier): """Fill in VIAF data and cover images where possible before setting a previously-unresolved identifier's work as presentation ready. """ work = None license_pool = identifier.licensed_through if license_pool: work, created = license_pool.calculate_work(even_if_no_author=True) if work: self.resolve_viaf(work) self.resolve_cover_image(work) work.calculate_presentation() work.set_presentation_ready() else: error_msg = "500; " + "Work could not be calculated for %r" % identifier transiency = True return CoverageFailure( identifier, error_msg, data_source=self.output_source, transient=transiency ) def resolve_equivalent_oclc_identifiers(self, identifier): """Ensures OCLC coverage for an identifier. This has to be called after the OCLCClassify coverage is run to confirm that equivalent OCLC identifiers are available. """ oclc_ids = set() types = [Identifier.OCLC_WORK, Identifier.OCLC_NUMBER, Identifier.ISBN] for edition in identifier.primarily_identifies: oclc_ids = oclc_ids.union( edition.equivalent_identifiers(type=types) ) for oclc_id in oclc_ids: self.log.info("Currently processing equivalent identifier: %r", oclc_id) self.oclc_linked_data.ensure_coverage(oclc_id) def resolve_viaf(self, work): """Get VIAF data on all contributors.""" viaf = VIAFClient(self._db) for pool in work.license_pools: edition = pool.presentation_edition for contributor in edition.contributors: viaf.process_contributor(contributor) if not contributor.display_name: contributor.family_name, contributor.display_name = ( contributor.default_names()) def resolve_cover_image(self, work): """Make sure we have the cover for all editions.""" for pool in work.license_pools: edition = pool.presentation_edition data_source_name = edition.data_source.name if data_source_name in self.image_mirrors: self.image_mirrors[data_source_name].mirror_edition(edition) self.image_scaler.scale_edition(edition)
class ContentCafeAPI(object): """Associates up to four resources with an ISBN.""" BASE_URL = "http://contentcafe2.btol.com/" ONE_YEAR_AGO = datetime.timedelta(days=365) image_url = BASE_URL + "ContentCafe/Jacket.aspx?userID=%(userid)s&password=%(password)s&Type=L&Value=%(isbn)s" overview_url= BASE_URL + "ContentCafeClient/ContentCafe.aspx?UserID=%(userid)s&Password=%(password)s&ItemKey=%(isbn)s" review_url = BASE_URL + "ContentCafeClient/ReviewsDetail.aspx?UserID=%(userid)s&Password=%(password)s&ItemKey=%(isbn)s" summary_url = BASE_URL + "ContentCafeClient/Summary.aspx?UserID=%(userid)s&Password=%(password)s&ItemKey=%(isbn)s" excerpt_url = BASE_URL + "ContentCafeClient/Excerpt.aspx?UserID=%(userid)s&Password=%(password)s&ItemKey=%(isbn)s" author_notes_url = BASE_URL + "ContentCafeClient/AuthorNotes.aspx?UserID=%(userid)s&Password=%(password)s&ItemKey=%(isbn)s" def __init__(self, db, mirror, user_id=None, password=None, uploader=None, soap_client=None): self._db = db self.mirror = mirror if self.mirror: self.scaler = ImageScaler(db, [self.mirror], uploader=uploader) else: self.scaler = None integration = Configuration.integration("Content Cafe") self.user_id = user_id or integration['username'] self.password = password or integration['password'] self.log = logging.getLogger("Content Cafe API") self.soap_client = ( soap_client or ContentCafeSOAPClient(self.user_id, self.password) ) @property def data_source(self): return DataSource.lookup(self._db, DataSource.CONTENT_CAFE) def mirror_resources(self, isbn_identifier): """Associate a number of resources with the given ISBN. """ isbn = isbn_identifier.identifier args = dict(userid=self.user_id, password=self.password, isbn=isbn) image_url = self.image_url % args hyperlink, is_new = isbn_identifier.add_link( Hyperlink.IMAGE, image_url, self.data_source) representation = self.mirror.mirror_hyperlink(hyperlink) if representation.status_code == 404: # Content Cafe served us an HTML page instead of an # image. This indicates that Content Cafe has no knowledge # of this ISBN. There is no need to make any more # requests. return True self.mirror.uploader.mirror_one(representation) self.scaler.scale_edition(isbn_identifier) self.get_descriptions(isbn_identifier, args) self.get_excerpt(isbn_identifier, args) self.get_reviews(isbn_identifier, args) self.get_author_notes(isbn_identifier, args) self.measure_popularity(isbn_identifier, self.soap_client.ONE_YEAR_AGO) def get_associated_web_resources( self, identifier, args, url, phrase_indicating_missing_data, rel, scrape_method): url = url % args self.log.debug("Getting associated resources for %s", url) response = requests.get(url) content_type = response.headers['Content-Type'] hyperlinks = [] already_seen = set() if not phrase_indicating_missing_data in response.content: self.log.info("Found %s %s Content!", identifier.identifier, rel) soup = BeautifulSoup(response.content, "lxml") resource_contents = scrape_method(soup) if resource_contents: for content in resource_contents: if content in already_seen: continue already_seen.add(content) hyperlink, is_new = identifier.add_link( rel, None, self.data_source, media_type="text/html", content=content) hyperlinks.append(hyperlink) self.log.debug( "Content: %s", hyperlink.resource.representation.content[:75]) return hyperlinks def get_reviews(self, identifier, args): return self.get_associated_web_resources( identifier, args, self.review_url, 'No review info exists for this item', Hyperlink.REVIEW, self._scrape_list) def get_descriptions(self, identifier, args): hyperlinks = list(self.get_associated_web_resources( identifier, args, self.summary_url, 'No annotation info exists for this item', Hyperlink.DESCRIPTION, self._scrape_list)) if not hyperlinks: return hyperlinks # Since we get multiple descriptions, and there is no # associated Edition, now is a good time to evaluate the quality # of the descriptions. This will make it easy to pick the best one # when this identifier is looked up. evaluator = SummaryEvaluator(bad_phrases=[]) by_content = dict() for link in hyperlinks: content = link.resource.representation.content evaluator.add(content) evaluator.ready() for link in hyperlinks: resource = link.resource content = resource.representation.content quality = evaluator.score(content) resource.set_estimated_quality(quality) resource.update_quality() return hyperlinks def get_author_notes(self, identifier, args): return self.get_associated_web_resources( identifier, args, self.author_notes_url, 'No author notes info exists for this item', Hyperlink.AUTHOR, self._scrape_one) def get_excerpt(self, identifier, args): return self.get_associated_web_resources( identifier, args, self.excerpt_url, 'No excerpt info exists for this item', Hyperlink.SAMPLE, self._scrape_one) def measure_popularity(self, identifier, cutoff=None): if identifier.type != Identifier.ISBN: raise Error("I can only measure the popularity of ISBNs.") value = self.soap_client.estimated_popularity(identifier.identifier) # Even a complete lack of popularity data is useful--it tells # us there's no need to check again anytime soon. measurement = identifier.add_measurement( self.data_source, Measurement.POPULARITY, value) # Since there is no associated Edition, now is a good time to # normalize the value. return measurement.normalized_value @classmethod def _scrape_list(cls, soup): table = soup.find('table', id='Table_Main') if table: for header in table.find_all('td', class_='SectionHeader'): content = header.parent.next_sibling if content.name != 'tr': continue if not content.td: continue yield content.td.encode_contents() @classmethod def _scrape_one(cls, soup): table = soup.find('table', id='Table_Main') if not table: return [] if table.tr and table.tr.td: return [table.tr.td.encode_contents()] else: return []
def __init__(self, collection, uploader=None, viaf_client=None, linked_data_coverage_provider=None, content_cafe_api=None, overdrive_api_class=OverdriveAPI, **kwargs): super(IdentifierResolutionCoverageProvider, self).__init__(collection, **kwargs) # Since we are the metadata wrangler, any resources we find, # we mirror to S3. if not uploader: uploader = S3Uploader.from_config(self._db) self.uploader = uploader # We're going to be aggressive about recalculating the presentation # for this work because either the work is currently not set up # at all, or something went wrong trying to set it up. self.policy = PresentationCalculationPolicy( regenerate_opds_entries=True) self.overdrive_api = self.create_overdrive_api(overdrive_api_class) self.content_cafe_api = content_cafe_api # Determine the optional and required coverage providers. # Each Identifier in this Collection's catalog will be run # through all relevant providers. self.required_coverage_providers, self.optional_coverage_providers = self.providers( ) # When we need to look up a contributor via VIAF we will use this # client. self.viaf_client = viaf_client or VIAFClient(self._db) # Books are not looked up in OCLC Linked Data directly, since # there is no Collection that identifies a book by its OCLC Number. # However, when a book is looked up through OCLC Classify, some # OCLC Numbers may be associated with it, and _those_ numbers # can be run through OCLC Linked Data. # # TODO: We get many books identified by ISBN, and those books # _could_ be run through a LinkedDataCoverageProvider if it # worked a little differently. However, I don't think this # would be very useful, since those books will get looked up # through OCLC Classify, which will probably result in us # finding that same ISBN via OCLC Number. self.oclc_linked_data = (linked_data_coverage_provider or LinkedDataCoverageProvider( self._db, viaf_api=self.viaf_client)) # The ordinary OverdriveBibliographicCoverageProvider # doesn't upload images, so we need to create our own # mirror and scaler. # # TODO: This class would be neater if we were to subclass # OverdriveBibliographicCoverageProvider to do the scaling and # uploading. self.image_mirrors = { DataSource.OVERDRIVE: OverdriveCoverImageMirror(self._db, uploader=uploader) } self.image_scaler = ImageScaler(self._db, self.image_mirrors.values(), uploader=uploader)
class IdentifierResolutionCoverageProvider(CatalogCoverageProvider): """Make sure all Identifiers registered as needing coverage by this CoverageProvider become Works with Editions and (probably dummy) LicensePools. Coverage happens by running the Identifier through _other_ CoverageProviders, filling in the blanks with additional data from third-party entities. For ISBNs, we end up with a bunch of Resources, rather than Works. TODO: This needs to change. """ SERVICE_NAME = "Identifier Resolution Coverage Provider" DATA_SOURCE_NAME = DataSource.INTERNAL_PROCESSING INPUT_IDENTIFIER_TYPES = [ Identifier.OVERDRIVE_ID, Identifier.ISBN, Identifier.URI, Identifier.GUTENBERG_ID ] OPERATION = CoverageRecord.RESOLVE_IDENTIFIER_OPERATION LICENSE_SOURCE_NOT_ACCESSIBLE = ( "Could not access underlying license source over the network.") UNKNOWN_FAILURE = "Unknown failure." DEFAULT_OVERDRIVE_COLLECTION_NAME = u'Default Overdrive' def __init__(self, collection, uploader=None, viaf_client=None, linked_data_coverage_provider=None, content_cafe_api=None, overdrive_api_class=OverdriveAPI, **kwargs): super(IdentifierResolutionCoverageProvider, self).__init__(collection, **kwargs) # Since we are the metadata wrangler, any resources we find, # we mirror to S3. if not uploader: uploader = S3Uploader.from_config(self._db) self.uploader = uploader # We're going to be aggressive about recalculating the presentation # for this work because either the work is currently not set up # at all, or something went wrong trying to set it up. self.policy = PresentationCalculationPolicy( regenerate_opds_entries=True) self.overdrive_api = self.create_overdrive_api(overdrive_api_class) self.content_cafe_api = content_cafe_api # Determine the optional and required coverage providers. # Each Identifier in this Collection's catalog will be run # through all relevant providers. self.required_coverage_providers, self.optional_coverage_providers = self.providers( ) # When we need to look up a contributor via VIAF we will use this # client. self.viaf_client = viaf_client or VIAFClient(self._db) # Books are not looked up in OCLC Linked Data directly, since # there is no Collection that identifies a book by its OCLC Number. # However, when a book is looked up through OCLC Classify, some # OCLC Numbers may be associated with it, and _those_ numbers # can be run through OCLC Linked Data. # # TODO: We get many books identified by ISBN, and those books # _could_ be run through a LinkedDataCoverageProvider if it # worked a little differently. However, I don't think this # would be very useful, since those books will get looked up # through OCLC Classify, which will probably result in us # finding that same ISBN via OCLC Number. self.oclc_linked_data = (linked_data_coverage_provider or LinkedDataCoverageProvider( self._db, viaf_api=self.viaf_client)) # The ordinary OverdriveBibliographicCoverageProvider # doesn't upload images, so we need to create our own # mirror and scaler. # # TODO: This class would be neater if we were to subclass # OverdriveBibliographicCoverageProvider to do the scaling and # uploading. self.image_mirrors = { DataSource.OVERDRIVE: OverdriveCoverImageMirror(self._db, uploader=uploader) } self.image_scaler = ImageScaler(self._db, self.image_mirrors.values(), uploader=uploader) def create_overdrive_api(self, overdrive_api_class): collection, is_new = Collection.by_name_and_protocol( self._db, self.DEFAULT_OVERDRIVE_COLLECTION_NAME, ExternalIntegration.OVERDRIVE) if is_new: raise ValueError( 'Default Overdrive collection has not been configured.') return overdrive_api_class(self._db, collection) def providers(self): """Instantiate required and optional CoverageProviders. All Identifiers in this Collection's catalog will be run through each provider. If an optional provider fails, nothing will happen. If a required provider fails, the coverage operation as a whole will fail. NOTE: This method creates CoverageProviders that go against real servers. Because of this, tests must use a subclass that mocks providers(), such as MockIdentifierResolutionCoverageProvider. """ # All books must be run through Content Cafe and OCLC # Classify, assuming their identifiers are of the right # type. content_cafe = ContentCafeCoverageProvider(self._db, api=self.content_cafe_api, uploader=self.uploader) oclc_classify = OCLCClassifyCoverageProvider(self._db) optional = [] required = [content_cafe, oclc_classify] # All books derived from OPDS import against the open-access # content server must be looked up in that server. # # TODO: This could stand some generalization. Any OPDS server # that also supports the lookup protocol can be used here. if (self.collection.protocol == ExternalIntegration.OPDS_IMPORT and self.collection.data_source and self.collection.data_source.name == DataSource.OA_CONTENT_SERVER): required.append(LookupClientCoverageProvider(self.collection)) # All books obtained from Overdrive must be looked up via the # Overdrive API. if self.collection.protocol == ExternalIntegration.OVERDRIVE: required.append( OverdriveBibliographicCoverageProvider( self.collection, api_class=self.overdrive_api)) return optional, required def items_that_need_coverage(self, identifiers=None, **kwargs): """Find all identifiers lacking coverage from this CoverageProvider. Only identifiers that have CoverageRecords in the 'transient failure' state will be returned. Unlike with other CoverageProviders, Identifiers that have no CoverageRecord at all will not be processed. """ qu = super(IdentifierResolutionCoverageProvider, self).items_that_need_coverage(identifiers=identifiers, **kwargs) qu = qu.filter(CoverageRecord.id != None) return qu def process_item(self, identifier): """For this identifier, checks that it has all of the available 3rd party metadata, and if not, obtains it. If metadata failed to be obtained, and the coverage was deemed required, then returns a CoverageFailure. """ self.log.info("Ensuring coverage for %r", identifier) # Make sure there's a LicensePool for this Identifier in this # Collection. Since we're the metadata wrangler, the # LicensePool will probably be a stub that doesn't actually # represent the right to loan the book, but that's okay. license_pool = self.license_pool(identifier) if not license_pool.licenses_owned: license_pool.update_availability(1, 1, 0, 0) # Go through all relevant providers and try to ensure coverage. failure = self.run_through_relevant_providers( identifier, self.required_coverage_providers, fail_on_any_failure=True) if failure: return failure # Now go through relevant optional providers and try to ensure # coverage. failure = self.run_through_relevant_providers( identifier, self.optional_coverage_providers, fail_on_any_failure=False) if failure: return failure # We got coverage from all the required coverage providers, # and none of the optional coverage providers raised an exception, # so we're ready. try: self.finalize(identifier) except Exception as e: return self.transform_exception_into_failure(e, identifier) return identifier def run_through_relevant_providers(self, identifier, providers, fail_on_any_failure): """Run the given Identifier through a set of CoverageProviders. :param identifier: Process this Identifier. :param providers: Run `identifier` through every relevant CoverageProvider in this list. :param fail_on_any_failure: True means that each CoverageProvider must succeed or the whole operation fails. False means that if a CoverageProvider fails it's not a deal-breaker. :return: A CoverageFailure if there was an unrecoverable failure, None if everything went okay. """ for provider in providers: if (provider.input_identifier_types and not identifier.type in provider.input_identifier_types): # The CoverageProvider under consideration doesn't # handle Identifiers of this type. continue try: record = provider.ensure_coverage(identifier, force=True) if fail_on_any_failure and record.exception: # As the CoverageProvider under consideration has # fallen, so must this CoverageProvider also fall. error_msg = "500: " + record.exception transient = ( record.status == CoverageRecord.TRANSIENT_FAILURE) return self.failure(identifier, error_msg, transient=transient) except Exception as e: # An uncaught exception becomes a CoverageFailure no # matter what. return self.transform_exception_into_failure(e, identifier) # Return None to indicate success. return None def transform_exception_into_failure(self, error, identifier): """Ensures coverage of a given identifier by a given provider with appropriate error handling for broken providers. """ self.log.warn("Error completing coverage for %r: %r", identifier, error, exc_info=error) return self.failure(identifier, repr(error), transient=True) def finalize(self, identifier): """Sets equivalent identifiers from OCLC and processes the work.""" self.resolve_equivalent_oclc_identifiers(identifier) if identifier.type == Identifier.ISBN: # In order to create Works for ISBNs, we first have to # create an edition associated with the ISBN as a primary # identifier. At the moment, this is achieved via OCLC # Linked Data. self.generate_edition(identifier) self.process_work(identifier) def generate_edition(self, identifier): """Utilizes an ISBN's equivalent identifiers (OCLC Number or Work IDs) to set an appropriate LicensePool presentation edition so a Work can later be created. """ equivalent_ids = identifier.equivalent_identifier_ids()[identifier.id] # Get the editions of equivalent identifiers (OCLC Number or Work IDs) # to set as a presentation edition. These editions can be lower quality, # and it's important that they have a title. titled_equivalent_editions = self._db.query(Edition).\ join(Edition.primary_identifier).\ filter(Identifier.id.in_(equivalent_ids)).\ filter(Edition.title!=None) # It's preferable that they have an author, too. authored_equivalent_editions = titled_equivalent_editions.filter( Edition.author != None, Edition.author != Edition.UNKNOWN_AUTHOR) if fast_query_count(authored_equivalent_editions): # Prioritize editions with both a title and an author if available. equivalent_editions = authored_equivalent_editions.all() else: equivalent_editions = titled_equivalent_editions.all() if equivalent_editions: # Set the presentation edition. pool = identifier.licensed_through[0] pool.set_presentation_edition( equivalent_editions=equivalent_editions) def process_work(self, identifier): """Fill in VIAF data and cover images where possible before setting a previously-unresolved identifier's work as presentation ready. TODO: I think this should be split into a separate WorkCoverageProvider which runs last. That way we have a record of which Works have had this service. """ work = None license_pools = identifier.licensed_through if license_pools: pool = license_pools[0] work, created = pool.calculate_work(even_if_no_author=True, exclude_search=True) if work: self.resolve_viaf(work) self.resolve_cover_image(work) work.calculate_presentation(policy=self.policy, exclude_search=True) work.set_presentation_ready(exclude_search=True) else: error_msg = "500; " + "Work could not be calculated for %r" % identifier raise RuntimeError(error_msg) def resolve_equivalent_oclc_identifiers(self, identifier): """Ensures OCLC coverage for an identifier. This has to be called after the OCLCClassify coverage is run to confirm that equivalent OCLC identifiers are available. """ oclc_ids = set() if identifier.type == Identifier.ISBN: # ISBNs won't have editions, so they should be run through OCLC # to retrieve basic edition data (title, author). oclc_ids.add(identifier) types = [Identifier.OCLC_WORK, Identifier.OCLC_NUMBER, Identifier.ISBN] for edition in identifier.primarily_identifies: oclc_ids = oclc_ids.union( edition.equivalent_identifiers(type=types)) for oclc_id in oclc_ids: self.log.info("Currently processing equivalent identifier: %r", oclc_id) self.oclc_linked_data.ensure_coverage(oclc_id) def resolve_viaf(self, work): """Get VIAF data on all contributors.""" for pool in work.license_pools: edition = pool.presentation_edition if not edition: continue for contributor in edition.contributors: self.viaf_client.process_contributor(contributor) if not contributor.display_name: contributor.family_name, contributor.display_name = ( contributor.default_names()) def resolve_cover_image(self, work): """Make sure we have the cover for all editions.""" for pool in work.license_pools: edition = pool.presentation_edition data_source_name = pool.data_source.name if data_source_name in self.image_mirrors: self.image_mirrors[data_source_name].mirror_edition(edition) self.image_scaler.scale_edition(edition)
class ContentCafeAPI(object): """Associates up to four resources with an ISBN.""" BASE_URL = "http://contentcafe2.btol.com/" ONE_YEAR_AGO = datetime.timedelta(days=365) image_url = BASE_URL + "ContentCafe/Jacket.aspx?userID=%(userid)s&password=%(password)s&Type=L&Value=%(isbn)s" overview_url = BASE_URL + "ContentCafeClient/ContentCafe.aspx?UserID=%(userid)s&Password=%(password)s&ItemKey=%(isbn)s" review_url = BASE_URL + "ContentCafeClient/ReviewsDetail.aspx?UserID=%(userid)s&Password=%(password)s&ItemKey=%(isbn)s" summary_url = BASE_URL + "ContentCafeClient/Summary.aspx?UserID=%(userid)s&Password=%(password)s&ItemKey=%(isbn)s" excerpt_url = BASE_URL + "ContentCafeClient/Excerpt.aspx?UserID=%(userid)s&Password=%(password)s&ItemKey=%(isbn)s" author_notes_url = BASE_URL + "ContentCafeClient/AuthorNotes.aspx?UserID=%(userid)s&Password=%(password)s&ItemKey=%(isbn)s" log = logging.getLogger("Content Cafe API") @classmethod def from_config(cls, _db, mirror, **kwargs): integration = ExternalIntegration.lookup( _db, ExternalIntegration.CONTENT_CAFE, ExternalIntegration.METADATA_GOAL) if not integration or not (integration.username and integration.password): raise CannotLoadConfiguration( 'Content Cafe not properly configured') return cls(_db, mirror, integration.username, integration.password, **kwargs) def __init__(self, _db, mirror, user_id, password, uploader=None, soap_client=None): self._db = _db self.mirror = mirror if self.mirror: self.scaler = ImageScaler(_db, [self.mirror], uploader=uploader) else: self.scaler = None self.user_id = user_id self.password = password self.soap_client = (soap_client or ContentCafeSOAPClient(user_id, password)) @property def data_source(self): return DataSource.lookup(self._db, DataSource.CONTENT_CAFE) def mirror_resources(self, isbn_identifier): """Associate a number of resources with the given ISBN. """ isbn = isbn_identifier.identifier args = dict(userid=self.user_id, password=self.password, isbn=isbn) image_url = self.image_url % args hyperlink, is_new = isbn_identifier.add_link(Hyperlink.IMAGE, image_url, self.data_source) representation = self.mirror.mirror_hyperlink(hyperlink) if representation.status_code == 404: # Content Cafe served us an HTML page instead of an # image. This indicates that Content Cafe has no knowledge # of this ISBN. There is no need to make any more # requests. return True self.mirror.uploader.mirror_one(representation) self.scaler.scale_edition(isbn_identifier) self.get_descriptions(isbn_identifier, args) self.get_excerpt(isbn_identifier, args) self.get_reviews(isbn_identifier, args) self.get_author_notes(isbn_identifier, args) self.measure_popularity(isbn_identifier, self.soap_client.ONE_YEAR_AGO) def get_associated_web_resources(self, identifier, args, url, phrase_indicating_missing_data, rel, scrape_method): url = url % args self.log.debug("Getting associated resources for %s", url) response = requests.get(url) content_type = response.headers['Content-Type'] hyperlinks = [] already_seen = set() if not phrase_indicating_missing_data in response.content: self.log.info("Found %s %s Content!", identifier.identifier, rel) soup = BeautifulSoup(response.content, "lxml") resource_contents = scrape_method(soup) if resource_contents: for content in resource_contents: if content in already_seen: continue already_seen.add(content) hyperlink, is_new = identifier.add_link( rel, None, self.data_source, media_type="text/html", content=content) hyperlinks.append(hyperlink) self.log.debug( "Content: %s", hyperlink.resource.representation.content[:75]) return hyperlinks def get_reviews(self, identifier, args): return self.get_associated_web_resources( identifier, args, self.review_url, 'No review info exists for this item', Hyperlink.REVIEW, self._scrape_list) def get_descriptions(self, identifier, args): hyperlinks = list( self.get_associated_web_resources( identifier, args, self.summary_url, 'No annotation info exists for this item', Hyperlink.DESCRIPTION, self._scrape_list)) if not hyperlinks: return hyperlinks # Since we get multiple descriptions, and there is no # associated Edition, now is a good time to evaluate the quality # of the descriptions. This will make it easy to pick the best one # when this identifier is looked up. evaluator = SummaryEvaluator(bad_phrases=[]) by_content = dict() for link in hyperlinks: content = link.resource.representation.content evaluator.add(content) evaluator.ready() for link in hyperlinks: resource = link.resource content = resource.representation.content quality = evaluator.score(content) resource.set_estimated_quality(quality) resource.update_quality() return hyperlinks def get_author_notes(self, identifier, args): return self.get_associated_web_resources( identifier, args, self.author_notes_url, 'No author notes info exists for this item', Hyperlink.AUTHOR, self._scrape_one) def get_excerpt(self, identifier, args): return self.get_associated_web_resources( identifier, args, self.excerpt_url, 'No excerpt info exists for this item', Hyperlink.SAMPLE, self._scrape_one) def measure_popularity(self, identifier, cutoff=None): if identifier.type != Identifier.ISBN: raise Error("I can only measure the popularity of ISBNs.") value = self.soap_client.estimated_popularity(identifier.identifier) # Even a complete lack of popularity data is useful--it tells # us there's no need to check again anytime soon. measurement = identifier.add_measurement(self.data_source, Measurement.POPULARITY, value) # Since there is no associated Edition, now is a good time to # normalize the value. return measurement.normalized_value @classmethod def _scrape_list(cls, soup): table = soup.find('table', id='Table_Main') if table: for header in table.find_all('td', class_='SectionHeader'): content = header.parent.next_sibling if content.name != 'tr': continue if not content.td: continue yield content.td.encode_contents() @classmethod def _scrape_one(cls, soup): table = soup.find('table', id='Table_Main') if not table: return [] if table.tr and table.tr.td: return [table.tr.td.encode_contents()] else: return []
def run(self): mirrors = [OverdriveCoverImageMirror] ImageScaler(self._db, mirrors).run(force=self.force)