Пример #1
0
    def __init__(self,
                 _db,
                 mirror,
                 user_id,
                 password,
                 uploader=None,
                 soap_client=None):
        self._db = _db

        self.mirror = mirror
        if self.mirror:
            self.scaler = ImageScaler(_db, [self.mirror], uploader=uploader)
        else:
            self.scaler = None

        self.user_id = user_id
        self.password = password
        self.soap_client = (soap_client
                            or ContentCafeSOAPClient(user_id, password))
Пример #2
0
 def __init__(self, db, mirror, user_id=None, password=None, uploader=None,
              soap_client=None):
     self._db = db
     self.mirror = mirror
     if self.mirror:
         self.scaler = ImageScaler(db, [self.mirror], uploader=uploader)
     else:
         self.scaler = None
     integration = Configuration.integration("Content Cafe")
     self.user_id = user_id or integration['username']
     self.password = password or integration['password']
     self.log = logging.getLogger("Content Cafe API")
     self.soap_client = (
         soap_client or ContentCafeSOAPClient(self.user_id, self.password)
     )
Пример #3
0
    def __init__(self, _db, batch_size=10, cutoff_time=None,
                 uploader=None, providers=None, **kwargs):
        output_source, made_new = get_one_or_create(
            _db, DataSource,
            name=DataSource.INTERNAL_PROCESSING
        )
        # Other components don't have INTERNAL_PROCESSING as offering
        # licenses, but we do, because we're responsible for managing
        # LicensePools.
        output_source.offers_licenses=True
        input_identifier_types = [Identifier.OVERDRIVE_ID, Identifier.ISBN]

        super(IdentifierResolutionCoverageProvider, self).__init__(
            service_name="Identifier Resolution Coverage Provider",
            input_identifier_types=input_identifier_types,
            output_source=output_source,
            batch_size=batch_size,
            operation=CoverageRecord.RESOLVE_IDENTIFIER_OPERATION,
        )

        # Since we are the metadata wrangler, any resources we find,
        # we mirror to S3.
        mirror = uploader or S3Uploader()

        # We're going to be aggressive about recalculating the presentation
        # for this work because either the work is currently not set up
        # at all, or something went wrong trying to set it up.
        presentation_calculation_policy = PresentationCalculationPolicy(
            regenerate_opds_entries=True,
            update_search_index=True
        )
        policy = ReplacementPolicy.from_metadata_source(
            mirror=mirror, even_if_not_apparently_updated=True,
            presentation_calculation_policy=presentation_calculation_policy
        )
        if providers:
            # For testing purposes. Initializing the real coverage providers
            # during tests can cause requests to third-parties.
            (self.required_coverage_providers,
            self.optional_coverage_providers) = providers
        else:
            overdrive = OverdriveBibliographicCoverageProvider(
                _db, metadata_replacement_policy=policy
            )
            content_cafe = ContentCafeCoverageProvider(self._db)
            content_server = ContentServerCoverageProvider(self._db)
            oclc_classify = OCLCClassifyCoverageProvider(self._db)

            self.required_coverage_providers = [
                overdrive, content_cafe, content_server, oclc_classify
            ]
            self.optional_coverage_providers = []

        self.viaf = VIAFClient(self._db)
        self.image_mirrors = {
            DataSource.OVERDRIVE : OverdriveCoverImageMirror(
                self._db, uploader=uploader
            )
        }
        self.image_scaler = ImageScaler(
            self._db, self.image_mirrors.values(), uploader=uploader
        )
        self.oclc_linked_data = LinkedDataCoverageProvider(self._db)
Пример #4
0
class IdentifierResolutionCoverageProvider(CoverageProvider):
    """ Resolve all of the Identifiers with CoverageProviders in transient 
    failure states, turning them into Editions with LicensePools.
    Create CoverageProviders to contact 3rd party entities for information on 
    Identifier-represented library item (book).

    For ISBNs, make a bunch of Resources, rather than LicensePooled Editions.
    """

    CAN_CREATE_LICENSE_POOLS = True
    LICENSE_SOURCE_NOT_ACCESSIBLE = (
        "Could not access underlying license source over the network.")
    UNKNOWN_FAILURE = "Unknown failure."

    def __init__(self, _db, batch_size=10, cutoff_time=None,
                 uploader=None, providers=None, **kwargs):
        output_source, made_new = get_one_or_create(
            _db, DataSource,
            name=DataSource.INTERNAL_PROCESSING
        )
        # Other components don't have INTERNAL_PROCESSING as offering
        # licenses, but we do, because we're responsible for managing
        # LicensePools.
        output_source.offers_licenses=True
        input_identifier_types = [Identifier.OVERDRIVE_ID, Identifier.ISBN]

        super(IdentifierResolutionCoverageProvider, self).__init__(
            service_name="Identifier Resolution Coverage Provider",
            input_identifier_types=input_identifier_types,
            output_source=output_source,
            batch_size=batch_size,
            operation=CoverageRecord.RESOLVE_IDENTIFIER_OPERATION,
        )

        # Since we are the metadata wrangler, any resources we find,
        # we mirror to S3.
        mirror = uploader or S3Uploader()

        # We're going to be aggressive about recalculating the presentation
        # for this work because either the work is currently not set up
        # at all, or something went wrong trying to set it up.
        presentation_calculation_policy = PresentationCalculationPolicy(
            regenerate_opds_entries=True,
            update_search_index=True
        )
        policy = ReplacementPolicy.from_metadata_source(
            mirror=mirror, even_if_not_apparently_updated=True,
            presentation_calculation_policy=presentation_calculation_policy
        )
        if providers:
            # For testing purposes. Initializing the real coverage providers
            # during tests can cause requests to third-parties.
            (self.required_coverage_providers,
            self.optional_coverage_providers) = providers
        else:
            overdrive = OverdriveBibliographicCoverageProvider(
                _db, metadata_replacement_policy=policy
            )
            content_cafe = ContentCafeCoverageProvider(self._db)
            content_server = ContentServerCoverageProvider(self._db)
            oclc_classify = OCLCClassifyCoverageProvider(self._db)

            self.required_coverage_providers = [
                overdrive, content_cafe, content_server, oclc_classify
            ]
            self.optional_coverage_providers = []

        self.viaf = VIAFClient(self._db)
        self.image_mirrors = {
            DataSource.OVERDRIVE : OverdriveCoverImageMirror(
                self._db, uploader=uploader
            )
        }
        self.image_scaler = ImageScaler(
            self._db, self.image_mirrors.values(), uploader=uploader
        )
        self.oclc_linked_data = LinkedDataCoverageProvider(self._db)

    def items_that_need_coverage(self, identifiers=None, **kwargs):
        """Find all identifiers lacking coverage from this CoverageProvider.

        Only identifiers that have been requested via the URNLookupController
        (and thus given 'transient failure' CoverageRecords) should be
        returned. Identifiers created through previous resolution processes
        can be ignored.
        """
        qu = super(IdentifierResolutionCoverageProvider, self).items_that_need_coverage(
            identifiers=identifiers, **kwargs
        )
        qu = qu.filter(CoverageRecord.id != None)
        return qu

    def process_item(self, identifier):
        """For this identifier, checks that it has all of the available
        3rd party metadata, and if not, obtains it.

        If metadata failed to be obtained, and the coverage was deemed
        required, then returns a CoverageFailure.
        """
        self.log.info("Ensuring coverage for %r", identifier)

        license_pool = self.license_pool(identifier)
        if isinstance(license_pool, CoverageFailure):
            error = ValueError(
                "Could not generate LicensePool for %r" % identifier
            )
            return self.transform_exception_into_failure(e, identifier)

        # Go through all relevant providers and tries to ensure coverage.
        # If there's a failure or an exception, create a CoverageFailure.
        for provider in self.required_coverage_providers:
            if not identifier.type in provider.input_identifier_types:
                continue
            try:
                record = provider.ensure_coverage(identifier, force=True)
            except Exception as e:
                return self.transform_exception_into_failure(e, identifier)

            if record.exception:
                error_msg = "500: " + record.exception
                transiency = True
                if record.status == CoverageRecord.PERSISTENT_FAILURE:
                    transiency = False
                return CoverageFailure(
                    identifier, error_msg,
                    data_source=self.output_source, transient=transiency
                )

        # Now go through the optional providers. It's the same deal,
        # but a CoverageFailure doesn't cause the entire identifier
        # resolution process to fail.
        for provider in self.optional_coverage_providers:
            if not identifier.type in provider.input_identifier_types:
                continue
            try:
                record = provider.ensure_coverage(identifier, force=True)
            except Exception as e:
                return self.transform_exception_into_failure(e, identifier)

        try:
            self.finalize(identifier)
        except Exception as e:
            return self.transform_exception_into_failure(e, identifier)

        return identifier

    def transform_exception_into_failure(self, error, identifier):
        """Ensures coverage of a given identifier by a given provider with
        appropriate error handling for broken providers.
        """
        self.log.warn(
            "Error completing coverage for %r: %r", identifier, error,
            exc_info=error
        )
        return CoverageFailure(
            identifier, repr(error),
            data_source=self.output_source, transient=True
        )

    def finalize(self, identifier):
        """Sets equivalent identifiers from OCLC and processes the work."""

        self.resolve_equivalent_oclc_identifiers(identifier)
        if identifier.type==Identifier.ISBN:
            # Currently we don't try to create Works for ISBNs,
            # we just make sure all the Resources associated with the
            # ISBN are properly handled. At this point, that has
            # completed successfully, so do nothing.
            pass
        else:
            self.process_work(identifier)

    def process_work(self, identifier):
        """Fill in VIAF data and cover images where possible before setting
        a previously-unresolved identifier's work as presentation ready.
        """
        work = None
        license_pool = identifier.licensed_through
        if license_pool:
            work, created = license_pool.calculate_work(even_if_no_author=True)
        if work:
            self.resolve_viaf(work)
            self.resolve_cover_image(work)
            work.calculate_presentation()
            work.set_presentation_ready()
        else:
            error_msg = "500; " + "Work could not be calculated for %r" % identifier
            transiency = True
            return CoverageFailure(
                identifier, error_msg,
                data_source=self.output_source, transient=transiency
            )

    def resolve_equivalent_oclc_identifiers(self, identifier):
        """Ensures OCLC coverage for an identifier.

        This has to be called after the OCLCClassify coverage is run to confirm
        that equivalent OCLC identifiers are available.
        """
        oclc_ids = set()
        types = [Identifier.OCLC_WORK, Identifier.OCLC_NUMBER, Identifier.ISBN]
        for edition in identifier.primarily_identifies:
            oclc_ids = oclc_ids.union(
                edition.equivalent_identifiers(type=types)
            )
        for oclc_id in oclc_ids:
            self.log.info("Currently processing equivalent identifier: %r", oclc_id)
            self.oclc_linked_data.ensure_coverage(oclc_id)

    def resolve_viaf(self, work):
        """Get VIAF data on all contributors."""

        viaf = VIAFClient(self._db)
        for pool in work.license_pools:
            edition = pool.presentation_edition
            for contributor in edition.contributors:
                viaf.process_contributor(contributor)
                if not contributor.display_name:
                    contributor.family_name, contributor.display_name = (
                        contributor.default_names())

    def resolve_cover_image(self, work):
        """Make sure we have the cover for all editions."""

        for pool in work.license_pools:
            edition = pool.presentation_edition
            data_source_name = edition.data_source.name
            if data_source_name in self.image_mirrors:
                self.image_mirrors[data_source_name].mirror_edition(edition)
                self.image_scaler.scale_edition(edition)
Пример #5
0
class ContentCafeAPI(object):
    """Associates up to four resources with an ISBN."""

    BASE_URL = "http://contentcafe2.btol.com/"
    ONE_YEAR_AGO = datetime.timedelta(days=365)

    image_url = BASE_URL + "ContentCafe/Jacket.aspx?userID=%(userid)s&password=%(password)s&Type=L&Value=%(isbn)s"
    overview_url= BASE_URL + "ContentCafeClient/ContentCafe.aspx?UserID=%(userid)s&Password=%(password)s&ItemKey=%(isbn)s"
    review_url = BASE_URL + "ContentCafeClient/ReviewsDetail.aspx?UserID=%(userid)s&Password=%(password)s&ItemKey=%(isbn)s"
    summary_url = BASE_URL + "ContentCafeClient/Summary.aspx?UserID=%(userid)s&Password=%(password)s&ItemKey=%(isbn)s"
    excerpt_url = BASE_URL + "ContentCafeClient/Excerpt.aspx?UserID=%(userid)s&Password=%(password)s&ItemKey=%(isbn)s"
    author_notes_url = BASE_URL + "ContentCafeClient/AuthorNotes.aspx?UserID=%(userid)s&Password=%(password)s&ItemKey=%(isbn)s"

    def __init__(self, db, mirror, user_id=None, password=None, uploader=None,
                 soap_client=None):
        self._db = db
        self.mirror = mirror
        if self.mirror:
            self.scaler = ImageScaler(db, [self.mirror], uploader=uploader)
        else:
            self.scaler = None
        integration = Configuration.integration("Content Cafe")
        self.user_id = user_id or integration['username']
        self.password = password or integration['password']
        self.log = logging.getLogger("Content Cafe API")
        self.soap_client = (
            soap_client or ContentCafeSOAPClient(self.user_id, self.password)
        )

    @property
    def data_source(self):
        return DataSource.lookup(self._db, DataSource.CONTENT_CAFE)

    def mirror_resources(self, isbn_identifier):
        """Associate a number of resources with the given ISBN.
        """
        isbn = isbn_identifier.identifier

        args = dict(userid=self.user_id, password=self.password, isbn=isbn)
        image_url = self.image_url % args
        hyperlink, is_new = isbn_identifier.add_link(
            Hyperlink.IMAGE, image_url, self.data_source)
        representation = self.mirror.mirror_hyperlink(hyperlink)
        if representation.status_code == 404:
            # Content Cafe served us an HTML page instead of an
            # image. This indicates that Content Cafe has no knowledge
            # of this ISBN. There is no need to make any more
            # requests.
            return True

        self.mirror.uploader.mirror_one(representation)
        self.scaler.scale_edition(isbn_identifier)
        self.get_descriptions(isbn_identifier, args)
        self.get_excerpt(isbn_identifier, args)
        self.get_reviews(isbn_identifier, args)
        self.get_author_notes(isbn_identifier, args)
        self.measure_popularity(isbn_identifier, self.soap_client.ONE_YEAR_AGO)

    def get_associated_web_resources(
            self, identifier, args, url,
            phrase_indicating_missing_data,
            rel, scrape_method):
        url = url % args
        self.log.debug("Getting associated resources for %s", url)
        response = requests.get(url)
        content_type = response.headers['Content-Type']
        hyperlinks = []
        already_seen = set()
        if not phrase_indicating_missing_data in response.content:
            self.log.info("Found %s %s Content!", identifier.identifier, rel)
            soup = BeautifulSoup(response.content, "lxml")
            resource_contents = scrape_method(soup)
            if resource_contents:
                for content in resource_contents:
                    if content in already_seen:
                        continue
                    already_seen.add(content)
                    hyperlink, is_new = identifier.add_link(
                        rel, None, self.data_source, media_type="text/html",
                        content=content)
                    hyperlinks.append(hyperlink)
                    self.log.debug(
                        "Content: %s",
                        hyperlink.resource.representation.content[:75])
        return hyperlinks

    def get_reviews(self, identifier, args):
        return self.get_associated_web_resources(
            identifier, args, self.review_url,
            'No review info exists for this item',
            Hyperlink.REVIEW, self._scrape_list)

    def get_descriptions(self, identifier, args):
        hyperlinks = list(self.get_associated_web_resources(
            identifier, args, self.summary_url,
            'No annotation info exists for this item',
            Hyperlink.DESCRIPTION, self._scrape_list))
        if not hyperlinks:
            return hyperlinks

        # Since we get multiple descriptions, and there is no
        # associated Edition, now is a good time to evaluate the quality
        # of the descriptions. This will make it easy to pick the best one
        # when this identifier is looked up.
        evaluator = SummaryEvaluator(bad_phrases=[])
        by_content = dict()
        for link in hyperlinks:
            content = link.resource.representation.content
            evaluator.add(content)
        evaluator.ready()
        for link in hyperlinks:
            resource = link.resource
            content = resource.representation.content
            quality = evaluator.score(content)
            resource.set_estimated_quality(quality)
            resource.update_quality()
        return hyperlinks

    def get_author_notes(self, identifier, args):
        return self.get_associated_web_resources(
            identifier, args, self.author_notes_url,
            'No author notes info exists for this item',
            Hyperlink.AUTHOR, self._scrape_one)

    def get_excerpt(self, identifier, args):
        return self.get_associated_web_resources(
            identifier, args, self.excerpt_url,
            'No excerpt info exists for this item', Hyperlink.SAMPLE,
            self._scrape_one)

    def measure_popularity(self, identifier, cutoff=None):
        if identifier.type != Identifier.ISBN:
            raise Error("I can only measure the popularity of ISBNs.")
        value = self.soap_client.estimated_popularity(identifier.identifier)
        # Even a complete lack of popularity data is useful--it tells
        # us there's no need to check again anytime soon.
        measurement = identifier.add_measurement(
            self.data_source, Measurement.POPULARITY, value)

        # Since there is no associated Edition, now is a good time to
        # normalize the value.
        return measurement.normalized_value

    @classmethod
    def _scrape_list(cls, soup):
        table = soup.find('table', id='Table_Main')
        if table:
            for header in table.find_all('td', class_='SectionHeader'):
                content = header.parent.next_sibling
                if content.name != 'tr':
                    continue
                if not content.td:
                    continue
                yield content.td.encode_contents()

    @classmethod
    def _scrape_one(cls, soup):
        table = soup.find('table', id='Table_Main')
        if not table:
            return []
        if table.tr and table.tr.td:
            return [table.tr.td.encode_contents()]
        else:
            return []
Пример #6
0
    def __init__(self,
                 collection,
                 uploader=None,
                 viaf_client=None,
                 linked_data_coverage_provider=None,
                 content_cafe_api=None,
                 overdrive_api_class=OverdriveAPI,
                 **kwargs):

        super(IdentifierResolutionCoverageProvider,
              self).__init__(collection, **kwargs)

        # Since we are the metadata wrangler, any resources we find,
        # we mirror to S3.
        if not uploader:
            uploader = S3Uploader.from_config(self._db)
        self.uploader = uploader

        # We're going to be aggressive about recalculating the presentation
        # for this work because either the work is currently not set up
        # at all, or something went wrong trying to set it up.
        self.policy = PresentationCalculationPolicy(
            regenerate_opds_entries=True)

        self.overdrive_api = self.create_overdrive_api(overdrive_api_class)

        self.content_cafe_api = content_cafe_api

        # Determine the optional and required coverage providers.
        # Each Identifier in this Collection's catalog will be run
        # through all relevant providers.
        self.required_coverage_providers, self.optional_coverage_providers = self.providers(
        )

        # When we need to look up a contributor via VIAF we will use this
        # client.
        self.viaf_client = viaf_client or VIAFClient(self._db)

        # Books are not looked up in OCLC Linked Data directly, since
        # there is no Collection that identifies a book by its OCLC Number.
        # However, when a book is looked up through OCLC Classify, some
        # OCLC Numbers may be associated with it, and _those_ numbers
        # can be run through OCLC Linked Data.
        #
        # TODO: We get many books identified by ISBN, and those books
        # _could_ be run through a LinkedDataCoverageProvider if it
        # worked a little differently. However, I don't think this
        # would be very useful, since those books will get looked up
        # through OCLC Classify, which will probably result in us
        # finding that same ISBN via OCLC Number.
        self.oclc_linked_data = (linked_data_coverage_provider
                                 or LinkedDataCoverageProvider(
                                     self._db, viaf_api=self.viaf_client))

        # The ordinary OverdriveBibliographicCoverageProvider
        # doesn't upload images, so we need to create our own
        # mirror and scaler.
        #
        # TODO: This class would be neater if we were to subclass
        # OverdriveBibliographicCoverageProvider to do the scaling and
        # uploading.
        self.image_mirrors = {
            DataSource.OVERDRIVE:
            OverdriveCoverImageMirror(self._db, uploader=uploader)
        }
        self.image_scaler = ImageScaler(self._db,
                                        self.image_mirrors.values(),
                                        uploader=uploader)
Пример #7
0
class IdentifierResolutionCoverageProvider(CatalogCoverageProvider):
    """Make sure all Identifiers registered as needing coverage by this
    CoverageProvider become Works with Editions and (probably dummy)
    LicensePools.

    Coverage happens by running the Identifier through _other_
    CoverageProviders, filling in the blanks with additional data from
    third-party entities.

    For ISBNs, we end up with a bunch of Resources, rather than
    Works. TODO: This needs to change.
    """

    SERVICE_NAME = "Identifier Resolution Coverage Provider"
    DATA_SOURCE_NAME = DataSource.INTERNAL_PROCESSING
    INPUT_IDENTIFIER_TYPES = [
        Identifier.OVERDRIVE_ID, Identifier.ISBN, Identifier.URI,
        Identifier.GUTENBERG_ID
    ]
    OPERATION = CoverageRecord.RESOLVE_IDENTIFIER_OPERATION

    LICENSE_SOURCE_NOT_ACCESSIBLE = (
        "Could not access underlying license source over the network.")
    UNKNOWN_FAILURE = "Unknown failure."

    DEFAULT_OVERDRIVE_COLLECTION_NAME = u'Default Overdrive'

    def __init__(self,
                 collection,
                 uploader=None,
                 viaf_client=None,
                 linked_data_coverage_provider=None,
                 content_cafe_api=None,
                 overdrive_api_class=OverdriveAPI,
                 **kwargs):

        super(IdentifierResolutionCoverageProvider,
              self).__init__(collection, **kwargs)

        # Since we are the metadata wrangler, any resources we find,
        # we mirror to S3.
        if not uploader:
            uploader = S3Uploader.from_config(self._db)
        self.uploader = uploader

        # We're going to be aggressive about recalculating the presentation
        # for this work because either the work is currently not set up
        # at all, or something went wrong trying to set it up.
        self.policy = PresentationCalculationPolicy(
            regenerate_opds_entries=True)

        self.overdrive_api = self.create_overdrive_api(overdrive_api_class)

        self.content_cafe_api = content_cafe_api

        # Determine the optional and required coverage providers.
        # Each Identifier in this Collection's catalog will be run
        # through all relevant providers.
        self.required_coverage_providers, self.optional_coverage_providers = self.providers(
        )

        # When we need to look up a contributor via VIAF we will use this
        # client.
        self.viaf_client = viaf_client or VIAFClient(self._db)

        # Books are not looked up in OCLC Linked Data directly, since
        # there is no Collection that identifies a book by its OCLC Number.
        # However, when a book is looked up through OCLC Classify, some
        # OCLC Numbers may be associated with it, and _those_ numbers
        # can be run through OCLC Linked Data.
        #
        # TODO: We get many books identified by ISBN, and those books
        # _could_ be run through a LinkedDataCoverageProvider if it
        # worked a little differently. However, I don't think this
        # would be very useful, since those books will get looked up
        # through OCLC Classify, which will probably result in us
        # finding that same ISBN via OCLC Number.
        self.oclc_linked_data = (linked_data_coverage_provider
                                 or LinkedDataCoverageProvider(
                                     self._db, viaf_api=self.viaf_client))

        # The ordinary OverdriveBibliographicCoverageProvider
        # doesn't upload images, so we need to create our own
        # mirror and scaler.
        #
        # TODO: This class would be neater if we were to subclass
        # OverdriveBibliographicCoverageProvider to do the scaling and
        # uploading.
        self.image_mirrors = {
            DataSource.OVERDRIVE:
            OverdriveCoverImageMirror(self._db, uploader=uploader)
        }
        self.image_scaler = ImageScaler(self._db,
                                        self.image_mirrors.values(),
                                        uploader=uploader)

    def create_overdrive_api(self, overdrive_api_class):
        collection, is_new = Collection.by_name_and_protocol(
            self._db, self.DEFAULT_OVERDRIVE_COLLECTION_NAME,
            ExternalIntegration.OVERDRIVE)
        if is_new:
            raise ValueError(
                'Default Overdrive collection has not been configured.')
        return overdrive_api_class(self._db, collection)

    def providers(self):
        """Instantiate required and optional CoverageProviders.

        All Identifiers in this Collection's catalog will be run
        through each provider. If an optional provider fails, nothing
        will happen.  If a required provider fails, the coverage
        operation as a whole will fail.

        NOTE: This method creates CoverageProviders that go against
        real servers. Because of this, tests must use a subclass that
        mocks providers(), such as
        MockIdentifierResolutionCoverageProvider.
        """
        # All books must be run through Content Cafe and OCLC
        # Classify, assuming their identifiers are of the right
        # type.
        content_cafe = ContentCafeCoverageProvider(self._db,
                                                   api=self.content_cafe_api,
                                                   uploader=self.uploader)
        oclc_classify = OCLCClassifyCoverageProvider(self._db)

        optional = []
        required = [content_cafe, oclc_classify]

        # All books derived from OPDS import against the open-access
        # content server must be looked up in that server.
        #
        # TODO: This could stand some generalization. Any OPDS server
        # that also supports the lookup protocol can be used here.
        if (self.collection.protocol == ExternalIntegration.OPDS_IMPORT
                and self.collection.data_source
                and self.collection.data_source.name
                == DataSource.OA_CONTENT_SERVER):
            required.append(LookupClientCoverageProvider(self.collection))

        # All books obtained from Overdrive must be looked up via the
        # Overdrive API.
        if self.collection.protocol == ExternalIntegration.OVERDRIVE:
            required.append(
                OverdriveBibliographicCoverageProvider(
                    self.collection, api_class=self.overdrive_api))
        return optional, required

    def items_that_need_coverage(self, identifiers=None, **kwargs):
        """Find all identifiers lacking coverage from this CoverageProvider.

        Only identifiers that have CoverageRecords in the 'transient
        failure' state will be returned. Unlike with other
        CoverageProviders, Identifiers that have no CoverageRecord at
        all will not be processed.
        """
        qu = super(IdentifierResolutionCoverageProvider,
                   self).items_that_need_coverage(identifiers=identifiers,
                                                  **kwargs)
        qu = qu.filter(CoverageRecord.id != None)
        return qu

    def process_item(self, identifier):
        """For this identifier, checks that it has all of the available
        3rd party metadata, and if not, obtains it.

        If metadata failed to be obtained, and the coverage was deemed
        required, then returns a CoverageFailure.
        """
        self.log.info("Ensuring coverage for %r", identifier)

        # Make sure there's a LicensePool for this Identifier in this
        # Collection. Since we're the metadata wrangler, the
        # LicensePool will probably be a stub that doesn't actually
        # represent the right to loan the book, but that's okay.
        license_pool = self.license_pool(identifier)
        if not license_pool.licenses_owned:
            license_pool.update_availability(1, 1, 0, 0)

        # Go through all relevant providers and try to ensure coverage.
        failure = self.run_through_relevant_providers(
            identifier,
            self.required_coverage_providers,
            fail_on_any_failure=True)
        if failure:
            return failure

        # Now go through relevant optional providers and try to ensure
        # coverage.
        failure = self.run_through_relevant_providers(
            identifier,
            self.optional_coverage_providers,
            fail_on_any_failure=False)
        if failure:
            return failure

        # We got coverage from all the required coverage providers,
        # and none of the optional coverage providers raised an exception,
        # so we're ready.
        try:
            self.finalize(identifier)
        except Exception as e:
            return self.transform_exception_into_failure(e, identifier)

        return identifier

    def run_through_relevant_providers(self, identifier, providers,
                                       fail_on_any_failure):
        """Run the given Identifier through a set of CoverageProviders.

        :param identifier: Process this Identifier.
        :param providers: Run `identifier` through every relevant
            CoverageProvider in this list.
        :param fail_on_any_failure: True means that each
            CoverageProvider must succeed or the whole operation
            fails. False means that if a CoverageProvider fails it's
            not a deal-breaker.
        :return: A CoverageFailure if there was an unrecoverable failure,
            None if everything went okay.
        """
        for provider in providers:
            if (provider.input_identifier_types and
                    not identifier.type in provider.input_identifier_types):
                # The CoverageProvider under consideration doesn't
                # handle Identifiers of this type.
                continue
            try:
                record = provider.ensure_coverage(identifier, force=True)
                if fail_on_any_failure and record.exception:
                    # As the CoverageProvider under consideration has
                    # fallen, so must this CoverageProvider also fall.
                    error_msg = "500: " + record.exception
                    transient = (
                        record.status == CoverageRecord.TRANSIENT_FAILURE)
                    return self.failure(identifier,
                                        error_msg,
                                        transient=transient)
            except Exception as e:
                # An uncaught exception becomes a CoverageFailure no
                # matter what.
                return self.transform_exception_into_failure(e, identifier)

        # Return None to indicate success.
        return None

    def transform_exception_into_failure(self, error, identifier):
        """Ensures coverage of a given identifier by a given provider with
        appropriate error handling for broken providers.
        """
        self.log.warn("Error completing coverage for %r: %r",
                      identifier,
                      error,
                      exc_info=error)
        return self.failure(identifier, repr(error), transient=True)

    def finalize(self, identifier):
        """Sets equivalent identifiers from OCLC and processes the work."""

        self.resolve_equivalent_oclc_identifiers(identifier)
        if identifier.type == Identifier.ISBN:
            # In order to create Works for ISBNs, we first have to
            # create an edition associated with the ISBN as a primary
            # identifier. At the moment, this is achieved via OCLC
            # Linked Data.
            self.generate_edition(identifier)
        self.process_work(identifier)

    def generate_edition(self, identifier):
        """Utilizes an ISBN's equivalent identifiers (OCLC Number or Work IDs)
        to set an appropriate LicensePool presentation edition so a Work can
        later be created.
        """
        equivalent_ids = identifier.equivalent_identifier_ids()[identifier.id]

        # Get the editions of equivalent identifiers (OCLC Number or Work IDs)
        # to set as a presentation edition. These editions can be lower quality,
        # and it's important that they have a title.
        titled_equivalent_editions = self._db.query(Edition).\
            join(Edition.primary_identifier).\
            filter(Identifier.id.in_(equivalent_ids)).\
            filter(Edition.title!=None)

        # It's preferable that they have an author, too.
        authored_equivalent_editions = titled_equivalent_editions.filter(
            Edition.author != None, Edition.author != Edition.UNKNOWN_AUTHOR)

        if fast_query_count(authored_equivalent_editions):
            # Prioritize editions with both a title and an author if available.
            equivalent_editions = authored_equivalent_editions.all()
        else:
            equivalent_editions = titled_equivalent_editions.all()

        if equivalent_editions:
            # Set the presentation edition.
            pool = identifier.licensed_through[0]
            pool.set_presentation_edition(
                equivalent_editions=equivalent_editions)

    def process_work(self, identifier):
        """Fill in VIAF data and cover images where possible before setting
        a previously-unresolved identifier's work as presentation ready.

        TODO: I think this should be split into a separate
        WorkCoverageProvider which runs last. That way we have a record
        of which Works have had this service.
        """
        work = None
        license_pools = identifier.licensed_through
        if license_pools:
            pool = license_pools[0]
            work, created = pool.calculate_work(even_if_no_author=True,
                                                exclude_search=True)
        if work:
            self.resolve_viaf(work)
            self.resolve_cover_image(work)

            work.calculate_presentation(policy=self.policy,
                                        exclude_search=True)
            work.set_presentation_ready(exclude_search=True)
        else:
            error_msg = "500; " + "Work could not be calculated for %r" % identifier
            raise RuntimeError(error_msg)

    def resolve_equivalent_oclc_identifiers(self, identifier):
        """Ensures OCLC coverage for an identifier.

        This has to be called after the OCLCClassify coverage is run to confirm
        that equivalent OCLC identifiers are available.
        """
        oclc_ids = set()
        if identifier.type == Identifier.ISBN:
            # ISBNs won't have editions, so they should be run through OCLC
            # to retrieve basic edition data (title, author).
            oclc_ids.add(identifier)

        types = [Identifier.OCLC_WORK, Identifier.OCLC_NUMBER, Identifier.ISBN]
        for edition in identifier.primarily_identifies:
            oclc_ids = oclc_ids.union(
                edition.equivalent_identifiers(type=types))
        for oclc_id in oclc_ids:
            self.log.info("Currently processing equivalent identifier: %r",
                          oclc_id)
            self.oclc_linked_data.ensure_coverage(oclc_id)

    def resolve_viaf(self, work):
        """Get VIAF data on all contributors."""

        for pool in work.license_pools:
            edition = pool.presentation_edition
            if not edition:
                continue
            for contributor in edition.contributors:
                self.viaf_client.process_contributor(contributor)
                if not contributor.display_name:
                    contributor.family_name, contributor.display_name = (
                        contributor.default_names())

    def resolve_cover_image(self, work):
        """Make sure we have the cover for all editions."""

        for pool in work.license_pools:
            edition = pool.presentation_edition
            data_source_name = pool.data_source.name
            if data_source_name in self.image_mirrors:
                self.image_mirrors[data_source_name].mirror_edition(edition)
                self.image_scaler.scale_edition(edition)
Пример #8
0
class ContentCafeAPI(object):
    """Associates up to four resources with an ISBN."""

    BASE_URL = "http://contentcafe2.btol.com/"
    ONE_YEAR_AGO = datetime.timedelta(days=365)

    image_url = BASE_URL + "ContentCafe/Jacket.aspx?userID=%(userid)s&password=%(password)s&Type=L&Value=%(isbn)s"
    overview_url = BASE_URL + "ContentCafeClient/ContentCafe.aspx?UserID=%(userid)s&Password=%(password)s&ItemKey=%(isbn)s"
    review_url = BASE_URL + "ContentCafeClient/ReviewsDetail.aspx?UserID=%(userid)s&Password=%(password)s&ItemKey=%(isbn)s"
    summary_url = BASE_URL + "ContentCafeClient/Summary.aspx?UserID=%(userid)s&Password=%(password)s&ItemKey=%(isbn)s"
    excerpt_url = BASE_URL + "ContentCafeClient/Excerpt.aspx?UserID=%(userid)s&Password=%(password)s&ItemKey=%(isbn)s"
    author_notes_url = BASE_URL + "ContentCafeClient/AuthorNotes.aspx?UserID=%(userid)s&Password=%(password)s&ItemKey=%(isbn)s"

    log = logging.getLogger("Content Cafe API")

    @classmethod
    def from_config(cls, _db, mirror, **kwargs):
        integration = ExternalIntegration.lookup(
            _db, ExternalIntegration.CONTENT_CAFE,
            ExternalIntegration.METADATA_GOAL)

        if not integration or not (integration.username
                                   and integration.password):
            raise CannotLoadConfiguration(
                'Content Cafe not properly configured')

        return cls(_db, mirror, integration.username, integration.password,
                   **kwargs)

    def __init__(self,
                 _db,
                 mirror,
                 user_id,
                 password,
                 uploader=None,
                 soap_client=None):
        self._db = _db

        self.mirror = mirror
        if self.mirror:
            self.scaler = ImageScaler(_db, [self.mirror], uploader=uploader)
        else:
            self.scaler = None

        self.user_id = user_id
        self.password = password
        self.soap_client = (soap_client
                            or ContentCafeSOAPClient(user_id, password))

    @property
    def data_source(self):
        return DataSource.lookup(self._db, DataSource.CONTENT_CAFE)

    def mirror_resources(self, isbn_identifier):
        """Associate a number of resources with the given ISBN.
        """
        isbn = isbn_identifier.identifier

        args = dict(userid=self.user_id, password=self.password, isbn=isbn)
        image_url = self.image_url % args
        hyperlink, is_new = isbn_identifier.add_link(Hyperlink.IMAGE,
                                                     image_url,
                                                     self.data_source)
        representation = self.mirror.mirror_hyperlink(hyperlink)
        if representation.status_code == 404:
            # Content Cafe served us an HTML page instead of an
            # image. This indicates that Content Cafe has no knowledge
            # of this ISBN. There is no need to make any more
            # requests.
            return True

        self.mirror.uploader.mirror_one(representation)
        self.scaler.scale_edition(isbn_identifier)
        self.get_descriptions(isbn_identifier, args)
        self.get_excerpt(isbn_identifier, args)
        self.get_reviews(isbn_identifier, args)
        self.get_author_notes(isbn_identifier, args)
        self.measure_popularity(isbn_identifier, self.soap_client.ONE_YEAR_AGO)

    def get_associated_web_resources(self, identifier, args, url,
                                     phrase_indicating_missing_data, rel,
                                     scrape_method):
        url = url % args
        self.log.debug("Getting associated resources for %s", url)
        response = requests.get(url)
        content_type = response.headers['Content-Type']
        hyperlinks = []
        already_seen = set()
        if not phrase_indicating_missing_data in response.content:
            self.log.info("Found %s %s Content!", identifier.identifier, rel)
            soup = BeautifulSoup(response.content, "lxml")
            resource_contents = scrape_method(soup)
            if resource_contents:
                for content in resource_contents:
                    if content in already_seen:
                        continue
                    already_seen.add(content)
                    hyperlink, is_new = identifier.add_link(
                        rel,
                        None,
                        self.data_source,
                        media_type="text/html",
                        content=content)
                    hyperlinks.append(hyperlink)
                    self.log.debug(
                        "Content: %s",
                        hyperlink.resource.representation.content[:75])
        return hyperlinks

    def get_reviews(self, identifier, args):
        return self.get_associated_web_resources(
            identifier, args, self.review_url,
            'No review info exists for this item', Hyperlink.REVIEW,
            self._scrape_list)

    def get_descriptions(self, identifier, args):
        hyperlinks = list(
            self.get_associated_web_resources(
                identifier, args, self.summary_url,
                'No annotation info exists for this item',
                Hyperlink.DESCRIPTION, self._scrape_list))
        if not hyperlinks:
            return hyperlinks

        # Since we get multiple descriptions, and there is no
        # associated Edition, now is a good time to evaluate the quality
        # of the descriptions. This will make it easy to pick the best one
        # when this identifier is looked up.
        evaluator = SummaryEvaluator(bad_phrases=[])
        by_content = dict()
        for link in hyperlinks:
            content = link.resource.representation.content
            evaluator.add(content)
        evaluator.ready()
        for link in hyperlinks:
            resource = link.resource
            content = resource.representation.content
            quality = evaluator.score(content)
            resource.set_estimated_quality(quality)
            resource.update_quality()
        return hyperlinks

    def get_author_notes(self, identifier, args):
        return self.get_associated_web_resources(
            identifier, args, self.author_notes_url,
            'No author notes info exists for this item', Hyperlink.AUTHOR,
            self._scrape_one)

    def get_excerpt(self, identifier, args):
        return self.get_associated_web_resources(
            identifier, args, self.excerpt_url,
            'No excerpt info exists for this item', Hyperlink.SAMPLE,
            self._scrape_one)

    def measure_popularity(self, identifier, cutoff=None):
        if identifier.type != Identifier.ISBN:
            raise Error("I can only measure the popularity of ISBNs.")
        value = self.soap_client.estimated_popularity(identifier.identifier)
        # Even a complete lack of popularity data is useful--it tells
        # us there's no need to check again anytime soon.
        measurement = identifier.add_measurement(self.data_source,
                                                 Measurement.POPULARITY, value)

        # Since there is no associated Edition, now is a good time to
        # normalize the value.
        return measurement.normalized_value

    @classmethod
    def _scrape_list(cls, soup):
        table = soup.find('table', id='Table_Main')
        if table:
            for header in table.find_all('td', class_='SectionHeader'):
                content = header.parent.next_sibling
                if content.name != 'tr':
                    continue
                if not content.td:
                    continue
                yield content.td.encode_contents()

    @classmethod
    def _scrape_one(cls, soup):
        table = soup.find('table', id='Table_Main')
        if not table:
            return []
        if table.tr and table.tr.td:
            return [table.tr.td.encode_contents()]
        else:
            return []
Пример #9
0
 def run(self):
     mirrors = [OverdriveCoverImageMirror]
     ImageScaler(self._db, mirrors).run(force=self.force)