예제 #1
0
 def __init__(self, test_session=None):
     # Allows tests to run without db session overlap.
     if test_session:
         self._session = test_session
     self.coverage = LinkedDataCoverageProvider(self._db)
     self.oclc_classify = OCLCClassifyCoverageProvider(self._db)
     self.viaf = VIAFClient(self._db)
예제 #2
0
    def test_all_authors_get_viaf_lookup(self):
        # TODO: The code this calls could be refactored quite a bit --
        # we don't really need to test all of process_item() here.
        # But ATM it does seem to be our only test of process_item().

        oclc = MockOCLCLinkedDataAPI()
        viaf = MockVIAFClient()
        provider = LinkedDataCoverageProvider(
            self._db, api=oclc, viaf_api=viaf
        )

        # Here's a placeholder that will be filled in with information from
        # OCLC Linked Data.
        edition = self._edition()
        for i in edition.contributions:
            self._db.delete(i)
        self._db.commit()
        identifier = edition.primary_identifier

        # OCLC Linked Data is going to mention two authors -- one with
        # a sort name + VIAF, and one with a VIAF but no sort name.
        contributor1 = ContributorData(viaf="1")
        contributor2 = ContributorData(viaf="2", sort_name="Jordan, Robert")
        idata = IdentifierData(type=identifier.type, 
                               identifier=identifier.identifier)
        metadata = Metadata(
            DataSource.OCLC_LINKED_DATA,
            contributors=[contributor1, contributor2],
            primary_identifier=idata,
            title=u"foo"
        )
        oclc.queue_info_for(metadata)

        # Our OCLC Linked Data client is going to try to fill in the
        # data, asking VIAF about the contributors.
        lookup1 = (ContributorData(
                  viaf="1", display_name="Display Name",
                  family_name="Family", sort_name="Name, Sort",
                  wikipedia_name="Wikipedia_Name"), None, None)
        lookup2 = (ContributorData(
                   viaf="2", wikipedia_name="Robert_Jordan_(Author)",
                   biography="That guy."), None, None)
        viaf.queue_lookup(lookup1, lookup2)

        provider.process_item(identifier)

        # Both authors have had their information updated with the
        # VIAF results.
        filled_in = sorted(
            [(x.sort_name, x.display_name, x.viaf, x.wikipedia_name, x.biography)
             for x in edition.contributors]
        )
        eq_(
            [(u'Jordan, Robert', None, u'2', u'Robert_Jordan_(Author)', u'That guy.'),
            (u'Name, Sort', u'Display Name', u'1', u'Wikipedia_Name', None)],
            filled_in
        )
예제 #3
0
class RedoOCLC(Explain):

    def __init__(self):
        self.coverage = LinkedDataCoverageProvider(self._db)

    @property
    def oclcld(self):
        return DataSource.lookup(self._db, DataSource.OCLC_LINKED_DATA)

    def run(self):
        id_type, identifier = sys.argv[1:]
        identifier, ignore = Identifier.for_foreign_id(
            self._db, id_type, identifier
        )
        self.fix_identifier(identifier)

    def fix_identifier(self, primary_identifier):
        equivalent_ids = primary_identifier.equivalent_identifier_ids(
            levels=6, threshold=0)
        return self.fix_identifier_with_equivalents(primary_identifier, equivalent_ids)

    def fix_identifier_with_equivalents(self, primary_identifier, equivalent_ids):
        for edition in primary_identifier.primarily_identifies:
            print "BEFORE"
            self.explain(self._db, edition)
            print "-" * 80

        t1 = self._db.begin_nested()

        equivalencies = self._db.query(Equivalency).filter(
            Equivalency.data_source == self.oclcld).filter(
                Equivalency.input_id.in_(equivalent_ids)
            )
        print "DELETING %d" % equivalencies.count()
        for e in equivalencies:
            if e.strength == 0:
                print "DELETING %r" % e
            self._db.delete(e)
        t1.commit()

        self.coverage.process_item(primary_identifier)

        equivalent_ids = primary_identifier.equivalent_identifier_ids(
            levels=6, threshold=0)
        equivalencies = self._db.query(Equivalency).filter(
            Equivalency.data_source == self.oclcld).filter(
                Equivalency.input_id.in_(equivalent_ids),
            )

        for edition in primary_identifier.primarily_identifies:
            if edition.work:
                edition.work.calculate_presentation()
            self.explain(self._db, edition)
        print "I WOULD NOW EXPECT EVERYTHING TO BE FINE."
예제 #4
0
    def test_process_item_exception(self):
        class DoomedOCLCLinkedData(OCLCLinkedData):
            def info_for(self, identifier):
                raise IOError("Exception!")

        provider = LinkedDataCoverageProvider(self._db, api=DoomedOCLCLinkedData(self._db))
        
        edition = self._edition()
        identifier = edition.primary_identifier

        result = provider.process_item(identifier)
        assert isinstance(result, CoverageFailure)
        assert "Exception!" in result.exception
예제 #5
0
    def test_process_item_exception_missing_isbn(self):
        class DoomedOCLCLinkedData(OCLCLinkedData):
            def info_for(self, identifier):
                raise IOError("Tried, but couldn't find location")

        provider = LinkedDataCoverageProvider(self._db,
                                              api=DoomedOCLCLinkedData(
                                                  self._db))

        edition = self._edition()
        identifier = edition.primary_identifier

        result = provider.process_item(identifier)
        assert isinstance(result, CoverageFailure)
        assert "OCLC doesn't know about this ISBN" in result.exception
예제 #6
0
    def test_process_item_exception(self):
        class DoomedOCLCLinkedData(OCLCLinkedData):
            def info_for(self, identifier):
                raise IOError("Exception!")

        provider = LinkedDataCoverageProvider(self._db,
                                              api=DoomedOCLCLinkedData(
                                                  self._db))

        edition = self._edition()
        identifier = edition.primary_identifier

        result = provider.process_item(identifier)
        assert isinstance(result, CoverageFailure)
        assert "Exception!" in result.exception
예제 #7
0
    def test_process_item_exception_missing_isbn(self):
        class DoomedOCLCLinkedData(OCLCLinkedData):
            def info_for(self, identifier):
                raise IOError("Tried, but couldn't find location")

        provider = LinkedDataCoverageProvider(
            self._db, api=DoomedOCLCLinkedData(self._db)
        )
        
        edition = self._edition()
        identifier = edition.primary_identifier

        result = provider.process_item(identifier)
        assert isinstance(result, CoverageFailure)
        assert "OCLC doesn't know about this ISBN" in result.exception
예제 #8
0
 def __init__(self, test_session=None):
     # Allows tests to run without db session overlap.
     if test_session:
         self._session = test_session
     self.coverage = LinkedDataCoverageProvider(self._db)
     self.oclc_classify = OCLCClassifyCoverageProvider(self._db)
     self.viaf = VIAFClient(self._db)
예제 #9
0
class RedoOCLC(Explain):
    def __init__(self):
        self.coverage = LinkedDataCoverageProvider(self._db)

    @property
    def oclcld(self):
        return DataSource.lookup(self._db, DataSource.OCLC_LINKED_DATA)

    def run(self):
        id_type, identifier = sys.argv[1:]
        identifier, ignore = Identifier.for_foreign_id(self._db, id_type,
                                                       identifier)
        self.fix_identifier(identifier)

    def fix_identifier(self, primary_identifier):
        equivalent_ids = primary_identifier.equivalent_identifier_ids(
            levels=6, threshold=0)
        return self.fix_identifier_with_equivalents(primary_identifier,
                                                    equivalent_ids)

    def fix_identifier_with_equivalents(self, primary_identifier,
                                        equivalent_ids):
        for edition in primary_identifier.primarily_identifies:
            print "BEFORE"
            self.explain(self._db, edition)
            print "-" * 80

        t1 = self._db.begin_nested()

        equivalencies = self._db.query(Equivalency).filter(
            Equivalency.data_source == self.oclcld).filter(
                Equivalency.input_id.in_(equivalent_ids))
        print "DELETING %d" % equivalencies.count()
        for e in equivalencies:
            if e.strength == 0:
                print "DELETING %r" % e
            self._db.delete(e)
        t1.commit()

        self.coverage.process_item(primary_identifier)

        for edition in primary_identifier.primarily_identifies:
            if edition.work:
                edition.work.calculate_presentation()
            self.explain(self._db, edition)
        print "I WOULD NOW EXPECT EVERYTHING TO BE FINE."
예제 #10
0
    def setup(self):
        super(TestIdentifierResolutionCoverageProvider, self).setup()
        self.identifier = self._identifier(Identifier.OVERDRIVE_ID)
        self._default_collection.catalog_identifier(self._db, self.identifier)
        self.source = DataSource.license_source_for(self._db, self.identifier)

        # Create mocks for the different collections and APIs used by
        # IdentifierResolutionCoverageProvider.
        overdrive_collection = MockOverdriveAPI.mock_collection(self._db)
        overdrive_collection.name = (IdentifierResolutionCoverageProvider.
                                     DEFAULT_OVERDRIVE_COLLECTION_NAME)
        self.viaf = MockVIAFClient(self._db)
        self.linked_data_client = MockOCLCLinkedData(self._db)
        self.linked_data_coverage_provider = LinkedDataCoverageProvider(
            self._db, None, self.viaf, api=self.linked_data_client)
        self.uploader = DummyS3Uploader()

        # Make the constructor arguments available in case a test
        # needs to create a different type of resolver.
        self.provider_kwargs = dict(
            uploader=self.uploader,
            viaf_client=self.viaf,
            overdrive_api_class=MockOverdriveAPI,
            linked_data_coverage_provider=self.linked_data_coverage_provider,
        )

        # But most tests will use this resolver.
        self.resolver = MockIdentifierResolutionCoverageProvider(
            self._default_collection, **self.provider_kwargs)

        # Create some useful CoverageProviders that can be inserted
        # into self.resolver.required_coverage_providers
        # and self.resolver.optional_coverage_providers
        self.always_successful = AlwaysSuccessfulCoverageProvider(self._db)
        self.never_successful = NeverSuccessfulCoverageProvider(self._db)
        self.broken = BrokenCoverageProvider(self._db)
예제 #11
0
 def __init__(self):
     self.coverage = LinkedDataCoverageProvider(self._db)
예제 #12
0
 def setup(self):
     super(TestLinkedDataCoverageProvider, self).setup()
     self.provider = LinkedDataCoverageProvider(self._db)
예제 #13
0
class TestLinkedDataCoverageProvider(DatabaseTest):

    def setup(self):
        super(TestLinkedDataCoverageProvider, self).setup()
        self.provider = LinkedDataCoverageProvider(self._db)

    def test_new_isbns(self):
        existing_id = self._identifier()
        metadata = Metadata(
            DataSource.lookup(self._db, DataSource.GUTENBERG),
            identifiers=[
                IdentifierData(type=Identifier.OCLC_WORK, identifier="abra"),
                IdentifierData(type=existing_id.type, identifier=existing_id.identifier),
                IdentifierData(type=Identifier.ISBN, identifier="kadabra"),
            ]
        )

        eq_(2, self.provider.new_isbns(metadata))

    def test_set_equivalence(self):
        edition = self._edition()
        edition.title = "The House on Mango Street"
        edition.add_contributor(Contributor(viaf="112460612"), Contributor.AUTHOR_ROLE)
        identifier = edition.primary_identifier

        i1 = self._identifier()
        identifierdata1 = IdentifierData(type=i1.type, identifier=i1.identifier)
        good_metadata = Metadata(
            DataSource.lookup(self._db, DataSource.GUTENBERG),
            primary_identifier = identifierdata1,
            title = "The House on Mango Street",
            contributors = [Contributor(viaf="112460612")]
        )

        i2 = self._identifier()
        identifierdata2 = IdentifierData(type=i2.type, identifier=i2.identifier)
        bad_metadata = Metadata(
            DataSource.lookup(self._db, DataSource.GUTENBERG),
            primary_identifier = identifierdata2,
            title = "Calvin & Hobbes",
            contributors = [Contributor(viaf="101010")]
        )

        self.provider.set_equivalence(identifier, good_metadata)
        self.provider.set_equivalence(identifier, bad_metadata)
        equivalencies = Equivalency.for_identifiers(self._db, [identifier]).all()

        # The identifier for the bad metadata isn't made equivalent
        eq_([i1], [x.output for x in equivalencies])
        eq_([1], [x.strength for x in equivalencies])

        # But if the existing identifier has no editions, they're made equivalent.
        identifier = self._identifier()
        self.provider.set_equivalence(identifier, bad_metadata)
        equivalencies = Equivalency.for_identifiers(self._db, [identifier]).all()
        eq_([i2], [x.output for x in equivalencies])
        eq_([1], [x.strength for x in equivalencies])

    def test_process_item_exception(self):
        class DoomedOCLCLinkedData(OCLCLinkedData):
            def info_for(self, identifier):
                raise IOError("Exception!")

        provider = LinkedDataCoverageProvider(self._db, api=DoomedOCLCLinkedData(self._db))
        
        edition = self._edition()
        identifier = edition.primary_identifier

        result = provider.process_item(identifier)
        assert isinstance(result, CoverageFailure)
        assert "Exception!" in result.exception

    def test_process_item_exception_missing_isbn(self):
        class DoomedOCLCLinkedData(OCLCLinkedData):
            def info_for(self, identifier):
                raise IOError("Tried, but couldn't find location")

        provider = LinkedDataCoverageProvider(
            self._db, api=DoomedOCLCLinkedData(self._db)
        )
        
        edition = self._edition()
        identifier = edition.primary_identifier

        result = provider.process_item(identifier)
        assert isinstance(result, CoverageFailure)
        assert "OCLC doesn't know about this ISBN" in result.exception

    def test_all_authors_get_viaf_lookup(self):
        # TODO: The code this calls could be refactored quite a bit --
        # we don't really need to test all of process_item() here.
        # But ATM it does seem to be our only test of process_item().

        oclc = MockOCLCLinkedDataAPI()
        viaf = MockVIAFClient()
        provider = LinkedDataCoverageProvider(
            self._db, api=oclc, viaf_api=viaf
        )

        # Here's a placeholder that will be filled in with information from
        # OCLC Linked Data.
        edition = self._edition()
        for i in edition.contributions:
            self._db.delete(i)
        self._db.commit()
        identifier = edition.primary_identifier

        # OCLC Linked Data is going to mention two authors -- one with
        # a sort name + VIAF, and one with a VIAF but no sort name.
        contributor1 = ContributorData(viaf="1")
        contributor2 = ContributorData(viaf="2", sort_name="Jordan, Robert")
        idata = IdentifierData(type=identifier.type, 
                               identifier=identifier.identifier)
        metadata = Metadata(
            DataSource.OCLC_LINKED_DATA,
            contributors=[contributor1, contributor2],
            primary_identifier=idata,
            title=u"foo"
        )
        oclc.queue_info_for(metadata)

        # Our OCLC Linked Data client is going to try to fill in the
        # data, asking VIAF about the contributors.
        lookup1 = (ContributorData(
                  viaf="1", display_name="Display Name",
                  family_name="Family", sort_name="Name, Sort",
                  wikipedia_name="Wikipedia_Name"), None, None)
        lookup2 = (ContributorData(
                   viaf="2", wikipedia_name="Robert_Jordan_(Author)",
                   biography="That guy."), None, None)
        viaf.queue_lookup(lookup1, lookup2)

        provider.process_item(identifier)

        # Both authors have had their information updated with the
        # VIAF results.
        filled_in = sorted(
            [(x.sort_name, x.display_name, x.viaf, x.wikipedia_name, x.biography)
             for x in edition.contributors]
        )
        eq_(
            [(u'Jordan, Robert', None, u'2', u'Robert_Jordan_(Author)', u'That guy.'),
            (u'Name, Sort', u'Display Name', u'1', u'Wikipedia_Name', None)],
            filled_in
        )
예제 #14
0
    def __init__(self, _db, batch_size=10, cutoff_time=None,
                 uploader=None, providers=None, **kwargs):
        output_source, made_new = get_one_or_create(
            _db, DataSource,
            name=DataSource.INTERNAL_PROCESSING
        )
        # Other components don't have INTERNAL_PROCESSING as offering
        # licenses, but we do, because we're responsible for managing
        # LicensePools.
        output_source.offers_licenses=True
        input_identifier_types = [Identifier.OVERDRIVE_ID, Identifier.ISBN]

        super(IdentifierResolutionCoverageProvider, self).__init__(
            service_name="Identifier Resolution Coverage Provider",
            input_identifier_types=input_identifier_types,
            output_source=output_source,
            batch_size=batch_size,
            operation=CoverageRecord.RESOLVE_IDENTIFIER_OPERATION,
        )

        # Since we are the metadata wrangler, any resources we find,
        # we mirror to S3.
        mirror = uploader or S3Uploader()

        # We're going to be aggressive about recalculating the presentation
        # for this work because either the work is currently not set up
        # at all, or something went wrong trying to set it up.
        presentation_calculation_policy = PresentationCalculationPolicy(
            regenerate_opds_entries=True,
            update_search_index=True
        )
        policy = ReplacementPolicy.from_metadata_source(
            mirror=mirror, even_if_not_apparently_updated=True,
            presentation_calculation_policy=presentation_calculation_policy
        )
        if providers:
            # For testing purposes. Initializing the real coverage providers
            # during tests can cause requests to third-parties.
            (self.required_coverage_providers,
            self.optional_coverage_providers) = providers
        else:
            overdrive = OverdriveBibliographicCoverageProvider(
                _db, metadata_replacement_policy=policy
            )
            content_cafe = ContentCafeCoverageProvider(self._db)
            content_server = ContentServerCoverageProvider(self._db)
            oclc_classify = OCLCClassifyCoverageProvider(self._db)

            self.required_coverage_providers = [
                overdrive, content_cafe, content_server, oclc_classify
            ]
            self.optional_coverage_providers = []

        self.viaf = VIAFClient(self._db)
        self.image_mirrors = {
            DataSource.OVERDRIVE : OverdriveCoverImageMirror(
                self._db, uploader=uploader
            )
        }
        self.image_scaler = ImageScaler(
            self._db, self.image_mirrors.values(), uploader=uploader
        )
        self.oclc_linked_data = LinkedDataCoverageProvider(self._db)
예제 #15
0
class IdentifierResolutionCoverageProvider(CoverageProvider):
    """ Resolve all of the Identifiers with CoverageProviders in transient 
    failure states, turning them into Editions with LicensePools.
    Create CoverageProviders to contact 3rd party entities for information on 
    Identifier-represented library item (book).

    For ISBNs, make a bunch of Resources, rather than LicensePooled Editions.
    """

    CAN_CREATE_LICENSE_POOLS = True
    LICENSE_SOURCE_NOT_ACCESSIBLE = (
        "Could not access underlying license source over the network.")
    UNKNOWN_FAILURE = "Unknown failure."

    def __init__(self, _db, batch_size=10, cutoff_time=None,
                 uploader=None, providers=None, **kwargs):
        output_source, made_new = get_one_or_create(
            _db, DataSource,
            name=DataSource.INTERNAL_PROCESSING
        )
        # Other components don't have INTERNAL_PROCESSING as offering
        # licenses, but we do, because we're responsible for managing
        # LicensePools.
        output_source.offers_licenses=True
        input_identifier_types = [Identifier.OVERDRIVE_ID, Identifier.ISBN]

        super(IdentifierResolutionCoverageProvider, self).__init__(
            service_name="Identifier Resolution Coverage Provider",
            input_identifier_types=input_identifier_types,
            output_source=output_source,
            batch_size=batch_size,
            operation=CoverageRecord.RESOLVE_IDENTIFIER_OPERATION,
        )

        # Since we are the metadata wrangler, any resources we find,
        # we mirror to S3.
        mirror = uploader or S3Uploader()

        # We're going to be aggressive about recalculating the presentation
        # for this work because either the work is currently not set up
        # at all, or something went wrong trying to set it up.
        presentation_calculation_policy = PresentationCalculationPolicy(
            regenerate_opds_entries=True,
            update_search_index=True
        )
        policy = ReplacementPolicy.from_metadata_source(
            mirror=mirror, even_if_not_apparently_updated=True,
            presentation_calculation_policy=presentation_calculation_policy
        )
        if providers:
            # For testing purposes. Initializing the real coverage providers
            # during tests can cause requests to third-parties.
            (self.required_coverage_providers,
            self.optional_coverage_providers) = providers
        else:
            overdrive = OverdriveBibliographicCoverageProvider(
                _db, metadata_replacement_policy=policy
            )
            content_cafe = ContentCafeCoverageProvider(self._db)
            content_server = ContentServerCoverageProvider(self._db)
            oclc_classify = OCLCClassifyCoverageProvider(self._db)

            self.required_coverage_providers = [
                overdrive, content_cafe, content_server, oclc_classify
            ]
            self.optional_coverage_providers = []

        self.viaf = VIAFClient(self._db)
        self.image_mirrors = {
            DataSource.OVERDRIVE : OverdriveCoverImageMirror(
                self._db, uploader=uploader
            )
        }
        self.image_scaler = ImageScaler(
            self._db, self.image_mirrors.values(), uploader=uploader
        )
        self.oclc_linked_data = LinkedDataCoverageProvider(self._db)

    def items_that_need_coverage(self, identifiers=None, **kwargs):
        """Find all identifiers lacking coverage from this CoverageProvider.

        Only identifiers that have been requested via the URNLookupController
        (and thus given 'transient failure' CoverageRecords) should be
        returned. Identifiers created through previous resolution processes
        can be ignored.
        """
        qu = super(IdentifierResolutionCoverageProvider, self).items_that_need_coverage(
            identifiers=identifiers, **kwargs
        )
        qu = qu.filter(CoverageRecord.id != None)
        return qu

    def process_item(self, identifier):
        """For this identifier, checks that it has all of the available
        3rd party metadata, and if not, obtains it.

        If metadata failed to be obtained, and the coverage was deemed
        required, then returns a CoverageFailure.
        """
        self.log.info("Ensuring coverage for %r", identifier)

        license_pool = self.license_pool(identifier)
        if isinstance(license_pool, CoverageFailure):
            error = ValueError(
                "Could not generate LicensePool for %r" % identifier
            )
            return self.transform_exception_into_failure(e, identifier)

        # Go through all relevant providers and tries to ensure coverage.
        # If there's a failure or an exception, create a CoverageFailure.
        for provider in self.required_coverage_providers:
            if not identifier.type in provider.input_identifier_types:
                continue
            try:
                record = provider.ensure_coverage(identifier, force=True)
            except Exception as e:
                return self.transform_exception_into_failure(e, identifier)

            if record.exception:
                error_msg = "500: " + record.exception
                transiency = True
                if record.status == CoverageRecord.PERSISTENT_FAILURE:
                    transiency = False
                return CoverageFailure(
                    identifier, error_msg,
                    data_source=self.output_source, transient=transiency
                )

        # Now go through the optional providers. It's the same deal,
        # but a CoverageFailure doesn't cause the entire identifier
        # resolution process to fail.
        for provider in self.optional_coverage_providers:
            if not identifier.type in provider.input_identifier_types:
                continue
            try:
                record = provider.ensure_coverage(identifier, force=True)
            except Exception as e:
                return self.transform_exception_into_failure(e, identifier)

        try:
            self.finalize(identifier)
        except Exception as e:
            return self.transform_exception_into_failure(e, identifier)

        return identifier

    def transform_exception_into_failure(self, error, identifier):
        """Ensures coverage of a given identifier by a given provider with
        appropriate error handling for broken providers.
        """
        self.log.warn(
            "Error completing coverage for %r: %r", identifier, error,
            exc_info=error
        )
        return CoverageFailure(
            identifier, repr(error),
            data_source=self.output_source, transient=True
        )

    def finalize(self, identifier):
        """Sets equivalent identifiers from OCLC and processes the work."""

        self.resolve_equivalent_oclc_identifiers(identifier)
        if identifier.type==Identifier.ISBN:
            # Currently we don't try to create Works for ISBNs,
            # we just make sure all the Resources associated with the
            # ISBN are properly handled. At this point, that has
            # completed successfully, so do nothing.
            pass
        else:
            self.process_work(identifier)

    def process_work(self, identifier):
        """Fill in VIAF data and cover images where possible before setting
        a previously-unresolved identifier's work as presentation ready.
        """
        work = None
        license_pool = identifier.licensed_through
        if license_pool:
            work, created = license_pool.calculate_work(even_if_no_author=True)
        if work:
            self.resolve_viaf(work)
            self.resolve_cover_image(work)
            work.calculate_presentation()
            work.set_presentation_ready()
        else:
            error_msg = "500; " + "Work could not be calculated for %r" % identifier
            transiency = True
            return CoverageFailure(
                identifier, error_msg,
                data_source=self.output_source, transient=transiency
            )

    def resolve_equivalent_oclc_identifiers(self, identifier):
        """Ensures OCLC coverage for an identifier.

        This has to be called after the OCLCClassify coverage is run to confirm
        that equivalent OCLC identifiers are available.
        """
        oclc_ids = set()
        types = [Identifier.OCLC_WORK, Identifier.OCLC_NUMBER, Identifier.ISBN]
        for edition in identifier.primarily_identifies:
            oclc_ids = oclc_ids.union(
                edition.equivalent_identifiers(type=types)
            )
        for oclc_id in oclc_ids:
            self.log.info("Currently processing equivalent identifier: %r", oclc_id)
            self.oclc_linked_data.ensure_coverage(oclc_id)

    def resolve_viaf(self, work):
        """Get VIAF data on all contributors."""

        viaf = VIAFClient(self._db)
        for pool in work.license_pools:
            edition = pool.presentation_edition
            for contributor in edition.contributors:
                viaf.process_contributor(contributor)
                if not contributor.display_name:
                    contributor.family_name, contributor.display_name = (
                        contributor.default_names())

    def resolve_cover_image(self, work):
        """Make sure we have the cover for all editions."""

        for pool in work.license_pools:
            edition = pool.presentation_edition
            data_source_name = edition.data_source.name
            if data_source_name in self.image_mirrors:
                self.image_mirrors[data_source_name].mirror_edition(edition)
                self.image_scaler.scale_edition(edition)
예제 #16
0
    def __init__(self,
                 collection,
                 uploader=None,
                 viaf_client=None,
                 linked_data_coverage_provider=None,
                 content_cafe_api=None,
                 overdrive_api_class=OverdriveAPI,
                 **kwargs):

        super(IdentifierResolutionCoverageProvider,
              self).__init__(collection, **kwargs)

        # Since we are the metadata wrangler, any resources we find,
        # we mirror to S3.
        if not uploader:
            uploader = S3Uploader.from_config(self._db)
        self.uploader = uploader

        # We're going to be aggressive about recalculating the presentation
        # for this work because either the work is currently not set up
        # at all, or something went wrong trying to set it up.
        self.policy = PresentationCalculationPolicy(
            regenerate_opds_entries=True)

        self.overdrive_api = self.create_overdrive_api(overdrive_api_class)

        self.content_cafe_api = content_cafe_api

        # Determine the optional and required coverage providers.
        # Each Identifier in this Collection's catalog will be run
        # through all relevant providers.
        self.required_coverage_providers, self.optional_coverage_providers = self.providers(
        )

        # When we need to look up a contributor via VIAF we will use this
        # client.
        self.viaf_client = viaf_client or VIAFClient(self._db)

        # Books are not looked up in OCLC Linked Data directly, since
        # there is no Collection that identifies a book by its OCLC Number.
        # However, when a book is looked up through OCLC Classify, some
        # OCLC Numbers may be associated with it, and _those_ numbers
        # can be run through OCLC Linked Data.
        #
        # TODO: We get many books identified by ISBN, and those books
        # _could_ be run through a LinkedDataCoverageProvider if it
        # worked a little differently. However, I don't think this
        # would be very useful, since those books will get looked up
        # through OCLC Classify, which will probably result in us
        # finding that same ISBN via OCLC Number.
        self.oclc_linked_data = (linked_data_coverage_provider
                                 or LinkedDataCoverageProvider(
                                     self._db, viaf_api=self.viaf_client))

        # The ordinary OverdriveBibliographicCoverageProvider
        # doesn't upload images, so we need to create our own
        # mirror and scaler.
        #
        # TODO: This class would be neater if we were to subclass
        # OverdriveBibliographicCoverageProvider to do the scaling and
        # uploading.
        self.image_mirrors = {
            DataSource.OVERDRIVE:
            OverdriveCoverImageMirror(self._db, uploader=uploader)
        }
        self.image_scaler = ImageScaler(self._db,
                                        self.image_mirrors.values(),
                                        uploader=uploader)
예제 #17
0
    def test_viaf_authors_get_viaf_lookup(self):
        # TODO: The code this calls could be refactored quite a bit --
        # we don't really need to test all of process_item() here.
        # But ATM it does seem to be our only test of process_item().

        oclc = MockOCLCLinkedDataAPI()
        viaf = MockVIAFClient()
        provider = LinkedDataCoverageProvider(self._db,
                                              api=oclc,
                                              viaf_api=viaf)

        # Here's a placeholder that will be filled in with information from
        # OCLC Linked Data.
        edition = self._edition()
        for i in edition.contributions:
            self._db.delete(i)
        self._db.commit()
        identifier = edition.primary_identifier

        # OCLC Linked Data is going to mention two authors -- one with
        # a sort name + VIAF, and one with a VIAF but no sort name.
        contributor1 = ContributorData(viaf="1")
        contributor2 = ContributorData(viaf="2", sort_name="Jordan, Robert")
        contributor3 = ContributorData(sort_name="Rice, Anne",
                                       display_name="Anne Rice")
        idata = IdentifierData(type=identifier.type,
                               identifier=identifier.identifier)
        metadata = Metadata(
            DataSource.OCLC_LINKED_DATA,
            contributors=[contributor1, contributor2, contributor3],
            primary_identifier=idata,
            title=u"foo")
        oclc.queue_info_for(metadata)

        # Our OCLC Linked Data client is going to try to fill in the
        # data, asking VIAF about the contributors that have VIAF data,
        # and not those who do not.
        lookup1 = (ContributorData(viaf="1",
                                   display_name="Display Name",
                                   family_name="Family",
                                   sort_name="Name, Sort",
                                   wikipedia_name="Wikipedia_Name"), None,
                   None)
        lookup2 = (ContributorData(viaf="2",
                                   wikipedia_name="Robert_Jordan_(Author)",
                                   biography="That guy."), None, None)
        viaf.queue_lookup(lookup1, lookup2, "Unrequested lookup")

        provider.process_item(identifier)

        # Both VIAF-identified authors have had their information updated
        # with the VIAF results.
        filled_in = sorted([(x.sort_name, x.display_name, x.viaf,
                             x.wikipedia_name, x.biography)
                            for x in edition.contributors])
        eq_([(u'Jordan, Robert', None, u'2', u'Robert_Jordan_(Author)',
              u'That guy.'),
             (u'Name, Sort', u'Display Name', u'1', u'Wikipedia_Name', None),
             (u'Rice, Anne', u'Anne Rice', None, None, None)], filled_in)
        # The author without VIAF data didn't request a VIAF lookup.
        # Instead, that result is still in the mock VIAF queue.
        eq_(viaf.results, ["Unrequested lookup"])
예제 #18
0
 def __init__(self):
     self.coverage = LinkedDataCoverageProvider(self._db)
예제 #19
0
 def setup(self):
     super(TestLinkedDataCoverageProvider, self).setup()
     self.provider = LinkedDataCoverageProvider(self._db)
예제 #20
0
class TestLinkedDataCoverageProvider(DatabaseTest):
    def setup(self):
        super(TestLinkedDataCoverageProvider, self).setup()
        self.provider = LinkedDataCoverageProvider(self._db)

    def test_new_isbns(self):
        existing_id = self._identifier()
        metadata = Metadata(DataSource.lookup(self._db, DataSource.GUTENBERG),
                            identifiers=[
                                IdentifierData(type=Identifier.OCLC_WORK,
                                               identifier="abra"),
                                IdentifierData(
                                    type=existing_id.type,
                                    identifier=existing_id.identifier),
                                IdentifierData(type=Identifier.ISBN,
                                               identifier="kadabra"),
                            ])

        eq_(2, self.provider.new_isbns(metadata))

    def test_set_equivalence(self):
        edition = self._edition()
        edition.title = "The House on Mango Street"
        edition.add_contributor(Contributor(viaf="112460612"),
                                Contributor.AUTHOR_ROLE)
        identifier = edition.primary_identifier

        i1 = self._identifier()
        identifierdata1 = IdentifierData(type=i1.type,
                                         identifier=i1.identifier)
        good_metadata = Metadata(DataSource.lookup(self._db,
                                                   DataSource.GUTENBERG),
                                 primary_identifier=identifierdata1,
                                 title="The House on Mango Street",
                                 contributors=[Contributor(viaf="112460612")])

        i2 = self._identifier()
        identifierdata2 = IdentifierData(type=i2.type,
                                         identifier=i2.identifier)
        bad_metadata = Metadata(DataSource.lookup(self._db,
                                                  DataSource.GUTENBERG),
                                primary_identifier=identifierdata2,
                                title="Calvin & Hobbes",
                                contributors=[Contributor(viaf="101010")])

        self.provider.set_equivalence(identifier, good_metadata)
        self.provider.set_equivalence(identifier, bad_metadata)
        equivalencies = Equivalency.for_identifiers(self._db,
                                                    [identifier]).all()

        # The identifier for the bad metadata isn't made equivalent
        eq_([i1], [x.output for x in equivalencies])
        eq_([1], [x.strength for x in equivalencies])

        # But if the existing identifier has no editions, they're made equivalent.
        identifier = self._identifier()
        self.provider.set_equivalence(identifier, bad_metadata)
        equivalencies = Equivalency.for_identifiers(self._db,
                                                    [identifier]).all()
        eq_([i2], [x.output for x in equivalencies])
        eq_([1], [x.strength for x in equivalencies])

    def test_process_item_exception(self):
        class DoomedOCLCLinkedData(OCLCLinkedData):
            def info_for(self, identifier):
                raise IOError("Exception!")

        provider = LinkedDataCoverageProvider(self._db,
                                              api=DoomedOCLCLinkedData(
                                                  self._db))

        edition = self._edition()
        identifier = edition.primary_identifier

        result = provider.process_item(identifier)
        assert isinstance(result, CoverageFailure)
        assert "Exception!" in result.exception

    def test_process_item_exception_missing_isbn(self):
        class DoomedOCLCLinkedData(OCLCLinkedData):
            def info_for(self, identifier):
                raise IOError("Tried, but couldn't find location")

        provider = LinkedDataCoverageProvider(self._db,
                                              api=DoomedOCLCLinkedData(
                                                  self._db))

        edition = self._edition()
        identifier = edition.primary_identifier

        result = provider.process_item(identifier)
        assert isinstance(result, CoverageFailure)
        assert "OCLC doesn't know about this ISBN" in result.exception

    def test_viaf_authors_get_viaf_lookup(self):
        # TODO: The code this calls could be refactored quite a bit --
        # we don't really need to test all of process_item() here.
        # But ATM it does seem to be our only test of process_item().

        oclc = MockOCLCLinkedDataAPI()
        viaf = MockVIAFClient()
        provider = LinkedDataCoverageProvider(self._db,
                                              api=oclc,
                                              viaf_api=viaf)

        # Here's a placeholder that will be filled in with information from
        # OCLC Linked Data.
        edition = self._edition()
        for i in edition.contributions:
            self._db.delete(i)
        self._db.commit()
        identifier = edition.primary_identifier

        # OCLC Linked Data is going to mention two authors -- one with
        # a sort name + VIAF, and one with a VIAF but no sort name.
        contributor1 = ContributorData(viaf="1")
        contributor2 = ContributorData(viaf="2", sort_name="Jordan, Robert")
        contributor3 = ContributorData(sort_name="Rice, Anne",
                                       display_name="Anne Rice")
        idata = IdentifierData(type=identifier.type,
                               identifier=identifier.identifier)
        metadata = Metadata(
            DataSource.OCLC_LINKED_DATA,
            contributors=[contributor1, contributor2, contributor3],
            primary_identifier=idata,
            title=u"foo")
        oclc.queue_info_for(metadata)

        # Our OCLC Linked Data client is going to try to fill in the
        # data, asking VIAF about the contributors that have VIAF data,
        # and not those who do not.
        lookup1 = (ContributorData(viaf="1",
                                   display_name="Display Name",
                                   family_name="Family",
                                   sort_name="Name, Sort",
                                   wikipedia_name="Wikipedia_Name"), None,
                   None)
        lookup2 = (ContributorData(viaf="2",
                                   wikipedia_name="Robert_Jordan_(Author)",
                                   biography="That guy."), None, None)
        viaf.queue_lookup(lookup1, lookup2, "Unrequested lookup")

        provider.process_item(identifier)

        # Both VIAF-identified authors have had their information updated
        # with the VIAF results.
        filled_in = sorted([(x.sort_name, x.display_name, x.viaf,
                             x.wikipedia_name, x.biography)
                            for x in edition.contributors])
        eq_([(u'Jordan, Robert', None, u'2', u'Robert_Jordan_(Author)',
              u'That guy.'),
             (u'Name, Sort', u'Display Name', u'1', u'Wikipedia_Name', None),
             (u'Rice, Anne', u'Anne Rice', None, None, None)], filled_in)
        # The author without VIAF data didn't request a VIAF lookup.
        # Instead, that result is still in the mock VIAF queue.
        eq_(viaf.results, ["Unrequested lookup"])
예제 #21
0
class RedoOCLCForThreeMScript(Script):

    def __init__(self, test_session=None):
        # Allows tests to run without db session overlap.
        if test_session:
            self._session = test_session
        self.coverage = LinkedDataCoverageProvider(self._db)
        self.oclc_classify = OCLCClassifyCoverageProvider(self._db)
        self.viaf = VIAFClient(self._db)

    @property
    def input_data_source(self):
        return DataSource.lookup(self._db, DataSource.OCLC_LINKED_DATA)

    def do_run(self):
        """Re-runs OCLC Linked Data coverage provider to get viafs. Fetches
        author information and recalculates presentation."""
        identifiers = self.fetch_authorless_threem_identifiers()
        self.delete_coverage_records(identifiers)
        self.ensure_isbn_identifier(identifiers)
        for identifier in identifiers:
            self.coverage.ensure_coverage(identifier)
            self.merge_contributors(identifier)
            # Recalculate everything so the contributors can be seen.
            for contributor in identifier.primary_edition.contributors:
                self.viaf.process_contributor(contributor)
            identifier.primary_edition.calculate_presentation()
            if identifier.licensed_through:
                identifier.licensed_through.calculate_work()

    def fetch_authorless_threem_identifiers(self):
        """Returns a list of ThreeM identifiers that don't have contributors"""
        qu = self._db.query(Identifier).join(Identifier.primarily_identifies)
        qu = qu.outerjoin(Edition.contributions).filter(Contribution.id==None)
        qu = qu.filter(Identifier.type == Identifier.THREEM_ID)
        return qu.all()

    def delete_coverage_records(self, identifiers):
        """Deletes existing OCLC Linked Data coverage records to re-run and
        capture author data"""
        t1 = self._db.begin_nested()

        for identifier in identifiers:
            for coverage_record in identifier.coverage_records:
                if coverage_record.data_source == self.input_data_source:
                    self._db.delete(coverage_record)

        t1.commit()

    def ensure_isbn_identifier(self, identifiers):
        """Runs OCLCClassify to get ISBN numbers if they're not available."""
        identifiers_without_isbn = []
        for identifier in identifiers:
            equivalencies = identifier.equivalencies
            equivalent_types = [eq.output.type for eq in equivalencies]
            if Identifier.ISBN not in equivalent_types:
                identifiers_without_isbn.append(identifier)

        for identifier in identifiers_without_isbn:
            self.oclc_classify.ensure_coverage(identifier)

    def merge_contributors(self, identifier):
        """Gives a ThreeM primary edition any contributors found via OCLC-LD"""
        qu = self._db.query(Identifier).join(Identifier.inbound_equivalencies)
        qu = qu.filter(or_(
            Identifier.type == Identifier.OCLC_WORK,
            Identifier.type == Identifier.OCLC_NUMBER
        )).filter(Equivalency.input_id == identifier.id)

        oclc_contributions = []
        for oclc_identifier in qu.all():
            editions = oclc_identifier.primarily_identifies
            for edition in editions:
                oclc_contributions += edition.contributions

        for contribution in oclc_contributions:
            for edition in identifier.primarily_identifies:
                edition.add_contributor(contribution.contributor, contribution.role)
예제 #22
0
class RedoOCLCForThreeMScript(Script):
    def __init__(self, test_session=None):
        # Allows tests to run without db session overlap.
        if test_session:
            self._session = test_session
        self.coverage = LinkedDataCoverageProvider(self._db)
        self.oclc_classify = OCLCClassifyCoverageProvider(self._db)
        self.viaf = VIAFClient(self._db)

    @property
    def input_data_source(self):
        return DataSource.lookup(self._db, DataSource.OCLC_LINKED_DATA)

    def do_run(self):
        """Re-runs OCLC Linked Data coverage provider to get viafs. Fetches
        author information and recalculates presentation."""
        identifiers = self.fetch_authorless_threem_identifiers()
        self.delete_coverage_records(identifiers)
        self.ensure_isbn_identifier(identifiers)
        for identifier in identifiers:
            self.coverage.ensure_coverage(identifier)
            self.merge_contributors(identifier)
            # Recalculate everything so the contributors can be seen.
            for contributor in identifier.primary_edition.contributors:
                self.viaf.process_contributor(contributor)
            identifier.primary_edition.calculate_presentation()
            if identifier.licensed_through:
                identifier.licensed_through.calculate_work()

    def fetch_authorless_threem_identifiers(self):
        """Returns a list of ThreeM identifiers that don't have contributors"""
        qu = self._db.query(Identifier).join(Identifier.primarily_identifies)
        qu = qu.outerjoin(
            Edition.contributions).filter(Contribution.id == None)
        qu = qu.filter(Identifier.type == Identifier.THREEM_ID)
        return qu.all()

    def delete_coverage_records(self, identifiers):
        """Deletes existing OCLC Linked Data coverage records to re-run and
        capture author data"""
        t1 = self._db.begin_nested()

        for identifier in identifiers:
            for coverage_record in identifier.coverage_records:
                if coverage_record.data_source == self.input_data_source:
                    self._db.delete(coverage_record)

        t1.commit()

    def ensure_isbn_identifier(self, identifiers):
        """Runs OCLCClassify to get ISBN numbers if they're not available."""
        identifiers_without_isbn = []
        for identifier in identifiers:
            equivalencies = identifier.equivalencies
            equivalent_types = [eq.output.type for eq in equivalencies]
            if Identifier.ISBN not in equivalent_types:
                identifiers_without_isbn.append(identifier)

        for identifier in identifiers_without_isbn:
            self.oclc_classify.ensure_coverage(identifier)

    def merge_contributors(self, identifier):
        """Gives a ThreeM primary edition any contributors found via OCLC-LD"""
        qu = self._db.query(Identifier).join(Identifier.inbound_equivalencies)
        qu = qu.filter(
            or_(Identifier.type == Identifier.OCLC_WORK,
                Identifier.type == Identifier.OCLC_NUMBER)).filter(
                    Equivalency.input_id == identifier.id)

        oclc_contributions = []
        for oclc_identifier in qu.all():
            editions = oclc_identifier.primarily_identifies
            for edition in editions:
                oclc_contributions += edition.contributions

        for contribution in oclc_contributions:
            for edition in identifier.primarily_identifies:
                edition.add_contributor(contribution.contributor,
                                        contribution.role)