Пример #1
0
    def test_equivalent_identifiers(self):

        edition = self._edition()
        identifier = self._identifier()
        data_source = DataSource.lookup(self._db, DataSource.OCLC)

        identifier.equivalent_to(data_source, edition.primary_identifier, 0.6)

        policy = PresentationCalculationPolicy(
            equivalent_identifier_threshold=0.5)
        assert set([identifier, edition.primary_identifier
                    ]) == set(edition.equivalent_identifiers(policy=policy))

        policy.equivalent_identifier_threshold = 0.7
        assert set([edition.primary_identifier
                    ]) == set(edition.equivalent_identifiers(policy=policy))
    def test_policy_can_be_customized(self):
        original_policy = self.provider.policy
        new_policy = PresentationCalculationPolicy.reset_cover()

        self.provider._policy = new_policy
        eq_(new_policy, self.provider.policy)
        eq_(False, self.provider.policy.regenerate_opds_entries)
        eq_(False, self.provider.policy.choose_edition)
    def test_policy_can_be_customized(self):
        original_policy = self.provider.policy
        new_policy = PresentationCalculationPolicy.reset_cover()

        self.provider._policy = new_policy
        eq_(new_policy, self.provider.policy)
        eq_(False, self.provider.policy.regenerate_opds_entries)
        eq_(False, self.provider.policy.choose_edition)
 def policy(self):
     # We're going to be aggressive about recalculating the presentation
     # for this work because either the work is currently not calculated
     # at all, or new metadata has been added that may impact the work, or
     # something went wrong trying to calculate it last time.
     if not self._policy:
         self._policy = PresentationCalculationPolicy(
             regenerate_opds_entries=True)
     return self._policy
Пример #5
0
    def test_work_entry_includes_contributor_links(self):
        """ContributorLane links are added to works with contributors"""
        work = self._work(with_open_access_download=True)
        contributor1 = work.presentation_edition.author_contributors[0]
        feed = self.get_parsed_feed([work])
        [entry] = feed.entries

        expected_rel_and_partial = dict(contributor='/contributor')
        self.assert_link_on_entry(
            entry,
            link_type=OPDSFeed.ACQUISITION_FEED_TYPE,
            partials_by_rel=expected_rel_and_partial,
        )

        # When there are two authors, they each get a contributor link.
        work.presentation_edition.add_contributor(u'Oprah',
                                                  Contributor.AUTHOR_ROLE)
        work.calculate_presentation(
            PresentationCalculationPolicy(regenerate_opds_entries=True),
            DummyExternalSearchIndex())
        [entry] = self.get_parsed_feed([work]).entries
        contributor_links = [l for l in entry.links if l.rel == 'contributor']
        eq_(2, len(contributor_links))
        contributor_links.sort(key=lambda l: l.href)
        for l in contributor_links:
            assert l.type == OPDSFeed.ACQUISITION_FEED_TYPE
            assert '/contributor' in l.href
        assert contributor1.sort_name in contributor_links[0].href
        assert 'Oprah' in contributor_links[1].href

        # When there's no author, there's no contributor link.
        self._db.delete(work.presentation_edition.contributions[0])
        self._db.delete(work.presentation_edition.contributions[1])
        self._db.commit()
        work.calculate_presentation(
            PresentationCalculationPolicy(regenerate_opds_entries=True),
            DummyExternalSearchIndex())
        feed = self.get_parsed_feed([work])
        [entry] = feed.entries
        eq_([], filter(lambda l: l.rel == 'contributor', entry.links))
Пример #6
0
    def add_with_metadata(self, collection_details):
        """Adds identifiers with their metadata to a Collection's catalog"""
        client = authenticated_client_from_request(self._db)
        if isinstance(client, ProblemDetail):
            return client

        collection = collection_from_details(self._db, client,
                                             collection_details)

        data_source = DataSource.lookup(self._db,
                                        collection.name,
                                        autocreate=True)

        messages = []

        feed = feedparser.parse(request.data)
        entries = feed.get("entries", [])
        entries_by_urn = {entry.get('id'): entry for entry in entries}

        identifiers_by_urn, invalid_urns = Identifier.parse_urns(
            self._db, entries_by_urn.keys())

        messages = list()

        for urn in invalid_urns:
            messages.append(
                OPDSMessage(urn, INVALID_URN.status_code, INVALID_URN.detail))

        for urn, identifier in identifiers_by_urn.items():
            entry = entries_by_urn[urn]
            status = HTTP_OK
            description = "Already in catalog"

            if identifier not in collection.catalog:
                collection.catalog_identifier(identifier)
                status = HTTP_CREATED
                description = "Successfully added"

            message = OPDSMessage(urn, status, description)

            # Get a cover if it exists.
            image_types = set([Hyperlink.IMAGE, Hyperlink.THUMBNAIL_IMAGE])
            images = [
                l for l in entry.get("links", [])
                if l.get("rel") in image_types
            ]
            links = [
                LinkData(image.get("rel"), image.get("href"))
                for image in images
            ]

            # Create an edition to hold the title and author. LicensePool.calculate_work
            # refuses to create a Work when there's no title, and if we have a title, author
            # and language we can attempt to look up the edition in OCLC.
            title = entry.get("title") or "Unknown Title"
            author = ContributorData(sort_name=(entry.get("author")
                                                or Edition.UNKNOWN_AUTHOR),
                                     roles=[Contributor.PRIMARY_AUTHOR_ROLE])
            language = entry.get("dcterms_language")

            presentation = PresentationCalculationPolicy(
                choose_edition=False,
                set_edition_metadata=False,
                classify=False,
                choose_summary=False,
                calculate_quality=False,
                choose_cover=False,
                regenerate_opds_entries=False,
            )
            replace = ReplacementPolicy(
                presentation_calculation_policy=presentation)
            metadata = Metadata(
                data_source,
                primary_identifier=IdentifierData(identifier.type,
                                                  identifier.identifier),
                title=title,
                language=language,
                contributors=[author],
                links=links,
            )

            edition, ignore = metadata.edition(self._db)
            metadata.apply(edition, collection, replace=replace)

            messages.append(message)

        title = "%s Catalog Item Additions for %s" % (collection.protocol,
                                                      client.url)
        url = self.collection_feed_url("add_with_metadata", collection)
        addition_feed = AcquisitionFeed(self._db,
                                        title,
                                        url, [],
                                        VerboseAnnotator,
                                        precomposed_entries=messages)

        return feed_response(addition_feed)
Пример #7
0
    def change_book_cover(self, identifier_type, identifier, mirrors=None):
        """Save a new book cover based on the submitted form."""
        self.require_librarian(flask.request.library)

        data_source = DataSource.lookup(self._db, DataSource.LIBRARY_STAFF)

        work = self.load_work(flask.request.library, identifier_type,
                              identifier)
        if isinstance(work, ProblemDetail):
            return work

        rights_uri = flask.request.form.get("rights_status")
        rights_explanation = flask.request.form.get("rights_explanation")

        if not rights_uri:
            return INVALID_IMAGE.detailed(
                _("You must specify the image's license."))

        collection = self._get_collection_from_pools(identifier_type,
                                                     identifier)
        if isinstance(collection, ProblemDetail):
            return collection

        # Look for an appropriate mirror to store this cover image. Since the
        # mirror should be used for covers, we don't need a mirror for books.
        mirrors = mirrors or dict(covers_mirror=MirrorUploader.for_collection(
            collection, ExternalIntegrationLink.COVERS),
                                  books_mirror=None)
        if not mirrors.get(ExternalIntegrationLink.COVERS):
            return INVALID_CONFIGURATION_OPTION.detailed(
                _("Could not find a storage integration for uploading the cover."
                  ))

        image = self.generate_cover_image(work, identifier_type, identifier)
        if isinstance(image, ProblemDetail):
            return image

        original, derivation_settings, cover_href, cover_rights_explanation = self._original_cover_info(
            image, work, data_source, rights_uri, rights_explanation)

        buffer = StringIO()
        image.save(buffer, format="PNG")
        content = buffer.getvalue()

        if not cover_href:
            cover_href = Hyperlink.generic_uri(
                data_source,
                work.presentation_edition.primary_identifier,
                Hyperlink.IMAGE,
                content=content)

        cover_data = LinkData(
            Hyperlink.IMAGE,
            href=cover_href,
            media_type=Representation.PNG_MEDIA_TYPE,
            content=content,
            rights_uri=rights_uri,
            rights_explanation=cover_rights_explanation,
            original=original,
            transformation_settings=derivation_settings,
        )

        presentation_policy = PresentationCalculationPolicy(
            choose_edition=False,
            set_edition_metadata=False,
            classify=False,
            choose_summary=False,
            calculate_quality=False,
            choose_cover=True,
            regenerate_opds_entries=True,
            regenerate_marc_record=True,
            update_search_index=False,
        )

        replacement_policy = ReplacementPolicy(
            links=True,
            # link_content is false because we already have the content.
            # We don't want the metadata layer to try to fetch it again.
            link_content=False,
            mirrors=mirrors,
            presentation_calculation_policy=presentation_policy,
        )

        metadata = Metadata(data_source, links=[cover_data])
        metadata.apply(work.presentation_edition,
                       collection,
                       replace=replacement_policy)

        # metadata.apply only updates the edition, so we also need
        # to update the work.
        work.calculate_presentation(policy=presentation_policy)

        return Response(_("Success"), 200)
Пример #8
0
    def edit_classifications(self, identifier_type, identifier):
        """Edit a work's audience, target age, fiction status, and genres."""
        self.require_librarian(flask.request.library)

        work = self.load_work(flask.request.library, identifier_type,
                              identifier)
        if isinstance(work, ProblemDetail):
            return work

        staff_data_source = DataSource.lookup(self._db,
                                              DataSource.LIBRARY_STAFF)

        # Previous staff classifications
        primary_identifier = work.presentation_edition.primary_identifier
        old_classifications = self._db \
            .query(Classification) \
            .join(Subject) \
            .filter(
                Classification.identifier == primary_identifier,
                Classification.data_source == staff_data_source
            )
        old_genre_classifications = old_classifications \
            .filter(Subject.genre_id != None)
        old_staff_genres = [
            c.subject.genre.name for c in old_genre_classifications
            if c.subject.genre
        ]
        old_computed_genres = [
            work_genre.genre.name for work_genre in work.work_genres
        ]

        # New genres should be compared to previously computed genres
        new_genres = flask.request.form.getlist("genres")
        genres_changed = sorted(new_genres) != sorted(old_computed_genres)

        # Update audience
        new_audience = flask.request.form.get("audience")
        if new_audience != work.audience:
            # Delete all previous staff audience classifications
            for c in old_classifications:
                if c.subject.type == Subject.FREEFORM_AUDIENCE:
                    self._db.delete(c)

            # Create a new classification with a high weight
            primary_identifier.classify(
                data_source=staff_data_source,
                subject_type=Subject.FREEFORM_AUDIENCE,
                subject_identifier=new_audience,
                weight=WorkController.STAFF_WEIGHT,
            )

        # Update target age if present
        new_target_age_min = flask.request.form.get("target_age_min")
        new_target_age_min = int(
            new_target_age_min) if new_target_age_min else None
        new_target_age_max = flask.request.form.get("target_age_max")
        new_target_age_max = int(
            new_target_age_max) if new_target_age_max else None
        if new_target_age_max < new_target_age_min:
            return INVALID_EDIT.detailed(
                _("Minimum target age must be less than maximum target age."))

        if work.target_age:
            old_target_age_min = work.target_age.lower
            old_target_age_max = work.target_age.upper
        else:
            old_target_age_min = None
            old_target_age_max = None
        if new_target_age_min != old_target_age_min or new_target_age_max != old_target_age_max:
            # Delete all previous staff target age classifications
            for c in old_classifications:
                if c.subject.type == Subject.AGE_RANGE:
                    self._db.delete(c)

            # Create a new classification with a high weight - higher than audience
            if new_target_age_min and new_target_age_max:
                age_range_identifier = "%s-%s" % (new_target_age_min,
                                                  new_target_age_max)
                primary_identifier.classify(
                    data_source=staff_data_source,
                    subject_type=Subject.AGE_RANGE,
                    subject_identifier=age_range_identifier,
                    weight=WorkController.STAFF_WEIGHT * 100,
                )

        # Update fiction status
        # If fiction status hasn't changed but genres have changed,
        # we still want to ensure that there's a staff classification
        new_fiction = True if flask.request.form.get(
            "fiction") == "fiction" else False
        if new_fiction != work.fiction or genres_changed:
            # Delete previous staff fiction classifications
            for c in old_classifications:
                if c.subject.type == Subject.SIMPLIFIED_FICTION_STATUS:
                    self._db.delete(c)

            # Create a new classification with a high weight (higher than genre)
            fiction_term = "Fiction" if new_fiction else "Nonfiction"
            classification = primary_identifier.classify(
                data_source=staff_data_source,
                subject_type=Subject.SIMPLIFIED_FICTION_STATUS,
                subject_identifier=fiction_term,
                weight=WorkController.STAFF_WEIGHT,
            )
            classification.subject.fiction = new_fiction

        # Update genres
        # make sure all new genres are legit
        for name in new_genres:
            genre, is_new = Genre.lookup(self._db, name)
            if not isinstance(genre, Genre):
                return GENRE_NOT_FOUND
            if genres[name].is_fiction is not None and genres[
                    name].is_fiction != new_fiction:
                return INCOMPATIBLE_GENRE
            if name == "Erotica" and new_audience != "Adults Only":
                return EROTICA_FOR_ADULTS_ONLY

        if genres_changed:
            # delete existing staff classifications for genres that aren't being kept
            for c in old_genre_classifications:
                if c.subject.genre.name not in new_genres:
                    self._db.delete(c)

            # add new staff classifications for new genres
            for genre in new_genres:
                if genre not in old_staff_genres:
                    classification = primary_identifier.classify(
                        data_source=staff_data_source,
                        subject_type=Subject.SIMPLIFIED_GENRE,
                        subject_identifier=genre,
                        weight=WorkController.STAFF_WEIGHT)

            # add NONE genre classification if we aren't keeping any genres
            if len(new_genres) == 0:
                primary_identifier.classify(
                    data_source=staff_data_source,
                    subject_type=Subject.SIMPLIFIED_GENRE,
                    subject_identifier=SimplifiedGenreClassifier.NONE,
                    weight=WorkController.STAFF_WEIGHT)
            else:
                # otherwise delete existing NONE genre classification
                none_classifications = self._db \
                    .query(Classification) \
                    .join(Subject) \
                    .filter(
                        Classification.identifier == primary_identifier,
                        Subject.identifier == SimplifiedGenreClassifier.NONE
                    ) \
                    .all()
                for c in none_classifications:
                    self._db.delete(c)

        # Update presentation
        policy = PresentationCalculationPolicy(classify=True,
                                               regenerate_opds_entries=True,
                                               regenerate_marc_record=True,
                                               update_search_index=True)
        work.calculate_presentation(policy=policy)

        return Response("", 200)
Пример #9
0
    def edit(self, identifier_type, identifier):
        """Edit a work's metadata."""
        self.require_librarian(flask.request.library)

        # TODO: It would be nice to use the metadata layer for this, but
        # this code handles empty values differently than other metadata
        # sources. When a staff member deletes a value, that indicates
        # they think it should be empty. This needs to be indicated in the
        # db so that it can overrule other data sources that set a value,
        # unlike other sources which set empty fields to None.

        work = self.load_work(flask.request.library, identifier_type,
                              identifier)
        if isinstance(work, ProblemDetail):
            return work

        changed = False

        staff_data_source = DataSource.lookup(self._db,
                                              DataSource.LIBRARY_STAFF)
        primary_identifier = work.presentation_edition.primary_identifier
        staff_edition, is_new = get_one_or_create(
            self._db,
            Edition,
            primary_identifier_id=primary_identifier.id,
            data_source_id=staff_data_source.id)
        self._db.expire(primary_identifier)

        new_title = flask.request.form.get("title")
        if new_title and work.title != new_title:
            staff_edition.title = unicode(new_title)
            changed = True

        new_subtitle = flask.request.form.get("subtitle")
        if work.subtitle != new_subtitle:
            if work.subtitle and not new_subtitle:
                new_subtitle = NO_VALUE
            staff_edition.subtitle = unicode(new_subtitle)
            changed = True

        # The form data includes roles and names for contributors in the same order.
        new_contributor_roles = flask.request.form.getlist("contributor-role")
        new_contributor_names = [
            unicode(n) for n in flask.request.form.getlist("contributor-name")
        ]
        # The first author in the form is considered the primary author, even
        # though there's no separate MARC code for that.
        for i, role in enumerate(new_contributor_roles):
            if role == Contributor.AUTHOR_ROLE:
                new_contributor_roles[i] = Contributor.PRIMARY_AUTHOR_ROLE
                break
        roles_and_names = zip(new_contributor_roles, new_contributor_names)

        # Remove any contributions that weren't in the form, and remove contributions
        # that already exist from the list so they won't be added again.
        deleted_contributions = False
        for contribution in staff_edition.contributions:
            if (contribution.role, contribution.contributor.display_name
                ) not in roles_and_names:
                self._db.delete(contribution)
                deleted_contributions = True
                changed = True
            else:
                roles_and_names.remove(
                    (contribution.role, contribution.contributor.display_name))
        if deleted_contributions:
            # Ensure the staff edition's contributions are up-to-date when
            # calculating the presentation edition later.
            self._db.refresh(staff_edition)

        # Any remaining roles and names are new contributions.
        for role, name in roles_and_names:
            # There may be one extra role at the end from the input for
            # adding a contributor, in which case it will have no
            # corresponding name and can be ignored.
            if name:
                if role not in Contributor.MARC_ROLE_CODES.keys():
                    self._db.rollback()
                    return UNKNOWN_ROLE.detailed(
                        _("Role %(role)s is not one of the known contributor roles.",
                          role=role))
                contributor = staff_edition.add_contributor(name=name,
                                                            roles=[role])
                contributor.display_name = name
                changed = True

        new_series = flask.request.form.get("series")
        if work.series != new_series:
            if work.series and not new_series:
                new_series = NO_VALUE
            staff_edition.series = unicode(new_series)
            changed = True

        new_series_position = flask.request.form.get("series_position")
        if new_series_position != None and new_series_position != '':
            try:
                new_series_position = int(new_series_position)
            except ValueError:
                self._db.rollback()
                return INVALID_SERIES_POSITION
        else:
            new_series_position = None
        if work.series_position != new_series_position:
            if work.series_position and new_series_position == None:
                new_series_position = NO_NUMBER
            staff_edition.series_position = new_series_position
            changed = True

        new_medium = flask.request.form.get("medium")
        if new_medium:
            if new_medium not in Edition.medium_to_additional_type.keys():
                self._db.rollback()
                return UNKNOWN_MEDIUM.detailed(
                    _("Medium %(medium)s is not one of the known media.",
                      medium=new_medium))
            staff_edition.medium = new_medium
            changed = True

        new_language = flask.request.form.get("language")
        if new_language != None and new_language != '':
            new_language = LanguageCodes.string_to_alpha_3(new_language)
            if not new_language:
                self._db.rollback()
                return UNKNOWN_LANGUAGE
        else:
            new_language = None
        if new_language != staff_edition.language:
            staff_edition.language = new_language
            changed = True

        new_publisher = flask.request.form.get("publisher")
        if new_publisher != staff_edition.publisher:
            if staff_edition.publisher and not new_publisher:
                new_publisher = NO_VALUE
            staff_edition.publisher = unicode(new_publisher)
            changed = True

        new_imprint = flask.request.form.get("imprint")
        if new_imprint != staff_edition.imprint:
            if staff_edition.imprint and not new_imprint:
                new_imprint = NO_VALUE
            staff_edition.imprint = unicode(new_imprint)
            changed = True

        new_issued = flask.request.form.get("issued")
        if new_issued != None and new_issued != '':
            try:
                new_issued = datetime.strptime(new_issued, '%Y-%m-%d')
            except ValueError:
                self._db.rollback()
                return INVALID_DATE_FORMAT
        else:
            new_issued = None
        if new_issued != staff_edition.issued:
            staff_edition.issued = new_issued
            changed = True

        # TODO: This lets library staff add a 1-5 rating, which is used in the
        # quality calculation. However, this doesn't work well if there are any
        # other measurements that contribute to the quality. The form will show
        # the calculated quality rather than the staff rating, which will be
        # confusing. It might also be useful to make it more clear how this
        # relates to the quality threshold in the library settings.
        changed_rating = False
        new_rating = flask.request.form.get("rating")
        if new_rating != None and new_rating != '':
            try:
                new_rating = float(new_rating)
            except ValueError:
                self._db.rollback()
                return INVALID_RATING
            scale = Measurement.RATING_SCALES[DataSource.LIBRARY_STAFF]
            if new_rating < scale[0] or new_rating > scale[1]:
                self._db.rollback()
                return INVALID_RATING.detailed(
                    _("The rating must be a number between %(low)s and %(high)s.",
                      low=scale[0],
                      high=scale[1]))
            if (new_rating - scale[0]) / (scale[1] - scale[0]) != work.quality:
                primary_identifier.add_measurement(
                    staff_data_source,
                    Measurement.RATING,
                    new_rating,
                    weight=WorkController.STAFF_WEIGHT)
                changed = True
                changed_rating = True

        changed_summary = False
        new_summary = flask.request.form.get("summary") or ""
        if new_summary != work.summary_text:
            old_summary = None
            if work.summary and work.summary.data_source == staff_data_source:
                old_summary = work.summary

            work.presentation_edition.primary_identifier.add_link(
                Hyperlink.DESCRIPTION,
                None,
                staff_data_source,
                content=new_summary)

            # Delete previous staff summary
            if old_summary:
                for link in old_summary.links:
                    self._db.delete(link)
                self._db.delete(old_summary)

            changed = True
            changed_summary = True

        if changed:
            # Even if the presentation doesn't visibly change, we want
            # to regenerate the OPDS entries and update the search
            # index for the work, because that might be the 'real'
            # problem the user is trying to fix.
            policy = PresentationCalculationPolicy(
                classify=True,
                regenerate_opds_entries=True,
                regenerate_marc_record=True,
                update_search_index=True,
                calculate_quality=changed_rating,
                choose_summary=changed_summary,
            )
            work.calculate_presentation(policy=policy)

        return Response("", 200)
Пример #10
0
    def __init__(self,
                 collection,
                 uploader=None,
                 viaf_client=None,
                 linked_data_coverage_provider=None,
                 content_cafe_api=None,
                 overdrive_api_class=OverdriveAPI,
                 **kwargs):

        super(IdentifierResolutionCoverageProvider,
              self).__init__(collection, **kwargs)

        # Since we are the metadata wrangler, any resources we find,
        # we mirror to S3.
        if not uploader:
            uploader = S3Uploader.from_config(self._db)
        self.uploader = uploader

        # We're going to be aggressive about recalculating the presentation
        # for this work because either the work is currently not set up
        # at all, or something went wrong trying to set it up.
        self.policy = PresentationCalculationPolicy(
            regenerate_opds_entries=True)

        self.overdrive_api = self.create_overdrive_api(overdrive_api_class)

        self.content_cafe_api = content_cafe_api

        # Determine the optional and required coverage providers.
        # Each Identifier in this Collection's catalog will be run
        # through all relevant providers.
        self.required_coverage_providers, self.optional_coverage_providers = self.providers(
        )

        # When we need to look up a contributor via VIAF we will use this
        # client.
        self.viaf_client = viaf_client or VIAFClient(self._db)

        # Books are not looked up in OCLC Linked Data directly, since
        # there is no Collection that identifies a book by its OCLC Number.
        # However, when a book is looked up through OCLC Classify, some
        # OCLC Numbers may be associated with it, and _those_ numbers
        # can be run through OCLC Linked Data.
        #
        # TODO: We get many books identified by ISBN, and those books
        # _could_ be run through a LinkedDataCoverageProvider if it
        # worked a little differently. However, I don't think this
        # would be very useful, since those books will get looked up
        # through OCLC Classify, which will probably result in us
        # finding that same ISBN via OCLC Number.
        self.oclc_linked_data = (linked_data_coverage_provider
                                 or LinkedDataCoverageProvider(
                                     self._db, viaf_api=self.viaf_client))

        # The ordinary OverdriveBibliographicCoverageProvider
        # doesn't upload images, so we need to create our own
        # mirror and scaler.
        #
        # TODO: This class would be neater if we were to subclass
        # OverdriveBibliographicCoverageProvider to do the scaling and
        # uploading.
        self.image_mirrors = {
            DataSource.OVERDRIVE:
            OverdriveCoverImageMirror(self._db, uploader=uploader)
        }
        self.image_scaler = ImageScaler(self._db,
                                        self.image_mirrors.values(),
                                        uploader=uploader)
Пример #11
0
    def test_recursively_equivalent_identifier_ids(self):
        identifier = self._identifier()
        data_source = DataSource.lookup(self._db, DataSource.MANUAL)

        strong_equivalent = self._identifier()
        identifier.equivalent_to(data_source, strong_equivalent, 0.9)

        weak_equivalent = self._identifier()
        identifier.equivalent_to(data_source, weak_equivalent, 0.2)

        level_2_equivalent = self._identifier()
        strong_equivalent.equivalent_to(data_source, level_2_equivalent, 0.5)

        level_3_equivalent = self._identifier()
        level_2_equivalent.equivalent_to(data_source, level_3_equivalent, 0.9)

        level_4_equivalent = self._identifier()
        level_3_equivalent.equivalent_to(data_source, level_4_equivalent, 0.6)

        unrelated = self._identifier()

        # With a low threshold and enough levels, we find all the identifiers.
        high_levels_low_threshold = PresentationCalculationPolicy(
            equivalent_identifier_levels=5,
            equivalent_identifier_threshold=0.1)
        equivs = Identifier.recursively_equivalent_identifier_ids(
            self._db, [identifier.id], policy=high_levels_low_threshold)
        assert (set([
            identifier.id,
            strong_equivalent.id,
            weak_equivalent.id,
            level_2_equivalent.id,
            level_3_equivalent.id,
            level_4_equivalent.id,
        ]) == set(equivs[identifier.id]))

        # If we only look at one level, we don't find the level 2, 3, or 4 identifiers.
        one_level = PresentationCalculationPolicy(
            equivalent_identifier_levels=1,
            equivalent_identifier_threshold=0.1)
        equivs = Identifier.recursively_equivalent_identifier_ids(
            self._db, [identifier.id], policy=one_level)
        assert set([identifier.id, strong_equivalent.id,
                    weak_equivalent.id]) == set(equivs[identifier.id])

        # If we raise the threshold, we don't find the weak identifier.
        one_level_high_threshold = PresentationCalculationPolicy(
            equivalent_identifier_levels=1,
            equivalent_identifier_threshold=0.4)
        equivs = Identifier.recursively_equivalent_identifier_ids(
            self._db, [identifier.id], policy=one_level_high_threshold)
        assert set([identifier.id,
                    strong_equivalent.id]) == set(equivs[identifier.id])

        # For deeper levels, the strength is the product of the strengths
        # of all the equivalencies in between the two identifiers.

        # In this example:
        # identifier - level_2_equivalent = 0.9 * 0.5 = 0.45
        # identifier - level_3_equivalent = 0.9 * 0.5 * 0.9 = 0.405
        # identifier - level_4_equivalent = 0.9 * 0.5 * 0.9 * 0.6 = 0.243

        # With a threshold of 0.5, level 2 and all subsequent levels are too weak.
        high_levels_high_threshold = PresentationCalculationPolicy(
            equivalent_identifier_levels=5,
            equivalent_identifier_threshold=0.5)
        equivs = Identifier.recursively_equivalent_identifier_ids(
            self._db, [identifier.id], policy=high_levels_high_threshold)
        assert set([identifier.id,
                    strong_equivalent.id]) == set(equivs[identifier.id])

        # With a threshold of 0.25, level 2 is strong enough, but level
        # 4 is too weak.
        high_levels_lower_threshold = PresentationCalculationPolicy(
            equivalent_identifier_levels=5,
            equivalent_identifier_threshold=0.25)
        equivs = Identifier.recursively_equivalent_identifier_ids(
            self._db, [identifier.id], policy=high_levels_lower_threshold)
        assert (set([
            identifier.id,
            strong_equivalent.id,
            level_2_equivalent.id,
            level_3_equivalent.id,
        ]) == set(equivs[identifier.id]))

        # It also works if we start from other identifiers.
        equivs = Identifier.recursively_equivalent_identifier_ids(
            self._db, [strong_equivalent.id], policy=high_levels_low_threshold)
        assert (set([
            identifier.id,
            strong_equivalent.id,
            weak_equivalent.id,
            level_2_equivalent.id,
            level_3_equivalent.id,
            level_4_equivalent.id,
        ]) == set(equivs[strong_equivalent.id]))

        equivs = Identifier.recursively_equivalent_identifier_ids(
            self._db, [level_4_equivalent.id],
            policy=high_levels_low_threshold)
        assert (set([
            identifier.id,
            strong_equivalent.id,
            level_2_equivalent.id,
            level_3_equivalent.id,
            level_4_equivalent.id,
        ]) == set(equivs[level_4_equivalent.id]))

        equivs = Identifier.recursively_equivalent_identifier_ids(
            self._db, [level_4_equivalent.id],
            policy=high_levels_high_threshold)
        assert set([
            level_2_equivalent.id, level_3_equivalent.id, level_4_equivalent.id
        ]) == set(equivs[level_4_equivalent.id])

        # A chain of very strong equivalents can keep a high strength
        # even at deep levels. This wouldn't work if we changed the strength
        # threshold by level instead of accumulating a strength product.
        another_identifier = self._identifier()
        l2 = self._identifier()
        l3 = self._identifier()
        l4 = self._identifier()
        l2.equivalent_to(data_source, another_identifier, 1)
        l3.equivalent_to(data_source, l2, 1)
        l4.equivalent_to(data_source, l3, 0.9)
        high_levels_fairly_high_threshold = PresentationCalculationPolicy(
            equivalent_identifier_levels=5,
            equivalent_identifier_threshold=0.89)
        equivs = Identifier.recursively_equivalent_identifier_ids(
            self._db, [another_identifier.id],
            high_levels_fairly_high_threshold)
        assert set([another_identifier.id, l2.id, l3.id,
                    l4.id]) == set(equivs[another_identifier.id])

        # We can look for multiple identifiers at once.
        two_levels_high_threshold = PresentationCalculationPolicy(
            equivalent_identifier_levels=2,
            equivalent_identifier_threshold=0.8)
        equivs = Identifier.recursively_equivalent_identifier_ids(
            self._db,
            [identifier.id, level_3_equivalent.id],
            policy=two_levels_high_threshold,
        )
        assert set([identifier.id,
                    strong_equivalent.id]) == set(equivs[identifier.id])
        assert set([level_2_equivalent.id, level_3_equivalent.id
                    ]) == set(equivs[level_3_equivalent.id])

        # By setting a cutoff, you can say to look deep in the tree,
        # but stop looking as soon as you have a certain number of
        # equivalents.
        with_cutoff = PresentationCalculationPolicy(
            equivalent_identifier_levels=5,
            equivalent_identifier_threshold=0.1,
            equivalent_identifier_cutoff=1,
        )
        equivs = Identifier.recursively_equivalent_identifier_ids(
            self._db, [identifier.id], policy=with_cutoff)

        # The cutoff was set to 1, but we always go at least one level
        # deep, and that gives us three equivalent identifiers. We
        # don't artificially trim it back down to 1.
        assert 3 == len(equivs[identifier.id])

        # Increase the cutoff, and we get more identifiers.
        with_cutoff.equivalent_identifier_cutoff = 5
        equivs = Identifier.recursively_equivalent_identifier_ids(
            self._db, [identifier.id], policy=with_cutoff)
        assert len(equivs[identifier.id]) > 3

        # The query() method uses the same db function, but returns
        # equivalents for all identifiers together so it can be used
        # as a subquery.
        query = Identifier.recursively_equivalent_identifier_ids_query(
            Identifier.id, policy=high_levels_low_threshold)
        query = query.where(Identifier.id == identifier.id)
        results = self._db.execute(query)
        equivalent_ids = [r[0] for r in results]
        assert (set([
            identifier.id,
            strong_equivalent.id,
            weak_equivalent.id,
            level_2_equivalent.id,
            level_3_equivalent.id,
            level_4_equivalent.id,
        ]) == set(equivalent_ids))

        query = Identifier.recursively_equivalent_identifier_ids_query(
            Identifier.id, policy=two_levels_high_threshold)
        query = query.where(
            Identifier.id.in_([identifier.id, level_3_equivalent.id]))
        results = self._db.execute(query)
        equivalent_ids = [r[0] for r in results]
        assert (set([
            identifier.id,
            strong_equivalent.id,
            level_2_equivalent.id,
            level_3_equivalent.id,
        ]) == set(equivalent_ids))
# Find all books where the edition associated with the LicensePool has a
# different medium from the presentation edition.
_db = production_session()

# Find all the LicensePools that aren't books.
subq = select([LicensePool.id]).select_from(
    join(
        LicensePool, Edition,
        and_(LicensePool.data_source_id == Edition.data_source_id,
             LicensePool.identifier_id == Edition.primary_identifier_id))
).where(Edition.medium != Edition.BOOK_MEDIUM)

# Of those LicensePools, find every LicensePool whose presentation
# edition says it _is_ a book.
qu = _db.query(LicensePool).join(
    Edition, LicensePool.presentation_edition_id == Edition.id).filter(
        LicensePool.id.in_(subq)).filter(Edition.medium == Edition.BOOK_MEDIUM)

print "Recalculating presentation edition for %d LicensePools." % qu.count()

for lp in qu:
    # Recalculate that LicensePool's presentation edition, and then its
    # work presentation.
    lp.set_presentation_edition()
    policy = PresentationCalculationPolicy(regenerate_opds_entries=True,
                                           update_search_index=True)
    work, is_new = lp.calculate_work()
    work.calculate_presentation(policy)
    print "New medium: %s" % lp.presentation_edition.medium
    _db.commit()
Пример #13
0
    def test_recursively_equivalent_identifiers(self):

        # We start with a Gutenberg book.
        gutenberg = DataSource.lookup(self._db, DataSource.GUTENBERG)
        record, ignore = Edition.for_foreign_id(self._db, gutenberg,
                                                Identifier.GUTENBERG_ID, "100")
        gutenberg_id = record.primary_identifier

        # We use OCLC Classify to do a title/author lookup.
        oclc = DataSource.lookup(self._db, DataSource.OCLC)
        search_id, ignore = Identifier.for_foreign_id(self._db,
                                                      Identifier.OCLC_WORK,
                                                      "60010")
        gutenberg_id.equivalent_to(oclc, search_id, 1)

        # The title/author lookup associates the search term with two
        # different OCLC Numbers.
        oclc_id, ignore = Identifier.for_foreign_id(self._db,
                                                    Identifier.OCLC_NUMBER,
                                                    "9999")
        oclc_id_2, ignore = Identifier.for_foreign_id(self._db,
                                                      Identifier.OCLC_NUMBER,
                                                      "1000")

        search_id.equivalent_to(oclc, oclc_id, 1)
        search_id.equivalent_to(oclc, oclc_id_2, 1)

        # We then use OCLC Linked Data to connect one of the OCLC
        # Numbers with an ISBN.
        linked_data = DataSource.lookup(self._db, DataSource.OCLC_LINKED_DATA)
        isbn_id, ignore = Identifier.for_foreign_id(self._db, Identifier.ISBN,
                                                    "900100434X")
        oclc_id.equivalent_to(linked_data, isbn_id, 1)

        # As it turns out, we have an Overdrive work record...
        overdrive = DataSource.lookup(self._db, DataSource.OVERDRIVE)
        overdrive_record, ignore = Edition.for_foreign_id(
            self._db, overdrive, Identifier.OVERDRIVE_ID, "{111-222}")
        overdrive_id = overdrive_record.primary_identifier

        # ...which is tied (by Overdrive) to the same ISBN.
        overdrive_id.equivalent_to(overdrive, isbn_id, 1)

        # Finally, here's a completely unrelated Edition, which
        # will not be showing up.
        gutenberg2, ignore = Edition.for_foreign_id(self._db, gutenberg,
                                                    Identifier.GUTENBERG_ID,
                                                    "200")
        gutenberg2.title = "Unrelated Gutenberg record."

        levels = [
            record.equivalent_identifiers(policy=PresentationCalculationPolicy(
                equivalent_identifier_levels=i)) for i in range(0, 5)
        ]

        # At level 0, the only identifier found is the Gutenberg ID.
        assert set([gutenberg_id]) == set(levels[0])

        # At level 1, we pick up the title/author lookup.
        assert set([gutenberg_id, search_id]) == set(levels[1])

        # At level 2, we pick up the title/author lookup and the two
        # OCLC Numbers.
        assert set([gutenberg_id, search_id, oclc_id,
                    oclc_id_2]) == set(levels[2])

        # At level 3, we also pick up the ISBN.
        assert set([gutenberg_id, search_id, oclc_id, oclc_id_2,
                    isbn_id]) == set(levels[3])

        # At level 4, the recursion starts to go in the other
        # direction: we pick up the Overdrive ID that's equivalent to
        # the same ISBN as the OCLC Number.
        assert set([
            gutenberg_id, search_id, oclc_id, oclc_id_2, isbn_id, overdrive_id
        ]) == set(levels[4])
Пример #14
0
class IdentifierResolutionCoverageProvider(CatalogCoverageProvider):
    """Make sure all Identifiers associated with some Collection become
    Works.

    Coverage happens by running the Identifier through _other_
    CoverageProviders, which fill in the blanks with data from
    third-party entities.

    This CoverageProvider may force those other CoverageProviders to
    do their work for each Identifier immediately, or it may simply
    register its Identifiers with those CoverageProviders and allow
    them to complete the work at their own pace.

    Unlike most CoverageProviders, which are invoked from a script,
    this CoverageProvider is invoked from
    URNLookupController.process_urns, and only when a client expresses
    a desire that we look into a specific identifier.
    """

    SERVICE_NAME = "Identifier Resolution Coverage Provider"
    DATA_SOURCE_NAME = DataSource.INTERNAL_PROCESSING

    # These are the only identifier types we have any hope of providing
    # insight into.
    INPUT_IDENTIFIER_TYPES = [
        Identifier.OVERDRIVE_ID,
        Identifier.ISBN,
        Identifier.URI,
    ]
    OPERATION = CoverageRecord.RESOLVE_IDENTIFIER_OPERATION

    # We cover all Collections, regardless of their protocol.
    PROTOCOL = None

    def __init__(self,
                 collection,
                 mirror=None,
                 http_get=None,
                 viaf=None,
                 provide_coverage_immediately=False,
                 force=False,
                 provider_kwargs=None,
                 **kwargs):
        """Constructor.

        :param collection: Handle all Identifiers from this Collection
        that were previously registered with this CoverageProvider.

        :param mirror: A MirrorUploader to use if coverage requires
        uploading any cover images to external storage.

        :param http_get: A drop-in replacement for
        Representation.simple_http_get, to be used if any information
        (such as a book cover) needs to be obtained from the public
        Internet.

        :param viaf_client: A VIAFClient to use if coverage requires
        gathering information about authors from VIAF.

        :param force: Force CoverageProviders to cover identifiers
        even if they believe they have already done the work.

        :param provide_coverage_immediately: If this is True, then
        resolving an identifier means registering it with all of its
        other CoverageProviders *and then attempting to provide
        coverage*.  Registration is considered a success even if the
        other CoverageProviders fail, but the attempt must be made
        immediately.

        If this is False (the default), then resolving an identifier
        just means registering it with all other relevant
        CoverageProviders.

        :param provider_kwargs: Pass this object in as provider_kwargs
        when calling gather_providers at the end of the
        constructor. Used only in testing.

        """
        _db = Session.object_session(collection)

        # Since we are the metadata wrangler, any resources we find,
        # we mirror using the sitewide MirrorUploader.
        if not mirror:
            try:
                mirror = MirrorUploader.sitewide(_db)
            except CannotLoadConfiguration, e:
                logging.error(
                    "No storage integration is configured. Cover images will not be stored anywhere.",
                    exc_info=e)
        self.mirror = mirror

        # We're going to be aggressive about recalculating the presentation
        # for this work because either the work is currently not set up
        # at all, or something went wrong trying to set it up.
        presentation = PresentationCalculationPolicy(
            regenerate_opds_entries=True)
        replacement_policy = ReplacementPolicy.from_metadata_source(
            presentation_calculation_policy=presentation,
            mirror=self.mirror,
            http_get=http_get,
        )
        super(IdentifierResolutionCoverageProvider,
              self).__init__(collection,
                             replacement_policy=replacement_policy,
                             **kwargs)

        self.provide_coverage_immediately = provide_coverage_immediately
        self.force = force or provide_coverage_immediately

        self.viaf = viaf or VIAFClient(self._db)

        # Instantiate the coverage providers that may be needed to
        # relevant to any given Identifier.
        #
        # Each Identifier in this Collection's catalog will be registered
        # with all relevant providers (if provide_coverage_immediately
        # is False) or immediately covered by all relevant providers
        # (if provide_coverage_immediately is True).
        self.providers = self.gather_providers(provider_kwargs)
Пример #15
0
    def edit(self, data_source, identifier_type, identifier):
        """Edit a work's metadata."""

        pool = self.load_licensepool(data_source, identifier_type, identifier)
        if isinstance(pool, ProblemDetail):
            return pool
        work = pool.work
        changed = False

        staff_data_source = DataSource.lookup(self._db, DataSource.LIBRARY_STAFF)
        primary_identifier = work.presentation_edition.primary_identifier
        staff_edition, is_new = get_one_or_create(
            self._db, Edition,
            primary_identifier_id=primary_identifier.id,
            data_source_id=staff_data_source.id
        )
        self._db.expire(primary_identifier)

        new_title = flask.request.form.get("title")
        if new_title and work.title != new_title:
            staff_edition.title = unicode(new_title)
            changed = True

        new_subtitle = flask.request.form.get("subtitle")
        if work.subtitle != new_subtitle:
            if work.subtitle and not new_subtitle:
                new_subtitle = NO_VALUE
            staff_edition.subtitle = unicode(new_subtitle)
            changed = True

        new_series = flask.request.form.get("series")
        if work.series != new_series:
            if work.series and not new_series:
                new_series = NO_VALUE
            staff_edition.series = unicode(new_series)
            changed = True

        new_series_position = flask.request.form.get("series_position")
        if new_series_position:
            try:
                new_series_position = int(new_series_position)
            except ValueError:
                return INVALID_SERIES_POSITION
        else:
            new_series_position = None
        if work.series_position != new_series_position:
            if work.series_position and not new_series_position:
                new_series_position = NO_NUMBER
            staff_edition.series_position = new_series_position
            changed = True

        new_summary = flask.request.form.get("summary") or ""
        if new_summary != work.summary_text:
            old_summary = None
            if work.summary and work.summary.data_source == staff_data_source:
                old_summary = work.summary

            work.presentation_edition.primary_identifier.add_link(
                Hyperlink.DESCRIPTION, None,
                staff_data_source, content=new_summary)

            # Delete previous staff summary
            if old_summary:
                for link in old_summary.links:
                    self._db.delete(link)
                self._db.delete(old_summary)

            changed = True

        if changed:
            # Even if the presentation doesn't visibly change, we want
            # to regenerate the OPDS entries and update the search
            # index for the work, because that might be the 'real'
            # problem the user is trying to fix.
            policy = PresentationCalculationPolicy(
                classify=True,
                regenerate_opds_entries=True,
                update_search_index=True,
                choose_summary=True
            )
            work.calculate_presentation(policy=policy)
        return Response("", 200)
from core.config import Configuration
from core.model import (
    production_session,
    Representation,
    get_one,
    PresentationCalculationPolicy,
)

_db = production_session()
qu = _db.query(Representation).filter(
    Representation.image_height ==
    120).filter(Representation.image_width == 80).filter(
        Representation.url.like("http://contentcafe2.btol.com/%")).order_by(
            Representation.id)
policy = PresentationCalculationPolicy(regenerate_opds_entries=True,
                                       classify=False,
                                       choose_summary=False,
                                       calculate_quality=False)
for rep in qu:
    print rep.id
    identifiers = [h.identifier for h in rep.resource.links]
    fix_editions = []
    for identifier in identifiers:
        print identifier
        for edition in identifier.primarily_identifies:
            if (edition.cover_thumbnail_url
                    and 'Content' in edition.cover_thumbnail_url) or (
                        edition.cover_full_url
                        and 'Content' in edition.cover_full_url):
                fix_editions.append(edition)

    # Delete the hyperlinks so we don't use these images anymore.