def test_equivalent_identifiers(self): edition = self._edition() identifier = self._identifier() data_source = DataSource.lookup(self._db, DataSource.OCLC) identifier.equivalent_to(data_source, edition.primary_identifier, 0.6) policy = PresentationCalculationPolicy( equivalent_identifier_threshold=0.5) assert set([identifier, edition.primary_identifier ]) == set(edition.equivalent_identifiers(policy=policy)) policy.equivalent_identifier_threshold = 0.7 assert set([edition.primary_identifier ]) == set(edition.equivalent_identifiers(policy=policy))
def test_policy_can_be_customized(self): original_policy = self.provider.policy new_policy = PresentationCalculationPolicy.reset_cover() self.provider._policy = new_policy eq_(new_policy, self.provider.policy) eq_(False, self.provider.policy.regenerate_opds_entries) eq_(False, self.provider.policy.choose_edition)
def test_policy_can_be_customized(self): original_policy = self.provider.policy new_policy = PresentationCalculationPolicy.reset_cover() self.provider._policy = new_policy eq_(new_policy, self.provider.policy) eq_(False, self.provider.policy.regenerate_opds_entries) eq_(False, self.provider.policy.choose_edition)
def policy(self): # We're going to be aggressive about recalculating the presentation # for this work because either the work is currently not calculated # at all, or new metadata has been added that may impact the work, or # something went wrong trying to calculate it last time. if not self._policy: self._policy = PresentationCalculationPolicy( regenerate_opds_entries=True) return self._policy
def test_work_entry_includes_contributor_links(self): """ContributorLane links are added to works with contributors""" work = self._work(with_open_access_download=True) contributor1 = work.presentation_edition.author_contributors[0] feed = self.get_parsed_feed([work]) [entry] = feed.entries expected_rel_and_partial = dict(contributor='/contributor') self.assert_link_on_entry( entry, link_type=OPDSFeed.ACQUISITION_FEED_TYPE, partials_by_rel=expected_rel_and_partial, ) # When there are two authors, they each get a contributor link. work.presentation_edition.add_contributor(u'Oprah', Contributor.AUTHOR_ROLE) work.calculate_presentation( PresentationCalculationPolicy(regenerate_opds_entries=True), DummyExternalSearchIndex()) [entry] = self.get_parsed_feed([work]).entries contributor_links = [l for l in entry.links if l.rel == 'contributor'] eq_(2, len(contributor_links)) contributor_links.sort(key=lambda l: l.href) for l in contributor_links: assert l.type == OPDSFeed.ACQUISITION_FEED_TYPE assert '/contributor' in l.href assert contributor1.sort_name in contributor_links[0].href assert 'Oprah' in contributor_links[1].href # When there's no author, there's no contributor link. self._db.delete(work.presentation_edition.contributions[0]) self._db.delete(work.presentation_edition.contributions[1]) self._db.commit() work.calculate_presentation( PresentationCalculationPolicy(regenerate_opds_entries=True), DummyExternalSearchIndex()) feed = self.get_parsed_feed([work]) [entry] = feed.entries eq_([], filter(lambda l: l.rel == 'contributor', entry.links))
def add_with_metadata(self, collection_details): """Adds identifiers with their metadata to a Collection's catalog""" client = authenticated_client_from_request(self._db) if isinstance(client, ProblemDetail): return client collection = collection_from_details(self._db, client, collection_details) data_source = DataSource.lookup(self._db, collection.name, autocreate=True) messages = [] feed = feedparser.parse(request.data) entries = feed.get("entries", []) entries_by_urn = {entry.get('id'): entry for entry in entries} identifiers_by_urn, invalid_urns = Identifier.parse_urns( self._db, entries_by_urn.keys()) messages = list() for urn in invalid_urns: messages.append( OPDSMessage(urn, INVALID_URN.status_code, INVALID_URN.detail)) for urn, identifier in identifiers_by_urn.items(): entry = entries_by_urn[urn] status = HTTP_OK description = "Already in catalog" if identifier not in collection.catalog: collection.catalog_identifier(identifier) status = HTTP_CREATED description = "Successfully added" message = OPDSMessage(urn, status, description) # Get a cover if it exists. image_types = set([Hyperlink.IMAGE, Hyperlink.THUMBNAIL_IMAGE]) images = [ l for l in entry.get("links", []) if l.get("rel") in image_types ] links = [ LinkData(image.get("rel"), image.get("href")) for image in images ] # Create an edition to hold the title and author. LicensePool.calculate_work # refuses to create a Work when there's no title, and if we have a title, author # and language we can attempt to look up the edition in OCLC. title = entry.get("title") or "Unknown Title" author = ContributorData(sort_name=(entry.get("author") or Edition.UNKNOWN_AUTHOR), roles=[Contributor.PRIMARY_AUTHOR_ROLE]) language = entry.get("dcterms_language") presentation = PresentationCalculationPolicy( choose_edition=False, set_edition_metadata=False, classify=False, choose_summary=False, calculate_quality=False, choose_cover=False, regenerate_opds_entries=False, ) replace = ReplacementPolicy( presentation_calculation_policy=presentation) metadata = Metadata( data_source, primary_identifier=IdentifierData(identifier.type, identifier.identifier), title=title, language=language, contributors=[author], links=links, ) edition, ignore = metadata.edition(self._db) metadata.apply(edition, collection, replace=replace) messages.append(message) title = "%s Catalog Item Additions for %s" % (collection.protocol, client.url) url = self.collection_feed_url("add_with_metadata", collection) addition_feed = AcquisitionFeed(self._db, title, url, [], VerboseAnnotator, precomposed_entries=messages) return feed_response(addition_feed)
def change_book_cover(self, identifier_type, identifier, mirrors=None): """Save a new book cover based on the submitted form.""" self.require_librarian(flask.request.library) data_source = DataSource.lookup(self._db, DataSource.LIBRARY_STAFF) work = self.load_work(flask.request.library, identifier_type, identifier) if isinstance(work, ProblemDetail): return work rights_uri = flask.request.form.get("rights_status") rights_explanation = flask.request.form.get("rights_explanation") if not rights_uri: return INVALID_IMAGE.detailed( _("You must specify the image's license.")) collection = self._get_collection_from_pools(identifier_type, identifier) if isinstance(collection, ProblemDetail): return collection # Look for an appropriate mirror to store this cover image. Since the # mirror should be used for covers, we don't need a mirror for books. mirrors = mirrors or dict(covers_mirror=MirrorUploader.for_collection( collection, ExternalIntegrationLink.COVERS), books_mirror=None) if not mirrors.get(ExternalIntegrationLink.COVERS): return INVALID_CONFIGURATION_OPTION.detailed( _("Could not find a storage integration for uploading the cover." )) image = self.generate_cover_image(work, identifier_type, identifier) if isinstance(image, ProblemDetail): return image original, derivation_settings, cover_href, cover_rights_explanation = self._original_cover_info( image, work, data_source, rights_uri, rights_explanation) buffer = StringIO() image.save(buffer, format="PNG") content = buffer.getvalue() if not cover_href: cover_href = Hyperlink.generic_uri( data_source, work.presentation_edition.primary_identifier, Hyperlink.IMAGE, content=content) cover_data = LinkData( Hyperlink.IMAGE, href=cover_href, media_type=Representation.PNG_MEDIA_TYPE, content=content, rights_uri=rights_uri, rights_explanation=cover_rights_explanation, original=original, transformation_settings=derivation_settings, ) presentation_policy = PresentationCalculationPolicy( choose_edition=False, set_edition_metadata=False, classify=False, choose_summary=False, calculate_quality=False, choose_cover=True, regenerate_opds_entries=True, regenerate_marc_record=True, update_search_index=False, ) replacement_policy = ReplacementPolicy( links=True, # link_content is false because we already have the content. # We don't want the metadata layer to try to fetch it again. link_content=False, mirrors=mirrors, presentation_calculation_policy=presentation_policy, ) metadata = Metadata(data_source, links=[cover_data]) metadata.apply(work.presentation_edition, collection, replace=replacement_policy) # metadata.apply only updates the edition, so we also need # to update the work. work.calculate_presentation(policy=presentation_policy) return Response(_("Success"), 200)
def edit_classifications(self, identifier_type, identifier): """Edit a work's audience, target age, fiction status, and genres.""" self.require_librarian(flask.request.library) work = self.load_work(flask.request.library, identifier_type, identifier) if isinstance(work, ProblemDetail): return work staff_data_source = DataSource.lookup(self._db, DataSource.LIBRARY_STAFF) # Previous staff classifications primary_identifier = work.presentation_edition.primary_identifier old_classifications = self._db \ .query(Classification) \ .join(Subject) \ .filter( Classification.identifier == primary_identifier, Classification.data_source == staff_data_source ) old_genre_classifications = old_classifications \ .filter(Subject.genre_id != None) old_staff_genres = [ c.subject.genre.name for c in old_genre_classifications if c.subject.genre ] old_computed_genres = [ work_genre.genre.name for work_genre in work.work_genres ] # New genres should be compared to previously computed genres new_genres = flask.request.form.getlist("genres") genres_changed = sorted(new_genres) != sorted(old_computed_genres) # Update audience new_audience = flask.request.form.get("audience") if new_audience != work.audience: # Delete all previous staff audience classifications for c in old_classifications: if c.subject.type == Subject.FREEFORM_AUDIENCE: self._db.delete(c) # Create a new classification with a high weight primary_identifier.classify( data_source=staff_data_source, subject_type=Subject.FREEFORM_AUDIENCE, subject_identifier=new_audience, weight=WorkController.STAFF_WEIGHT, ) # Update target age if present new_target_age_min = flask.request.form.get("target_age_min") new_target_age_min = int( new_target_age_min) if new_target_age_min else None new_target_age_max = flask.request.form.get("target_age_max") new_target_age_max = int( new_target_age_max) if new_target_age_max else None if new_target_age_max < new_target_age_min: return INVALID_EDIT.detailed( _("Minimum target age must be less than maximum target age.")) if work.target_age: old_target_age_min = work.target_age.lower old_target_age_max = work.target_age.upper else: old_target_age_min = None old_target_age_max = None if new_target_age_min != old_target_age_min or new_target_age_max != old_target_age_max: # Delete all previous staff target age classifications for c in old_classifications: if c.subject.type == Subject.AGE_RANGE: self._db.delete(c) # Create a new classification with a high weight - higher than audience if new_target_age_min and new_target_age_max: age_range_identifier = "%s-%s" % (new_target_age_min, new_target_age_max) primary_identifier.classify( data_source=staff_data_source, subject_type=Subject.AGE_RANGE, subject_identifier=age_range_identifier, weight=WorkController.STAFF_WEIGHT * 100, ) # Update fiction status # If fiction status hasn't changed but genres have changed, # we still want to ensure that there's a staff classification new_fiction = True if flask.request.form.get( "fiction") == "fiction" else False if new_fiction != work.fiction or genres_changed: # Delete previous staff fiction classifications for c in old_classifications: if c.subject.type == Subject.SIMPLIFIED_FICTION_STATUS: self._db.delete(c) # Create a new classification with a high weight (higher than genre) fiction_term = "Fiction" if new_fiction else "Nonfiction" classification = primary_identifier.classify( data_source=staff_data_source, subject_type=Subject.SIMPLIFIED_FICTION_STATUS, subject_identifier=fiction_term, weight=WorkController.STAFF_WEIGHT, ) classification.subject.fiction = new_fiction # Update genres # make sure all new genres are legit for name in new_genres: genre, is_new = Genre.lookup(self._db, name) if not isinstance(genre, Genre): return GENRE_NOT_FOUND if genres[name].is_fiction is not None and genres[ name].is_fiction != new_fiction: return INCOMPATIBLE_GENRE if name == "Erotica" and new_audience != "Adults Only": return EROTICA_FOR_ADULTS_ONLY if genres_changed: # delete existing staff classifications for genres that aren't being kept for c in old_genre_classifications: if c.subject.genre.name not in new_genres: self._db.delete(c) # add new staff classifications for new genres for genre in new_genres: if genre not in old_staff_genres: classification = primary_identifier.classify( data_source=staff_data_source, subject_type=Subject.SIMPLIFIED_GENRE, subject_identifier=genre, weight=WorkController.STAFF_WEIGHT) # add NONE genre classification if we aren't keeping any genres if len(new_genres) == 0: primary_identifier.classify( data_source=staff_data_source, subject_type=Subject.SIMPLIFIED_GENRE, subject_identifier=SimplifiedGenreClassifier.NONE, weight=WorkController.STAFF_WEIGHT) else: # otherwise delete existing NONE genre classification none_classifications = self._db \ .query(Classification) \ .join(Subject) \ .filter( Classification.identifier == primary_identifier, Subject.identifier == SimplifiedGenreClassifier.NONE ) \ .all() for c in none_classifications: self._db.delete(c) # Update presentation policy = PresentationCalculationPolicy(classify=True, regenerate_opds_entries=True, regenerate_marc_record=True, update_search_index=True) work.calculate_presentation(policy=policy) return Response("", 200)
def edit(self, identifier_type, identifier): """Edit a work's metadata.""" self.require_librarian(flask.request.library) # TODO: It would be nice to use the metadata layer for this, but # this code handles empty values differently than other metadata # sources. When a staff member deletes a value, that indicates # they think it should be empty. This needs to be indicated in the # db so that it can overrule other data sources that set a value, # unlike other sources which set empty fields to None. work = self.load_work(flask.request.library, identifier_type, identifier) if isinstance(work, ProblemDetail): return work changed = False staff_data_source = DataSource.lookup(self._db, DataSource.LIBRARY_STAFF) primary_identifier = work.presentation_edition.primary_identifier staff_edition, is_new = get_one_or_create( self._db, Edition, primary_identifier_id=primary_identifier.id, data_source_id=staff_data_source.id) self._db.expire(primary_identifier) new_title = flask.request.form.get("title") if new_title and work.title != new_title: staff_edition.title = unicode(new_title) changed = True new_subtitle = flask.request.form.get("subtitle") if work.subtitle != new_subtitle: if work.subtitle and not new_subtitle: new_subtitle = NO_VALUE staff_edition.subtitle = unicode(new_subtitle) changed = True # The form data includes roles and names for contributors in the same order. new_contributor_roles = flask.request.form.getlist("contributor-role") new_contributor_names = [ unicode(n) for n in flask.request.form.getlist("contributor-name") ] # The first author in the form is considered the primary author, even # though there's no separate MARC code for that. for i, role in enumerate(new_contributor_roles): if role == Contributor.AUTHOR_ROLE: new_contributor_roles[i] = Contributor.PRIMARY_AUTHOR_ROLE break roles_and_names = zip(new_contributor_roles, new_contributor_names) # Remove any contributions that weren't in the form, and remove contributions # that already exist from the list so they won't be added again. deleted_contributions = False for contribution in staff_edition.contributions: if (contribution.role, contribution.contributor.display_name ) not in roles_and_names: self._db.delete(contribution) deleted_contributions = True changed = True else: roles_and_names.remove( (contribution.role, contribution.contributor.display_name)) if deleted_contributions: # Ensure the staff edition's contributions are up-to-date when # calculating the presentation edition later. self._db.refresh(staff_edition) # Any remaining roles and names are new contributions. for role, name in roles_and_names: # There may be one extra role at the end from the input for # adding a contributor, in which case it will have no # corresponding name and can be ignored. if name: if role not in Contributor.MARC_ROLE_CODES.keys(): self._db.rollback() return UNKNOWN_ROLE.detailed( _("Role %(role)s is not one of the known contributor roles.", role=role)) contributor = staff_edition.add_contributor(name=name, roles=[role]) contributor.display_name = name changed = True new_series = flask.request.form.get("series") if work.series != new_series: if work.series and not new_series: new_series = NO_VALUE staff_edition.series = unicode(new_series) changed = True new_series_position = flask.request.form.get("series_position") if new_series_position != None and new_series_position != '': try: new_series_position = int(new_series_position) except ValueError: self._db.rollback() return INVALID_SERIES_POSITION else: new_series_position = None if work.series_position != new_series_position: if work.series_position and new_series_position == None: new_series_position = NO_NUMBER staff_edition.series_position = new_series_position changed = True new_medium = flask.request.form.get("medium") if new_medium: if new_medium not in Edition.medium_to_additional_type.keys(): self._db.rollback() return UNKNOWN_MEDIUM.detailed( _("Medium %(medium)s is not one of the known media.", medium=new_medium)) staff_edition.medium = new_medium changed = True new_language = flask.request.form.get("language") if new_language != None and new_language != '': new_language = LanguageCodes.string_to_alpha_3(new_language) if not new_language: self._db.rollback() return UNKNOWN_LANGUAGE else: new_language = None if new_language != staff_edition.language: staff_edition.language = new_language changed = True new_publisher = flask.request.form.get("publisher") if new_publisher != staff_edition.publisher: if staff_edition.publisher and not new_publisher: new_publisher = NO_VALUE staff_edition.publisher = unicode(new_publisher) changed = True new_imprint = flask.request.form.get("imprint") if new_imprint != staff_edition.imprint: if staff_edition.imprint and not new_imprint: new_imprint = NO_VALUE staff_edition.imprint = unicode(new_imprint) changed = True new_issued = flask.request.form.get("issued") if new_issued != None and new_issued != '': try: new_issued = datetime.strptime(new_issued, '%Y-%m-%d') except ValueError: self._db.rollback() return INVALID_DATE_FORMAT else: new_issued = None if new_issued != staff_edition.issued: staff_edition.issued = new_issued changed = True # TODO: This lets library staff add a 1-5 rating, which is used in the # quality calculation. However, this doesn't work well if there are any # other measurements that contribute to the quality. The form will show # the calculated quality rather than the staff rating, which will be # confusing. It might also be useful to make it more clear how this # relates to the quality threshold in the library settings. changed_rating = False new_rating = flask.request.form.get("rating") if new_rating != None and new_rating != '': try: new_rating = float(new_rating) except ValueError: self._db.rollback() return INVALID_RATING scale = Measurement.RATING_SCALES[DataSource.LIBRARY_STAFF] if new_rating < scale[0] or new_rating > scale[1]: self._db.rollback() return INVALID_RATING.detailed( _("The rating must be a number between %(low)s and %(high)s.", low=scale[0], high=scale[1])) if (new_rating - scale[0]) / (scale[1] - scale[0]) != work.quality: primary_identifier.add_measurement( staff_data_source, Measurement.RATING, new_rating, weight=WorkController.STAFF_WEIGHT) changed = True changed_rating = True changed_summary = False new_summary = flask.request.form.get("summary") or "" if new_summary != work.summary_text: old_summary = None if work.summary and work.summary.data_source == staff_data_source: old_summary = work.summary work.presentation_edition.primary_identifier.add_link( Hyperlink.DESCRIPTION, None, staff_data_source, content=new_summary) # Delete previous staff summary if old_summary: for link in old_summary.links: self._db.delete(link) self._db.delete(old_summary) changed = True changed_summary = True if changed: # Even if the presentation doesn't visibly change, we want # to regenerate the OPDS entries and update the search # index for the work, because that might be the 'real' # problem the user is trying to fix. policy = PresentationCalculationPolicy( classify=True, regenerate_opds_entries=True, regenerate_marc_record=True, update_search_index=True, calculate_quality=changed_rating, choose_summary=changed_summary, ) work.calculate_presentation(policy=policy) return Response("", 200)
def __init__(self, collection, uploader=None, viaf_client=None, linked_data_coverage_provider=None, content_cafe_api=None, overdrive_api_class=OverdriveAPI, **kwargs): super(IdentifierResolutionCoverageProvider, self).__init__(collection, **kwargs) # Since we are the metadata wrangler, any resources we find, # we mirror to S3. if not uploader: uploader = S3Uploader.from_config(self._db) self.uploader = uploader # We're going to be aggressive about recalculating the presentation # for this work because either the work is currently not set up # at all, or something went wrong trying to set it up. self.policy = PresentationCalculationPolicy( regenerate_opds_entries=True) self.overdrive_api = self.create_overdrive_api(overdrive_api_class) self.content_cafe_api = content_cafe_api # Determine the optional and required coverage providers. # Each Identifier in this Collection's catalog will be run # through all relevant providers. self.required_coverage_providers, self.optional_coverage_providers = self.providers( ) # When we need to look up a contributor via VIAF we will use this # client. self.viaf_client = viaf_client or VIAFClient(self._db) # Books are not looked up in OCLC Linked Data directly, since # there is no Collection that identifies a book by its OCLC Number. # However, when a book is looked up through OCLC Classify, some # OCLC Numbers may be associated with it, and _those_ numbers # can be run through OCLC Linked Data. # # TODO: We get many books identified by ISBN, and those books # _could_ be run through a LinkedDataCoverageProvider if it # worked a little differently. However, I don't think this # would be very useful, since those books will get looked up # through OCLC Classify, which will probably result in us # finding that same ISBN via OCLC Number. self.oclc_linked_data = (linked_data_coverage_provider or LinkedDataCoverageProvider( self._db, viaf_api=self.viaf_client)) # The ordinary OverdriveBibliographicCoverageProvider # doesn't upload images, so we need to create our own # mirror and scaler. # # TODO: This class would be neater if we were to subclass # OverdriveBibliographicCoverageProvider to do the scaling and # uploading. self.image_mirrors = { DataSource.OVERDRIVE: OverdriveCoverImageMirror(self._db, uploader=uploader) } self.image_scaler = ImageScaler(self._db, self.image_mirrors.values(), uploader=uploader)
def test_recursively_equivalent_identifier_ids(self): identifier = self._identifier() data_source = DataSource.lookup(self._db, DataSource.MANUAL) strong_equivalent = self._identifier() identifier.equivalent_to(data_source, strong_equivalent, 0.9) weak_equivalent = self._identifier() identifier.equivalent_to(data_source, weak_equivalent, 0.2) level_2_equivalent = self._identifier() strong_equivalent.equivalent_to(data_source, level_2_equivalent, 0.5) level_3_equivalent = self._identifier() level_2_equivalent.equivalent_to(data_source, level_3_equivalent, 0.9) level_4_equivalent = self._identifier() level_3_equivalent.equivalent_to(data_source, level_4_equivalent, 0.6) unrelated = self._identifier() # With a low threshold and enough levels, we find all the identifiers. high_levels_low_threshold = PresentationCalculationPolicy( equivalent_identifier_levels=5, equivalent_identifier_threshold=0.1) equivs = Identifier.recursively_equivalent_identifier_ids( self._db, [identifier.id], policy=high_levels_low_threshold) assert (set([ identifier.id, strong_equivalent.id, weak_equivalent.id, level_2_equivalent.id, level_3_equivalent.id, level_4_equivalent.id, ]) == set(equivs[identifier.id])) # If we only look at one level, we don't find the level 2, 3, or 4 identifiers. one_level = PresentationCalculationPolicy( equivalent_identifier_levels=1, equivalent_identifier_threshold=0.1) equivs = Identifier.recursively_equivalent_identifier_ids( self._db, [identifier.id], policy=one_level) assert set([identifier.id, strong_equivalent.id, weak_equivalent.id]) == set(equivs[identifier.id]) # If we raise the threshold, we don't find the weak identifier. one_level_high_threshold = PresentationCalculationPolicy( equivalent_identifier_levels=1, equivalent_identifier_threshold=0.4) equivs = Identifier.recursively_equivalent_identifier_ids( self._db, [identifier.id], policy=one_level_high_threshold) assert set([identifier.id, strong_equivalent.id]) == set(equivs[identifier.id]) # For deeper levels, the strength is the product of the strengths # of all the equivalencies in between the two identifiers. # In this example: # identifier - level_2_equivalent = 0.9 * 0.5 = 0.45 # identifier - level_3_equivalent = 0.9 * 0.5 * 0.9 = 0.405 # identifier - level_4_equivalent = 0.9 * 0.5 * 0.9 * 0.6 = 0.243 # With a threshold of 0.5, level 2 and all subsequent levels are too weak. high_levels_high_threshold = PresentationCalculationPolicy( equivalent_identifier_levels=5, equivalent_identifier_threshold=0.5) equivs = Identifier.recursively_equivalent_identifier_ids( self._db, [identifier.id], policy=high_levels_high_threshold) assert set([identifier.id, strong_equivalent.id]) == set(equivs[identifier.id]) # With a threshold of 0.25, level 2 is strong enough, but level # 4 is too weak. high_levels_lower_threshold = PresentationCalculationPolicy( equivalent_identifier_levels=5, equivalent_identifier_threshold=0.25) equivs = Identifier.recursively_equivalent_identifier_ids( self._db, [identifier.id], policy=high_levels_lower_threshold) assert (set([ identifier.id, strong_equivalent.id, level_2_equivalent.id, level_3_equivalent.id, ]) == set(equivs[identifier.id])) # It also works if we start from other identifiers. equivs = Identifier.recursively_equivalent_identifier_ids( self._db, [strong_equivalent.id], policy=high_levels_low_threshold) assert (set([ identifier.id, strong_equivalent.id, weak_equivalent.id, level_2_equivalent.id, level_3_equivalent.id, level_4_equivalent.id, ]) == set(equivs[strong_equivalent.id])) equivs = Identifier.recursively_equivalent_identifier_ids( self._db, [level_4_equivalent.id], policy=high_levels_low_threshold) assert (set([ identifier.id, strong_equivalent.id, level_2_equivalent.id, level_3_equivalent.id, level_4_equivalent.id, ]) == set(equivs[level_4_equivalent.id])) equivs = Identifier.recursively_equivalent_identifier_ids( self._db, [level_4_equivalent.id], policy=high_levels_high_threshold) assert set([ level_2_equivalent.id, level_3_equivalent.id, level_4_equivalent.id ]) == set(equivs[level_4_equivalent.id]) # A chain of very strong equivalents can keep a high strength # even at deep levels. This wouldn't work if we changed the strength # threshold by level instead of accumulating a strength product. another_identifier = self._identifier() l2 = self._identifier() l3 = self._identifier() l4 = self._identifier() l2.equivalent_to(data_source, another_identifier, 1) l3.equivalent_to(data_source, l2, 1) l4.equivalent_to(data_source, l3, 0.9) high_levels_fairly_high_threshold = PresentationCalculationPolicy( equivalent_identifier_levels=5, equivalent_identifier_threshold=0.89) equivs = Identifier.recursively_equivalent_identifier_ids( self._db, [another_identifier.id], high_levels_fairly_high_threshold) assert set([another_identifier.id, l2.id, l3.id, l4.id]) == set(equivs[another_identifier.id]) # We can look for multiple identifiers at once. two_levels_high_threshold = PresentationCalculationPolicy( equivalent_identifier_levels=2, equivalent_identifier_threshold=0.8) equivs = Identifier.recursively_equivalent_identifier_ids( self._db, [identifier.id, level_3_equivalent.id], policy=two_levels_high_threshold, ) assert set([identifier.id, strong_equivalent.id]) == set(equivs[identifier.id]) assert set([level_2_equivalent.id, level_3_equivalent.id ]) == set(equivs[level_3_equivalent.id]) # By setting a cutoff, you can say to look deep in the tree, # but stop looking as soon as you have a certain number of # equivalents. with_cutoff = PresentationCalculationPolicy( equivalent_identifier_levels=5, equivalent_identifier_threshold=0.1, equivalent_identifier_cutoff=1, ) equivs = Identifier.recursively_equivalent_identifier_ids( self._db, [identifier.id], policy=with_cutoff) # The cutoff was set to 1, but we always go at least one level # deep, and that gives us three equivalent identifiers. We # don't artificially trim it back down to 1. assert 3 == len(equivs[identifier.id]) # Increase the cutoff, and we get more identifiers. with_cutoff.equivalent_identifier_cutoff = 5 equivs = Identifier.recursively_equivalent_identifier_ids( self._db, [identifier.id], policy=with_cutoff) assert len(equivs[identifier.id]) > 3 # The query() method uses the same db function, but returns # equivalents for all identifiers together so it can be used # as a subquery. query = Identifier.recursively_equivalent_identifier_ids_query( Identifier.id, policy=high_levels_low_threshold) query = query.where(Identifier.id == identifier.id) results = self._db.execute(query) equivalent_ids = [r[0] for r in results] assert (set([ identifier.id, strong_equivalent.id, weak_equivalent.id, level_2_equivalent.id, level_3_equivalent.id, level_4_equivalent.id, ]) == set(equivalent_ids)) query = Identifier.recursively_equivalent_identifier_ids_query( Identifier.id, policy=two_levels_high_threshold) query = query.where( Identifier.id.in_([identifier.id, level_3_equivalent.id])) results = self._db.execute(query) equivalent_ids = [r[0] for r in results] assert (set([ identifier.id, strong_equivalent.id, level_2_equivalent.id, level_3_equivalent.id, ]) == set(equivalent_ids))
# Find all books where the edition associated with the LicensePool has a # different medium from the presentation edition. _db = production_session() # Find all the LicensePools that aren't books. subq = select([LicensePool.id]).select_from( join( LicensePool, Edition, and_(LicensePool.data_source_id == Edition.data_source_id, LicensePool.identifier_id == Edition.primary_identifier_id)) ).where(Edition.medium != Edition.BOOK_MEDIUM) # Of those LicensePools, find every LicensePool whose presentation # edition says it _is_ a book. qu = _db.query(LicensePool).join( Edition, LicensePool.presentation_edition_id == Edition.id).filter( LicensePool.id.in_(subq)).filter(Edition.medium == Edition.BOOK_MEDIUM) print "Recalculating presentation edition for %d LicensePools." % qu.count() for lp in qu: # Recalculate that LicensePool's presentation edition, and then its # work presentation. lp.set_presentation_edition() policy = PresentationCalculationPolicy(regenerate_opds_entries=True, update_search_index=True) work, is_new = lp.calculate_work() work.calculate_presentation(policy) print "New medium: %s" % lp.presentation_edition.medium _db.commit()
def test_recursively_equivalent_identifiers(self): # We start with a Gutenberg book. gutenberg = DataSource.lookup(self._db, DataSource.GUTENBERG) record, ignore = Edition.for_foreign_id(self._db, gutenberg, Identifier.GUTENBERG_ID, "100") gutenberg_id = record.primary_identifier # We use OCLC Classify to do a title/author lookup. oclc = DataSource.lookup(self._db, DataSource.OCLC) search_id, ignore = Identifier.for_foreign_id(self._db, Identifier.OCLC_WORK, "60010") gutenberg_id.equivalent_to(oclc, search_id, 1) # The title/author lookup associates the search term with two # different OCLC Numbers. oclc_id, ignore = Identifier.for_foreign_id(self._db, Identifier.OCLC_NUMBER, "9999") oclc_id_2, ignore = Identifier.for_foreign_id(self._db, Identifier.OCLC_NUMBER, "1000") search_id.equivalent_to(oclc, oclc_id, 1) search_id.equivalent_to(oclc, oclc_id_2, 1) # We then use OCLC Linked Data to connect one of the OCLC # Numbers with an ISBN. linked_data = DataSource.lookup(self._db, DataSource.OCLC_LINKED_DATA) isbn_id, ignore = Identifier.for_foreign_id(self._db, Identifier.ISBN, "900100434X") oclc_id.equivalent_to(linked_data, isbn_id, 1) # As it turns out, we have an Overdrive work record... overdrive = DataSource.lookup(self._db, DataSource.OVERDRIVE) overdrive_record, ignore = Edition.for_foreign_id( self._db, overdrive, Identifier.OVERDRIVE_ID, "{111-222}") overdrive_id = overdrive_record.primary_identifier # ...which is tied (by Overdrive) to the same ISBN. overdrive_id.equivalent_to(overdrive, isbn_id, 1) # Finally, here's a completely unrelated Edition, which # will not be showing up. gutenberg2, ignore = Edition.for_foreign_id(self._db, gutenberg, Identifier.GUTENBERG_ID, "200") gutenberg2.title = "Unrelated Gutenberg record." levels = [ record.equivalent_identifiers(policy=PresentationCalculationPolicy( equivalent_identifier_levels=i)) for i in range(0, 5) ] # At level 0, the only identifier found is the Gutenberg ID. assert set([gutenberg_id]) == set(levels[0]) # At level 1, we pick up the title/author lookup. assert set([gutenberg_id, search_id]) == set(levels[1]) # At level 2, we pick up the title/author lookup and the two # OCLC Numbers. assert set([gutenberg_id, search_id, oclc_id, oclc_id_2]) == set(levels[2]) # At level 3, we also pick up the ISBN. assert set([gutenberg_id, search_id, oclc_id, oclc_id_2, isbn_id]) == set(levels[3]) # At level 4, the recursion starts to go in the other # direction: we pick up the Overdrive ID that's equivalent to # the same ISBN as the OCLC Number. assert set([ gutenberg_id, search_id, oclc_id, oclc_id_2, isbn_id, overdrive_id ]) == set(levels[4])
class IdentifierResolutionCoverageProvider(CatalogCoverageProvider): """Make sure all Identifiers associated with some Collection become Works. Coverage happens by running the Identifier through _other_ CoverageProviders, which fill in the blanks with data from third-party entities. This CoverageProvider may force those other CoverageProviders to do their work for each Identifier immediately, or it may simply register its Identifiers with those CoverageProviders and allow them to complete the work at their own pace. Unlike most CoverageProviders, which are invoked from a script, this CoverageProvider is invoked from URNLookupController.process_urns, and only when a client expresses a desire that we look into a specific identifier. """ SERVICE_NAME = "Identifier Resolution Coverage Provider" DATA_SOURCE_NAME = DataSource.INTERNAL_PROCESSING # These are the only identifier types we have any hope of providing # insight into. INPUT_IDENTIFIER_TYPES = [ Identifier.OVERDRIVE_ID, Identifier.ISBN, Identifier.URI, ] OPERATION = CoverageRecord.RESOLVE_IDENTIFIER_OPERATION # We cover all Collections, regardless of their protocol. PROTOCOL = None def __init__(self, collection, mirror=None, http_get=None, viaf=None, provide_coverage_immediately=False, force=False, provider_kwargs=None, **kwargs): """Constructor. :param collection: Handle all Identifiers from this Collection that were previously registered with this CoverageProvider. :param mirror: A MirrorUploader to use if coverage requires uploading any cover images to external storage. :param http_get: A drop-in replacement for Representation.simple_http_get, to be used if any information (such as a book cover) needs to be obtained from the public Internet. :param viaf_client: A VIAFClient to use if coverage requires gathering information about authors from VIAF. :param force: Force CoverageProviders to cover identifiers even if they believe they have already done the work. :param provide_coverage_immediately: If this is True, then resolving an identifier means registering it with all of its other CoverageProviders *and then attempting to provide coverage*. Registration is considered a success even if the other CoverageProviders fail, but the attempt must be made immediately. If this is False (the default), then resolving an identifier just means registering it with all other relevant CoverageProviders. :param provider_kwargs: Pass this object in as provider_kwargs when calling gather_providers at the end of the constructor. Used only in testing. """ _db = Session.object_session(collection) # Since we are the metadata wrangler, any resources we find, # we mirror using the sitewide MirrorUploader. if not mirror: try: mirror = MirrorUploader.sitewide(_db) except CannotLoadConfiguration, e: logging.error( "No storage integration is configured. Cover images will not be stored anywhere.", exc_info=e) self.mirror = mirror # We're going to be aggressive about recalculating the presentation # for this work because either the work is currently not set up # at all, or something went wrong trying to set it up. presentation = PresentationCalculationPolicy( regenerate_opds_entries=True) replacement_policy = ReplacementPolicy.from_metadata_source( presentation_calculation_policy=presentation, mirror=self.mirror, http_get=http_get, ) super(IdentifierResolutionCoverageProvider, self).__init__(collection, replacement_policy=replacement_policy, **kwargs) self.provide_coverage_immediately = provide_coverage_immediately self.force = force or provide_coverage_immediately self.viaf = viaf or VIAFClient(self._db) # Instantiate the coverage providers that may be needed to # relevant to any given Identifier. # # Each Identifier in this Collection's catalog will be registered # with all relevant providers (if provide_coverage_immediately # is False) or immediately covered by all relevant providers # (if provide_coverage_immediately is True). self.providers = self.gather_providers(provider_kwargs)
def edit(self, data_source, identifier_type, identifier): """Edit a work's metadata.""" pool = self.load_licensepool(data_source, identifier_type, identifier) if isinstance(pool, ProblemDetail): return pool work = pool.work changed = False staff_data_source = DataSource.lookup(self._db, DataSource.LIBRARY_STAFF) primary_identifier = work.presentation_edition.primary_identifier staff_edition, is_new = get_one_or_create( self._db, Edition, primary_identifier_id=primary_identifier.id, data_source_id=staff_data_source.id ) self._db.expire(primary_identifier) new_title = flask.request.form.get("title") if new_title and work.title != new_title: staff_edition.title = unicode(new_title) changed = True new_subtitle = flask.request.form.get("subtitle") if work.subtitle != new_subtitle: if work.subtitle and not new_subtitle: new_subtitle = NO_VALUE staff_edition.subtitle = unicode(new_subtitle) changed = True new_series = flask.request.form.get("series") if work.series != new_series: if work.series and not new_series: new_series = NO_VALUE staff_edition.series = unicode(new_series) changed = True new_series_position = flask.request.form.get("series_position") if new_series_position: try: new_series_position = int(new_series_position) except ValueError: return INVALID_SERIES_POSITION else: new_series_position = None if work.series_position != new_series_position: if work.series_position and not new_series_position: new_series_position = NO_NUMBER staff_edition.series_position = new_series_position changed = True new_summary = flask.request.form.get("summary") or "" if new_summary != work.summary_text: old_summary = None if work.summary and work.summary.data_source == staff_data_source: old_summary = work.summary work.presentation_edition.primary_identifier.add_link( Hyperlink.DESCRIPTION, None, staff_data_source, content=new_summary) # Delete previous staff summary if old_summary: for link in old_summary.links: self._db.delete(link) self._db.delete(old_summary) changed = True if changed: # Even if the presentation doesn't visibly change, we want # to regenerate the OPDS entries and update the search # index for the work, because that might be the 'real' # problem the user is trying to fix. policy = PresentationCalculationPolicy( classify=True, regenerate_opds_entries=True, update_search_index=True, choose_summary=True ) work.calculate_presentation(policy=policy) return Response("", 200)
from core.config import Configuration from core.model import ( production_session, Representation, get_one, PresentationCalculationPolicy, ) _db = production_session() qu = _db.query(Representation).filter( Representation.image_height == 120).filter(Representation.image_width == 80).filter( Representation.url.like("http://contentcafe2.btol.com/%")).order_by( Representation.id) policy = PresentationCalculationPolicy(regenerate_opds_entries=True, classify=False, choose_summary=False, calculate_quality=False) for rep in qu: print rep.id identifiers = [h.identifier for h in rep.resource.links] fix_editions = [] for identifier in identifiers: print identifier for edition in identifier.primarily_identifies: if (edition.cover_thumbnail_url and 'Content' in edition.cover_thumbnail_url) or ( edition.cover_full_url and 'Content' in edition.cover_full_url): fix_editions.append(edition) # Delete the hyperlinks so we don't use these images anymore.