def test_choose_best_metadata(self): more_identifier = self._identifier(identifier_type=Identifier.NOVELIST_ID) less_identifier = self._identifier(identifier_type=Identifier.NOVELIST_ID) metadatas = [Metadata(DataSource.NOVELIST, primary_identifier=more_identifier)] # When only one Metadata object is given, that object is returned. result = self.novelist.choose_best_metadata(metadatas, self._identifier()) assert True == isinstance(result, tuple) assert metadatas[0] == result[0] # A default confidence of 1.0 is returned. assert 1.0 == result[1] # When top identifiers have equal representation, the method returns none. metadatas.append( Metadata(DataSource.NOVELIST, primary_identifier=less_identifier) ) assert (None, None) == self.novelist.choose_best_metadata( metadatas, self._identifier() ) # But when one pulls ahead, we get the metadata object again. metadatas.append( Metadata(DataSource.NOVELIST, primary_identifier=more_identifier) ) result = self.novelist.choose_best_metadata(metadatas, self._identifier()) assert True == isinstance(result, tuple) metadata, confidence = result assert True == isinstance(metadata, Metadata) assert 0.67 == round(confidence, 2) assert more_identifier == metadata.primary_identifier
def test_choose_best_metadata(self): more_identifier = self._identifier( identifier_type=Identifier.NOVELIST_ID) less_identifier = self._identifier( identifier_type=Identifier.NOVELIST_ID) metadatas = [ Metadata(DataSource.NOVELIST, primary_identifier=more_identifier) ] # When only one Metadata object is given, that object is returned. result = self.novelist.choose_best_metadata(metadatas, self._identifier()) eq_(True, isinstance(result, tuple)) eq_(metadatas[0], result[0]) # A default confidence of 1.0 is returned. eq_(1.0, result[1]) # When top identifiers have equal representation, the method returns none. metadatas.append( Metadata(DataSource.NOVELIST, primary_identifier=less_identifier)) eq_((None, None), self.novelist.choose_best_metadata(metadatas, self._identifier())) # But when one pulls ahead, we get the metadata object again. metadatas.append( Metadata(DataSource.NOVELIST, primary_identifier=more_identifier)) result = self.novelist.choose_best_metadata(metadatas, self._identifier()) eq_(True, isinstance(result, tuple)) metadata, confidence = result eq_(True, isinstance(metadata, Metadata)) eq_(0.67, round(confidence, 2)) eq_(more_identifier, metadata.primary_identifier)
def book_info_to_metadata(self, subgraph, book_info): """Filters raw book information to exclude irrelevant or unhelpful data. :returns: None if information is unhelpful; metadata object otherwise. """ if not self._has_relevant_types(book_info): # This book is not available in any format we're # interested in from a metadata perspective. return None (oclc_id_type, oclc_id, titles, descriptions, subjects, creator_uris, publisher_names, publication_dates, example_uris) = self.extract_useful_data(subgraph, book_info) if not oclc_id_type or not oclc_id: return None self.log.info("Processing edition %s: %r", oclc_id, titles) metadata = Metadata(self.source) metadata.primary_identifier = IdentifierData( type=oclc_id_type, identifier=oclc_id ) if titles: metadata.title = titles[0] for d in publication_dates: try: metadata.published = datetime.datetime.strptime(d[:4], "%Y") except Exception, e: pass
def book_info_to_metadata(self, subgraph, book_info): """Filters raw book information to exclude irrelevant or unhelpful data. :returns: None if information is unhelpful; metadata object otherwise. """ if not self._has_relevant_types(book_info): # This book is not available in any format we're # interested in from a metadata perspective. return None (oclc_id_type, oclc_id, titles, descriptions, subjects, creator_uris, publisher_names, publication_dates, example_uris) = self.extract_useful_data(subgraph, book_info) if not oclc_id_type or not oclc_id: return None self.log.info("Processing edition %s: %r", oclc_id, titles) metadata = Metadata(self.source) metadata.primary_identifier = IdentifierData(type=oclc_id_type, identifier=oclc_id) if titles: metadata.title = titles[0] for d in publication_dates: try: metadata.published = datetime.datetime.strptime(d[:4], "%Y") except Exception, e: pass
def test_related_books(self): # A book with no related books returns a ProblemDetail. with temp_config() as config: config['integrations'][Configuration.NOVELIST_INTEGRATION] = {} with self.app.test_request_context('/'): response = self.manager.work_controller.related( self.datasource, self.identifier.type, self.identifier.identifier ) eq_(404, response.status_code) eq_("http://librarysimplified.org/terms/problem/unknown-lane", response.uri) # Prep book with a book in its series and a recommendation. self.lp.presentation_edition.series = "Around the World" self.french_1.presentation_edition.series = "Around the World" SessionManager.refresh_materialized_views(self._db) source = DataSource.lookup(self._db, self.datasource) metadata = Metadata(source) mock_api = MockNoveListAPI() metadata.recommendations = [self.english_2.license_pools[0].identifier] mock_api.setup(metadata) # A grouped feed is returned with both of these related books with self.app.test_request_context('/'): response = self.manager.work_controller.related( self.datasource, self.identifier.type, self.identifier.identifier, novelist_api=mock_api ) eq_(200, response.status_code) feed = feedparser.parse(response.data) eq_(3, len(feed['entries'])) # One book is in the recommendations feed. [e1] = [e for e in feed['entries'] if e['title'] == self.english_2.title] [collection_link] = [link for link in e1['links'] if link['rel']=='collection'] eq_("Recommended Books", collection_link['title']) work_url = "/works/%s/%s/%s/" % (self.datasource, self.identifier.type, self.identifier.identifier) expected = urllib.quote(work_url + 'recommendations') eq_(True, collection_link['href'].endswith(expected)) # Two books are in the series feed. The original work and its companion [e2] = [e for e in feed['entries'] if e['title'] == self.french_1.title] [collection_link] = [link for link in e2['links'] if link['rel']=='collection'] eq_("Around the World", collection_link['title']) expected = urllib.quote(work_url + 'series') eq_(True, collection_link['href'].endswith(expected)) [e3] = [e for e in feed['entries'] if e['title'] == self.english_1.title] [collection_link] = [link for link in e3['links'] if link['rel']=='collection'] eq_("Around the World", collection_link['title']) expected = urllib.quote(work_url + 'series') eq_(True, collection_link['href'].endswith(expected))
def test_confirm_same_identifier(self): source = DataSource.lookup(self._db, DataSource.NOVELIST) identifier, ignore = Identifier.for_foreign_id(self._db, Identifier.NOVELIST_ID, '84752928') unmatched_identifier, ignore = Identifier.for_foreign_id( self._db, Identifier.NOVELIST_ID, '23781947') metadata = Metadata(source, primary_identifier=identifier) match = Metadata(source, primary_identifier=identifier) mistake = Metadata(source, primary_identifier=unmatched_identifier) eq_(False, self.novelist._confirm_same_identifier([metadata, mistake])) eq_(True, self.novelist._confirm_same_identifier([metadata, match]))
def test_confirm_same_identifier(self): source = DataSource.lookup(self._db, DataSource.NOVELIST) identifier, ignore = Identifier.for_foreign_id( self._db, Identifier.NOVELIST_ID, "84752928" ) unmatched_identifier, ignore = Identifier.for_foreign_id( self._db, Identifier.NOVELIST_ID, "23781947" ) metadata = Metadata(source, primary_identifier=identifier) match = Metadata(source, primary_identifier=identifier) mistake = Metadata(source, primary_identifier=unmatched_identifier) assert False == self.novelist._confirm_same_identifier([metadata, mistake]) assert True == self.novelist._confirm_same_identifier([metadata, match])
def extract_bibliographic(self, element): identifiers = [] contributors = [] identifiers.append(IdentifierData(Identifier.ISBN, element["isbn"])) sort_name = element["author"] if not sort_name: sort_name = Edition.UNKNOWN_AUTHOR contributors.append(ContributorData(sort_name=sort_name)) primary_identifier = IdentifierData(EnkiAPI.ENKI_ID, element["id"]) image_url = element["large_image"] thumbnail_url = element["large_image"] images = [ LinkData(rel=Hyperlink.THUMBNAIL_IMAGE, href=thumbnail_url, media_type=Representation.PNG_MEDIA_TYPE), LinkData(rel=Hyperlink.IMAGE, href=image_url, media_type=Representation.PNG_MEDIA_TYPE) ] metadata = Metadata( data_source=DataSource.ENKI, title=element["title"], language="eng", medium=Edition.BOOK_MEDIUM, publisher=element["publisher"], primary_identifier=primary_identifier, identifiers=identifiers, contributors=contributors, links=images, ) licenses_owned = element["availability"]["totalCopies"] licenses_available = element["availability"]["availableCopies"] hold = element["availability"]["onHold"] drm_type = EnkiAPI.adobe_drm if (element["availability"]["accessType"] == 'acs') else EnkiAPI.no_drm formats = [] formats.append( FormatData(content_type=Representation.EPUB_MEDIA_TYPE, drm_scheme=drm_type)) circulationdata = CirculationData( data_source=DataSource.ENKI, primary_identifier=primary_identifier, formats=formats, licenses_owned=int(licenses_owned), licenses_available=int(licenses_available), patrons_in_hold_queue=int(hold)) metadata.circulation = circulationdata return metadata
def test_process_book_updates_old_licensepool(self): """If the LicensePool already exists, the circulation monitor updates it. """ edition, licensepool = self._edition( with_license_pool=True, identifier_type=Identifier.AXIS_360_ID, identifier_id=u'0003642860') # We start off with availability information based on the # default for test data. eq_(1, licensepool.licenses_owned) identifier = IdentifierData( type=licensepool.identifier.type, identifier=licensepool.identifier.identifier) metadata = Metadata(DataSource.AXIS_360, primary_identifier=identifier) monitor = Axis360CirculationMonitor( self._db, self.collection, api_class=MockAxis360API, metadata_client=MockMetadataWranglerOPDSLookup('url')) edition, licensepool = monitor.process_book(metadata, self.AVAILABILITY_DATA) # Now we have information based on the CirculationData. eq_(9, licensepool.licenses_owned)
def test_recommendations(self): # Prep an empty recommendation. source = DataSource.lookup(self._db, self.datasource) metadata = Metadata(source) mock_api = MockNoveListAPI() mock_api.setup(metadata) SessionManager.refresh_materialized_views(self._db) with self.app.test_request_context('/'): response = self.manager.work_controller.recommendations( self.datasource, self.identifier.type, self.identifier.identifier, novelist_api=mock_api ) eq_(200, response.status_code) feed = feedparser.parse(response.data) eq_('Recommended Books', feed['feed']['title']) eq_(0, len(feed['entries'])) # Delete the cache and prep a recommendation result. [cached_empty_feed] = self._db.query(CachedFeed).all() self._db.delete(cached_empty_feed) metadata.recommendations = [self.english_2.license_pools[0].identifier] mock_api.setup(metadata) SessionManager.refresh_materialized_views(self._db) with self.app.test_request_context('/'): response = self.manager.work_controller.recommendations( self.datasource, self.identifier.type, self.identifier.identifier, novelist_api=mock_api ) # A feed is returned with the proper recommendation. eq_(200, response.status_code) feed = feedparser.parse(response.data) eq_('Recommended Books', feed['feed']['title']) eq_(1, len(feed['entries'])) [entry] = feed['entries'] eq_(self.english_2.title, entry['title']) eq_(self.english_2.author, entry['author']) with temp_config() as config: with self.app.test_request_context('/'): config['integrations'][Configuration.NOVELIST_INTEGRATION] = {} response = self.manager.work_controller.recommendations( self.datasource, self.identifier.type, self.identifier.identifier ) eq_(404, response.status_code) eq_("http://librarysimplified.org/terms/problem/unknown-lane", response.uri)
def test_initialization(self): """Asserts that a RelatedBooksLane won't be initialized for a work without related books """ # A book without a series or a contributor on a circ manager without # NoveList recommendations raises an error. self._db.delete(self.edition.contributions[0]) self._db.commit() assert_raises( ValueError, RelatedBooksLane, self._default_library, self.work, "" ) # A book with a contributor initializes a RelatedBooksLane. luthor, i = self._contributor('Luthor, Lex') self.edition.add_contributor(luthor, [Contributor.EDITOR_ROLE]) result = RelatedBooksLane(self._default_library, self.work, '') eq_(self.work, result.work) [sublane] = result.children eq_(True, isinstance(sublane, ContributorLane)) eq_(sublane.contributors, [luthor]) # As does a book in a series. self.edition.series = "All By Myself" result = RelatedBooksLane(self._default_library, self.work, "") eq_(2, len(result.children)) [contributor, series] = result.children eq_(True, isinstance(series, SeriesLane)) # When NoveList is configured and recommendations are available, # a RecommendationLane will be included. self._external_integration( ExternalIntegration.NOVELIST, goal=ExternalIntegration.METADATA_GOAL, username=u'library', password=u'sure', libraries=[self._default_library] ) mock_api = MockNoveListAPI(self._db) response = Metadata( self.edition.data_source, recommendations=[self._identifier()] ) mock_api.setup(response) result = RelatedBooksLane(self._default_library, self.work, "", novelist_api=mock_api) eq_(3, len(result.children)) # The book's language and audience list is passed down to all sublanes. eq_(['eng'], result.languages) for sublane in result.children: eq_(result.languages, sublane.languages) if isinstance(sublane, SeriesLane): eq_([result.source_audience], sublane.audiences) else: eq_(sorted(list(result.audiences)), sorted(list(sublane.audiences))) contributor, recommendations, series = result.children eq_(True, isinstance(recommendations, RecommendationLane)) eq_(True, isinstance(series, SeriesLane)) eq_(True, isinstance(contributor, ContributorLane))
def metadata(self): return Metadata( data_source=DataSource.NYPL_SHADOWCAT, title=self.title, identifiers=self.identifiers, subjects=self.subjects, links=self.links, )
def generate_mock_api(self): """Prep an empty NoveList result.""" source = DataSource.lookup(self._db, DataSource.OVERDRIVE) metadata = Metadata(source) mock_api = MockNoveListAPI(self._db) mock_api.setup_method(metadata) return mock_api
def test_annotate_with_web_resources(self): metadata = Metadata(DataSource.CONTENT_CAFE) rel = self._str # We're going to be grabbing this URL and # scraping it. url_template = "http://url/%(arg1)s" args = dict(arg1='value') # A couple of useful functions for scraping. class MockScrapers(object): scrape_called = False explode_called = False def scrape(self, soup): self.scrape_called = True return [soup.find('content').string] def explode(self, soup): self.explode_called = True raise Exception("I'll never be called") scrapers = MockScrapers() # When the result of the HTTP request contains a certain phrase, # we don't even bother scraping. m = self.api.annotate_with_web_resources http = self.http http.queue_requests_response(200, 'text/html', content='There is no data!') m(metadata, self.identifier, args, url_template, "no data!", rel, scrapers.explode) # We made the request but nothing happened. expect_url = url_template % args eq_(expect_url, self.http.requests.pop()) eq_(False, scrapers.explode_called) eq_(None, metadata.title) eq_([], metadata.links) # Otherwise, we try to scrape. good_content = '<html><span class="PageHeader2">Book title</span><content>Here you go</content>' http.queue_requests_response(200, 'text/html', content=good_content) m(metadata, self.identifier, args, url_template, "no data!", rel, scrapers.scrape) eq_(True, scrapers.scrape_called) # We called _extract_title and took a Content Cafe title out # for the Metadata object. eq_("Book title", metadata.title) # Then we called mock_scrape, which gave us the content for # one LinkData. [link] = metadata.links eq_(rel, link.rel) eq_(None, link.href) eq_("text/html", link.media_type) eq_("Here you go", link.content)
def test_get_series_information(self): metadata = Metadata(data_source=DataSource.NOVELIST) vampire = json.loads(self.sample_data("vampire_kisses.json")) book_info = vampire['TitleInfo'] series_info = vampire['FeatureContent']['SeriesInfo'] (metadata, ideal_title_key) = self.novelist.get_series_information( metadata, series_info, book_info ) # Relevant series information is extracted eq_('Vampire kisses manga', metadata.series) eq_(1, metadata.series_position) # The 'full_title' key should be returned as ideal because # all the volumes have the same 'main_title' eq_('full_title', ideal_title_key) watchman = json.loads(self.sample_data("alternate_series_example.json")) book_info = watchman['TitleInfo'] series_info = watchman['FeatureContent']['SeriesInfo'] # Confirms that the new example doesn't match any volume's full title eq_([], [v for v in series_info['series_titles'] if v.get('full_title')==book_info.get('full_title')]) # But it still finds its matching volume (metadata, ideal_title_key) = self.novelist.get_series_information( metadata, series_info, book_info ) eq_('Elvis Cole/Joe Pike novels', metadata.series) eq_(11, metadata.series_position) # And recommends using the main_title eq_('main_title', ideal_title_key) # If the volume is found in the series more than once... book_info = dict( main_title='The Baby-Sitters Club', full_title='The Baby-Sitters Club: Claudia and Mean Janine' ) series_info = dict( full_title='The Baby-Sitters Club series', series_titles=[ # The volume is here twice! book_info, book_info, dict( full_title='The Baby-Sitters Club', main_title='The Baby-Sitters Club: Claudia and Mean Janine', series_position='3.' ) ] ) # An error is raised. assert_raises( ValueError, self.novelist.get_series_information, metadata, series_info, book_info )
def process_item(self, identifier): edition = self.edition(identifier) metadata = Metadata.from_edition(edition) metadata.apply(edition, self.collection, replace=self.replacement_policy) failure = self.register_work_for_calculation(identifier) if failure: return failure return identifier
def test_set_equivalence(self): edition = self._edition() edition.title = "The House on Mango Street" edition.add_contributor(Contributor(viaf="112460612"), Contributor.AUTHOR_ROLE) identifier = edition.primary_identifier i1 = self._identifier() identifierdata1 = IdentifierData(type=i1.type, identifier=i1.identifier) good_metadata = Metadata(DataSource.lookup(self._db, DataSource.GUTENBERG), primary_identifier=identifierdata1, title="The House on Mango Street", contributors=[Contributor(viaf="112460612")]) i2 = self._identifier() identifierdata2 = IdentifierData(type=i2.type, identifier=i2.identifier) bad_metadata = Metadata(DataSource.lookup(self._db, DataSource.GUTENBERG), primary_identifier=identifierdata2, title="Calvin & Hobbes", contributors=[Contributor(viaf="101010")]) self.provider.set_equivalence(identifier, good_metadata) self.provider.set_equivalence(identifier, bad_metadata) equivalencies = Equivalency.for_identifiers(self._db, [identifier]).all() # The identifier for the bad metadata isn't made equivalent eq_([i1], [x.output for x in equivalencies]) eq_([1], [x.strength for x in equivalencies]) # But if the existing identifier has no editions, they're made equivalent. identifier = self._identifier() self.provider.set_equivalence(identifier, bad_metadata) equivalencies = Equivalency.for_identifiers(self._db, [identifier]).all() eq_([i2], [x.output for x in equivalencies]) eq_([1], [x.strength for x in equivalencies])
def test_add_author_notes(self): """Verify that add_author_notes works in a real case.""" metadata = Metadata(DataSource.CONTENT_CAFE) content = self.data_file("author_notes.html") self.http.queue_requests_response(200, 'text/html', content=content) self.api.add_author_notes(metadata, self.identifier, self.args) [notes] = metadata.links eq_(Hyperlink.AUTHOR, notes.rel) assert 'Brenda researched turtles' in notes.content # We incidentally figured out the book's title. eq_("Franklin's Christmas Gift", metadata.title)
def test_add_excerpt(self): """Verify that add_excerpt works in a real case.""" metadata = Metadata(DataSource.CONTENT_CAFE) content = self.data_file("excerpt.html") self.http.queue_requests_response(200, 'text/html', content=content) self.api.add_excerpt(metadata, self.identifier, self.args) [excerpt] = metadata.links eq_(Hyperlink.SAMPLE, excerpt.rel) assert 'Franklin loved his marbles.' in excerpt.content # We incidentally figured out the book's title. eq_("Franklin's Christmas Gift", metadata.title)
def test_new_isbns(self): existing_id = self._identifier() metadata = Metadata(DataSource.lookup(self._db, DataSource.GUTENBERG), identifiers=[ IdentifierData(type=Identifier.OCLC_WORK, identifier="abra"), IdentifierData( type=existing_id.type, identifier=existing_id.identifier), IdentifierData(type=Identifier.ISBN, identifier="kadabra"), ]) eq_(2, self.provider.new_isbns(metadata))
def setup(self): super(TestNoveListCoverageProvider, self).setup() with temp_config() as config: config['integrations'][Configuration.NOVELIST_INTEGRATION] = { Configuration.NOVELIST_PROFILE: "library", Configuration.NOVELIST_PASSWORD: "******" } self.novelist = NoveListCoverageProvider(self._db) self.novelist.api = MockNoveListAPI() self.metadata = Metadata(data_source=self.novelist.source, primary_identifier=self._identifier( identifier_type=Identifier.NOVELIST_ID), title=u"The Great American Novel")
def test_add_reviews(self): """Verify that add_reviews works in a real case.""" metadata = Metadata(DataSource.CONTENT_CAFE) content = self.data_file("reviews.html") self.http.queue_requests_response(200, 'text/html', content=content) self.api.add_reviews(metadata, self.identifier, self.args) # We extracted six reviews from the sample file. reviews = metadata.links eq_(6, len(reviews)) assert all([x.rel == Hyperlink.REVIEW for x in reviews]) assert "isn't a myth!" in reviews[0].content # We incidentally figured out the book's title. eq_("Shadow Thieves", metadata.title)
def setup(self): super(TestNoveListCoverageProvider, self).setup() self.integration = self._external_integration( ExternalIntegration.NOVELIST, ExternalIntegration.METADATA_GOAL, username=u'library', password=u'yep', libraries=[self._default_library]) self.novelist = NoveListCoverageProvider(self._db) self.novelist.api = MockNoveListAPI.from_config(self._default_library) self.metadata = Metadata(data_source=self.novelist.data_source, primary_identifier=self._identifier( identifier_type=Identifier.NOVELIST_ID), title=u"The Great American Novel")
def _fetch_remote_availability(self, identifiers): for i, identifier in enumerate(identifiers): # The first identifer in the list is still # available. identifier_data = IdentifierData( type=identifier.type, identifier=identifier.identifier) metadata = Metadata(data_source=DataSource.AXIS_360, primary_identifier=identifier_data) availability = CirculationData( data_source=DataSource.AXIS_360, primary_identifier=identifier_data, licenses_owned=7, licenses_available=6) yield metadata, availability # The rest have been 'forgotten' by Axis 360. break
def create_metadata(self, isbn_identifier): """Make a Metadata object for the given Identifier. The Metadata object may include a cover image, descriptions, reviews, an excerpt, author notes, and a popularity measurement. :return: A Metadata object, or None if Content Cafe has no knowledge of this ISBN. """ isbn = isbn_identifier.identifier args = dict(userid=self.user_id, password=self.password, isbn=isbn) image_url = self.image_url % args response = self.do_get(image_url) if response.status_code == 404: # Content Cafe served us an HTML page instead of an # image. This indicates that Content Cafe has no knowledge # of this ISBN -- if it knew _anything_ it would have a # cover image. There is no need to build a Metadata object. return None media_type = response.headers.get('Content-Type', 'image/jpeg') # Start building a Metadata object. metadata = Metadata(self.data_source, primary_identifier=isbn_identifier) # Add the cover image to it image = response.content if self.is_suitable_image(image): metadata.links.append( LinkData(rel=Hyperlink.IMAGE, href=image_url, media_type=media_type, content=response.content)) for annotator in (self.add_descriptions, self.add_excerpt, self.add_reviews, self.add_author_notes): annotator(metadata, isbn_identifier, args) popularity = self.measure_popularity(isbn_identifier, self.ONE_YEAR_AGO) if popularity: metadata.measurements.append(popularity) return metadata
# level of confidence. for isbn in d.get('isbns', []): isbn13 = isbn.get('isbn13', None) if isbn13: other_isbns.append( IdentifierData(Identifier.ISBN, isbn13, 0.50)) primary_isbn = primary_isbn13 or primary_isbn10 if primary_isbn: primary_isbn = IdentifierData(Identifier.ISBN, primary_isbn, 0.90) contributors = [] if display_author: contributors.append(ContributorData(display_name=display_author)) metadata = Metadata( data_source=DataSource.NYT, title=title, medium=medium, language='eng', published=published_date, publisher=publisher, contributors=contributors, primary_identifier=primary_isbn, identifiers=other_isbns, ) super(NYTBestSellerListTitle, self).__init__(metadata, first_appearance, most_recent_appearance, annotation)
def lookup_info_to_metadata(self, lookup_representation): """Transforms a NoveList JSON representation into a Metadata object""" if not lookup_representation.content: return None lookup_info = json.loads(lookup_representation.content) book_info = lookup_info['TitleInfo'] if book_info: novelist_identifier = book_info.get('ui') if not book_info or not novelist_identifier: # NoveList didn't know the ISBN. return None primary_identifier, ignore = Identifier.for_foreign_id( self._db, Identifier.NOVELIST_ID, novelist_identifier) metadata = Metadata(self.source, primary_identifier=primary_identifier) # Get the equivalent ISBN identifiers. metadata.identifiers += self._extract_isbns(book_info) author = book_info.get('author') if author: metadata.contributors.append(ContributorData(sort_name=author)) description = book_info.get('description') if description: metadata.links.append( LinkData(rel=Hyperlink.DESCRIPTION, content=description, media_type=Representation.TEXT_PLAIN)) audience_level = book_info.get('audience_level') if audience_level: metadata.subjects.append( SubjectData(Subject.FREEFORM_AUDIENCE, audience_level)) novelist_rating = book_info.get('rating') if novelist_rating: metadata.measurements.append( MeasurementData(Measurement.RATING, novelist_rating)) # Extract feature content if it is available. series_info = None appeals_info = None lexile_info = None goodreads_info = None recommendations_info = None feature_content = lookup_info.get('FeatureContent') if feature_content: series_info = feature_content.get('SeriesInfo') appeals_info = feature_content.get('Appeals') lexile_info = feature_content.get('LexileInfo') goodreads_info = feature_content.get('GoodReads') recommendations_info = feature_content.get('SimilarTitles') metadata, title_key = self.get_series_information( metadata, series_info, book_info) metadata.title = book_info.get(title_key) subtitle = TitleProcessor.extract_subtitle(metadata.title, book_info.get('full_title')) metadata.subtitle = self._scrub_subtitle(subtitle) # TODO: How well do we trust this data? We could conceivably bump up # the weight here. if appeals_info: extracted_genres = False for appeal in appeals_info: genres = appeal.get('genres') if genres: for genre in genres: metadata.subjects.append( SubjectData(Subject.TAG, genre['Name'])) extracted_genres = True if extracted_genres: break if lexile_info: metadata.subjects.append( SubjectData(Subject.LEXILE_SCORE, lexile_info['Lexile'])) if goodreads_info: metadata.measurements.append( MeasurementData(Measurement.RATING, goodreads_info['average_rating'])) metadata = self.get_recommendations(metadata, recommendations_info) # If nothing interesting comes from the API, ignore it. if not (metadata.measurements or metadata.series_position or metadata.series or metadata.subjects or metadata.links or metadata.subtitle or metadata.recommendations): metadata = None return metadata
def test_initialization(self): # Asserts that a RelatedBooksLane won't be initialized for a work # without related books # A book without a series or a contributor on a circ manager without # NoveList recommendations raises an error. self._db.delete(self.edition.contributions[0]) self._db.commit() pytest.raises( ValueError, RelatedBooksLane, self._default_library, self.work, "" ) # A book with a contributor initializes a RelatedBooksLane. luthor, i = self._contributor("Luthor, Lex") self.edition.add_contributor(luthor, [Contributor.EDITOR_ROLE]) result = RelatedBooksLane(self._default_library, self.work, "") assert self.work == result.work [sublane] = result.children assert True == isinstance(sublane, ContributorLane) assert sublane.contributor == luthor # As does a book in a series. self.edition.series = "All By Myself" result = RelatedBooksLane(self._default_library, self.work, "") assert 2 == len(result.children) [contributor, series] = result.children assert True == isinstance(series, SeriesLane) # When NoveList is configured and recommendations are available, # a RecommendationLane will be included. self._external_integration( ExternalIntegration.NOVELIST, goal=ExternalIntegration.METADATA_GOAL, username="******", password="******", libraries=[self._default_library], ) mock_api = MockNoveListAPI(self._db) response = Metadata( self.edition.data_source, recommendations=[self._identifier()] ) mock_api.setup_method(response) result = RelatedBooksLane( self._default_library, self.work, "", novelist_api=mock_api ) assert 3 == len(result.children) [novelist_recommendations] = [ x for x in result.children if isinstance(x, RecommendationLane) ] assert ( "Similar titles recommended by NoveList" == novelist_recommendations.display_name ) # The book's language and audience list is passed down to all sublanes. assert ["eng"] == result.languages for sublane in result.children: assert result.languages == sublane.languages if isinstance(sublane, SeriesLane): assert [result.source_audience] == sublane.audiences else: assert sorted(list(result.audiences)) == sorted(list(sublane.audiences)) contributor, recommendations, series = result.children assert True == isinstance(recommendations, RecommendationLane) assert True == isinstance(series, SeriesLane) assert True == isinstance(contributor, ContributorLane)
def lookup_info_to_metadata(self, lookup_representation): """Transforms a NoveList JSON representation into a Metadata object""" if not lookup_representation.content: return None lookup_info = json.loads(lookup_representation.content) book_info = lookup_info['TitleInfo'] if book_info: novelist_identifier = book_info.get('ui') if not book_info or not novelist_identifier: # NoveList didn't know the ISBN. return None primary_identifier, ignore = Identifier.for_foreign_id( self._db, Identifier.NOVELIST_ID, novelist_identifier ) metadata = Metadata(self.source, primary_identifier=primary_identifier) # Get the equivalent ISBN identifiers. metadata.identifiers += self._extract_isbns(book_info) author = book_info.get('author') if author: metadata.contributors.append(ContributorData(sort_name=author)) description = book_info.get('description') if description: metadata.links.append(LinkData( rel=Hyperlink.DESCRIPTION, content=description, media_type=Representation.TEXT_PLAIN )) audience_level = book_info.get('audience_level') if audience_level: metadata.subjects.append(SubjectData( Subject.FREEFORM_AUDIENCE, audience_level )) novelist_rating = book_info.get('rating') if novelist_rating: metadata.measurements.append(MeasurementData( Measurement.RATING, novelist_rating )) # Extract feature content if it is available. series_info = None appeals_info = None lexile_info = None goodreads_info = None recommendations_info = None feature_content = lookup_info.get('FeatureContent') if feature_content: series_info = feature_content.get('SeriesInfo') appeals_info = feature_content.get('Appeals') lexile_info = feature_content.get('LexileInfo') goodreads_info = feature_content.get('GoodReads') recommendations_info = feature_content.get('SimilarTitles') metadata, title_key = self.get_series_information( metadata, series_info, book_info ) metadata.title = book_info.get(title_key) subtitle = TitleProcessor.extract_subtitle( metadata.title, book_info.get('full_title') ) metadata.subtitle = self._scrub_subtitle(subtitle) if appeals_info: extracted_genres = False for appeal in appeals_info: genres = appeal.get('genres') if genres: for genre in genres: metadata.subjects.append(SubjectData( Subject.TAG, genre['Name'] )) extracted_genres = True if extracted_genres: break if lexile_info: metadata.subjects.append(SubjectData( Subject.LEXILE_SCORE, lexile_info['Lexile'] )) if goodreads_info: metadata.measurements.append(MeasurementData( Measurement.RATING, goodreads_info['average_rating'] )) metadata = self.get_recommendations(metadata, recommendations_info) # If nothing interesting comes from the API, ignore it. if not (metadata.measurements or metadata.series_position or metadata.series or metadata.subjects or metadata.links or metadata.subtitle or metadata.recommendations ): metadata = None return metadata
def test_work_from_metadata(self): """Validate the ability to create a new Work from appropriate metadata. """ class Mock(MockDirectoryImportScript): """In this test we need to verify that annotate_metadata was called but did nothing. """ def annotate_metadata(self, metadata, *args, **kwargs): metadata.annotated = True return super(Mock, self).annotate_metadata( metadata, *args, **kwargs ) identifier = IdentifierData(Identifier.GUTENBERG_ID, "1003") identifier_obj, ignore = identifier.load(self._db) metadata = Metadata( DataSource.GUTENBERG, primary_identifier=identifier, title=u"A book" ) metadata.annotated = False datasource = DataSource.lookup(self._db, DataSource.GUTENBERG) policy = ReplacementPolicy.from_license_source(self._db) mirror = MockS3Uploader() policy.mirror = mirror # Here, work_from_metadata calls annotate_metadata, but does # not actually import anything because there are no files 'on # disk' and thus no way to actually get the book. collection = self._default_collection args = (collection, metadata, policy, "cover directory", "ebook directory", RightsStatus.CC0) script = Mock(self._db) eq_(None, script.work_from_metadata(*args)) eq_(True, metadata.annotated) # Now let's try it with some files 'on disk'. with open(self.sample_cover_path('test-book-cover.png')) as fh: image = fh.read() mock_filesystem = { 'cover directory' : ( 'cover.jpg', Representation.JPEG_MEDIA_TYPE, image ), 'ebook directory' : ( 'book.epub', Representation.EPUB_MEDIA_TYPE, "I'm an EPUB." ) } script = MockDirectoryImportScript( self._db, mock_filesystem=mock_filesystem ) work = script.work_from_metadata(*args) # We have created a book. It has a cover image, which has a # thumbnail. eq_("A book", work.title) assert work.cover_full_url.endswith( '/test.cover.bucket/Gutenberg/Gutenberg+ID/1003/1003.jpg' ) assert work.cover_thumbnail_url.endswith( '/test.cover.bucket/scaled/300/Gutenberg/Gutenberg+ID/1003/1003.png' ) [pool] = work.license_pools assert pool.open_access_download_url.endswith( '/test.content.bucket/Gutenberg/Gutenberg+ID/1003/A+book.epub' ) eq_(RightsStatus.CC0, pool.delivery_mechanisms[0].rights_status.uri) # The mock S3Uploader has a record of 'uploading' all these files # to S3. epub, full, thumbnail = mirror.uploaded eq_(epub.url, pool.open_access_download_url) eq_(full.url, work.cover_full_url) eq_(thumbnail.url, work.cover_thumbnail_url) # The EPUB Representation was cleared out after the upload, to # save database space. eq_("I'm an EPUB.", mirror.content[0]) eq_(None, epub.content)
def test_annotate_metadata(self): """Verify that annotate_metadata calls load_circulation_data and load_cover_link appropriately. """ # First, test an unsuccessful annotation. class MockNoCirculationData(DirectoryImportScript): """Do nothing when load_circulation_data is called. Explode if load_cover_link is called. """ def load_circulation_data(self, *args): self.load_circulation_data_args = args return None def load_cover_link(self, *args): raise Exception("Explode!") gutenberg = DataSource.lookup(self._db, DataSource.GUTENBERG) identifier = IdentifierData(Identifier.GUTENBERG_ID, "11111") identifier_obj, ignore = identifier.load(self._db) metadata = Metadata( title=self._str, data_source=gutenberg, primary_identifier=identifier ) mirror = object() policy = ReplacementPolicy(mirror=mirror) cover_directory = object() ebook_directory = object() rights_uri = object() script = MockNoCirculationData(self._db) args = (metadata, policy, cover_directory, ebook_directory, rights_uri) script.annotate_metadata(*args) # load_circulation_data was called. eq_( (identifier_obj, gutenberg, ebook_directory, mirror, metadata.title, rights_uri), script.load_circulation_data_args ) # But because load_circulation_data returned None, # metadata.circulation_data was not modified and # load_cover_link was not called (which would have raised an # exception). eq_(None, metadata.circulation) # Test a successful annotation with no cover image. class MockNoCoverLink(DirectoryImportScript): """Return an object when load_circulation_data is called. Do nothing when load_cover_link is called. """ def load_circulation_data(self, *args): return "Some circulation data" def load_cover_link(self, *args): self.load_cover_link_args = args return None script = MockNoCoverLink(self._db) script.annotate_metadata(*args) # The Metadata object was annotated with the return value of # load_circulation_data. eq_("Some circulation data", metadata.circulation) # load_cover_link was called. eq_( (identifier_obj, gutenberg, cover_directory, mirror), script.load_cover_link_args ) # But since it provided no cover link, metadata.links was empty. eq_([], metadata.links) # Finally, test a completely successful annotation. class MockWithCoverLink(DirectoryImportScript): """Mock success for both load_circulation_data and load_cover_link. """ def load_circulation_data(self, *args): return "Some circulation data" def load_cover_link(self, *args): return "A cover link" metadata.circulation = None script = MockWithCoverLink(self._db) script.annotate_metadata(*args) eq_("Some circulation data", metadata.circulation) eq_(['A cover link'], metadata.links)
def extract_bibliographic(self, element): """Extract Metadata and CirculationData from a dictionary of information from Enki. :return: A Metadata with attached CirculationData. """ # TODO: it's not clear what these are or whether we'd find them # useful: # dateSaved # length # publishDate primary_identifier = IdentifierData(EnkiAPI.ENKI_ID, element["id"]) identifiers = [] identifiers.append(IdentifierData(Identifier.ISBN, element["isbn"])) contributors = [] sort_name = element.get("author", None) or Edition.UNKNOWN_AUTHOR contributors.append(ContributorData(sort_name=sort_name)) links = [] description = element.get('description') if description: links.append( LinkData(rel=Hyperlink.DESCRIPTION, content=description, media_type="text/html") ) # NOTE: When this method is called by, e.g. updated_titles(), # the large and small images are available separately. When # this method is called by get_item(), we only get a single # image, in 'cover'. In get_item() we ask that that image be 'large', # which means we'll be filing it as a normal-sized image. # full_image = None thumbnail_image = None for key, rel in ( ('cover', Hyperlink.IMAGE), ('small_image', Hyperlink.THUMBNAIL_IMAGE), ('large_image', Hyperlink.IMAGE) ): url = element.get(key) if not url: continue link = LinkData( rel=rel, href=url, media_type=Representation.PNG_MEDIA_TYPE ) if rel == Hyperlink.THUMBNAIL_IMAGE: # Don't add a thumbnail to the list of links -- wait # until the end and then make it a thumbnail of the # primary image. thumbnail_image = link else: if rel == Hyperlink.IMAGE: full_image = link links.append(link) if thumbnail_image: if full_image: # Set the thumbnail as the thumbnail _of_ the full image. full_image.thumbnail = thumbnail_image else: # Treat the thumbnail as the full image. thumbnail_image.rel = Hyperlink.IMAGE links.append(thumbnail_image) # We treat 'subject', 'topic', and 'genre' as interchangeable # sets of tags. This data is based on BISAC but it's not reliably # presented in a form that can be parsed as BISAC. subjects = [] seen_topics = set() for key in ('subject', 'topic', 'genre'): for topic in element.get(key, []): if not topic or topic in seen_topics: continue subjects.append(SubjectData(Subject.TAG, topic)) seen_topics.add(topic) language_code = element.get("language", "English") language = self.LANGUAGE_CODES.get(language_code, "eng") metadata = Metadata( data_source=DataSource.ENKI, title=element.get("title"), language=language, medium=Edition.BOOK_MEDIUM, publisher=element.get("publisher"), primary_identifier=primary_identifier, identifiers=identifiers, contributors=contributors, links=links, subjects=subjects, ) circulationdata = self.extract_circulation( primary_identifier, element.get('availability', {}), element.get('formattype', None) ) metadata.circulation = circulationdata return metadata
def test_improve_description(self): # Here's a Metadata that has a bad (truncated) description. metadata = Metadata(self.data_source) bad_description = LinkData(rel=Hyperlink.DESCRIPTION, media_type="text/plain", content=u"The Discourse on the Method is a philosophical and mathematical treatise published by Ren\xe9 Descartes in 1637. Its full name is Discourse on the Method of Rightly Conducting the Reason, and Searching for Truth in the Sciences (French title: Discour...") irrelevant_description = LinkData( rel=Hyperlink.DESCRIPTION, media_type="text/plain", content="Don't look at me; I'm irrelevant!" ) # Sending an HTTP request to this URL is going to give a 404 error. alternate = LinkData(rel=Hyperlink.ALTERNATE, href="http://foo/", media_type=OPDSFeed.ENTRY_TYPE) # We're not even going to try to send an HTTP request to this URL # because it doesn't promise an OPDS entry. alternate2 = LinkData(rel=Hyperlink.ALTERNATE, href="http://bar/", media_type="text/html") # But this URL will give us full information about this # entry, including a better description. alternate3 = LinkData( rel=Hyperlink.ALTERNATE, href="http://baz/", media_type=OPDSFeed.ENTRY_TYPE ) # This URL will not be requested because the third alternate URL # gives us the answer we're looking for. alternate4 = LinkData( rel=Hyperlink.ALTERNATE, href="http://qux/", media_type=OPDSFeed.ENTRY_TYPE ) # Two requests will be made. The first will result in a 404 # error. The second will give us an OPDS entry. self.http.queue_response(404, content="Not found") self.http.queue_response(200, OPDSFeed.ENTRY_TYPE, content=self.sample_file("677.atom")) metadata.links = [bad_description, irrelevant_description, alternate, alternate2, alternate3, alternate4] self.importer.improve_description("some ID", metadata) # The descriptions have been removed from metatadata.links, # because 677.atom included a description we know was better. # # The incomplete description was removed even though 677.atom # also included a copy of it. assert bad_description not in metadata.links assert irrelevant_description not in metadata.links # The more complete description from 677.atom has been added. [good_description] = [ x for x in metadata.links if x.rel == Hyperlink.DESCRIPTION ] # The four alternate links have not been touched. assert (alternate in metadata.links) assert (alternate2 in metadata.links) assert (alternate3 in metadata.links) assert (alternate4 in metadata.links) # Two HTTP requests were made. eq_(['http://foo/', 'http://baz/'], self.http.requests)
def parse_book(cls, collection, g, uri, title): """Turn an RDF graph into a Edition for the given `uri` and `title`. """ source_id = unicode(cls.ID_IN_URI.search(uri).groups()[0]) primary_identifier = IdentifierData( Identifier.GUTENBERG_ID, source_id ) # Split a subtitle out from the main title. title = unicode(title) subtitle = None for separator in "\r\n", "\n": if separator in title: parts = title.split(separator) title = parts[0] subtitle = "\n".join(parts[1:]) break issued = cls._value(g, (uri, cls.dcterms.issued, None)) issued = datetime.datetime.strptime(issued, cls.DATE_FORMAT).date() rights = cls._value(g, (uri, cls.dcterms.rights, None)) if rights: rights = str(rights) else: rights = '' rights_uri = RightsStatus.rights_uri_from_string(rights) # As far as I can tell, Gutenberg descriptions are 100% # useless for our purposes. They should not be used, even if # no other description is available. publisher = cls._value(g, (uri, cls.dcterms.publisher, None)) languages = [] for ignore, ignore, language_uri in g.triples( (uri, cls.dcterms.language, None)): code = str(cls._value(g, (language_uri, cls.rdf.value, None))) code = LanguageCodes.two_to_three[code] if code: languages.append(code) if 'eng' in languages: language = 'eng' elif languages: language = languages[0] else: language = None contributors = [] for ignore, ignore, author_uri in g.triples((uri, cls.dcterms.creator, None)): name = cls._value(g, (author_uri, cls.gutenberg.name, None)) aliases = cls._values(g, (author_uri, cls.gutenberg.alias, None)) contributors.append(ContributorData( sort_name=name, aliases=aliases, roles=[Contributor.AUTHOR_ROLE], )) subjects = [] subject_links = cls._values(g, (uri, cls.dcterms.subject, None)) for subject in subject_links: value = cls._value(g, (subject, cls.rdf.value, None)) vocabulary = cls._value(g, (subject, cls.dcam.memberOf, None)) vocabulary = Subject.by_uri[str(vocabulary)] subjects.append(SubjectData(vocabulary, value)) medium = Edition.BOOK_MEDIUM # Turn the Gutenberg download links into Hyperlinks associated # with the new Edition. They will serve either as open access # downloads or cover images. download_links = cls._values(g, (uri, cls.dcterms.hasFormat, None)) links = [LinkData( rel=Hyperlink.CANONICAL, href=str(uri), )] # Gutenberg won't allow us to use any of the download or image # links--we have to make our own from an rsynced mirror--but # we can look through the links to determine which medium to # assign to this book. formats = [] for href in download_links: for format_uri in cls._values( g, (href, cls.dcterms['format'], None)): media_type = unicode( cls._value(g, (format_uri, cls.rdf.value, None))) if media_type.startswith('audio/'): medium = Edition.AUDIO_MEDIUM formats.append(FormatData( content_type=Representation.MP3_MEDIA_TYPE, drm_scheme=DeliveryMechanism.NO_DRM, )) elif media_type.startswith('video/'): medium = Edition.VIDEO_MEDIUM else: formats.append(FormatData( content_type=Representation.EPUB_MEDIA_TYPE, drm_scheme=DeliveryMechanism.NO_DRM, rights_uri=rights_uri, )) _db = Session.object_session(collection) metadata = Metadata( data_source=DataSource.GUTENBERG, title=title, subtitle=subtitle, language=language, publisher=publisher, issued=issued, medium=medium, primary_identifier=primary_identifier, subjects=subjects, contributors=contributors, links=links, ) edition, new = metadata.edition(_db) metadata.apply(edition, collection) # Ensure that an open-access LicensePool exists for this book. circulation_data = CirculationData( data_source=DataSource.GUTENBERG, primary_identifier=primary_identifier, formats=formats, default_rights_uri=rights_uri, links=links, ) license_pool, new_license_pool = circulation_data.license_pool( _db, collection ) replace = ReplacementPolicy(formats=True) circulation_data.apply(_db, collection, replace=replace) license_pool.calculate_work() return edition, license_pool, new
class TestCirculationMonitor(Axis360Test): BIBLIOGRAPHIC_DATA = Metadata( DataSource.AXIS_360, publisher=u'Random House Inc', language='eng', title=u'Faith of My Fathers : A Family Memoir', imprint=u'Random House Inc2', published=datetime.datetime(2000, 3, 7, 0, 0), primary_identifier=IdentifierData(type=Identifier.AXIS_360_ID, identifier=u'0003642860'), identifiers=[ IdentifierData(type=Identifier.ISBN, identifier=u'9780375504587') ], contributors=[ ContributorData(sort_name=u"McCain, John", roles=[Contributor.PRIMARY_AUTHOR_ROLE]), ContributorData(sort_name=u"Salter, Mark", roles=[Contributor.AUTHOR_ROLE]), ], subjects=[ SubjectData(type=Subject.BISAC, identifier=u'BIOGRAPHY & AUTOBIOGRAPHY / Political'), SubjectData(type=Subject.FREEFORM_AUDIENCE, identifier=u'Adult'), ], ) AVAILABILITY_DATA = CirculationData( data_source=DataSource.AXIS_360, primary_identifier=BIBLIOGRAPHIC_DATA.primary_identifier, licenses_owned=9, licenses_available=8, licenses_reserved=0, patrons_in_hold_queue=0, last_checked=datetime.datetime(2015, 5, 20, 2, 9, 8), ) def test_process_book(self): integration, ignore = create( self._db, ExternalIntegration, goal=ExternalIntegration.ANALYTICS_GOAL, protocol="core.local_analytics_provider", ) monitor = Axis360CirculationMonitor( self._db, self.collection, api_class=MockAxis360API, metadata_client=MockMetadataWranglerOPDSLookup('url')) edition, license_pool = monitor.process_book(self.BIBLIOGRAPHIC_DATA, self.AVAILABILITY_DATA) eq_(u'Faith of My Fathers : A Family Memoir', edition.title) eq_(u'eng', edition.language) eq_(u'Random House Inc', edition.publisher) eq_(u'Random House Inc2', edition.imprint) eq_(Identifier.AXIS_360_ID, edition.primary_identifier.type) eq_(u'0003642860', edition.primary_identifier.identifier) [isbn] = [ x for x in edition.equivalent_identifiers() if x is not edition.primary_identifier ] eq_(Identifier.ISBN, isbn.type) eq_(u'9780375504587', isbn.identifier) eq_( ["McCain, John", "Salter, Mark"], sorted([x.sort_name for x in edition.contributors]), ) subs = sorted((x.subject.type, x.subject.identifier) for x in edition.primary_identifier.classifications) eq_([(Subject.BISAC, u'BIOGRAPHY & AUTOBIOGRAPHY / Political'), (Subject.FREEFORM_AUDIENCE, u'Adult')], subs) eq_(9, license_pool.licenses_owned) eq_(8, license_pool.licenses_available) eq_(0, license_pool.patrons_in_hold_queue) eq_(datetime.datetime(2015, 5, 20, 2, 9, 8), license_pool.last_checked) # Three circulation events were created, backdated to the # last_checked date of the license pool. events = license_pool.circulation_events eq_([ u'distributor_title_add', u'distributor_check_in', u'distributor_license_add' ], [x.type for x in events]) for e in events: eq_(e.start, license_pool.last_checked) # A presentation-ready work has been created for the LicensePool. work = license_pool.work eq_(True, work.presentation_ready) eq_("Faith of My Fathers : A Family Memoir", work.title) # A CoverageRecord has been provided for this book in the Axis # 360 bibliographic coverage provider, so that in the future # it doesn't have to make a separate API request to ask about # this book. records = [ x for x in license_pool.identifier.coverage_records if x.data_source.name == DataSource.AXIS_360 and x.operation is None ] eq_(1, len(records)) def test_process_book_updates_old_licensepool(self): """If the LicensePool already exists, the circulation monitor updates it. """ edition, licensepool = self._edition( with_license_pool=True, identifier_type=Identifier.AXIS_360_ID, identifier_id=u'0003642860') # We start off with availability information based on the # default for test data. eq_(1, licensepool.licenses_owned) identifier = IdentifierData( type=licensepool.identifier.type, identifier=licensepool.identifier.identifier) metadata = Metadata(DataSource.AXIS_360, primary_identifier=identifier) monitor = Axis360CirculationMonitor( self._db, self.collection, api_class=MockAxis360API, metadata_client=MockMetadataWranglerOPDSLookup('url')) edition, licensepool = monitor.process_book(metadata, self.AVAILABILITY_DATA) # Now we have information based on the CirculationData. eq_(9, licensepool.licenses_owned)
def extract_bibliographic(self, element): """Extract Metadata and CirculationData from a dictionary of information from Enki. :return: A Metadata with attached CirculationData. """ # TODO: it's not clear what these are or whether we'd find them # useful: # dateSaved # length # publishDate primary_identifier = IdentifierData(EnkiAPI.ENKI_ID, element["id"]) identifiers = [] identifiers.append(IdentifierData(Identifier.ISBN, element["isbn"])) contributors = [] sort_name = element.get("author", None) or Edition.UNKNOWN_AUTHOR contributors.append(ContributorData(sort_name=sort_name)) links = [] description = element.get("description") if description: links.append( LinkData( rel=Hyperlink.DESCRIPTION, content=description, media_type="text/html", ) ) # NOTE: When this method is called by, e.g. updated_titles(), # the large and small images are available separately. When # this method is called by get_item(), we only get a single # image, in 'cover'. In get_item() we ask that that image be 'large', # which means we'll be filing it as a normal-sized image. # full_image = None thumbnail_image = None for key, rel in ( ("cover", Hyperlink.IMAGE), ("small_image", Hyperlink.THUMBNAIL_IMAGE), ("large_image", Hyperlink.IMAGE), ): url = element.get(key) if not url: continue link = LinkData(rel=rel, href=url, media_type=Representation.PNG_MEDIA_TYPE) if rel == Hyperlink.THUMBNAIL_IMAGE: # Don't add a thumbnail to the list of links -- wait # until the end and then make it a thumbnail of the # primary image. thumbnail_image = link else: if rel == Hyperlink.IMAGE: full_image = link links.append(link) if thumbnail_image: if full_image: # Set the thumbnail as the thumbnail _of_ the full image. full_image.thumbnail = thumbnail_image else: # Treat the thumbnail as the full image. thumbnail_image.rel = Hyperlink.IMAGE links.append(thumbnail_image) # We treat 'subject', 'topic', and 'genre' as interchangeable # sets of tags. This data is based on BISAC but it's not reliably # presented in a form that can be parsed as BISAC. subjects = [] seen_topics = set() for key in ("subject", "topic", "genre"): for topic in element.get(key, []): if not topic or topic in seen_topics: continue subjects.append( SubjectData( Subject.TAG, topic, weight=Classification.TRUSTED_DISTRIBUTOR_WEIGHT, ) ) seen_topics.add(topic) language_code = element.get("language", "English") language = self.LANGUAGE_CODES.get(language_code, "eng") metadata = Metadata( data_source=DataSource.ENKI, title=element.get("title"), language=language, medium=Edition.BOOK_MEDIUM, publisher=element.get("publisher"), primary_identifier=primary_identifier, identifiers=identifiers, contributors=contributors, links=links, subjects=subjects, ) circulationdata = self.extract_circulation( primary_identifier, element.get("availability", {}), element.get("formattype", None), ) metadata.circulation = circulationdata return metadata
def record_info_to_metadata(cls, book, availability): """Turn Odilo's JSON representation of a book into a Metadata object. Note: The json data passed into this method is from a different file/stream from the json data that goes into the book_info_to_circulation() method. """ if 'id' not in book: return None odilo_id = book['id'] primary_identifier = IdentifierData(Identifier.ODILO_ID, odilo_id) active = book.get('active') title = book.get('title') subtitle = book.get('subtitle') series = book.get('series') series_position = book.get('seriesPosition') contributors = [] sort_author = book.get('author') if sort_author: roles = [Contributor.AUTHOR_ROLE] display_author = sort_name_to_display_name(sort_author) contributor = ContributorData(sort_name=sort_author, display_name=display_author, roles=roles, biography=None) contributors.append(contributor) publisher = book.get('publisher') # Metadata --> Marc21 260$c published = book.get('publicationDate') if not published: # yyyyMMdd --> record creation date published = book.get('releaseDate') if published: try: published = datetime.datetime.strptime(published, "%Y%m%d") except ValueError as e: cls.log.warn('Cannot parse publication date from: ' + published + ', message: ' + e.message) # yyyyMMdd --> record last modification date last_update = book.get('modificationDate') if last_update: try: last_update = datetime.datetime.strptime(last_update, "%Y%m%d") except ValueError as e: cls.log.warn('Cannot parse last update date from: ' + last_update + ', message: ' + e.message) language = book.get('language', 'spa') subjects = [] for subject in book.get('subjects', []): subjects.append( SubjectData(type=Subject.TAG, identifier=subject, weight=100)) for subjectBisacCode in book.get('subjectsBisacCodes', []): subjects.append( SubjectData(type=Subject.BISAC, identifier=subjectBisacCode, weight=100)) grade_level = book.get('gradeLevel') if grade_level: subject = SubjectData(type=Subject.GRADE_LEVEL, identifier=grade_level, weight=10) subjects.append(subject) medium = None file_format = book.get('fileFormat') formats = [] for format_received in book.get('formats', []): if format_received in cls.format_data_for_odilo_format: medium = cls.set_format(format_received, formats) elif format_received == cls.ACSM and file_format: medium = cls.set_format( format_received + '_' + file_format.upper(), formats) else: cls.log.warn('Unrecognized format received: ' + format_received) if not medium: medium = Edition.BOOK_MEDIUM identifiers = [] isbn = book.get('isbn') if isbn: if isbnlib.is_isbn10(isbn): isbn = isbnlib.to_isbn13(isbn) identifiers.append(IdentifierData(Identifier.ISBN, isbn, 1)) # A cover links = [] cover_image_url = book.get('coverImageUrl') if cover_image_url: image_data = cls.image_link_to_linkdata(cover_image_url, Hyperlink.THUMBNAIL_IMAGE) if image_data: links.append(image_data) original_image_url = book.get('originalImageUrl') if original_image_url: image_data = cls.image_link_to_linkdata(original_image_url, Hyperlink.IMAGE) if image_data: links.append(image_data) # Descriptions become links. description = book.get('description') if description: links.append( LinkData(rel=Hyperlink.DESCRIPTION, content=description, media_type="text/html")) metadata = Metadata(data_source=DataSource.ODILO, title=title, subtitle=subtitle, language=language, medium=medium, series=series, series_position=series_position, publisher=publisher, published=published, primary_identifier=primary_identifier, identifiers=identifiers, subjects=subjects, contributors=contributors, links=links, data_source_last_updated=last_update) metadata.circulation = OdiloRepresentationExtractor.record_info_to_circulation( availability) # 'active' --> means that the book exists but it's no longer in the collection # (it could be available again in the future) if not active: metadata.circulation.licenses_owned = 0 metadata.circulation.formats = formats return metadata, active
def test_parse(self): identifier = self._identifier() tree = self.tree("single_work_response.xml") metadata = Metadata(data_source=DataSource.OCLC, primary_identifier=identifier) result = self.parser.parse(tree, metadata) eq_([identifier], result.identifiers) # Contributors [parker, tanner, hayford, melville] = result.contributors eq_('4947338', parker.viaf) eq_('n50050335', parker.lc) eq_([Contributor.EDITOR_ROLE], parker.roles) eq_('51716047', tanner.viaf) eq_('n79059764', tanner.lc) eq_( set([ Contributor.UNKNOWN_ROLE, Contributor.EDITOR_ROLE, Contributor.INTRODUCTION_ROLE, Contributor.AUTHOR_ROLE ]), set(tanner.roles)) eq_('34482742', hayford.viaf) eq_('n50025038', hayford.lc) eq_(set([Contributor.ASSOCIATED_ROLE, Contributor.EDITOR_ROLE]), set(hayford.roles)) eq_('27068555', melville.viaf) eq_('n79006936', melville.lc) eq_([Contributor.AUTHOR_ROLE], melville.roles) eq_({'deathDate': '1891', 'birthDate': '1819'}, melville.extra) # Measurements def get_measurement(quantity): [measurement] = [ m.value for m in result.measurements if m.quantity_measured == self.parser.MEASUREMENT_MAPPING[quantity] ] return measurement eq_(46983, get_measurement("holdings")) eq_(2781, get_measurement("editions")) # Subjects def get_subjects(type): for s in result.subjects: if s.type == type: yield s [ddc] = get_subjects("DDC") eq_("813.3", ddc.identifier) eq_(21183, ddc.weight) [lcc] = get_subjects("LCC") eq_("PS2384", lcc.identifier) eq_(22460, lcc.weight) fasts = list(get_subjects("FAST")) eq_([ '1174284', '1174266', '801923', '1116147', '1174307', '1016699', '1110122', '1356235' ], [x.identifier for x in fasts]) eq_([32058, 31482, 29933, 19086, 18913, 17294, 6893, 4512], [x.weight for x in fasts]) eq_([ 'Whaling', 'Whales', 'Ahab, Captain (Fictitious character)', 'Ship captains', 'Whaling ships', 'Mentally ill', 'Sea stories', 'Moby Dick (Melville, Herman)' ], [x.name for x in fasts])
def add_with_metadata(self, collection_details): """Adds identifiers with their metadata to a Collection's catalog""" client = authenticated_client_from_request(self._db) if isinstance(client, ProblemDetail): return client collection = collection_from_details( self._db, client, collection_details ) data_source = DataSource.lookup( self._db, collection.name, autocreate=True ) messages = [] feed = feedparser.parse(request.data) entries = feed.get("entries", []) entries_by_urn = { entry.get('id') : entry for entry in entries } identifiers_by_urn, invalid_urns = Identifier.parse_urns( self._db, entries_by_urn.keys() ) messages = list() for urn in invalid_urns: messages.append(OPDSMessage( urn, INVALID_URN.status_code, INVALID_URN.detail )) for urn, identifier in identifiers_by_urn.items(): entry = entries_by_urn[urn] status = HTTP_OK description = "Already in catalog" if identifier not in collection.catalog: collection.catalog_identifier(identifier) status = HTTP_CREATED description = "Successfully added" message = OPDSMessage(urn, status, description) # Get a cover if it exists. image_types = set([Hyperlink.IMAGE, Hyperlink.THUMBNAIL_IMAGE]) images = [l for l in entry.get("links", []) if l.get("rel") in image_types] links = [LinkData(image.get("rel"), image.get("href")) for image in images] # Create an edition to hold the title and author. LicensePool.calculate_work # refuses to create a Work when there's no title, and if we have a title, author # and language we can attempt to look up the edition in OCLC. title = entry.get("title") or "Unknown Title" author = ContributorData( sort_name=(entry.get("author") or Edition.UNKNOWN_AUTHOR), roles=[Contributor.PRIMARY_AUTHOR_ROLE] ) language = entry.get("dcterms_language") presentation = PresentationCalculationPolicy( choose_edition=False, set_edition_metadata=False, classify=False, choose_summary=False, calculate_quality=False, choose_cover=False, regenerate_opds_entries=False, ) replace = ReplacementPolicy(presentation_calculation_policy=presentation) metadata = Metadata( data_source, primary_identifier=IdentifierData(identifier.type, identifier.identifier), title=title, language=language, contributors=[author], links=links, ) edition, ignore = metadata.edition(self._db) metadata.apply(edition, collection, replace=replace) messages.append(message) title = "%s Catalog Item Additions for %s" % (collection.protocol, client.url) url = self.collection_feed_url("add_with_metadata", collection) addition_feed = AcquisitionFeed( self._db, title, url, [], VerboseAnnotator, precomposed_entries=messages ) return feed_response(addition_feed)