예제 #1
0
    def test_choose_best_metadata(self):
        more_identifier = self._identifier(identifier_type=Identifier.NOVELIST_ID)
        less_identifier = self._identifier(identifier_type=Identifier.NOVELIST_ID)
        metadatas = [Metadata(DataSource.NOVELIST, primary_identifier=more_identifier)]

        # When only one Metadata object is given, that object is returned.
        result = self.novelist.choose_best_metadata(metadatas, self._identifier())
        assert True == isinstance(result, tuple)
        assert metadatas[0] == result[0]
        # A default confidence of 1.0 is returned.
        assert 1.0 == result[1]

        # When top identifiers have equal representation, the method returns none.
        metadatas.append(
            Metadata(DataSource.NOVELIST, primary_identifier=less_identifier)
        )
        assert (None, None) == self.novelist.choose_best_metadata(
            metadatas, self._identifier()
        )

        # But when one pulls ahead, we get the metadata object again.
        metadatas.append(
            Metadata(DataSource.NOVELIST, primary_identifier=more_identifier)
        )
        result = self.novelist.choose_best_metadata(metadatas, self._identifier())
        assert True == isinstance(result, tuple)
        metadata, confidence = result
        assert True == isinstance(metadata, Metadata)
        assert 0.67 == round(confidence, 2)
        assert more_identifier == metadata.primary_identifier
예제 #2
0
    def test_choose_best_metadata(self):
        more_identifier = self._identifier(
            identifier_type=Identifier.NOVELIST_ID)
        less_identifier = self._identifier(
            identifier_type=Identifier.NOVELIST_ID)
        metadatas = [
            Metadata(DataSource.NOVELIST, primary_identifier=more_identifier)
        ]

        # When only one Metadata object is given, that object is returned.
        result = self.novelist.choose_best_metadata(metadatas,
                                                    self._identifier())
        eq_(True, isinstance(result, tuple))
        eq_(metadatas[0], result[0])
        # A default confidence of 1.0 is returned.
        eq_(1.0, result[1])

        # When top identifiers have equal representation, the method returns none.
        metadatas.append(
            Metadata(DataSource.NOVELIST, primary_identifier=less_identifier))
        eq_((None, None),
            self.novelist.choose_best_metadata(metadatas, self._identifier()))

        # But when one pulls ahead, we get the metadata object again.
        metadatas.append(
            Metadata(DataSource.NOVELIST, primary_identifier=more_identifier))
        result = self.novelist.choose_best_metadata(metadatas,
                                                    self._identifier())
        eq_(True, isinstance(result, tuple))
        metadata, confidence = result
        eq_(True, isinstance(metadata, Metadata))
        eq_(0.67, round(confidence, 2))
        eq_(more_identifier, metadata.primary_identifier)
예제 #3
0
    def book_info_to_metadata(self, subgraph, book_info):
        """Filters raw book information to exclude irrelevant or unhelpful data.

        :returns: None if information is unhelpful; metadata object otherwise.
        """
        if not self._has_relevant_types(book_info):
            # This book is not available in any format we're
            # interested in from a metadata perspective.
            return None

        (oclc_id_type,
         oclc_id,
         titles,
         descriptions,
         subjects,
         creator_uris,
         publisher_names,
         publication_dates,
         example_uris) = self.extract_useful_data(subgraph, book_info)

        if not oclc_id_type or not oclc_id:
            return None

        self.log.info("Processing edition %s: %r", oclc_id, titles)
        metadata = Metadata(self.source)
        metadata.primary_identifier = IdentifierData(
            type=oclc_id_type, identifier=oclc_id
        )
        if titles:
            metadata.title = titles[0]
        for d in publication_dates:
            try:
                metadata.published = datetime.datetime.strptime(d[:4], "%Y")
            except Exception, e:
                pass
예제 #4
0
    def book_info_to_metadata(self, subgraph, book_info):
        """Filters raw book information to exclude irrelevant or unhelpful data.

        :returns: None if information is unhelpful; metadata object otherwise.
        """
        if not self._has_relevant_types(book_info):
            # This book is not available in any format we're
            # interested in from a metadata perspective.
            return None

        (oclc_id_type, oclc_id, titles, descriptions, subjects, creator_uris,
         publisher_names, publication_dates,
         example_uris) = self.extract_useful_data(subgraph, book_info)

        if not oclc_id_type or not oclc_id:
            return None

        self.log.info("Processing edition %s: %r", oclc_id, titles)
        metadata = Metadata(self.source)
        metadata.primary_identifier = IdentifierData(type=oclc_id_type,
                                                     identifier=oclc_id)
        if titles:
            metadata.title = titles[0]
        for d in publication_dates:
            try:
                metadata.published = datetime.datetime.strptime(d[:4], "%Y")
            except Exception, e:
                pass
예제 #5
0
    def test_related_books(self):
        # A book with no related books returns a ProblemDetail.
        with temp_config() as config:
            config['integrations'][Configuration.NOVELIST_INTEGRATION] = {}
            with self.app.test_request_context('/'):
                response = self.manager.work_controller.related(
                    self.datasource, self.identifier.type, self.identifier.identifier
                )
        eq_(404, response.status_code)
        eq_("http://librarysimplified.org/terms/problem/unknown-lane", response.uri)

        # Prep book with a book in its series and a recommendation.
        self.lp.presentation_edition.series = "Around the World"
        self.french_1.presentation_edition.series = "Around the World"
        SessionManager.refresh_materialized_views(self._db)

        source = DataSource.lookup(self._db, self.datasource)
        metadata = Metadata(source)
        mock_api = MockNoveListAPI()
        metadata.recommendations = [self.english_2.license_pools[0].identifier]
        mock_api.setup(metadata)

        # A grouped feed is returned with both of these related books
        with self.app.test_request_context('/'):
            response = self.manager.work_controller.related(
                self.datasource, self.identifier.type, self.identifier.identifier,
                novelist_api=mock_api
            )
        eq_(200, response.status_code)
        feed = feedparser.parse(response.data)
        eq_(3, len(feed['entries']))

        # One book is in the recommendations feed.
        [e1] = [e for e in feed['entries'] if e['title'] == self.english_2.title]
        [collection_link] = [link for link in e1['links'] if link['rel']=='collection']
        eq_("Recommended Books", collection_link['title'])
        work_url = "/works/%s/%s/%s/" % (self.datasource, self.identifier.type, self.identifier.identifier)
        expected = urllib.quote(work_url + 'recommendations')
        eq_(True, collection_link['href'].endswith(expected))

        # Two books are in the series feed. The original work and its companion
        [e2] = [e for e in feed['entries'] if e['title'] == self.french_1.title]
        [collection_link] = [link for link in e2['links'] if link['rel']=='collection']
        eq_("Around the World", collection_link['title'])
        expected = urllib.quote(work_url + 'series')
        eq_(True, collection_link['href'].endswith(expected))

        [e3] = [e for e in feed['entries'] if e['title'] == self.english_1.title]
        [collection_link] = [link for link in e3['links'] if link['rel']=='collection']
        eq_("Around the World", collection_link['title'])
        expected = urllib.quote(work_url + 'series')
        eq_(True, collection_link['href'].endswith(expected))
예제 #6
0
    def test_confirm_same_identifier(self):
        source = DataSource.lookup(self._db, DataSource.NOVELIST)
        identifier, ignore = Identifier.for_foreign_id(self._db,
                                                       Identifier.NOVELIST_ID,
                                                       '84752928')
        unmatched_identifier, ignore = Identifier.for_foreign_id(
            self._db, Identifier.NOVELIST_ID, '23781947')
        metadata = Metadata(source, primary_identifier=identifier)
        match = Metadata(source, primary_identifier=identifier)
        mistake = Metadata(source, primary_identifier=unmatched_identifier)

        eq_(False, self.novelist._confirm_same_identifier([metadata, mistake]))
        eq_(True, self.novelist._confirm_same_identifier([metadata, match]))
예제 #7
0
    def test_confirm_same_identifier(self):
        source = DataSource.lookup(self._db, DataSource.NOVELIST)
        identifier, ignore = Identifier.for_foreign_id(
            self._db, Identifier.NOVELIST_ID, "84752928"
        )
        unmatched_identifier, ignore = Identifier.for_foreign_id(
            self._db, Identifier.NOVELIST_ID, "23781947"
        )
        metadata = Metadata(source, primary_identifier=identifier)
        match = Metadata(source, primary_identifier=identifier)
        mistake = Metadata(source, primary_identifier=unmatched_identifier)

        assert False == self.novelist._confirm_same_identifier([metadata, mistake])
        assert True == self.novelist._confirm_same_identifier([metadata, match])
예제 #8
0
    def extract_bibliographic(self, element):
        identifiers = []
        contributors = []
        identifiers.append(IdentifierData(Identifier.ISBN, element["isbn"]))
        sort_name = element["author"]
        if not sort_name:
            sort_name = Edition.UNKNOWN_AUTHOR
        contributors.append(ContributorData(sort_name=sort_name))
        primary_identifier = IdentifierData(EnkiAPI.ENKI_ID, element["id"])
        image_url = element["large_image"]
        thumbnail_url = element["large_image"]
        images = [
            LinkData(rel=Hyperlink.THUMBNAIL_IMAGE,
                     href=thumbnail_url,
                     media_type=Representation.PNG_MEDIA_TYPE),
            LinkData(rel=Hyperlink.IMAGE,
                     href=image_url,
                     media_type=Representation.PNG_MEDIA_TYPE)
        ]
        metadata = Metadata(
            data_source=DataSource.ENKI,
            title=element["title"],
            language="eng",
            medium=Edition.BOOK_MEDIUM,
            publisher=element["publisher"],
            primary_identifier=primary_identifier,
            identifiers=identifiers,
            contributors=contributors,
            links=images,
        )
        licenses_owned = element["availability"]["totalCopies"]
        licenses_available = element["availability"]["availableCopies"]
        hold = element["availability"]["onHold"]
        drm_type = EnkiAPI.adobe_drm if (element["availability"]["accessType"]
                                         == 'acs') else EnkiAPI.no_drm
        formats = []
        formats.append(
            FormatData(content_type=Representation.EPUB_MEDIA_TYPE,
                       drm_scheme=drm_type))

        circulationdata = CirculationData(
            data_source=DataSource.ENKI,
            primary_identifier=primary_identifier,
            formats=formats,
            licenses_owned=int(licenses_owned),
            licenses_available=int(licenses_available),
            patrons_in_hold_queue=int(hold))

        metadata.circulation = circulationdata
        return metadata
예제 #9
0
    def test_process_book_updates_old_licensepool(self):
        """If the LicensePool already exists, the circulation monitor
        updates it.
        """
        edition, licensepool = self._edition(
            with_license_pool=True,
            identifier_type=Identifier.AXIS_360_ID,
            identifier_id=u'0003642860')
        # We start off with availability information based on the
        # default for test data.
        eq_(1, licensepool.licenses_owned)

        identifier = IdentifierData(
            type=licensepool.identifier.type,
            identifier=licensepool.identifier.identifier)
        metadata = Metadata(DataSource.AXIS_360, primary_identifier=identifier)
        monitor = Axis360CirculationMonitor(
            self._db,
            self.collection,
            api_class=MockAxis360API,
            metadata_client=MockMetadataWranglerOPDSLookup('url'))
        edition, licensepool = monitor.process_book(metadata,
                                                    self.AVAILABILITY_DATA)

        # Now we have information based on the CirculationData.
        eq_(9, licensepool.licenses_owned)
예제 #10
0
    def test_recommendations(self):
        # Prep an empty recommendation.
        source = DataSource.lookup(self._db, self.datasource)
        metadata = Metadata(source)
        mock_api = MockNoveListAPI()
        mock_api.setup(metadata)

        SessionManager.refresh_materialized_views(self._db)
        with self.app.test_request_context('/'):
            response = self.manager.work_controller.recommendations(
                self.datasource, self.identifier.type, self.identifier.identifier,
                novelist_api=mock_api
            )
        eq_(200, response.status_code)
        feed = feedparser.parse(response.data)
        eq_('Recommended Books', feed['feed']['title'])
        eq_(0, len(feed['entries']))

        # Delete the cache and prep a recommendation result.
        [cached_empty_feed] = self._db.query(CachedFeed).all()
        self._db.delete(cached_empty_feed)
        metadata.recommendations = [self.english_2.license_pools[0].identifier]
        mock_api.setup(metadata)

        SessionManager.refresh_materialized_views(self._db)
        with self.app.test_request_context('/'):
            response = self.manager.work_controller.recommendations(
                self.datasource, self.identifier.type, self.identifier.identifier,
                novelist_api=mock_api
            )
        # A feed is returned with the proper recommendation.
        eq_(200, response.status_code)
        feed = feedparser.parse(response.data)
        eq_('Recommended Books', feed['feed']['title'])
        eq_(1, len(feed['entries']))
        [entry] = feed['entries']
        eq_(self.english_2.title, entry['title'])
        eq_(self.english_2.author, entry['author'])

        with temp_config() as config:
            with self.app.test_request_context('/'):
                config['integrations'][Configuration.NOVELIST_INTEGRATION] = {}
                response = self.manager.work_controller.recommendations(
                    self.datasource, self.identifier.type, self.identifier.identifier
                )
            eq_(404, response.status_code)
            eq_("http://librarysimplified.org/terms/problem/unknown-lane", response.uri)
예제 #11
0
    def test_initialization(self):
        """Asserts that a RelatedBooksLane won't be initialized for a work
        without related books
        """

        # A book without a series or a contributor on a circ manager without
        # NoveList recommendations raises an error.
        self._db.delete(self.edition.contributions[0])
        self._db.commit()

        assert_raises(
            ValueError, RelatedBooksLane, self._default_library, self.work, ""
        )

        # A book with a contributor initializes a RelatedBooksLane.
        luthor, i = self._contributor('Luthor, Lex')
        self.edition.add_contributor(luthor, [Contributor.EDITOR_ROLE])

        result = RelatedBooksLane(self._default_library, self.work, '')
        eq_(self.work, result.work)
        [sublane] = result.children
        eq_(True, isinstance(sublane, ContributorLane))
        eq_(sublane.contributors, [luthor])

        # As does a book in a series.
        self.edition.series = "All By Myself"
        result = RelatedBooksLane(self._default_library, self.work, "")
        eq_(2, len(result.children))
        [contributor, series] = result.children
        eq_(True, isinstance(series, SeriesLane))

        # When NoveList is configured and recommendations are available,
        # a RecommendationLane will be included.
        self._external_integration(
            ExternalIntegration.NOVELIST,
            goal=ExternalIntegration.METADATA_GOAL, username=u'library',
            password=u'sure', libraries=[self._default_library]
        )
        mock_api = MockNoveListAPI(self._db)
        response = Metadata(
            self.edition.data_source, recommendations=[self._identifier()]
        )
        mock_api.setup(response)
        result = RelatedBooksLane(self._default_library, self.work, "", novelist_api=mock_api)
        eq_(3, len(result.children))

        # The book's language and audience list is passed down to all sublanes.
        eq_(['eng'], result.languages)
        for sublane in result.children:
            eq_(result.languages, sublane.languages)
            if isinstance(sublane, SeriesLane):
                eq_([result.source_audience], sublane.audiences)
            else:
                eq_(sorted(list(result.audiences)), sorted(list(sublane.audiences)))

        contributor, recommendations, series = result.children
        eq_(True, isinstance(recommendations, RecommendationLane))
        eq_(True, isinstance(series, SeriesLane))
        eq_(True, isinstance(contributor, ContributorLane))
예제 #12
0
 def metadata(self):
     return Metadata(
         data_source=DataSource.NYPL_SHADOWCAT,
         title=self.title,
         identifiers=self.identifiers,
         subjects=self.subjects,
         links=self.links,
     )
예제 #13
0
    def generate_mock_api(self):
        """Prep an empty NoveList result."""
        source = DataSource.lookup(self._db, DataSource.OVERDRIVE)
        metadata = Metadata(source)

        mock_api = MockNoveListAPI(self._db)
        mock_api.setup_method(metadata)
        return mock_api
    def test_annotate_with_web_resources(self):
        metadata = Metadata(DataSource.CONTENT_CAFE)
        rel = self._str

        # We're going to be grabbing this URL and
        # scraping it.
        url_template = "http://url/%(arg1)s"
        args = dict(arg1='value')

        # A couple of useful functions for scraping.
        class MockScrapers(object):
            scrape_called = False
            explode_called = False

            def scrape(self, soup):
                self.scrape_called = True
                return [soup.find('content').string]

            def explode(self, soup):
                self.explode_called = True
                raise Exception("I'll never be called")

        scrapers = MockScrapers()

        # When the result of the HTTP request contains a certain phrase,
        # we don't even bother scraping.
        m = self.api.annotate_with_web_resources
        http = self.http
        http.queue_requests_response(200,
                                     'text/html',
                                     content='There is no data!')
        m(metadata, self.identifier, args, url_template, "no data!", rel,
          scrapers.explode)
        # We made the request but nothing happened.
        expect_url = url_template % args
        eq_(expect_url, self.http.requests.pop())
        eq_(False, scrapers.explode_called)
        eq_(None, metadata.title)
        eq_([], metadata.links)

        # Otherwise, we try to scrape.
        good_content = '<html><span class="PageHeader2">Book title</span><content>Here you go</content>'
        http.queue_requests_response(200, 'text/html', content=good_content)
        m(metadata, self.identifier, args, url_template, "no data!", rel,
          scrapers.scrape)
        eq_(True, scrapers.scrape_called)

        # We called _extract_title and took a Content Cafe title out
        # for the Metadata object.
        eq_("Book title", metadata.title)

        # Then we called mock_scrape, which gave us the content for
        # one LinkData.
        [link] = metadata.links
        eq_(rel, link.rel)
        eq_(None, link.href)
        eq_("text/html", link.media_type)
        eq_("Here you go", link.content)
예제 #15
0
    def test_get_series_information(self):

        metadata = Metadata(data_source=DataSource.NOVELIST)
        vampire = json.loads(self.sample_data("vampire_kisses.json"))
        book_info = vampire['TitleInfo']
        series_info = vampire['FeatureContent']['SeriesInfo']

        (metadata, ideal_title_key) = self.novelist.get_series_information(
            metadata, series_info, book_info
        )
        # Relevant series information is extracted
        eq_('Vampire kisses manga', metadata.series)
        eq_(1, metadata.series_position)
        # The 'full_title' key should be returned as ideal because
        # all the volumes have the same 'main_title'
        eq_('full_title', ideal_title_key)


        watchman = json.loads(self.sample_data("alternate_series_example.json"))
        book_info = watchman['TitleInfo']
        series_info = watchman['FeatureContent']['SeriesInfo']
        # Confirms that the new example doesn't match any volume's full title
        eq_([], [v for v in series_info['series_titles']
                if v.get('full_title')==book_info.get('full_title')])

        # But it still finds its matching volume
        (metadata, ideal_title_key) = self.novelist.get_series_information(
            metadata, series_info, book_info
        )
        eq_('Elvis Cole/Joe Pike novels', metadata.series)
        eq_(11, metadata.series_position)
        # And recommends using the main_title
        eq_('main_title', ideal_title_key)

        # If the volume is found in the series more than once...
        book_info = dict(
            main_title='The Baby-Sitters Club',
            full_title='The Baby-Sitters Club: Claudia and Mean Janine'
        )
        series_info = dict(
            full_title='The Baby-Sitters Club series',
            series_titles=[
                # The volume is here twice!
                book_info,
                book_info,
                dict(
                    full_title='The Baby-Sitters Club',
                    main_title='The Baby-Sitters Club: Claudia and Mean Janine',
                    series_position='3.'
                )
            ]
        )
        # An error is raised.
        assert_raises(
            ValueError, self.novelist.get_series_information,
            metadata, series_info, book_info
        )
    def process_item(self, identifier):
        edition = self.edition(identifier)
        metadata = Metadata.from_edition(edition)
        metadata.apply(edition, self.collection,
                       replace=self.replacement_policy)

        failure = self.register_work_for_calculation(identifier)
        if failure:
            return failure

        return identifier
예제 #17
0
    def test_set_equivalence(self):
        edition = self._edition()
        edition.title = "The House on Mango Street"
        edition.add_contributor(Contributor(viaf="112460612"),
                                Contributor.AUTHOR_ROLE)
        identifier = edition.primary_identifier

        i1 = self._identifier()
        identifierdata1 = IdentifierData(type=i1.type,
                                         identifier=i1.identifier)
        good_metadata = Metadata(DataSource.lookup(self._db,
                                                   DataSource.GUTENBERG),
                                 primary_identifier=identifierdata1,
                                 title="The House on Mango Street",
                                 contributors=[Contributor(viaf="112460612")])

        i2 = self._identifier()
        identifierdata2 = IdentifierData(type=i2.type,
                                         identifier=i2.identifier)
        bad_metadata = Metadata(DataSource.lookup(self._db,
                                                  DataSource.GUTENBERG),
                                primary_identifier=identifierdata2,
                                title="Calvin & Hobbes",
                                contributors=[Contributor(viaf="101010")])

        self.provider.set_equivalence(identifier, good_metadata)
        self.provider.set_equivalence(identifier, bad_metadata)
        equivalencies = Equivalency.for_identifiers(self._db,
                                                    [identifier]).all()

        # The identifier for the bad metadata isn't made equivalent
        eq_([i1], [x.output for x in equivalencies])
        eq_([1], [x.strength for x in equivalencies])

        # But if the existing identifier has no editions, they're made equivalent.
        identifier = self._identifier()
        self.provider.set_equivalence(identifier, bad_metadata)
        equivalencies = Equivalency.for_identifiers(self._db,
                                                    [identifier]).all()
        eq_([i2], [x.output for x in equivalencies])
        eq_([1], [x.strength for x in equivalencies])
    def process_item(self, identifier):
        edition = self.edition(identifier)
        metadata = Metadata.from_edition(edition)
        metadata.apply(edition,
                       self.collection,
                       replace=self.replacement_policy)

        failure = self.register_work_for_calculation(identifier)
        if failure:
            return failure

        return identifier
    def test_add_author_notes(self):
        """Verify that add_author_notes works in a real case."""
        metadata = Metadata(DataSource.CONTENT_CAFE)
        content = self.data_file("author_notes.html")
        self.http.queue_requests_response(200, 'text/html', content=content)
        self.api.add_author_notes(metadata, self.identifier, self.args)

        [notes] = metadata.links
        eq_(Hyperlink.AUTHOR, notes.rel)
        assert 'Brenda researched turtles' in notes.content

        # We incidentally figured out the book's title.
        eq_("Franklin's Christmas Gift", metadata.title)
    def test_add_excerpt(self):
        """Verify that add_excerpt works in a real case."""
        metadata = Metadata(DataSource.CONTENT_CAFE)
        content = self.data_file("excerpt.html")
        self.http.queue_requests_response(200, 'text/html', content=content)
        self.api.add_excerpt(metadata, self.identifier, self.args)

        [excerpt] = metadata.links
        eq_(Hyperlink.SAMPLE, excerpt.rel)
        assert 'Franklin loved his marbles.' in excerpt.content

        # We incidentally figured out the book's title.
        eq_("Franklin's Christmas Gift", metadata.title)
예제 #21
0
    def test_new_isbns(self):
        existing_id = self._identifier()
        metadata = Metadata(DataSource.lookup(self._db, DataSource.GUTENBERG),
                            identifiers=[
                                IdentifierData(type=Identifier.OCLC_WORK,
                                               identifier="abra"),
                                IdentifierData(
                                    type=existing_id.type,
                                    identifier=existing_id.identifier),
                                IdentifierData(type=Identifier.ISBN,
                                               identifier="kadabra"),
                            ])

        eq_(2, self.provider.new_isbns(metadata))
예제 #22
0
    def setup(self):
        super(TestNoveListCoverageProvider, self).setup()
        with temp_config() as config:
            config['integrations'][Configuration.NOVELIST_INTEGRATION] = {
                Configuration.NOVELIST_PROFILE: "library",
                Configuration.NOVELIST_PASSWORD: "******"
            }
            self.novelist = NoveListCoverageProvider(self._db)
        self.novelist.api = MockNoveListAPI()

        self.metadata = Metadata(data_source=self.novelist.source,
                                 primary_identifier=self._identifier(
                                     identifier_type=Identifier.NOVELIST_ID),
                                 title=u"The Great American Novel")
    def test_add_reviews(self):
        """Verify that add_reviews works in a real case."""
        metadata = Metadata(DataSource.CONTENT_CAFE)
        content = self.data_file("reviews.html")
        self.http.queue_requests_response(200, 'text/html', content=content)
        self.api.add_reviews(metadata, self.identifier, self.args)

        # We extracted six reviews from the sample file.
        reviews = metadata.links
        eq_(6, len(reviews))
        assert all([x.rel == Hyperlink.REVIEW for x in reviews])
        assert "isn't a myth!" in reviews[0].content

        # We incidentally figured out the book's title.
        eq_("Shadow Thieves", metadata.title)
예제 #24
0
    def setup(self):
        super(TestNoveListCoverageProvider, self).setup()
        self.integration = self._external_integration(
            ExternalIntegration.NOVELIST,
            ExternalIntegration.METADATA_GOAL,
            username=u'library',
            password=u'yep',
            libraries=[self._default_library])

        self.novelist = NoveListCoverageProvider(self._db)
        self.novelist.api = MockNoveListAPI.from_config(self._default_library)

        self.metadata = Metadata(data_source=self.novelist.data_source,
                                 primary_identifier=self._identifier(
                                     identifier_type=Identifier.NOVELIST_ID),
                                 title=u"The Great American Novel")
예제 #25
0
            def _fetch_remote_availability(self, identifiers):
                for i, identifier in enumerate(identifiers):
                    # The first identifer in the list is still
                    # available.
                    identifier_data = IdentifierData(
                        type=identifier.type, identifier=identifier.identifier)
                    metadata = Metadata(data_source=DataSource.AXIS_360,
                                        primary_identifier=identifier_data)
                    availability = CirculationData(
                        data_source=DataSource.AXIS_360,
                        primary_identifier=identifier_data,
                        licenses_owned=7,
                        licenses_available=6)
                    yield metadata, availability

                    # The rest have been 'forgotten' by Axis 360.
                    break
예제 #26
0
    def create_metadata(self, isbn_identifier):
        """Make a Metadata object for the given Identifier.

        The Metadata object may include a cover image, descriptions,
        reviews, an excerpt, author notes, and a popularity measurement.

        :return: A Metadata object, or None if Content Cafe has no
        knowledge of this ISBN.
        """
        isbn = isbn_identifier.identifier

        args = dict(userid=self.user_id, password=self.password, isbn=isbn)
        image_url = self.image_url % args
        response = self.do_get(image_url)
        if response.status_code == 404:
            # Content Cafe served us an HTML page instead of an
            # image. This indicates that Content Cafe has no knowledge
            # of this ISBN -- if it knew _anything_ it would have a
            # cover image. There is no need to build a Metadata object.
            return None

        media_type = response.headers.get('Content-Type', 'image/jpeg')

        # Start building a Metadata object.
        metadata = Metadata(self.data_source,
                            primary_identifier=isbn_identifier)

        # Add the cover image to it
        image = response.content
        if self.is_suitable_image(image):
            metadata.links.append(
                LinkData(rel=Hyperlink.IMAGE,
                         href=image_url,
                         media_type=media_type,
                         content=response.content))

        for annotator in (self.add_descriptions, self.add_excerpt,
                          self.add_reviews, self.add_author_notes):
            annotator(metadata, isbn_identifier, args)

        popularity = self.measure_popularity(isbn_identifier,
                                             self.ONE_YEAR_AGO)
        if popularity:
            metadata.measurements.append(popularity)
        return metadata
예제 #27
0
            # level of confidence.
            for isbn in d.get('isbns', []):
                isbn13 = isbn.get('isbn13', None)
                if isbn13:
                    other_isbns.append(
                        IdentifierData(Identifier.ISBN, isbn13, 0.50))

        primary_isbn = primary_isbn13 or primary_isbn10
        if primary_isbn:
            primary_isbn = IdentifierData(Identifier.ISBN, primary_isbn, 0.90)

        contributors = []
        if display_author:
            contributors.append(ContributorData(display_name=display_author))

        metadata = Metadata(
            data_source=DataSource.NYT,
            title=title,
            medium=medium,
            language='eng',
            published=published_date,
            publisher=publisher,
            contributors=contributors,
            primary_identifier=primary_isbn,
            identifiers=other_isbns,
        )

        super(NYTBestSellerListTitle,
              self).__init__(metadata, first_appearance,
                             most_recent_appearance, annotation)
예제 #28
0
    def lookup_info_to_metadata(self, lookup_representation):
        """Transforms a NoveList JSON representation into a Metadata object"""

        if not lookup_representation.content:
            return None

        lookup_info = json.loads(lookup_representation.content)
        book_info = lookup_info['TitleInfo']
        if book_info:
            novelist_identifier = book_info.get('ui')
        if not book_info or not novelist_identifier:
            # NoveList didn't know the ISBN.
            return None

        primary_identifier, ignore = Identifier.for_foreign_id(
            self._db, Identifier.NOVELIST_ID, novelist_identifier)
        metadata = Metadata(self.source, primary_identifier=primary_identifier)

        # Get the equivalent ISBN identifiers.
        metadata.identifiers += self._extract_isbns(book_info)

        author = book_info.get('author')
        if author:
            metadata.contributors.append(ContributorData(sort_name=author))

        description = book_info.get('description')
        if description:
            metadata.links.append(
                LinkData(rel=Hyperlink.DESCRIPTION,
                         content=description,
                         media_type=Representation.TEXT_PLAIN))

        audience_level = book_info.get('audience_level')
        if audience_level:
            metadata.subjects.append(
                SubjectData(Subject.FREEFORM_AUDIENCE, audience_level))

        novelist_rating = book_info.get('rating')
        if novelist_rating:
            metadata.measurements.append(
                MeasurementData(Measurement.RATING, novelist_rating))

        # Extract feature content if it is available.
        series_info = None
        appeals_info = None
        lexile_info = None
        goodreads_info = None
        recommendations_info = None
        feature_content = lookup_info.get('FeatureContent')
        if feature_content:
            series_info = feature_content.get('SeriesInfo')
            appeals_info = feature_content.get('Appeals')
            lexile_info = feature_content.get('LexileInfo')
            goodreads_info = feature_content.get('GoodReads')
            recommendations_info = feature_content.get('SimilarTitles')

        metadata, title_key = self.get_series_information(
            metadata, series_info, book_info)
        metadata.title = book_info.get(title_key)
        subtitle = TitleProcessor.extract_subtitle(metadata.title,
                                                   book_info.get('full_title'))
        metadata.subtitle = self._scrub_subtitle(subtitle)

        # TODO: How well do we trust this data? We could conceivably bump up
        # the weight here.
        if appeals_info:
            extracted_genres = False
            for appeal in appeals_info:
                genres = appeal.get('genres')
                if genres:
                    for genre in genres:
                        metadata.subjects.append(
                            SubjectData(Subject.TAG, genre['Name']))
                        extracted_genres = True
                if extracted_genres:
                    break

        if lexile_info:
            metadata.subjects.append(
                SubjectData(Subject.LEXILE_SCORE, lexile_info['Lexile']))

        if goodreads_info:
            metadata.measurements.append(
                MeasurementData(Measurement.RATING,
                                goodreads_info['average_rating']))

        metadata = self.get_recommendations(metadata, recommendations_info)

        # If nothing interesting comes from the API, ignore it.
        if not (metadata.measurements or metadata.series_position
                or metadata.series or metadata.subjects or metadata.links
                or metadata.subtitle or metadata.recommendations):
            metadata = None
        return metadata
예제 #29
0
    def test_initialization(self):
        # Asserts that a RelatedBooksLane won't be initialized for a work
        # without related books

        # A book without a series or a contributor on a circ manager without
        # NoveList recommendations raises an error.
        self._db.delete(self.edition.contributions[0])
        self._db.commit()

        pytest.raises(
            ValueError, RelatedBooksLane, self._default_library, self.work, ""
        )

        # A book with a contributor initializes a RelatedBooksLane.
        luthor, i = self._contributor("Luthor, Lex")
        self.edition.add_contributor(luthor, [Contributor.EDITOR_ROLE])

        result = RelatedBooksLane(self._default_library, self.work, "")
        assert self.work == result.work
        [sublane] = result.children
        assert True == isinstance(sublane, ContributorLane)
        assert sublane.contributor == luthor

        # As does a book in a series.
        self.edition.series = "All By Myself"
        result = RelatedBooksLane(self._default_library, self.work, "")
        assert 2 == len(result.children)
        [contributor, series] = result.children
        assert True == isinstance(series, SeriesLane)

        # When NoveList is configured and recommendations are available,
        # a RecommendationLane will be included.
        self._external_integration(
            ExternalIntegration.NOVELIST,
            goal=ExternalIntegration.METADATA_GOAL,
            username="******",
            password="******",
            libraries=[self._default_library],
        )
        mock_api = MockNoveListAPI(self._db)
        response = Metadata(
            self.edition.data_source, recommendations=[self._identifier()]
        )
        mock_api.setup_method(response)
        result = RelatedBooksLane(
            self._default_library, self.work, "", novelist_api=mock_api
        )
        assert 3 == len(result.children)

        [novelist_recommendations] = [
            x for x in result.children if isinstance(x, RecommendationLane)
        ]
        assert (
            "Similar titles recommended by NoveList"
            == novelist_recommendations.display_name
        )

        # The book's language and audience list is passed down to all sublanes.
        assert ["eng"] == result.languages
        for sublane in result.children:
            assert result.languages == sublane.languages
            if isinstance(sublane, SeriesLane):
                assert [result.source_audience] == sublane.audiences
            else:
                assert sorted(list(result.audiences)) == sorted(list(sublane.audiences))

        contributor, recommendations, series = result.children
        assert True == isinstance(recommendations, RecommendationLane)
        assert True == isinstance(series, SeriesLane)
        assert True == isinstance(contributor, ContributorLane)
예제 #30
0
    def lookup_info_to_metadata(self, lookup_representation):
        """Transforms a NoveList JSON representation into a Metadata object"""

        if not lookup_representation.content:
            return None

        lookup_info = json.loads(lookup_representation.content)
        book_info = lookup_info['TitleInfo']
        if book_info:
            novelist_identifier = book_info.get('ui')
        if not book_info or not novelist_identifier:
            # NoveList didn't know the ISBN.
            return None

        primary_identifier, ignore = Identifier.for_foreign_id(
            self._db, Identifier.NOVELIST_ID, novelist_identifier
        )
        metadata = Metadata(self.source, primary_identifier=primary_identifier)

        # Get the equivalent ISBN identifiers.
        metadata.identifiers += self._extract_isbns(book_info)

        author = book_info.get('author')
        if author:
            metadata.contributors.append(ContributorData(sort_name=author))

        description = book_info.get('description')
        if description:
            metadata.links.append(LinkData(
                rel=Hyperlink.DESCRIPTION, content=description,
                media_type=Representation.TEXT_PLAIN
            ))

        audience_level = book_info.get('audience_level')
        if audience_level:
            metadata.subjects.append(SubjectData(
                Subject.FREEFORM_AUDIENCE, audience_level
            ))

        novelist_rating = book_info.get('rating')
        if novelist_rating:
            metadata.measurements.append(MeasurementData(
                Measurement.RATING, novelist_rating
            ))

        # Extract feature content if it is available.
        series_info = None
        appeals_info = None
        lexile_info = None
        goodreads_info = None
        recommendations_info = None
        feature_content = lookup_info.get('FeatureContent')
        if feature_content:
            series_info = feature_content.get('SeriesInfo')
            appeals_info = feature_content.get('Appeals')
            lexile_info = feature_content.get('LexileInfo')
            goodreads_info = feature_content.get('GoodReads')
            recommendations_info = feature_content.get('SimilarTitles')

        metadata, title_key = self.get_series_information(
            metadata, series_info, book_info
        )
        metadata.title = book_info.get(title_key)
        subtitle = TitleProcessor.extract_subtitle(
            metadata.title, book_info.get('full_title')
        )
        metadata.subtitle = self._scrub_subtitle(subtitle)

        if appeals_info:
            extracted_genres = False
            for appeal in appeals_info:
                genres = appeal.get('genres')
                if genres:
                    for genre in genres:
                        metadata.subjects.append(SubjectData(
                            Subject.TAG, genre['Name']
                        ))
                        extracted_genres = True
                if extracted_genres:
                    break

        if lexile_info:
            metadata.subjects.append(SubjectData(
                Subject.LEXILE_SCORE, lexile_info['Lexile']
            ))

        if goodreads_info:
            metadata.measurements.append(MeasurementData(
                Measurement.RATING, goodreads_info['average_rating']
            ))

        metadata = self.get_recommendations(metadata, recommendations_info)

        # If nothing interesting comes from the API, ignore it.
        if not (metadata.measurements or metadata.series_position or
            metadata.series or metadata.subjects or metadata.links or
            metadata.subtitle or metadata.recommendations
        ):
            metadata = None
        return metadata
예제 #31
0
    def test_work_from_metadata(self):
        """Validate the ability to create a new Work from appropriate metadata.
        """

        class Mock(MockDirectoryImportScript):
            """In this test we need to verify that annotate_metadata
            was called but did nothing.
            """
            def annotate_metadata(self, metadata, *args, **kwargs):
                metadata.annotated = True
                return super(Mock, self).annotate_metadata(
                    metadata, *args, **kwargs
                )

        identifier = IdentifierData(Identifier.GUTENBERG_ID, "1003")
        identifier_obj, ignore = identifier.load(self._db)
        metadata = Metadata(
            DataSource.GUTENBERG,
            primary_identifier=identifier,
            title=u"A book"
        )
        metadata.annotated = False
        datasource = DataSource.lookup(self._db, DataSource.GUTENBERG)
        policy = ReplacementPolicy.from_license_source(self._db)
        mirror = MockS3Uploader()
        policy.mirror = mirror

        # Here, work_from_metadata calls annotate_metadata, but does
        # not actually import anything because there are no files 'on
        # disk' and thus no way to actually get the book.
        collection = self._default_collection
        args = (collection, metadata, policy, "cover directory",
                "ebook directory", RightsStatus.CC0)
        script = Mock(self._db)
        eq_(None, script.work_from_metadata(*args))
        eq_(True, metadata.annotated)

        # Now let's try it with some files 'on disk'.
        with open(self.sample_cover_path('test-book-cover.png')) as fh:
            image = fh.read()
        mock_filesystem = {
            'cover directory' : (
                'cover.jpg', Representation.JPEG_MEDIA_TYPE, image
            ),
            'ebook directory' : (
                'book.epub', Representation.EPUB_MEDIA_TYPE, "I'm an EPUB."
            )
        }
        script = MockDirectoryImportScript(
            self._db, mock_filesystem=mock_filesystem
        )
        work = script.work_from_metadata(*args)

        # We have created a book. It has a cover image, which has a
        # thumbnail.
        eq_("A book", work.title)
        assert work.cover_full_url.endswith(
            '/test.cover.bucket/Gutenberg/Gutenberg+ID/1003/1003.jpg'
        )
        assert work.cover_thumbnail_url.endswith(
            '/test.cover.bucket/scaled/300/Gutenberg/Gutenberg+ID/1003/1003.png'
        )
        [pool] = work.license_pools
        assert pool.open_access_download_url.endswith(
            '/test.content.bucket/Gutenberg/Gutenberg+ID/1003/A+book.epub'
        )

        eq_(RightsStatus.CC0,
            pool.delivery_mechanisms[0].rights_status.uri)

        # The mock S3Uploader has a record of 'uploading' all these files
        # to S3.
        epub, full, thumbnail = mirror.uploaded
        eq_(epub.url, pool.open_access_download_url)
        eq_(full.url, work.cover_full_url)
        eq_(thumbnail.url, work.cover_thumbnail_url)

        # The EPUB Representation was cleared out after the upload, to
        # save database space.
        eq_("I'm an EPUB.", mirror.content[0])
        eq_(None, epub.content)
예제 #32
0
    def test_annotate_metadata(self):
        """Verify that annotate_metadata calls load_circulation_data
        and load_cover_link appropriately.
        """

        # First, test an unsuccessful annotation.
        class MockNoCirculationData(DirectoryImportScript):
            """Do nothing when load_circulation_data is called. Explode if
            load_cover_link is called.
            """
            def load_circulation_data(self, *args):
                self.load_circulation_data_args = args
                return None

            def load_cover_link(self, *args):
                raise Exception("Explode!")

        gutenberg = DataSource.lookup(self._db, DataSource.GUTENBERG)
        identifier = IdentifierData(Identifier.GUTENBERG_ID, "11111")
        identifier_obj, ignore = identifier.load(self._db)
        metadata = Metadata(
            title=self._str,
            data_source=gutenberg,
            primary_identifier=identifier
        )
        mirror = object()
        policy = ReplacementPolicy(mirror=mirror)
        cover_directory = object()
        ebook_directory = object()
        rights_uri = object()

        script = MockNoCirculationData(self._db)
        args = (metadata, policy, cover_directory, ebook_directory, rights_uri)
        script.annotate_metadata(*args)

        # load_circulation_data was called.
        eq_(
            (identifier_obj, gutenberg, ebook_directory, mirror,
             metadata.title, rights_uri),
            script.load_circulation_data_args
        )

        # But because load_circulation_data returned None,
        # metadata.circulation_data was not modified and
        # load_cover_link was not called (which would have raised an
        # exception).
        eq_(None, metadata.circulation)

        # Test a successful annotation with no cover image.
        class MockNoCoverLink(DirectoryImportScript):
            """Return an object when load_circulation_data is called.
            Do nothing when load_cover_link is called.
            """
            def load_circulation_data(self, *args):
                return "Some circulation data"

            def load_cover_link(self, *args):
                self.load_cover_link_args = args
                return None

        script = MockNoCoverLink(self._db)
        script.annotate_metadata(*args)

        # The Metadata object was annotated with the return value of
        # load_circulation_data.
        eq_("Some circulation data", metadata.circulation)

        # load_cover_link was called.
        eq_(
            (identifier_obj, gutenberg, cover_directory, mirror),
            script.load_cover_link_args
        )

        # But since it provided no cover link, metadata.links was empty.
        eq_([], metadata.links)

        # Finally, test a completely successful annotation.
        class MockWithCoverLink(DirectoryImportScript):
            """Mock success for both load_circulation_data
            and load_cover_link.
            """
            def load_circulation_data(self, *args):
                return "Some circulation data"

            def load_cover_link(self, *args):
                return "A cover link"

        metadata.circulation = None
        script = MockWithCoverLink(self._db)
        script.annotate_metadata(*args)

        eq_("Some circulation data", metadata.circulation)
        eq_(['A cover link'], metadata.links)
예제 #33
0
    def extract_bibliographic(self, element):
        """Extract Metadata and CirculationData from a dictionary
        of information from Enki.

        :return: A Metadata with attached CirculationData.
        """
        # TODO: it's not clear what these are or whether we'd find them
        # useful:
        #  dateSaved
        #  length
        #  publishDate
        primary_identifier = IdentifierData(EnkiAPI.ENKI_ID, element["id"])

        identifiers = []
        identifiers.append(IdentifierData(Identifier.ISBN, element["isbn"]))

        contributors = []
        sort_name = element.get("author", None) or Edition.UNKNOWN_AUTHOR
        contributors.append(ContributorData(sort_name=sort_name))

        links = []
        description = element.get('description')
        if description:
            links.append(
                LinkData(rel=Hyperlink.DESCRIPTION, content=description,
                         media_type="text/html")
            )

        # NOTE: When this method is called by, e.g. updated_titles(),
        # the large and small images are available separately. When
        # this method is called by get_item(), we only get a single
        # image, in 'cover'. In get_item() we ask that that image be 'large',
        # which means we'll be filing it as a normal-sized image.
        #
        full_image = None
        thumbnail_image = None
        for key, rel in (
                ('cover', Hyperlink.IMAGE),
                ('small_image', Hyperlink.THUMBNAIL_IMAGE),
                ('large_image', Hyperlink.IMAGE)
        ):
            url = element.get(key)
            if not url:
                continue
            link = LinkData(
                rel=rel, href=url, media_type=Representation.PNG_MEDIA_TYPE
            )
            if rel == Hyperlink.THUMBNAIL_IMAGE:
                # Don't add a thumbnail to the list of links -- wait
                # until the end and then make it a thumbnail of the
                # primary image.
                thumbnail_image = link
            else:
                if rel == Hyperlink.IMAGE:
                    full_image = link
                links.append(link)

        if thumbnail_image:
            if full_image:
                # Set the thumbnail as the thumbnail _of_ the full image.
                full_image.thumbnail = thumbnail_image
            else:
                # Treat the thumbnail as the full image.
                thumbnail_image.rel = Hyperlink.IMAGE
                links.append(thumbnail_image)

        # We treat 'subject', 'topic', and 'genre' as interchangeable
        # sets of tags. This data is based on BISAC but it's not reliably
        # presented in a form that can be parsed as BISAC.
        subjects = []
        seen_topics = set()
        for key in ('subject', 'topic', 'genre'):
            for topic in element.get(key, []):
                if not topic or topic in seen_topics:
                    continue
                subjects.append(SubjectData(Subject.TAG, topic))
                seen_topics.add(topic)

        language_code = element.get("language", "English")
        language = self.LANGUAGE_CODES.get(language_code, "eng")

        metadata = Metadata(
            data_source=DataSource.ENKI,
            title=element.get("title"),
            language=language,
            medium=Edition.BOOK_MEDIUM,
            publisher=element.get("publisher"),
            primary_identifier=primary_identifier,
            identifiers=identifiers,
            contributors=contributors,
            links=links,
            subjects=subjects,
        )
        circulationdata = self.extract_circulation(
            primary_identifier, element.get('availability', {}), element.get('formattype', None)
        )
        metadata.circulation = circulationdata
        return metadata
예제 #34
0
    def test_improve_description(self):
        # Here's a Metadata that has a bad (truncated) description.
        metadata = Metadata(self.data_source)

        bad_description = LinkData(rel=Hyperlink.DESCRIPTION, media_type="text/plain", content=u"The Discourse on the Method is a philosophical and mathematical treatise published by Ren\xe9 Descartes in 1637. Its full name is Discourse on the Method of Rightly Conducting the Reason, and Searching for Truth in the Sciences (French title: Discour...")

        irrelevant_description = LinkData(
            rel=Hyperlink.DESCRIPTION, media_type="text/plain",
            content="Don't look at me; I'm irrelevant!"
        )

        # Sending an HTTP request to this URL is going to give a 404 error.
        alternate = LinkData(rel=Hyperlink.ALTERNATE, href="http://foo/",
                             media_type=OPDSFeed.ENTRY_TYPE)

        # We're not even going to try to send an HTTP request to this URL
        # because it doesn't promise an OPDS entry.
        alternate2 = LinkData(rel=Hyperlink.ALTERNATE, href="http://bar/",
                             media_type="text/html")

        # But this URL will give us full information about this
        # entry, including a better description.
        alternate3 = LinkData(
            rel=Hyperlink.ALTERNATE, href="http://baz/",
            media_type=OPDSFeed.ENTRY_TYPE
        )

        # This URL will not be requested because the third alternate URL
        # gives us the answer we're looking for.
        alternate4 = LinkData(
            rel=Hyperlink.ALTERNATE, href="http://qux/",
            media_type=OPDSFeed.ENTRY_TYPE
        )

        # Two requests will be made. The first will result in a 404
        # error. The second will give us an OPDS entry.
        self.http.queue_response(404, content="Not found")
        self.http.queue_response(200, OPDSFeed.ENTRY_TYPE,
                                 content=self.sample_file("677.atom"))

        metadata.links = [bad_description, irrelevant_description,
                          alternate, alternate2, alternate3, alternate4]

        self.importer.improve_description("some ID", metadata)

        # The descriptions have been removed from metatadata.links,
        # because 677.atom included a description we know was better.
        #
        # The incomplete description was removed even though 677.atom
        # also included a copy of it.
        assert bad_description not in metadata.links
        assert irrelevant_description not in metadata.links

        # The more complete description from 677.atom has been added.
        [good_description] = [
            x for x in metadata.links if x.rel == Hyperlink.DESCRIPTION
        ]

        # The four alternate links have not been touched.
        assert (alternate in metadata.links)
        assert (alternate2 in metadata.links)
        assert (alternate3 in metadata.links)
        assert (alternate4 in metadata.links)

        # Two HTTP requests were made.
        eq_(['http://foo/', 'http://baz/'], self.http.requests)
예제 #35
0
    def test_improve_description(self):
        # Here's a Metadata that has a bad (truncated) description.
        metadata = Metadata(self.data_source)

        bad_description = LinkData(rel=Hyperlink.DESCRIPTION, media_type="text/plain", content=u"The Discourse on the Method is a philosophical and mathematical treatise published by Ren\xe9 Descartes in 1637. Its full name is Discourse on the Method of Rightly Conducting the Reason, and Searching for Truth in the Sciences (French title: Discour...")

        irrelevant_description = LinkData(
            rel=Hyperlink.DESCRIPTION, media_type="text/plain",
            content="Don't look at me; I'm irrelevant!"
        )

        # Sending an HTTP request to this URL is going to give a 404 error.
        alternate = LinkData(rel=Hyperlink.ALTERNATE, href="http://foo/",
                             media_type=OPDSFeed.ENTRY_TYPE)

        # We're not even going to try to send an HTTP request to this URL
        # because it doesn't promise an OPDS entry.
        alternate2 = LinkData(rel=Hyperlink.ALTERNATE, href="http://bar/",
                             media_type="text/html")

        # But this URL will give us full information about this
        # entry, including a better description.
        alternate3 = LinkData(
            rel=Hyperlink.ALTERNATE, href="http://baz/",
            media_type=OPDSFeed.ENTRY_TYPE
        )

        # This URL will not be requested because the third alternate URL
        # gives us the answer we're looking for.
        alternate4 = LinkData(
            rel=Hyperlink.ALTERNATE, href="http://qux/",
            media_type=OPDSFeed.ENTRY_TYPE
        )

        # Two requests will be made. The first will result in a 404
        # error. The second will give us an OPDS entry.
        self.http.queue_response(404, content="Not found")
        self.http.queue_response(200, OPDSFeed.ENTRY_TYPE,
                                 content=self.sample_file("677.atom"))

        metadata.links = [bad_description, irrelevant_description,
                          alternate, alternate2, alternate3, alternate4]

        self.importer.improve_description("some ID", metadata)

        # The descriptions have been removed from metatadata.links,
        # because 677.atom included a description we know was better.
        #
        # The incomplete description was removed even though 677.atom
        # also included a copy of it.
        assert bad_description not in metadata.links
        assert irrelevant_description not in metadata.links

        # The more complete description from 677.atom has been added.
        [good_description] = [
            x for x in metadata.links if x.rel == Hyperlink.DESCRIPTION
        ]

        # The four alternate links have not been touched.
        assert (alternate in metadata.links)
        assert (alternate2 in metadata.links)
        assert (alternate3 in metadata.links)
        assert (alternate4 in metadata.links)

        # Two HTTP requests were made.
        eq_(['http://foo/', 'http://baz/'], self.http.requests)
예제 #36
0
    def parse_book(cls, collection, g, uri, title):
        """Turn an RDF graph into a Edition for the given `uri` and
        `title`.
        """
        source_id = unicode(cls.ID_IN_URI.search(uri).groups()[0])
        primary_identifier = IdentifierData(
            Identifier.GUTENBERG_ID, source_id
        )

        # Split a subtitle out from the main title.
        title = unicode(title)
        subtitle = None
        for separator in "\r\n", "\n":
            if separator in title:
                parts = title.split(separator)
                title = parts[0]
                subtitle = "\n".join(parts[1:])
                break

        issued = cls._value(g, (uri, cls.dcterms.issued, None))
        issued = datetime.datetime.strptime(issued, cls.DATE_FORMAT).date()

        rights = cls._value(g, (uri, cls.dcterms.rights, None))
        if rights:
            rights = str(rights)
        else:
            rights = ''
        rights_uri = RightsStatus.rights_uri_from_string(rights)

        # As far as I can tell, Gutenberg descriptions are 100%
        # useless for our purposes. They should not be used, even if
        # no other description is available.

        publisher = cls._value(g, (uri, cls.dcterms.publisher, None))

        languages = []
        for ignore, ignore, language_uri in g.triples(
                (uri, cls.dcterms.language, None)):
            code = str(cls._value(g, (language_uri, cls.rdf.value, None)))
            code = LanguageCodes.two_to_three[code]
            if code:
                languages.append(code)

        if 'eng' in languages:
            language = 'eng'
        elif languages:
            language = languages[0]
        else:
            language = None

        contributors = []
        for ignore, ignore, author_uri in g.triples((uri, cls.dcterms.creator, None)):
            name = cls._value(g, (author_uri, cls.gutenberg.name, None))
            aliases = cls._values(g, (author_uri, cls.gutenberg.alias, None))
            contributors.append(ContributorData(
                sort_name=name,
                aliases=aliases,
                roles=[Contributor.AUTHOR_ROLE],
            ))

        subjects = []
        subject_links = cls._values(g, (uri, cls.dcterms.subject, None))
        for subject in subject_links:
            value = cls._value(g, (subject, cls.rdf.value, None))
            vocabulary = cls._value(g, (subject, cls.dcam.memberOf, None))
            vocabulary = Subject.by_uri[str(vocabulary)]
            subjects.append(SubjectData(vocabulary, value))

        medium = Edition.BOOK_MEDIUM

        # Turn the Gutenberg download links into Hyperlinks associated 
        # with the new Edition. They will serve either as open access
        # downloads or cover images.
        download_links = cls._values(g, (uri, cls.dcterms.hasFormat, None))
        links = [LinkData(
            rel=Hyperlink.CANONICAL,
            href=str(uri),
        )]

        # Gutenberg won't allow us to use any of the download or image
        # links--we have to make our own from an rsynced mirror--but
        # we can look through the links to determine which medium to
        # assign to this book.
        formats = []
        for href in download_links:
            for format_uri in cls._values(
                    g, (href, cls.dcterms['format'], None)):
                media_type = unicode(
                    cls._value(g, (format_uri, cls.rdf.value, None)))
                if media_type.startswith('audio/'):
                    medium = Edition.AUDIO_MEDIUM
                    formats.append(FormatData(
                        content_type=Representation.MP3_MEDIA_TYPE,
                        drm_scheme=DeliveryMechanism.NO_DRM,
                    ))
                elif media_type.startswith('video/'):
                    medium = Edition.VIDEO_MEDIUM
                else:
                    formats.append(FormatData(
                        content_type=Representation.EPUB_MEDIA_TYPE,
                        drm_scheme=DeliveryMechanism.NO_DRM,
                        rights_uri=rights_uri,
                    ))

        _db  = Session.object_session(collection)
        metadata = Metadata(
            data_source=DataSource.GUTENBERG,
            title=title,
            subtitle=subtitle,
            language=language,
            publisher=publisher,
            issued=issued,
            medium=medium,
            primary_identifier=primary_identifier,
            subjects=subjects,
            contributors=contributors,
            links=links,
        )
        edition, new = metadata.edition(_db)
        metadata.apply(edition, collection)

        # Ensure that an open-access LicensePool exists for this book.
        circulation_data = CirculationData(
            data_source=DataSource.GUTENBERG,
            primary_identifier=primary_identifier,
            formats=formats,
            default_rights_uri=rights_uri,
            links=links,
        )

        license_pool, new_license_pool = circulation_data.license_pool(
            _db, collection
        )
        replace = ReplacementPolicy(formats=True)
        circulation_data.apply(_db, collection, replace=replace)
        license_pool.calculate_work()
        return edition, license_pool, new
예제 #37
0
class TestCirculationMonitor(Axis360Test):

    BIBLIOGRAPHIC_DATA = Metadata(
        DataSource.AXIS_360,
        publisher=u'Random House Inc',
        language='eng',
        title=u'Faith of My Fathers : A Family Memoir',
        imprint=u'Random House Inc2',
        published=datetime.datetime(2000, 3, 7, 0, 0),
        primary_identifier=IdentifierData(type=Identifier.AXIS_360_ID,
                                          identifier=u'0003642860'),
        identifiers=[
            IdentifierData(type=Identifier.ISBN, identifier=u'9780375504587')
        ],
        contributors=[
            ContributorData(sort_name=u"McCain, John",
                            roles=[Contributor.PRIMARY_AUTHOR_ROLE]),
            ContributorData(sort_name=u"Salter, Mark",
                            roles=[Contributor.AUTHOR_ROLE]),
        ],
        subjects=[
            SubjectData(type=Subject.BISAC,
                        identifier=u'BIOGRAPHY & AUTOBIOGRAPHY / Political'),
            SubjectData(type=Subject.FREEFORM_AUDIENCE, identifier=u'Adult'),
        ],
    )

    AVAILABILITY_DATA = CirculationData(
        data_source=DataSource.AXIS_360,
        primary_identifier=BIBLIOGRAPHIC_DATA.primary_identifier,
        licenses_owned=9,
        licenses_available=8,
        licenses_reserved=0,
        patrons_in_hold_queue=0,
        last_checked=datetime.datetime(2015, 5, 20, 2, 9, 8),
    )

    def test_process_book(self):
        integration, ignore = create(
            self._db,
            ExternalIntegration,
            goal=ExternalIntegration.ANALYTICS_GOAL,
            protocol="core.local_analytics_provider",
        )

        monitor = Axis360CirculationMonitor(
            self._db,
            self.collection,
            api_class=MockAxis360API,
            metadata_client=MockMetadataWranglerOPDSLookup('url'))
        edition, license_pool = monitor.process_book(self.BIBLIOGRAPHIC_DATA,
                                                     self.AVAILABILITY_DATA)
        eq_(u'Faith of My Fathers : A Family Memoir', edition.title)
        eq_(u'eng', edition.language)
        eq_(u'Random House Inc', edition.publisher)
        eq_(u'Random House Inc2', edition.imprint)

        eq_(Identifier.AXIS_360_ID, edition.primary_identifier.type)
        eq_(u'0003642860', edition.primary_identifier.identifier)

        [isbn] = [
            x for x in edition.equivalent_identifiers()
            if x is not edition.primary_identifier
        ]
        eq_(Identifier.ISBN, isbn.type)
        eq_(u'9780375504587', isbn.identifier)

        eq_(
            ["McCain, John", "Salter, Mark"],
            sorted([x.sort_name for x in edition.contributors]),
        )

        subs = sorted((x.subject.type, x.subject.identifier)
                      for x in edition.primary_identifier.classifications)
        eq_([(Subject.BISAC, u'BIOGRAPHY & AUTOBIOGRAPHY / Political'),
             (Subject.FREEFORM_AUDIENCE, u'Adult')], subs)

        eq_(9, license_pool.licenses_owned)
        eq_(8, license_pool.licenses_available)
        eq_(0, license_pool.patrons_in_hold_queue)
        eq_(datetime.datetime(2015, 5, 20, 2, 9, 8), license_pool.last_checked)

        # Three circulation events were created, backdated to the
        # last_checked date of the license pool.
        events = license_pool.circulation_events
        eq_([
            u'distributor_title_add', u'distributor_check_in',
            u'distributor_license_add'
        ], [x.type for x in events])
        for e in events:
            eq_(e.start, license_pool.last_checked)

        # A presentation-ready work has been created for the LicensePool.
        work = license_pool.work
        eq_(True, work.presentation_ready)
        eq_("Faith of My Fathers : A Family Memoir", work.title)

        # A CoverageRecord has been provided for this book in the Axis
        # 360 bibliographic coverage provider, so that in the future
        # it doesn't have to make a separate API request to ask about
        # this book.
        records = [
            x for x in license_pool.identifier.coverage_records if
            x.data_source.name == DataSource.AXIS_360 and x.operation is None
        ]
        eq_(1, len(records))

    def test_process_book_updates_old_licensepool(self):
        """If the LicensePool already exists, the circulation monitor
        updates it.
        """
        edition, licensepool = self._edition(
            with_license_pool=True,
            identifier_type=Identifier.AXIS_360_ID,
            identifier_id=u'0003642860')
        # We start off with availability information based on the
        # default for test data.
        eq_(1, licensepool.licenses_owned)

        identifier = IdentifierData(
            type=licensepool.identifier.type,
            identifier=licensepool.identifier.identifier)
        metadata = Metadata(DataSource.AXIS_360, primary_identifier=identifier)
        monitor = Axis360CirculationMonitor(
            self._db,
            self.collection,
            api_class=MockAxis360API,
            metadata_client=MockMetadataWranglerOPDSLookup('url'))
        edition, licensepool = monitor.process_book(metadata,
                                                    self.AVAILABILITY_DATA)

        # Now we have information based on the CirculationData.
        eq_(9, licensepool.licenses_owned)
예제 #38
0
    def extract_bibliographic(self, element):
        """Extract Metadata and CirculationData from a dictionary
        of information from Enki.

        :return: A Metadata with attached CirculationData.
        """
        # TODO: it's not clear what these are or whether we'd find them
        # useful:
        #  dateSaved
        #  length
        #  publishDate
        primary_identifier = IdentifierData(EnkiAPI.ENKI_ID, element["id"])

        identifiers = []
        identifiers.append(IdentifierData(Identifier.ISBN, element["isbn"]))

        contributors = []
        sort_name = element.get("author", None) or Edition.UNKNOWN_AUTHOR
        contributors.append(ContributorData(sort_name=sort_name))

        links = []
        description = element.get("description")
        if description:
            links.append(
                LinkData(
                    rel=Hyperlink.DESCRIPTION,
                    content=description,
                    media_type="text/html",
                )
            )

        # NOTE: When this method is called by, e.g. updated_titles(),
        # the large and small images are available separately. When
        # this method is called by get_item(), we only get a single
        # image, in 'cover'. In get_item() we ask that that image be 'large',
        # which means we'll be filing it as a normal-sized image.
        #
        full_image = None
        thumbnail_image = None
        for key, rel in (
            ("cover", Hyperlink.IMAGE),
            ("small_image", Hyperlink.THUMBNAIL_IMAGE),
            ("large_image", Hyperlink.IMAGE),
        ):
            url = element.get(key)
            if not url:
                continue
            link = LinkData(rel=rel, href=url, media_type=Representation.PNG_MEDIA_TYPE)
            if rel == Hyperlink.THUMBNAIL_IMAGE:
                # Don't add a thumbnail to the list of links -- wait
                # until the end and then make it a thumbnail of the
                # primary image.
                thumbnail_image = link
            else:
                if rel == Hyperlink.IMAGE:
                    full_image = link
                links.append(link)

        if thumbnail_image:
            if full_image:
                # Set the thumbnail as the thumbnail _of_ the full image.
                full_image.thumbnail = thumbnail_image
            else:
                # Treat the thumbnail as the full image.
                thumbnail_image.rel = Hyperlink.IMAGE
                links.append(thumbnail_image)

        # We treat 'subject', 'topic', and 'genre' as interchangeable
        # sets of tags. This data is based on BISAC but it's not reliably
        # presented in a form that can be parsed as BISAC.
        subjects = []
        seen_topics = set()
        for key in ("subject", "topic", "genre"):
            for topic in element.get(key, []):
                if not topic or topic in seen_topics:
                    continue
                subjects.append(
                    SubjectData(
                        Subject.TAG,
                        topic,
                        weight=Classification.TRUSTED_DISTRIBUTOR_WEIGHT,
                    )
                )
                seen_topics.add(topic)

        language_code = element.get("language", "English")
        language = self.LANGUAGE_CODES.get(language_code, "eng")

        metadata = Metadata(
            data_source=DataSource.ENKI,
            title=element.get("title"),
            language=language,
            medium=Edition.BOOK_MEDIUM,
            publisher=element.get("publisher"),
            primary_identifier=primary_identifier,
            identifiers=identifiers,
            contributors=contributors,
            links=links,
            subjects=subjects,
        )
        circulationdata = self.extract_circulation(
            primary_identifier,
            element.get("availability", {}),
            element.get("formattype", None),
        )
        metadata.circulation = circulationdata
        return metadata
예제 #39
0
    def record_info_to_metadata(cls, book, availability):
        """Turn Odilo's JSON representation of a book into a Metadata
        object.

        Note:  The json data passed into this method is from a different file/stream
        from the json data that goes into the book_info_to_circulation() method.
        """
        if 'id' not in book:
            return None

        odilo_id = book['id']
        primary_identifier = IdentifierData(Identifier.ODILO_ID, odilo_id)
        active = book.get('active')

        title = book.get('title')
        subtitle = book.get('subtitle')
        series = book.get('series')
        series_position = book.get('seriesPosition')

        contributors = []
        sort_author = book.get('author')
        if sort_author:
            roles = [Contributor.AUTHOR_ROLE]
            display_author = sort_name_to_display_name(sort_author)
            contributor = ContributorData(sort_name=sort_author,
                                          display_name=display_author,
                                          roles=roles,
                                          biography=None)
            contributors.append(contributor)

        publisher = book.get('publisher')

        # Metadata --> Marc21 260$c
        published = book.get('publicationDate')
        if not published:
            # yyyyMMdd --> record creation date
            published = book.get('releaseDate')

        if published:
            try:
                published = datetime.datetime.strptime(published, "%Y%m%d")
            except ValueError as e:
                cls.log.warn('Cannot parse publication date from: ' +
                             published + ', message: ' + e.message)

        # yyyyMMdd --> record last modification date
        last_update = book.get('modificationDate')
        if last_update:
            try:
                last_update = datetime.datetime.strptime(last_update, "%Y%m%d")
            except ValueError as e:
                cls.log.warn('Cannot parse last update date from: ' +
                             last_update + ', message: ' + e.message)

        language = book.get('language', 'spa')

        subjects = []
        for subject in book.get('subjects', []):
            subjects.append(
                SubjectData(type=Subject.TAG, identifier=subject, weight=100))

        for subjectBisacCode in book.get('subjectsBisacCodes', []):
            subjects.append(
                SubjectData(type=Subject.BISAC,
                            identifier=subjectBisacCode,
                            weight=100))

        grade_level = book.get('gradeLevel')
        if grade_level:
            subject = SubjectData(type=Subject.GRADE_LEVEL,
                                  identifier=grade_level,
                                  weight=10)
            subjects.append(subject)

        medium = None
        file_format = book.get('fileFormat')
        formats = []
        for format_received in book.get('formats', []):
            if format_received in cls.format_data_for_odilo_format:
                medium = cls.set_format(format_received, formats)
            elif format_received == cls.ACSM and file_format:
                medium = cls.set_format(
                    format_received + '_' + file_format.upper(), formats)
            else:
                cls.log.warn('Unrecognized format received: ' +
                             format_received)

        if not medium:
            medium = Edition.BOOK_MEDIUM

        identifiers = []
        isbn = book.get('isbn')
        if isbn:
            if isbnlib.is_isbn10(isbn):
                isbn = isbnlib.to_isbn13(isbn)
            identifiers.append(IdentifierData(Identifier.ISBN, isbn, 1))

        # A cover
        links = []
        cover_image_url = book.get('coverImageUrl')
        if cover_image_url:
            image_data = cls.image_link_to_linkdata(cover_image_url,
                                                    Hyperlink.THUMBNAIL_IMAGE)
            if image_data:
                links.append(image_data)

        original_image_url = book.get('originalImageUrl')
        if original_image_url:
            image_data = cls.image_link_to_linkdata(original_image_url,
                                                    Hyperlink.IMAGE)
            if image_data:
                links.append(image_data)

        # Descriptions become links.
        description = book.get('description')
        if description:
            links.append(
                LinkData(rel=Hyperlink.DESCRIPTION,
                         content=description,
                         media_type="text/html"))

        metadata = Metadata(data_source=DataSource.ODILO,
                            title=title,
                            subtitle=subtitle,
                            language=language,
                            medium=medium,
                            series=series,
                            series_position=series_position,
                            publisher=publisher,
                            published=published,
                            primary_identifier=primary_identifier,
                            identifiers=identifiers,
                            subjects=subjects,
                            contributors=contributors,
                            links=links,
                            data_source_last_updated=last_update)

        metadata.circulation = OdiloRepresentationExtractor.record_info_to_circulation(
            availability)
        # 'active' --> means that the book exists but it's no longer in the collection
        # (it could be available again in the future)
        if not active:
            metadata.circulation.licenses_owned = 0
        metadata.circulation.formats = formats

        return metadata, active
예제 #40
0
    def test_parse(self):
        identifier = self._identifier()
        tree = self.tree("single_work_response.xml")
        metadata = Metadata(data_source=DataSource.OCLC,
                            primary_identifier=identifier)
        result = self.parser.parse(tree, metadata)
        eq_([identifier], result.identifiers)

        # Contributors
        [parker, tanner, hayford, melville] = result.contributors
        eq_('4947338', parker.viaf)
        eq_('n50050335', parker.lc)
        eq_([Contributor.EDITOR_ROLE], parker.roles)

        eq_('51716047', tanner.viaf)
        eq_('n79059764', tanner.lc)
        eq_(
            set([
                Contributor.UNKNOWN_ROLE, Contributor.EDITOR_ROLE,
                Contributor.INTRODUCTION_ROLE, Contributor.AUTHOR_ROLE
            ]), set(tanner.roles))

        eq_('34482742', hayford.viaf)
        eq_('n50025038', hayford.lc)
        eq_(set([Contributor.ASSOCIATED_ROLE, Contributor.EDITOR_ROLE]),
            set(hayford.roles))

        eq_('27068555', melville.viaf)
        eq_('n79006936', melville.lc)
        eq_([Contributor.AUTHOR_ROLE], melville.roles)
        eq_({'deathDate': '1891', 'birthDate': '1819'}, melville.extra)

        # Measurements
        def get_measurement(quantity):
            [measurement] = [
                m.value for m in result.measurements if m.quantity_measured ==
                self.parser.MEASUREMENT_MAPPING[quantity]
            ]
            return measurement

        eq_(46983, get_measurement("holdings"))
        eq_(2781, get_measurement("editions"))

        # Subjects
        def get_subjects(type):
            for s in result.subjects:
                if s.type == type:
                    yield s

        [ddc] = get_subjects("DDC")
        eq_("813.3", ddc.identifier)
        eq_(21183, ddc.weight)

        [lcc] = get_subjects("LCC")
        eq_("PS2384", lcc.identifier)
        eq_(22460, lcc.weight)

        fasts = list(get_subjects("FAST"))
        eq_([
            '1174284', '1174266', '801923', '1116147', '1174307', '1016699',
            '1110122', '1356235'
        ], [x.identifier for x in fasts])
        eq_([32058, 31482, 29933, 19086, 18913, 17294, 6893, 4512],
            [x.weight for x in fasts])
        eq_([
            'Whaling', 'Whales', 'Ahab, Captain (Fictitious character)',
            'Ship captains', 'Whaling ships', 'Mentally ill', 'Sea stories',
            'Moby Dick (Melville, Herman)'
        ], [x.name for x in fasts])
    def add_with_metadata(self, collection_details):
        """Adds identifiers with their metadata to a Collection's catalog"""
        client = authenticated_client_from_request(self._db)
        if isinstance(client, ProblemDetail):
            return client

        collection = collection_from_details(
            self._db, client, collection_details
        )

        data_source = DataSource.lookup(
            self._db, collection.name, autocreate=True
        )

        messages = []

        feed = feedparser.parse(request.data)
        entries = feed.get("entries", [])
        entries_by_urn = { entry.get('id') : entry for entry in entries }

        identifiers_by_urn, invalid_urns = Identifier.parse_urns(
            self._db, entries_by_urn.keys()
        )

        messages = list()

        for urn in invalid_urns:
            messages.append(OPDSMessage(
                urn, INVALID_URN.status_code, INVALID_URN.detail
            ))


        for urn, identifier in identifiers_by_urn.items():
            entry = entries_by_urn[urn]
            status = HTTP_OK
            description = "Already in catalog"

            if identifier not in collection.catalog:
                collection.catalog_identifier(identifier)
                status = HTTP_CREATED
                description = "Successfully added"

            message = OPDSMessage(urn, status, description)

            # Get a cover if it exists.
            image_types = set([Hyperlink.IMAGE, Hyperlink.THUMBNAIL_IMAGE])
            images = [l for l in entry.get("links", []) if l.get("rel") in image_types]
            links = [LinkData(image.get("rel"), image.get("href")) for image in images]

            # Create an edition to hold the title and author. LicensePool.calculate_work
            # refuses to create a Work when there's no title, and if we have a title, author
            # and language we can attempt to look up the edition in OCLC.
            title = entry.get("title") or "Unknown Title"
            author = ContributorData(
                sort_name=(entry.get("author") or Edition.UNKNOWN_AUTHOR),
                roles=[Contributor.PRIMARY_AUTHOR_ROLE]
            )
            language = entry.get("dcterms_language")

            presentation = PresentationCalculationPolicy(
                choose_edition=False,
                set_edition_metadata=False,
                classify=False,
                choose_summary=False,
                calculate_quality=False,
                choose_cover=False,
                regenerate_opds_entries=False,
            )
            replace = ReplacementPolicy(presentation_calculation_policy=presentation)
            metadata = Metadata(
                data_source,
                primary_identifier=IdentifierData(identifier.type, identifier.identifier),
                title=title,
                language=language,
                contributors=[author],
                links=links,
            )

            edition, ignore = metadata.edition(self._db)
            metadata.apply(edition, collection, replace=replace)

            messages.append(message)

        title = "%s Catalog Item Additions for %s" % (collection.protocol, client.url)
        url = self.collection_feed_url("add_with_metadata", collection)
        addition_feed = AcquisitionFeed(
            self._db, title, url, [], VerboseAnnotator,
            precomposed_entries=messages
        )

        return feed_response(addition_feed)