예제 #1
0
    def extract_bibliographic(self, element):
        identifiers = []
        contributors = []
        identifiers.append(IdentifierData(Identifier.ISBN, element["isbn"]))
        sort_name = element["author"]
        if not sort_name:
            sort_name = Edition.UNKNOWN_AUTHOR
        contributors.append(ContributorData(sort_name=sort_name))
        primary_identifier = IdentifierData(EnkiAPI.ENKI_ID, element["id"])
        image_url = element["large_image"]
        thumbnail_url = element["large_image"]
        images = [
            LinkData(rel=Hyperlink.THUMBNAIL_IMAGE,
                     href=thumbnail_url,
                     media_type=Representation.PNG_MEDIA_TYPE),
            LinkData(rel=Hyperlink.IMAGE,
                     href=image_url,
                     media_type=Representation.PNG_MEDIA_TYPE)
        ]
        metadata = Metadata(
            data_source=DataSource.ENKI,
            title=element["title"],
            language="eng",
            medium=Edition.BOOK_MEDIUM,
            publisher=element["publisher"],
            primary_identifier=primary_identifier,
            identifiers=identifiers,
            contributors=contributors,
            links=images,
        )
        licenses_owned = element["availability"]["totalCopies"]
        licenses_available = element["availability"]["availableCopies"]
        hold = element["availability"]["onHold"]
        drm_type = EnkiAPI.adobe_drm if (element["availability"]["accessType"]
                                         == 'acs') else EnkiAPI.no_drm
        formats = []
        formats.append(
            FormatData(content_type=Representation.EPUB_MEDIA_TYPE,
                       drm_scheme=drm_type))

        circulationdata = CirculationData(
            data_source=DataSource.ENKI,
            primary_identifier=primary_identifier,
            formats=formats,
            licenses_owned=int(licenses_owned),
            licenses_available=int(licenses_available),
            patrons_in_hold_queue=int(hold))

        metadata.circulation = circulationdata
        return metadata
예제 #2
0
    def extract_bibliographic(self, element):
        """Extract Metadata and CirculationData from a dictionary
        of information from Enki.

        :return: A Metadata with attached CirculationData.
        """
        # TODO: it's not clear what these are or whether we'd find them
        # useful:
        #  dateSaved
        #  length
        #  publishDate
        primary_identifier = IdentifierData(EnkiAPI.ENKI_ID, element["id"])

        identifiers = []
        identifiers.append(IdentifierData(Identifier.ISBN, element["isbn"]))

        contributors = []
        sort_name = element.get("author", None) or Edition.UNKNOWN_AUTHOR
        contributors.append(ContributorData(sort_name=sort_name))

        links = []
        description = element.get("description")
        if description:
            links.append(
                LinkData(
                    rel=Hyperlink.DESCRIPTION,
                    content=description,
                    media_type="text/html",
                )
            )

        # NOTE: When this method is called by, e.g. updated_titles(),
        # the large and small images are available separately. When
        # this method is called by get_item(), we only get a single
        # image, in 'cover'. In get_item() we ask that that image be 'large',
        # which means we'll be filing it as a normal-sized image.
        #
        full_image = None
        thumbnail_image = None
        for key, rel in (
            ("cover", Hyperlink.IMAGE),
            ("small_image", Hyperlink.THUMBNAIL_IMAGE),
            ("large_image", Hyperlink.IMAGE),
        ):
            url = element.get(key)
            if not url:
                continue
            link = LinkData(rel=rel, href=url, media_type=Representation.PNG_MEDIA_TYPE)
            if rel == Hyperlink.THUMBNAIL_IMAGE:
                # Don't add a thumbnail to the list of links -- wait
                # until the end and then make it a thumbnail of the
                # primary image.
                thumbnail_image = link
            else:
                if rel == Hyperlink.IMAGE:
                    full_image = link
                links.append(link)

        if thumbnail_image:
            if full_image:
                # Set the thumbnail as the thumbnail _of_ the full image.
                full_image.thumbnail = thumbnail_image
            else:
                # Treat the thumbnail as the full image.
                thumbnail_image.rel = Hyperlink.IMAGE
                links.append(thumbnail_image)

        # We treat 'subject', 'topic', and 'genre' as interchangeable
        # sets of tags. This data is based on BISAC but it's not reliably
        # presented in a form that can be parsed as BISAC.
        subjects = []
        seen_topics = set()
        for key in ("subject", "topic", "genre"):
            for topic in element.get(key, []):
                if not topic or topic in seen_topics:
                    continue
                subjects.append(
                    SubjectData(
                        Subject.TAG,
                        topic,
                        weight=Classification.TRUSTED_DISTRIBUTOR_WEIGHT,
                    )
                )
                seen_topics.add(topic)

        language_code = element.get("language", "English")
        language = self.LANGUAGE_CODES.get(language_code, "eng")

        metadata = Metadata(
            data_source=DataSource.ENKI,
            title=element.get("title"),
            language=language,
            medium=Edition.BOOK_MEDIUM,
            publisher=element.get("publisher"),
            primary_identifier=primary_identifier,
            identifiers=identifiers,
            contributors=contributors,
            links=links,
            subjects=subjects,
        )
        circulationdata = self.extract_circulation(
            primary_identifier,
            element.get("availability", {}),
            element.get("formattype", None),
        )
        metadata.circulation = circulationdata
        return metadata
예제 #3
0
    def record_info_to_metadata(cls, book, availability):
        """Turn Odilo's JSON representation of a book into a Metadata
        object.

        Note:  The json data passed into this method is from a different file/stream
        from the json data that goes into the book_info_to_circulation() method.
        """
        if 'id' not in book:
            return None

        odilo_id = book['id']
        primary_identifier = IdentifierData(Identifier.ODILO_ID, odilo_id)
        active = book.get('active')

        title = book.get('title')
        subtitle = book.get('subtitle')
        series = book.get('series')
        series_position = book.get('seriesPosition')

        contributors = []
        sort_author = book.get('author')
        if sort_author:
            roles = [Contributor.AUTHOR_ROLE]
            display_author = sort_name_to_display_name(sort_author)
            contributor = ContributorData(sort_name=sort_author,
                                          display_name=display_author,
                                          roles=roles,
                                          biography=None)
            contributors.append(contributor)

        publisher = book.get('publisher')

        # Metadata --> Marc21 260$c
        published = book.get('publicationDate')
        if not published:
            # yyyyMMdd --> record creation date
            published = book.get('releaseDate')

        if published:
            try:
                published = datetime.datetime.strptime(published, "%Y%m%d")
            except ValueError as e:
                cls.log.warn('Cannot parse publication date from: ' +
                             published + ', message: ' + e.message)

        # yyyyMMdd --> record last modification date
        last_update = book.get('modificationDate')
        if last_update:
            try:
                last_update = datetime.datetime.strptime(last_update, "%Y%m%d")
            except ValueError as e:
                cls.log.warn('Cannot parse last update date from: ' +
                             last_update + ', message: ' + e.message)

        language = book.get('language', 'spa')

        subjects = []
        for subject in book.get('subjects', []):
            subjects.append(
                SubjectData(type=Subject.TAG, identifier=subject, weight=100))

        for subjectBisacCode in book.get('subjectsBisacCodes', []):
            subjects.append(
                SubjectData(type=Subject.BISAC,
                            identifier=subjectBisacCode,
                            weight=100))

        grade_level = book.get('gradeLevel')
        if grade_level:
            subject = SubjectData(type=Subject.GRADE_LEVEL,
                                  identifier=grade_level,
                                  weight=10)
            subjects.append(subject)

        medium = None
        file_format = book.get('fileFormat')
        formats = []
        for format_received in book.get('formats', []):
            if format_received in cls.format_data_for_odilo_format:
                medium = cls.set_format(format_received, formats)
            elif format_received == cls.ACSM and file_format:
                medium = cls.set_format(
                    format_received + '_' + file_format.upper(), formats)
            else:
                cls.log.warn('Unrecognized format received: ' +
                             format_received)

        if not medium:
            medium = Edition.BOOK_MEDIUM

        identifiers = []
        isbn = book.get('isbn')
        if isbn:
            if isbnlib.is_isbn10(isbn):
                isbn = isbnlib.to_isbn13(isbn)
            identifiers.append(IdentifierData(Identifier.ISBN, isbn, 1))

        # A cover
        links = []
        cover_image_url = book.get('coverImageUrl')
        if cover_image_url:
            image_data = cls.image_link_to_linkdata(cover_image_url,
                                                    Hyperlink.THUMBNAIL_IMAGE)
            if image_data:
                links.append(image_data)

        original_image_url = book.get('originalImageUrl')
        if original_image_url:
            image_data = cls.image_link_to_linkdata(original_image_url,
                                                    Hyperlink.IMAGE)
            if image_data:
                links.append(image_data)

        # Descriptions become links.
        description = book.get('description')
        if description:
            links.append(
                LinkData(rel=Hyperlink.DESCRIPTION,
                         content=description,
                         media_type="text/html"))

        metadata = Metadata(data_source=DataSource.ODILO,
                            title=title,
                            subtitle=subtitle,
                            language=language,
                            medium=medium,
                            series=series,
                            series_position=series_position,
                            publisher=publisher,
                            published=published,
                            primary_identifier=primary_identifier,
                            identifiers=identifiers,
                            subjects=subjects,
                            contributors=contributors,
                            links=links,
                            data_source_last_updated=last_update)

        metadata.circulation = OdiloRepresentationExtractor.record_info_to_circulation(
            availability)
        # 'active' --> means that the book exists but it's no longer in the collection
        # (it could be available again in the future)
        if not active:
            metadata.circulation.licenses_owned = 0
        metadata.circulation.formats = formats

        return metadata, active
예제 #4
0
    def test_annotate_metadata(self):
        """Verify that annotate_metadata calls load_circulation_data
        and load_cover_link appropriately.
        """

        # First, test an unsuccessful annotation.
        class MockNoCirculationData(DirectoryImportScript):
            """Do nothing when load_circulation_data is called. Explode if
            load_cover_link is called.
            """
            def load_circulation_data(self, *args):
                self.load_circulation_data_args = args
                return None

            def load_cover_link(self, *args):
                raise Exception("Explode!")

        gutenberg = DataSource.lookup(self._db, DataSource.GUTENBERG)
        identifier = IdentifierData(Identifier.GUTENBERG_ID, "11111")
        identifier_obj, ignore = identifier.load(self._db)
        metadata = Metadata(
            title=self._str,
            data_source=gutenberg,
            primary_identifier=identifier
        )
        mirror = object()
        policy = ReplacementPolicy(mirror=mirror)
        cover_directory = object()
        ebook_directory = object()
        rights_uri = object()

        script = MockNoCirculationData(self._db)
        args = (metadata, policy, cover_directory, ebook_directory, rights_uri)
        script.annotate_metadata(*args)

        # load_circulation_data was called.
        eq_(
            (identifier_obj, gutenberg, ebook_directory, mirror,
             metadata.title, rights_uri),
            script.load_circulation_data_args
        )

        # But because load_circulation_data returned None,
        # metadata.circulation_data was not modified and
        # load_cover_link was not called (which would have raised an
        # exception).
        eq_(None, metadata.circulation)

        # Test a successful annotation with no cover image.
        class MockNoCoverLink(DirectoryImportScript):
            """Return an object when load_circulation_data is called.
            Do nothing when load_cover_link is called.
            """
            def load_circulation_data(self, *args):
                return "Some circulation data"

            def load_cover_link(self, *args):
                self.load_cover_link_args = args
                return None

        script = MockNoCoverLink(self._db)
        script.annotate_metadata(*args)

        # The Metadata object was annotated with the return value of
        # load_circulation_data.
        eq_("Some circulation data", metadata.circulation)

        # load_cover_link was called.
        eq_(
            (identifier_obj, gutenberg, cover_directory, mirror),
            script.load_cover_link_args
        )

        # But since it provided no cover link, metadata.links was empty.
        eq_([], metadata.links)

        # Finally, test a completely successful annotation.
        class MockWithCoverLink(DirectoryImportScript):
            """Mock success for both load_circulation_data
            and load_cover_link.
            """
            def load_circulation_data(self, *args):
                return "Some circulation data"

            def load_cover_link(self, *args):
                return "A cover link"

        metadata.circulation = None
        script = MockWithCoverLink(self._db)
        script.annotate_metadata(*args)

        eq_("Some circulation data", metadata.circulation)
        eq_(['A cover link'], metadata.links)
예제 #5
0
    def test_annotate_metadata(self):
        """Verify that annotate_metadata calls load_circulation_data
        and load_cover_link appropriately.
        """

        # First, test an unsuccessful annotation.
        class MockNoCirculationData(DirectoryImportScript):
            """Do nothing when load_circulation_data is called. Explode if
            load_cover_link is called.
            """
            def load_circulation_data(self, *args):
                self.load_circulation_data_args = args
                return None

            def load_cover_link(self, *args):
                raise Exception("Explode!")

        gutenberg = DataSource.lookup(self._db, DataSource.GUTENBERG)
        identifier = IdentifierData(Identifier.GUTENBERG_ID, "11111")
        identifier_obj, ignore = identifier.load(self._db)
        metadata = Metadata(
            title=self._str,
            data_source=gutenberg,
            primary_identifier=identifier
        )
        mirror = object()
        policy = ReplacementPolicy(mirror=mirror)
        cover_directory = object()
        ebook_directory = object()
        rights_uri = object()

        script = MockNoCirculationData(self._db)
        args = (metadata, policy, cover_directory, ebook_directory, rights_uri)
        script.annotate_metadata(*args)

        # load_circulation_data was called.
        eq_(
            (identifier_obj, gutenberg, ebook_directory, mirror,
             metadata.title, rights_uri),
            script.load_circulation_data_args
        )

        # But because load_circulation_data returned None,
        # metadata.circulation_data was not modified and
        # load_cover_link was not called (which would have raised an
        # exception).
        eq_(None, metadata.circulation)

        # Test a successful annotation with no cover image.
        class MockNoCoverLink(DirectoryImportScript):
            """Return an object when load_circulation_data is called.
            Do nothing when load_cover_link is called.
            """
            def load_circulation_data(self, *args):
                return "Some circulation data"

            def load_cover_link(self, *args):
                self.load_cover_link_args = args
                return None

        script = MockNoCoverLink(self._db)
        script.annotate_metadata(*args)

        # The Metadata object was annotated with the return value of
        # load_circulation_data.
        eq_("Some circulation data", metadata.circulation)

        # load_cover_link was called.
        eq_(
            (identifier_obj, gutenberg, cover_directory, mirror),
            script.load_cover_link_args
        )

        # But since it provided no cover link, metadata.links was empty.
        eq_([], metadata.links)

        # Finally, test a completely successful annotation.
        class MockWithCoverLink(DirectoryImportScript):
            """Mock success for both load_circulation_data
            and load_cover_link.
            """
            def load_circulation_data(self, *args):
                return "Some circulation data"

            def load_cover_link(self, *args):
                return "A cover link"

        metadata.circulation = None
        script = MockWithCoverLink(self._db)
        script.annotate_metadata(*args)

        eq_("Some circulation data", metadata.circulation)
        eq_(['A cover link'], metadata.links)
예제 #6
0
    def extract_bibliographic(self, element):
        """Extract Metadata and CirculationData from a dictionary
        of information from Enki.

        :return: A Metadata with attached CirculationData.
        """
        # TODO: it's not clear what these are or whether we'd find them
        # useful:
        #  dateSaved
        #  length
        #  publishDate
        primary_identifier = IdentifierData(EnkiAPI.ENKI_ID, element["id"])

        identifiers = []
        identifiers.append(IdentifierData(Identifier.ISBN, element["isbn"]))

        contributors = []
        sort_name = element.get("author", None) or Edition.UNKNOWN_AUTHOR
        contributors.append(ContributorData(sort_name=sort_name))

        links = []
        description = element.get('description')
        if description:
            links.append(
                LinkData(rel=Hyperlink.DESCRIPTION, content=description,
                         media_type="text/html")
            )

        # NOTE: When this method is called by, e.g. updated_titles(),
        # the large and small images are available separately. When
        # this method is called by get_item(), we only get a single
        # image, in 'cover'. In get_item() we ask that that image be 'large',
        # which means we'll be filing it as a normal-sized image.
        #
        full_image = None
        thumbnail_image = None
        for key, rel in (
                ('cover', Hyperlink.IMAGE),
                ('small_image', Hyperlink.THUMBNAIL_IMAGE),
                ('large_image', Hyperlink.IMAGE)
        ):
            url = element.get(key)
            if not url:
                continue
            link = LinkData(
                rel=rel, href=url, media_type=Representation.PNG_MEDIA_TYPE
            )
            if rel == Hyperlink.THUMBNAIL_IMAGE:
                # Don't add a thumbnail to the list of links -- wait
                # until the end and then make it a thumbnail of the
                # primary image.
                thumbnail_image = link
            else:
                if rel == Hyperlink.IMAGE:
                    full_image = link
                links.append(link)

        if thumbnail_image:
            if full_image:
                # Set the thumbnail as the thumbnail _of_ the full image.
                full_image.thumbnail = thumbnail_image
            else:
                # Treat the thumbnail as the full image.
                thumbnail_image.rel = Hyperlink.IMAGE
                links.append(thumbnail_image)

        # We treat 'subject', 'topic', and 'genre' as interchangeable
        # sets of tags. This data is based on BISAC but it's not reliably
        # presented in a form that can be parsed as BISAC.
        subjects = []
        seen_topics = set()
        for key in ('subject', 'topic', 'genre'):
            for topic in element.get(key, []):
                if not topic or topic in seen_topics:
                    continue
                subjects.append(SubjectData(Subject.TAG, topic))
                seen_topics.add(topic)

        language_code = element.get("language", "English")
        language = self.LANGUAGE_CODES.get(language_code, "eng")

        metadata = Metadata(
            data_source=DataSource.ENKI,
            title=element.get("title"),
            language=language,
            medium=Edition.BOOK_MEDIUM,
            publisher=element.get("publisher"),
            primary_identifier=primary_identifier,
            identifiers=identifiers,
            contributors=contributors,
            links=links,
            subjects=subjects,
        )
        circulationdata = self.extract_circulation(
            primary_identifier, element.get('availability', {}), element.get('formattype', None)
        )
        metadata.circulation = circulationdata
        return metadata