Exemplo n.º 1
0
    def test_explicit_formatdata(self):
        # Creating an edition with an open-access download will
        # automatically create a delivery mechanism.
        edition, pool = self._edition(with_open_access_download=True)

        # Let's also add a DRM format.
        drm_format = FormatData(
            content_type=Representation.PDF_MEDIA_TYPE,
            drm_scheme=DeliveryMechanism.ADOBE_DRM,
        )

        circulation_data = CirculationData(formats=[drm_format],
                            data_source=edition.data_source, 
                            primary_identifier=edition.primary_identifier)
        circulation_data.apply(pool)

        [epub, pdf] = sorted(pool.delivery_mechanisms, 
                             key=lambda x: x.delivery_mechanism.content_type)
        eq_(epub.resource, edition.license_pool.best_open_access_link)

        eq_(Representation.PDF_MEDIA_TYPE, pdf.delivery_mechanism.content_type)
        eq_(DeliveryMechanism.ADOBE_DRM, pdf.delivery_mechanism.drm_scheme)

        # If we tell Metadata to replace the list of formats, we only
        # have the one format we manually created.
        replace = ReplacementPolicy(
                formats=True,
            )
        circulation_data.apply(pool, replace=replace)
        [pdf] = pool.delivery_mechanisms
        eq_(Representation.PDF_MEDIA_TYPE, pdf.delivery_mechanism.content_type)
Exemplo n.º 2
0
    def test_rights_status_commercial_link_with_rights(self):
        identifier = IdentifierData(
            Identifier.OVERDRIVE_ID,
            "abcd",
        )
        link = LinkData(
            rel=Hyperlink.DRM_ENCRYPTED_DOWNLOAD,
            media_type=Representation.EPUB_MEDIA_TYPE,
            href=self._url,
            rights_uri=RightsStatus.IN_COPYRIGHT,
        )
        format = FormatData(
            content_type=link.media_type,
            drm_scheme=DeliveryMechanism.ADOBE_DRM,
            link=link,
            rights_uri=RightsStatus.IN_COPYRIGHT,
        )

        circulation_data = CirculationData(
            data_source=DataSource.OVERDRIVE,
            primary_identifier=identifier,
            links=[link],
            formats=[format],
        )

        replace = ReplacementPolicy(
            formats=True,
        )

        pool, ignore = circulation_data.license_pool(self._db)
        circulation_data.apply(pool, replace)
        eq_(False, pool.open_access)
        eq_(1, len(pool.delivery_mechanisms))
        eq_(RightsStatus.IN_COPYRIGHT, pool.delivery_mechanisms[0].rights_status.uri)
Exemplo n.º 3
0
    def test_circulationdata_can_be_deepcopied(self):
        # Check that we didn't put something in the CirculationData that
        # will prevent it from being copied. (e.g., self.log)

        subject = SubjectData(Subject.TAG, "subject")
        contributor = ContributorData()
        identifier = IdentifierData(Identifier.GUTENBERG_ID, "1")
        link = LinkData(Hyperlink.OPEN_ACCESS_DOWNLOAD, "example.epub")
        format = FormatData(Representation.EPUB_MEDIA_TYPE, DeliveryMechanism.NO_DRM)
        rights_uri = RightsStatus.GENERIC_OPEN_ACCESS

        circulation_data = CirculationData(
            DataSource.GUTENBERG,
            primary_identifier=identifier,
            links=[link],
            licenses_owned=5,
            licenses_available=5,
            licenses_reserved=None,
            patrons_in_hold_queue=None,
            formats=[format],
            default_rights_uri=rights_uri,
        )

        circulation_data_copy = deepcopy(circulation_data)

        # If deepcopy didn't throw an exception we're ok.
        assert circulation_data_copy is not None
Exemplo n.º 4
0
    def test_license_pool_sets_default_license_values(self):
        """We have no information about how many copies of the book we've
        actually licensed, but a LicensePool can be created anyway,
        so we can store format information.
        """
        identifier = IdentifierData(Identifier.OVERDRIVE_ID, "1")
        drm_format = FormatData(
            content_type=Representation.PDF_MEDIA_TYPE,
            drm_scheme=DeliveryMechanism.ADOBE_DRM,
        )
        circulation = CirculationData(
            data_source=DataSource.OVERDRIVE,
            primary_identifier=identifier,
            formats=[drm_format],
        )
        pool, is_new = circulation.license_pool(
            self._db,
        )
        eq_(True, is_new)

        # We start with the conservative assumption that we own no
        # licenses for the book.
        eq_(0, pool.licenses_owned)
        eq_(0, pool.licenses_available)
        eq_(0, pool.licenses_reserved)
        eq_(0, pool.patrons_in_hold_queue)
Exemplo n.º 5
0
    def test_apply_removes_old_formats_based_on_replacement_policy(self):
        edition, pool = self._edition(with_license_pool=True)

        # Start with one delivery mechanism for this pool.
        for lpdm in pool.delivery_mechanisms:
            self._db.delete(lpdm)

        old_lpdm = pool.set_delivery_mechanism(Representation.PDF_MEDIA_TYPE,
                                               DeliveryMechanism.ADOBE_DRM,
                                               RightsStatus.IN_COPYRIGHT, None)

        # And it has been loaned.
        patron = self._patron()
        loan, ignore = pool.loan_to(patron, fulfillment=old_lpdm)
        eq_(old_lpdm, loan.fulfillment)

        # We have new circulation data that has a different format.
        format = FormatData(
            content_type=Representation.EPUB_MEDIA_TYPE,
            drm_scheme=DeliveryMechanism.ADOBE_DRM,
        )
        circulation_data = CirculationData(
            formats=[format],
            data_source=edition.data_source,
            primary_identifier=edition.primary_identifier,
        )

        # If we apply the new CirculationData with formats false in the policy,
        # we'll add the new format, but keep the old one as well.
        replacement_policy = ReplacementPolicy(formats=False)
        circulation_data.apply(self._db, pool.collection, replacement_policy)

        eq_(2, pool.delivery_mechanisms.count())
        eq_(
            set([
                Representation.PDF_MEDIA_TYPE, Representation.EPUB_MEDIA_TYPE
            ]),
            set([
                lpdm.delivery_mechanism.content_type
                for lpdm in pool.delivery_mechanisms
            ]))
        eq_(old_lpdm, loan.fulfillment)

        # But if we make formats true in the policy, we'll delete the old format
        # and remove it from its loan.
        replacement_policy = ReplacementPolicy(formats=True)
        circulation_data.apply(self._db, pool.collection, replacement_policy)

        eq_(1, pool.delivery_mechanisms.count())
        eq_(Representation.EPUB_MEDIA_TYPE,
            pool.delivery_mechanisms[0].delivery_mechanism.content_type)
        eq_(None, loan.fulfillment)
Exemplo n.º 6
0
    def test_format_change_may_change_open_access_status(self):

        # In this test, whenever we call CirculationData.apply(), we
        # want to destroy the old list of formats and recreate it.
        replace_formats = ReplacementPolicy(formats=True)

        # Here's a seemingly ordinary non-open-access LicensePool.
        edition, pool = self._edition(with_license_pool=True)
        eq_(False, pool.open_access)

        # One day, we learn that it has an open-access delivery mechanism.
        link = LinkData(
            rel=Hyperlink.OPEN_ACCESS_DOWNLOAD,
            media_type=Representation.EPUB_MEDIA_TYPE,
            href=self._url,
            rights_uri=RightsStatus.CC_BY_ND,
        )

        circulation_data = CirculationData(
            data_source=pool.data_source,
            primary_identifier=pool.identifier,
            links=[link],
        )

        # Applying this information turns the pool into an open-access pool.
        circulation_data.apply(self._db,
                               pool.collection,
                               replace=replace_formats)
        eq_(True, pool.open_access)

        # Then we find out it was a mistake -- the book is in copyright.
        format = FormatData(Representation.EPUB_MEDIA_TYPE,
                            DeliveryMechanism.NO_DRM,
                            rights_uri=RightsStatus.IN_COPYRIGHT)
        circulation_data = CirculationData(data_source=pool.data_source,
                                           primary_identifier=pool.identifier,
                                           formats=[format])
        circulation_data.apply(self._db,
                               pool.collection,
                               replace=replace_formats)

        # The original LPDM has been removed and only the new one remains.
        eq_(False, pool.open_access)
        eq_(1, pool.delivery_mechanisms.count())
Exemplo n.º 7
0
    def internal_formats(cls, book_format):
        """Convert the term Bibliotheca uses to refer to a book
        format into a (medium [formats]) 2-tuple.
        """
        medium = Edition.BOOK_MEDIUM
        format = None
        if book_format not in cls.format_data_for_bibliotheca_format:
            logging.error("Unrecognized BookFormat: %s", book_format)
            return medium, []

        content_type, drm_scheme = cls.format_data_for_bibliotheca_format[
            book_format]

        format = FormatData(content_type=content_type, drm_scheme=drm_scheme)
        if book_format == 'MP3':
            medium = Edition.AUDIO_MEDIUM
        else:
            medium = Edition.BOOK_MEDIUM
        return medium, [format]
Exemplo n.º 8
0
    def extract_bibliographic(self, element, ns):
        identifiers = []
        contributors = []
        identifiers.append(IdentifierData(Identifier.ISBN, element["isbn"]))
        sort_name = element["author"]
        if not sort_name:
            sort_name = "Unknown"
        contributors.append(ContributorData(sort_name=sort_name))
        primary_identifier = IdentifierData(Identifier.ENKI_ID, element["id"])
        metadata = Metadata(
            data_source=DataSource.ENKI,
            title=element["title"],
            language="ENGLISH",
            medium=Edition.BOOK_MEDIUM,
            #series=series,
            publisher=element["publisher"],
            #imprint=imprint,
            #published=publication_date,
            primary_identifier=primary_identifier,
            identifiers=identifiers,
            #subjects=subjects,
            contributors=contributors,
        )
        #TODO: This should parse the content type and look it up in the Enki Delivery Data above. Currently,
        # we assume everything is an ePub that uses Adobe DRM, which is a safe assumption only for now.
        formats = []
        formats.append(
            FormatData(content_type=Representation.EPUB_MEDIA_TYPE,
                       drm_scheme=DeliveryMechanism.ADOBE_DRM))

        circulationdata = CirculationData(
            data_source=DataSource.ENKI,
            primary_identifier=primary_identifier,
            formats=formats,
        )

        metadata.circulation = circulationdata
        return metadata
Exemplo n.º 9
0
    def test_circulationdata_may_require_collection(self):
        """Depending on the information provided in a CirculationData
        object, it might or might not be possible to call apply()
        without providing a Collection.
        """

        identifier = IdentifierData(Identifier.OVERDRIVE_ID, "1")
        format = FormatData(Representation.EPUB_MEDIA_TYPE,
                            DeliveryMechanism.NO_DRM,
                            rights_uri=RightsStatus.IN_COPYRIGHT)
        circdata = CirculationData(DataSource.OVERDRIVE,
                                   primary_identifier=identifier,
                                   formats=[format])
        circdata.apply(self._db, collection=None)

        # apply() has created a LicensePoolDeliveryMechanism for this
        # title, even though there are no LicensePools for it.
        identifier_obj, ignore = identifier.load(self._db)
        eq_([], identifier_obj.licensed_through)
        [lpdm] = identifier_obj.delivery_mechanisms
        eq_(DataSource.OVERDRIVE, lpdm.data_source.name)
        eq_(RightsStatus.IN_COPYRIGHT, lpdm.rights_status.uri)

        mechanism = lpdm.delivery_mechanism
        eq_(Representation.EPUB_MEDIA_TYPE, mechanism.content_type)
        eq_(DeliveryMechanism.NO_DRM, mechanism.drm_scheme)

        # But if we put some information in the CirculationData
        # that can only be stored in a LicensePool, there's trouble.
        circdata.licenses_owned = 0
        assert_raises_regexp(
            ValueError,
            'Cannot store circulation information because no Collection was provided.',
            circdata.apply,
            self._db,
            collection=None)
Exemplo n.º 10
0
    def book_info_to_metadata(cls,
                              book,
                              include_bibliographic=True,
                              include_formats=True):
        """Turn Overdrive's JSON representation of a book into a Metadata
        object.

        Note:  The json data passed into this method is from a different file/stream
        from the json data that goes into the book_info_to_circulation() method.
        """
        if not 'id' in book:
            return None
        overdrive_id = book['id']
        primary_identifier = IdentifierData(Identifier.OVERDRIVE_ID,
                                            overdrive_id)

        if include_bibliographic:
            title = book.get('title', None)
            sort_title = book.get('sortTitle')
            subtitle = book.get('subtitle', None)
            series = book.get('series', None)
            publisher = book.get('publisher', None)
            imprint = book.get('imprint', None)

            if 'publishDate' in book:
                published = datetime.datetime.strptime(
                    book['publishDate'][:10], cls.DATE_FORMAT)
            else:
                published = None

            languages = [l['code'] for l in book.get('languages', [])]
            if 'eng' in languages or not languages:
                language = 'eng'
            else:
                language = sorted(languages)[0]

            contributors = []
            for creator in book.get('creators', []):
                sort_name = creator['fileAs']
                display_name = creator['name']
                role = creator['role']
                roles = cls.parse_roles(overdrive_id,
                                        role) or [Contributor.UNKNOWN_ROLE]
                contributor = ContributorData(sort_name=sort_name,
                                              display_name=display_name,
                                              roles=roles,
                                              biography=creator.get(
                                                  'bioText', None))
                contributors.append(contributor)

            subjects = []
            for sub in book.get('subjects', []):
                subject = SubjectData(type=Subject.OVERDRIVE,
                                      identifier=sub['value'],
                                      weight=100)
                subjects.append(subject)

            for sub in book.get('keywords', []):
                subject = SubjectData(type=Subject.TAG,
                                      identifier=sub['value'],
                                      weight=1)
                subjects.append(subject)

            extra = dict()
            if 'grade_levels' in book:
                # n.b. Grade levels are measurements of reading level, not
                # age appropriateness. We can use them as a measure of age
                # appropriateness in a pinch, but we weight them less
                # heavily than other information from Overdrive.
                for i in book['grade_levels']:
                    subject = SubjectData(type=Subject.GRADE_LEVEL,
                                          identifier=i['value'],
                                          weight=10)
                    subjects.append(subject)

            overdrive_medium = book.get('mediaType', None)
            if overdrive_medium and overdrive_medium not in cls.overdrive_medium_to_simplified_medium:
                cls.log.error("Could not process medium %s for %s",
                              overdrive_medium, overdrive_id)

            medium = cls.overdrive_medium_to_simplified_medium.get(
                overdrive_medium, Edition.BOOK_MEDIUM)

            measurements = []
            if 'awards' in book:
                extra['awards'] = book.get('awards', [])
                num_awards = len(extra['awards'])
                measurements.append(
                    MeasurementData(Measurement.AWARDS, str(num_awards)))

            for name, subject_type in (('ATOS', Subject.ATOS_SCORE),
                                       ('lexileScore', Subject.LEXILE_SCORE),
                                       ('interestLevel',
                                        Subject.INTEREST_LEVEL)):
                if not name in book:
                    continue
                identifier = str(book[name])
                subjects.append(
                    SubjectData(type=subject_type,
                                identifier=identifier,
                                weight=100))

            for grade_level_info in book.get('gradeLevels', []):
                grade_level = grade_level_info.get('value')
                subjects.append(
                    SubjectData(type=Subject.GRADE_LEVEL,
                                identifier=grade_level,
                                weight=100))

            identifiers = []
            links = []
            for format in book.get('formats', []):
                for new_id in format.get('identifiers', []):
                    t = new_id['type']
                    v = new_id['value']
                    orig_v = v
                    type_key = None
                    if t == 'ASIN':
                        type_key = Identifier.ASIN
                    elif t == 'ISBN':
                        type_key = Identifier.ISBN
                        if len(v) == 10:
                            v = isbnlib.to_isbn13(v)
                        if v is None or not isbnlib.is_isbn13(v):
                            # Overdrive sometimes uses invalid values
                            # like "n/a" as placeholders. Ignore such
                            # values to avoid a situation where hundreds of
                            # books appear to have the same ISBN. ISBNs
                            # which fail check digit checks or are invalid
                            # also can occur. Log them for review.
                            cls.log.info("Bad ISBN value provided: %s", orig_v)
                            continue
                    elif t == 'DOI':
                        type_key = Identifier.DOI
                    elif t == 'UPC':
                        type_key = Identifier.UPC
                    elif t == 'PublisherCatalogNumber':
                        continue
                    if type_key and v:
                        identifiers.append(IdentifierData(type_key, v, 1))

                # Samples become links.
                if 'samples' in format:

                    if not format['id'] in cls.format_data_for_overdrive_format:
                        # Useless to us.
                        continue
                    content_type, drm_scheme = cls.format_data_for_overdrive_format.get(
                        format['id'])
                    if Representation.is_media_type(content_type):
                        for sample_info in format['samples']:
                            href = sample_info['url']
                            links.append(
                                LinkData(rel=Hyperlink.SAMPLE,
                                         href=href,
                                         media_type=content_type))

            # A cover and its thumbnail become a single LinkData.
            if 'images' in book:
                images = book['images']
                image_data = cls.image_link_to_linkdata(
                    images.get('cover'), Hyperlink.IMAGE)
                for name in ['cover300Wide', 'cover150Wide', 'thumbnail']:
                    # Try to get a thumbnail that's as close as possible
                    # to the size we use.
                    image = images.get(name)
                    thumbnail_data = cls.image_link_to_linkdata(
                        image, Hyperlink.THUMBNAIL_IMAGE)
                    if not image_data:
                        image_data = cls.image_link_to_linkdata(
                            image, Hyperlink.IMAGE)
                    if thumbnail_data:
                        break

                if image_data:
                    if thumbnail_data:
                        image_data.thumbnail = thumbnail_data
                    links.append(image_data)

            # Descriptions become links.
            short = book.get('shortDescription')
            full = book.get('fullDescription')
            if full:
                links.append(
                    LinkData(
                        rel=Hyperlink.DESCRIPTION,
                        content=full,
                        media_type="text/html",
                    ))

            if short and (not full or not full.startswith(short)):
                links.append(
                    LinkData(
                        rel=Hyperlink.SHORT_DESCRIPTION,
                        content=short,
                        media_type="text/html",
                    ))

            # Add measurements: rating and popularity
            if book.get('starRating') is not None and book['starRating'] > 0:
                measurements.append(
                    MeasurementData(quantity_measured=Measurement.RATING,
                                    value=book['starRating']))

            if book.get('popularity'):
                measurements.append(
                    MeasurementData(quantity_measured=Measurement.POPULARITY,
                                    value=book['popularity']))

            metadata = Metadata(
                data_source=DataSource.OVERDRIVE,
                title=title,
                subtitle=subtitle,
                sort_title=sort_title,
                language=language,
                medium=medium,
                series=series,
                publisher=publisher,
                imprint=imprint,
                published=published,
                primary_identifier=primary_identifier,
                identifiers=identifiers,
                subjects=subjects,
                contributors=contributors,
                measurements=measurements,
                links=links,
            )
        else:
            metadata = Metadata(
                data_source=DataSource.OVERDRIVE,
                primary_identifier=primary_identifier,
            )

        if include_formats:
            formats = []
            for format in book.get('formats', []):
                format_id = format['id']
                if format_id in cls.format_data_for_overdrive_format:
                    content_type, drm_scheme = cls.format_data_for_overdrive_format.get(
                        format_id)
                    formats.append(FormatData(content_type, drm_scheme))
                elif format_id not in cls.ignorable_overdrive_formats:
                    cls.log.error(
                        "Could not process Overdrive format %s for %s",
                        format_id, overdrive_id)

            # Also make a CirculationData so we can write the formats,
            circulationdata = CirculationData(
                data_source=DataSource.OVERDRIVE,
                primary_identifier=primary_identifier,
                formats=formats,
            )

            metadata.circulation = circulationdata

        return metadata
Exemplo n.º 11
0
class ItemListParser(XMLParser):

    DATE_FORMAT = "%Y-%m-%d"
    YEAR_FORMAT = "%Y"

    NAMESPACES = {}

    def parse(self, xml):
        for i in self.process_all(xml, "//Item"):
            yield i

    parenthetical = re.compile(" \([^)]+\)$")

    @classmethod
    def contributors_from_string(cls, string):
        contributors = []
        if not string:
            return contributors
        
        for sort_name in string.split(';'):
            sort_name = cls.parenthetical.sub("", sort_name.strip())
            contributors.append(
                ContributorData(
                    sort_name=sort_name.strip(),
                    roles=[Contributor.AUTHOR_ROLE]
                )
            )
        return contributors

    @classmethod
    def parse_genre_string(self, s):           
        genres = []
        if not s:
            return genres
        for i in s.split(","):
            i = i.strip()
            if not i:
                continue
            i = i.replace("&", "&").replace("&", "&").replace("'", "'")
            genres.append(SubjectData(Subject.THREEM, i, weight=15))
        return genres


    def process_one(self, tag, namespaces):
        """Turn an <item> tag into a Metadata and an encompassed CirculationData 
        objects, and return the Metadata."""

        def value(threem_key):
            return self.text_of_optional_subtag(tag, threem_key)

        links = dict()
        identifiers = dict()
        subjects = []

        primary_identifier = IdentifierData(
            Identifier.THREEM_ID, value("ItemId")
        )

        identifiers = []
        for key in ('ISBN13', 'PhysicalISBN'):
            v = value(key)
            if v:
                identifiers.append(
                    IdentifierData(Identifier.ISBN, v)
                )

        subjects = self.parse_genre_string(value("Genre"))

        title = value("Title")
        subtitle = value("SubTitle")
        publisher = value("Publisher")
        language = value("Language")

        contributors = list(self.contributors_from_string(value('Authors')))

        published_date = None
        published = value("PubDate")
        if published:
            formats = [self.DATE_FORMAT, self.YEAR_FORMAT]
        else:
            published = value("PubYear")
            formats = [self.YEAR_FORMAT]

        for format in formats:
            try:
                published_date = datetime.strptime(published, format)
            except ValueError, e:
                pass

        links = []
        description = value("Description")
        if description:
            links.append(
                LinkData(rel=Hyperlink.DESCRIPTION, content=description)
            )

        cover_url = value("CoverLinkURL").replace("&amp;", "&")
        links.append(LinkData(rel=Hyperlink.IMAGE, href=cover_url))

        alternate_url = value("BookLinkURL").replace("&amp;", "&")
        links.append(LinkData(rel='alternate', href=alternate_url))

        measurements = []
        pages = value("NumberOfPages")
        if pages:
            pages = int(pages)
            measurements.append(
                MeasurementData(quantity_measured=Measurement.PAGE_COUNT,
                                value=pages)
            )

        medium = Edition.BOOK_MEDIUM

        book_format = value("BookFormat")
        format = None
        if book_format == 'EPUB':
            format = FormatData(
                content_type=Representation.EPUB_MEDIA_TYPE,
                drm_scheme=DeliveryMechanism.ADOBE_DRM
            )
        elif book_format == 'PDF':
            format = FormatData(
                content_type=Representation.PDF_MEDIA_TYPE,
                drm_scheme=DeliveryMechanism.ADOBE_DRM
            )
        elif book_format == 'MP3':
            format = FormatData(
                content_type=Representation.MP3_MEDIA_TYPE,
                drm_scheme=DeliveryMechanism.ADOBE_DRM
            )
            medium = Edition.AUDIO_MEDIUM

        formats = [format]

        metadata = Metadata(
            data_source=DataSource.THREEM,
            title=title,
            subtitle=subtitle,
            language=language,
            medium=medium,
            publisher=publisher,
            published=published_date,
            primary_identifier=primary_identifier,
            identifiers=identifiers,
            subjects=subjects,
            contributors=contributors,
            measurements=measurements,
            links=links,
        )

        # Also make a CirculationData so we can write the formats, 
        circulationdata = CirculationData(
            data_source=DataSource.THREEM,
            primary_identifier=primary_identifier,
            formats=formats,
            links=links,
        )

        metadata.circulation = circulationdata
        return metadata
Exemplo n.º 12
0
    def extract_bibliographic(self, element, ns):
        """Turn bibliographic metadata into a Metadata and a CirculationData objects, 
        and return them as a tuple."""

        # TODO: These are consistently empty (some are clearly for
        # audiobooks) so I don't know what they do and/or what format
        # they're in.
        #
        # annotation
        # edition
        # narrator
        # runtime

        identifier = self.text_of_subtag(element, 'axis:titleId', ns)
        isbn = self.text_of_optional_subtag(element, 'axis:isbn', ns)
        title = self.text_of_subtag(element, 'axis:productTitle', ns)

        contributor = self.text_of_optional_subtag(element, 'axis:contributor',
                                                   ns)
        contributors = []
        found_primary_author = False
        if contributor:
            for c in self.parse_list(contributor):
                contributor = self.parse_contributor(c, found_primary_author)
                if Contributor.PRIMARY_AUTHOR_ROLE in contributor.roles:
                    found_primary_author = True
                contributors.append(contributor)

        subject = self.text_of_optional_subtag(element, 'axis:subject', ns)
        subjects = []
        if subject:
            for subject_identifier in self.parse_list(subject):
                subjects.append(
                    SubjectData(type=Subject.BISAC,
                                identifier=subject_identifier,
                                weight=1))

        publication_date = self.text_of_optional_subtag(
            element, 'axis:publicationDate', ns)
        if publication_date:
            publication_date = datetime.datetime.strptime(
                publication_date, self.SHORT_DATE_FORMAT)

        series = self.text_of_optional_subtag(element, 'axis:series', ns)
        publisher = self.text_of_optional_subtag(element, 'axis:publisher', ns)
        imprint = self.text_of_optional_subtag(element, 'axis:imprint', ns)

        audience = self.text_of_optional_subtag(element, 'axis:audience', ns)
        if audience:
            subjects.append(
                SubjectData(
                    type=Subject.THETA_AUDIENCE,
                    identifier=audience,
                    weight=1,
                ))

        language = self.text_of_subtag(element, 'axis:language', ns)

        # We don't use this for anything.
        # file_size = self.int_of_optional_subtag(element, 'theta:fileSize', ns)
        primary_identifier = IdentifierData(Identifier.THETA_ID, identifier)
        identifiers = []
        if isbn:
            identifiers.append(IdentifierData(Identifier.ISBN, isbn))

        formats = []
        acceptable = False
        seen_formats = []
        for format_tag in self._xpath(
                element,
                'axis:availability/axis:availableFormats/axis:formatName', ns):
            informal_name = format_tag.text
            seen_formats.append(informal_name)
            if informal_name not in self.DELIVERY_DATA_FOR_THETA_FORMAT:
                self.log("Unrecognized Theta format name for %s: %s" %
                         (identifier, informal_name))
            elif self.DELIVERY_DATA_FOR_THETA_FORMAT.get(informal_name):
                content_type, drm_scheme = self.DELIVERY_DATA_FOR_THETA_FORMAT[
                    informal_name]
                formats.append(
                    FormatData(content_type=content_type,
                               drm_scheme=drm_scheme))

        if not formats:
            self.log.error("No supported format for %s (%s)! Saw: %s",
                           identifier, title, ", ".join(seen_formats))

        metadata = Metadata(
            data_source=DataSource.THETA,
            title=title,
            language=language,
            medium=Edition.BOOK_MEDIUM,
            series=series,
            publisher=publisher,
            imprint=imprint,
            published=publication_date,
            primary_identifier=primary_identifier,
            identifiers=identifiers,
            subjects=subjects,
            contributors=contributors,
        )

        circulationdata = CirculationData(
            data_source=DataSource.THETA,
            primary_identifier=primary_identifier,
            formats=formats,
        )

        metadata.circulation = circulationdata
        return metadata
Exemplo n.º 13
0
 def set_format(cls, format_received, formats):
     content_type, drm_scheme = cls.format_data_for_odilo_format.get(
         format_received)
     formats.append(FormatData(content_type, drm_scheme))
     return cls.odilo_medium_to_simplified_medium.get(format_received)
Exemplo n.º 14
0
    def isbn_info_to_metadata(cls,
                              book,
                              include_bibliographic=True,
                              include_formats=True):
        """Turn OneClick's JSON representation of a book into a Metadata object.
        Assumes the JSON is in the format that comes from the media/{isbn} endpoint.

        TODO:  Use the seriesTotal field.

        :param book a json response-derived dictionary of book attributes
        """
        if not 'isbn' in book:
            return None
        oneclick_id = book['isbn']
        primary_identifier = IdentifierData(Identifier.ONECLICK_ID,
                                            oneclick_id)

        metadata = Metadata(
            data_source=DataSource.ONECLICK,
            primary_identifier=primary_identifier,
        )

        if include_bibliographic:
            title = book.get('title', None)
            # NOTE: An item that's part of a series, will have the seriesName field, and
            # will have its seriesPosition and seriesTotal fields set to >0.
            # An item not part of a series will have the seriesPosition and seriesTotal fields
            # set to 0, and will not have a seriesName at all.
            # Sometimes, series position and total == 0, for many series items (ex: "seriesName": "EngLits").
            # Sometimes, seriesName is set to "Default Blank", meaning "not actually a series".
            series_name = book.get('seriesName', None)

            series_position = book.get('seriesPosition', None)
            if series_position:
                try:
                    series_position = int(series_position)
                except ValueError:
                    # not big enough deal to stop the whole process
                    series_position = None

            # ignored for now
            series_total = book.get('seriesTotal', None)
            # ignored for now
            has_digital_rights = book.get('hasDigitalRights', None)

            publisher = book.get('publisher', None)
            if 'publicationDate' in book:
                published = datetime.datetime.strptime(
                    book['publicationDate'][:10], cls.DATE_FORMAT)
            else:
                published = None

            if 'language' in book:
                language = LanguageCodes.string_to_alpha_3(book['language'])
            else:
                language = 'eng'

            contributors = []
            if 'authors' in book:
                authors = book['authors']
                for author in authors.split(";"):
                    sort_name = author.strip()
                    if sort_name:
                        sort_name = name_tidy(sort_name)
                        display_name = sort_name_to_display_name(sort_name)
                        roles = [Contributor.AUTHOR_ROLE]
                        contributor = ContributorData(
                            sort_name=sort_name,
                            display_name=display_name,
                            roles=roles)
                        contributors.append(contributor)

            if 'narrators' in book:
                narrators = book['narrators']
                for narrator in narrators.split(";"):
                    sort_name = narrator.strip()
                    if sort_name:
                        sort_name = name_tidy(sort_name)
                        display_name = sort_name_to_display_name(sort_name)
                        roles = [Contributor.NARRATOR_ROLE]
                        contributor = ContributorData(
                            sort_name=sort_name,
                            display_name=display_name,
                            roles=roles)
                        contributors.append(contributor)

            subjects = []
            if 'genres' in book:
                # example: "FICTION / Humorous / General"
                genres = book['genres']
                subject = SubjectData(type=Subject.BISAC,
                                      identifier=genres,
                                      weight=100)
                subjects.append(subject)

            if 'primaryGenre' in book:
                # example: "humorous-fiction,mystery,womens-fiction"
                genres = book['primaryGenre']
                for genre in genres.split(","):
                    subject = SubjectData(type=Subject.ONECLICK,
                                          identifier=genre.strip(),
                                          weight=100)
                    subjects.append(subject)

            # audience options are: adult, beginning-reader, childrens, young-adult
            # NOTE: In OneClick metadata, audience can be set to "Adult" while publisher is "HarperTeen".
            audience = book.get('audience', None)
            if audience:
                subject = SubjectData(type=Subject.ONECLICK_AUDIENCE,
                                      identifier=audience.strip().lower(),
                                      weight=10)
                subjects.append(subject)

            # options are: "eBook", "eAudio"
            oneclick_medium = book.get('mediaType', None)
            if oneclick_medium and oneclick_medium not in cls.oneclick_medium_to_simplified_medium:
                cls.log.error("Could not process medium %s for %s",
                              oneclick_medium, oneclick_id)

            medium = cls.oneclick_medium_to_simplified_medium.get(
                oneclick_medium, Edition.BOOK_MEDIUM)

            # passed to metadata.apply, the isbn_identifier will create an equivalency
            # between the OneClick-labeled and the ISBN-labeled identifier rows, which
            # will in turn allow us to ask the MetadataWrangler for more info about the book.
            isbn_identifier = IdentifierData(Identifier.ISBN, oneclick_id)

            identifiers = [primary_identifier, isbn_identifier]

            links = []
            # A cover and its thumbnail become a single LinkData.
            # images come in small (ex: 71x108px), medium (ex: 95x140px),
            # and large (ex: 128x192px) sizes
            if 'images' in book:
                images = book['images']
                for image in images:
                    if image['name'] == "large":
                        image_data = cls.image_link_to_linkdata(
                            image['url'], Hyperlink.IMAGE)
                    if image['name'] == "medium":
                        thumbnail_data = cls.image_link_to_linkdata(
                            image['url'], Hyperlink.THUMBNAIL_IMAGE)
                    if image['name'] == "small":
                        thumbnail_data_backup = cls.image_link_to_linkdata(
                            image['url'], Hyperlink.THUMBNAIL_IMAGE)

                if not thumbnail_data and thumbnail_data_backup:
                    thumbnail_data = thumbnail_data_backup

                if image_data:
                    if thumbnail_data:
                        image_data.thumbnail = thumbnail_data
                    links.append(image_data)

            # Descriptions become links.
            description = book.get('description', None)
            if description:
                links.append(
                    LinkData(
                        # there can be fuller descriptions in the search endpoint output
                        rel=Hyperlink.SHORT_DESCRIPTION,
                        content=description,
                        media_type="text/html",
                    ))

            metadata.title = title
            metadata.language = language
            metadata.medium = medium
            metadata.series = series_name
            metadata.series_position = series_position
            metadata.publisher = publisher
            metadata.published = published
            metadata.identifiers = identifiers
            metadata.subjects = subjects
            metadata.contributors = contributors
            metadata.links = links

        if include_formats:
            formats = []
            if metadata.medium == Edition.BOOK_MEDIUM:
                content_type, drm_scheme = cls.oneclick_formats.get(
                    "ebook-epub-oneclick")
                formats.append(FormatData(content_type, drm_scheme))
            elif metadata.medium == Edition.AUDIO_MEDIUM:
                content_type, drm_scheme = cls.oneclick_formats.get(
                    "audiobook-mp3-oneclick")
                formats.append(FormatData(content_type, drm_scheme))
            else:
                cls.log.warn("Unfamiliar format: %s", format_id)

            # Make a CirculationData so we can write the formats,
            circulationdata = CirculationData(
                data_source=DataSource.ONECLICK,
                primary_identifier=primary_identifier,
                formats=formats,
            )

            metadata.circulation = circulationdata

        return metadata