예제 #1
0
    def test_initialization(self):
        assert_raises_regexp(
            ValueError, "ContributorLane can't be created without contributor",
            ContributorLane, self._default_library, None)

        parent = WorkList()
        parent.initialize(self._default_library)

        lane = ContributorLane(
            self._default_library,
            self.contributor,
            parent,
            languages=['a'],
            audiences=['b'],
        )
        eq_(self.contributor, lane.contributor)
        eq_(['a'], lane.languages)
        eq_(['b'], lane.audiences)
        eq_([lane], parent.children)

        # The contributor_key will be used in links to other pages
        # of this Lane and so on.
        eq_("Lois Lane", lane.contributor_key)

        # If the contributor used to create a ContributorLane has no
        # display name, their sort name is used as the
        # contributor_key.
        contributor = ContributorData(sort_name="Lane, Lois")
        lane = ContributorLane(self._default_library, contributor)
        eq_(contributor, lane.contributor)
        eq_("Lane, Lois", lane.contributor_key)
예제 #2
0
    def test_circulationdata_can_be_deepcopied(self):
        # Check that we didn't put something in the CirculationData that
        # will prevent it from being copied. (e.g., self.log)

        subject = SubjectData(Subject.TAG, "subject")
        contributor = ContributorData()
        identifier = IdentifierData(Identifier.GUTENBERG_ID, "1")
        link = LinkData(Hyperlink.OPEN_ACCESS_DOWNLOAD, "example.epub")
        format = FormatData(Representation.EPUB_MEDIA_TYPE,
                            DeliveryMechanism.NO_DRM)
        rights_uri = RightsStatus.GENERIC_OPEN_ACCESS

        circulation_data = CirculationData(
            DataSource.GUTENBERG,
            primary_identifier=identifier,
            links=[link],
            licenses_owned=5,
            licenses_available=5,
            licenses_reserved=None,
            patrons_in_hold_queue=None,
            formats=[format],
            default_rights_uri=rights_uri,
        )

        circulation_data_copy = deepcopy(circulation_data)

        # If deepcopy didn't throw an exception we're ok.
        assert circulation_data_copy is not None
예제 #3
0
    def test_initialization(self):
        with pytest.raises(ValueError) as excinfo:
            ContributorLane(self._default_library, None)
        assert "ContributorLane can't be created without contributor" in str(
            excinfo.value
        )

        parent = WorkList()
        parent.initialize(self._default_library)

        lane = ContributorLane(
            self._default_library,
            self.contributor,
            parent,
            languages=["a"],
            audiences=["b"],
        )
        assert self.contributor == lane.contributor
        assert ["a"] == lane.languages
        assert ["b"] == lane.audiences
        assert [lane] == parent.children

        # The contributor_key will be used in links to other pages
        # of this Lane and so on.
        assert "Lois Lane" == lane.contributor_key

        # If the contributor used to create a ContributorLane has no
        # display name, their sort name is used as the
        # contributor_key.
        contributor = ContributorData(sort_name="Lane, Lois")
        lane = ContributorLane(self._default_library, contributor)
        assert contributor == lane.contributor
        assert "Lane, Lois" == lane.contributor_key
예제 #4
0
    def extract_bibliographic(self, element):
        identifiers = []
        contributors = []
        identifiers.append(IdentifierData(Identifier.ISBN, element["isbn"]))
        sort_name = element["author"]
        if not sort_name:
            sort_name = Edition.UNKNOWN_AUTHOR
        contributors.append(ContributorData(sort_name=sort_name))
        primary_identifier = IdentifierData(EnkiAPI.ENKI_ID, element["id"])
        image_url = element["large_image"]
        thumbnail_url = element["large_image"]
        images = [
            LinkData(rel=Hyperlink.THUMBNAIL_IMAGE,
                     href=thumbnail_url,
                     media_type=Representation.PNG_MEDIA_TYPE),
            LinkData(rel=Hyperlink.IMAGE,
                     href=image_url,
                     media_type=Representation.PNG_MEDIA_TYPE)
        ]
        metadata = Metadata(
            data_source=DataSource.ENKI,
            title=element["title"],
            language="eng",
            medium=Edition.BOOK_MEDIUM,
            publisher=element["publisher"],
            primary_identifier=primary_identifier,
            identifiers=identifiers,
            contributors=contributors,
            links=images,
        )
        licenses_owned = element["availability"]["totalCopies"]
        licenses_available = element["availability"]["availableCopies"]
        hold = element["availability"]["onHold"]
        drm_type = EnkiAPI.adobe_drm if (element["availability"]["accessType"]
                                         == 'acs') else EnkiAPI.no_drm
        formats = []
        formats.append(
            FormatData(content_type=Representation.EPUB_MEDIA_TYPE,
                       drm_scheme=drm_type))

        circulationdata = CirculationData(
            data_source=DataSource.ENKI,
            primary_identifier=primary_identifier,
            formats=formats,
            licenses_owned=int(licenses_owned),
            licenses_available=int(licenses_available),
            patrons_in_hold_queue=int(hold))

        metadata.circulation = circulationdata
        return metadata
예제 #5
0
    def parse(cls, string):
        """Parse a string into a ContributorData object.

        This may include sort_name, birth_date, and death_date.
        """
        string = string.strip()
        sort_name, birth, death = cls._get_lifespan(string)
        extra = dict()
        if birth is not None:
            extra[Contributor.BIRTH_DATE] = birth
        if death is not None:
            extra[Contributor.DEATH_DATE] = death
        return ContributorData(
            sort_name=sort_name,
            extra=extra,
        )
예제 #6
0
    def cluster_has_record_for_named_author(
            self, cluster, working_sort_name, working_display_name, contributor_data=None):
        """  Looks through the xml cluster for all fields that could indicate the
        author's name.

        Don't short-circuit the xml parsing process -- if found an author name
        match, keep parsing and see what else can find.

        :return: a dictionary containing description of xml field
        that matched author name searched for.
        """
        match_confidences = {}
        if not contributor_data:
            contributor_data = ContributorData()

        # If we have a sort name to look for, and it's in this cluster's
        # sort names, great.
        if working_sort_name:
            for potential_match in self.sort_names_for_cluster(cluster):
                match_confidence = contributor_name_match_ratio(potential_match, working_sort_name)
                match_confidences["sort_name"] = match_confidence
                # fuzzy match filter may not always give a 100% match, so cap arbitrarily at 90% as a "sure match"
                if match_confidence > 90:
                    contributor_data.sort_name=potential_match
                    return match_confidences

        # If we have a display name to look for, and this cluster's
        # Wikipedia name converts to the display name, great.
        if working_display_name:
            wikipedia_name = self.extract_wikipedia_name(cluster)
            if wikipedia_name:
                contributor_data.wikipedia_name=wikipedia_name
                display_name = self.wikipedia_name_to_display_name(wikipedia_name)
                match_confidence = contributor_name_match_ratio(display_name, working_display_name)
                match_confidences["display_name"] = match_confidence
                if match_confidence > 90:
                    contributor_data.display_name=display_name
                    return match_confidences

        # If there are UNIMARC records, and every part of the UNIMARC
        # record matches the sort name or the display name, great.
        unimarcs = self._xpath(cluster, './/*[local-name()="datafield"][@dtype="UNIMARC"]')
        candidates = []
        for unimarc in unimarcs:
            (possible_given, possible_family,
             possible_extra, possible_sort_name) = self.extract_name_from_unimarc(unimarc)
            if working_sort_name:
                match_confidence = contributor_name_match_ratio(possible_sort_name, working_sort_name)
                match_confidences["unimarc"] = match_confidence
                if match_confidence > 90:
                    contributor_data.family_name=possible_sort_name
                    return match_confidences

            for name in (working_sort_name, working_display_name):
                if not name:
                    continue
                if (possible_given and possible_given in name
                    and possible_family and possible_family in name and (
                        not possible_extra or possible_extra in name)):
                    match_confidences["unimarc"] = 90
                    contributor_data.family_name=possible_family
                    return match_confidences

        # Last-ditch effort. Guess at the sort name and see if *that's* one
        # of the cluster sort names.
        if working_display_name and not working_sort_name:
            test_sort_name = display_name_to_sort_name(working_display_name)
            for potential_match in self.sort_names_for_cluster(cluster):
                match_confidence = contributor_name_match_ratio(potential_match, test_sort_name)
                match_confidences["guessed_sort_name"] = match_confidence
                if match_confidence > 90:
                    contributor_data.sort_name=potential_match
                    return match_confidences

        # OK, last last-ditch effort.  See if the alternate name forms (pseudonyms) are it.
        if working_sort_name:
            for potential_match in self.alternate_name_forms_for_cluster(cluster):
                match_confidence = contributor_name_match_ratio(potential_match, working_sort_name)
                match_confidences["alternate_name"] = match_confidence
                if match_confidence > 90:
                    contributor_data.family_name=potential_match
                    return match_confidences

        return match_confidences
예제 #7
0
            for isbn in d.get('isbns', []): 
                isbn13 = isbn.get('isbn13', None)
                if isbn13:
                    other_isbns.append( 
                        IdentifierData(Identifier.ISBN, isbn13, 0.50) 
                    )


        primary_isbn = primary_isbn13 or primary_isbn10
        if primary_isbn:
            primary_isbn = IdentifierData(Identifier.ISBN, primary_isbn, 0.90)

        contributors = []
        if display_author:
            contributors.append(
                ContributorData(display_name=display_author)
            )

        metadata = Metadata(
            data_source=DataSource.NYT,
            title=title, 
            language='eng',
            published=published_date,
            publisher=publisher,
            contributors=contributors,
            primary_identifier=primary_isbn,
            identifiers=other_isbns,
        )

        super(NYTBestSellerListTitle, self).__init__(
            metadata, first_appearance, most_recent_appearance,
예제 #8
0
            # other books in the same series, as well as ISBNs that
            # are just wrong. Assign these equivalencies at a low
            # level of confidence.
            for isbn in d.get('isbns', []):
                isbn13 = isbn.get('isbn13', None)
                if isbn13:
                    other_isbns.append(
                        IdentifierData(Identifier.ISBN, isbn13, 0.50))

        primary_isbn = primary_isbn13 or primary_isbn10
        if primary_isbn:
            primary_isbn = IdentifierData(Identifier.ISBN, primary_isbn, 0.90)

        contributors = []
        if display_author:
            contributors.append(ContributorData(display_name=display_author))

        metadata = Metadata(
            data_source=DataSource.NYT,
            title=title,
            medium=medium,
            language='eng',
            published=published_date,
            publisher=publisher,
            contributors=contributors,
            primary_identifier=primary_isbn,
            identifiers=other_isbns,
        )

        super(NYTBestSellerListTitle,
              self).__init__(metadata, first_appearance,
예제 #9
0
    def extract_bibliographic(self, element):
        """Extract Metadata and CirculationData from a dictionary
        of information from Enki.

        :return: A Metadata with attached CirculationData.
        """
        # TODO: it's not clear what these are or whether we'd find them
        # useful:
        #  dateSaved
        #  length
        #  publishDate
        primary_identifier = IdentifierData(EnkiAPI.ENKI_ID, element["id"])

        identifiers = []
        identifiers.append(IdentifierData(Identifier.ISBN, element["isbn"]))

        contributors = []
        sort_name = element.get("author", None) or Edition.UNKNOWN_AUTHOR
        contributors.append(ContributorData(sort_name=sort_name))

        links = []
        description = element.get("description")
        if description:
            links.append(
                LinkData(
                    rel=Hyperlink.DESCRIPTION,
                    content=description,
                    media_type="text/html",
                )
            )

        # NOTE: When this method is called by, e.g. updated_titles(),
        # the large and small images are available separately. When
        # this method is called by get_item(), we only get a single
        # image, in 'cover'. In get_item() we ask that that image be 'large',
        # which means we'll be filing it as a normal-sized image.
        #
        full_image = None
        thumbnail_image = None
        for key, rel in (
            ("cover", Hyperlink.IMAGE),
            ("small_image", Hyperlink.THUMBNAIL_IMAGE),
            ("large_image", Hyperlink.IMAGE),
        ):
            url = element.get(key)
            if not url:
                continue
            link = LinkData(rel=rel, href=url, media_type=Representation.PNG_MEDIA_TYPE)
            if rel == Hyperlink.THUMBNAIL_IMAGE:
                # Don't add a thumbnail to the list of links -- wait
                # until the end and then make it a thumbnail of the
                # primary image.
                thumbnail_image = link
            else:
                if rel == Hyperlink.IMAGE:
                    full_image = link
                links.append(link)

        if thumbnail_image:
            if full_image:
                # Set the thumbnail as the thumbnail _of_ the full image.
                full_image.thumbnail = thumbnail_image
            else:
                # Treat the thumbnail as the full image.
                thumbnail_image.rel = Hyperlink.IMAGE
                links.append(thumbnail_image)

        # We treat 'subject', 'topic', and 'genre' as interchangeable
        # sets of tags. This data is based on BISAC but it's not reliably
        # presented in a form that can be parsed as BISAC.
        subjects = []
        seen_topics = set()
        for key in ("subject", "topic", "genre"):
            for topic in element.get(key, []):
                if not topic or topic in seen_topics:
                    continue
                subjects.append(
                    SubjectData(
                        Subject.TAG,
                        topic,
                        weight=Classification.TRUSTED_DISTRIBUTOR_WEIGHT,
                    )
                )
                seen_topics.add(topic)

        language_code = element.get("language", "English")
        language = self.LANGUAGE_CODES.get(language_code, "eng")

        metadata = Metadata(
            data_source=DataSource.ENKI,
            title=element.get("title"),
            language=language,
            medium=Edition.BOOK_MEDIUM,
            publisher=element.get("publisher"),
            primary_identifier=primary_identifier,
            identifiers=identifiers,
            contributors=contributors,
            links=links,
            subjects=subjects,
        )
        circulationdata = self.extract_circulation(
            primary_identifier,
            element.get("availability", {}),
            element.get("formattype", None),
        )
        metadata.circulation = circulationdata
        return metadata
예제 #10
0
    def cluster_has_record_for_named_author(self,
                                            cluster,
                                            working_sort_name,
                                            working_display_name,
                                            contributor_data=None):
        """  Looks through the xml cluster for all fields that could indicate the 
        author's name.

        Don't short-circuit the xml parsing process -- if found an author name 
        match, keep parsing and see what else can find.

        :return: a dictionary containing description of xml field 
        that matched author name searched for.
        """
        match_confidences = {}
        if not contributor_data:
            contributor_data = ContributorData()

        # If we have a sort name to look for, and it's in this cluster's
        # sort names, great.
        if working_sort_name:
            for potential_match in self.sort_names_for_cluster(cluster):
                match_confidence = contributor_name_match_ratio(
                    potential_match, working_sort_name)
                match_confidences["sort_name"] = match_confidence
                # fuzzy match filter may not always give a 100% match, so cap arbitrarily at 90% as a "sure match"
                if match_confidence > 90:
                    contributor_data.sort_name = potential_match
                    return match_confidences

        # If we have a display name to look for, and this cluster's
        # Wikipedia name converts to the display name, great.
        if working_display_name:
            wikipedia_name = self.extract_wikipedia_name(cluster)
            if wikipedia_name:
                contributor_data.wikipedia_name = wikipedia_name
                display_name = self.wikipedia_name_to_display_name(
                    wikipedia_name)
                match_confidence = contributor_name_match_ratio(
                    display_name, working_display_name)
                match_confidences["display_name"] = match_confidence
                if match_confidence > 90:
                    contributor_data.display_name = display_name
                    return match_confidences

        # If there are UNIMARC records, and every part of the UNIMARC
        # record matches the sort name or the display name, great.
        unimarcs = self._xpath(
            cluster, './/*[local-name()="datafield"][@dtype="UNIMARC"]')
        candidates = []
        for unimarc in unimarcs:
            (possible_given, possible_family, possible_extra,
             possible_sort_name) = self.extract_name_from_unimarc(unimarc)
            if working_sort_name:
                match_confidence = contributor_name_match_ratio(
                    possible_sort_name, working_sort_name)
                match_confidences["unimarc"] = match_confidence
                if match_confidence > 90:
                    contributor_data.family_name = possible_sort_name
                    return match_confidences

            for name in (working_sort_name, working_display_name):
                if not name:
                    continue
                if (possible_given and possible_given in name
                        and possible_family and possible_family in name
                        and (not possible_extra or possible_extra in name)):
                    match_confidences["unimarc"] = 90
                    contributor_data.family_name = possible_family
                    return match_confidences

        # Last-ditch effort. Guess at the sort name and see if *that's* one
        # of the cluster sort names.
        if working_display_name and not working_sort_name:
            test_sort_name = display_name_to_sort_name(working_display_name)
            for potential_match in self.sort_names_for_cluster(cluster):
                match_confidence = contributor_name_match_ratio(
                    potential_match, test_sort_name)
                match_confidences["guessed_sort_name"] = match_confidence
                if match_confidence > 90:
                    contributor_data.sort_name = potential_match
                    return match_confidences

        # OK, last last-ditch effort.  See if the alternate name forms (pseudonyms) are it.
        if working_sort_name:
            for potential_match in self.alternate_name_forms_for_cluster(
                    cluster):
                match_confidence = contributor_name_match_ratio(
                    potential_match, working_sort_name)
                match_confidences["alternate_name"] = match_confidence
                if match_confidence > 90:
                    contributor_data.family_name = potential_match
                    return match_confidences

        return match_confidences
예제 #11
0
    def parse(cls, file, data_source_name):
        metadata_records = []

        # TODO: ONIX has plain language 'reference names' and short tags that
        # may be used interchangably. This code currently only handles short tags,
        # and it's not comprehensive.

        parser = XMLParser()
        tree = etree.parse(file)
        root = tree.getroot()

        for record in root.findall('product'):
            title = parser.text_of_optional_subtag(record, 'descriptivedetail/titledetail/titleelement/b203')
            if not title:
                title_prefix = parser.text_of_optional_subtag(record, 'descriptivedetail/titledetail/titleelement/b030')
                title_without_prefix = parser.text_of_optional_subtag(record, 'descriptivedetail/titledetail/titleelement/b031')
                if title_prefix and title_without_prefix:
                    title = title_prefix + " " + title_without_prefix

            subtitle = parser.text_of_optional_subtag(record, 'descriptivedetail/titledetail/titleelement/b029')
            language = parser.text_of_optional_subtag(record, 'descriptivedetail/language/b252') or "eng"
            publisher = parser.text_of_optional_subtag(record, 'publishingdetail/publisher/b081')
            imprint = parser.text_of_optional_subtag(record, 'publishingdetail/imprint/b079')
            if imprint == publisher:
                imprint = None

            publishing_date = parser.text_of_optional_subtag(record, 'publishingdetail/publishingdate/b306')
            issued = None
            if publishing_date:
                issued = datetime.datetime.strptime(publishing_date, "%Y%m%d")

            identifier_tags = parser._xpath(record, 'productidentifier')
            identifiers = []
            primary_identifier = None
            for tag in identifier_tags:
                type = parser.text_of_subtag(tag, "b221")
                if type == '02' or type == '15':
                    primary_identifier = IdentifierData(Identifier.ISBN, parser.text_of_subtag(tag, 'b244'))
                    identifiers.append(primary_identifier)

            subject_tags = parser._xpath(record, 'descriptivedetail/subject')
            subjects = []

            weight = Classification.TRUSTED_DISTRIBUTOR_WEIGHT
            for tag in subject_tags:
                type = parser.text_of_subtag(tag, 'b067')
                if type in cls.SUBJECT_TYPES:
                    subjects.append(
                        SubjectData(
                            cls.SUBJECT_TYPES[type],
                            parser.text_of_subtag(tag, 'b069'),
                            weight=weight
                        )
                    )

            audience_tags = parser._xpath(record, 'descriptivedetail/audience/b204')
            audiences = []
            for tag in audience_tags:
                if tag.text in cls.AUDIENCE_TYPES:
                    subjects.append(
                        SubjectData(
                            Subject.FREEFORM_AUDIENCE,
                            cls.AUDIENCE_TYPES[tag.text],
                            weight=weight
                        )
                    )

            contributor_tags = parser._xpath(record, 'descriptivedetail/contributor')
            contributors = []
            for tag in contributor_tags:
                type = parser.text_of_subtag(tag, 'b035')
                if type in cls.CONTRIBUTOR_TYPES:
                    display_name = parser.text_of_subtag(tag, 'b036')
                    sort_name = parser.text_of_optional_subtag(tag, 'b037')
                    family_name = parser.text_of_optional_subtag(tag, 'b040')
                    bio = parser.text_of_optional_subtag(tag, 'b044')
                    contributors.append(ContributorData(sort_name=sort_name,
                                                        display_name=display_name,
                                                        family_name=family_name,
                                                        roles=[cls.CONTRIBUTOR_TYPES[type]],
                                                        biography=bio))

            collateral_tags = parser._xpath(record, 'collateraldetail/textcontent')
            links = []
            for tag in collateral_tags:
                type = parser.text_of_subtag(tag, 'x426')
                # TODO: '03' is the summary in the example I'm testing, but that
                # might not be generally true.
                if type == '03':
                    text = parser.text_of_subtag(tag, 'd104')
                    links.append(LinkData(rel=Hyperlink.DESCRIPTION,
                                          media_type=Representation.TEXT_HTML_MEDIA_TYPE,
                                          content=text))

            metadata_records.append(Metadata(
                data_source=data_source_name,
                title=title,
                subtitle=subtitle,
                language=language,
                medium=Edition.BOOK_MEDIUM,
                publisher=publisher,
                imprint=imprint,
                issued=issued,
                primary_identifier=primary_identifier,
                identifiers=identifiers,
                subjects=subjects,
                contributors=contributors,
                links=links
            ))
        return metadata_records
예제 #12
0
    def __init__(self, product):
        self.subjects = []
        self.identifiers = []
        self.contributors = []
        self.links = []
        self.product = product
        self.var = defaultdict(list)
        self.unrecognized_tags = dict()
        self.title = None
        for f in self.product.get('varFields', []):
            marctag = MarcTag(f)
            self.var[marctag.marcTag].append(marctag)

        # Find a title.
        for num in ('245', '240'):
            for tag in self.tags(num):
                self.title = tag.a
                if self.title:
                    break
            if self.title:
                break

        # Contributors
        for tag in self.tags('100'):
            role = tag.e or 'author.'
            sort_name = tag.a
            self.contributors.append(
                ContributorData(sort_name=sort_name, roles=[role]))

        # Subjects
        for number in ('050', '908'):
            for tag in self.tags(number):
                # Library of Congress classification
                if tag.a:
                    self.subjects.append(
                        SubjectData(type=Subject.LCC, identifier=tag.a))
                # TODO: tag.b ("Pap 2014eb") includes potentially useful
                # date information.

        for tag in self.tags('856'):
            if tag.subfields.get('3', {}).get('content') == 'Image':
                continue
            if tag.u:
                if tag.y == 'Access eNYPL' or tag.z == 'Access eNYPL':
                    self.links.append(LinkData(rel='alternate', href=tag.u))

        for tag in self.tags('082'):
            if tag.a:
                self.subjects.append(
                    SubjectData(type=Subject.DDC, identifier=tag.a))

        for v in range(650, 656):
            for tag in self.tags(v):
                type = getattr(tag, '2', None)
                native_type = Subject.TAG
                if type:
                    if type.endswith('.'):
                        type = type[:-1]
                    Representation.tag_type[type] += 1
                    native_type = self.shadowcat_subject_type_to_native_type.get(
                        type, Subject.TAG)

                identifiers = [x for x in [tag.a, tag.v] if x]
                for identifier in identifiers:
                    self.subjects.append(
                        SubjectData(type=native_type, identifier=identifier))

        # Identifiers
        for tag in self.tags('037'):
            if tag.a and (tag.b in self.marc_037_b_to_identifier_type):
                t = self.marc_037_b_to_identifier_type[tag.b]
                self.identifiers.append(
                    IdentifierData(type=t, identifier=tag.a))

        for tag in self.tags('020'):
            isbn = tag.a
            if not isbn:
                continue
            for r in self.isbn_res:
                m = r.search(isbn)
                if m:
                    isbn = m.groups()[0]
                    self.identifiers.append(
                        IdentifierData(type=Identifier.ISBN, identifier=isbn))

        for key in ['385', '521']:
            for tag in self.tags(key):
                identifier = tag.a
                if identifier.lower() in self.audience_blacklist:
                    continue
                self.subjects.append(
                    SubjectData(type=Subject.FREEFORM_AUDIENCE,
                                identifier=identifier))

        for tag in self.tags('035'):
            potential = tag.a
            identifier = None
            for r, type in self.marc_035_a_to_identifier_type.items():
                m = r.search(potential)
                if m:
                    identifier = m.groups()[0]
                    break
            if identifier:
                self.identifiers.append(
                    IdentifierData(type=type, identifier=identifier))

        # Keep track of items we haven't seen before.
        for key, var in self.var.items():
            if key not in self.known_vars:
                self.unrecognized_tags[key] = var
예제 #13
0
파일: onix.py 프로젝트: lhuabu/circulation
    def parse(cls, file, data_source_name):
        metadata_records = []

        # TODO: ONIX has plain language 'reference names' and short tags that
        # may be used interchangably. This code currently only handles short tags,
        # and it's not comprehensive.

        parser = XMLParser()
        tree = etree.parse(file)
        root = tree.getroot()

        for record in root.findall('product'):
            title = parser.text_of_optional_subtag(
                record, 'descriptivedetail/titledetail/titleelement/b203')
            if not title:
                title_prefix = parser.text_of_optional_subtag(
                    record, 'descriptivedetail/titledetail/titleelement/b030')
                title_without_prefix = parser.text_of_optional_subtag(
                    record, 'descriptivedetail/titledetail/titleelement/b031')
                if title_prefix and title_without_prefix:
                    title = title_prefix + " " + title_without_prefix

            subtitle = parser.text_of_optional_subtag(
                record, 'descriptivedetail/titledetail/titleelement/b029')
            language = parser.text_of_optional_subtag(
                record, 'descriptivedetail/language/b252') or "eng"
            publisher = parser.text_of_optional_subtag(
                record, 'publishingdetail/publisher/b081')
            imprint = parser.text_of_optional_subtag(
                record, 'publishingdetail/imprint/b079')
            if imprint == publisher:
                imprint = None

            publishing_date = parser.text_of_optional_subtag(
                record, 'publishingdetail/publishingdate/b306')
            issued = None
            if publishing_date:
                issued = datetime.datetime.strptime(publishing_date, "%Y%m%d")

            identifier_tags = parser._xpath(record, 'productidentifier')
            identifiers = []
            primary_identifier = None
            for tag in identifier_tags:
                type = parser.text_of_subtag(tag, "b221")
                if type == '02' or type == '15':
                    primary_identifier = IdentifierData(
                        Identifier.ISBN, parser.text_of_subtag(tag, 'b244'))
                    identifiers.append(primary_identifier)

            subject_tags = parser._xpath(record, 'descriptivedetail/subject')
            subjects = []

            weight = Classification.TRUSTED_DISTRIBUTOR_WEIGHT
            for tag in subject_tags:
                type = parser.text_of_subtag(tag, 'b067')
                if type in cls.SUBJECT_TYPES:
                    subjects.append(
                        SubjectData(cls.SUBJECT_TYPES[type],
                                    parser.text_of_subtag(tag, 'b069'),
                                    weight=weight))

            audience_tags = parser._xpath(record,
                                          'descriptivedetail/audience/b204')
            audiences = []
            for tag in audience_tags:
                if tag.text in cls.AUDIENCE_TYPES:
                    subjects.append(
                        SubjectData(Subject.FREEFORM_AUDIENCE,
                                    cls.AUDIENCE_TYPES[tag.text],
                                    weight=weight))

            contributor_tags = parser._xpath(record,
                                             'descriptivedetail/contributor')
            contributors = []
            for tag in contributor_tags:
                type = parser.text_of_subtag(tag, 'b035')
                if type in cls.CONTRIBUTOR_TYPES:
                    display_name = parser.text_of_subtag(tag, 'b036')
                    sort_name = parser.text_of_optional_subtag(tag, 'b037')
                    family_name = parser.text_of_optional_subtag(tag, 'b040')
                    bio = parser.text_of_optional_subtag(tag, 'b044')
                    contributors.append(
                        ContributorData(sort_name=sort_name,
                                        display_name=display_name,
                                        family_name=family_name,
                                        roles=[cls.CONTRIBUTOR_TYPES[type]],
                                        biography=bio))

            collateral_tags = parser._xpath(record,
                                            'collateraldetail/textcontent')
            links = []
            for tag in collateral_tags:
                type = parser.text_of_subtag(tag, 'x426')
                # TODO: '03' is the summary in the example I'm testing, but that
                # might not be generally true.
                if type == '03':
                    text = parser.text_of_subtag(tag, 'd104')
                    links.append(
                        LinkData(
                            rel=Hyperlink.DESCRIPTION,
                            media_type=Representation.TEXT_HTML_MEDIA_TYPE,
                            content=text))

            usage_constraint_tags = parser._xpath(
                record, 'descriptivedetail/epubusageconstraint')
            licenses_owned = LicensePool.UNLIMITED_ACCESS

            if usage_constraint_tags:
                cls._logger.debug('Found {0} EpubUsageConstraint tags'.format(
                    len(usage_constraint_tags)))

            for usage_constraint_tag in usage_constraint_tags:
                usage_status = parser.text_of_subtag(usage_constraint_tag,
                                                     'x319')

                cls._logger.debug('EpubUsageStatus: {0}'.format(usage_status))

                if usage_status == UsageStatus.PROHIBITED.value:
                    raise Exception('The content is prohibited')
                elif usage_status == UsageStatus.LIMITED.value:
                    usage_limit_tags = parser._xpath(
                        record,
                        'descriptivedetail/epubusageconstraint/epubusagelimit')

                    cls._logger.debug('Found {0} EpubUsageLimit tags'.format(
                        len(usage_limit_tags)))

                    if not usage_limit_tags:
                        continue

                    [usage_limit_tag] = usage_limit_tags

                    usage_unit = parser.text_of_subtag(usage_limit_tag, 'x321')

                    cls._logger.debug('EpubUsageUnit: {0}'.format(usage_unit))

                    if usage_unit == UsageUnit.COPIES.value or usage_status == UsageUnit.CONCURRENT_USERS.value:
                        quantity_limit = parser.text_of_subtag(
                            usage_limit_tag, 'x320')

                        cls._logger.debug(
                            'Quantity: {0}'.format(quantity_limit))

                        if licenses_owned == LicensePool.UNLIMITED_ACCESS:
                            licenses_owned = 0

                        licenses_owned += int(quantity_limit)

            metadata_records.append(
                Metadata(data_source=data_source_name,
                         title=title,
                         subtitle=subtitle,
                         language=language,
                         medium=Edition.BOOK_MEDIUM,
                         publisher=publisher,
                         imprint=imprint,
                         issued=issued,
                         primary_identifier=primary_identifier,
                         identifiers=identifiers,
                         subjects=subjects,
                         contributors=contributors,
                         links=links,
                         circulation=CirculationData(
                             data_source_name,
                             primary_identifier,
                             licenses_owned=licenses_owned,
                             licenses_available=licenses_owned,
                             licenses_reserved=0,
                             patrons_in_hold_queue=0)))

        return metadata_records
예제 #14
0
                    subject_identifier = subject_detail.get('id')
                    metadata.subjects.append(
                        SubjectData(
                            type=subject_type,
                            identifier=subject_identifier,
                            name=subject_name,
                        ))
                else:
                    metadata.subjects.append(
                        SubjectData(type=subject_type,
                                    identifier=subject_detail))

        viafs = [self.VIAF_ID.search(uri) for uri in creator_uris]
        viafs = [viaf.groups()[0] for viaf in viafs if viaf is not None]
        for viaf in viafs:
            metadata.contributors.append(ContributorData(viaf=viaf))

        if creator_uris and not viafs:
            # We vastly prefer VIAF author information over OCLC.
            # We'll only extract OCLC author information if we have
            # _NO_ author information at all.
            contributors_data = []
            for uri in creator_uris:
                external = self.EXTERNAL_PERSON_URI.search(uri)
                if external:
                    contributors_data += self.get_contributors(uri)
                internal = self.INTERNAL_PERSON_URI.search(uri)
                if internal:
                    graphs = self.internal_lookup(subgraph, [uri])
                    for person_graph in graphs:
                        contributor_data = self.extract_contributor(
예제 #15
0
    def __init__(self, data, medium):
        data = data
        try:
            bestsellers_date = NYTAPI.parse_datetime(data.get("bestsellers_date"))
            first_appearance = bestsellers_date
            most_recent_appearance = bestsellers_date
        except ValueError as e:
            first_appearance = None
            most_recent_appearance = None

        try:
            # This is the date the _book_ was published, not the date
            # the _bestseller list_ was published.
            published_date = NYTAPI.parse_date(data.get("published_date"))
        except ValueError as e:
            published_date = None

        details = data["book_details"]
        other_isbns = []
        if len(details) == 0:
            publisher = annotation = primary_isbn10 = primary_isbn13 = title = None
            display_author = None
        else:
            d = details[0]
            title = d.get("title", None)
            display_author = d.get("author", None)
            publisher = d.get("publisher", None)
            annotation = d.get("description", None)
            primary_isbn10 = d.get("primary_isbn10", None)
            primary_isbn13 = d.get("primary_isbn13", None)

            # The list of other ISBNs frequently contains ISBNs for
            # other books in the same series, as well as ISBNs that
            # are just wrong. Assign these equivalencies at a low
            # level of confidence.
            for isbn in d.get("isbns", []):
                isbn13 = isbn.get("isbn13", None)
                if isbn13:
                    other_isbns.append(IdentifierData(Identifier.ISBN, isbn13, 0.50))

        primary_isbn = primary_isbn13 or primary_isbn10
        if primary_isbn:
            primary_isbn = IdentifierData(Identifier.ISBN, primary_isbn, 0.90)

        contributors = []
        if display_author:
            contributors.append(ContributorData(display_name=display_author))

        metadata = Metadata(
            data_source=DataSource.NYT,
            title=title,
            medium=medium,
            language="eng",
            published=published_date,
            publisher=publisher,
            contributors=contributors,
            primary_identifier=primary_isbn,
            identifiers=other_isbns,
        )

        super(NYTBestSellerListTitle, self).__init__(
            metadata, first_appearance, most_recent_appearance, annotation
        )
예제 #16
0
    def extract_viaf_info(self, cluster, working_sort_name=None,
                          working_display_name=False):
        """ Extract name info from a single VIAF cluster.

        :return: a tuple containing:
        - ContributorData object filled with display, sort, family, and wikipedia names.
        - dictionary of ways the xml cluster data matched the names searched for.
        - list of titles attributed to the contributor in the cluster.
        or Nones on error.
        """
        contributor_data = ContributorData()
        contributor_titles = []
        match_confidences = {}

        # Find out if one of the working names shows up in a name record.
        # Note: Potentially sets contributor_data.sort_name.
        match_confidences = self.cluster_has_record_for_named_author(
                cluster, working_sort_name, working_display_name,
                contributor_data
        )

        # Get the VIAF ID for this cluster, just in case we don't have one yet.
        viaf_tag = self._xpath1(cluster, './/*[local-name()="viafID"]')
        if viaf_tag is None:
            contributor_data.viaf = None
        else:
            contributor_data.viaf = viaf_tag.text

        # If we don't have a working sort name, find the most popular
        # sort name in this cluster and use it as the sort name.
        sort_name_popularity = self.sort_names_by_popularity(cluster)

        # Does this cluster have a Wikipedia page?
        contributor_data.wikipedia_name = self.extract_wikipedia_name(cluster)
        if contributor_data.wikipedia_name:
            contributor_data.display_name = self.wikipedia_name_to_display_name(contributor_data.wikipedia_name)
            working_display_name = contributor_data.display_name
            # TODO: There's a problem here when someone's record has a
            # Wikipedia page other than their personal page (e.g. for
            # a band they're in.)

        known_name = working_sort_name or working_display_name
        unimarcs = self._xpath(cluster, './/*[local-name()="datafield"][@dtype="UNIMARC"]')
        candidates = []
        for unimarc in unimarcs:
            (possible_given, possible_family,
             possible_extra, possible_sort_name) = self.extract_name_from_unimarc(unimarc)
            # Some part of this name must also show up in the original
            # name for it to even be considered. Otherwise it's a
            # better bet to try to munge the original name.
            for v in (possible_given, possible_family, possible_extra):
                if not v:
                    continue
                if not known_name or v in known_name:
                    self.log.debug(
                        "FOUND %s in %s", v, known_name
                    )
                    candidates.append((possible_given, possible_family, possible_extra))
                    if possible_sort_name:
                        if possible_sort_name.endswith(","):
                            possible_sort_name = possible_sort_name[:-1]
                        sort_name_popularity[possible_sort_name] += 1
                    break
            else:
                self.log.debug(
                    "EXCLUDED %s/%s/%s for lack of resemblance to %s",
                    possible_given, possible_family, possible_extra,
                    known_name
                )
                pass

        if sort_name_popularity and not contributor_data.sort_name:
            contributor_data.sort_name, ignore = sort_name_popularity.most_common(1)[0]

        if contributor_data.display_name:
            parts = contributor_data.display_name.split(" ")
            if len(parts) == 2:
                # Pretty clearly given name+family name.
                # If it gets more complicated than this we can't
                # be confident.
                candidates.append(parts + [None])

        display_nameparts = self.best_choice(candidates)
        if display_nameparts[1]: # Family name
            contributor_data.family_name = display_nameparts[1]

        contributor_data.display_name = contributor_data.display_name or self.combine_nameparts(*display_nameparts) or working_display_name


        # Now go through the title elements, and make a list.
        titles = self._xpath(cluster, './/*[local-name()="titles"]/*[local-name()="work"]/*[local-name()="title"]')
        for title in titles:
            contributor_titles.append(title.text)

        return contributor_data, match_confidences, contributor_titles
예제 #17
0
    def record_info_to_metadata(cls, book, availability):
        """Turn Odilo's JSON representation of a book into a Metadata
        object.

        Note:  The json data passed into this method is from a different file/stream
        from the json data that goes into the book_info_to_circulation() method.
        """
        if 'id' not in book:
            return None

        odilo_id = book['id']
        primary_identifier = IdentifierData(Identifier.ODILO_ID, odilo_id)
        active = book.get('active')

        title = book.get('title')
        subtitle = book.get('subtitle')
        series = book.get('series')
        series_position = book.get('seriesPosition')

        contributors = []
        sort_author = book.get('author')
        if sort_author:
            roles = [Contributor.AUTHOR_ROLE]
            display_author = sort_name_to_display_name(sort_author)
            contributor = ContributorData(sort_name=sort_author,
                                          display_name=display_author,
                                          roles=roles,
                                          biography=None)
            contributors.append(contributor)

        publisher = book.get('publisher')

        # Metadata --> Marc21 260$c
        published = book.get('publicationDate')
        if not published:
            # yyyyMMdd --> record creation date
            published = book.get('releaseDate')

        if published:
            try:
                published = datetime.datetime.strptime(published, "%Y%m%d")
            except ValueError as e:
                cls.log.warn('Cannot parse publication date from: ' +
                             published + ', message: ' + e.message)

        # yyyyMMdd --> record last modification date
        last_update = book.get('modificationDate')
        if last_update:
            try:
                last_update = datetime.datetime.strptime(last_update, "%Y%m%d")
            except ValueError as e:
                cls.log.warn('Cannot parse last update date from: ' +
                             last_update + ', message: ' + e.message)

        language = book.get('language', 'spa')

        subjects = []
        for subject in book.get('subjects', []):
            subjects.append(
                SubjectData(type=Subject.TAG, identifier=subject, weight=100))

        for subjectBisacCode in book.get('subjectsBisacCodes', []):
            subjects.append(
                SubjectData(type=Subject.BISAC,
                            identifier=subjectBisacCode,
                            weight=100))

        grade_level = book.get('gradeLevel')
        if grade_level:
            subject = SubjectData(type=Subject.GRADE_LEVEL,
                                  identifier=grade_level,
                                  weight=10)
            subjects.append(subject)

        medium = None
        file_format = book.get('fileFormat')
        formats = []
        for format_received in book.get('formats', []):
            if format_received in cls.format_data_for_odilo_format:
                medium = cls.set_format(format_received, formats)
            elif format_received == cls.ACSM and file_format:
                medium = cls.set_format(
                    format_received + '_' + file_format.upper(), formats)
            else:
                cls.log.warn('Unrecognized format received: ' +
                             format_received)

        if not medium:
            medium = Edition.BOOK_MEDIUM

        identifiers = []
        isbn = book.get('isbn')
        if isbn:
            if isbnlib.is_isbn10(isbn):
                isbn = isbnlib.to_isbn13(isbn)
            identifiers.append(IdentifierData(Identifier.ISBN, isbn, 1))

        # A cover
        links = []
        cover_image_url = book.get('coverImageUrl')
        if cover_image_url:
            image_data = cls.image_link_to_linkdata(cover_image_url,
                                                    Hyperlink.THUMBNAIL_IMAGE)
            if image_data:
                links.append(image_data)

        original_image_url = book.get('originalImageUrl')
        if original_image_url:
            image_data = cls.image_link_to_linkdata(original_image_url,
                                                    Hyperlink.IMAGE)
            if image_data:
                links.append(image_data)

        # Descriptions become links.
        description = book.get('description')
        if description:
            links.append(
                LinkData(rel=Hyperlink.DESCRIPTION,
                         content=description,
                         media_type="text/html"))

        metadata = Metadata(data_source=DataSource.ODILO,
                            title=title,
                            subtitle=subtitle,
                            language=language,
                            medium=medium,
                            series=series,
                            series_position=series_position,
                            publisher=publisher,
                            published=published,
                            primary_identifier=primary_identifier,
                            identifiers=identifiers,
                            subjects=subjects,
                            contributors=contributors,
                            links=links,
                            data_source_last_updated=last_update)

        metadata.circulation = OdiloRepresentationExtractor.record_info_to_circulation(
            availability)
        # 'active' --> means that the book exists but it's no longer in the collection
        # (it could be available again in the future)
        if not active:
            metadata.circulation.licenses_owned = 0
        metadata.circulation.formats = formats

        return metadata, active
예제 #18
0
    def extract_viaf_info(self,
                          cluster,
                          working_sort_name=None,
                          working_display_name=False):
        """ Extract name info from a single VIAF cluster.

        :return: a tuple containing: 
        - ContributorData object filled with display, sort, family, and wikipedia names.
        - dictionary of ways the xml cluster data matched the names searched for.
        - list of titles attributed to the contributor in the cluster.
        or Nones on error.
        """
        contributor_data = ContributorData()
        contributor_titles = []
        match_confidences = {}

        # Find out if one of the working names shows up in a name record.
        # Note: Potentially sets contributor_data.sort_name.
        match_confidences = self.cluster_has_record_for_named_author(
            cluster, working_sort_name, working_display_name, contributor_data)

        # Get the VIAF ID for this cluster, just in case we don't have one yet.
        viaf_tag = self._xpath1(cluster, './/*[local-name()="viafID"]')
        if viaf_tag is None:
            contributor_data.viaf = None
        else:
            contributor_data.viaf = viaf_tag.text

        # If we don't have a working sort name, find the most popular
        # sort name in this cluster and use it as the sort name.
        sort_name_popularity = self.sort_names_by_popularity(cluster)

        # Does this cluster have a Wikipedia page?
        contributor_data.wikipedia_name = self.extract_wikipedia_name(cluster)
        if contributor_data.wikipedia_name:
            contributor_data.display_name = self.wikipedia_name_to_display_name(
                contributor_data.wikipedia_name)
            working_display_name = contributor_data.display_name
            # TODO: There's a problem here when someone's record has a
            # Wikipedia page other than their personal page (e.g. for
            # a band they're in.)

        known_name = working_sort_name or working_display_name
        unimarcs = self._xpath(
            cluster, './/*[local-name()="datafield"][@dtype="UNIMARC"]')
        candidates = []
        for unimarc in unimarcs:
            (possible_given, possible_family, possible_extra,
             possible_sort_name) = self.extract_name_from_unimarc(unimarc)
            # Some part of this name must also show up in the original
            # name for it to even be considered. Otherwise it's a
            # better bet to try to munge the original name.
            for v in (possible_given, possible_family, possible_extra):
                if not v:
                    continue
                if not known_name or v in known_name:
                    self.log.debug("FOUND %s in %s", v, known_name)
                    candidates.append(
                        (possible_given, possible_family, possible_extra))
                    if possible_sort_name:
                        if possible_sort_name.endswith(","):
                            possible_sort_name = possible_sort_name[:-1]
                        sort_name_popularity[possible_sort_name] += 1
                    break
            else:
                self.log.debug(
                    "EXCLUDED %s/%s/%s for lack of resemblance to %s",
                    possible_given, possible_family, possible_extra,
                    known_name)
                pass

        if sort_name_popularity and not contributor_data.sort_name:
            contributor_data.sort_name, ignore = sort_name_popularity.most_common(
                1)[0]

        if contributor_data.display_name:
            parts = contributor_data.display_name.split(" ")
            if len(parts) == 2:
                # Pretty clearly given name+family name.
                # If it gets more complicated than this we can't
                # be confident.
                candidates.append(parts + [None])

        display_nameparts = self.best_choice(candidates)
        if display_nameparts[1]:  # Family name
            contributor_data.family_name = display_nameparts[1]

        contributor_data.display_name = contributor_data.display_name or self.combine_nameparts(
            *display_nameparts) or working_display_name

        # Now go through the title elements, and make a list.
        titles = self._xpath(
            cluster,
            './/*[local-name()="titles"]/*[local-name()="work"]/*[local-name()="title"]'
        )
        for title in titles:
            contributor_titles.append(title.text)

        return contributor_data, match_confidences, contributor_titles
예제 #19
0
class TestCirculationMonitor(Axis360Test):

    BIBLIOGRAPHIC_DATA = Metadata(
        DataSource.AXIS_360,
        publisher=u'Random House Inc',
        language='eng',
        title=u'Faith of My Fathers : A Family Memoir',
        imprint=u'Random House Inc2',
        published=datetime.datetime(2000, 3, 7, 0, 0),
        primary_identifier=IdentifierData(type=Identifier.AXIS_360_ID,
                                          identifier=u'0003642860'),
        identifiers=[
            IdentifierData(type=Identifier.ISBN, identifier=u'9780375504587')
        ],
        contributors=[
            ContributorData(sort_name=u"McCain, John",
                            roles=[Contributor.PRIMARY_AUTHOR_ROLE]),
            ContributorData(sort_name=u"Salter, Mark",
                            roles=[Contributor.AUTHOR_ROLE]),
        ],
        subjects=[
            SubjectData(type=Subject.BISAC,
                        identifier=u'BIOGRAPHY & AUTOBIOGRAPHY / Political'),
            SubjectData(type=Subject.FREEFORM_AUDIENCE, identifier=u'Adult'),
        ],
    )

    AVAILABILITY_DATA = CirculationData(
        data_source=DataSource.AXIS_360,
        primary_identifier=BIBLIOGRAPHIC_DATA.primary_identifier,
        licenses_owned=9,
        licenses_available=8,
        licenses_reserved=0,
        patrons_in_hold_queue=0,
        last_checked=datetime.datetime(2015, 5, 20, 2, 9, 8),
    )

    def test_process_book(self):
        integration, ignore = create(
            self._db,
            ExternalIntegration,
            goal=ExternalIntegration.ANALYTICS_GOAL,
            protocol="core.local_analytics_provider",
        )

        monitor = Axis360CirculationMonitor(
            self._db,
            self.collection,
            api_class=MockAxis360API,
            metadata_client=MockMetadataWranglerOPDSLookup('url'))
        edition, license_pool = monitor.process_book(self.BIBLIOGRAPHIC_DATA,
                                                     self.AVAILABILITY_DATA)
        eq_(u'Faith of My Fathers : A Family Memoir', edition.title)
        eq_(u'eng', edition.language)
        eq_(u'Random House Inc', edition.publisher)
        eq_(u'Random House Inc2', edition.imprint)

        eq_(Identifier.AXIS_360_ID, edition.primary_identifier.type)
        eq_(u'0003642860', edition.primary_identifier.identifier)

        [isbn] = [
            x for x in edition.equivalent_identifiers()
            if x is not edition.primary_identifier
        ]
        eq_(Identifier.ISBN, isbn.type)
        eq_(u'9780375504587', isbn.identifier)

        eq_(
            ["McCain, John", "Salter, Mark"],
            sorted([x.sort_name for x in edition.contributors]),
        )

        subs = sorted((x.subject.type, x.subject.identifier)
                      for x in edition.primary_identifier.classifications)
        eq_([(Subject.BISAC, u'BIOGRAPHY & AUTOBIOGRAPHY / Political'),
             (Subject.FREEFORM_AUDIENCE, u'Adult')], subs)

        eq_(9, license_pool.licenses_owned)
        eq_(8, license_pool.licenses_available)
        eq_(0, license_pool.patrons_in_hold_queue)
        eq_(datetime.datetime(2015, 5, 20, 2, 9, 8), license_pool.last_checked)

        # Three circulation events were created, backdated to the
        # last_checked date of the license pool.
        events = license_pool.circulation_events
        eq_([
            u'distributor_title_add', u'distributor_check_in',
            u'distributor_license_add'
        ], [x.type for x in events])
        for e in events:
            eq_(e.start, license_pool.last_checked)

        # A presentation-ready work has been created for the LicensePool.
        work = license_pool.work
        eq_(True, work.presentation_ready)
        eq_("Faith of My Fathers : A Family Memoir", work.title)

        # A CoverageRecord has been provided for this book in the Axis
        # 360 bibliographic coverage provider, so that in the future
        # it doesn't have to make a separate API request to ask about
        # this book.
        records = [
            x for x in license_pool.identifier.coverage_records if
            x.data_source.name == DataSource.AXIS_360 and x.operation is None
        ]
        eq_(1, len(records))

    def test_process_book_updates_old_licensepool(self):
        """If the LicensePool already exists, the circulation monitor
        updates it.
        """
        edition, licensepool = self._edition(
            with_license_pool=True,
            identifier_type=Identifier.AXIS_360_ID,
            identifier_id=u'0003642860')
        # We start off with availability information based on the
        # default for test data.
        eq_(1, licensepool.licenses_owned)

        identifier = IdentifierData(
            type=licensepool.identifier.type,
            identifier=licensepool.identifier.identifier)
        metadata = Metadata(DataSource.AXIS_360, primary_identifier=identifier)
        monitor = Axis360CirculationMonitor(
            self._db,
            self.collection,
            api_class=MockAxis360API,
            metadata_client=MockMetadataWranglerOPDSLookup('url'))
        edition, licensepool = monitor.process_book(metadata,
                                                    self.AVAILABILITY_DATA)

        # Now we have information based on the CirculationData.
        eq_(9, licensepool.licenses_owned)
예제 #20
0
    def parse(cls, file, data_source_name, default_medium=None):
        metadata_records = []

        # TODO: ONIX has plain language 'reference names' and short tags that
        # may be used interchangably. This code currently only handles short tags,
        # and it's not comprehensive.

        parser = XMLParser()
        tree = etree.parse(file)
        root = tree.getroot()

        for record in root.findall("product"):
            title = parser.text_of_optional_subtag(
                record, "descriptivedetail/titledetail/titleelement/b203")
            if not title:
                title_prefix = parser.text_of_optional_subtag(
                    record, "descriptivedetail/titledetail/titleelement/b030")
                title_without_prefix = parser.text_of_optional_subtag(
                    record, "descriptivedetail/titledetail/titleelement/b031")
                if title_prefix and title_without_prefix:
                    title = title_prefix + " " + title_without_prefix

            medium = parser.text_of_optional_subtag(record, "b385")

            if not medium and default_medium:
                medium = default_medium
            else:
                medium = cls.PRODUCT_CONTENT_TYPES.get(
                    medium, EditionConstants.BOOK_MEDIUM)

            subtitle = parser.text_of_optional_subtag(
                record, "descriptivedetail/titledetail/titleelement/b029")
            language = (parser.text_of_optional_subtag(
                record, "descriptivedetail/language/b252") or "eng")
            publisher = parser.text_of_optional_subtag(
                record, "publishingdetail/publisher/b081")
            imprint = parser.text_of_optional_subtag(
                record, "publishingdetail/imprint/b079")
            if imprint == publisher:
                imprint = None

            publishing_date = parser.text_of_optional_subtag(
                record, "publishingdetail/publishingdate/b306")
            issued = None
            if publishing_date:
                issued = dateutil.parser.isoparse(publishing_date)
                if issued.tzinfo is None:
                    cls._logger.warning(
                        "Publishing date {} does not contain timezone information. Assuming UTC."
                        .format(publishing_date))
                issued = to_utc(issued)

            identifier_tags = parser._xpath(record, "productidentifier")
            identifiers = []
            primary_identifier = None
            for tag in identifier_tags:
                type = parser.text_of_subtag(tag, "b221")
                if type == "02" or type == "15":
                    primary_identifier = IdentifierData(
                        Identifier.ISBN, parser.text_of_subtag(tag, "b244"))
                    identifiers.append(primary_identifier)

            subject_tags = parser._xpath(record, "descriptivedetail/subject")
            subjects = []

            weight = Classification.TRUSTED_DISTRIBUTOR_WEIGHT
            for tag in subject_tags:
                type = parser.text_of_subtag(tag, "b067")
                if type in cls.SUBJECT_TYPES:
                    b069 = parser.text_of_optional_subtag(tag, "b069")

                    if b069:
                        subjects.append(
                            SubjectData(cls.SUBJECT_TYPES[type],
                                        b069,
                                        weight=weight))

            audience_tags = parser._xpath(record,
                                          "descriptivedetail/audience/b204")
            audiences = []
            for tag in audience_tags:
                if tag.text in cls.AUDIENCE_TYPES:
                    subjects.append(
                        SubjectData(
                            Subject.FREEFORM_AUDIENCE,
                            cls.AUDIENCE_TYPES[tag.text],
                            weight=weight,
                        ))

            # TODO: We don't handle ONIX unnamed and alternatively named contributors.
            contributor_tags = parser._xpath(record,
                                             "descriptivedetail/contributor")
            contributors = []
            for tag in contributor_tags:
                type = parser.text_of_subtag(tag, "b035")
                if type in cls.CONTRIBUTOR_TYPES:
                    person_name_display = parser.text_of_optional_subtag(
                        tag, "b036")
                    person_name_inverted = parser.text_of_optional_subtag(
                        tag, "b037")
                    corp_name_display = parser.text_of_optional_subtag(
                        tag, "b047")
                    corp_name_inverted = parser.text_of_optional_subtag(
                        tag, "x443")
                    bio = parser.text_of_optional_subtag(tag, "b044")
                    family_name = None
                    if person_name_display or person_name_inverted:
                        display_name = person_name_display
                        sort_name = person_name_inverted
                        family_name = parser.text_of_optional_subtag(
                            tag, "b040")
                    elif corp_name_display or corp_name_inverted:
                        display_name = corp_name_display
                        # Sort form for corporate name might just be the display name
                        sort_name = corp_name_inverted or corp_name_display
                    else:
                        sort_name = display_name = None
                    contributors.append(
                        ContributorData(
                            sort_name=sort_name,
                            display_name=display_name,
                            family_name=family_name,
                            roles=[cls.CONTRIBUTOR_TYPES[type]],
                            biography=bio,
                        ))

            collateral_tags = parser._xpath(record,
                                            "collateraldetail/textcontent")
            links = []
            for tag in collateral_tags:
                type = parser.text_of_subtag(tag, "x426")
                # TODO: '03' is the summary in the example I'm testing, but that
                # might not be generally true.
                if type == "03":
                    text = parser.text_of_subtag(tag, "d104")
                    links.append(
                        LinkData(
                            rel=Hyperlink.DESCRIPTION,
                            media_type=Representation.TEXT_HTML_MEDIA_TYPE,
                            content=text,
                        ))

            usage_constraint_tags = parser._xpath(
                record, "descriptivedetail/epubusageconstraint")
            licenses_owned = LicensePool.UNLIMITED_ACCESS

            if usage_constraint_tags:
                cls._logger.debug("Found {0} EpubUsageConstraint tags".format(
                    len(usage_constraint_tags)))

            for usage_constraint_tag in usage_constraint_tags:
                usage_status = parser.text_of_subtag(usage_constraint_tag,
                                                     "x319")

                cls._logger.debug("EpubUsageStatus: {0}".format(usage_status))

                if usage_status == UsageStatus.PROHIBITED.value:
                    raise Exception("The content is prohibited")
                elif usage_status == UsageStatus.LIMITED.value:
                    usage_limit_tags = parser._xpath(
                        record,
                        "descriptivedetail/epubusageconstraint/epubusagelimit")

                    cls._logger.debug("Found {0} EpubUsageLimit tags".format(
                        len(usage_limit_tags)))

                    if not usage_limit_tags:
                        continue

                    [usage_limit_tag] = usage_limit_tags

                    usage_unit = parser.text_of_subtag(usage_limit_tag, "x321")

                    cls._logger.debug("EpubUsageUnit: {0}".format(usage_unit))

                    if (usage_unit == UsageUnit.COPIES.value or usage_status
                            == UsageUnit.CONCURRENT_USERS.value):
                        quantity_limit = parser.text_of_subtag(
                            usage_limit_tag, "x320")

                        cls._logger.debug(
                            "Quantity: {0}".format(quantity_limit))

                        if licenses_owned == LicensePool.UNLIMITED_ACCESS:
                            licenses_owned = 0

                        licenses_owned += int(quantity_limit)

            metadata_records.append(
                Metadata(
                    data_source=data_source_name,
                    title=title,
                    subtitle=subtitle,
                    language=language,
                    medium=medium,
                    publisher=publisher,
                    imprint=imprint,
                    issued=issued,
                    primary_identifier=primary_identifier,
                    identifiers=identifiers,
                    subjects=subjects,
                    contributors=contributors,
                    links=links,
                    circulation=CirculationData(
                        data_source_name,
                        primary_identifier,
                        licenses_owned=licenses_owned,
                        licenses_available=licenses_owned,
                        licenses_reserved=0,
                        patrons_in_hold_queue=0,
                    ),
                ))

        return metadata_records
예제 #21
0
    def test_viaf_authors_get_viaf_lookup(self):
        # TODO: The code this calls could be refactored quite a bit --
        # we don't really need to test all of process_item() here.
        # But ATM it does seem to be our only test of process_item().

        oclc = MockOCLCLinkedDataAPI()
        viaf = MockVIAFClient()
        provider = LinkedDataCoverageProvider(self._db,
                                              api=oclc,
                                              viaf_api=viaf)

        # Here's a placeholder that will be filled in with information from
        # OCLC Linked Data.
        edition = self._edition()
        for i in edition.contributions:
            self._db.delete(i)
        self._db.commit()
        identifier = edition.primary_identifier

        # OCLC Linked Data is going to mention two authors -- one with
        # a sort name + VIAF, and one with a VIAF but no sort name.
        contributor1 = ContributorData(viaf="1")
        contributor2 = ContributorData(viaf="2", sort_name="Jordan, Robert")
        contributor3 = ContributorData(sort_name="Rice, Anne",
                                       display_name="Anne Rice")
        idata = IdentifierData(type=identifier.type,
                               identifier=identifier.identifier)
        metadata = Metadata(
            DataSource.OCLC_LINKED_DATA,
            contributors=[contributor1, contributor2, contributor3],
            primary_identifier=idata,
            title=u"foo")
        oclc.queue_info_for(metadata)

        # Our OCLC Linked Data client is going to try to fill in the
        # data, asking VIAF about the contributors that have VIAF data,
        # and not those who do not.
        lookup1 = (ContributorData(viaf="1",
                                   display_name="Display Name",
                                   family_name="Family",
                                   sort_name="Name, Sort",
                                   wikipedia_name="Wikipedia_Name"), None,
                   None)
        lookup2 = (ContributorData(viaf="2",
                                   wikipedia_name="Robert_Jordan_(Author)",
                                   biography="That guy."), None, None)
        viaf.queue_lookup(lookup1, lookup2, "Unrequested lookup")

        provider.process_item(identifier)

        # Both VIAF-identified authors have had their information updated
        # with the VIAF results.
        filled_in = sorted([(x.sort_name, x.display_name, x.viaf,
                             x.wikipedia_name, x.biography)
                            for x in edition.contributors])
        eq_([(u'Jordan, Robert', None, u'2', u'Robert_Jordan_(Author)',
              u'That guy.'),
             (u'Name, Sort', u'Display Name', u'1', u'Wikipedia_Name', None),
             (u'Rice, Anne', u'Anne Rice', None, None, None)], filled_in)
        # The author without VIAF data didn't request a VIAF lookup.
        # Instead, that result is still in the mock VIAF queue.
        eq_(viaf.results, ["Unrequested lookup"])
예제 #22
0
    def add_with_metadata(self, collection_details):
        """Adds identifiers with their metadata to a Collection's catalog"""
        client = authenticated_client_from_request(self._db)
        if isinstance(client, ProblemDetail):
            return client

        collection = collection_from_details(self._db, client,
                                             collection_details)

        data_source = DataSource.lookup(self._db,
                                        collection.name,
                                        autocreate=True)

        messages = []

        feed = feedparser.parse(request.data)
        entries = feed.get("entries", [])
        entries_by_urn = {entry.get('id'): entry for entry in entries}

        identifiers_by_urn, invalid_urns = Identifier.parse_urns(
            self._db, entries_by_urn.keys())

        messages = list()

        for urn in invalid_urns:
            messages.append(
                OPDSMessage(urn, INVALID_URN.status_code, INVALID_URN.detail))

        for urn, identifier in identifiers_by_urn.items():
            entry = entries_by_urn[urn]
            status = HTTP_OK
            description = "Already in catalog"

            if identifier not in collection.catalog:
                collection.catalog_identifier(identifier)
                status = HTTP_CREATED
                description = "Successfully added"

            message = OPDSMessage(urn, status, description)

            # Get a cover if it exists.
            image_types = set([Hyperlink.IMAGE, Hyperlink.THUMBNAIL_IMAGE])
            images = [
                l for l in entry.get("links", [])
                if l.get("rel") in image_types
            ]
            links = [
                LinkData(image.get("rel"), image.get("href"))
                for image in images
            ]

            # Create an edition to hold the title and author. LicensePool.calculate_work
            # refuses to create a Work when there's no title, and if we have a title, author
            # and language we can attempt to look up the edition in OCLC.
            title = entry.get("title") or "Unknown Title"
            author = ContributorData(sort_name=(entry.get("author")
                                                or Edition.UNKNOWN_AUTHOR),
                                     roles=[Contributor.PRIMARY_AUTHOR_ROLE])
            language = entry.get("dcterms_language")

            presentation = PresentationCalculationPolicy(
                choose_edition=False,
                set_edition_metadata=False,
                classify=False,
                choose_summary=False,
                calculate_quality=False,
                choose_cover=False,
                regenerate_opds_entries=False,
            )
            replace = ReplacementPolicy(
                presentation_calculation_policy=presentation)
            metadata = Metadata(
                data_source,
                primary_identifier=IdentifierData(identifier.type,
                                                  identifier.identifier),
                title=title,
                language=language,
                contributors=[author],
                links=links,
            )

            edition, ignore = metadata.edition(self._db)
            metadata.apply(edition, collection, replace=replace)

            messages.append(message)

        title = "%s Catalog Item Additions for %s" % (collection.protocol,
                                                      client.url)
        url = self.collection_feed_url("add_with_metadata", collection)
        addition_feed = AcquisitionFeed(self._db,
                                        title,
                                        url, [],
                                        VerboseAnnotator,
                                        precomposed_entries=messages)

        return feed_response(addition_feed)
예제 #23
0
    def lookup_info_to_metadata(self, lookup_representation):
        """Transforms a NoveList JSON representation into a Metadata object"""

        if not lookup_representation.content:
            return None

        lookup_info = json.loads(lookup_representation.content)
        book_info = lookup_info['TitleInfo']
        if book_info:
            novelist_identifier = book_info.get('ui')
        if not book_info or not novelist_identifier:
            # NoveList didn't know the ISBN.
            return None

        primary_identifier, ignore = Identifier.for_foreign_id(
            self._db, Identifier.NOVELIST_ID, novelist_identifier)
        metadata = Metadata(self.source, primary_identifier=primary_identifier)

        # Get the equivalent ISBN identifiers.
        metadata.identifiers += self._extract_isbns(book_info)

        author = book_info.get('author')
        if author:
            metadata.contributors.append(ContributorData(sort_name=author))

        description = book_info.get('description')
        if description:
            metadata.links.append(
                LinkData(rel=Hyperlink.DESCRIPTION,
                         content=description,
                         media_type=Representation.TEXT_PLAIN))

        audience_level = book_info.get('audience_level')
        if audience_level:
            metadata.subjects.append(
                SubjectData(Subject.FREEFORM_AUDIENCE, audience_level))

        novelist_rating = book_info.get('rating')
        if novelist_rating:
            metadata.measurements.append(
                MeasurementData(Measurement.RATING, novelist_rating))

        # Extract feature content if it is available.
        series_info = None
        appeals_info = None
        lexile_info = None
        goodreads_info = None
        recommendations_info = None
        feature_content = lookup_info.get('FeatureContent')
        if feature_content:
            series_info = feature_content.get('SeriesInfo')
            appeals_info = feature_content.get('Appeals')
            lexile_info = feature_content.get('LexileInfo')
            goodreads_info = feature_content.get('GoodReads')
            recommendations_info = feature_content.get('SimilarTitles')

        metadata, title_key = self.get_series_information(
            metadata, series_info, book_info)
        metadata.title = book_info.get(title_key)
        subtitle = TitleProcessor.extract_subtitle(metadata.title,
                                                   book_info.get('full_title'))
        metadata.subtitle = self._scrub_subtitle(subtitle)

        # TODO: How well do we trust this data? We could conceivably bump up
        # the weight here.
        if appeals_info:
            extracted_genres = False
            for appeal in appeals_info:
                genres = appeal.get('genres')
                if genres:
                    for genre in genres:
                        metadata.subjects.append(
                            SubjectData(Subject.TAG, genre['Name']))
                        extracted_genres = True
                if extracted_genres:
                    break

        if lexile_info:
            metadata.subjects.append(
                SubjectData(Subject.LEXILE_SCORE, lexile_info['Lexile']))

        if goodreads_info:
            metadata.measurements.append(
                MeasurementData(Measurement.RATING,
                                goodreads_info['average_rating']))

        metadata = self.get_recommendations(metadata, recommendations_info)

        # If nothing interesting comes from the API, ignore it.
        if not (metadata.measurements or metadata.series_position
                or metadata.series or metadata.subjects or metadata.links
                or metadata.subtitle or metadata.recommendations):
            metadata = None
        return metadata
예제 #24
0
    def parse(cls, file, data_source_name):
        reader = MARCReader(file)
        metadata_records = []

        for record in reader:
            title = record.title()
            if title.endswith(' /'):
                title = title[:-len(' /')]
            issued_year = datetime.datetime.strptime(record.pubyear(), "%Y.")
            publisher = record.publisher()
            if publisher.endswith(','):
                publisher = publisher[:-1]

            links = []
            summary = record.notes()[0]['a']

            if summary:
                summary_link = LinkData(
                    rel=Hyperlink.DESCRIPTION,
                    media_type=Representation.TEXT_PLAIN,
                    content=summary,
                )
                links.append(summary_link)

            isbn = record['020']['a'].split(" ")[0]
            primary_identifier = IdentifierData(Identifier.ISBN, isbn)

            subjects = [
                SubjectData(
                    Classifier.FAST,
                    subject['a'],
                ) for subject in record.subjects()
            ]

            author = record.author()
            if author:
                old_author = author
                # Turn 'Dante Alighieri,   1265-1321, author.'
                # into 'Dante Alighieri'. The metadata wrangler will
                # take it from there.
                for regex in cls.END_OF_AUTHOR_NAME_RES:
                    match = regex.search(author)
                    if match:
                        old_author = author
                        author = author[:match.start()]
                        break
                author_names = [author]
            else:
                author_names = ['Anonymous']
            contributors = [
                ContributorData(
                    sort_name=author,
                    roles=[Contributor.AUTHOR_ROLE],
                ) for author in author_names
            ]

            metadata_records.append(
                Metadata(data_source=data_source_name,
                         title=title,
                         language='eng',
                         medium=Edition.BOOK_MEDIUM,
                         publisher=publisher,
                         issued=issued_year,
                         primary_identifier=primary_identifier,
                         subjects=subjects,
                         contributors=contributors,
                         links=links))
        return metadata_records