def process_work(self, work):
     primary_identifier_ids = [
         x.primary_identifier.id for x in work.editions]
     data = Identifier.recursively_equivalent_identifier_ids(
         self._db, primary_identifier_ids, 5, threshold=0.5)
     flattened_data = Identifier.flatten_identifier_ids(data)
     workgenres, work.fiction, work.audience, target_age = work.assign_genres(
         flattened_data)
     old_target_age = work.target_age
     work.target_age = NumericRange(*target_age)
     if work.target_age != old_target_age and work.target_age.lower != None:
         print "%r: %r->%r" % (work.title, old_target_age, work.target_age)
示例#2
0
 def process_work(self, work):
     primary_identifier_ids = [
         x.primary_identifier.id for x in work.editions
     ]
     data = Identifier.recursively_equivalent_identifier_ids(
         self._db, primary_identifier_ids, 5, threshold=0.5)
     flattened_data = Identifier.flatten_identifier_ids(data)
     workgenres, work.fiction, work.audience, target_age = work.assign_genres(
         flattened_data)
     old_target_age = work.target_age
     work.target_age = NumericRange(*target_age)
     if work.target_age != old_target_age and work.target_age.lower != None:
         print "%r: %r->%r" % (work.title, old_target_age, work.target_age)
示例#3
0
    def process_item(self, identifier):
        try:
            new_info_counter = Counter()
            self.log.info("Processing identifier %r", identifier)
            metadatas = [m for m in self.api.info_for(identifier)]

            if identifier.type == Identifier.ISBN:
                # Currently info_for seeks the results of OCLC Work IDs only
                # This segment will get the metadata of any equivalent OCLC Numbers
                # as well.
                equivalents = Identifier.recursively_equivalent_identifier_ids(
                    self._db, [identifier.id])
                oclc_numbers = self._db.query(Identifier).\
                    filter(Identifier.id.in_(equivalents)).\
                    filter(Identifier.type==Identifier.OCLC_NUMBER).all()
                for oclc_number in oclc_numbers:
                    more_metadata = [m for m in self.api.info_for(oclc_number)]
                    metadatas += more_metadata
                    metadatas = [m for m in metadatas if m]

            for metadata in metadatas:
                other_identifier, ignore = metadata.primary_identifier.load(
                    self._db)
                oclc_editions = other_identifier.primarily_identifies

                # Keep track of the number of editions OCLC associates
                # with this identifier.
                other_identifier.add_measurement(
                    self.data_source, Measurement.PUBLISHED_EDITIONS,
                    len(oclc_editions))

                # Clean up contributor information.
                self.apply_viaf_to_contributor_data(metadata)
                # Remove any empty ContributorData objects that may have
                # been created.
                metadata.contributors = filter(
                    lambda c: c.sort_name or c.display_name,
                    metadata.contributors)

                # When metadata is applied, it must be given a client that can
                # response to 'canonicalize_author_name'. Usually this is an
                # OPDSImporter that reaches out to the Metadata Wrangler, but
                # in the case of being _on_ the Metadata Wrangler...:
                from canonicalize import AuthorNameCanonicalizer
                metadata_client = AuthorNameCanonicalizer(self._db,
                                                          oclcld=self.api,
                                                          viaf=self.viaf)

                num_new_isbns = self.new_isbns(metadata)
                new_info_counter['isbns'] += num_new_isbns
                if oclc_editions:
                    # There are existing OCLC editions. Apply any new information to them.
                    for edition in oclc_editions:
                        metadata, new_info_counter = self.apply_metadata_to_edition(
                            edition, metadata, metadata_client,
                            new_info_counter)
                else:
                    # Create a new OCLC edition to hold the information.
                    edition, ignore = get_one_or_create(
                        self._db,
                        Edition,
                        data_source=self.data_source,
                        primary_identifier=other_identifier)
                    metadata, new_info_counter = self.apply_metadata_to_edition(
                        edition, metadata, metadata_client, new_info_counter)
                    # Set the new OCLC edition's identifier equivalent to this
                    # identifier so we know they're related.
                    self.set_equivalence(identifier, metadata)

                self.log.info(
                    "Total: %(editions)d editions, %(isbns)d ISBNs, "\
                    "%(descriptions)d descriptions, %(subjects)d classifications.",
                    new_info_counter
                )
        except IOError as e:
            if ", but couldn't find location" in e.message:
                exception = "OCLC doesn't know about this ISBN: %r" % e
                transient = False
            else:
                exception = "OCLC raised an error: %r" % e
                transient = True
            return self.failure(identifier, exception, transient=transient)
        return identifier
    def process_item(self, identifier):
        # Books are not looked up in OCLC Linked Data directly, since
        # there is no Collection that identifies a book by its OCLC Number.
        # However, when a book is looked up through OCLC Classify, some
        # OCLC Numbers may be associated with it, and _those_ numbers
        # can be run through OCLC Linked Data.
        try:
            new_info_counter = Counter()
            self.log.info("Processing identifier %r", identifier)
            metadatas = [m for m in self.api.info_for(identifier)]

            if identifier.type==Identifier.ISBN:
                # Currently info_for seeks the results of OCLC Work IDs only
                # This segment will get the metadata of any equivalent OCLC Numbers
                # as well.
                equivalents = Identifier.recursively_equivalent_identifier_ids(
                    self._db, [identifier.id]
                )
                oclc_numbers = self._db.query(Identifier).\
                    filter(Identifier.id.in_(equivalents)).\
                    filter(Identifier.type==Identifier.OCLC_NUMBER).all()
                for oclc_number in oclc_numbers:
                    more_metadata = [m for m in self.api.info_for(oclc_number)]
                    metadatas += more_metadata
                    metadatas = [m for m in metadatas if m]

            for metadata in metadatas:
                other_identifier, ignore = metadata.primary_identifier.load(self._db)
                oclc_editions = other_identifier.primarily_identifies

                # Keep track of the number of editions OCLC associates
                # with this identifier.
                other_identifier.add_measurement(
                    self.data_source, Measurement.PUBLISHED_EDITIONS,
                    len(oclc_editions)
                )

                # Clean up contributor information.
                self.apply_viaf_to_contributor_data(metadata)
                # Remove any empty ContributorData objects that may have
                # been created.
                metadata.contributors = filter(
                    lambda c: c.sort_name or c.display_name,
                    metadata.contributors
                )

                # When metadata is applied, it must be given a client that can
                # response to 'canonicalize_author_name'. Usually this is an
                # OPDSImporter that reaches out to the Metadata Wrangler, but
                # in the case of being _on_ the Metadata Wrangler...:
                from canonicalize import AuthorNameCanonicalizer
                metadata_client = AuthorNameCanonicalizer(
                    self._db, oclcld=self.api, viaf=self.viaf
                )

                num_new_isbns = self.new_isbns(metadata)
                new_info_counter['isbns'] += num_new_isbns
                if oclc_editions:
                    # There are existing OCLC editions. Apply any new information to them.
                    for edition in oclc_editions:
                        metadata, new_info_counter = self.apply_metadata_to_edition(
                            edition, metadata, metadata_client, new_info_counter
                        )
                else:
                    # Create a new OCLC edition to hold the information.
                    edition, ignore = get_one_or_create(
                        self._db, Edition, data_source=self.data_source,
                        primary_identifier=other_identifier
                    )
                    metadata, new_info_counter = self.apply_metadata_to_edition(
                        edition, metadata, metadata_client, new_info_counter
                    )
                    # Set the new OCLC edition's identifier equivalent to this
                    # identifier so we know they're related.
                    self.set_equivalence(identifier, metadata)

                self.log.info(
                    "Total: %(editions)d editions, %(isbns)d ISBNs, "\
                    "%(descriptions)d descriptions, %(subjects)d classifications.",
                    new_info_counter
                )
        except IOError as e:
            if ", but couldn't find location" in e.message:
                exception = "OCLC doesn't know about this ISBN: %r" % e
                transient = False
            else:
                exception = "OCLC raised an error: %r" % e
                transient = True
            return self.failure(identifier, exception, transient=transient)

        # Try to calculate or recalculate a work for ISBNs.
        #
        # We won't do this for other Identifier types because we don't want
        # to overwrite the high-quality metadata direct from the source.
        # With ISBNs, that higher-quality metadata is not available, so we
        # depend on OCLC for title and author information.
        if identifier.type == Identifier.ISBN:
            self.calculate_work_for_isbn(identifier)

        return identifier