def process_work(self, work): primary_identifier_ids = [ x.primary_identifier.id for x in work.editions] data = Identifier.recursively_equivalent_identifier_ids( self._db, primary_identifier_ids, 5, threshold=0.5) flattened_data = Identifier.flatten_identifier_ids(data) workgenres, work.fiction, work.audience, target_age = work.assign_genres( flattened_data) old_target_age = work.target_age work.target_age = NumericRange(*target_age) if work.target_age != old_target_age and work.target_age.lower != None: print "%r: %r->%r" % (work.title, old_target_age, work.target_age)
def process_work(self, work): primary_identifier_ids = [ x.primary_identifier.id for x in work.editions ] data = Identifier.recursively_equivalent_identifier_ids( self._db, primary_identifier_ids, 5, threshold=0.5) flattened_data = Identifier.flatten_identifier_ids(data) workgenres, work.fiction, work.audience, target_age = work.assign_genres( flattened_data) old_target_age = work.target_age work.target_age = NumericRange(*target_age) if work.target_age != old_target_age and work.target_age.lower != None: print "%r: %r->%r" % (work.title, old_target_age, work.target_age)
def process_item(self, identifier): try: new_info_counter = Counter() self.log.info("Processing identifier %r", identifier) metadatas = [m for m in self.api.info_for(identifier)] if identifier.type == Identifier.ISBN: # Currently info_for seeks the results of OCLC Work IDs only # This segment will get the metadata of any equivalent OCLC Numbers # as well. equivalents = Identifier.recursively_equivalent_identifier_ids( self._db, [identifier.id]) oclc_numbers = self._db.query(Identifier).\ filter(Identifier.id.in_(equivalents)).\ filter(Identifier.type==Identifier.OCLC_NUMBER).all() for oclc_number in oclc_numbers: more_metadata = [m for m in self.api.info_for(oclc_number)] metadatas += more_metadata metadatas = [m for m in metadatas if m] for metadata in metadatas: other_identifier, ignore = metadata.primary_identifier.load( self._db) oclc_editions = other_identifier.primarily_identifies # Keep track of the number of editions OCLC associates # with this identifier. other_identifier.add_measurement( self.data_source, Measurement.PUBLISHED_EDITIONS, len(oclc_editions)) # Clean up contributor information. self.apply_viaf_to_contributor_data(metadata) # Remove any empty ContributorData objects that may have # been created. metadata.contributors = filter( lambda c: c.sort_name or c.display_name, metadata.contributors) # When metadata is applied, it must be given a client that can # response to 'canonicalize_author_name'. Usually this is an # OPDSImporter that reaches out to the Metadata Wrangler, but # in the case of being _on_ the Metadata Wrangler...: from canonicalize import AuthorNameCanonicalizer metadata_client = AuthorNameCanonicalizer(self._db, oclcld=self.api, viaf=self.viaf) num_new_isbns = self.new_isbns(metadata) new_info_counter['isbns'] += num_new_isbns if oclc_editions: # There are existing OCLC editions. Apply any new information to them. for edition in oclc_editions: metadata, new_info_counter = self.apply_metadata_to_edition( edition, metadata, metadata_client, new_info_counter) else: # Create a new OCLC edition to hold the information. edition, ignore = get_one_or_create( self._db, Edition, data_source=self.data_source, primary_identifier=other_identifier) metadata, new_info_counter = self.apply_metadata_to_edition( edition, metadata, metadata_client, new_info_counter) # Set the new OCLC edition's identifier equivalent to this # identifier so we know they're related. self.set_equivalence(identifier, metadata) self.log.info( "Total: %(editions)d editions, %(isbns)d ISBNs, "\ "%(descriptions)d descriptions, %(subjects)d classifications.", new_info_counter ) except IOError as e: if ", but couldn't find location" in e.message: exception = "OCLC doesn't know about this ISBN: %r" % e transient = False else: exception = "OCLC raised an error: %r" % e transient = True return self.failure(identifier, exception, transient=transient) return identifier
def process_item(self, identifier): # Books are not looked up in OCLC Linked Data directly, since # there is no Collection that identifies a book by its OCLC Number. # However, when a book is looked up through OCLC Classify, some # OCLC Numbers may be associated with it, and _those_ numbers # can be run through OCLC Linked Data. try: new_info_counter = Counter() self.log.info("Processing identifier %r", identifier) metadatas = [m for m in self.api.info_for(identifier)] if identifier.type==Identifier.ISBN: # Currently info_for seeks the results of OCLC Work IDs only # This segment will get the metadata of any equivalent OCLC Numbers # as well. equivalents = Identifier.recursively_equivalent_identifier_ids( self._db, [identifier.id] ) oclc_numbers = self._db.query(Identifier).\ filter(Identifier.id.in_(equivalents)).\ filter(Identifier.type==Identifier.OCLC_NUMBER).all() for oclc_number in oclc_numbers: more_metadata = [m for m in self.api.info_for(oclc_number)] metadatas += more_metadata metadatas = [m for m in metadatas if m] for metadata in metadatas: other_identifier, ignore = metadata.primary_identifier.load(self._db) oclc_editions = other_identifier.primarily_identifies # Keep track of the number of editions OCLC associates # with this identifier. other_identifier.add_measurement( self.data_source, Measurement.PUBLISHED_EDITIONS, len(oclc_editions) ) # Clean up contributor information. self.apply_viaf_to_contributor_data(metadata) # Remove any empty ContributorData objects that may have # been created. metadata.contributors = filter( lambda c: c.sort_name or c.display_name, metadata.contributors ) # When metadata is applied, it must be given a client that can # response to 'canonicalize_author_name'. Usually this is an # OPDSImporter that reaches out to the Metadata Wrangler, but # in the case of being _on_ the Metadata Wrangler...: from canonicalize import AuthorNameCanonicalizer metadata_client = AuthorNameCanonicalizer( self._db, oclcld=self.api, viaf=self.viaf ) num_new_isbns = self.new_isbns(metadata) new_info_counter['isbns'] += num_new_isbns if oclc_editions: # There are existing OCLC editions. Apply any new information to them. for edition in oclc_editions: metadata, new_info_counter = self.apply_metadata_to_edition( edition, metadata, metadata_client, new_info_counter ) else: # Create a new OCLC edition to hold the information. edition, ignore = get_one_or_create( self._db, Edition, data_source=self.data_source, primary_identifier=other_identifier ) metadata, new_info_counter = self.apply_metadata_to_edition( edition, metadata, metadata_client, new_info_counter ) # Set the new OCLC edition's identifier equivalent to this # identifier so we know they're related. self.set_equivalence(identifier, metadata) self.log.info( "Total: %(editions)d editions, %(isbns)d ISBNs, "\ "%(descriptions)d descriptions, %(subjects)d classifications.", new_info_counter ) except IOError as e: if ", but couldn't find location" in e.message: exception = "OCLC doesn't know about this ISBN: %r" % e transient = False else: exception = "OCLC raised an error: %r" % e transient = True return self.failure(identifier, exception, transient=transient) # Try to calculate or recalculate a work for ISBNs. # # We won't do this for other Identifier types because we don't want # to overwrite the high-quality metadata direct from the source. # With ISBNs, that higher-quality metadata is not available, so we # depend on OCLC for title and author information. if identifier.type == Identifier.ISBN: self.calculate_work_for_isbn(identifier) return identifier