def test_confirm_same_identifier(self): source = DataSource.lookup(self._db, DataSource.NOVELIST) identifier, ignore = Identifier.for_foreign_id( self._db, Identifier.NOVELIST_ID, '84752928' ) unmatched_identifier, ignore = Identifier.for_foreign_id( self._db, Identifier.NOVELIST_ID, '23781947' ) metadata = Metadata(source, primary_identifier=identifier) match = Metadata(source, primary_identifier=identifier) mistake = Metadata(source, primary_identifier=unmatched_identifier) eq_(False, self.novelist._confirm_same_identifier([metadata, mistake])) eq_(True, self.novelist._confirm_same_identifier([metadata, match]))
def oclc_number_for_isbn(self, isbn): """Turn an ISBN identifier into an OCLC Number identifier.""" # Let's pretend any id can be an oclc id. oclc_number = isbn.identifier oclc_identifier, made_new = Identifier.for_foreign_id( self._db, Identifier.OCLC_NUMBER, oclc_number, autocreate=True) return oclc_identifier
def handle_event(self, threem_id, isbn, foreign_patron_id, start_time, end_time, internal_event_type): # Find or lookup the LicensePool for this event. license_pool, is_new = LicensePool.for_foreign_id( self._db, self.api.source, Identifier.THREEM_ID, threem_id) if is_new: # Immediately acquire bibliographic coverage for this book. # This will set the DistributionMechanisms and make the # book presentation-ready. However, its circulation information # might not be up to date until we process some more events. record = self.bibliographic_coverage_provider.ensure_coverage( license_pool.identifier, force=True ) threem_identifier = license_pool.identifier isbn, ignore = Identifier.for_foreign_id( self._db, Identifier.ISBN, isbn) edition, ignore = Edition.for_foreign_id( self._db, self.api.source, Identifier.THREEM_ID, threem_id) # The ISBN and the 3M identifier are exactly equivalent. threem_identifier.equivalent_to(self.api.source, isbn, strength=1) # Log the event. event, was_new = get_one_or_create( self._db, CirculationEvent, license_pool=license_pool, type=internal_event_type, start=start_time, foreign_patron_id=foreign_patron_id, create_method_kwargs=dict(delta=1,end=end_time) ) # If this is our first time seeing this LicensePool, log its # occurance as a separate event if is_new: event = get_one_or_create( self._db, CirculationEvent, type=CirculationEvent.TITLE_ADD, license_pool=license_pool, create_method_kwargs=dict( start=license_pool.last_checked or start_time, delta=1, end=license_pool.last_checked or end_time, ) ) title = edition.title or "[no title]" self.log.info("%r %s: %s", start_time, title, internal_event_type) return start_time
def oclc_number_for_isbn(self, isbn): """Turn an ISBN identifier into an OCLC Number identifier.""" url = self.ISBN_BASE_URL % dict(id=isbn.identifier) representation, cached = Representation.get( self._db, url, Representation.http_get_no_redirect) if not representation.location: raise IOError( "Expected %s to redirect, but couldn't find location." % url ) location = representation.location match = self.URI_WITH_OCLC_NUMBER.match(location) if not match: raise IOError( "OCLC redirected ISBN lookup, but I couldn't make sense of the destination, %s" % location) oclc_number = match.groups()[0] return Identifier.for_foreign_id( self._db, Identifier.OCLC_NUMBER, oclc_number)[0]
def process_item(self, work): try: content_item = self.content_item_from_work(work) result = self.api.create_content_item(content_item) except Exception as e: return CoverageFailure( work, str(e), data_source=self.data_source, transient=True ) content_item_id = result.get('contentItemId') bibblio_identifier, _is_new = Identifier.for_foreign_id( self._db, Identifier.BIBBLIO_CONTENT_ITEM_ID, content_item_id ) identifier = work.presentation_edition.primary_identifier identifier.equivalent_to(self.data_source, bibblio_identifier, 1) return work
def test_process_urn_isbn(self): # Create a new ISBN identifier. # Ask online providers for metadata to turn into an opds feed about this identifier. # Make sure a coverage record was created, and a 201 status obtained from provider. # Ask online provider again, and make sure we're now getting a 202 "working on it" status. # Ask again, this time getting a result. Make sure know that got a result. isbn, ignore = Identifier.for_foreign_id( self._db, Identifier.ISBN, self._isbn ) # The first time we look up an ISBN a CoverageRecord is created # representing the work to be done. self.controller.process_urn(isbn.urn) self.assert_one_message( isbn.urn, HTTP_CREATED, self.controller.IDENTIFIER_REGISTERED ) [record] = isbn.coverage_records eq_(record.exception, self.controller.NO_WORK_DONE_EXCEPTION) eq_(record.status, CoverageRecord.TRANSIENT_FAILURE) # So long as the necessary coverage is not provided, # future lookups will not provide useful information self.controller.precomposed_entries = [] self.controller.process_urn(isbn.urn) self.assert_one_message( isbn.urn, HTTP_ACCEPTED, self.controller.WORKING_TO_RESOLVE_IDENTIFIER ) # Let's provide the coverage. metadata_sources = DataSource.metadata_sources_for( self._db, isbn ) for source in metadata_sources: CoverageRecord.add_for(isbn, source) # Process the ISBN again, and we get an <entry> tag with the # information. self.controller.precomposed_entries = [] self.controller.process_urn(isbn.urn) expect = isbn.opds_entry() [actual] = self.controller.precomposed_entries eq_(etree.tostring(expect), etree.tostring(actual))
def lookup(self, identifier_or_uri, processed_uris=set()): """Perform an OCLC Open Data lookup for the given identifier.""" type = None identifier = None if isinstance(identifier_or_uri, basestring): # e.g. http://experiment.worldcat.org/oclc/1862341597.json match = self.URI_WITH_OCLC_NUMBER.search(identifier_or_uri) if match: type = Identifier.OCLC_NUMBER id = match.groups()[0] if not type or not id: return None, None identifier, is_new = Identifier.for_foreign_id( self._db, type, id) else: identifier = identifier_or_uri type = identifier.type if not type or not identifier: return None, None return self.lookup_by_identifier(identifier, processed_uris)
def test_lookup_info_to_metadata(self): # Basic book information is returned identifier, ignore = Identifier.for_foreign_id( self._db, Identifier.ISBN, "9780804171335" ) bad_character = self.sample_representation("a_bad_character.json") metadata = self.novelist.lookup_info_to_metadata(bad_character) eq_(True, isinstance(metadata, Metadata)) eq_(Identifier.NOVELIST_ID, metadata.primary_identifier.type) eq_('10392078', metadata.primary_identifier.identifier) eq_("A bad character", metadata.title) eq_(None, metadata.subtitle) eq_(1, len(metadata.contributors)) [contributor] = metadata.contributors eq_("Kapoor, Deepti", contributor.sort_name) eq_(4, len(metadata.identifiers)) eq_(4, len(metadata.subjects)) eq_(2, len(metadata.measurements)) ratings = sorted(metadata.measurements, key=lambda m: m.value) eq_(2, ratings[0].value) eq_(3.27, ratings[1].value) eq_(625, len(metadata.recommendations)) # Confirm that Lexile and series data is extracted with a # different sample. vampire = self.sample_representation("vampire_kisses.json") metadata = self.novelist.lookup_info_to_metadata(vampire) [lexile] = filter(lambda s: s.type=='Lexile', metadata.subjects) eq_(u'630', lexile.identifier) eq_(u'Vampire kisses manga', metadata.series) # The full title should be selected, since every volume # has the same main title: 'Vampire kisses' eq_(u'Vampire kisses: blood relatives. Volume 1', metadata.title) eq_(1, metadata.series_position) eq_(5, len(metadata.recommendations))
def oclc_works_for_isbn(self, isbn, processed_uris=set()): """Yield every OCLC Work graph for the given ISBN.""" # Find the OCLC Number for this ISBN. oclc_number = self.oclc_number_for_isbn(isbn) # Retrieve the OCLC Linked Data document for that OCLC Number. oclc_number_data, was_new = self.lookup_by_identifier( oclc_number, processed_uris) if not oclc_number_data: return # Look up every work referenced in that document and yield its data. graph = OCLCLinkedData.graph(oclc_number_data) works = OCLCLinkedData.extract_works(graph) for work_uri in works: m = self.URI_WITH_OCLC_WORK_ID.match(work_uri) if m: work_id = m.groups()[0] identifier, was_new = Identifier.for_foreign_id( self._db, Identifier.OCLC_WORK, work_id) oclc_work_data, cached = self.lookup_by_identifier( identifier, processed_uris) yield oclc_work_data
def lookup_info_to_metadata(self, lookup_representation): """Transforms a NoveList JSON representation into a Metadata object""" if not lookup_representation.content: return None lookup_info = json.loads(lookup_representation.content) book_info = lookup_info["TitleInfo"] if book_info: novelist_identifier = book_info.get("ui") if not book_info or not novelist_identifier: # NoveList didn't know the ISBN. return None primary_identifier, ignore = Identifier.for_foreign_id( self._db, Identifier.NOVELIST_ID, novelist_identifier ) metadata = Metadata(self.source, primary_identifier=primary_identifier) # Get the equivalent ISBN identifiers. metadata.identifiers += self._extract_isbns(book_info) author = book_info.get("author") if author: metadata.contributors.append(ContributorData(sort_name=author)) description = book_info.get("description") if description: metadata.links.append( LinkData( rel=Hyperlink.DESCRIPTION, content=description, media_type=Representation.TEXT_PLAIN, ) ) audience_level = book_info.get("audience_level") if audience_level: metadata.subjects.append( SubjectData(Subject.FREEFORM_AUDIENCE, audience_level) ) novelist_rating = book_info.get("rating") if novelist_rating: metadata.measurements.append( MeasurementData(Measurement.RATING, novelist_rating) ) # Extract feature content if it is available. series_info = None appeals_info = None lexile_info = None goodreads_info = None recommendations_info = None feature_content = lookup_info.get("FeatureContent") if feature_content: series_info = feature_content.get("SeriesInfo") appeals_info = feature_content.get("Appeals") lexile_info = feature_content.get("LexileInfo") goodreads_info = feature_content.get("GoodReads") recommendations_info = feature_content.get("SimilarTitles") metadata, title_key = self.get_series_information( metadata, series_info, book_info ) metadata.title = book_info.get(title_key) subtitle = TitleProcessor.extract_subtitle( metadata.title, book_info.get("full_title") ) metadata.subtitle = self._scrub_subtitle(subtitle) # TODO: How well do we trust this data? We could conceivably bump up # the weight here. if appeals_info: extracted_genres = False for appeal in appeals_info: genres = appeal.get("genres") if genres: for genre in genres: metadata.subjects.append( SubjectData(Subject.TAG, genre["Name"]) ) extracted_genres = True if extracted_genres: break if lexile_info: metadata.subjects.append( SubjectData(Subject.LEXILE_SCORE, lexile_info["Lexile"]) ) if goodreads_info: metadata.measurements.append( MeasurementData(Measurement.RATING, goodreads_info["average_rating"]) ) metadata = self.get_recommendations(metadata, recommendations_info) # If nothing interesting comes from the API, ignore it. if not ( metadata.measurements or metadata.series_position or metadata.series or metadata.subjects or metadata.links or metadata.subtitle or metadata.recommendations ): metadata = None return metadata
class OCLCXMLParser(XMLParser): # OCLC in-representation 'status codes' SINGLE_WORK_SUMMARY_STATUS = 0 SINGLE_WORK_DETAIL_STATUS = 2 MULTI_WORK_STATUS = 4 NO_INPUT_STATUS = 100 INVALID_INPUT_STATUS = 101 NOT_FOUND_STATUS = 102 UNEXPECTED_ERROR_STATUS = 200 INTS = set([OCLC.HOLDING_COUNT, OCLC.EDITION_COUNT]) NAMESPACES = {'oclc': 'http://classify.oclc.org'} LIST_TYPE = "works" log = logging.getLogger("OCLC XML Parser") @classmethod def parse(cls, _db, xml, **restrictions): """Turn XML data from the OCLC lookup service into a list of SWIDs (for a multi-work response) or a list of Edition objects (for a single-work response). """ tree = etree.fromstring(xml, parser=etree.XMLParser(recover=True)) response = cls._xpath1(tree, "oclc:response") representation_type = int(response.get('code')) workset_record = None editions = [] edition_records = [] if representation_type == cls.UNEXPECTED_ERROR_STATUS: raise IOError("Unexpected error from OCLC API: %s" % xml) elif representation_type in (cls.NO_INPUT_STATUS, cls.INVALID_INPUT_STATUS): return representation_type, [] elif representation_type == cls.SINGLE_WORK_SUMMARY_STATUS: raise IOError( "Got single-work summary from OCLC despite requesting detail: %s" % xml) # The real action happens here. if representation_type == cls.SINGLE_WORK_DETAIL_STATUS: authors_tag = cls._xpath1(tree, "//oclc:authors") work_tag = cls._xpath1(tree, "//oclc:work") if work_tag is not None: author_string = work_tag.get('author') primary_author = cls.primary_author_from_author_string( _db, author_string) existing_authors = cls.extract_authors( _db, authors_tag, primary_author=primary_author) # The representation lists a single work, its authors, its editions, # plus summary classification information for the work. edition, ignore = cls.extract_edition(_db, work_tag, existing_authors, **restrictions) if edition: cls.log.info("EXTRACTED %r", edition) records = [] if edition: records.append(edition) else: # The work record itself failed one of the # restrictions. None of its editions are likely to # succeed either. return representation_type, records elif representation_type == cls.MULTI_WORK_STATUS: # The representation lists a set of works that match the # search query. cls.log.debug("Extracting SWIDs from search results.") records = cls.extract_swids(_db, tree, **restrictions) elif representation_type == cls.NOT_FOUND_STATUS: # No problem; OCLC just doesn't have any data. records = [] else: raise IOError("Unrecognized status code from OCLC API: %s (%s)" % (representation_type, xml)) return representation_type, records @classmethod def extract_swids(cls, _db, tree, **restrictions): """Turn a multi-work response into a list of SWIDs.""" swids = [] for work_tag in cls._xpath(tree, "//oclc:work"): # We're not calling extract_basic_info because we care about # the info, we're calling it to make sure this work meets # the restriction. If this work meets the restriction, # we'll store its info when we look up the SWID. response = cls._extract_basic_info(_db, work_tag, **restrictions) if response: title, author_names, language = response # TODO: 'swid' is what it's called in older representations. # That code can be removed once we replace all representations. work_identifier = work_tag.get('wi') or work_tag.get('swid') cls.log.debug("WORK ID %s (%s, %r, %s)", work_identifier, title, author_names, language) swids.append(work_identifier) return swids ROLES = re.compile("\[([^]]+)\]$") LIFESPAN = re.compile("([0-9]+)-([0-9]*)[.;]?$") @classmethod def extract_authors(cls, _db, authors_tag, primary_author=None): results = [] if authors_tag is not None: for author_tag in cls._xpath(authors_tag, "//oclc:author"): lc = author_tag.get('lc', None) viaf = author_tag.get('viaf', None) contributor, roles, default_role_used = cls._parse_single_author( _db, author_tag.text, lc=lc, viaf=viaf, primary_author=primary_author) if contributor: results.append(contributor) return results @classmethod def _contributor_match(cls, contributor, name, lc, viaf): return (contributor.sort_name == name and (lc is None or contributor.lc == lc) and (viaf is None or contributor.viaf == viaf)) @classmethod def _parse_single_author(cls, _db, author, lc=None, viaf=None, existing_authors=[], default_role=Contributor.AUTHOR_ROLE, primary_author=None): default_role_used = False # First find roles if present # "Giles, Lionel, 1875-1958 [Writer of added commentary; Translator]" author = author.strip() m = cls.ROLES.search(author) if m: author = author[:m.start()].strip() role_string = m.groups()[0] roles = [x.strip() for x in role_string.split(";")] elif default_role: roles = [default_role] default_role_used = True else: roles = [] # Author string now looks like # "Giles, Lionel, 1875-1958" m = cls.LIFESPAN.search(author) kwargs = dict() if m: author = author[:m.start()].strip() birth, death = m.groups() if birth: kwargs[Contributor.BIRTH_DATE] = birth if death: kwargs[Contributor.DEATH_DATE] = death # Author string now looks like # "Giles, Lionel," if author.endswith(","): author = author[:-1] contributor = None if not author: # No name was given for the author. return None, roles, default_role_used if primary_author and author == primary_author.sort_name: if Contributor.AUTHOR_ROLE in roles: roles.remove(Contributor.AUTHOR_ROLE) if Contributor.UNKNOWN_ROLE in roles: roles.remove(Contributor.UNKNOWN_ROLE) roles.insert(0, Contributor.PRIMARY_AUTHOR_ROLE) if existing_authors: # Calling Contributor.lookup will result in a database # hit, and looking up a contributor based on name may # result in multiple results (see below). We'll have no # way of distinguishing between those results. If # possible, it's much more reliable to look through # existing_authors (the authors derived from an entry's # <authors> tag). for x in existing_authors: if cls._contributor_match(x, author, lc, viaf): contributor = x break if contributor: was_new = False if not contributor: contributor, was_new = Contributor.lookup(_db, author, viaf, lc, extra=kwargs) if isinstance(contributor, list): # We asked for an author based solely on the name, which makes # Contributor.lookup() return a list. if len(contributor) == 1: # Fortunately, either the database knows about only # one author with that name, or it didn't know about # any authors with that name and it just created one, # so we can unambiguously use it. contributor = contributor[0] else: # Uh-oh. The database knows about multiple authors # with that name. We have no basis for deciding which # author we mean. But we would prefer to identify with # an author who has a known LC or VIAF number. # # This should happen very rarely because of our check # against existing_authors above. But it will happen # for authors that have a work in Project Gutenberg. with_id = [ x for x in contributor if x.lc is not None or x.viaf is not None ] if with_id: contributor = with_id[0] else: contributor = contributor[0] return contributor, roles, default_role_used @classmethod def primary_author_from_author_string(cls, _db, author_string): # If the first author mentioned in the author string # does not have an explicit role set, treat them as the primary # author. if not author_string: return None authors = author_string.split("|") if not authors: return None author, roles, default_role_used = cls._parse_single_author( _db, authors[0], default_role=Contributor.PRIMARY_AUTHOR_ROLE) if roles == [Contributor.PRIMARY_AUTHOR_ROLE]: return author return None @classmethod def parse_author_string(cls, _db, author_string, existing_authors=[], primary_author=None): default_role = Contributor.PRIMARY_AUTHOR_ROLE authors = [] if not author_string: return authors for author in author_string.split("|"): author, roles, default_role_used = cls._parse_single_author( _db, author, existing_authors=existing_authors, default_role=default_role, primary_author=primary_author) if roles: if Contributor.PRIMARY_AUTHOR_ROLE in roles: # That was the primary author. If we see someone # with no explicit role after this point, they're # just a regular author. default_role = Contributor.AUTHOR_ROLE elif not default_role_used: # We're dealing with someone whose role was # explicitly specified. If we see someone with no # explicit role after this point, it's probably # because their role is so minor as to not be # worth mentioning, not because it's so major that # we can assume they're an author. default_role = Contributor.UNKNOWN_ROLE roles = roles or [default_role] if author: authors.append((author, roles)) return authors @classmethod def _extract_basic_info(cls, _db, tag, existing_authors=None, **restrictions): """Extract information common to work tag and edition tag.""" title = tag.get('title') author_string = tag.get('author') authors_and_roles = cls.parse_author_string(_db, author_string, existing_authors) if 'language' in tag.keys(): language = tag.get('language') else: language = None if title and 'title' in restrictions: must_resemble_title = restrictions['title'] threshold = restrictions.get('title_similarity', 0.25) similarity = MetadataSimilarity.title_similarity( must_resemble_title, title) if similarity < threshold: # The title of the book under consideration is not # similar enough to the given title. cls.log.debug("FAILURE TO RESEMBLE: %s vs %s (%.2f)", title, must_resemble_title, similarity) return None # The semicolon is frequently used to separate multiple # works in an anthology. If there is no semicolon in the # original title, do not consider titles that contain # semicolons. if (not ' ; ' in must_resemble_title and ' ; ' in title and threshold > 0): cls.log.debug("SEMICOLON DISQUALIFICATION: %s", title) return None # Apply restrictions. If they're not met, return None. if 'language' in restrictions and language: # We know which language this record is for. Match it # against the language used in the Edition we're # matching against. restrict_to_language = set(restrictions['language']) if language != restrict_to_language: # This record is for a book in a different language cls.log.debug("WRONG LANGUAGE: %s", language) return None if 'authors' in restrictions: restrict_to_authors = restrictions['authors'] if restrict_to_authors and isinstance(restrict_to_authors[0], Contributor): restrict_to_authors = [ x.sort_name for x in restrict_to_authors ] primary_author = None for a, roles in authors_and_roles: if Contributor.PRIMARY_AUTHOR_ROLE in roles: primary_author = a break if (not primary_author or (primary_author not in restrict_to_authors and primary_author.sort_name not in restrict_to_authors)): # None of the given authors showed up as the # primary author of this book. They may have had # some other role in it, or the book may be about # them, or incorporate their work, but this book # is not *by* them. return None author_names = ", ".join([x.sort_name for x, y in authors_and_roles]) return title, authors_and_roles, language UNUSED_MEDIA = set([ "itemtype-intmm", "itemtype-msscr", "itemtype-artchap-artcl", "itemtype-jrnl", "itemtype-map", "itemtype-vis", "itemtype-jrnl-digital", "itemtype-image-2d", "itemtype-artchap-digital", "itemtype-intmm-digital", "itemtype-archv", "itemtype-msscr-digital", "itemtype-game", "itemtype-web-digital", "itemtype-map-digital", ]) @classmethod def extract_edition(cls, _db, work_tag, existing_authors, **restrictions): """Create a new Edition object with information about a work (identified by OCLC Work ID). """ # TODO: 'pswid' is what it's called in older representations. # That code can be removed once we replace all representations. oclc_work_id = unicode(work_tag.get('owi') or work_tag.get('pswid')) # if oclc_work_id: # print " owi: %s" % oclc_work_id # else: # print " No owi in %s" % etree.tostring(work_tag) if not oclc_work_id: raise ValueError("Work has no owi") item_type = work_tag.get("itemtype") if (item_type.startswith('itemtype-book') or item_type.startswith('itemtype-compfile')): medium = Edition.BOOK_MEDIUM elif item_type.startswith('itemtype-audiobook' ) or item_type.startswith('itemtype-music'): # Pretty much all Gutenberg texts, even the audio texts, # are based on a book, and the ones that aren't # (recordings of individual songs) probably aren't in OCLC # anyway. So we just want to get the books. medium = Edition.AUDIO_MEDIUM medium = None elif item_type.startswith('itemtype-video'): #medium = Edition.VIDEO_MEDIUM medium = None elif item_type in cls.UNUSED_MEDIA: medium = None else: medium = None # Only create Editions for books with a recognized medium if medium is None: return None, False result = cls._extract_basic_info(_db, work_tag, existing_authors, **restrictions) if not result: # This record did not meet one of the restrictions. return None, False title, authors_and_roles, language = result # Record some extra OCLC-specific information editions = work_tag.get('editions') holdings = work_tag.get('holdings') # Get an identifier for this work. identifier, ignore = Identifier.for_foreign_id(_db, Identifier.OCLC_WORK, oclc_work_id) data_source = DataSource.lookup(_db, DataSource.OCLC) identifier.add_measurement(data_source, Measurement.HOLDINGS, holdings) identifier.add_measurement(data_source, Measurement.PUBLISHED_EDITIONS, editions) # Create a Edition for source + identifier edition, new = get_one_or_create(_db, Edition, data_source=data_source, primary_identifier=identifier, create_method_kwargs=dict( title=title, language=language, )) # Get the most popular Dewey and LCC classification for this # work. for tag_name, subject_type in (("ddc", Subject.DDC), ("lcc", Subject.LCC)): tag = cls._xpath1(work_tag, "//oclc:%s/oclc:mostPopular" % tag_name) if tag is not None: id = tag.get('nsfa') or tag.get('sfa') weight = int(tag.get('holdings')) identifier.classify(data_source, subject_type, id, weight=weight) # Find FAST subjects for the work. for heading in cls._xpath(work_tag, "//oclc:fast//oclc:heading"): id = heading.get('ident') weight = int(heading.get('heldby')) value = heading.text identifier.classify(data_source, Subject.FAST, id, value, weight) # Associate the authors with the Edition. for contributor, roles in authors_and_roles: edition.add_contributor(contributor, roles) return edition, new @classmethod def extract_edition_record(cls, _db, edition_tag, existing_authors, **restrictions): """Create a new Edition object with information about an edition of a book (identified by OCLC Number). """ oclc_number = unicode(edition_tag.get('oclc')) try: int(oclc_number) except ValueError, e: # This record does not have a valid OCLC number. return None, False # Fill in some basic information about this new record. result = cls._extract_basic_info(_db, edition_tag, existing_authors, **restrictions) if not result: # This record did not meet one of the restrictions. return None, False title, authors_and_roles, language = result # Add a couple extra bits of OCLC-specific information. extra = { OCLC.HOLDING_COUNT: edition_tag.get('holdings'), OCLC.FORMAT: edition_tag.get('itemtype'), } # Get an identifier for this edition. identifier, ignore = Identifier.for_foreign_id(_db, Identifier.OCLC_NUMBER, oclc_number) # Create a Edition for source + identifier data_source = DataSource.lookup(_db, DataSource.OCLC) edition_record, new = get_one_or_create(_db, Edition, data_source=data_source, primary_identifier=identifier, create_method_kwargs=dict( title=title, language=language, subjects=subjects, extra=extra, )) subjects = {} for subject_type, oclc_code in ((Subject.LCC, "050"), (Subject.DDC, "082")): classification = cls._xpath1( edition_tag, "oclc:classifications/oclc:class[@tag=%s]" % oclc_code) if classification is not None: value = classification.get("nsfa") or classification.get('sfa') identifier.classify(data_source, subject_type, value) # Associated each contributor with the new record. for author, roles in authors_and_roles: edition_record.add_contributor(author, roles) return edition_record, new
def extract_edition(cls, _db, work_tag, existing_authors, **restrictions): """Create a new Edition object with information about a work (identified by OCLC Work ID). """ # TODO: 'pswid' is what it's called in older representations. # That code can be removed once we replace all representations. oclc_work_id = unicode(work_tag.get('owi') or work_tag.get('pswid')) # if oclc_work_id: # print " owi: %s" % oclc_work_id # else: # print " No owi in %s" % etree.tostring(work_tag) if not oclc_work_id: raise ValueError("Work has no owi") item_type = work_tag.get("itemtype") if (item_type.startswith('itemtype-book') or item_type.startswith('itemtype-compfile')): medium = Edition.BOOK_MEDIUM elif item_type.startswith('itemtype-audiobook' ) or item_type.startswith('itemtype-music'): # Pretty much all Gutenberg texts, even the audio texts, # are based on a book, and the ones that aren't # (recordings of individual songs) probably aren't in OCLC # anyway. So we just want to get the books. medium = Edition.AUDIO_MEDIUM medium = None elif item_type.startswith('itemtype-video'): #medium = Edition.VIDEO_MEDIUM medium = None elif item_type in cls.UNUSED_MEDIA: medium = None else: medium = None # Only create Editions for books with a recognized medium if medium is None: return None, False result = cls._extract_basic_info(_db, work_tag, existing_authors, **restrictions) if not result: # This record did not meet one of the restrictions. return None, False title, authors_and_roles, language = result # Record some extra OCLC-specific information editions = work_tag.get('editions') holdings = work_tag.get('holdings') # Get an identifier for this work. identifier, ignore = Identifier.for_foreign_id(_db, Identifier.OCLC_WORK, oclc_work_id) data_source = DataSource.lookup(_db, DataSource.OCLC) identifier.add_measurement(data_source, Measurement.HOLDINGS, holdings) identifier.add_measurement(data_source, Measurement.PUBLISHED_EDITIONS, editions) # Create a Edition for source + identifier edition, new = get_one_or_create(_db, Edition, data_source=data_source, primary_identifier=identifier, create_method_kwargs=dict( title=title, language=language, )) # Get the most popular Dewey and LCC classification for this # work. for tag_name, subject_type in (("ddc", Subject.DDC), ("lcc", Subject.LCC)): tag = cls._xpath1(work_tag, "//oclc:%s/oclc:mostPopular" % tag_name) if tag is not None: id = tag.get('nsfa') or tag.get('sfa') weight = int(tag.get('holdings')) identifier.classify(data_source, subject_type, id, weight=weight) # Find FAST subjects for the work. for heading in cls._xpath(work_tag, "//oclc:fast//oclc:heading"): id = heading.get('ident') weight = int(heading.get('heldby')) value = heading.text identifier.classify(data_source, Subject.FAST, id, value, weight) # Associate the authors with the Edition. for contributor, roles in authors_and_roles: edition.add_contributor(contributor, roles) return edition, new
def run(self): id_type, identifier = sys.argv[1:] identifier, ignore = Identifier.for_foreign_id( self._db, id_type, identifier ) self.fix_identifier(identifier)
def extract_edition(cls, _db, work_tag, existing_authors, **restrictions): """Create a new Edition object with information about a work (identified by OCLC Work ID). """ # TODO: 'pswid' is what it's called in older representations. # That code can be removed once we replace all representations. oclc_work_id = unicode(work_tag.get('owi') or work_tag.get('pswid')) # if oclc_work_id: # print " owi: %s" % oclc_work_id # else: # print " No owi in %s" % etree.tostring(work_tag) if not oclc_work_id: raise ValueError("Work has no owi") item_type = work_tag.get("itemtype") if (item_type.startswith('itemtype-book') or item_type.startswith('itemtype-compfile')): medium = Edition.BOOK_MEDIUM elif item_type.startswith('itemtype-audiobook') or item_type.startswith('itemtype-music'): # Pretty much all Gutenberg texts, even the audio texts, # are based on a book, and the ones that aren't # (recordings of individual songs) probably aren't in OCLC # anyway. So we just want to get the books. medium = Edition.AUDIO_MEDIUM medium = None elif item_type.startswith('itemtype-video'): #medium = Edition.VIDEO_MEDIUM medium = None elif item_type in cls.UNUSED_MEDIA: medium = None else: medium = None # Only create Editions for books with a recognized medium if medium is None: return None, False result = cls._extract_basic_info(_db, work_tag, existing_authors, **restrictions) if not result: # This record did not meet one of the restrictions. return None, False title, authors_and_roles, language = result # Record some extra OCLC-specific information editions = work_tag.get('editions') holdings = work_tag.get('holdings') # Get an identifier for this work. identifier, ignore = Identifier.for_foreign_id( _db, Identifier.OCLC_WORK, oclc_work_id ) data_source = DataSource.lookup(_db, DataSource.OCLC) identifier.add_measurement(data_source, Measurement.HOLDINGS, holdings) identifier.add_measurement( data_source, Measurement.PUBLISHED_EDITIONS, editions) # Create a Edition for source + identifier edition, new = get_one_or_create( _db, Edition, data_source=data_source, primary_identifier=identifier, create_method_kwargs=dict( title=title, language=language, ) ) # Get the most popular Dewey and LCC classification for this # work. for tag_name, subject_type in ( ("ddc", Subject.DDC), ("lcc", Subject.LCC)): tag = cls._xpath1( work_tag, "//oclc:%s/oclc:mostPopular" % tag_name) if tag is not None: id = tag.get('nsfa') or tag.get('sfa') weight = int(tag.get('holdings')) identifier.classify( data_source, subject_type, id, weight=weight) # Find FAST subjects for the work. for heading in cls._xpath( work_tag, "//oclc:fast//oclc:heading"): id = heading.get('ident') weight = int(heading.get('heldby')) value = heading.text identifier.classify( data_source, Subject.FAST, id, value, weight) # Associate the authors with the Edition. for contributor, roles in authors_and_roles: edition.add_contributor(contributor, roles) return edition, new
def lookup_info_to_metadata(self, lookup_representation): """Transforms a NoveList JSON representation into a Metadata object""" if not lookup_representation.content: return None lookup_info = json.loads(lookup_representation.content) book_info = lookup_info['TitleInfo'] if book_info: novelist_identifier = book_info.get('ui') if not book_info or not novelist_identifier: # NoveList didn't know the ISBN. return None primary_identifier, ignore = Identifier.for_foreign_id( self._db, Identifier.NOVELIST_ID, novelist_identifier ) metadata = Metadata(self.source, primary_identifier=primary_identifier) # Get the equivalent ISBN identifiers. metadata.identifiers += self._extract_isbns(book_info) author = book_info.get('author') if author: metadata.contributors.append(ContributorData(sort_name=author)) description = book_info.get('description') if description: metadata.links.append(LinkData( rel=Hyperlink.DESCRIPTION, content=description, media_type=Representation.TEXT_PLAIN )) audience_level = book_info.get('audience_level') if audience_level: metadata.subjects.append(SubjectData( Subject.FREEFORM_AUDIENCE, audience_level )) novelist_rating = book_info.get('rating') if novelist_rating: metadata.measurements.append(MeasurementData( Measurement.RATING, novelist_rating )) # Extract feature content if it is available. series_info = None appeals_info = None lexile_info = None goodreads_info = None recommendations_info = None feature_content = lookup_info.get('FeatureContent') if feature_content: series_info = feature_content.get('SeriesInfo') appeals_info = feature_content.get('Appeals') lexile_info = feature_content.get('LexileInfo') goodreads_info = feature_content.get('GoodReads') recommendations_info = feature_content.get('SimilarTitles') metadata, title_key = self.get_series_information( metadata, series_info, book_info ) metadata.title = book_info.get(title_key) subtitle = TitleProcessor.extract_subtitle( metadata.title, book_info.get('full_title') ) metadata.subtitle = self._scrub_subtitle(subtitle) if appeals_info: extracted_genres = False for appeal in appeals_info: genres = appeal.get('genres') if genres: for genre in genres: metadata.subjects.append(SubjectData( Subject.TAG, genre['Name'] )) extracted_genres = True if extracted_genres: break if lexile_info: metadata.subjects.append(SubjectData( Subject.LEXILE_SCORE, lexile_info['Lexile'] )) if goodreads_info: metadata.measurements.append(MeasurementData( Measurement.RATING, goodreads_info['average_rating'] )) metadata = self.get_recommendations(metadata, recommendations_info) # If nothing interesting comes from the API, ignore it. if not (metadata.measurements or metadata.series_position or metadata.series or metadata.subjects or metadata.links or metadata.subtitle or metadata.recommendations ): metadata = None return metadata
# by now we can assume response is either empty or a list for item in resp_obj: # go through patron's checkouts and generate LoanInfo objects, # with FulfillmentInfo objects included media_type = item.get('mediaType', 'eBook') isbn = item.get('isbn', None) can_renew = item.get('canRenew', None) title = item.get('title', None) authors = item.get('authors', None) # refers to checkout expiration date, not the downloadUrl's expires = item.get('expiration', None) if expires: expires = datetime.datetime.strptime(expires, self.EXPIRATION_DATE_FORMAT).date() identifier, made_new = Identifier.for_foreign_id(self._db, foreign_identifier_type=Identifier.RB_DIGITAL_ID, foreign_id=isbn, autocreate=False) # Note: if OneClick knows about a patron's checked-out item that wasn't # checked out through us, we ignore it if not identifier: continue files = item.get('files', None) for file in files: filename = file.get('filename', None) # assume fileFormat is same for all files associated with this checkout # and use the last one mentioned. Ex: "fileFormat": "EPUB". # note: audio books don't list fileFormat field, just the filename, and the mediaType. file_format = file.get('fileFormat', None) if file_format == 'EPUB':
# by now we can assume response is either empty or a list for item in resp_obj: # go through patron's checkouts and generate LoanInfo objects, # with FulfillmentInfo objects included media_type = item.get('mediaType', 'eBook') isbn = item.get('isbn', None) can_renew = item.get('canRenew', None) title = item.get('title', None) authors = item.get('authors', None) # refers to checkout expiration date, not the downloadUrl's expires = item.get('expiration', None) if expires: expires = datetime.datetime.strptime(expires, self.EXPIRATION_DATE_FORMAT).date() identifier, made_new = Identifier.for_foreign_id(self._db, foreign_identifier_type=Identifier.ONECLICK_ID, foreign_id=isbn, autocreate=False) # Note: if OneClick knows about a patron's checked-out item that wasn't # checked out through us, we ignore it if not identifier: continue files = item.get('files', None) for file in files: filename = file.get('filename', None) # assume fileFormat is same for all files associated with this checkout # and use the last one mentioned. Ex: "fileFormat": "EPUB". # note: audio books don't list fileFormat field, just the filename, and the mediaType. file_format = file.get('fileFormat', None) if file_format == 'EPUB':
def test_recursively_equivalent_identifiers(self): # We start with a Gutenberg book. gutenberg = DataSource.lookup(self._db, DataSource.GUTENBERG) record, ignore = Edition.for_foreign_id(self._db, gutenberg, Identifier.GUTENBERG_ID, "100") gutenberg_id = record.primary_identifier # We use OCLC Classify to do a title/author lookup. oclc = DataSource.lookup(self._db, DataSource.OCLC) search_id, ignore = Identifier.for_foreign_id(self._db, Identifier.OCLC_WORK, "60010") gutenberg_id.equivalent_to(oclc, search_id, 1) # The title/author lookup associates the search term with two # different OCLC Numbers. oclc_id, ignore = Identifier.for_foreign_id(self._db, Identifier.OCLC_NUMBER, "9999") oclc_id_2, ignore = Identifier.for_foreign_id(self._db, Identifier.OCLC_NUMBER, "1000") search_id.equivalent_to(oclc, oclc_id, 1) search_id.equivalent_to(oclc, oclc_id_2, 1) # We then use OCLC Linked Data to connect one of the OCLC # Numbers with an ISBN. linked_data = DataSource.lookup(self._db, DataSource.OCLC_LINKED_DATA) isbn_id, ignore = Identifier.for_foreign_id(self._db, Identifier.ISBN, "900100434X") oclc_id.equivalent_to(linked_data, isbn_id, 1) # As it turns out, we have an Overdrive work record... overdrive = DataSource.lookup(self._db, DataSource.OVERDRIVE) overdrive_record, ignore = Edition.for_foreign_id( self._db, overdrive, Identifier.OVERDRIVE_ID, "{111-222}") overdrive_id = overdrive_record.primary_identifier # ...which is tied (by Overdrive) to the same ISBN. overdrive_id.equivalent_to(overdrive, isbn_id, 1) # Finally, here's a completely unrelated Edition, which # will not be showing up. gutenberg2, ignore = Edition.for_foreign_id(self._db, gutenberg, Identifier.GUTENBERG_ID, "200") gutenberg2.title = "Unrelated Gutenberg record." levels = [ record.equivalent_identifiers(policy=PresentationCalculationPolicy( equivalent_identifier_levels=i)) for i in range(0, 5) ] # At level 0, the only identifier found is the Gutenberg ID. assert set([gutenberg_id]) == set(levels[0]) # At level 1, we pick up the title/author lookup. assert set([gutenberg_id, search_id]) == set(levels[1]) # At level 2, we pick up the title/author lookup and the two # OCLC Numbers. assert set([gutenberg_id, search_id, oclc_id, oclc_id_2]) == set(levels[2]) # At level 3, we also pick up the ISBN. assert set([gutenberg_id, search_id, oclc_id, oclc_id_2, isbn_id]) == set(levels[3]) # At level 4, the recursion starts to go in the other # direction: we pick up the Overdrive ID that's equivalent to # the same ISBN as the OCLC Number. assert set([ gutenberg_id, search_id, oclc_id, oclc_id_2, isbn_id, overdrive_id ]) == set(levels[4])