def request(self, path, identifier=None, max_age=LIST_MAX_AGE): if not path.startswith(self.BASE_URL): if not path.startswith("/"): path = "/" + path url = self.BASE_URL + path else: url = path joiner = '?' if '?' in url: joiner = '&' url += joiner + "api-key=" + self.api_key representation, cached = Representation.get( self._db, url, do_get=self.do_get, max_age=max_age, debug=True, pause_before=0.1) status = representation.status_code if status == 200: # Everything's fine. content = json.loads(representation.content) return content diagnostic = "Response from %s was: %r" % ( url, representation.content ) if status == 403: raise IntegrationException( "API authentication failed", "API key is most likely wrong. %s" % diagnostic ) else: raise IntegrationException( "Unknown API error (status %s)" % status, diagnostic )
def request(self, path, identifier=None, max_age=LIST_MAX_AGE): if not path.startswith(self.BASE_URL): if not path.startswith("/"): path = "/" + path url = self.BASE_URL + path else: url = path joiner = '?' if '?' in url: joiner = '&' url += joiner + "api-key=" + self.api_key representation, cached = Representation.get(self._db, url, do_get=self.do_get, max_age=max_age, debug=True, pause_before=0.1) status = representation.status_code if status == 200: # Everything's fine. content = json.loads(representation.content) return content diagnostic = "Response from %s was: %r" % (url, representation.content) if status == 403: raise IntegrationException( "API authentication failed", "API key is most likely wrong. %s" % diagnostic) else: raise IntegrationException( "Unknown API error (status %s)" % status, diagnostic)
def lookup_by_viaf(self, viaf, working_sort_name=None, working_display_name=None, do_get=None): url = self.LOOKUP_URL % dict(viaf=viaf) r, cached = Representation.get( self._db, url, do_get=do_get, max_age=self.REPRESENTATION_MAX_AGE ) xml = r.content return self.parser.parse(xml, working_sort_name, working_display_name)
def lookup_by_name(self, sort_name, display_name=None, do_get=None, known_titles=None): """ Asks VIAF for a list of author clusters, matching the passed-in author name. Selects the cluster we deem the best match for the author we mean. :param sort_name: Author name in Last, First format. :param display_name: Author name in First Last format. :param do_get: Ask Representation to use Http GET? :param known_titles: A list of titles we know this author wrote. :return: (selected_candidate, match_confidences, contributor_titles) for selected ContributorData. """ author_name = sort_name or display_name # from OCLC tech support: # VIAF's SRU endpoint can only return a maximum number of 10 records # when the recordSchema is http://viaf.org/VIAFCluster maximum_records = 10 # viaf maximum that's not ignored page = 1 contributor_candidates = [] # limit ourselves to reading the first 500 viaf clusters, on the # assumption that search match quality is unlikely to be usable after that. for page in range (1, 51): start_record = 1 + maximum_records * (page-1) scope = 'local.personalNames' if is_corporate_name(author_name): scope = 'local.corporateNames' url = self.SEARCH_URL.format( scope=scope, author_name=author_name.encode("utf8"), maximum_records=maximum_records, start_record=start_record ) representation, cached = Representation.get( self._db, url, do_get=do_get, max_age=self.REPRESENTATION_MAX_AGE ) xml = representation.content candidates = self.parser.parse_multiple(xml, sort_name, display_name, page) if not any(candidates): # Delete the representation so it's not cached. self._db.query(Representation).filter( Representation.id==representation.id ).delete() # We ran out of clusters, so we can relax and move on to # ordering the returned results break contributor_candidates.extend(candidates) page += 1 best_match = self.select_best_match(candidates=contributor_candidates, working_sort_name=author_name, known_titles=known_titles) return best_match
def get_jsonld(self, url): representation, cached = Representation.get(self._db, url) try: data = jsonld.load_document(url) except Exception as e: self.log.error("EXCEPTION on %s: %s", url, e, exc_info=e) return None, False if cached and not representation.content: representation, cached = Representation.get( self._db, url, max_age=0) if not representation.content: return None, False doc = { 'contextUrl': None, 'documentUrl': url, 'document': representation.content.decode('utf8') } return doc, cached
def get_jsonld(self, url): representation, cached = Representation.get(self._db, url) try: data = jsonld.load_document(url) except Exception as e: self.log.error("EXCEPTION on %s: %s", url, e, exc_info=e) return None, False if cached and not representation.content: representation, cached = Representation.get(self._db, url, max_age=0) if not representation.content: return None, False doc = { 'contextUrl': None, 'documentUrl': url, 'document': representation.content.decode('utf8') } return doc, cached
def lookup_name_title(self, viaf, do_get=None): url = self.LOOKUP_URL % dict(viaf=viaf) r, cached = Representation.get( self._db, url, do_get=do_get, max_age=self.REPRESENTATION_MAX_AGE ) xml = r.content cluster = etree.fromstring(xml, parser=etree.XMLParser(recover=True)) titles = [] for potential_title in self.parser.name_titles_for_cluster(cluster): titles.append(potential_title) return titles
def lookup_by_viaf(self, viaf, working_sort_name=None, working_display_name=None, do_get=None): url = self.LOOKUP_URL % dict(viaf=viaf) r, cached = Representation.get(self._db, url, do_get=do_get, max_age=self.REPRESENTATION_MAX_AGE) xml = r.content return self.parser.parse(xml, working_sort_name, working_display_name)
def lookup_name_title(self, viaf, do_get=None): url = self.LOOKUP_URL % dict(viaf=viaf) r, cached = Representation.get(self._db, url, do_get=do_get, max_age=self.REPRESENTATION_MAX_AGE) xml = r.content cluster = etree.fromstring(xml, parser=etree.XMLParser(recover=True)) titles = [] for potential_title in self.parser.name_titles_for_cluster(cluster): titles.append(potential_title) return titles
def request(self, path, identifier=None, max_age=LIST_MAX_AGE): if not path.startswith(self.BASE_URL): if not path.startswith("/"): path = "/" + path url = self.BASE_URL + path else: url = path joiner = '?' if '?' in url: joiner = '&' url += joiner + "api-key=" + self.api_key representation, cached = Representation.get( self._db, url, do_get=self.do_get, max_age=max_age, debug=True, pause_before=0.1) content = json.loads(representation.content) return content
def oclc_number_for_isbn(self, isbn): """Turn an ISBN identifier into an OCLC Number identifier.""" url = self.ISBN_BASE_URL % dict(id=isbn.identifier) representation, cached = Representation.get( self._db, url, Representation.http_get_no_redirect) if not representation.location: raise IOError( "Expected %s to redirect, but couldn't find location." % url) location = representation.location match = self.URI_WITH_OCLC_NUMBER.match(location) if not match: raise IOError( "OCLC redirected ISBN lookup, but I couldn't make sense of the destination, %s" % location) oclc_number = match.groups()[0] return Identifier.for_foreign_id(self._db, Identifier.OCLC_NUMBER, oclc_number)[0]
def oclc_number_for_isbn(self, isbn): """Turn an ISBN identifier into an OCLC Number identifier.""" url = self.ISBN_BASE_URL % dict(id=isbn.identifier) representation, cached = Representation.get( self._db, url, Representation.http_get_no_redirect) if not representation.location: raise IOError( "Expected %s to redirect, but couldn't find location." % url ) location = representation.location match = self.URI_WITH_OCLC_NUMBER.match(location) if not match: raise IOError( "OCLC redirected ISBN lookup, but I couldn't make sense of the destination, %s" % location) oclc_number = match.groups()[0] return Identifier.for_foreign_id( self._db, Identifier.OCLC_NUMBER, oclc_number)[0]
def mirror_hyperlink(self, hyperlink): resource = hyperlink.resource if not resource.representation: resource.representation, cached = Representation.get( self._db, resource.url, max_age=self.ONE_YEAR) representation = resource.representation if not representation.media_type or not representation.media_type.startswith( 'image/'): representation.fetch_exception = ( 'Representation is not an image as expected.') return representation extension = self.image_extensions_for_types.get( representation.media_type, '') filename = "cover" + extension representation.mirror_url = self.uploader.cover_image_url( hyperlink.data_source, hyperlink.identifier, filename) self._db.commit() return resource.representation
def mirror_hyperlink(self, hyperlink): resource = hyperlink.resource if not resource.representation: resource.representation, cached = Representation.get( self._db, resource.url, max_age=self.ONE_YEAR) representation = resource.representation if not representation.media_type or not representation.media_type.startswith('image/'): representation.fetch_exception = ( 'Representation is not an image as expected.') return representation extension = self.image_extensions_for_types.get( representation.media_type, '') filename = "cover" + extension representation.mirror_url = self.uploader.cover_image_url( hyperlink.data_source, hyperlink.identifier, filename) self._db.commit() return resource.representation
def lookup_by_identifier(self, identifier, processed_uris=set()): """Turn an Identifier into a JSON-LD document.""" if identifier.type == Identifier.OCLC_WORK: foreign_type = 'work' url = self.WORK_BASE_URL elif identifier.type == Identifier.OCLC_NUMBER: foreign_type = "oclc" url = self.BASE_URL url = url % dict(id=identifier.identifier, type=foreign_type) if url in processed_uris: self.log.debug("SKIPPING %s, already processed.", url) return None, True processed_uris.add(url) representation, cached = Representation.get(self._db, url) try: data = jsonld.load_document(url) except Exception, e: self.log.error("EXCEPTION on %s: %s", url, e, exc_info=e) return None, False
def open(self): if len(sys.argv) > 1: return open(sys.argv[1]) url = Configuration.integration_url( Configuration.STAFF_PICKS_INTEGRATION, True ) if not url.startswith('https://') or url.startswith('http://'): url = self.DEFAULT_URL_TEMPLATE % url self.log.info("Retrieving %s", url) representation, cached = Representation.get( self._db, url, do_get=Representation.browser_http_get, accept="text/csv", max_age=timedelta(days=1)) if representation.status_code != 200: raise ValueError("Unexpected status code %s" % representation.status_code) if not representation.media_type.startswith("text/csv"): raise ValueError("Unexpected media type %s" % representation.media_type) return StringIO(representation.content)
def lookup_by_name(self, sort_name, display_name=None, do_get=None, best_match=False): sort_name = sort_name or display_name # from OCLC tech support: # VIAF's SRU endpoint can only return a maximum number of 10 records # when the recordSchema is http://viaf.org/VIAFCluster maximum_records = 10 # viaf maximum that's not ignored page = 1 contributor_candidates = [] # limit ourselves to reading the first 500 viaf clusters, on the # assumption that search match quality is unlikely to be usable after that. for page in range (1, 51): start_record = 1 + maximum_records * (page-1) scope = 'local.personalNames' if is_corporate_name(sort_name): scope = 'local.corporateNames' url = self.SEARCH_URL.format( scope=scope, sort_name=sort_name.encode("utf8"), maximum_records=maximum_records, start_record=start_record ) representation, cached = Representation.get( self._db, url, do_get=do_get, max_age=self.REPRESENTATION_MAX_AGE ) xml = representation.content candidates = self.parser.parse_multiple(xml, sort_name, display_name, page) if not any(candidates): # Delete the representation so it's not cached. self._db.query(Representation).filter( Representation.id==representation.id ).delete() # We ran out of clusters, so we can relax and move on to # ordering the returned results break contributor_candidates.extend(candidates) page += 1 return self.select_best_match(contributor_candidates, sort_name)
def lookup_by(self, **kwargs): """Perform an OCLC Classify lookup.""" query_string = self.query_string(**kwargs) url = self.BASE_URL + query_string representation, cached = Representation.get(self._db, url) return representation.content
def lookup_by_name(self, sort_name, display_name=None, do_get=None, known_titles=None): """ Asks VIAF for a list of author clusters, matching the passed-in author name. Selects the cluster we deem the best match for the author we mean. :param sort_name: Author name in Last, First format. :param display_name: Author name in First Last format. :param do_get: Ask Representation to use Http GET? :param known_titles: A list of titles we know this author wrote. :return: (selected_candidate, match_confidences, contributor_titles) for selected ContributorData. """ author_name = sort_name or display_name # from OCLC tech support: # VIAF's SRU endpoint can only return a maximum number of 10 records # when the recordSchema is http://viaf.org/VIAFCluster maximum_records = 10 # viaf maximum that's not ignored page = 1 contributor_candidates = [] # limit ourselves to reading the first 500 viaf clusters, on the # assumption that search match quality is unlikely to be usable after that. for page in range(1, 51): start_record = 1 + maximum_records * (page - 1) scope = 'local.personalNames' if is_corporate_name(author_name): scope = 'local.corporateNames' url = self.SEARCH_URL.format( scope=scope, author_name=author_name.encode("utf8"), maximum_records=maximum_records, start_record=start_record) representation, cached = Representation.get( self._db, url, do_get=do_get, max_age=self.REPRESENTATION_MAX_AGE) xml = representation.content candidates = self.parser.parse_multiple(xml, sort_name, display_name, page) if not any(candidates): # Delete the representation so it's not cached. self._db.query(Representation).filter( Representation.id == representation.id).delete() # We ran out of clusters, so we can relax and move on to # ordering the returned results break contributor_candidates.extend(candidates) page += 1 best_match = self.select_best_match(candidates=contributor_candidates, working_sort_name=author_name, known_titles=known_titles) return best_match
def improve_description(self, id, metadata): """Improve the description associated with a book, if possible. This involves fetching an alternate OPDS entry that might contain more detailed descriptions than those available in the main feed. """ alternate_links = [] existing_descriptions = [] everything_except_descriptions = [] for x in metadata.links: if (x.rel == Hyperlink.ALTERNATE and x.href and x.media_type == OPDSFeed.ENTRY_TYPE): alternate_links.append(x) if x.rel == Hyperlink.DESCRIPTION: existing_descriptions.append((x.media_type, x.content)) else: everything_except_descriptions.append(x) better_descriptions = [] for alternate_link in alternate_links: # There should only be one alternate link, but we'll keep # processing them until we get a good description. # Fetch the alternate entry. representation, is_new = Representation.get( self._db, alternate_link.href, max_age=self.THIRTY_DAYS, do_get=self.http_get ) if representation.status_code != 200: continue # Parse the alternate entry with feedparser and run it through # data_detail_for_feedparser_entry(). parsed = feedparser.parse(representation.content) if len(parsed['entries']) != 1: # This is supposed to be a single entry, and it's not. continue [entry] = parsed['entries'] data_source = self.data_source detail_id, new_detail, failure = self.data_detail_for_feedparser_entry( entry, data_source ) if failure: # There was a problem parsing the entry. self.log.error(failure.exception) continue # TODO: Ideally we could verify that detail_id == id, but # right now they are always different -- one is an HTTPS # URI and one is an HTTP URI. So we omit this step and # assume the documents at both ends of the 'alternate' # link identify the same resource. # Find any descriptions present in the alternate view which # are not present in the original. new_descriptions = [ x for x in new_detail['links'] if x.rel == Hyperlink.DESCRIPTION and (x.media_type, x.content) not in existing_descriptions ] if new_descriptions: # Replace old descriptions with new descriptions. metadata.links = ( everything_except_descriptions + new_descriptions ) break return metadata
def improve_description(self, id, metadata): """Improve the description associated with a book, if possible. This involves fetching an alternate OPDS entry that might contain more detailed descriptions than those available in the main feed. """ alternate_links = [] existing_descriptions = [] everything_except_descriptions = [] for x in metadata.links: if (x.rel == Hyperlink.ALTERNATE and x.href and x.media_type == OPDSFeed.ENTRY_TYPE): alternate_links.append(x) if x.rel == Hyperlink.DESCRIPTION: existing_descriptions.append((x.media_type, x.content)) else: everything_except_descriptions.append(x) better_descriptions = [] for alternate_link in alternate_links: # There should only be one alternate link, but we'll keep # processing them until we get a good description. # Fetch the alternate entry. representation, is_new = Representation.get( self._db, alternate_link.href, max_age=self.THIRTY_DAYS, do_get=self.http_get) if representation.status_code != 200: continue # Parse the alternate entry with feedparser and run it through # data_detail_for_feedparser_entry(). parsed = feedparser.parse(representation.content) if len(parsed['entries']) != 1: # This is supposed to be a single entry, and it's not. continue [entry] = parsed['entries'] data_source = self.data_source detail_id, new_detail, failure = self.data_detail_for_feedparser_entry( entry, data_source) if failure: # There was a problem parsing the entry. self.log.error(failure.exception) continue # TODO: Ideally we could verify that detail_id == id, but # right now they are always different -- one is an HTTPS # URI and one is an HTTP URI. So we omit this step and # assume the documents at both ends of the 'alternate' # link identify the same resource. # Find any descriptions present in the alternate view which # are not present in the original. new_descriptions = [ x for x in new_detail['links'] if x.rel == Hyperlink.DESCRIPTION and ( x.media_type, x.content) not in existing_descriptions ] if new_descriptions: # Replace old descriptions with new descriptions. metadata.links = (everything_except_descriptions + new_descriptions) break return metadata