def _load_solrdata(self): if self.type == "edition": return { 'ebook_count': int(bool(self.document.ocaid)), 'edition_count': 1, 'work_count': 1, 'last_update': self.document.last_modified } else: q = self.get_solr_query_term() if q: solr = get_solr() result = solr.select(q, fields=["edition_count", "ebook_count_i"]) last_update_i = [doc['last_update_i'] for doc in result.docs if 'last_update_i' in doc] if last_update_i: last_update = self._inttime_to_datetime(last_update_i) else: # if last_update is not present in solr, consider last_modfied of # that document as last_update if self.type in ['work', 'author']: last_update = self.document.last_modified else: last_update = None return { 'ebook_count': sum(doc.get('ebook_count_i', 0) for doc in result.docs), 'edition_count': sum(doc.get('edition_count', 0) for doc in result.docs), 'work_count': result.num_found, 'last_update': last_update } return {}
def get_solr_works(work_key: Iterable[str]) -> dict[str, dict]: from openlibrary.plugins.worksearch.search import get_solr return { doc['key']: doc for doc in get_solr().get_many(set(work_key), fields=DEFAULT_SEARCH_FIELDS) }
def GET(self): i = web.input(q="", limit=5) i.limit = safeint(i.limit, 5) solr = get_solr() q = solr.escape(i.q).strip() if is_work_olid(q.upper()): # ensure uppercase; key is case sensitive in solr solr_q = 'key:"/works/%s"' % q.upper() else: solr_q = 'title:"%s"^2 OR title:(%s*)' % (q, q) params = { 'q_op': 'AND', 'sort': 'edition_count desc', 'rows': i.limit, 'fq': 'type:work', # limit the fields returned for better performance 'fl': 'key,title,subtitle,cover_i,first_publish_year,author_name,edition_count' } data = solr.select(solr_q, **params) # exclude fake works that actually have an edition key docs = [d for d in data['docs'] if d['key'][-1] == 'W'] for d in docs: # Required by the frontend d['name'] = d['key'].split('/')[-1] d['full_title'] = d['title'] if 'subtitle' in d: d['full_title'] += ": " + d['subtitle'] return to_json(docs)
def _get_solr_data(self): fields = [ "cover_edition_key", "cover_id", "edition_key", "first_publish_year", "has_fulltext", "lending_edition_s", "checked_out", "public_scan_b", "ia" ] solr = get_solr() stats.begin("solr", query={"key": self.key}, fields=fields) try: d = solr.select({"key": self.key}, fields=fields) except Exception as e: logging.getLogger("openlibrary").exception( "Failed to get solr data") return None finally: stats.end() if d.num_found > 0: w = d.docs[0] else: w = None # Replace _solr_data property with the attribute self.__dict__['_solr_data'] = w return w
def GET(self): i = web.input(q="", limit=5) i.limit = safeint(i.limit, 5) solr = get_solr() q = solr.escape(i.q).strip() solr_q = '' if is_author_olid(q.upper()): # ensure uppercase; key is case sensitive in solr solr_q = 'key:"/authors/%s"' % q.upper() else: prefix_q = q + "*" solr_q = 'name:(%s) OR alternate_names:(%s)' % (prefix_q, prefix_q) params = { 'q_op': 'AND', 'sort': 'work_count desc', 'rows': i.limit, 'fq': 'type:author' } data = solr.select(solr_q, **params) docs = data['docs'] for d in docs: if 'top_work' in d: d['works'] = [d.pop('top_work')] else: d['works'] = [] d['subjects'] = d.pop('top_subjects', []) return to_json(docs)
def find_matches(self, i): """ Tries to find an edition, or work, or multiple work candidates that match the given input data. Case#1: No match. None is returned. Case#2: Work match but not edition. Work is returned. Case#3: Work match and edition match. Edition is returned Case#4: Multiple work match. List of works is returned. :param web.utils.Storage i: addbook user supplied formdata :rtype: None or list or Work or Edition :return: None or Work or Edition or list of Works that are likely matches. """ i.publish_year = i.publish_date and self.extract_year(i.publish_date) author_key = i.authors and i.authors[0].author.key # work is set from the templates/books/check.html page. work_key = i.get('work') # work_key is set to none-of-these when user selects none-of-these link. if work_key == 'none-of-these': return None # Case 1, from check page work = work_key and web.ctx.site.get(work_key) if work: edition = self.try_edition_match(work=work, publisher=i.publisher, publish_year=i.publish_year, id_name=i.id_name, id_value=i.id_value) return edition or work # Case 3 or 2, from check page edition = self.try_edition_match(title=i.title, author_key=author_key, publisher=i.publisher, publish_year=i.publish_year, id_name=i.id_name, id_value=i.id_value) if edition: return edition # Case 2 or 3 or 4, from add page solr = get_solr() # Less exact solr search than try_edition_match(), search by supplied title and author only. result = solr.select( { 'title': i.title, 'author_key': author_key.split("/")[-1] }, doc_wrapper=make_work, q_op="AND") if result.num_found == 0: return None # Case 1, from add page elif result.num_found == 1: return result.docs[0] # Case 2 else: return result.docs # Case 4
def _get_edition_keys_from_solr(self, query_terms): if not query_terms: return q = " OR ".join(query_terms) solr = get_solr() result = solr.select(q, fields=["edition_key"], rows=10000) for doc in result['docs']: if 'edition_key' not in doc: continue for k in doc['edition_key']: yield "/books/" + k
def get_solr_query_term(self): if self.type == 'edition': return "edition_key:" + self._get_document_basekey() elif self.type == 'work': return 'key:/works/' + self._get_document_basekey() elif self.type == 'author': return "author_key:" + self._get_document_basekey() elif self.type == 'subject': type, value = self.key.split(":", 1) # escaping value as it can have special chars like : etc. value = get_solr().escape(value) return "%s_key:%s" % (type, value)
def _get_all_subjects(self): solr = get_solr() q = self._get_solr_query_for_subjects() # Solr has a maxBooleanClauses constraint there too many seeds, the if len(self.seeds) > 500: logger.warn( "More than 500 seeds. skipping solr query for finding subjects." ) return [] facet_names = [ 'subject_facet', 'place_facet', 'person_facet', 'time_facet' ] try: result = solr.select(q, fields=[], facets=facet_names, facet_limit=20, facet_mincount=1) except OSError: logger.error("Error in finding subjects of list %s", self.key, exc_info=True) return [] def get_subject_prefix(facet_name): name = facet_name.replace("_facet", "") if name == 'subject': return '' else: return name + ":" def process_subject(facet_name, title, count): prefix = get_subject_prefix(facet_name) key = prefix + title.lower().replace(" ", "_") url = "/subjects/" + key return web.storage({ "title": title, "name": title, "count": count, "key": key, "url": url }) def process_all(): facets = result['facets'] for k in facet_names: for f in facets.get(k, []): yield process_subject(f.name, f.value, f.count) return sorted(process_all(), reverse=True, key=lambda s: s["count"])
def _solr_data(self): fields = [ "cover_edition_key", "cover_id", "edition_key", "first_publish_year", "has_fulltext", "lending_edition_s", "public_scan_b", "ia"] solr = get_solr() stats.begin("solr", get=self.key, fields=fields) try: return solr.get(self.key, fields=fields) except Exception as e: logging.getLogger("openlibrary").exception("Failed to get solr data") return None finally: stats.end()
def GET(self): i = web.input(q="", limit=5) i.limit = safeint(i.limit, 5) solr = get_solr() q = solr.escape(i.q).strip() query_is_key = is_work_olid(q.upper()) if query_is_key: # ensure uppercase; key is case sensitive in solr solr_q = 'key:"/works/%s"' % q.upper() else: solr_q = f'title:"{q}"^2 OR title:({q}*)' params = { 'q_op': 'AND', 'sort': 'edition_count desc', 'rows': i.limit, 'fq': 'type:work', # limit the fields returned for better performance 'fl': 'key,title,subtitle,cover_i,first_publish_year,author_name,edition_count', } data = solr.select(solr_q, **params) # exclude fake works that actually have an edition key docs = [d for d in data['docs'] if d['key'][-1] == 'W'] if query_is_key and not docs: # Grumble! Work not in solr yet. Create a dummy. key = '/works/%s' % q.upper() work = web.ctx.site.get(key) if work: docs = [work.as_fake_solr_record()] for d in docs: # Required by the frontend d['name'] = d['key'].split('/')[-1] d['full_title'] = d['title'] if 'subtitle' in d: d['full_title'] += ": " + d['subtitle'] return to_json(docs)
def random_ebooks(limit=2000): solr = search.get_solr() sort = "edition_count desc" result = solr.select( query='has_fulltext:true -public_scan_b:false', rows=limit, sort=sort, fields=[ 'has_fulltext', 'key', 'ia', "title", "cover_edition_key", "author_key", "author_name", ]) return [format_work_data(doc) for doc in result.get('docs', []) if doc.get('ia')]
def _get_all_subjects(self): solr = get_solr() q = self._get_solr_query_for_subjects() # Solr has a maxBooleanClauses constraint there too many seeds, the if len(self.seeds) > 500: logger.warn("More than 500 seeds. skipping solr query for finding subjects.") return [] facet_names = ['subject_facet', 'place_facet', 'person_facet', 'time_facet'] try: result = solr.select(q, fields=[], facets=facet_names, facet_limit=20, facet_mincount=1) except IOError: logger.error("Error in finding subjects of list %s", self.key, exc_info=True) return [] def get_subject_prefix(facet_name): name = facet_name.replace("_facet", "") if name == 'subject': return '' else: return name + ":" def process_subject(facet_name, title, count): prefix = get_subject_prefix(facet_name) key = prefix + title.lower().replace(" ", "_") url = "/subjects/" + key return web.storage({ "title": title, "name": title, "count": count, "key": key, "url": url }) def process_all(): facets = result['facets'] for k in facet_names: for f in facets.get(k, []): yield process_subject(f.name, f.value, f.count) return sorted(process_all(), reverse=True, key=lambda s: s["count"])
def GET(self): from openlibrary.plugins.worksearch.search import get_solr result = get_solr().select(query='borrowed_b:false', fields=['key', 'lending_edition_s'], limit=100) def make_doc(d): # Makes a store doc from solr doc return { "_key": "ebooks/books/" + d['lending_edition_s'], "_rev": None, # Don't worry about consistancy "type": "ebook", "book_key": "/books/" + d['lending_edition_s'], "borrowed": "false" } docs = [make_doc(d) for d in result['docs']] docdict = dict((d['_key'], d) for d in docs) web.ctx.site.store.update(docdict) return delegate.RawText("ok\n")
def random_ebooks(limit=2000): solr = search.get_solr() sort = "edition_count desc" result = solr.select(query='has_fulltext:true -public_scan_b:false', rows=limit, sort=sort, fields=[ 'has_fulltext', 'key', 'ia', "title", "cover_edition_key", "author_key", "author_name", ]) def process_doc(doc): d = {} key = doc.get('key', '') # New solr stores the key as /works/OLxxxW if not key.startswith("/works/"): key = "/works/" + key d['url'] = key d['title'] = doc.get('title', '') if 'author_key' in doc and 'author_name' in doc: d['authors'] = [{ "key": key, "name": name } for key, name in zip(doc['author_key'], doc['author_name'])] if 'cover_edition_key' in doc: d['cover_url'] = h.get_coverstore_url( ) + "/b/olid/%s-M.jpg" % doc['cover_edition_key'] d['read_url'] = "//archive.org/stream/" + doc['ia'][0] return d return [ process_doc(doc) for doc in result.get('docs', []) if doc.get('ia') ]
def get_solr_query_term(self): if self.type == 'subject': typ, value = self.key.split(":", 1) # escaping value as it can have special chars like : etc. value = get_solr().escape(value) return f"{typ}_key:{value}" else: doc_basekey = self.document.key.split("/")[-1] if self.type == 'edition': return f"edition_key:{doc_basekey}" elif self.type == 'work': return f'key:/works/{doc_basekey}' elif self.type == 'author': return f"author_key:{doc_basekey}" else: logger.warning( f"Cannot get solr query term for seed type {self.type}", extra={'list': self._list.key, 'seed': self.key}, ) return None
def GET(self): i = web.input(q="", limit=5) i.limit = safeint(i.limit, 5) solr = get_solr() q = solr.escape(i.q).strip() query_is_key = is_author_olid(q.upper()) if query_is_key: # ensure uppercase; key is case sensitive in solr solr_q = 'key:"/authors/%s"' % q.upper() else: prefix_q = q + "*" solr_q = f'name:({prefix_q}) OR alternate_names:({prefix_q})' params = { 'q_op': 'AND', 'sort': 'work_count desc', 'rows': i.limit, 'fq': 'type:author', } data = solr.select(solr_q, **params) docs = data['docs'] if query_is_key and not docs: # Grumble! Must be a new author. Fetch from db, and build a "fake" solr resp key = '/authors/%s' % q.upper() author = web.ctx.site.get(key) if author: docs = [author.as_fake_solr_record()] for d in docs: if 'top_work' in d: d['works'] = [d.pop('top_work')] else: d['works'] = [] d['subjects'] = d.pop('top_subjects', []) return to_json(docs)
def try_edition_match( self, work=None, title=None, author_key=None, publisher=None, publish_year=None, id_name=None, id_value=None, ): """ Searches solr for potential edition matches. :param web.Storage work: :param str title: :param str author_key: e.g. /author/OL1234A :param str publisher: :param str publish_year: yyyy :param str id_name: from list of values in mapping below :param str id_value: :rtype: None or Edition or list :return: None, an Edition, or a list of Works """ # insufficient data if not publisher and not publish_year and not id_value: return q = {} work and q.setdefault('key', work.key.split("/")[-1]) title and q.setdefault('title', title) author_key and q.setdefault('author_key', author_key.split('/')[-1]) publisher and q.setdefault('publisher', publisher) # There are some errors indexing of publish_year. Use publish_date until it is fixed publish_year and q.setdefault('publish_date', publish_year) mapping = { 'isbn_10': 'isbn', 'isbn_13': 'isbn', 'lccn': 'lccn', 'oclc_numbers': 'oclc', 'ocaid': 'ia', } if id_value and id_name in mapping: if id_name.startswith('isbn'): id_value = id_value.replace('-', '') q[mapping[id_name]] = id_value solr = get_solr() result = solr.select(q, doc_wrapper=make_work, q_op="AND") if len(result.docs) > 1: # found multiple work matches return result.docs elif len(result.docs) == 1: # found one work match work = result.docs[0] publisher = publisher and fuzzy_find( publisher, work.publisher, stopwords=("publisher", "publishers", "and")) editions = web.ctx.site.get_many( ["/books/" + key for key in work.edition_key]) for e in editions: d = {} if publisher: if not e.publishers or e.publishers[0] != publisher: continue if publish_year: if not e.publish_date or publish_year != self.extract_year( e.publish_date): continue if id_value and id_name in mapping: if not id_name in e or id_value not in e[id_name]: continue # return the first good likely matching Edition return e