Пример #1
0
def get_meta_xml(itemid):
    """Returns the contents of meta_xml as JSON.
    """
    itemid = itemid.strip()
    
    url = 'http://www.archive.org/download/%s/%s_meta.xml' % (itemid, itemid)
    try:
        stats.begin("archive.org", url=url)
        metaxml = urllib2.urlopen(url).read()
        stats.end()
    except IOError:
        stats.end()
        return web.storage()
        
    # archive.org returns html on internal errors. 
    # Checking for valid xml before trying to parse it.
    if not metaxml.strip().startswith("<?xml"):
        return web.storage()
    
    try:
        defaults = {"collection": [], "external-identifier": []}
        return web.storage(xml2dict(metaxml, **defaults))
    except Exception, e:
        print >> web.debug, "Failed to parse metaxml for %s: %s" % (itemid, str(e)) 
        return web.storage()
Пример #2
0
    def get_many(self, sitename, data):
        keys = simplejson.loads(data['keys'])

        stats.begin("memcache.get_multi")
        result = self.memcache.get_multi(keys)
        stats.end(found=len(result))

        keys2 = [k for k in keys if k not in result]
        if keys2:
            data['keys'] = simplejson.dumps(keys2)
            result2 = ConnectionMiddleware.get_many(self, sitename, data)
            result2 = simplejson.loads(result2)

            # Memcache expects dict with (key, json) mapping and we have (key, doc) mapping.
            # Converting the docs to json before passing to memcache.
            self.mc_set_multi(dict((key, simplejson.dumps(doc)) for key, doc in result2.items()))

            result.update(result2)

        #@@ too many JSON conversions
        for k in result:
            if isinstance(result[k], six.string_types):
                result[k] = simplejson.loads(result[k])

        return simplejson.dumps(result)
Пример #3
0
def works_by_author(akey, sort='editions', page=1, rows=100):
    q='author_key:' + akey
    offset = rows * (page - 1)
    fields = ['key', 'author_name', 'author_key', 'title', 'subtitle',
        'edition_count', 'ia', 'cover_edition_key', 'has_fulltext',
        'first_publish_year', 'public_scan_b', 'lending_edition_s',
        'overdrive_s', 'ia_collection_s']
    fl = ','.join(fields)
    solr_select = solr_select_url + "?q.op=AND&q=%s&fq=&start=%d&rows=%d&fl=%s&wt=json" % (q, offset, rows, fl)
    facet_fields = ["author_facet", "language", "publish_year", "publisher_facet", "subject_facet", "person_facet", "place_facet", "time_facet"]
    if sort == 'editions':
        solr_select += '&sort=edition_count+desc'
    elif sort.startswith('old'):
        solr_select += '&sort=first_publish_year+asc'
    elif sort.startswith('new'):
        solr_select += '&sort=first_publish_year+desc'
    elif sort.startswith('title'):
        solr_select += '&sort=title+asc'
    solr_select += "&facet=true&facet.mincount=1&f.author_facet.facet.sort=count&f.publish_year.facet.limit=-1&facet.limit=25&" + '&'.join("facet.field=" + f for f in facet_fields)
    stats.begin("solr", url=solr_select)
    reply = json.load(urllib.urlopen(solr_select))
    stats.end()
    facets = reply['facet_counts']['facet_fields']
    works = [work_object(w) for w in reply['response']['docs']]

    def get_facet(f, limit=None):
        return list(web.group(facets[f][:limit * 2] if limit else facets[f], 2))

    return web.storage(
        num_found = int(reply['response']['numFound']),
        works = works,
        years = [(int(k), v) for k, v in get_facet('publish_year')],
        get_facet = get_facet,
        sort = sort,
    )
Пример #4
0
def get_meta_xml(itemid):
    """Returns the contents of meta_xml as JSON.
    """
    itemid = itemid.strip()

    url = 'http://www.archive.org/download/%s/%s_meta.xml' % (itemid, itemid)
    try:
        stats.begin("archive.org", url=url)
        metaxml = urllib2.urlopen(url).read()
        stats.end()
    except IOError:
        stats.end()
        return web.storage()

    # archive.org returns html on internal errors.
    # Checking for valid xml before trying to parse it.
    if not metaxml.strip().startswith("<?xml"):
        return web.storage()

    try:
        defaults = {"collection": [], "external-identifier": []}
        return web.storage(xml2dict(metaxml, **defaults))
    except Exception, e:
        print >> web.debug, "Failed to parse metaxml for %s: %s" % (itemid,
                                                                    str(e))
        return web.storage()
Пример #5
0
def work_search(query, limit=20, offset=0, **kw):
    """Search for works."""

    kw.setdefault("doc_wrapper", work_wrapper)
    fields = [
        "key", 
        "author_name", 
        "author_key", 
        "title",
        "edition_count",
        "ia",
        "cover_edition_key",
        "has_fulltext",
        "subject",
        "ia_collection_s",
        "public_scan_b",
        "overdrive_s",
        "lending_edition_s",
    ]
    kw.setdefault("fields", fields)

    if config.get('single_core_solr'):
        kw.setdefault("fq", "type:work")

    query = process_work_query(query)
    solr = get_works_solr()
    
    stats.begin("solr", query=query, start=offset, rows=limit, kw=kw)
    try:
        result = solr.select(query, start=offset, rows=limit, **kw)
    finally:
        stats.end()
    
    return result
Пример #6
0
 def get_results(q, offset=0, limit=100):
     q = escape_bracket(q)
     solr_select = solr_edition_select_url + "?q.op=AND&q=%s&fq=&start=%d&rows=%d&fl=*&qt=standard&wt=json" % (web.urlquote(q), offset, limit)
     stats.begin("solr", url=solr_select)
     json_data = urllib.urlopen(solr_select).read()
     stats.end()
     return json.loads(json_data)
Пример #7
0
 def get_results(q, offset=0, limit=100, snippets=3, fragsize=200, hl_phrase=False):
     m = re_bad_fields.match(q)
     if m:
         return { 'error': m.group(1) + ' search not supported' }
     q = escape_q(q)
     solr_params = [
         ('fl', 'ia,body_length,page_count'),
         ('hl', 'true'),
         ('hl.fl', 'body'),
         ('hl.snippets', snippets),
         ('hl.mergeContiguous', 'true'),
         ('hl.usePhraseHighlighter', 'true' if hl_phrase else 'false'),
         ('hl.simple.pre', '{{{'),
         ('hl.simple.post', '}}}'),
         ('hl.fragsize', fragsize),
         ('q.op', 'AND'),
         ('q', web.urlquote(q)),
         ('start', offset),
         ('rows', limit),
         ('qf', 'body'),
         ('qt', 'standard'),
         ('hl.maxAnalyzedChars', '-1'),
         ('wt', 'json'),
     ]
     solr_select = solr_select_url + '?' + '&'.join("%s=%s" % (k, unicode(v)) for k, v in solr_params)
     stats.begin("solr", url=solr_select)
     json_data = urllib.urlopen(solr_select).read()
     stats.end()
     try:
         return simplejson.loads(json_data)
     except:
         m = re_query_parser_error.search(json_data)
         return { 'error': web.htmlunquote(m.group(1)) }
Пример #8
0
    def _editions_view(self, seeds, **kw):
        reverse = str(kw.pop("reverse", "")).lower()
        if 'sort' in kw and reverse == "true":
            # sort=\field is the couchdb-lucene's way of telling ORDER BY field DESC
            kw['sort'] = '\\' + kw['sort']
        view_url = config.get("lists", {}).get("editions_view")
        if not view_url:
            return {}

        def escape(value):
            special_chars = '+-&|!(){}[]^"~*?:\\'
            pattern = "([%s])" % re.escape(special_chars)
            
            quote = '"'
            return quote + web.re_compile(pattern).sub(r'\\\1', value) + quote
        
        q = " OR ".join("seed:" + escape(seed.encode('utf-8')) for seed in seeds)
        url = view_url + "?" + urllib.urlencode(dict(kw, q=q))
        
        stats.begin("couchdb", url=url)
        try:
            json = urllib2.urlopen(url).read()
        finally:
            stats.end()
        return simplejson.loads(json)
Пример #9
0
def work_search(query, limit=20, offset=0, **kw):
    """Search for works."""

    kw.setdefault("doc_wrapper", work_wrapper)
    fields = [
        "key", 
        "author_name", 
        "author_key", 
        "title",
        "edition_count",
        "ia",
        "cover_edition_key",
        "has_fulltext",
        "subject",
        "ia_collection_s",
        "public_scan_b",
        "overdrive_s",
        "lending_edition_s",
    ]
    kw.setdefault("fields", fields)

    query = process_work_query(query)
    solr = get_works_solr()
    
    stats.begin("solr", query=query, start=offset, rows=limit, kw=kw)
    try:
        result = solr.select(query, start=offset, rows=limit, **kw)
    finally:
        stats.end()
    
    return result
Пример #10
0
 def add(self, key, value, expires=0):
     key = web.safestr(key)
     value = simplejson.dumps(value)
     stats.begin("memcache.add", key=key)
     value = self.memcache.add(key, value, expires)
     stats.end()
     return value
Пример #11
0
    def _get_solr_data(self):
        if config.get("single_core_solr"):
            key = self.key
        else:
            key = self.get_olid()

        fields = [
            "cover_edition_key", "cover_id", "edition_key", "first_publish_year",
            "has_fulltext", "lending_edition", "checked_out", "public_scan_b", "ia"]
        
        solr = get_works_solr()
        stats.begin("solr", query={"key": key}, fields=fields)
        try:
            d = solr.select({"key": key}, fields=fields)
        finally:
            stats.end()
            
        if d.num_found > 0:
            w = d.docs[0]
        else:
            w = None
                
        # Replace _solr_data property with the attribute
        self.__dict__['_solr_data'] = w
        return w
Пример #12
0
 def _couchdb_view(self, db, viewname, **kw):
     stats.begin("couchdb", db=db.name, view=viewname, kw=kw)
     try:
         result = db.view(viewname, **kw)
     finally:
         stats.end()
     return result
Пример #13
0
 def set(self, key, value, expires=0):
     key = web.safestr(key)
     value = simplejson.dumps(value)
     stats.begin("memcache.set", key=key)
     value = self.memcache.set(key, value, expires)
     stats.end()
     return value
Пример #14
0
    def _get_solr_data(self):
        if config.get("single_core_solr"):
            key = self.key
        else:
            key = self.get_olid()

        fields = [
            "cover_edition_key", "cover_id", "edition_key",
            "first_publish_year", "has_fulltext", "lending_edition_s",
            "checked_out", "public_scan_b", "ia"
        ]

        solr = get_works_solr()
        stats.begin("solr", query={"key": key}, fields=fields)
        try:
            d = solr.select({"key": key}, fields=fields)
        except Exception as e:
            logging.getLogger("openlibrary").exception(
                "Failed to get solr data")
            return None
        finally:
            stats.end()

        if d.num_found > 0:
            w = d.docs[0]
        else:
            w = None

        # Replace _solr_data property with the attribute
        self.__dict__['_solr_data'] = w
        return w
Пример #15
0
    def get_ia_meta_fields(self):
        # Check for cached value
        # $$$ we haven't assigned _ia_meta_fields the first time around but there's apparently
        #     some magic that lets us check this way (and breaks using hasattr to check if defined)
        if self._ia_meta_fields:
            return self._ia_meta_fields
            
        if not self.get('ocaid', None):
            return {}
        ia = self.ocaid
        url = 'http://www.archive.org/download/%s/%s_meta.xml' % (ia, ia)
        reply = dict([ (set_name, set()) for set_name in ia_meta_sets ]) # create empty sets
        try:
            stats.begin("archive.org", url=url)
            f = urllib2.urlopen(url)
            stats.end()
        except:
            stats.end()
            return reply
        for line in f:
            m = re_meta_field.search(line)
            if not m:
                continue
            k = m.group(1).lower()
            v = m.group(2)
            if k == 'collection':
                reply[k].add(v.lower())
            elif k in ia_meta_sets:
                reply[k].add(v)
            else:
                if k in ia_meta_fields:
                    reply[k] = v

        self._ia_meta_fields = reply
        return self._ia_meta_fields
Пример #16
0
    def get_ia_meta_fields(self):
        if not self.get('ocaid', None):
            return {}
        ia = self.ocaid
        url = 'http://www.archive.org/download/%s/%s_meta.xml' % (ia, ia)
        reply = { 'collection': set() }
        try:
            stats.begin("archive.org", url=url)
            f = urllib2.urlopen(url)
            stats.end()
        except:
            stats.end()
            return reply
        for line in f:
            m = re_meta_field.search(line)
            if not m:
                continue
            k = m.group(1).lower()
            v = m.group(2)
            if k == 'collection':
                reply[k].add(v.lower())
            else:
                assert k == 'contributor'
                reply[k] = v

        return reply
Пример #17
0
 def add(self, key, value, expires=0):
     key = web.safestr(key)
     value = json.dumps(value)
     stats.begin("memcache.add", key=key)
     value = self.memcache.add(key, value, expires)
     stats.end()
     return value
Пример #18
0
 def get_couchdb_docs(self, db, keys):
     try:
         stats.begin(name="_all_docs", keys=keys, include_docs=True)
         docs = dict((row.id, row.doc) for row in db.view("_all_docs", keys=keys, include_docs=True))
     finally:
         stats.end()
     return docs
Пример #19
0
def read_from_archive(ia):
    meta_xml = 'http://archive.org/download/' + ia + '/' + ia + '_meta.xml'
    stats.begin("archive.org", url=meta_xml)
    xml_data = urllib.urlopen(meta_xml)
    item = {}
    try:
        tree = etree.parse(xml_data)
    except etree.XMLSyntaxError:
        return {}
    finally:
        stats.end()
    root = tree.getroot()

    fields = ['title', 'creator', 'publisher', 'date', 'language']

    for k in 'title', 'date', 'publisher':
        v = root.find(k)
        if v is not None:
            item[k] = v.text

    for k in 'creator', 'language', 'collection':
        v = root.findall(k)
        if len(v):
            item[k] = [i.text for i in v if i.text]
    return item
Пример #20
0
def _old_get_meta_xml(itemid):
    """Returns the contents of meta_xml as JSON.
    """
    itemid = web.safestr(itemid.strip())
    url = 'http://www.archive.org/download/%s/%s_meta.xml' % (itemid, itemid)
    try:
        stats.begin('archive.org', url=url)
        metaxml = urllib2.urlopen(url).read()
        stats.end()
    except IOError:
        logger.error("Failed to download _meta.xml for %s", itemid, exc_info=True)
        stats.end()
        return web.storage()

    # archive.org returns html on internal errors.
    # Checking for valid xml before trying to parse it.
    if not metaxml.strip().startswith("<?xml"):
        return web.storage()

    try:
        defaults = {"collection": [], "external-identifier": []}
        return web.storage(xml2dict(metaxml, **defaults))
    except Exception as e:
        logger.error("Failed to parse metaxml for %s", itemid, exc_info=True)
        return web.storage()
Пример #21
0
    def get_many(self, sitename, data):
        keys = simplejson.loads(data['keys'])

        stats.begin("memcache.get_multi")
        result = self.memcache.get_multi(keys)
        stats.end(found=len(result))

        keys2 = [k for k in keys if k not in result]
        if keys2:
            data['keys'] = simplejson.dumps(keys2)
            result2 = ConnectionMiddleware.get_many(self, sitename, data)
            result2 = simplejson.loads(result2)

            # Memcache expects dict with (key, json) mapping and we have (key, doc) mapping.
            # Converting the docs to json before passing to memcache.
            self.mc_set_multi(
                dict((key, simplejson.dumps(doc))
                     for key, doc in result2.items()))

            result.update(result2)

        #@@ too many JSON conversions
        for k in result:
            if isinstance(result[k], basestring):
                result[k] = simplejson.loads(result[k])

        return simplejson.dumps(result)
Пример #22
0
def read_from_archive(ia):
    meta_xml = 'http://www.archive.org/download/' + ia + '/' + ia + '_meta.xml'
    stats.begin("archive.org", url=meta_xml)
    xml_data = urllib.urlopen(meta_xml)
    item = {}
    try:
        tree = etree.parse(xml_data)
    except etree.XMLSyntaxError:
        return {}
    finally:
        stats.end()
    root = tree.getroot()

    fields = ['title', 'creator', 'publisher', 'date', 'language']

    for k in 'title', 'date', 'publisher':
        v = root.find(k)
        if v is not None:
            item[k] = v.text

    for k in 'creator', 'language', 'collection':
        v = root.findall(k)
        if len(v):
            item[k] = [i.text for i in v if i.text]
    return item
Пример #23
0
    def _editions_view(self, seeds, **kw):
        reverse = str(kw.pop("reverse", "")).lower()
        if 'sort' in kw and reverse == "true":
            # sort=\field is the couchdb-lucene's way of telling ORDER BY field DESC
            kw['sort'] = '\\' + kw['sort']
        view_url = config.get("lists", {}).get("editions_view")
        if not view_url:
            return {}

        def escape(value):
            special_chars = '+-&|!(){}[]^"~*?:\\'
            pattern = "([%s])" % re.escape(special_chars)

            quote = '"'
            return quote + web.re_compile(pattern).sub(r'\\\1', value) + quote

        q = " OR ".join("seed:" + escape(seed.encode('utf-8'))
                        for seed in seeds)
        url = view_url + "?" + urllib.urlencode(dict(kw, q=q))

        stats.begin("couchdb", url=url)
        try:
            json = urllib2.urlopen(url).read()
        finally:
            stats.end()
        return simplejson.loads(json)
Пример #24
0
    def memcache_set(self, args, kw, value, time):
        """Adds value and time to memcache. Key is computed from the arguments."""
        key = self.compute_key(args, kw)
        json_data = self.json_encode([value, time])

        stats.begin("memcache.set", key=key)
        self.memcache.set(key, json_data)
        stats.end()
Пример #25
0
        def get_results(q,
                        offset=0,
                        limit=100,
                        snippets=3,
                        fragsize=200,
                        hl_phrase=False):
            m = re_bad_fields.match(q)
            if m:
                return {'error': m.group(1) + ' search not supported'}
            q = escape_q(q)
            solr_params = [
                ('fl', 'ia,body_length,page_count'),
                ('hl', 'true'),
                ('hl.fl', 'body'),
                ('hl.snippets', snippets),
                ('hl.mergeContiguous', 'true'),
                ('hl.usePhraseHighlighter', 'true' if hl_phrase else 'false'),
                ('hl.simple.pre', '{{{'),
                ('hl.simple.post', '}}}'),
                ('hl.fragsize', fragsize),
                ('q.op', 'AND'),
                ('q', web.urlquote(q)),
                ('start', offset),
                ('rows', limit),
                ('qf', 'body'),
                ('qt', 'standard'),
                ('hl.maxAnalyzedChars', '-1'),
                ('wt', 'json'),
            ]
            solr_select = solr_select_url + '?' + '&'.join(
                "%s=%s" % (k, unicode(v)) for k, v in solr_params)
            stats.begin("solr", url=solr_select)
            json_data = urllib.urlopen(solr_select).read()
            stats.end()

            try:
                results = simplejson.loads(json_data)
            except:
                m = re_query_parser_error.search(json_data)
                return {'error': web.htmlunquote(m.group(1))}

            ekey_doc = {}
            for doc in results['response']['docs']:
                ia = doc['ia']
                q = {'type': '/type/edition', 'ocaid': ia}
                ekeys = web.ctx.site.things(q)
                if not ekeys:
                    del q['ocaid']
                    q['source_records'] = 'ia:' + ia
                    ekeys = web.ctx.site.things(q)
                if ekeys:
                    ekey_doc[ekeys[0]] = doc

            editions = web.ctx.site.get_many(ekey_doc.keys())
            for e in editions:
                ekey_doc[e['key']]['edition'] = e
            return results
Пример #26
0
 def get_results(q, offset=0, limit=100):
     valid_fields = ['key', 'name', 'alternate_names', 'birth_date', 'death_date', 'date', 'work_count']
     q = escape_colon(escape_bracket(q), valid_fields)
     solr_select = solr_author_select_url + "?q.op=AND&q=%s&fq=&start=%d&rows=%d&fl=*&qt=standard&wt=json" % (web.urlquote(q), offset, limit)
     solr_select += '&sort=work_count+desc'
     stats.begin("solr", url=solr_select)
     json_data = urllib.urlopen(solr_select).read()
     stats.end()
     return json.loads(json_data)
Пример #27
0
    def memcache_set(self, args, kw, value, time):
        """Adds value and time to memcache. Key is computed from the arguments.
        """
        key = self.compute_key(args, kw)
        json = self.json_encode([value, time])

        stats.begin("memcache.set", key=key)
        self.memcache.set(key, json)
        stats.end()
Пример #28
0
 def get_couchdb_docs(self, db, keys):
     try:
         stats.begin(name="_all_docs", keys=keys, include_docs=True)
         docs = dict(
             (row.id, row.doc)
             for row in db.view("_all_docs", keys=keys, include_docs=True))
     finally:
         stats.end()
     return docs
Пример #29
0
def simple_search(q, offset=0, rows=20, sort=None):
    solr_select = solr_select_url + "?version=2.2&q.op=AND&q=%s&fq=&start=%d&rows=%d&fl=*%%2Cscore&qt=standard&wt=json" % (web.urlquote(q), offset, rows)
    if sort:
        solr_select += "&sort=" + web.urlquote(sort)

    stats.begin("solr", url=solr_select)
    json_data = urllib.urlopen(solr_select)
    stats.end()
    return json.load(json_data)
Пример #30
0
 def get_results(q, offset=0, limit=100):
     valid_fields = ['key', 'name', 'type', 'count']
     q = escape_colon(escape_bracket(q), valid_fields)
     solr_select = solr_subject_select_url + "?q.op=AND&q=%s&fq=&start=%d&rows=%d&fl=name,type,count&qt=standard&wt=json" % (web.urlquote(q), offset, limit)
     solr_select += '&sort=count+desc'
     stats.begin("solr", url=solr_select)
     json_data = urllib.urlopen(solr_select).read()
     stats.end()
     return json.loads(json_data)
Пример #31
0
def simple_search(q, offset=0, rows=20, sort=None):
    solr_select = solr_select_url + "?version=2.2&q.op=AND&q=%s&fq=&start=%d&rows=%d&fl=*%%2Cscore&qt=standard&wt=json" % (web.urlquote(q), offset, rows)
    if sort:
        solr_select += "&sort=" + web.urlquote(sort)

    stats.begin("solr", url=solr_select)
    json_data = urllib.urlopen(solr_select)
    stats.end()
    return json.load(json_data)
Пример #32
0
    def _couchdb_view(self, db, viewname, **kw):
        stats.begin("couchdb", db=db.name, view=viewname, kw=kw)
        try:
            result = db.view(viewname, **kw)

            # force fetching the results
            result.rows
        finally:
            stats.end()
        return result
Пример #33
0
def execute_solr_query(url):
    stats.begin("solr", url=url)
    try:
        solr_result = urllib2.urlopen(url, timeout=3)
    except Exception as e:
        logger.exception("Failed solr query")
        return None
    finally:
        stats.end()
    return solr_result
Пример #34
0
def top_books_from_author(akey, rows=5, offset=0):
    q = 'author_key:(' + akey + ')'
    solr_select = solr_select_url + "?q=%s&start=%d&rows=%d&fl=key,title,edition_count,first_publish_year&wt=json&sort=edition_count+desc" % (q, offset, rows)
    stats.begin("solr", url=solr_select)
    response = json.load(urllib.urlopen(solr_select))['response']
    stats.end()
    return {
        'books': [web.storage(doc) for doc in response['docs']],
        'total': response['numFound'],
    }
Пример #35
0
def top_books_from_author(akey, rows=5, offset=0):
    q = 'author_key:(' + akey + ')'
    solr_select = solr_select_url + "?q=%s&start=%d&rows=%d&fl=key,title,edition_count,first_publish_year&wt=json&sort=edition_count+desc" % (q, offset, rows)
    stats.begin("solr", url=solr_select)
    response = json.load(urllib.urlopen(solr_select))['response']
    stats.end()
    return {
        'books': [web.storage(doc) for doc in response['docs']],
        'total': response['numFound'],
    }
Пример #36
0
def execute_solr_query(url):
    stats.begin("solr", url=url)
    try:
        solr_result = urllib2.urlopen(url, timeout=3)
    except Exception as e:
        logger.exception("Failed solr query")
        return None
    finally:
        stats.end()
    return solr_result
Пример #37
0
 def _couchdb_view(self, db, viewname, **kw):
     stats.begin("couchdb", db=db.name, view=viewname, kw=kw)
     try:
         result = db.view(viewname, **kw)
         
         # force fetching the results
         result.rows
     finally:
         stats.end()
     return result
Пример #38
0
 def get_results(q, offset=0, limit=100, snippets=3, fragsize=200):
     q = escape_bracket(q)
     solr_select = solr_select_url + "?fl=ia,body_length,page_count&hl=true&hl.fl=body&hl.snippets=%d&hl.mergeContiguous=true&hl.usePhraseHighlighter=false&hl.simple.pre={{{&hl.simple.post=}}}&hl.fragsize=%d&q.op=AND&q=%s&start=%d&rows=%d&qf=body&qt=standard&hl.maxAnalyzedChars=1000000&wt=json" % (snippets, fragsize, web.urlquote(q), offset, limit)
     stats.begin("solr", url=solr_select)
     json_data = urllib.urlopen(solr_select).read()
     stats.end()
     try:
         return simplejson.loads(json_data)
     except:
         m = re_query_parser_error.search(json_data)
         return { 'error': web.htmlunquote(m.group(1)) }
Пример #39
0
def run_solr_query(param = {}, rows=100, page=1, sort=None, spellcheck_count=None):
    # called by do_search
    if spellcheck_count == None:
        spellcheck_count = default_spellcheck_count
    offset = rows * (page - 1)

    (q_list, use_dismax) = build_q_list(param)

    fields = ['key', 'author_name', 'author_key', 'title', 'subtitle', 'edition_count', 'ia', 'has_fulltext', 'first_publish_year', 'cover_edition_key', 'public_scan_b', 'lending_edition_s', 'overdrive_s', 'ia_collection_s']
    fl = ','.join(fields)
    if use_dismax:
        q = web.urlquote(' '.join(q_list))
        solr_select = solr_select_url + "?defType=dismax&q.op=AND&q=%s&qf=text+title^5+author_name^5&bf=sqrt(edition_count)^10&start=%d&rows=%d&fl=%s&wt=standard" % (q, offset, rows, fl)
    else:
        q = web.urlquote(' '.join(q_list + ['_val_:"sqrt(edition_count)"^10']))
        solr_select = solr_select_url + "?q.op=AND&q=%s&start=%d&rows=%d&fl=%s&wt=standard" % (q, offset, rows, fl)
    solr_select += '&spellcheck=true&spellcheck.count=%d' % spellcheck_count
    solr_select += "&facet=true&" + '&'.join("facet.field=" + f for f in facet_fields)

    if 'public_scan' in param:
        v = param.pop('public_scan').lower()
        if v in ('true', 'false'):
            if v == 'false':
                # also constrain on print disabled since the index may not be in sync
                param.setdefault('print_disabled', 'false')
            solr_select += '&fq=public_scan_b:%s' % v

    if 'print_disabled' in param:
        v = param.pop('print_disabled').lower()
        if v in ('true', 'false'):
            solr_select += '&fq=%ssubject_key:protected_daisy' % ('-' if v == 'false' else '')

    k = 'has_fulltext'
    if k in param:
        v = param[k].lower()
        if v not in ('true', 'false'):
            del param[k]
        param[k] == v
        solr_select += '&fq=%s:%s' % (k, v)

    for k in facet_list_fields:
        if k == 'author_facet':
            k = 'author_key'
        if k not in param:
            continue
        v = param[k]
        solr_select += ''.join('&fq=%s:"%s"' % (k, url_quote(l)) for l in v if l)
    if sort:
        solr_select += "&sort=" + url_quote(sort)

    stats.begin("solr", url=solr_select)
    reply = urllib.urlopen(solr_select).read()
    stats.end()
    return (reply, solr_select, q_list)
Пример #40
0
 def memcache_get(self, args, kw):
     """Reads the value from memcache. Key is computed from the arguments.
     
     Returns (value, time) when the value is available, None otherwise.
     """
     key = self.compute_key(args, kw)
     stats.begin("memcache.get", key=key)
     json = self.memcache.get(key)
     stats.end(hit=bool(json))
     
     return json and self.json_decode(json)
Пример #41
0
def get_item_manifest(item_id, item_server, item_path):
    url = 'https://%s/BookReader/BookReaderJSON.php' % item_server
    url += '?itemPath=%s&itemId=%s&server=%s' % (item_path, item_id, item_server)
    try:
        stats.begin('archive.org', url=url)
        manifest = requests.get(url)
        stats.end()
        return manifest.json()
    except IOError:
        stats.end()
        return {}
Пример #42
0
    def memcache_get(self, args, kw):
        """Reads the value from memcache. Key is computed from the arguments.

        Returns (value, time) when the value is available, None otherwise.
        """
        key = self.compute_key(args, kw)
        stats.begin("memcache.get", key=key)
        json = self.memcache.get(key)
        stats.end(hit=bool(json))

        return json and self.json_decode(json)
Пример #43
0
def get_item_json(itemid):
    itemid = web.safestr(itemid.strip())
    url = 'http://archive.org/metadata/%s' % itemid
    try:
        stats.begin('archive.org', url=url)
        metadata_json = urllib2.urlopen(url).read()
        stats.end()
        return simplejson.loads(metadata_json)
    except IOError:
        stats.end()
        return {}
Пример #44
0
def search_inside_result_count(q):
    q = escape_q(q)
    solr_select = solr_select_url + "?fl=ia&q.op=AND&wt=json&q=" + web.urlquote(q)
    stats.begin("solr", url=solr_select)
    json_data = urllib.urlopen(solr_select).read()
    stats.end()
    try:
        results = simplejson.loads(json_data)
    except:
        return None
    return results['response']['numFound']
Пример #45
0
def _get_metadata(itemid):
    """Returns metadata by querying the archive.org metadata API.
    """
    url = "http://www.archive.org/metadata/%s" % itemid
    try:
        stats.begin("archive.org", url=url)
        text = urllib2.urlopen(url).read()
        stats.end()
        return simplejson.loads(text)
    except (IOError, ValueError):
        return None
Пример #46
0
def get_item_manifest(item_id, item_server, item_path):
    url = 'https://%s/BookReader/BookReaderJSON.php' % item_server
    url += "?itemPath=%s&itemId=%s&server=%s" % (item_path, item_id, item_server)
    try:
        stats.begin("archive.org", url=url)
        manifest_json = urllib2.urlopen(url).read()
        stats.end()
        return simplejson.loads(manifest_json)
    except IOError:
        stats.end()
        return {}
Пример #47
0
def sorted_work_editions(wkey, json_data=None):
    q='key:' + wkey
    if not json_data: # for testing
        solr_select = solr_select_url + "?version=2.2&q.op=AND&q=%s&rows=10&fl=edition_key&qt=standard&wt=json" % q
        stats.begin("solr", url=solr_select)
        json_data = urllib.urlopen(solr_select).read()
        stats.end()
    reply = json.loads(json_data)

    if reply['response']['numFound'] == 0:
        return []
    return reply["response"]['docs'][0].get('edition_key', [])
Пример #48
0
def search_inside_result_count(q):
    q = escape_q(q)
    solr_select = solr_select_url + "?fl=ia&q.op=AND&wt=json&q=" + web.urlquote(
        q)
    stats.begin("solr", url=solr_select)
    json_data = urllib.urlopen(solr_select).read()
    stats.end()
    try:
        results = simplejson.loads(json_data)
    except:
        return None
    return results['response']['numFound']
Пример #49
0
def sorted_work_editions(wkey, json_data=None):
    q='key:' + wkey
    if not json_data: # for testing
        solr_select = solr_select_url + "?version=2.2&q.op=AND&q=%s&rows=10&fl=edition_key&qt=standard&wt=json" % q
        stats.begin("solr", url=solr_select)
        json_data = urllib.urlopen(solr_select).read()
        stats.end()
    reply = json.loads(json_data)

    if reply['response']['numFound'] == 0:
        return []
    return reply["response"]['docs'][0].get('edition_key', [])
Пример #50
0
def _get_metadata(itemid):
    """Returns metadata by querying the archive.org metadata API.
    """
    itemid = web.safestr(itemid.strip())
    url = '%s/metadata/%s' % (IA_BASE_URL, itemid)
    try:
        stats.begin('archive.org', url=url)
        metadata = requests.get(url)
        stats.end()
        return metadata.json()
    except IOError:
        stats.end()
        return {}
Пример #51
0
def execute_solr_query(
        solr_path: str,
        params: Union[dict, list[tuple[str, Any]]]) -> Optional[Response]:
    stats.begin("solr", url=f'{solr_path}?{urlencode(params)}')
    try:
        response = requests.get(solr_path, params=params, timeout=10)
        response.raise_for_status()
    except requests.HTTPError:
        logger.exception("Failed solr query")
        return None
    finally:
        stats.end()
    return response
Пример #52
0
 def request(self, sitename, path, method='GET', data=None):
     path = "/" + sitename + path
     web.ctx.infobase_auth_token = self.get_auth_token()
     try:
         stats.begin("infobase", path=path, method=method, data=data)
         out = server.request(path, method, data)
         stats.end()
         if 'infobase_auth_token' in web.ctx:
             self.set_auth_token(web.ctx.infobase_auth_token)
     except common.InfobaseException as e:
         stats.end(error=True)
         self.handle_error(e.status, str(e))
     return out
Пример #53
0
    def _solr_data(self):
        fields = [
            "cover_edition_key", "cover_id", "edition_key", "first_publish_year",
            "has_fulltext", "lending_edition_s", "public_scan_b", "ia"]

        solr = get_solr()
        stats.begin("solr", get=self.key, fields=fields)
        try:
            return solr.get(self.key, fields=fields)
        except Exception as e:
            logging.getLogger("openlibrary").exception("Failed to get solr data")
            return None
        finally:
            stats.end()
Пример #54
0
def get_work_iaids(wkey):
    #wid = wkey.split('/')[2]
    solr_select_url = get_works_solr_select_url()
    filter = 'ia'
    q = 'key:' + wkey
    stats.begin('solr', url=wkey)
    solr_select = solr_select_url + "?version=2.2&q.op=AND&q=%s&rows=10&fl=%s&qt=standard&wt=json&fq=type:work" % (q, filter)
    json_data = urllib.urlopen(solr_select).read()
    stats.end()
    print json_data
    reply = simplejson.loads(json_data)
    if reply['response']['numFound'] == 0:
        return []
    return reply["response"]['docs'][0].get(filter, [])
Пример #55
0
def get_work_iaids(wkey):
    #wid = wkey.split('/')[2]
    solr_select_url = get_solr_select_url()
    filter = 'ia'
    q = 'key:' + wkey
    stats.begin('solr', url=wkey)
    solr_select = solr_select_url + "?version=2.2&q.op=AND&q=%s&rows=10&fl=%s&qt=standard&wt=json&fq=type:work" % (
        q, filter)
    reply = requests.get(solr_select).json()
    stats.end()
    print(reply)
    if reply['response']['numFound'] == 0:
        return []
    return reply["response"]['docs'][0].get(filter, [])
Пример #56
0
def execute_solr_query(url):
    """
    Returns a requests.Response or None
    """
    stats.begin("solr", url=url)
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
    except requests.HTTPError:
        logger.exception("Failed solr query")
        return None
    finally:
        stats.end()
    return response
Пример #57
0
def _get_count_docs(ndays):
    """Returns the count docs from admin couchdb database.
    
    This function is memoized to avoid accessing couchdb for every request.
    """
    admin_db = couchdb.Database(config.admin.counts_db)
    end      = datetime.datetime.now().strftime("counts-%Y-%m-%d")
    start    = (datetime.datetime.now() - datetime.timedelta(days = ndays)).strftime("counts-%Y-%m-%d")
        
    stats.begin("couchdb")
    docs = [x.doc for x in admin_db.view("_all_docs",
                                         startkey_docid = start,
                                         endkey_docid   = end,
                                         include_docs = True)]
    stats.end()
    return docs