Пример #1
0
def parse_error(path):
    html = open(path).read(10000)
    soup = BeautifulSoup(html)

    h1 = web.htmlunquote(soup.body.h1.string or "")
    h2 = web.htmlunquote(soup.body.h2.string or "")
    message = h1.split("at")[0].strip() + ": " + (h2 and h2.splitlines()[0])

    code, url = [
        web.htmlunquote(td.string) for td in soup.body.table.findAll("td")
    ]

    # strip common prefixes
    code = web.re_compile(
        ".*/(?:staging|production)/(openlibrary|infogami|web)").sub(
            r"\1", code)

    m = web.re_compile(r"(\d\d)(\d\d)(\d\d)(\d{6})").match(
        web.numify(os.path.basename(path)))
    hh, mm, ss, microsec = m.groups()

    return web.storage(url=url,
                       message=message,
                       code=code,
                       time="%s:%s:%s" % (hh, mm, ss))
Пример #2
0
def parse_module(module_url):
    module = web.storage(
        url=module_url,
        cached_xml="",
        screenshot="",
        title="",
        title_url="",
        directory_title="",
        description="",
        author="",
        author_email="",
        author_affiliation="",
        author_location="",
        render_inline="",
    )

    if not module_url.startswith("http://"):
        raise "Ooops! Submission has failed – the URL seems to be invalid."

    try:
        html = utils.dnl(module_url)
        html = web.htmlunquote(html)  # this may confuse the parser
        xml = utils.parse_xml(html)
    except:
        raise "Ooops! Submission has failed – the XML or HTML page could not be loaded successfully."

    xnodes = xml.xpath("//ModulePrefs")
    if not xnodes:
        raise "Ooops! The XML is valid, but we cannot find the module."
    xnodes = xnodes[0]

    for attr in module:
        module[attr] = xnodes.get(attr) or module[attr]

    return module
Пример #3
0
 def get_results(q, offset=0, limit=100, snippets=3, fragsize=200, hl_phrase=False):
     m = re_bad_fields.match(q)
     if m:
         return { 'error': m.group(1) + ' search not supported' }
     q = escape_q(q)
     solr_params = [
         ('fl', 'ia,body_length,page_count'),
         ('hl', 'true'),
         ('hl.fl', 'body'),
         ('hl.snippets', snippets),
         ('hl.mergeContiguous', 'true'),
         ('hl.usePhraseHighlighter', 'true' if hl_phrase else 'false'),
         ('hl.simple.pre', '{{{'),
         ('hl.simple.post', '}}}'),
         ('hl.fragsize', fragsize),
         ('q.op', 'AND'),
         ('q', web.urlquote(q)),
         ('start', offset),
         ('rows', limit),
         ('qf', 'body'),
         ('qt', 'standard'),
         ('hl.maxAnalyzedChars', '-1'),
         ('wt', 'json'),
     ]
     solr_select = solr_select_url + '?' + '&'.join("%s=%s" % (k, unicode(v)) for k, v in solr_params)
     stats.begin("solr", url=solr_select)
     json_data = urllib.urlopen(solr_select).read()
     stats.end()
     try:
         return simplejson.loads(json_data)
     except:
         m = re_query_parser_error.search(json_data)
         return { 'error': web.htmlunquote(m.group(1)) }
Пример #4
0
def do_search(param, sort, page=1, rows=100):
    (reply, solr_select, q_list) = run_solr_query(param, rows, page, sort)
    is_bad = False
    if reply.startswith('<html'):
        is_bad = True
    if not is_bad:
        try:
            root = XML(reply)
        except XMLSyntaxError:
            is_bad = True
    if is_bad:
        m = re_pre.search(reply)
        return web.storage(
            facet_counts = None,
            docs = [],
            is_advanced = bool(param.get('q', 'None')),
            num_found = None,
            solr_select = solr_select,
            q_list = q_list,
            error = (web.htmlunquote(m.group(1)) if m else reply),
        )

    docs = root.find('result')
    return web.storage(
        facet_counts = read_facets(root),
        docs = docs,
        is_advanced = bool(param.get('q', 'None')),
        num_found = (int(docs.attrib['numFound']) if docs is not None else None),
        solr_select = solr_select,
        q_list = q_list,
        error = None,
    )
Пример #5
0
def parse_module(module_url):
    module = web.storage(
        url=module_url, cached_xml='', screenshot='', title='', title_url='', 
        directory_title='', description='', author='', author_email='', 
        author_affiliation='', author_location='', render_inline='')
    
    if not module_url.startswith('http://'):
        raise 'Ooops! Submission has failed &#8211; the URL seems to be invalid.'
    
    try:
        html = utils.dnl(module_url)
        html = web.htmlunquote(html)  # this may confuse the parser
        xml = utils.parse_xml(html)
    except:
        raise 'Ooops! Submission has failed &#8211; the XML or HTML page could not be loaded successfully.'
    
    xnodes = xml.xpath('//ModulePrefs')
    if not xnodes:
        raise 'Ooops! The XML is valid, but we cannot find the module.'
    xnodes = xnodes[0]
    
    for attr in module:
        module[attr] = xnodes.get(attr) or module[attr]
    
    return module
Пример #6
0
def parse_error(path):
    html = open(path).read(10000)
    soup = BeautifulSoup(html)

    h1 = web.htmlunquote(soup.body.h1.string or "")
    h2 = web.htmlunquote(soup.body.h2.string or "")
    message = h1.split('at')[0].strip() + ': ' + (h2 and h2.splitlines()[0])

    code, url = [web.htmlunquote(td.string) for td in soup.body.table.findAll('td')]

    # strip common prefixes
    code = web.re_compile(".*/(?:staging|production)/(openlibrary|infogami|web)").sub(r'\1', code)

    m = web.re_compile('(\d\d)(\d\d)(\d\d)(\d{6})').match(web.numify(os.path.basename(path)))
    hh, mm, ss, microsec = m.groups()

    return web.storage(url=url, message=message, code=code, time="%s:%s:%s" % (hh, mm, ss))
Пример #7
0
 def find_matches(ia, q):
     host, ia_path = ia_lookup('/download/' + ia)
     url = 'http://' + host + '/fulltext/inside.php?item_id=' + ia + '&doc=' + ia + '&path=' + ia_path + '&q=' + web.urlquote(q)
     ret = urllib.urlopen(url)
     try:
         return simplejson.load(ret)
     except:
         m = re_h1_error.search(ret)
         return { 'error': web.htmlunquote(m.group(1)) }
Пример #8
0
        def get_results(q,
                        offset=0,
                        limit=100,
                        snippets=3,
                        fragsize=200,
                        hl_phrase=False):
            m = re_bad_fields.match(q)
            if m:
                return {'error': m.group(1) + ' search not supported'}
            q = escape_q(q)
            solr_params = [
                ('fl', 'ia,body_length,page_count'),
                ('hl', 'true'),
                ('hl.fl', 'body'),
                ('hl.snippets', snippets),
                ('hl.mergeContiguous', 'true'),
                ('hl.usePhraseHighlighter', 'true' if hl_phrase else 'false'),
                ('hl.simple.pre', '{{{'),
                ('hl.simple.post', '}}}'),
                ('hl.fragsize', fragsize),
                ('q.op', 'AND'),
                ('q', web.urlquote(q)),
                ('start', offset),
                ('rows', limit),
                ('qf', 'body'),
                ('qt', 'standard'),
                ('hl.maxAnalyzedChars', '-1'),
                ('wt', 'json'),
            ]
            solr_select = solr_select_url + '?' + '&'.join(
                "%s=%s" % (k, unicode(v)) for k, v in solr_params)
            stats.begin("solr", url=solr_select)
            json_data = urllib.urlopen(solr_select).read()
            stats.end()

            try:
                results = simplejson.loads(json_data)
            except:
                m = re_query_parser_error.search(json_data)
                return {'error': web.htmlunquote(m.group(1))}

            ekey_doc = {}
            for doc in results['response']['docs']:
                ia = doc['ia']
                q = {'type': '/type/edition', 'ocaid': ia}
                ekeys = web.ctx.site.things(q)
                if not ekeys:
                    del q['ocaid']
                    q['source_records'] = 'ia:' + ia
                    ekeys = web.ctx.site.things(q)
                if ekeys:
                    ekey_doc[ekeys[0]] = doc

            editions = web.ctx.site.get_many(ekey_doc.keys())
            for e in editions:
                ekey_doc[e['key']]['edition'] = e
            return results
Пример #9
0
def parse_search_response(json_data):
    try:
        return json.loads(json_data)
    except json.JSONDecodeError:
        m = re_pre.search(json_data)
        error = web.htmlunquote(m.group(1))
        solr_error = 'org.apache.lucene.queryParser.ParseException: '
        if error.startswith(solr_error):
            error = error[len(solr_error):]
        return {'error': error}
Пример #10
0
def parse_search_response(json_data):
    try:
        return json.loads(json_data)
    except json.JSONDecodeError:
        m = re_pre.search(json_data)
        error = web.htmlunquote(m.group(1))
        solr_error = 'org.apache.lucene.queryParser.ParseException: '
        if error.startswith(solr_error):
            error = error[len(solr_error):]
        return {'error': error}
Пример #11
0
 def get_results(q, offset=0, limit=100, snippets=3, fragsize=200):
     q = escape_bracket(q)
     solr_select = solr_select_url + "?fl=ia,body_length,page_count&hl=true&hl.fl=body&hl.snippets=%d&hl.mergeContiguous=true&hl.usePhraseHighlighter=false&hl.simple.pre={{{&hl.simple.post=}}}&hl.fragsize=%d&q.op=AND&q=%s&start=%d&rows=%d&qf=body&qt=standard&hl.maxAnalyzedChars=1000000&wt=json" % (snippets, fragsize, web.urlquote(q), offset, limit)
     stats.begin("solr", url=solr_select)
     json_data = urllib.urlopen(solr_select).read()
     stats.end()
     try:
         return simplejson.loads(json_data)
     except:
         m = re_query_parser_error.search(json_data)
         return { 'error': web.htmlunquote(m.group(1)) }
Пример #12
0
 def find_matches(ia, q):
     q = escape_q(q)
     host, ia_path = ia_lookup('/download/' + ia)
     url = 'http://' + host + '/fulltext/inside.php?item_id=' + ia + '&doc=' + ia + '&path=' + ia_path + '&q=' + web.urlquote(
         q)
     ret = urllib.urlopen(url)
     try:
         return simplejson.load(ret)
     except:
         m = re_h1_error.search(ret)
         return {'error': web.htmlunquote(m.group(1))}
Пример #13
0
        def find_matches(ia, q):
            q = escape_q(q)
            host, ia_path = ia_lookup('/download/' + ia)
            doc = find_doc(ia, host, ia_path) or ia

            url = 'http://' + host + '/fulltext/inside.php?item_id=' + ia + '&doc=' + doc + '&path=' + ia_path + '&q=' + web.urlquote(q)
            ret = urllib.urlopen(url).read().replace('"matches": [],\n}', '"matches": []\n}')
            try:
                return simplejson.loads(ret)
            except:
                m = re_h1_error.search(ret)
                return { 'error': web.htmlunquote(m.group(1)) }
Пример #14
0
        def get_results(q, offset=0, limit=100, snippets=3, fragsize=200, hl_phrase=False):
            m = re_bad_fields.match(q)
            if m:
                return { 'error': m.group(1) + ' search not supported' }
            q = escape_q(q)
            solr_params = [
                ('fl', 'ia,body_length,page_count'),
                ('hl', 'true'),
                ('hl.fl', 'body'),
                ('hl.snippets', snippets),
                ('hl.mergeContiguous', 'true'),
                ('hl.usePhraseHighlighter', 'true' if hl_phrase else 'false'),
                ('hl.simple.pre', '{{{'),
                ('hl.simple.post', '}}}'),
                ('hl.fragsize', fragsize),
                ('q.op', 'AND'),
                ('q', web.urlquote(q)),
                ('start', offset),
                ('rows', limit),
                ('qf', 'body'),
                ('qt', 'standard'),
                ('hl.maxAnalyzedChars', '-1'),
                ('wt', 'json'),
            ]
            solr_select = solr_select_url + '?' + '&'.join("%s=%s" % (k, unicode(v)) for k, v in solr_params)
            stats.begin("solr", url=solr_select)
            json_data = urllib.urlopen(solr_select).read()
            stats.end()
           
            try:
                results = simplejson.loads(json_data)
            except:
                m = re_query_parser_error.search(json_data)
                return { 'error': web.htmlunquote(m.group(1)) }

            ekey_doc = {}
            for doc in results['response']['docs']:
                ia = doc['ia']
                q = {'type': '/type/edition', 'ocaid': ia}
                ekeys = web.ctx.site.things(q)
                if not ekeys:
                    del q['ocaid']
                    q['source_records'] = 'ia:' + ia
                    ekeys = web.ctx.site.things(q)
                if ekeys:
                    ekey_doc[ekeys[0]] = doc

            editions = web.ctx.site.get_many(ekey_doc.keys())
            for e in editions:
                ekey_doc[e['key']]['edition'] = e
            return results
Пример #15
0
def parse_search_response(json_data):
    """Construct response for any input"""
    if json_data is None:
        return {'error': 'Error parsing empty search engine response'}
    try:
        return json.loads(json_data)
    except json.JSONDecodeError:
        logger.exception("Error parsing search engine response")
        m = re_pre.search(json_data)
        if m is None:
            return {'error': 'Error parsing search engine response'}
        error = web.htmlunquote(m.group(1))
        solr_error = 'org.apache.lucene.queryParser.ParseException: '
        if error.startswith(solr_error):
            error = error[len(solr_error):]
        return {'error': error}
Пример #16
0
def parse_search_response(json_data):
    """Construct response for any input"""
    if json_data is None:
        return {'error': 'Error parsing empty search engine response'}
    try:
        return json.loads(json_data)
    except json.JSONDecodeError:
        logger.exception("Error parsing search engine response")
        m = re_pre.search(json_data)
        if m is None:
            return {'error': 'Error parsing search engine response'}
        error = web.htmlunquote(m.group(1))
        solr_error = 'org.apache.lucene.queryParser.ParseException: '
        if error.startswith(solr_error):
            error = error[len(solr_error):]
        return {'error': error}
Пример #17
0
def do_search(param, sort, page=1, rows=100, spellcheck_count=None):
    if sort:
        sort = process_sort(sort)
    (solr_result, solr_select, q_list) = run_solr_query(
        param, rows, page, sort, spellcheck_count
    )
    is_bad = False
    if not solr_result or solr_result.startswith(b'<html'):
        is_bad = True
    if not is_bad:
        try:
            root = XML(solr_result)
        except XMLSyntaxError:
            is_bad = True
    if is_bad:
        m = re_pre.search(solr_result)
        return web.storage(
            facet_counts=None,
            docs=[],
            is_advanced=bool(param.get('q')),
            num_found=None,
            solr_select=solr_select,
            q_list=q_list,
            error=(web.htmlunquote(m.group(1)) if m else solr_result),
        )

    spellcheck = root.find("lst[@name='spellcheck']")
    spell_map = {}
    if spellcheck is not None and len(spellcheck):
        for e in spellcheck.find("lst[@name='suggestions']"):
            assert e.tag == 'lst'
            a = e.attrib['name']
            if a in spell_map or a in ('sqrt', 'edition_count'):
                continue
            spell_map[a] = [i.text for i in e.find("arr[@name='suggestion']")]

    docs = root.find('result')
    return web.storage(
        facet_counts=read_facets(root),
        docs=docs,
        is_advanced=bool(param.get('q')),
        num_found=(int(docs.attrib['numFound']) if docs is not None else None),
        solr_select=solr_select,
        q_list=q_list,
        error=None,
        spellcheck=spell_map,
    )
Пример #18
0
def do_search(param, sort, page=1, rows=100, spellcheck_count=None):
    (reply, solr_select, q_list) = run_solr_query(
        param, rows, page, sort, spellcheck_count)
    is_bad = False
    if not reply or reply.startswith('<html'):
        is_bad = True
    if not is_bad:
        try:
            root = XML(reply)
        except XMLSyntaxError:
            is_bad = True
    if is_bad:
        m = re_pre.search(reply)
        return web.storage(
            facet_counts = None,
            docs = [],
            is_advanced = bool(param.get('q')),
            num_found = None,
            solr_select = solr_select,
            q_list = q_list,
            error = (web.htmlunquote(m.group(1)) if m else reply),
        )

    spellcheck = root.find("lst[@name='spellcheck']")
    spell_map = {}
    if spellcheck is not None and len(spellcheck):
        for e in spellcheck.find("lst[@name='suggestions']"):
            assert e.tag == 'lst'
            a = e.attrib['name']
            if a in spell_map or a in ('sqrt', 'edition_count'):
                continue
            spell_map[a] = [i.text for i in e.find("arr[@name='suggestion']")]

    docs = root.find('result')
    return web.storage(
        facet_counts = read_facets(root),
        docs = docs,
        is_advanced = bool(param.get('q')),
        num_found = (int(docs.attrib['numFound']) if docs is not None else None),
        solr_select = solr_select,
        q_list = q_list,
        error = None,
        spellcheck = spell_map,
    )
Пример #19
0
 def get_results(q,
                 offset=0,
                 limit=100,
                 snippets=3,
                 fragsize=200,
                 hl_phrase=False):
     m = re_bad_fields.match(q)
     if m:
         return {'error': m.group(1) + ' search not supported'}
     q = escape_q(q)
     solr_params = [
         ('fl', 'ia,body_length,page_count'),
         ('hl', 'true'),
         ('hl.fl', 'body'),
         ('hl.snippets', snippets),
         ('hl.mergeContiguous', 'true'),
         ('hl.usePhraseHighlighter', 'true' if hl_phrase else 'false'),
         ('hl.simple.pre', '{{{'),
         ('hl.simple.post', '}}}'),
         ('hl.fragsize', fragsize),
         ('q.op', 'AND'),
         ('q', web.urlquote(q)),
         ('start', offset),
         ('rows', limit),
         ('qf', 'body'),
         ('qt', 'standard'),
         ('hl.maxAnalyzedChars', '-1'),
         ('wt', 'json'),
     ]
     solr_select = solr_select_url + '?' + '&'.join(
         "%s=%s" % (k, unicode(v)) for k, v in solr_params)
     stats.begin("solr", url=solr_select)
     json_data = urllib.urlopen(solr_select).read()
     stats.end()
     try:
         return simplejson.loads(json_data)
     except:
         m = re_query_parser_error.search(json_data)
         return {'error': web.htmlunquote(m.group(1))}
Пример #20
0
    solr_select = solr_select_url + "?" + urllib.urlencode(params)
    stats.begin("solr", url=solr_select)

    try:
        json_data = urlopen(solr_select).read()
    except IOError, e:
        logger.error("Unable to query search inside solr", exc_info=True)
        return {"error": web.htmlquote(str(e))}
    finally:
        stats.end()

    try:
        return simplejson.loads(json_data)
    except:
        m = re_query_parser_error.search(json_data)
        return {'error': web.htmlunquote(m.group(1))}


def editions_from_ia(ia):
    q = {
        'type': '/type/edition',
        'ocaid': ia,
        'title': None,
        'covers': None,
        'works': None,
        'authors': None
    }
    editions = web.ctx.site.things(q)
    if not editions:
        del q['ocaid']
        q['source_records'] = 'ia:' + ia
Пример #21
0
    solr_select = solr_select_url + "?" + urllib.urlencode(params)
    stats.begin("solr", url=solr_select)

    try:
        json_data = urlopen(solr_select).read()
    except IOError, e:
        logger.error("Unable to query search inside solr", exc_info=True)
        return {"error": web.htmlquote(str(e))}
    finally:
        stats.end()
   
    try:
        return simplejson.loads(json_data)
    except:
        m = re_query_parser_error.search(json_data)
        return { 'error': web.htmlunquote(m.group(1)) }

def editions_from_ia(ia):
    q = {'type': '/type/edition', 'ocaid': ia, 'title': None, 'covers': None, 'works': None, 'authors': None}
    editions = web.ctx.site.things(q)
    if not editions:
        del q['ocaid']
        q['source_records'] = 'ia:' + ia
        editions = web.ctx.site.things(q)
    return editions

def read_from_archive(ia):
    meta_xml = 'http://archive.org/download/' + ia + '/' + ia + '_meta.xml'
    stats.begin("archive.org", url=meta_xml)
    xml_data = urllib.urlopen(meta_xml)
    item = {}