def parse_error(path): html = open(path).read(10000) soup = BeautifulSoup(html) h1 = web.htmlunquote(soup.body.h1.string or "") h2 = web.htmlunquote(soup.body.h2.string or "") message = h1.split("at")[0].strip() + ": " + (h2 and h2.splitlines()[0]) code, url = [ web.htmlunquote(td.string) for td in soup.body.table.findAll("td") ] # strip common prefixes code = web.re_compile( ".*/(?:staging|production)/(openlibrary|infogami|web)").sub( r"\1", code) m = web.re_compile(r"(\d\d)(\d\d)(\d\d)(\d{6})").match( web.numify(os.path.basename(path))) hh, mm, ss, microsec = m.groups() return web.storage(url=url, message=message, code=code, time="%s:%s:%s" % (hh, mm, ss))
def parse_module(module_url): module = web.storage( url=module_url, cached_xml="", screenshot="", title="", title_url="", directory_title="", description="", author="", author_email="", author_affiliation="", author_location="", render_inline="", ) if not module_url.startswith("http://"): raise "Ooops! Submission has failed – the URL seems to be invalid." try: html = utils.dnl(module_url) html = web.htmlunquote(html) # this may confuse the parser xml = utils.parse_xml(html) except: raise "Ooops! Submission has failed – the XML or HTML page could not be loaded successfully." xnodes = xml.xpath("//ModulePrefs") if not xnodes: raise "Ooops! The XML is valid, but we cannot find the module." xnodes = xnodes[0] for attr in module: module[attr] = xnodes.get(attr) or module[attr] return module
def get_results(q, offset=0, limit=100, snippets=3, fragsize=200, hl_phrase=False): m = re_bad_fields.match(q) if m: return { 'error': m.group(1) + ' search not supported' } q = escape_q(q) solr_params = [ ('fl', 'ia,body_length,page_count'), ('hl', 'true'), ('hl.fl', 'body'), ('hl.snippets', snippets), ('hl.mergeContiguous', 'true'), ('hl.usePhraseHighlighter', 'true' if hl_phrase else 'false'), ('hl.simple.pre', '{{{'), ('hl.simple.post', '}}}'), ('hl.fragsize', fragsize), ('q.op', 'AND'), ('q', web.urlquote(q)), ('start', offset), ('rows', limit), ('qf', 'body'), ('qt', 'standard'), ('hl.maxAnalyzedChars', '-1'), ('wt', 'json'), ] solr_select = solr_select_url + '?' + '&'.join("%s=%s" % (k, unicode(v)) for k, v in solr_params) stats.begin("solr", url=solr_select) json_data = urllib.urlopen(solr_select).read() stats.end() try: return simplejson.loads(json_data) except: m = re_query_parser_error.search(json_data) return { 'error': web.htmlunquote(m.group(1)) }
def do_search(param, sort, page=1, rows=100): (reply, solr_select, q_list) = run_solr_query(param, rows, page, sort) is_bad = False if reply.startswith('<html'): is_bad = True if not is_bad: try: root = XML(reply) except XMLSyntaxError: is_bad = True if is_bad: m = re_pre.search(reply) return web.storage( facet_counts = None, docs = [], is_advanced = bool(param.get('q', 'None')), num_found = None, solr_select = solr_select, q_list = q_list, error = (web.htmlunquote(m.group(1)) if m else reply), ) docs = root.find('result') return web.storage( facet_counts = read_facets(root), docs = docs, is_advanced = bool(param.get('q', 'None')), num_found = (int(docs.attrib['numFound']) if docs is not None else None), solr_select = solr_select, q_list = q_list, error = None, )
def parse_module(module_url): module = web.storage( url=module_url, cached_xml='', screenshot='', title='', title_url='', directory_title='', description='', author='', author_email='', author_affiliation='', author_location='', render_inline='') if not module_url.startswith('http://'): raise 'Ooops! Submission has failed – the URL seems to be invalid.' try: html = utils.dnl(module_url) html = web.htmlunquote(html) # this may confuse the parser xml = utils.parse_xml(html) except: raise 'Ooops! Submission has failed – the XML or HTML page could not be loaded successfully.' xnodes = xml.xpath('//ModulePrefs') if not xnodes: raise 'Ooops! The XML is valid, but we cannot find the module.' xnodes = xnodes[0] for attr in module: module[attr] = xnodes.get(attr) or module[attr] return module
def parse_error(path): html = open(path).read(10000) soup = BeautifulSoup(html) h1 = web.htmlunquote(soup.body.h1.string or "") h2 = web.htmlunquote(soup.body.h2.string or "") message = h1.split('at')[0].strip() + ': ' + (h2 and h2.splitlines()[0]) code, url = [web.htmlunquote(td.string) for td in soup.body.table.findAll('td')] # strip common prefixes code = web.re_compile(".*/(?:staging|production)/(openlibrary|infogami|web)").sub(r'\1', code) m = web.re_compile('(\d\d)(\d\d)(\d\d)(\d{6})').match(web.numify(os.path.basename(path))) hh, mm, ss, microsec = m.groups() return web.storage(url=url, message=message, code=code, time="%s:%s:%s" % (hh, mm, ss))
def find_matches(ia, q): host, ia_path = ia_lookup('/download/' + ia) url = 'http://' + host + '/fulltext/inside.php?item_id=' + ia + '&doc=' + ia + '&path=' + ia_path + '&q=' + web.urlquote(q) ret = urllib.urlopen(url) try: return simplejson.load(ret) except: m = re_h1_error.search(ret) return { 'error': web.htmlunquote(m.group(1)) }
def get_results(q, offset=0, limit=100, snippets=3, fragsize=200, hl_phrase=False): m = re_bad_fields.match(q) if m: return {'error': m.group(1) + ' search not supported'} q = escape_q(q) solr_params = [ ('fl', 'ia,body_length,page_count'), ('hl', 'true'), ('hl.fl', 'body'), ('hl.snippets', snippets), ('hl.mergeContiguous', 'true'), ('hl.usePhraseHighlighter', 'true' if hl_phrase else 'false'), ('hl.simple.pre', '{{{'), ('hl.simple.post', '}}}'), ('hl.fragsize', fragsize), ('q.op', 'AND'), ('q', web.urlquote(q)), ('start', offset), ('rows', limit), ('qf', 'body'), ('qt', 'standard'), ('hl.maxAnalyzedChars', '-1'), ('wt', 'json'), ] solr_select = solr_select_url + '?' + '&'.join( "%s=%s" % (k, unicode(v)) for k, v in solr_params) stats.begin("solr", url=solr_select) json_data = urllib.urlopen(solr_select).read() stats.end() try: results = simplejson.loads(json_data) except: m = re_query_parser_error.search(json_data) return {'error': web.htmlunquote(m.group(1))} ekey_doc = {} for doc in results['response']['docs']: ia = doc['ia'] q = {'type': '/type/edition', 'ocaid': ia} ekeys = web.ctx.site.things(q) if not ekeys: del q['ocaid'] q['source_records'] = 'ia:' + ia ekeys = web.ctx.site.things(q) if ekeys: ekey_doc[ekeys[0]] = doc editions = web.ctx.site.get_many(ekey_doc.keys()) for e in editions: ekey_doc[e['key']]['edition'] = e return results
def parse_search_response(json_data): try: return json.loads(json_data) except json.JSONDecodeError: m = re_pre.search(json_data) error = web.htmlunquote(m.group(1)) solr_error = 'org.apache.lucene.queryParser.ParseException: ' if error.startswith(solr_error): error = error[len(solr_error):] return {'error': error}
def get_results(q, offset=0, limit=100, snippets=3, fragsize=200): q = escape_bracket(q) solr_select = solr_select_url + "?fl=ia,body_length,page_count&hl=true&hl.fl=body&hl.snippets=%d&hl.mergeContiguous=true&hl.usePhraseHighlighter=false&hl.simple.pre={{{&hl.simple.post=}}}&hl.fragsize=%d&q.op=AND&q=%s&start=%d&rows=%d&qf=body&qt=standard&hl.maxAnalyzedChars=1000000&wt=json" % (snippets, fragsize, web.urlquote(q), offset, limit) stats.begin("solr", url=solr_select) json_data = urllib.urlopen(solr_select).read() stats.end() try: return simplejson.loads(json_data) except: m = re_query_parser_error.search(json_data) return { 'error': web.htmlunquote(m.group(1)) }
def find_matches(ia, q): q = escape_q(q) host, ia_path = ia_lookup('/download/' + ia) url = 'http://' + host + '/fulltext/inside.php?item_id=' + ia + '&doc=' + ia + '&path=' + ia_path + '&q=' + web.urlquote( q) ret = urllib.urlopen(url) try: return simplejson.load(ret) except: m = re_h1_error.search(ret) return {'error': web.htmlunquote(m.group(1))}
def find_matches(ia, q): q = escape_q(q) host, ia_path = ia_lookup('/download/' + ia) doc = find_doc(ia, host, ia_path) or ia url = 'http://' + host + '/fulltext/inside.php?item_id=' + ia + '&doc=' + doc + '&path=' + ia_path + '&q=' + web.urlquote(q) ret = urllib.urlopen(url).read().replace('"matches": [],\n}', '"matches": []\n}') try: return simplejson.loads(ret) except: m = re_h1_error.search(ret) return { 'error': web.htmlunquote(m.group(1)) }
def get_results(q, offset=0, limit=100, snippets=3, fragsize=200, hl_phrase=False): m = re_bad_fields.match(q) if m: return { 'error': m.group(1) + ' search not supported' } q = escape_q(q) solr_params = [ ('fl', 'ia,body_length,page_count'), ('hl', 'true'), ('hl.fl', 'body'), ('hl.snippets', snippets), ('hl.mergeContiguous', 'true'), ('hl.usePhraseHighlighter', 'true' if hl_phrase else 'false'), ('hl.simple.pre', '{{{'), ('hl.simple.post', '}}}'), ('hl.fragsize', fragsize), ('q.op', 'AND'), ('q', web.urlquote(q)), ('start', offset), ('rows', limit), ('qf', 'body'), ('qt', 'standard'), ('hl.maxAnalyzedChars', '-1'), ('wt', 'json'), ] solr_select = solr_select_url + '?' + '&'.join("%s=%s" % (k, unicode(v)) for k, v in solr_params) stats.begin("solr", url=solr_select) json_data = urllib.urlopen(solr_select).read() stats.end() try: results = simplejson.loads(json_data) except: m = re_query_parser_error.search(json_data) return { 'error': web.htmlunquote(m.group(1)) } ekey_doc = {} for doc in results['response']['docs']: ia = doc['ia'] q = {'type': '/type/edition', 'ocaid': ia} ekeys = web.ctx.site.things(q) if not ekeys: del q['ocaid'] q['source_records'] = 'ia:' + ia ekeys = web.ctx.site.things(q) if ekeys: ekey_doc[ekeys[0]] = doc editions = web.ctx.site.get_many(ekey_doc.keys()) for e in editions: ekey_doc[e['key']]['edition'] = e return results
def parse_search_response(json_data): """Construct response for any input""" if json_data is None: return {'error': 'Error parsing empty search engine response'} try: return json.loads(json_data) except json.JSONDecodeError: logger.exception("Error parsing search engine response") m = re_pre.search(json_data) if m is None: return {'error': 'Error parsing search engine response'} error = web.htmlunquote(m.group(1)) solr_error = 'org.apache.lucene.queryParser.ParseException: ' if error.startswith(solr_error): error = error[len(solr_error):] return {'error': error}
def do_search(param, sort, page=1, rows=100, spellcheck_count=None): if sort: sort = process_sort(sort) (solr_result, solr_select, q_list) = run_solr_query( param, rows, page, sort, spellcheck_count ) is_bad = False if not solr_result or solr_result.startswith(b'<html'): is_bad = True if not is_bad: try: root = XML(solr_result) except XMLSyntaxError: is_bad = True if is_bad: m = re_pre.search(solr_result) return web.storage( facet_counts=None, docs=[], is_advanced=bool(param.get('q')), num_found=None, solr_select=solr_select, q_list=q_list, error=(web.htmlunquote(m.group(1)) if m else solr_result), ) spellcheck = root.find("lst[@name='spellcheck']") spell_map = {} if spellcheck is not None and len(spellcheck): for e in spellcheck.find("lst[@name='suggestions']"): assert e.tag == 'lst' a = e.attrib['name'] if a in spell_map or a in ('sqrt', 'edition_count'): continue spell_map[a] = [i.text for i in e.find("arr[@name='suggestion']")] docs = root.find('result') return web.storage( facet_counts=read_facets(root), docs=docs, is_advanced=bool(param.get('q')), num_found=(int(docs.attrib['numFound']) if docs is not None else None), solr_select=solr_select, q_list=q_list, error=None, spellcheck=spell_map, )
def do_search(param, sort, page=1, rows=100, spellcheck_count=None): (reply, solr_select, q_list) = run_solr_query( param, rows, page, sort, spellcheck_count) is_bad = False if not reply or reply.startswith('<html'): is_bad = True if not is_bad: try: root = XML(reply) except XMLSyntaxError: is_bad = True if is_bad: m = re_pre.search(reply) return web.storage( facet_counts = None, docs = [], is_advanced = bool(param.get('q')), num_found = None, solr_select = solr_select, q_list = q_list, error = (web.htmlunquote(m.group(1)) if m else reply), ) spellcheck = root.find("lst[@name='spellcheck']") spell_map = {} if spellcheck is not None and len(spellcheck): for e in spellcheck.find("lst[@name='suggestions']"): assert e.tag == 'lst' a = e.attrib['name'] if a in spell_map or a in ('sqrt', 'edition_count'): continue spell_map[a] = [i.text for i in e.find("arr[@name='suggestion']")] docs = root.find('result') return web.storage( facet_counts = read_facets(root), docs = docs, is_advanced = bool(param.get('q')), num_found = (int(docs.attrib['numFound']) if docs is not None else None), solr_select = solr_select, q_list = q_list, error = None, spellcheck = spell_map, )
def get_results(q, offset=0, limit=100, snippets=3, fragsize=200, hl_phrase=False): m = re_bad_fields.match(q) if m: return {'error': m.group(1) + ' search not supported'} q = escape_q(q) solr_params = [ ('fl', 'ia,body_length,page_count'), ('hl', 'true'), ('hl.fl', 'body'), ('hl.snippets', snippets), ('hl.mergeContiguous', 'true'), ('hl.usePhraseHighlighter', 'true' if hl_phrase else 'false'), ('hl.simple.pre', '{{{'), ('hl.simple.post', '}}}'), ('hl.fragsize', fragsize), ('q.op', 'AND'), ('q', web.urlquote(q)), ('start', offset), ('rows', limit), ('qf', 'body'), ('qt', 'standard'), ('hl.maxAnalyzedChars', '-1'), ('wt', 'json'), ] solr_select = solr_select_url + '?' + '&'.join( "%s=%s" % (k, unicode(v)) for k, v in solr_params) stats.begin("solr", url=solr_select) json_data = urllib.urlopen(solr_select).read() stats.end() try: return simplejson.loads(json_data) except: m = re_query_parser_error.search(json_data) return {'error': web.htmlunquote(m.group(1))}
solr_select = solr_select_url + "?" + urllib.urlencode(params) stats.begin("solr", url=solr_select) try: json_data = urlopen(solr_select).read() except IOError, e: logger.error("Unable to query search inside solr", exc_info=True) return {"error": web.htmlquote(str(e))} finally: stats.end() try: return simplejson.loads(json_data) except: m = re_query_parser_error.search(json_data) return {'error': web.htmlunquote(m.group(1))} def editions_from_ia(ia): q = { 'type': '/type/edition', 'ocaid': ia, 'title': None, 'covers': None, 'works': None, 'authors': None } editions = web.ctx.site.things(q) if not editions: del q['ocaid'] q['source_records'] = 'ia:' + ia
solr_select = solr_select_url + "?" + urllib.urlencode(params) stats.begin("solr", url=solr_select) try: json_data = urlopen(solr_select).read() except IOError, e: logger.error("Unable to query search inside solr", exc_info=True) return {"error": web.htmlquote(str(e))} finally: stats.end() try: return simplejson.loads(json_data) except: m = re_query_parser_error.search(json_data) return { 'error': web.htmlunquote(m.group(1)) } def editions_from_ia(ia): q = {'type': '/type/edition', 'ocaid': ia, 'title': None, 'covers': None, 'works': None, 'authors': None} editions = web.ctx.site.things(q) if not editions: del q['ocaid'] q['source_records'] = 'ia:' + ia editions = web.ctx.site.things(q) return editions def read_from_archive(ia): meta_xml = 'http://archive.org/download/' + ia + '/' + ia + '_meta.xml' stats.begin("archive.org", url=meta_xml) xml_data = urllib.urlopen(meta_xml) item = {}