def response(resp): from searx.webapp import sentry results = [] dom = fromstring(resp.text) for result in dom.xpath( '//div[@class="srch-all-result"]//li[@class="sort_lst_li"]'): try: url = 'https://v.sogou.com' + result.xpath('./a')[0].attrib.get( 'href') title = result.xpath('./a')[0].attrib.get('title') content = title thumbnail = result.xpath('./a/img')[0].attrib.get('src') results.append({ 'url': url, 'title': title, 'content': content, 'thumbnail': thumbnail, 'template': 'videos.html' }) except: sentry.captureException() return results
def response(resp): from searx.webapp import sentry results = [] dom = html.fromstring(resp.text) for result in dom.xpath('//div[@class="dg_u"]'): try: url = (result.xpath('./div[@class="mc_vtvc"]/a/@href') or result.xpath('./div[@class="mc_vtvc mc_vtvc_fh"]/a/@href'))[0] #url = 'https://bing.com' + url title = extract_text(result.xpath('./div/a/div/div[@class="mc_vtvc_title"]/@title')) content = extract_text(result.xpath('./div/a/div/div/div/div/text()')) thumbnail = result.xpath('./div/a/div/div/img/@src')[0] results.append({'url': url, 'title': title, 'content': content, 'thumbnail': thumbnail, 'template': 'videos.html'}) if len(results) >= number_of_results: break except: sentry.captureException() return results
def response(resp): from searx.webapp import sentry results = [] dom = html.fromstring(resp.text) for result in dom.xpath('//li[@class="result"]'): try: url = result.xpath('./a')[0].attrib.get('href') title = result.xpath('./a')[0].attrib.get('title') content = title thumbnail = result.xpath( './/div[@class="view"]/img[@class="img-blur-layer"]' )[0].attrib.get('src') results.append({ 'url': url, 'title': title, 'content': content, 'thumbnail': thumbnail, 'template': 'videos.html' }) except Exception: sentry.captureException() if len(results) >= imageLength: break return results
def response(resp): from searx.webapp import sentry resultdic = loads(resp.text) resultlist = resultdic["list"] results = [] for image in resultlist: try: url = image["link"] title = image["title"].replace("<em>", "").replace("</em>", "") thumbnail = image["thumb"] img_src = image["img"] width = image["width"] height = image["height"] # append result results.append({ 'template': 'images.html', 'url': url, 'title': title, 'content': '', 'thumbnail_src': thumbnail, 'img_src': img_src, 'width': width, 'height': height }) except Exception: sentry.captureException() # return results return results
def response(resp): from searx.webapp import sentry unicode_resp = resp.text utf8resp = unicode_resp.encode('utf8') resultdic = loads(utf8resp) resultlist = resultdic["items"] results = [] for image in resultlist: try: url = image["page_url"] title = image["title"] thumbnail = image["thumbUrl"] img_src = image["pic_url"] width = image["width"] height = image["height"] # append result results.append({ 'template': 'images.html', 'url': url, 'title': title, 'content': '', 'thumbnail_src': thumbnail, 'width': width, 'height': height, 'img_src': img_src }) except Exception: sentry.captureException() # return results return results
def response(resp): from searx.webapp import sentry results = [] dom = html.fromstring(resp.text) # parse results for result in dom.xpath( '//div[@id="mmComponent_images_1"]/ul/li/div/div[@class="imgpt"]'): try: link = result.xpath('./a')[0] # TODO find actual title title = link.xpath('.//img/@alt')[0] # parse json-data (it is required to add a space, to make it parsable) json_data = loads( _quote_keys_regex.sub(r'\1"\2": \3', link.attrib.get('m'))) url = json_data.get('purl') img_src = json_data.get('murl') thumb_json_data = loads( _quote_keys_regex.sub(r'\1"\2": \3', link.attrib.get('mad'))) width = int(thumb_json_data.get('max')) height = int(thumb_json_data.get('mah')) thumbnail = thumb_json_data.get('turl') # append result results.append({ 'template': 'images.html', 'url': url, 'width': width, 'height': height, 'title': title, 'content': '', 'thumbnail_src': thumbnail, 'img_src': img_src }) # TODO stop parsing if 10 images are found # if len(results) >= 10: # break except: sentry.captureException() # return results return results
def response(resp): from searx.webapp import sentry results = [] dom = html.fromstring(resp.text) try: results.append({'number_of_results': int(dom.xpath('//span[@class="nums"]/text()')[0] .split(u'\u7ea6')[1].split(u'\u4e2a')[0].replace(',', ''))}) except Exception: sentry.captureException() # parse results for result in dom.xpath('//li[@class="res-list"]'): try: title = extract_text(result.xpath('.//h3')[0]) url = result.xpath('.//h3/a')[0].attrib.get('href') try: if result.xpath('.//p[@class="res-desc"]'): content = extract_text(result.xpath('.//p[@class="res-desc"]')) if result.xpath('.//div[starts-with(@class,"res-rich")]'): content = extract_text(result.xpath('.//div[starts-with(@class,"res-rich")]')) if result.xpath('.//div[@class="cont mh-pc-hover"]'): content = extract_text(result.xpath('.//div[@class="cont mh-pc-hover"]')) if result.xpath('.//div[@class="g-card g-shadow"]'): content = extract_text(result.xpath('.//div[@class="g-card g-shadow"]')) if result.xpath('.//p[@class="mh-more"]'): content = extract_text(result.xpath('.//p[@class="mh-more"]')) except Exception: content = '' sentry.captureException() # append result if 'www.so.com/link?' in url: url = settings['result_proxy'].get('server_name') + "/url_proxy?proxyurl=" + parse.quote( url) + "&token=" + new_hmac(settings['result_proxy']['key'], url.encode("utf-8")) try: showurl = extract_text(result.xpath(".//p[@class='res-linkinfo']/cite")) if len(showurl) == 0: showurl = url except Exception: showurl = url sentry.captureException() else: showurl = url results.append({'url': url, 'showurl': showurl, 'title': title, 'content': content}) content = '' except Exception: sentry.captureException() # return results return results
def response(resp): from searx.webapp import sentry results = [] search_result = loads(resp.text) # wikipedia article's unique id # first valid id is assumed to be the requested article for article_id in search_result['query']['pages']: page = search_result['query']['pages'][article_id] if int(article_id) > 0: break if int(article_id) < 0: return [] try: title = page.get('title') image = page.get('thumbnail') if image: image = image.get('source') extract = page.get('extract') summary = extract_first_paragraph(extract, title, image) # link to wikipedia article wikipedia_link = base_url.format(language=url_lang(resp.search_params['language'])) \ + 'wiki/' + quote(title.replace(' ', '_').encode('utf8')) results.append({'url': wikipedia_link, 'title': title}) results.append({'infobox': title, 'id': wikipedia_link, 'content': summary, 'img_src': image, 'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]}) except: sentry.captureException() return results
def response(resp): from searx.webapp import sentry use_resp = resp.content try: resultdic = loads(use_resp) except Exception: resultdic = loads( re.sub(r'(?<!\\)\\(?!["\\/bfnrt]|u[0-9a-fA-F]{4})', r'', resp.text).encode(encoding="utf-8")) resultlist = resultdic["data"] results = [] for image in resultlist: try: url = image["replaceUrl"][0]["FromURL"] title = image["fromPageTitle"].replace("<strong>", "").replace( "</strong>", "") thumbnail = image["thumbURL"] img_src = image["thumbURL"] width = image["width"] height = image["height"] # append result results.append({ 'template': 'images.html', 'url': url, 'title': title, 'content': '', 'thumbnail_src': thumbnail, 'width': width, 'height': height, 'img_src': img_src }) except Exception as e: sentry.captureException() # return results return results
def response(resp): from searx.webapp import sentry results = [] dom = html.fromstring(resp.text) try: results.append({ 'number_of_results': int( dom.xpath('//span[@class="sb_count"]/text()')[0].split() [0].replace(',', '')) }) except: sentry.captureException() # parse results for result in dom.xpath('//div[@class="sa_cc"]'): try: link = result.xpath('.//h3/a')[0] url = link.attrib.get('href') title = extract_text(link) content = extract_text(result.xpath('.//p')) # append result results.append({'url': url, 'title': title, 'content': content}) except: sentry.captureException() # parse results again if nothing is found yet for result in dom.xpath('//li[@class="b_algo"]'): try: link = result.xpath('.//h2/a')[0] url = link.attrib.get('href') title = extract_text(link) content = extract_text(result.xpath('.//p')) # append result results.append({'url': url, 'title': title, 'content': content}) except: sentry.captureException() # return results return results
def response(resp): from searx.webapp import sentry results = [] dom = html.fromstring(resp.text) try: results.append({ 'number_of_results': int( dom.xpath('//span[@class="nums_text"]/text()')[0].split( u'\u7ea6')[1].split(u'\u4e2a')[0].replace(',', '')) }) except Exception: sentry.captureException() # parse results for result in dom.xpath('//div[@class="result c-container "]'): title = extract_text(result.xpath('.//h3/a')[0]) # when search query is Chinese words try: url = result.xpath('.//div[@class="f13"]/a')[0].attrib.get('href') # To generate miji url with baidu url url = settings['result_proxy'].get('server_name') + "/url_proxy?proxyurl=" + \ url + "&token=" + new_hmac(settings['result_proxy']['key'], url.encode("utf-8")) content = extract_text( (result.xpath('.//div[@class="c-abstract"]') or result.xpath('.//div[@class="c-abstract c-abstract-en"]'))[0]) showurl = extract_text( result.xpath('.//div[@class="f13"]/a')).replace('百度快照', '') if len(showurl.strip()) == 0: showurl = re.findall(WEB_URL_REGEX, content)[0] showurl = showurl.lstrip('.') if len(showurl.strip()) == 0: showurl = url # append result results.append({ 'url': url, 'showurl': showurl, 'title': title, 'content': content }) # when search query is English words except Exception: try: url = result.xpath('.//h3[@class="t"]/a')[0].attrib.get('href') showurl = extract_text( result.xpath('.//div[@class="f13"]/a')).replace( '百度快照', '').replace('翻译此页', '') content = extract_text( result.xpath('.//div[@class="c-span18 c-span-last"]')[0]) # To generate miji url with baidu url url = settings['result_proxy'].get('server_name') + "/url_proxy?proxyurl=" + \ url + "&token=" + new_hmac(settings['result_proxy']['key'], url.encode("utf-8")) if len(showurl.strip()) == 0: showurl = re.findall(WEB_URL_REGEX, content)[0] showurl = showurl.lstrip('.') if len(showurl.strip()) == 0: showurl = url # append result results.append({ 'url': url, 'showurl': showurl, 'title': title, 'content': content }) except Exception: sentry.captureException() # return results return results
def response(resp): from searx.webapp import sentry results = [] dom = html.fromstring(resp.text) try: results.append({ 'number_of_results': int( dom.xpath('//p[@class="num-tips"]/text()')[0].split(u'\u7ea6') [1].split(u'\u6761')[0].replace(',', '')) }) except Exception: sentry.captureException() # parse results try: for result in dom.xpath('//div[@class="vrwrap"]'): try: url = result.xpath('.//a')[0].attrib.get( 'href') if result.xpath('.//a')[0].attrib.get( 'href').startswith( "http") else "https://sogou.com" + result.xpath( './/a')[0].attrib.get('href') # parse weixin.sogou html if "http://weixin.sogou.com/" == url.strip(): url = result.xpath( './/div[@class="str-pd-box str-pd-none"]//a' )[0].attrib.get('href') title = extract_text( result.xpath( './/div[@class="str-pd-box str-pd-none"]//p[@class="str_time"]/a' )[0]) content = extract_text( result.xpath( './/div[@class="str-pd-box str-pd-none"]//p[@class="str_info"]' )[0]) else: title = extract_text(result.xpath('.//h3/a')[0]) content = extract_text(result.xpath('.//div')[0]) if 'sogou.com/link?url' in url: url = settings['result_proxy'].get('server_name') + "/url_proxy?proxyurl=" + \ url + "&token=" + new_hmac(settings['result_proxy']['key'], url.encode("utf-8")) showurl = re.findall( WEB_URL_REGEX, extract_text(result.xpath('.//div[@class="fb"]')))[0] showurl = showurl.lstrip('.') else: showurl = url # append result results.append({ 'url': url, 'showurl': showurl, 'title': title, 'content': content }) except Exception: sentry.captureException() continue except Exception as e: sentry.captureException() try: for result in dom.xpath('//div[@class="rb"]'): try: url = result.xpath('.//a')[0].attrib.get( 'href') if result.xpath('.//a')[0].attrib.get( 'href').startswith( "http") else "https://sogou.com" + result.xpath( './/a')[0].attrib.get('href') # to parse sogou weixin html if "http://weixin.sogou.com/" == url.strip(): url = result.xpath( './/div[@class="str-pd-box str-pd-none"]//a' )[0].attrib.get('href') title = extract_text( result.xpath( './/div[@class="str-pd-box str-pd-none"]//p[@class="str_time"]/a' )[0]) content = extract_text( result.xpath( './/div[@class="str-pd-box str-pd-none"]//p[@class="str_info"]' )[0]) else: title = extract_text(result.xpath('.//h3/a')[0]) content = extract_text(result.xpath('.//div')[0]) if 'sogou.com/link?url' in url: url = settings['result_proxy'].get('server_name') + "/url_proxy?proxyurl=" + \ url + "&token=" + new_hmac(settings['result_proxy']['key'], url.encode("utf-8")) showurl = re.findall( WEB_URL_REGEX, extract_text(result.xpath('.//div[@class="fb"]')))[0] showurl = showurl.lstrip('.') else: showurl = url results.append({ 'url': url, 'showurl': showurl, 'title': title, 'content': content }) except Exception as e: sentry.captureException() continue except Exception as e: sentry.captureException() # return results return results