def raise_for_cloudflare_captcha(resp): if resp.headers.get('Server', '').startswith('cloudflare'): if is_cloudflare_challenge(resp): # https://support.cloudflare.com/hc/en-us/articles/200170136-Understanding-Cloudflare-Challenge-Passage-Captcha- # suspend for 2 weeks raise SearxEngineCaptchaException(message='Cloudflare CAPTCHA', suspended_time=3600 * 24 * 15) if is_cloudflare_firewall(resp): raise SearxEngineAccessDeniedException( message='Cloudflare Firewall', suspended_time=3600 * 24)
def response(resp): resp_url = urlparse(resp.url) if resp_url.path.startswith('/showcaptcha'): raise SearxEngineCaptchaException() dom = html.fromstring(resp.text) results = [] for result in dom.xpath(results_xpath): try: res = {'url': result.xpath(url_xpath)[0], 'title': ''.join(result.xpath(title_xpath)), 'content': ''.join(result.xpath(content_xpath))} except: logger.exception('yandex parse crash') continue results.append(res) return results
def response(resp): resp_url = urlparse(resp.url) if resp_url.path.startswith('/nocaptcha'): raise SearxEngineCaptchaException() results = [] dom = html.fromstring(resp.text) # parse results for result in dom.xpath(results_xpath): link = result.xpath(link_xpath)[0] href = urljoin(url, link.attrib.get('href')) title = extract_text(link) content = extract_text(result.xpath(content_xpath)) # append result results.append({'url': href, 'title': title, 'content': content}) # return results return results
def response(resp): results = [] # According to https://www.qwant.com/js/app.js if resp.status_code == 429: raise SearxEngineCaptchaException() # raise for other errors raise_for_httperror(resp) # load JSON result search_results = loads(resp.text) # check for an API error if search_results.get('status') != 'success': raise SearxEngineAPIException('API error ' + str(search_results.get('error', ''))) # return empty array if there are no results if 'data' not in search_results: return [] data = search_results.get('data', {}) res = data.get('result', {}) # parse results for result in res.get('items', {}): title = html_to_text(result['title']) res_url = result['url'] content = html_to_text(result['desc']) if category_to_keyword.get(categories[0], '') == 'web': results.append({ 'title': title, 'content': content, 'url': res_url }) elif category_to_keyword.get(categories[0], '') == 'images': thumbnail_src = result['thumbnail'] img_src = result['media'] results.append({ 'template': 'images.html', 'url': res_url, 'title': title, 'content': '', 'thumbnail_src': thumbnail_src, 'img_src': img_src }) elif category_to_keyword.get(categories[0], '') == 'news': published_date = datetime.fromtimestamp(result['date'], None) media = result.get('media', []) if len(media) > 0: img_src = media[0].get('pict', {}).get('url', None) else: img_src = None results.append({ 'url': res_url, 'title': title, 'publishedDate': published_date, 'content': content, 'img_src': img_src }) return results
def raise_for_recaptcha(resp): if resp.status_code == 503 \ and '"https://www.google.com/recaptcha/' in resp.text: raise SearxEngineCaptchaException(message='ReCAPTCHA', suspended_time=3600 * 24 * 7)
def response(resp): """Get response from google's search request""" results = [] # detect google sorry resp_url = urlparse(resp.url) if resp_url.netloc == "sorry.google.com" or resp_url.path == "/sorry/IndexRedirect": raise SearxEngineCaptchaException() if resp_url.path.startswith("/sorry"): raise SearxEngineCaptchaException() # which subdomain ? # subdomain = resp.search_params.get('google_subdomain') # convert the text to dom dom = html.fromstring(resp.text) img_bas64_map = scrap_out_thumbs(dom) img_src_script = eval_xpath( dom, '//script[contains(., "AF_initDataCallback({key: ")]')[1].text # parse results # # root element:: # <div id="islmp" ..> # result div per image:: # <div jsmodel="tTXmib"> / <div jsaction="..." data-id="..." # The data-id matches to a item in a json-data structure in:: # <script nonce="I+vqelcy/01CKiBJi5Z1Ow">AF_initDataCallback({key: 'ds:1', ... data:function(){return [ ... # In this structure the link to the origin PNG, JPG or whatever is given # first link per image-div contains a <img> with the data-iid for bas64 encoded image data:: # <img class="rg_i Q4LuWd" data-iid="0" # second link per image-div is the target link:: # <a class="VFACy kGQAp" href="https://en.wikipedia.org/wiki/The_Sacrament_of_the_Last_Supper"> # the second link also contains two div tags with the *description* and *publisher*:: # <div class="WGvvNb">The Sacrament of the Last Supper ...</div> # <div class="fxgdke">en.wikipedia.org</div> root = eval_xpath(dom, '//div[@id="islmp"]') if not root: logger.error("did not find root element id='islmp'") return results root = root[0] for img_node in eval_xpath(root, './/img[contains(@class, "rg_i")]'): try: img_alt = eval_xpath(img_node, "@alt")[0] img_base64_id = eval_xpath(img_node, "@data-iid") if img_base64_id: img_base64_id = img_base64_id[0] thumbnail_src = img_bas64_map[img_base64_id] else: thumbnail_src = eval_xpath(img_node, "@src") if not thumbnail_src: thumbnail_src = eval_xpath(img_node, "@data-src") if thumbnail_src: thumbnail_src = thumbnail_src[0] else: thumbnail_src = "" link_node = eval_xpath(img_node, "../../../a[2]")[0] url = eval_xpath(link_node, "@href")[0] pub_nodes = eval_xpath(link_node, "./div/div") pub_descr = img_alt pub_source = "" if pub_nodes: pub_descr = extract_text(pub_nodes[0]) pub_source = extract_text(pub_nodes[1]) img_src_id = eval_xpath(img_node, "../../../@data-id")[0] src_url = scrap_img_by_id(img_src_script, img_src_id) if not src_url: src_url = thumbnail_src results.append({ "url": url, "title": img_alt, "content": pub_descr, "source": pub_source, "img_src": src_url, "img_format": { "width": int(eval_xpath(img_node, "@width")[0]), "height": int(eval_xpath(img_node, "@height")[0]), }, "thumbnail_src": thumbnail_src, "template": "images.html", }) except Exception as e: # pylint: disable=broad-except logger.error(e, exc_info=True) # from lxml import etree # logger.debug(etree.tostring(img_node, pretty_print=True)) # import pdb # pdb.set_trace() continue return results
def detect_google_sorry(resp): resp_url = urlparse(resp.url) if resp_url.netloc == 'sorry.google.com' or resp_url.path.startswith( '/sorry'): raise SearxEngineCaptchaException()
def raise_captcha(resp): if str(resp.url).startswith('https://www.startpage.com/sp/captcha'): # suspend CAPTCHA for 7 days raise SearxEngineCaptchaException(suspended_time=7 * 24 * 3600)
def detect_google_sorry(resp): if resp.url.host == 'sorry.google.com' or resp.url.path.startswith( '/sorry'): raise SearxEngineCaptchaException()
def response(resp): """Get response from google's search request""" results = [] # detect google sorry resp_url = urlparse(resp.url) if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect': raise SearxEngineCaptchaException() if resp_url.path.startswith('/sorry'): raise SearxEngineCaptchaException() # which subdomain ? # subdomain = resp.search_params.get('google_subdomain') # convert the text to dom dom = html.fromstring(resp.text) # results --> answer answer = eval_xpath(dom, '//div[contains(@class, "LGOjhe")]//text()') if answer: results.append({'answer': ' '.join(answer)}) else: logger.debug("did not found 'answer'") # results --> number_of_results try: _txt = eval_xpath_getindex(dom, '//div[@id="result-stats"]//text()', 0) _digit = ''.join([n for n in _txt if n.isdigit()]) number_of_results = int(_digit) results.append({'number_of_results': number_of_results}) except Exception as e: # pylint: disable=broad-except logger.debug("did not 'number_of_results'") logger.error(e, exc_info=True) # parse results for result in eval_xpath_list(dom, results_xpath): # google *sections* if extract_text(eval_xpath(result, g_section_with_header)): logger.debug("ingoring <g-section-with-header>") continue try: title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None) if title_tag is None: # this not one of the common google results *section* logger.debug('ingoring <div class="g" ../> section: missing title') continue title = extract_text(title_tag) url = eval_xpath_getindex(result, href_xpath, 0) content = extract_text(eval_xpath_getindex(result, content_xpath, 0, default=None), allow_none=True) results.append({ 'url': url, 'title': title, 'content': content }) except Exception as e: # pylint: disable=broad-except logger.error(e, exc_info=True) # from lxml import etree # logger.debug(etree.tostring(result, pretty_print=True)) # import pdb # pdb.set_trace() continue # parse suggestion for suggestion in eval_xpath_list(dom, suggestion_xpath): # append suggestion results.append({'suggestion': extract_text(suggestion)}) for correction in eval_xpath_list(dom, spelling_suggestion_xpath): results.append({'correction': extract_text(correction)}) # return results return results