def search(query, request, selected_engines): global engines, categories, number_of_searches requests = [] results = {} suggestions = set() number_of_searches += 1 #user_agent = request.headers.get('User-Agent', '') user_agent = gen_useragent() for selected_engine in selected_engines: if selected_engine['name'] not in engines: continue engine = engines[selected_engine['name']] request_params = default_request_params() request_params['headers']['User-Agent'] = user_agent request_params['category'] = selected_engine['category'] request_params['started'] = datetime.now() request_params = engine.request(query, request_params) callback = make_callback( selected_engine['name'], results, suggestions, engine.response, request_params ) request_args = dict( headers=request_params['headers'], hooks=dict(response=callback), cookies=request_params['cookies'], timeout=settings['server']['request_timeout'] ) if request_params['method'] == 'GET': req = grequests.get else: req = grequests.post request_args['data'] = request_params['data'] # ignoring empty urls if not request_params['url']: continue requests.append(req(request_params['url'], **request_args)) grequests.map(requests) for engine_name, engine_results in results.items(): engines[engine_name].stats['search_count'] += 1 engines[engine_name].stats['result_count'] += len(engine_results) results = score_results(results) for result in results: for res_engine in result['engines']: engines[result['engine']].stats['score_count'] += result['score'] return results, suggestions
def image_proxy(): url = request.args.get('url').encode('utf-8') if not url: return '', 400 h = hashlib.sha256(url + settings['server']['secret_key'].encode('utf-8')).hexdigest() if h != request.args.get('h'): return '', 400 headers = dict_subset(request.headers, {'If-Modified-Since', 'If-None-Match'}) headers['User-Agent'] = gen_useragent() resp = requests.get(url, stream=True, timeout=settings['outgoing']['request_timeout'], headers=headers, proxies=outgoing_proxies) if resp.status_code == 304: return '', resp.status_code if resp.status_code != 200: logger.debug('image-proxy: wrong response code: {0}'.format(resp.status_code)) if resp.status_code >= 400: return '', resp.status_code return '', 400 if not resp.headers.get('content-type', '').startswith('image/'): logger.debug('image-proxy: wrong content-type: {0}'.format(resp.headers.get('content-type'))) return '', 400 img = '' chunk_counter = 0 for chunk in resp.iter_content(1024 * 1024): chunk_counter += 1 if chunk_counter > 5: return '', 502 # Bad gateway - file is too big (>5M) img += chunk headers = dict_subset(resp.headers, {'Content-Length', 'Length', 'Date', 'Last-Modified', 'Expires', 'Etag'}) return Response(img, mimetype=resp.headers['content-type'], headers=headers)
def image_proxy(): url = request.args.get('url').encode('utf-8') if not url: return '', 400 h = hashlib.sha256(url + settings['server']['secret_key'].encode('utf-8')).hexdigest() if h != request.args.get('h'): return '', 400 headers = dict_subset(request.headers, {'If-Modified-Since', 'If-None-Match'}) headers['User-Agent'] = gen_useragent() resp = http_get(url, stream=True, timeout=settings['server'].get('request_timeout', 2), headers=headers) if resp.status_code == 304: return '', resp.status_code if resp.status_code != 200: logger.debug('image-proxy: wrong response code: {0}'.format(resp.status_code)) if resp.status_code >= 400: return '', resp.status_code return '', 400 if not resp.headers.get('content-type', '').startswith('image/'): logger.debug('image-proxy: wrong content-type: {0}'.format(resp.get('content-type'))) return '', 400 img = '' chunk_counter = 0 for chunk in resp.iter_content(1024 * 1024): chunk_counter += 1 if chunk_counter > 5: return '', 502 # Bad gateway - file is too big (>5M) img += chunk headers = dict_subset(resp.headers, {'Content-Length', 'Length', 'Date', 'Last-Modified', 'Expires', 'Etag'}) return Response(img, mimetype=resp.headers['content-type'], headers=headers)
def image_proxy(): url = request.args.get("url").encode("utf-8") if not url: return "", 400 h = hashlib.sha256(url + settings["server"]["secret_key"].encode("utf-8")).hexdigest() if h != request.args.get("h"): return "", 400 headers = dict_subset(request.headers, {"If-Modified-Since", "If-None-Match"}) headers["User-Agent"] = gen_useragent() resp = requests.get( url, stream=True, timeout=settings["outgoing"]["request_timeout"], headers=headers, proxies=outgoing_proxies ) if resp.status_code == 304: return "", resp.status_code if resp.status_code != 200: logger.debug("image-proxy: wrong response code: {0}".format(resp.status_code)) if resp.status_code >= 400: return "", resp.status_code return "", 400 if not resp.headers.get("content-type", "").startswith("image/"): logger.debug("image-proxy: wrong content-type: {0}".format(resp.headers.get("content-type"))) return "", 400 img = "" chunk_counter = 0 for chunk in resp.iter_content(1024 * 1024): chunk_counter += 1 if chunk_counter > 5: return "", 502 # Bad gateway - file is too big (>5M) img += chunk headers = dict_subset(resp.headers, {"Content-Length", "Length", "Date", "Last-Modified", "Expires", "Etag"}) return Response(img, mimetype=resp.headers["content-type"], headers=headers)
def request(query, params): offset = (params['pageno'] - 1) * 10 + 1 if params['language'] == 'all': lang = 'EN' else: lang = match_language(params['language'], supported_languages, language_aliases) query = u'language:{} {}'.format( lang.split('-')[0].upper(), query.decode('utf-8')).encode('utf-8') search_path = search_string.format(query=urlencode({'q': query}), offset=offset) params['url'] = base_url + search_path params['headers']['User-Agent'] = gen_useragent('Windows NT 6.3; WOW64') return params
def search(self): global number_of_searches # init vars requests = [] # increase number of searches number_of_searches += 1 # set default useragent # user_agent = request.headers.get('User-Agent', '') user_agent = gen_useragent() search_query = self.search_query # start search-reqest for all selected engines for selected_engine in search_query.engines: if selected_engine['name'] not in engines: continue engine = engines[selected_engine['name']] # skip suspended engines if engine.suspend_end_time >= time(): logger.debug('Engine currently suspended: %s', selected_engine['name']) continue # if paging is not supported, skip if search_query.pageno > 1 and not engine.paging: continue # if search-language is set and engine does not # provide language-support, skip if search_query.lang != 'all' and not engine.language_support: continue # if time_range is not supported, skip if search_query.time_range and not engine.time_range_support: continue # set default request parameters request_params = default_request_params() request_params['headers']['User-Agent'] = user_agent request_params['category'] = selected_engine['category'] request_params['started'] = time() request_params['pageno'] = search_query.pageno if hasattr(engine, 'language') and engine.language: request_params['language'] = engine.language else: request_params['language'] = search_query.lang # 0 = None, 1 = Moderate, 2 = Strict request_params['safesearch'] = search_query.safesearch request_params['time_range'] = search_query.time_range # update request parameters dependent on # search-engine (contained in engines folder) engine.request(search_query.query.encode('utf-8'), request_params) if request_params['url'] is None: # TODO add support of offline engines pass # create a callback wrapper for the search engine results callback = make_callback( selected_engine['name'], engine.response, request_params, self.result_container) # create dictionary which contain all # informations about the request request_args = dict( headers=request_params['headers'], hooks=dict(response=callback), cookies=request_params['cookies'], timeout=engine.timeout, verify=request_params['verify'] ) # specific type of request (GET or POST) if request_params['method'] == 'GET': req = requests_lib.get else: req = requests_lib.post request_args['data'] = request_params['data'] # ignoring empty urls if not request_params['url']: continue # append request to list requests.append((req, request_params['url'], request_args, selected_engine['name'])) if not requests: return self.result_container # send all search-request threaded_requests(requests) start_new_thread(gc.collect, tuple()) # return results, suggestions, answers and infoboxes return self.result_container
def test_gen_useragent(self): self.assertIsInstance(utils.gen_useragent(), str) self.assertIsNotNone(utils.gen_useragent()) self.assertTrue(utils.gen_useragent().startswith('Mozilla'))
def search(self, request): global number_of_searches # init vars requests = [] # increase number of searches number_of_searches += 1 # set default useragent # user_agent = request.headers.get('User-Agent', '') user_agent = gen_useragent() # start search-reqest for all selected engines for selected_engine in self.engines: if selected_engine['name'] not in engines: continue engine = engines[selected_engine['name']] # if paging is not supported, skip if self.pageno > 1 and not engine.paging: continue # if search-language is set and engine does not # provide language-support, skip if self.lang != 'all' and not engine.language_support: continue # set default request parameters request_params = default_request_params() request_params['headers']['User-Agent'] = user_agent request_params['category'] = selected_engine['category'] request_params['started'] = time() request_params['pageno'] = self.pageno if hasattr(engine, 'language') and engine.language: request_params['language'] = engine.language else: request_params['language'] = self.lang # 0 = None, 1 = Moderate, 2 = Strict request_params['safesearch'] = request.preferences.get_value( 'safesearch') # update request parameters dependent on # search-engine (contained in engines folder) engine.request(self.query.encode('utf-8'), request_params) if request_params['url'] is None: # TODO add support of offline engines pass # create a callback wrapper for the search engine results callback = make_callback(selected_engine['name'], engine.response, request_params, self.result_container) # create dictionary which contain all # informations about the request request_args = dict(headers=request_params['headers'], hooks=dict(response=callback), cookies=request_params['cookies'], timeout=engine.timeout, verify=request_params['verify']) # specific type of request (GET or POST) if request_params['method'] == 'GET': req = requests_lib.get else: req = requests_lib.post request_args['data'] = request_params['data'] # ignoring empty urls if not request_params['url']: continue # append request to list requests.append((req, request_params['url'], request_args, selected_engine['name'])) if not requests: return self # send all search-request threaded_requests(requests) # return results, suggestions, answers and infoboxes return self
def search(self, task): global number_of_searches # init vars requests = [] results_queue = Queue() results = {} # increase number of searches number_of_searches += 1 # set default useragent # user_agent = request.headers.get('User-Agent', '') user_agent = gen_useragent() # start search-reqest for all selected engines for selected_engine in self.engines: if selected_engine['name'] not in engines: continue engine = engines[selected_engine['name']] # if paging is not supported, skip if self.pageno > 1 and not engine.paging: continue # if search-language is set and engine does not # provide language-support, skip if self.lang != 'all' and not engine.language_support: continue # set default request parameters request_params = default_request_params() request_params['headers']['User-Agent'] = user_agent request_params['category'] = selected_engine['category'] request_params['started'] = time() request_params['pageno'] = self.pageno if hasattr(engine, 'language') and engine.language: request_params['language'] = engine.language else: request_params['language'] = self.lang # try: # 0 = None, 1 = Moderate, 2 = Strict # request_params['safesearch'] = int(request.cookies.get('safesearch')) # except Exception: request_params['safesearch'] = settings['search']['safe_search'] # update request parameters dependent on # search-engine (contained in engines folder) engine.request(task['query'].encode('utf-8'), request_params) # update request parameters dependent on # search-engine (contained in engines folder) if request_params['url'] is None: # TODO add support of offline engines pass # create a callback wrapper for the search engine results callback = make_callback( selected_engine['name'], results_queue, engine.response, request_params) # create dictionary which contain all # informations about the request request_args = dict( headers=request_params['headers'], hooks=dict(response=callback), cookies=request_params['cookies'], timeout=engine.timeout, verify=request_params['verify'] ) # specific type of request (GET or POST) if request_params['method'] == 'GET': req = requests_lib.get else: req = requests_lib.post request_args['data'] = request_params['data'] # ignoring empty urls if not request_params['url']: continue # append request to list requests.append((req, request_params['url'], request_args, selected_engine['name'])) if not requests: return self # send all search-request threaded_requests(requests) while not results_queue.empty(): engine_name, engine_results = results_queue.get_nowait() # TODO type checks [self.suggestions.append(x['suggestion']) for x in list(engine_results) if 'suggestion' in x and engine_results.remove(x) is None] [self.answers.append(x['answer']) for x in list(engine_results) if 'answer' in x and engine_results.remove(x) is None] self.infoboxes.extend(x for x in list(engine_results) if 'infobox' in x and engine_results.remove(x) is None) results[engine_name] = engine_results # update engine-specific stats for engine_name, engine_results in results.items(): engines[engine_name].stats['search_count'] += 1 engines[engine_name].stats['result_count'] += len(engine_results) # score results and remove duplications self.results = score_results(results) # merge infoboxes according to their ids self.infoboxes = merge_infoboxes(self.infoboxes) # update engine stats, using calculated score for result in self.results: plugins.callAPI('on_result', self.plugins, locals()) for res_engine in result['engines']: engines[result['engine']] \ .stats['score_count'] += result['score'] result['pretty_url'] = prettify_url(result['url']) # TODO, check if timezone is calculated right if 'publishedDate' in result: result['pubdate'] = result['publishedDate'].strftime('%Y-%m-%d %H:%M:%S%z') if not self.paging and engines[result['engine']].paging: self.paging = True if 'content' in result: result['content_html'] = highlight_content(result['content'], self.query.encode('utf-8')) # noqa result['title_html'] = highlight_content(result['title'], self.query.encode('utf-8')) if result.get('content'): result['content'] = html_to_text(result['content']).strip() # removing html content and whitespace duplications result['title'] = ' '.join(html_to_text(result['title']).strip().split()) # return results, suggestions, answers and infoboxes return self
def search(self, request): global number_of_searches # init vars requests = [] results_queue = Queue() results = {} suggestions = set() answers = set() infoboxes = [] # increase number of searches number_of_searches += 1 # set default useragent # user_agent = request.headers.get('User-Agent', '') user_agent = gen_useragent() # start search-reqest for all selected engines for selected_engine in self.engines: if selected_engine['name'] not in engines: continue engine = engines[selected_engine['name']] # if paging is not supported, skip if self.pageno > 1 and not engine.paging: continue # if search-language is set and engine does not # provide language-support, skip if self.lang != 'all' and not engine.language_support: continue # set default request parameters request_params = default_request_params() request_params['headers']['User-Agent'] = user_agent request_params['category'] = selected_engine['category'] request_params['started'] = time() request_params['pageno'] = self.pageno request_params['language'] = self.lang # update request parameters dependent on # search-engine (contained in engines folder) engine.request(self.query.encode('utf-8'), request_params) if request_params['url'] is None: # TODO add support of offline engines pass # create a callback wrapper for the search engine results callback = make_callback( selected_engine['name'], results_queue, engine.response, request_params) # create dictionary which contain all # informations about the request request_args = dict( headers=request_params['headers'], hooks=dict(response=callback), cookies=request_params['cookies'], timeout=engine.timeout, verify=request_params['verify'] ) # specific type of request (GET or POST) if request_params['method'] == 'GET': req = requests_lib.get else: req = requests_lib.post request_args['data'] = request_params['data'] # ignoring empty urls if not request_params['url']: continue # append request to list requests.append((req, request_params['url'], request_args, selected_engine['name'])) if not requests: return results, suggestions, answers, infoboxes # send all search-request threaded_requests(requests) while not results_queue.empty(): engine_name, engine_results = results_queue.get_nowait() # TODO type checks [suggestions.add(x['suggestion']) for x in list(engine_results) if 'suggestion' in x and engine_results.remove(x) is None] [answers.add(x['answer']) for x in list(engine_results) if 'answer' in x and engine_results.remove(x) is None] infoboxes.extend(x for x in list(engine_results) if 'infobox' in x and engine_results.remove(x) is None) results[engine_name] = engine_results # update engine-specific stats for engine_name, engine_results in results.items(): engines[engine_name].stats['search_count'] += 1 engines[engine_name].stats['result_count'] += len(engine_results) # score results and remove duplications results = score_results(results) # merge infoboxes according to their ids infoboxes = merge_infoboxes(infoboxes) # update engine stats, using calculated score for result in results: for res_engine in result['engines']: engines[result['engine']]\ .stats['score_count'] += result['score'] # return results, suggestions, answers and infoboxes return results, suggestions, answers, infoboxes
def image_proxy(): url = request.args.get('url') if not url: return '', 400 h = new_hmac(settings['server']['secret_key'], url.encode()) if h != request.args.get('h'): return '', 400 maximum_size = 5 * 1024 * 1024 try: headers = dict_subset(request.headers, {'If-Modified-Since', 'If-None-Match'}) headers['User-Agent'] = gen_useragent() stream = http_stream(method='GET', url=url, headers=headers, timeout=settings['outgoing']['request_timeout'], follow_redirects=True, max_redirects=20) resp = next(stream) content_length = resp.headers.get('Content-Length') if content_length and content_length.isdigit( ) and int(content_length) > maximum_size: return 'Max size', 400 if resp.status_code == 304: return '', resp.status_code if resp.status_code != 200: logger.debug('image-proxy: wrong response code: {0}'.format( resp.status_code)) if resp.status_code >= 400: return '', resp.status_code return '', 400 if not resp.headers.get('content-type', '').startswith('image/'): logger.debug('image-proxy: wrong content-type: {0}'.format( resp.headers.get('content-type'))) return '', 400 headers = dict_subset(resp.headers, { 'Content-Length', 'Length', 'Date', 'Last-Modified', 'Expires', 'Etag' }) total_length = 0 def forward_chunk(): nonlocal total_length for chunk in stream: total_length += len(chunk) if total_length > maximum_size: break yield chunk return Response(forward_chunk(), mimetype=resp.headers['Content-Type'], headers=headers) except httpx.HTTPError: return '', 400
def search(self, request): global number_of_searches # init vars requests = [] results_queue = Queue() results = {} # increase number of searches number_of_searches += 1 # set default useragent # user_agent = request.headers.get('User-Agent', '') user_agent = gen_useragent() # start search-reqest for all selected engines for selected_engine in self.engines: if selected_engine['name'] not in engines: continue engine = engines[selected_engine['name']] # if paging is not supported, skip if self.pageno > 1 and not engine.paging: continue # if search-language is set and engine does not # provide language-support, skip if self.lang != 'all' and not engine.language_support: continue # set default request parameters request_params = default_request_params() request_params['headers']['User-Agent'] = user_agent request_params['category'] = selected_engine['category'] request_params['started'] = time() request_params['pageno'] = self.pageno if hasattr(engine, 'language'): request_params['language'] = engine.language else: request_params['language'] = self.lang try: # 0 = None, 1 = Moderate, 2 = Strict request_params['safesearch'] = int( request.cookies.get('safesearch', 1)) except ValueError: request_params['safesearch'] = 1 # update request parameters dependent on # search-engine (contained in engines folder) engine.request(self.query.encode('utf-8'), request_params) if request_params['url'] is None: # TODO add support of offline engines pass # create a callback wrapper for the search engine results callback = make_callback(selected_engine['name'], results_queue, engine.response, request_params) # create dictionary which contain all # informations about the request request_args = dict(headers=request_params['headers'], hooks=dict(response=callback), cookies=request_params['cookies'], timeout=engine.timeout, verify=request_params['verify']) # specific type of request (GET or POST) if request_params['method'] == 'GET': req = requests_lib.get else: req = requests_lib.post request_args['data'] = request_params['data'] # ignoring empty urls if not request_params['url']: continue # append request to list requests.append((req, request_params['url'], request_args, selected_engine['name'])) if not requests: return self # send all search-request threaded_requests(requests) while not results_queue.empty(): engine_name, engine_results = results_queue.get_nowait() # TODO type checks [ self.suggestions.add(x['suggestion']) for x in list(engine_results) if 'suggestion' in x and engine_results.remove(x) is None ] [ self.answers.add(x['answer']) for x in list(engine_results) if 'answer' in x and engine_results.remove(x) is None ] self.infoboxes.extend( x for x in list(engine_results) if 'infobox' in x and engine_results.remove(x) is None) results[engine_name] = engine_results # update engine-specific stats for engine_name, engine_results in results.items(): engines[engine_name].stats['search_count'] += 1 engines[engine_name].stats['result_count'] += len(engine_results) # score results and remove duplications self.results = score_results(results) # merge infoboxes according to their ids self.infoboxes = merge_infoboxes(self.infoboxes) # update engine stats, using calculated score for result in self.results: for res_engine in result['engines']: engines[result['engine']]\ .stats['score_count'] += result['score'] # return results, suggestions, answers and infoboxes return self
def image_proxy(): # pylint: disable=too-many-return-statements, too-many-branches url = request.args.get('url') if not url: return '', 400 h = new_hmac(settings['server']['secret_key'], url.encode()) if h != request.args.get('h'): return '', 400 maximum_size = 5 * 1024 * 1024 forward_resp = False resp = None try: request_headers = { 'User-Agent': gen_useragent(), 'Accept': 'image/webp,*/*', 'Accept-Encoding': 'gzip, deflate', 'Sec-GPC': '1', 'DNT': '1', } set_context_network_name('image_proxy') stream = http_stream(method='GET', url=url, headers=request_headers, timeout=settings['outgoing']['request_timeout'], follow_redirects=True, max_redirects=20) resp = next(stream) content_length = resp.headers.get('Content-Length') if content_length and content_length.isdigit( ) and int(content_length) > maximum_size: return 'Max size', 400 if resp.status_code != 200: logger.debug('image-proxy: wrong response code: {0}'.format( resp.status_code)) if resp.status_code >= 400: return '', resp.status_code return '', 400 if not resp.headers.get('Content-Type', '').startswith('image/'): logger.debug('image-proxy: wrong content-type: %s', resp.headers.get('Content-Type', '')) return '', 400 forward_resp = True except httpx.HTTPError: logger.exception('HTTP error') return '', 400 finally: if resp and not forward_resp: # the code is about to return an HTTP 400 error to the browser # we make sure to close the response between searxng and the HTTP server try: resp.close() except httpx.HTTPError: logger.exception('HTTP error on closing') try: headers = dict_subset( resp.headers, {'Content-Type', 'Content-Encoding', 'Content-Length', 'Length'}) def forward_chunk(): total_length = 0 for chunk in stream: total_length += len(chunk) if total_length > maximum_size: break yield chunk return Response(forward_chunk(), mimetype=resp.headers['Content-Type'], headers=headers) except httpx.HTTPError: return '', 400
def load_engine(engine_data): engine_name = engine_data['name'] if '_' in engine_name: logger.error( 'Engine name contains underscore: "{}"'.format(engine_name)) sys.exit(1) if engine_name.lower() != engine_name: logger.warn( 'Engine name is not lowercase: "{}", converting to lowercase'. format(engine_name)) engine_name = engine_name.lower() engine_data['name'] = engine_name engine_module = engine_data['engine'] try: engine = load_module(engine_module + '.py', engine_dir) except (SyntaxError, KeyboardInterrupt, SystemExit, SystemError, ImportError, RuntimeError): logger.exception( 'Fatal exception in engine "{}"'.format(engine_module)) sys.exit(1) except: logger.exception('Cannot load engine "{}"'.format(engine_module)) return None for param_name, param_value in engine_data.items(): if param_name == 'engine': pass elif param_name == 'categories': if param_value == 'none': engine.categories = [] else: engine.categories = list(map(str.strip, param_value.split(','))) else: setattr(engine, param_name, param_value) for arg_name, arg_value in engine_default_args.items(): if not hasattr(engine, arg_name): setattr(engine, arg_name, arg_value) # checking required variables for engine_attr in dir(engine): if engine_attr.startswith('_'): continue if engine_attr == 'inactive' and getattr(engine, engine_attr) is True: return None if getattr(engine, engine_attr) is None: logger.error('Missing engine config attribute: "{0}.{1}"'.format( engine.name, engine_attr)) sys.exit(1) # assign supported languages from json file if engine_data['name'] in ENGINES_LANGUAGES: setattr(engine, 'supported_languages', ENGINES_LANGUAGES[engine_data['name']]) # find custom aliases for non standard language codes if hasattr(engine, 'supported_languages'): if hasattr(engine, 'language_aliases'): language_aliases = getattr(engine, 'language_aliases') else: language_aliases = {} for engine_lang in getattr(engine, 'supported_languages'): iso_lang = match_language(engine_lang, babel_langs, fallback=None) if iso_lang and iso_lang != engine_lang and not engine_lang.startswith(iso_lang) and \ iso_lang not in getattr(engine, 'supported_languages'): language_aliases[iso_lang] = engine_lang setattr(engine, 'language_aliases', language_aliases) # language_support setattr(engine, 'language_support', len(getattr(engine, 'supported_languages', [])) > 0) # assign language fetching method if auxiliary method exists if hasattr(engine, '_fetch_supported_languages'): headers = { 'User-Agent': gen_useragent(), 'Accept-Language': 'ja-JP,ja;q=0.8,en-US;q=0.5,en;q=0.3', # bing needs a non-English language } setattr( engine, 'fetch_supported_languages', lambda: engine._fetch_supported_languages( get(engine.supported_languages_url, headers=headers))) # tor related settings if settings['outgoing'].get('using_tor_proxy'): # use onion url if using tor. if hasattr(engine, 'onion_url'): engine.search_url = engine.onion_url + getattr( engine, 'search_path', '') elif 'onions' in engine.categories: # exclude onion engines if not using tor. return None engine.timeout += settings['outgoing'].get('extra_proxy_timeout', 0) for category_name in engine.categories: categories.setdefault(category_name, []).append(engine) if engine.shortcut in engine_shortcuts: logger.error('Engine config error: ambigious shortcut: {0}'.format( engine.shortcut)) sys.exit(1) engine_shortcuts[engine.shortcut] = engine.name return engine
def search(self): global number_of_searches # start time start_time = time() # answeres ? answerers_results = ask(self.search_query) if answerers_results: for results in answerers_results: self.result_container.extend('answer', results) return self.result_container # init vars requests = [] # increase number of searches number_of_searches += 1 # set default useragent # user_agent = request.headers.get('User-Agent', '') user_agent = gen_useragent() search_query = self.search_query # max of all selected engine timeout timeout_limit = 0 # start search-reqest for all selected engines for selected_engine in search_query.engines: if selected_engine['name'] not in engines: continue engine = engines[selected_engine['name']] # skip suspended engines if engine.suspend_end_time >= time(): logger.debug('Engine currently suspended: %s', selected_engine['name']) continue # if paging is not supported, skip if search_query.pageno > 1 and not engine.paging: continue # if time_range is not supported, skip if search_query.time_range and not engine.time_range_support: continue # set default request parameters request_params = default_request_params() request_params['headers']['User-Agent'] = user_agent request_params['category'] = selected_engine['category'] request_params['pageno'] = search_query.pageno if hasattr(engine, 'language') and engine.language: request_params['language'] = engine.language else: request_params['language'] = search_query.lang # 0 = None, 1 = Moderate, 2 = Strict request_params['safesearch'] = search_query.safesearch request_params['time_range'] = search_query.time_range # append request to list requests.append((selected_engine['name'], search_query.query, request_params)) # update timeout_limit timeout_limit = max(timeout_limit, engine.timeout) if requests: # send all search-request search_multiple_requests(requests, self.result_container, start_time, timeout_limit) start_new_thread(gc.collect, tuple()) # return results, suggestions, answers and infoboxes return self.result_container
def search(self): global number_of_searches # start time start_time = time() # answeres ? answerers_results = ask(self.search_query) if answerers_results: for results in answerers_results: self.result_container.extend('answer', results) return self.result_container # init vars requests = [] # increase number of searches number_of_searches += 1 # set default useragent # user_agent = request.headers.get('User-Agent', '') user_agent = gen_useragent() search_query = self.search_query # max of all selected engine timeout timeout_limit = 0 # start search-reqest for all selected engines for selected_engine in search_query.engines: if selected_engine['name'] not in engines: continue engine = engines[selected_engine['name']] # skip suspended engines if engine.suspend_end_time >= time(): logger.debug('Engine currently suspended: %s', selected_engine['name']) continue # if paging is not supported, skip if search_query.pageno > 1 and not engine.paging: continue # if time_range is not supported, skip if search_query.time_range and not engine.time_range_support: continue # set default request parameters request_params = default_request_params() request_params['headers']['User-Agent'] = user_agent request_params['category'] = selected_engine['category'] request_params['pageno'] = search_query.pageno if hasattr(engine, 'language') and engine.language: request_params['language'] = engine.language else: request_params['language'] = search_query.lang # 0 = None, 1 = Moderate, 2 = Strict request_params['safesearch'] = search_query.safesearch request_params['time_range'] = search_query.time_range # append request to list requests.append((selected_engine['name'], search_query.query.encode('utf-8'), request_params)) # update timeout_limit timeout_limit = max(timeout_limit, engine.timeout) if requests: # send all search-request search_multiple_requests(requests, self.result_container, start_time, timeout_limit) start_new_thread(gc.collect, tuple()) # return results, suggestions, answers and infoboxes return self.result_container
def search(self): global number_of_searches # Check if there is a external bang. After that we can stop because the search will terminate. if self.search_query.external_bang: self.result_container.redirect_url = get_bang_url( self.search_query) # This means there was a valid bang and the # rest of the search does not need to be continued if isinstance(self.result_container.redirect_url, str): return self.result_container # start time start_time = time() # answeres ? answerers_results = ask(self.search_query) if answerers_results: for results in answerers_results: self.result_container.extend('answer', results) return self.result_container # init vars requests = [] # increase number of searches number_of_searches += 1 # set default useragent # user_agent = request.headers.get('User-Agent', '') user_agent = gen_useragent() search_query = self.search_query # max of all selected engine timeout default_timeout = 0 # start search-reqest for all selected engines for selected_engine in search_query.engines: if selected_engine['name'] not in engines: continue engine = engines[selected_engine['name']] if not search_query.preferences.validate_token(engine): continue # skip suspended engines if engine.suspend_end_time >= time(): logger.debug('Engine currently suspended: %s', selected_engine['name']) continue # if paging is not supported, skip if search_query.pageno > 1 and not engine.paging: continue # if time_range is not supported, skip if search_query.time_range and not engine.time_range_support: continue # set default request parameters request_params = {} if not engine.offline: request_params = default_request_params() request_params['headers']['User-Agent'] = user_agent if hasattr(engine, 'language') and engine.language: request_params['language'] = engine.language else: request_params['language'] = search_query.lang request_params['safesearch'] = search_query.safesearch request_params['time_range'] = search_query.time_range request_params['category'] = selected_engine['category'] request_params['pageno'] = search_query.pageno # append request to list requests.append( (selected_engine['name'], search_query.query, request_params)) # update default_timeout default_timeout = max(default_timeout, engine.timeout) # adjust timeout self.actual_timeout = default_timeout query_timeout = self.search_query.timeout_limit if max_request_timeout is None and query_timeout is None: # No max, no user query: default_timeout pass elif max_request_timeout is None and query_timeout is not None: # No max, but user query: From user query except if above default self.actual_timeout = min(default_timeout, query_timeout) elif max_request_timeout is not None and query_timeout is None: # Max, no user query: Default except if above max self.actual_timeout = min(default_timeout, max_request_timeout) elif max_request_timeout is not None and query_timeout is not None: # Max & user query: From user query except if above max self.actual_timeout = min(query_timeout, max_request_timeout) logger.debug( "actual_timeout={0} (default_timeout={1}, ?timeout_limit={2}, max_request_timeout={3})" .format(self.actual_timeout, default_timeout, query_timeout, max_request_timeout)) # send all search-request if requests: search_multiple_requests(requests, self.result_container, start_time, self.actual_timeout) start_new_thread(gc.collect, tuple()) # return results, suggestions, answers and infoboxes return self.result_container
def search(self, request): global number_of_searches requests = [] results = {} suggestions = set() number_of_searches += 1 #user_agent = request.headers.get('User-Agent', '') user_agent = gen_useragent() for selected_engine in self.engines: if selected_engine['name'] not in engines: continue engine = engines[selected_engine['name']] if self.pageno > 1 and not engine.paging: continue if self.lang != 'all' and not engine.language_support: continue request_params = default_request_params() request_params['headers']['User-Agent'] = user_agent request_params['category'] = selected_engine['category'] request_params['started'] = datetime.now() request_params['pageno'] = self.pageno request_params['language'] = self.lang request_params = engine.request(self.query.encode('utf-8'), request_params) if request_params['url'] is None: # TODO add support of offline engines pass callback = make_callback( selected_engine['name'], results, suggestions, engine.response, request_params ) request_args = dict( headers=request_params['headers'], hooks=dict(response=callback), cookies=request_params['cookies'], timeout=engine.timeout ) if request_params['method'] == 'GET': req = grequests.get else: req = grequests.post request_args['data'] = request_params['data'] # ignoring empty urls if not request_params['url']: continue requests.append(req(request_params['url'], **request_args)) grequests.map(requests) for engine_name, engine_results in results.items(): engines[engine_name].stats['search_count'] += 1 engines[engine_name].stats['result_count'] += len(engine_results) results = score_results(results) for result in results: for res_engine in result['engines']: engines[result['engine']]\ .stats['score_count'] += result['score'] return results, suggestions