async def fetch_one(instance: str) -> dict: timings = {} try: user_pool_limits = httpx.PoolLimits(soft_limit=10, hard_limit=300) network_type = get_network_type(instance) async with new_client(pool_limits=user_pool_limits, network_type=network_type) as session: # check index with a new connection each time print('🏠 ' + instance) await request_stat_with_exception(timings, 'index', session, instance, REQUEST_COUNT, 20, 40, None) # check wikipedia engine with a new connection each time print('🔎 ' + instance) await request_stat_with_exception(timings, 'search_wp', session, instance, REQUEST_COUNT, 30, 60, check_wikipedia_result, params={'q': '!wp time'}) # check google engine with a new connection each time print('🔍 ' + instance) await request_stat_with_exception(timings, 'search_go', session, instance, 2, 60, 80, check_google_result, params={'q': '!google time'}) except RequestErrorException as ex: print('❌ {0}: {1}'.format(str(instance), str(ex))) except Exception as ex: print('❌❌ {0}: unexpected {1} {2}'.format(str(instance), type(ex), str(ex))) timings['error'] = exception_to_str(ex) traceback.print_exc(file=sys.stdout) else: print('🏁 {0}'.format(str(instance))) return timings
async def fetch_one(searx_stats_result: SearxStatisticsResult, url: str, detail): network_type = get_network_type(url) async with new_client(network_type=network_type) as session: # get config and config result_status = await get_status(session, url) result_config, result_instance = await get_config(session, url) if result_status is None: result_stats = await get_stats_multi(session, url) result_status = get_status_from_stats(result_stats) # update config and status for the instance detail_engines = detail.setdefault('engines', dict()) if result_instance is not None: dict_merge(detail_engines, result_instance) if result_status is not None: dict_merge(detail_engines, result_status) # update existing engine and category list if result_config is not None: # engines searx_stats_result.engines.update(result_config['engines']) # categories for category in result_config['categories']: if category not in searx_stats_result.categories: searx_stats_result.categories.append(category) print('💡 {0:30}'.format(url))
async def get_ip(url): async with new_client() as session: response, error = await get(session, url, timeout=10.0) if error is None: return response.text, None else: return False, error
async def get_instance_urls(): instance_urls = [] # fetch the .rst source async with new_client() as session: response = await session.get(SEARX_INSTANCES_URL, timeout=10) # get source after 'Alive and running' match = re.search(AFTER_ALIVE_AND_RUNNING, response.text) if match: # for each item of a list lines = re.findall(ITEM_RE, match.group(0)) for line in lines: # for each link links = re.findall(LINK_RE, line) for link in links: # normalize the link url = normalize_url(link[1]) if url: # add it instance_urls.append(url) # remove duplicates instance_urls = list(set(instance_urls)) # sort list instance_urls.sort() # return instance_urls
async def analyze(host): user_url = USER_ENDPOINT.format(host) json = None try: # get the result from cryptcheck.fr async with new_client() as session: json, pending = await get_existing_result(session, host, CACHE_EXPIRE_TIME) if json is None: # no existing result or too old if not pending: # ask for refresh await refresh_result(session, host) # pool the response json = await pool_result(session, host) # get the ranks from the result if json is not None and json.get('result') is not None: # get the grades from the different IPs (use a set to remove duplicates) ranks = list( set(map(lambda r: r.get('grade', '?'), json['result']))) # concat all the grades in one line, worse grade in front ranks.sort(reverse=True) ranks = ', '.join(ranks) # return (ranks, user_url) else: return ('?', user_url) except Exception as ex: print(host, exception_to_str(ex)) return ('?', user_url)
async def test_do_get_404(httpserver: pytest_httpserver.HTTPServer): httpserver.expect_request('/404.html').\ respond_with_data('Not Found', content_type='text/html', status=404) async with http.new_client() as session: response, error = await http.get(session, httpserver.url_for('/404.html')) assert response.text == 'Not Found' assert error == 'HTTP status code 404'
async def test_do_get_ok(httpserver: pytest_httpserver.HTTPServer): httpserver.expect_request('/index.html').\ respond_with_data('OK', content_type='text/html') async with http.new_client() as session: response, error = await http.get(session, httpserver.url_for('/index.html')) assert response.text == 'OK' assert error is None
async def fetch_one(instance_url): detail = dict() # no cookie ( cookies=DEFAULT_COOKIES, ) try: network_type = get_network_type(instance_url) async with new_client(network_type=network_type) as session: response, error = await get(session, instance_url, headers=DEFAULT_HEADERS, timeout=10) if response is not None: version = await get_searx_version(response) detail = { 'network_type': network_type.name.lower(), 'http': { 'status_code': response.status_code, 'error': error }, 'version': version, 'timing': { 'initial': response.elapsed.total_seconds() }, 'alternativeUrls': {}, } response_url = str(response.url) # add trailing slash if not response_url.endswith('/'): response_url = response_url + '/' # redirect if response_url != instance_url: if 'redirect_from' not in detail: detail['redirect_from'] = [] detail['alternativeUrls'][instance_url] = 'redirect' instance_url = response_url else: detail = { 'network_type': network_type.name.lower(), 'http': { 'status_code': None, 'error': error }, 'version': None, 'timing': {}, 'alternativeUrls': {}, } except concurrent.futures.TimeoutError: # This exception occurs on new_client() detail['error'] = 'Timeout error' if error is not None: detail['error'] = error if network_type == NetworkType.NORMAL: detail['tls'] = get_ssl_info(get_host(instance_url)) return instance_url, detail
async def fetch_one(instance_url: str, private: bool) -> dict: # no cookie ( cookies=DEFAULT_COOKIES, ) network_type = get_network_type(instance_url) detail = { 'network_type': network_type.name.lower(), 'http': {}, 'version': None, } try: async with new_client(network_type=network_type) as session: response, error = await get(session, instance_url, headers=DEFAULT_HEADERS, timeout=10) status_code = response.status_code if response is not None else None detail['http'] = { 'status_code': status_code, 'error': error, } if response is not None: response_url = str(response.url) # add trailing slash if not response_url.endswith('/'): response_url = response_url + '/' # redirect if 'alternativeUrls' not in detail: detail['alternativeUrls'] = dict() if response_url != instance_url: detail['alternativeUrls'][instance_url] = 'redirect from' instance_url = response_url # get the searx version if error is None: await asyncio.sleep(0.5) await set_searx_version(detail, session, response_url, response) # set initial response time detail['timing'] = {} response_time_stats = ResponseTimeStats() response_time_stats.add_response(response) detail['timing']['initial'] = response_time_stats.get() except concurrent.futures.TimeoutError: # This exception occurs on new_client() error = 'Timeout error' if (detail['version'] is not None or private) and network_type == NetworkType.NORMAL: detail['tls'] = get_ssl_info(get_host(instance_url)) if error is not None: detail['http']['error'] = error detail['error'] = error return instance_url, detail
async def test_do_get_connection_refused( httpserver: pytest_httpserver.HTTPServer): httpserver.expect_request('/index.html').\ respond_with_data('Not Found', content_type='text/html', status=404) # close HTTP server on purpose: make sure the connection will be refused httpserver.stop() try: async with http.new_client() as session: response, error = await http.get(session, httpserver.url_for('/index.html')) finally: # start again to avoid side effect httpserver.start() assert response is None assert error == 'Connection refused'
async def get_instance_urls(): instance_urls = [] # fetch html page async with new_client() as session: response = await session.get(SEARX_INSTANCES_URL, headers=DEFAULT_HEADERS, cookies=DEFAULT_COOKIES, timeout=10) html = await html_fromstring(response.text) # remove content before MARKDOWN_ELEMENTS_XPATH for element in MARKDOWN_ELEMENTS_XPATH(html)[0].getchildren(): text = stringify_children(element) if text.lower().find(REMOVE_BEFORE_LOWER_CASE) >= 0: break element.clear() # check all links for aelement in INSTANCES_XPATH(html): ahref = aelement.get('href') if ahref.startswith('https://www.ssllabs.com/') or \ ahref.startswith('https://hstspreload.org/') or \ ahref.startswith('https://geti2p.net/') or \ ahref.endswith('/cert/'): continue if ahref.endswith('/'): ahref = ahref[:-1] if ahref.endswith('/search'): ahref = ahref[:-7] # Remove .i2p (keep .onion URL) host = get_host(ahref) if host.endswith('.i2p'): continue ahref = ahref + '/' instance_urls.append(ahref) # remove duplicates instance_urls = list(set(instance_urls)) # sort list instance_urls.sort() # return instance_urls
async def fetch_one(instance: str) -> dict: timings = {} try: network_type = get_network_type(instance) timeout = 15 if network_type == NetworkType.NORMAL else 30 async with new_client(timeout=timeout, network_type=network_type) as client: # check if cookie settings is supported # intended side effect: add one HTTP connection to the pool cookies = await get_cookie_settings(client, instance) # check the default engines print('🔎 ' + instance) await request_stat_with_log(instance, timings, 'search', client, instance, 3, 120, 160, check_search_result, params={'q': 'time'}, cookies=cookies, headers=DEFAULT_HEADERS) # check the wikipedia engine print('🐘 ' + instance) await request_stat_with_log(instance, timings, 'search_wp', client, instance, 2, 60, 160, check_wikipedia_result, params={'q': '!wp time'}, cookies=cookies, headers=DEFAULT_HEADERS) # check the google engine # may include google results too, so wikipedia engine check before print('🔍 ' + instance) await request_stat_with_log(instance, timings, 'search_go', client, instance, 2, 60, 160, check_google_result, params={'q': '!google time'}, cookies=cookies, headers=DEFAULT_HEADERS) except Exception as ex: print('❌❌ {0}: unexpected {1} {2}'.format(str(instance), type(ex), str(ex))) timings['error'] = exception_to_str(ex) traceback.print_exc(file=sys.stdout) else: print('🏁 {0}'.format(str(instance))) return timings
async def analyze(host): grade_url = USER_ENDPOINT.format(host) try: async with new_client() as session: response = await session.post(API_NEW.format(host)) json = response.json() if json.get('error') == 'rescan-attempt-too-soon': return False finished = False grade = None remaining_tries = MAX_RETRY while not finished: await asyncio.sleep(TIME_BETWEEN_RETRY) response = await session.get(API_GET.format(host), timeout=5) json = response.json() state = json.get('state', '') if state == 'FINISHED': finished = True grade = json.get('grade') elif state in ['ABORTED', 'FAILED']: finished = True grade = None elif state not in ['PENDING', 'STARTING', 'RUNNING']: print(host, 'unknow state ', state) finished = True grade = None # if remaining_tries == 0: finished = True grade = None else: remaining_tries = remaining_tries - 1 except Exception as ex: print(host, exception_to_str(ex)) grade = None return (grade, grade_url)
async def test_new_client(): async with http.new_client() as session: cookies = session.cookies assert cookies is not None