def fetch(searx_stats_result: SearxStatisticsResult): ressource_hashes = { 'index': 0 } for network_type in NetworkType: fetch_instances(searx_stats_result, network_type, ressource_hashes) # create searx_json['hashes'] searx_stats_result.hashes = [None] * ressource_hashes['index'] for ressource_hash, ressource_desc in ressource_hashes.items(): if ressource_hash != 'index': i = ressource_desc['index'] del ressource_desc['index'] ressource_desc['hash'] = ressource_hash searx_stats_result.hashes[i] = ressource_desc # detect fork using the static files for _, detail in searx_stats_result.iter_instances(only_valid=True): ressources = detail.get('html', {}).get('ressources') if ressources: found_forks = find_forks(detail['html']['ressources'], searx_stats_result.hashes, searx_stats_result.forks) if found_forks and detail['git_url'] not in found_forks: detail['git_url'] = found_forks[0] # get grade for _, detail in searx_stats_result.iter_instances(only_valid=True): if 'html' in detail: html = detail['html'] html['grade'] = get_grade(html['ressources'], searx_stats_result.hashes)
async def _find_similar_instances(searx_stats_result: SearxStatisticsResult): # group instance urls per ip set all_ips_set = dict() for url, detail in searx_stats_result.iter_instances( valid_or_private=True, network_type=NetworkType.NORMAL): ips = set(detail.get('network', {}).get('ips', {}).keys()) # at least one IP if len(ips) > 0: # frozenset so it can use as a key of app_ips_set ips = frozenset(ips) urls = all_ips_set.setdefault(ips, set()) urls.add(url) # set alternativeUrls for ips, urls in all_ips_set.items(): if len(urls) > 1: # only if there are two or more instances sharing the same ips for url in urls: # for each url, create a reference to all other urls detail = searx_stats_result.get_instance(url) if 'alternativeUrls' not in detail: detail['alternativeUrls'] = dict() for url2 in urls: if url2 != url and url2 not in detail['alternativeUrls']: detail['alternativeUrls'][url2] = 'same IP'
async def fetch(searx_stats_result: SearxStatisticsResult): url_to_deleted = [] url_to_update = OrderedDict() # fetch and store the changes in url_to_deleted and url_to_add # do not modify the searx_stats_result.instances to avoid async def fetch_and_store_change(url: str, detail, *_, **__): if 'version' not in detail: r_url, r_detail = await fetch_one_display( url, searx_stats_result.private) dict_merge(r_detail, detail) if r_url != url: # r_url is the URL after following a HTTP redirect # in this case the searx_stats_result.instances[url] must be deleted. url_to_deleted.append(url) url_to_update[r_url] = r_detail instance_iterator = searx_stats_result.iter_instances( only_valid=False, valid_or_private=False) await for_each(instance_iterator, fetch_and_store_change, limit=1) # apply the changes for url in url_to_deleted: del searx_stats_result.instances[url] for url, detail in url_to_update.items(): searx_stats_result.update_instance(url, detail)
async def fetch(searx_stats_result: SearxStatisticsResult): seen_git_url = set() for _, detail in searx_stats_result.iter_instances(only_valid=True): git_url = normalize_git_url(detail['git_url']) if git_url and git_url not in seen_git_url: try: await fetch_hashes_from_url(git_url) except Exception as ex: print(exception_to_str(ex)) else: if git_url not in searx_stats_result.forks: searx_stats_result.forks.append(git_url) seen_git_url.add(git_url)
async def fetch(searx_stats_result: SearxStatisticsResult): url_to_deleted = [] async def fetch_and_set_async(url: str, detail, *_, **__): if 'version' not in detail: r_url, r_detail = await fetch_one_display( url, searx_stats_result.private) dict_merge(r_detail, detail) if r_url != url: # another r_url will never be url (the variable) # since r_url is the result of following HTTP redirect url_to_deleted.append(url) searx_stats_result.update_instance(r_url, r_detail) instance_iterator = searx_stats_result.iter_instances( only_valid=False, valid_or_private=False) await for_each(instance_iterator, fetch_and_set_async, limit=1) for url in url_to_deleted: del searx_stats_result.instances[url]
def fetch_instances(searx_stats_result: SearxStatisticsResult, network_type: NetworkType, ressource_hashes): driver = new_driver(network_type=network_type) try: for url, detail in searx_stats_result.iter_instances(only_valid=True, network_type=network_type): if get_network_type(url) == network_type: ressources = fetch_ressource_hashes(driver, url, ressource_hashes, searx_stats_result.forks) if 'error' in ressources: # don't reuse the browser if there was an error driver.quit() driver = new_driver(network_type=network_type) # temporary storage detail['html'] = { 'ressources': ressources } # output progress external_js = len(ressources.get('script', [])) inline_js = len(ressources.get('inline_script', [])) error_msg = ressources.get('error', '').strip() print('🔗 {0:60} {1:3} loaded js {2:3} inline js {3}'.format(url, external_js, inline_js, error_msg)) finally: driver.quit()
def fetch(searx_stats_result: SearxStatisticsResult): ressource_hashes = {'index': 0} for network_type in NetworkType: fetch_instances(searx_stats_result, network_type, ressource_hashes) # create searx_json['hashes'] searx_stats_result.hashes = [None] * ressource_hashes['index'] for ressource_hash, ressource_desc in ressource_hashes.items(): if ressource_hash != 'index': i = ressource_desc['index'] del ressource_desc['index'] ressource_desc['hash'] = ressource_hash searx_stats_result.hashes[i] = ressource_desc # get grade for _, detail in searx_stats_result.iter_instances(only_valid=True): if 'html' in detail: html = detail['html'] html['grade'] = get_grade(html['ressources'], searx_stats_result.hashes)
async def _fetch_network(searx_stats_result: SearxStatisticsResult): await for_each( searx_stats_result.iter_instances(valid_or_private=True, network_type=NetworkType.NORMAL), fetch_one, searx_stats_result)
async def fetch(searx_stats_result: SearxStatisticsResult): await for_each(searx_stats_result.iter_instances(only_valid=True), fetch_one, searx_stats_result)
async def _fetch_network(searx_stats_result: SearxStatisticsResult): await for_each(searx_stats_result.iter_instances(only_valid=False, network_type=NetworkType.NORMAL), fetch_one, searx_stats_result)