Пример #1
0
def fetch(searx_stats_result: SearxStatisticsResult):
    ressource_hashes = {
        'index': 0
    }

    for network_type in NetworkType:
        fetch_instances(searx_stats_result, network_type, ressource_hashes)

    # create searx_json['hashes']
    searx_stats_result.hashes = [None] * ressource_hashes['index']
    for ressource_hash, ressource_desc in ressource_hashes.items():
        if ressource_hash != 'index':
            i = ressource_desc['index']
            del ressource_desc['index']
            ressource_desc['hash'] = ressource_hash
            searx_stats_result.hashes[i] = ressource_desc

    # detect fork using the static files
    for _, detail in searx_stats_result.iter_instances(only_valid=True):
        ressources = detail.get('html', {}).get('ressources')
        if ressources:
            found_forks = find_forks(detail['html']['ressources'], searx_stats_result.hashes, searx_stats_result.forks)
            if found_forks and detail['git_url'] not in found_forks:
                detail['git_url'] = found_forks[0]

    # get grade
    for _, detail in searx_stats_result.iter_instances(only_valid=True):
        if 'html' in detail:
            html = detail['html']
            html['grade'] = get_grade(html['ressources'], searx_stats_result.hashes)
Пример #2
0
async def _find_similar_instances(searx_stats_result: SearxStatisticsResult):
    # group instance urls per ip set
    all_ips_set = dict()
    for url, detail in searx_stats_result.iter_instances(
            valid_or_private=True, network_type=NetworkType.NORMAL):
        ips = set(detail.get('network', {}).get('ips', {}).keys())
        # at least one IP
        if len(ips) > 0:
            # frozenset so it can use as a key of app_ips_set
            ips = frozenset(ips)
            urls = all_ips_set.setdefault(ips, set())
            urls.add(url)
    # set alternativeUrls
    for ips, urls in all_ips_set.items():
        if len(urls) > 1:
            # only if there are two or more instances sharing the same ips
            for url in urls:
                # for each url, create a reference to all other urls
                detail = searx_stats_result.get_instance(url)
                if 'alternativeUrls' not in detail:
                    detail['alternativeUrls'] = dict()

                for url2 in urls:
                    if url2 != url and url2 not in detail['alternativeUrls']:
                        detail['alternativeUrls'][url2] = 'same IP'
Пример #3
0
async def fetch(searx_stats_result: SearxStatisticsResult):

    url_to_deleted = []
    url_to_update = OrderedDict()

    # fetch and store the changes in url_to_deleted and url_to_add
    # do not modify the searx_stats_result.instances to avoid
    async def fetch_and_store_change(url: str, detail, *_, **__):
        if 'version' not in detail:
            r_url, r_detail = await fetch_one_display(
                url, searx_stats_result.private)
            dict_merge(r_detail, detail)
            if r_url != url:
                # r_url is the URL after following a HTTP redirect
                # in this case the searx_stats_result.instances[url] must be deleted.
                url_to_deleted.append(url)
            url_to_update[r_url] = r_detail

    instance_iterator = searx_stats_result.iter_instances(
        only_valid=False, valid_or_private=False)
    await for_each(instance_iterator, fetch_and_store_change, limit=1)

    # apply the changes
    for url in url_to_deleted:
        del searx_stats_result.instances[url]
    for url, detail in url_to_update.items():
        searx_stats_result.update_instance(url, detail)
Пример #4
0
async def fetch(searx_stats_result: SearxStatisticsResult):
    seen_git_url = set()
    for _, detail in searx_stats_result.iter_instances(only_valid=True):
        git_url = normalize_git_url(detail['git_url'])
        if git_url and git_url not in seen_git_url:
            try:
                await fetch_hashes_from_url(git_url)
            except Exception as ex:
                print(exception_to_str(ex))
            else:
                if git_url not in searx_stats_result.forks:
                    searx_stats_result.forks.append(git_url)
            seen_git_url.add(git_url)
Пример #5
0
async def fetch(searx_stats_result: SearxStatisticsResult):

    url_to_deleted = []

    async def fetch_and_set_async(url: str, detail, *_, **__):
        if 'version' not in detail:
            r_url, r_detail = await fetch_one_display(
                url, searx_stats_result.private)
            dict_merge(r_detail, detail)
            if r_url != url:
                # another r_url will never be url (the variable)
                # since r_url is the result of following HTTP redirect
                url_to_deleted.append(url)
            searx_stats_result.update_instance(r_url, r_detail)

    instance_iterator = searx_stats_result.iter_instances(
        only_valid=False, valid_or_private=False)
    await for_each(instance_iterator, fetch_and_set_async, limit=1)

    for url in url_to_deleted:
        del searx_stats_result.instances[url]
Пример #6
0
def fetch_instances(searx_stats_result: SearxStatisticsResult, network_type: NetworkType, ressource_hashes):
    driver = new_driver(network_type=network_type)
    try:
        for url, detail in searx_stats_result.iter_instances(only_valid=True, network_type=network_type):
            if get_network_type(url) == network_type:
                ressources = fetch_ressource_hashes(driver, url, ressource_hashes, searx_stats_result.forks)
                if 'error' in ressources:
                    # don't reuse the browser if there was an error
                    driver.quit()
                    driver = new_driver(network_type=network_type)
                # temporary storage
                detail['html'] = {
                    'ressources': ressources
                }
                # output progress
                external_js = len(ressources.get('script', []))
                inline_js = len(ressources.get('inline_script', []))
                error_msg = ressources.get('error', '').strip()
                print('🔗 {0:60} {1:3} loaded js {2:3} inline js  {3}'.format(url, external_js, inline_js, error_msg))
    finally:
        driver.quit()
Пример #7
0
def fetch(searx_stats_result: SearxStatisticsResult):
    ressource_hashes = {'index': 0}

    for network_type in NetworkType:
        fetch_instances(searx_stats_result, network_type, ressource_hashes)

    # create searx_json['hashes']
    searx_stats_result.hashes = [None] * ressource_hashes['index']
    for ressource_hash, ressource_desc in ressource_hashes.items():
        if ressource_hash != 'index':
            i = ressource_desc['index']
            del ressource_desc['index']
            ressource_desc['hash'] = ressource_hash
            searx_stats_result.hashes[i] = ressource_desc

    # get grade
    for _, detail in searx_stats_result.iter_instances(only_valid=True):
        if 'html' in detail:
            html = detail['html']
            html['grade'] = get_grade(html['ressources'],
                                      searx_stats_result.hashes)
Пример #8
0
async def _fetch_network(searx_stats_result: SearxStatisticsResult):
    await for_each(
        searx_stats_result.iter_instances(valid_or_private=True,
                                          network_type=NetworkType.NORMAL),
        fetch_one, searx_stats_result)
Пример #9
0
async def fetch(searx_stats_result: SearxStatisticsResult):
    await for_each(searx_stats_result.iter_instances(only_valid=True),
                   fetch_one, searx_stats_result)
Пример #10
0
async def _fetch_network(searx_stats_result: SearxStatisticsResult):
    await for_each(searx_stats_result.iter_instances(only_valid=False, network_type=NetworkType.NORMAL),
                   fetch_one, searx_stats_result)