Exemplo n.º 1
0
def retrieve_all(hostname: str) -> dict:
    retrievals = {
        'hostname': hostname,
        'resources': {},
        'responses': {
            'auto':
            None,  # whichever of 'http' or 'https' actually works, with 'https' as higher priority
            'cors': None,  # CORS preflight test
            'http': None,
            'https': None,
        },
        'session': None,
    }

    # The list of resources to get
    resources = ('/clientaccesspolicy.xml', '/contribute.json',
                 '/crossdomain.xml', '/robots.txt')

    # Get the headers and cookies from the database
    # TODO: Allow headers to be overridden on a per-scan basis?
    headers = select_site_headers(hostname)

    # Create some reusable sessions, one for HTTP and one for HTTPS
    http_session = __create_session('http://' + hostname + '/',
                                    headers=headers['headers'],
                                    cookies=headers['cookies'])
    https_session = __create_session('https://' + hostname + '/',
                                     headers=headers['headers'],
                                     cookies=headers['cookies'])

    # If neither one works, then the site just can't be loaded
    if not http_session['session'] and not https_session['session']:
        return retrievals

    else:
        # Store the HTTP only and HTTPS only responses (some things can only be retrieved over one or the other)
        retrievals['responses']['http'] = http_session['response']
        retrievals['responses']['https'] = https_session['response']

        if https_session['session']:
            retrievals['responses']['auto'] = https_session['response']
            retrievals['session'] = https_session['session']
        else:
            retrievals['responses']['auto'] = http_session['response']
            retrievals['session'] = http_session['session']

        # Store the contents of the base page
        retrievals['resources']['/'] = __get_page_text(
            retrievals['responses']['auto'])

        # Do a CORS preflight request
        retrievals['responses']['cors'] = __get(
            retrievals['session'], headers={'Origin': RETRIEVER_CORS_ORIGIN})

        # Store all the files we retrieve
        for resource in resources:
            resp = __get(retrievals['session'], resource)
            retrievals['resources'][resource] = __get_page_text(resp)

    return retrievals
Exemplo n.º 2
0
def retrieve_all(hostname: str) -> dict:
    retrievals = {
        'hostname': hostname,
        'resources': {
        },
        'responses': {
            'auto': None,  # whichever of 'http' or 'https' actually works, with 'https' as higher priority
            'cors': None,  # CORS preflight test
            'http': None,
            'https': None,
        },
        'session': None,
    }

    # The list of resources to get
    resources = (
        '/clientaccesspolicy.xml',
        '/contribute.json',
        '/crossdomain.xml',
        '/robots.txt'
    )

    # Get the headers from the database
    # TODO: Allow headers to be overridden on a per-scan basis?
    headers = select_site_headers(hostname)

    # Create some reusable sessions, one for HTTP and one for HTTPS
    http_session = __create_session('http://' + hostname + '/', headers=headers)
    https_session = __create_session('https://' + hostname + '/', headers=headers)

    # If neither one works, then the site just can't be loaded
    if not http_session['session'] and not https_session['session']:
        return retrievals

    else:
        # Store the HTTP only and HTTPS only responses (some things can only be retrieved over one or the other)
        retrievals['responses']['http'] = http_session['response']
        retrievals['responses']['https'] = https_session['response']

        if https_session['session']:
            retrievals['responses']['auto'] = https_session['response']
            retrievals['session'] = https_session['session']
        else:
            retrievals['responses']['auto'] = http_session['response']
            retrievals['session'] = http_session['session']

        # Store the contents of the base page
        retrievals['resources']['/'] = __get_page_text(retrievals['responses']['auto'])

        # Do a CORS preflight request
        retrievals['responses']['cors'] = __get(retrievals['session'],
                                                headers={'Origin': 'https://http-observatory.security.mozilla.org'})

        # Store all the files we retrieve
        for resource in resources:
            resp = __get(retrievals['session'], resource)
            retrievals['resources'][resource] = __get_page_text(resp)

    return retrievals
Exemplo n.º 3
0
def scan(hostname: str, site_id: int, scan_id: int):
    try:
        # Once celery kicks off the task, let's update the scan state from PENDING to RUNNING
        update_scan_state(scan_id, STATE_RUNNING)

        # Get the site's cookies and headers
        headers = select_site_headers(hostname)

        # Attempt to retrieve all the resources
        reqs = retrieve_all(hostname, cookies=headers['cookies'], headers=headers['headers'])

        # If we can't connect at all, let's abort the test
        if reqs['responses']['auto'] is None:
            update_scan_state(scan_id, STATE_FAILED, error='site down')

            return

        # Execute each test, replacing the underscores in the function name with dashes in the test name
        # TODO: Get overridden expectations
        insert_test_results(site_id,
                            scan_id,
                            [test(reqs) for test in tests],
                            sanitize_headers(reqs['responses']['auto'].headers),
                            reqs['responses']['auto'].status_code)

    # catch the celery timeout, which will almost certainly occur in retrieve_all()
    except SoftTimeLimitExceeded:
        update_scan_state(scan_id, STATE_ABORTED, error='site unresponsive')
    except (TimeLimitExceeded, WorkerLostError, WorkerShutdown, WorkerTerminate):
        raise
    # the database is down, oh no!
    except IOError:
        print('database down, aborting scan on {hostname}'.format(hostname=hostname), file=sys.stderr)
    except:
        # TODO: have more specific error messages
        e = sys.exc_info()[1]  # get the error message

        # If we are unsuccessful, close out the scan in the database
        update_scan_state(scan_id, STATE_FAILED, error=repr(e))

        # Print the exception to stderr if we're in dev
        if DEVELOPMENT_MODE:
            import traceback
            print('Error detected in scan for : ' + hostname)
            traceback.print_exc(file=sys.stderr)