示例#1
0
def empty_requests(http_equiv_file=None) -> dict:
    req = {
        'hostname': 'http-observatory.security.mozilla.org',
        'resources': {
            '__path__': None,
            '/': None,
            '/clientaccesspolicy.xml': None,
            '/contribute.json': None,
            '/crossdomain.xml': None,
            '/robots.txt': None,
        },
        'responses': {
            'auto': UserDict(),
            'cors': None,
            'http': None,
            'https': None,
        },
        'session': UserDict(),
    }

    # Parse the HTML file for its own headers, if requested
    if http_equiv_file:
        __dirname = os.path.abspath(os.path.dirname(__file__))

        with open(
                os.path.join(__dirname, 'unittests', 'files', http_equiv_file),
                'r') as f:
            html = f.read()

        # Load the HTML file into the object for content tests.
        req['resources']['__path__'] = html

    req['responses']['auto'].headers = {
        'Content-Type': 'text/html',
    }
    req['responses']['auto'].history = []
    req['responses']['auto'].request = UserDict()
    req['responses']['auto'].request.headers = UserDict()
    req['responses']['auto'].status_code = 200
    req['responses'][
        'auto'].url = 'https://http-observatory.security.mozilla.org/'
    req['responses']['auto'].verified = True

    req['session'].cookies = RequestsCookieJar()

    req['responses']['cors'] = deepcopy(req['responses']['auto'])
    req['responses']['http'] = deepcopy(req['responses']['auto'])
    req['responses']['https'] = deepcopy(req['responses']['auto'])

    # Parse the HTML file for its own headers, if requested
    if http_equiv_file:
        req['responses']['auto'].http_equiv = parse_http_equiv_headers(
            req['resources']['__path__'])
    else:
        req['responses']['auto'].http_equiv = {}

    return req
示例#2
0
def retrieve_all(hostname, **kwargs):
    kwargs['cookies'] = kwargs.get(
        'cookies', {})  # HTTP cookies to send, instead of from the database
    kwargs['headers'] = kwargs.get(
        'headers', {})  # HTTP headers to send, instead of from the database

    # This way of doing it keeps the urls tidy even if makes the code ugly
    kwargs['http_port'] = ':' + str(kwargs.get(
        'http_port', '')) if 'http_port' in kwargs else ''
    kwargs['https_port'] = ':' + str(kwargs.get(
        'https_port', '')) if 'https_port' in kwargs else ''
    kwargs['path'] = kwargs.get('path', '/')
    kwargs['verify'] = kwargs.get('verify', True)

    retrievals = {
        'hostname': hostname,
        'resources': {},
        'responses': {
            'auto':
            None,  # whichever of 'http' or 'https' actually works, with 'https' as higher priority
            'cors': None,  # CORS preflight test
            'http': None,
            'https': None,
        },
        'session': None,
    }

    # The list of resources to get
    resources = ('/clientaccesspolicy.xml', '/contribute.json',
                 '/crossdomain.xml', '/robots.txt')

    # Create some reusable sessions, one for HTTP and one for HTTPS
    http_session = __create_session(
        'http://' + hostname + kwargs['http_port'] + kwargs['path'], **kwargs)
    https_session = __create_session(
        'https://' + hostname + kwargs['https_port'] + kwargs['path'],
        **kwargs)

    # If neither one works, then the site just can't be loaded
    if http_session['session'] is None and https_session['session'] is None:
        return retrievals

    else:
        # Store the HTTP only and HTTPS only responses (some things can only be retrieved over one or the other)
        retrievals['responses']['http'] = http_session['response']
        retrievals['responses']['https'] = https_session['response']

        if https_session['session'] is not None:
            retrievals['responses']['auto'] = https_session['response']
            retrievals['session'] = https_session['session']
        else:
            retrievals['responses']['auto'] = http_session['response']
            retrievals['session'] = http_session['session']

        # Store the contents of the "base" page
        retrievals['resources']['__path__'] = __get_page_text(
            retrievals['responses']['auto'], force=True)

        # Do a CORS preflight request
        retrievals['responses']['cors'] = __get(
            retrievals['session'],
            kwargs['path'],
            headers={'Origin': RETRIEVER_CORS_ORIGIN})

        # Store all the files we retrieve
        for resource in resources:
            resp = __get(retrievals['session'], resource)
            retrievals['resources'][resource] = __get_page_text(resp)

    # Parse out the HTTP meta-equiv headers
    if (retrievals['responses']['auto'].headers.get(
            'Content-Type', '').split(';')[0] in HTML_TYPES
            and retrievals['resources']['__path__']):
        retrievals['responses']['auto'].http_equiv = parse_http_equiv_headers(
            retrievals['resources']['__path__'])
    else:
        retrievals['responses']['auto'].http_equiv = {}

    return retrievals
示例#3
0
def retrieve_all(hostname, **kwargs):
    kwargs['cookies'] = kwargs.get('cookies', {})   # HTTP cookies to send, instead of from the database
    kwargs['headers'] = kwargs.get('headers', {})   # HTTP headers to send, instead of from the database

    # This way of doing it keeps the urls tidy even if makes the code ugly
    kwargs['http_port'] = ':' + str(kwargs.get('http_port', '')) if 'http_port' in kwargs else ''
    kwargs['https_port'] = ':' + str(kwargs.get('https_port', '')) if 'https_port' in kwargs else ''
    kwargs['path'] = kwargs.get('path', '/')
    kwargs['verify'] = kwargs.get('verify', True)

    retrievals = {
        'hostname': hostname,
        'resources': {
        },
        'responses': {
            'auto': None,  # whichever of 'http' or 'https' actually works, with 'https' as higher priority
            'cors': None,  # CORS preflight test
            'http': None,
            'https': None,
        },
        'session': None,
    }

    # The list of resources to get
    resources = (
        '/clientaccesspolicy.xml',
        '/contribute.json',
        '/crossdomain.xml',
        '/robots.txt'
    )

    # Create some reusable sessions, one for HTTP and one for HTTPS
    http_session = __create_session('http://' + hostname + kwargs['http_port'] + kwargs['path'], **kwargs)
    https_session = __create_session('https://' + hostname + kwargs['https_port'] + kwargs['path'], **kwargs)

    # If neither one works, then the site just can't be loaded
    if http_session['session'] is None and https_session['session'] is None:
        return retrievals

    else:
        # Store the HTTP only and HTTPS only responses (some things can only be retrieved over one or the other)
        retrievals['responses']['http'] = http_session['response']
        retrievals['responses']['https'] = https_session['response']

        if https_session['session'] is not None:
            retrievals['responses']['auto'] = https_session['response']
            retrievals['session'] = https_session['session']
        else:
            retrievals['responses']['auto'] = http_session['response']
            retrievals['session'] = http_session['session']

        # Store the contents of the "base" page
        retrievals['resources']['__path__'] = __get_page_text(retrievals['responses']['auto'], force=True)

        # Do a CORS preflight request
        retrievals['responses']['cors'] = __get(retrievals['session'],
                                                kwargs['path'],
                                                headers={'Origin': RETRIEVER_CORS_ORIGIN})

        # Store all the files we retrieve
        for resource in resources:
            resp = __get(retrievals['session'], resource)
            retrievals['resources'][resource] = __get_page_text(resp)

    # Parse out the HTTP meta-equiv headers
    if (retrievals['responses']['auto'].headers.get('Content-Type', '').split(';')[0] in HTML_TYPES and
            retrievals['resources']['__path__']):
        retrievals['responses']['auto'].http_equiv = parse_http_equiv_headers(retrievals['resources']['__path__'])
    else:
        retrievals['responses']['auto'].http_equiv = {}

    return retrievals