def empty_requests(http_equiv_file=None) -> dict: req = { 'hostname': 'http-observatory.security.mozilla.org', 'resources': { '__path__': None, '/': None, '/clientaccesspolicy.xml': None, '/contribute.json': None, '/crossdomain.xml': None, '/robots.txt': None, }, 'responses': { 'auto': UserDict(), 'cors': None, 'http': None, 'https': None, }, 'session': UserDict(), } # Parse the HTML file for its own headers, if requested if http_equiv_file: __dirname = os.path.abspath(os.path.dirname(__file__)) with open( os.path.join(__dirname, 'unittests', 'files', http_equiv_file), 'r') as f: html = f.read() # Load the HTML file into the object for content tests. req['resources']['__path__'] = html req['responses']['auto'].headers = { 'Content-Type': 'text/html', } req['responses']['auto'].history = [] req['responses']['auto'].request = UserDict() req['responses']['auto'].request.headers = UserDict() req['responses']['auto'].status_code = 200 req['responses'][ 'auto'].url = 'https://http-observatory.security.mozilla.org/' req['responses']['auto'].verified = True req['session'].cookies = RequestsCookieJar() req['responses']['cors'] = deepcopy(req['responses']['auto']) req['responses']['http'] = deepcopy(req['responses']['auto']) req['responses']['https'] = deepcopy(req['responses']['auto']) # Parse the HTML file for its own headers, if requested if http_equiv_file: req['responses']['auto'].http_equiv = parse_http_equiv_headers( req['resources']['__path__']) else: req['responses']['auto'].http_equiv = {} return req
def retrieve_all(hostname, **kwargs): kwargs['cookies'] = kwargs.get( 'cookies', {}) # HTTP cookies to send, instead of from the database kwargs['headers'] = kwargs.get( 'headers', {}) # HTTP headers to send, instead of from the database # This way of doing it keeps the urls tidy even if makes the code ugly kwargs['http_port'] = ':' + str(kwargs.get( 'http_port', '')) if 'http_port' in kwargs else '' kwargs['https_port'] = ':' + str(kwargs.get( 'https_port', '')) if 'https_port' in kwargs else '' kwargs['path'] = kwargs.get('path', '/') kwargs['verify'] = kwargs.get('verify', True) retrievals = { 'hostname': hostname, 'resources': {}, 'responses': { 'auto': None, # whichever of 'http' or 'https' actually works, with 'https' as higher priority 'cors': None, # CORS preflight test 'http': None, 'https': None, }, 'session': None, } # The list of resources to get resources = ('/clientaccesspolicy.xml', '/contribute.json', '/crossdomain.xml', '/robots.txt') # Create some reusable sessions, one for HTTP and one for HTTPS http_session = __create_session( 'http://' + hostname + kwargs['http_port'] + kwargs['path'], **kwargs) https_session = __create_session( 'https://' + hostname + kwargs['https_port'] + kwargs['path'], **kwargs) # If neither one works, then the site just can't be loaded if http_session['session'] is None and https_session['session'] is None: return retrievals else: # Store the HTTP only and HTTPS only responses (some things can only be retrieved over one or the other) retrievals['responses']['http'] = http_session['response'] retrievals['responses']['https'] = https_session['response'] if https_session['session'] is not None: retrievals['responses']['auto'] = https_session['response'] retrievals['session'] = https_session['session'] else: retrievals['responses']['auto'] = http_session['response'] retrievals['session'] = http_session['session'] # Store the contents of the "base" page retrievals['resources']['__path__'] = __get_page_text( retrievals['responses']['auto'], force=True) # Do a CORS preflight request retrievals['responses']['cors'] = __get( retrievals['session'], kwargs['path'], headers={'Origin': RETRIEVER_CORS_ORIGIN}) # Store all the files we retrieve for resource in resources: resp = __get(retrievals['session'], resource) retrievals['resources'][resource] = __get_page_text(resp) # Parse out the HTTP meta-equiv headers if (retrievals['responses']['auto'].headers.get( 'Content-Type', '').split(';')[0] in HTML_TYPES and retrievals['resources']['__path__']): retrievals['responses']['auto'].http_equiv = parse_http_equiv_headers( retrievals['resources']['__path__']) else: retrievals['responses']['auto'].http_equiv = {} return retrievals
def retrieve_all(hostname, **kwargs): kwargs['cookies'] = kwargs.get('cookies', {}) # HTTP cookies to send, instead of from the database kwargs['headers'] = kwargs.get('headers', {}) # HTTP headers to send, instead of from the database # This way of doing it keeps the urls tidy even if makes the code ugly kwargs['http_port'] = ':' + str(kwargs.get('http_port', '')) if 'http_port' in kwargs else '' kwargs['https_port'] = ':' + str(kwargs.get('https_port', '')) if 'https_port' in kwargs else '' kwargs['path'] = kwargs.get('path', '/') kwargs['verify'] = kwargs.get('verify', True) retrievals = { 'hostname': hostname, 'resources': { }, 'responses': { 'auto': None, # whichever of 'http' or 'https' actually works, with 'https' as higher priority 'cors': None, # CORS preflight test 'http': None, 'https': None, }, 'session': None, } # The list of resources to get resources = ( '/clientaccesspolicy.xml', '/contribute.json', '/crossdomain.xml', '/robots.txt' ) # Create some reusable sessions, one for HTTP and one for HTTPS http_session = __create_session('http://' + hostname + kwargs['http_port'] + kwargs['path'], **kwargs) https_session = __create_session('https://' + hostname + kwargs['https_port'] + kwargs['path'], **kwargs) # If neither one works, then the site just can't be loaded if http_session['session'] is None and https_session['session'] is None: return retrievals else: # Store the HTTP only and HTTPS only responses (some things can only be retrieved over one or the other) retrievals['responses']['http'] = http_session['response'] retrievals['responses']['https'] = https_session['response'] if https_session['session'] is not None: retrievals['responses']['auto'] = https_session['response'] retrievals['session'] = https_session['session'] else: retrievals['responses']['auto'] = http_session['response'] retrievals['session'] = http_session['session'] # Store the contents of the "base" page retrievals['resources']['__path__'] = __get_page_text(retrievals['responses']['auto'], force=True) # Do a CORS preflight request retrievals['responses']['cors'] = __get(retrievals['session'], kwargs['path'], headers={'Origin': RETRIEVER_CORS_ORIGIN}) # Store all the files we retrieve for resource in resources: resp = __get(retrievals['session'], resource) retrievals['resources'][resource] = __get_page_text(resp) # Parse out the HTTP meta-equiv headers if (retrievals['responses']['auto'].headers.get('Content-Type', '').split(';')[0] in HTML_TYPES and retrievals['resources']['__path__']): retrievals['responses']['auto'].http_equiv = parse_http_equiv_headers(retrievals['resources']['__path__']) else: retrievals['responses']['auto'].http_equiv = {} return retrievals