def retrieve_all(hostname: str) -> dict: retrievals = { 'hostname': hostname, 'resources': {}, 'responses': { 'auto': None, # whichever of 'http' or 'https' actually works, with 'https' as higher priority 'cors': None, # CORS preflight test 'http': None, 'https': None, }, 'session': None, } # The list of resources to get resources = ('/clientaccesspolicy.xml', '/contribute.json', '/crossdomain.xml', '/robots.txt') # Get the headers and cookies from the database # TODO: Allow headers to be overridden on a per-scan basis? headers = select_site_headers(hostname) # Create some reusable sessions, one for HTTP and one for HTTPS http_session = __create_session('http://' + hostname + '/', headers=headers['headers'], cookies=headers['cookies']) https_session = __create_session('https://' + hostname + '/', headers=headers['headers'], cookies=headers['cookies']) # If neither one works, then the site just can't be loaded if not http_session['session'] and not https_session['session']: return retrievals else: # Store the HTTP only and HTTPS only responses (some things can only be retrieved over one or the other) retrievals['responses']['http'] = http_session['response'] retrievals['responses']['https'] = https_session['response'] if https_session['session']: retrievals['responses']['auto'] = https_session['response'] retrievals['session'] = https_session['session'] else: retrievals['responses']['auto'] = http_session['response'] retrievals['session'] = http_session['session'] # Store the contents of the base page retrievals['resources']['/'] = __get_page_text( retrievals['responses']['auto']) # Do a CORS preflight request retrievals['responses']['cors'] = __get( retrievals['session'], headers={'Origin': RETRIEVER_CORS_ORIGIN}) # Store all the files we retrieve for resource in resources: resp = __get(retrievals['session'], resource) retrievals['resources'][resource] = __get_page_text(resp) return retrievals
def retrieve_all(hostname: str) -> dict: retrievals = { 'hostname': hostname, 'resources': { }, 'responses': { 'auto': None, # whichever of 'http' or 'https' actually works, with 'https' as higher priority 'cors': None, # CORS preflight test 'http': None, 'https': None, }, 'session': None, } # The list of resources to get resources = ( '/clientaccesspolicy.xml', '/contribute.json', '/crossdomain.xml', '/robots.txt' ) # Get the headers from the database # TODO: Allow headers to be overridden on a per-scan basis? headers = select_site_headers(hostname) # Create some reusable sessions, one for HTTP and one for HTTPS http_session = __create_session('http://' + hostname + '/', headers=headers) https_session = __create_session('https://' + hostname + '/', headers=headers) # If neither one works, then the site just can't be loaded if not http_session['session'] and not https_session['session']: return retrievals else: # Store the HTTP only and HTTPS only responses (some things can only be retrieved over one or the other) retrievals['responses']['http'] = http_session['response'] retrievals['responses']['https'] = https_session['response'] if https_session['session']: retrievals['responses']['auto'] = https_session['response'] retrievals['session'] = https_session['session'] else: retrievals['responses']['auto'] = http_session['response'] retrievals['session'] = http_session['session'] # Store the contents of the base page retrievals['resources']['/'] = __get_page_text(retrievals['responses']['auto']) # Do a CORS preflight request retrievals['responses']['cors'] = __get(retrievals['session'], headers={'Origin': 'https://http-observatory.security.mozilla.org'}) # Store all the files we retrieve for resource in resources: resp = __get(retrievals['session'], resource) retrievals['resources'][resource] = __get_page_text(resp) return retrievals
def scan(hostname: str, site_id: int, scan_id: int): try: # Once celery kicks off the task, let's update the scan state from PENDING to RUNNING update_scan_state(scan_id, STATE_RUNNING) # Get the site's cookies and headers headers = select_site_headers(hostname) # Attempt to retrieve all the resources reqs = retrieve_all(hostname, cookies=headers['cookies'], headers=headers['headers']) # If we can't connect at all, let's abort the test if reqs['responses']['auto'] is None: update_scan_state(scan_id, STATE_FAILED, error='site down') return # Execute each test, replacing the underscores in the function name with dashes in the test name # TODO: Get overridden expectations insert_test_results(site_id, scan_id, [test(reqs) for test in tests], sanitize_headers(reqs['responses']['auto'].headers), reqs['responses']['auto'].status_code) # catch the celery timeout, which will almost certainly occur in retrieve_all() except SoftTimeLimitExceeded: update_scan_state(scan_id, STATE_ABORTED, error='site unresponsive') except (TimeLimitExceeded, WorkerLostError, WorkerShutdown, WorkerTerminate): raise # the database is down, oh no! except IOError: print('database down, aborting scan on {hostname}'.format(hostname=hostname), file=sys.stderr) except: # TODO: have more specific error messages e = sys.exc_info()[1] # get the error message # If we are unsuccessful, close out the scan in the database update_scan_state(scan_id, STATE_FAILED, error=repr(e)) # Print the exception to stderr if we're in dev if DEVELOPMENT_MODE: import traceback print('Error detected in scan for : ' + hostname) traceback.print_exc(file=sys.stderr)