def process_url(self, url): """ this function takes a specified url, loads it in the browser and returns json-formatted output with relevant request data, etc. the output_store class then puts this data in the db for later analysis """ # set up sql connection used to log errors and do checks if self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) # output store does the heavy lifting of analyzing browser output and storing to db output_store = OutputStore(self.db_engine, self.db_name) # support for loading same page with multiple browsers - purposefully undocumented for browser_type in self.browser_types: # import and set up specified browser driver # note we need to set up a new browser each time to # get a fresh profile if browser_type == 'chrome': browser_driver = ChromeDriver(ua=self.chrome_ua) # attempt to load the page, fail gracefully try: browser_output = browser_driver.get_webxray_scan_data( url, self.browser_wait) except: print('\t\t%-50s Browser %s Did Not Return' % (url[:50], browser_type)) sql_driver.log_error(url, 'Unable to load page') sql_driver.close() return # if there was a problem we log the error if browser_output['success'] == False: print('\t\t%-50s Browser %s Error: %s' % (url[:50], browser_type, browser_output['result'])) sql_driver.log_error(url, 'Unable to load page') sql_driver.close() return else: # no error, treat result as browser output browser_output = browser_output['result'] # attempt to store the output if output_store.store(url, browser_output): print('\t\t%-50s Success with %s' % (url[:50], browser_type)) else: print('\t\t%-50s Fail with %s' % (url[:50], browser_type)) sql_driver.log_error(url, 'Unable to load page') sql_driver.close() return
def store(self, url, browser_output, store_source=False, store_1p=True): """ this is the primary function of this class, it takes the url of the given page and the request and cookie data generated by the browser data is cleaned up with some minor analysis (eg file types) and stored for later in-depth analysis. there is an option to store first party requests as well as third, turned on by default to save disk space turn off store_1p there is also an option to get file hashes, this introduces serious overhead and is turned off by default """ # open up a sql connection if self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) else: print('INVALED DB ENGINE FOR %s, QUITTING!' % db_engine) exit() # get the ip, fqdn, domain, pubsuffix, and tld # we need the domain to figure out if cookies/elements are third-party origin_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(url) # if we can't get page domain info we fail gracefully if origin_ip_fqdn_domain_pubsuffix_tld is None: sql_driver.log_error(url, 'Could not parse TLD for %s' % url) return False origin_ip = origin_ip_fqdn_domain_pubsuffix_tld[0] origin_fqdn = origin_ip_fqdn_domain_pubsuffix_tld[1] origin_domain = origin_ip_fqdn_domain_pubsuffix_tld[2] origin_pubsuffix = origin_ip_fqdn_domain_pubsuffix_tld[3] origin_tld = origin_ip_fqdn_domain_pubsuffix_tld[4] # sql_driver.add_domain both stores the new domain and returns its db row id # if it is already in db just return the existing id page_domain_id = sql_driver.add_domain(origin_ip, origin_fqdn, origin_domain, origin_pubsuffix, origin_tld) # if the final page is https (often after a redirect), mark it appropriately if browser_output['final_url'][:5] == 'https': page_is_ssl = True else: page_is_ssl = False if store_source: source = browser_output['source'] else: source = None # add page page_id = sql_driver.add_page( browser_output['browser_type'], browser_output['browser_version'], browser_output['browser_wait'], browser_output['title'], browser_output['meta_desc'], url, browser_output['final_url'], page_is_ssl, source, browser_output['load_time'], page_domain_id ) # store cookies for cookie in browser_output['cookies']: # get the ip, fqdn, domain, pubsuffix, and tld # we need the domain to figure out if cookies/elements are third-party # note: # url_parser fails on non-http, we should fix this, right now a lame hack is to prepend http:// cookie_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld('http://'+cookie['domain']) # something went wrong, log and fail gracefully if cookie_ip_fqdn_domain_pubsuffix_tld is None: sql_driver.log_error(url, 'Error parsing cookie with domain: '+cookie['domain']) continue # otherwise, everything went fine cookie_ip = cookie_ip_fqdn_domain_pubsuffix_tld[0] cookie_fqdn = cookie_ip_fqdn_domain_pubsuffix_tld[1] cookie_domain = cookie_ip_fqdn_domain_pubsuffix_tld[2] cookie_pubsuffix = cookie_ip_fqdn_domain_pubsuffix_tld[3] cookie_tld = cookie_ip_fqdn_domain_pubsuffix_tld[4] # mark third-party cookies if origin_domain != cookie_domain: is_3p_cookie = True else: is_3p_cookie = False # this is a first party cookie, see if we want to store it if is_3p_cookie is False and store_1p is False: continue # sql_driver.add_domain both stores the new domain and returns its id cookie_domain_id = sql_driver.add_domain(cookie_ip, cookie_fqdn, cookie_domain, cookie_pubsuffix, cookie_tld) # name and domain are required, so if they fail we just continue try: name = cookie['name'] except: continue try: domain = cookie_domain except: continue # these are optional, fill with null values if fail try: secure = cookie['secure'] except: secure = None try: path = cookie['path'] except: path = None try: httponly = cookie['httponly'] except: httponly = None try: expiry = cookie['expiry'] except: expiry = None try: value = cookie['value'] except: value = None # all done with this cookie sql_driver.add_cookie( page_id, name, secure, path, domain, httponly, expiry, value, is_3p_cookie, cookie_domain_id ) # process requests now for request in browser_output['processed_requests']: # if the request starts with the following we can't parse anyway, so skip if re.match('^(data|about|chrome|blob).+', request): continue # get the ip, fqdn, domain, pubsuffix, and tld # we need the domain to figure out if cookies/elements are third-party element_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(request) # problem with this request, log and fail gracefully if element_ip_fqdn_domain_pubsuffix_tld is None: sql_driver.log_error(url, 'Error parsing element request: '+request) continue element_ip = element_ip_fqdn_domain_pubsuffix_tld[0] element_fqdn = element_ip_fqdn_domain_pubsuffix_tld[1] element_domain = element_ip_fqdn_domain_pubsuffix_tld[2] element_pubsuffix = element_ip_fqdn_domain_pubsuffix_tld[3] element_tld = element_ip_fqdn_domain_pubsuffix_tld[4] # sql_driver.add_domain both stores the new domain and returns its db row id element_domain_id = sql_driver.add_domain(element_ip, element_fqdn, element_domain, element_pubsuffix, element_tld) # mark third-party elements based on domain if origin_domain != element_domain: is_3p_element = True else: is_3p_element = False # if we are not storing 1p elements continue if is_3p_element is False and store_1p is False: continue if request[:5] == 'https' or request[:3] == 'wss': element_is_ssl = True else: element_is_ssl = False try: received = browser_output['processed_requests'][request]['received'] except: received = None # get domain of referer and determine if page leaked by referer try: referer = browser_output['processed_requests'][request]['referer'] except: referer = None if referer and len(referer) != 0: referer_ip_fqdn_domain_pubsuffix_tld = self.url_parser.get_ip_fqdn_domain_pubsuffix_tld(referer) if referer_ip_fqdn_domain_pubsuffix_tld: if referer_ip_fqdn_domain_pubsuffix_tld[2] == origin_domain: page_domain_in_referer = True else: page_domain_in_referer = False else: page_domain_in_referer = None sql_driver.log_error(url, 'Error parsing referer header: '+referer) else: page_domain_in_referer = None try: start_time_offset = browser_output['processed_requests'][request]['start_time_offset'] except: start_time_offset = None try: load_time = browser_output['processed_requests'][request]['load_time'] except: load_time = None try: status = browser_output['processed_requests'][request]['status'] except: status = None try: status_text = browser_output['processed_requests'][request]['status_text'] except: status_text = None try: content_type = browser_output['processed_requests'][request]['content_type'] except: content_type = None try: body_size = browser_output['processed_requests'][request]['body_size'] except: body_size = None try: request_headers = str(browser_output['processed_requests'][request]['request_headers']) except: request_headers = None try: response_headers = str(browser_output['processed_requests'][request]['response_headers']) except: response_headers = None # consider anything before the "?" to be the element_url try: element_url = re.search('^(.+?)\?.+$', request).group(1) except: element_url = request # consider anything after the "?" to be the args try: element_args = re.search('^.+(\?.+)$', request).group(1) # start url args except: element_args = None # attempt to parse off the extension try: element_extension = re.search('\.([0-9A-Za-z]+)$', element_url).group(1).lower() except: element_extension = None # lists of common extensions, can be expanded image_extensions = ['png', 'jpg', 'jpgx', 'jpeg', 'gif', 'svg', 'bmp', 'tif', 'tiff', 'webp', 'srf'] script_extensions = ['js', 'javascript'] data_extensions = ['json', 'jsonp', 'xml'] font_extentions = ['woff', 'ttf', 'otf'] static_extentions = ['html', 'htm', 'shtml'] dynamic_extentions = ['php', 'asp', 'jsp', 'aspx', 'ashx', 'pl', 'cgi', 'fcgi'] # figure out what type of element it is if element_extension in image_extensions: element_type = 'image' elif element_extension in script_extensions: element_type = 'javascript' elif element_extension in data_extensions: element_type = 'data_structured' elif element_extension == 'css': element_type = 'style_sheet' elif element_extension in font_extentions: element_type = 'font' elif element_extension in static_extentions: element_type = 'page_static' elif element_extension == dynamic_extentions: element_type = 'page_dynamic' elif element_extension == 'swf' or element_extension == 'fla': element_type = 'Shockwave Flash' else: element_type = None file_md5 = None # final tasks is to truncate the request if it is # over 2k characters as it is likely # binary data and may cause problems inserting # into TEXT fields in database # # TODO: # better handle binary data in general if len(request) >= 2000: request = request[:2000] if len(element_url) >= 2000: element_url = element_url[:2000] # store request sql_driver.add_element( page_id, request, element_url, is_3p_element, element_is_ssl, received, referer, page_domain_in_referer, start_time_offset, load_time, status, status_text, content_type, body_size, request_headers, response_headers, element_extension, element_type, element_args, element_domain_id ) # close db connection sql_driver.close() return True
def process_tasks_from_queue(self, process_num): """ Selects the next page from the task_queue and passes to process_url. If load is unsucessful places page back into queue and updates attempts. Returns once when there are no pages in the queue under max_attempts. """ print('\t[p.%s]\t🏃♂️ Starting process' % process_num) # need a local connection for each queue manager if self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(self.db_name) else: print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() # keep getting tasks from queue until none are left at max attempt level while sql_driver.get_task_queue_length( max_attempts=self.config['max_attempts'], unlocked_only=True) != 0: # it is possible for two processes to both pass the above conditional # and then try to get a task from the queue at the same time. # however, the second process that attempts to get a task will # get an empty result (and crash), so we have a try/except block here # to handle that condition gracefully try: target, task = sql_driver.get_task_from_queue( max_attempts=self.config['max_attempts'], client_id=self.client_id) except: break print('\t[p.%s]\t👉 Initializing: %s for target %s' % (process_num, task, target[:50])) # import and set up specified browser driver # note we set up a new browser each time to # get a fresh profile if self.browser_config['client_browser_type'] == 'chrome': browser_driver = ChromeDriver(self.browser_config, port_offset=process_num) else: print( f"🥴 INVALID BROWSER TYPE for {self.browser_config['client_browser_type']}!" ) return # does the webxray scan or policy capture if task == 'get_scan': task_result = browser_driver.get_scan(target) elif task == 'get_crawl': task_result = browser_driver.get_crawl(json.loads(target)) elif task == 'get_policy': task_result = browser_driver.get_scan(target, get_text_only=True) elif task == 'get_random_crawl': task_result = browser_driver.get_random_crawl(target) # kill browser del browser_driver # browser has failed to get result, unlock and continue if task_result['success'] == False: print('\t[p.%s]\t👎 Error: %s %s' % (process_num, target[:50], task_result['result'])) # for times we don't want to retry, such as a rejected # redirect or network resolution failure, this could be expanded fail_cases = [ 'reached fail limit', 'rejecting redirect', 'did not find enough internal links' ] if task_result[ 'result'] in fail_cases or 'ERR_NAME_NOT_RESOLVED' in task_result[ 'result']: sql_driver.set_task_as_failed(target, task) else: sql_driver.unlock_task_in_queue(target, task) # keep track of error regardless of fail/unlock sql_driver.log_error({ 'client_id': 'localhost', 'target': target, 'task': task, 'msg': task_result['result'] }) continue # debug if self.debug: print( '\t[p.%s]\t📥 Got browser result on task %s, going to store: %s' % (process_num, task, target[:50])) # store_result also handles task queue mangement store_result = self.store_result({ 'target': target, 'task': task, 'task_result': task_result['result'], 'client_id': self.client_id }) if store_result['success'] == True: print(f'\t[p.{process_num}]\t👍 Success: {target[:50]}') else: print( f'\t[p.{process_num}]\t👎 Error: {target[:50]} {store_result["result"]}' ) # tidy up sql_driver.close() del sql_driver print('\t[p.%s]\t✋ Completed process' % process_num) return
def store_result(self, params): """ Handles storing task_result and removing jobs from the task_queue. """ # unpack params target = params['target'] task = params['task'] task_result = params['task_result'] client_id = params['client_id'] # client_ip is optional if 'client_ip' in params: client_ip = params['client_ip'] else: client_ip = None # if db_name is specified we are running in server mode and we # connect to the db which corresponds to the result being # processed. otherwise, we use the global db_name as we are # running in non-server mode. if 'db_name' in params: if self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(params['db_name']) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(params['db_name']) else: print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() output_store = OutputStore(params['db_name'], self.db_engine) else: if self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(self.db_name) else: print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() output_store = OutputStore(self.db_name, self.db_engine) if task == 'get_policy': store_result = output_store.store_policy(task_result, client_id, client_ip=client_ip) # we never retry policies sql_driver.remove_task_from_queue(target, task) if store_result['success']: result = {'success': True} else: # log error sql_driver.log_error({ 'client_id': client_id, 'task': task, 'target': target, 'msg': 'output_store fail on ' + store_result['result'] }) result = {'success': False, 'result': store_result['result']} # elif task == 'get_crawl' or task == 'get_random_crawl': else: all_crawls_ok = True # We want to be able to re-run random crawls, and to do so we make sure # the crawl_id will match if task == 'get_crawl' or task == 'get_scan': crawl_id = target elif task == 'get_random_crawl': crawl_id = [] for result in task_result: crawl_id.append(result['start_url']) crawl_id = json.dumps(crawl_id) # tweak to account for differences between scans/crawls if task == 'get_scan': task_result = [task_result] # keep track of domains all_3p_cookie_domains = set() all_3p_dom_storage_domains = set() all_3p_request_domains = set() all_3p_response_domains = set() all_3p_websocket_domains = set() # When we store a crawl we add optional fields in the page table # that allow us to connect the page loads into a single crawl. # the crawl_id is a hash of the target (which is a json string # derived from the url_list), and the crawl_timestamp which is the # first accessed time from the crawl. for crawl_sequence, result in enumerate(task_result): store_result = output_store.store_scan({ 'browser_output': result, 'client_id': client_id, 'crawl_id': crawl_id, 'crawl_timestamp': task_result[0]['accessed'], 'crawl_sequence': crawl_sequence, 'client_ip': client_ip }) if store_result['success'] != True: all_crawls_ok = False else: # we are successful, create entries in page_lookup table page_lookup_table = self.build_lookup_table( 'page', store_result['page_id'], { 'requests': store_result['page_3p_request_domains'], 'responses': store_result['page_3p_response_domains'], 'websockets': store_result['page_3p_websocket_domains'], 'dom_storage': store_result['page_3p_dom_storage_domains'], 'cookies': store_result['page_3p_dom_storage_domains'] }) for lookup_item in page_lookup_table: sql_driver.add_page_id_domain_lookup_item( page_lookup_table[lookup_item]) # we are also making a lookup table for the crawl, keep joing the # sets as we go along all_3p_request_domains.update( store_result['page_3p_request_domains']) all_3p_response_domains.update( store_result['page_3p_response_domains']) all_3p_websocket_domains.update( store_result['page_3p_websocket_domains']) all_3p_dom_storage_domains.update( store_result['page_3p_dom_storage_domains']) all_3p_cookie_domains.update( store_result['page_3p_dom_storage_domains']) if all_crawls_ok: sql_driver.remove_task_from_queue(target, task) result = {'success': True} # build crawl lookup table crawl_lookup_table = self.build_lookup_table( 'crawl', crawl_id, { 'requests': all_3p_request_domains, 'responses': all_3p_response_domains, 'websockets': all_3p_websocket_domains, 'dom_storage': all_3p_dom_storage_domains, 'cookies': all_3p_cookie_domains }) # patch lookup table for lookup_item in crawl_lookup_table: sql_driver.add_crawl_id_domain_lookup_item( crawl_lookup_table[lookup_item]) else: sql_driver.unlock_task_in_queue(target, task) # log error sql_driver.log_error({ 'client_id': client_id, 'task': task, 'target': target, 'msg': 'output_store fail to store all scans for crawl_id_target ' + target }) result = { 'success': False, 'result': 'unable to store all crawl loads' } # tidy up output_store.close() sql_driver.close() # done return result