def process_uri(self, uri): sql_driver = MySQLDriver(self.db_name) output_store = OutputStore(self.db_name) phantom_driver = PhantomDriver( '--ignore-ssl-errors=true --ssl-protocol=any', 'wbxr_logger.js') # this can be higher or lower depending on network load # generally, 90 seems to be fine, so keep with it try: phantom_output = phantom_driver.execute(uri, 90) except: print("\t\t%-50s Phantomjs Did Not Return." % uri[:50]) sql_driver.log_error(uri, "FAIL: Phantomjs Did Not Return.") return if re.match('^FAIL.+', phantom_output): print("\t\t%-50s Phantom Error\n\t%s" % (uri[:50], phantom_output)) sql_driver.log_error(uri, phantom_output) else: print("\t\t%-50s %s" % (uri[:50], output_store.store(uri, phantom_output))) # closes our db connections sql_driver.close() output_store.close() return
def process_uri(self, uri): sql_driver = MySQLDriver(self.db_name) output_store = OutputStore(self.db_name) phantom_driver = PhantomDriver('--ignore-ssl-errors=true --ssl-protocol=any', 'wbxr_logger.js') # this can be higher or lower depending on network load # generally, 90 seems to be fine, so keep with it try: phantom_output = phantom_driver.execute(uri, 90) except: print("\t\t%-50s Phantomjs Did Not Return." % uri[:50]) sql_driver.log_error(uri, "FAIL: Phantomjs Did Not Return.") return if re.match('^FAIL.+', phantom_output): print("\t\t%-50s Phantom Error\n\t%s" % (uri[:50], phantom_output)) sql_driver.log_error(uri, phantom_output) else: print("\t\t%-50s %s" % (uri[:50], output_store.store(uri, phantom_output))) # closes our db connections sql_driver.close() output_store.close() return
def store_result(self, params): """ Handles storing task_result and removing jobs from the task_queue. """ # unpack params target = params['target'] task = params['task'] task_result = params['task_result'] client_id = params['client_id'] # client_ip is optional if 'client_ip' in params: client_ip = params['client_ip'] else: client_ip = None # if db_name is specified we are running in server mode and we # connect to the db which corresponds to the result being # processed. otherwise, we use the global db_name as we are # running in non-server mode. if 'db_name' in params: if self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(params['db_name']) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(params['db_name']) else: print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() output_store = OutputStore(params['db_name'], self.db_engine) else: if self.db_engine == 'sqlite': from webxray.SQLiteDriver import SQLiteDriver sql_driver = SQLiteDriver(self.db_name) elif self.db_engine == 'postgres': from webxray.PostgreSQLDriver import PostgreSQLDriver sql_driver = PostgreSQLDriver(self.db_name) else: print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine) quit() output_store = OutputStore(self.db_name, self.db_engine) if task == 'get_policy': store_result = output_store.store_policy(task_result, client_id, client_ip=client_ip) # we never retry policies sql_driver.remove_task_from_queue(target, task) if store_result['success']: result = {'success': True} else: # log error sql_driver.log_error({ 'client_id': client_id, 'task': task, 'target': target, 'msg': 'output_store fail on ' + store_result['result'] }) result = {'success': False, 'result': store_result['result']} # elif task == 'get_crawl' or task == 'get_random_crawl': else: all_crawls_ok = True # We want to be able to re-run random crawls, and to do so we make sure # the crawl_id will match if task == 'get_crawl' or task == 'get_scan': crawl_id = target elif task == 'get_random_crawl': crawl_id = [] for result in task_result: crawl_id.append(result['start_url']) crawl_id = json.dumps(crawl_id) # tweak to account for differences between scans/crawls if task == 'get_scan': task_result = [task_result] # keep track of domains all_3p_cookie_domains = set() all_3p_dom_storage_domains = set() all_3p_request_domains = set() all_3p_response_domains = set() all_3p_websocket_domains = set() # When we store a crawl we add optional fields in the page table # that allow us to connect the page loads into a single crawl. # the crawl_id is a hash of the target (which is a json string # derived from the url_list), and the crawl_timestamp which is the # first accessed time from the crawl. for crawl_sequence, result in enumerate(task_result): store_result = output_store.store_scan({ 'browser_output': result, 'client_id': client_id, 'crawl_id': crawl_id, 'crawl_timestamp': task_result[0]['accessed'], 'crawl_sequence': crawl_sequence, 'client_ip': client_ip }) if store_result['success'] != True: all_crawls_ok = False else: # we are successful, create entries in page_lookup table page_lookup_table = self.build_lookup_table( 'page', store_result['page_id'], { 'requests': store_result['page_3p_request_domains'], 'responses': store_result['page_3p_response_domains'], 'websockets': store_result['page_3p_websocket_domains'], 'dom_storage': store_result['page_3p_dom_storage_domains'], 'cookies': store_result['page_3p_dom_storage_domains'] }) for lookup_item in page_lookup_table: sql_driver.add_page_id_domain_lookup_item( page_lookup_table[lookup_item]) # we are also making a lookup table for the crawl, keep joing the # sets as we go along all_3p_request_domains.update( store_result['page_3p_request_domains']) all_3p_response_domains.update( store_result['page_3p_response_domains']) all_3p_websocket_domains.update( store_result['page_3p_websocket_domains']) all_3p_dom_storage_domains.update( store_result['page_3p_dom_storage_domains']) all_3p_cookie_domains.update( store_result['page_3p_dom_storage_domains']) if all_crawls_ok: sql_driver.remove_task_from_queue(target, task) result = {'success': True} # build crawl lookup table crawl_lookup_table = self.build_lookup_table( 'crawl', crawl_id, { 'requests': all_3p_request_domains, 'responses': all_3p_response_domains, 'websockets': all_3p_websocket_domains, 'dom_storage': all_3p_dom_storage_domains, 'cookies': all_3p_cookie_domains }) # patch lookup table for lookup_item in crawl_lookup_table: sql_driver.add_crawl_id_domain_lookup_item( crawl_lookup_table[lookup_item]) else: sql_driver.unlock_task_in_queue(target, task) # log error sql_driver.log_error({ 'client_id': client_id, 'task': task, 'target': target, 'msg': 'output_store fail to store all scans for crawl_id_target ' + target }) result = { 'success': False, 'result': 'unable to store all crawl loads' } # tidy up output_store.close() sql_driver.close() # done return result