示例#1
0
    def process_tasks_from_queue(self, process_num):
        """
		Selects the next page from the task_queue and passes to 
			process_url.  If load is unsucessful places page
			back into queue and updates attempts.  Returns once 
			when there are no pages in the queue under max_attempts.
		"""

        print('\t[p.%s]\t🏃‍♂️ Starting process' % process_num)

        # need a local connection for each queue manager
        if self.db_engine == 'sqlite':
            from webxray.SQLiteDriver import SQLiteDriver
            sql_driver = SQLiteDriver(self.db_name)
        elif self.db_engine == 'postgres':
            from webxray.PostgreSQLDriver import PostgreSQLDriver
            sql_driver = PostgreSQLDriver(self.db_name)
        else:
            print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
            quit()

        # keep getting tasks from queue until none are left at max attempt level
        while sql_driver.get_task_queue_length(
                max_attempts=self.config['max_attempts'],
                unlocked_only=True) != 0:
            # it is possible for two processes to both pass the above conditional
            #	and then try to get a task from the queue at the same time.
            #	however, the second process that attempts to get a task will
            #	get an empty result (and crash), so we have a try/except block here
            #	to handle that condition gracefully
            try:
                target, task = sql_driver.get_task_from_queue(
                    max_attempts=self.config['max_attempts'],
                    client_id=self.client_id)
            except:
                break

            print('\t[p.%s]\t👉 Initializing: %s for target %s' %
                  (process_num, task, target[:50]))

            # import and set up specified browser driver
            # 	note we set up a new browser each time to
            #	get a fresh profile
            if self.browser_config['client_browser_type'] == 'chrome':
                browser_driver = ChromeDriver(self.browser_config,
                                              port_offset=process_num)
            else:
                print(
                    f"🥴 INVALID BROWSER TYPE for {self.browser_config['client_browser_type']}!"
                )
                return

            # does the webxray scan or policy capture
            if task == 'get_scan':
                task_result = browser_driver.get_scan(target)
            elif task == 'get_crawl':
                task_result = browser_driver.get_crawl(json.loads(target))
            elif task == 'get_policy':
                task_result = browser_driver.get_scan(target,
                                                      get_text_only=True)
            elif task == 'get_random_crawl':
                task_result = browser_driver.get_random_crawl(target)

            # kill browser
            del browser_driver

            # browser has failed to get result, unlock and continue
            if task_result['success'] == False:
                print('\t[p.%s]\t👎 Error: %s %s' %
                      (process_num, target[:50], task_result['result']))

                # for times we don't want to retry, such as a rejected
                #	redirect or network resolution failure, this could be expanded
                fail_cases = [
                    'reached fail limit', 'rejecting redirect',
                    'did not find enough internal links'
                ]

                if task_result[
                        'result'] in fail_cases or 'ERR_NAME_NOT_RESOLVED' in task_result[
                            'result']:
                    sql_driver.set_task_as_failed(target, task)
                else:
                    sql_driver.unlock_task_in_queue(target, task)

                # keep track of error regardless of fail/unlock
                sql_driver.log_error({
                    'client_id': 'localhost',
                    'target': target,
                    'task': task,
                    'msg': task_result['result']
                })
                continue

            # debug
            if self.debug:
                print(
                    '\t[p.%s]\t📥 Got browser result on task %s, going to store: %s'
                    % (process_num, task, target[:50]))

            # store_result also handles task queue mangement
            store_result = self.store_result({
                'target':
                target,
                'task':
                task,
                'task_result':
                task_result['result'],
                'client_id':
                self.client_id
            })

            if store_result['success'] == True:
                print(f'\t[p.{process_num}]\t👍 Success: {target[:50]}')
            else:
                print(
                    f'\t[p.{process_num}]\t👎 Error: {target[:50]} {store_result["result"]}'
                )

        # tidy up
        sql_driver.close()
        del sql_driver

        print('\t[p.%s]\t✋ Completed process' % process_num)
        return
示例#2
0
    def store_result(self, params):
        """
		Handles storing task_result and removing jobs
			from the task_queue.
		"""

        # unpack params
        target = params['target']
        task = params['task']
        task_result = params['task_result']
        client_id = params['client_id']

        # client_ip is optional
        if 'client_ip' in params:
            client_ip = params['client_ip']
        else:
            client_ip = None

        # if db_name is specified we are running in server mode and we
        #	connect to the db which corresponds to the result being
        #	processed.  otherwise, we use the global db_name as we are
        #	running in non-server mode.
        if 'db_name' in params:
            if self.db_engine == 'sqlite':
                from webxray.SQLiteDriver import SQLiteDriver
                sql_driver = SQLiteDriver(params['db_name'])
            elif self.db_engine == 'postgres':
                from webxray.PostgreSQLDriver import PostgreSQLDriver
                sql_driver = PostgreSQLDriver(params['db_name'])
            else:
                print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
                quit()
            output_store = OutputStore(params['db_name'], self.db_engine)
        else:
            if self.db_engine == 'sqlite':
                from webxray.SQLiteDriver import SQLiteDriver
                sql_driver = SQLiteDriver(self.db_name)
            elif self.db_engine == 'postgres':
                from webxray.PostgreSQLDriver import PostgreSQLDriver
                sql_driver = PostgreSQLDriver(self.db_name)
            else:
                print('INVALID DB ENGINE FOR %s, QUITTING!' % db_engine)
                quit()

            output_store = OutputStore(self.db_name, self.db_engine)

        if task == 'get_policy':
            store_result = output_store.store_policy(task_result,
                                                     client_id,
                                                     client_ip=client_ip)
            # we never retry policies
            sql_driver.remove_task_from_queue(target, task)
            if store_result['success']:
                result = {'success': True}
            else:
                # log error
                sql_driver.log_error({
                    'client_id':
                    client_id,
                    'task':
                    task,
                    'target':
                    target,
                    'msg':
                    'output_store fail on ' + store_result['result']
                })
                result = {'success': False, 'result': store_result['result']}
        # elif task == 'get_crawl' or task == 'get_random_crawl':
        else:
            all_crawls_ok = True

            # We want to be able to re-run random crawls, and to do so we make sure
            #	the crawl_id will match
            if task == 'get_crawl' or task == 'get_scan':
                crawl_id = target
            elif task == 'get_random_crawl':
                crawl_id = []
                for result in task_result:
                    crawl_id.append(result['start_url'])
                crawl_id = json.dumps(crawl_id)

            # tweak to account for differences between scans/crawls
            if task == 'get_scan': task_result = [task_result]

            # keep track of domains
            all_3p_cookie_domains = set()
            all_3p_dom_storage_domains = set()
            all_3p_request_domains = set()
            all_3p_response_domains = set()
            all_3p_websocket_domains = set()

            # When we store a crawl we add optional fields in the page table
            #	that allow us to connect the page loads into a single crawl.
            #	the crawl_id is a hash of the target (which is a json string
            #	derived from the url_list), and the crawl_timestamp which is the
            #	first accessed time from the crawl.
            for crawl_sequence, result in enumerate(task_result):
                store_result = output_store.store_scan({
                    'browser_output':
                    result,
                    'client_id':
                    client_id,
                    'crawl_id':
                    crawl_id,
                    'crawl_timestamp':
                    task_result[0]['accessed'],
                    'crawl_sequence':
                    crawl_sequence,
                    'client_ip':
                    client_ip
                })

                if store_result['success'] != True:
                    all_crawls_ok = False
                else:
                    # we are successful, create entries in page_lookup table
                    page_lookup_table = self.build_lookup_table(
                        'page', store_result['page_id'], {
                            'requests':
                            store_result['page_3p_request_domains'],
                            'responses':
                            store_result['page_3p_response_domains'],
                            'websockets':
                            store_result['page_3p_websocket_domains'],
                            'dom_storage':
                            store_result['page_3p_dom_storage_domains'],
                            'cookies':
                            store_result['page_3p_dom_storage_domains']
                        })

                    for lookup_item in page_lookup_table:
                        sql_driver.add_page_id_domain_lookup_item(
                            page_lookup_table[lookup_item])

                    # we are also making a lookup table for the crawl, keep joing the
                    #	sets as we go along
                    all_3p_request_domains.update(
                        store_result['page_3p_request_domains'])
                    all_3p_response_domains.update(
                        store_result['page_3p_response_domains'])
                    all_3p_websocket_domains.update(
                        store_result['page_3p_websocket_domains'])
                    all_3p_dom_storage_domains.update(
                        store_result['page_3p_dom_storage_domains'])
                    all_3p_cookie_domains.update(
                        store_result['page_3p_dom_storage_domains'])

            if all_crawls_ok:
                sql_driver.remove_task_from_queue(target, task)
                result = {'success': True}

                # build crawl lookup table
                crawl_lookup_table = self.build_lookup_table(
                    'crawl', crawl_id, {
                        'requests': all_3p_request_domains,
                        'responses': all_3p_response_domains,
                        'websockets': all_3p_websocket_domains,
                        'dom_storage': all_3p_dom_storage_domains,
                        'cookies': all_3p_cookie_domains
                    })

                # patch lookup table
                for lookup_item in crawl_lookup_table:
                    sql_driver.add_crawl_id_domain_lookup_item(
                        crawl_lookup_table[lookup_item])

            else:
                sql_driver.unlock_task_in_queue(target, task)
                # log error
                sql_driver.log_error({
                    'client_id':
                    client_id,
                    'task':
                    task,
                    'target':
                    target,
                    'msg':
                    'output_store fail to store all scans for crawl_id_target '
                    + target
                })
                result = {
                    'success': False,
                    'result': 'unable to store all crawl loads'
                }

        # tidy up
        output_store.close()
        sql_driver.close()

        # done
        return result