def test_increase_number_of_workers(self): worker_pool = Pool(processes=2, worker_names='WorkerThread', maxtasksperchild=3) self.assertEqual(worker_pool.get_worker_count(), 2) def noop(): return 1 + 2 for _ in xrange(12): result = worker_pool.apply_async(func=noop) self.assertEqual(result.get(), 3) self.assertEqual(worker_pool.get_worker_count(), 2) worker_pool.set_worker_count(4) # It takes some time... self.assertEqual(worker_pool.get_worker_count(), 2) for _ in xrange(12): result = worker_pool.apply_async(func=noop) self.assertEqual(result.get(), 3) self.assertEqual(worker_pool.get_worker_count(), 4) worker_pool.terminate() worker_pool.join()
def test_multiple_append_uniq_group(self): def multi_append(): for i in xrange(InfoSet.MAX_INFO_INSTANCES * 2): vuln = MockVuln() kb.append_uniq_group('a', 'b', vuln, group_klass=MockInfoSetTrue) info_set_list = kb.get('a', 'b') self.assertEqual(len(info_set_list), 1) info_set = info_set_list[0] self.assertEqual(len(info_set.infos), InfoSet.MAX_INFO_INSTANCES) return True pool = Pool(2) r1 = pool.apply_async(multi_append) r2 = pool.apply_async(multi_append) r3 = pool.apply_async(multi_append) self.assertTrue(r1.get()) self.assertTrue(r2.get()) self.assertTrue(r3.get()) pool.terminate() pool.join()
def test_pickleable_shells(self): pool = Pool(1) xurllib = ExtendedUrllib() original_shell = Shell(MockVuln(), xurllib, pool) kb.append('a', 'b', original_shell) unpickled_shell = kb.get('a', 'b')[0] self.assertEqual(original_shell, unpickled_shell) self.assertEqual(unpickled_shell.worker_pool, None) self.assertEqual(unpickled_shell._uri_opener, None) pool.terminate() pool.join() xurllib.end()
def test_max_queued_tasks(self): worker_pool = Pool(processes=1, max_queued_tasks=2) # These tasks should be queued very fast worker_pool.apply_async(func=time.sleep, args=(2, )) worker_pool.apply_async(func=time.sleep, args=(2, )) worker_pool.apply_async(func=time.sleep, args=(2, )) worker_pool.apply_async(func=time.sleep, args=(2, )) # Now the pool is full and we need to wait in the main # thread to get the task queued start = time.time() worker_pool.apply_async(func=time.sleep, args=(2, )) spent = time.time() - start worker_pool.close() worker_pool.join() self.assertLess(spent, 2.1) self.assertGreater(spent, 1.9)
def test_max_queued_tasks(self): worker_pool = Pool(processes=1, max_queued_tasks=2) # These tasks should be queued very fast worker_pool.apply_async(func=time.sleep, args=(2,)) worker_pool.apply_async(func=time.sleep, args=(2,)) worker_pool.apply_async(func=time.sleep, args=(2,)) worker_pool.apply_async(func=time.sleep, args=(2,)) # Now the pool is full and we need to wait in the main # thread to get the task queued start = time.time() worker_pool.apply_async(func=time.sleep, args=(2,)) spent = time.time() - start worker_pool.close() worker_pool.join() self.assertLess(spent, 2.1) self.assertGreater(spent, 1.9)
def get_fingerprint(url, threads): pool = Pool(worker_names='HMap', maxtasksperchild=2, processes=threads, max_queued_tasks=5) tests = { basic_get, basic_options, unknown_method, unauthorized_activity, nonexistant_object, malformed_method_line, long_url_ranges, long_default_ranges, many_header_ranges, large_header_ranges, unavailable_accept, fake_content_length } for test in tests: pool.apply_async(func=test, args=(url, )) pool.close() pool.join() pool.terminate() fingerprint['SYNTACTIC']['HEADER_ORDER'] = winnow_ordered_list( fingerprint['SYNTACTIC']['HEADER_ORDER']) return fingerprint
class BaseConsumer(Process): """ Consumer thread that takes fuzzable requests from a Queue that's populated by the crawl plugins and identified vulnerabilities by performing various requests. """ THREAD_POOL_SIZE = 10 def __init__(self, consumer_plugins, w3af_core, thread_name, create_pool=True, max_pool_queued_tasks=0, max_in_queue_size=0, thread_pool_size=None): """ :param consumer_plugins: Instances of base_consumer plugins in a list :param w3af_core: The w3af core that we'll use for status reporting :param thread_name: How to name the current thread, eg. Auditor :param create_pool: True to create a worker pool for this consumer """ super(BaseConsumer, self).__init__(name='%sController' % thread_name) self.in_queue = CachedQueue(maxsize=max_in_queue_size, name=thread_name + 'In') # # Crawl and infrastructure plugins write to this queue using: # # self.output_queue.put(fuzz_req) # # The strategy will read items from this queue in a tight loop using: # # result_item = url_producer.get_result(timeout=0.1) # # And write them to self.in_queue (defined above) for all the url consumers # # Since this queue is read in a tight loop, items that are written here # will, in theory, not stay in memory for long. # # Also, items written here are fuzzable requests, which shouldn't use a lot # of memory. # # The only scenario I can think of where this queue is full of items # is one where the strategy loop is slow / delayed and the crawl plugins # are all findings many new URLs and forms. # # Tests showed something like this for a common site: # # [Thu Feb 15 16:45:36 2018 - debug] CachedQueue.get() ... CrawlInfraOut DiskDict size is 19. # [Thu Feb 15 16:45:36 2018 - debug] CachedQueue.get() ... CrawlInfraOut DiskDict size is 28. # [Thu Feb 15 16:45:37 2018 - debug] CachedQueue.get() ... CrawlInfraOut DiskDict size is 27. # ... # [Thu Feb 15 16:45:52 2018 - debug] CachedQueue.get() ... CrawlInfraOut DiskDict size is 1. # # This was with a max_in_queue_size of 100 set for the CachedQueue defined below. # # Meaning that: # * There were 119 items in the queue (100 in memory ) in the first log line # * Also at 16:45:36, there were 128 items in the queue (100 in memory) # * It took 16 seconds to consume 28 items from the queue (from second 36 to second 52) # # This surprises me a little bit. I expected this queue to have less items in memory. # Since I want to remove the memory usage in the framework, I'm going to reduce the # maxsize sent to this CachedQueue to 50 # # But just in case I'm using a CachedQueue! self._out_queue = CachedQueue(maxsize=75, name=thread_name + 'Out') self._thread_name = thread_name self._consumer_plugins = consumer_plugins self._w3af_core = w3af_core self._observers = [] self._tasks_in_progress = {} self._poison_pill_sent = False self._has_finished = False self._threadpool = None if create_pool: self._threadpool = Pool(thread_pool_size or self.THREAD_POOL_SIZE, worker_names='%sWorker' % thread_name, max_queued_tasks=max_pool_queued_tasks) def get_pool(self): return self._threadpool def get_name(self): raise NotImplementedError def set_has_finished(self): self._has_finished = True self._w3af_core.strategy.clear_queue_speed_data() def run(self): """ Consume the queue items, sending them to the plugins which are then going to find vulnerabilities, new URLs, etc. """ while True: try: work_unit = self.in_queue.get() except KeyboardInterrupt: # https://github.com/andresriancho/w3af/issues/9587 # # If we don't do this, the thread will die and will never # process the POISON_PILL, which will end up in an endless # wait for .join() continue if work_unit == POISON_PILL: try: # Close the pool and wait for everyone to finish if self._threadpool is not None: self._threadpool.close() self._threadpool.join() self._threadpool = None self._teardown() finally: # Finish this consumer and everyone consuming the output self._out_queue.put(POISON_PILL) self.in_queue.task_done() self.set_has_finished() break else: # pylint: disable=E1120 try: self._consume_wrapper(work_unit) finally: self.in_queue.task_done() def get_running_task_count(self): """ :return: The number of tasks which are currently running in the threadpool. This is commonly used for measuring ETA. """ if self._threadpool is None: return 0 return self._threadpool.get_running_task_count() def _teardown(self): raise NotImplementedError def _consume(self, work_unit): raise NotImplementedError def has_finished(self): return self._has_finished @task_decorator def _consume_wrapper(self, function_id, work_unit): """ Just makes sure that all _consume methods are decorated as tasks. """ return self._consume(work_unit) def _task_done(self, function_id): """ The task_in_progress_counter is needed because we want to know if the consumer is processing something and let it finish. It is mainly used in the has_pending_work(). For example: * You can have pending work if there are items in the input_queue * You can have pending work if there are still items to be read from the output_queue by one of the consumers that reads our output. * You can have pending work when there are no items in input_queue and no items in output_queue but the threadpool inside the consumer is processing something. This situation is handled by the self._tasks_in_progress attribute and the _add_task and _task_done methods. So, for each _add_task() there has to be a _task_done() even if the task ends in an error or exception. Recommendation: Do NOT set the callback for apply_async to call _task_done, the Python2.7 pool implementation won't call it if the function raised an exception and you'll end up with tasks in progress that finished with an exception. """ try: self._tasks_in_progress.pop(function_id) except KeyError: raise AssertionError('The function %s was not found!' % function_id) def _add_task(self, function_id): """ :param function_id: Just for debugging @see: _task_done()'s documentation. """ self._tasks_in_progress[function_id] = 1 def in_queue_put(self, work, force=False): """ Add work to the queue :param work: Work item :param force: Add to the queue even when the poison pill was already sent, this should NEVER be used unless you know what you are doing! :return: True if the task was added to the queue """ if work is None: return # Force the queue not to accept anything after POISON_PILL is sent. # # If anything is put to the queue after POISON_PILL, a race condition # might happen and the consumer might never stop # # https://github.com/andresriancho/w3af/pull/16063 if self._poison_pill_sent and not force: return return self.in_queue.put(work) def in_queue_put_iter(self, work_iter): if work_iter is not None: for work in work_iter: self.in_queue_put(work) def has_pending_work(self): """ @see: _task_done() documentation :return: True if the in_queue_size is != 0 OR if one of the pool workers is still doing something that might impact on out_queue. """ if self.in_queue_size() > 0: return True if self.out_queue.qsize() > 0: return True if len(self._tasks_in_progress) > 0: return True # This is a special case which loosely translates to: "If there are any # pending tasks in the threadpool, even if they haven't yet called the # _add_task method, we know we have pending work to do". if self._threadpool is not None: if self._threadpool._inqueue.qsize() > 0: return True if self._threadpool._outqueue.qsize() > 0: return True return False @property def out_queue(self): # This output queue can contain one of the following: # * POISON_PILL # * (plugin_name, fuzzable_request, AsyncResult) # * An ExceptionData instance return self._out_queue def in_queue_size(self): return self.in_queue.qsize() def join(self): """ Poison the loop and wait for all queued work to finish this might take some time to process. """ start_time = time.time() if not self.is_alive(): # This return has a long history, follow it here: # https://github.com/andresriancho/w3af/issues/1172 return if not self._poison_pill_sent: # https://github.com/andresriancho/w3af/issues/9587 # let put() know that all new tasks should be ignored self._poison_pill_sent = True # send the poison pill self.in_queue_put(POISON_PILL, force=True) self.in_queue.join() if self._threadpool is not None: self._threadpool.close() self._threadpool.join() spent_time = time.time() - start_time om.out.debug('%s took %.2f seconds to join()' % (self._thread_name, spent_time)) def terminate(self): """ Remove all queued work from in_queue and poison the loop so the consumer exits. Should be very fast and called only if we don't care about the queued work anymore (ie. user clicked stop in the UI). """ while not self.in_queue.empty(): try: self.in_queue.get_nowait() except Empty: # We get here in very rare cases where: # # * Another thread (T1) is running and reading from in_queue # * Our thread (T2) asks if the queue is empty and gets False # * T1 reads from in_queue # * T2 reads from the queue but there are no more tasks there # * T2 locks for ever (at least that is what happen when self.in_queue.get() # was used instead of get_nowait() # msg = 'Handled race condition in %s consumer terminate()' args = (self._thread_name, ) om.out.debug(msg % args) continue self.in_queue.task_done() om.out.debug('No more tasks in %s consumer input queue.' % self._thread_name) self.join() def get_result(self, timeout=0.5): """ :return: The first result from the output Queue. """ return self._out_queue.get(timeout=timeout) def handle_exception(self, phase, plugin_name, fuzzable_request, _exception): """ Get the exception information, and put it into the output queue then, the strategy will get the items from the output queue and handle the exceptions. :param plugin_name: The plugin that generated the exception :param fuzzable_request: The fuzzable request that was sent as input to the plugin when the exception was raised :param _exception: The exception object """ except_type, except_class, tb = sys.exc_info() enabled_plugins = pprint_plugins(self._w3af_core) status = CoreStatus(self._w3af_core) status.set_running_plugin(phase, plugin_name, log=False) status.set_current_fuzzable_request(phase, fuzzable_request) exception_data = ExceptionData(status, _exception, tb, enabled_plugins) self._out_queue.put(exception_data) def add_observer(self, observer): self._observers.append(observer) def _log_end_took(self, msg_fmt, start_time, plugin): spent_time = time.time() - start_time args = (spent_time, plugin.get_name()) om.out.debug(msg_fmt % args)
class BaseConsumer(Process): """ Consumer thread that takes fuzzable requests from a Queue that's populated by the crawl plugins and identified vulnerabilities by performing various requests. """ def __init__(self, consumer_plugins, w3af_core, thread_name, create_pool=True): """ :param base_consumer_plugins: Instances of base_consumer plugins in a list :param w3af_core: The w3af core that we'll use for status reporting :param thread_name: How to name the current thread :param create_pool: True to create a worker pool for this consumer """ super(BaseConsumer, self).__init__(name='%sController' % thread_name) self.in_queue = QueueSpeed() self._out_queue = Queue.Queue() self._consumer_plugins = consumer_plugins self._w3af_core = w3af_core self._tasks_in_progress = {} self._threadpool = None if create_pool: self._threadpool = Pool(10, worker_names='%sWorker' % thread_name) def run(self): """ Consume the queue items, sending them to the plugins which are then going to find vulnerabilities, new URLs, etc. """ while True: work_unit = self.in_queue.get() if work_unit == POISON_PILL: # Close the pool and wait for everyone to finish self._threadpool.close() self._threadpool.join() del self._threadpool self._teardown() # Finish this consumer and everyone consuming the output self._out_queue.put(POISON_PILL) self.in_queue.task_done() break else: # pylint: disable=E1120 self._consume_wrapper(work_unit) self.in_queue.task_done() def _teardown(self): raise NotImplementedError def _consume(self, work_unit): raise NotImplementedError @task_decorator def _consume_wrapper(self, function_id, work_unit): """ Just makes sure that all _consume methods are decorated as tasks. """ return self._consume(work_unit) def _task_done(self, function_id): """ The task_in_progress_counter is needed because we want to know if the consumer is processing something and let it finish. It is mainly used in the has_pending_work(). For example: * You can have pending work if there are items in the input_queue * You can have pending work if there are still items to be read from the output_queue by one of the consumers that reads our output. * You can have pending work when there are no items in input_queue and no items in output_queue but the threadpool inside the consumer is processing something. This situation is handled by the self._tasks_in_progress attribute and the _add_task and _task_done methods. So, for each _add_task() there has to be a _task_done() even if the task ends in an error or exception. Recommendation: Do NOT set the callback for apply_async to call _task_done, the Python2.7 pool implementation won't call it if the function raised an exception and you'll end up with tasks in progress that finished with an exception. """ try: self._tasks_in_progress.pop(function_id) except KeyError: raise AssertionError('The function %s was not found!' % function_id) def _add_task(self, function_id): """ :param function_id: Just for debugging @see: _task_done()'s documentation. """ self._tasks_in_progress[function_id] = 1 def in_queue_put(self, work): if work is not None: return self.in_queue.put(work) def in_queue_put_iter(self, work_iter): if work_iter is not None: for work in work_iter: self.in_queue_put(work) def has_pending_work(self): """ @see: _task_done() documentation :return: True if the in_queue_size is != 0 OR if one of the pool workers is still doing something that might impact on out_queue. """ if self.in_queue_size() > 0 \ or self.out_queue.qsize() > 0: return True if len(self._tasks_in_progress) > 0: return True # This is a special case which loosely translates to: "If there are any # pending tasks in the threadpool, even if they haven't yet called the # _add_task method, we know we have pending work to do". if hasattr(self, '_threadpool') and self._threadpool is not None: if self._threadpool._inqueue.qsize() > 0 \ or self._threadpool._outqueue.qsize() > 0: return True return False @property def out_queue(self): # # This output queue can contain one of the following: # * POISON_PILL # * (plugin_name, fuzzable_request, AsyncResult) # * An ExceptionData instance return self._out_queue def in_queue_size(self): return self.in_queue.qsize() def join(self): """ Poison the loop and wait for all queued work to finish this might take some time to process. """ if not self.is_alive(): # This return has a long history, follow it here: # https://github.com/andresriancho/w3af/issues/1172 return self.in_queue_put(POISON_PILL) self.in_queue.join() def terminate(self): """ Remove all queued work from in_queue and poison the loop so the consumer exits. Should be very fast and called only if we don't care about the queued work anymore (ie. user clicked stop in the UI). """ while not self.in_queue.empty(): self.in_queue.get() self.in_queue.task_done() self.join() def get_result(self, timeout=0.5): """ :return: The first result from the output Queue. """ return self._out_queue.get(timeout=timeout) def handle_exception(self, phase, plugin_name, fuzzable_request, _exception): """ Get the exception information, and put it into the output queue then, the strategy will get the items from the output queue and handle the exceptions. :param plugin_name: The plugin that generated the exception :param fuzzable_request: The fuzzable request that was sent as input to the plugin when the exception was raised :param _exception: The exception object """ except_type, except_class, tb = sys.exc_info() enabled_plugins = pprint_plugins(self._w3af_core) status = w3af_core_status(self._w3af_core) status.set_running_plugin(phase, plugin_name, log=False) status.set_current_fuzzable_request(phase, fuzzable_request) exception_data = ExceptionData(status, _exception, tb, enabled_plugins) self._out_queue.put(exception_data)
def test_terminate_join(self): worker_pool = Pool(1, worker_names='WorkerThread') worker_pool.terminate() worker_pool.join()
class BaseConsumer(Process): """ Consumer thread that takes fuzzable requests from a Queue that's populated by the crawl plugins and identified vulnerabilities by performing various requests. """ THREAD_POOL_SIZE = 10 def __init__(self, consumer_plugins, w3af_core, thread_name, create_pool=True, max_pool_queued_tasks=0, max_in_queue_size=0, thread_pool_size=None): """ :param consumer_plugins: Instances of base_consumer plugins in a list :param w3af_core: The w3af core that we'll use for status reporting :param thread_name: How to name the current thread, eg. Auditor :param create_pool: True to create a worker pool for this consumer """ super(BaseConsumer, self).__init__(name='%sController' % thread_name) self.in_queue = CachedQueue(maxsize=max_in_queue_size, name=thread_name) self._out_queue = Queue.Queue() self._thread_name = thread_name self._consumer_plugins = consumer_plugins self._w3af_core = w3af_core self._observers = [] self._tasks_in_progress = {} self._poison_pill_sent = False self._threadpool = None if create_pool: self._threadpool = Pool(thread_pool_size or self.THREAD_POOL_SIZE, worker_names='%sWorker' % thread_name, max_queued_tasks=max_pool_queued_tasks) def run(self): """ Consume the queue items, sending them to the plugins which are then going to find vulnerabilities, new URLs, etc. """ while True: try: work_unit = self.in_queue.get() except KeyboardInterrupt: # https://github.com/andresriancho/w3af/issues/9587 # # If we don't do this, the thread will die and will never # process the POISON_PILL, which will end up in an endless # wait for .join() continue if work_unit == POISON_PILL: # Close the pool and wait for everyone to finish self._threadpool.close() self._threadpool.join() self._threadpool = None self._teardown() # Finish this consumer and everyone consuming the output self._out_queue.put(POISON_PILL) self.in_queue.task_done() break else: # pylint: disable=E1120 try: self._consume_wrapper(work_unit) finally: self.in_queue.task_done() def _teardown(self): raise NotImplementedError def _consume(self, work_unit): raise NotImplementedError @task_decorator def _consume_wrapper(self, function_id, work_unit): """ Just makes sure that all _consume methods are decorated as tasks. """ return self._consume(work_unit) def _task_done(self, function_id): """ The task_in_progress_counter is needed because we want to know if the consumer is processing something and let it finish. It is mainly used in the has_pending_work(). For example: * You can have pending work if there are items in the input_queue * You can have pending work if there are still items to be read from the output_queue by one of the consumers that reads our output. * You can have pending work when there are no items in input_queue and no items in output_queue but the threadpool inside the consumer is processing something. This situation is handled by the self._tasks_in_progress attribute and the _add_task and _task_done methods. So, for each _add_task() there has to be a _task_done() even if the task ends in an error or exception. Recommendation: Do NOT set the callback for apply_async to call _task_done, the Python2.7 pool implementation won't call it if the function raised an exception and you'll end up with tasks in progress that finished with an exception. """ try: self._tasks_in_progress.pop(function_id) except KeyError: raise AssertionError('The function %s was not found!' % function_id) def _add_task(self, function_id): """ :param function_id: Just for debugging @see: _task_done()'s documentation. """ self._tasks_in_progress[function_id] = 1 def in_queue_put(self, work): # Force the queue not to accept anything after POISON_PILL is sent. # If anything is put to the queue after POISON_PILL, a race condition might happens # and the consumer might never stop # https://github.com/andresriancho/w3af/pull/16063 if self._poison_pill_sent: return if work is not None: return self.in_queue.put(work) def in_queue_put_iter(self, work_iter): if work_iter is not None: for work in work_iter: self.in_queue_put(work) def has_pending_work(self): """ @see: _task_done() documentation :return: True if the in_queue_size is != 0 OR if one of the pool workers is still doing something that might impact on out_queue. """ if self.in_queue_size() > 0 \ or self.out_queue.qsize() > 0: return True if len(self._tasks_in_progress) > 0: return True # This is a special case which loosely translates to: "If there are any # pending tasks in the threadpool, even if they haven't yet called the # _add_task method, we know we have pending work to do". if self._threadpool is not None: if self._threadpool._inqueue.qsize() > 0: return True if self._threadpool._outqueue.qsize() > 0: return True return False @property def out_queue(self): # This output queue can contain one of the following: # * POISON_PILL # * (plugin_name, fuzzable_request, AsyncResult) # * An ExceptionData instance return self._out_queue def in_queue_size(self): return self.in_queue.qsize() def join(self): """ Poison the loop and wait for all queued work to finish this might take some time to process. """ start_time = time.time() if not self.is_alive(): # This return has a long history, follow it here: # https://github.com/andresriancho/w3af/issues/1172 return if not self._poison_pill_sent: # send the poison pill self.in_queue_put(POISON_PILL) # https://github.com/andresriancho/w3af/issues/9587 # let put() know that all new tasks should be ignored self._poison_pill_sent = True self.in_queue.join() if self._threadpool is not None: self._threadpool.close() self._threadpool.join() spent_time = time.time() - start_time om.out.debug('%s took %.2f seconds to join()' % (self._thread_name, spent_time)) def terminate(self): """ Remove all queued work from in_queue and poison the loop so the consumer exits. Should be very fast and called only if we don't care about the queued work anymore (ie. user clicked stop in the UI). """ while not self.in_queue.empty(): self.in_queue.get() self.in_queue.task_done() self.join() def get_result(self, timeout=0.5): """ :return: The first result from the output Queue. """ return self._out_queue.get(timeout=timeout) def handle_exception(self, phase, plugin_name, fuzzable_request, _exception): """ Get the exception information, and put it into the output queue then, the strategy will get the items from the output queue and handle the exceptions. :param plugin_name: The plugin that generated the exception :param fuzzable_request: The fuzzable request that was sent as input to the plugin when the exception was raised :param _exception: The exception object """ except_type, except_class, tb = sys.exc_info() enabled_plugins = pprint_plugins(self._w3af_core) status = w3af_core_status(self._w3af_core) status.set_running_plugin(phase, plugin_name, log=False) status.set_current_fuzzable_request(phase, fuzzable_request) exception_data = ExceptionData(status, _exception, tb, enabled_plugins) self._out_queue.put(exception_data) def add_observer(self, observer): self._observers.append(observer)
class BaseConsumer(Process): """ Consumer thread that takes fuzzable requests from a Queue that's populated by the crawl plugins and identified vulnerabilities by performing various requests. """ def __init__(self, consumer_plugins, w3af_core, thread_name, create_pool=True): """ :param base_consumer_plugins: Instances of base_consumer plugins in a list :param w3af_core: The w3af core that we'll use for status reporting :param thread_name: How to name the current thread :param create_pool: True to create a worker pool for this consumer """ super(BaseConsumer, self).__init__(name='%sController' % thread_name) self.in_queue = QueueSpeed() self._out_queue = Queue.Queue() self._consumer_plugins = consumer_plugins self._w3af_core = w3af_core self._tasks_in_progress_counter = Queue.Queue() self._threadpool = None if create_pool: self._threadpool = Pool(10, worker_names='%sWorker' % thread_name) def run(self): """ Consume the queue items, sending them to the plugins which are then going to find vulnerabilities, new URLs, etc. """ while True: work_unit = self.in_queue.get() if work_unit == POISON_PILL: # Close the pool and wait for everyone to finish self._threadpool.close() self._threadpool.join() del self._threadpool self._teardown() # Finish this consumer and everyone consuming the output self._out_queue.put(POISON_PILL) self.in_queue.task_done() break else: self._consume_wrapper(work_unit) self.in_queue.task_done() def _teardown(self): raise NotImplementedError def _consume(self, work_unit): raise NotImplementedError @task_decorator def _consume_wrapper(self, work_unit): """ Just makes sure that all _consume methods are decorated as tasks. """ return self._consume(work_unit) def _task_done(self, result): """ The task_in_progress_counter is needed because we want to know if the consumer is processing something and let it finish. It is mainly used in the has_pending_work(). For example: * You can have pending work if there are items in the input_queue * You can have pending work if there are still items to be read from the output_queue by one of the consumers that reads our output. * You can have pending work when there are no items in input_queue and no items in output_queue but the threadpool inside the consumer is processing something. This situation is handled by the self._tasks_in_progress_counter attribute and the _add_task and _task_done methods. So, for each _add_task() there has to be a _task_done() even if the task ends in an error or exception. Recommendation: Do NOT set the callback for apply_async to call _task_done, the Python2.7 pool implementation won't call it if the function raised an exception and you'll end up with tasks in progress that finished with an exception. """ try: self._tasks_in_progress_counter.get_nowait() except Queue.Empty: raise AssertionError('You can not _task_done()' ' more than you _add_task().') def _add_task(self): """ @see: _task_done()'s documentation. """ self._tasks_in_progress_counter.put(None) def in_queue_put(self, work): if work is not None: return self.in_queue.put(work) def in_queue_put_iter(self, work_iter): if work_iter is not None: for work in work_iter: self.in_queue_put(work) def has_pending_work(self): """ @see: _task_done() documentation :return: True if the in_queue_size is != 0 OR if one of the pool workers is still doing something that might impact on out_queue. """ if self.in_queue_size() > 0 \ or self.out_queue.qsize() > 0: return True if self._tasks_in_progress_counter.qsize() > 0: return True # This is a special case which loosely translates to: "If there are any # pending tasks in the threadpool, even if they haven't yet called the # _add_task method, we know we have pending work to do". if hasattr(self, '_threadpool') and self._threadpool is not None: if self._threadpool._inqueue.qsize() > 0 \ or self._threadpool._outqueue.qsize() > 0: return True return False @property def out_queue(self): # # This output queue can contain one of the following: # * POISON_PILL # * (plugin_name, fuzzable_request, AsyncResult) # * An ExceptionData instance return self._out_queue def in_queue_size(self): return self.in_queue.qsize() def join(self): """ Poison the loop and wait for all queued work to finish this might take some time to process. """ if not self.is_alive(): # This return has a long history, follow it here: # https://github.com/andresriancho/w3af/issues/1172 return self.in_queue_put(POISON_PILL) self.in_queue.join() def terminate(self): """ Remove all queued work from in_queue and poison the loop so the consumer exits. Should be very fast and called only if we don't care about the queued work anymore (ie. user clicked stop in the UI). """ while not self.in_queue.empty(): self.in_queue.get() self.in_queue.task_done() self.join() def get_result(self, timeout=0.5): """ :return: The first result from the output Queue. """ return self._out_queue.get(timeout=timeout) def handle_exception(self, phase, plugin_name, fuzzable_request, _exception): """ Get the exception information, and put it into the output queue then, the strategy will get the items from the output queue and handle the exceptions. :param plugin_name: The plugin that generated the exception :param fuzzable_request: The fuzzable request that was sent as input to the plugin when the exception was raised :param _exception: The exception object """ except_type, except_class, tb = sys.exc_info() enabled_plugins = pprint_plugins(self._w3af_core) status = w3af_core_status(self._w3af_core) status.set_running_plugin(phase, plugin_name, log=False) status.set_current_fuzzable_request(phase, fuzzable_request) exception_data = ExceptionData(status, _exception, tb, enabled_plugins) self._out_queue.put(exception_data)
class BaseConsumer(Process): """ Consumer thread that takes fuzzable requests from a Queue that's populated by the crawl plugins and identified vulnerabilities by performing various requests. """ THREAD_POOL_SIZE = 10 def __init__(self, consumer_plugins, w3af_core, thread_name, create_pool=True, max_pool_queued_tasks=0, max_in_queue_size=0, thread_pool_size=None): """ :param consumer_plugins: Instances of base_consumer plugins in a list :param w3af_core: The w3af core that we'll use for status reporting :param thread_name: How to name the current thread, eg. Auditor :param create_pool: True to create a worker pool for this consumer """ super(BaseConsumer, self).__init__(name='%sController' % thread_name) self.in_queue = CachedQueue(maxsize=max_in_queue_size, name=thread_name + 'In') # # Crawl and infrastructure plugins write to this queue using: # # self.output_queue.put(fuzz_req) # # The strategy will read items from this queue in a tight loop using: # # result_item = url_producer.get_result(timeout=0.1) # # And write them to self.in_queue (defined above) for all the url consumers # # Since this queue is read in a tight loop, items that are written here # will, in theory, not stay in memory for long. # # Also, items written here are fuzzable requests, which shouldn't use a lot # of memory. # # The only scenario I can think of where this queue is full of items # is one where the strategy loop is slow / delayed and the crawl plugins # are all findings many new URLs and forms. # # Tests showed something like this for a common site: # # [Thu Feb 15 16:45:36 2018 - debug] CachedQueue.get() ... CrawlInfraOut DiskDict size is 19. # [Thu Feb 15 16:45:36 2018 - debug] CachedQueue.get() ... CrawlInfraOut DiskDict size is 28. # [Thu Feb 15 16:45:37 2018 - debug] CachedQueue.get() ... CrawlInfraOut DiskDict size is 27. # ... # [Thu Feb 15 16:45:52 2018 - debug] CachedQueue.get() ... CrawlInfraOut DiskDict size is 1. # # This was with a max_in_queue_size of 100 set for the CachedQueue defined below. # # Meaning that: # * There were 119 items in the queue (100 in memory ) in the first log line # * Also at 16:45:36, there were 128 items in the queue (100 in memory) # * It took 16 seconds to consume 28 items from the queue (from second 36 to second 52) # # This surprises me a little bit. I expected this queue to have less items in memory. # Since I want to remove the memory usage in the framework, I'm going to reduce the # maxsize sent to this CachedQueue to 50 # # But just in case I'm using a CachedQueue! self._out_queue = CachedQueue(maxsize=75, name=thread_name + 'Out') self._thread_name = thread_name self._consumer_plugins = consumer_plugins self._w3af_core = w3af_core self._observers = [] self._tasks_in_progress = {} self._poison_pill_sent = False self._threadpool = None if create_pool: self._threadpool = Pool(thread_pool_size or self.THREAD_POOL_SIZE, worker_names='%sWorker' % thread_name, max_queued_tasks=max_pool_queued_tasks) def get_pool(self): return self._threadpool def run(self): """ Consume the queue items, sending them to the plugins which are then going to find vulnerabilities, new URLs, etc. """ while True: try: work_unit = self.in_queue.get() except KeyboardInterrupt: # https://github.com/andresriancho/w3af/issues/9587 # # If we don't do this, the thread will die and will never # process the POISON_PILL, which will end up in an endless # wait for .join() continue if work_unit == POISON_PILL: try: # Close the pool and wait for everyone to finish if self._threadpool is not None: self._threadpool.close() self._threadpool.join() self._threadpool = None self._teardown() finally: # Finish this consumer and everyone consuming the output self._out_queue.put(POISON_PILL) self.in_queue.task_done() break else: # pylint: disable=E1120 try: self._consume_wrapper(work_unit) finally: self.in_queue.task_done() def _teardown(self): raise NotImplementedError def _consume(self, work_unit): raise NotImplementedError @task_decorator def _consume_wrapper(self, function_id, work_unit): """ Just makes sure that all _consume methods are decorated as tasks. """ return self._consume(work_unit) def _task_done(self, function_id): """ The task_in_progress_counter is needed because we want to know if the consumer is processing something and let it finish. It is mainly used in the has_pending_work(). For example: * You can have pending work if there are items in the input_queue * You can have pending work if there are still items to be read from the output_queue by one of the consumers that reads our output. * You can have pending work when there are no items in input_queue and no items in output_queue but the threadpool inside the consumer is processing something. This situation is handled by the self._tasks_in_progress attribute and the _add_task and _task_done methods. So, for each _add_task() there has to be a _task_done() even if the task ends in an error or exception. Recommendation: Do NOT set the callback for apply_async to call _task_done, the Python2.7 pool implementation won't call it if the function raised an exception and you'll end up with tasks in progress that finished with an exception. """ try: self._tasks_in_progress.pop(function_id) except KeyError: raise AssertionError('The function %s was not found!' % function_id) def _add_task(self, function_id): """ :param function_id: Just for debugging @see: _task_done()'s documentation. """ self._tasks_in_progress[function_id] = 1 def in_queue_put(self, work, force=False): """ Add work to the queue :param work: Work item :param force: Add to the queue even when the poison pill was already sent, this should NEVER be used unless you know what you are doing! :return: True if the task was added to the queue """ if work is None: return # Force the queue not to accept anything after POISON_PILL is sent. # # If anything is put to the queue after POISON_PILL, a race condition # might happen and the consumer might never stop # # https://github.com/andresriancho/w3af/pull/16063 if self._poison_pill_sent and not force: return return self.in_queue.put(work) def in_queue_put_iter(self, work_iter): if work_iter is not None: for work in work_iter: self.in_queue_put(work) def has_pending_work(self): """ @see: _task_done() documentation :return: True if the in_queue_size is != 0 OR if one of the pool workers is still doing something that might impact on out_queue. """ if self.in_queue_size() > 0: return True if self.out_queue.qsize() > 0: return True if len(self._tasks_in_progress) > 0: return True # This is a special case which loosely translates to: "If there are any # pending tasks in the threadpool, even if they haven't yet called the # _add_task method, we know we have pending work to do". if self._threadpool is not None: if self._threadpool._inqueue.qsize() > 0: return True if self._threadpool._outqueue.qsize() > 0: return True return False @property def out_queue(self): # This output queue can contain one of the following: # * POISON_PILL # * (plugin_name, fuzzable_request, AsyncResult) # * An ExceptionData instance return self._out_queue def in_queue_size(self): return self.in_queue.qsize() def join(self): """ Poison the loop and wait for all queued work to finish this might take some time to process. """ start_time = time.time() if not self.is_alive(): # This return has a long history, follow it here: # https://github.com/andresriancho/w3af/issues/1172 return if not self._poison_pill_sent: # https://github.com/andresriancho/w3af/issues/9587 # let put() know that all new tasks should be ignored self._poison_pill_sent = True # send the poison pill self.in_queue_put(POISON_PILL, force=True) self.in_queue.join() if self._threadpool is not None: self._threadpool.close() self._threadpool.join() spent_time = time.time() - start_time om.out.debug('%s took %.2f seconds to join()' % (self._thread_name, spent_time)) def terminate(self): """ Remove all queued work from in_queue and poison the loop so the consumer exits. Should be very fast and called only if we don't care about the queued work anymore (ie. user clicked stop in the UI). """ while not self.in_queue.empty(): try: self.in_queue.get_nowait() except Empty: # We get here in very rare cases where: # # * Another thread (T1) is running and reading from in_queue # * Our thread (T2) asks if the queue is empty and gets False # * T1 reads from in_queue # * T2 reads from the queue but there are no more tasks there # * T2 locks for ever (at least that is what happen when self.in_queue.get() # was used instead of get_nowait() # msg = 'Handled race condition in %s consumer terminate()' args = (self._thread_name,) om.out.debug(msg % args) continue self.in_queue.task_done() om.out.debug('No more tasks in %s consumer input queue.' % self._thread_name) self.join() def get_result(self, timeout=0.5): """ :return: The first result from the output Queue. """ return self._out_queue.get(timeout=timeout) def handle_exception(self, phase, plugin_name, fuzzable_request, _exception): """ Get the exception information, and put it into the output queue then, the strategy will get the items from the output queue and handle the exceptions. :param plugin_name: The plugin that generated the exception :param fuzzable_request: The fuzzable request that was sent as input to the plugin when the exception was raised :param _exception: The exception object """ except_type, except_class, tb = sys.exc_info() enabled_plugins = pprint_plugins(self._w3af_core) status = w3af_core_status(self._w3af_core) status.set_running_plugin(phase, plugin_name, log=False) status.set_current_fuzzable_request(phase, fuzzable_request) exception_data = ExceptionData(status, _exception, tb, enabled_plugins) self._out_queue.put(exception_data) def add_observer(self, observer): self._observers.append(observer) def _log_end_took(self, msg_fmt, start_time, plugin): spent_time = time.time() - start_time args = (spent_time, plugin.get_name()) om.out.debug(msg_fmt % args)