def test_process_pool_join_workers(self): """Process Pool Spawn no worker is running after join.""" pool = ProcessPool(max_workers=4) pool.schedule(function, args=[1]) pool.stop() pool.join() self.assertEqual(len(pool._pool_manager.worker_manager.workers), 0)
def test_process_pool_stop_stopped(self): """Process Pool Spawn is stopped after stop.""" pool = ProcessPool() pool.schedule(function, args=[1]) pool.stop() pool.join() self.assertFalse(pool.active)
def test_process_pool_stop_stopped(self): """Process Pool Fork is stopped after stop.""" pool = ProcessPool(max_workers=1) pool.schedule(function, args=[1]) pool.stop() pool.join() self.assertFalse(pool.active)
def test_process_pool_close_stopped(self): """Process Pool Forkserver is stopped after close.""" pool = ProcessPool(max_workers=1, context=mp_context) pool.schedule(function, args=[1]) pool.close() pool.join() self.assertFalse(pool.active)
def test_process_pool_close_stopped(self): """Process Pool Fork is stopped after close.""" pool = ProcessPool(max_workers=1) pool.schedule(function, args=[1]) pool.close() pool.join() self.assertFalse(pool.active)
def test_process_pool_join_workers(self): """Process Pool Fork no worker is running after join.""" pool = ProcessPool(max_workers=4) pool.schedule(function, args=[1]) pool.stop() pool.join() self.assertEqual(len(pool._pool_manager.worker_manager.workers), 0)
def test_process_pool_close_stopped(self): """Process Pool Spawn is stopped after close.""" pool = ProcessPool() pool.schedule(function, args=[1]) pool.close() pool.join() self.assertFalse(pool.active)
def test_process_pool_stop_large_data(self): """Process Pool Fork is stopped if large data is sent on the channel.""" data = "a" * 1098 * 1024 * 50 # 50 Mb pool = ProcessPool(max_workers=1) pool.schedule(function, args=[data]) pool.stop() pool.join() self.assertFalse(pool.active)
def test_process_pool_close_futures(self): """Process Pool Spawn all futures are performed on close.""" futures = [] pool = ProcessPool() for index in range(10): futures.append(pool.schedule(function, args=[index])) pool.close() pool.join() map(self.assertTrue, [f.done() for f in futures])
def test_process_pool_join_futures_timeout(self): """Process Pool Fork TimeoutError is raised if join on long futures.""" pool = ProcessPool(max_workers=1) for _ in range(2): pool.schedule(long_function) pool.close() self.assertRaises(TimeoutError, pool.join, 0.4) pool.stop() pool.join()
def test_process_pool_join_futures_timeout(self): """Process Pool Spawn TimeoutError is raised if join on long tasks.""" pool = ProcessPool() for _ in range(2): pool.schedule(long_function) pool.close() self.assertRaises(TimeoutError, pool.join, 0.4) pool.stop() pool.join()
def test_process_pool_close_futures(self): """Process Pool Fork all futures are performed on close.""" futures = [] pool = ProcessPool(max_workers=1) for index in range(10): futures.append(pool.schedule(function, args=[index])) pool.close() pool.join() map(self.assertTrue, [f.done() for f in futures])
def test_process_pool_stop_futures(self): """Process Pool Fork not all futures are performed on stop.""" futures = [] pool = ProcessPool(max_workers=1) for index in range(10): futures.append(pool.schedule(function, args=[index])) pool.stop() pool.join() self.assertTrue(len([f for f in futures if not f.done()]) > 0)
def test_process_pool_stop_large_data(self): """Process Pool Spawn is stopped if large data is sent on the channel.""" data = "a" * 4098 * 1024 pool = ProcessPool(initializer=long_initializer) pool.schedule(function, args=[data]) pool.stop() pool.join() self.assertFalse(pool.active)
def test_process_pool_stop_futures(self): """Process Pool Spawn not all futures are performed on stop.""" futures = [] pool = ProcessPool() for index in range(10): futures.append(pool.schedule(function, args=[index])) pool.stop() pool.join() self.assertTrue(len([f for f in futures if not f.done()]) > 0)
class CounterDaemon(object): def __init__(self, workers=1, poll_interval=None, max_tasks=100, task_timeout=0.1, task_default_sleep=0.01, task_sleep_rand_range=(1, 20)): self.workers = workers self.poll_interval = poll_interval self.max_tasks = max_tasks self.task_timeout = task_timeout self.task_default_sleep = task_default_sleep self.task_sleep_rand_range = task_sleep_rand_range self.pool = ProcessPool(max_workers=self.workers, max_tasks=self.max_tasks) self.pool._start_pool() def done_callback(self, bucket_id, future): pid = os.getpid() try: result = future.result() logger.debug('Result: {}\tpid: {}\tbucket: {}'.format(result, pid, bucket_id)) except futures.TimeoutError as e: logger.warning('TimeoutError\tpid: {}\tbucket: {}'.format(pid, bucket_id)) except futures.CancelledError: return except Exception as e: logger.exception('TaskError\t pid: {}\tbucket: {}\tError: {}'.format(pid, bucket_id, e)) def run_once(self): for bucket_id in random.sample(BUCKET_RANGE, self.workers): sleep = self.task_default_sleep * random.randint(*self.task_sleep_rand_range) future = self.pool.schedule( incr, args=(bucket_id,), kwargs={'sleep': sleep}, timeout=self.task_timeout ) future.add_done_callback(functools.partial(self.done_callback, bucket_id)) def run_forever(self): while True: try: self.run_once() except Exception as e: logger.exception('RunOnceError: {}'.format(e)) time.sleep(self.poll_interval or (3 * self.task_timeout)) def start(self): try: self.run_forever() except Exception as e: logger.exception('Error during running daemon: {}'.format(e)) self.pool.close() time.sleep(10) self.pool.stop() finally: self.pool.join()
def find_tlds(self): dom_list = [self.known_domain + '.' + tld for tld in self.tld_list] try: pool = ThreadPool(max_workers=self.max_workers, max_tasks=self.max_tasks) results = pool.map(self.check_tld, dom_list, timeout=self.timeout) pool.close() pool.join() print(results) except Exception as e: print(repr(e)) pass
def parallel_checks() -> None: """ Do all the checks that don't change code and can run in parallel. """ chores = [ do_mypy, do_detect_secrets, do_git_secrets, vulture, do_compile_py, do_lint, do_flake8, do_dodgy, do_bandit, do_python_taint, do_mccabe, do_check_manifest, do_liccheck, ] if IS_GITLAB: # other tasks assume there will be a LOC file by now. do_count_lines_of_code() for chore in chores: print(chore()) return # can't do pyroma because that needs a package, which might not exist yet. pool = ProcessPool(12) # max_workers=len(chores)) # cpu_count()) # log_to_stderr(logging.DEBUG) tasks = [] for chore in chores: tasks.append(pool.schedule(chore, args=())) print("close & join") pool.close() pool.join() for current_task in tasks: # pylint: disable=broad-except try: result = current_task.result() exception = current_task.exception() if exception: print(current_task.exception()) print(result) if "Abnormal" in str(result): print("One or more parallel tasks failed.") sys.exit(-1) except Exception as ex: print(ex) sys.exit(-1)
def handle(self, *args, **options): trials = options['trials'] bucket_id = options['bucket'] pool = ProcessPool(max_workers=2) pool._start_pool() bucket, _ = Counter.objects.get_or_create(bucket=bucket_id) bucket.count = 0 bucket.save() future_1 = pool.schedule(run_atomic_transactions, args=('T1', bucket_id, trials)) future_2 = pool.schedule(run_savepoints, args=('T2', bucket_id, trials)) pool.close() pool.join()
class PebbleMap(PySAT): name = 'PySAT Concurrency: PebbleMap' def __init__(self, **kwargs): self.pool = None super().__init__(**kwargs) def initialize(self, solver, **kwargs): if self.pool is not None: kwargs['output'].debug(2, 2, 'Pool already inited') else: self.pool = ProcessPool(max_workers=self.processes, initializer=self.init_func, initargs=(solver, kwargs['instance'])) kwargs['output'].debug( 2, 2, 'Init pool with %d processes' % self.processes) def process(self, tasks: List[Task], **kwargs) -> List[Result]: output = kwargs['output'] results = [] future = self.pool.map(self.solve_func, tasks) # timer = Timer(20., future.cancel, ()) # timer.start() try: for result in future.result(): results.append(result) output.debug(2, 3, 'Already solved %d tasks' % len(results)) except Exception as e: output.debug(0, 1, 'Error while fetching pool results: %s' % e) # if timer.is_alive(): # timer.cancel() if not self.keep: self.terminate() return [ result.set_value(self.measure.get(result)) for result in results ] def terminate(self): if self.pool: self.pool.stop() self.pool.join() self.pool = None
class PebbleExecutor(concurrent.futures.Executor): def __init__(self, max_workers, timeout=None): self.pool = ProcessPool(max_workers=max_workers) self.timeout = timeout def submit(self, fn, *args, **kwargs): return self.pool.schedule(fn, args=args, timeout=self.timeout) def map(self, func, *iterables, timeout=None, chunksize=1): raise NotImplementedError("This wrapper does not support `map`.") def shutdown(self, wait=True): if wait: log.info("Closing workers...") self.pool.close() else: log.info("Ending workers...") self.pool.stop() self.pool.join() log.info("Workers joined.")
def propagate(self, tasks: List[Task], **kwargs) -> List[Result]: output, instance = kwargs['output'], kwargs['instance'] pool = ProcessPool( max_workers=self.processes, initializer=propagate_init, initargs=(self.propagator, instance) ) results = [] future = pool.map(propagate_solve, tasks) try: for result in future.result(): results.append(result) output.debug(2, 3, 'Already solved %d tasks' % len(results)) except Exception as e: output.debug(0, 1, 'Error while fetching pool results: %s' % e) pool.stop() pool.join() return [result.set_value(self.measure.get(result)) for result in results]
class MultiProcessingDocumentParser(object): """ A document parser that performs all it's tasks in different processes and returns results to the main process. Also implements a parsing timeout just in case the parser enters an infinite loop. :author: Andres Riancho ([email protected]) """ DEBUG = core_profiling_is_enabled() MAX_WORKERS = 2 if is_running_on_ci() else (multiprocessing.cpu_count() / 2) or 1 # Increasing the timeout when profiling is enabled seems to fix issue #9713 # # https://github.com/andresriancho/w3af/issues/9713 PROFILING_ENABLED = (user_wants_memory_profiling() or user_wants_pytracemalloc() or user_wants_cpu_profiling()) # in seconds PARSER_TIMEOUT = 60 * 3 if PROFILING_ENABLED else 10 # Document parsers can go crazy on memory usage when parsing some very # specific HTML / PDF documents. Sometimes when this happens the operating # system does an out of memory (OOM) kill of a "randomly chosen" process. # # We limit the memory which can be used by parsing processes to this constant # # The feature was tested in test_pebble_limit_memory_usage.py MEMORY_LIMIT = get_memory_limit() def __init__(self): self._pool = None self._start_lock = threading.RLock() def start_workers(self): """ Start the pool and workers :return: The pool instance """ with self._start_lock: if self._pool is None: # Start the process pool log_queue = om.manager.get_in_queue() self._pool = ProcessPool(self.MAX_WORKERS, max_tasks=20, initializer=init_worker, initargs=(log_queue, self.MEMORY_LIMIT)) return self._pool def stop_workers(self): """ Stop the pool workers :return: None """ if self._pool is not None: self._pool.stop() self._pool.join() self._pool = None def get_document_parser_for(self, http_response): """ Get a document parser for http_response This parses the http_response in a pool worker. This method has two features: * We can kill the worker if the parser is taking too long * We can have different parsers :param http_response: The http response instance :return: An instance of DocumentParser """ # Start the worker processes if needed self.start_workers() filename = write_http_response_to_temp_file(http_response) apply_args = (process_document_parser, filename, self.DEBUG) # Push the task to the workers try: future = self._pool.schedule(apply_with_return_error, args=(apply_args,), timeout=self.PARSER_TIMEOUT) except RuntimeError, rte: # Remove the temp file used to send data to the process remove_file_if_exists(filename) # We get here when the pebble pool management thread dies and # suddenly starts answering all calls with: # # RuntimeError('Unexpected error within the Pool') # # The scan needs to stop because we can't parse any more # HTTP responses, which is a very critical part of the process msg = str(rte) raise ScanMustStopException(msg) try: process_result = future.result() except TimeoutError: msg = ('[timeout] The parser took more than %s seconds' ' to complete parsing of "%s", killed it!') args = (self.PARSER_TIMEOUT, http_response.get_url()) raise TimeoutError(msg % args) except ProcessExpired: # We reach here when the process died because of an error, we # handle this just like when the parser takes a lot of time and # we're unable to retrieve an answer from it msg = ('One of the parser processes died unexpectedly, this could' ' be because of a bug, the operating system triggering OOM' ' kills, etc. The scanner will continue with the next' ' document, but the scan results might be inconsistent.') raise TimeoutError(msg) finally: # Remove the temp file used to send data to the process, we already # have the result, so this file is not needed anymore remove_file_if_exists(filename) # We still need to perform some error handling here... if isinstance(process_result, Error): if isinstance(process_result.exc_value, MemoryError): msg = ('The parser exceeded the memory usage limit of %s bytes' ' while trying to parse "%s". The parser was stopped in' ' order to prevent OOM issues.') args = (self.MEMORY_LIMIT, http_response.get_url()) om.out.debug(msg % args) raise MemoryError(msg % args) process_result.reraise() try: parser_output = load_object_from_temp_file(process_result) except Exception, e: msg = 'Failed to deserialize sub-process result. Exception: "%s"' args = (e,) raise Exception(msg % args)
class Wintermute(discord.Client): __channels = None __bot_prelude = None __timeout = None __parser = None __pool = None def __init__(self, channels={}, bot_prelude='[bot] ', timeout=10, multiprocessing=1, loglevel=logging.INFO): super().__init__() self.__channels = channels self.__bot_prelude = bot_prelude self.__timeout = timeout logging.basicConfig(level=loglevel) self.__parser = BotGram(prelude=bot_prelude) self.__pool = ProcessPool(max_workers=multiprocessing, initializer=seed) def __del__(self): self.__pool.close() self.__pool.join() async def on_ready(self): logging.info("Online as " + str(self.user.name)) game = discord.Game() game.name = "Manipulating humanity" await self.change_presence(game=game) logging.info('Setup done') async def on_message(self, mess): if mess.channel.is_private: logging.info("new private message from " + str(mess.author)) else: logging.info("new message on server " + str(mess.server) + " and channel " + str(mess.channel) + " from " + str(mess.author)) logging.info("len: " + str(len(mess.content))) logging.info("content: " + str(mess.content)) if mess.channel.is_private: return if mess.author == self.user: return if (mess.channel.permissions_for(mess.channel.server.me).send_messages and mess.channel.name == self.__channels[mess.server.name]): resp = self.__parser.parse(mess) if resp is not None: task = self.__pool.schedule(str, args=(resp, ), timeout=self.__timeout) asyncio.ensure_future(self.collect_response( mess.channel, mess.author.mention, task), loop=self.loop) async def collect_response(self, channel, mention, task): try: while not task.done(): await asyncio.sleep(0.1) result = task.result() await self.send_message(channel, result) except TimeoutError: await self.send_message( channel, self.__bot_prelude + mention + ' Your request timed out') task.cancel() except HTTPException as e: if e.response.status == 400: await self.send_message( channel, self.__bot_prelude + mention + ' Error: Request reply was probably too long')
class EvaluationPool: """Wrapper around ProcessPool. Essentially a ProcessPool that can only evaluate configs, with some tweaks and caching. Aims to do the following: - Instantiate evaluators once in the workers to reduce overhead - Use a cache to not have to run evaluations twice - Provide common format for results, with support for "ok" and "error" status - Catches specified, but not all, exceptions - Provides timeouts backed by a sufficiently brutal approach to killing processes* We therefore sacrifice a little bit of generality for convenience in our particular domain, which is just how we like it. Note that this deliberately doesn't do any event loop management, it simply provides the `.schedule` function which schedules the evaluation of a config, and a `.finish` function with obtains the result, with error checking, of that evaluation. *** WARNING: hash keys in the cache (called "evals") are solely based on the config to be evaluated. So if some `evals` with a different underlying evaluator get passed, things will break in undefined ways. Since you are not expected to ever touch this class without `Run` mediating, I don't think this is a problem. But be careful out there. *** WARNING: macOS has some issues with multiprocessing and fork safety. This should not be a problem with this implementation, but if the models evaluated do something fancy, this might be the problem. So if you encounter something like `RuntimeError: Unexpected error within the Pool`, please check whether it persists on Linux. (I've observed this in particular with using something that relies on sqlite3 and attempts to write things to disk concurrently.) --- * We interface with external code that doesn't always play by the rules, and in particular is quite fond of not reacting to SIGTERM. The `concurrent.futures` `ProcessPoolExecutor` doesn't seem to be able to enforce a timeout in such cases. """ def __init__( self, max_workers, evaluator_config, evaluator_context={}, evals=None, trial_timeout=None, caught_exceptions=(TimeoutError, ), ): self.trial_timeout = trial_timeout self.pool = ProcessPool( initializer=initializer, initargs=(evaluator_config, evaluator_context), max_workers=max_workers, ) if platform.system() == "Darwin" and max_workers > 1: logger.warning( "Parallel support on macOS is a bit wonky. Proceed with caution." ) if evals is None: evals = ResultDB() self.evals = evals self.caught_exceptions = caught_exceptions def schedule(self, suggestion): """Schedule evaluation of a suggestion. This also checks the cache in the background, and creates a faux future to return the cached result. This is slightly inefficient, but it substantially reduces the complexity of the interface: We can now always expect a future as a result, and the re-submission can be handled in a unified way by the `Run`. (You can't simply keep requesting suggestions until you hit something is not in the cache, this leads to deadlocks when the search space has been exhausted.) """ eid = compute_hash(suggestion) if eid in self.evals: result = self.evals.get_result(eid) future = self.pool.schedule(passthrough, args=(result, )) else: future = self.pool.schedule(evaluate, args=(eid, suggestion), timeout=self.trial_timeout) future.eid = eid # annotate with hash key in evals future.suggestion = suggestion return future def finish(self, future): """Obtain result of an evaluation future, catch errors, update caches. Should only be called with a finished future... but it's not a problem if it's not. The call to `future.result()` will trigger execution.""" try: result = future.result() self.evals.submit_result(future.eid, result) return result except self.caught_exceptions as e: trace = traceback.format_exc() result = { "error": { "error": e.__class__.__name__, "error_text": str(e), "traceback": trace, "suggestion": future.suggestion, } } self.evals.submit_result(future.eid, result) return result except Exception as e: # uncaught exception, print suggestion and exit trace = traceback.format_exc() message = f"Unexpected error {e.__class__.__name__} evaluating a trial.\n" message += f"Error string: {e}.\n" message += f"Suggestion: {future.suggestion}.\n" message += f"Traceback:\n{trace}." logger.error(message) raise e def shutdown(self): self.pool.stop() # no point in waiting for things try: self.pool.join(timeout=1.0) logger.info("Successfully and peacefully shut down pool.") except TimeoutError: logger.info( "Failed to peacefully shut down pool... but no worries.")
class MultiProcessingDocumentParser(object): """ A document parser that performs all it's tasks in different processes and returns results to the main process. Also implements a parsing timeout just in case the parser enters an infinite loop. :author: Andres Riancho ([email protected]) """ DEBUG = core_profiling_is_enabled() MAX_WORKERS = 2 if is_running_on_ci() else (multiprocessing.cpu_count() / 2) or 1 # Increasing the timeout when profiling is enabled seems to fix issue #9713 # # https://github.com/andresriancho/w3af/issues/9713 PROFILING_ENABLED = (user_wants_memory_profiling() or user_wants_pytracemalloc() or user_wants_cpu_profiling()) # in seconds PARSER_TIMEOUT = 60 * 3 if PROFILING_ENABLED else 10 def __init__(self): self._pool = None self._start_lock = threading.RLock() def start_workers(self): """ Start the pool and workers :return: The pool instance """ with self._start_lock: if self._pool is None: # Start the process pool log_queue = om.manager.get_in_queue() self._pool = ProcessPool(self.MAX_WORKERS, max_tasks=20, initializer=init_worker, initargs=(log_queue,)) return self._pool def stop_workers(self): """ Stop the pool workers :return: None """ if self._pool is not None: self._pool.stop() self._pool.join() self._pool = None def get_document_parser_for(self, http_response): """ Get a document parser for http_response This parses the http_response in a pool worker. This method has two features: * We can kill the worker if the parser is taking too long * We can have different parsers :param http_response: The http response instance :return: An instance of DocumentParser """ # Start the worker processes if needed self.start_workers() apply_args = (process_document_parser, http_response, self.DEBUG) # Push the task to the workers future = self._pool.schedule(apply_with_return_error, args=(apply_args,), timeout=self.PARSER_TIMEOUT) try: parser_output = future.result() except TimeoutError: # Act just like when there is no parser msg = ('[timeout] The parser took more than %s seconds' ' to complete parsing of "%s", killed it!') args = (self.PARSER_TIMEOUT, http_response.get_url()) raise BaseFrameworkException(msg % args) else: if isinstance(parser_output, Error): parser_output.reraise() return parser_output def get_tags_by_filter(self, http_response, tags, yield_text=False): """ Return Tag instances for the tags which match the `tags` filter, parsing and all lxml stuff is done in another process and the Tag instances are sent to the main process (the one calling this method) through a pipe Some things to note: * Not all responses can be parsed, so I need to call DocumentParser and handle exceptions * The parser selected by DocumentParser might not have tags, and it might not have get_tags_by_filter. In this case just return an empty list * Just like get_document_parser_for we have a timeout in place, when we hit the timeout just return an empty list, this is not the best thing to do, but makes the plugin code easier to write (plugins would ignore this anyways) :param tags: The filter :param yield_text: Should we yield the tag text? :return: A list of Tag instances as defined in sgml.py :see: SGMLParser.get_tags_by_filter """ # Start the worker processes if needed self.start_workers() apply_args = (process_get_tags_by_filter, http_response, tags, yield_text, self.DEBUG) # Push the task to the workers future = self._pool.schedule(apply_with_return_error, args=(apply_args,), timeout=self.PARSER_TIMEOUT) try: filtered_tags = future.result() except TimeoutError: # We hit a timeout, return an empty list return [] else: # There was an exception in the parser, maybe the HTML was really # broken, or it wasn't an HTML at all. if isinstance(filtered_tags, Error): return [] return filtered_tags
class MultiProcessingDocumentParser(object): """ A document parser that performs all it's tasks in different processes and returns results to the main process. Also implements a parsing timeout just in case the parser enters an infinite loop. :author: Andres Riancho ([email protected]) """ DEBUG = core_profiling_is_enabled() MAX_WORKERS = 2 if is_running_on_ci() else (multiprocessing.cpu_count() / 2) or 1 # Increasing the timeout when profiling is enabled seems to fix issue #9713 # # https://github.com/andresriancho/w3af/issues/9713 PROFILING_ENABLED = (user_wants_memory_profiling() or user_wants_pytracemalloc() or user_wants_cpu_profiling()) # in seconds PARSER_TIMEOUT = 60 * 3 if PROFILING_ENABLED else 10 def __init__(self): self._pool = None self._start_lock = threading.RLock() def start_workers(self): """ Start the pool and workers :return: The pool instance """ with self._start_lock: if self._pool is None: # Start the process pool log_queue = om.manager.get_in_queue() self._pool = ProcessPool(self.MAX_WORKERS, max_tasks=20, initializer=init_worker, initargs=(log_queue, )) return self._pool def stop_workers(self): """ Stop the pool workers :return: None """ if self._pool is not None: self._pool.stop() self._pool.join() self._pool = None def get_document_parser_for(self, http_response): """ Get a document parser for http_response This parses the http_response in a pool worker. This method has two features: * We can kill the worker if the parser is taking too long * We can have different parsers :param http_response: The http response instance :return: An instance of DocumentParser """ # Start the worker processes if needed self.start_workers() apply_args = (process_document_parser, http_response, self.DEBUG) # Push the task to the workers future = self._pool.schedule(apply_with_return_error, args=(apply_args, ), timeout=self.PARSER_TIMEOUT) try: parser_output = future.result() except TimeoutError: # Act just like when there is no parser msg = ('[timeout] The parser took more than %s seconds' ' to complete parsing of "%s", killed it!') args = (self.PARSER_TIMEOUT, http_response.get_url()) raise BaseFrameworkException(msg % args) else: if isinstance(parser_output, Error): parser_output.reraise() return parser_output def get_tags_by_filter(self, http_response, tags, yield_text=False): """ Return Tag instances for the tags which match the `tags` filter, parsing and all lxml stuff is done in another process and the Tag instances are sent to the main process (the one calling this method) through a pipe Some things to note: * Not all responses can be parsed, so I need to call DocumentParser and handle exceptions * The parser selected by DocumentParser might not have tags, and it might not have get_tags_by_filter. In this case just return an empty list * Just like get_document_parser_for we have a timeout in place, when we hit the timeout just return an empty list, this is not the best thing to do, but makes the plugin code easier to write (plugins would ignore this anyways) :param tags: The filter :param yield_text: Should we yield the tag text? :return: A list of Tag instances as defined in sgml.py :see: SGMLParser.get_tags_by_filter """ # Start the worker processes if needed self.start_workers() apply_args = (process_get_tags_by_filter, http_response, tags, yield_text, self.DEBUG) # Push the task to the workers future = self._pool.schedule(apply_with_return_error, args=(apply_args, ), timeout=self.PARSER_TIMEOUT) try: filtered_tags = future.result() except TimeoutError: # We hit a timeout, return an empty list return [] else: # There was an exception in the parser, maybe the HTML was really # broken, or it wasn't an HTML at all. if isinstance(filtered_tags, Error): return [] return filtered_tags
class MultiProcessingDocumentParser(object): """ A document parser that performs all it's tasks in different processes and returns results to the main process. Also implements a parsing timeout just in case the parser enters an infinite loop. :author: Andres Riancho ([email protected]) """ DEBUG = core_profiling_is_enabled() MAX_WORKERS = 2 if is_running_on_ci() else (multiprocessing.cpu_count() / 2) or 1 # Increasing the timeout when profiling is enabled seems to fix issue #9713 # # https://github.com/andresriancho/w3af/issues/9713 PROFILING_ENABLED = (user_wants_memory_profiling() or user_wants_pytracemalloc() or user_wants_cpu_profiling()) # in seconds PARSER_TIMEOUT = 60 * 3 if PROFILING_ENABLED else 10 # Document parsers can go crazy on memory usage when parsing some very # specific HTML / PDF documents. Sometimes when this happens the operating # system does an out of memory (OOM) kill of a "randomly chosen" process. # # We limit the memory which can be used by parsing processes to this constant # # The feature was tested in test_pebble_limit_memory_usage.py MEMORY_LIMIT = get_memory_limit() def __init__(self): self._pool = None self._start_lock = threading.RLock() def start_workers(self): """ Start the pool and workers :return: The pool instance """ with self._start_lock: if self._pool is None: # Start the process pool log_queue = om.manager.get_in_queue() self._pool = ProcessPool(self.MAX_WORKERS, max_tasks=20, initializer=init_worker, initargs=(log_queue, self.MEMORY_LIMIT)) return self._pool def stop_workers(self): """ Stop the pool workers :return: None """ if self._pool is not None: self._pool.stop() self._pool.join() self._pool = None def get_document_parser_for(self, http_response): """ Get a document parser for http_response This parses the http_response in a pool worker. This method has two features: * We can kill the worker if the parser is taking too long * We can have different parsers :param http_response: The http response instance :return: An instance of DocumentParser """ # Start the worker processes if needed self.start_workers() apply_args = (process_document_parser, http_response, self.DEBUG) # Push the task to the workers try: future = self._pool.schedule(apply_with_return_error, args=(apply_args, ), timeout=self.PARSER_TIMEOUT) except RuntimeError, rte: # We get here when the pebble pool management thread dies and # suddenly starts answering all calls with: # # RuntimeError('Unexpected error within the Pool') # # The scan needs to stop because we can't parse any more # HTTP responses, which is a very critical part of the process msg = str(rte) raise ScanMustStopException(msg) try: parser_output = future.result() except TimeoutError: msg = ('[timeout] The parser took more than %s seconds' ' to complete parsing of "%s", killed it!') args = (self.PARSER_TIMEOUT, http_response.get_url()) raise TimeoutError(msg % args) # We still need to perform some error handling here... if isinstance(parser_output, Error): if isinstance(parser_output.exc_value, MemoryError): msg = ('The parser exceeded the memory usage limit of %s bytes' ' while trying to parse "%s". The parser was stopped in' ' order to prevent OOM issues.') args = (self.MEMORY_LIMIT, http_response.get_url()) om.out.debug(msg % args) raise MemoryError(msg % args) parser_output.reraise() # Success! return parser_output