class MultiProcessingDocumentParser(object): """ A document parser that performs all it's tasks in different processes and returns results to the main process. Also implements a parsing timeout just in case the parser enters an infinite loop. :author: Andres Riancho ([email protected]) """ DEBUG = core_profiling_is_enabled() MAX_WORKERS = 2 if is_running_on_ci() else (multiprocessing.cpu_count() / 2) or 1 # Increasing the timeout when profiling is enabled seems to fix issue #9713 # # https://github.com/andresriancho/w3af/issues/9713 PROFILING_ENABLED = (user_wants_memory_profiling() or user_wants_pytracemalloc() or user_wants_cpu_profiling()) # in seconds PARSER_TIMEOUT = 60 * 3 if PROFILING_ENABLED else 10 def __init__(self): self._pool = None self._processes = None self._start_lock = threading.RLock() def start_workers(self): """ Start the pool and workers :return: The pool instance """ with self._start_lock: if self._pool is None: # pylint: disable=E1101 # Keep track of which pid is processing which http response self._processes = manager.dict() # pylint: enable=E1101 # The pool log_queue = om.manager.get_in_queue() self._pool = ProcessPool(self.MAX_WORKERS, maxtasksperchild=20, initializer=init_worker, initargs=(log_queue,)) return self._pool def stop_workers(self): """ Stop the pool workers :return: None """ if self._pool is not None: self._pool.terminate() self._pool = None if self._processes is not None: self._processes.clear() self._processes = None def _kill_parser_process(self, hash_string, http_response): """ Kill the process that's handling the parsing of http_response which can be identified by hash_string :param hash_string: The hash for the http_response :param http_response: The HTTP response which is being parsed :return: None """ # Near the timeout error, so we make sure that the pid is still # running our "buggy" input pid = self._processes.pop(hash_string, None) if pid is not None: try: os.kill(pid, signal.SIGTERM) except OSError, ose: msg = ('An error occurred while killing the parser' ' process: "%s"') om.out.debug(msg % ose) msg = ('[timeout] The parser took more than %s seconds to complete' ' parsing of "%s", killed it!') if self.PROFILING_ENABLED: msg += (' You are running a profiling session which requires more' ' CPU and resources to be run; the' ' MultiProcessingDocumentParser failed to parse the HTML' ' document. Try to increase the PARSER_TIMEOUT and try' ' again.\n\n' 'This issue invalidates the profiling session!\n\n' 'See issue #9713 for more information' ' https://github.com/andresriancho/w3af/issues/9713') log_function = om.out.error if self.PROFILING_ENABLED else om.out.debug log_function(msg % (self.PARSER_TIMEOUT, http_response.get_url()))
class MultiProcessingDocumentParser(object): """ A document parser that performs all it's tasks in different processes and returns results to the main process. Also implements a parsing timeout just in case the parser enters an infinite loop. :author: Andres Riancho ([email protected]) """ DEBUG = core_profiling_is_enabled() MAX_WORKERS = 2 if is_running_on_ci() else (multiprocessing.cpu_count() / 2) or 1 # Increasing the timeout when profiling is enabled seems to fix issue #9713 # # https://github.com/andresriancho/w3af/issues/9713 PROFILING_ENABLED = (user_wants_memory_profiling() or user_wants_pytracemalloc() or user_wants_cpu_profiling()) # in seconds PARSER_TIMEOUT = 60 * 3 if PROFILING_ENABLED else 10 # Document parsers can go crazy on memory usage when parsing some very # specific HTML / PDF documents. Sometimes when this happens the operating # system does an out of memory (OOM) kill of a "randomly chosen" process. # # We limit the memory which can be used by parsing processes to this constant # # The feature was tested in test_pebble_limit_memory_usage.py MEMORY_LIMIT = get_memory_limit() def __init__(self): self._pool = None self._start_lock = threading.RLock() def start_workers(self): """ Start the pool and workers :return: The pool instance """ with self._start_lock: if self._pool is None: # Start the process pool log_queue = om.manager.get_in_queue() self._pool = ProcessPool(self.MAX_WORKERS, max_tasks=20, initializer=init_worker, initargs=(log_queue, self.MEMORY_LIMIT)) return self._pool def stop_workers(self): """ Stop the pool workers :return: None """ if self._pool is not None: self._pool.stop() self._pool.join() self._pool = None def get_document_parser_for(self, http_response): """ Get a document parser for http_response This parses the http_response in a pool worker. This method has two features: * We can kill the worker if the parser is taking too long * We can have different parsers :param http_response: The http response instance :return: An instance of DocumentParser """ # Start the worker processes if needed self.start_workers() apply_args = (process_document_parser, http_response, self.DEBUG) # Push the task to the workers try: future = self._pool.schedule(apply_with_return_error, args=(apply_args, ), timeout=self.PARSER_TIMEOUT) except RuntimeError, rte: # We get here when the pebble pool management thread dies and # suddenly starts answering all calls with: # # RuntimeError('Unexpected error within the Pool') # # The scan needs to stop because we can't parse any more # HTTP responses, which is a very critical part of the process msg = str(rte) raise ScanMustStopException(msg) try: parser_output = future.result() except TimeoutError: msg = ('[timeout] The parser took more than %s seconds' ' to complete parsing of "%s", killed it!') args = (self.PARSER_TIMEOUT, http_response.get_url()) raise TimeoutError(msg % args) # We still need to perform some error handling here... if isinstance(parser_output, Error): if isinstance(parser_output.exc_value, MemoryError): msg = ('The parser exceeded the memory usage limit of %s bytes' ' while trying to parse "%s". The parser was stopped in' ' order to prevent OOM issues.') args = (self.MEMORY_LIMIT, http_response.get_url()) om.out.debug(msg % args) raise MemoryError(msg % args) parser_output.reraise() # Success! return parser_output
class MultiProcessingDocumentParser(object): """ A document parser that performs all it's tasks in different processes and returns results to the main process. Also implements a parsing timeout just in case the parser enters an infinite loop. :author: Andres Riancho ([email protected]) """ DEBUG = core_profiling_is_enabled() MAX_WORKERS = 2 if is_running_on_ci() else (multiprocessing.cpu_count() / 2) or 1 # Increasing the timeout when profiling is enabled seems to fix issue #9713 # # https://github.com/andresriancho/w3af/issues/9713 PROFILING_ENABLED = (user_wants_memory_profiling() or user_wants_pytracemalloc() or user_wants_cpu_profiling()) # in seconds PARSER_TIMEOUT = 60 * 3 if PROFILING_ENABLED else 10 def __init__(self): self._pool = None self._start_lock = threading.RLock() def start_workers(self): """ Start the pool and workers :return: The pool instance """ with self._start_lock: if self._pool is None: # Start the process pool log_queue = om.manager.get_in_queue() self._pool = ProcessPool(self.MAX_WORKERS, max_tasks=20, initializer=init_worker, initargs=(log_queue, )) return self._pool def stop_workers(self): """ Stop the pool workers :return: None """ if self._pool is not None: self._pool.stop() self._pool.join() self._pool = None def get_document_parser_for(self, http_response): """ Get a document parser for http_response This parses the http_response in a pool worker. This method has two features: * We can kill the worker if the parser is taking too long * We can have different parsers :param http_response: The http response instance :return: An instance of DocumentParser """ # Start the worker processes if needed self.start_workers() apply_args = (process_document_parser, http_response, self.DEBUG) # Push the task to the workers future = self._pool.schedule(apply_with_return_error, args=(apply_args, ), timeout=self.PARSER_TIMEOUT) try: parser_output = future.result() except TimeoutError: # Act just like when there is no parser msg = ('[timeout] The parser took more than %s seconds' ' to complete parsing of "%s", killed it!') args = (self.PARSER_TIMEOUT, http_response.get_url()) raise BaseFrameworkException(msg % args) else: if isinstance(parser_output, Error): parser_output.reraise() return parser_output def get_tags_by_filter(self, http_response, tags, yield_text=False): """ Return Tag instances for the tags which match the `tags` filter, parsing and all lxml stuff is done in another process and the Tag instances are sent to the main process (the one calling this method) through a pipe Some things to note: * Not all responses can be parsed, so I need to call DocumentParser and handle exceptions * The parser selected by DocumentParser might not have tags, and it might not have get_tags_by_filter. In this case just return an empty list * Just like get_document_parser_for we have a timeout in place, when we hit the timeout just return an empty list, this is not the best thing to do, but makes the plugin code easier to write (plugins would ignore this anyways) :param tags: The filter :param yield_text: Should we yield the tag text? :return: A list of Tag instances as defined in sgml.py :see: SGMLParser.get_tags_by_filter """ # Start the worker processes if needed self.start_workers() apply_args = (process_get_tags_by_filter, http_response, tags, yield_text, self.DEBUG) # Push the task to the workers future = self._pool.schedule(apply_with_return_error, args=(apply_args, ), timeout=self.PARSER_TIMEOUT) try: filtered_tags = future.result() except TimeoutError: # We hit a timeout, return an empty list return [] else: # There was an exception in the parser, maybe the HTML was really # broken, or it wasn't an HTML at all. if isinstance(filtered_tags, Error): return [] return filtered_tags