def start_workers(self): """ Start the pool and workers :return: The pool instance """ with self._start_lock: if self._pool is None: # Keep track of which pid is processing which http response # pylint: disable=E1101 self._processes = manager.dict() # pylint: enable=E1101 # The pool log_queue = om.manager.get_in_queue() self._pool = ProcessPool(self.MAX_WORKERS, maxtasksperchild=25, initializer=init_worker, initargs=(log_queue, )) return self._pool
def start_workers(self): """ Start the pool and workers :return: The pool instance """ with self._start_lock: if self._pool is None: # Keep track of which pid is processing which http response # pylint: disable=E1101 self._processes = manager.dict() # pylint: enable=E1101 # The pool log_queue = om.manager.get_in_queue() self._pool = ProcessPool( self.MAX_WORKERS, maxtasksperchild=25, initializer=init_worker, initargs=(log_queue,) ) return self._pool
class ParserCache(object): """ This class is a document parser cache. :author: Andres Riancho ([email protected]) """ LRU_LENGTH = 40 MAX_CACHEABLE_BODY_LEN = 1024 * 1024 PARSER_TIMEOUT = 60 # in seconds DEBUG = False MAX_WORKERS = 2 if is_running_on_ci() else (multiprocessing.cpu_count() / 2) or 1 def __init__(self): self._cache = SynchronizedLRUDict(self.LRU_LENGTH) self._pool = None self._processes = None self._parser_finished_events = {} self._start_lock = threading.RLock() # These are here for debugging: self._archive = set() self._from_LRU = 0.0 self._calculated_more_than_once = 0.0 self._total = 0.0 def start_workers(self): """ Start the pool and workers :return: The pool instance """ with self._start_lock: if self._pool is None: # Keep track of which pid is processing which http response # pylint: disable=E1101 self._processes = manager.dict() # pylint: enable=E1101 # The pool log_queue = om.manager.get_in_queue() self._pool = ProcessPool(self.MAX_WORKERS, maxtasksperchild=25, initializer=init_worker, initargs=(log_queue,)) return self._pool def stop_workers(self): """ Stop the pool workers :return: None """ if self._pool is not None: self._pool.terminate() self._pool = None self._processes = None # We don't need this data anymore self._cache.clear() if self.DEBUG: re_calc_rate = (self._calculated_more_than_once / self._total) print('parser_cache LRU rate: %s' % (self._from_LRU / self._total)) print('parser_cache re-calculation rate: %s' % re_calc_rate) print('parser_cache size: %s' % self.LRU_LENGTH) def get_cache_key(self, http_response): """ Before I used md5, but I realized that it was unnecessary. I experimented a little bit with python's hash functions and the builtin hash was the fastest. At first I thought that the built-in hash wasn't good enough, as it could create collisions... but... given that the LRU has only 40 positions, the real probability of a collision is too low. :return: The key to be used in the cache for storing this http_response """ # @see: test_bug_13_Dec_2012 to understand why we concat the uri to the # body before hashing uri_str = http_response.get_uri().url_string.encode('utf-8') body_str = http_response.body if isinstance(body_str, unicode): body_str = body_str.encode('utf-8', 'replace') _to_hash = body_str + uri_str # Added adler32 after finding some hash() collisions in builds hash_string = str(hash(_to_hash)) hash_string += str(zlib.adler32(_to_hash)) return hash_string def should_cache(self, http_response): """ Defines if this http_response parser should be cached or not :param http_response: The http response instance :return: True if we should cache the parser for this response """ return len(http_response.get_body()) < self.MAX_CACHEABLE_BODY_LEN def _test_parse_http_response(self, http_response, *args): """ Left here for testing! """ return DocumentParser(http_response) def _parse_http_response_in_worker(self, http_response, hash_string): """ This parses the http_response in a pool worker. This has two features: * We can kill the worker if the parser is taking too long * We can have different parsers :return: The DocumentParser instance """ event = multiprocessing.Event() self._parser_finished_events[hash_string] = event # Start the worker processes if needed self.start_workers() apply_args = (ProcessDocumentParser, http_response, self._processes, hash_string) # Push the task to the workers result = self._pool.apply_async(apply_with_return_error, (apply_args,)) try: parser_output = result.get(timeout=self.PARSER_TIMEOUT) except multiprocessing.TimeoutError: # Near the timeout error, so we make sure that the pid is still # running our "buggy" input pid = self._processes.pop(hash_string, None) if pid is not None: try: os.kill(pid, signal.SIGTERM) except OSError, ose: msg = 'An error occurred while killing the parser' \ ' process: "%s"' om.out.debug(msg % ose) msg = '[timeout] The parser took more than %s seconds'\ ' to complete parsing of "%s", killed it!' om.out.debug(msg % (self.PARSER_TIMEOUT, http_response.get_url())) # Act just like when there is no parser msg = 'There is no parser for "%s".' % http_response.get_url() raise BaseFrameworkException(msg) else:
class MultiProcessingDocumentParser(object): """ A document parser that performs all it's tasks in different processes and returns results to the main process. Also implements a parsing timeout just in case the parser enters an infinite loop. :author: Andres Riancho ([email protected]) """ DEBUG = core_profiling_is_enabled() MAX_WORKERS = 2 if is_running_on_ci() else (multiprocessing.cpu_count() / 2) or 1 # Increasing the timeout when profiling is enabled seems to fix issue #9713 # # https://github.com/andresriancho/w3af/issues/9713 PROFILING_ENABLED = (user_wants_memory_profiling() or user_wants_pytracemalloc() or user_wants_cpu_profiling()) # in seconds PARSER_TIMEOUT = 60 * 3 if PROFILING_ENABLED else 10 def __init__(self): self._pool = None self._processes = None self._start_lock = threading.RLock() def start_workers(self): """ Start the pool and workers :return: The pool instance """ with self._start_lock: if self._pool is None: # pylint: disable=E1101 # Keep track of which pid is processing which http response self._processes = manager.dict() # pylint: enable=E1101 # The pool log_queue = om.manager.get_in_queue() self._pool = ProcessPool(self.MAX_WORKERS, maxtasksperchild=20, initializer=init_worker, initargs=(log_queue,)) return self._pool def stop_workers(self): """ Stop the pool workers :return: None """ if self._pool is not None: self._pool.terminate() self._pool = None if self._processes is not None: self._processes.clear() self._processes = None def _kill_parser_process(self, hash_string, http_response): """ Kill the process that's handling the parsing of http_response which can be identified by hash_string :param hash_string: The hash for the http_response :param http_response: The HTTP response which is being parsed :return: None """ # Near the timeout error, so we make sure that the pid is still # running our "buggy" input pid = self._processes.pop(hash_string, None) if pid is not None: try: os.kill(pid, signal.SIGTERM) except OSError, ose: msg = ('An error occurred while killing the parser' ' process: "%s"') om.out.debug(msg % ose) msg = ('[timeout] The parser took more than %s seconds to complete' ' parsing of "%s", killed it!') if self.PROFILING_ENABLED: msg += (' You are running a profiling session which requires more' ' CPU and resources to be run; the' ' MultiProcessingDocumentParser failed to parse the HTML' ' document. Try to increase the PARSER_TIMEOUT and try' ' again.\n\n' 'This issue invalidates the profiling session!\n\n' 'See issue #9713 for more information' ' https://github.com/andresriancho/w3af/issues/9713') log_function = om.out.error if self.PROFILING_ENABLED else om.out.debug log_function(msg % (self.PARSER_TIMEOUT, http_response.get_url()))
class ParserCache(object): """ This class is a document parser cache. :author: Andres Riancho ([email protected]) """ LRU_LENGTH = 40 MAX_CACHEABLE_BODY_LEN = 1024 * 1024 PARSER_TIMEOUT = 60 # in seconds DEBUG = False MAX_WORKERS = 2 if is_running_on_ci() else (multiprocessing.cpu_count() / 2) or 1 def __init__(self): self._cache = SynchronizedLRUDict(self.LRU_LENGTH) self._pool = None self._processes = None self._parser_finished_events = {} self._start_lock = threading.RLock() # These are here for debugging: self._archive = set() self._from_LRU = 0.0 self._calculated_more_than_once = 0.0 self._total = 0.0 def start_workers(self): """ Start the pool and workers :return: The pool instance """ with self._start_lock: if self._pool is None: # Keep track of which pid is processing which http response # pylint: disable=E1101 self._processes = manager.dict() # pylint: enable=E1101 # The pool log_queue = om.manager.get_in_queue() self._pool = ProcessPool(self.MAX_WORKERS, maxtasksperchild=25, initializer=init_worker, initargs=(log_queue, )) return self._pool def stop_workers(self): """ Stop the pool workers :return: None """ if self._pool is not None: self._pool.terminate() self._pool = None self._processes = None # We don't need this data anymore self._cache.clear() if self.DEBUG: re_calc_rate = (self._calculated_more_than_once / self._total) print('parser_cache LRU rate: %s' % (self._from_LRU / self._total)) print('parser_cache re-calculation rate: %s' % re_calc_rate) print('parser_cache size: %s' % self.LRU_LENGTH) def get_cache_key(self, http_response): """ Before I used md5, but I realized that it was unnecessary. I experimented a little bit with python's hash functions and the builtin hash was the fastest. At first I thought that the built-in hash wasn't good enough, as it could create collisions... but... given that the LRU has only 40 positions, the real probability of a collision is too low. :return: The key to be used in the cache for storing this http_response """ # @see: test_bug_13_Dec_2012 to understand why we concat the uri to the # body before hashing uri_str = http_response.get_uri().url_string.encode('utf-8') body_str = http_response.body if isinstance(body_str, unicode): body_str = body_str.encode('utf-8', 'replace') _to_hash = body_str + uri_str # Added adler32 after finding some hash() collisions in builds hash_string = str(hash(_to_hash)) hash_string += str(zlib.adler32(_to_hash)) return hash_string def should_cache(self, http_response): """ Defines if this http_response parser should be cached or not :param http_response: The http response instance :return: True if we should cache the parser for this response """ return len(http_response.get_body()) < self.MAX_CACHEABLE_BODY_LEN def _test_parse_http_response(self, http_response, *args): """ Left here for testing! """ return DocumentParser(http_response) def _parse_http_response_in_worker(self, http_response, hash_string): """ This parses the http_response in a pool worker. This has two features: * We can kill the worker if the parser is taking too long * We can have different parsers :return: The DocumentParser instance """ event = multiprocessing.Event() self._parser_finished_events[hash_string] = event # Start the worker processes if needed self.start_workers() apply_args = (ProcessDocumentParser, http_response, self._processes, hash_string) # Push the task to the workers result = self._pool.apply_async(apply_with_return_error, (apply_args, )) try: parser_output = result.get(timeout=self.PARSER_TIMEOUT) except multiprocessing.TimeoutError: # Near the timeout error, so we make sure that the pid is still # running our "buggy" input pid = self._processes.pop(hash_string, None) if pid is not None: try: os.kill(pid, signal.SIGTERM) except OSError, ose: msg = 'An error occurred while killing the parser' \ ' process: "%s"' om.out.debug(msg % ose) msg = '[timeout] The parser took more than %s seconds'\ ' to complete parsing of "%s", killed it!' om.out.debug(msg % (self.PARSER_TIMEOUT, http_response.get_url())) # Act just like when there is no parser msg = 'There is no parser for "%s".' % http_response.get_url() raise BaseFrameworkException(msg) else:
class MultiProcessingDocumentParser(object): """ A document parser that performs all it's tasks in different processes and returns results to the main process. Also implements a parsing timeout just in case the parser enters an infinite loop. :author: Andres Riancho ([email protected]) """ # in seconds PARSER_TIMEOUT = 10 DEBUG = core_profiling_is_enabled() MAX_WORKERS = 2 if is_running_on_ci() else (multiprocessing.cpu_count() / 2) or 1 def __init__(self): self._pool = None self._processes = None self._start_lock = threading.RLock() def start_workers(self): """ Start the pool and workers :return: The pool instance """ with self._start_lock: if self._pool is None: # pylint: disable=E1101 # Keep track of which pid is processing which http response self._processes = manager.dict() # pylint: enable=E1101 # The pool log_queue = om.manager.get_in_queue() self._pool = ProcessPool(self.MAX_WORKERS, maxtasksperchild=20, initializer=init_worker, initargs=(log_queue, )) return self._pool def stop_workers(self): """ Stop the pool workers :return: None """ if self._pool is not None: self._pool.terminate() self._pool = None self._processes.clear() self._processes = None def _kill_parser_process(self, hash_string, http_response): """ Kill the process that's handling the parsing of http_response which can be identified by hash_string :param hash_string: The hash for the http_response :param http_response: The HTTP response which is being parsed :return: None """ # Near the timeout error, so we make sure that the pid is still # running our "buggy" input pid = self._processes.pop(hash_string, None) if pid is not None: try: os.kill(pid, signal.SIGTERM) except OSError, ose: msg = ('An error occurred while killing the parser' ' process: "%s"') om.out.debug(msg % ose) msg = ('[timeout] The parser took more than %s seconds to complete' ' parsing of "%s", killed it!') if user_wants_memory_profiling() or user_wants_pytracemalloc(): msg += (' Keep in mind that you\'re profiling memory usage and' ' there is a known bug where memory profilers break the' ' parser cache. See issue #9713 for more information' ' https://github.com/andresriancho/w3af/issues/9713') om.out.debug(msg % (self.PARSER_TIMEOUT, http_response.get_url()))
class MultiProcessingDocumentParser(object): """ A document parser that performs all it's tasks in different processes and returns results to the main process. Also implements a parsing timeout just in case the parser enters an infinite loop. :author: Andres Riancho ([email protected]) """ # in seconds PARSER_TIMEOUT = 10 DEBUG = core_profiling_is_enabled() MAX_WORKERS = 2 if is_running_on_ci() else (multiprocessing.cpu_count() / 2) or 1 def __init__(self): self._pool = None self._processes = None self._start_lock = threading.RLock() def start_workers(self): """ Start the pool and workers :return: The pool instance """ with self._start_lock: if self._pool is None: # pylint: disable=E1101 # Keep track of which pid is processing which http response self._processes = manager.dict() # pylint: enable=E1101 # The pool log_queue = om.manager.get_in_queue() self._pool = ProcessPool(self.MAX_WORKERS, maxtasksperchild=20, initializer=init_worker, initargs=(log_queue,)) return self._pool def stop_workers(self): """ Stop the pool workers :return: None """ if self._pool is not None: self._pool.terminate() self._pool = None if self._processes is not None: self._processes.clear() self._processes = None def _kill_parser_process(self, hash_string, http_response): """ Kill the process that's handling the parsing of http_response which can be identified by hash_string :param hash_string: The hash for the http_response :param http_response: The HTTP response which is being parsed :return: None """ # Near the timeout error, so we make sure that the pid is still # running our "buggy" input pid = self._processes.pop(hash_string, None) if pid is not None: try: os.kill(pid, signal.SIGTERM) except OSError, ose: msg = ('An error occurred while killing the parser' ' process: "%s"') om.out.debug(msg % ose) msg = ('[timeout] The parser took more than %s seconds to complete' ' parsing of "%s", killed it!') if user_wants_memory_profiling() or user_wants_pytracemalloc(): msg += (' Keep in mind that you\'re profiling memory usage and' ' there is a known bug where memory profilers break the' ' parser cache. See issue #9713 for more information' ' https://github.com/andresriancho/w3af/issues/9713') om.out.debug(msg % (self.PARSER_TIMEOUT, http_response.get_url()))