Exemplo n.º 1
0
 def test_process_pool_join_workers(self):
     """Process Pool Spawn no worker is running after join."""
     pool = ProcessPool(max_workers=4)
     pool.schedule(function, args=[1])
     pool.stop()
     pool.join()
     self.assertEqual(len(pool._pool_manager.worker_manager.workers), 0)
Exemplo n.º 2
0
 def test_process_pool_stop_stopped(self):
     """Process Pool Spawn is stopped after stop."""
     pool = ProcessPool()
     pool.schedule(function, args=[1])
     pool.stop()
     pool.join()
     self.assertFalse(pool.active)
Exemplo n.º 3
0
 def test_process_pool_stop_stopped(self):
     """Process Pool Fork is stopped after stop."""
     pool = ProcessPool(max_workers=1)
     pool.schedule(function, args=[1])
     pool.stop()
     pool.join()
     self.assertFalse(pool.active)
Exemplo n.º 4
0
 def test_process_pool_close_stopped(self):
     """Process Pool Forkserver is stopped after close."""
     pool = ProcessPool(max_workers=1, context=mp_context)
     pool.schedule(function, args=[1])
     pool.close()
     pool.join()
     self.assertFalse(pool.active)
Exemplo n.º 5
0
 def test_process_pool_close_stopped(self):
     """Process Pool Fork is stopped after close."""
     pool = ProcessPool(max_workers=1)
     pool.schedule(function, args=[1])
     pool.close()
     pool.join()
     self.assertFalse(pool.active)
Exemplo n.º 6
0
 def test_process_pool_join_workers(self):
     """Process Pool Fork no worker is running after join."""
     pool = ProcessPool(max_workers=4)
     pool.schedule(function, args=[1])
     pool.stop()
     pool.join()
     self.assertEqual(len(pool._pool_manager.worker_manager.workers), 0)
Exemplo n.º 7
0
 def test_process_pool_close_stopped(self):
     """Process Pool Spawn is stopped after close."""
     pool = ProcessPool()
     pool.schedule(function, args=[1])
     pool.close()
     pool.join()
     self.assertFalse(pool.active)
Exemplo n.º 8
0
    def test_process_pool_stop_large_data(self):
        """Process Pool Fork is stopped if large data is sent on the channel."""
        data = "a" * 1098 * 1024 * 50  # 50 Mb
        pool = ProcessPool(max_workers=1)
        pool.schedule(function, args=[data])
        pool.stop()
        pool.join()

        self.assertFalse(pool.active)
Exemplo n.º 9
0
 def test_process_pool_close_futures(self):
     """Process Pool Spawn all futures are performed on close."""
     futures = []
     pool = ProcessPool()
     for index in range(10):
         futures.append(pool.schedule(function, args=[index]))
     pool.close()
     pool.join()
     map(self.assertTrue, [f.done() for f in futures])
Exemplo n.º 10
0
 def test_process_pool_join_futures_timeout(self):
     """Process Pool Fork TimeoutError is raised if join on long futures."""
     pool = ProcessPool(max_workers=1)
     for _ in range(2):
         pool.schedule(long_function)
     pool.close()
     self.assertRaises(TimeoutError, pool.join, 0.4)
     pool.stop()
     pool.join()
Exemplo n.º 11
0
    def test_process_pool_stop_large_data(self):
        """Process Pool Fork is stopped if large data is sent on the channel."""
        data = "a" * 1098 * 1024 * 50  # 50 Mb
        pool = ProcessPool(max_workers=1)
        pool.schedule(function, args=[data])
        pool.stop()
        pool.join()

        self.assertFalse(pool.active)
Exemplo n.º 12
0
 def test_process_pool_join_futures_timeout(self):
     """Process Pool Spawn TimeoutError is raised if join on long tasks."""
     pool = ProcessPool()
     for _ in range(2):
         pool.schedule(long_function)
     pool.close()
     self.assertRaises(TimeoutError, pool.join, 0.4)
     pool.stop()
     pool.join()
Exemplo n.º 13
0
 def test_process_pool_close_futures(self):
     """Process Pool Fork all futures are performed on close."""
     futures = []
     pool = ProcessPool(max_workers=1)
     for index in range(10):
         futures.append(pool.schedule(function, args=[index]))
     pool.close()
     pool.join()
     map(self.assertTrue, [f.done() for f in futures])
Exemplo n.º 14
0
 def test_process_pool_stop_futures(self):
     """Process Pool Fork not all futures are performed on stop."""
     futures = []
     pool = ProcessPool(max_workers=1)
     for index in range(10):
         futures.append(pool.schedule(function, args=[index]))
     pool.stop()
     pool.join()
     self.assertTrue(len([f for f in futures if not f.done()]) > 0)
Exemplo n.º 15
0
    def test_process_pool_stop_large_data(self):
        """Process Pool Spawn is stopped if large data is sent on the channel."""
        data = "a" * 4098 * 1024
        pool = ProcessPool(initializer=long_initializer)
        pool.schedule(function, args=[data])
        pool.stop()
        pool.join()

        self.assertFalse(pool.active)
Exemplo n.º 16
0
 def test_process_pool_stop_futures(self):
     """Process Pool Spawn not all futures are performed on stop."""
     futures = []
     pool = ProcessPool()
     for index in range(10):
         futures.append(pool.schedule(function, args=[index]))
     pool.stop()
     pool.join()
     self.assertTrue(len([f for f in futures if not f.done()]) > 0)
Exemplo n.º 17
0
class CounterDaemon(object):

    def __init__(self, workers=1, poll_interval=None, max_tasks=100, task_timeout=0.1,
                 task_default_sleep=0.01, task_sleep_rand_range=(1, 20)):
        self.workers = workers
        self.poll_interval = poll_interval
        self.max_tasks = max_tasks
        self.task_timeout = task_timeout
        self.task_default_sleep = task_default_sleep
        self.task_sleep_rand_range = task_sleep_rand_range

        self.pool = ProcessPool(max_workers=self.workers, max_tasks=self.max_tasks)
        self.pool._start_pool()

    def done_callback(self, bucket_id, future):
        pid = os.getpid()
        try:
            result = future.result()
            logger.debug('Result: {}\tpid: {}\tbucket: {}'.format(result, pid, bucket_id))
        except futures.TimeoutError as e:
            logger.warning('TimeoutError\tpid: {}\tbucket: {}'.format(pid, bucket_id))
        except futures.CancelledError:
            return
        except Exception as e:
            logger.exception('TaskError\t pid: {}\tbucket: {}\tError: {}'.format(pid, bucket_id, e))

    def run_once(self):
        for bucket_id in random.sample(BUCKET_RANGE, self.workers):
            sleep = self.task_default_sleep * random.randint(*self.task_sleep_rand_range)
            future = self.pool.schedule(
                incr,
                args=(bucket_id,),
                kwargs={'sleep': sleep},
                timeout=self.task_timeout
            )
            future.add_done_callback(functools.partial(self.done_callback, bucket_id))

    def run_forever(self):
        while True:
            try:
                self.run_once()
            except Exception as e:
                logger.exception('RunOnceError: {}'.format(e))
            time.sleep(self.poll_interval or (3 * self.task_timeout))

    def start(self):
        try:
            self.run_forever()
        except Exception as e:
            logger.exception('Error during running daemon: {}'.format(e))
            self.pool.close()
            time.sleep(10)
            self.pool.stop()
        finally:
            self.pool.join()
Exemplo n.º 18
0
 def find_tlds(self):
     dom_list = [self.known_domain + '.' + tld for tld in self.tld_list]
     try:
         pool = ThreadPool(max_workers=self.max_workers,
                           max_tasks=self.max_tasks)
         results = pool.map(self.check_tld, dom_list, timeout=self.timeout)
         pool.close()
         pool.join()
         print(results)
     except Exception as e:
         print(repr(e))
         pass
Exemplo n.º 19
0
def parallel_checks() -> None:
    """
    Do all the checks that don't change code and can run in parallel.
    """
    chores = [
        do_mypy,
        do_detect_secrets,
        do_git_secrets,
        vulture,
        do_compile_py,
        do_lint,
        do_flake8,
        do_dodgy,
        do_bandit,
        do_python_taint,
        do_mccabe,
        do_check_manifest,
        do_liccheck,
    ]
    if IS_GITLAB:
        # other tasks assume there will be a LOC file by now.
        do_count_lines_of_code()
        for chore in chores:
            print(chore())
        return

    # can't do pyroma because that needs a package, which might not exist yet.

    pool = ProcessPool(12)  # max_workers=len(chores))  # cpu_count())
    # log_to_stderr(logging.DEBUG)
    tasks = []
    for chore in chores:
        tasks.append(pool.schedule(chore, args=()))

    print("close & join")
    pool.close()
    pool.join()

    for current_task in tasks:
        # pylint: disable=broad-except
        try:
            result = current_task.result()
            exception = current_task.exception()
            if exception:
                print(current_task.exception())
            print(result)
            if "Abnormal" in str(result):
                print("One or more parallel tasks failed.")
                sys.exit(-1)
        except Exception as ex:
            print(ex)
            sys.exit(-1)
Exemplo n.º 20
0
    def handle(self, *args, **options):
        trials = options['trials']
        bucket_id = options['bucket']

        pool = ProcessPool(max_workers=2)
        pool._start_pool()

        bucket, _ = Counter.objects.get_or_create(bucket=bucket_id)
        bucket.count = 0
        bucket.save()

        future_1 = pool.schedule(run_atomic_transactions,
                                 args=('T1', bucket_id, trials))
        future_2 = pool.schedule(run_savepoints,
                                 args=('T2', bucket_id, trials))
        pool.close()
        pool.join()
Exemplo n.º 21
0
class PebbleMap(PySAT):
    name = 'PySAT Concurrency: PebbleMap'

    def __init__(self, **kwargs):
        self.pool = None
        super().__init__(**kwargs)

    def initialize(self, solver, **kwargs):
        if self.pool is not None:
            kwargs['output'].debug(2, 2, 'Pool already inited')
        else:
            self.pool = ProcessPool(max_workers=self.processes,
                                    initializer=self.init_func,
                                    initargs=(solver, kwargs['instance']))
            kwargs['output'].debug(
                2, 2, 'Init pool with %d processes' % self.processes)

    def process(self, tasks: List[Task], **kwargs) -> List[Result]:
        output = kwargs['output']
        results = []
        future = self.pool.map(self.solve_func, tasks)

        # timer = Timer(20., future.cancel, ())
        # timer.start()
        try:
            for result in future.result():
                results.append(result)
                output.debug(2, 3, 'Already solved %d tasks' % len(results))
        except Exception as e:
            output.debug(0, 1, 'Error while fetching pool results: %s' % e)

        # if timer.is_alive():
        #     timer.cancel()

        if not self.keep:
            self.terminate()
        return [
            result.set_value(self.measure.get(result)) for result in results
        ]

    def terminate(self):
        if self.pool:
            self.pool.stop()
            self.pool.join()
            self.pool = None
Exemplo n.º 22
0
class PebbleExecutor(concurrent.futures.Executor):
    def __init__(self, max_workers, timeout=None):
        self.pool = ProcessPool(max_workers=max_workers)
        self.timeout = timeout

    def submit(self, fn, *args, **kwargs):
        return self.pool.schedule(fn, args=args, timeout=self.timeout)

    def map(self, func, *iterables, timeout=None, chunksize=1):
        raise NotImplementedError("This wrapper does not support `map`.")

    def shutdown(self, wait=True):
        if wait:
            log.info("Closing workers...")
            self.pool.close()
        else:
            log.info("Ending workers...")
            self.pool.stop()
        self.pool.join()
        log.info("Workers joined.")
Exemplo n.º 23
0
    def propagate(self, tasks: List[Task], **kwargs) -> List[Result]:
        output, instance = kwargs['output'], kwargs['instance']

        pool = ProcessPool(
            max_workers=self.processes,
            initializer=propagate_init,
            initargs=(self.propagator, instance)
        )
        results = []
        future = pool.map(propagate_solve, tasks)
        try:
            for result in future.result():
                results.append(result)
                output.debug(2, 3, 'Already solved %d tasks' % len(results))
        except Exception as e:
            output.debug(0, 1, 'Error while fetching pool results: %s' % e)
        pool.stop()
        pool.join()

        return [result.set_value(self.measure.get(result)) for result in results]
Exemplo n.º 24
0
class MultiProcessingDocumentParser(object):
    """
    A document parser that performs all it's tasks in different processes and
    returns results to the main process.

    Also implements a parsing timeout just in case the parser enters an infinite
    loop.

    :author: Andres Riancho ([email protected])
    """
    DEBUG = core_profiling_is_enabled()
    MAX_WORKERS = 2 if is_running_on_ci() else (multiprocessing.cpu_count() / 2) or 1

    # Increasing the timeout when profiling is enabled seems to fix issue #9713
    #
    # https://github.com/andresriancho/w3af/issues/9713
    PROFILING_ENABLED = (user_wants_memory_profiling() or
                         user_wants_pytracemalloc() or
                         user_wants_cpu_profiling())

    # in seconds
    PARSER_TIMEOUT = 60 * 3 if PROFILING_ENABLED else 10

    # Document parsers can go crazy on memory usage when parsing some very
    # specific HTML / PDF documents. Sometimes when this happens the operating
    # system does an out of memory (OOM) kill of a "randomly chosen" process.
    #
    # We limit the memory which can be used by parsing processes to this constant
    #
    # The feature was tested in test_pebble_limit_memory_usage.py
    MEMORY_LIMIT = get_memory_limit()

    def __init__(self):
        self._pool = None
        self._start_lock = threading.RLock()

    def start_workers(self):
        """
        Start the pool and workers
        :return: The pool instance
        """
        with self._start_lock:
            if self._pool is None:

                # Start the process pool
                log_queue = om.manager.get_in_queue()
                self._pool = ProcessPool(self.MAX_WORKERS,
                                         max_tasks=20,
                                         initializer=init_worker,
                                         initargs=(log_queue, self.MEMORY_LIMIT))

        return self._pool

    def stop_workers(self):
        """
        Stop the pool workers
        :return: None
        """
        if self._pool is not None:
            self._pool.stop()
            self._pool.join()
            self._pool = None

    def get_document_parser_for(self, http_response):
        """
        Get a document parser for http_response

        This parses the http_response in a pool worker. This method has two
        features:
            * We can kill the worker if the parser is taking too long
            * We can have different parsers

        :param http_response: The http response instance
        :return: An instance of DocumentParser
        """
        # Start the worker processes if needed
        self.start_workers()

        filename = write_http_response_to_temp_file(http_response)

        apply_args = (process_document_parser,
                      filename,
                      self.DEBUG)

        # Push the task to the workers
        try:
            future = self._pool.schedule(apply_with_return_error,
                                         args=(apply_args,),
                                         timeout=self.PARSER_TIMEOUT)
        except RuntimeError, rte:
            # Remove the temp file used to send data to the process
            remove_file_if_exists(filename)

            # We get here when the pebble pool management thread dies and
            # suddenly starts answering all calls with:
            #
            # RuntimeError('Unexpected error within the Pool')
            #
            # The scan needs to stop because we can't parse any more
            # HTTP responses, which is a very critical part of the process
            msg = str(rte)
            raise ScanMustStopException(msg)

        try:
            process_result = future.result()
        except TimeoutError:
            msg = ('[timeout] The parser took more than %s seconds'
                   ' to complete parsing of "%s", killed it!')
            args = (self.PARSER_TIMEOUT, http_response.get_url())
            raise TimeoutError(msg % args)
        except ProcessExpired:
            # We reach here when the process died because of an error, we
            # handle this just like when the parser takes a lot of time and
            # we're unable to retrieve an answer from it
            msg = ('One of the parser processes died unexpectedly, this could'
                   ' be because of a bug, the operating system triggering OOM'
                   ' kills, etc. The scanner will continue with the next'
                   ' document, but the scan results might be inconsistent.')
            raise TimeoutError(msg)
        finally:
            # Remove the temp file used to send data to the process, we already
            # have the result, so this file is not needed anymore
            remove_file_if_exists(filename)

        # We still need to perform some error handling here...
        if isinstance(process_result, Error):
            if isinstance(process_result.exc_value, MemoryError):
                msg = ('The parser exceeded the memory usage limit of %s bytes'
                       ' while trying to parse "%s". The parser was stopped in'
                       ' order to prevent OOM issues.')
                args = (self.MEMORY_LIMIT, http_response.get_url())
                om.out.debug(msg % args)
                raise MemoryError(msg % args)

            process_result.reraise()

        try:
            parser_output = load_object_from_temp_file(process_result)
        except Exception, e:
            msg = 'Failed to deserialize sub-process result. Exception: "%s"'
            args = (e,)
            raise Exception(msg % args)
Exemplo n.º 25
0
class Wintermute(discord.Client):
    __channels = None
    __bot_prelude = None
    __timeout = None
    __parser = None
    __pool = None

    def __init__(self,
                 channels={},
                 bot_prelude='[bot] ',
                 timeout=10,
                 multiprocessing=1,
                 loglevel=logging.INFO):
        super().__init__()

        self.__channels = channels
        self.__bot_prelude = bot_prelude
        self.__timeout = timeout
        logging.basicConfig(level=loglevel)

        self.__parser = BotGram(prelude=bot_prelude)
        self.__pool = ProcessPool(max_workers=multiprocessing,
                                  initializer=seed)

    def __del__(self):
        self.__pool.close()
        self.__pool.join()

    async def on_ready(self):
        logging.info("Online as " + str(self.user.name))
        game = discord.Game()
        game.name = "Manipulating humanity"
        await self.change_presence(game=game)
        logging.info('Setup done')

    async def on_message(self, mess):
        if mess.channel.is_private:
            logging.info("new private message from " + str(mess.author))
        else:
            logging.info("new message on server " + str(mess.server) +
                         " and channel " + str(mess.channel) + " from " +
                         str(mess.author))
        logging.info("len: " + str(len(mess.content)))
        logging.info("content: " + str(mess.content))
        if mess.channel.is_private:
            return

        if mess.author == self.user:
            return

        if (mess.channel.permissions_for(mess.channel.server.me).send_messages
                and mess.channel.name == self.__channels[mess.server.name]):
            resp = self.__parser.parse(mess)
            if resp is not None:
                task = self.__pool.schedule(str,
                                            args=(resp, ),
                                            timeout=self.__timeout)
                asyncio.ensure_future(self.collect_response(
                    mess.channel, mess.author.mention, task),
                                      loop=self.loop)

    async def collect_response(self, channel, mention, task):
        try:
            while not task.done():
                await asyncio.sleep(0.1)
            result = task.result()
            await self.send_message(channel, result)
        except TimeoutError:
            await self.send_message(
                channel,
                self.__bot_prelude + mention + ' Your request timed out')
            task.cancel()
        except HTTPException as e:
            if e.response.status == 400:
                await self.send_message(
                    channel, self.__bot_prelude + mention +
                    ' Error: Request reply was probably too long')
Exemplo n.º 26
0
class EvaluationPool:
    """Wrapper around ProcessPool.

    Essentially a ProcessPool that can only evaluate configs,
    with some tweaks and caching.

    Aims to do the following:
        - Instantiate evaluators once in the workers to reduce overhead
        - Use a cache to not have to run evaluations twice
        - Provide common format for results, with support for "ok" and "error" status
        - Catches specified, but not all, exceptions
        - Provides timeouts backed by a sufficiently brutal approach to killing processes*

    We therefore sacrifice a little bit of generality for convenience
    in our particular domain, which is just how we like it.

    Note that this deliberately doesn't do any event loop management, it simply provides
    the `.schedule` function which schedules the evaluation of a config, and a `.finish`
    function with obtains the result, with error checking, of that evaluation.

    ***

    WARNING: hash keys in the cache (called "evals") are solely based on
    the config to be evaluated. So if some `evals` with a different underlying evaluator
    get passed, things will break in undefined ways. Since you are not expected
    to ever touch this class without `Run` mediating, I don't think this is a problem.

    But be careful out there.

    ***

    WARNING: macOS has some issues with multiprocessing and fork safety. This
    should not be a problem with this implementation, but if the models evaluated do
    something fancy, this might be the problem. So if you encounter something like
    `RuntimeError: Unexpected error within the Pool`, please check whether it persists
    on Linux. (I've observed this in particular with using something that relies on
    sqlite3 and attempts to write things to disk concurrently.)

    ---
    * We interface with external code that doesn't always play by the rules, and in
      particular is quite fond of not reacting to SIGTERM. The `concurrent.futures`
      `ProcessPoolExecutor` doesn't seem to be able to enforce a timeout in such cases.

    """
    def __init__(
            self,
            max_workers,
            evaluator_config,
            evaluator_context={},
            evals=None,
            trial_timeout=None,
            caught_exceptions=(TimeoutError, ),
    ):

        self.trial_timeout = trial_timeout
        self.pool = ProcessPool(
            initializer=initializer,
            initargs=(evaluator_config, evaluator_context),
            max_workers=max_workers,
        )

        if platform.system() == "Darwin" and max_workers > 1:
            logger.warning(
                "Parallel support on macOS is a bit wonky. Proceed with caution."
            )

        if evals is None:
            evals = ResultDB()

        self.evals = evals
        self.caught_exceptions = caught_exceptions

    def schedule(self, suggestion):
        """Schedule evaluation of a suggestion.

        This also checks the cache in the background, and creates a faux
        future to return the cached result. This is slightly inefficient,
        but it substantially reduces the complexity of the interface: We
        can now always expect a future as a result, and the re-submission
        can be handled in a unified way by the `Run`. (You can't simply
        keep requesting suggestions until you hit something is not in the
        cache, this leads to deadlocks when the search space has been exhausted.)
        """
        eid = compute_hash(suggestion)

        if eid in self.evals:
            result = self.evals.get_result(eid)
            future = self.pool.schedule(passthrough, args=(result, ))
        else:
            future = self.pool.schedule(evaluate,
                                        args=(eid, suggestion),
                                        timeout=self.trial_timeout)

        future.eid = eid  # annotate with hash key in evals
        future.suggestion = suggestion

        return future

    def finish(self, future):
        """Obtain result of an evaluation future, catch errors, update caches.

        Should only be called with a finished future... but it's not a problem
        if it's not. The call to `future.result()` will trigger execution."""

        try:
            result = future.result()

            self.evals.submit_result(future.eid, result)
            return result
        except self.caught_exceptions as e:
            trace = traceback.format_exc()
            result = {
                "error": {
                    "error": e.__class__.__name__,
                    "error_text": str(e),
                    "traceback": trace,
                    "suggestion": future.suggestion,
                }
            }

            self.evals.submit_result(future.eid, result)
            return result
        except Exception as e:
            # uncaught exception, print suggestion and exit
            trace = traceback.format_exc()
            message = f"Unexpected error {e.__class__.__name__} evaluating a trial.\n"
            message += f"Error string: {e}.\n"
            message += f"Suggestion: {future.suggestion}.\n"
            message += f"Traceback:\n{trace}."

            logger.error(message)
            raise e

    def shutdown(self):
        self.pool.stop()  # no point in waiting for things
        try:
            self.pool.join(timeout=1.0)
            logger.info("Successfully and peacefully shut down pool.")
        except TimeoutError:
            logger.info(
                "Failed to peacefully shut down pool... but no worries.")
Exemplo n.º 27
0
class MultiProcessingDocumentParser(object):
    """
    A document parser that performs all it's tasks in different processes and
    returns results to the main process.

    Also implements a parsing timeout just in case the parser enters an infinite
    loop.

    :author: Andres Riancho ([email protected])
    """
    DEBUG = core_profiling_is_enabled()
    MAX_WORKERS = 2 if is_running_on_ci() else (multiprocessing.cpu_count() / 2) or 1

    # Increasing the timeout when profiling is enabled seems to fix issue #9713
    #
    # https://github.com/andresriancho/w3af/issues/9713
    PROFILING_ENABLED = (user_wants_memory_profiling() or
                         user_wants_pytracemalloc() or
                         user_wants_cpu_profiling())

    # in seconds
    PARSER_TIMEOUT = 60 * 3 if PROFILING_ENABLED else 10

    def __init__(self):
        self._pool = None
        self._start_lock = threading.RLock()

    def start_workers(self):
        """
        Start the pool and workers
        :return: The pool instance
        """
        with self._start_lock:
            if self._pool is None:

                # Start the process pool
                log_queue = om.manager.get_in_queue()
                self._pool = ProcessPool(self.MAX_WORKERS,
                                         max_tasks=20,
                                         initializer=init_worker,
                                         initargs=(log_queue,))

        return self._pool

    def stop_workers(self):
        """
        Stop the pool workers
        :return: None
        """
        if self._pool is not None:
            self._pool.stop()
            self._pool.join()
            self._pool = None

    def get_document_parser_for(self, http_response):
        """
        Get a document parser for http_response

        This parses the http_response in a pool worker. This method has two
        features:
            * We can kill the worker if the parser is taking too long
            * We can have different parsers

        :param http_response: The http response instance
        :return: An instance of DocumentParser
        """
        # Start the worker processes if needed
        self.start_workers()

        apply_args = (process_document_parser,
                      http_response,
                      self.DEBUG)

        # Push the task to the workers
        future = self._pool.schedule(apply_with_return_error,
                                     args=(apply_args,),
                                     timeout=self.PARSER_TIMEOUT)

        try:
            parser_output = future.result()
        except TimeoutError:
            # Act just like when there is no parser
            msg = ('[timeout] The parser took more than %s seconds'
                   ' to complete parsing of "%s", killed it!')

            args = (self.PARSER_TIMEOUT, http_response.get_url())

            raise BaseFrameworkException(msg % args)
        else:
            if isinstance(parser_output, Error):
                parser_output.reraise()

        return parser_output

    def get_tags_by_filter(self, http_response, tags, yield_text=False):
        """
        Return Tag instances for the tags which match the `tags` filter,
        parsing and all lxml stuff is done in another process and the Tag
        instances are sent to the main process (the one calling this method)
        through a pipe

        Some things to note:
            * Not all responses can be parsed, so I need to call DocumentParser
              and handle exceptions

            * The parser selected by DocumentParser might not have tags, and
              it might not have get_tags_by_filter. In this case just return an
              empty list

            * Just like get_document_parser_for we have a timeout in place,
              when we hit the timeout just return an empty list, this is not
              the best thing to do, but makes the plugin code easier to write
              (plugins would ignore this anyways)

        :param tags: The filter
        :param yield_text: Should we yield the tag text?
        :return: A list of Tag instances as defined in sgml.py

        :see: SGMLParser.get_tags_by_filter
        """
        # Start the worker processes if needed
        self.start_workers()

        apply_args = (process_get_tags_by_filter,
                      http_response,
                      tags,
                      yield_text,
                      self.DEBUG)

        # Push the task to the workers
        future = self._pool.schedule(apply_with_return_error,
                                     args=(apply_args,),
                                     timeout=self.PARSER_TIMEOUT)

        try:
            filtered_tags = future.result()
        except TimeoutError:
            # We hit a timeout, return an empty list
            return []
        else:
            # There was an exception in the parser, maybe the HTML was really
            # broken, or it wasn't an HTML at all.
            if isinstance(filtered_tags, Error):
                return []

        return filtered_tags
Exemplo n.º 28
0
class MultiProcessingDocumentParser(object):
    """
    A document parser that performs all it's tasks in different processes and
    returns results to the main process.

    Also implements a parsing timeout just in case the parser enters an infinite
    loop.

    :author: Andres Riancho ([email protected])
    """
    DEBUG = core_profiling_is_enabled()
    MAX_WORKERS = 2 if is_running_on_ci() else (multiprocessing.cpu_count() /
                                                2) or 1

    # Increasing the timeout when profiling is enabled seems to fix issue #9713
    #
    # https://github.com/andresriancho/w3af/issues/9713
    PROFILING_ENABLED = (user_wants_memory_profiling()
                         or user_wants_pytracemalloc()
                         or user_wants_cpu_profiling())

    # in seconds
    PARSER_TIMEOUT = 60 * 3 if PROFILING_ENABLED else 10

    def __init__(self):
        self._pool = None
        self._start_lock = threading.RLock()

    def start_workers(self):
        """
        Start the pool and workers
        :return: The pool instance
        """
        with self._start_lock:
            if self._pool is None:

                # Start the process pool
                log_queue = om.manager.get_in_queue()
                self._pool = ProcessPool(self.MAX_WORKERS,
                                         max_tasks=20,
                                         initializer=init_worker,
                                         initargs=(log_queue, ))

        return self._pool

    def stop_workers(self):
        """
        Stop the pool workers
        :return: None
        """
        if self._pool is not None:
            self._pool.stop()
            self._pool.join()
            self._pool = None

    def get_document_parser_for(self, http_response):
        """
        Get a document parser for http_response

        This parses the http_response in a pool worker. This method has two
        features:
            * We can kill the worker if the parser is taking too long
            * We can have different parsers

        :param http_response: The http response instance
        :return: An instance of DocumentParser
        """
        # Start the worker processes if needed
        self.start_workers()

        apply_args = (process_document_parser, http_response, self.DEBUG)

        # Push the task to the workers
        future = self._pool.schedule(apply_with_return_error,
                                     args=(apply_args, ),
                                     timeout=self.PARSER_TIMEOUT)

        try:
            parser_output = future.result()
        except TimeoutError:
            # Act just like when there is no parser
            msg = ('[timeout] The parser took more than %s seconds'
                   ' to complete parsing of "%s", killed it!')

            args = (self.PARSER_TIMEOUT, http_response.get_url())

            raise BaseFrameworkException(msg % args)
        else:
            if isinstance(parser_output, Error):
                parser_output.reraise()

        return parser_output

    def get_tags_by_filter(self, http_response, tags, yield_text=False):
        """
        Return Tag instances for the tags which match the `tags` filter,
        parsing and all lxml stuff is done in another process and the Tag
        instances are sent to the main process (the one calling this method)
        through a pipe

        Some things to note:
            * Not all responses can be parsed, so I need to call DocumentParser
              and handle exceptions

            * The parser selected by DocumentParser might not have tags, and
              it might not have get_tags_by_filter. In this case just return an
              empty list

            * Just like get_document_parser_for we have a timeout in place,
              when we hit the timeout just return an empty list, this is not
              the best thing to do, but makes the plugin code easier to write
              (plugins would ignore this anyways)

        :param tags: The filter
        :param yield_text: Should we yield the tag text?
        :return: A list of Tag instances as defined in sgml.py

        :see: SGMLParser.get_tags_by_filter
        """
        # Start the worker processes if needed
        self.start_workers()

        apply_args = (process_get_tags_by_filter, http_response, tags,
                      yield_text, self.DEBUG)

        # Push the task to the workers
        future = self._pool.schedule(apply_with_return_error,
                                     args=(apply_args, ),
                                     timeout=self.PARSER_TIMEOUT)

        try:
            filtered_tags = future.result()
        except TimeoutError:
            # We hit a timeout, return an empty list
            return []
        else:
            # There was an exception in the parser, maybe the HTML was really
            # broken, or it wasn't an HTML at all.
            if isinstance(filtered_tags, Error):
                return []

        return filtered_tags
Exemplo n.º 29
0
class MultiProcessingDocumentParser(object):
    """
    A document parser that performs all it's tasks in different processes and
    returns results to the main process.

    Also implements a parsing timeout just in case the parser enters an infinite
    loop.

    :author: Andres Riancho ([email protected])
    """
    DEBUG = core_profiling_is_enabled()
    MAX_WORKERS = 2 if is_running_on_ci() else (multiprocessing.cpu_count() /
                                                2) or 1

    # Increasing the timeout when profiling is enabled seems to fix issue #9713
    #
    # https://github.com/andresriancho/w3af/issues/9713
    PROFILING_ENABLED = (user_wants_memory_profiling()
                         or user_wants_pytracemalloc()
                         or user_wants_cpu_profiling())

    # in seconds
    PARSER_TIMEOUT = 60 * 3 if PROFILING_ENABLED else 10

    # Document parsers can go crazy on memory usage when parsing some very
    # specific HTML / PDF documents. Sometimes when this happens the operating
    # system does an out of memory (OOM) kill of a "randomly chosen" process.
    #
    # We limit the memory which can be used by parsing processes to this constant
    #
    # The feature was tested in test_pebble_limit_memory_usage.py
    MEMORY_LIMIT = get_memory_limit()

    def __init__(self):
        self._pool = None
        self._start_lock = threading.RLock()

    def start_workers(self):
        """
        Start the pool and workers
        :return: The pool instance
        """
        with self._start_lock:
            if self._pool is None:

                # Start the process pool
                log_queue = om.manager.get_in_queue()
                self._pool = ProcessPool(self.MAX_WORKERS,
                                         max_tasks=20,
                                         initializer=init_worker,
                                         initargs=(log_queue,
                                                   self.MEMORY_LIMIT))

        return self._pool

    def stop_workers(self):
        """
        Stop the pool workers
        :return: None
        """
        if self._pool is not None:
            self._pool.stop()
            self._pool.join()
            self._pool = None

    def get_document_parser_for(self, http_response):
        """
        Get a document parser for http_response

        This parses the http_response in a pool worker. This method has two
        features:
            * We can kill the worker if the parser is taking too long
            * We can have different parsers

        :param http_response: The http response instance
        :return: An instance of DocumentParser
        """
        # Start the worker processes if needed
        self.start_workers()

        apply_args = (process_document_parser, http_response, self.DEBUG)

        # Push the task to the workers
        try:
            future = self._pool.schedule(apply_with_return_error,
                                         args=(apply_args, ),
                                         timeout=self.PARSER_TIMEOUT)
        except RuntimeError, rte:
            # We get here when the pebble pool management thread dies and
            # suddenly starts answering all calls with:
            #
            # RuntimeError('Unexpected error within the Pool')
            #
            # The scan needs to stop because we can't parse any more
            # HTTP responses, which is a very critical part of the process
            msg = str(rte)
            raise ScanMustStopException(msg)

        try:
            parser_output = future.result()
        except TimeoutError:
            msg = ('[timeout] The parser took more than %s seconds'
                   ' to complete parsing of "%s", killed it!')
            args = (self.PARSER_TIMEOUT, http_response.get_url())
            raise TimeoutError(msg % args)

        # We still need to perform some error handling here...
        if isinstance(parser_output, Error):
            if isinstance(parser_output.exc_value, MemoryError):
                msg = ('The parser exceeded the memory usage limit of %s bytes'
                       ' while trying to parse "%s". The parser was stopped in'
                       ' order to prevent OOM issues.')
                args = (self.MEMORY_LIMIT, http_response.get_url())
                om.out.debug(msg % args)
                raise MemoryError(msg % args)

            parser_output.reraise()

        # Success!
        return parser_output