Пример #1
0
 def test_process_pool_stop_stopped(self):
     """Process Pool Spawn is stopped after stop."""
     pool = ProcessPool()
     pool.schedule(function, args=[1])
     pool.stop()
     pool.join()
     self.assertFalse(pool.active)
 def test_process_pool_stop_stopped(self):
     """Process Pool Forkserver is stopped after stop."""
     pool = ProcessPool(max_workers=1)
     pool.schedule(function, args=[1])
     pool.stop()
     pool.join()
     self.assertFalse(pool.active)
Пример #3
0
 def test_process_pool_join_workers(self):
     """Process Pool Spawn no worker is running after join."""
     pool = ProcessPool(max_workers=4)
     pool.schedule(function, args=[1])
     pool.stop()
     pool.join()
     self.assertEqual(len(pool._pool_manager.worker_manager.workers), 0)
Пример #4
0
 def test_process_pool_stop_stopped(self):
     """Process Pool Fork is stopped after stop."""
     pool = ProcessPool(max_workers=1)
     pool.schedule(function, args=[1])
     pool.stop()
     pool.join()
     self.assertFalse(pool.active)
Пример #5
0
 def test_process_pool_join_workers(self):
     """Process Pool Fork no worker is running after join."""
     pool = ProcessPool(max_workers=4)
     pool.schedule(function, args=[1])
     pool.stop()
     pool.join()
     self.assertEqual(len(pool._pool_manager.worker_manager.workers), 0)
Пример #6
0
    def test_process_pool_stop_large_data(self):
        """Process Pool Fork is stopped if large data is sent on the channel."""
        data = "a" * 1098 * 1024 * 50  # 50 Mb
        pool = ProcessPool(max_workers=1)
        pool.schedule(function, args=[data])
        pool.stop()
        pool.join()

        self.assertFalse(pool.active)
Пример #7
0
    def test_process_pool_stop_large_data(self):
        """Process Pool Fork is stopped if large data is sent on the channel."""
        data = "a" * 1098 * 1024 * 50  # 50 Mb
        pool = ProcessPool(max_workers=1)
        pool.schedule(function, args=[data])
        pool.stop()
        pool.join()

        self.assertFalse(pool.active)
Пример #8
0
 def test_process_pool_join_futures_timeout(self):
     """Process Pool Fork TimeoutError is raised if join on long futures."""
     pool = ProcessPool(max_workers=1)
     for _ in range(2):
         pool.schedule(long_function)
     pool.close()
     self.assertRaises(TimeoutError, pool.join, 0.4)
     pool.stop()
     pool.join()
Пример #9
0
 def test_process_pool_stop_futures(self):
     """Process Pool Spawn not all futures are performed on stop."""
     futures = []
     pool = ProcessPool()
     for index in range(10):
         futures.append(pool.schedule(function, args=[index]))
     pool.stop()
     pool.join()
     self.assertTrue(len([f for f in futures if not f.done()]) > 0)
Пример #10
0
 def test_process_pool_stop_futures(self):
     """Process Pool Fork not all futures are performed on stop."""
     futures = []
     pool = ProcessPool(max_workers=1)
     for index in range(10):
         futures.append(pool.schedule(function, args=[index]))
     pool.stop()
     pool.join()
     self.assertTrue(len([f for f in futures if not f.done()]) > 0)
Пример #11
0
 def test_process_pool_join_futures_timeout(self):
     """Process Pool Spawn TimeoutError is raised if join on long tasks."""
     pool = ProcessPool()
     for _ in range(2):
         pool.schedule(long_function)
     pool.close()
     self.assertRaises(TimeoutError, pool.join, 0.4)
     pool.stop()
     pool.join()
Пример #12
0
    def test_process_pool_stop_large_data(self):
        """Process Pool Spawn is stopped if large data is sent on the channel."""
        data = "a" * 4098 * 1024
        pool = ProcessPool(initializer=long_initializer)
        pool.schedule(function, args=[data])
        pool.stop()
        pool.join()

        self.assertFalse(pool.active)
Пример #13
0
class CounterDaemon(object):

    def __init__(self, workers=1, poll_interval=None, max_tasks=100, task_timeout=0.1,
                 task_default_sleep=0.01, task_sleep_rand_range=(1, 20)):
        self.workers = workers
        self.poll_interval = poll_interval
        self.max_tasks = max_tasks
        self.task_timeout = task_timeout
        self.task_default_sleep = task_default_sleep
        self.task_sleep_rand_range = task_sleep_rand_range

        self.pool = ProcessPool(max_workers=self.workers, max_tasks=self.max_tasks)
        self.pool._start_pool()

    def done_callback(self, bucket_id, future):
        pid = os.getpid()
        try:
            result = future.result()
            logger.debug('Result: {}\tpid: {}\tbucket: {}'.format(result, pid, bucket_id))
        except futures.TimeoutError as e:
            logger.warning('TimeoutError\tpid: {}\tbucket: {}'.format(pid, bucket_id))
        except futures.CancelledError:
            return
        except Exception as e:
            logger.exception('TaskError\t pid: {}\tbucket: {}\tError: {}'.format(pid, bucket_id, e))

    def run_once(self):
        for bucket_id in random.sample(BUCKET_RANGE, self.workers):
            sleep = self.task_default_sleep * random.randint(*self.task_sleep_rand_range)
            future = self.pool.schedule(
                incr,
                args=(bucket_id,),
                kwargs={'sleep': sleep},
                timeout=self.task_timeout
            )
            future.add_done_callback(functools.partial(self.done_callback, bucket_id))

    def run_forever(self):
        while True:
            try:
                self.run_once()
            except Exception as e:
                logger.exception('RunOnceError: {}'.format(e))
            time.sleep(self.poll_interval or (3 * self.task_timeout))

    def start(self):
        try:
            self.run_forever()
        except Exception as e:
            logger.exception('Error during running daemon: {}'.format(e))
            self.pool.close()
            time.sleep(10)
            self.pool.stop()
        finally:
            self.pool.join()
Пример #14
0
class PebbleMap(PySAT):
    name = 'PySAT Concurrency: PebbleMap'

    def __init__(self, **kwargs):
        self.pool = None
        super().__init__(**kwargs)

    def initialize(self, solver, **kwargs):
        if self.pool is not None:
            kwargs['output'].debug(2, 2, 'Pool already inited')
        else:
            self.pool = ProcessPool(max_workers=self.processes,
                                    initializer=self.init_func,
                                    initargs=(solver, kwargs['instance']))
            kwargs['output'].debug(
                2, 2, 'Init pool with %d processes' % self.processes)

    def process(self, tasks: List[Task], **kwargs) -> List[Result]:
        output = kwargs['output']
        results = []
        future = self.pool.map(self.solve_func, tasks)

        # timer = Timer(20., future.cancel, ())
        # timer.start()
        try:
            for result in future.result():
                results.append(result)
                output.debug(2, 3, 'Already solved %d tasks' % len(results))
        except Exception as e:
            output.debug(0, 1, 'Error while fetching pool results: %s' % e)

        # if timer.is_alive():
        #     timer.cancel()

        if not self.keep:
            self.terminate()
        return [
            result.set_value(self.measure.get(result)) for result in results
        ]

    def terminate(self):
        if self.pool:
            self.pool.stop()
            self.pool.join()
            self.pool = None
Пример #15
0
class PebbleExecutor(concurrent.futures.Executor):
    def __init__(self, max_workers, timeout=None):
        self.pool = ProcessPool(max_workers=max_workers)
        self.timeout = timeout

    def submit(self, fn, *args, **kwargs):
        return self.pool.schedule(fn, args=args, timeout=self.timeout)

    def map(self, func, *iterables, timeout=None, chunksize=1):
        raise NotImplementedError("This wrapper does not support `map`.")

    def shutdown(self, wait=True):
        if wait:
            log.info("Closing workers...")
            self.pool.close()
        else:
            log.info("Ending workers...")
            self.pool.stop()
        self.pool.join()
        log.info("Workers joined.")
Пример #16
0
    def propagate(self, tasks: List[Task], **kwargs) -> List[Result]:
        output, instance = kwargs['output'], kwargs['instance']

        pool = ProcessPool(
            max_workers=self.processes,
            initializer=propagate_init,
            initargs=(self.propagator, instance)
        )
        results = []
        future = pool.map(propagate_solve, tasks)
        try:
            for result in future.result():
                results.append(result)
                output.debug(2, 3, 'Already solved %d tasks' % len(results))
        except Exception as e:
            output.debug(0, 1, 'Error while fetching pool results: %s' % e)
        pool.stop()
        pool.join()

        return [result.set_value(self.measure.get(result)) for result in results]
Пример #17
0
class StreamClient():
    '''
    Client managing all stream pipelines
    '''

    # Maximum number of cameras that can be configured
    MAX_PIPELINE_COUNT = 20

    def __init__(self):
        # Initialize concurrent engine
        self.pool = ProcessPool(max_workers=StreamClient.MAX_PIPELINE_COUNT +
                                1,
                                max_tasks=StreamClient.MAX_PIPELINE_COUNT)
        self.futures = dict()
        self.set_env_variables()
        self.config = dict()
        self.pipelines = dict()

    def set_env_variables(self):
        '''
        Sets GStreamer environmental variables
        '''
        # Logging level
        if 'GST_DEBUG' not in os.environ:
            os.environ['GST_DEBUG'] = '2,splitmuxsink:4'

        # Plugin path
        plugin_path = "../gst_plugins"
        cwd_path = os.getcwd()
        file_path = os.path.abspath(os.path.dirname(__file__))
        if len(file_path) > len(cwd_path):
            diff_path = file_path.replace(cwd_path, '')
            diff_path = os.path.normpath(diff_path)
            plugin_path = os.path.join(diff_path, plugin_path)[1:]

        # Python bindings path
        plugin_path += ":/usr/lib/gstreamer-1.0/"

        logger.info("GST_PLUGIN_PATH: %s" % plugin_path)
        os.environ["GST_PLUGIN_PATH"] = plugin_path

        # Set LIB_GSTREAMER_PATH
        if 'LIB_GSTREAMER_PATH' not in os.environ:
            logger.info(
                "Gstreamer version: {}.{}.{}.{}".format(*Gst.version()))
            lib_gst_path = None
            if sys.platform == "darwin":
                lib_gst_path = "/usr/local/Cellar/gstreamer/{}.{}.{}/lib/libgstreamer-1.0.dylib".format(
                    *Gst.version())
            elif sys.platform == "win32":
                lib_gst_path = "C:\\msys64\\mingw64\\bin\\libgstreamer-1.0-0.dll"
            else:
                lib_gst_paths = [
                    "/usr/lib/aarch64-linux-gnu/libgstreamer-1.0.so.0",
                    "/usr/lib/x86_64-linux-gnu/libgstreamer-1.0.so.0"
                ]
                for path in lib_gst_paths:
                    if os.path.exists(path):
                        lib_gst_path = path
                        break

            if lib_gst_path is None:
                logger.warning(
                    "libgstreamer-1.0 not found! Please export LIB_GSTREAMER_PATH env var manually."
                )
            else:
                logger.info("LIB_GSTREAMER_PATH: %s" % lib_gst_path)
                os.environ['LIB_GSTREAMER_PATH'] = lib_gst_path

    def update(self, config):
        '''
        Merge new config with existing one
        '''
        self.config.update(config)

    def get_config(self, config_or_filename):
        '''
        Load config from file and return it in dictionary format
        '''
        if config_or_filename is None or isinstance(config_or_filename, str):
            return StreamConfig.LoadConfigFromFile(config_or_filename)
        else:
            return config_or_filename

    def get_pipeline(self, name):
        '''
        Returns pipeline with a given name
        '''
        if name in self.pipelines:
            return self.pipelines[name]
        return None

    def add(self, config_or_filename=None):
        '''
        Build pipeline and add to the registry
        '''
        # Get config in the proper format
        config = self.get_config(config_or_filename)

        # Add default id if config is flat
        if config is not None and "pipeline" in config:
            key = "default_%d" % len(self.pipelines.keys())
            config = {key: config}

        for key in config:
            # Skip those sources that are disabled in configuration
            if "enabled" in config[key] and not config[key]["enabled"]:
                logger.info("Skipping %s (disabled)" % key)
                continue

            # Parse config
            config[key]["id"] = key
            pipeline_config = StreamConfig(config[key])
            logger.info(pformat(pipeline_config))

            # Create pipeline
            pipeline = PipelineFactory.createPipeline(pipeline_config)
            self.pipelines[key] = pipeline

        if len(self.pipelines.keys()) == 1:
            return tuple(self.pipelines.values())[0]

        return tuple(self.pipelines.values())

    def run(self, loop=None):
        '''
        Run pipelines that have been added with add()
        '''
        if len(self.pipelines.keys()) == 0:
            logger.error("No pipelines added")
            return

        # Main application loop
        if loop is None:
            loop = GLib.MainLoop()

        # Start each pipeline
        for k, v in self.pipelines.items():
            v.start(loop)

        # Run the main loop
        loop.run()

        # Cleanup when main loop has ended
        for k, v in self.pipelines.items():
            v.stop()

    def start(self, config_or_filename=None, wait_for_finish=True):
        '''
        Start single pipeline
        '''
        pipeline = self.add(config_or_filename)
        if wait_for_finish:
            self.run()
        else:
            thread = Thread(target=self.run)
            thread.start()
            return pipeline, thread

    def stop(self):
        self.pool.stop()

    def schedule(self,
                 config_or_filename=None,
                 wait_for_finish=False,
                 restart_on_exception=False):
        '''
        Start one or more pipelines asynchronously, in parallel
        '''
        try:
            # Get config in the proper format
            config = self.get_config(config_or_filename)

            # Update cached config
            self.update(config)

            # Check for misuse of schedule()
            for i in ['debug', 'enabled']:
                if i in config and isinstance(config[i], bool):
                    logger.error(
                        "Either use start() instead of schedule() or nest the configuration into a pipeline."
                    )
                    return config

            # Stop pipelines that are going to be reconfigured
            for key in self.futures:
                if key in config:
                    logger.info("Cancelling the following pipeline: %s" % key)
                    if self.futures[key].cancel() == False:
                        logger.error(
                            "Failed to cancel the following pipeline: %s" %
                            key)
                        continue

            # Spin off each camera pipeline in a separate thread/process
            for key in config:
                # Cap at MAX_PIPELINE_COUNT
                if len(self.futures.keys()) >= StreamClient.MAX_PIPELINE_COUNT:
                    logger.error(
                        "Maximum number of pipelines reached. Not configuring %s"
                        % key)
                    continue

                # Skip those sources that are disabled in configuration
                if "enabled" in config[key] and not config[key]["enabled"]:
                    logger.info("Skipping %s (disabled)" % key)
                    del self.config[key]
                    continue

                # Start new process/thread
                future = self.pool.schedule(stream_pipeline,
                                            args=[key, config[key]])
                if restart_on_exception:
                    future.add_done_callback(self.catch_and_restart)
                self.futures[key] = future

            # This is a blocking call, therefore use with caution (it will prevent parallel execution!)
            if wait_for_finish:
                for key in self.futures:
                    logger.info(self.futures[key].result())

        except Exception as e:
            logger.error("Failed to start pipeline(s): " + repr(e))
            self.config["error"] = repr(e)

        return self.config

    def catch_and_restart(self, future):
        '''
        Catches any exception from the future and restarts the process/thread
        '''
        try:
            result = future.result()
            logger.info(result)
        except TimeoutError as error:
            logger.error("Function took longer than %d seconds" %
                         error.args[1])
        except Exception as error:
            logger.error("Function raised %s" % error)

            if "CancelledError" in error.__class__.__name__:
                logger.error(
                    "Function has been already cancelled. Doing nothing...")
                return

            if hasattr(error, 'traceback'):
                logger.error(error.traceback)

            logger.info("Restarting all pipelines after 5 sec...")
            time.sleep(5)
            self.schedule(self.config,
                          wait_for_finish=False,
                          restart_on_exception=True)
Пример #18
0
class MultiProcessingDocumentParser(object):
    """
    A document parser that performs all it's tasks in different processes and
    returns results to the main process.

    Also implements a parsing timeout just in case the parser enters an infinite
    loop.

    :author: Andres Riancho ([email protected])
    """
    DEBUG = core_profiling_is_enabled()
    MAX_WORKERS = 2 if is_running_on_ci() else (multiprocessing.cpu_count() / 2) or 1

    # Increasing the timeout when profiling is enabled seems to fix issue #9713
    #
    # https://github.com/andresriancho/w3af/issues/9713
    PROFILING_ENABLED = (user_wants_memory_profiling() or
                         user_wants_pytracemalloc() or
                         user_wants_cpu_profiling())

    # in seconds
    PARSER_TIMEOUT = 60 * 3 if PROFILING_ENABLED else 10

    # Document parsers can go crazy on memory usage when parsing some very
    # specific HTML / PDF documents. Sometimes when this happens the operating
    # system does an out of memory (OOM) kill of a "randomly chosen" process.
    #
    # We limit the memory which can be used by parsing processes to this constant
    #
    # The feature was tested in test_pebble_limit_memory_usage.py
    MEMORY_LIMIT = get_memory_limit()

    def __init__(self):
        self._pool = None
        self._start_lock = threading.RLock()

    def start_workers(self):
        """
        Start the pool and workers
        :return: The pool instance
        """
        with self._start_lock:
            if self._pool is None:

                # Start the process pool
                log_queue = om.manager.get_in_queue()
                self._pool = ProcessPool(self.MAX_WORKERS,
                                         max_tasks=20,
                                         initializer=init_worker,
                                         initargs=(log_queue, self.MEMORY_LIMIT))

        return self._pool

    def stop_workers(self):
        """
        Stop the pool workers
        :return: None
        """
        if self._pool is not None:
            self._pool.stop()
            self._pool.join()
            self._pool = None

    def get_document_parser_for(self, http_response):
        """
        Get a document parser for http_response

        This parses the http_response in a pool worker. This method has two
        features:
            * We can kill the worker if the parser is taking too long
            * We can have different parsers

        :param http_response: The http response instance
        :return: An instance of DocumentParser
        """
        # Start the worker processes if needed
        self.start_workers()

        filename = write_http_response_to_temp_file(http_response)

        apply_args = (process_document_parser,
                      filename,
                      self.DEBUG)

        # Push the task to the workers
        try:
            future = self._pool.schedule(apply_with_return_error,
                                         args=(apply_args,),
                                         timeout=self.PARSER_TIMEOUT)
        except RuntimeError, rte:
            # Remove the temp file used to send data to the process
            remove_file_if_exists(filename)

            # We get here when the pebble pool management thread dies and
            # suddenly starts answering all calls with:
            #
            # RuntimeError('Unexpected error within the Pool')
            #
            # The scan needs to stop because we can't parse any more
            # HTTP responses, which is a very critical part of the process
            msg = str(rte)
            raise ScanMustStopException(msg)

        try:
            process_result = future.result()
        except TimeoutError:
            msg = ('[timeout] The parser took more than %s seconds'
                   ' to complete parsing of "%s", killed it!')
            args = (self.PARSER_TIMEOUT, http_response.get_url())
            raise TimeoutError(msg % args)
        except ProcessExpired:
            # We reach here when the process died because of an error, we
            # handle this just like when the parser takes a lot of time and
            # we're unable to retrieve an answer from it
            msg = ('One of the parser processes died unexpectedly, this could'
                   ' be because of a bug, the operating system triggering OOM'
                   ' kills, etc. The scanner will continue with the next'
                   ' document, but the scan results might be inconsistent.')
            raise TimeoutError(msg)
        finally:
            # Remove the temp file used to send data to the process, we already
            # have the result, so this file is not needed anymore
            remove_file_if_exists(filename)

        # We still need to perform some error handling here...
        if isinstance(process_result, Error):
            if isinstance(process_result.exc_value, MemoryError):
                msg = ('The parser exceeded the memory usage limit of %s bytes'
                       ' while trying to parse "%s". The parser was stopped in'
                       ' order to prevent OOM issues.')
                args = (self.MEMORY_LIMIT, http_response.get_url())
                om.out.debug(msg % args)
                raise MemoryError(msg % args)

            process_result.reraise()

        try:
            parser_output = load_object_from_temp_file(process_result)
        except Exception, e:
            msg = 'Failed to deserialize sub-process result. Exception: "%s"'
            args = (e,)
            raise Exception(msg % args)
Пример #19
0
class StreamClient():
    '''
    Client managing all stream pipelines
    '''

    # Maximum number of cameras that can be configured
    MAX_PIPELINE_COUNT = 20

    def __init__(self):
        # Initialize concurrent engine
        self.pool = ProcessPool(max_workers=StreamClient.MAX_PIPELINE_COUNT +
                                1,
                                max_tasks=StreamClient.MAX_PIPELINE_COUNT)
        self.futures = dict()
        self.set_env_variables()
        self.config = dict()

    def set_env_variables(self):
        '''
        Sets GStreamer environmental variables
        '''
        # Logging level
        if 'GST_DEBUG' not in os.environ:
            os.environ['GST_DEBUG'] = '2,splitmuxsink:4'

        # Plugin path
        plugin_path = "../gst_plugins"
        cwd_path = os.getcwd()
        file_path = os.path.abspath(os.path.dirname(__file__))
        if len(file_path) > len(cwd_path):
            diff_path = file_path.replace(cwd_path, '')
            diff_path = os.path.normpath(diff_path)
            plugin_path = os.path.join(diff_path, plugin_path)[1:]

        # Python bindings path
        plugin_path += ":/usr/lib/gstreamer-1.0/"

        logger.info("GST_PLUGIN_PATH: %s" % plugin_path)
        os.environ["GST_PLUGIN_PATH"] = plugin_path

        # Set LIB_GSTREAMER_PATH
        if 'LIB_GSTREAMER_PATH' not in os.environ:
            logger.info(
                "Gstreamer version: {}.{}.{}.{}".format(*Gst.version()))
            lib_gst_path = None
            if sys.platform == "darwin":
                lib_gst_path = "/usr/local/Cellar/gstreamer/{}.{}.{}/lib/libgstreamer-1.0.dylib".format(
                    *Gst.version())
            else:
                lib_gst_paths = [
                    "/usr/lib/aarch64-linux-gnu/libgstreamer-1.0.so.0",
                    "/usr/lib/x86_64-linux-gnu/libgstreamer-1.0.so.0"
                ]
                for path in lib_gst_paths:
                    if os.path.exists(path):
                        lib_gst_path = path
                        break

            if lib_gst_path is None:
                logger.warning(
                    "libgstreamer-1.0 not found! Please export LIB_GSTREAMER_PATH env var manually."
                )
            else:
                logger.info("LIB_GSTREAMER_PATH: %s" % lib_gst_path)
                os.environ['LIB_GSTREAMER_PATH'] = lib_gst_path

    def update(self, config):
        '''
        Merge new config with existing one
        '''
        self.config.update(config)

    def get_config(self, config_or_filename):
        '''
        Load config from file and return it in dictionary format
        '''
        if config_or_filename is None or isinstance(config_or_filename, str):
            return StreamConfig.LoadConfigFromFile(config_or_filename)
        else:
            return config_or_filename

    def start(self, config_or_filename=None, key=None):
        '''
        Start single pipeline synchronously
        '''
        # Get config in the proper format
        config = self.get_config(config_or_filename)

        # Flatten the config
        if config is not None and len(config.keys()) == 1:
            key = list(config.keys())[0]
            config = config[key]

        # Start the pipeline
        future = self.pool.schedule(stream_pipeline, args=[key, config])
        future.result()

        return config

    def stop(self):
        self.pool.stop()

    def schedule(self, config_or_filename=None, wait_for_finish=False):
        '''
        Start one or more pipelines asynchronously, in parallel
        '''
        try:
            # Get config in the proper format
            config = self.get_config(config_or_filename)

            # Update cached config
            self.update(config)

            # Check for misuse of schedule()
            for i in ['debug', 'enabled']:
                if i in config and isinstance(config[i], bool):
                    logger.error(
                        "Either use start() instead of schedule() or nest the configuration into a pipeline."
                    )
                    return config

            # Stop pipelines that are going to be reconfigured
            for key in self.futures:
                if key in config:
                    logger.info("Cancelling the following pipeline: %s" % key)
                    if self.futures[key].cancel() == False:
                        logger.error(
                            "Failed to cancel the following pipeline: %s" %
                            key)
                        continue

            # Spin off each camera pipeline in a separate thread/process
            for key in config:
                # Cap at MAX_PIPELINE_COUNT
                if len(self.futures.keys()) >= StreamClient.MAX_PIPELINE_COUNT:
                    logger.error(
                        "Maximum number of pipelines reached. Not configuring %s"
                        % key)
                    continue

                # Skip those sources that are disabled in configuration
                if "enabled" in config[key] and not config[key]["enabled"]:
                    logger.info("Skipping %s (disabled)" % key)
                    del self.config[key]
                    continue

                # Start new process/thread
                future = self.pool.schedule(stream_pipeline,
                                            args=[key, config[key]])
                self.futures[key] = future

            # This is a blocking call, therefore use with caution (it will prevent parallel execution!)
            if wait_for_finish:
                for key in self.futures:
                    logger.info(self.futures[key].result())

        except Exception as e:
            logger.error("Failed to start pipeline(s): " + repr(e))
            self.config["error"] = repr(e)

        return self.config
Пример #20
0
class EvaluationPool:
    """Wrapper around ProcessPool.

    Essentially a ProcessPool that can only evaluate configs,
    with some tweaks and caching.

    Aims to do the following:
        - Instantiate evaluators once in the workers to reduce overhead
        - Use a cache to not have to run evaluations twice
        - Provide common format for results, with support for "ok" and "error" status
        - Catches specified, but not all, exceptions
        - Provides timeouts backed by a sufficiently brutal approach to killing processes*

    We therefore sacrifice a little bit of generality for convenience
    in our particular domain, which is just how we like it.

    Note that this deliberately doesn't do any event loop management, it simply provides
    the `.schedule` function which schedules the evaluation of a config, and a `.finish`
    function with obtains the result, with error checking, of that evaluation.

    ***

    WARNING: hash keys in the cache (called "evals") are solely based on
    the config to be evaluated. So if some `evals` with a different underlying evaluator
    get passed, things will break in undefined ways. Since you are not expected
    to ever touch this class without `Run` mediating, I don't think this is a problem.

    But be careful out there.

    ***

    WARNING: macOS has some issues with multiprocessing and fork safety. This
    should not be a problem with this implementation, but if the models evaluated do
    something fancy, this might be the problem. So if you encounter something like
    `RuntimeError: Unexpected error within the Pool`, please check whether it persists
    on Linux. (I've observed this in particular with using something that relies on
    sqlite3 and attempts to write things to disk concurrently.)

    ---
    * We interface with external code that doesn't always play by the rules, and in
      particular is quite fond of not reacting to SIGTERM. The `concurrent.futures`
      `ProcessPoolExecutor` doesn't seem to be able to enforce a timeout in such cases.

    """
    def __init__(
            self,
            max_workers,
            evaluator_config,
            evaluator_context={},
            evals=None,
            trial_timeout=None,
            caught_exceptions=(TimeoutError, ),
    ):

        self.trial_timeout = trial_timeout
        self.pool = ProcessPool(
            initializer=initializer,
            initargs=(evaluator_config, evaluator_context),
            max_workers=max_workers,
        )

        if platform.system() == "Darwin" and max_workers > 1:
            logger.warning(
                "Parallel support on macOS is a bit wonky. Proceed with caution."
            )

        if evals is None:
            evals = ResultDB()

        self.evals = evals
        self.caught_exceptions = caught_exceptions

    def schedule(self, suggestion):
        """Schedule evaluation of a suggestion.

        This also checks the cache in the background, and creates a faux
        future to return the cached result. This is slightly inefficient,
        but it substantially reduces the complexity of the interface: We
        can now always expect a future as a result, and the re-submission
        can be handled in a unified way by the `Run`. (You can't simply
        keep requesting suggestions until you hit something is not in the
        cache, this leads to deadlocks when the search space has been exhausted.)
        """
        eid = compute_hash(suggestion)

        if eid in self.evals:
            result = self.evals.get_result(eid)
            future = self.pool.schedule(passthrough, args=(result, ))
        else:
            future = self.pool.schedule(evaluate,
                                        args=(eid, suggestion),
                                        timeout=self.trial_timeout)

        future.eid = eid  # annotate with hash key in evals
        future.suggestion = suggestion

        return future

    def finish(self, future):
        """Obtain result of an evaluation future, catch errors, update caches.

        Should only be called with a finished future... but it's not a problem
        if it's not. The call to `future.result()` will trigger execution."""

        try:
            result = future.result()

            self.evals.submit_result(future.eid, result)
            return result
        except self.caught_exceptions as e:
            trace = traceback.format_exc()
            result = {
                "error": {
                    "error": e.__class__.__name__,
                    "error_text": str(e),
                    "traceback": trace,
                    "suggestion": future.suggestion,
                }
            }

            self.evals.submit_result(future.eid, result)
            return result
        except Exception as e:
            # uncaught exception, print suggestion and exit
            trace = traceback.format_exc()
            message = f"Unexpected error {e.__class__.__name__} evaluating a trial.\n"
            message += f"Error string: {e}.\n"
            message += f"Suggestion: {future.suggestion}.\n"
            message += f"Traceback:\n{trace}."

            logger.error(message)
            raise e

    def shutdown(self):
        self.pool.stop()  # no point in waiting for things
        try:
            self.pool.join(timeout=1.0)
            logger.info("Successfully and peacefully shut down pool.")
        except TimeoutError:
            logger.info(
                "Failed to peacefully shut down pool... but no worries.")
Пример #21
0
class MultiProcessingDocumentParser(object):
    """
    A document parser that performs all it's tasks in different processes and
    returns results to the main process.

    Also implements a parsing timeout just in case the parser enters an infinite
    loop.

    :author: Andres Riancho ([email protected])
    """
    DEBUG = core_profiling_is_enabled()
    MAX_WORKERS = 2 if is_running_on_ci() else (multiprocessing.cpu_count() /
                                                2) or 1

    # Increasing the timeout when profiling is enabled seems to fix issue #9713
    #
    # https://github.com/andresriancho/w3af/issues/9713
    PROFILING_ENABLED = (user_wants_memory_profiling()
                         or user_wants_pytracemalloc()
                         or user_wants_cpu_profiling())

    # in seconds
    PARSER_TIMEOUT = 60 * 3 if PROFILING_ENABLED else 10

    # Document parsers can go crazy on memory usage when parsing some very
    # specific HTML / PDF documents. Sometimes when this happens the operating
    # system does an out of memory (OOM) kill of a "randomly chosen" process.
    #
    # We limit the memory which can be used by parsing processes to this constant
    #
    # The feature was tested in test_pebble_limit_memory_usage.py
    MEMORY_LIMIT = get_memory_limit()

    def __init__(self):
        self._pool = None
        self._start_lock = threading.RLock()

    def start_workers(self):
        """
        Start the pool and workers
        :return: The pool instance
        """
        with self._start_lock:
            if self._pool is None:

                # Start the process pool
                log_queue = om.manager.get_in_queue()
                self._pool = ProcessPool(self.MAX_WORKERS,
                                         max_tasks=20,
                                         initializer=init_worker,
                                         initargs=(log_queue,
                                                   self.MEMORY_LIMIT))

        return self._pool

    def stop_workers(self):
        """
        Stop the pool workers
        :return: None
        """
        if self._pool is not None:
            self._pool.stop()
            self._pool.join()
            self._pool = None

    def get_document_parser_for(self, http_response):
        """
        Get a document parser for http_response

        This parses the http_response in a pool worker. This method has two
        features:
            * We can kill the worker if the parser is taking too long
            * We can have different parsers

        :param http_response: The http response instance
        :return: An instance of DocumentParser
        """
        # Start the worker processes if needed
        self.start_workers()

        apply_args = (process_document_parser, http_response, self.DEBUG)

        # Push the task to the workers
        try:
            future = self._pool.schedule(apply_with_return_error,
                                         args=(apply_args, ),
                                         timeout=self.PARSER_TIMEOUT)
        except RuntimeError, rte:
            # We get here when the pebble pool management thread dies and
            # suddenly starts answering all calls with:
            #
            # RuntimeError('Unexpected error within the Pool')
            #
            # The scan needs to stop because we can't parse any more
            # HTTP responses, which is a very critical part of the process
            msg = str(rte)
            raise ScanMustStopException(msg)

        try:
            parser_output = future.result()
        except TimeoutError:
            msg = ('[timeout] The parser took more than %s seconds'
                   ' to complete parsing of "%s", killed it!')
            args = (self.PARSER_TIMEOUT, http_response.get_url())
            raise TimeoutError(msg % args)

        # We still need to perform some error handling here...
        if isinstance(parser_output, Error):
            if isinstance(parser_output.exc_value, MemoryError):
                msg = ('The parser exceeded the memory usage limit of %s bytes'
                       ' while trying to parse "%s". The parser was stopped in'
                       ' order to prevent OOM issues.')
                args = (self.MEMORY_LIMIT, http_response.get_url())
                om.out.debug(msg % args)
                raise MemoryError(msg % args)

            parser_output.reraise()

        # Success!
        return parser_output
Пример #22
0
class MultiProcessingDocumentParser(object):
    """
    A document parser that performs all it's tasks in different processes and
    returns results to the main process.

    Also implements a parsing timeout just in case the parser enters an infinite
    loop.

    :author: Andres Riancho ([email protected])
    """
    DEBUG = core_profiling_is_enabled()
    MAX_WORKERS = 2 if is_running_on_ci() else (multiprocessing.cpu_count() / 2) or 1

    # Increasing the timeout when profiling is enabled seems to fix issue #9713
    #
    # https://github.com/andresriancho/w3af/issues/9713
    PROFILING_ENABLED = (user_wants_memory_profiling() or
                         user_wants_pytracemalloc() or
                         user_wants_cpu_profiling())

    # in seconds
    PARSER_TIMEOUT = 60 * 3 if PROFILING_ENABLED else 10

    def __init__(self):
        self._pool = None
        self._start_lock = threading.RLock()

    def start_workers(self):
        """
        Start the pool and workers
        :return: The pool instance
        """
        with self._start_lock:
            if self._pool is None:

                # Start the process pool
                log_queue = om.manager.get_in_queue()
                self._pool = ProcessPool(self.MAX_WORKERS,
                                         max_tasks=20,
                                         initializer=init_worker,
                                         initargs=(log_queue,))

        return self._pool

    def stop_workers(self):
        """
        Stop the pool workers
        :return: None
        """
        if self._pool is not None:
            self._pool.stop()
            self._pool.join()
            self._pool = None

    def get_document_parser_for(self, http_response):
        """
        Get a document parser for http_response

        This parses the http_response in a pool worker. This method has two
        features:
            * We can kill the worker if the parser is taking too long
            * We can have different parsers

        :param http_response: The http response instance
        :return: An instance of DocumentParser
        """
        # Start the worker processes if needed
        self.start_workers()

        apply_args = (process_document_parser,
                      http_response,
                      self.DEBUG)

        # Push the task to the workers
        future = self._pool.schedule(apply_with_return_error,
                                     args=(apply_args,),
                                     timeout=self.PARSER_TIMEOUT)

        try:
            parser_output = future.result()
        except TimeoutError:
            # Act just like when there is no parser
            msg = ('[timeout] The parser took more than %s seconds'
                   ' to complete parsing of "%s", killed it!')

            args = (self.PARSER_TIMEOUT, http_response.get_url())

            raise BaseFrameworkException(msg % args)
        else:
            if isinstance(parser_output, Error):
                parser_output.reraise()

        return parser_output

    def get_tags_by_filter(self, http_response, tags, yield_text=False):
        """
        Return Tag instances for the tags which match the `tags` filter,
        parsing and all lxml stuff is done in another process and the Tag
        instances are sent to the main process (the one calling this method)
        through a pipe

        Some things to note:
            * Not all responses can be parsed, so I need to call DocumentParser
              and handle exceptions

            * The parser selected by DocumentParser might not have tags, and
              it might not have get_tags_by_filter. In this case just return an
              empty list

            * Just like get_document_parser_for we have a timeout in place,
              when we hit the timeout just return an empty list, this is not
              the best thing to do, but makes the plugin code easier to write
              (plugins would ignore this anyways)

        :param tags: The filter
        :param yield_text: Should we yield the tag text?
        :return: A list of Tag instances as defined in sgml.py

        :see: SGMLParser.get_tags_by_filter
        """
        # Start the worker processes if needed
        self.start_workers()

        apply_args = (process_get_tags_by_filter,
                      http_response,
                      tags,
                      yield_text,
                      self.DEBUG)

        # Push the task to the workers
        future = self._pool.schedule(apply_with_return_error,
                                     args=(apply_args,),
                                     timeout=self.PARSER_TIMEOUT)

        try:
            filtered_tags = future.result()
        except TimeoutError:
            # We hit a timeout, return an empty list
            return []
        else:
            # There was an exception in the parser, maybe the HTML was really
            # broken, or it wasn't an HTML at all.
            if isinstance(filtered_tags, Error):
                return []

        return filtered_tags
Пример #23
0
class MultiProcessingDocumentParser(object):
    """
    A document parser that performs all it's tasks in different processes and
    returns results to the main process.

    Also implements a parsing timeout just in case the parser enters an infinite
    loop.

    :author: Andres Riancho ([email protected])
    """
    DEBUG = core_profiling_is_enabled()
    MAX_WORKERS = 2 if is_running_on_ci() else (multiprocessing.cpu_count() /
                                                2) or 1

    # Increasing the timeout when profiling is enabled seems to fix issue #9713
    #
    # https://github.com/andresriancho/w3af/issues/9713
    PROFILING_ENABLED = (user_wants_memory_profiling()
                         or user_wants_pytracemalloc()
                         or user_wants_cpu_profiling())

    # in seconds
    PARSER_TIMEOUT = 60 * 3 if PROFILING_ENABLED else 10

    def __init__(self):
        self._pool = None
        self._start_lock = threading.RLock()

    def start_workers(self):
        """
        Start the pool and workers
        :return: The pool instance
        """
        with self._start_lock:
            if self._pool is None:

                # Start the process pool
                log_queue = om.manager.get_in_queue()
                self._pool = ProcessPool(self.MAX_WORKERS,
                                         max_tasks=20,
                                         initializer=init_worker,
                                         initargs=(log_queue, ))

        return self._pool

    def stop_workers(self):
        """
        Stop the pool workers
        :return: None
        """
        if self._pool is not None:
            self._pool.stop()
            self._pool.join()
            self._pool = None

    def get_document_parser_for(self, http_response):
        """
        Get a document parser for http_response

        This parses the http_response in a pool worker. This method has two
        features:
            * We can kill the worker if the parser is taking too long
            * We can have different parsers

        :param http_response: The http response instance
        :return: An instance of DocumentParser
        """
        # Start the worker processes if needed
        self.start_workers()

        apply_args = (process_document_parser, http_response, self.DEBUG)

        # Push the task to the workers
        future = self._pool.schedule(apply_with_return_error,
                                     args=(apply_args, ),
                                     timeout=self.PARSER_TIMEOUT)

        try:
            parser_output = future.result()
        except TimeoutError:
            # Act just like when there is no parser
            msg = ('[timeout] The parser took more than %s seconds'
                   ' to complete parsing of "%s", killed it!')

            args = (self.PARSER_TIMEOUT, http_response.get_url())

            raise BaseFrameworkException(msg % args)
        else:
            if isinstance(parser_output, Error):
                parser_output.reraise()

        return parser_output

    def get_tags_by_filter(self, http_response, tags, yield_text=False):
        """
        Return Tag instances for the tags which match the `tags` filter,
        parsing and all lxml stuff is done in another process and the Tag
        instances are sent to the main process (the one calling this method)
        through a pipe

        Some things to note:
            * Not all responses can be parsed, so I need to call DocumentParser
              and handle exceptions

            * The parser selected by DocumentParser might not have tags, and
              it might not have get_tags_by_filter. In this case just return an
              empty list

            * Just like get_document_parser_for we have a timeout in place,
              when we hit the timeout just return an empty list, this is not
              the best thing to do, but makes the plugin code easier to write
              (plugins would ignore this anyways)

        :param tags: The filter
        :param yield_text: Should we yield the tag text?
        :return: A list of Tag instances as defined in sgml.py

        :see: SGMLParser.get_tags_by_filter
        """
        # Start the worker processes if needed
        self.start_workers()

        apply_args = (process_get_tags_by_filter, http_response, tags,
                      yield_text, self.DEBUG)

        # Push the task to the workers
        future = self._pool.schedule(apply_with_return_error,
                                     args=(apply_args, ),
                                     timeout=self.PARSER_TIMEOUT)

        try:
            filtered_tags = future.result()
        except TimeoutError:
            # We hit a timeout, return an empty list
            return []
        else:
            # There was an exception in the parser, maybe the HTML was really
            # broken, or it wasn't an HTML at all.
            if isinstance(filtered_tags, Error):
                return []

        return filtered_tags