def test_process_pool_join_workers(self): """Process Pool Spawn no worker is running after join.""" pool = ProcessPool(max_workers=4) pool.schedule(function, args=[1]) pool.stop() pool.join() self.assertEqual(len(pool._pool_manager.worker_manager.workers), 0)
def test_process_pool_stop_stopped(self): """Process Pool Spawn is stopped after stop.""" pool = ProcessPool() pool.schedule(function, args=[1]) pool.stop() pool.join() self.assertFalse(pool.active)
def test_process_pool_close_stopped(self): """Process Pool Fork is stopped after close.""" pool = ProcessPool(max_workers=1) pool.schedule(function, args=[1]) pool.close() pool.join() self.assertFalse(pool.active)
def test_process_pool_stop_large_data(self): """Process Pool Spawn is stopped if large data is sent on the channel.""" data = "a" * 4098 * 1024 pool = ProcessPool(initializer=long_initializer) pool.schedule(function, args=[data]) pool.stop() pool.join() self.assertFalse(pool.active)
def test_process_pool_stop_large_data(self): """Process Pool Fork is stopped if large data is sent on the channel.""" data = "a" * 1098 * 1024 * 50 # 50 Mb pool = ProcessPool(max_workers=1) pool.schedule(function, args=[data]) pool.stop() pool.join() self.assertFalse(pool.active)
def test_process_pool_join_futures_timeout(self): """Process Pool Spawn TimeoutError is raised if join on long tasks.""" pool = ProcessPool() for _ in range(2): pool.schedule(long_function) pool.close() self.assertRaises(TimeoutError, pool.join, 0.4) pool.stop() pool.join()
def test_process_pool_join_futures_timeout(self): """Process Pool Fork TimeoutError is raised if join on long futures.""" pool = ProcessPool(max_workers=1) for _ in range(2): pool.schedule(long_function) pool.close() self.assertRaises(TimeoutError, pool.join, 0.4) pool.stop() pool.join()
def handle(self, *args, **options): trials = options['trials'] bucket_id = options['bucket'] pool = ProcessPool(max_workers=2) pool._start_pool() bucket, _ = Counter.objects.get_or_create(bucket=bucket_id) bucket.count = 0 bucket.save() future_1 = pool.schedule(run_atomic_transactions, args=('T1', bucket_id, trials)) future_2 = pool.schedule(run_savepoints, args=('T2', bucket_id, trials)) pool.close() pool.join()
def test_process_pool_close_futures(self): """Process Pool Fork all futures are performed on close.""" futures = [] pool = ProcessPool(max_workers=1) for index in range(10): futures.append(pool.schedule(function, args=[index])) pool.close() pool.join() map(self.assertTrue, [f.done() for f in futures])
def test_process_pool_stop_futures(self): """Process Pool Spawn not all futures are performed on stop.""" futures = [] pool = ProcessPool() for index in range(10): futures.append(pool.schedule(function, args=[index])) pool.stop() pool.join() self.assertTrue(len([f for f in futures if not f.done()]) > 0)
def test_process_pool_close_futures(self): """Process Pool Spawn all futures are performed on close.""" futures = [] pool = ProcessPool() for index in range(10): futures.append(pool.schedule(function, args=[index])) pool.close() pool.join() map(self.assertTrue, [f.done() for f in futures])
def test_process_pool_stop_futures(self): """Process Pool Fork not all futures are performed on stop.""" futures = [] pool = ProcessPool(max_workers=1) for index in range(10): futures.append(pool.schedule(function, args=[index])) pool.stop() pool.join() self.assertTrue(len([f for f in futures if not f.done()]) > 0)
def hello_world(query, _id): for proc in processes: if proc["id"] == _id and proc["query"] == query: return {'failed': 'query already exists'} executor = ProcessPool() future = executor.schedule(twitter_worker.getTweets, args=(query, _id)) processes.append({'process': future, 'query': query, 'id': _id}) print(processes) return {'success': 'ack'}
class CounterDaemon(object): def __init__(self, workers=1, poll_interval=None, max_tasks=100, task_timeout=0.1, task_default_sleep=0.01, task_sleep_rand_range=(1, 20)): self.workers = workers self.poll_interval = poll_interval self.max_tasks = max_tasks self.task_timeout = task_timeout self.task_default_sleep = task_default_sleep self.task_sleep_rand_range = task_sleep_rand_range self.pool = ProcessPool(max_workers=self.workers, max_tasks=self.max_tasks) self.pool._start_pool() def done_callback(self, bucket_id, future): pid = os.getpid() try: result = future.result() logger.debug('Result: {}\tpid: {}\tbucket: {}'.format(result, pid, bucket_id)) except futures.TimeoutError as e: logger.warning('TimeoutError\tpid: {}\tbucket: {}'.format(pid, bucket_id)) except futures.CancelledError: return except Exception as e: logger.exception('TaskError\t pid: {}\tbucket: {}\tError: {}'.format(pid, bucket_id, e)) def run_once(self): for bucket_id in random.sample(BUCKET_RANGE, self.workers): sleep = self.task_default_sleep * random.randint(*self.task_sleep_rand_range) future = self.pool.schedule( incr, args=(bucket_id,), kwargs={'sleep': sleep}, timeout=self.task_timeout ) future.add_done_callback(functools.partial(self.done_callback, bucket_id)) def run_forever(self): while True: try: self.run_once() except Exception as e: logger.exception('RunOnceError: {}'.format(e)) time.sleep(self.poll_interval or (3 * self.task_timeout)) def start(self): try: self.run_forever() except Exception as e: logger.exception('Error during running daemon: {}'.format(e)) self.pool.close() time.sleep(10) self.pool.stop() finally: self.pool.join()
def parallel_checks() -> None: """ Do all the checks that don't change code and can run in parallel. """ chores = [ do_mypy, do_detect_secrets, do_git_secrets, vulture, do_compile_py, do_lint, do_flake8, do_dodgy, do_bandit, do_python_taint, do_mccabe, do_check_manifest, do_liccheck, ] if IS_GITLAB: # other tasks assume there will be a LOC file by now. do_count_lines_of_code() for chore in chores: print(chore()) return # can't do pyroma because that needs a package, which might not exist yet. pool = ProcessPool(12) # max_workers=len(chores)) # cpu_count()) # log_to_stderr(logging.DEBUG) tasks = [] for chore in chores: tasks.append(pool.schedule(chore, args=())) print("close & join") pool.close() pool.join() for current_task in tasks: # pylint: disable=broad-except try: result = current_task.result() exception = current_task.exception() if exception: print(current_task.exception()) print(result) if "Abnormal" in str(result): print("One or more parallel tasks failed.") sys.exit(-1) except Exception as ex: print(ex) sys.exit(-1)
def dump_packs(artifact_manager: ArtifactsManager, pool: ProcessPool) -> List[ProcessFuture]: """ Create futures which dumps conditionally content/Packs. Args: artifact_manager: Artifacts manager object. pool: Process pool to schedule new processes. Returns: List[ProcessFuture]: List of pebble futures to wait for. """ futures = [] if 'all' in artifact_manager.pack_names: for pack_name, pack in artifact_manager.content.packs.items(): if pack_name not in IGNORED_PACKS: futures.append(pool.schedule(dump_pack, args=(artifact_manager, pack))) else: for pack_name in artifact_manager.pack_names: if pack_name not in IGNORED_PACKS and pack_name in artifact_manager.content.packs: futures.append(pool.schedule(dump_pack, args=(artifact_manager, artifact_manager.content.packs[pack_name]) )) return futures
class Processor: conf = None api = None log_q = None def __init__(self, recording_type, processing_state, process_func, num_workers): self.recording_type = recording_type self.processing_state = processing_state self.process_func = process_func self.num_workers = num_workers self.pool = ProcessPool(num_workers, initializer=logs.init_worker, initargs=(self.log_q, )) self.in_progress = {} def poll(self): self.reap_completed() if len(self.in_progress) >= self.num_workers: return True recording = self.api.next_job(self.recording_type, self.processing_state) if recording: logger.debug( "scheduling %s (%s: %s)", recording["id"], recording["type"], self.processing_state, ) future = self.pool.schedule(self.process_func, (recording, self.conf)) self.in_progress[recording["id"]] = future return True return False def reap_completed(self): for recording_id, future in list(self.in_progress.items()): if future.done(): del self.in_progress[recording_id] err = future.exception() if err: msg = f"{self.recording_type}.{self.processing_state} processing of {recording_id} failed: {err}" tb = getattr(err, "traceback", None) if tb: msg += f":\n{tb}" logger.error(msg)
def start_ranking_stream(): print('Starting stream...') tickers = [ "$GILD", "$UNP", "$UTX", "$HPQ", "$V", "$CSCO", "$SLB", "$AMGN", "$BA", "$COP", "$CMCSA", "$BMY", "$VZ", "$T", "$UNH" ] executor = ProcessPool() futures = [] try: for ticker in tickers: futures.append(executor.schedule(rank_work, args=[ticker])) print(futures) return futures except Exception as e: logger.exception(e) raise e
class PebbleExecutor(concurrent.futures.Executor): def __init__(self, max_workers, timeout=None): self.pool = ProcessPool(max_workers=max_workers) self.timeout = timeout def submit(self, fn, *args, **kwargs): return self.pool.schedule(fn, args=args, timeout=self.timeout) def map(self, func, *iterables, timeout=None, chunksize=1): raise NotImplementedError("This wrapper does not support `map`.") def shutdown(self, wait=True): if wait: log.info("Closing workers...") self.pool.close() else: log.info("Ending workers...") self.pool.stop() self.pool.join() log.info("Workers joined.")
plot_update_time = time() continue # This is the first call to the server if not responses: latency = 0 last_staging_request_time = time() t_stage = t_now # You can use limited channels to do the scoring as well # for channels which are missing use blank_data which is simply # a vector of 0s of 60 second duration responses.append( executor.schedule( process_and_stage, (running_window[0, :], running_window[1, :], running_window[2, :], running_window[3, :], running_window[4, :], np.ones(5) * sampling_rate, token), timeout=SCORING_FREQUENCY)) if responses[-1].done() and not last_call_success: stage = responses[-1].result() #sleep_stages.append(np.hstack((t_stage, stage))) sleep_stages.append([t_stage, stage_keys[stage[0]], stage[1]]) print("Time: %0.2f Stage: %s Confidence: %0.2f" % (t_stage, stage_keys[stage[0]], stage[1])) latency = (time() - last_staging_request_time) * 1000 last_call_success = True if time() - last_staging_request_time > SCORING_FREQUENCY: # last request did not complete in time! if not responses[-1].done():
class Wintermute(discord.Client): __channels = None __bot_prelude = None __timeout = None __parser = None __pool = None def __init__(self, channels={}, bot_prelude='[bot] ', timeout=10, multiprocessing=1, loglevel=logging.INFO): super().__init__() self.__channels = channels self.__bot_prelude = bot_prelude self.__timeout = timeout logging.basicConfig(level=loglevel) self.__parser = BotGram(prelude=bot_prelude) self.__pool = ProcessPool(max_workers=multiprocessing, initializer=seed) def __del__(self): self.__pool.close() self.__pool.join() async def on_ready(self): logging.info("Online as " + str(self.user.name)) game = discord.Game() game.name = "Manipulating humanity" await self.change_presence(game=game) logging.info('Setup done') async def on_message(self, mess): if mess.channel.is_private: logging.info("new private message from " + str(mess.author)) else: logging.info("new message on server " + str(mess.server) + " and channel " + str(mess.channel) + " from " + str(mess.author)) logging.info("len: " + str(len(mess.content))) logging.info("content: " + str(mess.content)) if mess.channel.is_private: return if mess.author == self.user: return if (mess.channel.permissions_for(mess.channel.server.me).send_messages and mess.channel.name == self.__channels[mess.server.name]): resp = self.__parser.parse(mess) if resp is not None: task = self.__pool.schedule(str, args=(resp, ), timeout=self.__timeout) asyncio.ensure_future(self.collect_response( mess.channel, mess.author.mention, task), loop=self.loop) async def collect_response(self, channel, mention, task): try: while not task.done(): await asyncio.sleep(0.1) result = task.result() await self.send_message(channel, result) except TimeoutError: await self.send_message( channel, self.__bot_prelude + mention + ' Your request timed out') task.cancel() except HTTPException as e: if e.response.status == 400: await self.send_message( channel, self.__bot_prelude + mention + ' Error: Request reply was probably too long')
class StreamClient(): ''' Client managing all stream pipelines ''' # Maximum number of cameras that can be configured MAX_PIPELINE_COUNT = 20 def __init__(self): # Initialize concurrent engine self.pool = ProcessPool(max_workers=StreamClient.MAX_PIPELINE_COUNT + 1, max_tasks=StreamClient.MAX_PIPELINE_COUNT) self.futures = dict() self.set_env_variables() self.config = dict() self.pipelines = dict() def set_env_variables(self): ''' Sets GStreamer environmental variables ''' # Logging level if 'GST_DEBUG' not in os.environ: os.environ['GST_DEBUG'] = '2,splitmuxsink:4' # Plugin path plugin_path = "../gst_plugins" cwd_path = os.getcwd() file_path = os.path.abspath(os.path.dirname(__file__)) if len(file_path) > len(cwd_path): diff_path = file_path.replace(cwd_path, '') diff_path = os.path.normpath(diff_path) plugin_path = os.path.join(diff_path, plugin_path)[1:] # Python bindings path plugin_path += ":/usr/lib/gstreamer-1.0/" logger.info("GST_PLUGIN_PATH: %s" % plugin_path) os.environ["GST_PLUGIN_PATH"] = plugin_path # Set LIB_GSTREAMER_PATH if 'LIB_GSTREAMER_PATH' not in os.environ: logger.info( "Gstreamer version: {}.{}.{}.{}".format(*Gst.version())) lib_gst_path = None if sys.platform == "darwin": lib_gst_path = "/usr/local/Cellar/gstreamer/{}.{}.{}/lib/libgstreamer-1.0.dylib".format( *Gst.version()) elif sys.platform == "win32": lib_gst_path = "C:\\msys64\\mingw64\\bin\\libgstreamer-1.0-0.dll" else: lib_gst_paths = [ "/usr/lib/aarch64-linux-gnu/libgstreamer-1.0.so.0", "/usr/lib/x86_64-linux-gnu/libgstreamer-1.0.so.0" ] for path in lib_gst_paths: if os.path.exists(path): lib_gst_path = path break if lib_gst_path is None: logger.warning( "libgstreamer-1.0 not found! Please export LIB_GSTREAMER_PATH env var manually." ) else: logger.info("LIB_GSTREAMER_PATH: %s" % lib_gst_path) os.environ['LIB_GSTREAMER_PATH'] = lib_gst_path def update(self, config): ''' Merge new config with existing one ''' self.config.update(config) def get_config(self, config_or_filename): ''' Load config from file and return it in dictionary format ''' if config_or_filename is None or isinstance(config_or_filename, str): return StreamConfig.LoadConfigFromFile(config_or_filename) else: return config_or_filename def get_pipeline(self, name): ''' Returns pipeline with a given name ''' if name in self.pipelines: return self.pipelines[name] return None def add(self, config_or_filename=None): ''' Build pipeline and add to the registry ''' # Get config in the proper format config = self.get_config(config_or_filename) # Add default id if config is flat if config is not None and "pipeline" in config: key = "default_%d" % len(self.pipelines.keys()) config = {key: config} for key in config: # Skip those sources that are disabled in configuration if "enabled" in config[key] and not config[key]["enabled"]: logger.info("Skipping %s (disabled)" % key) continue # Parse config config[key]["id"] = key pipeline_config = StreamConfig(config[key]) logger.info(pformat(pipeline_config)) # Create pipeline pipeline = PipelineFactory.createPipeline(pipeline_config) self.pipelines[key] = pipeline if len(self.pipelines.keys()) == 1: return tuple(self.pipelines.values())[0] return tuple(self.pipelines.values()) def run(self, loop=None): ''' Run pipelines that have been added with add() ''' if len(self.pipelines.keys()) == 0: logger.error("No pipelines added") return # Main application loop if loop is None: loop = GLib.MainLoop() # Start each pipeline for k, v in self.pipelines.items(): v.start(loop) # Run the main loop loop.run() # Cleanup when main loop has ended for k, v in self.pipelines.items(): v.stop() def start(self, config_or_filename=None, wait_for_finish=True): ''' Start single pipeline ''' pipeline = self.add(config_or_filename) if wait_for_finish: self.run() else: thread = Thread(target=self.run) thread.start() return pipeline, thread def stop(self): self.pool.stop() def schedule(self, config_or_filename=None, wait_for_finish=False, restart_on_exception=False): ''' Start one or more pipelines asynchronously, in parallel ''' try: # Get config in the proper format config = self.get_config(config_or_filename) # Update cached config self.update(config) # Check for misuse of schedule() for i in ['debug', 'enabled']: if i in config and isinstance(config[i], bool): logger.error( "Either use start() instead of schedule() or nest the configuration into a pipeline." ) return config # Stop pipelines that are going to be reconfigured for key in self.futures: if key in config: logger.info("Cancelling the following pipeline: %s" % key) if self.futures[key].cancel() == False: logger.error( "Failed to cancel the following pipeline: %s" % key) continue # Spin off each camera pipeline in a separate thread/process for key in config: # Cap at MAX_PIPELINE_COUNT if len(self.futures.keys()) >= StreamClient.MAX_PIPELINE_COUNT: logger.error( "Maximum number of pipelines reached. Not configuring %s" % key) continue # Skip those sources that are disabled in configuration if "enabled" in config[key] and not config[key]["enabled"]: logger.info("Skipping %s (disabled)" % key) del self.config[key] continue # Start new process/thread future = self.pool.schedule(stream_pipeline, args=[key, config[key]]) if restart_on_exception: future.add_done_callback(self.catch_and_restart) self.futures[key] = future # This is a blocking call, therefore use with caution (it will prevent parallel execution!) if wait_for_finish: for key in self.futures: logger.info(self.futures[key].result()) except Exception as e: logger.error("Failed to start pipeline(s): " + repr(e)) self.config["error"] = repr(e) return self.config def catch_and_restart(self, future): ''' Catches any exception from the future and restarts the process/thread ''' try: result = future.result() logger.info(result) except TimeoutError as error: logger.error("Function took longer than %d seconds" % error.args[1]) except Exception as error: logger.error("Function raised %s" % error) if "CancelledError" in error.__class__.__name__: logger.error( "Function has been already cancelled. Doing nothing...") return if hasattr(error, 'traceback'): logger.error(error.traceback) logger.info("Restarting all pipelines after 5 sec...") time.sleep(5) self.schedule(self.config, wait_for_finish=False, restart_on_exception=True)
def byDays(startDate, endDate, query, _id): executor = ProcessPool() executor.schedule(twitter_worker.getByDay, args=[ startDate, endDate, query, _id]) return {'sucess': 'days'}
class Processor: conf = None api = None log_q = None def __init__(self, recording_type, processing_states, process_func, num_workers): self.recording_type = recording_type self.processing_states = processing_states self.process_func = process_func self.num_workers = num_workers self.pool = ProcessPool( num_workers, initializer=logs.init_worker, initargs=(self.log_q,) ) self.in_progress = {} def full(self): return len(self.in_progress) >= self.num_workers def has_no_work(self): return len(self.in_progress) == 0 def has_work(self): return len(self.in_progress) > 0 def poll(self): self.reap_completed() if self.full(): return True working = False for state in self.processing_states: response = self.api.next_job(self.recording_type, state) if not response: continue recording = response["recording"] rawJWT = response["rawJWT"] if recording.get("id", 0) in self.in_progress: logger.debug( "Recording %s (%s: %s) is already scheduled", recording["id"], recording["type"], state, ) continue logger.debug( "scheduling %s (%s: %s)", recording["id"], recording["type"], state, ) future = self.pool.schedule( self.process_func, (recording, rawJWT, self.conf) ) self.in_progress[recording["id"]] = (recording["jobKey"], future) working = True return working def reap_completed(self): for recording_id, job in list(self.in_progress.items()): future = job[1] if future.done(): err = future.exception() if err: msg = f"{self.recording_type}.{self.processing_states} processing of {recording_id} failed: {err}" tb = getattr(err, "traceback", None) if tb: msg += f":\n{tb}" logger.error(msg) try: self.api.report_failed(recording_id, job[0]) except: logger.error( "Could not set %s to failed state", recording_id, exc_info=True, ) del self.in_progress[recording_id]
class MultiProcessingDocumentParser(object): """ A document parser that performs all it's tasks in different processes and returns results to the main process. Also implements a parsing timeout just in case the parser enters an infinite loop. :author: Andres Riancho ([email protected]) """ DEBUG = core_profiling_is_enabled() MAX_WORKERS = 2 if is_running_on_ci() else (multiprocessing.cpu_count() / 2) or 1 # Increasing the timeout when profiling is enabled seems to fix issue #9713 # # https://github.com/andresriancho/w3af/issues/9713 PROFILING_ENABLED = (user_wants_memory_profiling() or user_wants_pytracemalloc() or user_wants_cpu_profiling()) # in seconds PARSER_TIMEOUT = 60 * 3 if PROFILING_ENABLED else 10 # Document parsers can go crazy on memory usage when parsing some very # specific HTML / PDF documents. Sometimes when this happens the operating # system does an out of memory (OOM) kill of a "randomly chosen" process. # # We limit the memory which can be used by parsing processes to this constant # # The feature was tested in test_pebble_limit_memory_usage.py MEMORY_LIMIT = get_memory_limit() def __init__(self): self._pool = None self._start_lock = threading.RLock() def start_workers(self): """ Start the pool and workers :return: The pool instance """ with self._start_lock: if self._pool is None: # Start the process pool log_queue = om.manager.get_in_queue() self._pool = ProcessPool(self.MAX_WORKERS, max_tasks=20, initializer=init_worker, initargs=(log_queue, self.MEMORY_LIMIT)) return self._pool def stop_workers(self): """ Stop the pool workers :return: None """ if self._pool is not None: self._pool.stop() self._pool.join() self._pool = None def get_document_parser_for(self, http_response): """ Get a document parser for http_response This parses the http_response in a pool worker. This method has two features: * We can kill the worker if the parser is taking too long * We can have different parsers :param http_response: The http response instance :return: An instance of DocumentParser """ # Start the worker processes if needed self.start_workers() filename = write_http_response_to_temp_file(http_response) apply_args = (process_document_parser, filename, self.DEBUG) # Push the task to the workers try: future = self._pool.schedule(apply_with_return_error, args=(apply_args,), timeout=self.PARSER_TIMEOUT) except RuntimeError, rte: # Remove the temp file used to send data to the process remove_file_if_exists(filename) # We get here when the pebble pool management thread dies and # suddenly starts answering all calls with: # # RuntimeError('Unexpected error within the Pool') # # The scan needs to stop because we can't parse any more # HTTP responses, which is a very critical part of the process msg = str(rte) raise ScanMustStopException(msg) try: process_result = future.result() except TimeoutError: msg = ('[timeout] The parser took more than %s seconds' ' to complete parsing of "%s", killed it!') args = (self.PARSER_TIMEOUT, http_response.get_url()) raise TimeoutError(msg % args) except ProcessExpired: # We reach here when the process died because of an error, we # handle this just like when the parser takes a lot of time and # we're unable to retrieve an answer from it msg = ('One of the parser processes died unexpectedly, this could' ' be because of a bug, the operating system triggering OOM' ' kills, etc. The scanner will continue with the next' ' document, but the scan results might be inconsistent.') raise TimeoutError(msg) finally: # Remove the temp file used to send data to the process, we already # have the result, so this file is not needed anymore remove_file_if_exists(filename) # We still need to perform some error handling here... if isinstance(process_result, Error): if isinstance(process_result.exc_value, MemoryError): msg = ('The parser exceeded the memory usage limit of %s bytes' ' while trying to parse "%s". The parser was stopped in' ' order to prevent OOM issues.') args = (self.MEMORY_LIMIT, http_response.get_url()) om.out.debug(msg % args) raise MemoryError(msg % args) process_result.reraise() try: parser_output = load_object_from_temp_file(process_result) except Exception, e: msg = 'Failed to deserialize sub-process result. Exception: "%s"' args = (e,) raise Exception(msg % args)
class StreamClient(): ''' Client managing all stream pipelines ''' # Maximum number of cameras that can be configured MAX_PIPELINE_COUNT = 20 def __init__(self): # Initialize concurrent engine self.pool = ProcessPool(max_workers=StreamClient.MAX_PIPELINE_COUNT + 1, max_tasks=StreamClient.MAX_PIPELINE_COUNT) self.futures = dict() self.set_env_variables() self.config = dict() def set_env_variables(self): ''' Sets GStreamer environmental variables ''' # Logging level if 'GST_DEBUG' not in os.environ: os.environ['GST_DEBUG'] = '2,splitmuxsink:4' # Plugin path plugin_path = "../gst_plugins" cwd_path = os.getcwd() file_path = os.path.abspath(os.path.dirname(__file__)) if len(file_path) > len(cwd_path): diff_path = file_path.replace(cwd_path, '') diff_path = os.path.normpath(diff_path) plugin_path = os.path.join(diff_path, plugin_path)[1:] # Python bindings path plugin_path += ":/usr/lib/gstreamer-1.0/" logger.info("GST_PLUGIN_PATH: %s" % plugin_path) os.environ["GST_PLUGIN_PATH"] = plugin_path # Set LIB_GSTREAMER_PATH if 'LIB_GSTREAMER_PATH' not in os.environ: logger.info( "Gstreamer version: {}.{}.{}.{}".format(*Gst.version())) lib_gst_path = None if sys.platform == "darwin": lib_gst_path = "/usr/local/Cellar/gstreamer/{}.{}.{}/lib/libgstreamer-1.0.dylib".format( *Gst.version()) else: lib_gst_paths = [ "/usr/lib/aarch64-linux-gnu/libgstreamer-1.0.so.0", "/usr/lib/x86_64-linux-gnu/libgstreamer-1.0.so.0" ] for path in lib_gst_paths: if os.path.exists(path): lib_gst_path = path break if lib_gst_path is None: logger.warning( "libgstreamer-1.0 not found! Please export LIB_GSTREAMER_PATH env var manually." ) else: logger.info("LIB_GSTREAMER_PATH: %s" % lib_gst_path) os.environ['LIB_GSTREAMER_PATH'] = lib_gst_path def update(self, config): ''' Merge new config with existing one ''' self.config.update(config) def get_config(self, config_or_filename): ''' Load config from file and return it in dictionary format ''' if config_or_filename is None or isinstance(config_or_filename, str): return StreamConfig.LoadConfigFromFile(config_or_filename) else: return config_or_filename def start(self, config_or_filename=None, key=None): ''' Start single pipeline synchronously ''' # Get config in the proper format config = self.get_config(config_or_filename) # Flatten the config if config is not None and len(config.keys()) == 1: key = list(config.keys())[0] config = config[key] # Start the pipeline future = self.pool.schedule(stream_pipeline, args=[key, config]) future.result() return config def stop(self): self.pool.stop() def schedule(self, config_or_filename=None, wait_for_finish=False): ''' Start one or more pipelines asynchronously, in parallel ''' try: # Get config in the proper format config = self.get_config(config_or_filename) # Update cached config self.update(config) # Check for misuse of schedule() for i in ['debug', 'enabled']: if i in config and isinstance(config[i], bool): logger.error( "Either use start() instead of schedule() or nest the configuration into a pipeline." ) return config # Stop pipelines that are going to be reconfigured for key in self.futures: if key in config: logger.info("Cancelling the following pipeline: %s" % key) if self.futures[key].cancel() == False: logger.error( "Failed to cancel the following pipeline: %s" % key) continue # Spin off each camera pipeline in a separate thread/process for key in config: # Cap at MAX_PIPELINE_COUNT if len(self.futures.keys()) >= StreamClient.MAX_PIPELINE_COUNT: logger.error( "Maximum number of pipelines reached. Not configuring %s" % key) continue # Skip those sources that are disabled in configuration if "enabled" in config[key] and not config[key]["enabled"]: logger.info("Skipping %s (disabled)" % key) del self.config[key] continue # Start new process/thread future = self.pool.schedule(stream_pipeline, args=[key, config[key]]) self.futures[key] = future # This is a blocking call, therefore use with caution (it will prevent parallel execution!) if wait_for_finish: for key in self.futures: logger.info(self.futures[key].result()) except Exception as e: logger.error("Failed to start pipeline(s): " + repr(e)) self.config["error"] = repr(e) return self.config
class EvaluationPool: """Wrapper around ProcessPool. Essentially a ProcessPool that can only evaluate configs, with some tweaks and caching. Aims to do the following: - Instantiate evaluators once in the workers to reduce overhead - Use a cache to not have to run evaluations twice - Provide common format for results, with support for "ok" and "error" status - Catches specified, but not all, exceptions - Provides timeouts backed by a sufficiently brutal approach to killing processes* We therefore sacrifice a little bit of generality for convenience in our particular domain, which is just how we like it. Note that this deliberately doesn't do any event loop management, it simply provides the `.schedule` function which schedules the evaluation of a config, and a `.finish` function with obtains the result, with error checking, of that evaluation. *** WARNING: hash keys in the cache (called "evals") are solely based on the config to be evaluated. So if some `evals` with a different underlying evaluator get passed, things will break in undefined ways. Since you are not expected to ever touch this class without `Run` mediating, I don't think this is a problem. But be careful out there. *** WARNING: macOS has some issues with multiprocessing and fork safety. This should not be a problem with this implementation, but if the models evaluated do something fancy, this might be the problem. So if you encounter something like `RuntimeError: Unexpected error within the Pool`, please check whether it persists on Linux. (I've observed this in particular with using something that relies on sqlite3 and attempts to write things to disk concurrently.) --- * We interface with external code that doesn't always play by the rules, and in particular is quite fond of not reacting to SIGTERM. The `concurrent.futures` `ProcessPoolExecutor` doesn't seem to be able to enforce a timeout in such cases. """ def __init__( self, max_workers, evaluator_config, evaluator_context={}, evals=None, trial_timeout=None, caught_exceptions=(TimeoutError, ), ): self.trial_timeout = trial_timeout self.pool = ProcessPool( initializer=initializer, initargs=(evaluator_config, evaluator_context), max_workers=max_workers, ) if platform.system() == "Darwin" and max_workers > 1: logger.warning( "Parallel support on macOS is a bit wonky. Proceed with caution." ) if evals is None: evals = ResultDB() self.evals = evals self.caught_exceptions = caught_exceptions def schedule(self, suggestion): """Schedule evaluation of a suggestion. This also checks the cache in the background, and creates a faux future to return the cached result. This is slightly inefficient, but it substantially reduces the complexity of the interface: We can now always expect a future as a result, and the re-submission can be handled in a unified way by the `Run`. (You can't simply keep requesting suggestions until you hit something is not in the cache, this leads to deadlocks when the search space has been exhausted.) """ eid = compute_hash(suggestion) if eid in self.evals: result = self.evals.get_result(eid) future = self.pool.schedule(passthrough, args=(result, )) else: future = self.pool.schedule(evaluate, args=(eid, suggestion), timeout=self.trial_timeout) future.eid = eid # annotate with hash key in evals future.suggestion = suggestion return future def finish(self, future): """Obtain result of an evaluation future, catch errors, update caches. Should only be called with a finished future... but it's not a problem if it's not. The call to `future.result()` will trigger execution.""" try: result = future.result() self.evals.submit_result(future.eid, result) return result except self.caught_exceptions as e: trace = traceback.format_exc() result = { "error": { "error": e.__class__.__name__, "error_text": str(e), "traceback": trace, "suggestion": future.suggestion, } } self.evals.submit_result(future.eid, result) return result except Exception as e: # uncaught exception, print suggestion and exit trace = traceback.format_exc() message = f"Unexpected error {e.__class__.__name__} evaluating a trial.\n" message += f"Error string: {e}.\n" message += f"Suggestion: {future.suggestion}.\n" message += f"Traceback:\n{trace}." logger.error(message) raise e def shutdown(self): self.pool.stop() # no point in waiting for things try: self.pool.join(timeout=1.0) logger.info("Successfully and peacefully shut down pool.") except TimeoutError: logger.info( "Failed to peacefully shut down pool... but no worries.")
class MultiProcessingDocumentParser(object): """ A document parser that performs all it's tasks in different processes and returns results to the main process. Also implements a parsing timeout just in case the parser enters an infinite loop. :author: Andres Riancho ([email protected]) """ DEBUG = core_profiling_is_enabled() MAX_WORKERS = 2 if is_running_on_ci() else (multiprocessing.cpu_count() / 2) or 1 # Increasing the timeout when profiling is enabled seems to fix issue #9713 # # https://github.com/andresriancho/w3af/issues/9713 PROFILING_ENABLED = (user_wants_memory_profiling() or user_wants_pytracemalloc() or user_wants_cpu_profiling()) # in seconds PARSER_TIMEOUT = 60 * 3 if PROFILING_ENABLED else 10 def __init__(self): self._pool = None self._start_lock = threading.RLock() def start_workers(self): """ Start the pool and workers :return: The pool instance """ with self._start_lock: if self._pool is None: # Start the process pool log_queue = om.manager.get_in_queue() self._pool = ProcessPool(self.MAX_WORKERS, max_tasks=20, initializer=init_worker, initargs=(log_queue,)) return self._pool def stop_workers(self): """ Stop the pool workers :return: None """ if self._pool is not None: self._pool.stop() self._pool.join() self._pool = None def get_document_parser_for(self, http_response): """ Get a document parser for http_response This parses the http_response in a pool worker. This method has two features: * We can kill the worker if the parser is taking too long * We can have different parsers :param http_response: The http response instance :return: An instance of DocumentParser """ # Start the worker processes if needed self.start_workers() apply_args = (process_document_parser, http_response, self.DEBUG) # Push the task to the workers future = self._pool.schedule(apply_with_return_error, args=(apply_args,), timeout=self.PARSER_TIMEOUT) try: parser_output = future.result() except TimeoutError: # Act just like when there is no parser msg = ('[timeout] The parser took more than %s seconds' ' to complete parsing of "%s", killed it!') args = (self.PARSER_TIMEOUT, http_response.get_url()) raise BaseFrameworkException(msg % args) else: if isinstance(parser_output, Error): parser_output.reraise() return parser_output def get_tags_by_filter(self, http_response, tags, yield_text=False): """ Return Tag instances for the tags which match the `tags` filter, parsing and all lxml stuff is done in another process and the Tag instances are sent to the main process (the one calling this method) through a pipe Some things to note: * Not all responses can be parsed, so I need to call DocumentParser and handle exceptions * The parser selected by DocumentParser might not have tags, and it might not have get_tags_by_filter. In this case just return an empty list * Just like get_document_parser_for we have a timeout in place, when we hit the timeout just return an empty list, this is not the best thing to do, but makes the plugin code easier to write (plugins would ignore this anyways) :param tags: The filter :param yield_text: Should we yield the tag text? :return: A list of Tag instances as defined in sgml.py :see: SGMLParser.get_tags_by_filter """ # Start the worker processes if needed self.start_workers() apply_args = (process_get_tags_by_filter, http_response, tags, yield_text, self.DEBUG) # Push the task to the workers future = self._pool.schedule(apply_with_return_error, args=(apply_args,), timeout=self.PARSER_TIMEOUT) try: filtered_tags = future.result() except TimeoutError: # We hit a timeout, return an empty list return [] else: # There was an exception in the parser, maybe the HTML was really # broken, or it wasn't an HTML at all. if isinstance(filtered_tags, Error): return [] return filtered_tags
class MultiProcessingDocumentParser(object): """ A document parser that performs all it's tasks in different processes and returns results to the main process. Also implements a parsing timeout just in case the parser enters an infinite loop. :author: Andres Riancho ([email protected]) """ DEBUG = core_profiling_is_enabled() MAX_WORKERS = 2 if is_running_on_ci() else (multiprocessing.cpu_count() / 2) or 1 # Increasing the timeout when profiling is enabled seems to fix issue #9713 # # https://github.com/andresriancho/w3af/issues/9713 PROFILING_ENABLED = (user_wants_memory_profiling() or user_wants_pytracemalloc() or user_wants_cpu_profiling()) # in seconds PARSER_TIMEOUT = 60 * 3 if PROFILING_ENABLED else 10 # Document parsers can go crazy on memory usage when parsing some very # specific HTML / PDF documents. Sometimes when this happens the operating # system does an out of memory (OOM) kill of a "randomly chosen" process. # # We limit the memory which can be used by parsing processes to this constant # # The feature was tested in test_pebble_limit_memory_usage.py MEMORY_LIMIT = get_memory_limit() def __init__(self): self._pool = None self._start_lock = threading.RLock() def start_workers(self): """ Start the pool and workers :return: The pool instance """ with self._start_lock: if self._pool is None: # Start the process pool log_queue = om.manager.get_in_queue() self._pool = ProcessPool(self.MAX_WORKERS, max_tasks=20, initializer=init_worker, initargs=(log_queue, self.MEMORY_LIMIT)) return self._pool def stop_workers(self): """ Stop the pool workers :return: None """ if self._pool is not None: self._pool.stop() self._pool.join() self._pool = None def get_document_parser_for(self, http_response): """ Get a document parser for http_response This parses the http_response in a pool worker. This method has two features: * We can kill the worker if the parser is taking too long * We can have different parsers :param http_response: The http response instance :return: An instance of DocumentParser """ # Start the worker processes if needed self.start_workers() apply_args = (process_document_parser, http_response, self.DEBUG) # Push the task to the workers try: future = self._pool.schedule(apply_with_return_error, args=(apply_args, ), timeout=self.PARSER_TIMEOUT) except RuntimeError, rte: # We get here when the pebble pool management thread dies and # suddenly starts answering all calls with: # # RuntimeError('Unexpected error within the Pool') # # The scan needs to stop because we can't parse any more # HTTP responses, which is a very critical part of the process msg = str(rte) raise ScanMustStopException(msg) try: parser_output = future.result() except TimeoutError: msg = ('[timeout] The parser took more than %s seconds' ' to complete parsing of "%s", killed it!') args = (self.PARSER_TIMEOUT, http_response.get_url()) raise TimeoutError(msg % args) # We still need to perform some error handling here... if isinstance(parser_output, Error): if isinstance(parser_output.exc_value, MemoryError): msg = ('The parser exceeded the memory usage limit of %s bytes' ' while trying to parse "%s". The parser was stopped in' ' order to prevent OOM issues.') args = (self.MEMORY_LIMIT, http_response.get_url()) om.out.debug(msg % args) raise MemoryError(msg % args) parser_output.reraise() # Success! return parser_output
class MultiProcessingDocumentParser(object): """ A document parser that performs all it's tasks in different processes and returns results to the main process. Also implements a parsing timeout just in case the parser enters an infinite loop. :author: Andres Riancho ([email protected]) """ DEBUG = core_profiling_is_enabled() MAX_WORKERS = 2 if is_running_on_ci() else (multiprocessing.cpu_count() / 2) or 1 # Increasing the timeout when profiling is enabled seems to fix issue #9713 # # https://github.com/andresriancho/w3af/issues/9713 PROFILING_ENABLED = (user_wants_memory_profiling() or user_wants_pytracemalloc() or user_wants_cpu_profiling()) # in seconds PARSER_TIMEOUT = 60 * 3 if PROFILING_ENABLED else 10 def __init__(self): self._pool = None self._start_lock = threading.RLock() def start_workers(self): """ Start the pool and workers :return: The pool instance """ with self._start_lock: if self._pool is None: # Start the process pool log_queue = om.manager.get_in_queue() self._pool = ProcessPool(self.MAX_WORKERS, max_tasks=20, initializer=init_worker, initargs=(log_queue, )) return self._pool def stop_workers(self): """ Stop the pool workers :return: None """ if self._pool is not None: self._pool.stop() self._pool.join() self._pool = None def get_document_parser_for(self, http_response): """ Get a document parser for http_response This parses the http_response in a pool worker. This method has two features: * We can kill the worker if the parser is taking too long * We can have different parsers :param http_response: The http response instance :return: An instance of DocumentParser """ # Start the worker processes if needed self.start_workers() apply_args = (process_document_parser, http_response, self.DEBUG) # Push the task to the workers future = self._pool.schedule(apply_with_return_error, args=(apply_args, ), timeout=self.PARSER_TIMEOUT) try: parser_output = future.result() except TimeoutError: # Act just like when there is no parser msg = ('[timeout] The parser took more than %s seconds' ' to complete parsing of "%s", killed it!') args = (self.PARSER_TIMEOUT, http_response.get_url()) raise BaseFrameworkException(msg % args) else: if isinstance(parser_output, Error): parser_output.reraise() return parser_output def get_tags_by_filter(self, http_response, tags, yield_text=False): """ Return Tag instances for the tags which match the `tags` filter, parsing and all lxml stuff is done in another process and the Tag instances are sent to the main process (the one calling this method) through a pipe Some things to note: * Not all responses can be parsed, so I need to call DocumentParser and handle exceptions * The parser selected by DocumentParser might not have tags, and it might not have get_tags_by_filter. In this case just return an empty list * Just like get_document_parser_for we have a timeout in place, when we hit the timeout just return an empty list, this is not the best thing to do, but makes the plugin code easier to write (plugins would ignore this anyways) :param tags: The filter :param yield_text: Should we yield the tag text? :return: A list of Tag instances as defined in sgml.py :see: SGMLParser.get_tags_by_filter """ # Start the worker processes if needed self.start_workers() apply_args = (process_get_tags_by_filter, http_response, tags, yield_text, self.DEBUG) # Push the task to the workers future = self._pool.schedule(apply_with_return_error, args=(apply_args, ), timeout=self.PARSER_TIMEOUT) try: filtered_tags = future.result() except TimeoutError: # We hit a timeout, return an empty list return [] else: # There was an exception in the parser, maybe the HTML was really # broken, or it wasn't an HTML at all. if isinstance(filtered_tags, Error): return [] return filtered_tags