def optimize_parallel_gpu( self, train_function, gpu_ids, max_nb_trials=None, ): """ Runs optimization across gpus with cuda drivers :param train_function: :param max_nb_trials: :param gpu_ids: List of strings like: ['0', '1, 3'] :return: """ self.trials = strategies.generate_trials( strategy=self.strategy, flat_params=self.__flatten_params(self.opt_args), nb_trials=max_nb_trials, ) self.trials = [(self.__namespace_from_trial(x), train_function) for x in self.trials] # build q of gpu ids so we can use them in each process # this is thread safe so each process can pull out a gpu id, run its task and put it back when done if self.pool is None: gpu_q = Queue() for gpu_id in gpu_ids: gpu_q.put(gpu_id) # init a pool with the nb of worker threads we want nb_workers = len(gpu_ids) self.pool = Pool(processes=nb_workers, initializer=init, initargs=(gpu_q,)) # apply parallelization results = self.pool.map(optimize_parallel_gpu_private, self.trials) return results
def _create_tensor_dicts_from_queue(input_queue: Queue, output_queue: Queue, iterator: DataIterator, shuffle: bool, index: int) -> None: """ Pulls instances from ``input_queue``, converts them into ``TensorDict``s using ``iterator``, and puts them on the ``output_queue``. """ logger.info(f"Iterator worker: {index} PID: {os.getpid()}") def instances() -> Iterator[Instance]: instance = input_queue.get() while instance is not None: yield instance instance = input_queue.get() for tensor_dict in iterator(instances(), num_epochs=1, shuffle=shuffle): output_queue.put(tensor_dict) output_queue.put(index) # We need to ensure we've gotten all the tensors out of this queue before # this process ends. Otherwise we'll crash. See # https://github.com/pytorch/pytorch/issues/7181. This appears to be an # issue specifically with tensors, perhaps due to the refcounting involved # in managing them in shared memory. If you're working on this code, be # aware that I've only been able to reproduce this issue on Linux. Testing # on a Mac alone is not sufficient. output_queue.join()
def optimize_trials_parallel_gpu( self, train_function, nb_trials, trials, gpu_ids, nb_workers=4, ): """ Runs optimization across gpus with cuda drivers :param train_function: :param nb_trials: :param gpu_ids: List of strings like: ['0', '1, 3'] :param nb_workers: :return: """ self.trials = trials self.trials = [(x, train_function) for x in self.trials] # build q of gpu ids so we can use them in each process # this is thread safe so each process can pull out a gpu id, run its task and put it back when done if self.pool is None: gpu_q = Queue() for gpu_id in gpu_ids: gpu_q.put(gpu_id) # init a pool with the nb of worker threads we want self.pool = Pool(processes=nb_workers, initializer=init, initargs=(gpu_q,)) # apply parallelization results = self.pool.map(optimize_parallel_gpu_private, self.trials) return results
def _worker_loop(dataset, job_queue: mp.Queue, result_queue: mp.Queue, interrupt_event: mp.Event): logger = logging.getLogger("worker_loop") logger.debug("Worker started.") while True: logger.debug("Trying to fetch from job_queue.") if interrupt_event.is_set(): logger.debug("Received interrupt signal, breaking.") break try: # This assumes that the job_queue is fully populated before the worker is started. index = job_queue.get_nowait() logger.debug("Fetch successful.") except Empty: logger.debug("Queue empty, setting up poison pill.") index = None if index is None or interrupt_event.is_set(): logger.debug( "Fetched poison pill or received interrupt signal, breaking.") break try: logger.debug("Sampling index {} from dataset.".format(index)) sample = dataset[index] except Exception: logger.debug("Dataset threw an exception.".format(index), exc_info=1) result_queue.put((index, ExceptionWrapper(sys.exc_info()))) else: logger.debug( "Putting sample at index {} in the result queue.".format( index)) result_queue.put((index, sample))
def work(self, job_q: Queue, done_q: Queue, num_jobs_to_perform: int, device: torch.device, logger: MLgymLoggerIF): logger.log(LogLevel.INFO, f"Process {self.process_id} started working.") jobs_done_count = 0 for job in iter(job_q.get, None): # https://stackoverflow.com/a/21157892 job.device = device job.executing_process_id = self.process_id logger.log( LogLevel.INFO, f"Process {job.executing_process_id} started job {job.job_id} on {job.device}." ) job.starting_time = time.time() if job.job_type == JobType.CALC: self._do_calc(job) job.finishing_time = time.time() job.done = True jobs_done_count += 1 if job.job_type == JobType.TERMINATE or num_jobs_to_perform == jobs_done_count: logger.log(LogLevel.DEBUG, f"Process {self.process_id} terminated.") done_q.put(job) break
def read_data(dataset: Union[Video_2D_Inference, Video_3D_Inference], batch_size: int, num_worker: int, data_queue: mp.Queue): mp.set_sharing_strategy('file_system') for item in DataLoader(dataset, batch_size=batch_size, num_workers=num_worker): data_queue.put(item)
def start_processes_zombie_test( idx: int, entrypoint: Union[str, Callable], mp_queue: mp.Queue, log_dir: str, nproc: int = 2, ) -> None: """ Starts processes """ args = {} envs = {} for idx in range(nproc): args[idx] = () envs[idx] = {} pc = start_processes( name="zombie_test", entrypoint=entrypoint, args=args, envs=envs, log_dir=log_dir, redirects=Std.NONE, ) my_pid = os.getpid() mp_queue.put(my_pid) for child_pid in pc.pids().values(): mp_queue.put(child_pid) try: pc.wait(period=1, timeout=300) except SignalException as e: pc.close(e.sigval)
def train(self, data_loaders, num_updates = 5, tb=None, num_iters=250000): data_queue = Queue() # for notifying when to recieve data data_event = Event() # for notifying this method when to send new data process_event = Event() # so doesn't hang on first iteration process_event.set() num_tasks = len(data_loaders) processes = [] for process_id in range(self.world_size): processes.append(Process(target=self.init_process, args=(process_id, data_queue, data_event, process_event, num_updates, tb if process_id == 0 else None))) processes[-1].start() for num_iter in range(num_iters): print("num iter:",num_iter) process_event.wait() process_event.clear() tasks = np.random.randint(0, num_tasks, (self.world_size)) for task in tasks: task_data = next(data_loaders[task]) # place holder for sampling data from dataset data_queue.put((task_data[0].numpy()[0], task_data[1].numpy()[0], task_data[2].numpy()[0], task_data[3].numpy()[0])) data_event.set() for p in processes: p.terminate() p.join()
def update_runner(num_states: int, back_max: int, update_batch_size: int, heur_fn_i_q, heur_fn_o_q, proc_id: int, env: Environment, result_queue: Queue, num_steps: int, update_method: str, eps_max: float): heuristic_fn = nnet_utils.heuristic_fn_queue(heur_fn_i_q, heur_fn_o_q, proc_id, env) start_idx: int = 0 while start_idx < num_states: end_idx: int = min(start_idx + update_batch_size, num_states) states_itr, _ = env.generate_states(end_idx - start_idx, (0, back_max)) if update_method.upper() == "GBFS": states_update, cost_to_go_update, is_solved = gbfs_update( states_itr, env, num_steps, heuristic_fn, eps_max) elif update_method.upper() == "ASTAR": states_update, cost_to_go_update, is_solved = astar_update( states_itr, env, num_steps, heuristic_fn) else: raise ValueError("Unknown update method %s" % update_method) states_update_nnet: List[np.ndaray] = env.state_to_nnet_input( states_update) result_queue.put((states_update_nnet, cost_to_go_update, is_solved)) start_idx: int = end_idx result_queue.put(None)
def _run_game(process_id: int, game_factory: GameExecutorFactory, network: nn.Module, device: torch.device, request_queue: Queue, experience_queue: Queue, batch_size: int, transfer_blocks: int, transfer_to_device: bool) -> None: exploration_rate = 1. game = game_factory.create() print('* worker %d started' % process_id) while True: try: if not request_queue.empty(): request: _RunGameRequest = request_queue.get(block=False) if request.do_terminate: print('* game worker %d terminated' % process_id) experience_queue.close() request_queue.close() return if request.set_exploration_rate is not None: exploration_rate = request.set_exploration_rate block = [] for _ in range(transfer_blocks): eps, exps = game.multi_step(network, device, exploration_rate, batch_size) if transfer_to_device: exps = [ e.to_device(device, non_blocking=False) for e in exps ] block.append((eps, exps)) experience_queue.put(block, block=True) except Exception as e: print('error in worker %d: ' % process_id, e)
def mixup_process_worker_wrapper(q_input: mp.Queue, q_output: mp.Queue, device: int): """ :param q_input: input queue :param q_output: output queue :param device: running gpu device """ os.environ["CUDA_VISIBLE_DEVICES"] = f"{device}" print(f"Process generated with cuda:{device}") device = torch.device(f"cuda:{device}") while True: # Get and load on gpu out, target_reweighted, hidden, args, sc, A_dist, debug = q_input.get() out = out.to(device) target_reweighted = target_reweighted.to(device) sc = sc.to(device) A_dist = A_dist.to(device) # Run out, target_reweighted = mixup_process_worker(out, target_reweighted, hidden, args, sc, A_dist, debug) # To cpu and return out = out.cpu() target_reweighted = target_reweighted.cpu() q_output.put([out, target_reweighted])
def _create_tensor_dicts_from_qiterable(qiterable: QIterable, output_queue: Queue, iterator: DataIterator, shuffle: bool, index: int) -> None: """ Pulls instances from ``qiterable.output_queue``, converts them into ``TensorDict``s using ``iterator``, and puts them on the ``output_queue``. """ logger.info(f"Iterator worker: {index} PID: {os.getpid()}") def instances() -> Iterator[Instance]: while qiterable.num_active_workers.value > 0 or qiterable.num_inflight_items.value > 0: while True: try: yield qiterable.output_queue.get(block=False, timeout=1.0) with qiterable.num_inflight_items.get_lock(): qiterable.num_inflight_items.value -= 1 except Empty: break for tensor_dict in iterator(instances(), num_epochs=1, shuffle=shuffle): output_queue.put(tensor_dict) output_queue.put(index) # See the note above in _create_tensor_dicts_from_queue. output_queue.join()
def loop_test(network, device, transformer, img_q: Queue, bbox_q: Queue, threshold=0.35): scale = None print(f"NETWORK IS NONE {type(network)}") print("STARTING TO SPIN DETECT LOOP") while True: print("WAIT") image = img_q.get() print("RECV") if type(image) is str and image == "DONE": del image break print("CHECK") boxes = detect_face(image, network, transformer, device, threshold) print("SENDING") bbox_q.put(boxes) print("SENT") # DONT FORGET TO CLEANUP del image img_q.close() bbox_q.close() print("BYE")
def _prefetch(in_queue: mp.Queue, out_queue: mp.Queue, batchsize: int, shutdown_event: mp.Event, target_device, waiting_time=5): """Continuously prefetches complete trajectories dropped by the :py:class:`~.TrajectoryStore` for training. As long as shutdown is not set, this method pulls :py:attr:`batchsize` trajectories from :py:attr:`in_queue`, transforms them into batches using :py:meth:`~_to_batch()` and puts them onto the :py:attr:`out_queue`. This usually runs as an asynchronous :py:obj:`multiprocessing.Process`. Parameters ---------- in_queue: :py:obj:`multiprocessing.Queue` A queue that delivers dropped trajectories from :py:class:`~.TrajectoryStore`. out_queue: :py:obj:`multiprocessing.Queue` A queue that delivers batches to :py:meth:`_loop()`. batchsize: `int` The number of trajectories that shall be processed into a batch. shutdown_event: :py:obj:`multiprocessing.Event` An event that breaks this methods internal loop. target_device: :py:obj:`torch.device` The target device of the batch. waiting_time: `float` Time the methods loop sleeps between each iteration. """ while not shutdown_event.is_set(): try: trajectories = [ in_queue.get(timeout=waiting_time) for _ in range(batchsize) ] except queue.Empty: continue batch = Learner._to_batch(trajectories, target_device) # delete Tensors after usage to free memory (see torch multiprocessing) del trajectories try: out_queue.put(batch) except (AssertionError, ValueError): # queue closed continue # delete Tensors after usage to free memory (see torch multiprocessing) del batch try: del trajectories except UnboundLocalError: # already deleted pass
def dynamic_power(model, input_shape): q = Queue() power_return = Queue() interval_return = Queue() latency_return = Queue() input_tensor_queue = Queue() model_queue = Queue() input_tensor = torch.ones([*input_shape]) input_tensor_queue.put(input_tensor) model.share_memory() model_queue.put(model) context = torch.multiprocessing.get_context('spawn') p_thread = context.Process(target=power_thread, args=(power_return, interval_return, q)) l_thread = context.Process(target=latency_thread, args=(model_queue, input_tensor_queue, latency_return, q)) l_thread.start() p_thread.start() power_l = list() # GPU power list interval_l = list() # power interval list latency_l = list() # latency list l_thread.join() while True: if not power_return.empty(): power_l.append(power_return.get()) if not interval_return.empty(): interval_l.append(interval_return.get()) if not latency_return.empty(): latency_l.append(latency_return.get()) if power_return.empty() and interval_return.empty( ) and latency_return.empty(): break power_return.close() interval_return.close() latency_return.close() q.close() del q del power_return del latency_return del interval_return return latency_l, power_l, interval_l
def evaluate_single_thread(p_id, model, config, seeds_per_thread, output: Queue): rewards = [] modified_rewards = [] steps_counts = [] infos = [] for seed_plus in range(p_id * seeds_per_thread, (p_id + 1) * seeds_per_thread): explorer_seed = 721 + seed_plus * 29 set_seeds(explorer_seed) internal_env_args = {'env_type': 'virtual', 'env_init_args': { 'host_tcp': config['training']['client']['host_tcp'], 'port_tcp': config['training']['client']['port_tcp_start'] + p_id }, 'env_config': config['environment']['core'] } internal_env_args['env_config']['seed'] = explorer_seed env = create_env(config, internal_env_args, transfer=config['training']['transfer']) observation = env.reset() done = False steps = 0 reward_sum = 0.0 reward_modified_sum = 0.0 while not done: observation_transformed, _ = observation observation, (reward, reward_modified), done, _ = env.step(model.act(observation_transformed)) reward_sum += reward reward_modified_sum += reward_modified steps += config["environment"]["wrapper"]["repeat_actions"] target_velocities = [[float(v) for v in tv] for tv in np.unique([obs["target_vel"] for obs in env.observations], axis=0)] velocity_similarity_measure = [np.linalg.norm(np.array(obs["target_vel"])[[0, 2]] - np.array(obs["body_vel"]["pelvis"])[[0, 2]]) for obs in env.observations] velocity_confidence_intervals = [mean_confidence_interval(velocity_similarity_measure, 0.95), mean_confidence_interval(velocity_similarity_measure, 0.99)] rewards.append(reward_sum) modified_rewards.append(reward_modified_sum) steps_counts.append(steps) print(explorer_seed, ':', reward_sum, ':', steps) infos.append({"target": target_velocities, "target_similarity_confidence_intervals": velocity_confidence_intervals, "seed": explorer_seed}) output.put((rewards, modified_rewards, steps_counts, infos))
def multiprocess_training_loader(process_number: int, _config, _queue: mp.Queue, _wait_for_exit: mp.Event, _local_file, _fasttext_vocab_cached_mapping, _fasttext_vocab_cached_data): # workflow: we tokenize the data files with the costly spacy before training in a preprocessing step # (and concat the tokens with single whitespaces), so here we only split on the whitepsaces _tokenizer = None if _config["preprocessed_tokenized"] == True: _tokenizer = WordTokenizer(word_splitter=JustSpacesWordSplitter()) if _config["token_embedder_type"] == "embedding": _token_indexers = { "tokens": SingleIdTokenIndexer(lowercase_tokens=True) } _vocab = Vocabulary.from_files(_config["vocab_directory"]) elif _config["token_embedder_type"] == "fasttext": _token_indexers = { "tokens": FastTextNGramIndexer(_config["fasttext_max_subwords"]) } _vocab = FastTextVocab(_fasttext_vocab_cached_mapping, _fasttext_vocab_cached_data, _config["fasttext_max_subwords"]) elif _config["token_embedder_type"] == "elmo": _token_indexers = {"tokens": ELMoTokenCharactersIndexer()} _vocab = None _triple_loader = IrTripleDatasetReader( lazy=True, tokenizer=_tokenizer, token_indexers=_token_indexers, max_doc_length=_config["max_doc_length"], max_query_length=_config["max_query_length"]) _iterator = BucketIterator(batch_size=int(_config["batch_size_train"]), sorting_keys=[("doc_pos_tokens", "num_tokens"), ("doc_neg_tokens", "num_tokens")]) _iterator.index_with(_vocab) for training_batch in _iterator(_triple_loader.read(_local_file), num_epochs=1): _queue.put( training_batch) # this moves the tensors in to shared memory _queue.close() # indicate this local thread is done _wait_for_exit.wait( ) # keep this process alive until all the shared memory is used and not needed anymore
def ReceivePlayouts(worker: socket.socket, worker_id: int, out_queue: Queue): worker.setblocking(True) while True: try: msg: bytes = communication.Receive(worker) except Exception as err: print(f"Error with worker {worker_id}, ending connection") worker.close() return buffer = io.BytesIO(msg) tensor = torch.load(buffer) print(f"Received message {tensor}") out_queue.put(tensor)
def ReceiveParams(server: socket.socket, param_queue: Queue): server.setblocking(True) print("Listening for network updates...") while True: try: msg = communication.Receive(server) except Exception as err: print(f"Error with server connection, ending connection") server.close() return buffer = io.BytesIO(msg) state_dict = torch.load(buffer) param_queue.put(state_dict)
def _queuer(instances: Iterable[Instance], input_queue: Queue, num_workers: int, num_epochs: Optional[int]) -> None: """ Reads Instances from the iterable and puts them in the input_queue. """ epoch = 0 while num_epochs is None or epoch < num_epochs: epoch += 1 for instance in instances: input_queue.put(instance) # Now put a None for each worker, since each needs to receive one # to know that it's done. for _ in range(num_workers): input_queue.put(None)
def inference_video( model: torch.nn.Module, gpu_id: int, data_queue: mp.Queue, result_queue: mp.Queue, # dataset: Union[Video_2D_Inference, Video_3D_Inference], # batch_size: int, num_worker: int, ): model = model.eval().cuda(device=gpu_id) with torch.no_grad(): # for data, fn, idx, done in DataLoader(dataset, batch_size=batch_size, num_workers=num_worker): while True: data, fn, idx, done = data_queue.get() out = model(data.cuda(device=gpu_id)).detach().cpu() result_queue.put((out, fn, idx.clone(), done.clone())) del data, idx, done
def _create_tensor_dicts(input_queue: Queue, output_queue: Queue, iterator: DataIterator, shuffle: bool, index: int) -> None: """ Pulls at most ``max_instances_in_memory`` from the input_queue, groups them into batches of size ``batch_size``, converts them to ``TensorDict`` s, and puts them on the ``output_queue``. """ def instances() -> Iterator[Instance]: instance = input_queue.get() while instance is not None: yield instance instance = input_queue.get() for tensor_dict in iterator(instances(), num_epochs=1, shuffle=shuffle): output_queue.put(tensor_dict) output_queue.put(index)
def _worker(reader: DatasetReader, input_queue: Queue, output_queue: Queue, index: int) -> None: """ A worker that pulls filenames off the input queue, uses the dataset reader to read them, and places the generated instances on the output queue. When there are no filenames left on the input queue, it puts its ``index`` on the output queue and doesn't do anything else. """ # Keep going until you get a file_path that's None. while True: file_path = input_queue.get() if file_path is None: # Put my index on the queue to signify that I'm finished output_queue.put(index) break logger.info(f"reading instances from {file_path}") for instance in reader.read(file_path): output_queue.put(instance)
def run_workers_in_parallel(task_queue: mp.Queue, worker): NUMBER_OF_PROCESSES = min(int(mp.cpu_count() * 1.1), task_queue.qsize()) # TODO: We've noticed that on certain 2 core machine parallelizing the tests # makes the llvm backend legacy pass manager 20x slower than using a # single process. Need to investigate the root cause eventually. This is a # hack to work around this issue. if mp.cpu_count() == 2: NUMBER_OF_PROCESSES = 1 processes = [] for i in range(NUMBER_OF_PROCESSES): p = mp.get_context("fork").Process(target=worker, args=(task_queue, )) p.start() processes.append(p) for i in range(NUMBER_OF_PROCESSES): task_queue.put(queue_sentinel) for p in processes: p.join()
class RemoteTaskManager: def __init__(self, factory_env, factory_mgr, n_tasks): self.pipe_cmd = Queue() # we want to queue more data in a row self.pipe_data = [SimpleQueue() for _ in range(n_tasks + 1)] self.factory_mgr = factory_mgr # create thread ( in main process!! ) which will handle requests! self.com = RemoteTaskServer(factory_mgr, factory_env, self.pipe_cmd, self.pipe_data) self.dtb = {} self.lock = threading.RLock() # def turnon(): self.com.start() def _ind(self, bot_id, objective_id): key = (bot_id, objective_id) assert key in self.dtb, "you forgot to register your environment [remote] in ctor!!" with self.lock: return self.dtb[key] def register(self, bot_id, objective_id): key = (bot_id, objective_id) with self.lock: # multibot register assert key not in self.dtb, "double initialization of your environment [remote]!!" self.dtb[key] = len(self.dtb) assert len(self.dtb) <= len( self.pipe_data), "#tasks > #pipes [remote task manager]" def reset(self, bot_id, objective_id, seed): ind = self._ind(bot_id, objective_id) args = (ind, (bot_id, objective_id), seed) self.pipe_cmd.put(["reset", args]) return self.pipe_data[ind].get() def step(self, bot_id, objective_id, action): ind = self._ind(bot_id, objective_id) args = (ind, (bot_id, objective_id), action) self.pipe_cmd.put(["step", args]) return self.pipe_data[ind].get()
def __backbone_process(backbone_cfg: list, recivq: Queue, sendq: Queue, timeout, run_semaphore, pause_event): # 实例化一个backbone里面所有的组件 backbone_components = [__build_backbone_component(bbcfg) for bbcfg in backbone_cfg] logger = get_logger() logger.info('create backbone') try: while True: if not run_semaphore.value: logger.info('通过信号量停止了backbone') break pause_event.wait() kwargs = recivq.get(timeout=timeout) # 首先由该管道内的第一个组件处理数据 kwargs = backbone_components[0](**kwargs) if len(backbone_components) > 1: # 如果该管道有多个component的话依次将数据交给之后的component处理 for backbone_component in backbone_components[1:]: kwargs = backbone_component(**kwargs) # print('backbone sendq len is {}'.format(sendq.qsize())) if kwargs is not None: for img_info in kwargs['imgs_info']: sendq.put(img_info, timeout=timeout) except KeyboardInterrupt: logger.info('user stop a backbone_process process') except Empty: logger.info('backbone normal stoped') except Full as e: logger.exception(e) logger.warning('通向主进程的队列已满,请检查主进程是否正常取出数据') except Exception as e: logger.exception(e) logger.info('发生不可忽视的错误,因此强制停止整个后台程序运行,请检查log输出定位错误') # import signal # os.killpg(os.getpgid(os.getpid()), signal.SIGKILL) finally: logger.info('release backbone source') del logger recivq.cancel_join_thread() sendq.cancel_join_thread() recivq.close() sendq.close() return
def mixup_process_worker_wrapper(q_input: mp.Queue, q_output: mp.Queue): """ :param q_input: input queue :param q_output: output queue :param device: running gpu device """ # os.environ["CUDA_VISIBLE_DEVICES"] = f"{device}" # not to call torch.cuda initializer in device-0 # print(f"cuda visible devices = {device}") # device = torch.device(f"cuda:0") while True: # get args key, out, target_reweighted, param_list, sc, A_dist, device = q_input.get( ) # run out, target_reweighted = mixup_match(out, target_reweighted, param_list, sc, A_dist, device) # return args q_output.put([key, out, target_reweighted])
def _worker( reader: DatasetReader, input_queue: Queue, output_queue: Queue, num_active_workers: Value, num_inflight_items: Value, worker_id: int, ) -> None: """ A worker that pulls filenames off the input queue, uses the dataset reader to read them, and places the generated instances on the output queue. When there are no filenames left on the input queue, it decrements num_active_workers to signal completion. """ logger.info(f"Reader worker: {worker_id} PID: {os.getpid()}") # Keep going until you get a file_path that's None. while True: file_path = input_queue.get() if file_path is None: # It's important that we close and join the queue here before # decrementing num_active_workers. Otherwise our parent may join us # before the queue's feeder thread has passed all buffered items to # the underlying pipe resulting in a deadlock. # # See: # https://docs.python.org/3.6/library/multiprocessing.html?highlight=process#pipes-and-queues # https://docs.python.org/3.6/library/multiprocessing.html?highlight=process#programming-guidelines output_queue.close() output_queue.join_thread() # Decrementing is not atomic. # See https://docs.python.org/2/library/multiprocessing.html#multiprocessing.Value. with num_active_workers.get_lock(): num_active_workers.value -= 1 logger.info(f"Reader worker {worker_id} finished") break logger.info(f"reading instances from {file_path}") for instance in reader.read(file_path): with num_inflight_items.get_lock(): num_inflight_items.value += 1 output_queue.put(instance)
def parallel_work_func(evaluator: BaseEvaluator, tracker: BaseTracker, dataset: Dataset, gpu_id: int, prefetch: bool, work_dir: str, idx_queue: multiprocessing.Queue, result_queue: multiprocessing.Queue): torch.cuda.set_device(gpu_id) tracker.cuda() tracker.eval() while True: idx = idx_queue.get() seq = dataset[idx] i_result = evaluator.run_sequence(tracker, seq, use_gpu=True, zero_based_index=dataset.zero_based_index, prefetch=prefetch, work_dir=work_dir) result_queue.put((idx, i_result))
def subprocess_prefetch(generator: Iterable[Union[np.array, Iterable[np.array]]], prefetch_buffer_size: int, )->Iterable[Union[np.array, Iterable[np.array]]]: """ Wraps a generator to prefect batches in a separate subprocess. It can be used in a `with` block (which grants proper resource cleanup) or directly as a normal generator. It relies on the ability of torch.multiprocessing to load Tensors in shared memory; this way, the subprocess loads the numpy array from disk, creates a torch Tensor from it and then sends it through a Queue to the main process, which consumes it. :param generator: Generator to wrap. :param prefetch_buffer_size: Size of the prefetch buffer. :return: Wrapped generator. """ batch_queue = Queue(prefetch_buffer_size) control_queue = Queue() Process(target=_enqueue_loader_output, args=(batch_queue, control_queue, generator)).start() control_queue.put(True) return _BatchIterator(batch_queue, control_queue)
{'popsize': pop_size}) epoch = 0 log_step = 3 while not es.stop(): if cur_best is not None and - cur_best > args.target_return: print("Already better than target, breaking...") break r_list = [0] * pop_size # result list solutions = es.ask() # push parameters to queue for s_id, s in enumerate(solutions): for _ in range(n_samples): p_queue.put((s_id, s)) # retrieve results if args.display: pbar = tqdm(total=pop_size * n_samples) for _ in range(pop_size * n_samples): while r_queue.empty(): sleep(.1) r_s_id, r = r_queue.get() r_list[r_s_id] += r / n_samples if args.display: pbar.update(1) if args.display: pbar.close() es.tell(solutions, r_list)