def __init__(self, dataset, maxsize=2): self.queue = mp.Queue(maxsize=maxsize) self.dataset = dataset self.is_done = mp.Event() self.is_shutdown = mp.Event() self.process = mp.Process(target=self._run) self.process.start()
def __init__(self, env, hidden_layer=[64, 64]): self.env = env #self.env.env.disableViewer = False self.num_inputs = env.observation_space.shape[0] self.num_outputs = env.action_space.shape[0] self.hidden_layer = hidden_layer self.params = Params() self.Net = ActorCriticNet self.model = self.Net(self.num_inputs, self.num_outputs, self.hidden_layer) self.model.share_memory() self.shared_obs_stats = Shared_obs_stats(self.num_inputs) self.memory = ReplayMemory(10000000) self.value_memory = ReplayMemory(10000000) self.test_mean = [] self.test_std = [] self.noisy_test_mean = [] self.noisy_test_std = [] self.fig = plt.figure() #self.fig2 = plt.figure() self.lr = self.params.lr plt.show(block=False) self.test_list = [] self.noisy_test_list = [] self.queue = mp.Queue() self.value_queue = mp.Queue() self.mpdone = [mp.Event(), mp.Event(), mp.Event(), mp.Event()] self.process = [] self.traffic_light = TrafficLight() self.counter = Counter() self.best_trajectory = ReplayMemory(5000) self.best_score_queue = mp.Queue() self.best_score = mp.Value("f", 0) self.max_reward = mp.Value("f", 1) self.expert_trajectory = ReplayMemory(1e7) self.validation_trajectory = ReplayMemory(6000 * 9) self.best_validation = 1.0 self.current_best_validation = 1.0 self.return_obs_stats = Shared_obs_stats(1) self.gpu_model = self.Net(self.num_inputs, self.num_outputs, self.hidden_layer) self.base_controller = None
def __init__(self, num_processes: int): self.writer = SummaryWriter("tensorboard/{}".format(uuid.uuid4())) self.queue = mp.Queue() self.evaluation_queue = mp.Queue() self.buffer = deque(maxlen=config.BUFFER_SIZE) self.trigger = mp.Event() self.network = create_network().to(th.device("cuda: 0")) self.optimizer = torch.optim.SGD(self.network.parameters(), lr=config.LEARNING_RATE, momentum=config.MOMENTUM) self.scheduler = LambdaLR(self.optimizer, self.scheduler_fn) self.evaluator = Evaluator(create_gomoku(), create_network().to(th.device("cuda: 0")), self.network, self.trigger, self.evaluation_queue, num_rounds=config.NUM_ROUNDS) self.selfplayers = [] self.rounds_selfplay = 0 self.loss = None self.steps = 0 for i in range(num_processes): game = create_gomoku() network = create_network().to(th.device("cuda: 1")) player = AlphaPlayer(game, network) selfplayer = SelfPlayer(game, player, network, self.network, self.queue) self.selfplayers.append(selfplayer)
def _test_proper_exit(use_workers, pin_memory, exit_method, hold_iter_reference, loader_setup_event, tester_setup_event): num_workers = 2 if use_workers else 0 if exit_method == 'worker_error' or exit_method == 'worker_kill': assert use_workers is True if exit_method == 'worker_error': worker_error_event = mp.Event() else: worker_error_event = None ds = TestProperExitDataset(12, worker_error_event) loader = DataLoader(ds, batch_size=1, shuffle=False, num_workers=num_workers, pin_memory=pin_memory) error_it = 2 if use_workers: # 2 is the magical per-worker prefetch number... # FIXME: change this after the number becomes configurable. assert len(loader) > (error_it + 2 + 1) * num_workers it = iter(loader) if use_workers: workers = it.workers def kill_pid(pid): psutil_p = psutil.Process(pid) psutil_p.kill() psutil_p.wait(JOIN_TIMEOUT) assert not psutil_p.is_running() for i, _ in enumerate(it): if i == 0: if not hold_iter_reference: del it loader_setup_event.set() tester_setup_event.wait() # ensure that the workers are still alive if use_workers: for w in workers: assert w.is_alive() if worker_error_event is not None: worker_error_event.set() if i == error_it: if exit_method == 'loader_error': raise RuntimeError('Loader error') elif exit_method == 'loader_kill': kill_pid(os.getpid()) elif exit_method == 'worker_kill': kill_pid(workers[0].pid) if not hold_iter_reference: # Tries to trigger the __del__ clean-up rather than the automatic # exiting of daemonic children. Technically it should be automatically # triggered, but I don't want to rely on the implementation detail of # Python gc. gc.collect()
def __init__(self, port='2222', width=640, height=480, depth=3, num=2, VERBOSE=False, BGR2RGB=False, saveRoot=None, feedProxy=None): super(VideoStreamClient, self).__init__() self.port = port self.width = width self.height = height self.depth = depth self.num = num self.bufsize = width * height * depth * num self.ts = 0 self.VERBOSE = VERBOSE self.BGR2RGB = BGR2RGB self.saveRoot = saveRoot self.frameLock = mp.Lock() self.frameNotifier = mp.Event() self.sharedFrame = torch.ByteTensor(height, width, depth) self.sharedFrame.storage().share_memory_() # If RobotCameraFeed Pyro4 proxy has been provided if feedProxy is not None: self.feed = feedProxy else: self.feed = None
def _collect_traj_parallel(self, task, debug=False): workers = [] event = mp.Event() queue = mp.Queue() rollout_nums = np.full(self.num_threads, self.rollout_num // self.num_threads, dtype=np.int) rollout_nums[:self.rollout_num % self.num_threads] += 1 for pid, rollout_num_per_thread in zip(range(self.num_threads), rollout_nums): if rollout_num_per_thread > 0: worker_args = (pid, event, queue, task, self.controller, self.theta, rollout_num_per_thread, self.rollout_len, self.M, self.phi, self.adaptation_update_num, self.loss_func, debug) workers.append( mp.Process(target=_collect_traj_per_thread, args=worker_args)) for worker in workers: worker.start() rollouts = [] for _ in workers: rollouts_per_thread, _n_model_steps_total, _n_task_steps_total = queue.get( ) rollouts.extend(rollouts_per_thread) self._n_model_steps_total += _n_model_steps_total self._n_task_steps_total += _n_task_steps_total event.set() return rollouts
def test_main_process_unclean_exit(self): '''There might be ConnectionResetError or leaked semaphore warning (due to dirty process exit), \ but they are all safe to ignore''' worker_pids = mp.Array('i', [0] * 4) manager_exit_event = mp.Event() p = mp.Process(target=TestDataLoader._manager_process, args=(self.dataset, worker_pids, manager_exit_event)) p.start() manager_exit_event.wait() exit_status = [False] * len(worker_pids) start_time = time.time() pname = 'python' while True: for i in range(len(worker_pids)): pid = worker_pids[i] if not exit_status[i]: if not TestDataLoader._is_process_alive(pid, pname): exit_status[i] = True if all(exit_status): break else: time.sleep(1) self.assertFalse(time.time() - start_time > MANAGER_STATUS_CHECK_INTERVAL + JOIN_TIMEOUT, 'subprocess not terminated')
def __init__( self, id: int, hparams: utils.Hyperparameters, policy: MlpPolicy, learner: Learner, q: mp.Queue, update_counter: utils.Counter, log_path: Union[Path, str, None] = None, timeout=10, ): self.id = id self.hp = hparams self.policy = policy for p in self.policy.parameters(): p.requires_grad = False self.learner = learner self.timeout = timeout self.q = q self.update_counter = update_counter self.log_path = log_path if self.log_path is not None: self.log_path = Path(self.log_path) / Path(f"a{self.id}") self.log_path.mkdir(parents=True, exist_ok=False) self.completion = mp.Event() self.p = mp.Process(target=self._act, name=f"actor_{self.id}") print(f"[main] actor_{self.id} Initialized")
def __init__(self, data_dir, batch_size, queue): super().__init__() logging.basicConfig(level=logging.INFO) self.logger = logging.getLogger(__name__) self.generator = DataGenerator(data_dir, batch_size, self.logger) self.queue = queue self.exit = mp.Event()
def __init__(self, loader): self.dataset = loader.dataset self.scale = loader.scale self.collate_fn = loader.collate_fn self.batch_sampler = loader.batch_sampler self.num_workers = loader.num_workers self.pin_memory = loader.pin_memory and torch.cuda.is_available() self.timeout = loader.timeout self.sample_iter = iter(self.batch_sampler) base_seed = torch.LongTensor(1).random_().item() if self.num_workers > 0: self.worker_init_fn = loader.worker_init_fn self.worker_queue_idx = 0 self.worker_result_queue = multiprocessing.Queue() self.batches_outstanding = 0 self.worker_pids_set = False self.shutdown = False self.send_idx = 0 self.rcvd_idx = 0 self.reorder_dict = {} self.done_event = multiprocessing.Event() base_seed = torch.LongTensor(1).random_()[0] self.index_queues = [] self.workers = [] for i in range(self.num_workers): index_queue = multiprocessing.Queue() index_queue.cancel_join_thread() w = multiprocessing.Process( target=_ms_loop, args=(self.dataset, index_queue, self.worker_result_queue, self.done_event, self.collate_fn, self.scale, base_seed + i, self.worker_init_fn, i)) w.start() self.index_queues.append(index_queue) self.workers.append(w) if self.pin_memory: self.data_queue = queue.Queue() pin_memory_thread = threading.Thread( target=_pin_memory_loop, args=(self.worker_result_queue, self.data_queue, torch.cuda.current_device(), self.done_event)) pin_memory_thread.daemon = True pin_memory_thread.start() self.pin_memory_thread = pin_memory_thread else: self.data_queue = self.worker_result_queue _update_worker_pids(id(self), tuple(w.pid for w in self.workers)) _set_SIGCHLD_handler() self.worker_pids_set = True for _ in range(2 * self.num_workers): self._put_indices()
def __init__(self, loader): self.dataset = loader.dataset self.collate_fn = loader.collate_fn self.batch_sampler = loader.batch_sampler self.num_workers = loader.num_workers self.pin_memory = loader.pin_memory and torch.cuda.is_available() self.timeout = loader.timeout self.sample_iter = iter(self.batch_sampler) base_seed = torch.LongTensor(1).random_().item() if self.num_workers > 0: self.worker_init_fn = loader.worker_init_fn self.index_queues = [ multiprocessing.Queue() for _ in range(self.num_workers) ] self.worker_queue_idx = 0 self.worker_result_queue = multiprocessing.Queue() self.batches_outstanding = 0 self.worker_pids_set = False self.shutdown = False self.send_idx = 0 self.rcvd_idx = 0 self.reorder_dict = {} self.done_event = multiprocessing.Event() self.workers = [ multiprocessing.Process( target=_worker_loop, args=(self.dataset, self.index_queues[i], self.worker_result_queue, self.done_event, self.collate_fn, base_seed + i, self.worker_init_fn, i)) for i in range(self.num_workers) ] if self.pin_memory: self.data_queue = queue.Queue() self.pin_memory_thread = threading.Thread( target=_pin_memory_loop, args=(self.worker_result_queue, self.data_queue, self.done_event, self.pin_memory, torch.cuda.current_device())) self.pin_memory_thread.daemon = True self.pin_memory_thread.start() else: self.data_queue = self.worker_result_queue for w in self.workers: w.daemon = True # ensure that the worker exits on process exit w.start() _update_worker_pids(id(self), tuple(w.pid for w in self.workers)) _set_SIGCHLD_handler() self.worker_pids_set = True # prime the prefetch loop for _ in range(2 * self.num_workers): self._put_indices()
def __init__(self, wait=15): self._wait = wait self.canceller = mp.Event() self._coroutines = {} self._exited = [] self.serial = True
def __init__(self, wait=15): self._wait = wait self._processes = {} self._references = [] self.canceller = mp.Event() set_start_method() self.serial = False
def test_needs_reset(self): outdir = tempfile.mkdtemp() agent = mock.Mock() env = mock.Mock() # First episode: 0 -> 1 -> 2 -> 3 (reset) # Second episode: 4 -> 5 -> 6 -> 7 (done) env.reset.side_effect = [("state", 0), ("state", 4)] env.step.side_effect = [ (("state", 1), 0, False, {}), (("state", 2), 0, False, {}), (("state", 3), 0, False, { "needs_reset": True }), (("state", 5), -0.5, False, {}), (("state", 6), 0, False, {}), (("state", 7), 1, True, {}), ] counter = mp.Value("i", 0) episodes_counter = mp.Value("i", 0) stop_event = mp.Event() exception_event = mp.Event() train_loop( process_idx=0, env=env, agent=agent, steps=5, outdir=outdir, counter=counter, episodes_counter=episodes_counter, stop_event=stop_event, exception_event=exception_event, ) self.assertEqual(agent.act.call_count, 5) self.assertEqual(agent.observe.call_count, 5) self.assertEqual(agent.observe.call_count, 5) # done=False and reset=True at state 3 self.assertFalse(agent.observe.call_args_list[2][0][2]) self.assertTrue(agent.observe.call_args_list[2][0][3]) self.assertEqual(env.reset.call_count, 2) self.assertEqual(env.step.call_count, 5)
def __init__(self, trainable_gan, devices): self.trainable_gan = trainable_gan self.sync = 0 self.processes = [] self.report_weights_event = [] self.report_weights_queue = [] self.set_weights_queue = [] loaded_events = [] head_device = torch.device( list(self.trainable_gan.parameters())[0].device).index self.head_device = head_device if devices == "-1": print("Running on all available devices: ", torch.cuda.device_count()) devices = list(range(torch.cuda.device_count())) else: devices = [int(d) for d in devices.split(",")] self.devices = devices print("Devices:", devices) save_complete_event = mp.Event() save_event = mp.Event() self.save_event = save_event self.save_complete_event = save_complete_event for device in devices: loaded_event = mp.Event() report_weights_event = mp.Event() set_weights_queue = mp.Queue() report_weights_queue = mp.Queue() inputs = self.trainable_gan.gan.inputs.to(device) p = mp.Process(target=train, args=(device, head_device, trainable_gan.gan, inputs, loaded_event, report_weights_queue, set_weights_queue, report_weights_event, save_event, save_complete_event, self.trainable_gan.save_file)) p.start() self.processes.append(p) self.report_weights_event.append(report_weights_event) self.report_weights_queue.append(report_weights_queue) self.set_weights_queue.append(set_weights_queue) loaded_event.wait() save_event = None
def __init__(self, dataset, num_workers): # Publics self.dataset = dataset self.num_workers = num_workers self.job_queue = mp.Queue() self.result_queue = mp.Queue() # Privates self._processes = [] self._workers_started = False self._workers_killed = False self._interrupt_event = mp.Event() self._samples_outstanding = len(dataset)
def _test_autograd_sharing(self, var): ready = mp.Event() master_modified = mp.Event() queue = mp.Queue() p = mp.Process(target=autograd_sharing, args=(queue, ready, master_modified)) p.start() queue.put(var) ready.wait() var.data[0,0] = 1000 if var.grad is not None: var.grad.data[:] = torch.ones(5, 5) * 4 master_modified.set() worker_ok = queue.get() self.assertTrue(worker_ok) self.assertEqual(var.data, torch.ones(5, 5)) if var.grad is not None: self.assertEqual(var.grad.data, torch.ones(5, 5)) p.join()
def _test_autograd_sharing(self, var): ready = mp.Event() master_modified = mp.Event() queue = mp.Queue() p = mp.Process(target=autograd_sharing, args=(queue, ready, master_modified)) p.start() var._grad = Variable(torch.zeros(5, 5), requires_grad=False) queue.put(var) ready.wait() var.data[0, 0] = 1000 var.grad.data[:] = torch.ones(5, 5) * 4 master_modified.set() worker_ok = queue.get() self.assertTrue(worker_ok) self.assertEqual(var.data, torch.ones(5, 5)) self.assertEqual(var.grad.data, torch.ones(5, 5) * 4) p.join()
def train(rank, model, train_data, val_data, test_data, optimizer, epochs=2, log_every=100): ema_loss = None criterion = nn.BCEWithLogitsLoss() best_iter = (0., 0, 0) best_test = 0. embeds = None for epoch in range(epochs): random.shuffle(train_data) for i, batch in enumerate(train_data): _, text, users, subs, lengths, metafeats, labels = batch text, users, subs, metafeats, labels = Variable(text), Variable( users), Variable(subs), Variable(metafeats), Variable(labels) # if os.path.isfile('checkpoint.pt'): # checkpoint = torch.load('checkpoint.pt') # model.load_state_dict(checkpoint['model']) # optimizer.load_state_dict(checkpoint['optimizer']) optimizer.zero_grad() if constants.CUDA: outputs = model(text, users, subs, metafeats, lengths).cuda() else: outputs = model(text, users, subs, metafeats, lengths) loss = criterion(outputs.squeeze(), labels) optimizer.step() if ema_loss is None: ema_loss = loss.data else: ema_loss = 0.01 * loss.data + 0.99 * ema_loss if i % 10 == 0: print((epoch, i, ema_loss)) if i % log_every == 0: auc = evaluate_auc(model, val_data) print(("Val AUC", epoch, i, auc)) if auc > best_iter[0]: best_iter = (auc, epoch, i) print(("New best val!", best_iter)) best_test = evaluate_auc(model, test_data) if auc > 0.7: ids, embeds = get_embeddings(train_data + val_data + test_data) print(("Overall best val:", best_iter)) # save_checkpoint(model, optimizer) mp.Event().wait return best_iter[0]
def validate(evm_model, args_evm, class_partition, feature_dic, data_name, gpu): with torch.no_grad(): print(f"start evaluating {data_name}") t1 = time.time() NG = min(len(gpu), number_of_classes) assert NG > 0 classes_to_process = list(range(1, int(number_of_classes) + 1)) list_acc = [0.0] * NG list_count = [0] * NG Q = mp.Queue() done_event = [mp.Event() for k in range(NG)] process_list = [] if data_name != "unknown": for k in range(NG): p = mp.Process( target=val_process, args=(class_partition[k], feature_dic[k], evm_model, args_evm, gpu[k], Q, done_event[k]), ) p.start() process_list.append(p) else: for k in range(NG): p = mp.Process( target=val_process_unknown, args=(class_partition[k], feature_dic[k], evm_model, args_evm, gpu[k], Q, done_event[k]), ) p.start() process_list.append(p) for k in range(NG): g, a, c = Q.get() print(g, a, c) i = gpu.index(g) list_acc[i] = a list_count[i] = c done_event[i].set() for p in process_list: p.join() print(data_name, "total accuracy = ", np.average(np.array(list_acc), weights=np.array(list_count))) del Q, done_event del p, process_list, g, a, c, list_acc, list_count t2 = time.time() print("validation time = ", t2 - t1) return
def do_test(): x = torch.zeros(5, 5) q = mp.Queue() e = mp.Event() data = [x, x[:, 1]] q.put(data) p = mp.Process(target=simple_fill, args=(q, e)) lc.check_pid(p.pid) p.start() e.wait() self.assertTrue(data[0].eq(4).all()) self.assertTrue(data[1].eq(4).all()) p.join(1) self.assertFalse(p.is_alive())
def sample(env, policy, batchsz, process_num, warm_up=False): """ Given batchsz number of task, the batchsz will be splited equally to each processes and when processes return, it merge all data and return :param env: :param policy: :param batchsz: :param process_num: :return: batch """ # batchsz will be splitted into each process, # final batchsz maybe larger than batchsz parameters process_batchsz = np.ceil(batchsz / process_num).astype(np.int32) # buffer to save all data queue = mp.Queue() # start processes for pid in range(1, processnum) # if processnum = 1, this part will be ignored. # when save tensor in Queue, the process should keep alive till Queue.get(), # please refer to : https://discuss.pytorch.org/t/using-torch-tensor-over-multiprocessing-queue-process-fails/2847 # however still some problem on CUDA tensors on multiprocessing queue, # please refer to : https://discuss.pytorch.org/t/cuda-tensors-on-multiprocessing-queue/28626 # so just transform tensors into numpy, then put them into queue. evt = mp.Event() processes = [] for i in range(process_num): process_args = (i, queue, evt, env, policy, process_batchsz) if warm_up: processes.append( mp.Process(target=warmupsampler, args=process_args)) else: processes.append(mp.Process(target=sampler, args=process_args)) for p in processes: # set the process as daemon, and it will be killed once the main process is stoped. p.daemon = True p.start() # we need to get the first Memory object and then merge others Memory use its append function. pid0, buff0 = queue.get() for _ in range(1, process_num): pid, buff_ = queue.get() buff0.append(buff_) # merge current Memory into buff0 evt.set() # now buff saves all the sampled data buff = buff0 return buff
def __init__(self, args, runner_maker): self.state_queues = [] self.prev = [] self.episodes = [] self.stop_queueing = mp.Event() self.done = mp.Event() self.stop_queueing.clear() self.done.clear() self.num = args.nthreads self.processes = [] for i in range(self.num): # queue size of 1 seems to be pretty ideal, longer queue and the resetting takes time queue = mp.Queue(1) self.state_queues.append(queue) self.prev.append(None) self.episodes.append([]) worker = ThreadedWorker(runner_maker, queue, self.done, self.stop_queueing) worker.start() self.processes.append(worker) self.args = args runner_temp = runner_maker() self.policy_net = runner_temp.policy_net self.value_net = runner_temp.value_net
def __init__(self, env, actor, config, memory = None, logwriter = None, name = ""): self.config = deepcopy(config) self.logwriter = logwriter self.memory = memory self.episode_num = mp.Value(c_uint) self.runcondition = mp.Event() self.memory_queue = mp.Queue(maxsize = config['replay_skip'] + 1) if memory is not None else None self.logqueue = mp.Queue(maxsize = 1) if logwriter is not None else None with self.episode_num.get_lock(): self.episode_num.value = 1 self.runner = mp.Process(target=envrun, args=(actor, env, self.episode_num, config, self.runcondition), kwargs = {'iterations': config['replay_skip'] + 1, 'memoryqueue' : self.memory_queue, 'logqueue' : self.logqueue, 'name' : name}) self.runner.start()
def __init__(self, n_processes: int = 1, process_cls=None, cancel=None, verbose: str = False, regular_get: bool = True, tracker=None): store_attr(but='process_cls') self.process_cls = ifnone(process_cls, DataFitProcess) self.queue = mp.JoinableQueue(maxsize=self.n_processes) self.cancel = ifnone(self.cancel, mp.Event()) self.pipe_in, self.pipe_out = mp.Pipe(False) if self.verbose else ( None, None) self.cached_items = [] self._place_holder_out = None self.step_idx = 0
def sample(self, batchsz): """ Given batchsz number of task, the batchsz will be splited equally to each threads and when threads return, it merge all data and return :param batchsz: :return: batch """ # batchsz will be splitted into each thread, # final batchsz maybe larger than batchsz parameters thread_batchsz = np.ceil(batchsz / self.thread_num).astype(np.int32) # buffer to save all data queue = multiprocessing.Queue() # start threads for pid in range(1, threadnum) # if threadnum = 1, this part will be ignored. # when save tensor in Queue, the thread should keep alive till Queue.get(), # please refer to : https://discuss.pytorch.org/t/using-torch-tensor-over-multiprocessing-queue-process-fails/2847/2 evt = multiprocessing.Event() threads = [] for i in range(self.thread_num): thread_args = (i, queue, self.env_list[i], self.policy, thread_batchsz) threads.append( multiprocessing.Process(target=sampler, args=thread_args)) for t in threads: # set the thread as daemon, and it will be killed once the main thread is stoped. t.daemon = True t.start() # we need to get the first ReplayMemory object and then merge others ReplayMemory use its append function. pid0, buff0, avg_reward0 = queue.get() avg_reward = [avg_reward0] for _ in range(1, self.thread_num): pid, buff_, avg_reward_ = queue.get() buff0.append(buff_) # merge current ReplayMemory into buff0 avg_reward.append(avg_reward_) # now buff saves all the sampled data and avg_reward is the average reward of current sampled data buff = buff0 avg_reward = np.array(avg_reward).mean() print('avg reward:', avg_reward) return buff.sample()
def init(self, evaluator, ckpt_dir): self.ckpt_dir = os.path.abspath(ckpt_dir) self.logger.info("checkpoint dir: %s", self.ckpt_dir) self.evaluator = evaluator self.stop_event = multiprocessing.Event() self.req_queue = multiprocessing.Queue() self.ans_queue = multiprocessing.Queue() self._register_signal_handler() backup_handlers = _logger.handlers _logger.handlers = [logging.NullHandler()] for gpu_id in self.gpu_ids: worker_p = multiprocessing.Process(target=self._worker, args=( self.evaluator, gpu_id, self.ckpt_dir, self.stop_event, self.req_queue, self.ans_queue)) self.workers.append(worker_p) worker_p.start() _logger.handlers = backup_handlers self._inited = True
def __init__(self, model: torch.nn.Module, device: torch.device, input_queues: mp.Queue, output_queues: mp.Queue, scenes, h5_file_path, evt): super(GPUThread, self).__init__() self.model = model.eval() self.model = self.model.to(device) self.device = device self.i_queues = input_queues self.o_queues = output_queues self.exit = mp.Event() self.scenes = scenes self.evt = evt self.preprocess = transforms.Normalize(mean=[123.68, 116.779, 103.939], std=[1.0, 1.0, 1.0])
def get_queue_feeder(batch_size=1, maxsize_queue=10, n_process=1): if batch_size != 1: raise NotImplementedError() if n_process < 1: raise ValueError( "n_process should be positive. Got {}".format(n_process)) queue_feed = mp.Queue(maxsize=maxsize_queue) stop_event = mp.Event() batch_loaders = [] for _ in range(n_process): batch_loader = mp.Process(target=feeder, args=(queue_feed, stop_event, batch_size)) batch_loader.start() batch_loaders.append(batch_loader) return queue_feed, stop_event, batch_loaders
def test_main_process_unclean_exit(self): r'''There might be ConnectionResetError or leaked semaphore warning (due to dirty process exit), \ but they are all safe to ignore''' # `raise_error` controls if the main process is KILL-ed by OS or just # simply raises an error. Both cases are interesting because # 1. In case of it is KILL-ed by OS, the workers need to automatically # discover that their parent is dead and exit gracefully. # 2. In case of it raises an error itself, the parent process needs to # take care of exiting the worker and then exits itself gracefully. for raise_error in (True, False): worker_pids = mp.Array('i', [0] * 4) main_exit_event = mp.Event() p = mp.Process(target=TestDataLoader._main_process, args=(self.dataset, worker_pids, main_exit_event, raise_error)) p.start() worker_pids[-1] = p.pid main_exit_event.wait() exit_status = [False] * len(worker_pids) start_time = time.time() pname = 'python' while True: for i in range(len(worker_pids)): pid = worker_pids[i] if not exit_status[i]: if not TestDataLoader._is_process_alive(pid, pname): exit_status[i] = True if all(exit_status): break else: if time.time( ) - start_time > MANAGER_STATUS_CHECK_INTERVAL + JOIN_TIMEOUT: self.fail('subprocess not terminated') time.sleep(1) p.join(MANAGER_STATUS_CHECK_INTERVAL + JOIN_TIMEOUT - (time.time() - start_time)) self.assertFalse(p.is_alive(), 'main process not terminated')