def _instances(self, file_path: str, manager: Manager, output_queue: Queue) -> Iterator[Instance]: """ A generator that reads instances off the output queue and yields them up until none are left (signified by all ``num_workers`` workers putting their ids into the queue). """ shards = glob.glob(file_path) num_shards = len(shards) # If we want multiple epochs per read, put shards in the queue multiple times. input_queue = manager.Queue(num_shards * self.epochs_per_read + self.num_workers) for _ in range(self.epochs_per_read): random.shuffle(shards) for shard in shards: input_queue.put(shard) # Then put a None per worker to signify no more files. for _ in range(self.num_workers): input_queue.put(None) processes: List[Process] = [] num_finished = 0 for worker_id in range(self.num_workers): process = Process(target=_worker, args=(self.reader, input_queue, output_queue, worker_id)) logger.info(f"starting worker {worker_id}") process.start() processes.append(process) # Keep going as long as not all the workers have finished. while num_finished < self.num_workers: item = output_queue.get() if isinstance(item, int): # Means a worker has finished, so increment the finished count. num_finished += 1 logger.info(f"worker {item} finished ({num_finished}/{self.num_workers})") else: # Otherwise it's an ``Instance``, so yield it up. yield item for process in processes: process.join() processes.clear()
def main(args): ''' embedding_path에 있는 data - [sentence#1, vecetor#1] - [sentence#2, vecetor#2] ... ''' print('> START ') print('> parameter ') for k, v in args._get_kwargs(): print('> {} : {}'.format(k, v)) print('') print('> Action ') # 0. sentence_embeddings 준비 embedding_type_name = args.embedding_type_name topk = args.topk target_data_path = args.target_data_path ground_data_path = args.ground_data_path source_embedding_path = args.source_embedding_path number_of_processes = args.gpu_num # 1. source pool loading # [(vector#1, sentence#1), (vector#2, sentence#2) ... ] source_pool = load_embedding_data(source_embedding_path) src_embeddings = [_[0] for _ in source_pool] src_sentences = [_[1] for _ in source_pool] # 2. target data split target_data_list = [ _.strip() for _ in open(target_data_path, mode='r', encoding='utf-8') ] number_of_processes = number_of_processes if number_of_processes < len( target_data_list) else len(target_data_list) num_of_tasks = len(target_data_list) // number_of_processes tasks = [ target_data_list[_ * num_of_tasks:(_ + 1) * num_of_tasks] for _ in range(number_of_processes) ] # 3. queue 준비 tasks_to_accomplish = Manager().Queue() tasks_finished = Manager().Queue() for task in tasks: tasks_to_accomplish.put(task) processes = [] # process 생성 # target에 대해서 encoding 하고 가장 벡터가 유사한 것을 찾는다. for i in range(number_of_processes): p = Process(target = multi_inference \ , args = (embedding_type_name, tasks_to_accomplish, tasks_finished, i+1, src_sentences, src_embeddings,)) processes.append(p) p.start() for p in processes: p.join() # 결과 파일로 저장 store_target = [] while not tasks_finished.empty(): store_target.append(tasks_finished.get_nowait()) if ground_data_path: gt_data = [ _.strip() for _ in open(ground_data_path, mode='r', encoding='utf-8') ] for idx, val in enumerate(zip(store_target, gt_data)): store_target[idx].append(val[-1]) time_tag = datetime.datetime.now().strftime('%Y%m%d%H%S') head_line = ['<target>', '<inference>', '<ground_truth>'] with open(args.output_data_path, mode='w', encoding='utf-8') as wdesc: wdesc.writelines('\t'.join(head_line)) wdesc.writelines('\n') for i in store_target: # target = data[0] # inference = data[1] # sbert_only_inference = data[2] line = '\t'.join(i) wdesc.writelines(line) wdesc.writelines('\n') print('> FINISH - result file : {}'.format(args.output_data_path)) return True
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('env_name', type=str) parser.add_argument('--exp_name', type=str, default='vac') parser.add_argument('--render', action='store_true') parser.add_argument('--discount', type=float, default=1.0) parser.add_argument('--n_iter', '-n', type=int, default=100) parser.add_argument('--batch_size', '-b', type=int, default=1000) parser.add_argument('--ep_len', '-ep', type=float, default=-1.) parser.add_argument('--actor_learning_rate', '-lr', type=float, default=5e-3) parser.add_argument('--critic_learning_rate', '-clr', type=float) parser.add_argument('--dont_normalize_advantages', '-dna', action='store_true') parser.add_argument('--num_target_updates', '-ntu', type=int, default=10) parser.add_argument('--num_grad_steps_per_target_update', '-ngsptu', type=int, default=10) parser.add_argument('--seed', type=int, default=1) parser.add_argument('--n_experiments', '-e', type=int, default=1) parser.add_argument('--actor_n_layers', '-l', type=int, default=2) parser.add_argument('--critic_n_layers', '-cl', type=int) parser.add_argument('--size', '-s', type=int, default=64) args = parser.parse_args() if not(os.path.exists('data')): os.makedirs('data') logdir = 'ac_' + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S") logdir = os.path.join('data', logdir) if not(os.path.exists(logdir)): os.makedirs(logdir) max_path_length = args.ep_len if args.ep_len > 0 else None if not args.critic_learning_rate: args.critic_learning_rate = args.actor_learning_rate if not args.critic_n_layers: args.critic_n_layers = args.actor_n_layers processes = [] for e in range(args.n_experiments): seed = args.seed + 10*e print('Running experiment with seed %d'%seed) def train_func(): train_AC( exp_name=args.exp_name, env_name=args.env_name, n_iter=args.n_iter, gamma=args.discount, min_timesteps_per_batch=args.batch_size, max_path_length=max_path_length, actor_learning_rate=args.actor_learning_rate, critic_learning_rate=args.critic_learning_rate, num_target_updates=args.num_target_updates, num_grad_steps_per_target_update=args.num_grad_steps_per_target_update, animate=args.render, logdir=os.path.join(logdir,'%d'%seed), normalize_advantages=not(args.dont_normalize_advantages), seed=seed, actor_n_layers=args.actor_n_layers, critic_n_layers=args.critic_n_layers, size=args.size ) p = Process(target=train_func, args=tuple()) p.start() processes.append(p) # if you comment in the line below, then the loop will block # until this process finishes # p.join() for p in processes: p.join()
def NCS_MP(crates, ncs_stepsize, masked_models, valid, corpus, acc_constraint, orig_fitvalue, num_runs=0): total_time = 0 total_iteration = 100 itr_count = 0 popsize = len(other_GPU_IDs) + 1 __C = edict() __C.parameters = { 'reset_xl_to_pop': False, 'init_value': crates, 'stepsize': ncs_stepsize, 'bounds': [0.1, 0.99999999], 'ftarget': 0, 'tmax': total_iteration * popsize, 'popsize': popsize, 'best_k': 1 } es = ncs.NCS(__C.parameters) start_t = time.time() print('***************NCS initialization***************') ref_net = masked_models[0] # 0.0 represents no parameters have been pruned, so it's original fitness ref_net.change_mask(len(crates) * [0.0], apply_MP_on_mask) ref_net.apply_mask() start_fit = evaluate_lm(ref_net.masked_model, valid, corpus, TEST_BATCH_SIZE) orignal_fit = orig_fitvalue print('start fit: {}'.format(start_fit)) print('orig fit: {}'.format(orignal_fit)) ref_net = masked_models[0] ref_net.change_mask(crates, apply_MP_on_mask) ref_net.apply_mask() tmp_fit = evaluate_lm(ref_net.masked_model, valid, corpus, TEST_BATCH_SIZE) print("start init threshold:", crates) print('Start sparsity: {}%'.format(ref_net.get_sparsity() * 100)) es.set_initFitness( es.popsize * [ref_net.get_sparsity() ]) # assume the inital crates store the size of each tensor #es.ask() #tmp_fit = torch.FloatTensor([0,0,0]) end_t = time.time() total_time = (end_t - start_t) print('fit:{}'.format(tmp_fit)) print('time {}min elapse'.format(total_time / 60.)) print('***************NCS initialization***************') ref_net.clear_cache() processes = [] results = {'result_NCS': torch.FloatTensor(crates)} results['result_NCS'].share_memory_() # paralell individuals for rank in range(popsize): p = Process(target=init_processes, args=(rank, popsize, orignal_fit, acc_constraint, prune_and_eval, valid, corpus, es, masked_models, num_runs, results)) p.start() processes.append(p) for p in processes: p.join() ref_net.change_mask(results['result_NCS'].numpy(), apply_MP_on_mask) ref_net.apply_mask() best_prune = evaluate_lm(ref_net.masked_model, valid, corpus, TEST_BATCH_SIZE) print('Accuracy:{}=>{}, ppl:{}=>{}, sparsity: {}%'.format( orignal_fit[1], best_prune[1], orignal_fit[0], best_prune[0], ref_net.get_sparsity() * 100.)) logger.scalar_summary('ncs_start_acc', tmp_fit[1], num_runs) logger.scalar_summary('ncs_start_ppl', tmp_fit[0], num_runs) logger.scalar_summary('ncs_best_acc', best_prune[1], num_runs) logger.scalar_summary('ncs_best_ppl', best_prune[0], num_runs) if True: saved_model_name = 'ncs_pruned_model_%s_iteration%s_%s_%s_acc_cons_%s.pt' % ( name_mark, num_runs, Model_type, layer_group_type, str(acc_constraint)) torch.save(ref_net, cfg.LM_MODEL_TMP_FOLDER + saved_model_name) return results['result_NCS'].numpy(), saved_model_name, ref_net
elif dist.get_rank() == 2: input_size = 10 output_size = 6 elif dist.get_rank() == 3: input_size = 6 output_size = 2 elif dist.get_rank() == 4: input_size = 2 output_size = 1 layer = Layer(input_size=input_size, output_size=output_size) fn(layer, size, batcher) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('-size', type=int, help='input the sum of node') parser.add_argument('-path', help='the path fo share file system') args = parser.parse_args() processes = [] print("size:" + str(args.size)) print("path:" + args.path) batcher = Batcher() for rank in range(args.size): p = Process(target=init_processes, args=(run, args.path, args.size, batcher)) p.start() processes.append(p) for p in processes: p.join()
def test_sampling_with_distributed_sampler(self, decoder): # Make one video with 15 frames and one with 10 frames, producing 3 clips and 2 # clips respectively. num_frames = 10 fps = 5 with temp_encoded_video(num_frames=int(num_frames * 1.5), fps=fps) as ( video_file_name_1, data_1, ): with temp_encoded_video(num_frames=num_frames, fps=fps) as ( video_file_name_2, data_2, ): with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as f: f.write(f"{video_file_name_1} 0\n".encode()) f.write(f"{video_file_name_2} 1\n".encode()) total_duration = num_frames / fps half_duration = total_duration / 2 - self._EPS # Create several processes initialized in a PyTorch distributed process # group so that distributed sampler is setup correctly when dataset is # constructed. num_processes = 2 processes = [] return_dict = multiprocessing.Manager().dict() for rank in range(num_processes): p = Process( target=run_distributed, args=( rank, num_processes, decoder, half_duration, f.name, return_dict, ), ) p.start() processes.append(p) for p in processes: p.join() # After joining all distributed processes we expect all these label, # video pairs to be returned in random order. half_frames = num_frames // 2 expected = { (0, data_1[:, :half_frames]), # 1/3 clip (0, data_1[:, half_frames:half_frames * 2]), # 2/3 clip (0, data_1[:, half_frames * 2:]), # 3/3 clip (1, data_2[:, :half_frames]), # First half (1, data_2[:, half_frames:]), # Second half } epoch_results = collections.defaultdict(list) for v in return_dict.values(): for k_2, v_2 in v.items(): epoch_results[k_2].extend(v_2) assert_unordered_list_compare_true(self, expected, epoch_results["epoch_1"]) assert_unordered_list_compare_true(self, expected, epoch_results["epoch_2"])
def NCS_MP(crates, ncs_stepsize, fields, masked_models, valid, acc_constraint, num_runs=0, checkpoint=None): total_time = 0 total_iteration = 100 itr_count = 0 popsize = len(other_GPU_IDs) + 1 __C = edict() __C.parameters = { 'reset_xl_to_pop': False, 'init_value': crates, 'stepsize': ncs_stepsize, 'bounds': [0., 0.95], 'ftarget': 0, 'tmax': total_iteration * popsize, 'popsize': popsize, 'best_k': 1 } es = ncs.NCS(__C.parameters) start_t = time.time() print('***************NCS initialization***************') ref_net = masked_models[0] # 0.0 represents no parameters have been pruned, so it's original fitness ref_net.change_mask(len(crates) * [0.0], apply_MP_on_mask) ref_net.apply_mask() orignal_fit = evaluate(ref_net, valid, fields) print('original fit: {}'.format(orignal_fit)) ref_net = masked_models[0] ref_net.change_mask(crates, apply_MP_on_mask) ref_net.apply_mask() tmp_fit = evaluate(ref_net, valid, fields) print("start init threshold:", crates) print('Start sparsity: {}%'.format(ref_net.get_sparsity() * 100)) es.set_initFitness( es.popsize * [ref_net.get_sparsity() ]) # assume the inital crates store the size of each tensor #es.ask() #tmp_fit = torch.FloatTensor([0,0,0]) end_t = time.time() total_time = (end_t - start_t) print('fit:{}'.format(tmp_fit)) print('time {}min elapse'.format(total_time / 60.)) print('***************NCS initialization***************') ref_net.clear_cache() valid.fields = [] # clear fields for send valid among thresholds processes = [] results = {'result_NCS': torch.FloatTensor(crates)} results['result_NCS'].share_memory_() # paralell individuals for rank in range(popsize): p = Process(target=init_processes, args=(rank, popsize, orignal_fit, acc_constraint, prune_and_eval, valid, es, masked_models, num_runs, results)) p.start() processes.append(p) for p in processes: p.join() valid.fields = fields ref_net.change_mask(results['result_NCS'].numpy(), apply_MP_on_mask) ref_net.apply_mask() best_prune = evaluate(ref_net, valid, fields) print('Accuracy:{}=>{}, ppl:{}=>{}, sparsity: {}%'.format( orignal_fit[1], best_prune[1], orignal_fit[0], best_prune[0], ref_net.get_sparsity() * 100.)) logger.scalar_summary('ncs_start_acc', tmp_fit[1], num_runs) logger.scalar_summary('ncs_start_ppl', tmp_fit[0], num_runs) logger.scalar_summary('ncs_best_acc', best_prune[1], num_runs) logger.scalar_summary('ncs_best_ppl', best_prune[0], num_runs) if checkpoint is not None: real_model = (ref_net.masked_model.module if isinstance( ref_net.masked_model, nn.DataParallel) else ref_net.masked_model) real_generator = (real_model.generator.module if isinstance( real_model.generator, nn.DataParallel) else real_model.generator) model_state_dict = real_model.state_dict() model_state_dict = { k: v for k, v in model_state_dict.items() if 'generator' not in k } generator_state_dict = real_generator.state_dict() checkpoint['model'] = model_state_dict checkpoint['generator'] = generator_state_dict saved_model_name = 'ncs_pruned_model_%s_iteration%s_%s_%s_acc_cons_%s.pt' % ( name_mark, num_runs, Model_type, layer_group_type, str(acc_constraint)) print("saved model:", saved_model_name) torch.save(checkpoint, SAVE_MODEL_TMP_FOLDER + saved_model_name) return results['result_NCS'].numpy(), saved_model_name, ref_net
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('env_name', type=str) # parser.add_argument('--env_name', type=str, default='CartPole-v0') parser.add_argument('--exp_name', type=str, default='vpg') parser.add_argument('--render', action='store_true') parser.add_argument('--discount', type=float, default=1.0) parser.add_argument('--n_iter', '-n', type=int, default=100) parser.add_argument('--batch_size', '-b', type=int, default=1000) parser.add_argument('--ep_len', '-ep', type=float, default=-1.) parser.add_argument('--learning_rate', '-lr', type=float, default=5e-3) parser.add_argument('--reward_to_go', '-rtg', action='store_true') # parser.add_argument('--reward_to_go', '-rtg', type=bool, default=True) parser.add_argument('--dont_normalize_advantages', '-dna', action='store_true') parser.add_argument('--nn_baseline', '-bl', action='store_true') parser.add_argument('--seed', type=int, default=1) parser.add_argument('--n_experiments', '-e', type=int, default=1) parser.add_argument('--n_layers', '-l', type=int, default=2) parser.add_argument('--size', '-s', type=int, default=64) parser.add_argument('--dir', '-d', type=str, default='test') args = parser.parse_args() if not (os.path.exists(args.dir)): os.makedirs(args.dir) logdir = args.exp_name + '_' + args.env_name + '_' + time.strftime( "%d-%m-%Y_%H-%M-%S") logdir = os.path.join(args.dir, logdir) if not (os.path.exists(logdir)): os.makedirs(logdir) max_path_length = args.ep_len if args.ep_len > 0 else None processes = [] for e in range(args.n_experiments): seed = args.seed + 10 * e print('Running experiment with seed %d' % seed) def train_func(): train_PG(exp_name=args.exp_name, env_name=args.env_name, n_iter=args.n_iter, gamma=args.discount, min_timesteps_per_batch=args.batch_size, max_path_length=max_path_length, learning_rate=args.learning_rate, reward_to_go=args.reward_to_go, animate=args.render, logdir=os.path.join(logdir, '%d' % seed), normalize_advantages=not (args.dont_normalize_advantages), nn_baseline=args.nn_baseline, seed=seed, n_layers=args.n_layers, size=args.size) p = Process(target=train_func, args=tuple()) p.start() processes.append(p) # if you comment in the line below, then the loop will block # until this process finishes # p.join() for p in processes: p.join()
def train_ai2thor(model, args, rank=0, b=None): seed = args.seed + 10000 * rank torch.manual_seed(seed) np.random.seed(seed) # torch.cuda.set_device(rank) device = torch.device(f'cuda:{rank}') os.environ['DISPLAY'] = f':{rank}' model = model.to(device) model.share_memory() # Experience buffer storage = PPOBuffer(model.obs_shape, args.steps, args.num_workers, args.state_size, args.gamma, device=device) storage.share_memory() #torch.multiprocessing.set_start_method('spawn') # start multiple processes ready_to_works = [Event() for _ in range(args.num_workers)] exit_flag = Value('i', 0) queue = SimpleQueue() processes = [] task_config_file = "config_files/multiMugTaskTrain.json" # start workers for worker_id in range(args.num_workers): p = Process(target=worker, args=(worker_id, model, storage, ready_to_works[worker_id], queue, exit_flag, task_config_file)) p.start() processes.append(p) # start trainer train_params = { "epochs": args.epochs, "steps": args.steps, "world_size": args.world_size, "num_workers": args.num_workers } ppo_params = { "clip_param": args.clip_param, "train_iters": args.train_iters, "mini_batch_size": args.mini_batch_size, "value_loss_coef": args.value_loss_coef, "entropy_coef": args.entropy_coef, "rnn_steps": args.rnn_steps, "lr": args.lr, "max_kl": args.max_kl } distributed = False if args.world_size > 1: distributed = True # Initialize Process Group, distributed backend type dist_backend = 'nccl' # Url used to setup distributed training dist_url = "tcp://127.0.0.1:23456" print("Initialize Process Group... pid:", os.getpid()) dist.init_process_group(backend=dist_backend, init_method=dist_url, rank=rank, world_size=args.world_size) # Make model DistributedDataParallel model = DistributedDataParallel(model, device_ids=[rank], output_device=rank) learner(model, storage, train_params, ppo_params, ready_to_works, queue, exit_flag, rank, distributed, b) for p in processes: print("process ", p.pid, " joined") p.join()
class MultiprocessIterator(DataIterator): """ Wraps another `DataIterator` and uses it to generate tensor dicts using multiple processes. # Parameters base_iterator : `DataIterator` The `DataIterator` for generating tensor dicts. It will be shared among processes, so it should not be stateful in any way. num_workers : `int`, optional (default = 1) The number of processes used for generating tensor dicts. output_queue_size : `int`, optional (default = 1000) The size of the output queue on which tensor dicts are placed to be consumed. You might need to increase this if you're generating tensor dicts too quickly. """ def __init__( self, base_iterator: DataIterator, num_workers: int = 1, output_queue_size: int = 1000 ) -> None: super().__init__() self.num_workers = num_workers self.batch_size = base_iterator._batch_size self.output_queue_size = output_queue_size # These two options make the iterator stateful, which means it can't be shared # across multiple processes. if base_iterator._cache_instances: raise ConfigurationError("cannot use Multiprocess iterator with cache_instances") if base_iterator._instances_per_epoch: raise ConfigurationError("cannot use instances_per_epoch with Multiprocess iterator") self.iterator = base_iterator self.processes: List[Process] = [] self.queuer: Optional[Process] = None def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]: raise RuntimeError("MultiprocessIterator doesn't use create_batches") def index_with(self, vocab: Vocabulary): self.iterator.index_with(vocab) def _call_with_instances( self, instances: Iterable[Instance], num_epochs: int, shuffle: bool ) -> Iterator[TensorDict]: # JoinableQueue needed here as sharing tensors across processes # requires that the creating process not exit prematurely. output_queue = JoinableQueue(self.output_queue_size) input_queue = Queue(self.output_queue_size * self.batch_size) # Start process that populates the queue. self.queuer = Process( target=_queuer, args=(instances, input_queue, self.num_workers, num_epochs) ) self.queuer.start() # Start the tensor-dict workers. for i in range(self.num_workers): args = (input_queue, output_queue, self.iterator, shuffle, i) process = Process(target=_create_tensor_dicts_from_queue, args=args) process.start() self.processes.append(process) num_finished = 0 while num_finished < self.num_workers: item = output_queue.get() output_queue.task_done() if isinstance(item, int): num_finished += 1 logger.info(f"worker {item} finished ({num_finished} / {self.num_workers})") else: yield item for process in self.processes: process.join() self.processes.clear() if self.queuer is not None: self.queuer.join() self.queuer = None def _call_with_qiterable( self, qiterable: QIterable, num_epochs: int, shuffle: bool ) -> Iterator[TensorDict]: # JoinableQueue needed here as sharing tensors across processes # requires that the creating tensor not exit prematurely. output_queue = JoinableQueue(self.output_queue_size) for _ in range(num_epochs): qiterable.start() # Start the tensor-dict workers. for i in range(self.num_workers): args = (qiterable, output_queue, self.iterator, shuffle, i) process = Process(target=_create_tensor_dicts_from_qiterable, args=args) process.start() self.processes.append(process) num_finished = 0 while num_finished < self.num_workers: item = output_queue.get() output_queue.task_done() if isinstance(item, int): num_finished += 1 logger.info(f"worker {item} finished ({num_finished} / {self.num_workers})") else: yield item for process in self.processes: process.join() self.processes.clear() qiterable.join() def __call__( self, instances: Iterable[Instance], num_epochs: int = None, shuffle: bool = True ) -> Iterator[TensorDict]: # If you run it forever, the multiprocesses won't shut down correctly. # TODO(joelgrus) find a solution for this if num_epochs is None: raise ConfigurationError( "Multiprocess Iterator must be run for a fixed number of epochs" ) if isinstance(instances, QIterable): return self._call_with_qiterable(instances, num_epochs, shuffle) else: return self._call_with_instances(instances, num_epochs, shuffle) def __del__(self) -> None: """ Terminate processes if the user hasn't joined implicitly by consuming all the tensors. This is necessary as leaving stray processes running can corrupt shared state. In brief, we've observed shared memory counters being reused (when the memory was free from the perspective of the parent process) while the stray workers still held a reference to them. For a discussion of using destructors in Python in this manner, see https://eli.thegreenplace.net/2009/06/12/safely-using-destructors-in-python/. """ for process in self.processes: process.terminate() if self.queuer is not None: self.queuer.terminate()