def __init__( self, cfg, obs_space, action_space, num_agents, worker_idx, shared_buffers, task_queue, policy_queues, report_queue, learner_queues, ): self.cfg = cfg self.obs_space = obs_space self.action_space = action_space self.num_agents = num_agents self.worker_idx = worker_idx self.shared_buffers = shared_buffers self.terminate = False self.num_complete_rollouts = 0 self.vector_size = cfg.num_envs_per_worker self.num_splits = cfg.worker_num_splits assert self.vector_size >= self.num_splits assert self.vector_size % self.num_splits == 0, 'Vector size should be divisible by num_splits' self.env_runners = None self.policy_queues = policy_queues self.report_queue = report_queue self.learner_queues = learner_queues self.task_queue = task_queue self.reward_shaping = [None for _ in range(self.cfg.num_policies)] self.process = TorchProcess(target=self._run, daemon=True) self.process.start()
def __init__( self, cfg, obs_space, action_space, num_agents, worker_idx, shared_buffers, task_queue, policy_queues, report_queue, learner_queues, ): """ Ctor. :param cfg: global config (all CLI params) :param obs_space: observation space (spaces) of the environment :param action_space: action space(s) :param num_agents: number of agents per env (all env should have the same number of agents right now, although it should be easy to fix) :param worker_idx: index of this worker process :param shared_buffers: shared memory data structures initialized in main process (see shared_buffers.py) :param task_queue: queue for incoming messages for THIS particular actor worker. See the task types in the loop below, but the most common task is ROLLOUT_STEP, which means "here's your actions, advance simulation by one step". :param policy_queues: FIFO queues associated with all policies participating in training. We send requests for policy queue #N to get actions for envs (agents) that are controlled by policy #N. :param report_queue: one-way communication with the main process, various stats and whatnot :param learner_queues: one-way communication with the learner, sending trajectory buffers for learning """ self.cfg = cfg self.obs_space = obs_space self.action_space = action_space self.num_agents = num_agents self.worker_idx = worker_idx self.shared_buffers = shared_buffers self.terminate = False self.num_complete_rollouts = 0 self.vector_size = cfg.num_envs_per_worker self.num_splits = cfg.worker_num_splits assert self.vector_size >= self.num_splits assert self.vector_size % self.num_splits == 0, 'Vector size should be divisible by num_splits' self.env_runners = None self.policy_queues = policy_queues self.report_queue = report_queue self.learner_queues = learner_queues self.task_queue = task_queue self.reward_shaping = [None for _ in range(self.cfg.num_policies)] self.process = TorchProcess(target=self._run, daemon=True) self.process.start()
def __init__(self, worker_idx, policy_id, cfg, obs_space, action_space, shared_buffers, policy_queue, actor_queues, report_queue, task_queue, policy_lock, resume_experience_collection_cv): super().__init__() log.info('Initializing policy worker %d for policy %d', worker_idx, policy_id) self.worker_idx = worker_idx self.policy_id = policy_id self.cfg = cfg self.obs_space = obs_space self.action_space = action_space self.device = None self.actor_critic = None self.shared_model_weights = None self.policy_lock = policy_lock self.resume_experience_collection_cv = resume_experience_collection_cv self.policy_queue = policy_queue self.actor_queues = actor_queues self.report_queue = report_queue # queue other components use to talk to this particular worker self.task_queue = task_queue self.initialized = False self.terminate = False self.initialized_event = multiprocessing.Event() self.initialized_event.clear() self.shared_buffers = shared_buffers self.tensors_individual_transitions = self.shared_buffers.tensors_individual_transitions self.policy_versions = shared_buffers.policy_versions self.stop_experience_collection = shared_buffers.stop_experience_collection self.latest_policy_version = -1 self.num_policy_updates = 0 self.requests = [] self.total_num_samples = 0 self.process = TorchProcess(target=self._run, daemon=True)
this_rank, test_data, batch_size=test_bsz) world_size = len(workers) + 1 class MyManager(BaseManager): pass MyManager.register('get_queue') MyManager.register('get_param') MyManager.register('get_stop_signal') manager = MyManager(address=(args.ps_ip, 5000), authkey=b'queue') manager.connect() q = manager.get_queue() # Queue receiving the model param_q = manager.get_param() # Queue reveiving the initial model stop_signal = manager.get_stop_signal() # Queue receiving the stop signal stop_flag = Value(c_bool, False) # Define a process monitoring the stop signal stop_p = Process(target=capture_stop, args=(stop_signal, stop_flag)) p = TorchProcess(target=init_processes, args=(this_rank, world_size, model, train_data, test_data, q, param_q, stop_flag, run)) p.start() stop_p.start() p.join() stop_p.join()
transform=test_transform) else: print('Model must be {} or {}!'.format('MnistCNN', 'AlexNet')) sys.exit(-1) models.append(model) train_bsz = args.train_bsz train_bsz /= len(workers) train_bsz = int(train_bsz) train_data = partition_dataset(train_dataset, workers) train_data_list = [] for i in workers: train_data_sub = select_dataset(workers, i, train_data, batch_size=train_bsz) train_data_list.append(train_data_sub) test_bsz = 400 # 用所有的测试数据测试 test_data = DataLoader(test_dataset, batch_size=test_bsz, shuffle = False) iterations_epoch = int(len(train_dataset) / args.train_bsz) save_path = str(args.save_path) save_path = save_path.rstrip('/') p = TorchProcess(target=init_processes, args=(workers, models, save_path, train_data_list, test_data,iterations_epoch, run)) p.start() p.join()
train_bsz /= len(workers) train_bsz = int(train_bsz) train_data = partition_dataset(train_dataset, workers) test_data = partition_dataset(test_dataset, workers) this_rank = args.this_rank train_data = select_dataset(workers, this_rank, train_data, batch_size=train_bsz) test_data = select_dataset(workers, this_rank, test_data, batch_size=test_bsz) # Initialize the test dataset #test_data = DataLoader(test_dataset, batch_size=test_bsz, shuffle=True) world_size = len(workers) + 1 save_path = str(args.save_path) save_path = save_path.rstrip('/') p = TorchProcess(target=init_processes, args=(this_rank, world_size, workers, model, save_path, train_data, test_data, run)) p.start() p.join()
parser = argparse.ArgumentParser() # 集群信息 parser.add_argument('--ps-ip', type=str, default='127.0.0.1') parser.add_argument('--ps-port', type=str, default='29000') parser.add_argument('--this-rank', type=int, default=0) parser.add_argument('--learners', type=str, default='1-2-3-4') args = parser.parse_args() ''' def run(rank, workers): pass ''' def init_processes(rank, size, backend='tcp'): os.environ['MASTER_ADDR'] = args.ps_ip os.environ['MASTER_PORT'] = args.ps_port dist.init_process_group(backend, rank=rank, world_size=size) # fn(rank, workers) if __name__ == '__main__': workers = [int(v) for v in str(args.learners).split('-')] world_size = len(workers) + 1 this_rank = args.this_rank p = TorchProcess(target=init_processes, args=(this_rank, world_size)) p.start() p.join()
backend='tcp'): os.environ['MASTER_ADDR'] = args.ps_ip os.environ['MASTER_PORT'] = args.ps_port dist.init_process_group(backend, rank=rank, world_size=size) fn(rank, model, train_pics, train_bsz) if __name__ == '__main__': # 随机数设置 manual_seed = random.randint(1, 10000) random.seed(manual_seed) torch.manual_seed(manual_seed) workers = [int(v) for v in str(args.learners).split('-')] model = alexnet(num_classes=10) train_pics = 50000 train_bsz = 64 train_bsz /= len(workers) train_bsz = int(train_bsz) world_size = len(str(args.learners).split('-')) + 1 this_rank = args.this_rank p = TorchProcess(target=init_processes, args=(this_rank, world_size, model, train_pics, train_bsz, run)) p.start() p.join()
def __init__(self, inputs): self.inputs = inputs self.process = TorchProcess(target=self.act, daemon=True) self.process.start()
for i in workers: # 取得部分train_data,体现数据并行 print('Start: {}, End: {}'.format(sp[i][0], sp[i][1])) train_data_sub = train_data[sp[i][0]:sp[i][1]].contiguous() train_data_list.append(train_data_sub) ntokens = len(corpus.dictionary) print("--------------------------", ntokens) models = [] for i in range(workers_num + 1): model = RNNModel(args.model, ntokens, ninp=10, nhid=10, nlayers=2, dropout=0.2, tie_weights=True) models.append(model) print(get_parameter_number(model)) save_path = str(args.save_path) save_path = save_path.rstrip('/') p = TorchProcess(target=init_processes, args=(workers, models, save_path, train_data_list, test_data, ntokens, train_batch_size, run)) p.start() p.join()