def launch_optimizer_workers(self, n_itr): """ If multi-GPU optimization, launches an optimizer worker for each GPU and initializes ``torch.distributed.`` """ if self.world_size == 1: return offset = self.affinity.optimizer[0].get("master_cpus", [0])[0] port = find_port(offset=offset) affinities = self.affinity.optimizer runners = [AsyncOptWorker( rank=rank, world_size=self.world_size, algo=self.algo, agent=self.agent, n_itr=n_itr, affinity=affinities[rank], seed=self.seed + 100, ctrl=self.ctrl, port=port, ) for rank in range(1, len(affinities))] procs = [mp.Process(target=r.optimize, args=()) for r in runners] for p in procs: p.start() torch.distributed.init_process_group( backend="nccl", rank=0, world_size=self.world_size, init_method=f"tcp://127.0.0.1:{port}", ) self.optimizer_procs = procs
def launch_workers(self): """ As part of startup, fork a separate Python process for each additional GPU; the master process runs on the first GPU. Initialize ``torch.distributed`` so the ``DistributedDataParallel`` wrapper can work--also makes ``torch.distributed`` avaiable for other communication. """ self.affinities = self.affinity self.affinity = self.affinities[0] self.world_size = world_size = len(self.affinities) self.rank = rank = 0 self.par = par = self.build_par_objs(world_size) if self.seed is None: self.seed = make_seed() port = find_port(offset=self.affinity.get("master_cpus", [0])[0]) # 29500 backend = "gloo" if self.affinity.get("cuda_idx", None) is None else "nccl" workers_kwargs = [ dict( algo=self.algo, agent=self.agent, sampler=self.sampler, n_steps=self.n_steps, seed=self.seed + 100 * rank, affinity=self.affinities[rank], log_interval_steps=self.log_interval_steps, rank=rank, world_size=world_size, port=port, backend=backend, par=par, ) for rank in range(1, world_size) ] workers = [self.WorkerCls(**w_kwargs) for w_kwargs in workers_kwargs] # self.workers = [mp.Process(target=w.train, args=()) for w in workers] self.workers = [mp.Process(target=w.eval, args=()) for w in workers] for w in self.workers: w.start() # print(torch.cuda.device_count()) # import os # os.environ["MASTER_ADDR"] = "127.0.0.1" # os.environ["MASTER_PORT"] = "29500" # os.environ["CUDA_VISIBLE_DEVICES"] = '1,2,3,4' # print(backend, rank, world_size) torch.distributed.init_process_group( backend=backend, rank=rank, world_size=world_size, init_method=f"tcp://128.112.35.85:{port}", )
def launch_workers(self): """ As part of startup, fork a separate Python process for each additional GPU; the master process runs on the first GPU. Initialize ``torch.distributed`` so the ``DistributedDataParallel`` wrapper can work--also makes ``torch.distributed`` avaiable for other communication. """ self.affinities = self.affinity self.affinity = self.affinities[0] self.world_size = world_size = len(self.affinities) self.rank = rank = 0 self.par = par = self.build_par_objs(world_size) if self.seed is None: self.seed = make_seed() port = find_port(offset=self.affinity.get("master_cpus", [0])[0]) backend = "gloo" if self.affinity.get("cuda_idx", None) is None else "nccl" workers_kwargs = [ dict( algo=self.algo, agent=self.agent, sampler=self.sampler, n_steps=self.n_steps, seed=self.seed + 100 * rank, affinity=self.affinities[rank], log_interval_steps=self.log_interval_steps, rank=rank, world_size=world_size, port=port, backend=backend, par=par, log_dir=self.log_dir, pretrain=self.pretrain, ) for rank in range(1, world_size) ] workers = [self.WorkerCls(**w_kwargs) for w_kwargs in workers_kwargs] self.workers = [mp.Process(target=w.train, args=()) for w in workers] for w in self.workers: w.start() torch.distributed.init_process_group( backend=backend, rank=rank, world_size=world_size, init_method=f"tcp://127.0.0.1:{port}", )
def launch_workers(self): self.affinities = self.affinity self.affinity = self.affinities[0] self.world_size = world_size = len(self.affinities) self.rank = rank = 0 self.par = par = self.build_par_objs(world_size) if self.seed is None: self.seed = make_seed() port = find_port(offset=self.affinity.get("master_cpus", [0])[0]) backend = "gloo" if self.affinity.get("cuda_idx", None) is None else "nccl" workers_kwargs = [ dict( algo=self.algo, agent=self.agent, sampler=self.sampler, n_steps=self.n_steps, seed=self.seed + 100 * rank, affinity=self.affinities[rank], log_interval_steps=self.log_interval_steps, rank=rank, world_size=world_size, port=port, backend=backend, par=par, ) for rank in range(1, world_size) ] workers = [self.WorkerCls(**w_kwargs) for w_kwargs in workers_kwargs] self.workers = [mp.Process(target=w.train, args=()) for w in workers] for w in self.workers: w.start() torch.distributed.init_process_group( backend=backend, rank=rank, world_size=world_size, init_method=f"tcp://127.0.0.1:{port}", )