예제 #1
0
 def launch_optimizer_workers(self, n_itr):
     """
     If multi-GPU optimization, launches an optimizer worker for each GPU
     and initializes ``torch.distributed.``
     """
     if self.world_size == 1:
         return
     offset = self.affinity.optimizer[0].get("master_cpus", [0])[0]
     port = find_port(offset=offset)
     affinities = self.affinity.optimizer
     runners = [AsyncOptWorker(
         rank=rank,
         world_size=self.world_size,
         algo=self.algo,
         agent=self.agent,
         n_itr=n_itr,
         affinity=affinities[rank],
         seed=self.seed + 100,
         ctrl=self.ctrl,
         port=port,
     ) for rank in range(1, len(affinities))]
     procs = [mp.Process(target=r.optimize, args=()) for r in runners]
     for p in procs:
         p.start()
     torch.distributed.init_process_group(
         backend="nccl",
         rank=0,
         world_size=self.world_size,
         init_method=f"tcp://127.0.0.1:{port}",
     )
     self.optimizer_procs = procs
예제 #2
0
파일: sync_rl.py 프로젝트: zren96/rlpyt
 def launch_workers(self):
     """
     As part of startup, fork a separate Python process for each additional
     GPU; the master process runs on the first GPU.  Initialize
     ``torch.distributed`` so the ``DistributedDataParallel`` wrapper can
     work--also makes ``torch.distributed`` avaiable for other
     communication.
     """
     self.affinities = self.affinity
     self.affinity = self.affinities[0]
     self.world_size = world_size = len(self.affinities)
     self.rank = rank = 0
     self.par = par = self.build_par_objs(world_size)
     if self.seed is None:
         self.seed = make_seed()
     port = find_port(offset=self.affinity.get("master_cpus",
                                               [0])[0])  # 29500
     backend = "gloo" if self.affinity.get("cuda_idx",
                                           None) is None else "nccl"
     workers_kwargs = [
         dict(
             algo=self.algo,
             agent=self.agent,
             sampler=self.sampler,
             n_steps=self.n_steps,
             seed=self.seed + 100 * rank,
             affinity=self.affinities[rank],
             log_interval_steps=self.log_interval_steps,
             rank=rank,
             world_size=world_size,
             port=port,
             backend=backend,
             par=par,
         ) for rank in range(1, world_size)
     ]
     workers = [self.WorkerCls(**w_kwargs) for w_kwargs in workers_kwargs]
     # self.workers = [mp.Process(target=w.train, args=()) for w in workers]
     self.workers = [mp.Process(target=w.eval, args=()) for w in workers]
     for w in self.workers:
         w.start()
     # print(torch.cuda.device_count())
     # import os
     # os.environ["MASTER_ADDR"] = "127.0.0.1"
     # os.environ["MASTER_PORT"] = "29500"
     # os.environ["CUDA_VISIBLE_DEVICES"] = '1,2,3,4'
     # print(backend, rank, world_size)
     torch.distributed.init_process_group(
         backend=backend,
         rank=rank,
         world_size=world_size,
         init_method=f"tcp://128.112.35.85:{port}",
     )
예제 #3
0
 def launch_workers(self):
     """
     As part of startup, fork a separate Python process for each additional
     GPU; the master process runs on the first GPU.  Initialize
     ``torch.distributed`` so the ``DistributedDataParallel`` wrapper can
     work--also makes ``torch.distributed`` avaiable for other
     communication.
     """
     self.affinities = self.affinity
     self.affinity = self.affinities[0]
     self.world_size = world_size = len(self.affinities)
     self.rank = rank = 0
     self.par = par = self.build_par_objs(world_size)
     if self.seed is None:
         self.seed = make_seed()
     port = find_port(offset=self.affinity.get("master_cpus", [0])[0])
     backend = "gloo" if self.affinity.get("cuda_idx",
                                           None) is None else "nccl"
     workers_kwargs = [
         dict(
             algo=self.algo,
             agent=self.agent,
             sampler=self.sampler,
             n_steps=self.n_steps,
             seed=self.seed + 100 * rank,
             affinity=self.affinities[rank],
             log_interval_steps=self.log_interval_steps,
             rank=rank,
             world_size=world_size,
             port=port,
             backend=backend,
             par=par,
             log_dir=self.log_dir,
             pretrain=self.pretrain,
         ) for rank in range(1, world_size)
     ]
     workers = [self.WorkerCls(**w_kwargs) for w_kwargs in workers_kwargs]
     self.workers = [mp.Process(target=w.train, args=()) for w in workers]
     for w in self.workers:
         w.start()
     torch.distributed.init_process_group(
         backend=backend,
         rank=rank,
         world_size=world_size,
         init_method=f"tcp://127.0.0.1:{port}",
     )
예제 #4
0
 def launch_workers(self):
     self.affinities = self.affinity
     self.affinity = self.affinities[0]
     self.world_size = world_size = len(self.affinities)
     self.rank = rank = 0
     self.par = par = self.build_par_objs(world_size)
     if self.seed is None:
         self.seed = make_seed()
     port = find_port(offset=self.affinity.get("master_cpus", [0])[0])
     backend = "gloo" if self.affinity.get("cuda_idx",
                                           None) is None else "nccl"
     workers_kwargs = [
         dict(
             algo=self.algo,
             agent=self.agent,
             sampler=self.sampler,
             n_steps=self.n_steps,
             seed=self.seed + 100 * rank,
             affinity=self.affinities[rank],
             log_interval_steps=self.log_interval_steps,
             rank=rank,
             world_size=world_size,
             port=port,
             backend=backend,
             par=par,
         ) for rank in range(1, world_size)
     ]
     workers = [self.WorkerCls(**w_kwargs) for w_kwargs in workers_kwargs]
     self.workers = [mp.Process(target=w.train, args=()) for w in workers]
     for w in self.workers:
         w.start()
     torch.distributed.init_process_group(
         backend=backend,
         rank=rank,
         world_size=world_size,
         init_method=f"tcp://127.0.0.1:{port}",
     )