def test_rl_rpc(self): if self.rank == 0: # Rank 0 is the agent. rpc.init_rpc( name=worker_name(self.rank), backend=self.rpc_backend, rank=self.rank, world_size=self.world_size, rpc_backend_options=self.rpc_backend_options, ) agent = Agent(self.world_size) run_agent(agent, n_steps=int(TOTAL_EPISODE_STEP / (self.world_size - 1))) # Ensure training was run. We don't really care about whether the task was learned, # since the purpose of the test is to check the API calls. self.assertGreater(agent.running_reward, 0.0) else: # Other ranks are observers that passively wait for instructions from the agent. rpc.init_rpc( name=worker_name(self.rank), backend=self.rpc_backend, rank=self.rank, world_size=self.world_size, rpc_backend_options=self.rpc_backend_options, ) rpc.shutdown()
def run(rank, world_size): print(f'rank = {rank} world_size = {world_size}') env_dict = {key: os.environ[key] for key in key_list} print(env_dict) if rank == 0: rpc.init_rpc(f"rank{rank}", rank=rank, world_size=world_size, backend=rpc.BackendType.TENSORPIPE) coordinator = Coordinator(world_size=world_size, lr=1e-3) coord_rref = RRef(coordinator) coordinator.run_training_loop(1000, coord_rref) torch.save(coordinator.policy, open(f'plots/policy_nworkers{world_size-1}.pt', 'wb')) json.dump(coordinator.stats, open(f'plots/stats_nworkers{world_size-1}.json', 'w')) else: rpc.init_rpc(f"rank{rank}", rank=rank, world_size=world_size, backend=rpc.BackendType.TENSORPIPE) rpc.shutdown()
def run_worker(rank, world_size) -> None: r""" This is the entry point for all processes. The rank 0 is the agent. All other ranks are observers. """ os.environ["MASTER_ADDR"] = "localhost" os.environ["MASTER_PORT"] = "29500" if rank == 0: # rank0 is the agent rpc.init_rpc(AGENT_NAME, rank=rank, world_size=world_size) agent = Agent(world_size) for i_episode in count(1): n_steps = int(TOTAL_EPISODE_STEP / (config.world_size - 1)) agent.run_episode(n_steps=n_steps) last_reward = agent.finish_episode() if i_episode % config.log_interval == 0: print( f"Episode {i_episode}\tLast reward: {last_reward:.2f}\tAverage reward: {agent.running_reward:.2f}" ) if agent.running_reward > agent.reward_threshold: print(f"Solved! Running reward is now {agent.running_reward}!") break else: # other ranks are the observer rpc.init_rpc( OBSERVER_NAME.format(rank), rank=rank, world_size=world_size ) # observers passively waiting for instructions from agents rpc.shutdown()
def new_test_method(self, *arg, **kwargs): # Setting _ignore_rref_leak to make sure OwnerRRefs are properly deleted # in tests. import torch.distributed.rpc.api as api api._ignore_rref_leak = False self.worker_id = self.rank if faulty_messages: _build_faulty_backend_options(faulty_messages) if setup_rpc: rpc.init_rpc( name="worker%d" % self.rank, backend=self.rpc_backend, rank=self.rank, world_size=self.world_size, rpc_backend_options=self.rpc_backend_options, ) return_value = old_test_method(self, *arg, **kwargs) if setup_rpc: rpc.shutdown(graceful=clean_shutdown) return return_value
def bench_mpi(args): guess_rank = int(os.environ["OMPI_COMM_WORLD_RANK"]) os.environ["UCX_NET_DEVICES"] = best_device_map[guess_rank] torch.distributed.init_process_group(backend="mpi") os.environ["MASTER_ADDR"] = args.host os.environ["MASTER_PORT"] = "10639" if args.socket_name: os.environ["GLOO_SOCKET_IFNAME"] = args.socket_name os.environ["TP_SOCKET_IFNAME"] = args.socket_name init_method = f"tcp://{os.environ['MASTER_ADDR']}:{os.environ['MASTER_PORT']}" rank = torch.distributed.get_rank() world_size = torch.distributed.get_world_size() torch.cuda.set_device(rank % torch.cuda.device_count()) rpc.init_rpc( f"Test{rank}", rank=rank, world_size=world_size, backend=rpc.BackendType.PROCESS_GROUP, rpc_backend_options=rpc.ProcessGroupRpcBackendOptions( rpc_timeout=20, init_method=init_method), ) initialize_model_parallel(1, world_size) init_random_seed(0) run_mp_worker(args, world_size) rpc.shutdown() torch.distributed.destroy_process_group()
def run_worker(rank, world_size): os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '29500' options = rpc.TensorPipeRpcBackendOptions(num_worker_threads=256, rpc_timeout=600) import psutil p = psutil.Process() if rank == 0: p.cpu_affinity([0]) print( f"Child #{rank}: Set my affinity to {rank}, affinity now {p.cpu_affinity()}", flush=True) rpc.init_rpc("master", rank=rank, world_size=world_size, rpc_backend_options=options) run_master() else: p.cpu_affinity([rank - 1]) print( f"Child #{rank}: Set my affinity to {rank}, affinity now {p.cpu_affinity()}", flush=True) rpc.init_rpc(f"worker{rank}", rank=rank, world_size=world_size, rpc_backend_options=options) pass # block until all rpcs finish rpc.shutdown()
def basic_rpc(): init_rpc() if torch.distributed.get_rank() != 0: rpc.shutdown() torch.distributed.barrier() return model = [nn.Linear(10, 10), nn.ReLU()] pipe = PipeRPCWrapper(model, [1, 1], input_device=torch.cuda.current_device(), worker_map=get_worker_map()) pipe.foreach_worker(register_optimizer, include_self=True) inputs = torch.rand(10).cuda() output = pipe(inputs) loss = output.mean() loss.backward() pipe.foreach_worker(step_optimizer, include_self=True) pipe.eval() rpc.shutdown() torch.distributed.barrier()
def run_benchmark(rank, model, data, config): world_size = config.trainer_count + config.ps_count + 1 os.environ['MASTER_ADDR'] = config.master_addr os.environ['MASTER_PORT'] = config.master_port rpc_backend_options = TensorPipeRpcBackendOptions() rpc_backend_options.init_method = config.rpc_init_method if rank == world_size - 1: # master = [trainer_count + parameter_server_count, trainer_count + parameter_server_count] run_master(rank, model, data, config, rpc_backend_options) elif rank >= config.trainer_count: # parameter_servers = [trainer_count, trainer_count + parameter_server_count) rpc.init_rpc(get_name(rank, config), rank=rank, world_size=world_size, rpc_backend_options=rpc_backend_options) else: # trainers = [0, trainer_count) trainer_config = config.trainer_config ps_config = config.ps_config if (USE_CUDA_RPC in trainer_config and trainer_config[USE_CUDA_RPC] and USE_CUDA_RPC in ps_config and ps_config[USE_CUDA_RPC] and config.ps_count > 0): ps_rank = get_parameter_server_rank(rank, config) ps_name = get_name(ps_rank, config) rpc_backend_options.set_device_map(ps_name, {rank: ps_rank}) trainer_name = get_name(rank, config) rpc.init_rpc(trainer_name, rank=rank, world_size=world_size, rpc_backend_options=rpc_backend_options) rpc.shutdown()
def run_worker(rank, world_size, n_episode, batch, print_log=True): r""" This is the entry point for all processes. The rank 0 is the agent. All other ranks are observers. """ os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '29500' if rank == 0: # rank0 is the agent rpc.init_rpc(AGENT_NAME, rank=rank, world_size=world_size) agent = Agent(world_size, batch) for i_episode in range(n_episode): last_reward, running_reward = agent.run_episode(n_steps=NUM_STEPS) if print_log: print( 'Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'. format(i_episode, last_reward, running_reward)) else: # other ranks are the observer rpc.init_rpc(OBSERVER_NAME.format(rank), rank=rank, world_size=world_size) # observers passively waiting for instructions from agents rpc.shutdown()
def rpc_worker(rank, world_size, init_file, func, *args): if torch_version() == (1, 8, 0): if torch.cuda.is_available(): # Workaround for https://github.com/pytorch/pytorch/issues/53844 options = rpc.TensorPipeRpcBackendOptions( init_method="file://" + init_file, _transports=["ibv", "uv"]) else: # Workaround for https://github.com/pytorch/pytorch/issues/54266 options = rpc.TensorPipeRpcBackendOptions( init_method="file://" + init_file, _channels=[ "mpt_uv", "basic", "cuda_ipc", "cuda_gdr", "cuda_xth", "cuda_basic" ], ) else: options = rpc.TensorPipeRpcBackendOptions(init_method="file://" + init_file) rpc.init_rpc( "worker" + str(rank), rank=rank, world_size=world_size, backend=rpc.BackendType.TENSORPIPE, rpc_backend_options=options, ) if rank == 0: func(*args) rpc.shutdown()
def new_test_method(self, *arg, **kwargs): # Setting _ignore_rref_leak to make sure OwnerRRefs are properly deleted # in tests. import torch.distributed.rpc.api as api api._ignore_rref_leak = False self.worker_id = self.rank if setup_rpc: global _ALL_NODE_NAMES _ALL_NODE_NAMES = { "worker{}".format(rank) for rank in range(self.world_size) } rpc.init_rpc( name="worker%d" % self.rank, backend=self.rpc_backend, rank=self.rank, world_size=self.world_size, rpc_backend_options=self.rpc_backend_options, ) return_value = old_test_method(self, *arg, **kwargs) if setup_rpc: rpc.shutdown(graceful=clean_shutdown) return return_value
def run_rpc(rank): argv = { 'world_size': int(2), 'rank': int(0), 'host': "worker0", 'worker': "worker1", 'epochs': int(5), 'lr': float(1e-3), 'root': 'data', 'batch_size': int(32) } if (rank == 0): print(argv) os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '29502' rpc.init_rpc(argv['host'], rank=rank, world_size=argv['world_size']) print('Start Run:', rank) run(argv) rpc.shutdown() else: os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '29502' rpc.init_rpc(argv['worker'], rank=rank, world_size=argv['world_size']) print('Start Run:', rank) rpc.shutdown()
def start(args): rpc.init_rpc(args.name, rank=args.rank, world_size=args.world_size, num_worker_threads=8) rpc.shutdown()
def run_worker(rank, world_size, num_split): os.environ['MASTER_ADDR'] = '172.31.13.136' # os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '29500' # options = rpc.TensorPipeRpcBackendOptions(num_worker_threads=32) if rank == 0: rpc.init_rpc( "master", rank=rank, world_size=world_size, # rpc_backend_options=options ) run_master(num_split) else: rpc.init_rpc( f"worker{rank}", rank=rank, world_size=world_size, # rpc_backend_options=options ) print("slave init") pass # block until all rpcs finish rpc.shutdown()
def test_reinit(self): rpc.init_rpc( name="worker{}".format(self.rank), backend=self.rpc_backend, rank=self.rank, world_size=self.world_size, rpc_backend_options=self.rpc_backend_options, ) # This is for the below `dist.barrier`. # For `RpcAgent` other than `ProcessGroupAgent`, # no `_default_pg` is initialized. if not dist.is_initialized(): dist.init_process_group( backend="gloo", init_method=self.init_method, rank=self.rank, world_size=self.world_size, ) # Wait for all init to complete. dist.barrier() with self.assertRaisesRegex(RuntimeError, "is already initialized"): rpc.init_rpc( name="worker{}".format(self.rank), backend=self.rpc_backend, rank=self.rank, world_size=self.world_size, rpc_backend_options=self.rpc_backend_options, ) rpc.shutdown()
def run_driver(rank, world_size, gpu_list, dataset, batch_size, lr, mom, lambd, max_epoch, client_epoch, model, seed, q, early_stop_round, early_stop_metric): exp_id = str(int(time.time())) print(f"Driver initializing RPC, rank {rank}, world size {world_size}") rpc.init_rpc(name="driver", rank=rank, world_size=world_size) print("Initialized driver") param_server_rref = rpc.remote("parameter_server", get_parameter_server, args=(gpu_list[0], world_size - 1, dataset, batch_size, lr, mom, lambd, model, max_epoch, client_epoch, seed, exp_id, early_stop_round, early_stop_metric)) for _rank in range(1, world_size - 1): print(f"Driver registering worker node {_rank}") worker_server_rref = rpc.remote(f"trainer_{_rank}", get_worker, args=(gpu_list[_rank], _rank, world_size - 1, dataset, model, batch_size, lr, seed, exp_id)) print(f"Driver binding worker {_rank} with param server") remote_method(ParameterServer.embedding_workers, param_server_rref, worker_server_rref) remote_method(TrainerNet.embedding_param_server, worker_server_rref, param_server_rref) fut = remote_method_async(ParameterServer.instruct_training, param_server_rref) fut.wait() final_accuracy = remote_method(ParameterServer.get_final_accuract, param_server_rref) q.put(final_accuracy) rpc.shutdown() print("RPC shutdown on Driver")
def test_shutdown(self): # Initialize RPC. rpc.init_rpc( name="worker%d" % self.rank, backend=self.rpc_backend, rank=self.rank, world_size=self.world_size, rpc_backend_options=self.rpc_backend_options, ) n = self.rank + 1 dst_rank = n % self.world_size ret = rpc.rpc_sync( "worker{}".format(dst_rank), torch.add, args=(torch.ones(n, n), torch.ones(n, n)), ) self.assertEqual(ret, torch.ones(n, n) * 2) rpc.shutdown() with self.assertRaisesRegex(RuntimeError, "^RPC has not been initialized"): rpc.rpc_sync( "worker{}".format(dst_rank), torch.add, args=(torch.ones(n, n), torch.ones(n, n)), ) # it's safe to call shutdown() multiple times rpc.shutdown()
def run_worker(rank, world_size, num_split): os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '29500' options = rpc.TensorPipeRpcBackendOptions(num_worker_threads=256, rpc_timeout=600) import psutil p = psutil.Process() if rank == 0: p.cpu_affinity([rank]) rpc.init_rpc( "master", rank=rank, world_size=world_size, rpc_backend_options=options ) run_master(num_split) else: p.cpu_affinity([rank]) rpc.init_rpc( f"worker{rank}", rank=rank, world_size=world_size, rpc_backend_options=options ) pass # block until all rpcs finish rpc.shutdown()
def rpc_optimizer(): init_rpc() if torch.distributed.get_rank() != 0: rpc.shutdown() torch.distributed.barrier() return def model_with_reuse(): reused_1 = nn.Linear(10, 10) return [reused_1, nn.ReLU(), reused_1, nn.ReLU(), reused_1, nn.ReLU()] check_pipe_against_reference( [2, 2, 2], lambda: [ nn.Linear(10, 10), nn.ReLU(), nn.Linear(10, 10), nn.ReLU(), nn.Linear(10, 10), nn.ReLU() ], ) check_pipe_against_reference([2, 1, 1], model_with_reuse) rpc.shutdown() torch.distributed.barrier()
def test_local_shutdown_with_rpc(self): # test that we can start RPC, send RPCs, and then run local shutdown. rpc.init_rpc( name="worker%d" % self.rank, backend=rpc.backend_registry.BackendType[ dist_utils.TEST_CONFIG.rpc_backend_name ], rank=self.rank, world_size=self.world_size, rpc_backend_options=self.rpc_backend_options, ) n = self.rank + 1 dst_rank = n % self.world_size rpc.rpc_sync( "worker{}".format(dst_rank), torch.add, args=(torch.ones(n, n), torch.ones(n, n)), ) # A barrier is needed to ensure that all RPCs are processed. # Otherwise, some RPCs can timeout since the receiving end # has terminated. if not dist.is_initialized(): dist.init_process_group( backend="gloo", init_method=self.init_method, rank=self.rank, world_size=self.world_size, ) dist.barrier() # pass in graceful=False to ensure that we don't wait for other workers. rpc.shutdown(graceful=False)
def test_init_pg_and_rpc_with_same_socket(self): addr = DEFAULT_HOSTNAME port = common.find_free_port() os.environ["MASTER_ADDR"] = addr os.environ["MASTER_PORT"] = str(port) # We internally use a multi-tenant TCP store. Both PG and RPC should successfully # initialize even when using the same socket address. dist.init_process_group( backend="gloo", init_method="env://", rank=0, world_size=1, ) backend_opts = rpc.TensorPipeRpcBackendOptions( init_method=f"tcp://{addr}:{port}") rpc.init_rpc( name="worker0", rank=0, world_size=1, rpc_backend_options=backend_opts, ) rpc.shutdown()
def new_test_method(self, *arg, **kwargs): # Setting _ignore_rref_leak to make sure OwnerRRefs are properly deleted # in tests. import torch.distributed.rpc.api as api api._ignore_rref_leak = False self.worker_id = self.rank if (rpc.backend_registry.backend_registered("FAULTY_PROCESS_GROUP") and self.rpc_backend == rpc.backend_registry.BackendType.FAULTY_PROCESS_GROUP): _build_faulty_backend_options(self, faulty_messages, messages_to_delay) if (rpc.backend_registry.backend_registered("TENSORPIPE") and self.rpc_backend == rpc.backend_registry.BackendType.TENSORPIPE): TEST_CONFIG.rpc_backend_name = "TENSORPIPE" _build_tensorpipe_backend_options() if setup_rpc: rpc.init_rpc( name="worker%d" % self.rank, backend=self.rpc_backend, rank=self.rank, world_size=self.world_size, rpc_backend_options=self.rpc_backend_options, ) return_value = old_test_method(self, *arg, **kwargs) if setup_rpc: rpc.shutdown(graceful=clean_shutdown) return return_value
def benchmark_multiprocess(rank, world_size, args): init_method_pgroup = "tcp://localhost:{}".format(MPI_PORT) # TODO(anj-s): Add regression benchmarks for nccl as well. torch.distributed.init_process_group( backend="gloo", rank=rank, world_size=world_size, init_method=init_method_pgroup ) torch.cuda.set_device(rank % torch.cuda.device_count()) # TODO(anj-s): Move to TensorPipeRpcBackendOptions. rpc.init_rpc( f"Test{rank}", rank=rank, world_size=world_size, backend=rpc.BackendType.PROCESS_GROUP, rpc_backend_options=rpc.ProcessGroupRpcBackendOptions( rpc_timeout=20, init_method="tcp://localhost:{}".format(RPC_PORT) ), ) initialize_model_parallel(1, world_size) init_random_seed(0) run_mp_worker(args, world_size) rpc.shutdown() torch.distributed.destroy_process_group()
def run_worker(rank, world_size, num_split): # os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_ADDR'] = '172.10.0.2' # os.environ['MASTER_PORT'] = '29500' os.environ['MASTER_PORT'] = '12345' options = rpc.TensorPipeRpcBackendOptions(num_worker_threads=256) if rank == 0: print("Init master") rpc.init_rpc("master", rank=rank, world_size=world_size, rpc_backend_options=options) print(rank) run_master(num_split) else: print("init worker rank ", rank) rpc.init_rpc(f"worker{rank}", rank=rank, world_size=world_size, rpc_backend_options=options) pass """ # source: https://pytorch.org/tutorials/intermediate/dist_tuto.html dist.init_process_group( init_method='tcp://10.1.1.20:23456', rank=args.rank, world_size=4) """ # block until all rpcs finish rpc.shutdown()
def run(rank, world_size): os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '29500' options=rpc.ProcessGroupRpcBackendOptions( num_send_recv_threads=16, rpc_timeout=0 # infinite timeout ) if rank != 0: rpc.init_rpc( f"trainer{rank}", rank=rank, world_size=world_size, rpc_backend_options=options ) # trainer passively waiting for ps to kick off training iterations else: rpc.init_rpc( "ps", rank=rank, world_size=world_size, rpc_backend_options=options ) run_ps([f"trainer{r}" for r in range(1, world_size)]) # block until all rpcs finish rpc.shutdown()
def new_test_method(self, *arg, **kwargs): # Setting _ignore_rref_leak to make sure OwnerRRefs are properly deleted # in tests. import torch.distributed.rpc.api as api api._ignore_rref_leak = False self.worker_id = self.rank self.setup_fault_injection(faulty_messages, messages_to_delay) rpc_backend_options = self.rpc_backend_options if setup_rpc: if TEST_WITH_TSAN: # TSAN runs much slower. rpc_backend_options.rpc_timeout = rpc.constants.DEFAULT_RPC_TIMEOUT_SEC * 5 rpc.constants.DEFAULT_SHUTDOWN_TIMEOUT = 60 rpc.init_rpc( name="worker%d" % self.rank, backend=self.rpc_backend, rank=self.rank, world_size=self.world_size, rpc_backend_options=rpc_backend_options, ) return_value = old_test_method(self, *arg, **kwargs) if setup_rpc: rpc.shutdown(graceful=clean_shutdown) return return_value
def run_worker(rank, world_size): r""" This is the entry point for all processes. The rank 0 is the agent. All other ranks are observers. """ os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '29500' if rank == 0: # rank0 is the agent rpc.init_rpc(AGENT_NAME, rank=rank, world_size=world_size) agent = Agent(world_size) for i_episode in count(1): n_steps = int(TOTAL_EPISODE_STEP / (args.world_size - 1)) #n_steps = int(TOTAL_EPISODE_STEP ) agent.run_episode(n_steps=n_steps) last_reward = agent.finish_episode() if i_episode % args.log_interval == 0: print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format( i_episode, last_reward, agent.running_reward)) if agent.running_reward > agent.reward_threshold: print("Solved! Running reward is now {}!".format(agent.running_reward)) break else: # other ranks are the observer rpc.init_rpc(OBSERVER_NAME.format(rank), rank=rank, world_size=world_size) # observers passively waiting for instructions from agents rpc.shutdown()
def _test_rref_leak(self, ignore_leak=False): rpc.init_rpc( name="worker{}".format(self.rank), backend=self.rpc_backend, rank=self.rank, world_size=self.world_size, rpc_backend_options=self.rpc_backend_options, ) # This is for the below `dist.barrier`. # For `RpcAgent` other than `ProcessGroupAgent`, # no `_default_pg` is initialized. if not dist.is_initialized(): dist.init_process_group( backend="gloo", init_method=self.init_method, rank=self.rank, world_size=self.world_size, ) # Wait for all init to complete. dist.barrier() rref = rpc.remote("worker{}".format((self.rank + 1) % self.world_size), torch.add, args=(torch.ones(2, 2), 1)) if ignore_leak: import torch.distributed.rpc.api as api api._ignore_rref_leak = True rpc.shutdown()
def destroy_comms(self, destroy_rpc=True): # Wait for all ranks to reach here before starting shutdown. dist.barrier() if destroy_rpc: rpc.shutdown() dist.destroy_process_group()
def run(i): global rank rank = i if i == world_size - 1: time.sleep(4) print("Process {} delayed start.".format(i)) rpc.init_rpc("Rank" + str(i), rank=i, world_size=world_size, rpc_backend_options=rpc.ProcessGroupRpcBackendOptions( init_method="env://", rpc_timeout=rpc.timedelta(seconds=2), num_send_recv_threads=4)) if i == 0: time.sleep(2) print("Process 0 exit.") exit(-1) t = time.time() reqs = [] for j in range(messages): for r in range(world_size): reqs.append(rpc.rpc_async("Rank{}".format(r), test, args=())) for req, idx in zip(reqs, range(world_size)): try: print("{} Received from {} : {}".format(rank, idx, req.wait())) except RuntimeError: print("An error ocurred while {} receiving results from {}". format(rank, idx)) reqs.clear() print(time.time() - t) rpc.shutdown(graceful=False)