def worker_0(queue, msg): init_rpc("worker", BackendType.PROCESS_GROUP) ret = rpc.rpc_sync(to="worker1", func=echo, args=(msg, )) queue.put(ret) rpc.shutdown()
def test_py_class_constructor(self): n = self.rank + 1 dst_rank = n % self.world_size ret = rpc.rpc_sync("worker{}".format(dst_rank), MyClass, args=(n,)) self.assertEqual(ret.a, n)
def test_py_function_exception(self): n = self.rank + 1 dst_rank = n % self.world_size with self.assertRaisesRegex(Exception, "TypeError"): ret = rpc.rpc_sync("worker{}".format(dst_rank), no_result, args=(10,))
def rpc_master(msg): init_rpc("master", BackendType.TENSORPIPE) ret = rpc.rpc_sync(to="worker", func=_echo, args=(msg, )) rpc.shutdown() return f"{ret} from worker"
def test_expected_src(self): dst_rank = (self.rank + 1) % self.world_size expected_src_rank = (self.rank - 1) % self.world_size ret = rpc.rpc_sync("worker{}".format(dst_rank), set_value, args=(self.rank,)) value = VALUE_FUTURE.result() self.assertEqual(value, expected_src_rank)
def test_ddp_dist_autograd_local_vs_remote_gpu(self): # Each trainer uses a different random seed. Otherwise, they are going # to have exactly the same initial model parameters, input, and # therefore grads. That means the grads will be the same before and # after DDP's all-reduce. torch.manual_seed(self.rank) dist.init_process_group( backend="gloo", init_method=INIT_METHOD_TEMPLATE.format(file_name=self.file_name), world_size=self.world_size, rank=self.rank, ) remote_layer1 = RemoteModule(remote_device="worker0/cpu", module_cls=nn.Linear, args=(10, 7, False)) layer1 = nn.Linear(10, 7, False) # Start with the same parameters for remote and local layer1.weight = remote_layer1.module_rref.to_here().weight layer2 = nn.Linear(7, 5).cuda(self.rank) ddp_layer2 = DistributedDataParallel(layer2, device_ids=[self.rank]) remote_layer3 = RemoteModule(remote_device="worker0/cpu", module_cls=nn.Linear, args=(5, 3, False)) layer3 = nn.Linear(5, 3, False) # Start with the same parameters for remote and local layer3.weight = remote_layer3.module_rref.to_here().weight layer4 = nn.Linear(3, 1).cuda(self.rank) ddp_layer4 = DistributedDataParallel(layer4, device_ids=[self.rank]) # Run local case. inputs = torch.rand((10, 10)) loss = ddp_layer4( layer3(ddp_layer2(layer1(inputs).cuda(self.rank)).cpu()).cuda( self.rank)).sum() loss.backward() # Run remote case. with dist_autograd.context() as context_id: loss = ddp_layer4( remote_layer3( ddp_layer2(remote_layer1(inputs).cuda( self.rank)).cpu()).cuda(self.rank)).sum() dist_autograd.backward(context_id, [loss]) grads_dict = dist_autograd.get_gradients(context_id) dist.barrier() self.assertEqual( layer1.weight.grad, rpc.rpc_sync( "worker0", DdpComparisonTest.get_remote_grads, args=(remote_layer1.module_rref, context_id), ), ) self.assertEqual(layer2.weight.grad, grads_dict[layer2.weight]) self.assertEqual( layer3.weight.grad, rpc.rpc_sync( "worker0", DdpComparisonTest.get_remote_grads, args=(remote_layer3.module_rref, context_id), ), ) self.assertEqual(layer4.weight.grad, grads_dict[layer4.weight])
def test_async_function_wrong_return_type(self): with self.assertRaisesRegex(RuntimeError, "Expected Future but got Tensor"): rpc.rpc_sync(worker_name((self.rank + 1) % self.world_size), async_wrong_type)
def _test_graph(self, fn, exec_mode): dst_rank = (self.rank + 1) % self.world_size # This is for the below `dist.barrier`. # For `RpcAgent` other than `ProcessGroupAgent`, # no `_default_pg` is initialized. if not dist.is_initialized(): dist.init_process_group( backend="gloo", init_method=self.init_method, rank=self.rank, world_size=self.world_size, ) with dist_autograd.context() as context_id: t1 = torch.ones(3, 3, requires_grad=True) t2 = torch.zeros(3, 3, requires_grad=True) if ExecMode.RPC_SYNC == exec_mode: ret = rpc.rpc_sync("worker{}".format(dst_rank), fn, args=(t1, t2)) elif ExecMode.REMOTE == exec_mode: ret = rpc.remote("worker{}".format(dst_rank), fn, args=(t1, t2)).to_here().wait() else: raise ValueError("Unrecognized ExecMode {}".format(exec_mode)) rpc.rpc_sync("worker{}".format(dst_rank), _set_rpc_done, args=(context_id, 1)) # Verify graph for current context id. ctx = dist_autograd._current_context() self.assertEqual(context_id, ctx._context_id()) send_functions = ctx._send_functions() self.assertEqual(1, len(send_functions)) recv_functions = ctx._recv_functions() self.assertEqual(1, len(recv_functions)) self._verify_graph_for_first_rpc_call( list(send_functions.values())[0], list(recv_functions.values())[0], t1, t2, ret) # Wait for the prev rank to be done with rpc. self._check_rpc_done(1) # Verify graph for previous context id. ctx = dist_autograd._retrieve_context(ctx_ids[1]) send_functions = ctx._send_functions() self.assertEqual(1, len(send_functions)) self._verify_graph_for_rpc_call_exec( list(send_functions.values())[0]) # this barrier is needed so one worker does not clean up their # autograd context before another worker tries to access it. dist.barrier() # autograd context should be cleaned up by now. with self.assertRaises(RuntimeError): ctx = dist_autograd._retrieve_context(context_id) # No autograd context available. with self.assertRaises(RuntimeError): ctx = dist_autograd._current_context()
def _test_graph_for_py_nested_call_itself(self, exec_mode): dst_rank = (self.rank + 1) % self.world_size # This is for the below `dist.barrier`. # For `RpcAgent` other than `ProcessGroupAgent`, # no `_default_pg` is initialized. if not dist.is_initialized(): dist.init_process_group( backend="gloo", init_method=self.init_method, rank=self.rank, world_size=self.world_size, ) with dist_autograd.context() as context_id: t1 = torch.ones(3, 3, requires_grad=True) t2 = torch.zeros(3, 3, requires_grad=True) if ExecMode.RPC_SYNC == exec_mode: ret = rpc.rpc_sync("worker{}".format(dst_rank), my_py_nested_call, args=(t1, t2, (self.rank - 1 + self.world_size) % self.world_size, self.world_size, 0)) elif ExecMode.REMOTE == exec_mode: ret = rpc.remote( "worker{}".format(dst_rank), my_py_nested_call, args=(t1, t2, (self.rank - 1 + self.world_size) % self.world_size, self.world_size, 0)).to_here().wait() else: raise ValueError("Unrecognized ExecMode {}".format(exec_mode)) rpc.rpc_sync("worker{}".format((self.rank + 1) % self.world_size), _set_rpc_done, args=(context_id, 1)) # For self.rank, it has 2 graphs to verify. # One is for current context id when this rank send first rpc # call and execute the torch.add() operator. # Another one is for prev context id when this rank make # nested call. ctx = dist_autograd._current_context() self.assertEqual(context_id, ctx._context_id()) send_functions = ctx._send_functions() self.assertEqual(2, len(send_functions)) recv_functions = ctx._recv_functions() self.assertEqual(2, len(recv_functions)) self._verify_graph_for_first_rpc_call( list(send_functions.values())[0], list(recv_functions.values())[1], t1, t2, ret) self._verify_graph_for_rpc_call_exec( list(send_functions.values())[1]) # Verify two pairs of send and recv functions for nested # call self._check_rpc_done(1) ctx = dist_autograd._retrieve_context(ctx_ids[1]) self._verify_graph_for_nested_rpc_call(ctx) # this barrier is needed so one worker does not clean up their # autograd context before another worker tries to access it. dist.barrier()
def __init__( self, remote_device: str, module_cls: nn.Module, args: Tuple = None, kwargs: Dict[str, Any] = None, _module_interface_cls: Any = None, ): """ A RemoteModule instance can only be created after RPC initialization. It creates a user-specified module on a specified remote node. It behaves like a regular ``nn.Module`` except that the ``forward`` method is executed on the remote node. It takes care of autograd recording to ensure the backward pass propogates gradients back to the corresponding remote module. The arguments of ``forward_async`` and ``forward`` are the same as the ``forward`` method of the module returned by the ``module_cls``. Apart from ``forward_async`` and ``forward``, no other methods are supported from nn.Module for now. Particularly, to create a hybrid model, typically the local modules should be created outside of remote modules, rather than as submodules of any remote module (by calling ``add_module``). Hybrid Example: >>> class HybridModel(nn.Module): >>> def __init__(self): >>> nn.Module.__init__(self) >>> self.remote_embedding = RemoteModule(...) >>> self.local_linear = nn.Linear(...) For example, if ``module_cls`` returns an instance of ``nn.Linear``, that has ``forward`` method signature, ``def forward(input: Tensor) -> Tensor:``, the generated ``RemoteModule`` will have 2 methods in signature of ``def forward(input: Tensor) -> Tensor:`` and ``def forward_async(input: Tensor) -> Future[Tensor]:``. .. note:: If the remote module is placed on a cuda device, any input CPU tensors will be automatically moved to the same cuda device, and GPU tensors are returned over the wire according to the device map of the remote worker on TensorPipe RPC backend. Args: remote_device (str): Device on the destination worker where we'd like to place this module. The format should be "<workername>/<device>", where the device field can be parsed as torch.device type. E.g., "trainer0/cpu", "trainer0", "ps0/cuda:0". In addition, the device field can be optional and the default value is "cpu". module_cls (nn.Module): For example, >>> class MyModule(nn.Module): >>> def forward(input): >>> return input + 1 >>> >>> module_cls = MyModule args (Sequence, optional): args to be passed to ``module_cls``. kwargs (Dict, optional): kwargs to be passed to ``module_cls``. _module_interface_cls (type, optional): The TorchScript interface type for the module to be created. The type object should be decorated by @torch.jit.interface. If not provided, the generated RemoteModule is not torchscript-able. Warning, this is an experimental API and susceptible to frequent changes. Returns: A remote module instance which wraps the :class:`~nn.Module` created by the user-provided ``module_cls``, it has a blocking ``forward`` method and an asynchronous ``forward_async`` method that returns a future of the ``forward`` call on the user-provided module on the remote side. Example:: Run the following code in two different processes: >>> # On worker 0: >>> import torch >>> import torch.distributed.rpc as rpc >>> from torch import nn, Tensor >>> from torch.distributed.nn.api.remote_module import RemoteModule >>> >>> rpc.init_rpc("worker0", rank=0, world_size=2) >>> remote_linear_module = RemoteModule( >>> "worker1/cpu", nn.Linear, args=(20, 30), >>> ) >>> input = torch.randn(128, 20) >>> ret_fut = remote_linear_module.forward_async(input) >>> ret = ret_fut.wait() >>> rpc.shutdown() >>> # On worker 1: >>> import torch >>> import torch.distributed.rpc as rpc >>> >>> rpc.init_rpc("worker1", rank=1, world_size=2) >>> rpc.shutdown() """ super().__init__() # Sanity checks. assert rpc._is_current_rpc_agent_set( ), "RemoteModule only works in RPC." # Default arguments preperation. args = args if args is not None else () kwargs = kwargs if kwargs is not None else {} self.on, self.device = _parse_remote_device(remote_device) agent = rpc._get_current_rpc_agent() # If the device map of the remote worker is set, # then enable moving any input CPU tensors to the same cuda device. self.is_device_map_set = bool( agent._get_device_map(agent.get_worker_info(self.on))) # ``enable_moving_cpu_tensors_to_cuda`` is less strict than ``is_device_map_set``: # If ``enable_moving_cpu_tensors_to_cuda`` is true, but the device map is not set, # then any CPU tensors can still be moved to a cuda device to run forward, # but the output must be moved back to CPU before being sent over the wire. enable_moving_cpu_tensors_to_cuda = torch.device( self.device).type == "cuda" if _module_interface_cls is not None: # Users reply on this field to know if this generated RemoteModule is TorchScript-able. self.is_scriptable = True # Instantiate template on remote side. fut = rpc.rpc_async( self.on, _instantiate_template, (_module_interface_cls, enable_moving_cpu_tensors_to_cuda), ) # Instantiate template on local side. generated_module = ( instantiator.instantiate_scriptable_remote_module_template( _module_interface_cls, enable_moving_cpu_tensors_to_cuda)) generated_methods = generated_module._generated_methods # Create the module on the remote side. fut.wait() # Ensure remote_module_cls is available on remote side. else: self.is_scriptable = False generated_methods = _NON_SCRIPTABLE_REMOTE_MODULE_MODULE._generated_methods # Create the module on the remote side. self.module_rref = rpc.rpc_sync( self.on, _create_module, (module_cls, args, kwargs, self.device, _module_interface_cls), ) # Install generated methods. for method in generated_methods: method_name = method.__name__ method = torch.jit.export(method) setattr(self, method_name, types.MethodType(method, self))
def run_worker(ps_rref, data_dir, batch_size, num_epochs, worker, job_name): worker_rank = int(worker[-1]) info_socketm = znet.SocketMsger.tcp_connect(DORKER0_IP, INFO_PORT) info_socketm.send("WORKER") info_socketm.send(f"1.0\n/home/ubuntu/measurement/logs/{job_name}_info{worker_rank}.log\n{job_name}") logger = Logger(job_name=job_name, file_dir=f"./measurement/logs/{job_name}_{worker}.log").logger transform = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) train_dataset = datasets.CIFAR10(root=data_dir, train=True, download=True, transform=transform) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) device_id = 0 device = torch.device(f"cuda:{device_id}" if torch.cuda.is_available() else "cpu") name = rpc.get_worker_info().name ps_rref.rpc_sync().set_ps_launched_to_true() m = ps_rref.rpc_sync().get_model().to(device) criterion = nn.CrossEntropyLoss() criterion = criterion.to(device) stop_flag = False info_socketm.send("START") if info_socketm.recv() != "CONFIRM": return cm_t1_end = time.time() tt0 = time.time() for i in range(num_epochs): for batch_idx, (data, target) in enumerate(train_loader): data, target = data.to(device), target.to(device) output = m(data) loss = criterion(output, target) loss.backward() cm_t0_start = time.time() cp_t = 1000 * (cm_t0_start - cm_t1_end) logger.info("{:8s} | Epoch: {:3d} | Batch: {:3d} | Loss: {:6.2f} | Computation Time: {:7.2f} ms" .format(name, (i + 1), (batch_idx + 1), loss.item(), cp_t)) m, stop_flag = rpc.rpc_sync( to=ps_rref.owner(), func=ParameterServer.update_and_fetch_model, args=(ps_rref, [p.grad for p in m.cpu().parameters()], name, i, batch_idx, cm_t0_start, cm_t1_end)) m.to(device) cm_t1_end = time.time() if stop_flag: break if stop_flag: break tt1 = time.time() info_socketm.send("END") logger.info("Time: {:.2f} seconds".format((tt1 - tt0)))
def _run_trainer(emb_rref_list, rank): r""" Each trainer runs a forward pass which involves an embedding lookup on the 8 parameter servers and running nn.Linear locally. During the backward pass, DDP is responsible for aggregating the gradients for the dense part (nn.Linear) and distributed autograd ensures gradients updates are propagated to the parameter servers. """ # Setup the model. model = HybridModel(emb_rref_list, rank) # Retrieve all model parameters as rrefs for DistributedOptimizer. # Retrieve parameters from all embedding tables for the current trainer. model_parameter_rrefs = [] for ind, emb_rref in enumerate(emb_rref_list): ps_name = "ps{}".format(ind) model_parameter_rrefs.extend( rpc.rpc_sync(ps_name, _retrieve_embedding_parameters, args=(emb_rref, ))) # model.parameters() only includes local parameters. for param in model.parameters(): model_parameter_rrefs.append(RRef(param)) # Setup distributed optimizer opt = DistributedOptimizer(optim.SGD, model_parameter_rrefs, lr=0.05) criterion = torch.nn.CrossEntropyLoss() def get_next_batch(rank): for _ in range(10): num_indices = random.randint(20, 50) indices = torch.LongTensor(num_indices).random_(0, NUM_EMBEDDINGS) # Generate offsets. offsets = [] start = 0 batch_size = 0 while start < num_indices: offsets.append(start) start += random.randint(1, 10) batch_size += 1 offsets_tensor = torch.LongTensor(offsets) target = torch.LongTensor(batch_size).random_(8).cuda(rank) yield indices, offsets_tensor, target measurements = [] # Include warm-up cycles during training for epoch in range(100 + WARMUP_CYCLES): start = time.time() batch_size = 0 # create distributed autograd context for indices, offsets, target in get_next_batch(rank): batch_size += len(target) with dist_autograd.context() as context_id: output = model(indices, offsets) loss = criterion(output, target) # Run distributed backward pass dist_autograd.backward(context_id, [loss]) # Run distributed optimizer. Gradients propagated all the way to the parameter servers opt.step(context_id) # Not necessary to zero grads as each iteration creates a different # distributed autograd context which hosts different grads measurements.append(time.time() - start) # print("Training done for epoch {}".format(epoch)) # Throw away warm-up measurements measurements = measurements[WARMUP_CYCLES:] return rank, measurements, batch_size
def script_rpc_sync_call(dst_worker_name: str, args: Tuple[Tensor, Tensor], kwargs: Dict[str, Tensor]): res = rpc.rpc_sync(dst_worker_name, two_args_two_kwargs, args, kwargs) return res
def test_rref_context_debug_info(self): # This test checks local states that are modified by remote workers. # This means that we would need barrier before and after every check. # The barrier before the check makes sure that all previous states are # cleared globally, the barrier after ensures that no following states # change gets into the current check. if not dist.is_initialized(): dist.init_process_group( backend="gloo", init_method=self.init_method, rank=self.rank, world_size=self.world_size, ) from torch.distributed.rpc import _rref_context_get_debug_info # Check 1: local RRef does not update owners_ map ################################################# rref1 = RRef(self.rank) # don't need a barrier here as local RRef is handled by this thread info = _rref_context_get_debug_info() self.assertIn("num_owner_rrefs", info) # RRef on local value is not added to context until shared across RPC self.assertEqual(0, int(info["num_owner_rrefs"])) # barrier after the check 1 dist.barrier() # Check 2: Sharing RRef as an arg should update owners_ map ########################################################### dst_rank = (self.rank + 1) % self.world_size rpc.rpc_sync( "worker{}".format(dst_rank), set_global_rref, args=(rref1,) ) # barrier before check 2 dist.barrier() info = _rref_context_get_debug_info() self.assertIn("num_owner_rrefs", info) self.assertEqual(1, int(info["num_owner_rrefs"])) # barrier after check 2 dist.barrier() # clear states for check 2 rpc.rpc_sync("worker{}".format(dst_rank), clear_global_rref) # Check 3: rpc.remote call should update owners_ map #################################################### rref2 = rpc.remote( "worker{}".format(dst_rank), torch.add, args=(torch.ones(2, 2), 1) ) rref3 = rpc.remote( "worker{}".format(dst_rank), torch.add, args=(torch.ones(2, 2), 1) ) rref2.to_here() rref3.to_here() # barrier before check 3 dist.barrier() info = _rref_context_get_debug_info() self.assertIn("num_owner_rrefs", info) self.assertEqual(2, int(info["num_owner_rrefs"])) # barrier after check 3 dist.barrier()
def remote_method(method, rref, *args, **kwargs): args = [method, rref] + list(args) return rpc.rpc_sync(rref.owner(), call_method, args=args, kwargs=kwargs)
def my_nested_rref_add(dst, rref_t1, t2): return rpc.rpc_sync(dst, my_rref_add, args=(rref_t1, t2))
def __init__( self, on: str, module_cls: nn.Module, args: Tuple = None, kwargs: Dict[str, Any] = None, _module_interface_cls: Any = None, ): """ A RemoteModule instance can only be created after RPC initialization. It creates a user-specified module on a specified remote node. It behaves like a regular ``nn.Module`` except that the ``forward`` method is executed on the remote node. It takes care of autograd recording to ensure the backward pass propogates gradients back to the corresponding remote module. The arguments of ``forward_async`` and ``forward`` are the same as the ``forward`` method of the module returned by the ``module_cls``. For example, if ``module_cls`` returns an instace of ``nn.Linear``, that has ``forward`` method signature, ``def forward(input: Tensor) -> Tensor:``, the generated ``RemoteModule`` will have 2 methods in signature of ``def forward(input: Tensor) -> Tensor:`` and ``def forward_async(input: Tensor) -> Future[Tensor]:``. Arguments: on (str or WorkerInfo): id or name of the destination worker. module_cls (nn.Module): For example, >>> class MyModule(nn.Module): >>> def forward(input): >>> return input + 1 >>> >>> module_cls = MyModule args (Sequence, optional): args to be passed to ``module_cls``. kwargs (Dict, optional): kwargs to be passed to ``module_cls``. _module_interface_cls (type, optional): The TorchScript interface type for the module to be created. The type object should be decorated by @torch.jit.interface. If not provided, the generated RemoteModule is not torchscript-able. Warning, this is an experimental API and susceptible to frequent changes. Returns: A remote module instance which wraps the :class:`~nn.Module` created by the user-provided ``module_cls``, it has a blocking ``forward`` method and an asynchronous ``forward_async`` method that returns a future of the ``forward`` call on the user-provided module on the remote side. Example:: Run the following code in two different processes: >>> # On worker 0: >>> import torch >>> import torch.distributed.rpc as rpc >>> from torch import nn, Tensor >>> from torch.distributed.nn.api.remote_module import RemoteModule >>> >>> rpc.init_rpc("worker0", rank=0, world_size=2) >>> remote_linear_module = RemoteModule( >>> "worker1", nn.Linear, args=(20, 30), >>> ) >>> input = torch.randn(128, 20) >>> ret_fut = remote_linear_module.forward_async(input) >>> ret = ret_fut.wait() >>> rpc.shutdown() >>> # On worker 1: >>> import torch >>> import torch.distributed.rpc as rpc >>> >>> rpc.init_rpc("worker1", rank=1, world_size=2) >>> rpc.shutdown() """ super().__init__() # Sanity checks. assert rpc._is_current_rpc_agent_set(), "RemoteModule only works in RPC." # Default arguments preperation. args = args if args is not None else () kwargs = kwargs if kwargs is not None else {} if _module_interface_cls is not None: # Users reply on this field to know if this generated RemoteModule is TorchScript-able. self.is_scriptable = True # Instantiate template on remote side. fut = rpc.rpc_async(on, _instantiate_template, (_module_interface_cls,)) # Instantiate template on local side. generated_module = instantiator.instantiate_scriptable_remote_module_template( _module_interface_cls ) generated_methods = generated_module._generated_methods # Create the module on the remote side. fut.wait() # Ensure remote_module_cls is available on remote side. else: self.is_scriptable = False generated_methods = _NON_SCRIPTABLE_REMOTE_MODULE_MODULE._generated_methods # Create the module on the remote side. self.module_rref = rpc.rpc_sync( on, _create_module, (module_cls, args, kwargs, _module_interface_cls), ) # Install generated methods. for method in generated_methods: method_name = method.__name__ method = torch.jit.export(method) setattr(self, method_name, types.MethodType(method, self))
def _remote_method(method, rref, args=[]): args = [method, rref] + list(args) return rpc_sync(rref.owner(), _call_method, args=args)
def _remote_method(method, rref, *args, **kwargs): args_tup = tuple([method, rref] + list(args)) return rpc.rpc_sync(rref.owner(), _call_method, args=args_tup, kwargs=kwargs)
def _run_trainer(emb_rref, rank): r""" Each trainer runs a forward pass which involves an embedding lookup on the parameter server and running nn.Linear locally. During the backward pass, DDP is responsible for aggregating the gradients for the dense part (nn.Linear) and distributed autograd ensures gradients updates are propagated to the parameter server. """ # Setup the model. model = HybridModel(emb_rref, rank) # Retrieve all model parameters as rrefs for DistributedOptimizer. # Retrieve parameters for embedding table. model_parameter_rrefs = rpc.rpc_sync("ps", _retrieve_embedding_parameters, args=(emb_rref, )) # model.parameters() only includes local parameters. for param in model.parameters(): model_parameter_rrefs.append(RRef(param)) # Setup distributed optimizer opt = DistributedOptimizer( optim.SGD, model_parameter_rrefs, lr=0.05, ) criterion = torch.nn.CrossEntropyLoss() def get_next_batch(rank): for _ in range(10): num_indices = random.randint(20, 50) indices = torch.LongTensor(num_indices).random_(0, NUM_EMBEDDINGS) # Generate offsets. offsets = [] start = 0 batch_size = 0 while start < num_indices: offsets.append(start) start += random.randint(1, 10) batch_size += 1 offsets_tensor = torch.LongTensor(offsets) target = torch.LongTensor(batch_size).random_(8).cuda(rank) yield indices, offsets_tensor, target # Train for 100 epochs for epoch in range(100): # create distributed autograd context for indices, offsets, target in get_next_batch(rank): with dist_autograd.context() as context_id: output = model(indices, offsets) loss = criterion(output, target) # Run distributed backward pass dist_autograd.backward(context_id, [loss]) # Tun distributed optimizer opt.step(context_id) # Not necessary to zero grads as each iteration creates a different # distributed autograd context which hosts different grads print("Training done for epoch {}".format(epoch))
def test_rref_list_mutate(self): dst = worker_name((self.rank + 1) % self.world_size) list_rref = rpc.remote(dst, list_create) rpc.rpc_sync(dst, rref_list_mutate, args=(list_rref, )) self.assertEqual(list_rref.to_here(), [1, 2, 3, 4, 5, 6])
def run_worker(ps_rref, data_dir, batch_size, num_epochs, worker, job_name): worker_rank = int(worker[-1]) info_socketm = znet.SocketMsger.tcp_connect(DORKER0_IP, INFO_PORT) info_socketm.send("WORKER") info_socketm.send( f"1.0\n/home/ubuntu/measurement/logs/{job_name}_info{worker_rank}.log\n{job_name}" ) logger = Logger( job_name=job_name, file_dir=f"./measurement/logs/{job_name}_{worker}.log").logger train_iter = WikiText2(root=data_dir, split='train') tokenizer = get_tokenizer('basic_english') vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=["<unk>"]) vocab.set_default_index(vocab["<unk>"]) bptt = 35 train_iter, val_iter, test_iter = WikiText2(root=data_dir) train_data = data_process(train_iter, vocab, tokenizer) train_data = batchify(train_data, batch_size) device_id = 0 device = torch.device( f"cuda:{device_id}" if torch.cuda.is_available() else "cpu") name = rpc.get_worker_info().name ps_rref.rpc_sync().set_ps_launched_to_true() m = ps_rref.rpc_sync().get_model().to(device) criterion = nn.CrossEntropyLoss() criterion = criterion.to(device) stop_flag = False info_socketm.send("START") if info_socketm.recv() != "CONFIRM": return cm_t1_end = time.time() tt0 = time.time() for epoch in range(num_epochs): hidden = m.init_hidden(batch_size) for batch_idx, i in enumerate(range(0, train_data.size(0) - 1, bptt)): data, target = get_batch(train_data, i, bptt) data, target = data.to(device), target.to(device) hidden = repackage_hidden(hidden) output, hidden = m(data, hidden) loss = criterion(output, target) loss.backward() cm_t0_start = time.time() cp_t = 1000 * (cm_t0_start - cm_t1_end) logger.info( "{:8s} | Epoch: {:3d} | Batch: {:3d} | Loss: {:6.2f} | Computation Time: {:7.2f} ms" .format(name, (epoch + 1), (batch_idx + 1), loss.item(), cp_t)) m, stop_flag = rpc.rpc_sync( to=ps_rref.owner(), func=ParameterServer.update_and_fetch_model, args=(ps_rref, [p.grad for p in m.cpu().parameters()], name, epoch, batch_idx, cm_t0_start, cm_t1_end)) m.to(device) cm_t1_end = time.time() if stop_flag: break if stop_flag: break tt1 = time.time() info_socketm.send("END") logger.info("Time: {:.2f} seconds".format((tt1 - tt0)))
def nested_rpc(dst): return rpc.rpc_sync(dst, torch.add, args=(torch.ones(2, 2), 1))
def __init__( self, remote_device: str, module_cls: Type[nn.Module], args: Tuple = None, kwargs: Dict[str, Any] = None, _module_interface_cls: Any = None, ): """ A RemoteModule instance can only be created after RPC initialization. It creates a user-specified module on a specified remote node. It behaves like a regular ``nn.Module`` except that the ``forward`` method is executed on the remote node. It takes care of autograd recording to ensure the backward pass propogates gradients back to the corresponding remote module. It can be shared across processors using `RPC framework <https://pytorch.org/docs/stable/rpc.html>`__, without incurring any overheads of copying the actual module, which is equivalent to an :class:`~torch.distributed.rpc.RRef` pointing to the remote module. The arguments of ``forward_async`` and ``forward`` are the same as the ``forward`` method of the module returned by the ``module_cls``. Apart from ``forward_async`` and ``forward``, no other methods are supported from nn.Module for now. Particularly, to create a hybrid model, typically the local modules should be created outside of remote modules, rather than as submodules of any remote module (by calling ``add_module``). Hybrid Example: >>> class HybridModel(nn.Module): >>> def __init__(self): >>> nn.Module.__init__(self) >>> self.remote_embedding = RemoteModule(...) >>> self.local_linear = nn.Linear(...) For example, if ``module_cls`` returns an instance of ``nn.Linear``, that has ``forward`` method signature, ``def forward(input: Tensor) -> Tensor:``, the generated ``RemoteModule`` will have 2 methods in signature of ``def forward(input: Tensor) -> Tensor:`` and ``def forward_async(input: Tensor) -> Future[Tensor]:``. .. note:: If the remote module is placed on a cuda device, any input CPU tensors will be automatically moved to the same cuda device, and GPU tensors are returned over the wire according to the device map of the remote worker on TensorPipe RPC backend. Args: remote_device (str): Device on the destination worker where we'd like to place this module. The device can be a local device or a remote device specified by one of the following remote formats: 1. "rank:<rank>/<device>" (ex: "rank:0/cuda:0"). 2. "<worker_name>/<device>" (ex: "trainer0/cuda:0"). In addition, the device field can be optional and the default value is "cpu". module_cls (nn.Module): For example, >>> class MyModule(nn.Module): >>> def forward(input): >>> return input + 1 >>> >>> module_cls = MyModule args (Sequence, optional): args to be passed to ``module_cls``. kwargs (Dict, optional): kwargs to be passed to ``module_cls``. _module_interface_cls (type, optional): The TorchScript interface type for the module to be created. The type object should be decorated by @torch.jit.interface. If not provided, the generated RemoteModule is not torchscript-able. Warning, this is an experimental API and susceptible to frequent changes. Returns: A remote module instance which wraps the :class:`~nn.Module` created by the user-provided ``module_cls``, it has a blocking ``forward`` method and an asynchronous ``forward_async`` method that returns a future of the ``forward`` call on the user-provided module on the remote side. Example:: Run the following code in two different processes: >>> # On worker 0: >>> import torch >>> import torch.distributed.rpc as rpc >>> from torch import nn, Tensor >>> from torch.distributed.nn.api.remote_module import RemoteModule >>> >>> rpc.init_rpc("worker0", rank=0, world_size=2) >>> remote_linear_module = RemoteModule( >>> "worker1/cpu", nn.Linear, args=(20, 30), >>> ) >>> input = torch.randn(128, 20) >>> ret_fut = remote_linear_module.forward_async(input) >>> ret = ret_fut.wait() >>> rpc.shutdown() >>> # On worker 1: >>> import torch >>> import torch.distributed.rpc as rpc >>> >>> rpc.init_rpc("worker1", rank=1, world_size=2) >>> rpc.shutdown() """ super().__init__() enable_moving_cpu_tensors_to_cuda = self._prepare_init(remote_device) # Default arguments preperation. args = args if args is not None else () kwargs = kwargs if kwargs is not None else {} if _module_interface_cls is not None: # Users reply on this field to know if this generated RemoteModule is TorchScript-able. self.is_scriptable = True # Instantiate template on remote side. fut = rpc.rpc_async( self.on, _instantiate_template, (_module_interface_cls, enable_moving_cpu_tensors_to_cuda), ) self._init_template(_module_interface_cls, enable_moving_cpu_tensors_to_cuda) # Instantiate template on remote side. fut = rpc.rpc_async( self.on, _instantiate_template, (_module_interface_cls, enable_moving_cpu_tensors_to_cuda), ) # Create the module on the remote side. fut.wait() # Ensure remote_module_cls is available on remote side. # TODO: We need to change this to rpc.remote, and make it async (see the else branch below). # For that we need to be able to apply _module_interface_cls to the RRef returned by rpc.remote # See https://github.com/pytorch/pytorch/issues/58098 for more context. self.module_rref = rpc.rpc_sync( self.on, _create_module_with_interface, (module_cls, args, kwargs, self.device, _module_interface_cls), ) else: self.is_scriptable = False self.generated_methods = ( _NON_SCRIPTABLE_REMOTE_MODULE_MODULE._generated_methods) # Create the module on the remote side. self.module_rref = rpc.remote( self.on, _create_module, (module_cls, args, kwargs, self.device), ) self._install_generated_methods() self._check_attribute_picklability()
def test_py_built_in(self): n = self.rank + 1 dst_rank = n % self.world_size ret = rpc.rpc_sync("worker{}".format(dst_rank), min, args=(n, n + 1, n + 2)) self.assertEqual(ret, min(n, n + 1, n + 2))
def test_autograd_functions(self): dst_rank = (self.rank + 1) % self.world_size with dist_autograd.context() as context_id: t1 = torch.ones(3, 3, requires_grad=True) t2 = torch.zeros(3, 3, requires_grad=True) ret = rpc.rpc_sync("worker{}".format(dst_rank), torch.add, args=(t1, t2)) rpc.rpc_sync("worker{}".format(dst_rank), _set_rpc_done, args=(context_id, )) # Get send function. ctx = dist_autograd._current_context() self.assertEqual(context_id, ctx._context_id()) send_functions = ctx._send_functions() self.assertEqual(1, len(send_functions)) # Ensure that the destination workerId is recorded on this context. worker_ids = ctx._known_worker_ids() self.assertEqual(len(worker_ids), 1) self.assertEqual(dst_rank, worker_ids[0]) # Retrieve the next functions in the graph. next_funcs = list(send_functions.values())[0].next_functions self.assertEqual(2, len(next_funcs)) # We should now hit t1 and t2 in the autograd graph. self.assertEqual("torch::autograd::AccumulateGrad", next_funcs[0][0].name()) self.assertEqual(t1, next_funcs[0][0].variable) self.assertEqual(0, next_funcs[0][1]) self.assertEqual("torch::autograd::AccumulateGrad", next_funcs[1][0].name()) self.assertEqual(t2, next_funcs[1][0].variable) self.assertEqual(0, next_funcs[1][1]) # Test recv functions. recv_functions = ctx._recv_functions() self.assertEqual(1, len(recv_functions)) self.assertEqual(ret.grad_fn, list(recv_functions.values())[0]) # We should have send/recv functions from the previous rank, get all # contexts in this node to find them. # Wait for the prev rank to be done with rpc. while not prev_rank_rpc_done: time.sleep(0.1) pass # Now verify the autograd graph. ctx = dist_autograd._retrieve_context(prev_rank_context_id) # Get the send function. send_functions = ctx._send_functions() self.assertEqual(1, len(send_functions)) # Verify next function is AddBackward0 next_funcs = list(send_functions.values())[0].next_functions self.assertEqual(1, len(next_funcs)) add_backward_fn = next_funcs[0][0] self.assertEqual("AddBackward0", add_backward_fn.name()) # Verify the next two functions are the same recv backward function. next_funcs = add_backward_fn.next_functions self.assertEqual(2, len(next_funcs)) self.assertEqual("torch::distributed::autograd::RecvRpcBackward", next_funcs[0][0].name()) self.assertEqual("torch::distributed::autograd::RecvRpcBackward", next_funcs[1][0].name()) self.assertEqual(next_funcs[0][0], next_funcs[1][0]) # autograd context should be cleaned up by now. with self.assertRaises(RuntimeError): ctx = dist_autograd._retrieve_context(context_id) # No autograd context available. with self.assertRaises(RuntimeError): ctx = dist_autograd._current_context()
def test_py_no_return_result(self): n = self.rank + 1 dst_rank = n % self.world_size ret = rpc.rpc_sync("worker{}".format(dst_rank), no_result) self.assertEqual(ret, no_result())
def _remote_method_direct(method, other_node: str, *args, **kwargs): args = [method, other_node] + list(args) # return rpc.rpc_sync(other_node, _call_method, args=args, kwargs=kwargs) return rpc.rpc_sync(other_node, method, args=args, kwargs=kwargs)
def do_test_on_master( self, ddp_mode: DdpMode, simulate_uneven_inputs: bool, remote_em_rref: rpc.RRef, remote_net_rref: rpc.RRef, ): if simulate_uneven_inputs: gLogger.info( "Running DDP + RPC test with simulating uneven inputs across trainers." ) trainer_rrefs = [] for rank in TRAINER_RANKS: trainer = self.trainer_name(rank) trainer_rrefs.append( rpc.remote( trainer, Trainer, args=(remote_em_rref, remote_net_rref, ddp_mode, rank), )) if ddp_mode in (DdpMode.INSIDE, DdpMode.OUTSIDE): # new_group needs to be called on ranks. dist.new_group(TRAINER_RANKS) training_examples = get_training_examples() for _ in range(3): futures = [] num_trainers = len(trainer_rrefs) for idx, trainer_rref in enumerate(trainer_rrefs): # Half the trainers will deplete inputs earlier than the rest. trainer_has_less_inputs = (simulate_uneven_inputs and idx < num_trainers // 2) futures.append( _remote_method_async( Trainer.train_batch, trainer_rref, training_examples[idx], trainer_has_less_inputs, simulate_uneven_inputs, )) for future in futures: ddp_grads, non_ddp_grads = future.wait() # When there are uneven inputs, it is not necessary that grads # cancel each other out, since some trainers contribute 0 grad. if not simulate_uneven_inputs: for grad in ddp_grads: self.assertEqual( grad, torch.zeros_like(grad), msg= f"The grad for any ddp parameter should be zeros, because " "the training examples' grads cancel each other. Received " f"gradient {grad}", ) for grad in non_ddp_grads: self.assertNotEqual( grad, torch.zeros_like(grad), msg= "The grad for any non-ddp parameter shouldn't be zeros", ) # Destroy process groups for idx, trainer_rref in enumerate(trainer_rrefs): _remote_method_async(Trainer.destroy_pg, trainer_rref).wait() # Send shutdown signals. for rank in TRAINER_RANKS: trainer = self.trainer_name(rank) rpc.rpc_sync(trainer, set_shutdown_signal, args=()) rpc.rpc_sync(self.remote_worker_name(), set_shutdown_signal, args=())
def parameter_rrefs(self) -> List[rpc.RRef]: rrefs_list_of_lists = [ rpc.rpc_sync(l.owner(), _parameter_rrefs, args=(l, )) for l in self.rmodule ] return list(itertools.chain(*rrefs_list_of_lists))