def _parse_and_validate_remote_device(pg, remote_device): worker_name = remote_device.worker_name() rank = remote_device.rank() device = remote_device.device() # Validate rank, skip validation if rank is not part of process group. if not distributed_c10d._rank_not_in_group(pg): if rank is not None and (rank < 0 or rank >= distributed_c10d.get_world_size(pg)): raise ValueError(f'Invalid rank: {rank}') if worker_name is not None: if not rpc._is_current_rpc_agent_set(): raise RuntimeError( f'RPC framework needs to be initialized for using worker names: {worker_name}' ) workers = rpc._get_current_rpc_agent().get_worker_infos() for worker in workers: if worker.name == worker_name: return worker.id, device raise ValueError(f'Invalid worker name: {worker_name}') return rank, device
def _prepare_init(self, remote_device_str: str) -> bool: """ Prepares the initializaiton and returns whether to enable automatically moving CPU tensors to CUDA devices. """ # Sanity check. assert rpc._is_current_rpc_agent_set( ), "RemoteModule only works in RPC." remote_device = _remote_device(remote_device_str) self.on = remote_device.worker_name() if remote_device.worker_name( ) is not None else remote_device.rank() self.device = str(remote_device.device()) agent = rpc._get_current_rpc_agent() # If the device map of the remote worker is set, # then enable moving any input CPU tensors to the same cuda device. self.is_device_map_set = bool( agent._get_device_map(agent.get_worker_info( self.on)) # type: ignore[arg-type] ) # ``enable_moving_cpu_tensors_to_cuda`` is less strict than ``is_device_map_set``: # If ``enable_moving_cpu_tensors_to_cuda`` is true, but the device map is not set, # then any CPU tensors can still be moved to a cuda device to run forward, # but the output must be moved back to CPU before being sent over the wire. enable_moving_cpu_tensors_to_cuda = torch.device( self.device).type == "cuda" return enable_moving_cpu_tensors_to_cuda
def _init_rpc(self): self._rpc_initialized = True self._remote_shards = {} # Gather all the sharded tensor ids. world_size = dist.get_world_size(self._process_group) worker_infos = rpc._get_current_rpc_agent().get_worker_infos() rank_to_name = {} name_to_rank = {} for worker_info in worker_infos: rank_to_name[worker_info.id] = worker_info.name name_to_rank[worker_info.name] = worker_info.id rpc_workers = set() for rank in range(world_size): if self._process_group == distributed_c10d._get_default_group(): global_rank = rank else: global_rank = distributed_c10d._get_global_rank( self._process_group, rank) rpc_workers.add(rank_to_name[global_rank]) all_tensor_ids = rpc.api._all_gather(self._sharded_tensor_id, rpc_workers) # Share the local shards to the entire world. futs = [] rpc_rank = rpc.get_worker_info().id for rank in range(world_size): # Skip self. if rank == dist.get_rank(self._process_group): continue if self._process_group == distributed_c10d._get_default_group(): global_rank = rank else: global_rank = distributed_c10d._get_global_rank( self._process_group, rank) if len(self.local_shards()) != 0: rrefs: List[rpc.RRef[Shard]] = [ rpc.RRef(shard) for shard in self.local_shards() ] fut = rpc.rpc_async( global_rank, _register_remote_shards, args=(all_tensor_ids[rank_to_name[global_rank]], rrefs, rpc_rank)) futs.append(fut) torch.futures.wait_all(futs) # Barrier for all RPCs to finish on all ranks. rpc.api._barrier(rpc_workers)
def __init__(self, name: str, rank: int = -1, world_size: int = None, init_method: str = "tcp://localhost:9100", rpc_timeout: float = 60, rpc_threads: int = 8): """ Args: name: A unique name to identify current process. rank: A unique rank of the current process. You do not need to specify it if you are using `torch.distributed.launch` or `torchelastic` world_size: Size of the distributed world. You do not need to specify it if you are using `torch.distributed.launch` or `torchelastic` init_method: Backend initialization method. rpc_timeout: Global rpc call timeout in seconds. rpc_threads: Rpc recv/send thread num. """ self.world_size = world_size self.rank = rank self.name = name self.groups = {} self.group_create_signals = {} # "<rank-number>" is used as the unique name. rpc.init_rpc(self.name, rank=rank, world_size=world_size, rpc_backend_options=rpc.ProcessGroupRpcBackendOptions( init_method=init_method, num_send_recv_threads=rpc_threads, rpc_timeout=timedelta(seconds=rpc_timeout) )) # get rank-name mapping self.rank_name_map = {} for wi in rpc._get_current_rpc_agent().get_worker_infos(): self.rank_name_map[wi.id] = wi.name # Start role dispatching. self.started = True self.rpc_timeout = rpc_timeout # map for paired values and registered services self.value_lut = {} self.service_lut = {} self.lut_lock = Lock() self.lut_manager = self.rank_name_map[0]
def _init_rpc(self): # Validate PG and RPC ranks match. pg_rank = dist.get_rank() rpc_rank = rpc.get_worker_info().id if pg_rank != rpc_rank: raise ValueError( f'Default ProcessGroup and RPC ranks must be ' f'the same for ShardedTensor, found process group rank: ' f'{pg_rank} and RPC rank: {rpc_rank}') self._remote_shards = {} # Gather all the sharded tensor ids. world_size = dist.get_world_size(self._process_group) worker_infos = rpc._get_current_rpc_agent().get_worker_infos() rank_to_name = {} name_to_rank = {} for worker_info in worker_infos: rank_to_name[worker_info.id] = worker_info.name name_to_rank[worker_info.name] = worker_info.id all_tensor_ids = rpc.api._all_gather(self._sharded_tensor_id) # Share the local shards to the entire world. futs = [] rpc_rank = rpc.get_worker_info().id for rank in range(dist.get_world_size()): # Skip self. if rank == dist.get_rank(): continue if len(self.local_shards()) != 0: rrefs: List[rpc.RRef[Shard]] = [ rpc.RRef(shard) for shard in self.local_shards() ] fut = rpc.rpc_async(rank, _register_remote_shards, args=(all_tensor_ids[rank_to_name[rank]], rrefs, rpc_rank)) futs.append(fut) torch.futures.wait_all(futs) # Barrier for all RPCs to finish on all ranks. rpc.api._all_gather(None)
def _parse_and_validate_remote_device(self, device): on, local_device = _parse_remote_device(device) # type: ignore[arg-type] # Validate rank. if isinstance(on, int) and (on < 0 or on >= dist.get_world_size(self._process_group)): raise ValueError(f'Invalid rank: {on}') if isinstance(on, str): if not rpc._is_current_rpc_agent_set(): raise RuntimeError(f'RPC framework needs to be initialized for using worker names: {on}') workers = rpc._get_current_rpc_agent().get_worker_infos() for worker in workers: if worker.name == on: return worker.id, local_device raise ValueError(f'Invalid worker name: {on}') return on, local_device
def __init__( self, remote_device: str, module_cls: nn.Module, args: Tuple = None, kwargs: Dict[str, Any] = None, _module_interface_cls: Any = None, ): """ A RemoteModule instance can only be created after RPC initialization. It creates a user-specified module on a specified remote node. It behaves like a regular ``nn.Module`` except that the ``forward`` method is executed on the remote node. It takes care of autograd recording to ensure the backward pass propogates gradients back to the corresponding remote module. It can be shared across processors using `RPC framework <https://pytorch.org/docs/stable/rpc.html>`__, without incurring any overheads of copying the actual module, which is equivalent to an :class:`~torch.distributed.rpc.RRef` pointing to the remote module. The arguments of ``forward_async`` and ``forward`` are the same as the ``forward`` method of the module returned by the ``module_cls``. Apart from ``forward_async`` and ``forward``, no other methods are supported from nn.Module for now. Particularly, to create a hybrid model, typically the local modules should be created outside of remote modules, rather than as submodules of any remote module (by calling ``add_module``). Hybrid Example: >>> class HybridModel(nn.Module): >>> def __init__(self): >>> nn.Module.__init__(self) >>> self.remote_embedding = RemoteModule(...) >>> self.local_linear = nn.Linear(...) For example, if ``module_cls`` returns an instance of ``nn.Linear``, that has ``forward`` method signature, ``def forward(input: Tensor) -> Tensor:``, the generated ``RemoteModule`` will have 2 methods in signature of ``def forward(input: Tensor) -> Tensor:`` and ``def forward_async(input: Tensor) -> Future[Tensor]:``. .. note:: If the remote module is placed on a cuda device, any input CPU tensors will be automatically moved to the same cuda device, and GPU tensors are returned over the wire according to the device map of the remote worker on TensorPipe RPC backend. Args: remote_device (str): Device on the destination worker where we'd like to place this module. The format should be "<workername>/<device>", where the device field can be parsed as torch.device type. E.g., "trainer0/cpu", "trainer0", "ps0/cuda:0". In addition, the device field can be optional and the default value is "cpu". module_cls (nn.Module): For example, >>> class MyModule(nn.Module): >>> def forward(input): >>> return input + 1 >>> >>> module_cls = MyModule args (Sequence, optional): args to be passed to ``module_cls``. kwargs (Dict, optional): kwargs to be passed to ``module_cls``. _module_interface_cls (type, optional): The TorchScript interface type for the module to be created. The type object should be decorated by @torch.jit.interface. If not provided, the generated RemoteModule is not torchscript-able. Warning, this is an experimental API and susceptible to frequent changes. Returns: A remote module instance which wraps the :class:`~nn.Module` created by the user-provided ``module_cls``, it has a blocking ``forward`` method and an asynchronous ``forward_async`` method that returns a future of the ``forward`` call on the user-provided module on the remote side. Example:: Run the following code in two different processes: >>> # On worker 0: >>> import torch >>> import torch.distributed.rpc as rpc >>> from torch import nn, Tensor >>> from torch.distributed.nn.api.remote_module import RemoteModule >>> >>> rpc.init_rpc("worker0", rank=0, world_size=2) >>> remote_linear_module = RemoteModule( >>> "worker1/cpu", nn.Linear, args=(20, 30), >>> ) >>> input = torch.randn(128, 20) >>> ret_fut = remote_linear_module.forward_async(input) >>> ret = ret_fut.wait() >>> rpc.shutdown() >>> # On worker 1: >>> import torch >>> import torch.distributed.rpc as rpc >>> >>> rpc.init_rpc("worker1", rank=1, world_size=2) >>> rpc.shutdown() """ super().__init__() # NOTE: if a new attribute is added to this class, also need to add it # to ``_REMOTE_MODULE_PICKLED_ATTRIBUTES`` for pickling/unpickling. # Sanity checks. assert rpc._is_current_rpc_agent_set( ), "RemoteModule only works in RPC." # Default arguments preperation. args = args if args is not None else () kwargs = kwargs if kwargs is not None else {} self.on, self.device = _parse_remote_device(remote_device) agent = rpc._get_current_rpc_agent() # If the device map of the remote worker is set, # then enable moving any input CPU tensors to the same cuda device. self.is_device_map_set = bool( agent._get_device_map(agent.get_worker_info(self.on))) # ``enable_moving_cpu_tensors_to_cuda`` is less strict than ``is_device_map_set``: # If ``enable_moving_cpu_tensors_to_cuda`` is true, but the device map is not set, # then any CPU tensors can still be moved to a cuda device to run forward, # but the output must be moved back to CPU before being sent over the wire. enable_moving_cpu_tensors_to_cuda = torch.device( self.device).type == "cuda" if _module_interface_cls is not None: # Users reply on this field to know if this generated RemoteModule is TorchScript-able. self.is_scriptable = True # Instantiate template on remote side. fut = rpc.rpc_async( self.on, _instantiate_template, (_module_interface_cls, enable_moving_cpu_tensors_to_cuda), ) # Instantiate template on local side. generated_module = ( instantiator.instantiate_scriptable_remote_module_template( _module_interface_cls, enable_moving_cpu_tensors_to_cuda)) self.generated_methods = generated_module._generated_methods # Create the module on the remote side. fut.wait() # Ensure remote_module_cls is available on remote side. # TODO: We need to change this to rpc.remote, and make it async (see the else branch below). # For that we need to be able to apply _module_interface_cls to the RRef returned by rpc.remote # See https://github.com/pytorch/pytorch/issues/58098 for more context. self.module_rref = rpc.rpc_sync( self.on, _create_module_with_interface, (module_cls, args, kwargs, self.device, _module_interface_cls), ) else: self.is_scriptable = False self.generated_methods = ( _NON_SCRIPTABLE_REMOTE_MODULE_MODULE._generated_methods) # Create the module on the remote side. self.module_rref = rpc.remote( self.on, _create_module, (module_cls, args, kwargs, self.device), ) # Install generated methods. for method in self.generated_methods: method_name = method.__name__ method = torch.jit.export(method) setattr(self, method_name, types.MethodType(method, self)) # Sanity check: whether to be pickled must be explicitly defined for every attribute. for k in self.__dict__.keys(): if (k not in _REMOTE_MODULE_PICKLED_ATTRIBUTES and k not in _REMOTE_MODULE_ATTRIBUTES_IGNORE_FOR_PICKLING): raise AttributeError( "Attribute {} must be either in ``_REMOTE_MODULE_PICKLED_ATTRIBUTES`` or " "``_REMOTE_MODULE_ATTRIBUTES_IGNORE_FOR_PICKLING``.". format(k))
def __init__( self, name: str, rank: int = -1, world_size: int = -1, init_dist: bool = True, init_rpc: bool = True, dist_backend: str = "gloo", dist_init_method: str = "tcp://localhost:9100", rpc_init_method: str = "tcp://localhost:9101", dist_timeout: float = 60, rpc_timeout: float = 60, ): """ Args: name: A unique name to identify current process. rank: A unique rank of the current process. You do not need to specify it if you are using `torch.distributed.launch` or `torchelastic` world_size: Size of the distributed world. You do not need to specify it if you are using `torch.distributed.launch` or `torchelastic` dist_timeout: Distributed package timeout in seconds. rpc_timeout: Global rpc call timeout in seconds. """ self.world_size = world_size self.rank = rank self.name = name self.groups = {} self.group_create_signals = {} if init_dist: dist.init_process_group( backend=dist_backend, init_method=dist_init_method, timeout=timedelta(seconds=dist_timeout), rank=rank, world_size=world_size, ) if init_rpc: rpc.init_rpc( self.name, rank=rank, world_size=world_size, backend=rpc.BackendType.TENSORPIPE, rpc_backend_options=rpc.TensorPipeRpcBackendOptions( init_method=rpc_init_method, rpc_timeout=rpc_timeout ), ) # get rank-name mapping self.rank_name_map = {} for wi in rpc._get_current_rpc_agent().get_worker_infos(): self.rank_name_map[wi.id] = wi.name # Start role dispatching. self.started = True self.rpc_timeout = rpc_timeout # map for paired values and registered services self.value_lut = {} self.service_lut = {} self.lut_lock = Lock() self.lut_manager = self.rank_name_map[0]