def run_agent(run_id, etcd_host, etcd_port, start_method, worker_fn, worker_args=()): rdzv_handler = dist.rendezvous( f"etcd://{etcd_host}:{etcd_port}/{run_id}" f"?min_workers=2" f"&max_workers=2") spec = WorkerSpec( role="test_trainer", local_world_size=1, fn=worker_fn, args=worker_args, rdzv_handler=rdzv_handler, max_restarts=3, monitor_interval=1, ) agent = LocalElasticAgent(spec, start_method) agent.run()
def _get_worker_spec( self, max_restarts=1, monitor_interval=1.0, role="test_trainer", local_world_size=8, ): run_id = str(uuid.uuid4().int) endpoint = self._etcd_server.get_endpoint() rdzv_handler = dist.rendezvous( f"etcd://{endpoint}/{run_id}?min_workers=1&max_workers=1" ) spec = WorkerSpec( role=role, local_world_size=local_world_size, fn=do_nothing, args=(), rdzv_handler=rdzv_handler, max_restarts=max_restarts, monitor_interval=monitor_interval, ) return spec
def _get_worker_spec( self, fn, args=(), max_restarts=1, num_agents=1, monitor_interval=0.1, local_world_size=8, ): run_id = str(uuid.uuid4().int) rdzv_handler = dist.rendezvous( f"etcd://{self._etcd_server.get_endpoint()}/{run_id}" f"?min_workers={num_agents}" f"&max_workers={num_agents}") spec = WorkerSpec( role="test_trainer", local_world_size=local_world_size, fn=fn, args=args, rdzv_handler=rdzv_handler, max_restarts=max_restarts, monitor_interval=monitor_interval, ) return spec
def _get_worker_spec( self, fn, args=(), max_restarts=1, num_agents=1, monitor_interval=0.1, local_world_size=8, ): run_id = str(uuid.uuid4().int) rdzv_handler = dist.rendezvous( f"zeus-adapter://localhost:{self._mock_zeus_port}/{run_id}" f"?min_size={num_agents}" f"&max_size={num_agents}") spec = WorkerSpec( role="test_trainer", local_world_size=local_world_size, fn=fn, args=args, rdzv_handler=rdzv_handler, max_restarts=max_restarts, monitor_interval=monitor_interval, ) return spec
def test_unknown_handler(self): with self.assertRaisesRegex(RuntimeError, "^No rendezvous handler"): c10d.rendezvous('invalid://')
def test_url_with_node_params(self): with self.assertRaisesRegex(AssertionError, "has node-specific arguments"): dist.rendezvous("file://foo?rank=12&world_size=16", 12, 16)
def init_rpc( name, backend=None, rank=-1, world_size=None, rpc_backend_options=None, ): r""" Initializes RPC primitives such as the local RPC agent and distributed autograd, which immediately makes the current process ready to send and receive RPCs. Args: name (str): a globally unique name of this node. (e.g., ``Trainer3``, ``ParameterServer2``, ``Master``, ``Worker1``) Name can only contain number, alphabet, underscore, colon, and/or dash, and must be shorter than 128 characters. backend (BackendType, optional): The type of RPC backend implementation. Supported values is ``BackendType.TENSORPIPE`` (the default). See :ref:`rpc-backends` for more information. rank (int): a globally unique id/rank of this node. world_size (int): The number of workers in the group. rpc_backend_options (RpcBackendOptions, optional): The options passed to the RpcAgent constructor. It must be an agent-specific subclass of :class:`~torch.distributed.rpc.RpcBackendOptions` and contains agent-specific initialization configurations. By default, for all agents, it sets the default timeout to 60 seconds and performs the rendezvous with an underlying process group initialized using ``init_method = "env://"``, meaning that environment variables ``MASTER_ADDR`` and ``MASTER_PORT`` need to be set properly. See :ref:`rpc-backends` for more information and find which options are available. """ torch._C._log_api_usage_once("torch.distributed.init_rpc") if backend is not None and not isinstance( backend, backend_registry.BackendType): raise TypeError("Argument backend must be a member of BackendType") if rpc_backend_options is not None and not isinstance( rpc_backend_options, RpcBackendOptions): raise TypeError( "Argument rpc_backend_options must be an instance of RpcBackendOptions" ) # Try to detect the backend from the options if backend is None and rpc_backend_options is not None: for candidate_backend in BackendType: if isinstance( rpc_backend_options, type( backend_registry.construct_rpc_backend_options( candidate_backend)), ): backend = candidate_backend break else: raise TypeError( f"Could not infer backend for options {rpc_backend_options}" ) # Ignore type error because mypy doesn't handle dynamically generated type objects (#4865) if backend != BackendType.TENSORPIPE: # type: ignore[attr-defined] logger.warning( f"RPC was initialized with no explicit backend but with options " # type: ignore[attr-defined] f"corresponding to {backend}, hence that backend will be used " f"instead of the default {BackendType.TENSORPIPE}. To silence this " f"warning pass `backend={backend}` explicitly.") if backend is None: backend = BackendType.TENSORPIPE # type: ignore[attr-defined] if rpc_backend_options is None: # default construct a set of RPC backend options. rpc_backend_options = backend_registry.construct_rpc_backend_options( backend) # Rendezvous. # This rendezvous state sometimes is destroyed before all processes # finishing handshaking. To avoid that issue, we make it global to # keep it alive. global rendezvous_iterator rendezvous_iterator = dist.rendezvous(rpc_backend_options.init_method, rank=rank, world_size=world_size) store, _, _ = next(rendezvous_iterator) # Use same timeout as RPC. store.set_timeout(timedelta(seconds=rpc_backend_options.rpc_timeout)) # Use a PrefixStore to distinguish multiple invocations. with _init_counter_lock: global _init_counter store = dist.PrefixStore( str("rpc_prefix_{}".format(_init_counter)), store) _init_counter += 1 # Initialize autograd before RPC since _init_rpc_backend guarantees all # processes sync via the store. If we initialize autograd after RPC, # there could be a race where some nodes might have initialized autograd # and others might not have. As a result, a node calling # torch.distributed.autograd.backward() would run into errors since # other nodes might not have been initialized. dist_autograd._init(rank) _set_profiler_node_id(rank) # Initialize RPC. _init_rpc_backend(backend, store, name, rank, world_size, rpc_backend_options)