def run(args): if args.standalone: etcd_server = EtcdServer() etcd_server.start() args.rdzv_backend = "etcd" args.rdzv_endpoint = etcd_server.get_endpoint() args.rdzv_id = str(uuid.uuid4()) log.info( f"\n**************************************\n" f"Rendezvous info:\n" f"--rdzv_backend={args.rdzv_backend} " f"--rdzv_endpoint={args.rdzv_endpoint} " f"--rdzv_id={args.rdzv_id}\n" f"**************************************\n" ) config, cmd = config_from_args(args) try: elastic_launch( config=config, entrypoint=cmd[0], )(*cmd[1:]) finally: if args.standalone: etcd_server.stop()
def elastic_launch_wrapper(): """We need a wrapper function for class `elastic_launch.` in order to make multiprocess returns correct exit code.""" elastic_launch( self.get_test_launch_config( min_nodes, max_nodes, nproc_per_node, run_id ), sys.executable, )("-u", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}")
def test_launch_elastic(self): nproc_per_node = 4 elastic_launch( self.get_test_launch_config(1, 2, nproc_per_node), sys.executable, )("-u", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}") world_size = nproc_per_node self.check_works_ran(world_size)
def test_launch_script_bash(self): nnodes = 1 nproc_per_node = 4 elastic_launch( self.get_test_launch_config(nnodes, nnodes, nproc_per_node), path("bin/test_script.sh"), )(f"{self.test_dir}") world_size = nnodes * nproc_per_node self.check_works_ran(world_size)
def test_launch_elastic_agent_raise_exception(self, record_mock, mock_agent_run): """ Asserts that when the agent raises an exception the launcher re-raises the original exception. """ mock_agent_run.side_effect = MockException with self.assertRaises(MockException): elastic_launch( self.get_test_launch_config(1, 2, 4), sys.executable, )("-u", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}") record_mock.assert_called_once()
def test_launch_script_python_local_rank_transfer(self): nnodes = 1 nproc_per_node = 4 elastic_launch( self.get_test_launch_config(nnodes, nnodes, nproc_per_node), sys.executable, )("-u", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}") # make sure all the workers ran. # each worker touches a file with its global rank as the name. world_size = nnodes * nproc_per_node self.check_works_ran(world_size)
def elastic_launch_wrapper( test_dir: str, rdzv_endpoint: str, min_nodes: int, max_nodes: int, nproc_per_node: int, run_id: str, ): """A wrapper function for class `elastic_launch.` in order to make multiprocess returns correct exit code.""" elastic_launch( get_test_launch_config(rdzv_endpoint, min_nodes, max_nodes, nproc_per_node, run_id), sys.executable, )("-u", path("bin/test_script.py"), f"--touch_file_dir={test_dir}")
def test_launch_elastic_worker_raise_exception(self, record_mock): """ Asserts that when the worker program fails and lancher raieses exception to indicate that worker process failed. """ nproc_per_node = 4 with self.assertRaises(ChildFailedError): elastic_launch( self.get_test_launch_config(1, 2, nproc_per_node), sys.executable, )("-u", path("bin/test_script.py"), "--fail") record_mock.assert_called_once()
def test_launch_shutdown(self, agent_mock_cls): agent_mock = Mock() agent_mock.run.return_value = RunResult(WorkerState.SUCCEEDED) agent_mock_cls.return_value = agent_mock rdzv_handler_mock = Mock() with patch( "torch.distributed.elastic.rendezvous.registry.get_rendezvous_handler" ) as param_mock: param_mock.return_value = rdzv_handler_mock elastic_launch( self.get_test_launch_config(1, 1, 4), sys.executable, )("-u", path("bin/test_script.py"), f"--touch_file_dir={self.test_dir}") rdzv_handler_mock.shutdown.assert_called_once()
def test_launch_dist_sum_with_static_rdzv(self): nnodes = 1 nproc_per_node = 4 sock = get_socket_with_port() with closing(sock): master_port = sock.getsockname()[1] rdzv_endpoint = f"127.0.0.1:{master_port}" rank = 0 rdzv_config = { "rank": rank, } res = elastic_launch( get_test_launch_config( rdzv_endpoint, nnodes, nnodes, nproc_per_node, rdzv_backend="static", config=rdzv_config, ), _dist_sum, )() expected_res = [sum(range(nproc_per_node))] * nproc_per_node actual_res = sorted(value for value in res.values()) self.assertEqual(expected_res, actual_res)
def run(args): if args.standalone: args.rdzv_backend = "c10d" args.rdzv_endpoint = "localhost:29400" args.rdzv_id = str(uuid.uuid4()) log.info(f"\n**************************************\n" f"Rendezvous info:\n" f"--rdzv_backend={args.rdzv_backend} " f"--rdzv_endpoint={args.rdzv_endpoint} " f"--rdzv_id={args.rdzv_id}\n" f"**************************************\n") config, cmd, cmd_args = config_from_args(args) elastic_launch( config=config, entrypoint=cmd, )(*cmd_args)
def launch( fn, n_gpu_per_machine, n_machine=1, machine_rank=0, dist_url=None, launch_config=None, args=(), ): world_size = n_machine * n_gpu_per_machine if world_size > 1: if "OMP_NUM_THREADS" not in os.environ: os.environ["OMP_NUM_THREADS"] = "1" if launch_config is not None: elastic_launch(config=launch_config, entrypoint=elastic_worker)(fn, args) return if dist_url == "auto": if n_machine != 1: raise ValueError( 'dist_url="auto" not supported in multi-machine jobs') port = find_free_port() dist_url = f"tcp://127.0.0.1:{port}" if n_machine > 1 and dist_url.startswith("file://"): raise ValueError( "file:// is not a reliable init method in multi-machine jobs. Prefer tcp://" ) mp.spawn( distributed_worker, nprocs=n_gpu_per_machine, args=(fn, world_size, n_gpu_per_machine, machine_rank, dist_url, args), daemon=False, ) else: fn(*args)
def test_launch_function(self): nnodes = 1 nproc_per_node = 4 res = elastic_launch( self.get_test_launch_config(nnodes, nnodes, nproc_per_node), simple_rank_scale, )() expected_res = [10, 11, 12, 13] actual_res = sorted(value for value in res.values()) self.assertEqual(expected_res, actual_res)