def run_agent(run_id, etcd_host, etcd_port, start_method, worker_fn, worker_args=()): rdzv_params = RendezvousParameters( backend="etcd", endpoint=f"{etcd_host}:{etcd_port}", run_id=run_id, min_nodes=2, max_nodes=2, ) rdzv_handler = rdzv_registry.get_rendezvous_handler(rdzv_params) spec = WorkerSpec( role="test_trainer", local_world_size=1, fn=worker_fn, args=worker_args, rdzv_handler=rdzv_handler, max_restarts=3, monitor_interval=1, ) agent = LocalElasticAgent(spec, start_method) agent.run()
def test_run_bipolar_function(self): spec = self._get_worker_spec(fn=_bipolar_function, max_restarts=2) agent = LocalElasticAgent(spec, start_method="fork") with self.assertRaises(Exception): agent.run() self.assertEqual(WorkerState.FAILED, agent.get_worker_group().state) self.assertEqual(0, agent._remaining_restarts)
def main(args=None): # If ``args`` not passed, defaults to ``sys.argv[:1]`` args = parse_args(args) min_nodes, max_nodes = parse_min_max_nnodes(args.nnodes) assert 0 < min_nodes <= max_nodes assert args.max_restarts > 0 rdzv_parameters = parameters.RendezvousParameters( args.rdzv_backend, args.rdzv_endpoint, args.rdzv_id, min_nodes, max_nodes, args.rdzv_conf, ) rdzv_handler = parameters.get_rendezvous(rdzv_parameters) omp_num_threads = None if "OMP_NUM_THREADS" not in os.environ and args.nproc_per_node > 1: omp_num_threads = 1 print( f"*****************************************\n" f"Setting OMP_NUM_THREADS environment variable for each process to be " f"{omp_num_threads} in default, to avoid your system being overloaded, " f"please further tune the variable for optimal performance in " f"your application as needed. \n" f"*****************************************") with_python = not args.no_python cmd = [] if with_python: cmd = [sys.executable, "-u"] if args.module: cmd.append("-m") else: if not args.use_env: raise ValueError("When using the '--no_python' flag," " you must also set the '--use_env' flag.") if args.module: raise ValueError("Don't use both the '--no_python' flag" " and the '--module' flag at the same time.") cmd.append(args.training_script) cmd.extend(args.training_script_args) spec = WorkerSpec( role="default", local_world_size=args.nproc_per_node, fn=wrapper_fn, args=(omp_num_threads, args.use_env, cmd), rdzv_handler=rdzv_handler, max_restarts=args.max_restarts, monitor_interval=args.monitor_interval, ) metrics.initialize_metrics() elastic_agent = LocalElasticAgent(spec, start_method=args.start_method) elastic_agent.run(spec.role)
def test_run_sad_function(self): spec = self._get_worker_spec(fn=_sad_function, max_restarts=2) agent = LocalElasticAgent(spec, start_method="fork") with self.assertRaises(WorkerGroupFailureException) as cm: agent.run() excs = cm.exception.get_worker_exceptions() for i in range(spec.local_world_size): self.assertTrue(isinstance(excs[i], Exception)) self.assertEqual(WorkerState.FAILED, agent.get_worker_group().state) self.assertEqual(0, agent._remaining_restarts)
def _run_agent( run_id, etcd_host, etcd_port, min_size, max_size, func_to_run, args, local_world_size=8, role="test_trainer", output_dict=None, agent_barrier_timeout=300, ): rdzv_handler = dist.rendezvous(f"etcd://{etcd_host}:{etcd_port}/{run_id}" f"?min_workers={min_size}" f"&max_workers={max_size}") spec = WorkerSpec( role=role, local_world_size=local_world_size, fn=func_to_run, args=args, rdzv_handler=rdzv_handler, max_restarts=2, monitor_interval=1, ) agent = LocalElasticAgent(spec, start_method="fork", exit_barrier_timeout=agent_barrier_timeout) res = agent.run() if output_dict is not None: key = str(uuid.uuid4().int) output_dict[key] = (role, res)
def test_check_role_name(self): spec = self._get_worker_spec(fn=_get_env_var, args=("ROLE_NAME",)) agent = LocalElasticAgent(spec, start_method="fork") group_result = agent.run() results = group_result.return_values for role_name in results.values(): self.assertEquals(spec.role, role_name)
def _run_agent(run_id, etcd_host, etcd_port, min_size, max_size, wait=0): rdzv_handler = dist.rendezvous(f"etcd://{etcd_host}:{etcd_port}/{run_id}" f"?min_workers={min_size}" f"&max_workers={max_size}") spec = WorkerSpec( role="test_trainer", local_world_size=8, fn=_distributed_sum, args=(wait, ), rdzv_handler=rdzv_handler, max_restarts=2, monitor_interval=1, ) agent = LocalElasticAgent(spec, start_method="fork") agent.run()
def _test_failed_result_with_run_id(self): max_restarts = 3 spec = self._get_worker_spec(fn=_sad_function, max_restarts=max_restarts) agent = LocalElasticAgent(spec, start_method="fork") run_result = agent.run() for failure in run_result.failures.values(): error_file = failure.error_file self.assertTrue(error_file.endswith(f"_{max_restarts}"))
def test_get_worker_return_values(self): spec = self._get_worker_spec(fn=_return_rank_times, args=(2,)) agent = LocalElasticAgent(spec, start_method="fork") ret_vals = agent.run() self.assertEqual(spec.local_world_size, len(ret_vals)) for i in range(spec.local_world_size): self.assertEqual(i * 2, ret_vals[i])
def test_run_check_run_id(self): def return_run_id(): return os.environ["TORCHELASTIC_RUN_ID"] spec = self._get_worker_spec(fn=return_run_id, max_restarts=0) agent = LocalElasticAgent(spec, start_method="fork") ret = agent.run() for i in range(spec.local_world_size): self.assertEqual(spec.rdzv_handler.get_run_id(), ret[i])
def run_agent( run_id, etcd_host, etcd_port, start_method, worker_fn, worker_args=() ): rdzv_handler = dist.rendezvous( f"etcd://{etcd_host}:{etcd_port}/{run_id}" f"?min_workers=2" f"&max_workers=2" ) spec = WorkerSpec( role="test_trainer", local_world_size=1, fn=worker_fn, args=worker_args, rdzv_handler=rdzv_handler, max_restarts=3, monitor_interval=1, ) agent = LocalElasticAgent(spec, start_method) agent.run()
def _test_transient_bug(self, error_dir: str): max_restarts = 3 spec = self._get_worker_spec(fn=_transient_bug, max_restarts=max_restarts) agent = LocalElasticAgent(spec, start_method="fork") run_result = agent.run() self.assertEqual(WorkerState.SUCCEEDED, run_result.state) for rank in range(len(run_result.return_values)): error_file_0 = os.path.join(error_dir, str(rank), "error.log_0") self.assertTrue(os.path.exists(error_file_0)) error_file_1 = os.path.join(error_dir, str(rank), "error.log_1") self.assertFalse(os.path.exists(error_file_1))
def test_run_segv_function(self): expected_error_index = 0 expected_failure = signal.SIGSEGV spec = self._get_worker_spec( fn=_fatal_signal_function, max_restarts=2, args=(expected_error_index, expected_failure), ) try: agent = LocalElasticAgent(spec, start_method="spawn") with self.assertRaises(WorkerGroupFailureException) as cm: agent.run() finally: spec.rdzv_handler.shutdown() excs = cm.exception.get_worker_exceptions() for i in range(spec.local_world_size): self.assertTrue(isinstance(excs[i], WorkerSignaledException)) self.assertEqual(expected_failure.name, excs[i].signal_name) self.assertEqual(WorkerState.FAILED, agent.get_worker_group().state) self.assertEqual(0, agent._remaining_restarts)
def _test_run_sad_function(self): spec = self._get_worker_spec(fn=_sad_function, max_restarts=0) agent = LocalElasticAgent(spec, start_method="fork") group_results = agent.run() failed_results = group_results.failures self.assertEqual(spec.local_world_size, len(failed_results)) # all ranks will have the same result for result in failed_results.values(): self.assertTrue(os.path.exists(result.error_file)) with open(result.error_file, "r") as f: data = f.read().replace("\n", "") self.assertTrue("RuntimeError: sad because i throw" in data) self.assertEqual(WorkerState.FAILED, agent.get_worker_group().state) self.assertEqual(0, agent._remaining_restarts)
def _run_agent( run_id, etcd_host, etcd_port, min_size, max_size, func_to_run, args, local_world_size=8, role="test_trainer", output_dict=None, agent_barrier_timeout=300, ): rdzv_params = RendezvousParameters( backend="etcd", endpoint=f"{etcd_host}:{etcd_port}", run_id=run_id, min_nodes=min_size, max_nodes=max_size, ) rdzv_handler = rdzv_registry.get_rendezvous_handler(rdzv_params) spec = WorkerSpec( role=role, local_world_size=local_world_size, fn=func_to_run, args=args, rdzv_handler=rdzv_handler, max_restarts=2, monitor_interval=1, ) agent = LocalElasticAgent( spec, start_method="fork", exit_barrier_timeout=agent_barrier_timeout ) res = agent.run() if output_dict is not None: key = str(uuid.uuid4().int) output_dict[key] = (role, res)
def test_barrier_failed(self, barrier_mock): barrier_mock.side_effect = RuntimeError("test error") spec = self._get_worker_spec(fn=_happy_function) agent = LocalElasticAgent(spec, start_method="fork") agent.run() barrier_mock.assert_called_once()
def test_run_check_env_function(self): spec = self._get_worker_spec(fn=_check_env_function, max_restarts=2) agent = LocalElasticAgent(spec, start_method="fork") agent.run()
def test_run_distributed_sum(self): spec = self._get_worker_spec(fn=_distributed_sum, args=(0,)) agent = LocalElasticAgent(spec, start_method="fork") agent.run()
def test_run_happy_function(self): spec = self._get_worker_spec(fn=_happy_function) agent = LocalElasticAgent(spec, start_method="fork") agent.run()
def main(args=None): # If ``args`` not passed, defaults to ``sys.argv[:1]`` args = parse_args(args) min_nodes, max_nodes = parse_min_max_nnodes(args.nnodes) assert 0 < min_nodes <= max_nodes assert args.max_restarts >= 0 if args.standalone: etcd_server = EtcdServer() etcd_server.start() args.rdzv_backend = "etcd" args.rdzv_endpoint = etcd_server.get_endpoint() args.rdzv_id = str(uuid.uuid4()) log.info( f"\n**************************************\n" f"Rendezvous info:\n" f"--rdzv_backend={args.rdzv_backend} " f"--rdzv_endpoint={args.rdzv_endpoint} " f"--rdzv_id={args.rdzv_id}\n" f"**************************************\n" ) nproc_per_node = determine_local_world_size(args.nproc_per_node) omp_num_threads = None if "OMP_NUM_THREADS" not in os.environ and nproc_per_node > 1: omp_num_threads = 1 print( f"*****************************************\n" f"Setting OMP_NUM_THREADS environment variable for each process to be " f"{omp_num_threads} in default, to avoid your system being overloaded, " f"please further tune the variable for optimal performance in " f"your application as needed. \n" f"*****************************************" ) with_python = not args.no_python cmd = [] if with_python: cmd = [sys.executable, "-u"] if args.module: cmd.append("-m") else: if args.module: raise ValueError( "Don't use both the '--no_python' flag" " and the '--module' flag at the same time." ) cmd.append(args.training_script) cmd.extend(args.training_script_args) rdzv_parameters = RendezvousParameters( backend=args.rdzv_backend, endpoint=args.rdzv_endpoint, run_id=args.rdzv_id, min_nodes=min_nodes, max_nodes=max_nodes, **_parse_rdzv_conf(args.rdzv_conf), ) rdzv_handler = rdzv_registry.get_rendezvous_handler(rdzv_parameters) try: spec = WorkerSpec( role=args.role, local_world_size=nproc_per_node, fn=wrapper_fn, args=(omp_num_threads, cmd), rdzv_handler=rdzv_handler, max_restarts=args.max_restarts, monitor_interval=args.monitor_interval, ) metrics.initialize_metrics() elastic_agent = LocalElasticAgent(spec, start_method=args.start_method) elastic_agent.run(spec.role) finally: rdzv_handler.shutdown() if args.standalone: etcd_server.stop()
def launch_agent( config: LaunchConfig, entrypoint: Union[Callable, str, None], args: List[Any], ) -> Dict[int, Any]: if not config.run_id: run_id = str(uuid.uuid4().int) logger.warning(f"config has no run_id, generate a new one: {run_id}") config.run_id = run_id entrypoint_name = _get_entrypoint_name(entrypoint, args) logger.info( f"Starting elastic_operator with launch configs:\n" f" entrypoint : {entrypoint_name}\n" f" min_nodes : {config.min_nodes}\n" f" max_nodes : {config.max_nodes}\n" f" nproc_per_node : {config.nproc_per_node}\n" f" run_id : {config.run_id}\n" f" rdzv_backend : {config.rdzv_backend}\n" f" rdzv_endpoint : {config.rdzv_endpoint}\n" f" rdzv_configs : {config.rdzv_configs}\n" f" max_restarts : {config.max_restarts}\n" f" monitor_interval : {config.monitor_interval}\n" f" log_dir : {config.log_dir}\n" f" metrics_cfg : {config.metrics_cfg}\n" ) rdzv_parameters = RendezvousParameters( backend=config.rdzv_backend, endpoint=config.rdzv_endpoint, run_id=config.run_id, min_nodes=config.min_nodes, max_nodes=config.max_nodes, **config.rdzv_configs, ) agent = None rdzv_handler = rdzv_registry.get_rendezvous_handler(rdzv_parameters) try: spec = WorkerSpec( role=config.role, local_world_size=config.nproc_per_node, entrypoint=entrypoint, args=tuple(args), rdzv_handler=rdzv_handler, max_restarts=config.max_restarts, monitor_interval=config.monitor_interval, redirects=config.redirects, tee=config.tee, ) cfg = metrics.MetricsConfig(config.metrics_cfg) if config.metrics_cfg else None metrics.initialize_metrics(cfg) agent = LocalElasticAgent( spec=spec, start_method=config.start_method, log_dir=config.log_dir ) result = agent.run() events.record(agent.get_agent_status_event(WorkerState.SUCCEEDED)) if result.is_failed(): # ChildFailedError is treated specially by @record # if the error files for the failed children exist # @record will copy the first error (root cause) # to the error file of the launcher process. raise ChildFailedError( name=entrypoint_name, failures=result.failures, ) else: return result.return_values except ChildFailedError: raise except Exception: if agent: events.record(agent.get_agent_status_event(WorkerState.FAILED)) else: events.record(_construct_event(config)) raise finally: rdzv_handler.shutdown()
def main(args=None): # If ``args`` not passed, defaults to ``sys.argv[:1]`` args = parse_args(args) min_nodes, max_nodes = parse_min_max_nnodes(args.nnodes) assert 0 < min_nodes <= max_nodes assert args.max_restarts >= 0 elastic_agent = None if args.standalone: etcd_server = EtcdServer() etcd_server.start() args.rdzv_backend = "etcd" args.rdzv_endpoint = etcd_server.get_endpoint() args.rdzv_id = str(uuid.uuid4()) log.info(f"\n**************************************\n" f"Rendezvous info:\n" f"--rdzv_backend={args.rdzv_backend} " f"--rdzv_endpoint={args.rdzv_endpoint} " f"--rdzv_id={args.rdzv_id}\n" f"**************************************\n") nproc_per_node = determine_local_world_size(args.nproc_per_node) if "OMP_NUM_THREADS" not in os.environ and nproc_per_node > 1: omp_num_threads = 1 print( f"*****************************************\n" f"Setting OMP_NUM_THREADS environment variable for each process to be " f"{omp_num_threads} in default, to avoid your system being overloaded, " f"please further tune the variable for optimal performance in " f"your application as needed. \n" f"*****************************************") # This env variable will be passed down to the subprocesses os.environ["OMP_NUM_THREADS"] = str(omp_num_threads) with_python = not args.no_python cmd = [] if with_python: cmd = [sys.executable, "-u"] if args.module: cmd.append("-m") else: if args.module: raise ValueError("Don't use both the '--no_python' flag" " and the '--module' flag at the same time.") cmd.append(args.training_script) cmd.extend(args.training_script_args) rdzv_parameters = RendezvousParameters( backend=args.rdzv_backend, endpoint=args.rdzv_endpoint, run_id=args.rdzv_id, min_nodes=min_nodes, max_nodes=max_nodes, **_parse_rendezvous_config(args.rdzv_conf), ) rdzv_handler = rdzv_registry.get_rendezvous_handler(rdzv_parameters) try: spec = WorkerSpec( role=args.role, local_world_size=nproc_per_node, entrypoint=cmd[0], args=(*cmd[1:], ), rdzv_handler=rdzv_handler, max_restarts=args.max_restarts, monitor_interval=args.monitor_interval, redirects=Std.from_str(args.redirects), tee=Std.from_str(args.tee), ) metrics.initialize_metrics() elastic_agent = LocalElasticAgent(spec=spec, start_method=args.start_method, log_dir=args.log_dir) run_result = elastic_agent.run(spec.role) events.record( elastic_agent.get_agent_status_event(WorkerState.SUCCEEDED)) if run_result.is_failed(): # ChildFailedError is treated specially by @record # if the error files for the failed children exist # @record will copy the first error (root cause) # to the error file of the launcher process raise ChildFailedError( name=args.training_script, failures=run_result.failures, ) except ChildFailedError: raise except Exception: if elastic_agent: events.record( elastic_agent.get_agent_status_event(WorkerState.FAILED)) else: events.record(_construct_event(args)) raise finally: rdzv_handler.shutdown() if args.standalone: etcd_server.stop()
def main(args=None): # If ``args`` not passed, defaults to ``sys.argv[:1]`` args = parse_args(args) min_nodes, max_nodes = parse_min_max_nnodes(args.nnodes) assert 0 < min_nodes <= max_nodes assert args.max_restarts >= 0 if args.standalone: etcd_server = EtcdServer() etcd_server.start() args.rdzv_backend = "etcd" args.rdzv_endpoint = etcd_server.get_endpoint() args.rdzv_id = str(uuid.uuid4()) log.info(f"\n**************************************\n" f"Rendezvous info:\n" f"--rdzv_backend={args.rdzv_backend} " f"--rdzv_endpoint={args.rdzv_endpoint} " f"--rdzv_id={args.rdzv_id}\n" f"**************************************\n") nproc_per_node = determine_local_world_size(args.nproc_per_node) omp_num_threads = None if "OMP_NUM_THREADS" not in os.environ and nproc_per_node > 1: omp_num_threads = 1 print( f"*****************************************\n" f"Setting OMP_NUM_THREADS environment variable for each process to be " f"{omp_num_threads} in default, to avoid your system being overloaded, " f"please further tune the variable for optimal performance in " f"your application as needed. \n" f"*****************************************") # This env variable will be passed down to the subprocesses os.environ["OMP_NUM_THREADS"] = str(omp_num_threads) with_python = not args.no_python cmd = [] if with_python: cmd = [sys.executable, "-u"] if args.module: cmd.append("-m") else: if args.module: raise ValueError("Don't use both the '--no_python' flag" " and the '--module' flag at the same time.") cmd.append(args.training_script) cmd.extend(args.training_script_args) rdzv_parameters = RendezvousParameters( backend=args.rdzv_backend, endpoint=args.rdzv_endpoint, run_id=args.rdzv_id, min_nodes=min_nodes, max_nodes=max_nodes, **_parse_rdzv_conf(args.rdzv_conf), ) rdzv_handler = rdzv_registry.get_rendezvous_handler(rdzv_parameters) try: spec = WorkerSpec( role=args.role, local_world_size=nproc_per_node, cmd=cmd, rdzv_handler=rdzv_handler, max_restarts=args.max_restarts, monitor_interval=args.monitor_interval, ) metrics.initialize_metrics() elastic_agent = LocalElasticAgent(spec, start_method=args.start_method) group_result = elastic_agent.run(spec.role) if group_result.is_failed(): min_rank = min(group_result.failures.keys()) failure = group_result.failures[min_rank] # Note: this line will raise an exception to indicate to the # scheduler process that something went wrong. # If any workers wrote the error file, it will be propagated # to the scheduler specific destination. process_failure(failure) msg = f""" *********************************************************************** \n ***********************USER CODE FAILED WITH ERROR****************** \n\n {get_failure_message(failure)} \n ******************************************************************** \n\n ******************************************************************** \n """ log.warning(msg) # Expected (0-127), 0 - success, anything else - failure sys.exit(abs(failure.exit_code)) finally: rdzv_handler.shutdown() if args.standalone: etcd_server.stop() cleanup()