def __init__( self, c10d_backend, init_method, max_num_trainers, process_group_timeout=10000, coordinator_pg_timeout=600000, # default 10 mins for coordinator pg timeout ): self.c10d_backend = c10d_backend self.init_method = init_method self.rendezvous = dist.rendezvous(init_method) assert isinstance( self.rendezvous, RendezvousHandler ), "CoordinatorP2P requires a torchelastic.rendezvous.RendezvousHandler" assert coordinator_pg_timeout > process_group_timeout, ( "coordinator_pg_timeout {} (ms) must larger than or equal to " "process_group_timeout {} (ms)".format( coordinator_pg_timeout, process_group_timeout ) ) self.max_num_trainers = max_num_trainers self.process_group_timeout = process_group_timeout self.coordinator_pg_timeout = coordinator_pg_timeout self.rank = -1 self.world_size = 0 self.is_worker_straggler = False self.stop_training = False self.coordinator_process_group = None self.monitor_progress_step = 0 self.host_name = socket.gethostname() self.pid = os.getpid() self.event_logger = get_event_logger() metrics.initialize_metrics()
def main(args=None): # If ``args`` not passed, defaults to ``sys.argv[:1]`` args = parse_args(args) min_nodes, max_nodes = parse_min_max_nnodes(args.nnodes) assert 0 < min_nodes <= max_nodes assert args.max_restarts > 0 rdzv_parameters = parameters.RendezvousParameters( args.rdzv_backend, args.rdzv_endpoint, args.rdzv_id, min_nodes, max_nodes, args.rdzv_conf, ) rdzv_handler = parameters.get_rendezvous(rdzv_parameters) omp_num_threads = None if "OMP_NUM_THREADS" not in os.environ and args.nproc_per_node > 1: omp_num_threads = 1 print( f"*****************************************\n" f"Setting OMP_NUM_THREADS environment variable for each process to be " f"{omp_num_threads} in default, to avoid your system being overloaded, " f"please further tune the variable for optimal performance in " f"your application as needed. \n" f"*****************************************") with_python = not args.no_python cmd = [] if with_python: cmd = [sys.executable, "-u"] if args.module: cmd.append("-m") else: if not args.use_env: raise ValueError("When using the '--no_python' flag," " you must also set the '--use_env' flag.") if args.module: raise ValueError("Don't use both the '--no_python' flag" " and the '--module' flag at the same time.") cmd.append(args.training_script) cmd.extend(args.training_script_args) spec = WorkerSpec( role="default", local_world_size=args.nproc_per_node, fn=wrapper_fn, args=(omp_num_threads, args.use_env, cmd), rdzv_handler=rdzv_handler, max_restarts=args.max_restarts, monitor_interval=args.monitor_interval, ) metrics.initialize_metrics() elastic_agent = LocalElasticAgent(spec, start_method=args.start_method) elastic_agent.run(spec.role)
def __init__(self, c10d_backend, init_method, max_num_trainers, process_group_timeout=10000): self.c10d_backend = c10d_backend self.init_method = init_method self.rendezvous = dist.rendezvous(init_method) assert isinstance( self.rendezvous, RendezvousHandler ), "CoordinatorP2P requires a torchelastic.rendezvous.RendezvousHandler" self.max_num_trainers = max_num_trainers self.process_group_timeout = process_group_timeout self.rank = -1 self.world_size = 0 self.is_worker_straggler = False self.stop_training = False self.coordinator_process_group = None self.monitor_progress_step = 0 metrics.initialize_metrics()
def main(args=None): # If ``args`` not passed, defaults to ``sys.argv[:1]`` args = parse_args(args) min_nodes, max_nodes = parse_min_max_nnodes(args.nnodes) assert 0 < min_nodes <= max_nodes assert args.max_restarts >= 0 elastic_agent = None if args.standalone: etcd_server = EtcdServer() etcd_server.start() args.rdzv_backend = "etcd" args.rdzv_endpoint = etcd_server.get_endpoint() args.rdzv_id = str(uuid.uuid4()) log.info(f"\n**************************************\n" f"Rendezvous info:\n" f"--rdzv_backend={args.rdzv_backend} " f"--rdzv_endpoint={args.rdzv_endpoint} " f"--rdzv_id={args.rdzv_id}\n" f"**************************************\n") nproc_per_node = determine_local_world_size(args.nproc_per_node) if "OMP_NUM_THREADS" not in os.environ and nproc_per_node > 1: omp_num_threads = 1 print( f"*****************************************\n" f"Setting OMP_NUM_THREADS environment variable for each process to be " f"{omp_num_threads} in default, to avoid your system being overloaded, " f"please further tune the variable for optimal performance in " f"your application as needed. \n" f"*****************************************") # This env variable will be passed down to the subprocesses os.environ["OMP_NUM_THREADS"] = str(omp_num_threads) with_python = not args.no_python cmd = [] if with_python: cmd = [sys.executable, "-u"] if args.module: cmd.append("-m") else: if args.module: raise ValueError("Don't use both the '--no_python' flag" " and the '--module' flag at the same time.") cmd.append(args.training_script) cmd.extend(args.training_script_args) rdzv_parameters = RendezvousParameters( backend=args.rdzv_backend, endpoint=args.rdzv_endpoint, run_id=args.rdzv_id, min_nodes=min_nodes, max_nodes=max_nodes, **_parse_rendezvous_config(args.rdzv_conf), ) rdzv_handler = rdzv_registry.get_rendezvous_handler(rdzv_parameters) try: spec = WorkerSpec( role=args.role, local_world_size=nproc_per_node, entrypoint=cmd[0], args=(*cmd[1:], ), rdzv_handler=rdzv_handler, max_restarts=args.max_restarts, monitor_interval=args.monitor_interval, redirects=Std.from_str(args.redirects), tee=Std.from_str(args.tee), ) metrics.initialize_metrics() elastic_agent = LocalElasticAgent(spec=spec, start_method=args.start_method, log_dir=args.log_dir) run_result = elastic_agent.run(spec.role) events.record( elastic_agent.get_agent_status_event(WorkerState.SUCCEEDED)) if run_result.is_failed(): # ChildFailedError is treated specially by @record # if the error files for the failed children exist # @record will copy the first error (root cause) # to the error file of the launcher process raise ChildFailedError( name=args.training_script, failures=run_result.failures, ) except ChildFailedError: raise except Exception: if elastic_agent: events.record( elastic_agent.get_agent_status_event(WorkerState.FAILED)) else: events.record(_construct_event(args)) raise finally: rdzv_handler.shutdown() if args.standalone: etcd_server.stop()
def main(args=None): # If ``args`` not passed, defaults to ``sys.argv[:1]`` args = parse_args(args) min_nodes, max_nodes = parse_min_max_nnodes(args.nnodes) assert 0 < min_nodes <= max_nodes assert args.max_restarts >= 0 if args.standalone: etcd_server = EtcdServer() etcd_server.start() args.rdzv_backend = "etcd" args.rdzv_endpoint = etcd_server.get_endpoint() args.rdzv_id = str(uuid.uuid4()) log.info( f"\n**************************************\n" f"Rendezvous info:\n" f"--rdzv_backend={args.rdzv_backend} " f"--rdzv_endpoint={args.rdzv_endpoint} " f"--rdzv_id={args.rdzv_id}\n" f"**************************************\n" ) nproc_per_node = determine_local_world_size(args.nproc_per_node) omp_num_threads = None if "OMP_NUM_THREADS" not in os.environ and nproc_per_node > 1: omp_num_threads = 1 print( f"*****************************************\n" f"Setting OMP_NUM_THREADS environment variable for each process to be " f"{omp_num_threads} in default, to avoid your system being overloaded, " f"please further tune the variable for optimal performance in " f"your application as needed. \n" f"*****************************************" ) with_python = not args.no_python cmd = [] if with_python: cmd = [sys.executable, "-u"] if args.module: cmd.append("-m") else: if args.module: raise ValueError( "Don't use both the '--no_python' flag" " and the '--module' flag at the same time." ) cmd.append(args.training_script) cmd.extend(args.training_script_args) rdzv_parameters = RendezvousParameters( backend=args.rdzv_backend, endpoint=args.rdzv_endpoint, run_id=args.rdzv_id, min_nodes=min_nodes, max_nodes=max_nodes, **_parse_rdzv_conf(args.rdzv_conf), ) rdzv_handler = rdzv_registry.get_rendezvous_handler(rdzv_parameters) try: spec = WorkerSpec( role=args.role, local_world_size=nproc_per_node, fn=wrapper_fn, args=(omp_num_threads, cmd), rdzv_handler=rdzv_handler, max_restarts=args.max_restarts, monitor_interval=args.monitor_interval, ) metrics.initialize_metrics() elastic_agent = LocalElasticAgent(spec, start_method=args.start_method) elastic_agent.run(spec.role) finally: rdzv_handler.shutdown() if args.standalone: etcd_server.stop()
def main(args=None): # If ``args`` not passed, defaults to ``sys.argv[:1]`` args = parse_args(args) min_nodes, max_nodes = parse_min_max_nnodes(args.nnodes) assert 0 < min_nodes <= max_nodes assert args.max_restarts >= 0 if args.standalone: etcd_server = EtcdServer() etcd_server.start() args.rdzv_backend = "etcd" args.rdzv_endpoint = etcd_server.get_endpoint() args.rdzv_id = str(uuid.uuid4()) log.info(f"\n**************************************\n" f"Rendezvous info:\n" f"--rdzv_backend={args.rdzv_backend} " f"--rdzv_endpoint={args.rdzv_endpoint} " f"--rdzv_id={args.rdzv_id}\n" f"**************************************\n") nproc_per_node = determine_local_world_size(args.nproc_per_node) omp_num_threads = None if "OMP_NUM_THREADS" not in os.environ and nproc_per_node > 1: omp_num_threads = 1 print( f"*****************************************\n" f"Setting OMP_NUM_THREADS environment variable for each process to be " f"{omp_num_threads} in default, to avoid your system being overloaded, " f"please further tune the variable for optimal performance in " f"your application as needed. \n" f"*****************************************") # This env variable will be passed down to the subprocesses os.environ["OMP_NUM_THREADS"] = str(omp_num_threads) with_python = not args.no_python cmd = [] if with_python: cmd = [sys.executable, "-u"] if args.module: cmd.append("-m") else: if args.module: raise ValueError("Don't use both the '--no_python' flag" " and the '--module' flag at the same time.") cmd.append(args.training_script) cmd.extend(args.training_script_args) rdzv_parameters = RendezvousParameters( backend=args.rdzv_backend, endpoint=args.rdzv_endpoint, run_id=args.rdzv_id, min_nodes=min_nodes, max_nodes=max_nodes, **_parse_rdzv_conf(args.rdzv_conf), ) rdzv_handler = rdzv_registry.get_rendezvous_handler(rdzv_parameters) try: spec = WorkerSpec( role=args.role, local_world_size=nproc_per_node, cmd=cmd, rdzv_handler=rdzv_handler, max_restarts=args.max_restarts, monitor_interval=args.monitor_interval, ) metrics.initialize_metrics() elastic_agent = LocalElasticAgent(spec, start_method=args.start_method) group_result = elastic_agent.run(spec.role) if group_result.is_failed(): min_rank = min(group_result.failures.keys()) failure = group_result.failures[min_rank] # Note: this line will raise an exception to indicate to the # scheduler process that something went wrong. # If any workers wrote the error file, it will be propagated # to the scheduler specific destination. process_failure(failure) msg = f""" *********************************************************************** \n ***********************USER CODE FAILED WITH ERROR****************** \n\n {get_failure_message(failure)} \n ******************************************************************** \n\n ******************************************************************** \n """ log.warning(msg) # Expected (0-127), 0 - success, anything else - failure sys.exit(abs(failure.exit_code)) finally: rdzv_handler.shutdown() if args.standalone: etcd_server.stop() cleanup()