Exemplo n.º 1
0
        def run_agent(run_id,
                      etcd_host,
                      etcd_port,
                      start_method,
                      worker_fn,
                      worker_args=()):
            rdzv_params = RendezvousParameters(
                backend="etcd",
                endpoint=f"{etcd_host}:{etcd_port}",
                run_id=run_id,
                min_nodes=2,
                max_nodes=2,
            )
            rdzv_handler = rdzv_registry.get_rendezvous_handler(rdzv_params)

            spec = WorkerSpec(
                role="test_trainer",
                local_world_size=1,
                fn=worker_fn,
                args=worker_args,
                rdzv_handler=rdzv_handler,
                max_restarts=3,
                monitor_interval=1,
            )

            agent = LocalElasticAgent(spec, start_method)
            agent.run()
Exemplo n.º 2
0
 def test_run_bipolar_function(self):
     spec = self._get_worker_spec(fn=_bipolar_function, max_restarts=2)
     agent = LocalElasticAgent(spec, start_method="fork")
     with self.assertRaises(Exception):
         agent.run()
     self.assertEqual(WorkerState.FAILED, agent.get_worker_group().state)
     self.assertEqual(0, agent._remaining_restarts)
Exemplo n.º 3
0
def main(args=None):
    # If ``args`` not passed, defaults to ``sys.argv[:1]``
    args = parse_args(args)

    min_nodes, max_nodes = parse_min_max_nnodes(args.nnodes)
    assert 0 < min_nodes <= max_nodes
    assert args.max_restarts > 0

    rdzv_parameters = parameters.RendezvousParameters(
        args.rdzv_backend,
        args.rdzv_endpoint,
        args.rdzv_id,
        min_nodes,
        max_nodes,
        args.rdzv_conf,
    )

    rdzv_handler = parameters.get_rendezvous(rdzv_parameters)

    omp_num_threads = None
    if "OMP_NUM_THREADS" not in os.environ and args.nproc_per_node > 1:
        omp_num_threads = 1
        print(
            f"*****************************************\n"
            f"Setting OMP_NUM_THREADS environment variable for each process to be "
            f"{omp_num_threads} in default, to avoid your system being overloaded, "
            f"please further tune the variable for optimal performance in "
            f"your application as needed. \n"
            f"*****************************************")

    with_python = not args.no_python
    cmd = []
    if with_python:
        cmd = [sys.executable, "-u"]
        if args.module:
            cmd.append("-m")
    else:
        if not args.use_env:
            raise ValueError("When using the '--no_python' flag,"
                             " you must also set the '--use_env' flag.")
        if args.module:
            raise ValueError("Don't use both the '--no_python' flag"
                             " and the '--module' flag at the same time.")

    cmd.append(args.training_script)
    cmd.extend(args.training_script_args)

    spec = WorkerSpec(
        role="default",
        local_world_size=args.nproc_per_node,
        fn=wrapper_fn,
        args=(omp_num_threads, args.use_env, cmd),
        rdzv_handler=rdzv_handler,
        max_restarts=args.max_restarts,
        monitor_interval=args.monitor_interval,
    )
    metrics.initialize_metrics()
    elastic_agent = LocalElasticAgent(spec, start_method=args.start_method)
    elastic_agent.run(spec.role)
Exemplo n.º 4
0
    def test_run_sad_function(self):
        spec = self._get_worker_spec(fn=_sad_function, max_restarts=2)
        agent = LocalElasticAgent(spec, start_method="fork")
        with self.assertRaises(WorkerGroupFailureException) as cm:
            agent.run()

        excs = cm.exception.get_worker_exceptions()
        for i in range(spec.local_world_size):
            self.assertTrue(isinstance(excs[i], Exception))

        self.assertEqual(WorkerState.FAILED, agent.get_worker_group().state)
        self.assertEqual(0, agent._remaining_restarts)
Exemplo n.º 5
0
def _run_agent(
    run_id,
    etcd_host,
    etcd_port,
    min_size,
    max_size,
    func_to_run,
    args,
    local_world_size=8,
    role="test_trainer",
    output_dict=None,
    agent_barrier_timeout=300,
):
    rdzv_handler = dist.rendezvous(f"etcd://{etcd_host}:{etcd_port}/{run_id}"
                                   f"?min_workers={min_size}"
                                   f"&max_workers={max_size}")
    spec = WorkerSpec(
        role=role,
        local_world_size=local_world_size,
        fn=func_to_run,
        args=args,
        rdzv_handler=rdzv_handler,
        max_restarts=2,
        monitor_interval=1,
    )

    agent = LocalElasticAgent(spec,
                              start_method="fork",
                              exit_barrier_timeout=agent_barrier_timeout)
    res = agent.run()
    if output_dict is not None:
        key = str(uuid.uuid4().int)
        output_dict[key] = (role, res)
Exemplo n.º 6
0
 def test_check_role_name(self):
     spec = self._get_worker_spec(fn=_get_env_var, args=("ROLE_NAME",))
     agent = LocalElasticAgent(spec, start_method="fork")
     group_result = agent.run()
     results = group_result.return_values
     for role_name in results.values():
         self.assertEquals(spec.role, role_name)
Exemplo n.º 7
0
def _run_agent(run_id, etcd_host, etcd_port, min_size, max_size, wait=0):
    rdzv_handler = dist.rendezvous(f"etcd://{etcd_host}:{etcd_port}/{run_id}"
                                   f"?min_workers={min_size}"
                                   f"&max_workers={max_size}")
    spec = WorkerSpec(
        role="test_trainer",
        local_world_size=8,
        fn=_distributed_sum,
        args=(wait, ),
        rdzv_handler=rdzv_handler,
        max_restarts=2,
        monitor_interval=1,
    )

    agent = LocalElasticAgent(spec, start_method="fork")
    agent.run()
Exemplo n.º 8
0
 def _test_failed_result_with_run_id(self):
     max_restarts = 3
     spec = self._get_worker_spec(fn=_sad_function, max_restarts=max_restarts)
     agent = LocalElasticAgent(spec, start_method="fork")
     run_result = agent.run()
     for failure in run_result.failures.values():
         error_file = failure.error_file
         self.assertTrue(error_file.endswith(f"_{max_restarts}"))
Exemplo n.º 9
0
    def test_get_worker_return_values(self):
        spec = self._get_worker_spec(fn=_return_rank_times, args=(2,))
        agent = LocalElasticAgent(spec, start_method="fork")
        ret_vals = agent.run()

        self.assertEqual(spec.local_world_size, len(ret_vals))
        for i in range(spec.local_world_size):
            self.assertEqual(i * 2, ret_vals[i])
Exemplo n.º 10
0
    def test_run_check_run_id(self):
        def return_run_id():
            return os.environ["TORCHELASTIC_RUN_ID"]

        spec = self._get_worker_spec(fn=return_run_id, max_restarts=0)
        agent = LocalElasticAgent(spec, start_method="fork")
        ret = agent.run()

        for i in range(spec.local_world_size):
            self.assertEqual(spec.rdzv_handler.get_run_id(), ret[i])
Exemplo n.º 11
0
        def run_agent(
            run_id, etcd_host, etcd_port, start_method, worker_fn, worker_args=()
        ):
            rdzv_handler = dist.rendezvous(
                f"etcd://{etcd_host}:{etcd_port}/{run_id}"
                f"?min_workers=2"
                f"&max_workers=2"
            )
            spec = WorkerSpec(
                role="test_trainer",
                local_world_size=1,
                fn=worker_fn,
                args=worker_args,
                rdzv_handler=rdzv_handler,
                max_restarts=3,
                monitor_interval=1,
            )

            agent = LocalElasticAgent(spec, start_method)
            agent.run()
Exemplo n.º 12
0
 def _test_transient_bug(self, error_dir: str):
     max_restarts = 3
     spec = self._get_worker_spec(fn=_transient_bug, max_restarts=max_restarts)
     agent = LocalElasticAgent(spec, start_method="fork")
     run_result = agent.run()
     self.assertEqual(WorkerState.SUCCEEDED, run_result.state)
     for rank in range(len(run_result.return_values)):
         error_file_0 = os.path.join(error_dir, str(rank), "error.log_0")
         self.assertTrue(os.path.exists(error_file_0))
         error_file_1 = os.path.join(error_dir, str(rank), "error.log_1")
         self.assertFalse(os.path.exists(error_file_1))
    def test_run_segv_function(self):
        expected_error_index = 0
        expected_failure = signal.SIGSEGV
        spec = self._get_worker_spec(
            fn=_fatal_signal_function,
            max_restarts=2,
            args=(expected_error_index, expected_failure),
        )
        try:
            agent = LocalElasticAgent(spec, start_method="spawn")
            with self.assertRaises(WorkerGroupFailureException) as cm:
                agent.run()
        finally:
            spec.rdzv_handler.shutdown()

        excs = cm.exception.get_worker_exceptions()
        for i in range(spec.local_world_size):
            self.assertTrue(isinstance(excs[i], WorkerSignaledException))
            self.assertEqual(expected_failure.name, excs[i].signal_name)

        self.assertEqual(WorkerState.FAILED, agent.get_worker_group().state)
        self.assertEqual(0, agent._remaining_restarts)
Exemplo n.º 14
0
    def _test_run_sad_function(self):
        spec = self._get_worker_spec(fn=_sad_function, max_restarts=0)
        agent = LocalElasticAgent(spec, start_method="fork")
        group_results = agent.run()
        failed_results = group_results.failures
        self.assertEqual(spec.local_world_size, len(failed_results))
        # all ranks will have the same result
        for result in failed_results.values():
            self.assertTrue(os.path.exists(result.error_file))
            with open(result.error_file, "r") as f:
                data = f.read().replace("\n", "")
                self.assertTrue("RuntimeError: sad because i throw" in data)

        self.assertEqual(WorkerState.FAILED, agent.get_worker_group().state)
        self.assertEqual(0, agent._remaining_restarts)
Exemplo n.º 15
0
def _run_agent(
    run_id,
    etcd_host,
    etcd_port,
    min_size,
    max_size,
    func_to_run,
    args,
    local_world_size=8,
    role="test_trainer",
    output_dict=None,
    agent_barrier_timeout=300,
):
    rdzv_params = RendezvousParameters(
        backend="etcd",
        endpoint=f"{etcd_host}:{etcd_port}",
        run_id=run_id,
        min_nodes=min_size,
        max_nodes=max_size,
    )
    rdzv_handler = rdzv_registry.get_rendezvous_handler(rdzv_params)

    spec = WorkerSpec(
        role=role,
        local_world_size=local_world_size,
        fn=func_to_run,
        args=args,
        rdzv_handler=rdzv_handler,
        max_restarts=2,
        monitor_interval=1,
    )

    agent = LocalElasticAgent(
        spec, start_method="fork", exit_barrier_timeout=agent_barrier_timeout
    )

    res = agent.run()
    if output_dict is not None:
        key = str(uuid.uuid4().int)
        output_dict[key] = (role, res)
Exemplo n.º 16
0
 def test_barrier_failed(self, barrier_mock):
     barrier_mock.side_effect = RuntimeError("test error")
     spec = self._get_worker_spec(fn=_happy_function)
     agent = LocalElasticAgent(spec, start_method="fork")
     agent.run()
     barrier_mock.assert_called_once()
Exemplo n.º 17
0
 def test_run_check_env_function(self):
     spec = self._get_worker_spec(fn=_check_env_function, max_restarts=2)
     agent = LocalElasticAgent(spec, start_method="fork")
     agent.run()
Exemplo n.º 18
0
 def test_run_distributed_sum(self):
     spec = self._get_worker_spec(fn=_distributed_sum, args=(0,))
     agent = LocalElasticAgent(spec, start_method="fork")
     agent.run()
Exemplo n.º 19
0
 def test_run_happy_function(self):
     spec = self._get_worker_spec(fn=_happy_function)
     agent = LocalElasticAgent(spec, start_method="fork")
     agent.run()
Exemplo n.º 20
0
def main(args=None):
    # If ``args`` not passed, defaults to ``sys.argv[:1]``
    args = parse_args(args)

    min_nodes, max_nodes = parse_min_max_nnodes(args.nnodes)
    assert 0 < min_nodes <= max_nodes
    assert args.max_restarts >= 0

    if args.standalone:
        etcd_server = EtcdServer()
        etcd_server.start()
        args.rdzv_backend = "etcd"
        args.rdzv_endpoint = etcd_server.get_endpoint()
        args.rdzv_id = str(uuid.uuid4())
        log.info(
            f"\n**************************************\n"
            f"Rendezvous info:\n"
            f"--rdzv_backend={args.rdzv_backend} "
            f"--rdzv_endpoint={args.rdzv_endpoint} "
            f"--rdzv_id={args.rdzv_id}\n"
            f"**************************************\n"
        )

    nproc_per_node = determine_local_world_size(args.nproc_per_node)
    omp_num_threads = None
    if "OMP_NUM_THREADS" not in os.environ and nproc_per_node > 1:
        omp_num_threads = 1
        print(
            f"*****************************************\n"
            f"Setting OMP_NUM_THREADS environment variable for each process to be "
            f"{omp_num_threads} in default, to avoid your system being overloaded, "
            f"please further tune the variable for optimal performance in "
            f"your application as needed. \n"
            f"*****************************************"
        )

    with_python = not args.no_python
    cmd = []
    if with_python:
        cmd = [sys.executable, "-u"]
        if args.module:
            cmd.append("-m")
    else:
        if args.module:
            raise ValueError(
                "Don't use both the '--no_python' flag"
                " and the '--module' flag at the same time."
            )

    cmd.append(args.training_script)
    cmd.extend(args.training_script_args)

    rdzv_parameters = RendezvousParameters(
        backend=args.rdzv_backend,
        endpoint=args.rdzv_endpoint,
        run_id=args.rdzv_id,
        min_nodes=min_nodes,
        max_nodes=max_nodes,
        **_parse_rdzv_conf(args.rdzv_conf),
    )

    rdzv_handler = rdzv_registry.get_rendezvous_handler(rdzv_parameters)

    try:
        spec = WorkerSpec(
            role=args.role,
            local_world_size=nproc_per_node,
            fn=wrapper_fn,
            args=(omp_num_threads, cmd),
            rdzv_handler=rdzv_handler,
            max_restarts=args.max_restarts,
            monitor_interval=args.monitor_interval,
        )
        metrics.initialize_metrics()
        elastic_agent = LocalElasticAgent(spec, start_method=args.start_method)
        elastic_agent.run(spec.role)
    finally:
        rdzv_handler.shutdown()

    if args.standalone:
        etcd_server.stop()
Exemplo n.º 21
0
def launch_agent(
    config: LaunchConfig,
    entrypoint: Union[Callable, str, None],
    args: List[Any],
) -> Dict[int, Any]:
    if not config.run_id:
        run_id = str(uuid.uuid4().int)
        logger.warning(f"config has no run_id, generate a new one: {run_id}")
        config.run_id = run_id

    entrypoint_name = _get_entrypoint_name(entrypoint, args)

    logger.info(
        f"Starting elastic_operator with launch configs:\n"
        f"  entrypoint       : {entrypoint_name}\n"
        f"  min_nodes        : {config.min_nodes}\n"
        f"  max_nodes        : {config.max_nodes}\n"
        f"  nproc_per_node   : {config.nproc_per_node}\n"
        f"  run_id           : {config.run_id}\n"
        f"  rdzv_backend     : {config.rdzv_backend}\n"
        f"  rdzv_endpoint    : {config.rdzv_endpoint}\n"
        f"  rdzv_configs     : {config.rdzv_configs}\n"
        f"  max_restarts     : {config.max_restarts}\n"
        f"  monitor_interval : {config.monitor_interval}\n"
        f"  log_dir          : {config.log_dir}\n"
        f"  metrics_cfg      : {config.metrics_cfg}\n"
    )

    rdzv_parameters = RendezvousParameters(
        backend=config.rdzv_backend,
        endpoint=config.rdzv_endpoint,
        run_id=config.run_id,
        min_nodes=config.min_nodes,
        max_nodes=config.max_nodes,
        **config.rdzv_configs,
    )

    agent = None
    rdzv_handler = rdzv_registry.get_rendezvous_handler(rdzv_parameters)
    try:
        spec = WorkerSpec(
            role=config.role,
            local_world_size=config.nproc_per_node,
            entrypoint=entrypoint,
            args=tuple(args),
            rdzv_handler=rdzv_handler,
            max_restarts=config.max_restarts,
            monitor_interval=config.monitor_interval,
            redirects=config.redirects,
            tee=config.tee,
        )

        cfg = metrics.MetricsConfig(config.metrics_cfg) if config.metrics_cfg else None
        metrics.initialize_metrics(cfg)

        agent = LocalElasticAgent(
            spec=spec, start_method=config.start_method, log_dir=config.log_dir
        )

        result = agent.run()
        events.record(agent.get_agent_status_event(WorkerState.SUCCEEDED))
        if result.is_failed():
            # ChildFailedError is treated specially by @record
            # if the error files for the failed children exist
            # @record will copy the first error (root cause)
            # to the error file of the launcher process.
            raise ChildFailedError(
                name=entrypoint_name,
                failures=result.failures,
            )
        else:
            return result.return_values
    except ChildFailedError:
        raise
    except Exception:
        if agent:
            events.record(agent.get_agent_status_event(WorkerState.FAILED))
        else:
            events.record(_construct_event(config))
        raise
    finally:
        rdzv_handler.shutdown()
Exemplo n.º 22
0
def main(args=None):
    # If ``args`` not passed, defaults to ``sys.argv[:1]``
    args = parse_args(args)
    min_nodes, max_nodes = parse_min_max_nnodes(args.nnodes)
    assert 0 < min_nodes <= max_nodes
    assert args.max_restarts >= 0

    elastic_agent = None

    if args.standalone:
        etcd_server = EtcdServer()
        etcd_server.start()
        args.rdzv_backend = "etcd"
        args.rdzv_endpoint = etcd_server.get_endpoint()
        args.rdzv_id = str(uuid.uuid4())
        log.info(f"\n**************************************\n"
                 f"Rendezvous info:\n"
                 f"--rdzv_backend={args.rdzv_backend} "
                 f"--rdzv_endpoint={args.rdzv_endpoint} "
                 f"--rdzv_id={args.rdzv_id}\n"
                 f"**************************************\n")

    nproc_per_node = determine_local_world_size(args.nproc_per_node)
    if "OMP_NUM_THREADS" not in os.environ and nproc_per_node > 1:
        omp_num_threads = 1
        print(
            f"*****************************************\n"
            f"Setting OMP_NUM_THREADS environment variable for each process to be "
            f"{omp_num_threads} in default, to avoid your system being overloaded, "
            f"please further tune the variable for optimal performance in "
            f"your application as needed. \n"
            f"*****************************************")
        # This env variable will be passed down to the subprocesses
        os.environ["OMP_NUM_THREADS"] = str(omp_num_threads)

    with_python = not args.no_python
    cmd = []
    if with_python:
        cmd = [sys.executable, "-u"]
        if args.module:
            cmd.append("-m")
    else:
        if args.module:
            raise ValueError("Don't use both the '--no_python' flag"
                             " and the '--module' flag at the same time.")

    cmd.append(args.training_script)
    cmd.extend(args.training_script_args)

    rdzv_parameters = RendezvousParameters(
        backend=args.rdzv_backend,
        endpoint=args.rdzv_endpoint,
        run_id=args.rdzv_id,
        min_nodes=min_nodes,
        max_nodes=max_nodes,
        **_parse_rendezvous_config(args.rdzv_conf),
    )

    rdzv_handler = rdzv_registry.get_rendezvous_handler(rdzv_parameters)
    try:
        spec = WorkerSpec(
            role=args.role,
            local_world_size=nproc_per_node,
            entrypoint=cmd[0],
            args=(*cmd[1:], ),
            rdzv_handler=rdzv_handler,
            max_restarts=args.max_restarts,
            monitor_interval=args.monitor_interval,
            redirects=Std.from_str(args.redirects),
            tee=Std.from_str(args.tee),
        )
        metrics.initialize_metrics()
        elastic_agent = LocalElasticAgent(spec=spec,
                                          start_method=args.start_method,
                                          log_dir=args.log_dir)
        run_result = elastic_agent.run(spec.role)
        events.record(
            elastic_agent.get_agent_status_event(WorkerState.SUCCEEDED))
        if run_result.is_failed():
            # ChildFailedError is treated specially by @record
            # if the error files for the failed children exist
            # @record will copy the first error (root cause)
            # to the error file of the launcher process
            raise ChildFailedError(
                name=args.training_script,
                failures=run_result.failures,
            )
    except ChildFailedError:
        raise
    except Exception:
        if elastic_agent:
            events.record(
                elastic_agent.get_agent_status_event(WorkerState.FAILED))
        else:
            events.record(_construct_event(args))
        raise
    finally:
        rdzv_handler.shutdown()
        if args.standalone:
            etcd_server.stop()
Exemplo n.º 23
0
def main(args=None):
    # If ``args`` not passed, defaults to ``sys.argv[:1]``
    args = parse_args(args)

    min_nodes, max_nodes = parse_min_max_nnodes(args.nnodes)
    assert 0 < min_nodes <= max_nodes
    assert args.max_restarts >= 0

    if args.standalone:
        etcd_server = EtcdServer()
        etcd_server.start()
        args.rdzv_backend = "etcd"
        args.rdzv_endpoint = etcd_server.get_endpoint()
        args.rdzv_id = str(uuid.uuid4())
        log.info(f"\n**************************************\n"
                 f"Rendezvous info:\n"
                 f"--rdzv_backend={args.rdzv_backend} "
                 f"--rdzv_endpoint={args.rdzv_endpoint} "
                 f"--rdzv_id={args.rdzv_id}\n"
                 f"**************************************\n")

    nproc_per_node = determine_local_world_size(args.nproc_per_node)
    omp_num_threads = None
    if "OMP_NUM_THREADS" not in os.environ and nproc_per_node > 1:
        omp_num_threads = 1
        print(
            f"*****************************************\n"
            f"Setting OMP_NUM_THREADS environment variable for each process to be "
            f"{omp_num_threads} in default, to avoid your system being overloaded, "
            f"please further tune the variable for optimal performance in "
            f"your application as needed. \n"
            f"*****************************************")
        # This env variable will be passed down to the subprocesses
        os.environ["OMP_NUM_THREADS"] = str(omp_num_threads)

    with_python = not args.no_python
    cmd = []
    if with_python:
        cmd = [sys.executable, "-u"]
        if args.module:
            cmd.append("-m")
    else:
        if args.module:
            raise ValueError("Don't use both the '--no_python' flag"
                             " and the '--module' flag at the same time.")

    cmd.append(args.training_script)
    cmd.extend(args.training_script_args)

    rdzv_parameters = RendezvousParameters(
        backend=args.rdzv_backend,
        endpoint=args.rdzv_endpoint,
        run_id=args.rdzv_id,
        min_nodes=min_nodes,
        max_nodes=max_nodes,
        **_parse_rdzv_conf(args.rdzv_conf),
    )

    rdzv_handler = rdzv_registry.get_rendezvous_handler(rdzv_parameters)

    try:
        spec = WorkerSpec(
            role=args.role,
            local_world_size=nproc_per_node,
            cmd=cmd,
            rdzv_handler=rdzv_handler,
            max_restarts=args.max_restarts,
            monitor_interval=args.monitor_interval,
        )
        metrics.initialize_metrics()
        elastic_agent = LocalElasticAgent(spec, start_method=args.start_method)
        group_result = elastic_agent.run(spec.role)
        if group_result.is_failed():
            min_rank = min(group_result.failures.keys())
            failure = group_result.failures[min_rank]
            # Note: this line will raise an exception to indicate to the
            # scheduler process that something went wrong.
            # If any workers wrote the error file, it will be propagated
            # to the scheduler specific destination.
            process_failure(failure)
            msg = f"""
*********************************************************************** \n
***********************USER CODE FAILED WITH ERROR****************** \n\n
{get_failure_message(failure)} \n
******************************************************************** \n\n
******************************************************************** \n
            """
            log.warning(msg)
            # Expected (0-127), 0 - success, anything else - failure
            sys.exit(abs(failure.exit_code))
    finally:
        rdzv_handler.shutdown()
        if args.standalone:
            etcd_server.stop()
        cleanup()