Exemplo n.º 1
0
    def _launch_horovodrun(self) -> subprocess.Popen:
        check.true(self.hvd_config.use)
        logging.debug(
            f"Starting training process on: {self.rendezvous_info.get_rank()}."
        )

        horovod_process_cmd = horovod.create_run_command(
            num_gpus_per_machine=self.num_gpus,
            ip_addresses=self.rendezvous_info.get_ip_addresses(),
            env=self.env,
            debug=self.env.experiment_config.debug_enabled(),
            optional_args=self.env.experiment_config.horovod_optional_args(),
            worker_process_env_path=self._worker_process_env_path,
        )
        subprocess_env = {
            **os.environ,
            "NCCL_DEBUG":
            "INFO",
            "DET_HOROVOD_GLOO_RENDEZVOUS_PORT":
            str(constants.HOROVOD_GLOO_RENDEZVOUS_PORT +
                self.env.det_trial_unique_port_offset),
        }
        return subprocess.Popen(horovod_process_cmd, env=subprocess_env)
Exemplo n.º 2
0
def test_create_run_command(debug: bool, auto_tune: bool,
                            tensor_fusion_threshold: int,
                            tensor_fusion_cycle_time: int) -> None:
    ip_addresses = ["localhost", "128.140.2.4"]
    num_gpus_per_machine = 8
    optimizations = {
        "auto_tune_tensor_fusion": auto_tune,
        "tensor_fusion_threshold": tensor_fusion_threshold,
        "tensor_fusion_cycle_time": tensor_fusion_cycle_time,
    }
    experiment_config = {
        "optimizations": optimizations,
        "resources": {
            "slots_per_trial": 1,
            "native_parallel": False
        },
    }
    env = create_default_env_context(experiment_config)

    expected_horovod_run_cmd = [
        "horovodrun",
        "-np",
        "16",
        "-p",
        str(constants.HOROVOD_SSH_PORT),
        "-H",
        "localhost:8,128.140.2.4:8",
        "--start-timeout",
        str(constants.HOROVOD_STARTUP_TIMEOUT_SECONDS),
        "--gloo-timeout-seconds",
        str(constants.HOROVOD_GLOO_TIMEOUT_SECONDS),
    ]
    if auto_tune:
        expected_horovod_run_cmd.extend([
            "--autotune", "--autotune-log-file",
            str(constants.HOROVOD_AUTOTUNE_LOG_FILEPATH)
        ])
    else:
        expected_horovod_run_cmd.extend([
            "--fusion-threshold-mb",
            str(tensor_fusion_threshold),
            "--cycle-time-ms",
            str(tensor_fusion_cycle_time),
        ])
    expected_horovod_run_cmd.extend([
        "--cache-capacity",
        str(1024),
        "--no-hierarchical-allreduce",
        "--no-hierarchical-allgather",
    ])
    if debug:
        expected_horovod_run_cmd.append("--verbose")
    expected_horovod_run_cmd.extend([
        "python3", "-m", "determined.exec.worker_process_wrapper", "env_path"
    ])

    created_horovod_run_cmd = horovod.create_run_command(
        num_gpus_per_machine=num_gpus_per_machine,
        ip_addresses=ip_addresses,
        env=env,
        debug=debug,
        optional_args=[],
        worker_process_env_path=pathlib.Path("env_path"),
    )
    assert expected_horovod_run_cmd == created_horovod_run_cmd
Exemplo n.º 3
0
def main(hvd_args: List[str], script: List[str], autohorovod: bool) -> int:
    hvd_args = hvd_args or []

    info = det.get_cluster_info()
    assert info is not None, "must be run on-cluster"
    assert info.task_type == "TRIAL", f'must be run with task_type="TRIAL", not "{info.task_type}"'

    # When --autohorovod was set, detect single-slot and zero-slot trials.
    if autohorovod and len(
            info.container_addrs) == 1 and len(info.slot_ids) <= 1:
        p = subprocess.Popen(script)
        with det.util.forward_signals(p):
            return p.wait()

    # Hack: get the resources id from the environment.
    resources_id = os.environ.get("DET_RESOURCES_ID")
    assert resources_id is not None, "Unable to run with DET_RESOURCES_ID unset"

    # Hack: read the full config.  The experiment config is not a stable API!
    experiment_config = info.trial._config

    debug = experiment_config.get("debug", False)
    if debug:
        logging.getLogger().setLevel(logging.DEBUG)

    # TODO: refactor websocket, data_layer, and profiling to to not use the cli_cert.
    cert = certs.default_load(info.master_url)
    certs.cli_cert = cert

    # The launch layer should provide the chief_ip to the training code, so that the training code
    # can function with a different launch layer in a different environment.  Inside Determined, the
    # easiest way to get the chief_ip is with container_addrs.
    chief_ip = info.container_addrs[0]

    # Chief IP is set as an environment variable to support nested launch layers
    os.environ["DET_CHIEF_IP"] = chief_ip

    if info.container_rank > 0:
        # Non-chief machines just run sshd.

        # Mark sshd containers as daemon resources that the master should kill when all non-daemon
        # contiainers (horovodrun, in this case) have exited.
        api.post(
            info.master_url,
            path=
            f"/api/v1/allocations/{info.allocation_id}/resources/{resources_id}/daemon",
            cert=cert,
        )

        pid_server_cmd, run_sshd_command = create_sshd_worker_cmd(
            info.allocation_id, len(info.slot_ids), debug=debug)

        logging.debug(
            f"Non-chief [{info.container_rank}] training process launch "
            f"command: {run_sshd_command}.")
        p = subprocess.Popen(pid_server_cmd + run_sshd_command)
        with det.util.forward_signals(p):
            return p.wait()

    # Chief machine waits for every worker's sshd to be available.  All machines should be pretty
    # close to in-step by now because all machines just finished synchronizing rendezvous info.
    deadline = time.time() + 20
    for peer_addr in info.container_addrs[1:]:
        util.check_sshd(peer_addr, deadline, DTRAIN_SSH_PORT)

    # The chief has several layers of wrapper processes:
    # - a top-level pid_server, which causes the whole container to exit if any local worker dies.
    # - horovodrun, which launches $slots_per_trial copies of the following layers:
    #     - a pid_client process to contact the local pid_server
    #     - wrap_rank, which redirects stdin/stdout to the local container
    #     - harness.py, which actually does the training for the worker
    #
    # It is a bug in horovod that causes us to have this pid_server/pid_client pair of layers.
    # We can remove these layers when the upstream fix has been around for long enough that we can
    # reasonably require user images to have patched horovod installations.

    pid_server_cmd = create_hvd_pid_server_cmd(info.allocation_id,
                                               len(info.slot_ids))

    # TODO: remove this (very old) hack when we have a configurable launch layer.
    hvd_optional_args = experiment_config.get("data",
                                              {}).get("__det_dtrain_args", [])
    hvd_optional_args += hvd_args
    if debug:
        hvd_optional_args += ["--mpi-args=-v --display-map"]

    hvd_cmd = horovod.create_run_command(
        num_proc_per_machine=len(info.slot_ids),
        ip_addresses=info.container_addrs,
        inter_node_network_interface=info.trial._inter_node_network_interface,
        optimizations=experiment_config["optimizations"],
        debug=debug,
        optional_args=hvd_optional_args,
    )

    worker_wrapper_cmd = create_worker_wrapper_cmd(info.allocation_id)

    logging.debug(
        f"chief worker calling horovodrun with args: {hvd_cmd[1:]} ...")

    os.environ["USE_HOROVOD"] = "1"

    # We now have environment images with built-in OpenMPI.   When invoked the
    # SLURM_JOBID variable triggers integration with SLURM, however, we are
    # running in a singularity container and SLURM may or may not have
    # compatible configuration enabled.  We therefore clear the SLURM_JOBID variable
    # before invoking mpi so that mpirun will honor the args passed via horvod
    # run to it describing the hosts and process topology, otherwise mpi ends
    # up wanting to launch all -np# processes on the local causing an oversubscription
    # error ("There are not enough slots available in the system").
    os.environ.pop("SLURM_JOBID", None)
    p = subprocess.Popen(pid_server_cmd + hvd_cmd + worker_wrapper_cmd +
                         script)
    with det.util.forward_signals(p):
        return p.wait()