def _launch_horovodrun(self) -> subprocess.Popen: check.true(self.hvd_config.use) logging.debug( f"Starting training process on: {self.rendezvous_info.get_rank()}." ) horovod_process_cmd = horovod.create_run_command( num_gpus_per_machine=self.num_gpus, ip_addresses=self.rendezvous_info.get_ip_addresses(), env=self.env, debug=self.env.experiment_config.debug_enabled(), optional_args=self.env.experiment_config.horovod_optional_args(), worker_process_env_path=self._worker_process_env_path, ) subprocess_env = { **os.environ, "NCCL_DEBUG": "INFO", "DET_HOROVOD_GLOO_RENDEZVOUS_PORT": str(constants.HOROVOD_GLOO_RENDEZVOUS_PORT + self.env.det_trial_unique_port_offset), } return subprocess.Popen(horovod_process_cmd, env=subprocess_env)
def test_create_run_command(debug: bool, auto_tune: bool, tensor_fusion_threshold: int, tensor_fusion_cycle_time: int) -> None: ip_addresses = ["localhost", "128.140.2.4"] num_gpus_per_machine = 8 optimizations = { "auto_tune_tensor_fusion": auto_tune, "tensor_fusion_threshold": tensor_fusion_threshold, "tensor_fusion_cycle_time": tensor_fusion_cycle_time, } experiment_config = { "optimizations": optimizations, "resources": { "slots_per_trial": 1, "native_parallel": False }, } env = create_default_env_context(experiment_config) expected_horovod_run_cmd = [ "horovodrun", "-np", "16", "-p", str(constants.HOROVOD_SSH_PORT), "-H", "localhost:8,128.140.2.4:8", "--start-timeout", str(constants.HOROVOD_STARTUP_TIMEOUT_SECONDS), "--gloo-timeout-seconds", str(constants.HOROVOD_GLOO_TIMEOUT_SECONDS), ] if auto_tune: expected_horovod_run_cmd.extend([ "--autotune", "--autotune-log-file", str(constants.HOROVOD_AUTOTUNE_LOG_FILEPATH) ]) else: expected_horovod_run_cmd.extend([ "--fusion-threshold-mb", str(tensor_fusion_threshold), "--cycle-time-ms", str(tensor_fusion_cycle_time), ]) expected_horovod_run_cmd.extend([ "--cache-capacity", str(1024), "--no-hierarchical-allreduce", "--no-hierarchical-allgather", ]) if debug: expected_horovod_run_cmd.append("--verbose") expected_horovod_run_cmd.extend([ "python3", "-m", "determined.exec.worker_process_wrapper", "env_path" ]) created_horovod_run_cmd = horovod.create_run_command( num_gpus_per_machine=num_gpus_per_machine, ip_addresses=ip_addresses, env=env, debug=debug, optional_args=[], worker_process_env_path=pathlib.Path("env_path"), ) assert expected_horovod_run_cmd == created_horovod_run_cmd
def main(hvd_args: List[str], script: List[str], autohorovod: bool) -> int: hvd_args = hvd_args or [] info = det.get_cluster_info() assert info is not None, "must be run on-cluster" assert info.task_type == "TRIAL", f'must be run with task_type="TRIAL", not "{info.task_type}"' # When --autohorovod was set, detect single-slot and zero-slot trials. if autohorovod and len( info.container_addrs) == 1 and len(info.slot_ids) <= 1: p = subprocess.Popen(script) with det.util.forward_signals(p): return p.wait() # Hack: get the resources id from the environment. resources_id = os.environ.get("DET_RESOURCES_ID") assert resources_id is not None, "Unable to run with DET_RESOURCES_ID unset" # Hack: read the full config. The experiment config is not a stable API! experiment_config = info.trial._config debug = experiment_config.get("debug", False) if debug: logging.getLogger().setLevel(logging.DEBUG) # TODO: refactor websocket, data_layer, and profiling to to not use the cli_cert. cert = certs.default_load(info.master_url) certs.cli_cert = cert # The launch layer should provide the chief_ip to the training code, so that the training code # can function with a different launch layer in a different environment. Inside Determined, the # easiest way to get the chief_ip is with container_addrs. chief_ip = info.container_addrs[0] # Chief IP is set as an environment variable to support nested launch layers os.environ["DET_CHIEF_IP"] = chief_ip if info.container_rank > 0: # Non-chief machines just run sshd. # Mark sshd containers as daemon resources that the master should kill when all non-daemon # contiainers (horovodrun, in this case) have exited. api.post( info.master_url, path= f"/api/v1/allocations/{info.allocation_id}/resources/{resources_id}/daemon", cert=cert, ) pid_server_cmd, run_sshd_command = create_sshd_worker_cmd( info.allocation_id, len(info.slot_ids), debug=debug) logging.debug( f"Non-chief [{info.container_rank}] training process launch " f"command: {run_sshd_command}.") p = subprocess.Popen(pid_server_cmd + run_sshd_command) with det.util.forward_signals(p): return p.wait() # Chief machine waits for every worker's sshd to be available. All machines should be pretty # close to in-step by now because all machines just finished synchronizing rendezvous info. deadline = time.time() + 20 for peer_addr in info.container_addrs[1:]: util.check_sshd(peer_addr, deadline, DTRAIN_SSH_PORT) # The chief has several layers of wrapper processes: # - a top-level pid_server, which causes the whole container to exit if any local worker dies. # - horovodrun, which launches $slots_per_trial copies of the following layers: # - a pid_client process to contact the local pid_server # - wrap_rank, which redirects stdin/stdout to the local container # - harness.py, which actually does the training for the worker # # It is a bug in horovod that causes us to have this pid_server/pid_client pair of layers. # We can remove these layers when the upstream fix has been around for long enough that we can # reasonably require user images to have patched horovod installations. pid_server_cmd = create_hvd_pid_server_cmd(info.allocation_id, len(info.slot_ids)) # TODO: remove this (very old) hack when we have a configurable launch layer. hvd_optional_args = experiment_config.get("data", {}).get("__det_dtrain_args", []) hvd_optional_args += hvd_args if debug: hvd_optional_args += ["--mpi-args=-v --display-map"] hvd_cmd = horovod.create_run_command( num_proc_per_machine=len(info.slot_ids), ip_addresses=info.container_addrs, inter_node_network_interface=info.trial._inter_node_network_interface, optimizations=experiment_config["optimizations"], debug=debug, optional_args=hvd_optional_args, ) worker_wrapper_cmd = create_worker_wrapper_cmd(info.allocation_id) logging.debug( f"chief worker calling horovodrun with args: {hvd_cmd[1:]} ...") os.environ["USE_HOROVOD"] = "1" # We now have environment images with built-in OpenMPI. When invoked the # SLURM_JOBID variable triggers integration with SLURM, however, we are # running in a singularity container and SLURM may or may not have # compatible configuration enabled. We therefore clear the SLURM_JOBID variable # before invoking mpi so that mpirun will honor the args passed via horvod # run to it describing the hosts and process topology, otherwise mpi ends # up wanting to launch all -np# processes on the local causing an oversubscription # error ("There are not enough slots available in the system"). os.environ.pop("SLURM_JOBID", None) p = subprocess.Popen(pid_server_cmd + hvd_cmd + worker_wrapper_cmd + script) with det.util.forward_signals(p): return p.wait()