def post_ready(master_url: str, cert: certs.Cert, allocation_id: str): api.post( master_url, f"/api/v1/allocations/{allocation_id}/ready", {}, cert=cert, )
def trigger_preemption(signum: int, frame: types.FrameType) -> None: info = det.get_cluster_info() if info and info.container_rank == 0: # Chief container, requests preemption, others ignore logging.debug( f"[rank={info.container_rank}] SIGTERM: Preemption imminent.") # Notify the master that we need to be preempted api.post( info.master_url, f"/api/v1/allocations/{info.allocation_id}/signals/pending_preemption" )
def set_priority(args: Namespace) -> None: task_id = expand_uuid_prefixes(args) name = RemoteTaskName[args._command] try: api_full_path = "api/v1/{}/{}/set_priority".format( RemoteTaskNewAPIs[args._command], task_id) api.post(args.master, api_full_path, {"priority": args.priority}) print( colored( "Set priority of {} {} to {}".format(name, task_id, args.priority), "green")) except api.errors.APIException as e: print(colored("Skipping: {} ({})".format(e, type(e).__name__), "red"))
def post_trial_profiler_metrics_batches( master_url: str, batches: List[TrialProfilerMetricsBatch], ) -> None: """ Post the given metrics to the master to be persisted. Labels must contain only a subset of the keys: trial_id, name, gpu_uuid, agent_id and metric_type, where metric_type is one of PROFILER_METRIC_TYPE_SYSTEM or PROFILER_METRIC_TYPE_TIMING. """ api.post( master_url, "/api/v1/trials/profiler/metrics", body={"batches": [b.__dict__ for b in batches]}, )
def kill(args: Namespace) -> None: ids = RemoteTaskGetIDsFunc[args._command](args) # type: ignore name = RemoteTaskName[args._command] for i, id in enumerate(ids): try: api_full_path = "api/v1/{}/{}/kill".format(RemoteTaskNewAPIs[args._command], id) api.post(args.master, api_full_path) print(colored("Killed {} {}".format(name, id), "green")) except api.errors.APIException as e: if not args.force: for ignored in ids[i + 1 :]: print("Cowardly not killing {}".format(ignored)) raise e print(colored("Skipping: {} ({})".format(e, type(e).__name__), "red"))
def patch(args: argparse.Namespace) -> None: check_false(args.all and args.agent_id) if not (args.all or args.agent_id): print("Error: must specify exactly one of `--all` or agent_id", file=sys.stderr) sys.exit(1) if args.agent_id: agent_ids = [args.agent_id] else: r = api.get(args.master, "agents") agent_ids = sorted(local_id(a) for a in r.json().keys()) drain_mode = None if enabled else args.drain for agent_id in agent_ids: action = "enable" if enabled else "disable" path = f"api/v1/agents/{agent_id}/{action}" payload = None if not enabled and drain_mode: payload = { "drain": drain_mode, } api.post(args.master, path, payload) status = "Disabled" if not enabled else "Enabled" print(f"{status} agent {agent_id}.", file=sys.stderr) # When draining, check if there're any tasks currently running on # these slots, and list them. if drain_mode: rsp = api.get(args.master, "tasks") tasks_data = { k: t for (k, t) in rsp.json().items() if any(a in agent_ids for r in t.get("resources", []) for a in r["agent_devices"]) } if not (args.json or args.csv): if tasks_data: print("Tasks still in progress on draining nodes.") else: print("No tasks in progress on draining nodes.") cli_task.render_tasks(args, tasks_data)
def register_version(self, checkpoint_uuid: str) -> Checkpoint: """ Creates a new model version and returns the :class:`~determined.experimental.Checkpoint` corresponding to the version. Arguments: checkpoint_uuid: The UUID of the checkpoint to register. """ resp = api.post( self._master, "/api/v1/models/{}/versions".format(self.name), body={"checkpoint_uuid": checkpoint_uuid}, ) data = resp.json() return Checkpoint.from_json( { **data["modelVersion"]["checkpoint"], "model_version": data["modelVersion"]["version"], "model_name": data["modelVersion"]["model"]["name"], }, self._master, )
def launch_command( master: str, endpoint: str, config: Dict[str, Any], template: str, context_path: Optional[Path] = None, data: Optional[Dict[str, Any]] = None, preview: Optional[bool] = False, ) -> Any: user_files = [] # type: List[Dict[str, Any]] if context_path: user_files, _ = context.read_context(context_path) body = {"config": config} # type: Dict[str, Any] if template: body["template_name"] = template if len(user_files) > 0: body["files"] = user_files if data is not None: message_bytes = json.dumps(data).encode("utf-8") base64_bytes = base64.b64encode(message_bytes) body["data"] = base64_bytes if preview: body["preview"] = preview return api.post( master, endpoint, body, ).json()
def cancel_experiment_v1(experiment_id: int) -> None: certs.cli_cert = certs.default_load(conf.make_master_url()) authentication.cli_auth = authentication.Authentication( conf.make_master_url(), try_reauth=True) r = api.post(conf.make_master_url(), "/api/v1/experiments/{}/cancel".format(experiment_id)) r.raise_for_status() wait_for_experiment_state(experiment_id, "CANCELED")
def preview_search(args: Namespace) -> None: experiment_config = safe_load_yaml_with_exceptions(args.config_file) args.config_file.close() if "searcher" not in experiment_config: print("Experiment configuration must have 'searcher' section") sys.exit(1) r = api.post(args.master, "searcher/preview", json=experiment_config) j = r.json() def to_full_name(kind: str) -> str: try: # The unitless searcher case, for masters newer than 0.17.6. length = int(kind) return f"train for {length}" except ValueError: pass if kind[-1] == "R": return "train {} records".format(kind[:-1]) if kind[-1] == "B": return "train {} batch(es)".format(kind[:-1]) if kind[-1] == "E": return "train {} epoch(s)".format(kind[:-1]) if kind == "V": return "validation" raise ValueError("unexpected kind: {}".format(kind)) def render_sequence(sequence: List[str]) -> str: if not sequence: return "N/A" instructions = [] current = sequence[0] count = 0 for k in sequence: if k != current: instructions.append("{} x {}".format(count, to_full_name(current))) current = k count = 1 else: count += 1 instructions.append("{} x {}".format(count, to_full_name(current))) return ", ".join(instructions) headers = ["Trials", "Breakdown"] values = [(count, render_sequence(operations.split())) for operations, count in j["results"].items()] print(colored("Using search configuration:", "green")) yml = yaml.YAML() yml.indent(mapping=2, sequence=4, offset=2) yml.dump(experiment_config["searcher"], sys.stdout) print() print("This search will create a total of {} trial(s).".format( sum(j["results"].values()))) print(tabulate.tabulate(values, headers, tablefmt="presto"), flush=False)
def log_out_user(parsed_args: Namespace) -> None: auth = authentication.cli_auth if auth is None: return try: api.post( parsed_args.master, "logout", headers={ "Authorization": "Bearer {}".format(auth.get_session_token()) }, authenticated=False, ) except api.errors.APIException as e: if e.status_code != 401: raise e token_store = authentication.TokenStore(parsed_args.master) token_store.drop_user(auth.get_session_user())
def add_metadata(self, metadata: Dict[str, Any]) -> None: """ Adds user-defined metadata to the checkpoint. The ``metadata`` argument must be a JSON-serializable dictionary. If any keys from this dictionary already appear in the checkpoint metadata, the corresponding dictionary entries in the checkpoint are replaced by the passed-in dictionary values. Arguments: metadata (dict): Dictionary of metadata to add to the checkpoint. """ for key, val in metadata.items(): self.metadata[key] = val if self._master: api.post( self._master, "/api/v1/checkpoints/{}/metadata".format(self.uuid), body={"checkpoint": { "metadata": self.metadata }}, )
def remove_metadata(self, keys: List[str]) -> None: """ Removes user-defined metadata from the checkpoint. Any top-level keys that appear in the ``keys`` list are removed from the checkpoint. Arguments: keys (List[string]): Top-level keys to remove from the checkpoint metadata. """ for key in keys: if key in self.metadata: del self.metadata[key] if self._master: api.post( self._master, "/api/v1/checkpoints/{}/metadata".format(self.uuid), body={"checkpoint": { "metadata": self.metadata }}, )
def add_client(parsed_args: Namespace) -> None: try: client = api.post( parsed_args.master, "oauth2/clients", body={ "domain": parsed_args.domain, "name": parsed_args.name }, ).json() except NotFoundException: raise EnterpriseOnlyError("API not found: oauth2/clients") print("Client ID: {}".format(client["id"])) print("Client secret: {}".format(client["secret"]))
def register_version(args: Namespace) -> None: if args.json: resp = api.post( args.master, "/api/v1/models/{}/versions".format(args.name), body={"checkpoint_uuid": args.uuid}, ) print(json.dumps(resp.json(), indent=2)) else: model = Determined(args.master, None).get_model(args.name) checkpoint = model.register_version(args.uuid) render_model(model) print("\n") render_model_version(checkpoint)
def do_login(master_address: str, auth: Authentication, username: str, password: str) -> str: r = api.post( master_address, "login", body={ "username": username, "password": password }, authenticated=False, ) token = cast(str, r.json()["token"]) auth.token_store.set_token(username, token) return token
def start_tensorboard(args: Namespace) -> None: if not (args.trial_ids or args.experiment_ids): print("Either experiment_ids or trial_ids must be specified.") sys.exit(1) config = parse_config(args.config_file, None, args.config, []) req_body = { "config": config, "trial_ids": args.trial_ids, "experiment_ids": args.experiment_ids, } if args.context is not None: req_body["files"], _ = context.read_context(args.context, constants.MAX_CONTEXT_SIZE) resp = api.post(args.master, "api/v1/tensorboards", json=req_body).json()["tensorboard"] if args.detach: print(resp["id"]) return url = "tensorboard/{}/events".format(resp["id"]) with api.ws(args.master, url) as ws: for msg in ws: if msg["log_event"] is not None: # TensorBoard will print a url by default. The URL is incorrect since # TensorBoard is not aware of the master proxy address it is assigned. if "http" in msg["log_event"]: continue if msg["service_ready_event"]: if args.no_browser: url = api.make_url(args.master, resp["serviceAddress"]) else: url = api.browser_open(args.master, resp["serviceAddress"]) print( colored("TensorBoard is running at: {}".format(url), "green")) render_event_stream(msg) break render_event_stream(msg)
def do_login( master_address: str, username: str, password: str, cert: Optional[certs.Cert] = None, ) -> str: r = api.post( master_address, "login", body={ "username": username, "password": password }, authenticated=False, cert=cert, ) token = r.json()["token"] assert isinstance(token, str), "got invalid token response from server" return token
def create_model(self, name: str, description: Optional[str] = "", metadata: Optional[Dict[str, Any]] = None) -> Model: """ Add a model to the model registry. Arguments: name (string): The name of the model. This name must be unique. description (string, optional): A description of the model. metadata (dict, optional): Dictionary of metadata to add to the model. """ r = api.post( self._session._master, "/api/v1/models/{}".format(name), body={ "description": description, "metadata": metadata }, ) return Model.from_json(r.json().get("model"), self._session._master)
def launch_command( master: str, endpoint: str, config: Dict[str, Any], template: str, context_path: Optional[Path] = None, data: Optional[Dict[str, Any]] = None, ) -> Any: user_files = [] # type: List[Dict[str, Any]] if context_path: user_files, _ = context.read_context(context_path) return api.post( master, endpoint, body={ "config": config, "template": template, "user_files": user_files, "data": data }, ).json()
def kill_experiment(args: Namespace) -> None: api.post(args.master, "experiments/{}/kill".format(args.experiment_id)) print("Killed experiment {}".format(args.experiment_id))
def main(hvd_args: List[str], script: List[str], autohorovod: bool) -> int: hvd_args = hvd_args or [] info = det.get_cluster_info() assert info is not None, "must be run on-cluster" assert info.task_type == "TRIAL", f'must be run with task_type="TRIAL", not "{info.task_type}"' # When --autohorovod was set, detect single-slot and zero-slot trials. if autohorovod and len( info.container_addrs) == 1 and len(info.slot_ids) <= 1: p = subprocess.Popen(script) with det.util.forward_signals(p): return p.wait() # Hack: get the resources id from the environment. resources_id = os.environ.get("DET_RESOURCES_ID") assert resources_id is not None, "Unable to run with DET_RESOURCES_ID unset" # Hack: read the full config. The experiment config is not a stable API! experiment_config = info.trial._config debug = experiment_config.get("debug", False) if debug: logging.getLogger().setLevel(logging.DEBUG) # TODO: refactor websocket, data_layer, and profiling to to not use the cli_cert. cert = certs.default_load(info.master_url) certs.cli_cert = cert # The launch layer should provide the chief_ip to the training code, so that the training code # can function with a different launch layer in a different environment. Inside Determined, the # easiest way to get the chief_ip is with container_addrs. chief_ip = info.container_addrs[0] # Chief IP is set as an environment variable to support nested launch layers os.environ["DET_CHIEF_IP"] = chief_ip if info.container_rank > 0: # Non-chief machines just run sshd. # Mark sshd containers as daemon resources that the master should kill when all non-daemon # contiainers (horovodrun, in this case) have exited. api.post( info.master_url, path= f"/api/v1/allocations/{info.allocation_id}/resources/{resources_id}/daemon", cert=cert, ) pid_server_cmd, run_sshd_command = create_sshd_worker_cmd( info.allocation_id, len(info.slot_ids), debug=debug) logging.debug( f"Non-chief [{info.container_rank}] training process launch " f"command: {run_sshd_command}.") p = subprocess.Popen(pid_server_cmd + run_sshd_command) with det.util.forward_signals(p): return p.wait() # Chief machine waits for every worker's sshd to be available. All machines should be pretty # close to in-step by now because all machines just finished synchronizing rendezvous info. deadline = time.time() + 20 for peer_addr in info.container_addrs[1:]: util.check_sshd(peer_addr, deadline, DTRAIN_SSH_PORT) # The chief has several layers of wrapper processes: # - a top-level pid_server, which causes the whole container to exit if any local worker dies. # - horovodrun, which launches $slots_per_trial copies of the following layers: # - a pid_client process to contact the local pid_server # - wrap_rank, which redirects stdin/stdout to the local container # - harness.py, which actually does the training for the worker # # It is a bug in horovod that causes us to have this pid_server/pid_client pair of layers. # We can remove these layers when the upstream fix has been around for long enough that we can # reasonably require user images to have patched horovod installations. pid_server_cmd = create_hvd_pid_server_cmd(info.allocation_id, len(info.slot_ids)) # TODO: remove this (very old) hack when we have a configurable launch layer. hvd_optional_args = experiment_config.get("data", {}).get("__det_dtrain_args", []) hvd_optional_args += hvd_args if debug: hvd_optional_args += ["--mpi-args=-v --display-map"] hvd_cmd = horovod.create_run_command( num_proc_per_machine=len(info.slot_ids), ip_addresses=info.container_addrs, inter_node_network_interface=info.trial._inter_node_network_interface, optimizations=experiment_config["optimizations"], debug=debug, optional_args=hvd_optional_args, ) worker_wrapper_cmd = create_worker_wrapper_cmd(info.allocation_id) logging.debug( f"chief worker calling horovodrun with args: {hvd_cmd[1:]} ...") os.environ["USE_HOROVOD"] = "1" # We now have environment images with built-in OpenMPI. When invoked the # SLURM_JOBID variable triggers integration with SLURM, however, we are # running in a singularity container and SLURM may or may not have # compatible configuration enabled. We therefore clear the SLURM_JOBID variable # before invoking mpi so that mpirun will honor the args passed via horvod # run to it describing the hosts and process topology, otherwise mpi ends # up wanting to launch all -np# processes on the local causing an oversubscription # error ("There are not enough slots available in the system"). os.environ.pop("SLURM_JOBID", None) p = subprocess.Popen(pid_server_cmd + hvd_cmd + worker_wrapper_cmd + script) with det.util.forward_signals(p): return p.wait()
def ship(self) -> None: if len(self.logs) <= 0: return api.post(self.master_url, "task-logs", self.logs) self.logs = []
def main(script: List[str]) -> int: info = det.get_cluster_info() assert info is not None, "must be run on-cluster" assert info.task_type == "TRIAL", f'must be run with task_type="TRIAL", not "{info.task_type}"' # Hack: get the resources id from the environment. resources_id = os.environ.get("DET_RESOURCES_ID") assert resources_id is not None, "Unable to run with DET_RESOURCES_ID unset" # TODO: refactor websocket, data_layer, and profiling to to not use the cli_cert. cert = certs.default_load(info.master_url) certs.cli_cert = cert # The launch layer should provide the chief_ip to the training code, so that the training code # can function with a different launch layer in a different environment. Inside Determined, the # easiest way to get the chief_ip is with container_addrs. chief_ip = info.container_addrs[0] # Chief IP is set as an environment variable to support nested launch layers os.environ["DET_CHIEF_IP"] = chief_ip # All ranks will need to run sshd. run_sshd_command = create_sshd_cmd() if info.container_rank > 0: # Non-chief machines just run sshd. # Mark sshd containers as daemon containers that the master should kill when all non-daemon # containers (deepspeed launcher, in this case) have exited. api.post( info.master_url, path=f"/api/v1/allocations/{info.allocation_id}/resources/{resources_id}/daemon", cert=cert, ) # Wrap it in a pid_server to ensure that we can't hang if a worker fails. # This is useful for deepspeed which does not have good error handling for remote processes # spun up by pdsh. pid_server_cmd = create_pid_server_cmd(info.allocation_id, len(info.slot_ids)) logging.debug( f"Non-chief [{info.container_rank}] training process launch " f"command: {run_sshd_command}." ) return subprocess.Popen(pid_server_cmd + run_sshd_command).wait() # We always need to set this variable to initialize the context correctly, even in the single # slot case. os.environ["USE_DEEPSPEED"] = "1" # The chief has several layers of wrapper processes: # - a top-level pid_server, which causes the whole container to exit if any local worker dies. # - deepspeed, which launches $slots_per_trial copies of the following layers: # - a pid_client process to contact the local pid_server # - wrap_rank, which redirects stdin/stdout to the local container # - harness.py, which actually does the training for the worker pid_server_cmd = create_pid_server_cmd(info.allocation_id, len(info.slot_ids)) master_address = create_hostlist_file( hostfile_path=pathlib.Path(hostfile_path), num_proc_per_machine=len(info.slot_ids), ip_addresses=info.container_addrs, ) cmd = create_run_command(master_address, hostfile_path) pid_client_cmd = create_pid_client_cmd(info.allocation_id) log_redirect_cmd = create_log_redirect_cmd() harness_cmd = script logging.debug(f"chief worker calling deepspeed with args: {cmd[1:]} ...") full_cmd = pid_server_cmd + cmd + pid_client_cmd + log_redirect_cmd + harness_cmd multi_machine = len(info.container_addrs) > 1 if not multi_machine: return subprocess.Popen(full_cmd).wait() # Create the environment file that will be passed by deepspeed to individual ranks. create_deepspeed_env_file() # Set custom PDSH args: # * bypass strict host checking # * -p our custom port # * other args are default ssh args for pdsh os.environ["PDSH_SSH_ARGS"] = ( "-o PasswordAuthentication=no -o StrictHostKeyChecking=no " f"-p {constants.DTRAIN_SSH_PORT} -2 -a -x %h" ) # Chief worker also needs to run sshd when using pdsh and multi-machine training. sshd_process = subprocess.Popen(run_sshd_command) try: # Chief machine waits for every worker's sshd to be available. All machines should be # close to in-step by now because all machines just finished synchronizing rendezvous # info. deadline = time.time() + 20 for peer_addr in info.container_addrs: util.check_sshd(peer_addr, deadline, constants.DTRAIN_SSH_PORT) return subprocess.Popen(full_cmd).wait() finally: sshd_process.kill() sshd_process.wait()
def _kill(master_url: str, taskType: str, taskID: str) -> None: api_full_path = "api/v1/{}/{}/kill".format(RemoteTaskNewAPIs[taskType], taskID) api.post(master_url, api_full_path)
def kill_trial(args: Namespace) -> None: api.post(args.master, "trials/{}/kill".format(args.trial_id)) print("Killed trial {}".format(args.trial_id))
def create_user(parsed_args: Namespace) -> None: username = parsed_args.username admin = bool(parsed_args.admin) request = {"username": username, "admin": admin, "active": True} api.post(parsed_args.master, "users", body=request)