예제 #1
0
def post_ready(master_url: str, cert: certs.Cert, allocation_id: str):
    api.post(
        master_url,
        f"/api/v1/allocations/{allocation_id}/ready",
        {},
        cert=cert,
    )
예제 #2
0
def trigger_preemption(signum: int, frame: types.FrameType) -> None:
    info = det.get_cluster_info()
    if info and info.container_rank == 0:
        # Chief container, requests preemption, others ignore
        logging.debug(
            f"[rank={info.container_rank}] SIGTERM: Preemption imminent.")
        # Notify the master that we need to be preempted
        api.post(
            info.master_url,
            f"/api/v1/allocations/{info.allocation_id}/signals/pending_preemption"
        )
예제 #3
0
def set_priority(args: Namespace) -> None:
    task_id = expand_uuid_prefixes(args)
    name = RemoteTaskName[args._command]

    try:
        api_full_path = "api/v1/{}/{}/set_priority".format(
            RemoteTaskNewAPIs[args._command], task_id)
        api.post(args.master, api_full_path, {"priority": args.priority})
        print(
            colored(
                "Set priority of {} {} to {}".format(name, task_id,
                                                     args.priority), "green"))
    except api.errors.APIException as e:
        print(colored("Skipping: {} ({})".format(e, type(e).__name__), "red"))
예제 #4
0
def post_trial_profiler_metrics_batches(
    master_url: str,
    batches: List[TrialProfilerMetricsBatch],
) -> None:
    """
    Post the given metrics to the master to be persisted. Labels
    must contain only a subset of the keys: trial_id,  name,
    gpu_uuid, agent_id and metric_type, where metric_type is one
    of PROFILER_METRIC_TYPE_SYSTEM or PROFILER_METRIC_TYPE_TIMING.
    """
    api.post(
        master_url,
        "/api/v1/trials/profiler/metrics",
        body={"batches": [b.__dict__ for b in batches]},
    )
예제 #5
0
def kill(args: Namespace) -> None:
    ids = RemoteTaskGetIDsFunc[args._command](args)  # type: ignore
    name = RemoteTaskName[args._command]

    for i, id in enumerate(ids):
        try:
            api_full_path = "api/v1/{}/{}/kill".format(RemoteTaskNewAPIs[args._command], id)
            api.post(args.master, api_full_path)
            print(colored("Killed {} {}".format(name, id), "green"))
        except api.errors.APIException as e:
            if not args.force:
                for ignored in ids[i + 1 :]:
                    print("Cowardly not killing {}".format(ignored))
                raise e
            print(colored("Skipping: {} ({})".format(e, type(e).__name__), "red"))
예제 #6
0
    def patch(args: argparse.Namespace) -> None:
        check_false(args.all and args.agent_id)

        if not (args.all or args.agent_id):
            print("Error: must specify exactly one of `--all` or agent_id",
                  file=sys.stderr)
            sys.exit(1)

        if args.agent_id:
            agent_ids = [args.agent_id]
        else:
            r = api.get(args.master, "agents")
            agent_ids = sorted(local_id(a) for a in r.json().keys())

        drain_mode = None if enabled else args.drain

        for agent_id in agent_ids:
            action = "enable" if enabled else "disable"
            path = f"api/v1/agents/{agent_id}/{action}"

            payload = None
            if not enabled and drain_mode:
                payload = {
                    "drain": drain_mode,
                }

            api.post(args.master, path, payload)
            status = "Disabled" if not enabled else "Enabled"
            print(f"{status} agent {agent_id}.", file=sys.stderr)

        # When draining, check if there're any tasks currently running on
        # these slots, and list them.
        if drain_mode:
            rsp = api.get(args.master, "tasks")
            tasks_data = {
                k: t
                for (k, t) in rsp.json().items()
                if any(a in agent_ids for r in t.get("resources", [])
                       for a in r["agent_devices"])
            }

            if not (args.json or args.csv):
                if tasks_data:
                    print("Tasks still in progress on draining nodes.")
                else:
                    print("No tasks in progress on draining nodes.")

            cli_task.render_tasks(args, tasks_data)
예제 #7
0
    def register_version(self, checkpoint_uuid: str) -> Checkpoint:
        """
        Creates a new model version and returns the
        :class:`~determined.experimental.Checkpoint` corresponding to the
        version.

        Arguments:
            checkpoint_uuid: The UUID of the checkpoint to register.
        """
        resp = api.post(
            self._master,
            "/api/v1/models/{}/versions".format(self.name),
            body={"checkpoint_uuid": checkpoint_uuid},
        )

        data = resp.json()

        return Checkpoint.from_json(
            {
                **data["modelVersion"]["checkpoint"],
                "model_version": data["modelVersion"]["version"],
                "model_name": data["modelVersion"]["model"]["name"],
            },
            self._master,
        )
예제 #8
0
def launch_command(
    master: str,
    endpoint: str,
    config: Dict[str, Any],
    template: str,
    context_path: Optional[Path] = None,
    data: Optional[Dict[str, Any]] = None,
    preview: Optional[bool] = False,
) -> Any:
    user_files = []  # type: List[Dict[str, Any]]
    if context_path:
        user_files, _ = context.read_context(context_path)

    body = {"config": config}  # type: Dict[str, Any]

    if template:
        body["template_name"] = template

    if len(user_files) > 0:
        body["files"] = user_files

    if data is not None:
        message_bytes = json.dumps(data).encode("utf-8")
        base64_bytes = base64.b64encode(message_bytes)
        body["data"] = base64_bytes

    if preview:
        body["preview"] = preview

    return api.post(
        master,
        endpoint,
        body,
    ).json()
예제 #9
0
def cancel_experiment_v1(experiment_id: int) -> None:
    certs.cli_cert = certs.default_load(conf.make_master_url())
    authentication.cli_auth = authentication.Authentication(
        conf.make_master_url(), try_reauth=True)
    r = api.post(conf.make_master_url(),
                 "/api/v1/experiments/{}/cancel".format(experiment_id))
    r.raise_for_status()
    wait_for_experiment_state(experiment_id, "CANCELED")
예제 #10
0
def preview_search(args: Namespace) -> None:
    experiment_config = safe_load_yaml_with_exceptions(args.config_file)
    args.config_file.close()

    if "searcher" not in experiment_config:
        print("Experiment configuration must have 'searcher' section")
        sys.exit(1)
    r = api.post(args.master, "searcher/preview", json=experiment_config)
    j = r.json()

    def to_full_name(kind: str) -> str:
        try:
            # The unitless searcher case, for masters newer than 0.17.6.
            length = int(kind)
            return f"train for {length}"
        except ValueError:
            pass
        if kind[-1] == "R":
            return "train {} records".format(kind[:-1])
        if kind[-1] == "B":
            return "train {} batch(es)".format(kind[:-1])
        if kind[-1] == "E":
            return "train {} epoch(s)".format(kind[:-1])
        if kind == "V":
            return "validation"
        raise ValueError("unexpected kind: {}".format(kind))

    def render_sequence(sequence: List[str]) -> str:
        if not sequence:
            return "N/A"
        instructions = []
        current = sequence[0]
        count = 0
        for k in sequence:
            if k != current:
                instructions.append("{} x {}".format(count,
                                                     to_full_name(current)))
                current = k
                count = 1
            else:
                count += 1
        instructions.append("{} x {}".format(count, to_full_name(current)))
        return ", ".join(instructions)

    headers = ["Trials", "Breakdown"]
    values = [(count, render_sequence(operations.split()))
              for operations, count in j["results"].items()]

    print(colored("Using search configuration:", "green"))
    yml = yaml.YAML()
    yml.indent(mapping=2, sequence=4, offset=2)
    yml.dump(experiment_config["searcher"], sys.stdout)
    print()
    print("This search will create a total of {} trial(s).".format(
        sum(j["results"].values())))
    print(tabulate.tabulate(values, headers, tablefmt="presto"), flush=False)
예제 #11
0
def log_out_user(parsed_args: Namespace) -> None:
    auth = authentication.cli_auth
    if auth is None:
        return

    try:
        api.post(
            parsed_args.master,
            "logout",
            headers={
                "Authorization": "Bearer {}".format(auth.get_session_token())
            },
            authenticated=False,
        )
    except api.errors.APIException as e:
        if e.status_code != 401:
            raise e

    token_store = authentication.TokenStore(parsed_args.master)
    token_store.drop_user(auth.get_session_user())
예제 #12
0
    def add_metadata(self, metadata: Dict[str, Any]) -> None:
        """
        Adds user-defined metadata to the checkpoint. The ``metadata`` argument must be a
        JSON-serializable dictionary. If any keys from this dictionary already appear in
        the checkpoint metadata, the corresponding dictionary entries in the checkpoint are
        replaced by the passed-in dictionary values.

        Arguments:
            metadata (dict): Dictionary of metadata to add to the checkpoint.
        """
        for key, val in metadata.items():
            self.metadata[key] = val

        if self._master:
            api.post(
                self._master,
                "/api/v1/checkpoints/{}/metadata".format(self.uuid),
                body={"checkpoint": {
                    "metadata": self.metadata
                }},
            )
예제 #13
0
    def remove_metadata(self, keys: List[str]) -> None:
        """
        Removes user-defined metadata from the checkpoint. Any top-level keys that
        appear in the ``keys`` list are removed from the checkpoint.

        Arguments:
            keys (List[string]): Top-level keys to remove from the checkpoint metadata.
        """

        for key in keys:
            if key in self.metadata:
                del self.metadata[key]

        if self._master:
            api.post(
                self._master,
                "/api/v1/checkpoints/{}/metadata".format(self.uuid),
                body={"checkpoint": {
                    "metadata": self.metadata
                }},
            )
예제 #14
0
def add_client(parsed_args: Namespace) -> None:
    try:
        client = api.post(
            parsed_args.master,
            "oauth2/clients",
            body={
                "domain": parsed_args.domain,
                "name": parsed_args.name
            },
        ).json()
    except NotFoundException:
        raise EnterpriseOnlyError("API not found: oauth2/clients")
    print("Client ID:     {}".format(client["id"]))
    print("Client secret: {}".format(client["secret"]))
예제 #15
0
def register_version(args: Namespace) -> None:
    if args.json:
        resp = api.post(
            args.master,
            "/api/v1/models/{}/versions".format(args.name),
            body={"checkpoint_uuid": args.uuid},
        )

        print(json.dumps(resp.json(), indent=2))
    else:
        model = Determined(args.master, None).get_model(args.name)
        checkpoint = model.register_version(args.uuid)
        render_model(model)
        print("\n")
        render_model_version(checkpoint)
예제 #16
0
def do_login(master_address: str, auth: Authentication, username: str,
             password: str) -> str:
    r = api.post(
        master_address,
        "login",
        body={
            "username": username,
            "password": password
        },
        authenticated=False,
    )

    token = cast(str, r.json()["token"])

    auth.token_store.set_token(username, token)

    return token
예제 #17
0
def start_tensorboard(args: Namespace) -> None:
    if not (args.trial_ids or args.experiment_ids):
        print("Either experiment_ids or trial_ids must be specified.")
        sys.exit(1)

    config = parse_config(args.config_file, None, args.config, [])
    req_body = {
        "config": config,
        "trial_ids": args.trial_ids,
        "experiment_ids": args.experiment_ids,
    }

    if args.context is not None:
        req_body["files"], _ = context.read_context(args.context,
                                                    constants.MAX_CONTEXT_SIZE)

    resp = api.post(args.master, "api/v1/tensorboards",
                    json=req_body).json()["tensorboard"]

    if args.detach:
        print(resp["id"])
        return

    url = "tensorboard/{}/events".format(resp["id"])
    with api.ws(args.master, url) as ws:
        for msg in ws:
            if msg["log_event"] is not None:
                # TensorBoard will print a url by default. The URL is incorrect since
                # TensorBoard is not aware of the master proxy address it is assigned.
                if "http" in msg["log_event"]:
                    continue

            if msg["service_ready_event"]:
                if args.no_browser:
                    url = api.make_url(args.master, resp["serviceAddress"])
                else:
                    url = api.browser_open(args.master, resp["serviceAddress"])

                print(
                    colored("TensorBoard is running at: {}".format(url),
                            "green"))
                render_event_stream(msg)
                break
            render_event_stream(msg)
예제 #18
0
def do_login(
    master_address: str,
    username: str,
    password: str,
    cert: Optional[certs.Cert] = None,
) -> str:
    r = api.post(
        master_address,
        "login",
        body={
            "username": username,
            "password": password
        },
        authenticated=False,
        cert=cert,
    )

    token = r.json()["token"]
    assert isinstance(token, str), "got invalid token response from server"

    return token
예제 #19
0
    def create_model(self,
                     name: str,
                     description: Optional[str] = "",
                     metadata: Optional[Dict[str, Any]] = None) -> Model:
        """
        Add a model to the model registry.

        Arguments:
            name (string): The name of the model. This name must be unique.
            description (string, optional): A description of the model.
            metadata (dict, optional): Dictionary of metadata to add to the model.
        """
        r = api.post(
            self._session._master,
            "/api/v1/models/{}".format(name),
            body={
                "description": description,
                "metadata": metadata
            },
        )

        return Model.from_json(r.json().get("model"), self._session._master)
예제 #20
0
def launch_command(
    master: str,
    endpoint: str,
    config: Dict[str, Any],
    template: str,
    context_path: Optional[Path] = None,
    data: Optional[Dict[str, Any]] = None,
) -> Any:
    user_files = []  # type: List[Dict[str, Any]]
    if context_path:
        user_files, _ = context.read_context(context_path)

    return api.post(
        master,
        endpoint,
        body={
            "config": config,
            "template": template,
            "user_files": user_files,
            "data": data
        },
    ).json()
예제 #21
0
def kill_experiment(args: Namespace) -> None:
    api.post(args.master, "experiments/{}/kill".format(args.experiment_id))
    print("Killed experiment {}".format(args.experiment_id))
예제 #22
0
def main(hvd_args: List[str], script: List[str], autohorovod: bool) -> int:
    hvd_args = hvd_args or []

    info = det.get_cluster_info()
    assert info is not None, "must be run on-cluster"
    assert info.task_type == "TRIAL", f'must be run with task_type="TRIAL", not "{info.task_type}"'

    # When --autohorovod was set, detect single-slot and zero-slot trials.
    if autohorovod and len(
            info.container_addrs) == 1 and len(info.slot_ids) <= 1:
        p = subprocess.Popen(script)
        with det.util.forward_signals(p):
            return p.wait()

    # Hack: get the resources id from the environment.
    resources_id = os.environ.get("DET_RESOURCES_ID")
    assert resources_id is not None, "Unable to run with DET_RESOURCES_ID unset"

    # Hack: read the full config.  The experiment config is not a stable API!
    experiment_config = info.trial._config

    debug = experiment_config.get("debug", False)
    if debug:
        logging.getLogger().setLevel(logging.DEBUG)

    # TODO: refactor websocket, data_layer, and profiling to to not use the cli_cert.
    cert = certs.default_load(info.master_url)
    certs.cli_cert = cert

    # The launch layer should provide the chief_ip to the training code, so that the training code
    # can function with a different launch layer in a different environment.  Inside Determined, the
    # easiest way to get the chief_ip is with container_addrs.
    chief_ip = info.container_addrs[0]

    # Chief IP is set as an environment variable to support nested launch layers
    os.environ["DET_CHIEF_IP"] = chief_ip

    if info.container_rank > 0:
        # Non-chief machines just run sshd.

        # Mark sshd containers as daemon resources that the master should kill when all non-daemon
        # contiainers (horovodrun, in this case) have exited.
        api.post(
            info.master_url,
            path=
            f"/api/v1/allocations/{info.allocation_id}/resources/{resources_id}/daemon",
            cert=cert,
        )

        pid_server_cmd, run_sshd_command = create_sshd_worker_cmd(
            info.allocation_id, len(info.slot_ids), debug=debug)

        logging.debug(
            f"Non-chief [{info.container_rank}] training process launch "
            f"command: {run_sshd_command}.")
        p = subprocess.Popen(pid_server_cmd + run_sshd_command)
        with det.util.forward_signals(p):
            return p.wait()

    # Chief machine waits for every worker's sshd to be available.  All machines should be pretty
    # close to in-step by now because all machines just finished synchronizing rendezvous info.
    deadline = time.time() + 20
    for peer_addr in info.container_addrs[1:]:
        util.check_sshd(peer_addr, deadline, DTRAIN_SSH_PORT)

    # The chief has several layers of wrapper processes:
    # - a top-level pid_server, which causes the whole container to exit if any local worker dies.
    # - horovodrun, which launches $slots_per_trial copies of the following layers:
    #     - a pid_client process to contact the local pid_server
    #     - wrap_rank, which redirects stdin/stdout to the local container
    #     - harness.py, which actually does the training for the worker
    #
    # It is a bug in horovod that causes us to have this pid_server/pid_client pair of layers.
    # We can remove these layers when the upstream fix has been around for long enough that we can
    # reasonably require user images to have patched horovod installations.

    pid_server_cmd = create_hvd_pid_server_cmd(info.allocation_id,
                                               len(info.slot_ids))

    # TODO: remove this (very old) hack when we have a configurable launch layer.
    hvd_optional_args = experiment_config.get("data",
                                              {}).get("__det_dtrain_args", [])
    hvd_optional_args += hvd_args
    if debug:
        hvd_optional_args += ["--mpi-args=-v --display-map"]

    hvd_cmd = horovod.create_run_command(
        num_proc_per_machine=len(info.slot_ids),
        ip_addresses=info.container_addrs,
        inter_node_network_interface=info.trial._inter_node_network_interface,
        optimizations=experiment_config["optimizations"],
        debug=debug,
        optional_args=hvd_optional_args,
    )

    worker_wrapper_cmd = create_worker_wrapper_cmd(info.allocation_id)

    logging.debug(
        f"chief worker calling horovodrun with args: {hvd_cmd[1:]} ...")

    os.environ["USE_HOROVOD"] = "1"

    # We now have environment images with built-in OpenMPI.   When invoked the
    # SLURM_JOBID variable triggers integration with SLURM, however, we are
    # running in a singularity container and SLURM may or may not have
    # compatible configuration enabled.  We therefore clear the SLURM_JOBID variable
    # before invoking mpi so that mpirun will honor the args passed via horvod
    # run to it describing the hosts and process topology, otherwise mpi ends
    # up wanting to launch all -np# processes on the local causing an oversubscription
    # error ("There are not enough slots available in the system").
    os.environ.pop("SLURM_JOBID", None)
    p = subprocess.Popen(pid_server_cmd + hvd_cmd + worker_wrapper_cmd +
                         script)
    with det.util.forward_signals(p):
        return p.wait()
예제 #23
0
    def ship(self) -> None:
        if len(self.logs) <= 0:
            return

        api.post(self.master_url, "task-logs", self.logs)
        self.logs = []
예제 #24
0
def main(script: List[str]) -> int:
    info = det.get_cluster_info()
    assert info is not None, "must be run on-cluster"
    assert info.task_type == "TRIAL", f'must be run with task_type="TRIAL", not "{info.task_type}"'

    # Hack: get the resources id from the environment.
    resources_id = os.environ.get("DET_RESOURCES_ID")
    assert resources_id is not None, "Unable to run with DET_RESOURCES_ID unset"

    # TODO: refactor websocket, data_layer, and profiling to to not use the cli_cert.
    cert = certs.default_load(info.master_url)
    certs.cli_cert = cert

    # The launch layer should provide the chief_ip to the training code, so that the training code
    # can function with a different launch layer in a different environment.  Inside Determined, the
    # easiest way to get the chief_ip is with container_addrs.
    chief_ip = info.container_addrs[0]

    # Chief IP is set as an environment variable to support nested launch layers
    os.environ["DET_CHIEF_IP"] = chief_ip

    # All ranks will need to run sshd.
    run_sshd_command = create_sshd_cmd()

    if info.container_rank > 0:
        # Non-chief machines just run sshd.

        # Mark sshd containers as daemon containers that the master should kill when all non-daemon
        # containers (deepspeed launcher, in this case) have exited.
        api.post(
            info.master_url,
            path=f"/api/v1/allocations/{info.allocation_id}/resources/{resources_id}/daemon",
            cert=cert,
        )

        # Wrap it in a pid_server to ensure that we can't hang if a worker fails.
        # This is useful for deepspeed which does not have good error handling for remote processes
        # spun up by pdsh.
        pid_server_cmd = create_pid_server_cmd(info.allocation_id, len(info.slot_ids))

        logging.debug(
            f"Non-chief [{info.container_rank}] training process launch "
            f"command: {run_sshd_command}."
        )
        return subprocess.Popen(pid_server_cmd + run_sshd_command).wait()

    # We always need to set this variable to initialize the context correctly, even in the single
    # slot case.
    os.environ["USE_DEEPSPEED"] = "1"

    # The chief has several layers of wrapper processes:
    # - a top-level pid_server, which causes the whole container to exit if any local worker dies.
    # - deepspeed, which launches $slots_per_trial copies of the following layers:
    #     - a pid_client process to contact the local pid_server
    #     - wrap_rank, which redirects stdin/stdout to the local container
    #     - harness.py, which actually does the training for the worker

    pid_server_cmd = create_pid_server_cmd(info.allocation_id, len(info.slot_ids))

    master_address = create_hostlist_file(
        hostfile_path=pathlib.Path(hostfile_path),
        num_proc_per_machine=len(info.slot_ids),
        ip_addresses=info.container_addrs,
    )
    cmd = create_run_command(master_address, hostfile_path)

    pid_client_cmd = create_pid_client_cmd(info.allocation_id)

    log_redirect_cmd = create_log_redirect_cmd()

    harness_cmd = script

    logging.debug(f"chief worker calling deepspeed with args: {cmd[1:]} ...")

    full_cmd = pid_server_cmd + cmd + pid_client_cmd + log_redirect_cmd + harness_cmd

    multi_machine = len(info.container_addrs) > 1
    if not multi_machine:
        return subprocess.Popen(full_cmd).wait()

    # Create the environment file that will be passed by deepspeed to individual ranks.
    create_deepspeed_env_file()
    # Set custom PDSH args:
    # * bypass strict host checking
    # * -p our custom port
    # * other args are default ssh args for pdsh
    os.environ["PDSH_SSH_ARGS"] = (
        "-o PasswordAuthentication=no -o StrictHostKeyChecking=no "
        f"-p {constants.DTRAIN_SSH_PORT} -2 -a -x %h"
    )

    # Chief worker also needs to run sshd when using pdsh and multi-machine training.
    sshd_process = subprocess.Popen(run_sshd_command)

    try:
        # Chief machine waits for every worker's sshd to be available.  All machines should be
        # close to in-step by now because all machines just finished synchronizing rendezvous
        # info.
        deadline = time.time() + 20
        for peer_addr in info.container_addrs:
            util.check_sshd(peer_addr, deadline, constants.DTRAIN_SSH_PORT)

        return subprocess.Popen(full_cmd).wait()
    finally:
        sshd_process.kill()
        sshd_process.wait()
예제 #25
0
def _kill(master_url: str, taskType: str, taskID: str) -> None:
    api_full_path = "api/v1/{}/{}/kill".format(RemoteTaskNewAPIs[taskType],
                                               taskID)
    api.post(master_url, api_full_path)
예제 #26
0
def kill_trial(args: Namespace) -> None:
    api.post(args.master, "trials/{}/kill".format(args.trial_id))
    print("Killed trial {}".format(args.trial_id))
예제 #27
0
def create_user(parsed_args: Namespace) -> None:
    username = parsed_args.username
    admin = bool(parsed_args.admin)

    request = {"username": username, "admin": admin, "active": True}
    api.post(parsed_args.master, "users", body=request)