def list_agents(args: argparse.Namespace) -> None: r = api.get(args.master, "agents") agents = r.json() agents = [ OrderedDict([ ("id", local_id(agent_id)), ("registered_time", render.format_time(agent["registered_time"])), ("num_slots", len(agent["slots"])), ("num_containers", agent["num_containers"]), ("resource_pool", agent["resource_pool"]), ("label", agent["label"]), ("addresses", ", ".join(agent["addresses"])), ]) for agent_id, agent in sorted(agents.items()) ] if args.json: print(json.dumps(agents, indent=4)) return headers = [ "Agent ID", "Registered Time", "Slots", "Containers", "Resource Pool", "Label", "Addresses", ] values = [a.values() for a in agents] render.tabulate_or_csv(headers, values, args.csv)
def list_tasks(args: Namespace) -> None: api_path = RemoteTaskNewAPIs[args._command] api_full_path = "api/v1/{}".format(api_path) table_header = RemoteTaskListTableHeaders[args._command] if args.all: params = {} # type: Dict[str, Any] else: params = {"users": [authentication.must_cli_auth().get_session_user()]} res = api.get(args.master, api_full_path, params=params).json()[api_path] if args.quiet: for command in res: print(command["id"]) return for item in res: if item["state"].startswith("STATE_"): item["state"] = item["state"][6:] if getattr(args, "json", None): print(json.dumps(res, indent=4)) return values = render.select_values(res, table_header) render.tabulate_or_csv(table_header, values, getattr(args, "csv", False))
def list_slots(args: argparse.Namespace) -> None: task_res = api.get(args.master, "tasks") agent_res = api.get(args.master, "agents") agents = agent_res.json() allocations = task_res.json() c_names = { r["container_id"]: { "name": a["name"], "allocation_id": a["allocation_id"] } for a in allocations.values() for r in a["resources"] if r["container_id"] } slots = [ OrderedDict([ ("agent_id", local_id(agent_id)), ("resource_pool", agent["resource_pool"]), ("slot_id", local_id(slot_id)), ("enabled", slot["enabled"]), ("draining", slot.get("draining", False)), ( "allocation_id", c_names[slot["container"]["id"]]["allocation_id"] if slot["container"] else "FREE", ), ( "task_name", c_names[slot["container"]["id"]]["name"] if slot["container"] else "None", ), ("type", slot["device"]["type"]), ("device", slot["device"]["brand"]), ]) for agent_id, agent in sorted(agents.items()) for slot_id, slot in sorted(agent["slots"].items()) ] headers = [ "Agent ID", "Resource Pool", "Slot ID", "Enabled", "Draining", "Allocation ID", "Task Name", "Type", "Device", ] if args.json: print(json.dumps(slots, indent=4)) return values = [s.values() for s in slots] render.tabulate_or_csv(headers, values, args.csv)
def describe_trial(args: Namespace) -> None: if args.metrics: r = api.get(args.master, "trials/{}/metrics".format(args.trial_id)) else: r = api.get(args.master, "trials/{}".format(args.trial_id)) trial = r.json() if args.json: print(json.dumps(trial, indent=4)) return # Print information about the trial itself. headers = [ "Experiment ID", "State", "H-Params", "Start Time", "End Time", ] values = [[ trial["experiment_id"], trial["state"], json.dumps(trial["hparams"], indent=4), render.format_time(trial["start_time"]), render.format_time(trial["end_time"]), ]] render.tabulate_or_csv(headers, values, args.csv) # Print information about individual steps. headers = [ "# of Batches", "State", "Start Time", "End Time", "Checkpoint", "Checkpoint UUID", "Checkpoint Metadata", "Validation", "Validation Metrics", ] if args.metrics: headers.append("Workload Metrics") values = [[ s["prior_batches_processed"] + s["num_batches"], s["state"], render.format_time(s["start_time"]), render.format_time(s["end_time"]), *format_checkpoint(s["checkpoint"]), *format_validation(s["validation"]), *([json.dumps(s["metrics"], indent=4)] if args.metrics else []), ] for s in trial["steps"]] print() print("Workloads:") render.tabulate_or_csv(headers, values, args.csv)
def ls(args: Namespace) -> None: session = setup_session(args) pools = bindings.get_GetResourcePools(setup_session(args)) is_priority = check_is_priority(pools, args.resource_pool) response = bindings.get_GetJobs( session, resourcePool=args.resource_pool, pagination_limit=args.limit, pagination_offset=args.offset, orderBy=bindings.v1OrderBy.ORDER_BY_ASC if not args.reverse else bindings.v1OrderBy.ORDER_BY_DESC, ) if args.yaml: print(yaml.safe_dump(response.to_json(), default_flow_style=False)) elif args.json: print(json.dumps(response.to_json(), indent=4, default=str)) else: headers = [ "#", "ID", "Type", "Job Name", "Priority" if is_priority else "Weight", "Submitted", "Slots (acquired/needed)", "Status", "User", ] def computed_job_name(job: bindings.v1Job) -> str: if job.type == bindings.determinedjobv1Type.TYPE_EXPERIMENT: return f"{job.name} ({job.entityId})" else: return job.name values = [ [ j.summary.jobsAhead if j.summary is not None and j.summary.jobsAhead > -1 else "N/A", j.jobId, j.type.value, computed_job_name(j), j.priority if is_priority else j.weight, pytz.utc.localize( datetime.strptime(j.submissionTime.split(".")[0], "%Y-%m-%dT%H:%M:%S") ), f"{j.allocatedSlots}/{j.requestedSlots}", j.summary.state.value if j.summary is not None else "N/A", j.username, ] for j in response.jobs ] render.tabulate_or_csv(headers, values, as_csv=args.csv)
def list_clients(parsed_args: Namespace) -> None: try: clients = api.get(parsed_args.master, "oauth2/clients").json() except NotFoundException: raise EnterpriseOnlyError("API not found: oauth2/clients") headers = ["Name", "Client ID", "Domain"] keys = ["name", "id", "domain"] render.tabulate_or_csv(headers, [[str(client[k]) for k in keys] for client in clients], False)
def list_trials(args: Namespace) -> None: r = api.get(args.master, "experiments/{}/summary".format(args.experiment_id)) experiment = r.json() headers = ["Trial ID", "State", "H-Params", "Start Time", "End Time", "# of Batches"] values = [ [ t["id"], t["state"], json.dumps(t["hparams"], indent=4), render.format_time(t["start_time"]), render.format_time(t["end_time"]), t["total_batches_processed"], ] for t in experiment["trials"] ] render.tabulate_or_csv(headers, values, args.csv)
def list_experiments(args: Namespace) -> None: kwargs = { "limit": args.limit, "offset": args.offset, } if not args.all: kwargs["archived"] = "false" kwargs["users"] = [authentication.must_cli_auth().get_session_user()] all_experiments: List[bindings.v1Experiment] = limit_offset_paginator( bindings.get_GetExperiments, "experiments", setup_session(args), **kwargs) def format_experiment(e: Any) -> List[Any]: result = [ e.id, e.username, e.name, e.forkedFrom, e.state.value.replace("STATE_", ""), render.format_percent(e.progress), render.format_time(e.startTime), render.format_time(e.endTime), e.resourcePool, ] if args.all: result.append(e.archived) return result headers = [ "ID", "Owner", "Name", "Parent ID", "State", "Progress", "Start Time", "End Time", "Resource Pool", ] if args.all: headers.append("Archived") values = [format_experiment(e) for e in all_experiments] render.tabulate_or_csv(headers, values, args.csv)
def list_agents(args: argparse.Namespace) -> None: resp = bindings.get_GetAgents(setup_session(args)) agents = [ OrderedDict([ ("id", local_id(a.id)), ("version", a.version), ("registered_time", render.format_time(a.registeredTime)), ("num_slots", len(a.slots) if a.slots is not None else ""), ("num_containers", len(a.containers) if a.containers is not None else ""), ( "resource_pools", ", ".join(a.resourcePools) if a.resourcePools is not None else "", ), ("enabled", a.enabled), ("draining", a.draining), ("label", a.label), ("addresses", ", ".join(a.addresses) if a.addresses is not None else ""), ]) for a in sorted(resp.agents or [], key=attrgetter("id")) ] if args.json: print(json.dumps(agents, indent=4)) return headers = [ "Agent ID", "Version", "Registered Time", "Slots", "Containers", "Resource Pool", "Enabled", "Draining", "Label", "Addresses", ] values = [a.values() for a in agents] render.tabulate_or_csv(headers, values, args.csv)
def render_tasks(args: Namespace, tasks: Dict[str, Dict[str, Any]]) -> None: def agent_info(t: Dict[str, Any]) -> Union[str, List[str]]: resources = t.get("resources", []) if not resources: return "unassigned" agents = [a for r in resources for a in r["agent_devices"]] if len(agents) == 1: agent = agents[0] # type: str return agent return agents if args.json: print(json.dumps(tasks, indent=4)) return headers = [ "Task ID", "Allocation ID", "Name", "Slots Needed", "Registered Time", "Agent", "Priority", "Resource Pool", ] values = [ [ task["task_id"], task["allocation_id"], task["name"], task["slots_needed"], render.format_time(task["registered_time"]), agent_info(task), task["priority"] if task["scheduler_type"] == "priority" else "N/A", task["resource_pool"], ] for task_id, task in sorted( tasks.items(), key=lambda tup: (render.format_time(tup[1]["registered_time"]),), ) ] render.tabulate_or_csv(headers, values, args.csv)
def list_experiments(args: Namespace) -> None: params = {} if args.all: params["filter"] = "all" else: params["user"] = api.Authentication.instance().get_session_user() r = api.get(args.master, "experiments", params=params) def format_experiment(e: Any) -> List[Any]: result = [ e["id"], e["owner"]["username"], e["config"]["description"], e["state"], render.format_percent(e["progress"]), render.format_time(e["start_time"]), render.format_time(e["end_time"]), e["config"]["resources"].get("resource_pool"), ] if args.all: result.append(e["archived"]) return result headers = [ "ID", "Owner", "Description", "State", "Progress", "Start Time", "End Time", "Resource Pool", ] if args.all: headers.append("Archived") values = [format_experiment(e) for e in r.json()] render.tabulate_or_csv(headers, values, args.csv)
def list_tasks(args: Namespace) -> None: r = api.get(args.master, "tasks") def agent_info(t: Dict[str, Any]) -> Union[str, List[str]]: containers = t.get("containers", []) if not containers: return "unassigned" if len(containers) == 1: agent = containers[0]["agent"] # type: str return agent return [c["agent"] for c in containers] tasks = r.json() headers = [ "ID", "Name", "Slots Needed", "Registered Time", "Agent", "Priority", "Resource Pool", ] values = [[ task["id"], task["name"], task["slots_needed"], render.format_time(task["registered_time"]), agent_info(task), task["priority"] if task["scheduler_type"] == "priority" else "N/A", task["resource_pool"], ] for task_id, task in sorted( tasks.items(), key=lambda tup: (render.format_time(tup[1]["registered_time"]), ), )] render.tabulate_or_csv(headers, values, args.csv)
def list_trials(args: Namespace) -> None: all_trials: List[bindings.trialv1Trial] = limit_offset_paginator( bindings.get_GetExperimentTrials, "trials", setup_session(args), experimentId=args.experiment_id, limit=args.limit, offset=args.offset, ) headers = [ "Trial ID", "State", "H-Params", "Start Time", "End Time", "# of Batches" ] values = [[ t.id, t.state.value.replace("STATE_", ""), json.dumps(t.hparams, indent=4), render.format_time(t.startTime), render.format_time(t.endTime), t.totalBatchesProcessed, ] for t in all_trials] render.tabulate_or_csv(headers, values, args.csv)
def describe(args: Namespace) -> None: docs = [] for experiment_id in args.experiment_ids.split(","): if args.metrics: r = api.get(args.master, "experiments/{}/metrics/summary".format(experiment_id)) else: r = api.get(args.master, "experiments/{}".format(experiment_id)) docs.append(r.json()) if args.json: print(json.dumps(docs, indent=4)) return # Display overall experiment information. headers = [ "Experiment ID", "State", "Progress", "Start Time", "End Time", "Description", "Archived", "Resource Pool", "Labels", ] values = [ [ doc["id"], doc["state"], render.format_percent(doc["progress"]), render.format_time(doc.get("start_time")), render.format_time(doc.get("end_time")), doc["config"].get("description"), doc["archived"], doc["config"]["resources"].get("resource_pool"), ", ".join(sorted(doc["config"].get("labels") or [])), ] for doc in docs ] if not args.outdir: outfile = None print("Experiment:") else: outfile = args.outdir.joinpath("experiments.csv") render.tabulate_or_csv(headers, values, args.csv, outfile) # Display trial-related information. headers = ["Trial ID", "Experiment ID", "State", "Start Time", "End Time", "H-Params"] values = [ [ trial["id"], doc["id"], trial["state"], render.format_time(trial.get("start_time")), render.format_time(trial.get("end_time")), json.dumps(trial["hparams"], indent=4), ] for doc in docs for trial in doc["trials"] ] if not args.outdir: outfile = None print("\nTrials:") else: outfile = args.outdir.joinpath("trials.csv") render.tabulate_or_csv(headers, values, args.csv, outfile) # Display step-related information. if args.metrics: # Accumulate the scalar training and validation metric names from all provided experiments. t_metrics_names = sorted({n for doc in docs for n in scalar_training_metrics_names(doc)}) t_metrics_headers = ["Training Metric: {}".format(name) for name in t_metrics_names] v_metrics_names = sorted({n for doc in docs for n in scalar_validation_metrics_names(doc)}) v_metrics_headers = ["Validation Metric: {}".format(name) for name in v_metrics_names] else: t_metrics_headers = [] v_metrics_headers = [] headers = ( ["Trial ID", "# of Batches", "State", "Start Time", "End Time"] + t_metrics_headers + [ "Checkpoint State", "Checkpoint Start Time", "Checkpoint End Time", "Validation State", "Validation Start Time", "Validation End Time", ] + v_metrics_headers ) values = [] for doc in docs: for trial in doc["trials"]: for step in trial["steps"]: t_metrics_fields = [] if step.get("metrics"): avg_metrics = step["metrics"]["avg_metrics"] for name in t_metrics_names: if name in avg_metrics: t_metrics_fields.append(avg_metrics[name]) else: t_metrics_fields.append(None) checkpoint = step.get("checkpoint") if checkpoint: checkpoint_state = checkpoint["state"] checkpoint_start_time = checkpoint.get("start_time") checkpoint_end_time = checkpoint.get("end_time") else: checkpoint_state = None checkpoint_start_time = None checkpoint_end_time = None validation = step.get("validation") if validation: validation_state = validation["state"] validation_start_time = validation.get("start_time") validation_end_time = validation.get("end_time") else: validation_state = None validation_start_time = None validation_end_time = None if args.metrics: v_metrics_fields = [ api.metric.get_validation_metric(name, validation) for name in v_metrics_names ] else: v_metrics_fields = [] row = ( [ step["trial_id"], step["total_batches"], step["state"], render.format_time(step.get("start_time")), render.format_time(step.get("end_time")), ] + t_metrics_fields + [ checkpoint_state, render.format_time(checkpoint_start_time), render.format_time(checkpoint_end_time), validation_state, render.format_time(validation_start_time), render.format_time(validation_end_time), ] + v_metrics_fields ) values.append(row) if not args.outdir: outfile = None print("\nWorkloads:") else: outfile = args.outdir.joinpath("workloads.csv") render.tabulate_or_csv(headers, values, args.csv, outfile)
def describe(args: Namespace) -> None: session = setup_session(args) exps = [] for experiment_id in args.experiment_ids.split(","): r = bindings.get_GetExperiment(session, experimentId=experiment_id) if args.json: exps.append(r.to_json()) else: exps.append(r.experiment) if args.json: print(json.dumps(exps, indent=4)) return # Display overall experiment information. headers = [ "Experiment ID", "State", "Progress", "Start Time", "End Time", "Name", "Description", "Archived", "Resource Pool", "Labels", ] values = [[ exp.id, exp.state.value.replace("STATE_", ""), render.format_percent(exp.progress), render.format_time(exp.startTime), render.format_time(exp.endTime), exp.name, exp.description, exp.archived, exp.resourcePool, ", ".join(sorted(exp.labels or [])), ] for exp in exps] if not args.outdir: outfile = None print("Experiment:") else: outfile = args.outdir.joinpath("experiments.csv") render.tabulate_or_csv(headers, values, args.csv, outfile) # Display trial-related information. trials_for_experiment: Dict[str, Sequence[bindings.trialv1Trial]] = {} for exp in exps: trials_for_experiment[exp.id] = bindings.get_GetExperimentTrials( session, experimentId=exp.id).trials headers = [ "Trial ID", "Experiment ID", "State", "Start Time", "End Time", "H-Params" ] values = [[ trial.id, exp.id, trial.state.value.replace("STATE_", ""), render.format_time(trial.startTime), render.format_time(trial.endTime), json.dumps(trial.hparams, indent=4), ] for exp in exps for trial in trials_for_experiment[exp.id]] if not args.outdir: outfile = None print("\nTrials:") else: outfile = args.outdir.joinpath("trials.csv") render.tabulate_or_csv(headers, values, args.csv, outfile) # Display step-related information. t_metrics_headers: List[str] = [] t_metrics_names: List[str] = [] v_metrics_headers: List[str] = [] v_metrics_names: List[str] = [] if args.metrics: # Accumulate the scalar training and validation metric names from all provided experiments. for exp in exps: sample_trial = trials_for_experiment[exp.id][0] sample_workloads = bindings.get_GetTrial( session, trialId=sample_trial.id).workloads t_metrics_names += scalar_training_metrics_names(sample_workloads) v_metrics_names += scalar_validation_metrics_names( sample_workloads) t_metrics_names = sorted(set(t_metrics_names)) t_metrics_headers = [ "Training Metric: {}".format(name) for name in t_metrics_names ] v_metrics_names = sorted(set(v_metrics_names)) v_metrics_headers = [ "Validation Metric: {}".format(name) for name in v_metrics_names ] headers = (["Trial ID", "# of Batches", "State", "Report Time"] + t_metrics_headers + [ "Checkpoint State", "Checkpoint Report Time", "Validation State", "Validation Report Time", ] + v_metrics_headers) wl_output: Dict[int, List[Any]] = {} for exp in exps: for trial in trials_for_experiment[exp.id]: workloads = bindings.get_GetTrial(session, trialId=trial.id).workloads for workload in workloads: t_metrics_fields = [] wl_detail: Optional[ Union[bindings.v1MetricsWorkload, bindings.v1CheckpointWorkload]] = None if workload.training: wl_detail = workload.training for name in t_metrics_names: if wl_detail.metrics and (name in wl_detail.metrics): t_metrics_fields.append(wl_detail.metrics[name]) else: t_metrics_fields.append(None) else: t_metrics_fields = [None for name in t_metrics_names] if workload.checkpoint: wl_detail = workload.checkpoint if workload.checkpoint and wl_detail: checkpoint_state = wl_detail.state.value checkpoint_end_time = wl_detail.endTime else: checkpoint_state = "" checkpoint_end_time = None v_metrics_fields = [] if workload.validation: wl_detail = workload.validation validation_state = wl_detail.state.value validation_end_time = wl_detail.endTime for name in v_metrics_names: if wl_detail.metrics and (name in wl_detail.metrics): v_metrics_fields.append(wl_detail.metrics[name]) else: v_metrics_fields.append(None) else: validation_state = "" validation_end_time = None v_metrics_fields = [None for name in v_metrics_names] if wl_detail: if wl_detail.totalBatches in wl_output: # condense training, checkpoints, validation workloads into one step-like # row for compatibility with previous versions of describe merge_row = wl_output[wl_detail.totalBatches] merge_row[3] = max( merge_row[3], render.format_time(wl_detail.endTime)) for idx, tfield in enumerate(t_metrics_fields): if tfield and merge_row[4 + idx] is None: merge_row[4 + idx] = tfield start_checkpoint = 4 + len(t_metrics_fields) if checkpoint_state: merge_row[ start_checkpoint] = checkpoint_state.replace( "STATE_", "") merge_row[start_checkpoint + 1] = render.format_time( checkpoint_end_time) if validation_end_time: merge_row[start_checkpoint + 3] = render.format_time( validation_end_time) if validation_state: merge_row[start_checkpoint + 2] = validation_state.replace( "STATE_", "") for idx, vfield in enumerate(v_metrics_fields): if vfield and merge_row[start_checkpoint + idx + 4] is None: merge_row[start_checkpoint + idx + 4] = vfield else: row = ([ trial.id, wl_detail.totalBatches, wl_detail.state.value.replace("STATE_", ""), render.format_time(wl_detail.endTime), ] + t_metrics_fields + [ checkpoint_state.replace("STATE_", ""), render.format_time(checkpoint_end_time), validation_state.replace("STATE_", ""), render.format_time(validation_end_time), ] + v_metrics_fields) wl_output[wl_detail.totalBatches] = row if not args.outdir: outfile = None print("\nWorkloads:") else: outfile = args.outdir.joinpath("workloads.csv") values = sorted(wl_output.values(), key=lambda a: int(a[1])) render.tabulate_or_csv(headers, values, args.csv, outfile)
def list_slots(args: argparse.Namespace) -> None: task_res = api.get(args.master, "tasks") agent_res = api.get(args.master, "agents") agents = agent_res.json() allocations = task_res.json() c_names = { r["container_id"]: { "name": a["name"], "allocation_id": a["allocation_id"] } for a in allocations.values() for r in a["resources"] if r["container_id"] } def get_task_name(containers: Dict[str, Any], slot: Dict[str, Any]) -> str: if not slot["container"]: return "FREE" container_id = slot["container"]["id"] if slot["container"] and container_id in containers: return str(containers[container_id]["name"]) if slot["container"] and ( "determined-master-deployment" in container_id or "determined-db-deployment" in container_id): return f"Determined System Task: {container_id}" return f"Non-Determined Task: {container_id}" slots = [ OrderedDict([ ("agent_id", local_id(agent_id)), ("resource_pool", agent["resource_pool"]), ("slot_id", local_id(slot_id)), ("enabled", slot["enabled"]), ("draining", slot.get("draining", False)), ( "allocation_id", c_names[slot["container"]["id"]]["allocation_id"] if slot["container"] and slot["container"]["id"] in c_names else ("OCCUPIED" if slot["container"] else "FREE"), ), ("task_name", get_task_name(c_names, slot)), ("type", slot["device"]["type"]), ("device", slot["device"]["brand"]), ]) for agent_id, agent in sorted(agents.items()) for slot_id, slot in sorted(agent["slots"].items()) ] headers = [ "Agent ID", "Resource Pool", "Slot ID", "Enabled", "Draining", "Allocation ID", "Task Name", "Type", "Device", ] if args.json: print(json.dumps(slots, indent=4)) return values = [s.values() for s in slots] render.tabulate_or_csv(headers, values, args.csv)