def format_experiment(e: Any) -> List[Any]: result = [ e["id"], e["owner"]["username"], e["config"]["description"], e["state"], render.format_percent(e["progress"]), render.format_time(e["start_time"]), render.format_time(e["end_time"]), e["config"]["resources"].get("resource_pool"), ] if args.all: result.append(e["archived"]) return result
def format_experiment(e: Any) -> List[Any]: result = [ e.id, e.username, e.name, e.forkedFrom, e.state.value.replace("STATE_", ""), render.format_percent(e.progress), render.format_time(e.startTime), render.format_time(e.endTime), e.resourcePool, ] if args.all: result.append(e.archived) return result
def describe(args: Namespace) -> None: docs = [] for experiment_id in args.experiment_ids.split(","): if args.metrics: r = api.get(args.master, "experiments/{}/metrics/summary".format(experiment_id)) else: r = api.get(args.master, "experiments/{}".format(experiment_id)) docs.append(r.json()) if args.json: print(json.dumps(docs, indent=4)) return # Display overall experiment information. headers = [ "Experiment ID", "State", "Progress", "Start Time", "End Time", "Description", "Archived", "Resource Pool", "Labels", ] values = [ [ doc["id"], doc["state"], render.format_percent(doc["progress"]), render.format_time(doc.get("start_time")), render.format_time(doc.get("end_time")), doc["config"].get("description"), doc["archived"], doc["config"]["resources"].get("resource_pool"), ", ".join(sorted(doc["config"].get("labels") or [])), ] for doc in docs ] if not args.outdir: outfile = None print("Experiment:") else: outfile = args.outdir.joinpath("experiments.csv") render.tabulate_or_csv(headers, values, args.csv, outfile) # Display trial-related information. headers = ["Trial ID", "Experiment ID", "State", "Start Time", "End Time", "H-Params"] values = [ [ trial["id"], doc["id"], trial["state"], render.format_time(trial.get("start_time")), render.format_time(trial.get("end_time")), json.dumps(trial["hparams"], indent=4), ] for doc in docs for trial in doc["trials"] ] if not args.outdir: outfile = None print("\nTrials:") else: outfile = args.outdir.joinpath("trials.csv") render.tabulate_or_csv(headers, values, args.csv, outfile) # Display step-related information. if args.metrics: # Accumulate the scalar training and validation metric names from all provided experiments. t_metrics_names = sorted({n for doc in docs for n in scalar_training_metrics_names(doc)}) t_metrics_headers = ["Training Metric: {}".format(name) for name in t_metrics_names] v_metrics_names = sorted({n for doc in docs for n in scalar_validation_metrics_names(doc)}) v_metrics_headers = ["Validation Metric: {}".format(name) for name in v_metrics_names] else: t_metrics_headers = [] v_metrics_headers = [] headers = ( ["Trial ID", "# of Batches", "State", "Start Time", "End Time"] + t_metrics_headers + [ "Checkpoint State", "Checkpoint Start Time", "Checkpoint End Time", "Validation State", "Validation Start Time", "Validation End Time", ] + v_metrics_headers ) values = [] for doc in docs: for trial in doc["trials"]: for step in trial["steps"]: t_metrics_fields = [] if step.get("metrics"): avg_metrics = step["metrics"]["avg_metrics"] for name in t_metrics_names: if name in avg_metrics: t_metrics_fields.append(avg_metrics[name]) else: t_metrics_fields.append(None) checkpoint = step.get("checkpoint") if checkpoint: checkpoint_state = checkpoint["state"] checkpoint_start_time = checkpoint.get("start_time") checkpoint_end_time = checkpoint.get("end_time") else: checkpoint_state = None checkpoint_start_time = None checkpoint_end_time = None validation = step.get("validation") if validation: validation_state = validation["state"] validation_start_time = validation.get("start_time") validation_end_time = validation.get("end_time") else: validation_state = None validation_start_time = None validation_end_time = None if args.metrics: v_metrics_fields = [ api.metric.get_validation_metric(name, validation) for name in v_metrics_names ] else: v_metrics_fields = [] row = ( [ step["trial_id"], step["total_batches"], step["state"], render.format_time(step.get("start_time")), render.format_time(step.get("end_time")), ] + t_metrics_fields + [ checkpoint_state, render.format_time(checkpoint_start_time), render.format_time(checkpoint_end_time), validation_state, render.format_time(validation_start_time), render.format_time(validation_end_time), ] + v_metrics_fields ) values.append(row) if not args.outdir: outfile = None print("\nWorkloads:") else: outfile = args.outdir.joinpath("workloads.csv") render.tabulate_or_csv(headers, values, args.csv, outfile)
def describe(args: Namespace) -> None: session = setup_session(args) exps = [] for experiment_id in args.experiment_ids.split(","): r = bindings.get_GetExperiment(session, experimentId=experiment_id) if args.json: exps.append(r.to_json()) else: exps.append(r.experiment) if args.json: print(json.dumps(exps, indent=4)) return # Display overall experiment information. headers = [ "Experiment ID", "State", "Progress", "Start Time", "End Time", "Name", "Description", "Archived", "Resource Pool", "Labels", ] values = [[ exp.id, exp.state.value.replace("STATE_", ""), render.format_percent(exp.progress), render.format_time(exp.startTime), render.format_time(exp.endTime), exp.name, exp.description, exp.archived, exp.resourcePool, ", ".join(sorted(exp.labels or [])), ] for exp in exps] if not args.outdir: outfile = None print("Experiment:") else: outfile = args.outdir.joinpath("experiments.csv") render.tabulate_or_csv(headers, values, args.csv, outfile) # Display trial-related information. trials_for_experiment: Dict[str, Sequence[bindings.trialv1Trial]] = {} for exp in exps: trials_for_experiment[exp.id] = bindings.get_GetExperimentTrials( session, experimentId=exp.id).trials headers = [ "Trial ID", "Experiment ID", "State", "Start Time", "End Time", "H-Params" ] values = [[ trial.id, exp.id, trial.state.value.replace("STATE_", ""), render.format_time(trial.startTime), render.format_time(trial.endTime), json.dumps(trial.hparams, indent=4), ] for exp in exps for trial in trials_for_experiment[exp.id]] if not args.outdir: outfile = None print("\nTrials:") else: outfile = args.outdir.joinpath("trials.csv") render.tabulate_or_csv(headers, values, args.csv, outfile) # Display step-related information. t_metrics_headers: List[str] = [] t_metrics_names: List[str] = [] v_metrics_headers: List[str] = [] v_metrics_names: List[str] = [] if args.metrics: # Accumulate the scalar training and validation metric names from all provided experiments. for exp in exps: sample_trial = trials_for_experiment[exp.id][0] sample_workloads = bindings.get_GetTrial( session, trialId=sample_trial.id).workloads t_metrics_names += scalar_training_metrics_names(sample_workloads) v_metrics_names += scalar_validation_metrics_names( sample_workloads) t_metrics_names = sorted(set(t_metrics_names)) t_metrics_headers = [ "Training Metric: {}".format(name) for name in t_metrics_names ] v_metrics_names = sorted(set(v_metrics_names)) v_metrics_headers = [ "Validation Metric: {}".format(name) for name in v_metrics_names ] headers = (["Trial ID", "# of Batches", "State", "Report Time"] + t_metrics_headers + [ "Checkpoint State", "Checkpoint Report Time", "Validation State", "Validation Report Time", ] + v_metrics_headers) wl_output: Dict[int, List[Any]] = {} for exp in exps: for trial in trials_for_experiment[exp.id]: workloads = bindings.get_GetTrial(session, trialId=trial.id).workloads for workload in workloads: t_metrics_fields = [] wl_detail: Optional[ Union[bindings.v1MetricsWorkload, bindings.v1CheckpointWorkload]] = None if workload.training: wl_detail = workload.training for name in t_metrics_names: if wl_detail.metrics and (name in wl_detail.metrics): t_metrics_fields.append(wl_detail.metrics[name]) else: t_metrics_fields.append(None) else: t_metrics_fields = [None for name in t_metrics_names] if workload.checkpoint: wl_detail = workload.checkpoint if workload.checkpoint and wl_detail: checkpoint_state = wl_detail.state.value checkpoint_end_time = wl_detail.endTime else: checkpoint_state = "" checkpoint_end_time = None v_metrics_fields = [] if workload.validation: wl_detail = workload.validation validation_state = wl_detail.state.value validation_end_time = wl_detail.endTime for name in v_metrics_names: if wl_detail.metrics and (name in wl_detail.metrics): v_metrics_fields.append(wl_detail.metrics[name]) else: v_metrics_fields.append(None) else: validation_state = "" validation_end_time = None v_metrics_fields = [None for name in v_metrics_names] if wl_detail: if wl_detail.totalBatches in wl_output: # condense training, checkpoints, validation workloads into one step-like # row for compatibility with previous versions of describe merge_row = wl_output[wl_detail.totalBatches] merge_row[3] = max( merge_row[3], render.format_time(wl_detail.endTime)) for idx, tfield in enumerate(t_metrics_fields): if tfield and merge_row[4 + idx] is None: merge_row[4 + idx] = tfield start_checkpoint = 4 + len(t_metrics_fields) if checkpoint_state: merge_row[ start_checkpoint] = checkpoint_state.replace( "STATE_", "") merge_row[start_checkpoint + 1] = render.format_time( checkpoint_end_time) if validation_end_time: merge_row[start_checkpoint + 3] = render.format_time( validation_end_time) if validation_state: merge_row[start_checkpoint + 2] = validation_state.replace( "STATE_", "") for idx, vfield in enumerate(v_metrics_fields): if vfield and merge_row[start_checkpoint + idx + 4] is None: merge_row[start_checkpoint + idx + 4] = vfield else: row = ([ trial.id, wl_detail.totalBatches, wl_detail.state.value.replace("STATE_", ""), render.format_time(wl_detail.endTime), ] + t_metrics_fields + [ checkpoint_state.replace("STATE_", ""), render.format_time(checkpoint_end_time), validation_state.replace("STATE_", ""), render.format_time(validation_end_time), ] + v_metrics_fields) wl_output[wl_detail.totalBatches] = row if not args.outdir: outfile = None print("\nWorkloads:") else: outfile = args.outdir.joinpath("workloads.csv") values = sorted(wl_output.values(), key=lambda a: int(a[1])) render.tabulate_or_csv(headers, values, args.csv, outfile)