def experiment_logs(args: Namespace) -> None: sess = setup_session(args) trials = bindings.get_GetExperimentTrials( sess, experimentId=args.experiment_id).trials if len(trials) == 0: print(f"No trials found for experiment {args.experiment_id}. " "Try again after the experiment has a trial running.") return first_trial_id = sorted(t_id.id for t_id in trials)[0] api.pprint_trial_logs( args.master, first_trial_id, head=args.head, tail=args.tail, follow=args.follow, agent_ids=args.agent_ids, container_ids=args.container_ids, rank_ids=args.rank_ids, sources=args.sources, stdtypes=args.stdtypes, level_above=args.level, timestamp_before=args.timestamp_before, timestamp_after=args.timestamp_after, )
def experiment_trials(experiment_id: int) -> List[bindings.v1GetTrialResponse]: r1 = bindings.get_GetExperimentTrials(test_session(), experimentId=experiment_id) src_trials = r1.trials trials = [] for trial in src_trials: r2 = bindings.get_GetTrial(test_session(), trialId=trial.id) trials.append(r2) # includes trial and workload return trials
def experiment_first_trial(exp_id: int) -> int: session = test_session() trials = bindings.get_GetExperimentTrials(session, experimentId=exp_id).trials assert len(trials) > 0 trial = trials[0] trial_id = trial.id return trial_id
def follow_experiment_logs(master_url: str, exp_id: int) -> None: # Get the ID of this experiment's first trial (i.e., the one with the lowest ID). print("Waiting for first trial to begin...") sess = session.Session(master_url, None, None, None) while True: trials = bindings.get_GetExperimentTrials(sess, experimentId=exp_id).trials if len(trials) > 0: break else: time.sleep(0.1) first_trial_id = sorted(t_id.id for t_id in trials)[0] print("Following first trial with ID {}".format(first_trial_id)) logs.pprint_trial_logs(master_url, first_trial_id, follow=True)
def describe(args: Namespace) -> None: session = setup_session(args) exps = [] for experiment_id in args.experiment_ids.split(","): r = bindings.get_GetExperiment(session, experimentId=experiment_id) if args.json: exps.append(r.to_json()) else: exps.append(r.experiment) if args.json: print(json.dumps(exps, indent=4)) return # Display overall experiment information. headers = [ "Experiment ID", "State", "Progress", "Start Time", "End Time", "Name", "Description", "Archived", "Resource Pool", "Labels", ] values = [[ exp.id, exp.state.value.replace("STATE_", ""), render.format_percent(exp.progress), render.format_time(exp.startTime), render.format_time(exp.endTime), exp.name, exp.description, exp.archived, exp.resourcePool, ", ".join(sorted(exp.labels or [])), ] for exp in exps] if not args.outdir: outfile = None print("Experiment:") else: outfile = args.outdir.joinpath("experiments.csv") render.tabulate_or_csv(headers, values, args.csv, outfile) # Display trial-related information. trials_for_experiment: Dict[str, Sequence[bindings.trialv1Trial]] = {} for exp in exps: trials_for_experiment[exp.id] = bindings.get_GetExperimentTrials( session, experimentId=exp.id).trials headers = [ "Trial ID", "Experiment ID", "State", "Start Time", "End Time", "H-Params" ] values = [[ trial.id, exp.id, trial.state.value.replace("STATE_", ""), render.format_time(trial.startTime), render.format_time(trial.endTime), json.dumps(trial.hparams, indent=4), ] for exp in exps for trial in trials_for_experiment[exp.id]] if not args.outdir: outfile = None print("\nTrials:") else: outfile = args.outdir.joinpath("trials.csv") render.tabulate_or_csv(headers, values, args.csv, outfile) # Display step-related information. t_metrics_headers: List[str] = [] t_metrics_names: List[str] = [] v_metrics_headers: List[str] = [] v_metrics_names: List[str] = [] if args.metrics: # Accumulate the scalar training and validation metric names from all provided experiments. for exp in exps: sample_trial = trials_for_experiment[exp.id][0] sample_workloads = bindings.get_GetTrial( session, trialId=sample_trial.id).workloads t_metrics_names += scalar_training_metrics_names(sample_workloads) v_metrics_names += scalar_validation_metrics_names( sample_workloads) t_metrics_names = sorted(set(t_metrics_names)) t_metrics_headers = [ "Training Metric: {}".format(name) for name in t_metrics_names ] v_metrics_names = sorted(set(v_metrics_names)) v_metrics_headers = [ "Validation Metric: {}".format(name) for name in v_metrics_names ] headers = (["Trial ID", "# of Batches", "State", "Report Time"] + t_metrics_headers + [ "Checkpoint State", "Checkpoint Report Time", "Validation State", "Validation Report Time", ] + v_metrics_headers) wl_output: Dict[int, List[Any]] = {} for exp in exps: for trial in trials_for_experiment[exp.id]: workloads = bindings.get_GetTrial(session, trialId=trial.id).workloads for workload in workloads: t_metrics_fields = [] wl_detail: Optional[ Union[bindings.v1MetricsWorkload, bindings.v1CheckpointWorkload]] = None if workload.training: wl_detail = workload.training for name in t_metrics_names: if wl_detail.metrics and (name in wl_detail.metrics): t_metrics_fields.append(wl_detail.metrics[name]) else: t_metrics_fields.append(None) else: t_metrics_fields = [None for name in t_metrics_names] if workload.checkpoint: wl_detail = workload.checkpoint if workload.checkpoint and wl_detail: checkpoint_state = wl_detail.state.value checkpoint_end_time = wl_detail.endTime else: checkpoint_state = "" checkpoint_end_time = None v_metrics_fields = [] if workload.validation: wl_detail = workload.validation validation_state = wl_detail.state.value validation_end_time = wl_detail.endTime for name in v_metrics_names: if wl_detail.metrics and (name in wl_detail.metrics): v_metrics_fields.append(wl_detail.metrics[name]) else: v_metrics_fields.append(None) else: validation_state = "" validation_end_time = None v_metrics_fields = [None for name in v_metrics_names] if wl_detail: if wl_detail.totalBatches in wl_output: # condense training, checkpoints, validation workloads into one step-like # row for compatibility with previous versions of describe merge_row = wl_output[wl_detail.totalBatches] merge_row[3] = max( merge_row[3], render.format_time(wl_detail.endTime)) for idx, tfield in enumerate(t_metrics_fields): if tfield and merge_row[4 + idx] is None: merge_row[4 + idx] = tfield start_checkpoint = 4 + len(t_metrics_fields) if checkpoint_state: merge_row[ start_checkpoint] = checkpoint_state.replace( "STATE_", "") merge_row[start_checkpoint + 1] = render.format_time( checkpoint_end_time) if validation_end_time: merge_row[start_checkpoint + 3] = render.format_time( validation_end_time) if validation_state: merge_row[start_checkpoint + 2] = validation_state.replace( "STATE_", "") for idx, vfield in enumerate(v_metrics_fields): if vfield and merge_row[start_checkpoint + idx + 4] is None: merge_row[start_checkpoint + idx + 4] = vfield else: row = ([ trial.id, wl_detail.totalBatches, wl_detail.state.value.replace("STATE_", ""), render.format_time(wl_detail.endTime), ] + t_metrics_fields + [ checkpoint_state.replace("STATE_", ""), render.format_time(checkpoint_end_time), validation_state.replace("STATE_", ""), render.format_time(validation_end_time), ] + v_metrics_fields) wl_output[wl_detail.totalBatches] = row if not args.outdir: outfile = None print("\nWorkloads:") else: outfile = args.outdir.joinpath("workloads.csv") values = sorted(wl_output.values(), key=lambda a: int(a[1])) render.tabulate_or_csv(headers, values, args.csv, outfile)
def follow_test_experiment_logs(master_url: str, exp_id: int) -> None: def print_progress(active_stage: int, ended: bool) -> None: # There are four sequential stages of verification. Track the # current stage with an index into this list. stages = [ "Scheduling task", "Testing training", "Testing validation", "Testing checkpointing", ] for idx, stage in enumerate(stages): if active_stage > idx: color = "green" checkbox = "✔" elif active_stage == idx: color = "red" if ended else "yellow" checkbox = "✗" if ended else " " else: color = "white" checkbox = " " print(colored(stage + (25 - len(stage)) * ".", color), end="") print(colored(" [" + checkbox + "]", color), end="") if idx == len(stages) - 1: print("\n" if ended else "\r", end="") else: print(", ", end="") sess = session.Session(master_url, None, None, None) while True: r = bindings.get_GetExperiment(sess, experimentId=exp_id).experiment trials = bindings.get_GetExperimentTrials(sess, experimentId=exp_id).trials # Wait for experiment to start and initialize a trial. if len(trials) < 1: t = {} else: trial_id = trials[0].id t = api.get(master_url, f"trials/{trial_id}").json() # Update the active_stage by examining the result from master # /api/v1/experiments/<experiment-id> endpoint. exp_state = r.state.value.replace("STATE_", "") if exp_state == constants.COMPLETED: active_stage = 4 elif t.get("runner_state") == "checkpointing": active_stage = 3 elif t.get("runner_state") == "validating": active_stage = 2 elif t.get("runner_state") in ("UNSPECIFIED", "training"): active_stage = 1 else: active_stage = 0 # If the experiment is in a terminal state, output the appropriate # message and exit. Otherwise, sleep and repeat. if exp_state == constants.COMPLETED: print_progress(active_stage, ended=True) print(colored("Model definition test succeeded! 🎉", "green")) return elif exp_state == constants.CANCELED: print_progress(active_stage, ended=True) print( colored( "Model definition test (ID: {}) canceled before " "model test could complete. Please re-run the " "command.".format(exp_id), "yellow", )) sys.exit(1) elif exp_state == constants.ERROR: print_progress(active_stage, ended=True) trial_id = trials[0].id logs.pprint_trial_logs(master_url, trial_id) sys.exit(1) else: print_progress(active_stage, ended=False) time.sleep(0.2)