def follow_experiment_logs(master_url: str, exp_id: int) -> None: # Get the ID of this experiment's first trial (i.e., the one with the lowest ID). q = api.GraphQLQuery(master_url) trials = q.op.trials( where=gql.trials_bool_exp(experiment_id=gql.Int_comparison_exp( _eq=exp_id)), order_by=[gql.trials_order_by(id=gql.order_by.asc)], limit=1, ) trials.id() print("Waiting for first trial to begin...") while True: resp = q.send() if resp.trials: break else: time.sleep(0.1) first_trial_id = resp.trials[0].id print("Following first trial with ID {}".format(first_trial_id)) # Call `logs --follow` on the new trial. logs_args = Namespace(trial_id=first_trial_id, follow=True, master=master_url, tail=None) logs(logs_args)
def list_trials(args: Namespace) -> None: q = api.GraphQLQuery(args.master) trials = q.op.trials( order_by=[gql.trials_order_by(id=gql.order_by.asc)], where=gql.trials_bool_exp(experiment_id=gql.Int_comparison_exp( _eq=args.experiment_id)), ) trials.id() trials.state() trials.hparams() trials.start_time() trials.end_time() trials.steps_aggregate().aggregate.count() resp = q.send() headers = [ "Trial ID", "State", "H-Params", "Start Time", "End Time", "# of Steps" ] values = [[ t.id, t.state, json.dumps(t.hparams, indent=4), render.format_time(t.start_time), render.format_time(t.end_time), t.steps_aggregate.aggregate.count, ] for t in resp.trials] render.tabulate_or_csv(headers, values, args.csv)
def experiment_trials(experiment_id: int) -> List[gql.trials]: q = query() trials = q.op.experiments_by_pk(id=experiment_id).trials( order_by=[gql.trials_order_by(id=gql.order_by.asc)]) trials.id() trials.state() trials.warm_start_checkpoint_id() steps = trials.steps(order_by=[gql.steps_order_by(id=gql.order_by.asc)]) steps.id() steps.state() steps.checkpoint.id() steps.checkpoint.state() steps.checkpoint.step_id() steps.checkpoint.uuid() steps.validation.metrics() steps.validation.state() r = q.send() return cast(List[gql.trials], r.experiments_by_pk.trials)
def get_experiment_durations(experiment_id: int, trial_idx: int) -> ExperimentDurations: q = query() exp = q.op.experiments_by_pk(id=experiment_id) exp.end_time() exp.start_time() steps = exp.trials(order_by=[gql.trials_order_by( id=gql.order_by.asc)]).steps() steps.end_time() steps.start_time() steps.checkpoint.end_time() steps.checkpoint.start_time() steps.validation.end_time() steps.validation.start_time() r = q.send() end_time = dateutil.parser.parse(r.end_time) start_time = dateutil.parser.parse(r.start_time) experiment_duration = end_time - start_time training_duration = datetime.timedelta(seconds=0) validation_duration = datetime.timedelta(seconds=0) checkpoint_duration = datetime.timedelta(seconds=0) for step in r.trials[trial_idx].steps: end_time = dateutil.parser.parse(step.end_time) start_time = dateutil.parser.parse(step.start_time) training_duration += end_time - start_time if step.validation: end_time = dateutil.parser.parse(step.validation.end_time) start_time = dateutil.parser.parse(step.validation.start_time) validation_duration += end_time - start_time if step.checkpoint: end_time = dateutil.parser.parse(step.checkpoint.end_time) start_time = dateutil.parser.parse(step.checkpoint.start_time) checkpoint_duration += end_time - start_time return ExperimentDurations(experiment_duration, training_duration, validation_duration, checkpoint_duration)
def describe(args: Namespace) -> None: ids = [int(x) for x in args.experiment_ids.split(",")] q = api.GraphQLQuery(args.master) exps = q.op.experiments(where=gql.experiments_bool_exp( id=gql.Int_comparison_exp(_in=ids))) exps.archived() exps.config() exps.end_time() exps.id() exps.progress() exps.start_time() exps.state() trials = exps.trials(order_by=[gql.trials_order_by(id=gql.order_by.asc)]) trials.end_time() trials.hparams() trials.id() trials.start_time() trials.state() steps = trials.steps(order_by=[gql.steps_order_by(id=gql.order_by.asc)]) steps.end_time() steps.id() steps.start_time() steps.state() steps.trial_id() steps.checkpoint.end_time() steps.checkpoint.start_time() steps.checkpoint.state() steps.validation.end_time() steps.validation.start_time() steps.validation.state() if args.metrics: steps.metrics(path="avg_metrics") steps.validation.metrics() resp = q.send() # Re-sort the experiment objects to match the original order. exps_by_id = {e.id: e for e in resp.experiments} experiments = [exps_by_id[id] for id in ids] if args.json: print(json.dumps(resp.__to_json_value__()["experiments"], indent=4)) return # Display overall experiment information. headers = [ "Experiment ID", "State", "Progress", "Start Time", "End Time", "Description", "Archived", "Labels", ] values = [[ e.id, e.state, render.format_percent(e.progress), render.format_time(e.start_time), render.format_time(e.end_time), e.config.get("description"), e.archived, ", ".join(sorted(e.config.get("labels", []))), ] for e in experiments] if not args.outdir: outfile = None print("Experiment:") else: outfile = args.outdir.joinpath("experiments.csv") render.tabulate_or_csv(headers, values, args.csv, outfile) # Display trial-related information. headers = [ "Trial ID", "Experiment ID", "State", "Start Time", "End Time", "H-Params" ] values = [[ t.id, e.id, t.state, render.format_time(t.start_time), render.format_time(t.end_time), json.dumps(t.hparams, indent=4), ] for e in experiments for t in e.trials] if not args.outdir: outfile = None print("\nTrials:") else: outfile = args.outdir.joinpath("trials.csv") render.tabulate_or_csv(headers, values, args.csv, outfile) # Display step-related information. if args.metrics: # Accumulate the scalar training and validation metric names from all provided experiments. t_metrics_names = sorted( {n for e in experiments for n in scalar_training_metrics_names(e)}) t_metrics_headers = [ "Training Metric: {}".format(name) for name in t_metrics_names ] v_metrics_names = sorted({ n for e in experiments for n in scalar_validation_metrics_names(e) }) v_metrics_headers = [ "Validation Metric: {}".format(name) for name in v_metrics_names ] else: t_metrics_headers = [] v_metrics_headers = [] headers = (["Trial ID", "Step ID", "State", "Start Time", "End Time"] + t_metrics_headers + [ "Checkpoint State", "Checkpoint Start Time", "Checkpoint End Time", "Validation State", "Validation Start Time", "Validation End Time", ] + v_metrics_headers) values = [] for e in experiments: for t in e.trials: for step in t.steps: t_metrics_fields = [] if hasattr(step, "metrics"): avg_metrics = step.metrics for name in t_metrics_names: if name in avg_metrics: t_metrics_fields.append(avg_metrics[name]) else: t_metrics_fields.append(None) checkpoint = step.checkpoint if checkpoint: checkpoint_state = checkpoint.state checkpoint_start_time = checkpoint.start_time checkpoint_end_time = checkpoint.end_time else: checkpoint_state = None checkpoint_start_time = None checkpoint_end_time = None validation = step.validation if validation: validation_state = validation.state validation_start_time = validation.start_time validation_end_time = validation.end_time else: validation_state = None validation_start_time = None validation_end_time = None if args.metrics: v_metrics_fields = [ api.metric.get_validation_metric(name, validation) for name in v_metrics_names ] else: v_metrics_fields = [] row = ([ step.trial_id, step.id, step.state, render.format_time(step.start_time), render.format_time(step.end_time), ] + t_metrics_fields + [ checkpoint_state, render.format_time(checkpoint_start_time), render.format_time(checkpoint_end_time), validation_state, render.format_time(validation_start_time), render.format_time(validation_end_time), ] + v_metrics_fields) values.append(row) if not args.outdir: outfile = None print("\nSteps:") else: outfile = args.outdir.joinpath("steps.csv") render.tabulate_or_csv(headers, values, args.csv, outfile)