Exemplo n.º 1
0
def experiment_logs(args: Namespace) -> None:
    sess = setup_session(args)
    trials = bindings.get_GetExperimentTrials(
        sess, experimentId=args.experiment_id).trials
    if len(trials) == 0:
        print(f"No trials found for experiment {args.experiment_id}. "
              "Try again after the experiment has a trial running.")
        return
    first_trial_id = sorted(t_id.id for t_id in trials)[0]

    api.pprint_trial_logs(
        args.master,
        first_trial_id,
        head=args.head,
        tail=args.tail,
        follow=args.follow,
        agent_ids=args.agent_ids,
        container_ids=args.container_ids,
        rank_ids=args.rank_ids,
        sources=args.sources,
        stdtypes=args.stdtypes,
        level_above=args.level,
        timestamp_before=args.timestamp_before,
        timestamp_after=args.timestamp_after,
    )
Exemplo n.º 2
0
def experiment_trials(experiment_id: int) -> List[bindings.v1GetTrialResponse]:
    r1 = bindings.get_GetExperimentTrials(test_session(),
                                          experimentId=experiment_id)
    src_trials = r1.trials
    trials = []
    for trial in src_trials:
        r2 = bindings.get_GetTrial(test_session(), trialId=trial.id)
        trials.append(r2)  # includes trial and workload
    return trials
Exemplo n.º 3
0
def experiment_first_trial(exp_id: int) -> int:
    session = test_session()
    trials = bindings.get_GetExperimentTrials(session,
                                              experimentId=exp_id).trials

    assert len(trials) > 0
    trial = trials[0]
    trial_id = trial.id
    return trial_id
Exemplo n.º 4
0
def follow_experiment_logs(master_url: str, exp_id: int) -> None:
    # Get the ID of this experiment's first trial (i.e., the one with the lowest ID).
    print("Waiting for first trial to begin...")
    sess = session.Session(master_url, None, None, None)
    while True:
        trials = bindings.get_GetExperimentTrials(sess,
                                                  experimentId=exp_id).trials
        if len(trials) > 0:
            break
        else:
            time.sleep(0.1)

    first_trial_id = sorted(t_id.id for t_id in trials)[0]
    print("Following first trial with ID {}".format(first_trial_id))
    logs.pprint_trial_logs(master_url, first_trial_id, follow=True)
Exemplo n.º 5
0
def describe(args: Namespace) -> None:
    session = setup_session(args)
    exps = []
    for experiment_id in args.experiment_ids.split(","):
        r = bindings.get_GetExperiment(session, experimentId=experiment_id)
        if args.json:
            exps.append(r.to_json())
        else:
            exps.append(r.experiment)

    if args.json:
        print(json.dumps(exps, indent=4))
        return

    # Display overall experiment information.
    headers = [
        "Experiment ID",
        "State",
        "Progress",
        "Start Time",
        "End Time",
        "Name",
        "Description",
        "Archived",
        "Resource Pool",
        "Labels",
    ]
    values = [[
        exp.id,
        exp.state.value.replace("STATE_", ""),
        render.format_percent(exp.progress),
        render.format_time(exp.startTime),
        render.format_time(exp.endTime),
        exp.name,
        exp.description,
        exp.archived,
        exp.resourcePool,
        ", ".join(sorted(exp.labels or [])),
    ] for exp in exps]
    if not args.outdir:
        outfile = None
        print("Experiment:")
    else:
        outfile = args.outdir.joinpath("experiments.csv")
    render.tabulate_or_csv(headers, values, args.csv, outfile)

    # Display trial-related information.
    trials_for_experiment: Dict[str, Sequence[bindings.trialv1Trial]] = {}
    for exp in exps:
        trials_for_experiment[exp.id] = bindings.get_GetExperimentTrials(
            session, experimentId=exp.id).trials

    headers = [
        "Trial ID", "Experiment ID", "State", "Start Time", "End Time",
        "H-Params"
    ]
    values = [[
        trial.id,
        exp.id,
        trial.state.value.replace("STATE_", ""),
        render.format_time(trial.startTime),
        render.format_time(trial.endTime),
        json.dumps(trial.hparams, indent=4),
    ] for exp in exps for trial in trials_for_experiment[exp.id]]
    if not args.outdir:
        outfile = None
        print("\nTrials:")
    else:
        outfile = args.outdir.joinpath("trials.csv")
    render.tabulate_or_csv(headers, values, args.csv, outfile)

    # Display step-related information.
    t_metrics_headers: List[str] = []
    t_metrics_names: List[str] = []
    v_metrics_headers: List[str] = []
    v_metrics_names: List[str] = []
    if args.metrics:
        # Accumulate the scalar training and validation metric names from all provided experiments.
        for exp in exps:
            sample_trial = trials_for_experiment[exp.id][0]
            sample_workloads = bindings.get_GetTrial(
                session, trialId=sample_trial.id).workloads
            t_metrics_names += scalar_training_metrics_names(sample_workloads)
            v_metrics_names += scalar_validation_metrics_names(
                sample_workloads)
        t_metrics_names = sorted(set(t_metrics_names))
        t_metrics_headers = [
            "Training Metric: {}".format(name) for name in t_metrics_names
        ]
        v_metrics_names = sorted(set(v_metrics_names))
        v_metrics_headers = [
            "Validation Metric: {}".format(name) for name in v_metrics_names
        ]

    headers = (["Trial ID", "# of Batches", "State", "Report Time"] +
               t_metrics_headers + [
                   "Checkpoint State",
                   "Checkpoint Report Time",
                   "Validation State",
                   "Validation Report Time",
               ] + v_metrics_headers)

    wl_output: Dict[int, List[Any]] = {}
    for exp in exps:
        for trial in trials_for_experiment[exp.id]:
            workloads = bindings.get_GetTrial(session,
                                              trialId=trial.id).workloads
            for workload in workloads:
                t_metrics_fields = []
                wl_detail: Optional[
                    Union[bindings.v1MetricsWorkload,
                          bindings.v1CheckpointWorkload]] = None
                if workload.training:
                    wl_detail = workload.training
                    for name in t_metrics_names:
                        if wl_detail.metrics and (name in wl_detail.metrics):
                            t_metrics_fields.append(wl_detail.metrics[name])
                        else:
                            t_metrics_fields.append(None)
                else:
                    t_metrics_fields = [None for name in t_metrics_names]

                if workload.checkpoint:
                    wl_detail = workload.checkpoint

                if workload.checkpoint and wl_detail:
                    checkpoint_state = wl_detail.state.value
                    checkpoint_end_time = wl_detail.endTime
                else:
                    checkpoint_state = ""
                    checkpoint_end_time = None

                v_metrics_fields = []
                if workload.validation:
                    wl_detail = workload.validation
                    validation_state = wl_detail.state.value
                    validation_end_time = wl_detail.endTime
                    for name in v_metrics_names:
                        if wl_detail.metrics and (name in wl_detail.metrics):
                            v_metrics_fields.append(wl_detail.metrics[name])
                        else:
                            v_metrics_fields.append(None)
                else:
                    validation_state = ""
                    validation_end_time = None
                    v_metrics_fields = [None for name in v_metrics_names]

                if wl_detail:
                    if wl_detail.totalBatches in wl_output:
                        # condense training, checkpoints, validation workloads into one step-like
                        # row for compatibility with previous versions of describe
                        merge_row = wl_output[wl_detail.totalBatches]
                        merge_row[3] = max(
                            merge_row[3],
                            render.format_time(wl_detail.endTime))
                        for idx, tfield in enumerate(t_metrics_fields):
                            if tfield and merge_row[4 + idx] is None:
                                merge_row[4 + idx] = tfield
                        start_checkpoint = 4 + len(t_metrics_fields)
                        if checkpoint_state:
                            merge_row[
                                start_checkpoint] = checkpoint_state.replace(
                                    "STATE_", "")
                            merge_row[start_checkpoint +
                                      1] = render.format_time(
                                          checkpoint_end_time)
                        if validation_end_time:
                            merge_row[start_checkpoint +
                                      3] = render.format_time(
                                          validation_end_time)
                        if validation_state:
                            merge_row[start_checkpoint +
                                      2] = validation_state.replace(
                                          "STATE_", "")
                        for idx, vfield in enumerate(v_metrics_fields):
                            if vfield and merge_row[start_checkpoint + idx +
                                                    4] is None:
                                merge_row[start_checkpoint + idx + 4] = vfield
                    else:
                        row = ([
                            trial.id,
                            wl_detail.totalBatches,
                            wl_detail.state.value.replace("STATE_", ""),
                            render.format_time(wl_detail.endTime),
                        ] + t_metrics_fields + [
                            checkpoint_state.replace("STATE_", ""),
                            render.format_time(checkpoint_end_time),
                            validation_state.replace("STATE_", ""),
                            render.format_time(validation_end_time),
                        ] + v_metrics_fields)
                        wl_output[wl_detail.totalBatches] = row

    if not args.outdir:
        outfile = None
        print("\nWorkloads:")
    else:
        outfile = args.outdir.joinpath("workloads.csv")
    values = sorted(wl_output.values(), key=lambda a: int(a[1]))
    render.tabulate_or_csv(headers, values, args.csv, outfile)
Exemplo n.º 6
0
def follow_test_experiment_logs(master_url: str, exp_id: int) -> None:
    def print_progress(active_stage: int, ended: bool) -> None:
        # There are four sequential stages of verification. Track the
        # current stage with an index into this list.
        stages = [
            "Scheduling task",
            "Testing training",
            "Testing validation",
            "Testing checkpointing",
        ]

        for idx, stage in enumerate(stages):
            if active_stage > idx:
                color = "green"
                checkbox = "✔"
            elif active_stage == idx:
                color = "red" if ended else "yellow"
                checkbox = "✗" if ended else " "
            else:
                color = "white"
                checkbox = " "
            print(colored(stage + (25 - len(stage)) * ".", color), end="")
            print(colored(" [" + checkbox + "]", color), end="")

            if idx == len(stages) - 1:
                print("\n" if ended else "\r", end="")
            else:
                print(", ", end="")

    sess = session.Session(master_url, None, None, None)
    while True:
        r = bindings.get_GetExperiment(sess, experimentId=exp_id).experiment
        trials = bindings.get_GetExperimentTrials(sess,
                                                  experimentId=exp_id).trials

        # Wait for experiment to start and initialize a trial.
        if len(trials) < 1:
            t = {}
        else:
            trial_id = trials[0].id
            t = api.get(master_url, f"trials/{trial_id}").json()

        # Update the active_stage by examining the result from master
        # /api/v1/experiments/<experiment-id> endpoint.
        exp_state = r.state.value.replace("STATE_", "")
        if exp_state == constants.COMPLETED:
            active_stage = 4
        elif t.get("runner_state") == "checkpointing":
            active_stage = 3
        elif t.get("runner_state") == "validating":
            active_stage = 2
        elif t.get("runner_state") in ("UNSPECIFIED", "training"):
            active_stage = 1
        else:
            active_stage = 0

        # If the experiment is in a terminal state, output the appropriate
        # message and exit. Otherwise, sleep and repeat.
        if exp_state == constants.COMPLETED:
            print_progress(active_stage, ended=True)
            print(colored("Model definition test succeeded! 🎉", "green"))
            return
        elif exp_state == constants.CANCELED:
            print_progress(active_stage, ended=True)
            print(
                colored(
                    "Model definition test (ID: {}) canceled before "
                    "model test could complete. Please re-run the "
                    "command.".format(exp_id),
                    "yellow",
                ))
            sys.exit(1)
        elif exp_state == constants.ERROR:
            print_progress(active_stage, ended=True)
            trial_id = trials[0].id
            logs.pprint_trial_logs(master_url, trial_id)
            sys.exit(1)
        else:
            print_progress(active_stage, ended=False)
            time.sleep(0.2)