示例#1
0
def _gather_experiments_training_curves(storage_dir,
                                        graph_key,
                                        curve_key,
                                        logger,
                                        x_metric,
                                        y_metric,
                                        x_data=None,
                                        y_data=None):

    # Initialize containers

    if x_data is None:
        x_data = OrderedDict()
    else:
        assert type(x_data) is OrderedDict

    if y_data is None:
        y_data = OrderedDict()
    else:
        assert type(y_data) is OrderedDict

    # Get all experiment directories

    all_experiments = DirectoryTree.get_all_experiments(
        storage_dir=storage_dir)

    for experiment_dir in all_experiments:

        # For that experiment, get all seed directories

        experiment_seeds = DirectoryTree.get_all_seeds(
            experiment_dir=experiment_dir)

        for i, seed_dir in enumerate(experiment_seeds):

            # Prints which seed directory is being treated

            logger.debug(f"{seed_dir}")

            # Loads training config

            config_dict = load_dict_from_json(str(seed_dir / "config.json"))

            # Keys can be any information stored in config.json
            # We also handle a few special cases (e.g. "experiment_num")

            keys = config_dict.copy()
            keys['experiment_num'] = seed_dir.parent.stem.strip('experiment')
            keys['storage_name'] = seed_dir.parents[1]

            outer_key = keys[graph_key]  # number of graphs to be made
            inner_key = keys[curve_key]  # number of curves per graph

            # Loads training data

            loaded_recorder = Recorder.init_from_pickle_file(
                filename=str(seed_dir / 'recorders' / 'train_recorder.pkl'))

            # Stores the data

            if outer_key not in y_data.keys():
                x_data[outer_key] = OrderedDict()
                y_data[outer_key] = OrderedDict()

            if inner_key not in y_data[outer_key].keys():
                x_data[outer_key][inner_key] = []
                y_data[outer_key][inner_key] = []

            x_data[outer_key][inner_key].append(loaded_recorder.tape[x_metric])
            y_data[outer_key][inner_key].append(
                loaded_recorder.tape[y_metric]
            )  # TODO: make sure that this is a scalar metric, even for eval_return (and not 10 points for every eval_step). All metrics saved in the recorder should be scalars for every time point.

    return x_data, y_data
示例#2
0
def _compute_seed_scores(storage_dir, performance_metric,
                         performance_aggregation, group_key, bar_key,
                         re_run_if_exists, save_dir, logger, root_dir,
                         n_eval_runs):
    if (storage_dir / save_dir /
            f"{save_dir}_seed_scores.pkl").exists() and not re_run_if_exists:
        logger.info(
            f" SKIPPING {storage_dir} - {save_dir}_seed_scores.pkl already exists"
        )
        return

    else:
        logger.info(f"Benchmarking {storage_dir}...")

    assert group_key in [
        'task_name', 'storage_name', 'experiment_num', 'alg_name'
    ]
    assert bar_key in [
        'task_name', 'storage_name', 'experiment_num', 'alg_name'
    ]

    # Initialize container

    scores = OrderedDict()

    # Get all experiment directories

    all_experiments = DirectoryTree.get_all_experiments(
        storage_dir=storage_dir)

    for experiment_dir in all_experiments:

        # For that experiment, get all seed directories

        experiment_seeds = DirectoryTree.get_all_seeds(
            experiment_dir=experiment_dir)

        # Initialize container

        all_seeds_scores = []

        for i, seed_dir in enumerate(experiment_seeds):
            # Prints which seed directory is being treated

            logger.debug(f"{seed_dir}")

            # Loads training config

            config_dict = load_dict_from_json(str(seed_dir / "config.json"))

            # Selects how data will be identified

            keys = {
                "task_name": config_dict["task_name"],
                "storage_name": seed_dir.parents[1].name,
                "alg_name": config_dict["alg_name"],
                "experiment_num": seed_dir.parents[0].name.strip('experiment')
            }

            outer_key = keys[bar_key]
            inner_key = keys[group_key]

            # Evaluation phase

            if performance_metric == 'evaluation_runs':

                assert n_eval_runs is not None

                try:
                    from evaluate import evaluate, get_evaluation_args
                except ImportError as e:
                    raise ImportError(
                        f"{e}\nTo evaluate models based on --performance_metric='evaluation_runs' "
                        f"alfred.benchmark assumes the following structure that the working directory contains "
                        f"a file called evaluate.py containing two functions: "
                        f"\n\t1. a function evaluate() that returns a score for each evaluation run"
                        f"\n\t2. a function get_evaluation_args() that returns a Namespace of arguments for evaluate()"
                    )

                # Sets config for evaluation phase

                eval_config = get_evaluation_args(overwritten_args="")
                eval_config.storage_name = seed_dir.parents[1].name
                eval_config.experiment_num = int(
                    seed_dir.parents[0].name.strip("experiment"))
                eval_config.seed_num = int(seed_dir.name.strip("seed"))
                eval_config.render = False
                eval_config.n_episodes = n_eval_runs
                eval_config.root_dir = root_dir

                # Evaluates agent and stores the return

                performance_data = evaluate(eval_config)

            else:

                # Loads training data

                loaded_recorder = Recorder.init_from_pickle_file(
                    filename=str(seed_dir / 'recorders' /
                                 'train_recorder.pkl'))

                performance_data = loaded_recorder.tape[performance_metric]

            # Aggregation phase

            if performance_aggregation == 'min':
                score = np.min(performance_data)

            elif performance_aggregation == 'max':
                score = np.max(performance_data)

            elif performance_aggregation == 'avg':
                score = np.mean(performance_data)

            elif performance_aggregation == 'last':
                score = performance_data[-1]

            elif performance_aggregation == 'mean_on_last_20_percents':
                eighty_percent_index = int(0.8 * len(performance_data))
                score = np.mean(performance_data[eighty_percent_index:])
            else:
                raise NotImplementedError

            all_seeds_scores.append(score)

        if outer_key not in scores.keys():
            scores[outer_key] = OrderedDict()

        scores[outer_key][inner_key] = np.stack(all_seeds_scores)

    os.makedirs(storage_dir / save_dir, exist_ok=True)

    with open(storage_dir / save_dir / f"{save_dir}_seed_scores.pkl",
              "wb") as f:
        pickle.dump(scores, f)

    scores_info = {
        'n_eval_runs': n_eval_runs,
        'performance_metric': performance_metric,
        'performance_aggregation': performance_aggregation
    }

    save_dict_to_json(scores_info,
                      filename=str(storage_dir / save_dir /
                                   f"{save_dir}_seed_scores_info.json"))
示例#3
0
def create_plot_arrays(
        from_file,
        storage_name,
        root_dir,
        remove_none,
        logger,
        plots_to_make=alfred.defaults.DEFAULT_PLOTS_ARRAYS_TO_MAKE):
    """
    Creates and and saves comparative figure containing a plot of total reward for each different experiment
    :param storage_dir: pathlib.Path object of the model directory containing the experiments to compare
    :param plots_to_make: list of strings indicating which comparative plots to make
    :return: None
    """
    # Select storage_dirs to run over

    storage_dirs = select_storage_dirs(from_file, storage_name, root_dir)

    for storage_dir in storage_dirs:

        # Get all experiment directories and sorts them numerically

        sorted_experiments = DirectoryTree.get_all_experiments(storage_dir)

        all_seeds_dir = []
        for experiment in sorted_experiments:
            all_seeds_dir = all_seeds_dir + DirectoryTree.get_all_seeds(
                experiment)

        # Determines what type of search was done

        if (storage_dir / 'GRID_SEARCH').exists():
            search_type = 'grid'
        elif (storage_dir / 'RANDOM_SEARCH').exists():
            search_type = 'random'
        else:
            search_type = 'unknown'

        # Determines row and columns of subplots

        if search_type == 'grid':
            variations = load_dict_from_json(filename=str(storage_dir /
                                                          'variations.json'))

            # experiment_groups account for the fact that all the experiment_dir in a storage_dir may have been created
            # though several runs of prepare_schedule.py, and therefore, many "groups" of experiments have been created
            experiment_groups = {key: {} for key in variations.keys()}
            for group_key, properties in experiment_groups.items():
                properties['variations'] = variations[group_key]

                properties['variations_lengths'] = {
                    k: len(properties['variations'][k])
                    for k in properties['variations'].keys()
                }

                # Deleting alg_name and task_name from variations (because they will not be contained in same storage_dir)

                hyperparam_variations_lengths = deepcopy(
                    properties['variations_lengths'])
                del hyperparam_variations_lengths['alg_name']
                del hyperparam_variations_lengths['task_name']

                i_max = sorted(hyperparam_variations_lengths.values())[-1]
                j_max = int(
                    np.prod(
                        sorted(hyperparam_variations_lengths.values())[:-1]))

                if i_max < 4 and j_max == 1:
                    # If only one hyperparameter was varied over, we order plots on a line
                    j_max = i_max
                    i_max = 1
                    ax_array_dim = 1

                elif i_max >= 4 and j_max == 1:
                    # ... unless there are 4 or more variations, then we put them in a square-ish fashion
                    j_max = int(np.sqrt(i_max))
                    i_max = int(np.ceil(float(i_max) / float(j_max)))
                    ax_array_dim = 2

                else:
                    ax_array_dim = 2

                properties['ax_array_shape'] = (i_max, j_max)
                properties['ax_array_dim'] = ax_array_dim

        else:
            experiment_groups = {"all": {}}
            for group_key, properties in experiment_groups.items():
                i_max = len(sorted_experiments
                            )  # each experiment is on a different row
                j_max = len(all_seeds_dir
                            ) // i_max  # each seed is on a different column

                if i_max == 1:
                    ax_array_dim = 1
                else:
                    ax_array_dim = 2

                properties['ax_array_shape'] = (i_max, j_max)
                properties['ax_array_dim'] = ax_array_dim

        for group_key, properties in experiment_groups.items():
            logger.debug(
                f"\n===========================\nPLOTS FOR EXPERIMENT GROUP: {group_key}"
            )
            i_max, j_max = properties['ax_array_shape']
            ax_array_dim = properties['ax_array_dim']

            first_exp = group_key.split('-')[0] if group_key != "all" else 0
            if first_exp != 0:
                for seed_idx, seed_dir in enumerate(all_seeds_dir):
                    if seed_dir.parent.stem.strip('experiment') == first_exp:
                        first_seed_idx = seed_idx
                        break
            else:
                first_seed_idx = 0

            for plot_to_make in plots_to_make:
                x_metric, y_metric, x_lim, y_lim = plot_to_make
                logger.debug(f'\n{y_metric} as a function of {x_metric}:')

                # Creates the subplots

                fig, ax_array = plt.subplots(i_max,
                                             j_max,
                                             figsize=(10 * j_max, 6 * i_max))

                for i in range(i_max):
                    for j in range(j_max):

                        if ax_array_dim == 1 and i_max == 1 and j_max == 1:
                            current_ax = ax_array
                        elif ax_array_dim == 1 and (i_max > 1 or j_max > 1):
                            current_ax = ax_array[j]
                        elif ax_array_dim == 2:
                            current_ax = ax_array[i, j]
                        else:
                            raise Exception(
                                'ax_array should not have more than two dimensions'
                            )

                        try:
                            seed_dir = all_seeds_dir[first_seed_idx +
                                                     (i * j_max + j)]
                            if group_key != 'all' \
                                    and (int(str(seed_dir.parent).split('experiment')[1]) < int(group_key.split('-')[0]) \
                                         or int(str(seed_dir.parent).split('experiment')[1]) > int(
                                        group_key.split('-')[1])):
                                raise IndexError
                            logger.debug(str(seed_dir))
                        except IndexError as e:
                            logger.debug(
                                f'experiment{i * j_max + j} does not exist')
                            current_ax.text(0.2,
                                            0.2,
                                            "no experiment\n found",
                                            transform=current_ax.transAxes,
                                            fontsize=24,
                                            fontweight='bold',
                                            color='red')
                            continue

                        logger.debug(seed_dir)

                        # Writes unique hyperparameters on plot

                        config = load_config_from_json(
                            filename=str(seed_dir / 'config.json'))
                        config_unique_dict = load_dict_from_json(
                            filename=str(seed_dir / 'config_unique.json'))
                        validate_config_unique(config, config_unique_dict)

                        if search_type == 'grid':
                            sorted_keys = sorted(
                                config_unique_dict.keys(),
                                key=lambda item:
                                (properties['variations_lengths'][item], item),
                                reverse=True)

                        else:
                            sorted_keys = config_unique_dict

                        info_str = f'{seed_dir.parent.stem}\n' + '\n'.join([
                            f'{k} = {config_unique_dict[k]}'
                            for k in sorted_keys
                        ])
                        bbox_props = dict(facecolor='gray', alpha=0.1)
                        current_ax.text(0.05,
                                        0.95,
                                        info_str,
                                        transform=current_ax.transAxes,
                                        fontsize=12,
                                        verticalalignment='top',
                                        bbox=bbox_props)

                        # Skip cases of UNHATCHED or CRASHED experiments

                        if (seed_dir / 'UNHATCHED').exists():
                            logger.debug('UNHATCHED')
                            current_ax.text(0.2,
                                            0.2,
                                            "UNHATCHED",
                                            transform=current_ax.transAxes,
                                            fontsize=24,
                                            fontweight='bold',
                                            color='blue')
                            continue

                        if (seed_dir / 'CRASH.txt').exists():
                            logger.debug('CRASHED')
                            current_ax.text(0.2,
                                            0.2,
                                            "CRASHED",
                                            transform=current_ax.transAxes,
                                            fontsize=24,
                                            fontweight='bold',
                                            color='red')
                            continue

                        try:

                            # Loading the recorder

                            loaded_recorder = Recorder.init_from_pickle_file(
                                filename=str(seed_dir / 'recorders' /
                                             'train_recorder.pkl'))

                            # Checking if provided metrics are present in the recorder

                            if y_metric not in loaded_recorder.tape.keys():
                                logger.debug(
                                    f"'{y_metric}' was not recorded in train_recorder."
                                )
                                current_ax.text(0.2,
                                                0.2,
                                                "ABSENT METRIC",
                                                transform=current_ax.transAxes,
                                                fontsize=24,
                                                fontweight='bold',
                                                color='red')
                                continue

                            if x_metric not in loaded_recorder.tape.keys(
                            ) and x_metric is not None:
                                if x_metric is None:
                                    pass
                                else:
                                    logger.debug(
                                        f"'{x_metric}' was not recorded in train_recorder."
                                    )
                                    current_ax.text(
                                        0.2,
                                        0.2,
                                        "ABSENT METRIC",
                                        transform=current_ax.transAxes,
                                        fontsize=24,
                                        fontweight='bold',
                                        color='red')
                                    continue

                            # Removing None entries

                            if remove_none:
                                loaded_recorder.tape[x_metric] = remove_nones(
                                    loaded_recorder.tape[x_metric])
                                loaded_recorder.tape[y_metric] = remove_nones(
                                    loaded_recorder.tape[y_metric])

                            # Plotting

                            try:

                                if x_metric is not None:
                                    plot_curves(
                                        current_ax,
                                        ys=[loaded_recorder.tape[y_metric]],
                                        xs=[loaded_recorder.tape[x_metric]],
                                        xlim=x_lim,
                                        ylim=y_lim,
                                        xlabel=x_metric,
                                        title=y_metric)
                                else:
                                    plot_curves(
                                        current_ax,
                                        ys=[loaded_recorder.tape[y_metric]],
                                        xlim=x_lim,
                                        ylim=y_lim,
                                        title=y_metric)

                            except Exception as e:
                                logger.debug(f'Polotting error: {e}')

                        except FileNotFoundError:
                            logger.debug('Training recorder not found')
                            current_ax.text(0.2,
                                            0.2,
                                            "'train_recorder'\nnot found",
                                            transform=current_ax.transAxes,
                                            fontsize=24,
                                            fontweight='bold',
                                            color='red')
                            continue

                plt.tight_layout()
                fig.savefig(
                    str(storage_dir /
                        f'{group_key}_comparative_{y_metric}_over_{x_metric}.png'
                        ))
                plt.close(fig)