def _gather_experiments_training_curves(storage_dir, graph_key, curve_key, logger, x_metric, y_metric, x_data=None, y_data=None): # Initialize containers if x_data is None: x_data = OrderedDict() else: assert type(x_data) is OrderedDict if y_data is None: y_data = OrderedDict() else: assert type(y_data) is OrderedDict # Get all experiment directories all_experiments = DirectoryTree.get_all_experiments( storage_dir=storage_dir) for experiment_dir in all_experiments: # For that experiment, get all seed directories experiment_seeds = DirectoryTree.get_all_seeds( experiment_dir=experiment_dir) for i, seed_dir in enumerate(experiment_seeds): # Prints which seed directory is being treated logger.debug(f"{seed_dir}") # Loads training config config_dict = load_dict_from_json(str(seed_dir / "config.json")) # Keys can be any information stored in config.json # We also handle a few special cases (e.g. "experiment_num") keys = config_dict.copy() keys['experiment_num'] = seed_dir.parent.stem.strip('experiment') keys['storage_name'] = seed_dir.parents[1] outer_key = keys[graph_key] # number of graphs to be made inner_key = keys[curve_key] # number of curves per graph # Loads training data loaded_recorder = Recorder.init_from_pickle_file( filename=str(seed_dir / 'recorders' / 'train_recorder.pkl')) # Stores the data if outer_key not in y_data.keys(): x_data[outer_key] = OrderedDict() y_data[outer_key] = OrderedDict() if inner_key not in y_data[outer_key].keys(): x_data[outer_key][inner_key] = [] y_data[outer_key][inner_key] = [] x_data[outer_key][inner_key].append(loaded_recorder.tape[x_metric]) y_data[outer_key][inner_key].append( loaded_recorder.tape[y_metric] ) # TODO: make sure that this is a scalar metric, even for eval_return (and not 10 points for every eval_step). All metrics saved in the recorder should be scalars for every time point. return x_data, y_data
def _compute_seed_scores(storage_dir, performance_metric, performance_aggregation, group_key, bar_key, re_run_if_exists, save_dir, logger, root_dir, n_eval_runs): if (storage_dir / save_dir / f"{save_dir}_seed_scores.pkl").exists() and not re_run_if_exists: logger.info( f" SKIPPING {storage_dir} - {save_dir}_seed_scores.pkl already exists" ) return else: logger.info(f"Benchmarking {storage_dir}...") assert group_key in [ 'task_name', 'storage_name', 'experiment_num', 'alg_name' ] assert bar_key in [ 'task_name', 'storage_name', 'experiment_num', 'alg_name' ] # Initialize container scores = OrderedDict() # Get all experiment directories all_experiments = DirectoryTree.get_all_experiments( storage_dir=storage_dir) for experiment_dir in all_experiments: # For that experiment, get all seed directories experiment_seeds = DirectoryTree.get_all_seeds( experiment_dir=experiment_dir) # Initialize container all_seeds_scores = [] for i, seed_dir in enumerate(experiment_seeds): # Prints which seed directory is being treated logger.debug(f"{seed_dir}") # Loads training config config_dict = load_dict_from_json(str(seed_dir / "config.json")) # Selects how data will be identified keys = { "task_name": config_dict["task_name"], "storage_name": seed_dir.parents[1].name, "alg_name": config_dict["alg_name"], "experiment_num": seed_dir.parents[0].name.strip('experiment') } outer_key = keys[bar_key] inner_key = keys[group_key] # Evaluation phase if performance_metric == 'evaluation_runs': assert n_eval_runs is not None try: from evaluate import evaluate, get_evaluation_args except ImportError as e: raise ImportError( f"{e}\nTo evaluate models based on --performance_metric='evaluation_runs' " f"alfred.benchmark assumes the following structure that the working directory contains " f"a file called evaluate.py containing two functions: " f"\n\t1. a function evaluate() that returns a score for each evaluation run" f"\n\t2. a function get_evaluation_args() that returns a Namespace of arguments for evaluate()" ) # Sets config for evaluation phase eval_config = get_evaluation_args(overwritten_args="") eval_config.storage_name = seed_dir.parents[1].name eval_config.experiment_num = int( seed_dir.parents[0].name.strip("experiment")) eval_config.seed_num = int(seed_dir.name.strip("seed")) eval_config.render = False eval_config.n_episodes = n_eval_runs eval_config.root_dir = root_dir # Evaluates agent and stores the return performance_data = evaluate(eval_config) else: # Loads training data loaded_recorder = Recorder.init_from_pickle_file( filename=str(seed_dir / 'recorders' / 'train_recorder.pkl')) performance_data = loaded_recorder.tape[performance_metric] # Aggregation phase if performance_aggregation == 'min': score = np.min(performance_data) elif performance_aggregation == 'max': score = np.max(performance_data) elif performance_aggregation == 'avg': score = np.mean(performance_data) elif performance_aggregation == 'last': score = performance_data[-1] elif performance_aggregation == 'mean_on_last_20_percents': eighty_percent_index = int(0.8 * len(performance_data)) score = np.mean(performance_data[eighty_percent_index:]) else: raise NotImplementedError all_seeds_scores.append(score) if outer_key not in scores.keys(): scores[outer_key] = OrderedDict() scores[outer_key][inner_key] = np.stack(all_seeds_scores) os.makedirs(storage_dir / save_dir, exist_ok=True) with open(storage_dir / save_dir / f"{save_dir}_seed_scores.pkl", "wb") as f: pickle.dump(scores, f) scores_info = { 'n_eval_runs': n_eval_runs, 'performance_metric': performance_metric, 'performance_aggregation': performance_aggregation } save_dict_to_json(scores_info, filename=str(storage_dir / save_dir / f"{save_dir}_seed_scores_info.json"))
def create_plot_arrays( from_file, storage_name, root_dir, remove_none, logger, plots_to_make=alfred.defaults.DEFAULT_PLOTS_ARRAYS_TO_MAKE): """ Creates and and saves comparative figure containing a plot of total reward for each different experiment :param storage_dir: pathlib.Path object of the model directory containing the experiments to compare :param plots_to_make: list of strings indicating which comparative plots to make :return: None """ # Select storage_dirs to run over storage_dirs = select_storage_dirs(from_file, storage_name, root_dir) for storage_dir in storage_dirs: # Get all experiment directories and sorts them numerically sorted_experiments = DirectoryTree.get_all_experiments(storage_dir) all_seeds_dir = [] for experiment in sorted_experiments: all_seeds_dir = all_seeds_dir + DirectoryTree.get_all_seeds( experiment) # Determines what type of search was done if (storage_dir / 'GRID_SEARCH').exists(): search_type = 'grid' elif (storage_dir / 'RANDOM_SEARCH').exists(): search_type = 'random' else: search_type = 'unknown' # Determines row and columns of subplots if search_type == 'grid': variations = load_dict_from_json(filename=str(storage_dir / 'variations.json')) # experiment_groups account for the fact that all the experiment_dir in a storage_dir may have been created # though several runs of prepare_schedule.py, and therefore, many "groups" of experiments have been created experiment_groups = {key: {} for key in variations.keys()} for group_key, properties in experiment_groups.items(): properties['variations'] = variations[group_key] properties['variations_lengths'] = { k: len(properties['variations'][k]) for k in properties['variations'].keys() } # Deleting alg_name and task_name from variations (because they will not be contained in same storage_dir) hyperparam_variations_lengths = deepcopy( properties['variations_lengths']) del hyperparam_variations_lengths['alg_name'] del hyperparam_variations_lengths['task_name'] i_max = sorted(hyperparam_variations_lengths.values())[-1] j_max = int( np.prod( sorted(hyperparam_variations_lengths.values())[:-1])) if i_max < 4 and j_max == 1: # If only one hyperparameter was varied over, we order plots on a line j_max = i_max i_max = 1 ax_array_dim = 1 elif i_max >= 4 and j_max == 1: # ... unless there are 4 or more variations, then we put them in a square-ish fashion j_max = int(np.sqrt(i_max)) i_max = int(np.ceil(float(i_max) / float(j_max))) ax_array_dim = 2 else: ax_array_dim = 2 properties['ax_array_shape'] = (i_max, j_max) properties['ax_array_dim'] = ax_array_dim else: experiment_groups = {"all": {}} for group_key, properties in experiment_groups.items(): i_max = len(sorted_experiments ) # each experiment is on a different row j_max = len(all_seeds_dir ) // i_max # each seed is on a different column if i_max == 1: ax_array_dim = 1 else: ax_array_dim = 2 properties['ax_array_shape'] = (i_max, j_max) properties['ax_array_dim'] = ax_array_dim for group_key, properties in experiment_groups.items(): logger.debug( f"\n===========================\nPLOTS FOR EXPERIMENT GROUP: {group_key}" ) i_max, j_max = properties['ax_array_shape'] ax_array_dim = properties['ax_array_dim'] first_exp = group_key.split('-')[0] if group_key != "all" else 0 if first_exp != 0: for seed_idx, seed_dir in enumerate(all_seeds_dir): if seed_dir.parent.stem.strip('experiment') == first_exp: first_seed_idx = seed_idx break else: first_seed_idx = 0 for plot_to_make in plots_to_make: x_metric, y_metric, x_lim, y_lim = plot_to_make logger.debug(f'\n{y_metric} as a function of {x_metric}:') # Creates the subplots fig, ax_array = plt.subplots(i_max, j_max, figsize=(10 * j_max, 6 * i_max)) for i in range(i_max): for j in range(j_max): if ax_array_dim == 1 and i_max == 1 and j_max == 1: current_ax = ax_array elif ax_array_dim == 1 and (i_max > 1 or j_max > 1): current_ax = ax_array[j] elif ax_array_dim == 2: current_ax = ax_array[i, j] else: raise Exception( 'ax_array should not have more than two dimensions' ) try: seed_dir = all_seeds_dir[first_seed_idx + (i * j_max + j)] if group_key != 'all' \ and (int(str(seed_dir.parent).split('experiment')[1]) < int(group_key.split('-')[0]) \ or int(str(seed_dir.parent).split('experiment')[1]) > int( group_key.split('-')[1])): raise IndexError logger.debug(str(seed_dir)) except IndexError as e: logger.debug( f'experiment{i * j_max + j} does not exist') current_ax.text(0.2, 0.2, "no experiment\n found", transform=current_ax.transAxes, fontsize=24, fontweight='bold', color='red') continue logger.debug(seed_dir) # Writes unique hyperparameters on plot config = load_config_from_json( filename=str(seed_dir / 'config.json')) config_unique_dict = load_dict_from_json( filename=str(seed_dir / 'config_unique.json')) validate_config_unique(config, config_unique_dict) if search_type == 'grid': sorted_keys = sorted( config_unique_dict.keys(), key=lambda item: (properties['variations_lengths'][item], item), reverse=True) else: sorted_keys = config_unique_dict info_str = f'{seed_dir.parent.stem}\n' + '\n'.join([ f'{k} = {config_unique_dict[k]}' for k in sorted_keys ]) bbox_props = dict(facecolor='gray', alpha=0.1) current_ax.text(0.05, 0.95, info_str, transform=current_ax.transAxes, fontsize=12, verticalalignment='top', bbox=bbox_props) # Skip cases of UNHATCHED or CRASHED experiments if (seed_dir / 'UNHATCHED').exists(): logger.debug('UNHATCHED') current_ax.text(0.2, 0.2, "UNHATCHED", transform=current_ax.transAxes, fontsize=24, fontweight='bold', color='blue') continue if (seed_dir / 'CRASH.txt').exists(): logger.debug('CRASHED') current_ax.text(0.2, 0.2, "CRASHED", transform=current_ax.transAxes, fontsize=24, fontweight='bold', color='red') continue try: # Loading the recorder loaded_recorder = Recorder.init_from_pickle_file( filename=str(seed_dir / 'recorders' / 'train_recorder.pkl')) # Checking if provided metrics are present in the recorder if y_metric not in loaded_recorder.tape.keys(): logger.debug( f"'{y_metric}' was not recorded in train_recorder." ) current_ax.text(0.2, 0.2, "ABSENT METRIC", transform=current_ax.transAxes, fontsize=24, fontweight='bold', color='red') continue if x_metric not in loaded_recorder.tape.keys( ) and x_metric is not None: if x_metric is None: pass else: logger.debug( f"'{x_metric}' was not recorded in train_recorder." ) current_ax.text( 0.2, 0.2, "ABSENT METRIC", transform=current_ax.transAxes, fontsize=24, fontweight='bold', color='red') continue # Removing None entries if remove_none: loaded_recorder.tape[x_metric] = remove_nones( loaded_recorder.tape[x_metric]) loaded_recorder.tape[y_metric] = remove_nones( loaded_recorder.tape[y_metric]) # Plotting try: if x_metric is not None: plot_curves( current_ax, ys=[loaded_recorder.tape[y_metric]], xs=[loaded_recorder.tape[x_metric]], xlim=x_lim, ylim=y_lim, xlabel=x_metric, title=y_metric) else: plot_curves( current_ax, ys=[loaded_recorder.tape[y_metric]], xlim=x_lim, ylim=y_lim, title=y_metric) except Exception as e: logger.debug(f'Polotting error: {e}') except FileNotFoundError: logger.debug('Training recorder not found') current_ax.text(0.2, 0.2, "'train_recorder'\nnot found", transform=current_ax.transAxes, fontsize=24, fontweight='bold', color='red') continue plt.tight_layout() fig.savefig( str(storage_dir / f'{group_key}_comparative_{y_metric}_over_{x_metric}.png' )) plt.close(fig)