def create_plots(self, train_recorder, save_dir): fig, axes = create_fig((3, 3)) plot_curves( axes[0, 0], xs=[remove_nones(train_recorder.tape['update_i'])], ys=[remove_nones(train_recorder.tape['reconstruction_loss'])], xlabel='update_i', ylabel='reconstruction_loss') plot_curves(axes[0, 1], xs=[remove_nones(train_recorder.tape['update_i'])], ys=[remove_nones(train_recorder.tape['prior_loss'])], xlabel='update_i', ylabel='prior_loss') plot_curves(axes[0, 2], xs=[remove_nones(train_recorder.tape['update_i'])], ys=[remove_nones(train_recorder.tape['total_loss'])], xlabel='update_i', ylabel='total_loss') plot_curves(axes[1, 0], xs=[remove_nones(train_recorder.tape['update_i'])], ys=[remove_nones(train_recorder.tape['lr'])], xlabel='update_i', ylabel='lr') plt.tight_layout() fig.savefig(str(save_dir / 'graphs.png')) plt.close(fig)
def _compute_seed_scores(storage_dir, performance_metric, performance_aggregation, group_key, bar_key, re_run_if_exists, save_dir, logger, root_dir, n_eval_runs): if (storage_dir / save_dir / f"{save_dir}_seed_scores.pkl").exists() and not re_run_if_exists: logger.info( f" SKIPPING {storage_dir} - {save_dir}_seed_scores.pkl already exists" ) return else: logger.info(f"Benchmarking {storage_dir}...") assert group_key in [ 'task_name', 'storage_name', 'experiment_num', 'alg_name' ] assert bar_key in [ 'task_name', 'storage_name', 'experiment_num', 'alg_name' ] # Initialize container scores = OrderedDict() # Get all experiment directories all_experiments = DirectoryTree.get_all_experiments( storage_dir=storage_dir) for experiment_dir in all_experiments: # For that experiment, get all seed directories experiment_seeds = DirectoryTree.get_all_seeds( experiment_dir=experiment_dir) # Initialize container all_seeds_scores = [] for i, seed_dir in enumerate(experiment_seeds): # Prints which seed directory is being treated logger.debug(f"{seed_dir}") # Loads training config config_dict = load_dict_from_json(str(seed_dir / "config.json")) # Selects how data will be identified keys = { "task_name": config_dict["task_name"], "storage_name": seed_dir.parents[1].name, "alg_name": config_dict["alg_name"], "experiment_num": seed_dir.parents[0].name.strip('experiment') } outer_key = keys[bar_key] inner_key = keys[group_key] # Evaluation phase if performance_metric == 'evaluation_runs': assert n_eval_runs is not None try: from evaluate import evaluate, get_evaluation_args except ImportError as e: raise ImportError( f"{e}\nTo evaluate models based on --performance_metric='evaluation_runs' " f"alfred.benchmark assumes the following structure that the working directory contains " f"a file called evaluate.py containing two functions: " f"\n\t1. a function evaluate() that returns a score for each evaluation run" f"\n\t2. a function get_evaluation_args() that returns a Namespace of arguments for evaluate()" ) # Sets config for evaluation phase eval_config = get_evaluation_args(overwritten_args="") eval_config.storage_name = seed_dir.parents[1].name eval_config.experiment_num = int( seed_dir.parents[0].name.strip("experiment")) eval_config.seed_num = int(seed_dir.name.strip("seed")) eval_config.render = False eval_config.n_episodes = n_eval_runs eval_config.root_dir = root_dir # Evaluates agent and stores the return performance_data = evaluate(eval_config) else: # Loads training data loaded_recorder = Recorder.init_from_pickle_file( filename=str(seed_dir / 'recorders' / 'train_recorder.pkl')) performance_data = remove_nones( loaded_recorder.tape[performance_metric]) # Aggregation phase if performance_aggregation == 'min': score = np.min(performance_data) elif performance_aggregation == 'max': score = np.max(performance_data) elif performance_aggregation == 'avg': score = np.mean(performance_data) elif performance_aggregation == 'last': score = performance_data[-1] elif performance_aggregation == 'mean_on_last_20_percents': eighty_percent_index = int(0.8 * len(performance_data)) score = np.mean(performance_data[eighty_percent_index:]) else: raise NotImplementedError all_seeds_scores.append(score) if outer_key not in scores.keys(): scores[outer_key] = OrderedDict() scores[outer_key][inner_key] = np.stack(all_seeds_scores) os.makedirs(storage_dir / save_dir, exist_ok=True) with open(storage_dir / save_dir / f"{save_dir}_seed_scores.pkl", "wb") as f: pickle.dump(scores, f) scores_info = { 'n_eval_runs': n_eval_runs, 'performance_metric': performance_metric, 'performance_aggregation': performance_aggregation } save_dict_to_json(scores_info, filename=str(storage_dir / save_dir / f"{save_dir}_seed_scores_info.json"))
def create_plot_arrays( from_file, storage_name, root_dir, remove_none, logger, plots_to_make=alfred.defaults.DEFAULT_PLOTS_ARRAYS_TO_MAKE): """ Creates and and saves comparative figure containing a plot of total reward for each different experiment :param storage_dir: pathlib.Path object of the model directory containing the experiments to compare :param plots_to_make: list of strings indicating which comparative plots to make :return: None """ # Select storage_dirs to run over storage_dirs = select_storage_dirs(from_file, storage_name, root_dir) for storage_dir in storage_dirs: # Get all experiment directories and sorts them numerically sorted_experiments = DirectoryTree.get_all_experiments(storage_dir) all_seeds_dir = [] for experiment in sorted_experiments: all_seeds_dir = all_seeds_dir + DirectoryTree.get_all_seeds( experiment) # Determines what type of search was done if (storage_dir / 'GRID_SEARCH').exists(): search_type = 'grid' elif (storage_dir / 'RANDOM_SEARCH').exists(): search_type = 'random' else: search_type = 'unknown' # Determines row and columns of subplots if search_type == 'grid': variations = load_dict_from_json(filename=str(storage_dir / 'variations.json')) # experiment_groups account for the fact that all the experiment_dir in a storage_dir may have been created # though several runs of prepare_schedule.py, and therefore, many "groups" of experiments have been created experiment_groups = {key: {} for key in variations.keys()} for group_key, properties in experiment_groups.items(): properties['variations'] = variations[group_key] properties['variations_lengths'] = { k: len(properties['variations'][k]) for k in properties['variations'].keys() } # Deleting alg_name and task_name from variations (because they will not be contained in same storage_dir) hyperparam_variations_lengths = deepcopy( properties['variations_lengths']) del hyperparam_variations_lengths['alg_name'] del hyperparam_variations_lengths['task_name'] i_max = sorted(hyperparam_variations_lengths.values())[-1] j_max = int( np.prod( sorted(hyperparam_variations_lengths.values())[:-1])) if i_max < 4 and j_max == 1: # If only one hyperparameter was varied over, we order plots on a line j_max = i_max i_max = 1 ax_array_dim = 1 elif i_max >= 4 and j_max == 1: # ... unless there are 4 or more variations, then we put them in a square-ish fashion j_max = int(np.sqrt(i_max)) i_max = int(np.ceil(float(i_max) / float(j_max))) ax_array_dim = 2 else: ax_array_dim = 2 properties['ax_array_shape'] = (i_max, j_max) properties['ax_array_dim'] = ax_array_dim else: experiment_groups = {"all": {}} for group_key, properties in experiment_groups.items(): i_max = len(sorted_experiments ) # each experiment is on a different row j_max = len(all_seeds_dir ) // i_max # each seed is on a different column if i_max == 1: ax_array_dim = 1 else: ax_array_dim = 2 properties['ax_array_shape'] = (i_max, j_max) properties['ax_array_dim'] = ax_array_dim for group_key, properties in experiment_groups.items(): logger.debug( f"\n===========================\nPLOTS FOR EXPERIMENT GROUP: {group_key}" ) i_max, j_max = properties['ax_array_shape'] ax_array_dim = properties['ax_array_dim'] first_exp = group_key.split('-')[0] if group_key != "all" else 0 if first_exp != 0: for seed_idx, seed_dir in enumerate(all_seeds_dir): if seed_dir.parent.stem.strip('experiment') == first_exp: first_seed_idx = seed_idx break else: first_seed_idx = 0 for plot_to_make in plots_to_make: x_metric, y_metric, x_lim, y_lim = plot_to_make logger.debug(f'\n{y_metric} as a function of {x_metric}:') # Creates the subplots fig, ax_array = plt.subplots(i_max, j_max, figsize=(10 * j_max, 6 * i_max)) for i in range(i_max): for j in range(j_max): if ax_array_dim == 1 and i_max == 1 and j_max == 1: current_ax = ax_array elif ax_array_dim == 1 and (i_max > 1 or j_max > 1): current_ax = ax_array[j] elif ax_array_dim == 2: current_ax = ax_array[i, j] else: raise Exception( 'ax_array should not have more than two dimensions' ) try: seed_dir = all_seeds_dir[first_seed_idx + (i * j_max + j)] if group_key != 'all' \ and (int(str(seed_dir.parent).split('experiment')[1]) < int(group_key.split('-')[0]) \ or int(str(seed_dir.parent).split('experiment')[1]) > int( group_key.split('-')[1])): raise IndexError logger.debug(str(seed_dir)) except IndexError as e: logger.debug( f'experiment{i * j_max + j} does not exist') current_ax.text(0.2, 0.2, "no experiment\n found", transform=current_ax.transAxes, fontsize=24, fontweight='bold', color='red') continue logger.debug(seed_dir) # Writes unique hyperparameters on plot config = load_config_from_json( filename=str(seed_dir / 'config.json')) config_unique_dict = load_dict_from_json( filename=str(seed_dir / 'config_unique.json')) validate_config_unique(config, config_unique_dict) if search_type == 'grid': sorted_keys = sorted( config_unique_dict.keys(), key=lambda item: (properties['variations_lengths'][item], item), reverse=True) else: sorted_keys = config_unique_dict info_str = f'{seed_dir.parent.stem}\n' + '\n'.join([ f'{k} = {config_unique_dict[k]}' for k in sorted_keys ]) bbox_props = dict(facecolor='gray', alpha=0.1) current_ax.text(0.05, 0.95, info_str, transform=current_ax.transAxes, fontsize=12, verticalalignment='top', bbox=bbox_props) # Skip cases of UNHATCHED or CRASHED experiments if (seed_dir / 'UNHATCHED').exists(): logger.debug('UNHATCHED') current_ax.text(0.2, 0.2, "UNHATCHED", transform=current_ax.transAxes, fontsize=24, fontweight='bold', color='blue') continue if (seed_dir / 'CRASH.txt').exists(): logger.debug('CRASHED') current_ax.text(0.2, 0.2, "CRASHED", transform=current_ax.transAxes, fontsize=24, fontweight='bold', color='red') continue try: # Loading the recorder loaded_recorder = Recorder.init_from_pickle_file( filename=str(seed_dir / 'recorders' / 'train_recorder.pkl')) # Checking if provided metrics are present in the recorder if y_metric not in loaded_recorder.tape.keys(): logger.debug( f"'{y_metric}' was not recorded in train_recorder." ) current_ax.text(0.2, 0.2, "ABSENT METRIC", transform=current_ax.transAxes, fontsize=24, fontweight='bold', color='red') continue if x_metric not in loaded_recorder.tape.keys( ) and x_metric is not None: if x_metric is None: pass else: logger.debug( f"'{x_metric}' was not recorded in train_recorder." ) current_ax.text( 0.2, 0.2, "ABSENT METRIC", transform=current_ax.transAxes, fontsize=24, fontweight='bold', color='red') continue # Removing None entries if remove_none: loaded_recorder.tape[x_metric] = remove_nones( loaded_recorder.tape[x_metric]) loaded_recorder.tape[y_metric] = remove_nones( loaded_recorder.tape[y_metric]) # Plotting try: if x_metric is not None: plot_curves( current_ax, ys=[loaded_recorder.tape[y_metric]], xs=[loaded_recorder.tape[x_metric]], xlim=x_lim, ylim=y_lim, xlabel=x_metric, title=y_metric) else: plot_curves( current_ax, ys=[loaded_recorder.tape[y_metric]], xlim=x_lim, ylim=y_lim, title=y_metric) except Exception as e: logger.debug(f'Polotting error: {e}') except FileNotFoundError: logger.debug('Training recorder not found') current_ax.text(0.2, 0.2, "'train_recorder'\nnot found", transform=current_ax.transAxes, fontsize=24, fontweight='bold', color='red') continue plt.tight_layout() fig.savefig( str(storage_dir / f'{group_key}_comparative_{y_metric}_over_{x_metric}.png' )) plt.close(fig)
def _make_benchmark_learning_figure(x_data, y_data, x_metric, y_metric, y_error_bars, storage_dirs, save_dir, logger, n_labels=np.inf, visuals_file=None, additional_curves_file=None): # Initialize containers y_data_means = OrderedDict() y_data_err_up = OrderedDict() y_data_err_down = OrderedDict() long_labels = OrderedDict() titles = OrderedDict() x_axis_titles = OrderedDict() y_axis_titles = OrderedDict() labels = OrderedDict() colors = OrderedDict() markers = OrderedDict() for outer_key in y_data: y_data_means[outer_key] = OrderedDict() y_data_err_up[outer_key] = OrderedDict() y_data_err_down[outer_key] = OrderedDict() # Initialize figure n_graphs = len(y_data.keys()) if n_graphs == 3: axes_shape = (1, 3) elif n_graphs > 1: i_max = int(np.ceil(np.sqrt(len(y_data.keys())))) axes_shape = (int(np.ceil(len(y_data.keys()) / i_max)), i_max) else: axes_shape = (1, 1) # Creates figure gs = gridspec.GridSpec(*axes_shape) fig = plt.figure(figsize=(8 * axes_shape[1], 4 * axes_shape[0])) # Remove nones for data in [x_data, y_data]: for outer_key in data.keys(): for inner_key in data[outer_key].keys(): for seed_i, seed_data in enumerate(data[outer_key][inner_key]): data[outer_key][inner_key][seed_i] = remove_nones( seed_data) # Compute means and stds for all inner_key curve from raw data for i, outer_key in enumerate(y_data.keys()): for inner_key in y_data[outer_key].keys(): if y_error_bars == "stderr": x_data[outer_key][inner_key] = x_data[outer_key][inner_key][ 0] # assumes all x_data are the same y_data_means[outer_key][inner_key] = np.stack( y_data[outer_key][inner_key], axis=-1).mean(-1) y_data_err_up[outer_key][inner_key] = np.stack(y_data[outer_key][inner_key], axis=-1).std(-1) \ / len(y_data[outer_key][inner_key]) ** 0.5 y_data_err_down = y_data_err_up elif y_error_bars == "bootstrapped_CI": x_data[outer_key][inner_key] = x_data[outer_key][inner_key][ 0] # assumes all x_data are the same y_data_samples = np.stack( y_data[outer_key][inner_key], axis=-1) # dim=0 is accross time (n_time_steps, n_samples) mean, err_up, err_down = get_95_confidence_interval_of_sequence( list_of_samples=y_data_samples, method=y_error_bars) y_data_means[outer_key][inner_key] = mean y_data_err_up[outer_key][inner_key] = err_up y_data_err_down[outer_key][inner_key] = err_down elif y_error_bars == "None": y_data_means[outer_key][inner_key] = y_data[outer_key][ inner_key] y_data_err_up[outer_key][inner_key] = None y_data_err_down[outer_key][inner_key] = None # Transpose list of lists (necessary for matplotlib to properly plot all curves in one call) # see: https://stackoverflow.com/questions/6473679/transpose-list-of-lists y_data_means[outer_key][inner_key] = list( map(list, zip(*y_data_means[outer_key][inner_key]))) x_data[outer_key][inner_key] = list( map(list, zip(*x_data[outer_key][inner_key]))) else: raise NotImplementedError long_labels[outer_key] = list(y_data_means[outer_key].keys()) # Limits the number of labels to be displayed (only displays labels of n_labels best experiments) if n_labels < np.inf: mean_over_entire_curves = np.array( [array.mean() for array in y_data_means[outer_key].values()]) n_max_idxs = (-mean_over_entire_curves).argsort()[:n_labels] for k in range(len(long_labels[outer_key])): if k in n_max_idxs: continue else: long_labels[outer_key][k] = None # Selects right ax object if axes_shape == (1, 1): current_ax = fig.add_subplot(gs[0, 0]) elif any(np.array(axes_shape) == 1): current_ax = fig.add_subplot(gs[0, i]) else: current_ax = fig.add_subplot(gs[i // axes_shape[1], i % axes_shape[1]]) # Collect algorithm names if all([ type(long_label) is pathlib.PosixPath for long_label in long_labels[outer_key] ]): algs = [] for path in long_labels[outer_key]: _, _, alg, _, _ = DirectoryTree.extract_info_from_storage_name( path.name) algs.append(alg) # Loads visuals dictionaries if visuals_file is not None: visuals = load_dict_from_json(visuals_file) else: visuals = None # Loads additional curves file if additional_curves_file is not None: additional_curves = load_dict_from_json(additional_curves_file) else: additional_curves = None # Sets visuals if type(visuals) is dict and 'titles_dict' in visuals.keys(): titles[outer_key] = visuals['titles_dict'][outer_key] else: titles[outer_key] = outer_key if type(visuals) is dict and 'axis_titles_dict' in visuals.keys(): x_axis_titles[outer_key] = visuals['axis_titles_dict'][x_metric] y_axis_titles[outer_key] = visuals['axis_titles_dict'][y_metric] else: x_axis_titles[outer_key] = x_metric y_axis_titles[outer_key] = y_metric if type(visuals) is dict and 'labels_dict' in visuals.keys(): labels[outer_key] = [ visuals['labels_dict'][inner_key] for inner_key in y_data_means[outer_key].keys() ] else: labels[outer_key] = long_labels[outer_key] if type(visuals) is dict and 'colors_dict' in visuals.keys(): colors[outer_key] = [ visuals['colors_dict'][inner_key] for inner_key in y_data_means[outer_key].keys() ] else: colors[outer_key] = [None for _ in long_labels[outer_key]] if type(visuals) is dict and 'markers_dict' in visuals.keys(): markers[outer_key] = [ visuals['markers_dict'][inner_key] for inner_key in y_data_means[outer_key].keys() ] else: markers[outer_key] = [None for _ in long_labels[outer_key]] logger.info( f"Graph for {outer_key}:\n\tlabels={labels}\n\tcolors={colors}\n\tmarkers={markers}" ) if additional_curves_file is not None: hlines = additional_curves['hlines'][outer_key] n_lines = len(hlines) else: hlines = None n_lines = 0 # Plots the curves plot_curves( current_ax, xs=list(x_data[outer_key].values()), ys=list(y_data_means[outer_key].values()), fill_up=list(y_data_err_up[outer_key].values()) if y_error_bars != "None" else None, fill_down=list(y_data_err_down[outer_key].values()) if y_error_bars != "None" else None, labels=labels[outer_key], colors=colors[outer_key], markers=markers[outer_key], xlabel=x_axis_titles[outer_key], ylabel=y_axis_titles[outer_key] if i == 0 else "", title=titles[outer_key].upper(), add_legend=True if i == (len(list(y_data.keys())) - 1) else False, legend_outside=True, legend_loc="upper right", legend_pos=(0.95, -0.2), legend_n_columns=len(list(y_data_means[outer_key].values())) + n_lines, hlines=hlines, tick_font_size=22, axis_font_size=26, legend_font_size=26, title_font_size=28) plt.tight_layout() for storage_dir in storage_dirs: os.makedirs(storage_dir / save_dir, exist_ok=True) fig.savefig(storage_dir / save_dir / f'{save_dir}_learning.pdf', bbox_inches='tight') plt.close(fig)