def run(args): root = logging.getLogger() root.setLevel(logging.INFO) with open(args.dataset_path, 'r') as fp: arff_dataset = arff.load(fp) config_space = sklearnbot.config_spaces.get_config_space(args.classifier, None) data = openmlcontrib.meta.arff_to_dataframe(arff_dataset, config_space) data = openmlcontrib.meta.integer_encode_dataframe(data, config_space) meta_data = get_dataset_metadata(args.dataset_path) if args.measure not in data.columns.values: raise ValueError('Could not find measure in dataset: %s' % args.measure) if set(config_space.get_hyperparameter_names()) != set(meta_data['col_parameters']): missing_cs = set(meta_data['col_parameters']) - set(config_space.get_hyperparameter_names()) missing_ds = set(config_space.get_hyperparameter_names()) - set(meta_data['col_parameters']) raise ValueError('ConfigSpace and hyperparameters of dataset do not ' 'align. ConfigSpace misses: %s, dataset misses: %s' % (missing_cs, missing_ds)) task_ids = data['task_id'].unique() result = list() for idx, task_id in enumerate(task_ids): logging.info('Running fanova on task %d (%d/%d)' % (task_id, idx + 1, len(task_ids))) data_task = data[data['task_id'] == task_id] evaluator = fanova.fanova.fANOVA(X=data_task[config_space.get_hyperparameter_names()].values, Y=data_task[args.measure].values, config_space=config_space, n_trees=args.n_trees) os.makedirs(args.output_directory, exist_ok=True) vis = Visualizer(evaluator, config_space, args.output_directory, y_label='Predictive Accuracy') indices = list(range(len(config_space.get_hyperparameters()))) for comb_size in range(1, args.comb_size + 1): for idx in itertools.combinations(indices, comb_size): param_names = np.array(config_space.get_hyperparameter_names())[np.array(idx)] logging.info('-- Calculating marginal for %s' % param_names) importance = evaluator.quantify_importance(idx)[idx] if comb_size == 1: visualizer_res = vis.generate_marginal(idx[0], args.resolution) # visualizer returns mean, std and potentially grid avg_marginal = np.array(visualizer_res[0]) elif comb_size == 2: visualizer_res = vis.generate_pairwise_marginal(idx, args.resolution) # visualizer returns grid names and values avg_marginal = np.array(visualizer_res[1]) else: raise ValueError('No support yet for higher dimensions than 2. Got: %d' % comb_size) difference_max_min = max(avg_marginal.reshape((-1,))) - min(avg_marginal.reshape((-1,))) current = { 'task_id': task_id, 'hyperparameter': ' / '.join(param_names), 'n_hyperparameters': len(param_names), 'importance_variance': importance['individual importance'], 'importance_max_min': difference_max_min, } result.append(current) df_result = pd.DataFrame(result) result_path = os.path.join(args.output_directory, 'fanova_%s_depth_%d.csv' % (args.classifier, args.comb_size)) df_result.to_csv(result_path) logging.info('resulting csv: %s' % result_path) logging.info('To plot, run <openml_pimp_root>/examples/plot/plot_fanova.py')
def plot_bokeh(self, plot_name=None, show_plot=False, plot_pairwise="most_important"): """ Plot single and pairwise margins in bokeh-plot. Single margins are always plotted (not expensive), pairwise can be configured by argument. Parameters ---------- plot_name: str path where to store the plot, None to not save it show_plot: bool whether or not to open plot in standard browser plot_pairwise: str choose from ["none", "most_important", "all"] where "most_important" relies on the fanova module to decide what that means Returns ------- layout: bokeh.models.Column bokeh plot (can be used in notebook or comparted with components) """ vis = Visualizer(self.evaluator, self.cs, directory='.', y_label=self._get_label(self.scenario.run_obj)) #################### # Single marginals # #################### plots_single = [] params = list(self.evaluated_parameter_importance.keys()) pbar = tqdm(deepcopy(params), ascii=True, disable=not self.verbose) for param_name in pbar: # Try and except pairwise importances that are also saved in evaluated_parameter_importance... try: param = self.cs.get_hyperparameter(param_name) except KeyError as err: self.logger.debug(err, exc_info=1) continue pbar.set_description('Plotting fANOVA (in bokeh) for %s' % param_name) incumbents = [] if not self.incumbents is None: incumbents = self.incumbents.copy() if isinstance(self.incumbents, list) else [self.incumbents] values = [c[param_name] for c in incumbents if param_name in c and c[param_name] is not None] if isinstance(param, (CategoricalHyperparameter, Constant)): labels = param.choices if isinstance(param, CategoricalHyperparameter) else str(param) mean, std = vis.generate_marginal(param_name) inc_indices = [labels.index(val) for val in values] p = bokeh_boxplot(labels, mean, std, x_label=param.name, y_label="runtime [sec]" if self.scenario.run_obj == "runtime" else "cost", runtime=self.scenario.run_obj=="runtime", inc_indices=inc_indices) else: mean, std, grid = vis.generate_marginal(param_name, 100) mean, std = np.asarray(mean), np.asarray(std) log_scale = param.log or (np.diff(grid).std() > 0.000001) inc_indices = [(np.abs(np.asarray(grid) - val)).argmin() for val in values] p = bokeh_line_uncertainty(grid, mean, std, log_scale, x_label=param.name, y_label="runtime [sec]" if self.scenario.run_obj == "runtime" else "cost", inc_indices=inc_indices) plots_single.append(Panel(child=Row(p), title=param_name)) ###################### # Pairwise marginals # ###################### if plot_pairwise == "all": combis = list(it.combinations(self.cs.get_hyperparameters(), 2)) elif plot_pairwise == "most_important": most_important_ones = list(self.evaluated_parameter_importance.keys())[ :min(self.num_single, self.n_most_imp_pairs)] most_important_pairwise_marginals = vis.fanova.get_most_important_pairwise_marginals( params=most_important_ones) combis = [(self.cs.get_hyperparameter(name1), self.cs.get_hyperparameter(name2)) for name1, name2 in most_important_pairwise_marginals] elif plot_pairwise == "none": combis = [] else: raise ValueError("{} not a valid set of pairwise plots to generate...".format(plot_pairwise)) plots_pairwise = [] pbar = tqdm(deepcopy(combis), ascii=True, disable=not self.verbose) for p1, p2 in pbar: pbar.set_description('Plotting pairwise fANOVA (in bokeh) for %s & %s' % (p1.name, p2.name)) first_is_cat = isinstance(p1, CategoricalHyperparameter) second_is_cat = isinstance(p2, CategoricalHyperparameter) # There are essentially three cases / different plots: # First case: both categorical -> heatmap if first_is_cat or second_is_cat: choices, zz = vis.generate_pairwise_marginal((p1.name, p2.name), 20) # Working with pandas makes life easier data = pd.DataFrame(zz, index=choices[0], columns=choices[1]) # Setting names for rows and columns and make categoricals strings data.index.name, data.columns.name = p1.name, p2.name data.index = data.index.astype(str) if first_is_cat else data.index data.columns = data.columns.astype(str) if second_is_cat else data.columns if first_is_cat and second_is_cat: p = bokeh_heatmap_cat(data, p1.name, p2.name) else: # Only one of them is categorical -> create multi-line-plot cat_choices = p1.choices if first_is_cat else p2.choices # We want categorical values be represented by columns: if not second_is_cat: data = data.transpose() # Find y_min and y_max BEFORE resetting index (otherwise index max obscure the query) y_limits = (data.min().min(), data.max().max()) x_limits = (p1.lower if second_is_cat else p2.lower, p1.upper if second_is_cat else p2.upper) # We want the index as a column (for plotting on x-axis) data = data.reset_index() p = bokeh_multiline(data, x_limits, y_limits, p1.name if second_is_cat else p2.name, cat_choices, y_label="runtime [sec]" if self.scenario.run_obj == "runtime" else "cost", z_label=p1.name if first_is_cat else p2.name, ) else: # Third case: both continous grid, zz = vis.generate_pairwise_marginal((p1.name, p2.name), 20) data = pd.DataFrame(zz, index=grid[0], columns=grid[1]) data.index.name, data.columns.name = p1.name, p2.name p = bokeh_heatmap_num(data, p1.name, p2.name, p1.log, p2.log) plots_pairwise.append(Panel(child=Row(p), title=" & ".join([p1.name, p2.name]))) # Putting both together tabs_single = Tabs(tabs=[*plots_single]) if len(plots_pairwise) > 0: tabs_pairwise = Tabs(tabs=[*plots_pairwise]) layout = Column(tabs_single, tabs_pairwise) else: layout = Column(tabs_single) # Save and show... save_and_show(plot_name, show_plot, layout) return layout