def test_inst_no_feat(self): ''' test if scenarios are treated correctly if no features are specified.''' scen = Scenario(self.scen_fn, cmd_options={ 'run_obj': 'quality', 'train_insts': self.train_insts, 'test_insts': self.test_insts }) self.assertTrue(scen.feature_array is None) self.assertEqual(len(scen.feature_dict), 0) scen.instance_specific = self.inst_specs validator = Validator(scen, self.trajectory, self.rng) # Add a few runs and check, if they are correctly processed old_configs = [entry["incumbent"] for entry in self.trajectory] old_rh = RunHistory() for config in old_configs[:int(len(old_configs) / 2)]: old_rh.add(config, 1, 1, StatusType.SUCCESS, instance_id='0', seed=127) rh = validator.validate_epm('all', 'train+test', 1, old_rh) self.assertEqual(len(old_rh.get_all_configs()), 4) self.assertEqual(len(rh.get_all_configs()), 10)
def _get_initial_points( self, num_points: int, runhistory: RunHistory, additional_start_points: Optional[List[Tuple[float, Configuration]]], ) -> List[Configuration]: if runhistory.empty(): init_points = self.config_space.sample_configuration( size=num_points) else: # initiate local search configs_previous_runs = runhistory.get_all_configs() # configurations with the highest previous EI configs_previous_runs_sorted = self._sort_configs_by_acq_value( configs_previous_runs) configs_previous_runs_sorted = [ conf[1] for conf in configs_previous_runs_sorted[:num_points] ] # configurations with the lowest predictive cost, check for None to make unit tests work if self.acquisition_function.model is not None: conf_array = convert_configurations_to_array( configs_previous_runs) costs = self.acquisition_function.model.predict_marginalized_over_instances( conf_array)[0] # From here # http://stackoverflow.com/questions/20197990/how-to-make-argsort-result-to-be-random-between-equal-values random = self.rng.rand(len(costs)) # Last column is primary sort key! indices = np.lexsort((random.flatten(), costs.flatten())) # Cannot use zip here because the indices array cannot index the # rand_configs list, because the second is a pure python list configs_previous_runs_sorted_by_cost = [ configs_previous_runs[ind] for ind in indices ][:num_points] else: configs_previous_runs_sorted_by_cost = [] if additional_start_points is not None: additional_start_points = [ asp[1] for asp in additional_start_points[:num_points] ] else: additional_start_points = [] init_points = [] init_points_as_set = set() # type: Set[Configuration] for cand in itertools.chain( configs_previous_runs_sorted, configs_previous_runs_sorted_by_cost, additional_start_points, ): if cand not in init_points_as_set: init_points.append(cand) init_points_as_set.add(cand) return init_points
def _next_challenger( self, challengers: typing.Optional[typing.List[Configuration]], chooser: typing.Optional[EPMChooser], run_history: RunHistory, repeat_configs: bool = True) -> typing.Optional[Configuration]: """ Retuns the next challenger to use in intensification If challenger is None, then optimizer will be used to generate the next challenger Parameters ---------- challengers : typing.List[Configuration] promising configurations to evaluate next chooser : smac.optimizer.epm_configuration_chooser.EPMChooser a sampler that generates next configurations to use for racing run_history : smac.runhistory.runhistory.RunHistory stores all runs we ran so far repeat_configs : bool if False, an evaluated configuration will not be generated again Returns ------- Configuration next challenger to use """ start_time = time.time() used_configs = set(run_history.get_all_configs()) if challengers: # iterate over challengers provided self.logger.debug("Using challengers provided") chall_gen = (c for c in challengers) # type: _config_to_run_type elif chooser: # generating challengers on-the-fly if optimizer is given self.logger.debug("Generating new challenger from optimizer") chall_gen = chooser.choose_next() else: raise ValueError( 'No configurations/chooser provided. Cannot generate challenger!' ) self.logger.debug('Time to select next challenger: %.4f' % (time.time() - start_time)) # select challenger from the generators assert chall_gen is not None for challenger in chall_gen: # repetitions allowed if repeat_configs: return challenger # otherwise, select only a unique challenger if challenger not in used_configs: return challenger self.logger.debug("No valid challenger was generated!") return None
def reduce_runhistory(self, rh: RunHistory, max_configs: int, keep=None): """ Reduce configs to desired number, by default just drop the configs with the fewest runs. Parameters ---------- rh: RunHistory runhistory that is to be reduced max_configs: int if > -1 reduce runhistory to at most max_configs keep: List[Configuration] list of configs that should be kept for sure (e.g. default, incumbents) Returns ------- rh: RunHistory reduced runhistory """ configs = rh.get_all_configs() if max_configs <= 0 or max_configs > len(configs): # keep all return rh runs = [(c, len(rh.get_runs_for_config(c, only_max_observed_budget=False))) for c in configs] if not keep: keep = [] runs = sorted(runs, key=lambda x: x[1])[-self.max_plot:] keep = [r[0] for r in runs] + keep self.logger.info( "Reducing number of configs from %d to %d, dropping from the fewest evaluations", len(configs), len(keep)) new_rh = RunHistory() for k, v in list(rh.data.items()): c = rh.ids_config[k.config_id] if c in keep: new_rh.add(config=rh.ids_config[k.config_id], cost=v.cost, time=v.time, status=v.status, instance_id=k.instance_id, seed=k.seed) return new_rh
def test_json_origin(self): for origin in ['test_origin', None]: rh = RunHistory() cs = get_config_space() config1 = Configuration(cs, values={'a': 1, 'b': 2}, origin=origin) rh.add(config=config1, cost=10, time=20, status=StatusType.SUCCESS, instance_id=1, seed=1) path = 'test/test_files/test_json_origin.json' rh.save_json(path) _ = rh.load_json(path, cs) self.assertEqual(rh.get_all_configs()[0].origin, origin) os.remove(path)
def __init__( self, original_rh: RunHistory, validated_rh: RunHistory, validator: Validator, scenario: Scenario, default: Configuration, incumbent: Configuration, param_imp: Union[None, Dict[str, float]], params: Union[int, List[str]], n_configs: int, pc_sort_by: str, output_dir: str, cs: ConfigurationSpace, runtime: bool = False, max_runs_epm: int = 3000000, ): """This function prepares the data from a SMAC-related format (using runhistories and parameters) to a more general format (using a dataframe). The resulting dataframe is passed to the parallel_coordinates-routine Parameters ---------- original_rh: RunHistory runhistory that should contain only runs that were executed during search validated_rh: RunHistory runhistory that may contain as many runs as possible, also external runs. this runhistory will be used to build the EPM validator: Validator validator to be used to estimate costs for configurations scenario: Scenario scenario object to take instances from default, incumbent: Configuration default and incumbent, they will surely be displayed param_imp: Union[None, Dict[str->float] if given, maps parameter-names to importance params: Union[int, List[str]] either directly the parameters to displayed or the number of parameters (will try to define the most important ones n_configs: int number of configs to be plotted pc_sort_by: str defines the pimp-method by which to choose the plotted parameters max_runs_epm: int maximum number of runs to train the epm with. this should prevent MemoryErrors output_dir: str output directory for plots cs: ConfigurationSpace parameter configuration space to be visualized runtime: boolean runtime will be on logscale """ self.logger = logging.getLogger(self.__module__ + "." + self.__class__.__name__) self.error = None self.default = default self.param_imp = param_imp self.cs = cs # Sorting by importance, if possible (choose first executed parameter-importance) self.method, self.importance = "", {} if pc_sort_by == 'all': self.logger.debug("Sorting by average importance") self.method = 'average' for m, i in self.param_imp.items(): if i: for p, imp in i.items(): if p in self.importance: self.importance[p].append(imp) else: self.importance[p] = [imp] self.importance = { k: sum(v) / len(v) for k, v in self.importance.items() } elif pc_sort_by in self.param_imp: self.method, self.importance = pc_sort_by, self.param_imp[ pc_sort_by] else: self.logger.debug("%s not evaluated.. choosing at random from: %s", pc_sort_by, str(list(self.param_imp.keys()))) for m, i in self.param_imp.items(): if i: self.method, self.importance = m, i break self.hp_names = sorted( [hp for hp in self.cs.get_hyperparameter_names()], key=lambda x: self.importance.get(x, 0), reverse=True) self.logger.debug("Sorted hp's by method \'%s\': %s", self.method, str(self.hp_names)) # To be set self.plots = [] # Define set of configurations (limiting to max and choosing most interesting ones) all_configs = original_rh.get_all_configs() max_runs_epm = 300000 # Maximum total number of runs considered for epm to limit maximum possible number configs max_configs = int( max_runs_epm / (len(scenario.train_insts) + len(scenario.test_insts))) if len(all_configs) > max_configs: self.logger.debug( "Limiting number of configs to train epm from %d to %d (based on max runs %d) and choosing " "the ones with the most runs (for parallel coordinates)", len(all_configs), max_configs, max_runs_epm) all_configs = sorted( all_configs, key=lambda c: len(original_rh.get_runs_for_config(c) ))[:max_configs] if not default in all_configs: all_configs = [default] + all_configs if not incumbent in all_configs: all_configs.append(incumbent) # Get costs for those configurations epm_rh = RunHistory(average_cost) epm_rh.update(validated_rh) if scenario.feature_dict: # if instances are available epm_rh.update( timing(validator.validate_epm)(all_configs, 'train+test', 1, runhistory=validated_rh)) self.config_to_cost = {c: epm_rh.get_cost(c) for c in all_configs} self.params = self.get_params(params) self.n_configs = n_configs self.pcp = ParallelCoordinatesPlotter(self.config_to_cost, output_dir, cs, runtime)
class SMAC4EPMOpimizer(AbstractOptimizer): def __init__(self, api_config, config_space, parallel_setting="LS"): super(SMAC4EPMOpimizer, self).__init__(api_config) self.cs = config_space self.num_hps = len(self.cs.get_hyperparameters()) if parallel_setting not in ["CL_min", "CL_max", "CL_mean", "KB", "LS"]: raise ValueError( "parallel_setting can only be one of the following: " "CL_min, CL_max, CL_mean, KB, LS") self.parallel_setting = parallel_setting rng = np.random.RandomState(seed=0) scenario = Scenario({ "run_obj": "quality", # we optimize quality (alt. to runtime) "runcount-limit": 128, "cs": self.cs, # configuration space "deterministic": True, "limit_resources": False, }) self.stats = Stats(scenario) # traj = TrajLogger(output_dir=None, stats=self.stats) self.runhistory = RunHistory() r2e_def_kwargs = { "scenario": scenario, "num_params": self.num_hps, "success_states": [ StatusType.SUCCESS, ], "impute_censored_data": False, "scale_perc": 5, } self.random_chooser = ChooserProb(rng=rng, prob=0.0) types, bounds = get_types(self.cs, instance_features=None) model_kwargs = { "configspace": self.cs, "types": types, "bounds": bounds, "seed": rng.randint(MAXINT), } models = [] cov_amp = ConstantKernel( 2.0, constant_value_bounds=(np.exp(-10), np.exp(2)), prior=LognormalPrior(mean=0.0, sigma=1.0, rng=rng), ) cont_dims = np.array(np.where(np.array(types) == 0)[0], dtype=np.int) cat_dims = np.where(np.array(types) != 0)[0] if len(cont_dims) > 0: exp_kernel = Matern( np.ones([len(cont_dims)]), [(np.exp(-6.754111155189306), np.exp(0.0858637988771976)) for _ in range(len(cont_dims))], nu=2.5, operate_on=cont_dims, ) if len(cat_dims) > 0: ham_kernel = HammingKernel( np.ones([len(cat_dims)]), [(np.exp(-6.754111155189306), np.exp(0.0858637988771976)) for _ in range(len(cat_dims))], operate_on=cat_dims, ) assert len(cont_dims) + len(cat_dims) == len( scenario.cs.get_hyperparameters()) noise_kernel = WhiteKernel( noise_level=1e-8, noise_level_bounds=(np.exp(-25), np.exp(2)), prior=HorseshoePrior(scale=0.1, rng=rng), ) if len(cont_dims) > 0 and len(cat_dims) > 0: # both kernel = cov_amp * (exp_kernel * ham_kernel) + noise_kernel elif len(cont_dims) > 0 and len(cat_dims) == 0: # only cont kernel = cov_amp * exp_kernel + noise_kernel elif len(cont_dims) == 0 and len(cat_dims) > 0: # only cont kernel = cov_amp * ham_kernel + noise_kernel else: raise ValueError() gp_kwargs = {"kernel": kernel} rf_kwargs = {} rf_kwargs["num_trees"] = model_kwargs.get("num_trees", 10) rf_kwargs["do_bootstrapping"] = model_kwargs.get( "do_bootstrapping", True) rf_kwargs["ratio_features"] = model_kwargs.get("ratio_features", 1.0) rf_kwargs["min_samples_split"] = model_kwargs.get( "min_samples_split", 2) rf_kwargs["min_samples_leaf"] = model_kwargs.get("min_samples_leaf", 1) rf_kwargs["log_y"] = model_kwargs.get("log_y", True) rf_log = RandomForestWithInstances(**model_kwargs, **rf_kwargs) rf_kwargs = copy.deepcopy(rf_kwargs) rf_kwargs["log_y"] = False rf_no_log = RandomForestWithInstances(**model_kwargs, **rf_kwargs) rh2epm_cost = RunHistory2EPM4Cost(**r2e_def_kwargs) rh2epm_log_cost = RunHistory2EPM4LogScaledCost(**r2e_def_kwargs) rh2epm_copula = RunHistory2EPM4GaussianCopulaCorrect(**r2e_def_kwargs) self.combinations = [] # 2 models * 4 acquisition functions acq_funcs = [EI, PI, LogEI, LCB] acq_func_instances = [] # acq_func_maximizer_instances = [] n_sls_iterations = { 1: 10, 2: 10, 3: 10, 4: 10, 5: 10, 6: 10, 7: 8, 8: 6, }.get(len(self.cs.get_hyperparameters()), 5) acq_func_maximizer_kwargs = { "config_space": self.cs, "rng": rng, "max_steps": 5, "n_steps_plateau_walk": 5, "n_sls_iterations": n_sls_iterations, } self.idx_ei = 0 self.num_models = len(models) self.num_acq_funcs = len(acq_funcs) no_transform_gp = GaussianProcess(**copy.deepcopy(model_kwargs), **copy.deepcopy(gp_kwargs)) ei = EI(model=no_transform_gp) acq_func_maximizer_kwargs["acquisition_function"] = ei ei_opt = LocalAndSortedRandomSearch(**acq_func_maximizer_kwargs) self.combinations.append((no_transform_gp, ei, ei_opt, rh2epm_cost)) pi = PI(model=no_transform_gp) acq_func_maximizer_kwargs["acquisition_function"] = pi pi_opt = LocalAndSortedRandomSearch(**acq_func_maximizer_kwargs) self.combinations.append((no_transform_gp, pi, pi_opt, rh2epm_cost)) lcb = LCB(model=no_transform_gp) acq_func_maximizer_kwargs["acquisition_function"] = lcb lcb_opt = LocalAndSortedRandomSearch(**acq_func_maximizer_kwargs) self.combinations.append((no_transform_gp, lcb, lcb_opt, rh2epm_cost)) gp = GaussianProcess(**copy.deepcopy(model_kwargs), **copy.deepcopy(gp_kwargs)) ei = EI(model=gp) acq_func_maximizer_kwargs["acquisition_function"] = ei ei_opt = LocalAndSortedRandomSearch(**acq_func_maximizer_kwargs) self.combinations.append((gp, ei, ei_opt, rh2epm_copula)) gp = GaussianProcess(**copy.deepcopy(model_kwargs), **copy.deepcopy(gp_kwargs)) ei = LogEI(model=gp) acq_func_maximizer_kwargs["acquisition_function"] = ei ei_opt = LocalAndSortedRandomSearch(**acq_func_maximizer_kwargs) self.combinations.append((gp, ei, ei_opt, rh2epm_log_cost)) ei = EI(model=rf_no_log) acq_func_maximizer_kwargs["acquisition_function"] = ei ei_opt = LocalAndSortedRandomSearch(**acq_func_maximizer_kwargs) self.combinations.append((rf_no_log, ei, ei_opt, rh2epm_cost)) ei = LogEI(model=rf_log) acq_func_maximizer_kwargs["acquisition_function"] = ei ei_opt = LocalAndSortedRandomSearch(**acq_func_maximizer_kwargs) self.combinations.append((rf_log, ei, ei_opt, rh2epm_log_cost)) ei = EI(model=rf_no_log) acq_func_maximizer_kwargs["acquisition_function"] = ei ei_opt = LocalAndSortedRandomSearch(**acq_func_maximizer_kwargs) self.combinations.append((rf_no_log, ei, ei_opt, rh2epm_copula)) self.num_acq_instances = len(acq_func_instances) self.best_observation = np.inf self.next_evaluations = [] def suggest(self, n_suggestions: int = 1) -> typing.List[typing.Dict]: """Get a suggestion from the optimizer. Parameters ---------- n_suggestions : int Desired number of parallel suggestions in the output Returns ------- next_guess : list of dict List of `n_suggestions` suggestions to evaluate the objective function. Each suggestion is a dictionary where each key corresponds to a parameter being optimized. CHANGED: each suggestion is a tuple of suggestion and string info! """ all_previous_configs = self.runhistory.get_all_configs() num_points = len(all_previous_configs) # we will save our info info_list = [] if len(self.next_evaluations) < n_suggestions: n_new = n_suggestions - len(self.next_evaluations) # import time order = np.random.permutation(list(range(len(self.combinations)))) optimized_this_iter = set() while len(self.next_evaluations) < n_new: model, acq, acq_opt, rh2epm = self.combinations[order[len( self.next_evaluations)]] # start_time = time.time() info = "" if model.__class__ == RandomForestWithInstances: info += "RF" elif model.__class__ == GaussianProcess: info += "GP" else: raise ValueError(model.__class__.name) info += f" {acq.__class__.__name__}" if rh2epm.__class__ == RunHistory2EPM4Cost: info += " cost" elif rh2epm.__class__ == RunHistory2EPM4LogScaledCost: info += " log_cost" elif rh2epm.__class__ == RunHistory2EPM4GaussianCopulaCorrect: info += " copula" else: raise ValueError(rh2epm.__classs__.name__) # print(model.__class__.__name__, # acq.__class__.__name__, # rh2epm.__class__.__name__) X, y = rh2epm.transform(self.runhistory) # If all are not finite then we return nothing if np.all(~np.isfinite(y)): self.next_evaluations = [] return [] # Safeguard, just in case... if np.any(~np.isfinite(y)): y[~np.isfinite(y)] = np.max(y[np.isfinite(y)]) if (self.parallel_setting != "LS" and len(self.next_evaluations) != 0): x_inc = np.array([ next_config.get_array() for next_config in self.next_evaluations ]) if self.parallel_setting == "CL_min": y_inc = np.min(y) elif self.parallel_setting == "CL_max": y_inc = np.max(y) elif self.parallel_setting == "CL_mean": y_inc = np.mean(y) elif self.parallel_setting == "KB": if model in optimized_this_iter and isinstance( model, GaussianProcess): # Safe some time by re-using the optimized # hyperparameters from before model._train(X, y, do_optimize=False) else: model.train(X, y) optimized_this_iter.add(model) y_inc, var = model.predict_marginalized_over_instances( x_inc) y_inc = y_inc.flatten() else: raise ValueError( "parallel_setting can only be one of the " "following: CL_min, CL_max, CL_mean, KB, LS") if self.parallel_setting in ("CL_min", "CL_max", "CL_mean"): # NOQA y_inc = np.repeat(y_inc, len(self.next_evaluations)).reshape( (-1, 1)) else: y_inc = y_inc.reshape((-1, 1)) X = np.concatenate((X, x_inc)) y = np.concatenate((y, y_inc)) if (isinstance(model, GaussianProcess) and self.parallel_setting == "KB"): # Safe some time by re-using the optimized # hyperparameters from above model._train(X, y, do_optimize=False) else: model.train(X, y) # As the training data for each subsequent model # changes quite drastically (taking the max of all # observations can create really disconnected error # landscapes in the region of the optimum) we have # to re-optimize the hyperparameters here and cannot # add the model to the set of previously # optimized models. # optimized_this_iter.add(model) else: model.train(X, y) optimized_this_iter.add(model) predictions = model.predict_marginalized_over_instances(X)[0] best_index = np.argmin(predictions) best_observation = predictions[best_index] x_best_array = X[best_index] acq.update( model=model, eta=best_observation, incumbent_array=x_best_array, num_data=num_points, X=X, ) new_config_iterator = acq_opt.maximize( runhistory=self.runhistory, stats=self.stats, num_points=10000, random_configuration_chooser=self.random_chooser, ) accept = False for next_config in new_config_iterator: if (next_config in self.next_evaluations or next_config in all_previous_configs): continue else: accept = True break if not accept: # If we don't find anything within 100 random # configurations, we re-run a configuration for next_config in self.cs.sample_configuration(100): if (next_config not in self.next_evaluations or next_config in all_previous_configs): break self.next_evaluations.append(next_config) info_list.append(info) # print(time.time() - start_time) next_guess = [{} for _ in range(n_suggestions)] while len(self.next_evaluations) < len(range(n_suggestions)): self.next_evaluations.append(self.cs.sample_configuration()) info_list.append("Random") for i in range(n_suggestions): eval_next = self.next_evaluations.pop(0) next_guess[i] = (eval_next.get_dictionary(), info_list[i]) return next_guess def init_with_rh(self, rh, iteration): self.runhistory.empty() for rh_value in rh: configuration = Configuration(configuration_space=self.cs, values=rh_value[0]) self.runhistory.add( config=configuration, cost=rh_value[1], time=0, status=StatusType.SUCCESS, ) def observe(self, X, y): """Feed an observation back. Parameters ---------- X : list of dict-like Places where the objective function has already been evaluated. Each suggestion is a dictionary 使用where each key corresponds to a parameter being optimized. y : array-like, shape (n,) Corresponding values where objective has been evaluated """ for xx, yy in zip(X, y): configuration = Configuration(configuration_space=self.cs, values=xx) self.runhistory.add(config=configuration, cost=yy, time=0, status=StatusType.SUCCESS)
args, unkown = parser.parse_known_args() logging.basicConfig(level=logging.INFO) if unkown: logging.warning('Could not parse the following arguments: ') logging.warning(str(unkown)) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Create Runhistory object as well as scenario object runhist = RunHistory(average_cost) scenario = Scenario(args.scenario, cmd_args={'output_dir': ""}) cs = scenario.cs runhist.load_json(args.data, cs) # populate the runhistory with the validation data configs = runhist.get_all_configs() def_ = cs.get_default_configuration() def_dict = def_.get_dictionary() # Switch it around such that statistics about the default are gathered first if configs[0] != def_: tmp = configs[0] configs[0] = configs[1] configs[1] = tmp del tmp logging.info('Found %d configs' % len(configs)) logging.info('Cost per config:') # For each config for config in configs: # gather statistics such as
def _plot_parallel_coordinates( self, original_rh: RunHistory, validated_rh: RunHistory, validator: Validator, scenario: Scenario, default: Configuration, incumbent: Configuration, param_imp: Union[None, Dict[str, float]], output_dir: str, cs: ConfigurationSpace, runtime: bool = False, ): """ Parameters: ----------- original_rh: RunHistory runhistory that should contain only runs that were executed during search validated_rh: RunHistory runhistory that may contain as many runs as possible, also external runs. this runhistory will be used to build the EPM validator: Validator validator to be used to estimate costs for configurations scenario: Scenario scenario object to take instances from default, incumbent: Configuration default and incumbent, they will surely be displayed param_imp: Union[None, Dict[str->float] if given, maps parameter-names to importance output_dir: str output directory for plots cs: ConfigurationSpace parameter configuration space to be visualized runtime: boolean runtime will be on logscale """ # Sorting parameters by importance, if possible (choose first executed parameter-importance) method, importance = "", {} if self.pc_sort_by == 'all': self.logger.debug("Sorting by average importance") method = 'average' for m, i in param_imp.items(): if i: for p, imp in i.items(): if p in importance: importance[p].append(imp) else: importance[p] = [imp] importance = {k: sum(v) / len(v) for k, v in importance.items()} elif self.pc_sort_by in param_imp: method, importance = self.pc_sort_by, param_imp[self.pc_sort_by] else: self.logger.debug("%s not evaluated.. choosing at random from: %s", self.pc_sort_by, str(list(param_imp.keys()))) for m, i in param_imp.items(): if i: method, importance = m, i self.logger.debug("Chose %s", method) break hp_names = sorted([hp for hp in cs.get_hyperparameter_names()], key=lambda x: importance.get(x, 0), reverse=True) self.logger.debug("Sorted hp's by method \'%s\': %s", method, str(hp_names)) # To be set self.plots = [] # Define set of configurations (limiting to max and choosing most interesting ones) all_configs = original_rh.get_all_configs() max_runs_epm = self.max_runs_epm # Maximum total number of runs considered for epm to limit maximum possible number configs max_configs = int( max_runs_epm / (len(scenario.train_insts) + len(scenario.test_insts))) if len(all_configs) > max_configs: self.logger.debug( "Limiting number of configs to train epm from %d to %d (based on max runs %d) and choosing " "the ones with the most runs (for parallel coordinates)", len(all_configs), max_configs, max_runs_epm) all_configs = sorted( all_configs, key=lambda c: len(original_rh.get_runs_for_config(c) ))[:max_configs] if not default in all_configs: all_configs = [default] + all_configs if not incumbent in all_configs: all_configs.append(incumbent) # Get costs for those configurations epm_rh = RunHistory(average_cost) epm_rh.update(validated_rh) if scenario.feature_dict: # if instances are available epm_rh.update( timing(validator.validate_epm)(all_configs, 'train+test', 1, runhistory=validated_rh)) config_to_cost = {c: epm_rh.get_cost(c) for c in all_configs} pcp = ParallelCoordinatesPlotter(config_to_cost, output_dir, cs, runtime) try: plots = [ pcp.plot_n_configs( self.n_configs, self.get_params(self.params, importance, hp_names)) ] self.logger.debug("Paths to plot(s): %s", str(plots)) return {'figure': plots} except ValueError as err: self.logger.debug("Error: %s", str(err)) return {'else': str(err)}
def _optimize(self, f, variables, X_init, Y_init, maxiter, maxeval, iter_callback): maxeval = get_maxeval_for_bo(maxeval, maxiter) iter_callback(X_init[0], Y_init[0], X_init, Y_init) # Get config space config_space = self.get_config_space(variables) # get scenario, runhistory and stats scenario = self.get_scenario(maxeval, config_space) runhistory = RunHistory() stats = Stats(scenario) # for acq function optimizer rnd_chooser = ChooserProb(rng=self._get_random_state(), prob=0.0) # get class to get valid train data from run history rh2epm = self.get_runhistory2epm(scenario) # we will add configs to run history by using the following function def add_to_runhistory(config, cost): runhistory.add( config=config, cost=cost, time=0, status=StatusType.SUCCESS ) # create gp and other stuff model = self.get_model(config_space) acq_fun = self.get_acquisition_function(model) acq_fun_opt = self.get_acquisition_function_optimizer( config_space, acq_fun ) # transform our X_init for valid configurations # we create random valid configs and then fill them with our values X_init_configs = config_space.sample_configuration(len(X_init)) for x in X_init: for i, x in enumerate(X_init): for ind, (var, par) in enumerate(zip(variables, x)): if isinstance(variables[ind], ContinuousVariable): par = float(par) X_init_configs[i][var.name] = par # add our initial design to run history for x, y in zip(X_init_configs, Y_init): add_to_runhistory(x, y) # begin Bayesian optimization while self.run_info.result.n_eval < maxeval or \ (maxiter is not None and self.run_info.result.n_iter < maxiter): total_t_start = time.time() X, y = rh2epm.transform(runhistory) # If all are not finite then we return nothing if np.all(~np.isfinite(y)): return self.run_info.result # Safeguard, just in case... if np.any(~np.isfinite(y)): y[~np.isfinite(y)] = np.max(y[np.isfinite(y)]) t_start = time.time() model.train(X, y) gp_train_time = time.time() - t_start t_start = time.time() predictions = model.predict_marginalized_over_instances(X)[0] best_index = np.argmin(predictions) best_observation = y[best_index] x_best_array = X[best_index] gp_predict_time = time.time() - t_start t_start = time.time() acq_fun.update( model=model, eta=best_observation, incumbent_array=x_best_array, num_data=len(X), X=X, ) new_config_iterator = acq_fun_opt.maximize( runhistory=runhistory, stats=stats, num_points=10000, random_configuration_chooser=rnd_chooser, ) accept = False for next_config in new_config_iterator: if next_config in runhistory.get_all_configs(): continue else: accept = True break assert accept acq_opt_time = time.time() - t_start t_start = time.time() x = [next_config[var.name] for var in variables] cost = f(x) eval_time = time.time() - t_start add_to_runhistory(next_config, cost) total_iter_time = time.time() - total_t_start update_kwargs = {"gp_train_time": gp_train_time, "gp_predict_time": gp_predict_time, "acq_opt_time": acq_opt_time, "eval_time": eval_time, "iter_time": total_iter_time} iter_callback(x, cost, [x], [cost], **update_kwargs) return self.run_info.result
def _get_initial_points( self, num_points: int, runhistory: RunHistory, additional_start_points: Optional[List[Tuple[float, Configuration]]], ) -> List[Configuration]: if runhistory.empty(): init_points = self.config_space.sample_configuration( size=num_points) else: # initiate local search configs_previous_runs = runhistory.get_all_configs() # configurations with the highest previous EI configs_previous_runs_sorted = self._sort_configs_by_acq_value( configs_previous_runs) configs_previous_runs_sorted = [ conf[1] for conf in configs_previous_runs_sorted[:num_points] ] # configurations with the lowest predictive cost, check for None to make unit tests work if self.acquisition_function.model is not None: conf_array = convert_configurations_to_array( configs_previous_runs) costs = self.acquisition_function.model.predict_marginalized_over_instances( conf_array)[0] assert len(conf_array) == len(costs), (conf_array.shape, costs.shape) # In case of the predictive model returning the prediction for more than one objective per configuration # (for example multi-objective or EIPS) it is not immediately clear how to sort according to the cost # of a configuration. Therefore, we simply follow the ParEGO approach and use a random scalarization. if len(costs.shape) == 2 and costs.shape[1] > 1: weights = np.array( [self.rng.rand() for _ in range(costs.shape[1])]) weights = weights / np.sum(weights) costs = costs @ weights # From here # http://stackoverflow.com/questions/20197990/how-to-make-argsort-result-to-be-random-between-equal-values random = self.rng.rand(len(costs)) # Last column is primary sort key! indices = np.lexsort((random.flatten(), costs.flatten())) # Cannot use zip here because the indices array cannot index the # rand_configs list, because the second is a pure python list configs_previous_runs_sorted_by_cost = [ configs_previous_runs[ind] for ind in indices ][:num_points] else: configs_previous_runs_sorted_by_cost = [] if additional_start_points is not None: additional_start_points = [ asp[1] for asp in additional_start_points[:num_points] ] else: additional_start_points = [] init_points = [] init_points_as_set = set() # type: Set[Configuration] for cand in itertools.chain( configs_previous_runs_sorted, configs_previous_runs_sorted_by_cost, additional_start_points, ): if cand not in init_points_as_set: init_points.append(cand) init_points_as_set.add(cand) return init_points
def _preprocess_budget( self, original_rh: RunHistory, validated_rh: RunHistory, validator: Validator, scenario: Scenario, default: Configuration, incumbent: Configuration, param_imp: Union[None, Dict[str, float]], output_dir: str, cs: ConfigurationSpace, runtime: bool = False, ): """ Preprocess data and save in self.data to enable fast replots Parameters: ----------- original_rh: RunHistory runhistory that should contain only runs that were executed during search validated_rh: RunHistory runhistory that may contain as many runs as possible, also external runs. this runhistory will be used to build the EPM validator: Validator validator to be used to estimate costs for configurations scenario: Scenario scenario object to take instances from default, incumbent: Configuration default and incumbent, they will surely be displayed param_imp: Union[None, Dict[str->float] if given, maps parameter-names to importance output_dir: str output directory for plots cs: ConfigurationSpace parameter configuration space to be visualized runtime: boolean runtime will be on logscale """ # Sorting parameters by importance, if possible (choose first executed parameter-importance) method, importance = "", {} if self.pc_sort_by == 'all': self.logger.debug("Sorting by average importance") method = 'average' for m, i in param_imp.items(): if i: for p, imp in i.items(): if p in importance: importance[p].append(imp) else: importance[p] = [imp] importance = {k: sum(v) / len(v) for k, v in importance.items()} elif self.pc_sort_by in param_imp: method, importance = self.pc_sort_by, param_imp[self.pc_sort_by] else: self.logger.debug("%s not evaluated.. choosing at random from: %s", self.pc_sort_by, str(list(param_imp.keys()))) for m, i in param_imp.items(): if i: method, importance = m, i self.logger.debug("Chose %s", method) break hp_names = sorted([p for p in cs.get_hyperparameter_names()], key=lambda x: importance.get(x, 0), reverse=True) self.logger.debug("Sorted hyperparameters by method \'%s\': %s", method, str(hp_names)) # Define set of configurations (limiting to max and choosing most interesting ones) all_configs = original_rh.get_all_configs() # max_runs_epm is the maximum total number of runs considered for epm to limit maximum possible number configs max_configs = int( self.max_runs_epm / (len(scenario.train_insts) + len(scenario.test_insts))) if len(all_configs) > max_configs: self.logger.debug( "Limiting number of configs to train epm from %d to %d (based on max runs %d) and " "choosing the ones with the most runs (for parallel coordinates)", len(all_configs), max_configs, self.max_runs_epm) all_configs = sorted(all_configs, key=lambda c: len( original_rh.get_runs_for_config( c, only_max_observed_budget=False))) all_configs = all_configs[:max_configs] if default not in all_configs: all_configs = [default] + all_configs if incumbent not in all_configs: all_configs.append(incumbent) # Get costs for those configurations epm_rh = RunHistory() epm_rh.update(validated_rh) if scenario.feature_dict: # if instances are available epm_rh.update( timing(validator.validate_epm)(all_configs, 'train+test', 1, runhistory=validated_rh)) config_to_cost = OrderedDict( {c: epm_rh.get_cost(c) for c in all_configs}) data = OrderedDict() data['cost'] = list(config_to_cost.values()) for hp in self.runscontainer.scenario.cs.get_hyperparameter_names(): data[hp] = np.array([ c[hp] # if hp in c.get_dictionary() and not isinstance(c[hp], str) else np.nan for c in config_to_cost.keys() ]) df = pd.DataFrame(data=data) return df
def _optimize(self, f, variables, X_init, Y_init, maxiter, maxeval, iter_callback): maxeval = get_maxeval_for_bo(maxeval, maxiter) # Create help optimizer with usual SMAC to call the same functions help_opt = SMACBayesianOptimizer(kernel="Auto") help_opt_log = SMACBayesianOptimizer(kernel="Auto", acquisition_type="logEI") kernel_name1, message1 = choose_kernel_if_needed( optimizer=help_opt, variables=variables, X=X_init, Y=Y_init, kernels=self.kernels_to_choose) help_opt.kernel_name = kernel_name1 kernel_name2, message2 = choose_kernel_if_needed( optimizer=help_opt_log, variables=variables, X=X_init, Y=Y_init, kernels=self.kernels_to_choose) help_opt_log.kernel_name = kernel_name2 message = f"For usual Y:\n{message1}For log_transformed Y:\n{message2}" x_best = X_init[0] y_best = Y_init[0] iter_callback(x_best, y_best, X_init, Y_init, message=message) # Get config space config_space = help_opt.get_config_space(variables) # get scenario, runhistory and stats scenario = help_opt.get_scenario(maxeval, config_space) runhistory = RunHistory() stats = Stats(scenario) # for acq function optimizer rnd_chooser = ChooserProb(rng=help_opt._get_random_state(), prob=0.0) # get classes to get valid train data from run history rh2epm_no_transform = help_opt.get_runhistory2epm(scenario) rh2epm_log = help_opt_log.get_runhistory2epm(scenario) acq2rh2epm = {"PI": rh2epm_no_transform, "logEI": rh2epm_log} # we will add configs to run history by using the following function def add_to_runhistory(config, cost): runhistory.add(config=config, cost=cost, time=0, status=StatusType.SUCCESS) combinations = [] for model_name, acq_name in [(kernel_name1, "PI"), (kernel_name2, "logEI")]: gp = self._create_gp_model(config_space, model_name) acq = self._create_acquisition_function(model=gp, acquisition_name=acq_name) acq_opt = help_opt.get_acquisition_function_optimizer( config_space, acq) mark = f"{model_name}_{acq_name}" combinations.append((mark, gp, acq, acq_opt, acq2rh2epm[acq_name])) # transform our X_init for valid configurations # we create random valid configs and then fill them with our values X_init_configs = config_space.sample_configuration(len(X_init)) for x in X_init: for i, x in enumerate(X_init): for ind, (var, par) in enumerate(zip(variables, x)): if isinstance(variables[ind], ContinuousVariable): par = float(par) X_init_configs[i][var.name] = par # add our initial design to run history for x, y in zip(X_init_configs, Y_init): add_to_runhistory(x, y) # begin Bayesian optimization while self.run_info.result.n_eval < maxeval or \ (maxiter is not None and self.run_info.result.n_iter * 4 < maxiter): do_gp_optim = self.do_gp_optimization() message = f"GP was optimized: {do_gp_optim}" total_t_start = time.time() gp_train_time = 0 gp_predict_time = 0 acq_opt_time = 0 eval_time = 0 iter_configs = [] iter_X = [] iter_y = [] # shuffle our list in-place # random.shuffle(combinations) kernel_ind = np.random.choice([0, 1]) mark, gp, acq, acq_opt, rh2epm = combinations[kernel_ind] # for mark, gp, acq, acq_opt, rh2epm in combinations: X, y = rh2epm.transform(runhistory) # If all are not finite then we return nothing if np.all(~np.isfinite(y)): return self.run_info.result # Safeguard, just in case... if np.any(~np.isfinite(y)): y[~np.isfinite(y)] = np.max(y[np.isfinite(y)]) t_start = time.time() gp.train(X, y, optimize=do_gp_optim) gp_train_time += time.time() - t_start t_start = time.time() # we do not care what model is used here predictions = gp.predict(X)[0] best_index = np.argmin(predictions) best_observation = y[best_index] x_best_array = X[best_index] gp_predict_time += time.time() - t_start t_start = time.time() acq.update( model=gp.gp_model, eta=best_observation, incumbent_array=x_best_array, num_data=len(X), X=X, ) new_config_iterator = acq_opt.maximize( runhistory=runhistory, stats=stats, num_points=10000, random_configuration_chooser=rnd_chooser, ) accept = False for next_config in new_config_iterator: if next_config in runhistory.get_all_configs(): continue else: accept = True break assert accept acq_opt_time += time.time() - t_start t_eval = time.time() x = [next_config[var.name] for var in variables] cost = f(x) eval_time += time.time() - t_eval iter_configs.append((next_config, cost)) x = WeightedMetaArray(x) x.metadata = mark iter_X.append(x) iter_y.append(cost) if cost < y_best: x_best = x y_best = cost for config, cost in iter_configs: add_to_runhistory(config, cost) total_iter_time = time.time() - total_t_start update_kwargs = { "gp_train_time": gp_train_time, "gp_predict_time": gp_predict_time, "acq_opt_time": acq_opt_time, "eval_time": eval_time, "iter_time": total_iter_time, "message": message } iter_callback(x_best, y_best, iter_X, iter_y, **update_kwargs) return self.run_info.result
class CAVE(object): """ """ def __init__(self, folders: typing.List[str], output: str, ta_exec_dir: Union[str, None] = None, missing_data_method: str = 'epm', max_pimp_samples: int = -1, fanova_pairwise=True): """ Initialize CAVE facade to handle analyzing, plotting and building the report-page easily. During initialization, the analysis-infrastructure is built and the data is validated, meaning the overall best incumbent is found and default+incumbent are evaluated for all instances for all runs, by default using an EPM. The class holds two runhistories: self.original_rh -> only contains runs from the actual data self.validated_rh -> contains original runs and epm-predictions for all incumbents The analyze()-method performs an analysis and outputs a report.html. Arguments --------- folders: list<strings> paths to relevant SMAC runs output: string output for cave to write results (figures + report) ta_exec_dir: string execution directory for target algorithm (to find instance.txt, ..) missing_data_method: string from [validation, epm], how to estimate missing runs """ self.logger = logging.getLogger("cave.cavefacade") self.logger.debug("Folders: %s", str(folders)) self.ta_exec_dir = ta_exec_dir # Create output if necessary self.output = output self.logger.info("Saving results to %s", self.output) if not os.path.exists(output): self.logger.debug("Output-dir %s does not exist, creating", self.output) os.makedirs(output) if not os.path.exists(os.path.join(self.output, "debug")): os.makedirs(os.path.join(self.output, "debug")) # Log to file logger = logging.getLogger() handler = logging.FileHandler( os.path.join(self.output, "debug/debug.log"), "w") handler.setLevel(logging.DEBUG) logger.addHandler(handler) # Global runhistory combines all actual runs of individual SMAC-runs # We save the combined (unvalidated) runhistory to disk, so we can use it later on. # We keep the validated runhistory (with as many runs as possible) in # memory. The distinction is made to avoid using runs that are # only estimated using an EPM for further EPMs or to handle runs # validated on different hardware (depending on validation-method). self.original_rh = RunHistory(average_cost) self.validated_rh = RunHistory(average_cost) # Save all relevant SMAC-runs in a list self.runs = [] for folder in folders: try: self.logger.debug("Collecting data from %s.", folder) self.runs.append(SMACrun(folder, ta_exec_dir)) except Exception as err: self.logger.warning( "Folder %s could not be loaded, failed " "with error message: %s", folder, err) continue if not len(self.runs): raise ValueError( "None of the specified SMAC-folders could be loaded.") # Use scenario of first run for general purposes (expecting they are all the same anyway!) self.scenario = self.runs[0].solver.scenario # Update global runhistory with all available runhistories self.logger.debug("Update original rh with all available rhs!") runhistory_fns = [ os.path.join(run.folder, "runhistory.json") for run in self.runs ] for rh_file in runhistory_fns: self.original_rh.update_from_json(rh_file, self.scenario.cs) self.logger.debug( 'Combined number of Runhistory data points: %d. ' '# Configurations: %d. # Runhistories: %d', len(self.original_rh.data), len(self.original_rh.get_all_configs()), len(runhistory_fns)) self.original_rh.save_json( os.path.join(self.output, "combined_rh.json")) # Validator for a) validating with epm, b) plot over time # Initialize without trajectory self.validator = Validator(self.scenario, None, None) # Estimate missing costs for [def, inc1, inc2, ...] self.complete_data(method=missing_data_method) self.best_run = min( self.runs, key=lambda run: self.validated_rh.get_cost(run.solver.incumbent)) self.default = self.scenario.cs.get_default_configuration() self.incumbent = self.best_run.solver.incumbent self.logger.debug("Overall best run: %s, with incumbent: %s", self.best_run.folder, self.incumbent) # Following variable determines whether a distinction is made # between train and test-instances (e.g. in plotting) self.train_test = bool(self.scenario.train_insts != [None] and self.scenario.test_insts != [None]) self.analyzer = Analyzer(self.original_rh, self.validated_rh, self.default, self.incumbent, self.train_test, self.scenario, self.validator, self.output, max_pimp_samples, fanova_pairwise) self.builder = HTMLBuilder(self.output, "CAVE") # Builder for html-website self.website = OrderedDict([]) def complete_data(self, method="epm"): """Complete missing data of runs to be analyzed. Either using validation or EPM. """ with changedir(self.ta_exec_dir if self.ta_exec_dir else '.'): self.logger.info("Completing data using %s.", method) path_for_validated_rhs = os.path.join(self.output, "validated_rhs") for run in self.runs: self.validator.traj = run.traj if method == "validation": # TODO determine # repetitions new_rh = self.validator.validate( 'def+inc', 'train+test', 1, -1, runhistory=self.original_rh) elif method == "epm": new_rh = self.validator.validate_epm( 'def+inc', 'train+test', 1, runhistory=self.original_rh) else: raise ValueError("Missing data method illegal (%s)", method) self.validator.traj = None # Avoid usage-mistakes self.validated_rh.update(new_rh) def analyze(self, performance=True, cdf=True, scatter=True, confviz=True, param_importance=['forward_selection', 'ablation', 'fanova'], feature_analysis=[ "box_violin", "correlation", "feat_importance", "clustering", "feature_cdf" ], parallel_coordinates=True, cost_over_time=True, algo_footprint=True): """Analyze the available data and build HTML-webpage as dict. Save webpage in 'self.output/CAVE/report.html'. Analyzing is performed with the analyzer-instance that is initialized in the __init__ Parameters ---------- performance: bool whether to calculate par10-values cdf: bool whether to plot cdf scatter: bool whether to plot scatter confviz: bool whether to perform configuration visualization param_importance: List[str] containing methods for parameter importance feature_analysis: List[str] containing methods for feature analysis parallel_coordinates: bool whether to plot parallel coordinates cost_over_time: bool whether to plot cost over time algo_footprint: bool whether to plot algorithm footprints """ # Check arguments for p in param_importance: if p not in [ 'forward_selection', 'ablation', 'fanova', 'incneighbor' ]: raise ValueError( "%s not a valid option for parameter " "importance!", p) for f in feature_analysis: if f not in [ "box_violin", "correlation", "importance", "clustering", "feature_cdf" ]: raise ValueError("%s not a valid option for feature analysis!", f) # Start analysis overview = self.analyzer.create_overview_table(self.best_run.folder) self.website["Meta Data"] = {"table": overview} compare_config = self.analyzer.config_to_html(self.default, self.incumbent) self.website["Best configuration"] = {"table": compare_config} ########## PERFORMANCE ANALYSIS self.website["Performance Analysis"] = OrderedDict() if performance: performance_table = self.analyzer.create_performance_table( self.default, self.incumbent) self.website["Performance Analysis"]["Performance Table"] = { "table": performance_table } if cdf: cdf_path = self.analyzer.plot_cdf() self.website["Performance Analysis"][ "empirical Cumulative Distribution Function (eCDF)"] = { "figure": cdf_path } if scatter and (self.scenario.train_insts != [[None]]): scatter_path = self.analyzer.plot_scatter() self.website["Performance Analysis"]["Scatterplot"] = { "figure": scatter_path } elif scatter: self.logger.info( "Scatter plot desired, but no instances available.") # Build report before time-consuming analysis self.build_website() if algo_footprint and self.scenario.feature_dict: algorithms = {self.default: "default", self.incumbent: "incumbent"} # Add all available incumbents to test portfolio strategy #for r in self.runs: # if not r.get_incumbent() in algorithms: # algorithms[r.get_incumbent()] = str(self.runs.index(r)) algo_footprint_plots = self.analyzer.plot_algorithm_footprint( algorithms) self.website["Performance Analysis"][ "Algorithm Footprints"] = OrderedDict() for p in algo_footprint_plots: header = os.path.splitext(os.path.split(p)[1])[0] # algo name self.website["Performance Analysis"]["Algorithm Footprints"][ header] = { "figure": p, "tooltip": get_tooltip("Algorithm Footprints") + ": " + header } self.build_website() ########### Configurator's behavior self.website["Configurator's behavior"] = OrderedDict() if confviz: if self.scenario.feature_array is None: self.scenario.feature_array = np.array([[]]) # Sort runhistories and incs wrt cost incumbents = [r.solver.incumbent for r in self.runs] trajectories = [r.traj for r in self.runs] runhistories = [r.runhistory for r in self.runs] costs = [self.validated_rh.get_cost(i) for i in incumbents] costs, incumbents, runhistories, trajectories = ( list(t) for t in zip( *sorted(zip(costs, incumbents, runhistories, trajectories), key=lambda x: x[0]))) incumbents = list(map(lambda x: x['incumbent'], trajectories[0])) confviz_script = self.analyzer.plot_confviz( incumbents, runhistories) self.website["Configurator's behavior"][ "Configurator Footprint"] = { "table": confviz_script } elif confviz: self.logger.info("Configuration visualization desired, but no " "instance-features available.") self.build_website() if cost_over_time: cost_over_time_path = self.analyzer.plot_cost_over_time( self.best_run.traj, self.validator) self.website["Configurator's behavior"]["Cost over time"] = { "figure": cost_over_time_path } self.build_website() self.parameter_importance(ablation='ablation' in param_importance, fanova='fanova' in param_importance, forward_selection='forward_selection' in param_importance, incneighbor='incneighbor' in param_importance) self.build_website() if parallel_coordinates: # Should be after parameter importance, if performed. n_params = 6 parallel_path = self.analyzer.plot_parallel_coordinates(n_params) self.website["Configurator's behavior"]["Parallel Coordinates"] = { "figure": parallel_path } self.build_website() if self.scenario.feature_dict: self.feature_analysis(box_violin='box_violin' in feature_analysis, correlation='correlation' in feature_analysis, clustering='clustering' in feature_analysis, importance='importance' in feature_analysis) else: self.logger.info('No feature analysis possible') self.logger.info("CAVE finished. Report is located in %s", os.path.join(self.output, 'report.html')) self.build_website() def parameter_importance(self, ablation=False, fanova=False, forward_selection=False, incneighbor=False): """Perform the specified parameter importance procedures. """ # PARAMETER IMPORTANCE if (ablation or forward_selection or fanova or incneighbor): self.website["Parameter Importance"] = OrderedDict() sum_ = 0 if fanova: sum_ += 1 table, plots, pair_plots = self.analyzer.fanova(self.incumbent) self.website["Parameter Importance"]["fANOVA"] = OrderedDict() self.website["Parameter Importance"]["fANOVA"]["Importance"] = { "table": table } # Insert plots (the received plots is a dict, mapping param -> path) self.website["Parameter Importance"]["fANOVA"][ "Marginals"] = OrderedDict([]) for param, plot in plots.items(): self.website["Parameter Importance"]["fANOVA"]["Marginals"][ param] = { "figure": plot } if pair_plots: self.website["Parameter Importance"]["fANOVA"][ "PairwiseMarginals"] = OrderedDict([]) for param, plot in pair_plots.items(): self.website["Parameter Importance"]["fANOVA"][ "PairwiseMarginals"][param] = { "figure": plot } if ablation: sum_ += 1 self.logger.info("Ablation...") self.analyzer.parameter_importance("ablation", self.incumbent, self.output) ablationpercentage_path = os.path.join(self.output, "ablationpercentage.png") ablationperformance_path = os.path.join(self.output, "ablationperformance.png") self.website["Parameter Importance"]["Ablation"] = { "figure": [ablationpercentage_path, ablationperformance_path] } if forward_selection: sum_ += 1 self.logger.info("Forward Selection...") self.analyzer.parameter_importance("forward-selection", self.incumbent, self.output) f_s_barplot_path = os.path.join(self.output, "forward selection-barplot.png") f_s_chng_path = os.path.join(self.output, "forward selection-chng.png") self.website["Parameter Importance"]["Forward Selection"] = { "figure": [f_s_barplot_path, f_s_chng_path] } if incneighbor: sum_ += 1 self.logger.info("Local EPM-predictions around incumbent...") plots = self.analyzer.local_epm_plots() self.website["Parameter Importance"][ "Local Parameter Importance (LPI)"] = OrderedDict([]) for param, plot in plots.items(): self.website["Parameter Importance"][ "Local Parameter Importance (LPI)"][param] = { "figure": plot } if sum_: of = os.path.join(self.output, 'pimp.tex') self.logger.info('Creating pimp latex table at %s' % of) self.analyzer.pimp.table_for_comparison(self.analyzer.evaluators, of, style='latex') def feature_analysis(self, box_violin=False, correlation=False, clustering=False, importance=False): if not (box_violin or correlation or clustering or importance): self.logger.debug("No feature analysis.") return # FEATURE ANALYSIS (ASAPY) # TODO make the following line prettier # TODO feat-names from scenario? in_reader = InputReader() feat_fn = self.scenario.feature_fn if not self.scenario.feature_names: with changedir(self.ta_exec_dir if self.ta_exec_dir else '.'): if not feat_fn or not os.path.exists(feat_fn): self.logger.warning( "Feature Analysis needs valid feature " "file! Either {} is not a valid " "filename or features are not saved in " "the scenario.") self.logger.error("Skipping Feature Analysis.") return else: feat_names = in_reader.read_instance_features_file( self.scenario.feature_fn)[0] else: feat_names = copy.deepcopy(self.scenario.feature_names) self.website["Feature Analysis"] = OrderedDict([]) # feature importance using forward selection if importance: self.website["Feature Analysis"][ "Feature Importance"] = OrderedDict() imp, plots = self.analyzer.feature_importance() imp = DataFrame(data=list(imp.values()), index=list(imp.keys()), columns=["Error"]) imp = imp.to_html() # this is a table with the values in html self.website["Feature Analysis"]["Feature Importance"]["Table"] = { "table": imp } for p in plots: name = os.path.splitext(os.path.basename(p))[0] self.website["Feature Analysis"]["Feature Importance"][ name] = { "figure": p } # box and violin plots if box_violin: name_plots = self.analyzer.feature_analysis( 'box_violin', feat_names) self.website["Feature Analysis"][ "Violin and Box Plots"] = OrderedDict() for plot_tuple in name_plots: key = "%s" % (plot_tuple[0]) self.website["Feature Analysis"]["Violin and Box Plots"][ key] = { "figure": plot_tuple[1] } # correlation plot if correlation: correlation_plot = self.analyzer.feature_analysis( 'correlation', feat_names) if correlation_plot: self.website["Feature Analysis"]["Correlation"] = { "figure": correlation_plot } # cluster instances in feature space if clustering: cluster_plot = self.analyzer.feature_analysis( 'clustering', feat_names) self.website["Feature Analysis"]["Clustering"] = { "figure": cluster_plot } self.build_website() def build_website(self): self.builder.generate_html(self.website)