def __init__( self, runscontainer: RunsContainer, incumbent_trajectory: str = None, ): """ """ super().__init__( runscontainer, incumbent_trajectory=incumbent_trajectory, ) self.rng = self.runscontainer.get_rng() self.scenario = self.runscontainer.scenario self.output_dir = os.path.join(self.runscontainer.output_dir, "tensorboard") self.rh = self.runscontainer.get_aggregated( False, False)[0].validated_runhistory # Run-specific / budget specific infos if len(self.runscontainer.get_budgets()) > 1: self.runs = self.runscontainer.get_aggregated(keep_folders=False, keep_budgets=True) else: self.runs = self.runscontainer.get_aggregated(keep_folders=True, keep_budgets=False) self.formatted_budgets = format_budgets( self.runscontainer.get_budgets()) # Will be set during execution: self.plots = [] # List with paths to '.png's
def __init__( self, runscontainer, ): super().__init__(runscontainer) self.rng = self.runscontainer.get_rng() self.scenario = self.runscontainer.scenario formatted_budgets = format_budgets(self.runscontainer.get_budgets()) for run in self.runscontainer.get_aggregated(keep_budgets=True, keep_folders=False): instances = [ i for i in run.scenario.train_insts + run.scenario.test_insts if i ] self.result[formatted_budgets[run.budget]] = { 'table': self.get_performance_table( instances, run.validated_runhistory, run.default, run.incumbent, run.epm_runhistory, run.scenario, ), }
def run(self): formatted_budgets = format_budgets(self.runscontainer.get_budgets(), allow_whitespace=True) for run in self.runscontainer.get_aggregated(keep_budgets=True, keep_folders=False): self.result[formatted_budgets[run.budget]] = self.plot( pimp=run.pimp, evaluators=list(run.share_information['evaluators'].values()), cs=self.runscontainer.scenario.cs, out_fn=os.path.join(run.output_dir, 'pimp.tex'), )
def __init__(self, runscontainer, ): super().__init__(runscontainer) runs = sorted(runscontainer.get_aggregated(True, False), key=lambda x: x.budget) incumbents = [r.incumbent for r in runs] budget_names = [f for b, f in format_budgets(runscontainer.get_budgets(), allow_whitespace=True).items()] epm_rhs = [r.epm_runhistory for r in runs] self.create_table(incumbents, budget_names, epm_rhs)
def __init__(self, runscontainer): super().__init__(runscontainer) check_for_features(runscontainer.scenario) formatted_budgets = format_budgets(self.runscontainer.get_budgets()) for run in self.runscontainer.get_aggregated(keep_budgets=True, keep_folders=False): self.result[formatted_budgets[run.budget]] = self.feat_analysis( output_dir=run.output_dir, scenario=run.scenario, feat_names=run.feature_names, feat_importance=run.share_information['feature_importance'], )
def __init__( self, runscontainer, pc_sort_by: str = None, params: Union[int, List[str]] = None, n_configs: int = None, max_runs_epm: int = None, ): """This function prepares the data from a SMAC-related format (using runhistories and parameters) to a more general format (using a dataframe). The resulting dataframe is passed to the parallel_coordinates-routine Parameters ---------- params: Union[int, List[str]] either directly the parameters to displayed or the number of parameters (will try to define the most important ones n_configs: int number of configs to be plotted pc_sort_by: str defines the pimp-method by which to choose the plotted parameters max_runs_epm: int maximum number of runs to train the epm with. this should prevent MemoryErrors """ super().__init__(runscontainer, pc_sort_by=pc_sort_by, params=params, n_configs=n_configs, max_runs_epm=max_runs_epm) self.params = self.options.getint('params') self.n_configs = self.options.getint('n_configs') self.max_runs_epm = self.options.getint('max_runs_epm') self.pc_sort_by = self.options['pc_sort_by'] formatted_budgets = format_budgets(self.runscontainer.get_budgets()) for run in self.runscontainer.get_aggregated(keep_budgets=True, keep_folders=False): self.result[formatted_budgets[ run.budget]] = self._plot_parallel_coordinates( original_rh=run.original_runhistory, validated_rh=run.validated_runhistory, validator=run.validator, scenario=run.scenario, default=run.default, incumbent=run.incumbent, param_imp=run.share_information["parameter_importance"], output_dir=run.output_dir, cs=run.scenario.cs, runtime=(run.scenario.run_obj == 'runtime'))
def __init__( self, runscontainer: RunsContainer, incumbent_trajectory: str = None, average_over_runs: bool = None, ): """ Plot performance over time, using all trajectory entries where max_time = max(wallclock_limit, the highest recorded time) """ super().__init__(runscontainer, incumbent_trajectory=incumbent_trajectory, average_over_runs=average_over_runs) self.rng = self.runscontainer.get_rng() self.output_fn = "cost_over_time.png" self.scenario = self.runscontainer.scenario self.output_dir = self.runscontainer.output_dir self.rh = self.runscontainer.get_aggregated( False, False)[0].validated_runhistory self.bohb_results = [ cr.share_information.get('hpbandster_result', None) for cr in self.runscontainer.get_all_runs() ] # Run-specific / budget specific infos if len(self.runscontainer.get_budgets()) > 1: self.runs = self.runscontainer.get_aggregated(keep_folders=False, keep_budgets=True) else: self.runs = self.runscontainer.get_aggregated(keep_folders=True, keep_budgets=False) self.block_epm = self.runscontainer.file_format == "BOHB" self.validator = self.runscontainer.get_aggregated(False, False)[0].validator self.average_over_runs = self.options.getboolean('average_over_runs') self.cot_inc_traj = self.options['incumbent_trajectory'] self.logger.debug( "Initialized CostOverTime with %d runs, output to \"%s\"", len(self.runscontainer.get_folders()), self.output_dir) if self.bohb_results: self.formatted_budgets = format_budgets( self.runscontainer.get_budgets()) # Will be set during execution: self.plots = [] # List with paths to '.png's
def __init__(self, runscontainer, ): super().__init__(runscontainer) check_for_features(runscontainer.scenario) formatted_budgets = format_budgets(self.runscontainer.get_budgets()) for run in self.runscontainer.get_aggregated(keep_budgets=True, keep_folders=False): feat_imp, plots = self.feature_importance( pimp=run.pimp, output_dir=run.output_dir, ) self.result[formatted_budgets[run.budget]] = plots # Add to run so other analysis-methods can use the information run.share_information['feature_importance'] = feat_imp
def __init__(self, runscontainer): """ Parameters ---------- runscontainer: RunsContainer contains all important information about the configurator runs """ super().__init__(runscontainer) self.runs = sorted(self.runscontainer.get_aggregated(True, False), key=lambda x: x.budget) self.budget_names = format_budgets(self.runscontainer.get_budgets(), allow_whitespace=True) # To be set self.dataframe = None
def __init__( self, runscontainer, ): """ Plot the cumulated distribution functions for given configurations, plots will share y-axis and if desired x-axis. Saves plot to file. """ super().__init__(runscontainer) formatted_budgets = format_budgets(self.runscontainer.get_budgets()) for run in self.runscontainer.get_aggregated(keep_budgets=True, keep_folders=False): self.result[formatted_budgets[run.budget]] = self._plot_ecdf( run.default, run.incumbent, run.epm_runhistory, run.scenario.train_insts, run.scenario.test_insts, run.scenario.cutoff, run.output_dir)
def __init__(self, runscontainer): """ Parameters ---------- runscontainer: RunsContainer contains all important information about the configurator runs """ super().__init__(runscontainer) self.runs = self.runscontainer.get_aggregated(keep_budgets=True, keep_folders=False) self.budget_names = list( format_budgets(self.runscontainer.get_budgets(), allow_whitespace=True).values()) self.logger.debug("Budget names: %s", str(self.budget_names)) # To be set self.dataframe = None
def parameter_importance(self, modus): """ modus: str modus for parameter importance, from [forward-selection, ablation, fanova, lpi] """ runs_by_budget = self.runscontainer.get_aggregated(True, False) formatted_budgets = format_budgets(self.runscontainer.get_budgets(), allow_whitespace=True) for run in runs_by_budget: self.logger.info("... parameter importance {} on {}".format( modus, run.get_identifier())) if not formatted_budgets[run.budget] in self.result: self.result[formatted_budgets[run.budget]] = OrderedDict() n_configs = len(run.original_runhistory.get_all_configs()) n_params = len(run.scenario.cs.get_hyperparameters()) if n_configs < n_params: self.result[formatted_budgets[run.budget]] = { 'else': "For this run there are only {} configs, " "but {} parameters. No reliable parameter importance analysis " "can be performed." } continue try: run.pimp.evaluate_scenario([modus], run.output_dir) except RuntimeError as e: err = "Encountered error '{}' for '{}' in '{}', (for fANOVA this can e.g. happen with too few data-points).".format( e, run.get_identifier(), modus) self.logger.info(err, exc_info=1) self.result[formatted_budgets[run.budget]][modus + '_error'] = err continue individual_result = self.postprocess(run.pimp, run.output_dir) self.result[formatted_budgets[run.budget]] = individual_result run.share_information['parameter_importance'][ modus] = run.pimp.evaluator.evaluated_parameter_importance run.share_information['evaluators'][modus] = run.pimp.evaluator
def __init__(self, runscontainer, ): """ Creates a scatterplot of the two configurations on the given set of instances. Saves plot to file. """ super().__init__(runscontainer) formatted_budgets = format_budgets(self.runscontainer.get_budgets()) for budget, run in zip(self.runscontainer.get_budgets(), self.runscontainer.get_aggregated(keep_budgets=True, keep_folders=False)): self.result[formatted_budgets[budget]] = self._plot_scatter( default=run.default, incumbent=run.incumbent, rh=run.epm_runhistory, train=run.scenario.train_insts, test=run.scenario.test_insts, run_obj=run.scenario.run_obj, cutoff=run.scenario.cutoff, output_dir=run.output_dir, )
def _preprocess(self): if self.data: raise ValueError( "Data seems to be already initialized, undefined behaviour.") else: self.data = OrderedDict() formatted_budgets = format_budgets(self.runscontainer.get_budgets()) for budget, run in zip( self.runscontainer.get_budgets(), self.runscontainer.get_aggregated(keep_budgets=True, keep_folders=False)): self.data[formatted_budgets[budget]] = self._preprocess_budget( original_rh=run.original_runhistory, validated_rh=run.validated_runhistory, validator=run.validator, scenario=run.scenario, default=run.default, incumbent=run.incumbent, param_imp=run.share_information["parameter_importance"], output_dir=run.output_dir, cs=run.scenario.cs, runtime=(run.scenario.run_obj == 'runtime'))
def __init__( self, scenario, output_dir, rh: RunHistory, runs: List[ConfiguratorRun], block_epm: bool = False, bohb_results=None, average_over_runs: bool = True, output_fn: str = "performance_over_time.png", validator: Union[None, Validator] = None, cot_inc_traj='racing', ): """ Plot performance over time, using all trajectory entries where max_time = max(wallclock_limit, the highest recorded time) Parameters ---------- scenario: smac.scenario.scenario.Scenario scenario object with necessary information output_dir: str output-directory for smac-object rh: smac.runhistory.runhistory.RunHistory runhistory to use runs: List[ConfiguratorRun] list of configurator-runs block_epm: bool if block_epm, only use given runs to estimate cost average_over_runs: bool if True, average over plots. if False, all runs are treated individually with checkboxes output_fn: str path to output-png for this analysis validator: Validator or None if given, use this epm to estimate costs for the individual incumbents (EPM) cot_inc_traj: str from ['racing', 'minimum', 'prefer_higher_budget'], defines incumbent trajectory from hpbandster result """ self.logger = logging.getLogger(self.__module__ + '.' + self.__class__.__name__) self.scenario = scenario self.output_dir = output_dir self.rh = rh self.runs = runs self.bohb_results = bohb_results self.block_epm = block_epm self.average_over_runs = average_over_runs self.output_fn = output_fn self.validator = validator self.cot_inc_traj = cot_inc_traj self.logger.debug( "Initialized CostOverTime with %d runs, output to \"%s\"", len(self.runs), self.output_dir) # TODO to be replaced by base restruct if self.bohb_results: self.formatted_budgets = format_budgets( self.bohb_results[0].HB_config['budgets']) # Will be set during execution: self.plots = [] # List with paths to '.png's
def hpbandster2smac(self, folder2result, cs_options, output_dir: str): """Reading hpbandster-result-object and creating RunHistory and trajectory... treats each budget as an individual 'smac'-run, creates an output-directory with subdirectories for each budget. Parameters ---------- folder2result: Dict(str : hpbandster.core.result.Result) folder mapping to bohb's result-objects cs_options: list[ConfigurationSpace] the configuration spaces. in the best case it's a single element, but for pcs-format we need to guess through a list of possible configspaces output_dir: str the output-dir to save the smac-runs to Returns ------- folder2budgets: dict(dict(str) - str) maps each folder (from parallel execution) to a dict, which in turn maps all budgets of the specific parallel execution to their paths """ folder2budgets = OrderedDict() self.logger.debug("Loading with %d configspace alternative options...", len(cs_options)) self.logger.info( "Assuming BOHB treats target algorithms as deterministic (and does not re-evaluate)" ) for folder, result in folder2result.items(): folder2budgets[folder] = OrderedDict() self.logger.debug("Budgets for '%s': %s" % (folder, str(result.HB_config['budgets']))) ########################## # 1. Create runhistory # ########################## id2config_mapping = result.get_id2config_mapping() skipped = {'None': 0, 'NaN': 0} budget2rh = OrderedDict() for run in result.get_all_runs(): # Choose runhistory to add run to if not run.budget in budget2rh: budget2rh[run.budget] = RunHistory(average_cost) rh = budget2rh[run.budget] # Load config... config = None while config is None: if len(cs_options) == 0: self.logger.debug("None of the alternatives worked...") raise ValueError( "Your configspace seems to be corrupt. If you use floats (or mix up ints, bools and strings) as categoricals, " "please consider using the .json-format, as the .pcs-format cannot recover the type " "of categoricals. Otherwise please report this to " "https://github.com/automl/CAVE/issues (and attach the debug.log)" ) try: config = self._get_config(run.config_id, id2config_mapping, cs_options[0]) except ValueError as err: self.logger.debug( "Loading configuration failed... trying %d alternatives" % len(cs_options) - 1, exc_info=1) cs_options = cs_options[ 1:] # remove the failing cs-version # Filter corrupted loss-values (ignore them) if run.loss is None: skipped['None'] += 1 continue if np.isnan(run.loss): skipped['NaN'] += 1 continue rh.add(config=config, cost=run.loss, time=run.time_stamps['finished'] - run.time_stamps['started'], status=StatusType.SUCCESS, seed=0, additional_info={ 'info': run.info, 'timestamps': run.time_stamps }) self.logger.debug( "Skipped %d None- and %d NaN-loss-values in BOHB-result", skipped['None'], skipped['NaN']) ########################## # 2. Create all else # ########################## formatted_budgets = format_budgets( budget2rh.keys() ) # Make budget-names readable [0.021311, 0.031211] to [0.02, 0.03] for b, rh in budget2rh.items(): output_path = os.path.join(output_dir, folder, formatted_budgets[b]) folder2budgets[folder][b] = output_path scenario = Scenario({ 'run_obj': 'quality', 'cs': cs_options[0], 'output_dir': output_dir, 'deterministic': True, # At the time of writing, BOHB is always treating ta's as deterministic }) scenario.output_dir_for_this_run = output_path scenario.write() with open(os.path.join(output_path, 'configspace.json'), 'w') as fh: fh.write(pcs_json.write(cs_options[0])) rh.save_json(fn=os.path.join(output_path, 'runhistory.json')) self.get_trajectory(folder2result[folder], output_path, scenario, rh, budget=b) return folder2budgets
def parameter_importance(self, modus): """ modus: str modus for parameter importance, from [forward-selection, ablation, fanova, lpi] """ runs_by_budget = self.runscontainer.get_aggregated(keep_budgets=True, keep_folders=False) formatted_budgets = format_budgets(self.runscontainer.get_budgets(), allow_whitespace=True) self.result['Importances Per Parameter'] = {} result = self.result['Importances Per Parameter'] for budget, run in zip(formatted_budgets.values(), runs_by_budget): self.logger.info("... parameter importance {} on {}".format( modus, run.get_identifier())) if budget not in result: result[budget] = OrderedDict() n_configs = len(run.original_runhistory.get_all_configs()) n_params = len(run.scenario.cs.get_hyperparameters()) if n_configs < n_params: result[budget] = { 'else': "For this run there are only {} configs, " "but {} parameters. No reliable parameter importance analysis " "can be performed.".format(n_configs, n_params) } continue try: run.pimp.evaluate_scenario([modus], run.output_dir) except RuntimeError as e: err = "Encountered error '{}' for '{}' in '{}', (for fANOVA this can e.g. happen with too few " \ "data-points).".format(e, run.get_identifier(), modus) self.logger.info(err, exc_info=1) result[budget][modus + '_error'] = err continue individual_result = self.postprocess(run.pimp, run.output_dir) result[budget] = individual_result # Interactive Plots if self.runscontainer.analyzing_options[ 'Parameter Importance'].getboolean( 'interactive_bokeh_plots'): try: result[budget]['Interactive Plots'] = { 'bokeh': components( run.pimp.evaluator.plot_bokeh(show_plot=False)) } except AttributeError as err: self.logger.debug(err, exc_info=1) run.share_information['parameter_importance'][ modus] = run.pimp.evaluator.evaluated_parameter_importance run.share_information['evaluators'][modus] = run.pimp.evaluator if self.runscontainer.analyzing_options[ 'Parameter Importance'].getboolean('whisker_quantiles_plot'): if len(self.runscontainer.get_budgets()) <= 1 and len( self.runscontainer.get_folders()) <= 1: self.logger.info( "The Whisker-Quantiles Plot for Parameter Importance makes only sense with multiple" "budgets and/or folders, but not with only one budget and one folder." ) self.runscontainer.analyzing_options.set( 'Parameter Importance', 'whisker_quantiles_plot', 'False') self.importance_per_budget = None return hyperparameters = self.runscontainer.scenario.cs.get_hyperparameter_names( ) # Generate data - for each parallel folder and each budget, perform an importance-analysis importance_per_budget = OrderedDict( ) # dict[budget][folder] -> (dict[param_name]->float) for budget in self.runscontainer.get_budgets(): importance_per_budget[budget] = { hp: {} for hp in hyperparameters } for folder in self.runscontainer.get_folders(): cr = self.runscontainer.get_run(folder, budget) try: importance = cr.pimp.evaluate_scenario( [modus], cr.output_dir, plot_pyplot=False, plot_bokeh=False)[0][modus]['imp'] except RuntimeError as e: importance = {} err = "Encountered error '{}' for '{}' in '{}', (for fANOVA this can e.g. happen with too " \ "few data-points).".format(e, cr.get_identifier(), modus) self.logger.debug(err, exc_info=1) self.logger.error(err) self.logger.debug("Importance for folder %s: %s", folder, importance) for hp in hyperparameters: importance_per_budget[budget][hp][ folder] = importance.pop(hp, np.nan) self.importance_per_budget = importance_per_budget
def hpbandster2smac(self, folder2result, cs: ConfigurationSpace, backup_cs, output_dir: str): """Reading hpbandster-result-object and creating RunHistory and trajectory... treats each budget as an individual 'smac'-run, creates an output-directory with subdirectories for each budget. Parameters ---------- folder2result: Dict(str : hpbandster.core.result.Result) folder mapping to bohb's result-objects cs: ConfigurationSpace the configuration space backup_cs: List[ConfigurationSpace] if loading a configuration fails, try configspaces from this list until succeed output_dir: str the output-dir to save the smac-runs to """ # Create runhistories (one per budget) budget2rh = OrderedDict() for folder, result in folder2result.items(): self.logger.debug("Budgets for '%s': %s" % (folder, str(result.HB_config['budgets']))) id2config_mapping = result.get_id2config_mapping() skipped = {'None': 0, 'NaN': 0} for run in result.get_all_runs(): if not run.budget in budget2rh: budget2rh[run.budget] = RunHistory(average_cost) rh = budget2rh[run.budget] # Load config... try: config = self._get_config(run.config_id, id2config_mapping, cs) except ValueError as err: self.logger.debug( "Loading configuration failed... trying alternatives", exc_info=1) for bcs in backup_cs: try: config = self._get_config(run.config_id, id2config_mapping, bcs) cs = bcs break except ValueError: self.logger.debug("", exc_info=1) pass else: self.logger.debug("None of the alternatives worked...") raise ValueError( "Your configspace seems to be corrupt. If you use floats (or mix up ints, bools and strings) as categoricals, " "please consider using the .json-format, as the .pcs-format cannot recover the type " "of categoricals. Otherwise please report this to " "https://github.com/automl/CAVE/issues (and attach the debug.log)" ) if run.loss is None: skipped['None'] += 1 continue if np.isnan(run.loss): skipped['NaN'] += 1 continue rh.add(config=config, cost=run.loss, time=run.time_stamps['finished'] - run.time_stamps['started'], status=StatusType.SUCCESS, seed=0, additional_info={ 'info': run.info, 'timestamps': run.time_stamps }) self.logger.debug( "Skipped %d None- and %d NaN-loss-values in BOHB-result", skipped['None'], skipped['NaN']) # Write to disk budget2path = OrderedDict() # paths to individual budgets self.logger.info( "Assuming BOHB treats target algorithms as deterministic (and does not re-evaluate)" ) formatted_budgets = format_budgets(budget2rh.keys()) for b, rh in budget2rh.items(): output_path = os.path.join(output_dir, formatted_budgets[b]) budget2path[b] = output_path scenario = Scenario({ 'run_obj': 'quality', 'cs': cs, 'output_dir': output_dir, 'deterministic': True, # At the time of writing, BOHB is always treating ta's as deterministic }) scenario.output_dir_for_this_run = output_path scenario.write() with open(os.path.join(output_path, 'configspace.json'), 'w') as fh: fh.write(pcs_json.write(cs)) rh.save_json(fn=os.path.join(output_path, 'runhistory.json')) self.get_trajectory(folder2result, output_path, scenario, rh, budget=b) return budget2path