def _save_results(self, rh: RunHistory, output_fn, backup_fn=None): """ Helper to save results to file Parameters ---------- rh: RunHistory runhistory to save output_fn: str if ends on '.json': filename to save history to else: directory to save runhistory to (filename is backup_fn) backup_fn: str if output_fn does not end on '.json', treat output_fn as dir and append backup_fn as filename (if output_fn ends on '.json', this argument is ignored) """ if output_fn == "": self.logger.info( "No output specified, validated runhistory not saved.") return # Check if a folder or a file is specified as output if not output_fn.endswith('.json'): output_dir = output_fn output_fn = os.path.join(output_dir, backup_fn) self.logger.debug("Output is \"%s\", changing to \"%s\"!", output_dir, output_fn) base = os.path.split(output_fn)[0] if not base == "" and not os.path.exists(base): self.logger.debug("Folder (\"%s\") doesn't exist, creating.", base) os.makedirs(base) rh.save_json(output_fn) self.logger.info("Saving validation-results in %s", output_fn)
def merge_foreign_data(scenario: Scenario, runhistory: RunHistory, in_scenario_list: typing.List[Scenario], in_runhistory_list: typing.List[RunHistory]): """Extend <scenario> and <runhistory> with runhistory data from another <in_scenario> assuming the same pcs, feature space, but different instances Parameters --------- scenario: Scenario original scenario -- feature dictionary will be extended runhistory: RunHistory original runhistory -- will be extended by further data points in_scenario_list: typing.List[Scenario] input scenario in_runhistory_list: typing.List[RunHistory] list of runhistories wrt <in_scenario> Returns ------- scenario: Scenario runhistory: Runhistory """ # add further instance features for in_scenario in in_scenario_list: if scenario.n_features != in_scenario.n_features: raise ValueError( "Feature Space has to be the same for both scenarios (%d vs %d)." % (scenario.n_features, in_scenario.n_features)) if scenario.cs != in_scenario.cs: raise ValueError("PCS of both scenarios have to be identical.") if scenario.cutoff != in_scenario.cutoff: raise ValueError("Cutoffs of both scenarios have to be identical.") scenario.feature_dict.update(in_scenario.feature_dict) # extend runhistory for rh in in_runhistory_list: runhistory.update(rh, origin=DataOrigin.EXTERNAL_DIFFERENT_INSTANCES) for date in runhistory.data: if scenario.feature_dict.get(date.instance_id) is None: raise ValueError( "Instance feature for \"%s\" was not found in scenario data." % (date.instance_id)) runhistory.compute_all_costs(instances=scenario.train_insts) return scenario, runhistory
def merge_foreign_data_from_file( scenario: Scenario, runhistory: RunHistory, in_scenario_fn_list: typing.List[str], in_runhistory_fn_list: typing.List[str], cs: ConfigurationSpace, aggregate_func: typing.Callable = average_cost): """Extend <scenario> and <runhistory> with runhistory data from another <in_scenario> assuming the same pcs, feature space, but different instances Parameters --------- scenario: Scenario original scenario -- feature dictionary will be extended runhistory: RunHistory original runhistory -- will be extended by further data points in_scenario_fn_list: typing.List[str] input scenario file names in_runhistory_fn_list: typing.List[str] list filenames of runhistory dumps cs: ConfigurationSpace parameter configuration space to read runhistory from file aggregate_func: typing.Callable function to aggregate performance of a configuratoion across instances Returns ------- scenario: Scenario runhistory: Runhistory """ if not in_scenario_fn_list: raise ValueError( "To read warmstart data from previous runhistories, the corresponding scenarios are required. Use option --warmstart_scenario" ) scens = [ Scenario(scenario=scen_fn, cmd_options={"output_dir": ""}) for scen_fn in in_scenario_fn_list ] rhs = [] for rh_fn in in_runhistory_fn_list: rh = RunHistory(aggregate_func, file_system=scenario.file_system) rh.load_json(rh_fn, cs) rhs.append(rh) return merge_foreign_data(scenario, runhistory, in_scenario_list=scens, in_runhistory_list=rhs)
def _cost(config: Configuration, run_history: RunHistory, instance_seed_pairs=None): """Return array of all costs for the given config for further calculations. Parameters ---------- config : Configuration Configuration to calculate objective for run_history : RunHistory RunHistory object from which the objective value is computed. instance_seed_pairs : list, optional (default=None) List of tuples of instance-seeds pairs. If None, the run_history is queried for all runs of the given configuration. Returns ------- Costs: list Array of all costs """ try: id_ = run_history.config_ids[config] except KeyError: # challenger was not running so far return [] if instance_seed_pairs is None: instance_seed_pairs = run_history.get_runs_for_config(config) costs = [] for i, r in instance_seed_pairs: k = RunKey(id_, i, r) costs.append(run_history.data[k].cost) return costs
def _get_mean_costs(self, incs: typing.List[Configuration], new_rh: RunHistory): """ Compute mean cost per instance Parameters ---------- incs : typing.List[Configuration] incumbents determined by all parallel SMAC runs new_rh : RunHistory runhistory to determine mean performance Returns ------- List[float] means Dict(Config -> Dict(inst_id(str) -> float)) """ config_cost_per_inst = {} results = [] for incumbent in incs: cost_per_inst = new_rh.get_instance_costs_for_config( config=incumbent) config_cost_per_inst[incumbent] = cost_per_inst results.append(np.mean(list(cost_per_inst.values()))) return results, config_cost_per_inst
def read(run_history: RunHistory, output_dirs: typing.Union[str, typing.List[str]], configuration_space: ConfigurationSpace, logger: logging.Logger): """Update runhistory with run results from concurrent runs of pSMAC. Parameters ---------- run_history : dsmac.runhistory.RunHistory RunHistory object to be updated with run information from runhistory objects stored in the output directory. output_dirs : typing.Union[str,typing.List[str]] List of SMAC output directories or Linux path expression (str) which will be casted into a list with file_system.glob(). This function will search the output directories for files matching the runhistory regular expression. configuration_space : ConfigSpace.ConfigurationSpace A ConfigurationSpace object to check if loaded configurations are valid. logger : logging.Logger """ numruns_in_runhistory = len(run_history.data) initial_numruns_in_runhistory = numruns_in_runhistory file_system = run_history.file_system if isinstance(output_dirs, str): parsed_output_dirs = file_system.glob(output_dirs) if file_system.glob(run_history.file_system.join(output_dirs, "run_*")): parsed_output_dirs += file_system.glob( file_system.join(output_dirs, "run_*")) else: parsed_output_dirs = output_dirs for output_directory in parsed_output_dirs: for file_in_output_directory in file_system.listdir(output_directory): match = re.match(RUNHISTORY_RE, file_in_output_directory) valid_match = re.match(VALIDATEDRUNHISTORY_RE, file_in_output_directory) if match or valid_match: last_id = PSMAC_VALUE.dir2id[output_directory] runhistory_file = file_system.join(output_directory, file_in_output_directory) updated_id_set = run_history.update_from_json( runhistory_file, configuration_space, id_set=PSMAC_VALUE.dir2id[output_directory], file_system=run_history.file_system) PSMAC_VALUE.dir2id[output_directory] = updated_id_set # print(PSMAC_VALUE.dir2id) new_numruns_in_runhistory = len(run_history.data) difference = new_numruns_in_runhistory - numruns_in_runhistory logger.debug('Shared model mode: Loaded %d new runs from %s' % (difference, runhistory_file)) numruns_in_runhistory = new_numruns_in_runhistory difference = numruns_in_runhistory - initial_numruns_in_runhistory logger.info( 'Shared model mode: Finished loading new runs, found %d new runs.' % difference)
def write(run_history: RunHistory, output_directory: str, logger: logging.Logger): """Write the runhistory to the output directory. Overwrites previously outputted runhistories. Parameters ---------- run_history : ~dsmac.runhistory.runhistory.RunHistory RunHistory object to be saved. output_directory : str logger : logging.Logger """ file_system = run_history.file_system output_filename = file_system.join(output_directory, RUNHISTORY_FILEPATTERN) logging.debug("Saving runhistory to %s" % output_filename) run_history.save_json(output_filename, save_external=False)
def restore_state(self, scen, args_): """Read in files for state-restoration: runhistory, stats, trajectory. """ # Check for folder and files rh_path = os.path.join(args_.restore_state, "runhistory.json") stats_path = os.path.join(args_.restore_state, "stats.json") traj_path_aclib = os.path.join(args_.restore_state, "traj_aclib2.json") traj_path_old = os.path.join(args_.restore_state, "traj_old.csv") scen_path = os.path.join(args_.restore_state, "scenario.txt") if not os.path.isdir(args_.restore_state): raise FileNotFoundError("Could not find folder from which to restore.") # Load runhistory and stats rh = RunHistory(aggregate_func=None) rh.load_json(rh_path, scen.cs) self.logger.debug("Restored runhistory from %s", rh_path) stats = Stats(scen) stats.load(stats_path) self.logger.debug("Restored stats from %s", stats_path) with open(traj_path_aclib, 'r') as traj_fn: traj_list_aclib = traj_fn.readlines() with open(traj_path_old, 'r') as traj_fn: traj_list_old = traj_fn.readlines() return rh, stats, traj_list_aclib, traj_list_old
def _adapt_cutoff(self, challenger: Configuration, incumbent: Configuration, run_history: RunHistory, inc_sum_cost: float): """Adaptive capping: Compute cutoff based on time so far used for incumbent and reduce cutoff for next run of challenger accordingly !Only applicable if self.run_obj_time !runs on incumbent should be superset of the runs performed for the challenger Parameters ---------- challenger : Configuration Configuration which challenges incumbent incumbent : Configuration Best configuration so far run_history : RunHistory Stores all runs we ran so far inc_sum_cost: float Sum of runtimes of all incumbent runs Returns ------- cutoff: int Adapted cutoff """ if not self.run_obj_time: return self.cutoff # cost used by challenger for going over all its runs # should be subset of runs of incumbent (not checked for efficiency # reasons) chall_inst_seeds = run_history.get_runs_for_config(challenger) chal_sum_cost = sum_cost(config=challenger, instance_seed_pairs=chall_inst_seeds, run_history=run_history) cutoff = min(self.cutoff, inc_sum_cost * self.adaptive_capping_slackfactor - chal_sum_cost ) return cutoff
def _add_inc_run(self, incumbent: Configuration, run_history: RunHistory): """Add new run for incumbent *Side effect:* adds runs to <run_history> Parameters ---------- incumbent : Configuration best configuration so far run_history : RunHistory stores all runs we ran so far """ inc_runs = run_history.get_runs_for_config(incumbent) # Line 3 # First evaluate incumbent on a new instance if len(inc_runs) < self.maxR: while True: # Line 4 # find all instances that have the most runs on the inc inc_runs = run_history.get_runs_for_config(incumbent) inc_inst = [s.instance for s in inc_runs] inc_inst = list(Counter(inc_inst).items()) inc_inst.sort(key=lambda x: x[1], reverse=True) try: max_runs = inc_inst[0][1] except IndexError: self.logger.debug("No run for incumbent found") max_runs = 0 inc_inst = set([x[0] for x in inc_inst if x[1] == max_runs]) available_insts = (self.instances - inc_inst) # if all instances were used n times, we can pick an instances # from the complete set again if not self.deterministic and not available_insts: available_insts = self.instances # Line 6 (Line 5 is further down...) if self.deterministic: next_seed = 0 else: next_seed = self.rs.randint(low=0, high=MAXINT, size=1)[0] if available_insts: # Line 5 (here for easier code) next_instance = self.rs.choice(list(available_insts)) # Line 7 self.logger.debug("Add run of incumbent") status, cost, dur, res = self.tae_runner.start( config=incumbent, instance=next_instance, seed=next_seed, cutoff=self.cutoff, instance_specific=self.instance_specifics.get( next_instance, "0")) self._ta_time += dur self._num_run += 1 else: self.logger.debug("No further instance-seed pairs for " "incumbent available.") break inc_runs = run_history.get_runs_for_config(incumbent) # Termination condition; after exactly one run, this checks # whether further runs are necessary due to minR if len(inc_runs) >= self.minR or len(inc_runs) >= self.maxR: break
def intensify(self, challengers: typing.List[Configuration], incumbent: Configuration, run_history: RunHistory, aggregate_func: typing.Callable, time_bound: float = float(MAXINT), log_traj: bool = True): """Running intensification to determine the incumbent configuration. *Side effect:* adds runs to run_history Implementation of Procedure 2 in Hutter et al. (2011). Parameters ---------- challengers : typing.List[Configuration] promising configurations incumbent : Configuration best configuration so far run_history : RunHistory stores all runs we ran so far aggregate_func: typing.Callable aggregate error across instances time_bound : float, optional (default=2 ** 31 - 1) time in [sec] available to perform intensify log_traj: bool whether to log changes of incumbents in trajectory Returns ------- incumbent: Configuration() current (maybe new) incumbent configuration inc_perf: float empirical performance of incumbent configuration """ self.start_time = time.time() self._ta_time = 0 if time_bound < self._min_time: raise ValueError("time_bound must be >= %f" % (self._min_time)) self._num_run = 0 self._chall_indx = 0 # Line 1 + 2 if isinstance(challengers, ChallengerList): challengers = challengers.challengers L1 = 10 L2 = 10000 r = np.random.uniform() if 0.05 < r < 0.2 and len(challengers) > L1: target = np.random.randint(1, L1) challengers[target], challengers[0] = challengers[0], challengers[ target] if r < 0.05 and len(challengers) > L2: target = np.random.randint(L1, L2) challengers[target], challengers[0] = challengers[0], challengers[ target] for challenger in challengers: if not run_history.db.appointment_config(challenger): continue if challenger == incumbent: self.logger.debug( "Challenger was the same as the current incumbent; Skipping challenger" ) continue self.logger.debug("Intensify on %s", challenger) if hasattr(challenger, 'origin'): self.logger.debug("Configuration origin: %s", challenger.origin) try: # Lines 3-7 self._add_inc_run(incumbent=incumbent, run_history=run_history) # Lines 8-17 incumbent = self._race_challenger( challenger=challenger, incumbent=incumbent, run_history=run_history, aggregate_func=aggregate_func, log_traj=log_traj) if self.always_race_against and \ challenger == incumbent and \ self.always_race_against != challenger: self.logger.debug( "Race against constant configuration after incumbent change." ) incumbent = self._race_challenger( challenger=self.always_race_against, incumbent=incumbent, run_history=run_history, aggregate_func=aggregate_func, log_traj=log_traj) except BudgetExhaustedException: # We return incumbent, SMBO stops due to its own budget checks inc_perf = run_history.get_cost(incumbent) self.logger.debug("Budget exhausted; Return incumbent") return incumbent, inc_perf tm = time.time() if self._chall_indx >= self.min_chall: if self._num_run > self.run_limit: self.logger.debug( "Maximum #runs for intensification reached") break if not self.use_ta_time_bound and tm - self.start_time - time_bound >= 0: self.logger.debug( "Wallclock time limit for intensification reached (" "used: %f sec, available: %f sec)" % (tm - self.start_time, time_bound)) break elif self._ta_time - time_bound >= 0: self.logger.debug( "TA time limit for intensification reached (" "used: %f sec, available: %f sec)" % (self._ta_time, time_bound)) break # output estimated performance of incumbent inc_runs = run_history.get_runs_for_config(incumbent) inc_perf = aggregate_func(incumbent, run_history, inc_runs) self.logger.info( "Updated estimated cost of incumbent on %d runs: %.4f" % (len(inc_runs), inc_perf)) self.stats.update_average_configs_per_intensify( n_configs=self._chall_indx) return incumbent, inc_perf
def __init__( self, scenario: Scenario, tae_runner: Optional[Union[Type[ExecuteTARun], Callable]] = None, tae_runner_kwargs: Optional[dict] = None, runhistory: Optional[Union[Type[RunHistory], RunHistory]] = None, runhistory_kwargs: Optional[dict] = None, intensifier: Optional[Type[Intensifier]] = None, intensifier_kwargs: Optional[dict] = None, acquisition_function: Optional[ Type[AbstractAcquisitionFunction]] = None, acquisition_function_kwargs: Optional[dict] = None, integrate_acquisition_function: bool = False, acquisition_function_optimizer: Optional[ Type[AcquisitionFunctionMaximizer]] = None, acquisition_function_optimizer_kwargs: Optional[dict] = None, model: Optional[Type[AbstractEPM]] = None, model_kwargs: Optional[dict] = None, runhistory2epm: Optional[Type[AbstractRunHistory2EPM]] = None, runhistory2epm_kwargs: Optional[dict] = None, initial_design: Optional[Type[InitialDesign]] = None, initial_design_kwargs: Optional[dict] = None, initial_configurations: Optional[List[Configuration]] = None, stats: Optional[Stats] = None, restore_incumbent: Optional[Configuration] = None, rng: Optional[Union[np.random.RandomState, int]] = None, smbo_class: Optional[SMBO] = None, run_id: Optional[int] = None, random_configuration_chooser: Optional[ Type[RandomConfigurationChooser]] = None, random_configuration_chooser_kwargs: Optional[dict] = None, ): """ Constructor Parameters ---------- scenario : ~dsmac.scenario.scenario.Scenario Scenario object tae_runner : ~dsmac.tae.execute_ta_run.ExecuteTARun or callable Callable or implementation of :class:`~dsmac.tae.execute_ta_run.ExecuteTARun`. In case a callable is passed it will be wrapped by :class:`~dsmac.tae.execute_func.ExecuteTAFuncDict`. If not set, it will be initialized with the :class:`~dsmac.tae.execute_ta_run_old.ExecuteTARunOld`. tae_runner_kwargs: Optional[dict] arguments passed to constructor of '~tae_runner' runhistory : RunHistory runhistory to store all algorithm runs runhistory_kwargs : Optional[dict] arguments passed to constructor of runhistory. We strongly advise against changing the aggregation function, since it will break some code assumptions intensifier : Intensifier intensification object to issue a racing to decide the current incumbent intensifier_kwargs: Optional[dict] arguments passed to the constructor of '~intensifier' acquisition_function : ~dsmac.optimizer.acquisition.AbstractAcquisitionFunction Class or object that implements the :class:`~dsmac.optimizer.acquisition.AbstractAcquisitionFunction`. Will use :class:`~dsmac.optimizer.acquisition.EI` or :class:`~dsmac.optimizer.acquisition.LogEI` if not set. `~acquisition_function_kwargs` is passed to the class constructor. acquisition_function_kwargs : Optional[dict] dictionary to pass specific arguments to ~acquisition_function integrate_acquisition_function : bool, default=False Whether to integrate the acquisition function. Works only with models which can sample their hyperparameters (i.e. GaussianProcessMCMC). acquisition_function_optimizer : ~dsmac.optimizer.ei_optimization.AcquisitionFunctionMaximizer Object that implements the :class:`~dsmac.optimizer.ei_optimization.AcquisitionFunctionMaximizer`. Will use :class:`dsmac.optimizer.ei_optimization.InterleavedLocalAndRandomSearch` if not set. acquisition_function_optimizer_kwargs: Optional[dict] Arguments passed to constructor of '~acquisition_function_optimizer' model : AbstractEPM Model that implements train() and predict(). Will use a :class:`~dsmac.epm.rf_with_instances.RandomForestWithInstances` if not set. model_kwargs : Optional[dict] Arguments passed to constructor of '~model' runhistory2epm : ~dsmac.runhistory.runhistory2epm.RunHistory2EMP Object that implements the AbstractRunHistory2EPM. If None, will use :class:`~dsmac.runhistory.runhistory2epm.RunHistory2EPM4Cost` if objective is cost or :class:`~dsmac.runhistory.runhistory2epm.RunHistory2EPM4LogCost` if objective is runtime. runhistory2epm_kwargs: Optional[dict] Arguments passed to the constructor of '~runhistory2epm' initial_design : InitialDesign initial sampling design initial_design_kwargs: Optional[dict] arguments passed to constructor of `~initial_design' initial_configurations : List[Configuration] list of initial configurations for initial design -- cannot be used together with initial_design stats : Stats optional stats object rng : np.random.RandomState Random number generator restore_incumbent : Configuration incumbent used if restoring to previous state smbo_class : ~dsmac.optimizer.smbo.SMBO Class implementing the SMBO interface which will be used to instantiate the optimizer class. run_id : int (optional) Run ID will be used as subfolder for output_dir. If no ``run_id`` is given, a random ``run_id`` will be chosen. random_configuration_chooser : ~dsmac.optimizer.random_configuration_chooser.RandomConfigurationChooser How often to choose a random configuration during the intensification procedure. random_configuration_chooser_kwargs : Optional[dict] arguments of constructor for '~random_configuration_chooser' """ self.logger = logging.getLogger(self.__module__ + "." + self.__class__.__name__) aggregate_func = average_cost self.scenario = scenario self.output_dir = "" if not restore_incumbent: # restore_incumbent is used by the CLI interface which provides a method for restoring a SMAC run given an # output directory. This is the default path. # initial random number generator # run_id, rng = get_rng(rng=rng, run_id=run_id, logger=self.logger) # run_id=datetime.now().strftime("%Y%m%d%H%M%S%f") run_id = uuid1() # self.output_dir = create_output_directory(scenario, run_id) # fixme run_id self.output_dir = scenario.output_dir # create_output_directory(scenario, run_id) # fixme run_id elif scenario.output_dir is not None: run_id, rng = get_rng(rng=rng, run_id=run_id, logger=self.logger) # output-directory is created in CLI when restoring from a # folder. calling the function again in the facade results in two # folders being created: run_X and run_X.OLD. if we are # restoring, the output-folder exists already and we omit creating it, # but set the self-output_dir to the dir. # necessary because we want to write traj to new output-dir in CLI. self.output_dir = scenario.output_dir_for_this_run if (scenario.deterministic is True and getattr(scenario, 'tuner_timeout', None) is None and scenario.run_obj == 'quality'): self.logger.info( 'Optimizing a deterministic scenario for quality without a tuner timeout - will make ' 'SMAC deterministic and only evaluate one configuration per iteration!' ) scenario.intensification_percentage = 1e-10 scenario.min_chall = 1 scenario.write() # initialize stats object if stats: self.stats = stats else: self.stats = Stats(scenario, file_system=scenario.file_system) if self.scenario.run_obj == "runtime" and not self.scenario.transform_y == "LOG": self.logger.warning( "Runtime as objective automatically activates log(y) transformation" ) self.scenario.transform_y = "LOG" # initialize empty runhistory runhistory_def_kwargs = {'aggregate_func': aggregate_func} if runhistory_kwargs is not None: runhistory_def_kwargs.update(runhistory_kwargs) if runhistory is None: runhistory = RunHistory(**runhistory_def_kwargs, file_system=scenario.file_system, db_type=scenario.db_type, db_args=scenario.db_args, db_kwargs=scenario.db_kwargs) elif inspect.isclass(runhistory): runhistory = runhistory(**runhistory_def_kwargs) else: if runhistory.aggregate_func is None: runhistory.aggregate_func = aggregate_func rand_conf_chooser_kwargs = {'rng': rng} if random_configuration_chooser_kwargs is not None: rand_conf_chooser_kwargs.update( random_configuration_chooser_kwargs) if random_configuration_chooser is None: if 'prob' not in rand_conf_chooser_kwargs: rand_conf_chooser_kwargs['prob'] = scenario.rand_prob random_configuration_chooser = ChooserProb( **rand_conf_chooser_kwargs) elif inspect.isclass(random_configuration_chooser): random_configuration_chooser = random_configuration_chooser( **rand_conf_chooser_kwargs) elif not isinstance(random_configuration_chooser, RandomConfigurationChooser): raise ValueError( "random_configuration_chooser has to be" " a class or object of RandomConfigurationChooser") # reset random number generator in config space to draw different # random configurations with each seed given to SMAC scenario.cs.seed(rng.randint(MAXINT)) # initial Trajectory Logger traj_logger = TrajLogger(output_dir=self.output_dir, stats=self.stats, file_system=scenario.file_system) # initial EPM types, bounds = get_types(scenario.cs, scenario.feature_array) model_def_kwargs = { 'types': types, 'bounds': bounds, 'instance_features': scenario.feature_array, 'seed': rng.randint(MAXINT), 'pca_components': scenario.PCA_DIM, } if model_kwargs is not None: model_def_kwargs.update(model_kwargs) if model is None: for key, value in { 'log_y': scenario.transform_y in ["LOG", "LOGS"], 'num_trees': scenario.rf_num_trees, 'do_bootstrapping': scenario.rf_do_bootstrapping, 'ratio_features': scenario.rf_ratio_features, 'min_samples_split': scenario.rf_min_samples_split, 'min_samples_leaf': scenario.rf_min_samples_leaf, 'max_depth': scenario.rf_max_depth, }.items(): if key not in model_def_kwargs: model_def_kwargs[key] = value model_def_kwargs['configspace'] = self.scenario.cs model = RandomForestWithInstances(**model_def_kwargs) elif inspect.isclass(model): model_def_kwargs['configspace'] = self.scenario.cs model = model(**model_def_kwargs) else: raise TypeError("Model not recognized: %s" % (type(model))) # initial acquisition function acq_def_kwargs = {'model': model} if acquisition_function_kwargs is not None: acq_def_kwargs.update(acquisition_function_kwargs) if acquisition_function is None: if scenario.transform_y in ["LOG", "LOGS"]: acquisition_function = LogEI(**acq_def_kwargs) else: acquisition_function = EI(**acq_def_kwargs) elif inspect.isclass(acquisition_function): acquisition_function = acquisition_function(**acq_def_kwargs) else: raise TypeError( "Argument acquisition_function must be None or an object implementing the " "AbstractAcquisitionFunction, not %s." % type(acquisition_function)) if integrate_acquisition_function: acquisition_function = IntegratedAcquisitionFunction( acquisition_function=acquisition_function, **acq_def_kwargs) # initialize optimizer on acquisition function acq_func_opt_kwargs = { 'acquisition_function': acquisition_function, 'config_space': scenario.cs, 'rng': rng, } if acquisition_function_optimizer_kwargs is not None: acq_func_opt_kwargs.update(acquisition_function_optimizer_kwargs) if acquisition_function_optimizer is None: for key, value in { 'max_steps': scenario.sls_max_steps, 'n_steps_plateau_walk': scenario.sls_n_steps_plateau_walk, }.items(): if key not in acq_func_opt_kwargs: acq_func_opt_kwargs[key] = value acquisition_function_optimizer = InterleavedLocalAndRandomSearch( **acq_func_opt_kwargs) elif inspect.isclass(acquisition_function_optimizer): acquisition_function_optimizer = acquisition_function_optimizer( **acq_func_opt_kwargs) else: raise TypeError( "Argument acquisition_function_optimizer must be None or an object implementing the " "AcquisitionFunctionMaximizer, but is '%s'" % type(acquisition_function_optimizer)) # initialize tae_runner # First case, if tae_runner is None, the target algorithm is a call # string in the scenario file tae_def_kwargs = { 'stats': self.stats, 'run_obj': scenario.run_obj, 'runhistory': runhistory, 'par_factor': scenario.par_factor, 'cost_for_crash': scenario.cost_for_crash, 'abort_on_first_run_crash': scenario.abort_on_first_run_crash } if tae_runner_kwargs is not None: tae_def_kwargs.update(tae_runner_kwargs) if 'ta' not in tae_def_kwargs: tae_def_kwargs['ta'] = scenario.ta if tae_runner is None: tae_def_kwargs['ta'] = scenario.ta tae_runner = ExecuteTARunOld(**tae_def_kwargs) elif inspect.isclass(tae_runner): tae_runner = tae_runner(**tae_def_kwargs) elif callable(tae_runner): tae_def_kwargs['ta'] = tae_runner tae_runner = ExecuteTAFuncDict(**tae_def_kwargs) else: raise TypeError( "Argument 'tae_runner' is %s, but must be " "either None, a callable or an object implementing " "ExecuteTaRun. Passing 'None' will result in the " "creation of target algorithm runner based on the " "call string in the scenario file." % type(tae_runner)) # Check that overall objective and tae objective are the same if tae_runner.run_obj != scenario.run_obj: raise ValueError("Objective for the target algorithm runner and " "the scenario must be the same, but are '%s' and " "'%s'" % (tae_runner.run_obj, scenario.run_obj)) # initialize intensification intensifier_def_kwargs = { 'tae_runner': tae_runner, 'stats': self.stats, 'traj_logger': traj_logger, 'rng': rng, 'instances': scenario.train_insts, 'cutoff': scenario.cutoff, 'deterministic': scenario.deterministic, 'run_obj_time': scenario.run_obj == "runtime", 'always_race_against': scenario.cs.get_default_configuration() if scenario.always_race_default else None, 'use_ta_time_bound': scenario.use_ta_time, 'instance_specifics': scenario.instance_specific, 'minR': scenario.minR, 'maxR': scenario.maxR, 'adaptive_capping_slackfactor': scenario.intens_adaptive_capping_slackfactor, 'min_chall': scenario.intens_min_chall, } if hasattr(scenario, 'filter_callback') and scenario.filter_callback is not None: print('update callback') intensifier_def_kwargs.update( {'filter_callback': scenario.filter_callback}) if intensifier_kwargs is not None: intensifier_def_kwargs.update(intensifier_kwargs) if intensifier is None: intensifier = Intensifier(**intensifier_def_kwargs) elif inspect.isclass(intensifier): intensifier = intensifier(**intensifier_def_kwargs) else: raise TypeError( "Argument intensifier must be None or an object implementing the Intensifier, but is '%s'" % type(intensifier)) # initial design if initial_design is not None and initial_configurations is not None: initial_design.initial_configurations = initial_configurations initial_configurations = None init_design_def_kwargs = { 'tae_runner': tae_runner, 'scenario': scenario, 'stats': self.stats, 'traj_logger': traj_logger, 'runhistory': runhistory, 'rng': rng, 'configs': initial_configurations, 'intensifier': intensifier, 'aggregate_func': aggregate_func, 'n_configs_x_params': 0, 'max_config_fracs': 0.0, 'initial_configurations': initial_design.initial_configurations } if initial_design_kwargs is not None: init_design_def_kwargs.update(initial_design_kwargs) if initial_configurations is not None: initial_design = InitialDesign(**init_design_def_kwargs) elif initial_design is None: if scenario.initial_incumbent == "DEFAULT": init_design_def_kwargs['max_config_fracs'] = 0.0 initial_design = DefaultConfiguration(**init_design_def_kwargs) elif scenario.initial_incumbent == "RANDOM": init_design_def_kwargs['max_config_fracs'] = 0.0 initial_design = RandomConfigurations(**init_design_def_kwargs) elif scenario.initial_incumbent == "LHD": initial_design = LHDesign(**init_design_def_kwargs) elif scenario.initial_incumbent == "FACTORIAL": initial_design = FactorialInitialDesign( **init_design_def_kwargs) elif scenario.initial_incumbent == "SOBOL": initial_design = SobolDesign(**init_design_def_kwargs) else: raise ValueError("Don't know what kind of initial_incumbent " "'%s' is" % scenario.initial_incumbent) elif inspect.isclass(initial_design): initial_design = initial_design(**init_design_def_kwargs) else: raise TypeError( "Argument initial_design must be None or an object implementing the InitialDesign, but is '%s'" % type(initial_design)) # if we log the performance data, # the RFRImputator will already get # log transform data from the runhistory if scenario.transform_y in ["LOG", "LOGS"]: cutoff = np.log(np.nanmin([np.inf, np.float_(scenario.cutoff)])) threshold = cutoff + np.log(scenario.par_factor) else: cutoff = np.nanmin([np.inf, np.float_(scenario.cutoff)]) threshold = cutoff * scenario.par_factor num_params = len(scenario.cs.get_hyperparameters()) imputor = RFRImputator(rng=rng, cutoff=cutoff, threshold=threshold, model=model, change_threshold=0.01, max_iter=2) r2e_def_kwargs = { 'scenario': scenario, 'num_params': num_params, 'success_states': [ StatusType.SUCCESS, ], 'impute_censored_data': True, 'impute_state': [ StatusType.CAPPED, ], 'imputor': imputor, 'scale_perc': 5 } if scenario.run_obj == 'quality': r2e_def_kwargs.update({ 'success_states': [StatusType.SUCCESS, StatusType.CRASHED], 'impute_censored_data': False, 'impute_state': None, }) if runhistory2epm_kwargs is not None: r2e_def_kwargs.update(runhistory2epm_kwargs) if runhistory2epm is None: if scenario.run_obj == 'runtime': runhistory2epm = RunHistory2EPM4LogCost(**r2e_def_kwargs) elif scenario.run_obj == 'quality': if scenario.transform_y == "NONE": runhistory2epm = RunHistory2EPM4Cost(**r2e_def_kwargs) elif scenario.transform_y == "LOG": runhistory2epm = RunHistory2EPM4LogCost(**r2e_def_kwargs) elif scenario.transform_y == "LOGS": runhistory2epm = RunHistory2EPM4LogScaledCost( **r2e_def_kwargs) elif scenario.transform_y == "INVS": runhistory2epm = RunHistory2EPM4InvScaledCost( **r2e_def_kwargs) else: raise ValueError('Unknown run objective: %s. Should be either ' 'quality or runtime.' % self.scenario.run_obj) elif inspect.isclass(runhistory2epm): runhistory2epm = runhistory2epm(**r2e_def_kwargs) else: raise TypeError( "Argument runhistory2epm must be None or an object implementing the RunHistory2EPM, but is '%s'" % type(runhistory2epm)) smbo_args = { 'scenario': scenario, 'stats': self.stats, 'initial_design': initial_design, 'runhistory': runhistory, 'runhistory2epm': runhistory2epm, 'intensifier': intensifier, 'aggregate_func': aggregate_func, 'num_run': run_id, 'model': model, 'acq_optimizer': acquisition_function_optimizer, 'acquisition_func': acquisition_function, 'rng': rng, 'restore_incumbent': restore_incumbent, 'random_configuration_chooser': random_configuration_chooser } if smbo_class is None: self.solver = SMBO(**smbo_args) else: self.solver = smbo_class(**smbo_args)
def _get_runs( self, configs: Union[str, typing.List[Configuration]], insts: Union[str, typing.List[str]], repetitions: int = 1, runhistory: RunHistory = None, ) -> typing.Tuple[typing.List[_Run], RunHistory]: """ Generate list of SMAC-TAE runs to be executed. This means combinations of configs with all instances on a certain number of seeds. side effect: Adds runs that don't need to be reevaluated to self.rh! Parameters ---------- configs: str or list<Configuration> string or directly a list of Configuration str from [def, inc, def+inc, wallclock_time, cpu_time, all] time evaluates at cpu- or wallclock-timesteps of: [max_time/2^0, max_time/2^1, max_time/2^3, ..., default] with max_time being the highest recorded time insts: str or list<str> what instances to use for validation, either from [train, test, train+test] or directly a list of instances repetitions: int number of seeds per instance/config-pair to be evaluated runhistory: RunHistory optional, try to reuse this runhistory and save some runs Returns ------- runs: list<_Run> list with _Runs [_Run(config=CONFIG1,inst=INSTANCE1,seed=SEED1,inst_specs=INST_SPECIFICS1), _Run(config=CONFIG2,inst=INSTANCE2,seed=SEED2,inst_specs=INST_SPECIFICS2), ...] """ # Get relevant configurations and instances if isinstance(configs, str): configs = self._get_configs(configs) if isinstance(insts, str): insts = self._get_instances(insts) # If no instances are given, fix the instances to one "None" instance if not insts: insts = [None] # If algorithm is deterministic, fix repetitions to 1 if self.scen.deterministic and repetitions != 1: self.logger.warning( "Specified %d repetitions, but fixing to 1, " "because algorithm is deterministic.", repetitions) repetitions = 1 # Extract relevant information from given runhistory inst_seed_config = self._process_runhistory(configs, insts, runhistory) # Now create the actual run-list runs = [] # Counter for runs without the need of recalculation runs_from_rh = 0 # If we reuse runs, we want to return them as well new_rh = RunHistory(average_cost) for i in sorted(insts): for rep in range(repetitions): # First, find a seed and add all the data we can take from the # given runhistory to "our" validation runhistory. configs_evaluated = [] if runhistory and i in inst_seed_config: # Choose seed based on most often evaluated inst-seed-pair seed, configs_evaluated = inst_seed_config[i].pop(0) # Delete inst if all seeds are used if not inst_seed_config[i]: inst_seed_config.pop(i) # Add runs to runhistory for c in configs_evaluated[:]: runkey = RunKey(runhistory.config_ids[c], i, seed) cost, time, status, additional_info = runhistory.data[ runkey] if status in [ StatusType.CRASHED, StatusType.ABORT, StatusType.CAPPED ]: # Not properly executed target algorithm runs should be repeated configs_evaluated.remove(c) continue new_rh.add(c, cost, time, status, instance_id=i, seed=seed, additional_info=additional_info) runs_from_rh += 1 else: # If no runhistory or no entries for instance, get new seed seed = self.rng.randint(MAXINT) # We now have a seed and add all configs that are not already # evaluated on that seed to the runs-list. This way, we # guarantee the same inst-seed-pairs for all configs. for config in [ c for c in configs if not c in configs_evaluated ]: # Only use specifics if specific exists, else use string "0" specs = self.scen.instance_specific[ i] if i and i in self.scen.instance_specific else "0" runs.append( _Run(config=config, inst=i, seed=seed, inst_specs=specs)) self.logger.info( "Collected %d runs from %d configurations on %d " "instances with %d repetitions. Reusing %d runs from " "given runhistory.", len(runs), len(configs), len(insts), repetitions, runs_from_rh) return runs, new_rh
class Hydra(object): """ Facade to use Hydra default mode Attributes ---------- logger stats : Stats loggs information about used resources solver : SMBO handles the actual algorithm calls rh : RunHistory List with information about previous runs portfolio : list List of all incumbents """ def __init__(self, scenario: Scenario, n_iterations: int, val_set: str = 'train', incs_per_round: int = 1, n_optimizers: int = 1, rng: typing.Optional[typing.Union[np.random.RandomState, int]] = None, run_id: int = 1, tae: typing.Type[ExecuteTARun] = ExecuteTARunOld, tae_kwargs: typing.Union[dict, None] = None, **kwargs): """ Constructor Parameters ---------- scenario : ~dsmac.scenario.scenario.Scenario Scenario object n_iterations: int, number of Hydra iterations val_set: str Set to validate incumbent(s) on. [train, valX]. train => whole training set, valX => train_set * 100/X where X in (0, 100) incs_per_round: int Number of incumbents to keep per round n_optimizers: int Number of optimizers to run in parallel per round rng: int/np.random.RandomState The randomState/seed to pass to each dsmac run run_id: int run_id for this hydra run tae: ExecuteTARun Target Algorithm Runner (supports old and aclib format as well as AbstractTAFunc) tae_kwargs: Optional[dict] arguments passed to constructor of '~tae' """ self.logger = logging.getLogger(self.__module__ + "." + self.__class__.__name__) self.n_iterations = n_iterations self.scenario = scenario self.run_id, self.rng = get_rng(rng, run_id, self.logger) self.kwargs = kwargs self.output_dir = None self.top_dir = None self.solver = None self.portfolio = None self.rh = RunHistory(average_cost, file_system=scenario.file_system) self._tae = tae self._tae_kwargs = tae_kwargs if incs_per_round <= 0: self.logger.warning('Invalid value in %s: %d. Setting to 1', 'incs_per_round', incs_per_round) self.incs_per_round = max(incs_per_round, 1) if n_optimizers <= 0: self.logger.warning('Invalid value in %s: %d. Setting to 1', 'n_optimizers', n_optimizers) self.n_optimizers = max(n_optimizers, 1) self.val_set = self._get_validation_set(val_set) self.cost_per_inst = {} self.optimizer = None self.portfolio_cost = None def _get_validation_set(self, val_set: str, delete: bool = True) -> typing.List[str]: """ Create small validation set for hydra to determine incumbent performance Parameters ---------- val_set: str Set to validate incumbent(s) on. [train, valX]. train => whole training set, valX => train_set * 100/X where X in (0, 100) delete: bool Flag to delete all validation instances from the training set Returns ------- val: typing.List[str] List of instance-ids to validate on """ if val_set == 'none': return None if val_set == 'train': return self.scenario.train_insts elif val_set[:3] != 'val': self.logger.warning( 'Can not determine validation set size. Using full training-set!' ) return self.scenario.train_insts else: size = int(val_set[3:]) / 100 if size <= 0 or size >= 1: raise ValueError( 'X invalid in valX, should be between 0 and 1') insts = np.array(self.scenario.train_insts) # just to make sure this also works with the small example we have to round up to 3 size = max(np.floor(insts.shape[0] * size).astype(int), 3) ids = np.random.choice(insts.shape[0], size, replace=False) val = insts[ids].tolist() if delete: self.scenario.train_insts = np.delete(insts, ids).tolist() return val def optimize(self) -> typing.List[Configuration]: """ Optimizes the algorithm provided in scenario (given in constructor) Returns ------- portfolio : typing.List[Configuration] Portfolio of found configurations """ # Setup output directory self.portfolio = [] portfolio_cost = np.inf if self.output_dir is None: self.top_dir = "hydra-output_%s" % ( datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d_%H:%M:%S_%f')) self.scenario.output_dir = os.path.join( self.top_dir, "psmac3-output_%s" % (datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d_%H:%M:%S_%f'))) self.output_dir = create_output_directory(self.scenario, run_id=self.run_id, logger=self.logger) scen = copy.deepcopy(self.scenario) scen.output_dir_for_this_run = None scen.output_dir = None # parent process SMAC only used for validation purposes self.solver = SMAC4AC(scenario=scen, tae_runner=self._tae, rng=self.rng, run_id=self.run_id, **self.kwargs) for i in range(self.n_iterations): self.logger.info("=" * 120) self.logger.info("Hydra Iteration: %d", (i + 1)) if i == 0: tae = self._tae tae_kwargs = self._tae_kwargs else: tae = ExecuteTARunHydra if self._tae_kwargs: tae_kwargs = self._tae_kwargs else: tae_kwargs = {} tae_kwargs['cost_oracle'] = self.cost_per_inst self.optimizer = PSMAC( scenario=self.scenario, run_id=self.run_id, rng=self.rng, tae=tae, tae_kwargs=tae_kwargs, shared_model=False, validate=True if self.val_set else False, n_optimizers=self.n_optimizers, val_set=self.val_set, n_incs=self. n_optimizers, # return all configurations (unvalidated) **self.kwargs) self.optimizer.output_dir = self.output_dir incs = self.optimizer.optimize() cost_per_conf_v, val_ids, cost_per_conf_e, est_ids = self.optimizer.get_best_incumbents_ids( incs) if self.val_set: to_keep_ids = val_ids[:self.incs_per_round] else: to_keep_ids = est_ids[:self.incs_per_round] config_cost_per_inst = {} incs = incs[to_keep_ids] self.logger.info('Kept incumbents') for inc in incs: self.logger.info(inc) config_cost_per_inst[inc] = cost_per_conf_v[ inc] if self.val_set else cost_per_conf_e[inc] cur_portfolio_cost = self._update_portfolio( incs, config_cost_per_inst) if portfolio_cost <= cur_portfolio_cost: self.logger.info( "No further progress (%f) --- terminate hydra", portfolio_cost) break else: portfolio_cost = cur_portfolio_cost self.logger.info("Current pertfolio cost: %f", portfolio_cost) self.scenario.output_dir = os.path.join( self.top_dir, "psmac3-output_%s" % (datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d_%H:%M:%S_%f'))) self.output_dir = create_output_directory(self.scenario, run_id=self.run_id, logger=self.logger) read(self.rh, os.path.join(self.top_dir, 'psmac3*', 'run_' + str(MAXINT)), self.scenario.cs, self.logger) self.rh.save_json(fn=os.path.join( self.top_dir, 'all_validated_runs_runhistory.json'), save_external=True) with open(os.path.join(self.top_dir, 'portfolio.pkl'), 'wb') as fh: pickle.dump(self.portfolio, fh) self.logger.info("~" * 120) self.logger.info('Resulting Portfolio:') for configuration in self.portfolio: self.logger.info(str(configuration)) self.logger.info("~" * 120) return self.portfolio def _update_portfolio( self, incs: np.ndarray, config_cost_per_inst: typing.Dict ) -> typing.Union[np.float, float]: """ Validates all configurations (in incs) and determines which ones to add to the portfolio Parameters ---------- incs: np.ndarray List of Configurations Returns ------- cur_cost: typing.Union[np.float, float] The current cost of the portfolio """ if self.val_set: # we have validated data for kept in incs: if kept not in self.portfolio: self.portfolio.append(kept) cost_per_inst = config_cost_per_inst[kept] if self.cost_per_inst: if len(self.cost_per_inst) != len(cost_per_inst): raise ValueError( 'Num validated Instances mismatch!') else: for key in cost_per_inst: self.cost_per_inst[key] = min( self.cost_per_inst[key], cost_per_inst[key]) else: self.cost_per_inst = cost_per_inst cur_cost = np.mean(list( self.cost_per_inst.values())) # type: np.float else: # No validated data. Set the mean to the approximated mean means = [ ] # can contain nans as not every instance was evaluated thus we should use nanmean to approximate for kept in incs: means.append( np.nanmean( list( self.optimizer.rh.get_instance_costs_for_config( kept).values()))) self.portfolio.append(kept) if self.portfolio_cost: new_mean = self.portfolio_cost * ( len(self.portfolio) - len(incs)) / len(self.portfolio) new_mean += np.nansum(means) else: new_mean = np.mean(means) self.cost_per_inst = defaultdict(lambda: new_mean) cur_cost = new_mean self.portfolio_cost = cur_cost return cur_cost
def __init__(self, scenario: Scenario, n_iterations: int, val_set: str = 'train', incs_per_round: int = 1, n_optimizers: int = 1, rng: typing.Optional[typing.Union[np.random.RandomState, int]] = None, run_id: int = 1, tae: typing.Type[ExecuteTARun] = ExecuteTARunOld, tae_kwargs: typing.Union[dict, None] = None, **kwargs): """ Constructor Parameters ---------- scenario : ~dsmac.scenario.scenario.Scenario Scenario object n_iterations: int, number of Hydra iterations val_set: str Set to validate incumbent(s) on. [train, valX]. train => whole training set, valX => train_set * 100/X where X in (0, 100) incs_per_round: int Number of incumbents to keep per round n_optimizers: int Number of optimizers to run in parallel per round rng: int/np.random.RandomState The randomState/seed to pass to each dsmac run run_id: int run_id for this hydra run tae: ExecuteTARun Target Algorithm Runner (supports old and aclib format as well as AbstractTAFunc) tae_kwargs: Optional[dict] arguments passed to constructor of '~tae' """ self.logger = logging.getLogger(self.__module__ + "." + self.__class__.__name__) self.n_iterations = n_iterations self.scenario = scenario self.run_id, self.rng = get_rng(rng, run_id, self.logger) self.kwargs = kwargs self.output_dir = None self.top_dir = None self.solver = None self.portfolio = None self.rh = RunHistory(average_cost, file_system=scenario.file_system) self._tae = tae self._tae_kwargs = tae_kwargs if incs_per_round <= 0: self.logger.warning('Invalid value in %s: %d. Setting to 1', 'incs_per_round', incs_per_round) self.incs_per_round = max(incs_per_round, 1) if n_optimizers <= 0: self.logger.warning('Invalid value in %s: %d. Setting to 1', 'n_optimizers', n_optimizers) self.n_optimizers = max(n_optimizers, 1) self.val_set = self._get_validation_set(val_set) self.cost_per_inst = {} self.optimizer = None self.portfolio_cost = None
def __init__( self, scenario: Scenario, # TODO: once we drop python3.4 add type hint # typing.Union[ExecuteTARun, callable] tae_runner=None, runhistory: RunHistory = None, intensifier: Intensifier = None, acquisition_function: AbstractAcquisitionFunction = None, model: AbstractEPM = None, runhistory2epm: AbstractRunHistory2EPM = None, initial_design: InitialDesign = None, initial_configurations: typing.List[Configuration] = None, stats: Stats = None, rng: np.random.RandomState = None, run_id: int = 1): """Constructor""" self.logger = logging.getLogger(self.__module__ + "." + self.__class__.__name__) aggregate_func = average_cost self.runhistory = None self.trajectory = None # initialize stats object if stats: self.stats = stats else: self.stats = Stats(scenario, file_system=scenario.file_system) self.output_dir = create_output_directory(scenario, run_id) scenario.write() # initialize empty runhistory if runhistory is None: runhistory = RunHistory(aggregate_func=aggregate_func, file_system=scenario.file_system) # inject aggr_func if necessary if runhistory.aggregate_func is None: runhistory.aggregate_func = aggregate_func # initial random number generator num_run, rng = self._get_rng(rng=rng) # reset random number generator in config space to draw different # random configurations with each seed given to SMAC scenario.cs.seed(rng.randint(MAXINT)) # initial Trajectory Logger traj_logger = TrajLogger(output_dir=self.output_dir, stats=self.stats, file_system=scenario.file_system) # initial EPM types, bounds = get_types(scenario.cs, scenario.feature_array) if model is None: model = RandomForestWithInstances( configspace=scenario.cs, types=types, bounds=bounds, instance_features=scenario.feature_array, seed=rng.randint(MAXINT), pca_components=scenario.PCA_DIM, num_trees=scenario.rf_num_trees, do_bootstrapping=scenario.rf_do_bootstrapping, ratio_features=scenario.rf_ratio_features, min_samples_split=scenario.rf_min_samples_split, min_samples_leaf=scenario.rf_min_samples_leaf, max_depth=scenario.rf_max_depth, ) # initial acquisition function if acquisition_function is None: if scenario.run_obj == "runtime": acquisition_function = LogEI(model=model) else: acquisition_function = EI(model=model) # inject model if necessary if acquisition_function.model is None: acquisition_function.model = model # initialize optimizer on acquisition function local_search = LocalSearch( acquisition_function, scenario.cs, max_steps=scenario.sls_max_steps, n_steps_plateau_walk=scenario.sls_n_steps_plateau_walk) # initialize tae_runner # First case, if tae_runner is None, the target algorithm is a call # string in the scenario file if tae_runner is None: tae_runner = ExecuteTARunOld( ta=scenario.ta, stats=self.stats, run_obj=scenario.run_obj, runhistory=runhistory, par_factor=scenario.par_factor, cost_for_crash=scenario.cost_for_crash) # Second case, the tae_runner is a function to be optimized elif callable(tae_runner): tae_runner = ExecuteTAFuncDict( ta=tae_runner, stats=self.stats, run_obj=scenario.run_obj, memory_limit=scenario.memory_limit, runhistory=runhistory, par_factor=scenario.par_factor, cost_for_crash=scenario.cost_for_crash) # Third case, if it is an ExecuteTaRun we can simply use the # instance. Otherwise, the next check raises an exception elif not isinstance(tae_runner, ExecuteTARun): raise TypeError("Argument 'tae_runner' is %s, but must be " "either a callable or an instance of " "ExecuteTaRun. Passing 'None' will result in the " "creation of target algorithm runner based on the " "call string in the scenario file." % type(tae_runner)) # Check that overall objective and tae objective are the same if tae_runner.run_obj != scenario.run_obj: raise ValueError("Objective for the target algorithm runner and " "the scenario must be the same, but are '%s' and " "'%s'" % (tae_runner.run_obj, scenario.run_obj)) # inject stats if necessary if tae_runner.stats is None: tae_runner.stats = self.stats # inject runhistory if necessary if tae_runner.runhistory is None: tae_runner.runhistory = runhistory # inject cost_for_crash if tae_runner.crash_cost != scenario.cost_for_crash: tae_runner.crash_cost = scenario.cost_for_crash # initialize intensification if intensifier is None: intensifier = Intensifier( tae_runner=tae_runner, stats=self.stats, traj_logger=traj_logger, rng=rng, instances=scenario.train_insts, cutoff=scenario.cutoff, deterministic=scenario.deterministic, run_obj_time=scenario.run_obj == "runtime", always_race_against=scenario.cs.get_default_configuration() if scenario.always_race_default else None, instance_specifics=scenario.instance_specific, minR=scenario.minR, maxR=scenario.maxR, adaptive_capping_slackfactor=scenario. intens_adaptive_capping_slackfactor, min_chall=scenario.intens_min_chall, distributer=scenario.distributer) # inject deps if necessary if intensifier.tae_runner is None: intensifier.tae_runner = tae_runner if intensifier.stats is None: intensifier.stats = self.stats if intensifier.traj_logger is None: intensifier.traj_logger = traj_logger # initial design if initial_design is not None and initial_configurations is not None: raise ValueError( "Either use initial_design or initial_configurations; but not both" ) if initial_configurations is not None: initial_design = InitialDesign(tae_runner=tae_runner, scenario=scenario, stats=self.stats, traj_logger=traj_logger, runhistory=runhistory, rng=rng, configs=initial_configurations, intensifier=intensifier, aggregate_func=aggregate_func) elif initial_design is None: if scenario.initial_incumbent == "DEFAULT": initial_design = DefaultConfiguration( tae_runner=tae_runner, scenario=scenario, stats=self.stats, traj_logger=traj_logger, runhistory=runhistory, rng=rng, intensifier=intensifier, aggregate_func=aggregate_func, max_config_fracs=0.0) elif scenario.initial_incumbent == "RANDOM": initial_design = RandomConfigurations( tae_runner=tae_runner, scenario=scenario, stats=self.stats, traj_logger=traj_logger, runhistory=runhistory, rng=rng, intensifier=intensifier, aggregate_func=aggregate_func, max_config_fracs=0.0) else: raise ValueError("Don't know what kind of initial_incumbent " "'%s' is" % scenario.initial_incumbent) # inject deps if necessary if initial_design.tae_runner is None: initial_design.tae_runner = tae_runner if initial_design.scenario is None: initial_design.scenario = scenario if initial_design.stats is None: initial_design.stats = self.stats if initial_design.traj_logger is None: initial_design.traj_logger = traj_logger # initial conversion of runhistory into EPM data if runhistory2epm is None: num_params = len(scenario.cs.get_hyperparameters()) if scenario.run_obj == "runtime": # if we log the performance data, # the RFRImputator will already get # log transform data from the runhistory cutoff = np.log(scenario.cutoff) threshold = np.log(scenario.cutoff * scenario.par_factor) imputor = RFRImputator(rng=rng, cutoff=cutoff, threshold=threshold, model=model, change_threshold=0.01, max_iter=2) runhistory2epm = RunHistory2EPM4LogCost( scenario=scenario, num_params=num_params, success_states=[ StatusType.SUCCESS, ], impute_censored_data=True, impute_state=[ StatusType.CAPPED, ], imputor=imputor) elif scenario.run_obj == 'quality': runhistory2epm = RunHistory2EPM4Cost( scenario=scenario, num_params=num_params, success_states=[ StatusType.SUCCESS, ], impute_censored_data=False, impute_state=None) else: raise ValueError('Unknown run objective: %s. Should be either ' 'quality or runtime.' % self.scenario.run_obj) # inject scenario if necessary: if runhistory2epm.scenario is None: runhistory2epm.scenario = scenario self.solver = EPILS_Solver(scenario=scenario, stats=self.stats, initial_design=initial_design, runhistory=runhistory, runhistory2epm=runhistory2epm, intensifier=intensifier, aggregate_func=aggregate_func, num_run=num_run, model=model, acq_optimizer=local_search, acquisition_func=acquisition_function, rng=rng)
def main_cli(self, commandline_arguments: typing.List[str]=None): """Main function of SMAC for CLI interface""" self.logger.info("SMAC call: %s" % (" ".join(sys.argv))) cmd_reader = CMDReader() kwargs = {} if commandline_arguments: kwargs['commandline_arguments'] = commandline_arguments main_args_, smac_args_, scen_args_ = cmd_reader.read_cmd(**kwargs) root_logger = logging.getLogger() root_logger.setLevel(main_args_.verbose_level) logger_handler = logging.StreamHandler( stream=sys.stdout) if root_logger.level >= logging.INFO: formatter = logging.Formatter( "%(levelname)s:\t%(message)s") else: formatter = logging.Formatter( "%(asctime)s:%(levelname)s:%(name)s:%(message)s", "%Y-%m-%d %H:%M:%S") logger_handler.setFormatter(formatter) root_logger.addHandler(logger_handler) # remove default handler if len(root_logger.handlers) > 1: root_logger.removeHandler(root_logger.handlers[0]) # Create defaults rh = None initial_configs = None stats = None incumbent = None # Create scenario-object scenario = {} scenario.update(vars(smac_args_)) scenario.update(vars(scen_args_)) scen = Scenario(scenario=scenario) # Restore state if main_args_.restore_state: root_logger.debug("Restoring state from %s...", main_args_.restore_state) rh, stats, traj_list_aclib, traj_list_old = self.restore_state(scen, main_args_) scen.output_dir_for_this_run = create_output_directory( scen, main_args_.seed, root_logger, ) scen.write() incumbent = self.restore_state_after_output_dir(scen, stats, traj_list_aclib, traj_list_old) if main_args_.warmstart_runhistory: aggregate_func = average_cost rh = RunHistory(aggregate_func=aggregate_func) scen, rh = merge_foreign_data_from_file( scenario=scen, runhistory=rh, in_scenario_fn_list=main_args_.warmstart_scenario, in_runhistory_fn_list=main_args_.warmstart_runhistory, cs=scen.cs, aggregate_func=aggregate_func) if main_args_.warmstart_incumbent: initial_configs = [scen.cs.get_default_configuration()] for traj_fn in main_args_.warmstart_incumbent: trajectory = TrajLogger.read_traj_aclib_format( fn=traj_fn, cs=scen.cs) initial_configs.append(trajectory[-1]["incumbent"]) if main_args_.mode == "SMAC4AC": optimizer = SMAC4AC( scenario=scen, rng=np.random.RandomState(main_args_.seed), runhistory=rh, initial_configurations=initial_configs, stats=stats, restore_incumbent=incumbent, run_id=main_args_.seed) elif main_args_.mode == "SMAC4HPO": optimizer = SMAC4HPO( scenario=scen, rng=np.random.RandomState(main_args_.seed), runhistory=rh, initial_configurations=initial_configs, stats=stats, restore_incumbent=incumbent, run_id=main_args_.seed) elif main_args_.mode == "SMAC4BO": optimizer = SMAC4BO( scenario=scen, rng=np.random.RandomState(main_args_.seed), runhistory=rh, initial_configurations=initial_configs, stats=stats, restore_incumbent=incumbent, run_id=main_args_.seed) elif main_args_.mode == "ROAR": optimizer = ROAR( scenario=scen, rng=np.random.RandomState(main_args_.seed), runhistory=rh, initial_configurations=initial_configs, run_id=main_args_.seed) elif main_args_.mode == "EPILS": optimizer = EPILS( scenario=scen, rng=np.random.RandomState(main_args_.seed), runhistory=rh, initial_configurations=initial_configs, run_id=main_args_.seed) elif main_args_.mode == "Hydra": optimizer = Hydra( scenario=scen, rng=np.random.RandomState(main_args_.seed), runhistory=rh, initial_configurations=initial_configs, stats=stats, restore_incumbent=incumbent, run_id=main_args_.seed, random_configuration_chooser=main_args_.random_configuration_chooser, n_iterations=main_args_.hydra_iterations, val_set=main_args_.hydra_validation, incs_per_round=main_args_.hydra_incumbents_per_round, n_optimizers=main_args_.hydra_n_optimizers) elif main_args_.mode == "PSMAC": optimizer = PSMAC( scenario=scen, rng=np.random.RandomState(main_args_.seed), run_id=main_args_.seed, shared_model=smac_args_.shared_model, validate=main_args_.psmac_validate, n_optimizers=main_args_.hydra_n_optimizers, n_incs=main_args_.hydra_incumbents_per_round, ) try: optimizer.optimize() except (TAEAbortException, FirstRunCrashedException) as err: self.logger.error(err)
def _race_challenger(self, challenger: Configuration, incumbent: Configuration, run_history: RunHistory, aggregate_func: typing.Callable, log_traj: bool = True): """Aggressively race challenger against incumbent Parameters ---------- challenger : Configuration Configuration which challenges incumbent incumbent : Configuration Best configuration so far run_history : RunHistory Stores all runs we ran so far aggregate_func: typing.Callable Aggregate performance across instances log_traj: bool Whether to log changes of incumbents in trajectory Returns ------- new_incumbent: Configuration Either challenger or incumbent """ # at least one run of challenger # to increase chall_indx counter first_run = False # Line 8 N = max(1, self.minR) inc_inst_seeds = set(run_history.get_runs_for_config(incumbent)) # Line 9 while True: chall_inst_seeds = set(run_history.get_runs_for_config(challenger)) # Line 10 missing_runs = list(inc_inst_seeds - chall_inst_seeds) # Line 11 self.rs.shuffle(missing_runs) to_run = missing_runs[:min(N, len(missing_runs))] # Line 13 (Line 12 comes below...) missing_runs = missing_runs[min(N, len(missing_runs)):] # for adaptive capping # because of efficieny computed here inst_seed_pairs = list(inc_inst_seeds - set(missing_runs)) # cost used by incumbent for going over all runs in inst_seed_pairs inc_sum_cost = sum_cost(config=incumbent, instance_seed_pairs=inst_seed_pairs, run_history=run_history) if len(to_run) == 0: self.logger.debug("No further runs for challenger available") # Line 12 # Run challenger on all <config,seed> to run for instance, seed in to_run: cutoff = self._adapt_cutoff(challenger=challenger, incumbent=incumbent, run_history=run_history, inc_sum_cost=inc_sum_cost) if cutoff is not None and cutoff <= 0: # no time to validate challenger self.logger.debug("Stop challenger itensification due " "to adaptive capping.") # challenger performance is worse than incumbent return incumbent if not first_run: first_run = True self._chall_indx += 1 self.logger.debug("Add run of challenger") try: status, cost, dur, res = self.tae_runner.start( config=challenger, instance=instance, seed=seed, cutoff=cutoff, instance_specific=self.instance_specifics.get( instance, "0"), capped=(self.cutoff is not None) and (cutoff < self.cutoff)) self._num_run += 1 self._ta_time += dur except CappedRunException: return incumbent new_incumbent = self._compare_configs( incumbent=incumbent, challenger=challenger, run_history=run_history, aggregate_func=aggregate_func, log_traj=log_traj) if new_incumbent == incumbent: break elif new_incumbent == challenger: incumbent = challenger break else: # Line 17 # challenger is not worse, continue N = 2 * N return incumbent
from dsmac.runhistory.runhistory import RunHistoryDB from dsmac.runhistory.utils import get_id_of_config from dsmac.tae.execute_ta_run import StatusType if __name__ == '__main__': from ConfigSpace import ConfigurationSpace import joblib from dsmac.optimizer.objective import average_cost from dsmac.runhistory.runhistory import RunHistory runhistory = RunHistory(average_cost,db_args="test.db") cs: ConfigurationSpace = joblib.load("/home/tqc/PycharmProjects/auto-pipeline/test/php.bz2") runhistory.load_json( "/home/tqc/PycharmProjects/auto-pipeline/test/test_runhistory/default_dataset_name/smac_output/runhistory.json", cs) all_configs = (runhistory.get_all_configs()) config = all_configs[0] config_id = get_id_of_config(config) cost = runhistory.get_cost(config) db = RunHistoryDB(cs, runhistory, "test.db") db.delete_all() ans = db.appointment_config(config) print(ans) db.insert_runhistory(config, cost, 0.1, StatusType.SUCCESS) db2 = RunHistoryDB(cs, runhistory, "test.db") db2.insert_runhistory(all_configs[1], runhistory.get_cost(all_configs[1]), 0.1, StatusType.SUCCESS) db.fetch_new_runhistory()
def _compare_configs(self, incumbent: Configuration, challenger: Configuration, run_history: RunHistory, aggregate_func: typing.Callable, log_traj: bool = True): """ Compare two configuration wrt the runhistory and return the one which performs better (or None if the decision is not safe) Decision strategy to return x as being better than y: 1. x has at least as many runs as y 2. x performs better than y on the intersection of runs on x and y Implicit assumption: Challenger was evaluated on the same instance-seed pairs as incumbent Parameters ---------- incumbent: Configuration Current incumbent challenger: Configuration Challenger configuration run_history: RunHistory Stores all runs we ran so far aggregate_func: typing.Callable Aggregate performance across instances log_traj: bool Whether to log changes of incumbents in trajectory Returns ------- None or better of the two configurations x,y """ inc_runs = run_history.get_runs_for_config(incumbent) chall_runs = run_history.get_runs_for_config(challenger) to_compare_runs = set(inc_runs).intersection(chall_runs) # performance on challenger runs chal_perf = aggregate_func(challenger, run_history, to_compare_runs) inc_perf = aggregate_func(incumbent, run_history, to_compare_runs) # Line 15 if chal_perf > inc_perf and len(chall_runs) >= self.minR: # Incumbent beats challenger self.logger.debug("Incumbent (%.4f) is better than challenger " "(%.4f) on %d runs." % (inc_perf, chal_perf, len(chall_runs))) return incumbent # Line 16 if not set(inc_runs) - set(chall_runs): # no plateau walks if chal_perf >= inc_perf: self.logger.debug( "Incumbent (%.4f) is at least as good as the " "challenger (%.4f) on %d runs." % (inc_perf, chal_perf, len(chall_runs))) return incumbent # Challenger is better than incumbent # and has at least the same runs as inc # -> change incumbent n_samples = len(chall_runs) self.logger.info( "Challenger (%.4f) is better than incumbent (%.4f)" " on %d runs." % (chal_perf, inc_perf, n_samples)) # Show changes in the configuration params = sorted([(param, incumbent[param], challenger[param]) for param in challenger.keys()]) self.logger.info("Changes in incumbent:") for param in params: if param[1] != param[2]: self.logger.info(" %s : %r -> %r" % (param)) else: self.logger.debug(" %s remains unchanged: %r" % (param[0], param[1])) if log_traj: self.stats.inc_changed += 1 self.traj_logger.add_entry(train_perf=chal_perf, incumbent_id=self.stats.inc_changed, incumbent=challenger) return challenger # undecided return None
def __init__(self, scenario: Scenario, rng: typing.Optional[typing.Union[np.random.RandomState, int]] = None, run_id: int = 1, tae: typing.Type[ExecuteTARun] = ExecuteTARunOld, tae_kwargs: typing.Union[dict, None] = None, shared_model: bool = True, validate: bool = True, n_optimizers: int = 2, val_set: typing.Union[typing.List[str], None] = None, n_incs: int = 1, **kwargs): """ Constructor Parameters ---------- scenario : ~dsmac.scenario.scenario.Scenario Scenario object n_optimizers: int Number of optimizers to run in parallel per round rng: int/np.random.RandomState The randomState/seed to pass to each dsmac run run_id: int run_id for this hydra run tae: ExecuteTARun Target Algorithm Runner (supports old and aclib format as well as AbstractTAFunc) tae_kwargs: Optional[dict] arguments passed to constructor of '~tae' shared_model: bool Flag to indicate whether information is shared between SMAC runs or not validate: bool / None Flag to indicate whether to validate the found configurations or to use the SMAC estimates None => neither and return the full portfolio n_incs: int Number of incumbents to return (n_incs <= 0 ==> all found configurations) val_set: typing.List[str] List of instance-ids to validate on """ self.logger = logging.getLogger(self.__module__ + "." + self.__class__.__name__) self.scenario = scenario self.run_id, self.rng = get_rng(rng, run_id, logger=self.logger) self.kwargs = kwargs self.output_dir = None self.rh = RunHistory(average_cost, file_system=scenario.file_system) self._tae = tae self._tae_kwargs = tae_kwargs if n_optimizers <= 1: self.logger.warning('Invalid value in %s: %d. Setting to 2', 'n_optimizers', n_optimizers) self.n_optimizers = max(n_optimizers, 2) self.validate = validate self.shared_model = shared_model self.n_incs = min(max(1, n_incs), self.n_optimizers) if val_set is None: self.val_set = scenario.train_insts else: self.val_set = val_set