def test_challenger_list_callback(self, patch_sample, patch_ei, patch_impute): values = (10, 1, 9, 2, 8, 3, 7, 4, 6, 5) patch_sample.return_value = ConfigurationMock(1) patch_ei.return_value = np.array([[_] for _ in values], dtype=float) patch_impute.side_effect = lambda l: values cs = ConfigurationSpace() ei = EI(None) rs = RandomSearch(ei, cs) rs._maximize = unittest.mock.Mock() rs._maximize.return_value = [(0, 0)] rval = rs.maximize( runhistory=None, stats=None, num_points=10, ) self.assertEqual(rs._maximize.call_count, 0) next(rval) self.assertEqual(rs._maximize.call_count, 1) random_configuration_chooser = unittest.mock.Mock() random_configuration_chooser.check.side_effect = [ True, False, False, False ] rs._maximize = unittest.mock.Mock() rs._maximize.return_value = [(0, 0), (1, 1)] rval = rs.maximize( runhistory=None, stats=None, num_points=10, random_configuration_chooser=random_configuration_chooser, ) self.assertEqual(rs._maximize.call_count, 0) # The first configuration is chosen at random (see the random_configuration_chooser mock) conf = next(rval) self.assertIsInstance(conf, ConfigurationMock) self.assertEqual(rs._maximize.call_count, 0) # The 2nd configuration triggers the call to the callback (see the random_configuration_chooser mock) conf = next(rval) self.assertEqual(rs._maximize.call_count, 1) self.assertEqual(conf, 0) # The 3rd configuration doesn't trigger the callback any more conf = next(rval) self.assertEqual(rs._maximize.call_count, 1) self.assertEqual(conf, 1) with self.assertRaises(StopIteration): next(rval)
class EPMChooser(object): def __init__(self, scenario: Scenario, stats: Stats, runhistory: RunHistory, runhistory2epm: AbstractRunHistory2EPM, model: RandomForestWithInstances, acq_optimizer: AcquisitionFunctionMaximizer, acquisition_func: AbstractAcquisitionFunction, rng: np.random.RandomState, restore_incumbent: Configuration = None, random_configuration_chooser: typing. Union[RandomConfigurationChooser] = ChooserNoCoolDown(2.0), predict_x_best: bool = True, min_samples_model: int = 1): """ Interface to train the EPM and generate next configurations Parameters ---------- scenario: smac.scenario.scenario.Scenario Scenario object stats: smac.stats.stats.Stats statistics object with configuration budgets runhistory: smac.runhistory.runhistory.RunHistory runhistory with all runs so far model: smac.epm.rf_with_instances.RandomForestWithInstances empirical performance model (right now, we support only RandomForestWithInstances) acq_optimizer: smac.optimizer.ei_optimization.AcquisitionFunctionMaximizer Optimizer of acquisition function. restore_incumbent: Configuration incumbent to be used from the start. ONLY used to restore states. rng: np.random.RandomState Random number generator random_configuration_chooser: Chooser for random configuration -- one of * ChooserNoCoolDown(modulus) * ChooserLinearCoolDown(start_modulus, modulus_increment, end_modulus) predict_x_best: bool Choose x_best for computing the acquisition function via the model instead of via the observations. min_samples_model: int Minimum number of samples to build a model """ self.logger = logging.getLogger(self.__module__ + "." + self.__class__.__name__) self.incumbent = restore_incumbent self.scenario = scenario self.stats = stats self.runhistory = runhistory self.rh2EPM = runhistory2epm self.model = model self.acq_optimizer = acq_optimizer self.acquisition_func = acquisition_func self.rng = rng self.random_configuration_chooser = random_configuration_chooser self._random_search = RandomSearch( acquisition_func, self.scenario.cs, # type: ignore[attr-defined] # noqa F821 rng, ) self.initial_design_configs = [] # type: typing.List[Configuration] self.predict_x_best = predict_x_best self.min_samples_model = min_samples_model self.currently_considered_budgets = [ 0.0, ] def _collect_data_to_train_model( self) -> typing.Tuple[np.ndarray, np.ndarray, np.ndarray]: # if we use a float value as a budget, we want to train the model only on the highest budget available_budgets = [] for run_key in self.runhistory.data.keys(): available_budgets.append(run_key.budget) # Sort available budgets from highest to lowest budget available_budgets = sorted(list(set(available_budgets)), reverse=True) # Get #points per budget and if there are enough samples, then build a model for b in available_budgets: X, Y = self.rh2EPM.transform(self.runhistory, budget_subset=[ b, ]) if X.shape[0] >= self.min_samples_model: self.currently_considered_budgets = [ b, ] configs_array = self.rh2EPM.get_configurations( self.runhistory, budget_subset=self.currently_considered_budgets) return X, Y, configs_array return np.empty(shape=[0, 0]), np.empty(shape=[ 0, ]), np.empty(shape=[0, 0]) def _get_evaluated_configs(self) -> typing.List[Configuration]: return self.runhistory.get_all_configs_per_budget( budget_subset=self.currently_considered_budgets) def choose_next( self, incumbent_value: float = None) -> typing.Iterator[Configuration]: """Choose next candidate solution with Bayesian optimization. The suggested configurations depend on the argument ``acq_optimizer`` to the ``SMBO`` class. Parameters ---------- incumbent_value: float Cost value of incumbent configuration (required for acquisition function); If not given, it will be inferred from runhistory or predicted; if not given and runhistory is empty, it will raise a ValueError. Returns ------- Iterator """ self.logger.debug("Search for next configuration") X, Y, X_configurations = self._collect_data_to_train_model() if X.shape[0] == 0: # Only return a single point to avoid an overly high number of # random search iterations return self._random_search.maximize(runhistory=self.runhistory, stats=self.stats, num_points=1) self.model.train(X, Y) if incumbent_value is not None: best_observation = incumbent_value x_best_array = None # type: typing.Optional[np.ndarray] else: if self.runhistory.empty(): raise ValueError("Runhistory is empty and the cost value of " "the incumbent is unknown.") x_best_array, best_observation = self._get_x_best( self.predict_x_best, X_configurations) self.acquisition_func.update( model=self.model, eta=best_observation, incumbent_array=x_best_array, num_data=len(self._get_evaluated_configs()), X=X_configurations, ) challengers = self.acq_optimizer.maximize( runhistory=self.runhistory, stats=self.stats, num_points=self.scenario. acq_opt_challengers, # type: ignore[attr-defined] # noqa F821 random_configuration_chooser=self.random_configuration_chooser) return challengers def _get_x_best(self, predict: bool, X: np.ndarray) -> typing.Tuple[float, np.ndarray]: """Get value, configuration, and array representation of the "best" configuration. The definition of best varies depending on the argument ``predict``. If set to ``True``, this function will return the stats of the best configuration as predicted by the model, otherwise it will return the stats for the best observed configuration. Parameters ---------- predict : bool Whether to use the predicted or observed best. Returns ------- float np.ndarry Configuration """ if predict: costs = list( map( lambda x: ( self.model.predict_marginalized_over_instances( x.reshape((1, -1)))[0][0][0], x, ), X, )) costs = sorted(costs, key=lambda t: t[0]) x_best_array = costs[0][1] best_observation = costs[0][0] # won't need log(y) if EPM was already trained on log(y) else: all_configs = self.runhistory.get_all_configs_per_budget( budget_subset=self.currently_considered_budgets) x_best = self.incumbent x_best_array = convert_configurations_to_array(all_configs) best_observation = self.runhistory.get_cost(x_best) best_observation_as_array = np.array(best_observation).reshape( (1, 1)) # It's unclear how to do this for inv scaling and potential future scaling. # This line should be changed if necessary best_observation = self.rh2EPM.transform_response_values( best_observation_as_array) best_observation = best_observation[0][0] return x_best_array, best_observation
class SMBO(object): """Interface that contains the main Bayesian optimization loop Attributes ---------- logger incumbent scenario config_space stats initial_design runhistory rh2EPM intensifier aggregate_func num_run model acq_optimizer acquisition_func rng """ def __init__(self, scenario: Scenario, stats: Stats, initial_design: InitialDesign, runhistory: RunHistory, runhistory2epm: AbstractRunHistory2EPM, intensifier: Intensifier, aggregate_func: callable, num_run: int, model: RandomForestWithInstances, acq_optimizer: AcquisitionFunctionMaximizer, acquisition_func: AbstractAcquisitionFunction, rng: np.random.RandomState, restore_incumbent: Configuration=None): """ Interface that contains the main Bayesian optimization loop Parameters ---------- scenario: smac.scenario.scenario.Scenario Scenario object stats: Stats statistics object with configuration budgets initial_design: InitialDesign initial sampling design runhistory: RunHistory runhistory with all runs so far runhistory2epm : AbstractRunHistory2EPM Object that implements the AbstractRunHistory2EPM to convert runhistory data into EPM data intensifier: Intensifier intensification of new challengers against incumbent configuration (probably with some kind of racing on the instances) aggregate_func: callable how to aggregate the runs in the runhistory to get the performance of a configuration num_run: int id of this run (used for pSMAC) model: RandomForestWithInstances empirical performance model (right now, we support only RandomForestWithInstances) acq_optimizer: AcquisitionFunctionMaximizer Optimizer of acquisition function. acquisition_function : AcquisitionFunction Object that implements the AbstractAcquisitionFunction (i.e., infill criterion for acq_optimizer) restore_incumbent: Configuration incumbent to be used from the start. ONLY used to restore states. rng: np.random.RandomState Random number generator """ self.logger = logging.getLogger( self.__module__ + "." + self.__class__.__name__) self.incumbent = restore_incumbent self.scenario = scenario self.config_space = scenario.cs self.stats = stats self.initial_design = initial_design self.runhistory = runhistory self.rh2EPM = runhistory2epm self.intensifier = intensifier self.aggregate_func = aggregate_func self.num_run = num_run self.model = model self.acq_optimizer = acq_optimizer self.acquisition_func = acquisition_func self.rng = rng self._random_search = RandomSearch( acquisition_func, self.config_space, rng ) def start(self): """Starts the Bayesian Optimization loop. Detects whether we the optimization is restored from previous state. """ self.stats.start_timing() # Initialization, depends on input if self.stats.ta_runs == 0 and self.incumbent is None: try: self.incumbent = self.initial_design.run() except FirstRunCrashedException as err: if self.scenario.abort_on_first_run_crash: raise elif self.stats.ta_runs > 0 and self.incumbent is None: raise ValueError("According to stats there have been runs performed, " "but the optimizer cannot detect an incumbent. Did " "you set the incumbent (e.g. after restoring state)?") elif self.stats.ta_runs == 0 and self.incumbent is not None: raise ValueError("An incumbent is specified, but there are no runs " "recorded in the Stats-object. If you're restoring " "a state, please provide the Stats-object.") else: # Restoring state! self.logger.info("State Restored! Starting optimization with " "incumbent %s", self.incumbent) self.logger.info("State restored with following budget:") self.stats.print_stats() def run(self): """Runs the Bayesian optimization loop Returns ---------- incumbent: np.array(1, H) The best found configuration """ self.start() # Main BO loop while True: if self.scenario.shared_model: pSMAC.read(run_history=self.runhistory, output_dirs=self.scenario.input_psmac_dirs, configuration_space=self.config_space, logger=self.logger) start_time = time.time() X, Y = self.rh2EPM.transform(self.runhistory) self.logger.debug("Search for next configuration") # get all found configurations sorted according to acq challengers = self.choose_next(X, Y) time_spent = time.time() - start_time time_left = self._get_timebound_for_intensification(time_spent) self.logger.debug("Intensify") self.incumbent, inc_perf = self.intensifier.intensify( challengers=challengers, incumbent=self.incumbent, run_history=self.runhistory, aggregate_func=self.aggregate_func, time_bound=max(self.intensifier._min_time, time_left)) if self.scenario.shared_model: pSMAC.write(run_history=self.runhistory, output_directory=self.scenario.output_dir_for_this_run) logging.debug("Remaining budget: %f (wallclock), %f (ta costs), %f (target runs)" % ( self.stats.get_remaing_time_budget(), self.stats.get_remaining_ta_budget(), self.stats.get_remaining_ta_runs())) if self.stats.is_budget_exhausted(): break self.stats.print_stats(debug_out=True) return self.incumbent def choose_next(self, X: np.ndarray, Y: np.ndarray, incumbent_value: float=None): """Choose next candidate solution with Bayesian optimization. The suggested configurations depend on the argument ``acq_optimizer`` to the ``SMBO`` class. Parameters ---------- X : (N, D) numpy array Each row contains a configuration and one set of instance features. Y : (N, O) numpy array The function values for each configuration instance pair. incumbent_value: float Cost value of incumbent configuration (required for acquisition function); if not given, it will be inferred from runhistory; if not given and runhistory is empty, it will raise a ValueError Returns ------- Iterable """ import pdb pdb.set_trace() if X.shape[0] == 0: # Only return a single point to avoid an overly high number of # random search iterations return self._random_search.maximize(runhistory=self.runhistory, stats=self.stats, num_points=1) self.model.train(X, Y) if incumbent_value is None: if self.runhistory.empty(): raise ValueError("Runhistory is empty and the cost value of the incumbent is unknown.") incumbent_value = self.runhistory.get_cost(self.incumbent) self.acquisition_func.update(model=self.model, eta=incumbent_value) challengers = self.acq_optimizer.maximize(self.runhistory, self.stats, 5000) return challengers def validate(self, config_mode='inc', instance_mode='train+test', repetitions=1, use_epm=False, n_jobs=-1, backend='threading'): """Create validator-object and run validation, using scenario-information, runhistory from smbo and tae_runner from intensify Parameters ---------- config_mode: str or list<Configuration> string or directly a list of Configuration str from [def, inc, def+inc, wallclock_time, cpu_time, all] time evaluates at cpu- or wallclock-timesteps of: [max_time/2^0, max_time/2^1, max_time/2^3, ..., default] with max_time being the highest recorded time instance_mode: string what instances to use for validation, from [train, test, train+test] repetitions: int number of repetitions in nondeterministic algorithms (in deterministic will be fixed to 1) use_epm: bool whether to use an EPM instead of evaluating all runs with the TAE n_jobs: int number of parallel processes used by joblib Returns ------- runhistory: RunHistory runhistory containing all specified runs """ traj_fn = os.path.join(self.scenario.output_dir_for_this_run, "traj_aclib2.json") trajectory = TrajLogger.read_traj_aclib_format(fn=traj_fn, cs=self.scenario.cs) new_rh_path = os.path.join(self.scenario.output_dir_for_this_run, "validated_runhistory.json") validator = Validator(self.scenario, trajectory, self.rng) if use_epm: new_rh = validator.validate_epm(config_mode=config_mode, instance_mode=instance_mode, repetitions=repetitions, runhistory=self.runhistory, output=new_rh_path) else: new_rh = validator.validate(config_mode, instance_mode, repetitions, n_jobs, backend, self.runhistory, self.intensifier.tae_runner, new_rh_path) return new_rh def _get_timebound_for_intensification(self, time_spent): """Calculate time left for intensify from the time spent on choosing challengers using the fraction of time intended for intensification (which is specified in scenario.intensification_percentage). Parameters ---------- time_spent : float Returns ------- time_left : float """ frac_intensify = self.scenario.intensification_percentage if frac_intensify <= 0 or frac_intensify >= 1: raise ValueError("The value for intensification_percentage-" "option must lie in (0,1), instead: %.2f" % (frac_intensify)) total_time = time_spent / (1 - frac_intensify) time_left = frac_intensify * total_time self.logger.debug("Total time: %.4f, time spent on choosing next " "configurations: %.4f (%.2f), time left for " "intensification: %.4f (%.2f)" % (total_time, time_spent, (1 - frac_intensify), time_left, frac_intensify)) return time_left
class SMBO(object): """Interface that contains the main Bayesian optimization loop Attributes ---------- logger incumbent scenario config_space stats initial_design runhistory rh2EPM intensifier aggregate_func num_run model acq_optimizer acquisition_func rng random_configuration_chooser """ def __init__(self, scenario: Scenario, stats: Stats, initial_design: InitialDesign, runhistory: RunHistory, runhistory2epm: AbstractRunHistory2EPM, intensifier: Intensifier, aggregate_func: callable, num_run: int, model: RandomForestWithInstances, acq_optimizer: AcquisitionFunctionMaximizer, acquisition_func: AbstractAcquisitionFunction, rng: np.random.RandomState, restore_incumbent: Configuration = None, random_configuration_chooser: typing.Union[ ChooserNoCoolDown, ChooserLinearCoolDown] = ChooserNoCoolDown(2.0), predict_incumbent: bool = True): """ Interface that contains the main Bayesian optimization loop Parameters ---------- scenario: smac.scenario.scenario.Scenario Scenario object stats: Stats statistics object with configuration budgets initial_design: InitialDesign initial sampling design runhistory: RunHistory runhistory with all runs so far runhistory2epm : AbstractRunHistory2EPM Object that implements the AbstractRunHistory2EPM to convert runhistory data into EPM data intensifier: Intensifier intensification of new challengers against incumbent configuration (probably with some kind of racing on the instances) aggregate_func: callable how to aggregate the runs in the runhistory to get the performance of a configuration num_run: int id of this run (used for pSMAC) model: RandomForestWithInstances empirical performance model (right now, we support only RandomForestWithInstances) acq_optimizer: AcquisitionFunctionMaximizer Optimizer of acquisition function. acquisition_function : AcquisitionFunction Object that implements the AbstractAcquisitionFunction (i.e., infill criterion for acq_optimizer) restore_incumbent: Configuration incumbent to be used from the start. ONLY used to restore states. rng: np.random.RandomState Random number generator random_configuration_chooser Chooser for random configuration -- one of * ChooserNoCoolDown(modulus) * ChooserLinearCoolDown(start_modulus, modulus_increment, end_modulus) predict_incumbent: bool Use predicted performance of incumbent instead of observed performance """ self.logger = logging.getLogger(self.__module__ + "." + self.__class__.__name__) self.incumbent = restore_incumbent self.scenario = scenario self.config_space = scenario.cs self.stats = stats self.initial_design = initial_design self.runhistory = runhistory self.rh2EPM = runhistory2epm self.intensifier = intensifier self.aggregate_func = aggregate_func self.num_run = num_run self.model = model self.acq_optimizer = acq_optimizer self.acquisition_func = acquisition_func self.rng = rng self.random_configuration_chooser = random_configuration_chooser self._random_search = RandomSearch(acquisition_func, self.config_space, rng) self.predict_incumbent = predict_incumbent def start(self): """Starts the Bayesian Optimization loop. Detects whether we the optimization is restored from previous state. """ self.stats.start_timing() # Initialization, depends on input if self.stats.ta_runs == 0 and self.incumbent is None: self.incumbent = self.initial_design.run() elif self.stats.ta_runs > 0 and self.incumbent is None: raise ValueError( "According to stats there have been runs performed, " "but the optimizer cannot detect an incumbent. Did " "you set the incumbent (e.g. after restoring state)?") elif self.stats.ta_runs == 0 and self.incumbent is not None: raise ValueError( "An incumbent is specified, but there are no runs " "recorded in the Stats-object. If you're restoring " "a state, please provide the Stats-object.") else: # Restoring state! self.logger.info( "State Restored! Starting optimization with " "incumbent %s", self.incumbent) self.logger.info("State restored with following budget:") self.stats.print_stats() # To be on the safe side -> never return "None" as incumbent if not self.incumbent: self.incumbent = self.scenario.cs.get_default_configuration() def run(self): """Runs the Bayesian optimization loop Returns ---------- incumbent: np.array(1, H) The best found configuration """ self.start() # Main BO loop while True: if self.scenario.shared_model: pSMAC.read(run_history=self.runhistory, output_dirs=self.scenario.input_psmac_dirs, configuration_space=self.config_space, logger=self.logger) start_time = time.time() X, Y = self.rh2EPM.transform(self.runhistory) self.logger.debug("Search for next configuration") # get all found configurations sorted according to acq challengers = self.choose_next(X, Y) time_spent = time.time() - start_time time_left = self._get_timebound_for_intensification(time_spent) self.logger.debug("Intensify") self.incumbent, inc_perf = self.intensifier.intensify( challengers=challengers, incumbent=self.incumbent, run_history=self.runhistory, aggregate_func=self.aggregate_func, time_bound=max(self.intensifier._min_time, time_left)) if self.scenario.shared_model: pSMAC.write( run_history=self.runhistory, output_directory=self.scenario.output_dir_for_this_run, logger=self.logger) logging.debug( "Remaining budget: %f (wallclock), %f (ta costs), %f (target runs)" % (self.stats.get_remaing_time_budget(), self.stats.get_remaining_ta_budget(), self.stats.get_remaining_ta_runs())) if self.stats.is_budget_exhausted(): break self.stats.print_stats(debug_out=True) return self.incumbent def choose_next(self, X: np.ndarray, Y: np.ndarray, incumbent_value: float = None): """Choose next candidate solution with Bayesian optimization. The suggested configurations depend on the argument ``acq_optimizer`` to the ``SMBO`` class. Parameters ---------- X : (N, D) numpy array Each row contains a configuration and one set of instance features. Y : (N, O) numpy array The function values for each configuration instance pair. incumbent_value: float Cost value of incumbent configuration (required for acquisition function); if not given, it will be inferred from runhistory; if not given and runhistory is empty, it will raise a ValueError Returns ------- Iterable """ if X.shape[0] == 0: # Only return a single point to avoid an overly high number of # random search iterations return self._random_search.maximize(runhistory=self.runhistory, stats=self.stats, num_points=1) self.model.train(X, Y) if incumbent_value is None: if self.runhistory.empty(): raise ValueError("Runhistory is empty and the cost value of " "the incumbent is unknown.") incumbent_value = self._get_incumbent_value() self.acquisition_func.update(model=self.model, eta=incumbent_value, num_data=len(self.runhistory.data)) challengers = self.acq_optimizer.maximize( runhistory=self.runhistory, stats=self.stats, num_points=self.scenario.acq_opt_challengers, random_configuration_chooser=self.random_configuration_chooser) return challengers def _get_incumbent_value(self): ''' get incumbent value either from runhistory or from best predicted performance on configs in runhistory (depends on self.predict_incumbent)" Return ------ float ''' if self.predict_incumbent: configs = convert_configurations_to_array( self.runhistory.get_all_configs()) costs = list( map( lambda config: self.model. predict_marginalized_over_instances(config.reshape( (1, -1)))[0][0][0], configs, )) incumbent_value = np.min(costs) # won't need log(y) if EPM was already trained on log(y) else: if self.runhistory.empty(): raise ValueError("Runhistory is empty and the cost value of " "the incumbent is unknown.") incumbent_value = self.runhistory.get_cost(self.incumbent) # It's unclear how to do this for inv scaling and potential future scaling. This line should be changed if # necessary incumbent_value_as_array = np.array(incumbent_value).reshape( (1, 1)) incumbent_value = self.rh2EPM.transform_response_values( incumbent_value_as_array) incumbent_value = incumbent_value[0][0] return incumbent_value def validate(self, config_mode='inc', instance_mode='train+test', repetitions=1, use_epm=False, n_jobs=-1, backend='threading'): """Create validator-object and run validation, using scenario-information, runhistory from smbo and tae_runner from intensify Parameters ---------- config_mode: str or list<Configuration> string or directly a list of Configuration str from [def, inc, def+inc, wallclock_time, cpu_time, all] time evaluates at cpu- or wallclock-timesteps of: [max_time/2^0, max_time/2^1, max_time/2^3, ..., default] with max_time being the highest recorded time instance_mode: string what instances to use for validation, from [train, test, train+test] repetitions: int number of repetitions in nondeterministic algorithms (in deterministic will be fixed to 1) use_epm: bool whether to use an EPM instead of evaluating all runs with the TAE n_jobs: int number of parallel processes used by joblib Returns ------- runhistory: RunHistory runhistory containing all specified runs """ if isinstance(config_mode, str): traj_fn = os.path.join(self.scenario.output_dir_for_this_run, "traj_aclib2.json") trajectory = TrajLogger.read_traj_aclib_format(fn=traj_fn, cs=self.scenario.cs) else: trajectory = None if self.scenario.output_dir_for_this_run: new_rh_path = os.path.join(self.scenario.output_dir_for_this_run, "validated_runhistory.json") else: new_rh_path = None validator = Validator(self.scenario, trajectory, self.rng) if use_epm: new_rh = validator.validate_epm(config_mode=config_mode, instance_mode=instance_mode, repetitions=repetitions, runhistory=self.runhistory, output_fn=new_rh_path) else: new_rh = validator.validate(config_mode, instance_mode, repetitions, n_jobs, backend, self.runhistory, self.intensifier.tae_runner, output_fn=new_rh_path) return new_rh def _get_timebound_for_intensification(self, time_spent: float): """Calculate time left for intensify from the time spent on choosing challengers using the fraction of time intended for intensification (which is specified in scenario.intensification_percentage). Parameters ---------- time_spent : float Returns ------- time_left : float """ frac_intensify = self.scenario.intensification_percentage if frac_intensify <= 0 or frac_intensify >= 1: raise ValueError("The value for intensification_percentage-" "option must lie in (0,1), instead: %.2f" % (frac_intensify)) total_time = time_spent / (1 - frac_intensify) time_left = frac_intensify * total_time self.logger.debug("Total time: %.4f, time spent on choosing next " "configurations: %.4f (%.2f), time left for " "intensification: %.4f (%.2f)" % (total_time, time_spent, (1 - frac_intensify), time_left, frac_intensify)) return time_left def _component_builder(self, conf:typing.Union[Configuration, dict]) \ -> typing.Tuple[AbstractAcquisitionFunction, AbstractEPM]: """ builds new Acquisition function object and EPM object and returns these Parameters ---------- conf: typing.Union[Configuration, dict] configuration specificing "model" and "acq_func" Returns ------- typing.Tuple[AbstractAcquisitionFunction, AbstractEPM] """ types, bounds = get_types( self.config_space, instance_features=self.scenario.feature_array) if conf["model"] == "RF": model = RandomForestWithInstances( configspace=self.config_space, types=types, bounds=bounds, instance_features=self.scenario.feature_array, seed=self.rng.randint(MAXINT), pca_components=conf.get("pca_dim", self.scenario.PCA_DIM), log_y=conf.get("log_y", self.scenario.transform_y in ["LOG", "LOGS"]), num_trees=conf.get("num_trees", self.scenario.rf_num_trees), do_bootstrapping=conf.get("do_bootstrapping", self.scenario.rf_do_bootstrapping), ratio_features=conf.get("ratio_features", self.scenario.rf_ratio_features), min_samples_split=conf.get("min_samples_split", self.scenario.rf_min_samples_split), min_samples_leaf=conf.get("min_samples_leaf", self.scenario.rf_min_samples_leaf), max_depth=conf.get("max_depth", self.scenario.rf_max_depth), ) elif conf["model"] == "GP": from smac.epm.gp_kernels import ConstantKernel, HammingKernel, WhiteKernel, Matern cov_amp = ConstantKernel( 2.0, constant_value_bounds=(np.exp(-10), np.exp(2)), prior=LognormalPrior(mean=0.0, sigma=1.0, rng=self.rng), ) cont_dims = np.nonzero(types == 0)[0] cat_dims = np.nonzero(types != 0)[0] if len(cont_dims) > 0: exp_kernel = Matern( np.ones([len(cont_dims)]), [(np.exp(-10), np.exp(2)) for _ in range(len(cont_dims))], nu=2.5, operate_on=cont_dims, ) if len(cat_dims) > 0: ham_kernel = HammingKernel( np.ones([len(cat_dims)]), [(np.exp(-10), np.exp(2)) for _ in range(len(cat_dims))], operate_on=cat_dims, ) noise_kernel = WhiteKernel( noise_level=1e-8, noise_level_bounds=(np.exp(-25), np.exp(2)), prior=HorseshoePrior(scale=0.1, rng=self.rng), ) if len(cont_dims) > 0 and len(cat_dims) > 0: # both kernel = cov_amp * (exp_kernel * ham_kernel) + noise_kernel elif len(cont_dims) > 0 and len(cat_dims) == 0: # only cont kernel = cov_amp * exp_kernel + noise_kernel elif len(cont_dims) == 0 and len(cat_dims) > 0: # only cont kernel = cov_amp * ham_kernel + noise_kernel else: raise ValueError() n_mcmc_walkers = 3 * len(kernel.theta) if n_mcmc_walkers % 2 == 1: n_mcmc_walkers += 1 model = GaussianProcessMCMC( self.config_space, types=types, bounds=bounds, kernel=kernel, n_mcmc_walkers=n_mcmc_walkers, chain_length=250, burnin_steps=250, normalize_y=True, seed=self.rng.randint(low=0, high=10000), ) if conf["acq_func"] == "EI": acq = EI(model=model, par=conf.get("par_ei", 0)) elif conf["acq_func"] == "LCB": acq = LCB(model=model, par=conf.get("par_lcb", 0)) elif conf["acq_func"] == "PI": acq = PI(model=model, par=conf.get("par_pi", 0)) elif conf["acq_func"] == "LogEI": # par value should be in log-space acq = LogEI(model=model, par=conf.get("par_logei", 0)) return acq, model def _get_acm_cs(self): """ returns a configuration space designed for querying ~smac.optimizer.smbo._component_builder Returns ------- ConfigurationSpace """ cs = ConfigurationSpace() cs.seed(self.rng.randint(0, 2**20)) if 'gp' in smac.extras_installed: model = CategoricalHyperparameter("model", choices=("RF", "GP")) else: model = Constant("model", value="RF") num_trees = Constant("num_trees", value=10) bootstrap = CategoricalHyperparameter("do_bootstrapping", choices=(True, False), default_value=True) ratio_features = CategoricalHyperparameter("ratio_features", choices=(3 / 6, 4 / 6, 5 / 6, 1), default_value=1) min_split = UniformIntegerHyperparameter("min_samples_to_split", lower=1, upper=10, default_value=2) min_leaves = UniformIntegerHyperparameter("min_samples_in_leaf", lower=1, upper=10, default_value=1) cs.add_hyperparameters([ model, num_trees, bootstrap, ratio_features, min_split, min_leaves ]) inc_num_trees = InCondition(num_trees, model, ["RF"]) inc_bootstrap = InCondition(bootstrap, model, ["RF"]) inc_ratio_features = InCondition(ratio_features, model, ["RF"]) inc_min_split = InCondition(min_split, model, ["RF"]) inc_min_leavs = InCondition(min_leaves, model, ["RF"]) cs.add_conditions([ inc_num_trees, inc_bootstrap, inc_ratio_features, inc_min_split, inc_min_leavs ]) acq = CategoricalHyperparameter("acq_func", choices=("EI", "LCB", "PI", "LogEI")) par_ei = UniformFloatHyperparameter("par_ei", lower=-10, upper=10) par_pi = UniformFloatHyperparameter("par_pi", lower=-10, upper=10) par_logei = UniformFloatHyperparameter("par_logei", lower=0.001, upper=100, log=True) par_lcb = UniformFloatHyperparameter("par_lcb", lower=0.0001, upper=0.9999) cs.add_hyperparameters([acq, par_ei, par_pi, par_logei, par_lcb]) inc_par_ei = InCondition(par_ei, acq, ["EI"]) inc_par_pi = InCondition(par_pi, acq, ["PI"]) inc_par_logei = InCondition(par_logei, acq, ["LogEI"]) inc_par_lcb = InCondition(par_lcb, acq, ["LCB"]) cs.add_conditions([inc_par_ei, inc_par_pi, inc_par_logei, inc_par_lcb]) return cs
class SMBO(object): """Interface that contains the main Bayesian optimization loop Attributes ---------- logger incumbent scenario config_space stats initial_design runhistory rh2EPM intensifier aggregate_func num_run model acq_optimizer acquisition_func rng """ def __init__( self, scenario: Scenario, stats: Stats, initial_design: InitialDesign, runhistory: RunHistory, runhistory2epm: AbstractRunHistory2EPM, intensifier: Intensifier, aggregate_func: callable, num_run: int, model: AbstractEPM, acq_optimizer: AcquisitionFunctionMaximizer, acquisition_func: AbstractAcquisitionFunction, rng: np.random.RandomState, restore_incumbent: Configuration = None, # 强行在smbo中加入训练集和验证集 hoag: AbstractHOAG = None, # 参数服务器worker的脚本文件路径 #server: Server = None, bayesian_optimization: bool = False): """ Interface that contains the main Bayesian optimization loop Parameters ---------- scenario: smac.scenario.scenario.Scenario Scenario object stats: Stats statistics object with configuration budgets initial_design: InitialDesign initial sampling design runhistory: RunHistory runhistory with all runs so far runhistory2epm : AbstractRunHistory2EPM Object that implements the AbstractRunHistory2EPM to convert runhistory data into EPM data intensifier: Intensifier intensification of new challengers against incumbent configuration (probably with some kind of racing on the instances) aggregate_func: callable how to aggregate the runs in the runhistory to get the performance of a configuration num_run: int id of this run (used for pSMAC) model: AbstractEPM empirical performance model (right now, we support only AbstractEPM) acq_optimizer: AcquisitionFunctionMaximizer Optimizer of acquisition function. acquisition_function : AcquisitionFunction Object that implements the AbstractAcquisitionFunction (i.e., infill criterion for acq_optimizer) restore_incumbent: Configuration incumbent to be used from the start. ONLY used to restore states. rng: np.random.RandomState Random number generator """ self.logger = logging.getLogger(self.__module__ + "." + self.__class__.__name__) self.incumbent = restore_incumbent self.scenario = scenario self.config_space = scenario.cs self.stats = stats self.initial_design = initial_design self.runhistory = runhistory self.rh2EPM = runhistory2epm self.intensifier = intensifier self.aggregate_func = aggregate_func self.num_run = num_run self.model = model self.acq_optimizer = acq_optimizer self.acquisition_func = acquisition_func self.rng = rng # hoag的类,直接使用hoag的fit,predict等 self.hoag = hoag # 保存server端进程 #self.server = server self.server = None self.bayesian_optimization = bayesian_optimization self._random_search = RandomSearch(acquisition_func, self.config_space, rng) def start(self): """Starts the Bayesian Optimization loop. Detects whether we the optimization is restored from previous state. """ self.stats.start_timing() # Initialization, depends on input if self.stats.ta_runs == 0 and self.incumbent is None: try: if self.server is None: self.incumbent = self.initial_design.run() else: # 由worker自己产生第一个incumbent,然后由server接收其中的一个 self.incumbent, new_runhistory = self.server.pull() self.runhistory.update(new_runhistory) except FirstRunCrashedException as err: if self.scenario.abort_on_first_run_crash: raise elif self.stats.ta_runs > 0 and self.incumbent is None: raise ValueError( "According to stats there have been runs performed, " "but the optimizer cannot detect an incumbent. Did " "you set the incumbent (e.g. after restoring state)?") elif self.stats.ta_runs == 0 and self.incumbent is not None: raise ValueError( "An incumbent is specified, but there are no runs " "recorded in the Stats-object. If you're restoring " "a state, please provide the Stats-object.") else: # Restoring state! self.logger.info( "State Restored! Starting optimization with " "incumbent %s", self.incumbent) self.logger.info("State restored with following budget:") self.stats.print_stats() def run(self): """Runs the Bayesian optimization loop Returns ---------- incumbent: np.array(1, H) The best found configuration """ self.start() # 设置一个counter counter = 0 # Main BO loop while True: # 打印每轮SMBO的最优结果(包括首轮SMBO 0) print('SMBO ' + str(counter) + ': ' + str(self.runhistory.get_cost(self.incumbent))) counter += 1 if self.scenario.shared_model: pSMAC.read(run_history=self.runhistory, output_dirs=self.scenario.input_psmac_dirs, configuration_space=self.config_space, logger=self.logger) start_time = time.time() X, Y = self.rh2EPM.transform(self.runhistory) self.logger.debug("Search for next configuration") # get all found configurations sorted according to acq challengers = self.choose_next(X, Y) time_spent = time.time() - start_time time_left = self._get_timebound_for_intensification(time_spent) self.logger.debug("Intensify") if self.server is None: self.incumbent, inc_perf = self.intensifier.intensify( challengers=challengers, incumbent=self.incumbent, run_history=self.runhistory, aggregate_func=self.aggregate_func, time_bound=max(self.intensifier._min_time, time_left)) else: # 从worker读取loss,加入history再运行新的challengers print(time_left) self.server.push(incumbent=self.incumbent, runhistory=self.runhistory, challengers=challengers.challengers, time_left=time_left) # 从worker读取runhistory,并merge到self.runhistory incumbent, new_runhistory = self.server.pull() self.runhistory.update(new_runhistory) # 更新了runhistory之后,应该找寻是否存在新的incumbent # 因为worker没有完整的 runhistory_old = self.runhistory.get_history_for_config( self.incumbent) runhistory_new = self.runhistory.get_history_for_config( incumbent) # 找寻cost最小值 lowest_cost_old = min([cost[0] for cost in runhistory_old]) lowest_cost_new = min([cost[0] for cost in runhistory_new]) if lowest_cost_new < lowest_cost_old: # 替换为新的incumbent self.incumbent = incumbent """可以考虑用这个函数 new_incumbent = self._compare_configs( incumbent=incumbent, challenger=challenger, run_history=run_history, aggregate_func=aggregate_func, log_traj=log_traj) """ if self.scenario.shared_model: pSMAC.write( run_history=self.runhistory, output_directory=self.scenario.output_dir_for_this_run) logging.debug( "Remaining budget: %f (wallclock), %f (ta costs), %f (target runs)" % (self.stats.get_remaing_time_budget(), self.stats.get_remaining_ta_budget(), self.stats.get_remaining_ta_runs())) if self.stats.is_budget_exhausted(): break self.stats.print_stats(debug_out=True) return self.incumbent def choose_next(self, X: np.ndarray, Y: np.ndarray, incumbent_value: float = None): """Choose next candidate solution with Bayesian optimization. The suggested configurations depend on the argument ``acq_optimizer`` to the ``SMBO`` class. Parameters ---------- X : (N, D) numpy array Each row contains a configuration and one set of instance features. Y : (N, O) numpy array The function values for each configuration instance pair. incumbent_value: float Cost value of incumbent configuration (required for acquisition function); if not given, it will be inferred from runhistory; if not given and runhistory is empty, it will raise a ValueError Returns ------- Iterable """ if X.shape[0] == 0: # Only return a single point to avoid an overly high number of # random search iterations return self._random_search.maximize(runhistory=self.runhistory, stats=self.stats, num_points=1) # 消去完全相同的行 X, Y = remove_same_values(X, Y) print(X.shape) # 如果指定了hoag函数,则进行调用 if self.hoag is not None: # 初始化梯度数组 gradient = np.zeros(X.shape) # 对每组X,计算对应的梯度(此处有大量重复计算) for i in range(X.shape[0]): self.hoag.fit(X[i, :]) gradient[i, :] = self.hoag.predict_gradient() X = X.flatten() ind = np.argsort(X) gradient = gradient.flatten()[ind].reshape(-1, 1) X = X[ind].reshape(-1, 1) Y = Y.flatten()[ind].reshape(-1, 1) self.model.train(X, Y, gradient=gradient) elif self.bayesian_optimization: # gpr使用的参数 gp_params = {"alpha": 1e-5, "n_restarts_optimizer": 2} # 从configspace读取超参的范围 pbounds = {} for key in self.scenario.cs._hyperparameters.keys(): # 只处理float类型的超参 hyperparamter = self.scenario.cs._hyperparameters[key], if isinstance(hyperparamter.default_value, float): pbounds[key] = (hyperparamter.lower, hyperparamter.upper) # 初始化bayesian_optimization bo = BayesianOptimization(X, Y, pbounds=pbounds, verbose=False) # 预测下一个ei取得点 newX = bo.maximize(acq="ei", **gp_params) # 将超参数组再转化为Configuration challengers = [Configuration(self.scenario.cs, x) for x in newX] return challengers else: self.model.train(X, Y) # 打印X和Y的值 # print("X: ", X.flatten()) # print("Y: ", Y.flatten()) # print("Y_pred: ", self.model.predict(X)) # if self.hoag is not None: # print("G: ", gradient) if incumbent_value is None: if self.runhistory.empty(): raise ValueError("Runhistory is empty and the cost value of " "the incumbent is unknown.") incumbent_value = self.runhistory.get_cost(self.incumbent) self.acquisition_func.update(model=self.model, eta=incumbent_value) challengers = self.acq_optimizer.maximize( # 初始为5000,提升速度调成500 self.runhistory, self.stats, 500) return challengers def validate(self, config_mode='inc', instance_mode='train+test', repetitions=1, use_epm=False, n_jobs=-1, backend='threading'): """Create validator-object and run validation, using scenario-information, runhistory from smbo and tae_runner from intensify Parameters ---------- config_mode: str or list<Configuration> string or directly a list of Configuration str from [def, inc, def+inc, wallclock_time, cpu_time, all] time evaluates at cpu- or wallclock-timesteps of: [max_time/2^0, max_time/2^1, max_time/2^3, ..., default] with max_time being the highest recorded time instance_mode: string what instances to use for validation, from [train, test, train+test] repetitions: int number of repetitions in nondeterministic algorithms (in deterministic will be fixed to 1) use_epm: bool whether to use an EPM instead of evaluating all runs with the TAE n_jobs: int number of parallel processes used by joblib Returns ------- runhistory: RunHistory runhistory containing all specified runs """ traj_fn = os.path.join(self.scenario.output_dir_for_this_run, "traj_aclib2.json") trajectory = TrajLogger.read_traj_aclib_format(fn=traj_fn, cs=self.scenario.cs) new_rh_path = os.path.join(self.scenario.output_dir_for_this_run, "validated_runhistory.json") validator = Validator(self.scenario, trajectory, self.rng) if use_epm: new_rh = validator.validate_epm(config_mode=config_mode, instance_mode=instance_mode, repetitions=repetitions, runhistory=self.runhistory, output=new_rh_path) else: new_rh = validator.validate(config_mode, instance_mode, repetitions, n_jobs, backend, self.runhistory, self.intensifier.tae_runner, output=new_rh_path) return new_rh def _get_timebound_for_intensification(self, time_spent): """Calculate time left for intensify from the time spent on choosing challengers using the fraction of time intended for intensification (which is specified in scenario.intensification_percentage). Parameters ---------- time_spent : float Returns ------- time_left : float """ frac_intensify = self.scenario.intensification_percentage if frac_intensify <= 0 or frac_intensify >= 1: raise ValueError("The value for intensification_percentage-" "option must lie in (0,1), instead: %.2f" % (frac_intensify)) total_time = time_spent / (1 - frac_intensify) time_left = frac_intensify * total_time self.logger.debug("Total time: %.4f, time spent on choosing next " "configurations: %.4f (%.2f), time left for " "intensification: %.4f (%.2f)" % (total_time, time_spent, (1 - frac_intensify), time_left, frac_intensify)) return time_left