def model(self, model_short_name='urfi'): if model_short_name not in ['urfi', 'rfi']: raise ValueError( 'Specified model %s does not exist or not supported!' % model_short_name) elif model_short_name == 'rfi': self.types, self.bounds = get_types(self.scenario.cs, self.scenario.feature_array) self._model = RandomForestWithInstances( self.types, self.bounds, instance_features=self.scenario.feature_array, seed=12345) elif model_short_name == 'urfi': if not self._preprocessed: self.types, self.bounds = get_types( self.scenario.cs, self.scenario.feature_array) self._model = UnloggedEPARXrfi( self.types, self.bounds, instance_features=self.scenario.feature_array, seed=12345, cutoff=self.cutoff, threshold=self.threshold) else: self.types, self.bounds = get_types(self.scenario.cs, None) self._model = Unloggedrfwi(self.types, self.bounds, instance_features=None, seed=12345) self._model.rf_opts.compute_oob_error = True
def optimize(scenario, run, forest=False, seed=8, ratio=0.8): types, bounds = get_types(scenario.cs, scenario.feature_array) rfr = RandomForestWithInstances(types=types, bounds=bounds, instance_features=scenario.feature_array, seed=seed) ei = EI(model=rfr) if forest: optimizer = ForestSearch(ei, scenario.cs, ratio=ratio) else: optimizer = InterleavedLocalAndRandomSearch(ei, scenario.cs) scenario.output_dir = "%s_%s_%d_%lf" % ("./logs/run_", "forest_" if forest else "random_", seed, time.time()) smac = SMAC( scenario=scenario, rng=np.random.RandomState(seed), model=rfr, acquisition_function=ei, acquisition_function_optimizer=optimizer, tae_runner=run, ) try: incumbent = smac.optimize() finally: incumbent = smac.solver.incumbent return smac.get_tae_runner().run(incumbent, 1)[1]
def setUp(self): logging.basicConfig(level=logging.DEBUG) self.cs = ConfigurationSpace() self.cs.add_hyperparameter(CategoricalHyperparameter( name="cat_a_b", choices=["a", "b"], default_value="a")) self.cs.add_hyperparameter(UniformFloatHyperparameter( name="float_0_1", lower=0, upper=1, default_value=0.5)) self.cs.add_hyperparameter(UniformIntegerHyperparameter( name='integer_0_100', lower=-10, upper=10, default_value=0)) self.rh = runhistory.RunHistory(aggregate_func=average_cost) rs = numpy.random.RandomState(1) to_count = 0 cn_count = 0 for i in range(500): config, seed, runtime, status, instance_id = \ generate_config(cs=self.cs, rs=rs) if runtime == 40: to_count += 1 if runtime < 40 and status == StatusType.TIMEOUT: cn_count += 1 self.rh.add(config=config, cost=runtime, time=runtime, status=status, instance_id=instance_id, seed=seed, additional_info=None) print("%d TIMEOUTs, %d censored" % (to_count, cn_count)) self.scen = Scen() self.scen.run_obj = "runtime" self.scen.overall_obj = "par10" self.scen.cutoff = 40 types, bounds = get_types(self.cs, None) self.model = RandomForestWithInstances( types=types, bounds=bounds, instance_features=None, seed=1234567980)
def testRandomImputation(self): rs = numpy.random.RandomState(1) for i in range(0, 150, 15): # First random imputation sanity check num_samples = max(1, i * 10) num_feat = max(1, i) num_censored = int(num_samples * 0.1) X = rs.rand(num_samples, num_feat) y = numpy.sin(X[:, 0:1]) cutoff = max(y) * 0.9 y[y > cutoff] = cutoff # We have some cen data cen_X = X[:num_censored, :] cen_y = y[:num_censored] uncen_X = X[num_censored:, :] uncen_y = y[num_censored:] cen_y /= 2 cs = ConfigurationSpace() for i in range(num_feat): cs.add_hyperparameter( UniformFloatHyperparameter(name="a_%d" % i, lower=0, upper=1, default_value=0.5)) types, bounds = get_types(cs, None) print(types) print(bounds) print('#' * 120) print(cen_X) print(uncen_X) print('~' * 120) self.model = RandomForestWithInstances(types=types, bounds=bounds, instance_features=None, seed=1234567980) imputor = rfr_imputator.RFRImputator(rng=rs, cutoff=cutoff, threshold=cutoff * 10, change_threshold=0.01, max_iter=5, model=self.model) imp_y = imputor.impute(censored_X=cen_X, censored_y=cen_y, uncensored_X=uncen_X, uncensored_y=uncen_y) if imp_y is None: continue for idx in range(cen_y.shape[0]): self.assertGreater(imp_y[idx], cen_y[idx]) self.assertTrue(numpy.isfinite(imp_y).all())
def test_init_EIPS_as_arguments(self): for objective in ['runtime', 'quality']: self.scenario.run_obj = objective types, bounds = get_types(self.scenario.cs, None) umrfwi = UncorrelatedMultiObjectiveRandomForestWithInstances( ['cost', 'runtime'], types, bounds) eips = EIPS(umrfwi) rh2EPM = RunHistory2EPM4EIPS(self.scenario, 2) smbo = SMAC(self.scenario, model=umrfwi, acquisition_function=eips, runhistory2epm=rh2EPM).solver self.assertIs(umrfwi, smbo.model) self.assertIs(eips, smbo.acquisition_func) self.assertIs(rh2EPM, smbo.rh2EPM)
def setUp(self): unittest.TestCase.setUp(self) self.rh = runhistory.RunHistory(aggregate_func=average_cost) self.cs = get_config_space() self.config1 = Configuration(self.cs, values={'a': 0, 'b': 100}) self.config2 = Configuration(self.cs, values={'a': 100, 'b': 0}) self.config3 = Configuration(self.cs, values={'a': 100, 'b': 100}) self.scen = Scenario({"cutoff_time": 20, 'cs': self.cs}) self.types, self.bounds = get_types(self.cs, None) self.scen = Scenario({"cutoff_time": 20, 'cs': self.cs, 'output_dir': ''})
def test_with_ordinal(self): cs = smac.configspace.ConfigurationSpace() a = cs.add_hyperparameter( CategoricalHyperparameter('a', [0, 1], default_value=0)) b = cs.add_hyperparameter( OrdinalHyperparameter('b', [0, 1], default_value=1)) b = cs.add_hyperparameter( UniformFloatHyperparameter('c', lower=0., upper=1., default_value=1)) b = cs.add_hyperparameter( UniformIntegerHyperparameter('d', lower=0, upper=10, default_value=1)) cs.seed(1) feat_array = np.array([0, 0, 0]).reshape(1, -1) types, bounds = get_types(cs, feat_array) model = RandomForestWithInstances(types=types, bounds=bounds, instance_features=feat_array, seed=1, ratio_features=1.0, pca_components=9) self.assertEqual(bounds[0][0], 2) self.assertTrue(bounds[0][1] is np.nan) self.assertEqual(bounds[1][0], 0) self.assertEqual(bounds[1][1], 1) self.assertEqual(bounds[2][0], 0.) self.assertEqual(bounds[2][1], 1.) self.assertEqual(bounds[3][0], 0.) self.assertEqual(bounds[3][1], 1.) X = np.array( [[0., 0., 0., 0., 0., 0., 0.], [0., 0., 1., 0., 0., 0., 0.], [0., 1., 0., 9., 0., 0., 0.], [0., 1., 1., 4., 0., 0., 0.]], dtype=np.float64) y = np.array([0, 1, 2, 3], dtype=np.float64) X_train = np.vstack((X, X, X, X, X, X, X, X, X, X)) y_train = np.vstack((y, y, y, y, y, y, y, y, y, y)) model.train(X_train, y_train.reshape((-1, 1))) mean, _ = model.predict(X) for idx, m in enumerate(mean): self.assertAlmostEqual(y[idx], m, 0.05)
def get_eips_object_callback( scenario_dict, seed, ta, backend, metalearning_configurations, runhistory, ): scenario_dict['input_psmac_dirs'] = backend.get_smac_output_glob() scenario = Scenario(scenario_dict) rh2EPM = RunHistory2EPM4EIPS( num_params=len(scenario.cs.get_hyperparameters()), scenario=scenario, success_states=[ StatusType.SUCCESS, StatusType.MEMOUT, StatusType.TIMEOUT, StatusType.CRASHED ], impute_censored_data=False, impute_state=None ) types, bounds = get_types(scenario.cs, scenario.feature_array) model = UncorrelatedMultiObjectiveRandomForestWithInstances( ['cost', 'runtime'], types=types, bounds=bounds, instance_features=scenario.feature_array, rf_kwargs={'seed': 1,}, ) acquisition_function = EIPS(model) return SMAC( runhistory=runhistory, scenario=scenario, rng=seed, tae_runner=ta, runhistory2epm=rh2EPM, model=model, acquisition_function=acquisition_function, run_id=seed, )
def model(self, model_short_name='urfi'): self.types, self.bounds = get_types(self.scenario.cs, self.scenario.feature_array) if model_short_name not in ['urfi', 'rfi']: raise ValueError( 'Specified model %s does not exist or not supported!' % model_short_name) elif model_short_name == 'rfi': self._model = RandomForestWithInstances( self.types, self.bounds, instance_features=self.scenario.feature_array, seed=self.rng.randint(99999)) elif model_short_name == 'urfi': self._model = UnloggedRandomForestWithInstances( self.types, self.bounds, self.scenario.feature_array, seed=self.rng.randint(99999), cutoff=self.cutoff, threshold=self.threshold) self._model.rf_opts.compute_oob_error = True
def _get_mean_var_time(self, validator, traj, pred, rh): # TODO kinda important: docstrings, what is this function doing? validator.traj = traj # set trajectory time, configs = [], [] if pred: for entry in traj: time.append(entry["wallclock_time"]) configs.append(entry["incumbent"]) # self.logger.debug('Time: %d Runs: %d', time[-1], # len(rh.get_runs_for_config(configs[-1]))) self.logger.debug( "Using %d samples (%d distinct) from trajectory.", len(time), len(set(configs))) if validator.epm: # not log as validator epm is trained on cost, not log cost epm = validator.epm else: self.logger.debug( "No EPM passed! Training new one from runhistory.") # Train random forest and transform training data (from given rh) # Not using validator because we want to plot uncertainties rh2epm = RunHistory2EPM4Cost(num_params=len( self.scenario.cs.get_hyperparameters()), scenario=self.scenario) X, y = rh2epm.transform(rh) self.logger.debug( "Training model with data of shape X: %s, y:%s", str(X.shape), str(y.shape)) types, bounds = get_types(self.scenario.cs, self.scenario.feature_array) epm = RandomForestWithInstances( types=types, bounds=bounds, instance_features=self.scenario.feature_array, # seed=self.rng.randint(MAXINT), ratio_features=1.0) epm.train(X, y) config_array = convert_configurations_to_array(configs) mean, var = epm.predict_marginalized_over_instances(config_array) var = np.zeros(mean.shape) # We don't want to show the uncertainty of the model but uncertainty over multiple optimizer runs # This variance is computed in an outer loop. else: mean, var = [], [] for entry in traj: time.append(entry["wallclock_time"]) configs.append(entry["incumbent"]) costs = _cost(configs[-1], rh, rh.get_runs_for_config(configs[-1])) # self.logger.debug(len(costs), time[-1] if not costs: time.pop() else: mean.append(np.mean(costs)) var.append(0) # No variance over instances mean, var = np.array(mean).reshape(-1, 1), np.array(var).reshape( -1, 1) return mean, var, time
def run_smbo(self): self.watcher.start_task('SMBO') # == first things first: load the datamanager self.reset_data_manager() # == Initialize non-SMBO stuff # first create a scenario seed = self.seed self.config_space.seed(seed) num_params = len(self.config_space.get_hyperparameters()) # allocate a run history num_run = self.start_num_run instance_id = self.dataset_name + SENTINEL # Initialize some SMAC dependencies runhistory = RunHistory(aggregate_func=average_cost) # meta_runhistory = RunHistory(aggregate_func=average_cost) # meta_runs_dataset_indices = {} # == METALEARNING suggestions # we start by evaluating the defaults on the full dataset again # and add the suggestions from metalearning behind it if self.num_metalearning_cfgs > 0: if self.metadata_directory is None: metalearning_directory = os.path.dirname( autosklearn.metalearning.__file__) # There is no multilabel data in OpenML if self.task == MULTILABEL_CLASSIFICATION: meta_task = BINARY_CLASSIFICATION else: meta_task = self.task metadata_directory = os.path.join( metalearning_directory, 'files', '%s_%s_%s' % (METRIC_TO_STRING[self.metric], TASK_TYPES_TO_STRING[meta_task], 'sparse' if self.datamanager.info['is_sparse'] else 'dense')) self.metadata_directory = metadata_directory self.logger.info('Metadata directory: %s', self.metadata_directory) meta_base = MetaBase(self.config_space, self.metadata_directory) metafeature_calculation_time_limit = int( self.total_walltime_limit / 4) metafeature_calculation_start_time = time.time() meta_features = self._calculate_metafeatures_with_limits( metafeature_calculation_time_limit) metafeature_calculation_end_time = time.time() metafeature_calculation_time_limit = \ metafeature_calculation_time_limit - ( metafeature_calculation_end_time - metafeature_calculation_start_time) if metafeature_calculation_time_limit < 1: self.logger.warning( 'Time limit for metafeature calculation less ' 'than 1 seconds (%f). Skipping calculation ' 'of metafeatures for encoded dataset.', metafeature_calculation_time_limit) meta_features_encoded = None else: with warnings.catch_warnings(): warnings.showwarning = self._send_warnings_to_log self.datamanager.perform1HotEncoding() meta_features_encoded = \ self._calculate_metafeatures_encoded_with_limits( metafeature_calculation_time_limit) # In case there is a problem calculating the encoded meta-features if meta_features is None: if meta_features_encoded is not None: meta_features = meta_features_encoded else: if meta_features_encoded is not None: meta_features.metafeature_values.update( meta_features_encoded.metafeature_values) if meta_features is not None: meta_base.add_dataset(instance_id, meta_features) # Do mean imputation of the meta-features - should be done specific # for each prediction model! all_metafeatures = meta_base.get_metafeatures( features=list(meta_features.keys())) all_metafeatures.fillna(all_metafeatures.mean(), inplace=True) with warnings.catch_warnings(): warnings.showwarning = self._send_warnings_to_log metalearning_configurations = self.collect_metalearning_suggestions( meta_base) if metalearning_configurations is None: metalearning_configurations = [] self.reset_data_manager() self.logger.info('%s', meta_features) # Convert meta-features into a dictionary because the scenario # expects a dictionary meta_features_dict = {} for dataset, series in all_metafeatures.iterrows(): meta_features_dict[dataset] = series.values meta_features_list = [] for meta_feature_name in all_metafeatures.columns: meta_features_list.append( meta_features[meta_feature_name].value) meta_features_list = np.array(meta_features_list).reshape( (1, -1)) self.logger.info(list(meta_features_dict.keys())) # meta_runs = meta_base.get_all_runs(METRIC_TO_STRING[self.metric]) # meta_runs_index = 0 # try: # meta_durations = meta_base.get_all_runs('runtime') # read_runtime_data = True # except KeyError: # read_runtime_data = False # self.logger.critical('Cannot read runtime data.') # if self.acquisition_function == 'EIPS': # self.logger.critical('Reverting to acquisition function EI!') # self.acquisition_function = 'EI' # for meta_dataset in meta_runs.index: # meta_dataset_start_index = meta_runs_index # for meta_configuration in meta_runs.columns: # if np.isfinite(meta_runs.loc[meta_dataset, meta_configuration]): # try: # config = meta_base.get_configuration_from_algorithm_index( # meta_configuration) # cost = meta_runs.loc[meta_dataset, meta_configuration] # if read_runtime_data: # runtime = meta_durations.loc[meta_dataset, # meta_configuration] # else: # runtime = 1 # # TODO read out other status types! # meta_runhistory.add(config, cost, runtime, # StatusType.SUCCESS, # instance_id=meta_dataset) # meta_runs_index += 1 # except: # # TODO maybe add warning # pass # # meta_runs_dataset_indices[meta_dataset] = ( # meta_dataset_start_index, meta_runs_index) else: meta_features = None if meta_features is None: if self.acquisition_function == 'EIPS': self.logger.critical('Reverting to acquisition function EI!') self.acquisition_function = 'EI' meta_features_list = [] meta_features_dict = {} metalearning_configurations = [] if self.resampling_strategy in [ 'partial-cv', 'partial-cv-iterative-fit' ]: num_folds = self.resampling_strategy_args['folds'] instances = [[fold_number] for fold_number in range(num_folds)] else: instances = None startup_time = self.watcher.wall_elapsed(self.dataset_name) total_walltime_limit = self.total_walltime_limit - startup_time - 5 scenario_dict = { 'cs': self.config_space, 'cutoff-time': self.func_eval_time_limit, 'memory-limit': self.memory_limit, 'wallclock-limit': total_walltime_limit, # 'instances': [[name] for name in meta_features_dict], 'output-dir': self.backend.temporary_directory, 'shared-model': self.shared_mode, 'run-obj': 'quality', 'deterministic': 'true', 'instances': instances } if self.configuration_mode == 'RANDOM': scenario_dict['minR'] = len( instances) if instances is not None else 1 scenario_dict['initial_incumbent'] = 'RANDOM' self.scenario = Scenario(scenario_dict) # TODO rebuild target algorithm to be it's own target algorithm # evaluator, which takes into account that a run can be killed prior # to the model being fully fitted; thus putting intermediate results # into a queue and querying them once the time is over exclude = dict() include = dict() if self.include_preprocessors is not None and \ self.exclude_preprocessors is not None: raise ValueError('Cannot specify include_preprocessors and ' 'exclude_preprocessors.') elif self.include_preprocessors is not None: include['preprocessor'] = self.include_preprocessors elif self.exclude_preprocessors is not None: exclude['preprocessor'] = self.exclude_preprocessors if self.include_estimators is not None and \ self.exclude_preprocessors is not None: raise ValueError('Cannot specify include_estimators and ' 'exclude_estimators.') elif self.include_estimators is not None: if self.task in CLASSIFICATION_TASKS: include['classifier'] = self.include_estimators elif self.task in REGRESSION_TASKS: include['regressor'] = self.include_estimators else: raise ValueError(self.task) elif self.exclude_estimators is not None: if self.task in CLASSIFICATION_TASKS: exclude['classifier'] = self.exclude_estimators elif self.task in REGRESSION_TASKS: exclude['regressor'] = self.exclude_estimators else: raise ValueError(self.task) ta = ExecuteTaFuncWithQueue( backend=self.backend, autosklearn_seed=seed, resampling_strategy=self.resampling_strategy, initial_num_run=num_run, logger=self.logger, include=include, exclude=exclude, memory_limit=self.memory_limit, disable_file_output=self.disable_file_output, **self.resampling_strategy_args) types = get_types(self.config_space, self.scenario.feature_array) # TODO extract generation of SMAC object into it's own function for # testing if self.acquisition_function == 'EI': model = RandomForestWithInstances( types, #instance_features=meta_features_list, seed=1, num_trees=10) rh2EPM = RunHistory2EPM4Cost(num_params=num_params, scenario=self.scenario, success_states=[ StatusType.SUCCESS, StatusType.MEMOUT, StatusType.TIMEOUT ], impute_censored_data=False, impute_state=None) _smac_arguments = dict(scenario=self.scenario, model=model, rng=seed, runhistory2epm=rh2EPM, tae_runner=ta, runhistory=runhistory) elif self.acquisition_function == 'EIPS': rh2EPM = RunHistory2EPM4EIPS(num_params=num_params, scenario=self.scenario, success_states=[ StatusType.SUCCESS, StatusType.MEMOUT, StatusType.TIMEOUT ], impute_censored_data=False, impute_state=None) model = UncorrelatedMultiObjectiveRandomForestWithInstances( ['cost', 'runtime'], types, num_trees=10, instance_features=meta_features_list, seed=1) acquisition_function = EIPS(model) _smac_arguments = dict(scenario=self.scenario, model=model, rng=seed, tae_runner=ta, runhistory2epm=rh2EPM, runhistory=runhistory, acquisition_function=acquisition_function) else: raise ValueError('Unknown acquisition function value %s!' % self.acquisition_function) if self.configuration_mode == 'SMAC': smac = SMAC(**_smac_arguments) elif self.configuration_mode in ['ROAR', 'RANDOM']: for not_in_roar in ['runhistory2epm', 'model']: if not_in_roar in _smac_arguments: del _smac_arguments[not_in_roar] smac = ROAR(**_smac_arguments) else: raise ValueError(self.configuration_mode) # Build a runtime model # runtime_rf = RandomForestWithInstances(types, # instance_features=meta_features_list, # seed=1, num_trees=10) # runtime_rh2EPM = RunHistory2EPM4EIPS(num_params=num_params, # scenario=self.scenario, # success_states=None, # impute_censored_data=False, # impute_state=None) # X_runtime, y_runtime = runtime_rh2EPM.transform(meta_runhistory) # runtime_rf.train(X_runtime, y_runtime[:, 1].flatten()) # X_meta, Y_meta = rh2EPM.transform(meta_runhistory) # # Transform Y_meta on a per-dataset base # for meta_dataset in meta_runs_dataset_indices: # start_index, end_index = meta_runs_dataset_indices[meta_dataset] # end_index += 1 # Python indexing # Y_meta[start_index:end_index, 0]\ # [Y_meta[start_index:end_index, 0] >2.0] = 2.0 # dataset_minimum = np.min(Y_meta[start_index:end_index, 0]) # Y_meta[start_index:end_index, 0] = 1 - ( # (1. - Y_meta[start_index:end_index, 0]) / # (1. - dataset_minimum)) # Y_meta[start_index:end_index, 0]\ # [Y_meta[start_index:end_index, 0] > 2] = 2 smac.solver.stats.start_timing() # == first, evaluate all metelearning and default configurations smac.solver.incumbent = smac.solver.initial_design.run() for challenger in metalearning_configurations: smac.solver.incumbent, inc_perf = smac.solver.intensifier.intensify( challengers=[challenger], incumbent=smac.solver.incumbent, run_history=smac.solver.runhistory, aggregate_func=smac.solver.aggregate_func, time_bound=self.total_walltime_limit) if smac.solver.scenario.shared_model: pSMAC.write(run_history=smac.solver.runhistory, output_directory=smac.solver.scenario.output_dir, num_run=self.seed) if smac.solver.stats.is_budget_exhausted(): break # == after metalearning run SMAC loop while True: if smac.solver.scenario.shared_model: pSMAC.read(run_history=smac.solver.runhistory, output_directory=self.scenario.output_dir, configuration_space=self.config_space, logger=self.logger) choose_next_start_time = time.time() try: challengers = self.choose_next(smac) except Exception as e: self.logger.error(e) self.logger.error("Error in getting next configurations " "with SMAC. Using random configuration!") next_config = self.config_space.sample_configuration() challengers = [next_config] time_for_choose_next = time.time() - choose_next_start_time self.logger.info('Used %g seconds to find next ' 'configurations' % (time_for_choose_next)) time_for_choose_next = max(time_for_choose_next, 1.0) smac.solver.incumbent, inc_perf = smac.solver.intensifier.intensify( challengers=challengers, incumbent=smac.solver.incumbent, run_history=smac.solver.runhistory, aggregate_func=smac.solver.aggregate_func, time_bound=time_for_choose_next) if smac.solver.scenario.shared_model: pSMAC.write(run_history=smac.solver.runhistory, output_directory=smac.solver.scenario.output_dir, num_run=self.seed) if smac.solver.stats.is_budget_exhausted(): break self.runhistory = smac.solver.runhistory self.trajectory = smac.solver.intensifier.traj_logger.trajectory return self.runhistory, self.trajectory
def convert_data_for_epm(scenario: Scenario, runhistory: RunHistory, logger=None): """ converts data from runhistory into EPM format Parameters ---------- scenario: Scenario smac.scenario.scenario.Scenario Object runhistory: RunHistory smac.runhistory.runhistory.RunHistory Object with all necessary data Returns ------- X: np.array X matrix with configuartion x features for all observed samples y: np.array y matrix with all observations types: np.array types of X cols -- necessary to train our RF implementation """ types, bounds = get_types(scenario.cs, scenario.feature_array) model = RandomForestWithInstances(types, bounds) params = scenario.cs.get_hyperparameters() num_params = len(params) run_obj = scenario.run_obj if run_obj == "runtime": # if we log the performance data, # the RFRImputator will already get # log transform data from the runhistory cutoff = np.log10(scenario.cutoff) threshold = np.log10(scenario.cutoff * scenario.par_factor) imputor = RFRImputator(rng=np.random.RandomState(42), cutoff=cutoff, threshold=threshold, model=model, change_threshold=0.01, max_iter=10) # TODO: Adapt runhistory2EPM object based on scenario rh2EPM = RunHistory2EPM4LogCost(scenario=scenario, num_params=num_params, success_states=[ StatusType.SUCCESS, ], impute_censored_data=True, impute_state=[ StatusType.TIMEOUT, ], imputor=imputor) X, Y = rh2EPM.transform(runhistory) else: rh2EPM = RunHistory2EPM4Cost(scenario=scenario, num_params=num_params, success_states=None, impute_censored_data=False, impute_state=None) X, Y = rh2EPM.transform(runhistory) return X, Y, types
def __init__( self, scenario: Scenario, tae_runner: typing.Optional[typing.Union[ExecuteTARun, typing.Callable]] = None, runhistory: typing.Optional[RunHistory] = None, intensifier: typing.Optional[Intensifier] = None, acquisition_function: typing. Optional[AbstractAcquisitionFunction] = None, acquisition_function_optimizer: typing. Optional[AcquisitionFunctionMaximizer] = None, model: typing.Optional[AbstractEPM] = None, runhistory2epm: typing.Optional[AbstractRunHistory2EPM] = None, initial_design: typing.Optional[InitialDesign] = None, initial_configurations: typing.Optional[ typing.List[Configuration]] = None, stats: typing.Optional[Stats] = None, restore_incumbent: typing.Optional[Configuration] = None, rng: typing.Optional[typing.Union[np.random.RandomState, int]] = None, smbo_class: typing.Optional[SMBO] = None, run_id: typing.Optional[int] = None, random_configuration_chooser: typing. Optional[RandomConfigurationChooser] = None): """ Constructor Parameters ---------- scenario : ~smac.scenario.scenario.Scenario Scenario object tae_runner : ~smac.tae.execute_ta_run.ExecuteTARun or callable Callable or implementation of :class:`~smac.tae.execute_ta_run.ExecuteTARun`. In case a callable is passed it will be wrapped by :class:`~smac.tae.execute_func.ExecuteTAFuncDict`. If not set, it will be initialized with the :class:`~smac.tae.execute_ta_run_old.ExecuteTARunOld`. runhistory : RunHistory runhistory to store all algorithm runs intensifier : Intensifier intensification object to issue a racing to decide the current incumbent acquisition_function : ~smac.optimizer.acquisition.AbstractAcquisitionFunction Object that implements the :class:`~smac.optimizer.acquisition.AbstractAcquisitionFunction`. Will use :class:`~smac.optimizer.acquisition.EI` if not set. acquisition_function_optimizer : ~smac.optimizer.ei_optimization.AcquisitionFunctionMaximizer Object that implements the :class:`~smac.optimizer.ei_optimization.AcquisitionFunctionMaximizer`. Will use :class:`smac.optimizer.ei_optimization.InterleavedLocalAndRandomSearch` if not set. model : AbstractEPM Model that implements train() and predict(). Will use a :class:`~smac.epm.rf_with_instances.RandomForestWithInstances` if not set. runhistory2epm : ~smac.runhistory.runhistory2epm.RunHistory2EMP Object that implements the AbstractRunHistory2EPM. If None, will use :class:`~smac.runhistory.runhistory2epm.RunHistory2EPM4Cost` if objective is cost or :class:`~smac.runhistory.runhistory2epm.RunHistory2EPM4LogCost` if objective is runtime. initial_design : InitialDesign initial sampling design initial_configurations : typing.List[Configuration] list of initial configurations for initial design -- cannot be used together with initial_design stats : Stats optional stats object rng : np.random.RandomState Random number generator restore_incumbent : Configuration incumbent used if restoring to previous state smbo_class : ~smac.optimizer.smbo.SMBO Class implementing the SMBO interface which will be used to instantiate the optimizer class. run_id : int (optional) Run ID will be used as subfolder for output_dir. If no ``run_id`` is given, a random ``run_id`` will be chosen. random_configuration_chooser : ~smac.optimizer.random_configuration_chooser.RandomConfigurationChooser How often to choose a random configuration during the intensification procedure. """ self.logger = logging.getLogger(self.__module__ + "." + self.__class__.__name__) aggregate_func = average_cost self.scenario = scenario self.output_dir = "" if not restore_incumbent: # restore_incumbent is used by the CLI interface which provides a method for restoring a SMAC run given an # output directory. This is the default path. # initial random number generator run_id, rng = get_rng(rng=rng, run_id=run_id, logger=self.logger) self.output_dir = create_output_directory(scenario, run_id) elif scenario.output_dir is not None: run_id, rng = get_rng(rng=rng, run_id=run_id, logger=self.logger) # output-directory is created in CLI when restoring from a # folder. calling the function again in the facade results in two # folders being created: run_X and run_X.OLD. if we are # restoring, the output-folder exists already and we omit creating it, # but set the self-output_dir to the dir. # necessary because we want to write traj to new output-dir in CLI. self.output_dir = scenario.output_dir_for_this_run if (scenario.deterministic is True and getattr(scenario, 'tuner_timeout', None) is None and scenario.run_obj == 'quality'): self.logger.info('Optimizing a deterministic scenario for ' 'quality without a tuner timeout - will make ' 'SMAC deterministic!') scenario.intensification_percentage = 1e-10 scenario.write() # initialize stats object if stats: self.stats = stats else: self.stats = Stats(scenario) if self.scenario.run_obj == "runtime" and not self.scenario.transform_y == "LOG": self.logger.warn( "Runtime as objective automatically activates log(y) transformation" ) self.scenario.transform_y = "LOG" # initialize empty runhistory if runhistory is None: runhistory = RunHistory(aggregate_func=aggregate_func) # inject aggr_func if necessary if runhistory.aggregate_func is None: runhistory.aggregate_func = aggregate_func if not random_configuration_chooser: random_configuration_chooser = ChooserProb(prob=scenario.rand_prob, rng=rng) # reset random number generator in config space to draw different # random configurations with each seed given to SMAC scenario.cs.seed(rng.randint(MAXINT)) # initial Trajectory Logger traj_logger = TrajLogger(output_dir=self.output_dir, stats=self.stats) # initial EPM types, bounds = get_types(scenario.cs, scenario.feature_array) if model is None: model = RandomForestWithInstances( types=types, bounds=bounds, instance_features=scenario.feature_array, seed=rng.randint(MAXINT), pca_components=scenario.PCA_DIM, log_y=scenario.transform_y in ["LOG", "LOGS"], num_trees=scenario.rf_num_trees, do_bootstrapping=scenario.rf_do_bootstrapping, ratio_features=scenario.rf_ratio_features, min_samples_split=scenario.rf_min_samples_split, min_samples_leaf=scenario.rf_min_samples_leaf, max_depth=scenario.rf_max_depth) # initial acquisition function if acquisition_function is None: if scenario.transform_y in ["LOG", "LOGS"]: acquisition_function = LogEI(model=model) else: acquisition_function = EI(model=model) # inject model if necessary if acquisition_function.model is None: acquisition_function.model = model # initialize optimizer on acquisition function if acquisition_function_optimizer is None: acquisition_function_optimizer = InterleavedLocalAndRandomSearch( acquisition_function=acquisition_function, config_space=scenario.cs, rng=np.random.RandomState(seed=rng.randint(MAXINT)), max_steps=scenario.sls_max_steps, n_steps_plateau_walk=scenario.sls_n_steps_plateau_walk) elif not isinstance( acquisition_function_optimizer, AcquisitionFunctionMaximizer, ): raise ValueError( "Argument 'acquisition_function_optimizer' must be of type" "'AcquisitionFunctionMaximizer', but is '%s'" % type(acquisition_function_optimizer)) # initialize tae_runner # First case, if tae_runner is None, the target algorithm is a call # string in the scenario file if tae_runner is None: tae_runner = ExecuteTARunOld( ta=scenario.ta, stats=self.stats, run_obj=scenario.run_obj, runhistory=runhistory, par_factor=scenario.par_factor, cost_for_crash=scenario.cost_for_crash, abort_on_first_run_crash=scenario.abort_on_first_run_crash) # Second case, the tae_runner is a function to be optimized elif callable(tae_runner): tae_runner = ExecuteTAFuncDict( ta=tae_runner, stats=self.stats, run_obj=scenario.run_obj, memory_limit=scenario.memory_limit, runhistory=runhistory, par_factor=scenario.par_factor, cost_for_crash=scenario.cost_for_crash, abort_on_first_run_crash=scenario.abort_on_first_run_crash) # Third case, if it is an ExecuteTaRun we can simply use the # instance. Otherwise, the next check raises an exception elif not isinstance(tae_runner, ExecuteTARun): raise TypeError("Argument 'tae_runner' is %s, but must be " "either a callable or an instance of " "ExecuteTaRun. Passing 'None' will result in the " "creation of target algorithm runner based on the " "call string in the scenario file." % type(tae_runner)) # Check that overall objective and tae objective are the same if tae_runner.run_obj != scenario.run_obj: raise ValueError("Objective for the target algorithm runner and " "the scenario must be the same, but are '%s' and " "'%s'" % (tae_runner.run_obj, scenario.run_obj)) # inject stats if necessary if tae_runner.stats is None: tae_runner.stats = self.stats # inject runhistory if necessary if tae_runner.runhistory is None: tae_runner.runhistory = runhistory # inject cost_for_crash if tae_runner.crash_cost != scenario.cost_for_crash: tae_runner.crash_cost = scenario.cost_for_crash # initialize intensification if intensifier is None: intensifier = Intensifier( tae_runner=tae_runner, stats=self.stats, traj_logger=traj_logger, rng=rng, instances=scenario.train_insts, cutoff=scenario.cutoff, deterministic=scenario.deterministic, run_obj_time=scenario.run_obj == "runtime", always_race_against=scenario.cs.get_default_configuration() if scenario.always_race_default else None, use_ta_time_bound=scenario.use_ta_time, instance_specifics=scenario.instance_specific, minR=scenario.minR, maxR=scenario.maxR, adaptive_capping_slackfactor=scenario. intens_adaptive_capping_slackfactor, min_chall=scenario.intens_min_chall) # inject deps if necessary if intensifier.tae_runner is None: intensifier.tae_runner = tae_runner if intensifier.stats is None: intensifier.stats = self.stats if intensifier.traj_logger is None: intensifier.traj_logger = traj_logger # initial design if initial_design is not None and initial_configurations is not None: raise ValueError( "Either use initial_design or initial_configurations; but not both" ) if initial_configurations is not None: initial_design = MultiConfigInitialDesign( tae_runner=tae_runner, scenario=scenario, stats=self.stats, traj_logger=traj_logger, runhistory=runhistory, rng=rng, configs=initial_configurations, intensifier=intensifier, aggregate_func=aggregate_func) elif initial_design is None: if scenario.initial_incumbent == "DEFAULT": initial_design = DefaultConfiguration(tae_runner=tae_runner, scenario=scenario, stats=self.stats, traj_logger=traj_logger, rng=rng) elif scenario.initial_incumbent == "RANDOM": initial_design = RandomConfiguration(tae_runner=tae_runner, scenario=scenario, stats=self.stats, traj_logger=traj_logger, rng=rng) elif scenario.initial_incumbent == "LHD": initial_design = LHDesign(runhistory=runhistory, intensifier=intensifier, aggregate_func=aggregate_func, tae_runner=tae_runner, scenario=scenario, stats=self.stats, traj_logger=traj_logger, rng=rng) elif scenario.initial_incumbent == "FACTORIAL": initial_design = FactorialInitialDesign( runhistory=runhistory, intensifier=intensifier, aggregate_func=aggregate_func, tae_runner=tae_runner, scenario=scenario, stats=self.stats, traj_logger=traj_logger, rng=rng) elif scenario.initial_incumbent == "SOBOL": initial_design = SobolDesign(runhistory=runhistory, intensifier=intensifier, aggregate_func=aggregate_func, tae_runner=tae_runner, scenario=scenario, stats=self.stats, traj_logger=traj_logger, rng=rng) else: raise ValueError("Don't know what kind of initial_incumbent " "'%s' is" % scenario.initial_incumbent) # inject deps if necessary if initial_design.tae_runner is None: initial_design.tae_runner = tae_runner if initial_design.scenario is None: initial_design.scenario = scenario if initial_design.stats is None: initial_design.stats = self.stats if initial_design.traj_logger is None: initial_design.traj_logger = traj_logger # initial conversion of runhistory into EPM data if runhistory2epm is None: num_params = len(scenario.cs.get_hyperparameters()) if scenario.run_obj == 'runtime': # if we log the performance data, # the RFRImputator will already get # log transform data from the runhistory cutoff = np.log(scenario.cutoff) threshold = np.log(scenario.cutoff * scenario.par_factor) imputor = RFRImputator(rng=rng, cutoff=cutoff, threshold=threshold, model=model, change_threshold=0.01, max_iter=2) runhistory2epm = RunHistory2EPM4LogCost( scenario=scenario, num_params=num_params, success_states=[ StatusType.SUCCESS, ], impute_censored_data=True, impute_state=[ StatusType.CAPPED, ], imputor=imputor) elif scenario.run_obj == 'quality': if scenario.transform_y == "NONE": runhistory2epm = RunHistory2EPM4Cost( scenario=scenario, num_params=num_params, success_states=[ StatusType.SUCCESS, StatusType.CRASHED ], impute_censored_data=False, impute_state=None) elif scenario.transform_y == "LOG": runhistory2epm = RunHistory2EPM4LogCost( scenario=scenario, num_params=num_params, success_states=[ StatusType.SUCCESS, StatusType.CRASHED ], impute_censored_data=False, impute_state=None) elif scenario.transform_y == "LOGS": runhistory2epm = RunHistory2EPM4LogScaledCost( scenario=scenario, num_params=num_params, success_states=[ StatusType.SUCCESS, StatusType.CRASHED ], impute_censored_data=False, impute_state=None) elif scenario.transform_y == "INVS": runhistory2epm = RunHistory2EPM4InvScaledCost( scenario=scenario, num_params=num_params, success_states=[ StatusType.SUCCESS, StatusType.CRASHED ], impute_censored_data=False, impute_state=None) else: raise ValueError('Unknown run objective: %s. Should be either ' 'quality or runtime.' % self.scenario.run_obj) # inject scenario if necessary: if runhistory2epm.scenario is None: runhistory2epm.scenario = scenario smbo_args = { 'scenario': scenario, 'stats': self.stats, 'initial_design': initial_design, 'runhistory': runhistory, 'runhistory2epm': runhistory2epm, 'intensifier': intensifier, 'aggregate_func': aggregate_func, 'num_run': run_id, 'model': model, 'acq_optimizer': acquisition_function_optimizer, 'acquisition_func': acquisition_function, 'rng': rng, 'restore_incumbent': restore_incumbent, 'random_configuration_chooser': random_configuration_chooser } if smbo_class is None: self.solver = SMBO(**smbo_args) else: self.solver = smbo_class(**smbo_args)
def __init__( self, scenario: Scenario, # TODO: once we drop python3.4 add type hint # typing.Union[ExecuteTARun, callable] tae_runner=None, runhistory: RunHistory = None, intensifier: Intensifier = None, acquisition_function: AbstractAcquisitionFunction = None, model: AbstractEPM = None, runhistory2epm: AbstractRunHistory2EPM = None, initial_design: InitialDesign = None, initial_configurations: typing.List[Configuration] = None, stats: Stats = None, rng: np.random.RandomState = None): ''' Facade to use SMAC default mode Parameters ---------- scenario: smac.scenario.scenario.Scenario Scenario object tae_runner: ExecuteTARun or callable Callable or implementation of :class:`ExecuteTaRun`. In case a callable is passed it will be wrapped by tae.ExecuteTaFunc(). If not set, tae_runner will be initialized with the tae.ExecuteTARunOld() runhistory: RunHistory runhistory to store all algorithm runs intensifier: Intensifier intensification object to issue a racing to decide the current incumbent acquisition_function : AcquisitionFunction Object that implements the AbstractAcquisitionFunction. Will use EI if not set. model : AbstractEPM Model that implements train() and predict(). Will use a RandomForest if not set. runhistory2epm : RunHistory2EMP Object that implements the AbstractRunHistory2EPM. If None, will use RunHistory2EPM4Cost if objective is cost or RunHistory2EPM4LogCost if objective is runtime. initial_design: InitialDesign initial sampling design initial_configurations: typing.List[Configuration] list of initial configurations for initial design -- cannot be used together with initial_design stats: Stats optional stats object rng: np.random.RandomState Random number generator ''' self.logger = logging.getLogger("SMAC") aggregate_func = average_cost # initialize stats object if stats: self.stats = stats else: self.stats = Stats(scenario) # initialize empty runhistory if runhistory is None: runhistory = RunHistory(aggregate_func=aggregate_func) # initial random number generator num_run, rng = self._get_rng(rng=rng) # reset random number generator in config space to draw different # random configurations with each seed given to SMAC scenario.cs.seed(rng.randint(MAXINT)) # initial Trajectory Logger traj_logger = TrajLogger(output_dir=scenario.output_dir, stats=self.stats) # initial EPM types = get_types(scenario.cs, scenario.feature_array) if model is None: model = RandomForestWithInstances( types=types, instance_features=scenario.feature_array, seed=rng.randint(MAXINT)) # initial acquisition function if acquisition_function is None: acquisition_function = EI(model=model) # initialize optimizer on acquisition function local_search = LocalSearch(acquisition_function, scenario.cs) # initialize tae_runner # First case, if tae_runner is None, the target algorithm is a call # string in the scenario file if tae_runner is None: tae_runner = ExecuteTARunOld(ta=scenario.ta, stats=self.stats, run_obj=scenario.run_obj, runhistory=runhistory, par_factor=scenario.par_factor) # Second case, the tae_runner is a function to be optimized elif callable(tae_runner): tae_runner = ExecuteTAFuncDict(ta=tae_runner, stats=self.stats, run_obj=scenario.run_obj, memory_limit=scenario.memory_limit, runhistory=runhistory, par_factor=scenario.par_factor) # Third case, if it is an ExecuteTaRun we can simply use the # instance. Otherwise, the next check raises an exception elif not isinstance(tae_runner, ExecuteTARun): raise TypeError("Argument 'tae_runner' is %s, but must be " "either a callable or an instance of " "ExecuteTaRun. Passing 'None' will result in the " "creation of target algorithm runner based on the " "call string in the scenario file." % type(tae_runner)) # Check that overall objective and tae objective are the same if tae_runner.run_obj != scenario.run_obj: raise ValueError("Objective for the target algorithm runner and " "the scenario must be the same, but are '%s' and " "'%s'" % (tae_runner.run_obj, scenario.run_obj)) # inject stats if necessary if tae_runner.stats is None: tae_runner.stats = self.stats # inject runhistory if necessary if tae_runner.runhistory is None: tae_runner.runhistory = runhistory # initial intensification if intensifier is None: intensifier = Intensifier( tae_runner=tae_runner, stats=self.stats, traj_logger=traj_logger, rng=rng, instances=scenario.train_insts, cutoff=scenario.cutoff, deterministic=scenario.deterministic, run_obj_time=scenario.run_obj == "runtime", instance_specifics=scenario.instance_specific, minR=scenario.minR, maxR=scenario.maxR) # initial design if initial_design is not None and initial_configurations is not None: raise ValueError( "Either use initial_design or initial_configurations; but not both" ) if initial_configurations is not None: initial_design = MultiConfigInitialDesign( tae_runner=tae_runner, scenario=scenario, stats=self.stats, traj_logger=traj_logger, runhistory=runhistory, rng=rng, configs=initial_configurations, intensifier=intensifier, aggregate_func=aggregate_func) elif initial_design is None: if scenario.initial_incumbent == "DEFAULT": initial_design = DefaultConfiguration(tae_runner=tae_runner, scenario=scenario, stats=self.stats, traj_logger=traj_logger, rng=rng) elif scenario.initial_incumbent == "RANDOM": initial_design = RandomConfiguration(tae_runner=tae_runner, scenario=scenario, stats=self.stats, traj_logger=traj_logger, rng=rng) else: raise ValueError("Don't know what kind of initial_incumbent " "'%s' is" % scenario.initial_incumbent) # initial conversion of runhistory into EPM data if runhistory2epm is None: num_params = len(scenario.cs.get_hyperparameters()) if scenario.run_obj == "runtime": # if we log the performance data, # the RFRImputator will already get # log transform data from the runhistory cutoff = np.log10(scenario.cutoff) threshold = np.log10(scenario.cutoff * scenario.par_factor) imputor = RFRImputator(rs=rng, cutoff=cutoff, threshold=threshold, model=model, change_threshold=0.01, max_iter=2) runhistory2epm = RunHistory2EPM4LogCost( scenario=scenario, num_params=num_params, success_states=[ StatusType.SUCCESS, ], impute_censored_data=True, impute_state=[ StatusType.TIMEOUT, ], imputor=imputor) elif scenario.run_obj == 'quality': runhistory2epm = RunHistory2EPM4Cost\ (scenario=scenario, num_params=num_params, success_states=[StatusType.SUCCESS, ], impute_censored_data=False, impute_state=None) else: raise ValueError('Unknown run objective: %s. Should be either ' 'quality or runtime.' % self.scenario.run_obj) self.solver = SMBO(scenario=scenario, stats=self.stats, initial_design=initial_design, runhistory=runhistory, runhistory2epm=runhistory2epm, intensifier=intensifier, aggregate_func=aggregate_func, num_run=num_run, model=model, acq_optimizer=local_search, acquisition_func=acquisition_function, rng=rng)
def __init__( self, scenario: Scenario, tae_runner: typing.Union[ExecuteTARun, typing.Callable] = None, runhistory: RunHistory = None, intensifier: Intensifier = None, acquisition_function: AbstractAcquisitionFunction = None, acquisition_function_optimizer: AcquisitionFunctionMaximizer = None, model: AbstractEPM = None, runhistory2epm: AbstractRunHistory2EPM = None, initial_design: InitialDesign = None, initial_configurations: typing.List[Configuration] = None, stats: Stats = None, restore_incumbent: Configuration = None, rng: typing.Union[np.random.RandomState, int] = None, smbo_class: SMBO = None, run_id: int = 1): """Constructor Parameters ---------- scenario : ~smac.scenario.scenario.Scenario Scenario object tae_runner : ~smac.tae.execute_ta_run.ExecuteTARun or callable Callable or implementation of :class:`~smac.tae.execute_ta_run.ExecuteTARun`. In case a callable is passed it will be wrapped by :class:`~smac.tae.execute_func.ExecuteTAFuncDict`. If not set, it will be initialized with the :class:`~smac.tae.execute_ta_run_old.ExecuteTARunOld`. runhistory : RunHistory runhistory to store all algorithm runs intensifier : Intensifier intensification object to issue a racing to decide the current incumbent acquisition_function : ~smac.optimizer.acquisition.AbstractAcquisitionFunction Object that implements the :class:`~smac.optimizer.acquisition.AbstractAcquisitionFunction`. Will use :class:`~smac.optimizer.acquisition.EI` if not set. acquisition_function_optimizer : ~smac.optimizer.ei_optimization.AcquisitionFunctionMaximizer Object that implements the :class:`~smac.optimizer.ei_optimization.AcquisitionFunctionMaximizer`. Will use :class:`smac.optimizer.ei_optimization.InterleavedLocalAndRandomSearch` if not set. model : AbstractEPM Model that implements train() and predict(). Will use a :class:`~smac.epm.rf_with_instances.RandomForestWithInstances` if not set. runhistory2epm : ~smac.runhistory.runhistory2epm.RunHistory2EMP Object that implements the AbstractRunHistory2EPM. If None, will use :class:`~smac.runhistory.runhistory2epm.RunHistory2EPM4Cost` if objective is cost or :class:`~smac.runhistory.runhistory2epm.RunHistory2EPM4LogCost` if objective is runtime. initial_design : InitialDesign initial sampling design initial_configurations : typing.List[Configuration] list of initial configurations for initial design -- cannot be used together with initial_design stats : Stats optional stats object rng : np.random.RandomState Random number generator restore_incumbent : Configuration incumbent used if restoring to previous state smbo_class : ~smac.optimizer.smbo.SMBO Class implementing the SMBO interface which will be used to instantiate the optimizer class. run_id: int, (default: 1) Run ID will be used as subfolder for output_dir. """ self.logger = logging.getLogger(self.__module__ + "." + self.__class__.__name__) aggregate_func = average_cost self.output_dir = create_output_directory(scenario, run_id) scenario.write() # initialize stats object if stats: self.stats = stats else: self.stats = Stats(scenario) # initialize empty runhistory if runhistory is None: runhistory = RunHistory(aggregate_func=aggregate_func) # inject aggr_func if necessary if runhistory.aggregate_func is None: runhistory.aggregate_func = aggregate_func # initial random number generator num_run, rng = self._get_rng(rng=rng) # reset random number generator in config space to draw different # random configurations with each seed given to SMAC scenario.cs.seed(rng.randint(MAXINT)) # initial Trajectory Logger traj_logger = TrajLogger(output_dir=self.output_dir, stats=self.stats) # initial EPM types, bounds = get_types(scenario.cs, scenario.feature_array) if model is None: model = RandomForestWithInstances( types=types, bounds=bounds, instance_features=scenario.feature_array, seed=rng.randint(MAXINT), pca_components=scenario.PCA_DIM) # initial acquisition function if acquisition_function is None: if scenario.run_obj == "runtime": acquisition_function = LogEI(model=model) else: acquisition_function = EI(model=model) # inject model if necessary if acquisition_function.model is None: acquisition_function.model = model # initialize optimizer on acquisition function if acquisition_function_optimizer is None: acquisition_function_optimizer = InterleavedLocalAndRandomSearch( acquisition_function, scenario.cs, np.random.RandomState(seed=rng.randint(MAXINT))) elif not isinstance( acquisition_function_optimizer, AcquisitionFunctionMaximizer, ): raise ValueError( "Argument 'acquisition_function_optimizer' must be of type" "'AcquisitionFunctionMaximizer', but is '%s'" % type(acquisition_function_optimizer)) # initialize tae_runner # First case, if tae_runner is None, the target algorithm is a call # string in the scenario file if tae_runner is None: tae_runner = ExecuteTARunOld( ta=scenario.ta, stats=self.stats, run_obj=scenario.run_obj, runhistory=runhistory, par_factor=scenario.par_factor, cost_for_crash=scenario.cost_for_crash) # Second case, the tae_runner is a function to be optimized elif callable(tae_runner): tae_runner = ExecuteTAFuncDict( ta=tae_runner, stats=self.stats, run_obj=scenario.run_obj, memory_limit=scenario.memory_limit, runhistory=runhistory, par_factor=scenario.par_factor, cost_for_crash=scenario.cost_for_crash) # Third case, if it is an ExecuteTaRun we can simply use the # instance. Otherwise, the next check raises an exception elif not isinstance(tae_runner, ExecuteTARun): raise TypeError("Argument 'tae_runner' is %s, but must be " "either a callable or an instance of " "ExecuteTaRun. Passing 'None' will result in the " "creation of target algorithm runner based on the " "call string in the scenario file." % type(tae_runner)) # Check that overall objective and tae objective are the same if tae_runner.run_obj != scenario.run_obj: raise ValueError("Objective for the target algorithm runner and " "the scenario must be the same, but are '%s' and " "'%s'" % (tae_runner.run_obj, scenario.run_obj)) # inject stats if necessary if tae_runner.stats is None: tae_runner.stats = self.stats # inject runhistory if necessary if tae_runner.runhistory is None: tae_runner.runhistory = runhistory # inject cost_for_crash if tae_runner.crash_cost != scenario.cost_for_crash: tae_runner.crash_cost = scenario.cost_for_crash # initialize intensification if intensifier is None: intensifier = Intensifier(tae_runner=tae_runner, stats=self.stats, traj_logger=traj_logger, rng=rng, instances=scenario.train_insts, cutoff=scenario.cutoff, deterministic=scenario.deterministic, run_obj_time=scenario.run_obj == "runtime", always_race_against=scenario.cs.get_default_configuration() \ if scenario.always_race_default else None, instance_specifics=scenario.instance_specific, minR=scenario.minR, maxR=scenario.maxR) # inject deps if necessary if intensifier.tae_runner is None: intensifier.tae_runner = tae_runner if intensifier.stats is None: intensifier.stats = self.stats if intensifier.traj_logger is None: intensifier.traj_logger = traj_logger # initial design if initial_design is not None and initial_configurations is not None: raise ValueError( "Either use initial_design or initial_configurations; but not both" ) if initial_configurations is not None: initial_design = MultiConfigInitialDesign( tae_runner=tae_runner, scenario=scenario, stats=self.stats, traj_logger=traj_logger, runhistory=runhistory, rng=rng, configs=initial_configurations, intensifier=intensifier, aggregate_func=aggregate_func) elif initial_design is None: if scenario.initial_incumbent == "DEFAULT": initial_design = DefaultConfiguration(tae_runner=tae_runner, scenario=scenario, stats=self.stats, traj_logger=traj_logger, rng=rng) elif scenario.initial_incumbent == "RANDOM": initial_design = RandomConfiguration(tae_runner=tae_runner, scenario=scenario, stats=self.stats, traj_logger=traj_logger, rng=rng) else: raise ValueError("Don't know what kind of initial_incumbent " "'%s' is" % scenario.initial_incumbent) # inject deps if necessary if initial_design.tae_runner is None: initial_design.tae_runner = tae_runner if initial_design.scenario is None: initial_design.scenario = scenario if initial_design.stats is None: initial_design.stats = self.stats if initial_design.traj_logger is None: initial_design.traj_logger = traj_logger # initial conversion of runhistory into EPM data if runhistory2epm is None: num_params = len(scenario.cs.get_hyperparameters()) if scenario.run_obj == "runtime": # if we log the performance data, # the RFRImputator will already get # log transform data from the runhistory cutoff = np.log10(scenario.cutoff) threshold = np.log10(scenario.cutoff * scenario.par_factor) imputor = RFRImputator(rng=rng, cutoff=cutoff, threshold=threshold, model=model, change_threshold=0.01, max_iter=2) runhistory2epm = RunHistory2EPM4LogCost( scenario=scenario, num_params=num_params, success_states=[ StatusType.SUCCESS, ], impute_censored_data=True, impute_state=[ StatusType.CAPPED, ], imputor=imputor) elif scenario.run_obj == 'quality': runhistory2epm = RunHistory2EPM4Cost( scenario=scenario, num_params=num_params, success_states=[StatusType.SUCCESS, StatusType.CRASHED], impute_censored_data=False, impute_state=None) else: raise ValueError('Unknown run objective: %s. Should be either ' 'quality or runtime.' % self.scenario.run_obj) # inject scenario if necessary: if runhistory2epm.scenario is None: runhistory2epm.scenario = scenario smbo_args = { 'scenario': scenario, 'stats': self.stats, 'initial_design': initial_design, 'runhistory': runhistory, 'runhistory2epm': runhistory2epm, 'intensifier': intensifier, 'aggregate_func': aggregate_func, 'num_run': num_run, 'model': model, 'acq_optimizer': acquisition_function_optimizer, 'acquisition_func': acquisition_function, 'rng': rng, 'restore_incumbent': restore_incumbent } if smbo_class is None: self.solver = SMBO(**smbo_args) else: self.solver = smbo_class(**smbo_args)
def validate_epm( self, config_mode: Union[str, typing.List[Configuration]] = 'def', instance_mode: Union[str, typing.List[str]] = 'test', repetitions: int = 1, runhistory: RunHistory = None, output_fn="", reuse_epm=True, ) -> RunHistory: """ Use EPM to predict costs/runtimes for unknown config/inst-pairs. side effect: if output is specified, saves runhistory to specified output directory. Parameters ---------- output_fn: str path to runhistory to be saved. if the suffix is not '.json', will be interpreted as directory and filename will be 'validated_runhistory_EPM.json' config_mode: str or list<Configuration> string or directly a list of Configuration, string from [def, inc, def+inc, wallclock_time, cpu_time, all]. time evaluates at cpu- or wallclock-timesteps of: [max_time/2^0, max_time/2^1, max_time/2^3, ..., default] with max_time being the highest recorded time instance_mode: str or list<str> what instances to use for validation, either from [train, test, train+test] or directly a list of instances repetitions: int number of repetitions in nondeterministic algorithms runhistory: RunHistory optional, RunHistory-object to reuse runs reuse_epm: bool if true (and if `self.epm`), reuse epm to validate runs Returns ------- runhistory: RunHistory runhistory with predicted runs """ if not isinstance(runhistory, RunHistory) and (self.epm is None or reuse_epm is False): raise ValueError( "No runhistory specified for validating with EPM!") elif reuse_epm is False or self.epm is None: # Create RandomForest types, bounds = get_types(self.scen.cs, self.scen.feature_array) self.epm = RandomForestWithInstances( types=types, bounds=bounds, instance_features=self.scen.feature_array, seed=self.rng.randint(MAXINT), ratio_features=1.0) # Use imputor if objective is runtime imputor = None impute_state = None impute_censored_data = False if self.scen.run_obj == 'runtime': threshold = self.scen.cutoff * self.scen.par_factor imputor = RFRImputator(rng=self.rng, cutoff=self.scen.cutoff, threshold=threshold, model=self.epm) impute_censored_data = True impute_state = [StatusType.CAPPED] # Transform training data (from given rh) rh2epm = RunHistory2EPM4Cost( num_params=len(self.scen.cs.get_hyperparameters()), scenario=self.scen, rng=self.rng, impute_censored_data=impute_censored_data, imputor=imputor, impute_state=impute_state) X, y = rh2epm.transform(runhistory) self.logger.debug("Training model with data of shape X: %s, y:%s", str(X.shape), str(y.shape)) # Train random forest self.epm.train(X, y) # Predict desired runs runs, rh_epm = self._get_runs(config_mode, instance_mode, repetitions, runhistory) feature_array_size = len(self.scen.cs.get_hyperparameters()) if self.scen.feature_array is not None: feature_array_size += self.scen.feature_array.shape[1] X_pred = np.empty((len(runs), feature_array_size)) for idx, run in enumerate(runs): if self.scen.feature_array is not None and run.inst is not None: X_pred[idx] = np.hstack([ convert_configurations_to_array([run.config])[0], self.scen.feature_dict[run.inst] ]) else: X_pred[idx] = convert_configurations_to_array([run.config])[0] self.logger.debug("Predicting desired %d runs, data has shape %s", len(runs), str(X_pred.shape)) y_pred = self.epm.predict(X_pred) # Add runs to runhistory for run, pred in zip(runs, y_pred[0]): rh_epm.add( config=run.config, cost=float(pred), time=float(pred), status=StatusType.SUCCESS, instance_id=run.inst, seed=-1, additional_info={"additional_info": "ESTIMATED USING EPM!"}) if output_fn: self._save_results(rh_epm, output_fn, backup_fn="validated_runhistory_EPM.json") return rh_epm
def build_pc_smbo(self, tae_runner, stats, scenario, runhistory, aggregate_func, acq_func_name, model_target_names, logging_directory, double_intensification=False, constant_pipeline_steps=None, variable_pipeline_steps=None, cached_pipeline_steps=None, seed=None, intensification_instances=None, num_marginalized_configurations_by_random_search=20, num_configs_for_marginalization=40, random_splitting_number=5, random_splitting_enabled=False): # Build intensifier rng = np.random.RandomState(seed) traj_logger = TrajLogger(logging_directory, stats) intensifier = Intensifier(tae_runner=tae_runner, stats=stats, traj_logger=traj_logger, rng=rng, cutoff=scenario.cutoff, deterministic=scenario.deterministic, run_obj_time=scenario.run_obj == "runtime", run_limit=scenario.ta_run_limit, instances=intensification_instances, maxR=len(intensification_instances)) # Build model types, bounds = get_types(scenario.cs, scenario.feature_array) #types = get_types(scenario.cs) if len(model_target_names) > 1: # model_target_names = ['cost','time'] model = UncorrelatedMultiObjectiveRandomForestWithInstances( target_names=model_target_names, bounds=bounds, types=types) # UncorrelatedMultiObjectiveRandomForestWithInstances(target_names=model_target_names, # types=types) elif len(model_target_names) == 1: model = RandomForestWithInstances(types=types, bounds=bounds) else: model = RandomEPM(rng=rng) # model = RandomForestWithInstances(types=types) # Build acquisition function, runhistory2epm and local search num_params = len(scenario.cs.get_hyperparameters()) if acq_func_name in ["ei", "pc-ei"]: acquisition_func = EI(model) acq_func_wrapper = PCAquisitionFunctionWrapper( acquisition_func=acquisition_func, config_space=scenario.cs, runhistory=runhistory, constant_pipeline_steps=constant_pipeline_steps, variable_pipeline_steps=variable_pipeline_steps) runhistory2epm = RunHistory2EPM4Cost( scenario, num_params, success_states=[StatusType.SUCCESS]) local_search = LocalSearch(acquisition_function=acq_func_wrapper, config_space=scenario.cs) select_configuration = SelectConfigurations( scenario=scenario, stats=stats, runhistory=runhistory, model=model, acq_optimizer=local_search, acquisition_func=acq_func_wrapper, rng=rng, constant_pipeline_steps=constant_pipeline_steps, variable_pipeline_steps=variable_pipeline_steps) elif acq_func_name in ["m-ei", "pc-m-ei"]: #acquisition_func = MEI(model) acquisition_func = EI(model) acq_func_wrapper = PCAquisitionFunctionWrapper( acquisition_func=acquisition_func, config_space=scenario.cs, runhistory=runhistory, constant_pipeline_steps=constant_pipeline_steps, variable_pipeline_steps=variable_pipeline_steps) runhistory2epm = RunHistory2EPM4Cost( scenario, num_params, success_states=[StatusType.SUCCESS]) local_search = LocalSearch(acquisition_function=acq_func_wrapper, config_space=scenario.cs) # TODO: num_configs_for_marginalization select_configuration = SelectConfigurationsWithMarginalization( scenario=scenario, stats=stats, runhistory=runhistory, model=model, acq_optimizer=local_search, acquisition_func=acq_func_wrapper, rng=rng, constant_pipeline_steps=constant_pipeline_steps, variable_pipeline_steps=variable_pipeline_steps, num_marginalized_configurations_by_random_search= num_marginalized_configurations_by_random_search, num_configs_for_marginalization=num_configs_for_marginalization ) elif acq_func_name in ['eips', 'pc-eips']: acquisition_func = EIPS(model) acq_func_wrapper = PCAquisitionFunctionWrapper( acquisition_func=acquisition_func, config_space=scenario.cs, runhistory=runhistory, constant_pipeline_steps=constant_pipeline_steps, variable_pipeline_steps=variable_pipeline_steps) runhistory2epm = RunHistory2EPM4EIPS( scenario, num_params, success_states=[StatusType.SUCCESS]) local_search = LocalSearch(acquisition_function=acq_func_wrapper, config_space=scenario.cs) select_configuration = SelectConfigurations( scenario=scenario, stats=stats, runhistory=runhistory, model=model, acq_optimizer=local_search, acquisition_func=acq_func_wrapper, rng=rng, constant_pipeline_steps=constant_pipeline_steps, variable_pipeline_steps=variable_pipeline_steps) elif acq_func_name in ["m-eips", "pc-m-eips"]: acquisition_func = EIPS(model) acq_func_wrapper = PCAquisitionFunctionWrapper( acquisition_func=acquisition_func, config_space=scenario.cs, runhistory=runhistory, constant_pipeline_steps=constant_pipeline_steps, variable_pipeline_steps=variable_pipeline_steps) runhistory2epm = RunHistory2EPM4EIPS( scenario, num_params, success_states=[StatusType.SUCCESS]) local_search = LocalSearch(acquisition_function=acq_func_wrapper, config_space=scenario.cs) # TODO: num_configs_for_marginalization select_configuration = SelectConfigurationsWithMarginalization( scenario=scenario, stats=stats, runhistory=runhistory, model=model, acq_optimizer=local_search, acquisition_func=acq_func_wrapper, rng=rng, constant_pipeline_steps=constant_pipeline_steps, variable_pipeline_steps=variable_pipeline_steps, num_marginalized_configurations_by_random_search= num_marginalized_configurations_by_random_search, num_configs_for_marginalization=num_configs_for_marginalization ) elif acq_func_name == 'pceips': acquisition_func = PCEIPS(model) acq_func_wrapper = PCAquisitionFunctionWrapperWithCachingReduction( acquisition_func=acquisition_func, config_space=scenario.cs, runhistory=runhistory, constant_pipeline_steps=constant_pipeline_steps, variable_pipeline_steps=variable_pipeline_steps, cached_pipeline_steps=cached_pipeline_steps) runhistory2epm = RunHistory2EPM4EIPS( scenario, num_params, success_states=[StatusType.SUCCESS]) local_search = LocalSearch(acquisition_function=acq_func_wrapper, config_space=scenario.cs) if constant_pipeline_steps == None or variable_pipeline_steps == None or cached_pipeline_steps == None: raise ValueError( "Constant_pipeline_steps and variable pipeline steps should not be none\ when using PCEIPS") select_configuration = SelectConfigurations( scenario=scenario, stats=stats, runhistory=runhistory, model=model, acq_optimizer=local_search, acquisition_func=acq_func_wrapper, rng=rng, constant_pipeline_steps=constant_pipeline_steps, variable_pipeline_steps=variable_pipeline_steps) elif acq_func_name == 'pc-m-pceips': acquisition_func = PCEIPS(model) acq_func_wrapper = PCAquisitionFunctionWrapperWithCachingReduction( acquisition_func=acquisition_func, config_space=scenario.cs, runhistory=runhistory, constant_pipeline_steps=constant_pipeline_steps, variable_pipeline_steps=variable_pipeline_steps, cached_pipeline_steps=cached_pipeline_steps) runhistory2epm = RunHistory2EPM4EIPS( scenario, num_params, success_states=[StatusType.SUCCESS]) local_search = LocalSearch(acquisition_function=acq_func_wrapper, config_space=scenario.cs) if constant_pipeline_steps == None or variable_pipeline_steps == None or cached_pipeline_steps == None: raise ValueError( "Constant_pipeline_steps and variable pipeline steps should not be none\ when using PCEIPS") select_configuration = SelectConfigurationsWithMarginalization( scenario=scenario, stats=stats, runhistory=runhistory, model=model, acq_optimizer=local_search, acquisition_func=acq_func_wrapper, rng=rng, constant_pipeline_steps=constant_pipeline_steps, variable_pipeline_steps=variable_pipeline_steps, num_marginalized_configurations_by_random_search= num_marginalized_configurations_by_random_search, num_configs_for_marginalization=num_configs_for_marginalization ) elif acq_func_name == "roar": runhistory2epm = RunHistory2EPM4Cost( scenario, num_params, success_states=[StatusType.SUCCESS]) select_configuration = SelectConfigurationsRandom( scenario=scenario) elif acq_func_name == "pc-roar-mrs": runhistory2epm = RunHistory2EPM4Cost( scenario, num_params, success_states=[StatusType.SUCCESS]) select_configuration = SelectConfigurationsMRS( scenario=scenario, constant_pipeline_steps=constant_pipeline_steps, variable_pipeline_steps=variable_pipeline_steps, splitting_number=random_splitting_number, random_splitting_enabled=random_splitting_enabled) elif acq_func_name == "pc-roar-sigmoid-rs": runhistory2epm = RunHistory2EPM4Cost( scenario, num_params, success_states=[StatusType.SUCCESS]) select_configuration = SelectConfigurationsSigmoidRS( scenario=scenario, constant_pipeline_steps=constant_pipeline_steps, variable_pipeline_steps=variable_pipeline_steps, fraction=random_splitting_number) else: # Not a valid acquisition function raise ValueError("The provided acquisition function is not valid") # Build initial design # initial_design = RandomConfiguration(tae_runner=tae_runner, # scenario=scenario, # stats=stats, # traj_logger=traj_logger, # rng=rng) initial_configs = scenario.cs.sample_configuration(size=2) for config in initial_configs: config._populate_values() initial_design = MultiConfigInitialDesign( tae_runner=tae_runner, scenario=scenario, stats=stats, traj_logger=traj_logger, runhistory=runhistory, rng=rng, configs=initial_configs, intensifier=intensifier, aggregate_func=aggregate_func) # run id num_run = rng.randint(1234567980) # Build pc_smbo if acq_func_name not in ['pc-roar-sigmoid-rs']: smbo = PCSMBO(scenario=scenario, stats=stats, initial_design=initial_design, runhistory=runhistory, runhistory2epm=runhistory2epm, intensifier=intensifier, aggregate_func=aggregate_func, num_run=num_run, model=model, rng=rng, select_configuration=select_configuration, double_intensification=double_intensification) else: smbo = PCSMBOSigmoidRandomSearch( scenario=scenario, stats=stats, initial_design=initial_design, runhistory=runhistory, runhistory2epm=runhistory2epm, intensifier=intensifier, aggregate_func=aggregate_func, num_run=num_run, model=model, rng=rng, select_configuration=select_configuration) return smbo
def run_smbo(self, max_iters=1000): global evaluator self.watcher.start_task('SMBO') # == first things first: load the datamanager self.reset_data_manager() # == Initialize non-SMBO stuff # first create a scenario seed = self.seed self.config_space.seed(seed) num_params = len(self.config_space.get_hyperparameters()) # allocate a run history num_run = self.start_num_run instance_id = self.dataset_name + SENTINEL # Initialize some SMAC dependencies runhistory = RunHistory(aggregate_func=average_cost) # meta_runhistory = RunHistory(aggregate_func=average_cost) # meta_runs_dataset_indices = {} # == METALEARNING suggestions # we start by evaluating the defaults on the full dataset again # and add the suggestions from metalearning behind it if self.metadata_directory is None: metalearning_directory = os.path.dirname( autosklearn.metalearning.__file__) # There is no multilabel data in OpenML if self.task == MULTILABEL_CLASSIFICATION: meta_task = BINARY_CLASSIFICATION else: meta_task = self.task metadata_directory = os.path.join( metalearning_directory, 'files', '%s_%s_%s' % (METRIC_TO_STRING[self.metric], TASK_TYPES_TO_STRING[meta_task], 'sparse' if self.datamanager.info['is_sparse'] else 'dense')) self.metadata_directory = metadata_directory self.logger.info('Metadata directory: %s', self.metadata_directory) meta_base = MetaBase(self.config_space, self.metadata_directory) metafeature_calculation_time_limit = int( self.total_walltime_limit / 4) metafeature_calculation_start_time = time.time() meta_features = self._calculate_metafeatures_with_limits( metafeature_calculation_time_limit) metafeature_calculation_end_time = time.time() metafeature_calculation_time_limit = \ metafeature_calculation_time_limit - ( metafeature_calculation_end_time - metafeature_calculation_start_time) if metafeature_calculation_time_limit < 1: self.logger.warning('Time limit for metafeature calculation less ' 'than 1 seconds (%f). Skipping calculation ' 'of metafeatures for encoded dataset.', metafeature_calculation_time_limit) meta_features_encoded = None else: with warnings.catch_warnings(): warnings.showwarning = self._send_warnings_to_log self.datamanager.perform1HotEncoding() meta_features_encoded = \ self._calculate_metafeatures_encoded_with_limits( metafeature_calculation_time_limit) # In case there is a problem calculating the encoded meta-features if meta_features is None: if meta_features_encoded is not None: meta_features = meta_features_encoded else: if meta_features_encoded is not None: meta_features.metafeature_values.update( meta_features_encoded.metafeature_values) if meta_features is not None: meta_base.add_dataset(instance_id, meta_features) # Do mean imputation of the meta-features - should be done specific # for each prediction model! all_metafeatures = meta_base.get_metafeatures( features=list(meta_features.keys())) all_metafeatures.fillna(all_metafeatures.mean(), inplace=True) with warnings.catch_warnings(): warnings.showwarning = self._send_warnings_to_log metalearning_configurations = self.collect_metalearning_suggestions( meta_base) if metalearning_configurations is None: metalearning_configurations = [] self.reset_data_manager() self.logger.info('%s', meta_features) # Convert meta-features into a dictionary because the scenario # expects a dictionary meta_features_dict = {} for dataset, series in all_metafeatures.iterrows(): meta_features_dict[dataset] = series.values meta_features_list = [] for meta_feature_name in all_metafeatures.columns: meta_features_list.append(meta_features[meta_feature_name].value) meta_features_list = np.array(meta_features_list).reshape((1, -1)) self.logger.info(list(meta_features_dict.keys())) #meta_runs = meta_base.get_all_runs(METRIC_TO_STRING[self.metric]) #meta_runs_index = 0 #try: # meta_durations = meta_base.get_all_runs('runtime') # read_runtime_data = True #except KeyError: # read_runtime_data = False # self.logger.critical('Cannot read runtime data.') # if self.acquisition_function == 'EIPS': # self.logger.critical('Reverting to acquisition function EI!') # self.acquisition_function = 'EI' # for meta_dataset in meta_runs.index: # meta_dataset_start_index = meta_runs_index # for meta_configuration in meta_runs.columns: # if np.isfinite(meta_runs.loc[meta_dataset, meta_configuration]): # try: # config = meta_base.get_configuration_from_algorithm_index( # meta_configuration) # cost = meta_runs.loc[meta_dataset, meta_configuration] # if read_runtime_data: # runtime = meta_durations.loc[meta_dataset, # meta_configuration] # else: # runtime = 1 # # TODO read out other status types! # meta_runhistory.add(config, cost, runtime, # StatusType.SUCCESS, # instance_id=meta_dataset) # meta_runs_index += 1 # except: # # TODO maybe add warning # pass # # meta_runs_dataset_indices[meta_dataset] = ( # meta_dataset_start_index, meta_runs_index) else: if self.acquisition_function == 'EIPS': self.logger.critical('Reverting to acquisition function EI!') self.acquisition_function = 'EI' meta_features_list = [] meta_features_dict = {} metalearning_configurations = [] self.scenario = Scenario({'cs': self.config_space, 'cutoff-time': self.func_eval_time_limit, 'memory-limit': self.memory_limit, 'wallclock-limit': self.total_walltime_limit, #'instances': [[name] for name in meta_features_dict], 'output-dir': self.backend.temporary_directory, 'shared-model': self.shared_mode, 'run-obj': 'quality', 'deterministic': 'true'}) # TODO rebuild target algorithm to be it's own target algorithm # evaluator, which takes into account that a run can be killed prior # to the model being fully fitted; thus putting intermediate results # into a queue and querying them once the time is over ta = ExecuteTaFuncWithQueue(backend=self.backend, autosklearn_seed=seed, resampling_strategy=self.resampling_strategy, initial_num_run=num_run, logger=self.logger, **self.resampling_strategy_args) types = get_types(self.config_space, self.scenario.feature_array) # TODO extract generation of SMAC object into it's own function for # testing if self.acquisition_function == 'EI': model = RandomForestWithInstances(types, #instance_features=meta_features_list, seed=1, num_trees=10) smac = SMAC(scenario=self.scenario, model=model, rng=seed, tae_runner=ta, runhistory=runhistory) elif self.acquisition_function == 'EIPS': rh2EPM = RunHistory2EPM4EIPS(num_params=num_params, scenario=self.scenario, success_states=None, impute_censored_data=False, impute_state=None) model = UncorrelatedMultiObjectiveRandomForestWithInstances( ['cost', 'runtime'], types, num_trees = 10, instance_features=meta_features_list, seed=1) acquisition_function = EIPS(model) smac = SMAC(scenario=self.scenario, tae_runner=ta, acquisition_function=acquisition_function, model=model, runhistory2epm=rh2EPM, rng=seed, runhistory=runhistory) else: raise ValueError('Unknown acquisition function value %s!' % self.acquisition_function) smac.solver.stats.start_timing() smac.solver.incumbent = smac.solver.initial_design.run() # Build a runtime model # runtime_rf = RandomForestWithInstances(types, # instance_features=meta_features_list, # seed=1, num_trees=10) # runtime_rh2EPM = RunHistory2EPM4EIPS(num_params=num_params, # scenario=self.scenario, # success_states=None, # impute_censored_data=False, # impute_state=None) # X_runtime, y_runtime = runtime_rh2EPM.transform(meta_runhistory) # runtime_rf.train(X_runtime, y_runtime[:, 1].flatten()) # X_meta, Y_meta = rh2EPM.transform(meta_runhistory) # # Transform Y_meta on a per-dataset base # for meta_dataset in meta_runs_dataset_indices: # start_index, end_index = meta_runs_dataset_indices[meta_dataset] # end_index += 1 # Python indexing # Y_meta[start_index:end_index, 0]\ # [Y_meta[start_index:end_index, 0] >2.0] = 2.0 # dataset_minimum = np.min(Y_meta[start_index:end_index, 0]) # Y_meta[start_index:end_index, 0] = 1 - ( # (1. - Y_meta[start_index:end_index, 0]) / # (1. - dataset_minimum)) # Y_meta[start_index:end_index, 0]\ # [Y_meta[start_index:end_index, 0] > 2] = 2 smac.solver.stats.start_timing() # == first, evaluate all metelearning and default configurations smac.solver.incumbent = smac.solver.initial_design.run() for challenger in metalearning_configurations: smac.solver.incumbent, inc_perf = smac.solver.intensifier.intensify( challengers=[challenger], incumbent=smac.solver.incumbent, run_history=smac.solver.runhistory, aggregate_func=smac.solver.aggregate_func, time_bound=self.total_walltime_limit) if smac.solver.scenario.shared_model: pSMAC.write(run_history=smac.solver.runhistory, output_directory=smac.solver.scenario.output_dir, num_run=self.seed) if smac.solver.stats.is_budget_exhausted(): break # == after metalearning run SMAC loop while True: if smac.solver.scenario.shared_model: pSMAC.read(run_history=smac.solver.runhistory, output_directory=self.scenario.output_dir, configuration_space=self.config_space, logger=self.logger) choose_next_start_time = time.time() try: challengers = self.choose_next(smac) except Exception as e: self.logger.error(e) self.logger.error("Error in getting next configurations " "with SMAC. Using random configuration!") next_config = self.config_space.sample_configuration() challengers = [next_config] time_for_choose_next = time.time() - choose_next_start_time self.logger.info('Used %g seconds to find next ' 'configurations' % (time_for_choose_next)) smac.solver.incumbent, inc_perf = smac.solver.intensifier.intensify( challengers=challengers, incumbent=smac.solver.incumbent, run_history=smac.solver.runhistory, aggregate_func=smac.solver.aggregate_func, time_bound=time_for_choose_next) if smac.solver.scenario.shared_model: pSMAC.write(run_history=smac.solver.runhistory, output_directory=smac.solver.scenario.output_dir, num_run=self.seed) if smac.solver.stats.is_budget_exhausted(): break self.runhistory = smac.solver.runhistory return runhistory
def __init__(self, model_type='gp_mcmc', **kwargs): """ Constructor see ~smac.facade.smac_facade for documentation """ scenario = kwargs['scenario'] if scenario.initial_incumbent not in ['LHD', 'FACTORIAL', 'SOBOL']: scenario.initial_incumbent = 'SOBOL' if scenario.transform_y is 'NONE': scenario.transform_y = "LOGS" if kwargs.get('model') is None: _, rng = get_rng(rng=kwargs.get("rng", None), run_id=kwargs.get("run_id", None), logger=None) cov_amp = 2 types, bounds = get_types(kwargs['scenario'].cs, instance_features=None) n_dims = len(types) initial_ls = np.ones([n_dims]) exp_kernel = george.kernels.Matern52Kernel(initial_ls, ndim=n_dims) kernel = cov_amp * exp_kernel prior = DefaultPrior(len(kernel) + 1, rng=rng) n_hypers = 3 * len(kernel) if n_hypers % 2 == 1: n_hypers += 1 if model_type == "gp": model = GaussianProcess( types=types, bounds=bounds, kernel=kernel, prior=prior, rng=rng, normalize_output=True, normalize_input=True, ) elif model_type == "gp_mcmc": model = GaussianProcessMCMC( types=types, bounds=bounds, kernel=kernel, prior=prior, n_hypers=n_hypers, chain_length=200, burnin_steps=100, normalize_input=True, normalize_output=True, rng=rng, ) kwargs['model'] = model super().__init__(**kwargs) if self.solver.scenario.n_features > 0: raise NotImplementedError("BOGP cannot handle instances") self.logger.info(self.__class__) self.solver.random_configuration_chooser.prob = 0.0 # only 1 configuration per SMBO iteration self.solver.scenario.intensification_percentage = 1e-10 self.solver.intensifier.min_chall = 1 # better improve acqusition function optimization # 1. increase number of sls iterations self.solver.acq_optimizer.n_sls_iterations = 100 # 2. more randomly sampled configurations self.solver.scenario.acq_opt_challengers = 1000 # activate predict incumbent self.solver.predict_incumbent = True
def plot_cost_over_time(self, rh, traj, output="performance_over_time.png", validator=None): """ Plot performance over time, using all trajectory entries with max_time = wallclock_limit or (if inf) the highest recorded time Parameters ---------- rh: RunHistory runhistory to use traj: List trajectory to take times/incumbents from output: str path to output-png epm: RandomForestWithInstances emperical performance model (expecting trained on all runs) """ self.logger.debug("Estimating costs over time for best run.") validator.traj = traj # set trajectory time, configs = [], [] for entry in traj: time.append(entry["wallclock_time"]) configs.append(entry["incumbent"]) self.logger.debug("Using %d samples (%d distinct) from trajectory.", len(time), len(set(configs))) if validator.epm: # not log as validator epm is trained on cost, not log cost epm = validator.epm else: self.logger.debug( "No EPM passed! Training new one from runhistory.") # Train random forest and transform training data (from given rh) # Not using validator because we want to plot uncertainties rh2epm = RunHistory2EPM4Cost(num_params=len( self.scenario.cs.get_hyperparameters()), scenario=self.scenario) X, y = rh2epm.transform(rh) self.logger.debug("Training model with data of shape X: %s, y:%s", str(X.shape), str(y.shape)) types, bounds = get_types(self.scenario.cs, self.scenario.feature_array) epm = RandomForestWithInstances( types=types, bounds=bounds, instance_features=self.scenario.feature_array, #seed=self.rng.randint(MAXINT), ratio_features=1.0) epm.train(X, y) ## not necessary right now since the EPM only knows the features ## of the training instances # use only training instances #======================================================================= # if self.scenario.feature_dict: # feat_array = [] # for inst in self.scenario.train_insts: # feat_array.append(self.scenario.feature_dict[inst]) # backup_features_epm = epm.instance_features # epm.instance_features = np.array(feat_array) #======================================================================= # predict performance for all configurations in trajectory config_array = convert_configurations_to_array(configs) mean, var = epm.predict_marginalized_over_instances(config_array) #======================================================================= # # restore feature array in epm # if self.scenario.feature_dict: # epm.instance_features = backup_features_epm #======================================================================= mean = mean[:, 0] var = var[:, 0] uncertainty_upper = mean + np.sqrt(var) uncertainty_lower = mean - np.sqrt(var) if self.scenario.run_obj == 'runtime': # We have to clip at 0 as we want to put y on the logscale uncertainty_lower[uncertainty_lower < 0] = 0 uncertainty_upper[uncertainty_upper < 0] = 0 # plot fig = plt.figure() ax = fig.add_subplot(111) ax.set_ylabel('performance') ax.set_xlabel('time [sec]') ax.plot(time, mean, 'r-', label="estimated performance") ax.fill_between(time, uncertainty_upper, uncertainty_lower, alpha=0.8, label="standard deviation") ax.set_xscale("log", nonposx='clip') if self.scenario.run_obj == 'runtime': ax.set_yscale('log') # ax.set_ylim(min(mean)*0.8, max(mean)*1.2) # start after 1% of the configuration budget ax.set_xlim(min(time) + (max(time) - min(time)) * 0.01, max(time)) ax.legend() plt.tight_layout() fig.savefig(output) plt.close(fig)
def run_experiment(): pipeline_space = PipelineSpace() o_s = OneHotEncodingStep() i_s = ImputationStep() r_s = RescalingStep() b_s = BalancingStep() p_s = PreprocessingStep() c_s = ClassificationStep() pipeline_space.add_pipeline_steps([o_s, i_s, r_s, b_s, p_s, c_s]) runhistory = PCRunHistory(average_cost) cs_builder = ConfigSpaceBuilder(pipeline_space) config_space = cs_builder.build_config_space() args = { 'cs': config_space, 'run_obj': "quality", 'runcount_limit': 100, 'wallclock_limit': 100, 'memory_limit': 100, 'cutoff_time': 100, 'deterministic': "true" } scenario = Scenario(args) # Build stats stats = Stats(scenario, output_dir=None, stamp="") types, bounds = get_types(scenario.cs, scenario.feature_array) model = RandomForestWithInstances(types=types, bounds=bounds) constant_pipeline_steps = [ "one_hot_encoder", "imputation", "rescaling", "balancing", "feature_preprocessor" ] variable_pipeline_steps = ["classifier"] rng = np.random.RandomState() num_params = len(scenario.cs.get_hyperparameters()) acquisition_func = EI(model) acq_func_wrapper = PCAquisitionFunctionWrapper( acquisition_func=acquisition_func, config_space=scenario.cs, runhistory=runhistory, constant_pipeline_steps=constant_pipeline_steps, variable_pipeline_steps=variable_pipeline_steps) runhistory2epm = RunHistory2EPM4Cost(scenario, num_params, success_states=[StatusType.SUCCESS]) local_search = LocalSearch(acquisition_function=acq_func_wrapper, config_space=scenario.cs) select_configuration = SelectConfigurationsWithMarginalization( scenario=scenario, stats=stats, runhistory=runhistory, model=model, acq_optimizer=local_search, acquisition_func=acq_func_wrapper, rng=rng, constant_pipeline_steps=constant_pipeline_steps, variable_pipeline_steps=variable_pipeline_steps, num_marginalized_configurations_by_random_search=40, num_configs_for_marginalization=200) # sample configurations to fill runhistory sample_configs = config_space.sample_configuration(size=10) for config in sample_configs: runhistory.add(config, 1, 1, StatusType.SUCCESS) # test select_configurations procedure X, Y = runhistory2epm.transform(runhistory) challengers = select_configuration.run( X, Y, sample_configs[0], num_configurations_by_random_search_sorted=100, num_configurations_by_local_search=10, random_leaf_size=1) print(challengers[0])
def __init__( self, scenario: Scenario, # TODO: once we drop python3.4 add type hint # typing.Union[ExecuteTARun, callable] tae_runner=None, runhistory: RunHistory = None, intensifier: Intensifier = None, acquisition_function: AbstractAcquisitionFunction = None, model: AbstractEPM = None, runhistory2epm: AbstractRunHistory2EPM = None, initial_design: InitialDesign = None, initial_configurations: typing.List[Configuration] = None, stats: Stats = None, rng: np.random.RandomState = None, run_id: int = 1): """Constructor""" self.logger = logging.getLogger(self.__module__ + "." + self.__class__.__name__) aggregate_func = average_cost self.runhistory = None self.trajectory = None # initialize stats object if stats: self.stats = stats else: self.stats = Stats(scenario) self.output_dir = create_output_directory(scenario, run_id) scenario.write() # initialize empty runhistory if runhistory is None: runhistory = RunHistory(aggregate_func=aggregate_func) # inject aggr_func if necessary if runhistory.aggregate_func is None: runhistory.aggregate_func = aggregate_func # initial random number generator num_run, rng = self._get_rng(rng=rng) # reset random number generator in config space to draw different # random configurations with each seed given to SMAC scenario.cs.seed(rng.randint(MAXINT)) # initial Trajectory Logger traj_logger = TrajLogger(output_dir=self.output_dir, stats=self.stats) # initial EPM types, bounds = get_types(scenario.cs, scenario.feature_array) if model is None: model = RandomForestWithInstances( types=types, bounds=bounds, instance_features=scenario.feature_array, seed=rng.randint(MAXINT), pca_components=scenario.PCA_DIM, num_trees=scenario.rf_num_trees, do_bootstrapping=scenario.rf_do_bootstrapping, ratio_features=scenario.rf_ratio_features, min_samples_split=scenario.rf_min_samples_split, min_samples_leaf=scenario.rf_min_samples_leaf, max_depth=scenario.rf_max_depth) # initial acquisition function if acquisition_function is None: if scenario.run_obj == "runtime": acquisition_function = LogEI(model=model) else: acquisition_function = EI(model=model) # inject model if necessary if acquisition_function.model is None: acquisition_function.model = model # initialize optimizer on acquisition function local_search = LocalSearch( acquisition_function, scenario.cs, max_steps=scenario.sls_max_steps, n_steps_plateau_walk=scenario.sls_n_steps_plateau_walk) # initialize tae_runner # First case, if tae_runner is None, the target algorithm is a call # string in the scenario file if tae_runner is None: tae_runner = ExecuteTARunOld( ta=scenario.ta, stats=self.stats, run_obj=scenario.run_obj, runhistory=runhistory, par_factor=scenario.par_factor, cost_for_crash=scenario.cost_for_crash) # Second case, the tae_runner is a function to be optimized elif callable(tae_runner): tae_runner = ExecuteTAFuncDict( ta=tae_runner, stats=self.stats, run_obj=scenario.run_obj, memory_limit=scenario.memory_limit, runhistory=runhistory, par_factor=scenario.par_factor, cost_for_crash=scenario.cost_for_crash) # Third case, if it is an ExecuteTaRun we can simply use the # instance. Otherwise, the next check raises an exception elif not isinstance(tae_runner, ExecuteTARun): raise TypeError("Argument 'tae_runner' is %s, but must be " "either a callable or an instance of " "ExecuteTaRun. Passing 'None' will result in the " "creation of target algorithm runner based on the " "call string in the scenario file." % type(tae_runner)) # Check that overall objective and tae objective are the same if tae_runner.run_obj != scenario.run_obj: raise ValueError("Objective for the target algorithm runner and " "the scenario must be the same, but are '%s' and " "'%s'" % (tae_runner.run_obj, scenario.run_obj)) # inject stats if necessary if tae_runner.stats is None: tae_runner.stats = self.stats # inject runhistory if necessary if tae_runner.runhistory is None: tae_runner.runhistory = runhistory # inject cost_for_crash if tae_runner.crash_cost != scenario.cost_for_crash: tae_runner.crash_cost = scenario.cost_for_crash # initialize intensification if intensifier is None: intensifier = Intensifier( tae_runner=tae_runner, stats=self.stats, traj_logger=traj_logger, rng=rng, instances=scenario.train_insts, cutoff=scenario.cutoff, deterministic=scenario.deterministic, run_obj_time=scenario.run_obj == "runtime", always_race_against=scenario.cs.get_default_configuration() if scenario.always_race_default else None, instance_specifics=scenario.instance_specific, minR=scenario.minR, maxR=scenario.maxR, adaptive_capping_slackfactor=scenario. intens_adaptive_capping_slackfactor, min_chall=scenario.intens_min_chall) # inject deps if necessary if intensifier.tae_runner is None: intensifier.tae_runner = tae_runner if intensifier.stats is None: intensifier.stats = self.stats if intensifier.traj_logger is None: intensifier.traj_logger = traj_logger # initial design if initial_design is not None and initial_configurations is not None: raise ValueError( "Either use initial_design or initial_configurations; but not both" ) if initial_configurations is not None: initial_design = MultiConfigInitialDesign( tae_runner=tae_runner, scenario=scenario, stats=self.stats, traj_logger=traj_logger, runhistory=runhistory, rng=rng, configs=initial_configurations, intensifier=intensifier, aggregate_func=aggregate_func) elif initial_design is None: if scenario.initial_incumbent == "DEFAULT": initial_design = DefaultConfiguration(tae_runner=tae_runner, scenario=scenario, stats=self.stats, traj_logger=traj_logger, rng=rng) elif scenario.initial_incumbent == "RANDOM": initial_design = RandomConfiguration(tae_runner=tae_runner, scenario=scenario, stats=self.stats, traj_logger=traj_logger, rng=rng) else: raise ValueError("Don't know what kind of initial_incumbent " "'%s' is" % scenario.initial_incumbent) # inject deps if necessary if initial_design.tae_runner is None: initial_design.tae_runner = tae_runner if initial_design.scenario is None: initial_design.scenario = scenario if initial_design.stats is None: initial_design.stats = self.stats if initial_design.traj_logger is None: initial_design.traj_logger = traj_logger # initial conversion of runhistory into EPM data if runhistory2epm is None: num_params = len(scenario.cs.get_hyperparameters()) if scenario.run_obj == "runtime": # if we log the performance data, # the RFRImputator will already get # log transform data from the runhistory cutoff = np.log(scenario.cutoff) threshold = np.log(scenario.cutoff * scenario.par_factor) imputor = RFRImputator(rng=rng, cutoff=cutoff, threshold=threshold, model=model, change_threshold=0.01, max_iter=2) runhistory2epm = RunHistory2EPM4LogCost( scenario=scenario, num_params=num_params, success_states=[ StatusType.SUCCESS, ], impute_censored_data=True, impute_state=[ StatusType.CAPPED, ], imputor=imputor) elif scenario.run_obj == 'quality': runhistory2epm = RunHistory2EPM4Cost( scenario=scenario, num_params=num_params, success_states=[ StatusType.SUCCESS, ], impute_censored_data=False, impute_state=None) else: raise ValueError('Unknown run objective: %s. Should be either ' 'quality or runtime.' % self.scenario.run_obj) # inject scenario if necessary: if runhistory2epm.scenario is None: runhistory2epm.scenario = scenario self.solver = EPILS_Solver(scenario=scenario, stats=self.stats, initial_design=initial_design, runhistory=runhistory, runhistory2epm=runhistory2epm, intensifier=intensifier, aggregate_func=aggregate_func, num_run=num_run, model=model, acq_optimizer=local_search, acquisition_func=acquisition_function, rng=rng)
def _component_builder(self, conf:typing.Union[Configuration, dict]) \ -> typing.Tuple[AbstractAcquisitionFunction, AbstractEPM]: """ builds new Acquisition function object and EPM object and returns these Parameters ---------- conf: typing.Union[Configuration, dict] configuration specificing "model" and "acq_func" Returns ------- typing.Tuple[AbstractAcquisitionFunction, AbstractEPM] """ types, bounds = get_types(self.config_space, instance_features=self.scenario.feature_array) if conf["model"] == "RF": model = RandomForestWithInstances( types=types, bounds=bounds, instance_features=self.scenario.feature_array, seed=self.rng.randint(MAXINT), pca_components=conf.get("pca_dim", self.scenario.PCA_DIM), log_y=conf.get("log_y", self.scenario.transform_y in ["LOG", "LOGS"]), num_trees=conf.get("num_trees", self.scenario.rf_num_trees), do_bootstrapping=conf.get("do_bootstrapping", self.scenario.rf_do_bootstrapping), ratio_features=conf.get("ratio_features", self.scenario.rf_ratio_features), min_samples_split=conf.get("min_samples_split", self.scenario.rf_min_samples_split), min_samples_leaf=conf.get("min_samples_leaf", self.scenario.rf_min_samples_leaf), max_depth=conf.get("max_depth", self.scenario.rf_max_depth)) elif conf["model"] == "GP": cov_amp = 2 n_dims = len(types) initial_ls = np.ones([n_dims]) exp_kernel = george.kernels.Matern52Kernel(initial_ls, ndim=n_dims) kernel = cov_amp * exp_kernel prior = DefaultPrior(len(kernel) + 1, rng=self.rng) n_hypers = 3 * len(kernel) if n_hypers % 2 == 1: n_hypers += 1 model = GaussianProcessMCMC( types=types, bounds=bounds, kernel=kernel, prior=prior, n_hypers=n_hypers, chain_length=200, burnin_steps=100, normalize_input=True, normalize_output=True, rng=self.rng, ) if conf["acq_func"] == "EI": acq = EI(model=model, par=conf.get("par_ei", 0)) elif conf["acq_func"] == "LCB": acq = LCB(model=model, par=conf.get("par_lcb", 0)) elif conf["acq_func"] == "PI": acq = PI(model=model, par=conf.get("par_pi", 0)) elif conf["acq_func"] == "LogEI": # par value should be in log-space acq = LogEI(model=model, par=conf.get("par_logei", 0)) return acq, model
def create_optimizer(self): from smac.epm.rf_with_instances import RandomForestWithInstances from smac.initial_design.default_configuration_design import DefaultConfiguration from smac.intensification.intensification import Intensifier from smac.optimizer.smbo import SMBO from smac.optimizer.acquisition import EI from smac.optimizer.ei_optimization import InterleavedLocalAndRandomSearch from smac.optimizer.objective import average_cost from smac.runhistory.runhistory2epm import RunHistory2EPM4Cost from smac.tae.execute_ta_run import StatusType from smac.utils.constants import MAXINT from smac.utils.util_funcs import get_types TAE_RUNNER = self._priv_evaluator runhistory2epm = RunHistory2EPM4Cost( scenario=self.scenario, num_params=len(self.param_space), success_states=[StatusType.SUCCESS, StatusType.CRASHED], impute_censored_data=False, impute_state=None) intensifier = Intensifier(tae_runner=TAE_RUNNER, stats=self.stats, traj_logger=self.traj_logger, rng=self.rng, instances=self.scenario.train_insts, cutoff=self.scenario.cutoff, deterministic=self.scenario.deterministic, run_obj_time=self.scenario.run_obj == "runtime", always_race_against=self.scenario.cs.get_default_configuration() \ if self.scenario.always_race_default else None, instance_specifics=self.scenario.instance_specific, minR=self.scenario.minR, maxR=self.scenario.maxR) types, bounds = get_types(self.scenario.cs, self.scenario.feature_array) model = RandomForestWithInstances( types=types, bounds=bounds, seed=self.rng.randint(MAXINT), instance_features=self.scenario.feature_array, pca_components=self.scenario.PCA_DIM) acq_func = EI(model=model) smbo_args = { 'scenario': self.scenario, 'stats': self.stats, 'initial_design': DefaultConfiguration(tae_runner=TAE_RUNNER, scenario=self.scenario, stats=self.stats, traj_logger=self.traj_logger, rng=self.rng), 'runhistory': self.runhistory, 'runhistory2epm': runhistory2epm, 'intensifier': intensifier, 'aggregate_func': average_cost, 'num_run': self.seed, 'model': model, 'acq_optimizer': InterleavedLocalAndRandomSearch( acq_func, self.scenario.cs, np.random.RandomState(seed=self.rng.randint(MAXINT))), 'acquisition_func': acq_func, 'rng': self.rng, 'restore_incumbent': None, } self.smbo = SMBO(**smbo_args)
def run(self): """ Implementation of the forward selection loop. Uses SMACs EPM (RF) wrt the feature space to minimize the OOB error. Returns ------- feature_importance: OrderedDict dict_keys (first key -> most important) -> OOB error """ parameters = [p.name for p in self.scenario.cs.get_hyperparameters()] self.logger.debug("Parameters: %s", parameters) rh2epm = RunHistory2EPM4Cost(scenario=self.scenario, num_params=len(parameters), success_states=[ StatusType.SUCCESS, StatusType.CAPPED, StatusType.CRASHED ], impute_censored_data=False, impute_state=None) X, y = rh2epm.transform(self.rh) # reduce sample size to speedup computation if X.shape[0] > self.MAX_SAMPLES: idx = np.random.choice(X.shape[0], size=self.MAX_SAMPLES, replace=False) X = X[idx, :] y = y[idx] self.logger.debug( "Shape of X: %s, of y: %s, #parameters: %s, #feats: %s", X.shape, y.shape, len(parameters), len(self.scenario.feature_names)) names = copy.deepcopy(self.scenario.feature_names) self.logger.debug("Features: %s", names) used = list(range(0, len(parameters))) feat_ids = {f: i for i, f in enumerate(names, len(used))} ids_feat = {i: f for f, i in feat_ids.items()} self.logger.debug("Used: %s", used) evaluated_feature_importance = OrderedDict() types, bounds = get_types(self.scenario.cs, self.scenario.feature_array) last_error = np.inf for _round in range(self.to_evaluate): # Main Loop errors = [] for f in names: i = feat_ids[f] self.logger.debug('Evaluating %s', f) used.append(i) self.logger.debug( 'Used features: %s', str([ids_feat[j] for j in used[len(parameters):]])) start = time.time() self._refit_model(types[sorted(used)], bounds, X[:, sorted(used)], y) # refit the model every round errors.append(self.model.rf.out_of_bag_error()) used.pop() self.logger.debug('Refitted RF (sec %.2f; error: %.4f)' % (time.time() - start, errors[-1])) else: self.logger.debug('Evaluating None') start = time.time() self._refit_model(types[sorted(used)], bounds, X[:, sorted(used)], y) # refit the model every round errors.append(self.model.rf.out_of_bag_error()) self.logger.debug('Refitted RF (sec %.2f; error: %.4f)' % (time.time() - start, errors[-1])) if _round == 0: evaluated_feature_importance['None'] = errors[-1] best_idx = np.argmin(errors) lowest_error = errors[best_idx] if best_idx == len(errors) - 1: self.logger.info('Best thing to do is add nothing') best_feature = 'None' # evaluated_feature_importance[best_feature] = lowest_error break elif lowest_error >= last_error: break else: last_error = lowest_error best_feature = names.pop(best_idx) used.append(feat_ids[best_feature]) self.logger.debug('%s: %.4f' % (best_feature, lowest_error)) evaluated_feature_importance[best_feature] = lowest_error self.logger.debug(evaluated_feature_importance) self.evaluated_feature_importance = evaluated_feature_importance return evaluated_feature_importance