def test_add_and_pickle(self): ''' simply adding some rundata to runhistory, then pickle it ''' rh = RunHistory() cs = get_config_space() config = Configuration(cs, values={'a': 1, 'b': 2}) self.assertTrue(rh.empty()) rh.add(config=config, cost=10, time=20, status=StatusType.SUCCESS, instance_id=None, seed=None, additional_info=None) rh.add(config=config, cost=10, time=20, status=StatusType.SUCCESS, instance_id=1, seed=12354, additional_info={"start_time": 10}) self.assertFalse(rh.empty()) tmpfile = tempfile.NamedTemporaryFile(mode='wb', delete=False) pickle.dump(rh, tmpfile, -1) name = tmpfile.name tmpfile.close() with open(name, 'rb') as fh: loaded_rh = pickle.load(fh) self.assertEqual(loaded_rh.data, rh.data)
def test_add_and_pickle(self): """ Simply adding some rundata to runhistory, then pickle it. """ rh = RunHistory() cs = get_config_space() config = Configuration(cs, values={"a": 1, "b": 2}) self.assertTrue(rh.empty()) rh.add( config=config, cost=[10, 20], time=20, status=StatusType.SUCCESS, instance_id=None, seed=None, starttime=100, endtime=120, additional_info=None, ) rh.add( config=config, cost=[4.5, 5.5], time=20, status=StatusType.SUCCESS, instance_id=1, seed=12354, starttime=10, endtime=30, additional_info={"start_time": 10}, ) rh.add( config=config, cost=["4.8", "5.8"], time=20, status=StatusType.SUCCESS, instance_id=1, seed=12354, starttime=10, endtime=30, additional_info={"start_time": 10}, ) self.assertFalse(rh.empty()) tmpfile = tempfile.NamedTemporaryFile(mode="wb", delete=False) pickle.dump(rh, tmpfile, -1) name = tmpfile.name tmpfile.close() with open(name, "rb") as fh: loaded_rh = pickle.load(fh) self.assertEqual(loaded_rh.data, rh.data)
def test_illegal_input(self): rh = RunHistory() cs = get_config_space() config = Configuration(cs, values={"a": 1, "b": 2}) self.assertTrue(rh.empty()) with pytest.raises(ValueError): rh.add( config=config, cost=[4.5, 5.5, 6.5], time=20, status=StatusType.SUCCESS, instance_id=1, seed=12354, starttime=10, endtime=30, additional_info={"start_time": 10}, ) rh.add( config=config, cost=[2.5, 5.5], time=20, status=StatusType.SUCCESS, instance_id=1, seed=12354, starttime=10, endtime=30, additional_info={"start_time": 10}, )
def _get_initial_points( self, num_points: int, runhistory: RunHistory, additional_start_points: Optional[List[Tuple[float, Configuration]]], ) -> List[Configuration]: if runhistory.empty(): init_points = self.config_space.sample_configuration( size=num_points) else: # initiate local search configs_previous_runs = runhistory.get_all_configs() # configurations with the highest previous EI configs_previous_runs_sorted = self._sort_configs_by_acq_value( configs_previous_runs) configs_previous_runs_sorted = [ conf[1] for conf in configs_previous_runs_sorted[:num_points] ] # configurations with the lowest predictive cost, check for None to make unit tests work if self.acquisition_function.model is not None: conf_array = convert_configurations_to_array( configs_previous_runs) costs = self.acquisition_function.model.predict_marginalized_over_instances( conf_array)[0] # From here # http://stackoverflow.com/questions/20197990/how-to-make-argsort-result-to-be-random-between-equal-values random = self.rng.rand(len(costs)) # Last column is primary sort key! indices = np.lexsort((random.flatten(), costs.flatten())) # Cannot use zip here because the indices array cannot index the # rand_configs list, because the second is a pure python list configs_previous_runs_sorted_by_cost = [ configs_previous_runs[ind] for ind in indices ][:num_points] else: configs_previous_runs_sorted_by_cost = [] if additional_start_points is not None: additional_start_points = [ asp[1] for asp in additional_start_points[:num_points] ] else: additional_start_points = [] init_points = [] init_points_as_set = set() # type: Set[Configuration] for cand in itertools.chain( configs_previous_runs_sorted, configs_previous_runs_sorted_by_cost, additional_start_points, ): if cand not in init_points_as_set: init_points.append(cand) init_points_as_set.add(cand) return init_points
def test_add(self): ''' simply adding some rundata to runhistory ''' rh = RunHistory() cs = get_config_space() config = Configuration(cs, values={'a': 1, 'b': 2}) self.assertTrue(rh.empty()) rh.add(config=config, cost=10, time=20, status=StatusType.SUCCESS, instance_id=None, seed=None, additional_info=None) rh.add(config=config, cost=10, time=20, status=StatusType.SUCCESS, instance_id=1, seed=12354, additional_info={"start_time": 10}) self.assertFalse(rh.empty())
def run_smbo(self, max_iters=1000): global evaluator # == first things first: load the datamanager self.reset_data_manager() # == Initialize SMBO stuff # first create a scenario seed = self.seed # TODO num_params = len(self.config_space.get_hyperparameters()) # allocate a run history run_history = RunHistory() meta_runhistory = RunHistory() meta_runs_dataset_indices = {} num_run = self.start_num_run instance_id = self.dataset_name + SENTINEL # == Train on subset # before doing anything, let us run the default_cfg # on a subset of the available data to ensure that # we at least have some models # we will try three different ratios of decreasing magnitude # in the hope that at least on the last one we will be able # to get a model n_data = self.datamanager.data['X_train'].shape[0] subset_ratio = 10000. / n_data if subset_ratio >= 0.5: subset_ratio = 0.33 subset_ratios = [subset_ratio, subset_ratio * 0.10] else: subset_ratios = [subset_ratio, 500. / n_data] self.logger.info("Training default configurations on a subset of " "%d/%d data points." % (int(n_data * subset_ratio), n_data)) # the time limit for these function evaluations is rigorously # set to only 1/2 of a full function evaluation subset_time_limit = max(5, int(self.func_eval_time_limit / 2)) # the configs we want to run on the data subset are: # 1) the default configs # 2) a set of configs we selected for training on a subset subset_configs = [self.config_space.get_default_configuration()] \ + self.collect_additional_subset_defaults() subset_config_succesful = [False] * len(subset_configs) for subset_config_id, next_config in enumerate(subset_configs): for i, ratio in enumerate(subset_ratios): self.reset_data_manager() n_data_subsample = int(n_data * ratio) # run the config, but throw away the result afterwards # since this cfg was evaluated only on a subset # and we don't want to confuse SMAC self.logger.info( "Starting to evaluate %d on SUBSET " "with size %d and time limit %ds.", num_run, n_data_subsample, subset_time_limit) self.logger.info(next_config) _info = eval_with_limits(self.datamanager, self.tmp_dir, next_config, seed, num_run, self.resampling_strategy, self.resampling_strategy_args, self.memory_limit, subset_time_limit, n_data_subsample) (duration, result, _, additional_run_info, status) = _info self.logger.info( "Finished evaluating %d. configuration on SUBSET. " "Duration %f; loss %f; status %s; additional run " "info: %s ", num_run, duration, result, str(status), additional_run_info) num_run += 1 if i < len(subset_ratios) - 1: if status != StatusType.SUCCESS: # Do not increase num_run here, because we will try # the same configuration with less data self.logger.info( "A CONFIG did not finish " " for subset ratio %f -> going smaller", ratio) continue else: self.logger.info( "Finished SUBSET training sucessfully " "with ratio %f", ratio) subset_config_succesful[subset_config_id] = True break else: if status != StatusType.SUCCESS: self.logger.info( "A CONFIG did not finish " " for subset ratio %f.", ratio) continue else: self.logger.info( "Finished SUBSET training sucessfully " "with ratio %f", ratio) subset_config_succesful[subset_config_id] = True break # Use the first non-failing configuration from the subsets as the new # default configuration -> this guards us against the random forest # failing on large, sparse datasets default_cfg = None for subset_config_id, next_config in enumerate(subset_configs): if subset_config_succesful[subset_config_id]: default_cfg = next_config break if default_cfg is None: default_cfg = self.config_space.get_default_configuration() # == METALEARNING suggestions # we start by evaluating the defaults on the full dataset again # and add the suggestions from metalearning behind it if self.metadata_directory is None: metalearning_directory = os.path.dirname( autosklearn.metalearning.__file__) # There is no multilabel data in OpenML if self.task == MULTILABEL_CLASSIFICATION: meta_task = BINARY_CLASSIFICATION else: meta_task = self.task metadata_directory = os.path.join( metalearning_directory, 'files', '%s_%s_%s' % (METRIC_TO_STRING[self.metric], TASK_TYPES_TO_STRING[meta_task], 'sparse' if self.datamanager.info['is_sparse'] else 'dense')) self.metadata_directory = metadata_directory self.logger.info('Metadata directory: %s', self.metadata_directory) meta_base = MetaBase(self.config_space, self.metadata_directory) metafeature_calculation_time_limit = int(self.total_walltime_limit / 4) metafeature_calculation_start_time = time.time() meta_features = self._calculate_metafeatures_with_limits( metafeature_calculation_time_limit) metafeature_calculation_end_time = time.time() metafeature_calculation_time_limit = \ metafeature_calculation_time_limit - ( metafeature_calculation_end_time - metafeature_calculation_start_time) if metafeature_calculation_time_limit < 1: self.logger.warning( 'Time limit for metafeature calculation less ' 'than 1 seconds (%f). Skipping calculation ' 'of metafeatures for encoded dataset.', metafeature_calculation_time_limit) meta_features_encoded = None else: self.datamanager.perform1HotEncoding() meta_features_encoded = \ self._calculate_metafeatures_encoded_with_limits( metafeature_calculation_time_limit) # In case there is a problem calculating the encoded meta-features if meta_features is None: if meta_features_encoded is not None: meta_features = meta_features_encoded else: if meta_features_encoded is not None: meta_features.metafeature_values.update( meta_features_encoded.metafeature_values) if meta_features is not None: meta_base.add_dataset(instance_id, meta_features) # Do mean imputation of the meta-features - should be done specific # for each prediction model! all_metafeatures = meta_base.get_metafeatures( features=list(meta_features.keys())) all_metafeatures.fillna(all_metafeatures.mean(), inplace=True) metalearning_configurations = self.collect_metalearning_suggestions( meta_base) if metalearning_configurations is None: metalearning_configurations = [] self.reset_data_manager() self.logger.info('%s', meta_features) # Convert meta-features into a dictionary because the scenario # expects a dictionary meta_features_dict = {} for dataset, series in all_metafeatures.iterrows(): meta_features_dict[dataset] = series.values meta_features_list = [] for meta_feature_name in all_metafeatures.columns: meta_features_list.append( meta_features[meta_feature_name].value) meta_features_list = np.array(meta_features_list).reshape((1, -1)) self.logger.info(list(meta_features_dict.keys())) meta_runs = meta_base.get_all_runs(METRIC_TO_STRING[self.metric]) meta_runs_index = 0 try: meta_durations = meta_base.get_all_runs('runtime') read_runtime_data = True except KeyError: read_runtime_data = False self.logger.critical('Cannot read runtime data.') if self.acquisition_function == 'EIPS': self.logger.critical( 'Reverting to acquisition function EI!') self.acquisition_function = 'EI' for meta_dataset in meta_runs.index: meta_dataset_start_index = meta_runs_index for meta_configuration in meta_runs.columns: if np.isfinite(meta_runs.loc[meta_dataset, meta_configuration]): try: config = meta_base.get_configuration_from_algorithm_index( meta_configuration) cost = meta_runs.loc[meta_dataset, meta_configuration] if read_runtime_data: runtime = meta_durations.loc[ meta_dataset, meta_configuration] else: runtime = 1 # TODO read out other status types! meta_runhistory.add(config, cost, runtime, StatusType.SUCCESS, instance_id=meta_dataset) meta_runs_index += 1 except: # TODO maybe add warning pass meta_runs_dataset_indices[meta_dataset] = ( meta_dataset_start_index, meta_runs_index) else: if self.acquisition_function == 'EIPS': self.logger.critical('Reverting to acquisition function EI!') self.acquisition_function = 'EI' meta_features_list = [] meta_features_dict = {} metalearning_configurations = [] self.scenario = AutoMLScenario(self.config_space, self.total_walltime_limit, self.func_eval_time_limit, meta_features_dict, self.tmp_dir, self.shared_mode) types = get_types(self.config_space, self.scenario.feature_array) if self.acquisition_function == 'EI': rh2EPM = RunHistory2EPM4Cost(num_params=num_params, scenario=self.scenario, success_states=None, impute_censored_data=False, impute_state=None) model = RandomForestWithInstances( types, instance_features=meta_features_list, seed=1, num_trees=10) smac = SMBO(self.scenario, model=model, rng=seed) elif self.acquisition_function == 'EIPS': rh2EPM = RunHistory2EPM4EIPS(num_params=num_params, scenario=self.scenario, success_states=None, impute_censored_data=False, impute_state=None) model = UncorrelatedMultiObjectiveRandomForestWithInstances( ['cost', 'runtime'], types, num_trees=10, instance_features=meta_features_list, seed=1) acquisition_function = EIPS(model) smac = SMBO(self.scenario, acquisition_function=acquisition_function, model=model, runhistory2epm=rh2EPM, rng=seed) else: raise ValueError('Unknown acquisition function value %s!' % self.acquisition_function) # Build a runtime model # runtime_rf = RandomForestWithInstances(types, # instance_features=meta_features_list, # seed=1, num_trees=10) # runtime_rh2EPM = RunHistory2EPM4EIPS(num_params=num_params, # scenario=self.scenario, # success_states=None, # impute_censored_data=False, # impute_state=None) # X_runtime, y_runtime = runtime_rh2EPM.transform(meta_runhistory) # runtime_rf.train(X_runtime, y_runtime[:, 1].flatten()) X_meta, Y_meta = rh2EPM.transform(meta_runhistory) # Transform Y_meta on a per-dataset base for meta_dataset in meta_runs_dataset_indices: start_index, end_index = meta_runs_dataset_indices[meta_dataset] end_index += 1 # Python indexing Y_meta[start_index:end_index, 0]\ [Y_meta[start_index:end_index, 0] >2.0] = 2.0 dataset_minimum = np.min(Y_meta[start_index:end_index, 0]) Y_meta[start_index:end_index, 0] = 1 - ((1. - Y_meta[start_index:end_index, 0]) / (1. - dataset_minimum)) Y_meta[start_index:end_index, 0]\ [Y_meta[start_index:end_index, 0] > 2] = 2 # == first, evaluate all metelearning and default configurations for i, next_config in enumerate( ([default_cfg] + metalearning_configurations)): # Do not evaluate default configurations more than once if i >= len([default_cfg]) and next_config in [default_cfg]: continue config_name = 'meta-learning' if i >= len([default_cfg]) \ else 'default' self.logger.info( "Starting to evaluate %d. configuration " "(%s configuration) with time limit %ds.", num_run, config_name, self.func_eval_time_limit) self.logger.info(next_config) self.reset_data_manager() info = eval_with_limits(self.datamanager, self.tmp_dir, next_config, seed, num_run, self.resampling_strategy, self.resampling_strategy_args, self.memory_limit, self.func_eval_time_limit) (duration, result, _, additional_run_info, status) = info run_history.add(config=next_config, cost=result, time=duration, status=status, instance_id=instance_id, seed=seed) run_history.update_cost(next_config, result) self.logger.info( "Finished evaluating %d. configuration. " "Duration %f; loss %f; status %s; additional run " "info: %s ", num_run, duration, result, str(status), additional_run_info) num_run += 1 if smac.incumbent is None: smac.incumbent = next_config elif result < run_history.get_cost(smac.incumbent): smac.incumbent = next_config if self.scenario.shared_model: pSMAC.write(run_history=run_history, output_directory=self.scenario.output_dir, num_run=self.seed) # == after metalearning run SMAC loop smac.runhistory = run_history smac_iter = 0 finished = False while not finished: if self.scenario.shared_model: pSMAC.read(run_history=run_history, output_directory=self.scenario.output_dir, configuration_space=self.config_space, logger=self.logger) next_configs = [] time_for_choose_next = -1 try: X_cfg, Y_cfg = rh2EPM.transform(run_history) if not run_history.empty(): # Update costs by normalization dataset_minimum = np.min(Y_cfg[:, 0]) Y_cfg[:, 0] = 1 - ((1. - Y_cfg[:, 0]) / (1. - dataset_minimum)) Y_cfg[:, 0][Y_cfg[:, 0] > 2] = 2 if len(X_meta) > 0 and len(X_cfg) > 0: pass #X_cfg = np.concatenate((X_meta, X_cfg)) #Y_cfg = np.concatenate((Y_meta, Y_cfg)) elif len(X_meta) > 0: X_cfg = X_meta.copy() Y_cfg = Y_meta.copy() elif len(X_cfg) > 0: X_cfg = X_cfg.copy() Y_cfg = Y_cfg.copy() else: raise ValueError( 'No training data for SMAC random forest!') self.logger.info('Using %d training points for SMAC.' % X_cfg.shape[0]) choose_next_start_time = time.time() next_configs_tmp = smac.choose_next( X_cfg, Y_cfg, num_interleaved_random=110, num_configurations_by_local_search=10, num_configurations_by_random_search_sorted=100) time_for_choose_next = time.time() - choose_next_start_time self.logger.info('Used %g seconds to find next ' 'configurations' % (time_for_choose_next)) next_configs.extend(next_configs_tmp) # TODO put Exception here! except Exception as e: self.logger.error(e) self.logger.error("Error in getting next configurations " "with SMAC. Using random configuration!") next_config = self.config_space.sample_configuration() next_configs.append(next_config) models_fitted_this_iteration = 0 start_time_this_iteration = time.time() for next_config in next_configs: x_runtime = impute_inactive_values(next_config) x_runtime = impute_inactive_values(x_runtime).get_array() # predicted_runtime = runtime_rf.predict_marginalized_over_instances( # x_runtime.reshape((1, -1))) # predicted_runtime = np.exp(predicted_runtime[0][0][0]) - 1 self.logger.info( "Starting to evaluate %d. configuration (from " "SMAC) with time limit %ds.", num_run, self.func_eval_time_limit) self.logger.info(next_config) self.reset_data_manager() info = eval_with_limits(self.datamanager, self.tmp_dir, next_config, seed, num_run, self.resampling_strategy, self.resampling_strategy_args, self.memory_limit, self.func_eval_time_limit) (duration, result, _, additional_run_info, status) = info run_history.add(config=next_config, cost=result, time=duration, status=status, instance_id=instance_id, seed=seed) run_history.update_cost(next_config, result) #self.logger.info('Predicted runtime %g, true runtime %g', # predicted_runtime, duration) # TODO add unittest to make sure everything works fine and # this does not get outdated! if smac.incumbent is None: smac.incumbent = next_config elif result < run_history.get_cost(smac.incumbent): smac.incumbent = next_config self.logger.info( "Finished evaluating %d. configuration. " "Duration: %f; loss: %f; status %s; additional " "run info: %s ", num_run, duration, result, str(status), additional_run_info) smac_iter += 1 num_run += 1 models_fitted_this_iteration += 1 time_used_this_iteration = time.time( ) - start_time_this_iteration if models_fitted_this_iteration >= 2 and \ time_for_choose_next > 0 and \ time_used_this_iteration > time_for_choose_next: break elif time_for_choose_next <= 0 and \ models_fitted_this_iteration >= 1: break elif models_fitted_this_iteration >= 50: break if max_iters is not None: finished = (smac_iter < max_iters) if self.scenario.shared_model: pSMAC.write(run_history=run_history, output_directory=self.scenario.output_dir, num_run=self.seed)
class SMAC4EPMOpimizer(AbstractOptimizer): def __init__(self, api_config, config_space, parallel_setting="LS"): super(SMAC4EPMOpimizer, self).__init__(api_config) self.cs = config_space self.num_hps = len(self.cs.get_hyperparameters()) if parallel_setting not in ["CL_min", "CL_max", "CL_mean", "KB", "LS"]: raise ValueError( "parallel_setting can only be one of the following: " "CL_min, CL_max, CL_mean, KB, LS") self.parallel_setting = parallel_setting rng = np.random.RandomState(seed=0) scenario = Scenario({ "run_obj": "quality", # we optimize quality (alt. to runtime) "runcount-limit": 128, "cs": self.cs, # configuration space "deterministic": True, "limit_resources": False, }) self.stats = Stats(scenario) # traj = TrajLogger(output_dir=None, stats=self.stats) self.runhistory = RunHistory() r2e_def_kwargs = { "scenario": scenario, "num_params": self.num_hps, "success_states": [ StatusType.SUCCESS, ], "impute_censored_data": False, "scale_perc": 5, } self.random_chooser = ChooserProb(rng=rng, prob=0.0) types, bounds = get_types(self.cs, instance_features=None) model_kwargs = { "configspace": self.cs, "types": types, "bounds": bounds, "seed": rng.randint(MAXINT), } models = [] cov_amp = ConstantKernel( 2.0, constant_value_bounds=(np.exp(-10), np.exp(2)), prior=LognormalPrior(mean=0.0, sigma=1.0, rng=rng), ) cont_dims = np.array(np.where(np.array(types) == 0)[0], dtype=np.int) cat_dims = np.where(np.array(types) != 0)[0] if len(cont_dims) > 0: exp_kernel = Matern( np.ones([len(cont_dims)]), [(np.exp(-6.754111155189306), np.exp(0.0858637988771976)) for _ in range(len(cont_dims))], nu=2.5, operate_on=cont_dims, ) if len(cat_dims) > 0: ham_kernel = HammingKernel( np.ones([len(cat_dims)]), [(np.exp(-6.754111155189306), np.exp(0.0858637988771976)) for _ in range(len(cat_dims))], operate_on=cat_dims, ) assert len(cont_dims) + len(cat_dims) == len( scenario.cs.get_hyperparameters()) noise_kernel = WhiteKernel( noise_level=1e-8, noise_level_bounds=(np.exp(-25), np.exp(2)), prior=HorseshoePrior(scale=0.1, rng=rng), ) if len(cont_dims) > 0 and len(cat_dims) > 0: # both kernel = cov_amp * (exp_kernel * ham_kernel) + noise_kernel elif len(cont_dims) > 0 and len(cat_dims) == 0: # only cont kernel = cov_amp * exp_kernel + noise_kernel elif len(cont_dims) == 0 and len(cat_dims) > 0: # only cont kernel = cov_amp * ham_kernel + noise_kernel else: raise ValueError() gp_kwargs = {"kernel": kernel} rf_kwargs = {} rf_kwargs["num_trees"] = model_kwargs.get("num_trees", 10) rf_kwargs["do_bootstrapping"] = model_kwargs.get( "do_bootstrapping", True) rf_kwargs["ratio_features"] = model_kwargs.get("ratio_features", 1.0) rf_kwargs["min_samples_split"] = model_kwargs.get( "min_samples_split", 2) rf_kwargs["min_samples_leaf"] = model_kwargs.get("min_samples_leaf", 1) rf_kwargs["log_y"] = model_kwargs.get("log_y", True) rf_log = RandomForestWithInstances(**model_kwargs, **rf_kwargs) rf_kwargs = copy.deepcopy(rf_kwargs) rf_kwargs["log_y"] = False rf_no_log = RandomForestWithInstances(**model_kwargs, **rf_kwargs) rh2epm_cost = RunHistory2EPM4Cost(**r2e_def_kwargs) rh2epm_log_cost = RunHistory2EPM4LogScaledCost(**r2e_def_kwargs) rh2epm_copula = RunHistory2EPM4GaussianCopulaCorrect(**r2e_def_kwargs) self.combinations = [] # 2 models * 4 acquisition functions acq_funcs = [EI, PI, LogEI, LCB] acq_func_instances = [] # acq_func_maximizer_instances = [] n_sls_iterations = { 1: 10, 2: 10, 3: 10, 4: 10, 5: 10, 6: 10, 7: 8, 8: 6, }.get(len(self.cs.get_hyperparameters()), 5) acq_func_maximizer_kwargs = { "config_space": self.cs, "rng": rng, "max_steps": 5, "n_steps_plateau_walk": 5, "n_sls_iterations": n_sls_iterations, } self.idx_ei = 0 self.num_models = len(models) self.num_acq_funcs = len(acq_funcs) no_transform_gp = GaussianProcess(**copy.deepcopy(model_kwargs), **copy.deepcopy(gp_kwargs)) ei = EI(model=no_transform_gp) acq_func_maximizer_kwargs["acquisition_function"] = ei ei_opt = LocalAndSortedRandomSearch(**acq_func_maximizer_kwargs) self.combinations.append((no_transform_gp, ei, ei_opt, rh2epm_cost)) pi = PI(model=no_transform_gp) acq_func_maximizer_kwargs["acquisition_function"] = pi pi_opt = LocalAndSortedRandomSearch(**acq_func_maximizer_kwargs) self.combinations.append((no_transform_gp, pi, pi_opt, rh2epm_cost)) lcb = LCB(model=no_transform_gp) acq_func_maximizer_kwargs["acquisition_function"] = lcb lcb_opt = LocalAndSortedRandomSearch(**acq_func_maximizer_kwargs) self.combinations.append((no_transform_gp, lcb, lcb_opt, rh2epm_cost)) gp = GaussianProcess(**copy.deepcopy(model_kwargs), **copy.deepcopy(gp_kwargs)) ei = EI(model=gp) acq_func_maximizer_kwargs["acquisition_function"] = ei ei_opt = LocalAndSortedRandomSearch(**acq_func_maximizer_kwargs) self.combinations.append((gp, ei, ei_opt, rh2epm_copula)) gp = GaussianProcess(**copy.deepcopy(model_kwargs), **copy.deepcopy(gp_kwargs)) ei = LogEI(model=gp) acq_func_maximizer_kwargs["acquisition_function"] = ei ei_opt = LocalAndSortedRandomSearch(**acq_func_maximizer_kwargs) self.combinations.append((gp, ei, ei_opt, rh2epm_log_cost)) ei = EI(model=rf_no_log) acq_func_maximizer_kwargs["acquisition_function"] = ei ei_opt = LocalAndSortedRandomSearch(**acq_func_maximizer_kwargs) self.combinations.append((rf_no_log, ei, ei_opt, rh2epm_cost)) ei = LogEI(model=rf_log) acq_func_maximizer_kwargs["acquisition_function"] = ei ei_opt = LocalAndSortedRandomSearch(**acq_func_maximizer_kwargs) self.combinations.append((rf_log, ei, ei_opt, rh2epm_log_cost)) ei = EI(model=rf_no_log) acq_func_maximizer_kwargs["acquisition_function"] = ei ei_opt = LocalAndSortedRandomSearch(**acq_func_maximizer_kwargs) self.combinations.append((rf_no_log, ei, ei_opt, rh2epm_copula)) self.num_acq_instances = len(acq_func_instances) self.best_observation = np.inf self.next_evaluations = [] def suggest(self, n_suggestions: int = 1) -> typing.List[typing.Dict]: """Get a suggestion from the optimizer. Parameters ---------- n_suggestions : int Desired number of parallel suggestions in the output Returns ------- next_guess : list of dict List of `n_suggestions` suggestions to evaluate the objective function. Each suggestion is a dictionary where each key corresponds to a parameter being optimized. CHANGED: each suggestion is a tuple of suggestion and string info! """ all_previous_configs = self.runhistory.get_all_configs() num_points = len(all_previous_configs) # we will save our info info_list = [] if len(self.next_evaluations) < n_suggestions: n_new = n_suggestions - len(self.next_evaluations) # import time order = np.random.permutation(list(range(len(self.combinations)))) optimized_this_iter = set() while len(self.next_evaluations) < n_new: model, acq, acq_opt, rh2epm = self.combinations[order[len( self.next_evaluations)]] # start_time = time.time() info = "" if model.__class__ == RandomForestWithInstances: info += "RF" elif model.__class__ == GaussianProcess: info += "GP" else: raise ValueError(model.__class__.name) info += f" {acq.__class__.__name__}" if rh2epm.__class__ == RunHistory2EPM4Cost: info += " cost" elif rh2epm.__class__ == RunHistory2EPM4LogScaledCost: info += " log_cost" elif rh2epm.__class__ == RunHistory2EPM4GaussianCopulaCorrect: info += " copula" else: raise ValueError(rh2epm.__classs__.name__) # print(model.__class__.__name__, # acq.__class__.__name__, # rh2epm.__class__.__name__) X, y = rh2epm.transform(self.runhistory) # If all are not finite then we return nothing if np.all(~np.isfinite(y)): self.next_evaluations = [] return [] # Safeguard, just in case... if np.any(~np.isfinite(y)): y[~np.isfinite(y)] = np.max(y[np.isfinite(y)]) if (self.parallel_setting != "LS" and len(self.next_evaluations) != 0): x_inc = np.array([ next_config.get_array() for next_config in self.next_evaluations ]) if self.parallel_setting == "CL_min": y_inc = np.min(y) elif self.parallel_setting == "CL_max": y_inc = np.max(y) elif self.parallel_setting == "CL_mean": y_inc = np.mean(y) elif self.parallel_setting == "KB": if model in optimized_this_iter and isinstance( model, GaussianProcess): # Safe some time by re-using the optimized # hyperparameters from before model._train(X, y, do_optimize=False) else: model.train(X, y) optimized_this_iter.add(model) y_inc, var = model.predict_marginalized_over_instances( x_inc) y_inc = y_inc.flatten() else: raise ValueError( "parallel_setting can only be one of the " "following: CL_min, CL_max, CL_mean, KB, LS") if self.parallel_setting in ("CL_min", "CL_max", "CL_mean"): # NOQA y_inc = np.repeat(y_inc, len(self.next_evaluations)).reshape( (-1, 1)) else: y_inc = y_inc.reshape((-1, 1)) X = np.concatenate((X, x_inc)) y = np.concatenate((y, y_inc)) if (isinstance(model, GaussianProcess) and self.parallel_setting == "KB"): # Safe some time by re-using the optimized # hyperparameters from above model._train(X, y, do_optimize=False) else: model.train(X, y) # As the training data for each subsequent model # changes quite drastically (taking the max of all # observations can create really disconnected error # landscapes in the region of the optimum) we have # to re-optimize the hyperparameters here and cannot # add the model to the set of previously # optimized models. # optimized_this_iter.add(model) else: model.train(X, y) optimized_this_iter.add(model) predictions = model.predict_marginalized_over_instances(X)[0] best_index = np.argmin(predictions) best_observation = predictions[best_index] x_best_array = X[best_index] acq.update( model=model, eta=best_observation, incumbent_array=x_best_array, num_data=num_points, X=X, ) new_config_iterator = acq_opt.maximize( runhistory=self.runhistory, stats=self.stats, num_points=10000, random_configuration_chooser=self.random_chooser, ) accept = False for next_config in new_config_iterator: if (next_config in self.next_evaluations or next_config in all_previous_configs): continue else: accept = True break if not accept: # If we don't find anything within 100 random # configurations, we re-run a configuration for next_config in self.cs.sample_configuration(100): if (next_config not in self.next_evaluations or next_config in all_previous_configs): break self.next_evaluations.append(next_config) info_list.append(info) # print(time.time() - start_time) next_guess = [{} for _ in range(n_suggestions)] while len(self.next_evaluations) < len(range(n_suggestions)): self.next_evaluations.append(self.cs.sample_configuration()) info_list.append("Random") for i in range(n_suggestions): eval_next = self.next_evaluations.pop(0) next_guess[i] = (eval_next.get_dictionary(), info_list[i]) return next_guess def init_with_rh(self, rh, iteration): self.runhistory.empty() for rh_value in rh: configuration = Configuration(configuration_space=self.cs, values=rh_value[0]) self.runhistory.add( config=configuration, cost=rh_value[1], time=0, status=StatusType.SUCCESS, ) def observe(self, X, y): """Feed an observation back. Parameters ---------- X : list of dict-like Places where the objective function has already been evaluated. Each suggestion is a dictionary 使用where each key corresponds to a parameter being optimized. y : array-like, shape (n,) Corresponding values where objective has been evaluated """ for xx, yy in zip(X, y): configuration = Configuration(configuration_space=self.cs, values=xx) self.runhistory.add(config=configuration, cost=yy, time=0, status=StatusType.SUCCESS)
def _get_initial_points( self, num_points: int, runhistory: RunHistory, additional_start_points: Optional[List[Tuple[float, Configuration]]], ) -> List[Configuration]: if runhistory.empty(): init_points = self.config_space.sample_configuration( size=num_points) else: # initiate local search configs_previous_runs = runhistory.get_all_configs() # configurations with the highest previous EI configs_previous_runs_sorted = self._sort_configs_by_acq_value( configs_previous_runs) configs_previous_runs_sorted = [ conf[1] for conf in configs_previous_runs_sorted[:num_points] ] # configurations with the lowest predictive cost, check for None to make unit tests work if self.acquisition_function.model is not None: conf_array = convert_configurations_to_array( configs_previous_runs) costs = self.acquisition_function.model.predict_marginalized_over_instances( conf_array)[0] assert len(conf_array) == len(costs), (conf_array.shape, costs.shape) # In case of the predictive model returning the prediction for more than one objective per configuration # (for example multi-objective or EIPS) it is not immediately clear how to sort according to the cost # of a configuration. Therefore, we simply follow the ParEGO approach and use a random scalarization. if len(costs.shape) == 2 and costs.shape[1] > 1: weights = np.array( [self.rng.rand() for _ in range(costs.shape[1])]) weights = weights / np.sum(weights) costs = costs @ weights # From here # http://stackoverflow.com/questions/20197990/how-to-make-argsort-result-to-be-random-between-equal-values random = self.rng.rand(len(costs)) # Last column is primary sort key! indices = np.lexsort((random.flatten(), costs.flatten())) # Cannot use zip here because the indices array cannot index the # rand_configs list, because the second is a pure python list configs_previous_runs_sorted_by_cost = [ configs_previous_runs[ind] for ind in indices ][:num_points] else: configs_previous_runs_sorted_by_cost = [] if additional_start_points is not None: additional_start_points = [ asp[1] for asp in additional_start_points[:num_points] ] else: additional_start_points = [] init_points = [] init_points_as_set = set() # type: Set[Configuration] for cand in itertools.chain( configs_previous_runs_sorted, configs_previous_runs_sorted_by_cost, additional_start_points, ): if cand not in init_points_as_set: init_points.append(cand) init_points_as_set.add(cand) return init_points
class SMBO(BaseSolver): def __init__(self, scenario, tae_runner=None, acquisition_function=None, model=None, runhistory2epm=None, stats=None, rng=None): ''' Interface that contains the main Bayesian optimization loop Parameters ---------- scenario: smac.scenario.scenario.Scenario Scenario object tae_runner: object object that implements the following method to call the target algorithm (or any other arbitrary function): run(self, config) If not set, it will be initialized with the tae.ExecuteTARunOld() acquisition_function : AcquisitionFunction Object that implements the AbstractAcquisitionFunction. Will use EI if not set. model : object Model that implements train() and predict(). Will use a RandomForest if not set. runhistory2epm : RunHistory2EMP Object that implements the AbstractRunHistory2EPM. If None, will use RunHistory2EPM4Cost if objective is cost or RunHistory2EPM4LogCost if objective is runtime. stats: Stats optional stats object rng: numpy.random.RandomState Random number generator ''' if stats: self.stats = stats else: self.stats = Stats(scenario) self.runhistory = RunHistory() self.logger = logging.getLogger("smbo") if rng is None: self.num_run = np.random.randint(1234567980) self.rng = np.random.RandomState(seed=self.num_run) elif isinstance(rng, int): self.num_run = rng self.rng = np.random.RandomState(seed=rng) elif isinstance(rng, np.random.RandomState): self.num_run = rng.randint(1234567980) self.rng = rng else: raise TypeError('Unknown type %s for argument rng. Only accepts ' 'None, int or np.random.RandomState' % str(type(rng))) self.scenario = scenario self.config_space = scenario.cs self.traj_logger = TrajLogger(output_dir=self.scenario.output_dir, stats=self.stats) self.types = get_types(self.config_space, scenario.feature_array) if model is None: self.model = RandomForestWithInstances( self.types, scenario.feature_array, seed=self.rng.randint(1234567980)) else: self.model = model if acquisition_function is None: self.acquisition_func = EI(self.model) else: self.acquisition_func = acquisition_function self.local_search = LocalSearch(self.acquisition_func, self.config_space) self.incumbent = None if tae_runner is None: self.executor = ExecuteTARunOld(ta=scenario.ta, stats=self.stats, run_obj=scenario.run_obj, par_factor=scenario.par_factor) else: self.executor = tae_runner self.inten = Intensifier( executor=self.executor, stats=self.stats, traj_logger=self.traj_logger, instances=self.scenario.train_insts, cutoff=self.scenario.cutoff, deterministic=self.scenario.deterministic, run_obj_time=self.scenario.run_obj == "runtime", instance_specifics=self.scenario.instance_specific) num_params = len(self.config_space.get_hyperparameters()) self.objective = average_cost if self.scenario.run_obj == "runtime": if runhistory2epm is None: # if we log the performance data, # the RFRImputator will already get # log transform data from the runhistory cutoff = np.log10(self.scenario.cutoff) threshold = np.log10(self.scenario.cutoff * self.scenario.par_factor) imputor = RFRImputator(cs=self.config_space, rs=self.rng, cutoff=cutoff, threshold=threshold, model=self.model, change_threshold=0.01, max_iter=10) self.rh2EPM = RunHistory2EPM4LogCost(scenario=self.scenario, num_params=num_params, success_states=[ StatusType.SUCCESS, ], impute_censored_data=True, impute_state=[ StatusType.TIMEOUT, ], imputor=imputor) else: self.rh2EPM = runhistory2epm elif self.scenario.run_obj == 'quality': if runhistory2epm is None: self.rh2EPM = RunHistory2EPM4Cost\ (scenario=self.scenario, num_params=num_params, success_states=[StatusType.SUCCESS, ], impute_censored_data=False, impute_state=None) else: self.rh2EPM = runhistory2epm else: raise ValueError('Unknown run objective: %s. Should be either ' 'quality or runtime.' % self.scenario.run_obj) def run_initial_design(self): ''' runs algorithm runs for a initial design; default implementation: running the default configuration on a random instance-seed pair Side effect: adds runs to self.runhistory Returns ------- incumbent: Configuration() initial incumbent configuration ''' default_conf = self.config_space.get_default_configuration() self.incumbent = default_conf # add this incumbent right away to have an entry to time point 0 self.traj_logger.add_entry(train_perf=2**31, incumbent_id=1, incumbent=self.incumbent) rand_inst_id = self.rng.randint(0, len(self.scenario.train_insts)) # ignore instance specific values rand_inst = self.scenario.train_insts[rand_inst_id] if self.scenario.deterministic: initial_seed = 0 else: initial_seed = random.randint(0, MAXINT) status, cost, runtime, additional_info = self.executor.start( default_conf, instance=rand_inst, cutoff=self.scenario.cutoff, seed=initial_seed, instance_specific=self.scenario.instance_specific.get( rand_inst, "0")) if status in [StatusType.CRASHED or StatusType.ABORT]: self.logger.critical("First run crashed -- Abort") sys.exit(1) self.runhistory.add(config=default_conf, cost=cost, time=runtime, status=status, instance_id=rand_inst, seed=initial_seed, additional_info=additional_info) defaul_inst_seeds = set( self.runhistory.get_runs_for_config(default_conf)) default_perf = self.objective(default_conf, self.runhistory, defaul_inst_seeds) self.runhistory.update_cost(default_conf, default_perf) self.stats.inc_changed += 1 # first incumbent self.traj_logger.add_entry(train_perf=default_perf, incumbent_id=self.stats.inc_changed, incumbent=self.incumbent) return default_conf def run(self, max_iters=10): ''' Runs the Bayesian optimization loop for max_iters iterations Parameters ---------- max_iters: int The maximum number of iterations Returns ---------- incumbent: np.array(1, H) The best found configuration ''' self.stats.start_timing() #self.runhistory = RunHisory() self.incumbent = self.run_initial_design() # Main BO loop iteration = 1 while True: if self.scenario.shared_model: pSMAC.read(run_history=self.runhistory, output_directory=self.scenario.output_dir, configuration_space=self.config_space, logger=self.logger) start_time = time.time() X, Y = self.rh2EPM.transform(self.runhistory) self.logger.debug("Search for next configuration") # get all found configurations sorted according to acq challengers = self.choose_next(X, Y) time_spend = time.time() - start_time logging.debug( "Time spend to choose next configurations: %.2f sec" % (time_spend)) self.logger.debug("Intensify") self.incumbent, inc_perf = self.inten.intensify( challengers=challengers, incumbent=self.incumbent, run_history=self.runhistory, objective=self.objective, time_bound=max(0.01, time_spend)) # TODO: Write run history into database if self.scenario.shared_model: pSMAC.write(run_history=self.runhistory, output_directory=self.scenario.output_dir, num_run=self.num_run) if iteration == max_iters: break iteration += 1 logging.debug( "Remaining budget: %f (wallclock), %f (ta costs), %f (target runs)" % (self.stats.get_remaing_time_budget(), self.stats.get_remaining_ta_budget(), self.stats.get_remaining_ta_runs())) if self.stats.is_budget_exhausted(): break self.stats.print_stats(debug_out=True) return self.incumbent def choose_next(self, X, Y, num_interleaved_random=1010, num_configurations_by_random_search_sorted=1000, num_configurations_by_local_search=10): """Choose next candidate solution with Bayesian optimization. Parameters ---------- X : (N, D) numpy array Each row contains a configuration and one set of instance features. Y : (N, O) numpy array The function values for each configuration instance pair. Returns ------- list List of 2020 suggested configurations to evaluate. """ self.model.train(X, Y) if self.runhistory.empty(): incumbent_value = 0.0 elif self.incumbent is None: # TODO try to calculate an incumbent from the runhistory! incumbent_value = 0.0 else: incumbent_value = self.runhistory.get_cost(self.incumbent) self.acquisition_func.update(model=self.model, eta=incumbent_value) # Remove dummy acquisition function value next_configs_by_random_search = [ x[1] for x in self._get_next_by_random_search( num_points=num_interleaved_random) ] # Get configurations sorted by EI next_configs_by_random_search_sorted = \ self._get_next_by_random_search( num_configurations_by_random_search_sorted, _sorted=True) next_configs_by_local_search = \ self._get_next_by_local_search(num_configurations_by_local_search) next_configs_by_acq_value = next_configs_by_random_search_sorted + \ next_configs_by_local_search next_configs_by_acq_value.sort(reverse=True, key=lambda x: x[0]) self.logger.debug( "First 10 acq func values of selected configurations: %s" % (str([_[0] for _ in next_configs_by_acq_value[:10]]))) next_configs_by_acq_value = [_[1] for _ in next_configs_by_acq_value] challengers = list( itertools.chain(*zip(next_configs_by_acq_value, next_configs_by_random_search))) return challengers def _get_next_by_random_search(self, num_points=1000, _sorted=False): """Get candidate solutions via local search. Parameters ---------- num_points : int, optional (default=10) Number of local searches and returned values. _sorted : bool, optional (default=True) Whether to sort the candidate solutions by acquisition function value. Returns ------- list : (acquisition value, Candidate solutions) """ rand_configs = self.config_space.sample_configuration(size=num_points) if _sorted: imputed_rand_configs = map(ConfigSpace.util.impute_inactive_values, rand_configs) imputed_rand_configs = [ x.get_array() for x in imputed_rand_configs ] imputed_rand_configs = np.array(imputed_rand_configs, dtype=np.float64) acq_values = self.acquisition_func(imputed_rand_configs) # From here # http://stackoverflow.com/questions/20197990/how-to-make-argsort-result-to-be-random-between-equal-values random = self.rng.rand(len(acq_values)) # Last column is primary sort key! indices = np.lexsort((random.flatten(), acq_values.flatten())) for i in range(len(rand_configs)): rand_configs[i].origin = 'Random Search (sorted)' # Cannot use zip here because the indices array cannot index the # rand_configs list, because the second is a pure python list return [(acq_values[ind][0], rand_configs[ind]) for ind in indices[::-1]] else: for i in range(len(rand_configs)): rand_configs[i].origin = 'Random Search' return [(0, rand_configs[i]) for i in range(len(rand_configs))] def _get_next_by_local_search(self, num_points=10): """Get candidate solutions via local search. In case acquisition function values tie, these will be broken randomly. Parameters ---------- num_points : int, optional (default=10) Number of local searches and returned values. Returns ------- list : (acquisition value, Candidate solutions), ordered by their acquisition function value """ configs_acq = [] # Start N local search from different random start points for i in range(num_points): if i == 0 and self.incumbent is not None: start_point = self.incumbent else: start_point = self.config_space.sample_configuration() configuration, acq_val = self.local_search.maximize(start_point) configuration.origin = 'Local Search' configs_acq.append((acq_val[0][0], configuration)) # shuffle for random tie-break random.shuffle(configs_acq, self.rng.rand) # sort according to acq value # and return n best configurations configs_acq.sort(reverse=True, key=lambda x: x[0]) return configs_acq
def run_smbo(self, max_iters=1000): global evaluator # == first things first: load the datamanager self.reset_data_manager() # == Initialize SMBO stuff # first create a scenario seed = self.seed # TODO num_params = len(self.config_space.get_hyperparameters()) # allocate a run history run_history = RunHistory() meta_runhistory = RunHistory() meta_runs_dataset_indices = {} num_run = self.start_num_run instance_id = self.dataset_name + SENTINEL # == Train on subset # before doing anything, let us run the default_cfg # on a subset of the available data to ensure that # we at least have some models # we will try three different ratios of decreasing magnitude # in the hope that at least on the last one we will be able # to get a model n_data = self.datamanager.data['X_train'].shape[0] subset_ratio = 10000. / n_data if subset_ratio >= 0.5: subset_ratio = 0.33 subset_ratios = [subset_ratio, subset_ratio * 0.10] else: subset_ratios = [subset_ratio, 500. / n_data] self.logger.info("Training default configurations on a subset of " "%d/%d data points." % (int(n_data * subset_ratio), n_data)) # the time limit for these function evaluations is rigorously # set to only 1/2 of a full function evaluation subset_time_limit = max(5, int(self.func_eval_time_limit / 2)) # the configs we want to run on the data subset are: # 1) the default configs # 2) a set of configs we selected for training on a subset subset_configs = [self.config_space.get_default_configuration()] \ + self.collect_additional_subset_defaults() subset_config_succesful = [False] * len(subset_configs) for subset_config_id, next_config in enumerate(subset_configs): for i, ratio in enumerate(subset_ratios): self.reset_data_manager() n_data_subsample = int(n_data * ratio) # run the config, but throw away the result afterwards # since this cfg was evaluated only on a subset # and we don't want to confuse SMAC self.logger.info("Starting to evaluate %d on SUBSET " "with size %d and time limit %ds.", num_run, n_data_subsample, subset_time_limit) self.logger.info(next_config) _info = eval_with_limits( self.datamanager, self.tmp_dir, next_config, seed, num_run, self.resampling_strategy, self.resampling_strategy_args, self.memory_limit, subset_time_limit, n_data_subsample) (duration, result, _, additional_run_info, status) = _info self.logger.info("Finished evaluating %d. configuration on SUBSET. " "Duration %f; loss %f; status %s; additional run " "info: %s ", num_run, duration, result, str(status), additional_run_info) num_run += 1 if i < len(subset_ratios) - 1: if status != StatusType.SUCCESS: # Do not increase num_run here, because we will try # the same configuration with less data self.logger.info("A CONFIG did not finish " " for subset ratio %f -> going smaller", ratio) continue else: self.logger.info("Finished SUBSET training sucessfully " "with ratio %f", ratio) subset_config_succesful[subset_config_id] = True break else: if status != StatusType.SUCCESS: self.logger.info("A CONFIG did not finish " " for subset ratio %f.", ratio) continue else: self.logger.info("Finished SUBSET training sucessfully " "with ratio %f", ratio) subset_config_succesful[subset_config_id] = True break # Use the first non-failing configuration from the subsets as the new # default configuration -> this guards us against the random forest # failing on large, sparse datasets default_cfg = None for subset_config_id, next_config in enumerate(subset_configs): if subset_config_succesful[subset_config_id]: default_cfg = next_config break if default_cfg is None: default_cfg = self.config_space.get_default_configuration() # == METALEARNING suggestions # we start by evaluating the defaults on the full dataset again # and add the suggestions from metalearning behind it if self.metadata_directory is None: metalearning_directory = os.path.dirname( autosklearn.metalearning.__file__) # There is no multilabel data in OpenML if self.task == MULTILABEL_CLASSIFICATION: meta_task = BINARY_CLASSIFICATION else: meta_task = self.task metadata_directory = os.path.join( metalearning_directory, 'files', '%s_%s_%s' % (METRIC_TO_STRING[self.metric], TASK_TYPES_TO_STRING[meta_task], 'sparse' if self.datamanager.info['is_sparse'] else 'dense')) self.metadata_directory = metadata_directory self.logger.info('Metadata directory: %s', self.metadata_directory) meta_base = MetaBase(self.config_space, self.metadata_directory) metafeature_calculation_time_limit = int( self.total_walltime_limit / 4) metafeature_calculation_start_time = time.time() meta_features = self._calculate_metafeatures_with_limits( metafeature_calculation_time_limit) metafeature_calculation_end_time = time.time() metafeature_calculation_time_limit = \ metafeature_calculation_time_limit - ( metafeature_calculation_end_time - metafeature_calculation_start_time) if metafeature_calculation_time_limit < 1: self.logger.warning('Time limit for metafeature calculation less ' 'than 1 seconds (%f). Skipping calculation ' 'of metafeatures for encoded dataset.', metafeature_calculation_time_limit) meta_features_encoded = None else: self.datamanager.perform1HotEncoding() meta_features_encoded = \ self._calculate_metafeatures_encoded_with_limits( metafeature_calculation_time_limit) # In case there is a problem calculating the encoded meta-features if meta_features is None: if meta_features_encoded is not None: meta_features = meta_features_encoded else: if meta_features_encoded is not None: meta_features.metafeature_values.update( meta_features_encoded.metafeature_values) if meta_features is not None: meta_base.add_dataset(instance_id, meta_features) # Do mean imputation of the meta-features - should be done specific # for each prediction model! all_metafeatures = meta_base.get_metafeatures( features=list(meta_features.keys())) all_metafeatures.fillna(all_metafeatures.mean(), inplace=True) metalearning_configurations = self.collect_metalearning_suggestions( meta_base) if metalearning_configurations is None: metalearning_configurations = [] self.reset_data_manager() self.logger.info('%s', meta_features) # Convert meta-features into a dictionary because the scenario # expects a dictionary meta_features_dict = {} for dataset, series in all_metafeatures.iterrows(): meta_features_dict[dataset] = series.values meta_features_list = [] for meta_feature_name in all_metafeatures.columns: meta_features_list.append(meta_features[meta_feature_name].value) meta_features_list = np.array(meta_features_list).reshape((1, -1)) self.logger.info(list(meta_features_dict.keys())) meta_runs = meta_base.get_all_runs(METRIC_TO_STRING[self.metric]) meta_runs_index = 0 try: meta_durations = meta_base.get_all_runs('runtime') read_runtime_data = True except KeyError: read_runtime_data = False self.logger.critical('Cannot read runtime data.') if self.acquisition_function == 'EIPS': self.logger.critical('Reverting to acquisition function EI!') self.acquisition_function = 'EI' for meta_dataset in meta_runs.index: meta_dataset_start_index = meta_runs_index for meta_configuration in meta_runs.columns: if np.isfinite(meta_runs.loc[meta_dataset, meta_configuration]): try: config = meta_base.get_configuration_from_algorithm_index( meta_configuration) cost = meta_runs.loc[meta_dataset, meta_configuration] if read_runtime_data: runtime = meta_durations.loc[meta_dataset, meta_configuration] else: runtime = 1 # TODO read out other status types! meta_runhistory.add(config, cost, runtime, StatusType.SUCCESS, instance_id=meta_dataset) meta_runs_index += 1 except: # TODO maybe add warning pass meta_runs_dataset_indices[meta_dataset] = ( meta_dataset_start_index, meta_runs_index) else: if self.acquisition_function == 'EIPS': self.logger.critical('Reverting to acquisition function EI!') self.acquisition_function = 'EI' meta_features_list = [] meta_features_dict = {} metalearning_configurations = [] self.scenario = AutoMLScenario(self.config_space, self.total_walltime_limit, self.func_eval_time_limit, meta_features_dict, self.tmp_dir, self.shared_mode) types = get_types(self.config_space, self.scenario.feature_array) if self.acquisition_function == 'EI': rh2EPM = RunHistory2EPM4Cost(num_params=num_params, scenario=self.scenario, success_states=None, impute_censored_data=False, impute_state=None) model = RandomForestWithInstances(types, instance_features=meta_features_list, seed=1, num_trees=10) smac = SMBO(self.scenario, model=model, rng=seed) elif self.acquisition_function == 'EIPS': rh2EPM = RunHistory2EPM4EIPS(num_params=num_params, scenario=self.scenario, success_states=None, impute_censored_data=False, impute_state=None) model = UncorrelatedMultiObjectiveRandomForestWithInstances( ['cost', 'runtime'], types, num_trees = 10, instance_features=meta_features_list, seed=1) acquisition_function = EIPS(model) smac = SMBO(self.scenario, acquisition_function=acquisition_function, model=model, runhistory2epm=rh2EPM, rng=seed) else: raise ValueError('Unknown acquisition function value %s!' % self.acquisition_function) # Build a runtime model # runtime_rf = RandomForestWithInstances(types, # instance_features=meta_features_list, # seed=1, num_trees=10) # runtime_rh2EPM = RunHistory2EPM4EIPS(num_params=num_params, # scenario=self.scenario, # success_states=None, # impute_censored_data=False, # impute_state=None) # X_runtime, y_runtime = runtime_rh2EPM.transform(meta_runhistory) # runtime_rf.train(X_runtime, y_runtime[:, 1].flatten()) X_meta, Y_meta = rh2EPM.transform(meta_runhistory) # Transform Y_meta on a per-dataset base for meta_dataset in meta_runs_dataset_indices: start_index, end_index = meta_runs_dataset_indices[meta_dataset] end_index += 1 # Python indexing Y_meta[start_index:end_index, 0]\ [Y_meta[start_index:end_index, 0] >2.0] = 2.0 dataset_minimum = np.min(Y_meta[start_index:end_index, 0]) Y_meta[start_index:end_index, 0] = 1 - ( (1. - Y_meta[start_index:end_index, 0]) / (1. - dataset_minimum)) Y_meta[start_index:end_index, 0]\ [Y_meta[start_index:end_index, 0] > 2] = 2 # == first, evaluate all metelearning and default configurations for i, next_config in enumerate(([default_cfg] + metalearning_configurations)): # Do not evaluate default configurations more than once if i >= len([default_cfg]) and next_config in [default_cfg]: continue config_name = 'meta-learning' if i >= len([default_cfg]) \ else 'default' self.logger.info("Starting to evaluate %d. configuration " "(%s configuration) with time limit %ds.", num_run, config_name, self.func_eval_time_limit) self.logger.info(next_config) self.reset_data_manager() info = eval_with_limits(self.datamanager, self.tmp_dir, next_config, seed, num_run, self.resampling_strategy, self.resampling_strategy_args, self.memory_limit, self.func_eval_time_limit) (duration, result, _, additional_run_info, status) = info run_history.add(config=next_config, cost=result, time=duration , status=status, instance_id=instance_id, seed=seed) run_history.update_cost(next_config, result) self.logger.info("Finished evaluating %d. configuration. " "Duration %f; loss %f; status %s; additional run " "info: %s ", num_run, duration, result, str(status), additional_run_info) num_run += 1 if smac.incumbent is None: smac.incumbent = next_config elif result < run_history.get_cost(smac.incumbent): smac.incumbent = next_config if self.scenario.shared_model: pSMAC.write(run_history=run_history, output_directory=self.scenario.output_dir, num_run=self.seed) # == after metalearning run SMAC loop smac.runhistory = run_history smac_iter = 0 finished = False while not finished: if self.scenario.shared_model: pSMAC.read(run_history=run_history, output_directory=self.scenario.output_dir, configuration_space=self.config_space, logger=self.logger) next_configs = [] time_for_choose_next = -1 try: X_cfg, Y_cfg = rh2EPM.transform(run_history) if not run_history.empty(): # Update costs by normalization dataset_minimum = np.min(Y_cfg[:, 0]) Y_cfg[:, 0] = 1 - ((1. - Y_cfg[:, 0]) / (1. - dataset_minimum)) Y_cfg[:, 0][Y_cfg[:, 0] > 2] = 2 if len(X_meta) > 0 and len(X_cfg) > 0: pass #X_cfg = np.concatenate((X_meta, X_cfg)) #Y_cfg = np.concatenate((Y_meta, Y_cfg)) elif len(X_meta) > 0: X_cfg = X_meta.copy() Y_cfg = Y_meta.copy() elif len(X_cfg) > 0: X_cfg = X_cfg.copy() Y_cfg = Y_cfg.copy() else: raise ValueError('No training data for SMAC random forest!') self.logger.info('Using %d training points for SMAC.' % X_cfg.shape[0]) choose_next_start_time = time.time() next_configs_tmp = smac.choose_next(X_cfg, Y_cfg, num_interleaved_random=110, num_configurations_by_local_search=10, num_configurations_by_random_search_sorted=100) time_for_choose_next = time.time() - choose_next_start_time self.logger.info('Used %g seconds to find next ' 'configurations' % (time_for_choose_next)) next_configs.extend(next_configs_tmp) # TODO put Exception here! except Exception as e: self.logger.error(e) self.logger.error("Error in getting next configurations " "with SMAC. Using random configuration!") next_config = self.config_space.sample_configuration() next_configs.append(next_config) models_fitted_this_iteration = 0 start_time_this_iteration = time.time() for next_config in next_configs: x_runtime = impute_inactive_values(next_config) x_runtime = impute_inactive_values(x_runtime).get_array() # predicted_runtime = runtime_rf.predict_marginalized_over_instances( # x_runtime.reshape((1, -1))) # predicted_runtime = np.exp(predicted_runtime[0][0][0]) - 1 self.logger.info("Starting to evaluate %d. configuration (from " "SMAC) with time limit %ds.", num_run, self.func_eval_time_limit) self.logger.info(next_config) self.reset_data_manager() info = eval_with_limits(self.datamanager, self.tmp_dir, next_config, seed, num_run, self.resampling_strategy, self.resampling_strategy_args, self.memory_limit, self.func_eval_time_limit) (duration, result, _, additional_run_info, status) = info run_history.add(config=next_config, cost=result, time=duration , status=status, instance_id=instance_id, seed=seed) run_history.update_cost(next_config, result) #self.logger.info('Predicted runtime %g, true runtime %g', # predicted_runtime, duration) # TODO add unittest to make sure everything works fine and # this does not get outdated! if smac.incumbent is None: smac.incumbent = next_config elif result < run_history.get_cost(smac.incumbent): smac.incumbent = next_config self.logger.info("Finished evaluating %d. configuration. " "Duration: %f; loss: %f; status %s; additional " "run info: %s ", num_run, duration, result, str(status), additional_run_info) smac_iter += 1 num_run += 1 models_fitted_this_iteration += 1 time_used_this_iteration = time.time() - start_time_this_iteration if models_fitted_this_iteration >= 2 and \ time_for_choose_next > 0 and \ time_used_this_iteration > time_for_choose_next: break elif time_for_choose_next <= 0 and \ models_fitted_this_iteration >= 1: break elif models_fitted_this_iteration >= 50: break if max_iters is not None: finished = (smac_iter < max_iters) if self.scenario.shared_model: pSMAC.write(run_history=run_history, output_directory=self.scenario.output_dir, num_run=self.seed)