def setUp(self): self.cwd = os.getcwd() data_dir = os.path.dirname(__file__) data_dir = os.path.join(data_dir, 'test_meta_base_data') os.chdir(data_dir) cs = autosklearn.pipeline.classification.SimpleClassificationPipeline()\ .get_hyperparameter_search_space() self.base = MetaBase(cs, data_dir)
def get_autosklearn_metalearning(X_train, y_train, cat, metric, num_initial_configurations): task_id = "new_task" is_sparse = scipy.sparse.issparse(X_train) dataset_properties = { 'signed': True, 'multiclass': False if len(np.unique(y_train)) == 2 == 2 else True, 'task': 1 if len(np.unique(y_train)) == 2 else 2, 'sparse': is_sparse, 'is_sparse': is_sparse, 'target_type': 'classification', 'multilabel': False } config_space = pipeline.get_configuration_space(dataset_properties, None, None, None, None) metalearning_dir = os.path.join( os.path.dirname(metalearning.__file__), "files", "balanced_accuracy_{0}.classification_{1}".format( "multiclass" if dataset_properties["multiclass"] else "binary", "sparse" if dataset_properties["sparse"] else "dense")) metabase = MetaBase(config_space, metalearning_dir) meta_features = None try: rvals, sparse = perform_one_hot_encoding( dataset_properties["sparse"], [c in ['categorical'] for c in cat], [X_train]) meta_features = _calculate_metafeatures_encoded__( task_id, rvals[0], y_train) X_train = rvals except: meta_features = _calculate_metafeatures__(cat, MULTICLASS_CLASSIFICATION, task_id, X_train, y_train) if meta_features is None: raise Exception("Error calculating metafeatures") metabase.add_dataset(task_id, meta_features) configs, list_nn = (suggest_via_metalearning_( metabase, task_id, metric, 2 if dataset_properties["multiclass"] else 1, False, num_initial_configurations)) return configs, list_nn
def setUp(self): self.cwd = os.getcwd() data_dir = os.path.dirname(__file__) data_dir = os.path.join(data_dir, 'test_meta_base_data') os.chdir(data_dir) cs = ParamSklearn.classification.ParamSklearnClassifier.get_hyperparameter_search_space() self.base = MetaBase(cs, data_dir)
def __init__(self, dataset_name, configuration_space, aslib_directory, distance='l1', seed=None, use_features='', distance_kwargs=None, subset='all'): """Metalearning optimizer. Parameters ---------- dataset_name : str Name of the dataset configuration_space : HPOlibConfigSpace.configuration_space.ConfigurationSpace datasets_file : str Path to an aslib directory distance : str, "l1" or "l2" or "random" Distance function to be used by the kNearestDatasets algorithm. seed use_features metric_kwargs subset """ self.dataset_name = dataset_name self.configuration_space = configuration_space self.aslib_dir = aslib_directory self.distance = distance self.seed = seed self.use_features = use_features self.distance_kwargs = distance_kwargs self.subset = subset self.kND = None # For caching, makes things faster... self.meta_base = MetaBase(configuration_space, self.aslib_dir) self.logger = logging.getLogger(__name__)
def setUp(self): self.cwd = os.getcwd() data_dir = os.path.dirname(__file__) data_dir = os.path.join(data_dir, 'test_meta_base_data') os.chdir(data_dir) cs = autosklearn.pipeline.classification.SimpleClassificationPipeline\ .get_hyperparameter_search_space() self.base = MetaBase(cs, data_dir)
def setUp(self): self.cwd = os.getcwd() data_dir = os.path.dirname(__file__) data_dir = os.path.join(data_dir, 'test_meta_base_data') os.chdir(data_dir) self.cs = autosklearn.pipeline.classification\ .SimpleClassificationPipeline().get_hyperparameter_search_space() meta_base = MetaBase(self.cs, data_dir) self.meta_optimizer = metalearner.MetaLearningOptimizer( '233', self.cs, meta_base)
class MetaBaseTest(unittest.TestCase): _multiprocess_can_split_ = True def setUp(self): self.cwd = os.getcwd() data_dir = os.path.dirname(__file__) data_dir = os.path.join(data_dir, 'test_meta_base_data') os.chdir(data_dir) cs = autosklearn.pipeline.classification.SimpleClassificationPipeline\ .get_hyperparameter_search_space() self.base = MetaBase(cs, data_dir) def tearDown(self): os.chdir(self.cwd) def test_get_all_runs(self): runs = self.base.get_all_runs() self.assertIsInstance(runs, pd.DataFrame) # TODO update this ASAP self.assertEqual((134, 24), runs.shape) def test_get_runs(self): runs = self.base.get_runs('38_acc') # TODO update this ASAP self.assertEqual(24, len(runs)) self.assertIsInstance(runs, pd.Series) def test_get_metafeatures_as_pandas(self): mf = self.base.get_metafeatures('38_acc') self.assertTrue(np.isfinite(mf).all()) self.assertEqual(type(mf), pd.Series) self.assertEqual(mf.name, u'38_acc') self.assertEqual(mf.loc['NumberOfInstances'], 2527.0) def test_get_all_metafeatures_as_pandas(self): mf = self.base.get_all_metafeatures() self.assertIsInstance(mf, pd.DataFrame) self.assertEqual((140, 46), mf.shape)
class MetaBaseTest(unittest.TestCase): _multiprocess_can_split_ = True def setUp(self): self.cwd = os.getcwd() data_dir = os.path.dirname(__file__) data_dir = os.path.join(data_dir, 'test_meta_base_data') os.chdir(data_dir) cs = autosklearn.pipeline.classification.SimpleClassificationPipeline()\ .get_hyperparameter_search_space() self.logger = logging.getLogger() self.base = MetaBase(cs, data_dir, logger=self.logger) def tearDown(self): os.chdir(self.cwd) def test_get_all_runs(self): runs = self.base.get_all_runs() self.assertIsInstance(runs, pd.DataFrame) # TODO update this ASAP self.assertEqual((125, 125), runs.shape) def test_get_runs(self): runs = self.base.get_runs('233') # TODO update this ASAP self.assertEqual(125, len(runs)) self.assertIsInstance(runs, pd.Series) def test_get_metafeatures_single_dataset(self): mf = self.base.get_metafeatures('233') self.assertIsInstance(mf, pd.Series) self.assertEqual(mf.name, '233') self.assertEqual(mf.loc['NumberOfInstances'], 2142.0) def test_get_metafeatures_single_feature(self): mf = self.base.get_metafeatures(features='NumberOfInstances') self.assertIsInstance(mf, pd.Series) self.assertEqual(mf.shape, (132, )) def test_get_metafeatures_single_dataset_and_single_feature(self): mf = self.base.get_metafeatures('233', features='NumberOfInstances') self.assertEqual(mf.shape, ()) def test_get_metafeatures_multiple_datasets(self): mf = self.base.get_metafeatures(['233', '236']) self.assertIsInstance(mf, pd.DataFrame) self.assertEqual(mf.shape, (2, 46)) def test_get_metafeatures_multiple_features(self): mf = self.base.get_metafeatures( features=['NumberOfInstances', 'NumberOfClasses']) self.assertIsInstance(mf, pd.DataFrame) self.assertEqual(mf.shape, (132, 2))
# args['metalearning_directory']) with open(args["task_files_list"]) as fh: task_files_list = fh.readlines() with open(args["experiments_list"]) as fh: experiments_list = fh.readlines() if 'keep_configurations' in args: keep_configurations = args['keep_configurations'] keep_configurations = keep_configurations.split(',') keep_configurations = tuple( [tuple(kc.split('=')) for kc in keep_configurations]) else: keep_configurations = None meta_base = MetaBase(task_files_list, experiments_list, keep_configurations) metafeatures = meta_base.get_all_train_metafeatures_as_pandas() runs = meta_base.get_all_runs() # This can print the best hyperparameters of every dataset # for dataset in runs: # print dataset, sorted(runs[dataset], key=lambda t: t.result)[0] rf = LearnedDistanceRF(**params) X, Y = rf._create_dataset(metafeatures, runs) import cPickle with open("test.pkl", "w") as fh: cPickle.dump((X, Y, metafeatures), fh, -1) print("Metafeatures", metafeatures.shape)
def run_smbo(self): self.watcher.start_task('SMBO') # == first things first: load the datamanager self.reset_data_manager() # == Initialize non-SMBO stuff # first create a scenario seed = self.seed self.config_space.seed(seed) num_params = len(self.config_space.get_hyperparameters()) # allocate a run history num_run = self.start_num_run instance_id = self.dataset_name + SENTINEL # Initialize some SMAC dependencies runhistory = RunHistory(aggregate_func=average_cost) # meta_runhistory = RunHistory(aggregate_func=average_cost) # meta_runs_dataset_indices = {} # == METALEARNING suggestions # we start by evaluating the defaults on the full dataset again # and add the suggestions from metalearning behind it if self.num_metalearning_cfgs > 0: if self.metadata_directory is None: metalearning_directory = os.path.dirname( autosklearn.metalearning.__file__) # There is no multilabel data in OpenML if self.task == MULTILABEL_CLASSIFICATION: meta_task = BINARY_CLASSIFICATION else: meta_task = self.task metadata_directory = os.path.join( metalearning_directory, 'files', '%s_%s_%s' % (METRIC_TO_STRING[self.metric], TASK_TYPES_TO_STRING[meta_task], 'sparse' if self.datamanager.info['is_sparse'] else 'dense')) self.metadata_directory = metadata_directory self.logger.info('Metadata directory: %s', self.metadata_directory) meta_base = MetaBase(self.config_space, self.metadata_directory) metafeature_calculation_time_limit = int( self.total_walltime_limit / 4) metafeature_calculation_start_time = time.time() meta_features = self._calculate_metafeatures_with_limits( metafeature_calculation_time_limit) metafeature_calculation_end_time = time.time() metafeature_calculation_time_limit = \ metafeature_calculation_time_limit - ( metafeature_calculation_end_time - metafeature_calculation_start_time) if metafeature_calculation_time_limit < 1: self.logger.warning( 'Time limit for metafeature calculation less ' 'than 1 seconds (%f). Skipping calculation ' 'of metafeatures for encoded dataset.', metafeature_calculation_time_limit) meta_features_encoded = None else: with warnings.catch_warnings(): warnings.showwarning = self._send_warnings_to_log self.datamanager.perform1HotEncoding() meta_features_encoded = \ self._calculate_metafeatures_encoded_with_limits( metafeature_calculation_time_limit) # In case there is a problem calculating the encoded meta-features if meta_features is None: if meta_features_encoded is not None: meta_features = meta_features_encoded else: if meta_features_encoded is not None: meta_features.metafeature_values.update( meta_features_encoded.metafeature_values) if meta_features is not None: meta_base.add_dataset(instance_id, meta_features) # Do mean imputation of the meta-features - should be done specific # for each prediction model! all_metafeatures = meta_base.get_metafeatures( features=list(meta_features.keys())) all_metafeatures.fillna(all_metafeatures.mean(), inplace=True) with warnings.catch_warnings(): warnings.showwarning = self._send_warnings_to_log metalearning_configurations = self.collect_metalearning_suggestions( meta_base) if metalearning_configurations is None: metalearning_configurations = [] self.reset_data_manager() self.logger.info('%s', meta_features) # Convert meta-features into a dictionary because the scenario # expects a dictionary meta_features_dict = {} for dataset, series in all_metafeatures.iterrows(): meta_features_dict[dataset] = series.values meta_features_list = [] for meta_feature_name in all_metafeatures.columns: meta_features_list.append( meta_features[meta_feature_name].value) meta_features_list = np.array(meta_features_list).reshape( (1, -1)) self.logger.info(list(meta_features_dict.keys())) # meta_runs = meta_base.get_all_runs(METRIC_TO_STRING[self.metric]) # meta_runs_index = 0 # try: # meta_durations = meta_base.get_all_runs('runtime') # read_runtime_data = True # except KeyError: # read_runtime_data = False # self.logger.critical('Cannot read runtime data.') # if self.acquisition_function == 'EIPS': # self.logger.critical('Reverting to acquisition function EI!') # self.acquisition_function = 'EI' # for meta_dataset in meta_runs.index: # meta_dataset_start_index = meta_runs_index # for meta_configuration in meta_runs.columns: # if np.isfinite(meta_runs.loc[meta_dataset, meta_configuration]): # try: # config = meta_base.get_configuration_from_algorithm_index( # meta_configuration) # cost = meta_runs.loc[meta_dataset, meta_configuration] # if read_runtime_data: # runtime = meta_durations.loc[meta_dataset, # meta_configuration] # else: # runtime = 1 # # TODO read out other status types! # meta_runhistory.add(config, cost, runtime, # StatusType.SUCCESS, # instance_id=meta_dataset) # meta_runs_index += 1 # except: # # TODO maybe add warning # pass # # meta_runs_dataset_indices[meta_dataset] = ( # meta_dataset_start_index, meta_runs_index) else: meta_features = None if meta_features is None: if self.acquisition_function == 'EIPS': self.logger.critical('Reverting to acquisition function EI!') self.acquisition_function = 'EI' meta_features_list = [] meta_features_dict = {} metalearning_configurations = [] if self.resampling_strategy in [ 'partial-cv', 'partial-cv-iterative-fit' ]: num_folds = self.resampling_strategy_args['folds'] instances = [[fold_number] for fold_number in range(num_folds)] else: instances = None startup_time = self.watcher.wall_elapsed(self.dataset_name) total_walltime_limit = self.total_walltime_limit - startup_time - 5 scenario_dict = { 'cs': self.config_space, 'cutoff-time': self.func_eval_time_limit, 'memory-limit': self.memory_limit, 'wallclock-limit': total_walltime_limit, # 'instances': [[name] for name in meta_features_dict], 'output-dir': self.backend.temporary_directory, 'shared-model': self.shared_mode, 'run-obj': 'quality', 'deterministic': 'true', 'instances': instances } if self.configuration_mode == 'RANDOM': scenario_dict['minR'] = len( instances) if instances is not None else 1 scenario_dict['initial_incumbent'] = 'RANDOM' self.scenario = Scenario(scenario_dict) # TODO rebuild target algorithm to be it's own target algorithm # evaluator, which takes into account that a run can be killed prior # to the model being fully fitted; thus putting intermediate results # into a queue and querying them once the time is over exclude = dict() include = dict() if self.include_preprocessors is not None and \ self.exclude_preprocessors is not None: raise ValueError('Cannot specify include_preprocessors and ' 'exclude_preprocessors.') elif self.include_preprocessors is not None: include['preprocessor'] = self.include_preprocessors elif self.exclude_preprocessors is not None: exclude['preprocessor'] = self.exclude_preprocessors if self.include_estimators is not None and \ self.exclude_preprocessors is not None: raise ValueError('Cannot specify include_estimators and ' 'exclude_estimators.') elif self.include_estimators is not None: if self.task in CLASSIFICATION_TASKS: include['classifier'] = self.include_estimators elif self.task in REGRESSION_TASKS: include['regressor'] = self.include_estimators else: raise ValueError(self.task) elif self.exclude_estimators is not None: if self.task in CLASSIFICATION_TASKS: exclude['classifier'] = self.exclude_estimators elif self.task in REGRESSION_TASKS: exclude['regressor'] = self.exclude_estimators else: raise ValueError(self.task) ta = ExecuteTaFuncWithQueue( backend=self.backend, autosklearn_seed=seed, resampling_strategy=self.resampling_strategy, initial_num_run=num_run, logger=self.logger, include=include, exclude=exclude, memory_limit=self.memory_limit, disable_file_output=self.disable_file_output, **self.resampling_strategy_args) types = get_types(self.config_space, self.scenario.feature_array) # TODO extract generation of SMAC object into it's own function for # testing if self.acquisition_function == 'EI': model = RandomForestWithInstances( types, #instance_features=meta_features_list, seed=1, num_trees=10) rh2EPM = RunHistory2EPM4Cost(num_params=num_params, scenario=self.scenario, success_states=[ StatusType.SUCCESS, StatusType.MEMOUT, StatusType.TIMEOUT ], impute_censored_data=False, impute_state=None) _smac_arguments = dict(scenario=self.scenario, model=model, rng=seed, runhistory2epm=rh2EPM, tae_runner=ta, runhistory=runhistory) elif self.acquisition_function == 'EIPS': rh2EPM = RunHistory2EPM4EIPS(num_params=num_params, scenario=self.scenario, success_states=[ StatusType.SUCCESS, StatusType.MEMOUT, StatusType.TIMEOUT ], impute_censored_data=False, impute_state=None) model = UncorrelatedMultiObjectiveRandomForestWithInstances( ['cost', 'runtime'], types, num_trees=10, instance_features=meta_features_list, seed=1) acquisition_function = EIPS(model) _smac_arguments = dict(scenario=self.scenario, model=model, rng=seed, tae_runner=ta, runhistory2epm=rh2EPM, runhistory=runhistory, acquisition_function=acquisition_function) else: raise ValueError('Unknown acquisition function value %s!' % self.acquisition_function) if self.configuration_mode == 'SMAC': smac = SMAC(**_smac_arguments) elif self.configuration_mode in ['ROAR', 'RANDOM']: for not_in_roar in ['runhistory2epm', 'model']: if not_in_roar in _smac_arguments: del _smac_arguments[not_in_roar] smac = ROAR(**_smac_arguments) else: raise ValueError(self.configuration_mode) # Build a runtime model # runtime_rf = RandomForestWithInstances(types, # instance_features=meta_features_list, # seed=1, num_trees=10) # runtime_rh2EPM = RunHistory2EPM4EIPS(num_params=num_params, # scenario=self.scenario, # success_states=None, # impute_censored_data=False, # impute_state=None) # X_runtime, y_runtime = runtime_rh2EPM.transform(meta_runhistory) # runtime_rf.train(X_runtime, y_runtime[:, 1].flatten()) # X_meta, Y_meta = rh2EPM.transform(meta_runhistory) # # Transform Y_meta on a per-dataset base # for meta_dataset in meta_runs_dataset_indices: # start_index, end_index = meta_runs_dataset_indices[meta_dataset] # end_index += 1 # Python indexing # Y_meta[start_index:end_index, 0]\ # [Y_meta[start_index:end_index, 0] >2.0] = 2.0 # dataset_minimum = np.min(Y_meta[start_index:end_index, 0]) # Y_meta[start_index:end_index, 0] = 1 - ( # (1. - Y_meta[start_index:end_index, 0]) / # (1. - dataset_minimum)) # Y_meta[start_index:end_index, 0]\ # [Y_meta[start_index:end_index, 0] > 2] = 2 smac.solver.stats.start_timing() # == first, evaluate all metelearning and default configurations smac.solver.incumbent = smac.solver.initial_design.run() for challenger in metalearning_configurations: smac.solver.incumbent, inc_perf = smac.solver.intensifier.intensify( challengers=[challenger], incumbent=smac.solver.incumbent, run_history=smac.solver.runhistory, aggregate_func=smac.solver.aggregate_func, time_bound=self.total_walltime_limit) if smac.solver.scenario.shared_model: pSMAC.write(run_history=smac.solver.runhistory, output_directory=smac.solver.scenario.output_dir, num_run=self.seed) if smac.solver.stats.is_budget_exhausted(): break # == after metalearning run SMAC loop while True: if smac.solver.scenario.shared_model: pSMAC.read(run_history=smac.solver.runhistory, output_directory=self.scenario.output_dir, configuration_space=self.config_space, logger=self.logger) choose_next_start_time = time.time() try: challengers = self.choose_next(smac) except Exception as e: self.logger.error(e) self.logger.error("Error in getting next configurations " "with SMAC. Using random configuration!") next_config = self.config_space.sample_configuration() challengers = [next_config] time_for_choose_next = time.time() - choose_next_start_time self.logger.info('Used %g seconds to find next ' 'configurations' % (time_for_choose_next)) time_for_choose_next = max(time_for_choose_next, 1.0) smac.solver.incumbent, inc_perf = smac.solver.intensifier.intensify( challengers=challengers, incumbent=smac.solver.incumbent, run_history=smac.solver.runhistory, aggregate_func=smac.solver.aggregate_func, time_bound=time_for_choose_next) if smac.solver.scenario.shared_model: pSMAC.write(run_history=smac.solver.runhistory, output_directory=smac.solver.scenario.output_dir, num_run=self.seed) if smac.solver.stats.is_budget_exhausted(): break self.runhistory = smac.solver.runhistory self.trajectory = smac.solver.intensifier.traj_logger.trajectory return self.runhistory, self.trajectory
class MetaLearningOptimizer(object): def __init__(self, dataset_name, configuration_space, aslib_directory, distance='l1', seed=None, use_features='', distance_kwargs=None, subset='all'): """Metalearning optimizer. Parameters ---------- dataset_name : str Name of the dataset configuration_space : HPOlibConfigSpace.configuration_space.ConfigurationSpace datasets_file : str Path to an aslib directory distance : str, "l1" or "l2" or "random" Distance function to be used by the kNearestDatasets algorithm. seed use_features metric_kwargs subset """ self.dataset_name = dataset_name self.configuration_space = configuration_space self.aslib_dir = aslib_directory self.distance = distance self.seed = seed self.use_features = use_features self.distance_kwargs = distance_kwargs self.subset = subset self.kND = None # For caching, makes things faster... self.meta_base = MetaBase(configuration_space, self.aslib_dir) self.logger = logging.getLogger(__name__) def perform_sequential_optimization(self, target_algorithm=test_function, time_budget=None, evaluation_budget=None): raise NotImplementedError("Right now this is not implemented due to " "timing issues.") time_taken = 0 num_evaluations = 0 history = [] self.logger.info("Taking distance measure %s" % self.distance) while True: if time_budget is not None and time_taken >= time_budget: self.logger.info("Reached time budget. Exiting optimization.") break if evaluation_budget is not None and \ num_evaluations >= evaluation_budget: self.logger.info( "Reached maximum number of evaluations. Exiting " "optimization.") break params = self.metalearning_suggest(history) fixed_params = OrderedDict() # Hack to remove all trailing - from the params which are # accidently in the experiment pickle of the current HPOlib version for key in params: if key[0] == "-": fixed_params[key[1:]] = params[key] else: fixed_params[key] = params[key] self.logger.info( "%d/%d, parameters: %s" % (num_evaluations, evaluation_budget, str(fixed_params))) result = target_algorithm(fixed_params) history.append(Run(params, result)) num_evaluations += 1 return min([run.result for run in history]) def metalearning_suggest_all(self, exclude_double_configurations=True): """Return a list of the best hyperparameters of neighboring datasets""" # TODO check if _learn was called before! neighbors = self._learn(exclude_double_configurations) hp_list = [] for neighbor in neighbors: try: configuration = \ self.meta_base.get_configuration_from_algorithm_index( neighbor[2]) self.logger.info("%s %s %s" % (neighbor[0], neighbor[1], configuration)) except (KeyError): self.logger.warning("Configuration %s not found" % neighbor[2]) continue hp_list.append(configuration) return hp_list def metalearning_suggest(self, history): """Suggest the next most promising hyperparameters which were not yet evaluated""" # TODO test the object in the history! neighbors = self._learn() # Iterate over all datasets which are sorted ascending by distance history_with_indices = [] for run in history: history_with_indices.append(\ self.meta_base.get_algorithm_index_from_configuration(run)) for idx, neighbor in enumerate(neighbors): already_evaluated = False # Check if that dataset was already evaluated for run in history_with_indices: # If so, return to the outer loop if neighbor[2] == run: already_evaluated = True break if not already_evaluated: self.logger.info( "Nearest dataset with hyperparameters of best value " "not evaluated yet is %s with a distance of %f" % (neighbor[0], neighbor[1])) return self.meta_base.get_configuration_from_algorithm_index( neighbor[2]) raise StopIteration("No more values available.") def _learn(self, exclude_double_configurations=True): dataset_metafeatures, all_other_metafeatures = self._get_metafeatures() # Remove metafeatures which could not be calculated for the target # dataset keep = [] for idx in dataset_metafeatures.index: if np.isfinite(dataset_metafeatures.loc[idx]): keep.append(idx) dataset_metafeatures = dataset_metafeatures.loc[keep] all_other_metafeatures = all_other_metafeatures.loc[:, keep] # Do mean imputation of all other metafeatures all_other_metafeatures = all_other_metafeatures.fillna( all_other_metafeatures.mean()) if self.kND is None: # In case that we learn our distance function, get_value the parameters for # the random forest if self.distance_kwargs: rf_params = ast.literal_eval(self.distance_kwargs) else: rf_params = None # To keep the distance the same in every iteration, we create a new # random state random_state = sklearn.utils.check_random_state(self.seed) kND = KNearestDatasets(metric=self.distance, random_state=random_state, metric_params=rf_params) runs = dict() # TODO move this code to the metabase for task_id in all_other_metafeatures.index: try: runs[task_id] = self.meta_base.get_runs(task_id) except KeyError: # TODO should I really except this? self.logger.warning("Could not find runs for instance %s" % task_id) runs[task_id] = pd.Series([], name=task_id) runs = pd.DataFrame(runs) kND.fit(all_other_metafeatures, runs) self.kND = kND return self.kND.kBestSuggestions( dataset_metafeatures, k=-1, exclude_double_configurations=exclude_double_configurations) def _get_metafeatures(self): """This is inside an extra function for testing purpose""" # Load the task self.logger.info("Going to use the metafeature subset: %s", self.subset) all_metafeatures = self.meta_base.get_all_metafeatures() self.logger.info(" ".join(all_metafeatures.columns)) # TODO: buggy and hacky, replace with a list seperated by commas if self.use_features and \ (type(self.use_features) != str or self.use_features != ''): #ogger.warn("Going to keep the following features %s", # str(self.use_features)) if type(self.use_features) == str: use_features = self.use_features.split(",") elif type(self.use_features) in (list, np.ndarray): use_features = self.use_features else: raise NotImplementedError(type(self.use_features)) if len(use_features) == 0: self.logger.info( "You just tried to remove all metafeatures...") else: keep = [ col for col in all_metafeatures.columns if col in use_features ] if len(use_features) == 0: self.logger.info( "You just tried to remove all metafeatures...") else: all_metafeatures = all_metafeatures.loc[:, keep] self.logger.info( "Going to keep the following metafeatures:") self.logger.info(str(keep)) return self._split_metafeature_array(self.dataset_name, all_metafeatures) def _split_metafeature_array(self, dataset_name, metafeatures): """Split the metafeature array into dataset metafeatures and all other. This is inside an extra function for testing purpose. """ dataset_metafeatures = metafeatures.loc[dataset_name].copy() metafeatures = metafeatures[metafeatures.index != dataset_name] return dataset_metafeatures, metafeatures def read_task_list(self, fh): dataset_filenames = list() for line in fh: line = line.replace("\n", "") if line: dataset_filenames.append(line) else: raise ValueError("Blank lines in the task list are not " "supported.") return dataset_filenames def read_experiments_list(self, fh): experiments_list = list() for line in fh.readlines(): experiments_list.append(line.split()) return experiments_list
'sparse')) configuration_space = get_configuration_space( { 'metric': metric, 'task': task, 'is_sparse': False }, include_preprocessors=['no_preprocessing']) print(metadata_directory) dataset_name = 'diabetes' X_train, Y_train, X_test, Y_test = get_dataset(dataset_name) print(X_train) meta_base = MetaBase(configuration_space, metadata_directory) print(meta_base) dataset_metafeatures = meta_base.get_metafeatures('1030_a_metric', None) print(dataset_metafeatures) all_other_datasets = meta_base.get_all_dataset_names() print(all_other_datasets) res = suggest_via_metalearning(meta_base,'198_a_metric',metric,task,False,1) print(res) print(type(res))
def run_smbo(self, max_iters=1000): global evaluator self.watcher.start_task('SMBO') # == first things first: load the datamanager self.reset_data_manager() # == Initialize non-SMBO stuff # first create a scenario seed = self.seed self.config_space.seed(seed) num_params = len(self.config_space.get_hyperparameters()) # allocate a run history num_run = self.start_num_run instance_id = self.dataset_name + SENTINEL # Initialize some SMAC dependencies runhistory = RunHistory(aggregate_func=average_cost) # meta_runhistory = RunHistory(aggregate_func=average_cost) # meta_runs_dataset_indices = {} # == METALEARNING suggestions # we start by evaluating the defaults on the full dataset again # and add the suggestions from metalearning behind it if self.metadata_directory is None: metalearning_directory = os.path.dirname( autosklearn.metalearning.__file__) # There is no multilabel data in OpenML if self.task == MULTILABEL_CLASSIFICATION: meta_task = BINARY_CLASSIFICATION else: meta_task = self.task metadata_directory = os.path.join( metalearning_directory, 'files', '%s_%s_%s' % (METRIC_TO_STRING[self.metric], TASK_TYPES_TO_STRING[meta_task], 'sparse' if self.datamanager.info['is_sparse'] else 'dense')) self.metadata_directory = metadata_directory self.logger.info('Metadata directory: %s', self.metadata_directory) meta_base = MetaBase(self.config_space, self.metadata_directory) metafeature_calculation_time_limit = int( self.total_walltime_limit / 4) metafeature_calculation_start_time = time.time() meta_features = self._calculate_metafeatures_with_limits( metafeature_calculation_time_limit) metafeature_calculation_end_time = time.time() metafeature_calculation_time_limit = \ metafeature_calculation_time_limit - ( metafeature_calculation_end_time - metafeature_calculation_start_time) if metafeature_calculation_time_limit < 1: self.logger.warning('Time limit for metafeature calculation less ' 'than 1 seconds (%f). Skipping calculation ' 'of metafeatures for encoded dataset.', metafeature_calculation_time_limit) meta_features_encoded = None else: with warnings.catch_warnings(): warnings.showwarning = self._send_warnings_to_log self.datamanager.perform1HotEncoding() meta_features_encoded = \ self._calculate_metafeatures_encoded_with_limits( metafeature_calculation_time_limit) # In case there is a problem calculating the encoded meta-features if meta_features is None: if meta_features_encoded is not None: meta_features = meta_features_encoded else: if meta_features_encoded is not None: meta_features.metafeature_values.update( meta_features_encoded.metafeature_values) if meta_features is not None: meta_base.add_dataset(instance_id, meta_features) # Do mean imputation of the meta-features - should be done specific # for each prediction model! all_metafeatures = meta_base.get_metafeatures( features=list(meta_features.keys())) all_metafeatures.fillna(all_metafeatures.mean(), inplace=True) with warnings.catch_warnings(): warnings.showwarning = self._send_warnings_to_log metalearning_configurations = self.collect_metalearning_suggestions( meta_base) if metalearning_configurations is None: metalearning_configurations = [] self.reset_data_manager() self.logger.info('%s', meta_features) # Convert meta-features into a dictionary because the scenario # expects a dictionary meta_features_dict = {} for dataset, series in all_metafeatures.iterrows(): meta_features_dict[dataset] = series.values meta_features_list = [] for meta_feature_name in all_metafeatures.columns: meta_features_list.append(meta_features[meta_feature_name].value) meta_features_list = np.array(meta_features_list).reshape((1, -1)) self.logger.info(list(meta_features_dict.keys())) #meta_runs = meta_base.get_all_runs(METRIC_TO_STRING[self.metric]) #meta_runs_index = 0 #try: # meta_durations = meta_base.get_all_runs('runtime') # read_runtime_data = True #except KeyError: # read_runtime_data = False # self.logger.critical('Cannot read runtime data.') # if self.acquisition_function == 'EIPS': # self.logger.critical('Reverting to acquisition function EI!') # self.acquisition_function = 'EI' # for meta_dataset in meta_runs.index: # meta_dataset_start_index = meta_runs_index # for meta_configuration in meta_runs.columns: # if np.isfinite(meta_runs.loc[meta_dataset, meta_configuration]): # try: # config = meta_base.get_configuration_from_algorithm_index( # meta_configuration) # cost = meta_runs.loc[meta_dataset, meta_configuration] # if read_runtime_data: # runtime = meta_durations.loc[meta_dataset, # meta_configuration] # else: # runtime = 1 # # TODO read out other status types! # meta_runhistory.add(config, cost, runtime, # StatusType.SUCCESS, # instance_id=meta_dataset) # meta_runs_index += 1 # except: # # TODO maybe add warning # pass # # meta_runs_dataset_indices[meta_dataset] = ( # meta_dataset_start_index, meta_runs_index) else: if self.acquisition_function == 'EIPS': self.logger.critical('Reverting to acquisition function EI!') self.acquisition_function = 'EI' meta_features_list = [] meta_features_dict = {} metalearning_configurations = [] self.scenario = Scenario({'cs': self.config_space, 'cutoff-time': self.func_eval_time_limit, 'memory-limit': self.memory_limit, 'wallclock-limit': self.total_walltime_limit, #'instances': [[name] for name in meta_features_dict], 'output-dir': self.backend.temporary_directory, 'shared-model': self.shared_mode, 'run-obj': 'quality', 'deterministic': 'true'}) # TODO rebuild target algorithm to be it's own target algorithm # evaluator, which takes into account that a run can be killed prior # to the model being fully fitted; thus putting intermediate results # into a queue and querying them once the time is over ta = ExecuteTaFuncWithQueue(backend=self.backend, autosklearn_seed=seed, resampling_strategy=self.resampling_strategy, initial_num_run=num_run, logger=self.logger, **self.resampling_strategy_args) types = get_types(self.config_space, self.scenario.feature_array) # TODO extract generation of SMAC object into it's own function for # testing if self.acquisition_function == 'EI': model = RandomForestWithInstances(types, #instance_features=meta_features_list, seed=1, num_trees=10) smac = SMAC(scenario=self.scenario, model=model, rng=seed, tae_runner=ta, runhistory=runhistory) elif self.acquisition_function == 'EIPS': rh2EPM = RunHistory2EPM4EIPS(num_params=num_params, scenario=self.scenario, success_states=None, impute_censored_data=False, impute_state=None) model = UncorrelatedMultiObjectiveRandomForestWithInstances( ['cost', 'runtime'], types, num_trees = 10, instance_features=meta_features_list, seed=1) acquisition_function = EIPS(model) smac = SMAC(scenario=self.scenario, tae_runner=ta, acquisition_function=acquisition_function, model=model, runhistory2epm=rh2EPM, rng=seed, runhistory=runhistory) else: raise ValueError('Unknown acquisition function value %s!' % self.acquisition_function) smac.solver.stats.start_timing() smac.solver.incumbent = smac.solver.initial_design.run() # Build a runtime model # runtime_rf = RandomForestWithInstances(types, # instance_features=meta_features_list, # seed=1, num_trees=10) # runtime_rh2EPM = RunHistory2EPM4EIPS(num_params=num_params, # scenario=self.scenario, # success_states=None, # impute_censored_data=False, # impute_state=None) # X_runtime, y_runtime = runtime_rh2EPM.transform(meta_runhistory) # runtime_rf.train(X_runtime, y_runtime[:, 1].flatten()) # X_meta, Y_meta = rh2EPM.transform(meta_runhistory) # # Transform Y_meta on a per-dataset base # for meta_dataset in meta_runs_dataset_indices: # start_index, end_index = meta_runs_dataset_indices[meta_dataset] # end_index += 1 # Python indexing # Y_meta[start_index:end_index, 0]\ # [Y_meta[start_index:end_index, 0] >2.0] = 2.0 # dataset_minimum = np.min(Y_meta[start_index:end_index, 0]) # Y_meta[start_index:end_index, 0] = 1 - ( # (1. - Y_meta[start_index:end_index, 0]) / # (1. - dataset_minimum)) # Y_meta[start_index:end_index, 0]\ # [Y_meta[start_index:end_index, 0] > 2] = 2 smac.solver.stats.start_timing() # == first, evaluate all metelearning and default configurations smac.solver.incumbent = smac.solver.initial_design.run() for challenger in metalearning_configurations: smac.solver.incumbent, inc_perf = smac.solver.intensifier.intensify( challengers=[challenger], incumbent=smac.solver.incumbent, run_history=smac.solver.runhistory, aggregate_func=smac.solver.aggregate_func, time_bound=self.total_walltime_limit) if smac.solver.scenario.shared_model: pSMAC.write(run_history=smac.solver.runhistory, output_directory=smac.solver.scenario.output_dir, num_run=self.seed) if smac.solver.stats.is_budget_exhausted(): break # == after metalearning run SMAC loop while True: if smac.solver.scenario.shared_model: pSMAC.read(run_history=smac.solver.runhistory, output_directory=self.scenario.output_dir, configuration_space=self.config_space, logger=self.logger) choose_next_start_time = time.time() try: challengers = self.choose_next(smac) except Exception as e: self.logger.error(e) self.logger.error("Error in getting next configurations " "with SMAC. Using random configuration!") next_config = self.config_space.sample_configuration() challengers = [next_config] time_for_choose_next = time.time() - choose_next_start_time self.logger.info('Used %g seconds to find next ' 'configurations' % (time_for_choose_next)) smac.solver.incumbent, inc_perf = smac.solver.intensifier.intensify( challengers=challengers, incumbent=smac.solver.incumbent, run_history=smac.solver.runhistory, aggregate_func=smac.solver.aggregate_func, time_bound=time_for_choose_next) if smac.solver.scenario.shared_model: pSMAC.write(run_history=smac.solver.runhistory, output_directory=smac.solver.scenario.output_dir, num_run=self.seed) if smac.solver.stats.is_budget_exhausted(): break self.runhistory = smac.solver.runhistory return runhistory
def get_metalearning_suggestions(self): # == METALEARNING suggestions # we start by evaluating the defaults on the full dataset again # and add the suggestions from metalearning behind it if self.num_metalearning_cfgs > 0: if self.metadata_directory is None: metalearning_directory = os.path.dirname( autosklearn.metalearning.__file__) # There is no multilabel data in OpenML if self.task == MULTILABEL_CLASSIFICATION: meta_task = BINARY_CLASSIFICATION else: meta_task = self.task metadata_directory = os.path.join( metalearning_directory, 'files', '%s_%s_%s' % (self.metric, TASK_TYPES_TO_STRING[meta_task], 'sparse' if self.datamanager.info['is_sparse'] else 'dense')) self.metadata_directory = metadata_directory if os.path.exists(self.metadata_directory): self.logger.info('Metadata directory: %s', self.metadata_directory) meta_base = MetaBase(self.config_space, self.metadata_directory) try: meta_base.remove_dataset(self.dataset_name) except: pass metafeature_calculation_time_limit = int( self.total_walltime_limit / 4) metafeature_calculation_start_time = time.time() meta_features = self._calculate_metafeatures_with_limits( metafeature_calculation_time_limit) metafeature_calculation_end_time = time.time() metafeature_calculation_time_limit = \ metafeature_calculation_time_limit - ( metafeature_calculation_end_time - metafeature_calculation_start_time) if metafeature_calculation_time_limit < 1: self.logger.warning( 'Time limit for metafeature calculation less ' 'than 1 seconds (%f). Skipping calculation ' 'of metafeatures for encoded dataset.', metafeature_calculation_time_limit) meta_features_encoded = None else: with warnings.catch_warnings(): warnings.showwarning = self._send_warnings_to_log self.datamanager.perform1HotEncoding() meta_features_encoded = \ self._calculate_metafeatures_encoded_with_limits( metafeature_calculation_time_limit) # In case there is a problem calculating the encoded meta-features if meta_features is None: if meta_features_encoded is not None: meta_features = meta_features_encoded else: if meta_features_encoded is not None: meta_features.metafeature_values.update( meta_features_encoded.metafeature_values) if meta_features is not None: meta_base.add_dataset(self.dataset_name, meta_features) # Do mean imputation of the meta-features - should be done specific # for each prediction model! all_metafeatures = meta_base.get_metafeatures( features=list(meta_features.keys())) all_metafeatures.fillna(all_metafeatures.mean(), inplace=True) with warnings.catch_warnings(): warnings.showwarning = self._send_warnings_to_log metalearning_configurations = self.collect_metalearning_suggestions( meta_base) if metalearning_configurations is None: metalearning_configurations = [] self.reset_data_manager() self.logger.info('%s', meta_features) # Convert meta-features into a dictionary because the scenario # expects a dictionary meta_features_dict = {} for dataset, series in all_metafeatures.iterrows(): meta_features_dict[dataset] = series.values meta_features_list = [] for meta_feature_name in all_metafeatures.columns: meta_features_list.append( meta_features[meta_feature_name].value) meta_features_list = np.array(meta_features_list).reshape( (1, -1)) self.logger.info(list(meta_features_dict.keys())) else: meta_features = None self.logger.warning('Could not find meta-data directory %s' % metadata_directory) else: meta_features = None if meta_features is None: meta_features_list = [] metalearning_configurations = [] return metalearning_configurations
class MetaBaseTest(unittest.TestCase): _multiprocess_can_split_ = True def setUp(self): self.cwd = os.getcwd() data_dir = os.path.dirname(__file__) data_dir = os.path.join(data_dir, 'test_meta_base_data') os.chdir(data_dir) cs = autosklearn.pipeline.classification.SimpleClassificationPipeline()\ .get_hyperparameter_search_space() self.base = MetaBase(cs, data_dir) def tearDown(self): os.chdir(self.cwd) def test_get_all_runs(self): runs = self.base.get_all_runs() self.assertIsInstance(runs, pd.DataFrame) # TODO update this ASAP self.assertEqual((134, 24), runs.shape) def test_get_runs(self): runs = self.base.get_runs('38_acc') # TODO update this ASAP self.assertEqual(24, len(runs)) self.assertIsInstance(runs, pd.Series) def test_get_metafeatures_single_dataset(self): mf = self.base.get_metafeatures('38_acc') self.assertIsInstance(mf, pd.Series) self.assertEqual(mf.name, u'38_acc') self.assertEqual(mf.loc['NumberOfInstances'], 2527.0) def test_get_metafeatures_single_feature(self): mf = self.base.get_metafeatures(features='NumberOfInstances') self.assertIsInstance(mf, pd.Series) self.assertEqual(mf.shape, (140, )) def test_get_metafeatures_single_dataset_and_single_feature(self): mf = self.base.get_metafeatures('38_acc', features='NumberOfInstances') self.assertEqual(mf.shape, ()) def test_get_metafeatures_multiple_datasets(self): mf = self.base.get_metafeatures(['38_acc', '24_acc']) self.assertIsInstance(mf, pd.DataFrame) self.assertEqual(mf.shape, (2, 46)) def test_get_metafeatures_multiple_features(self): mf = self.base.get_metafeatures(features=['NumberOfInstances', 'NumberOfClasses']) self.assertIsInstance(mf, pd.DataFrame) self.assertEqual(mf.shape, (140, 2)) def test_remove_dataset(self): name = "1000_acc" for key in self.base.algorithm_runs: self.assertIn(name, self.base.algorithm_runs[key].index) self.assertIn(name, self.base.metafeatures.index) metafeatures_shape = self.base.metafeatures.shape self.base.remove_dataset(name) for key in self.base.algorithm_runs: self.assertNotIn(name, self.base.algorithm_runs[key].index) self.assertNotIn(name, self.base.metafeatures.index) # Check that only one thing was removed self.assertEqual(self.base.metafeatures.shape, (metafeatures_shape[0] - 1, metafeatures_shape[1]))
def run_smbo(self, max_iters=1000): global evaluator # == first things first: load the datamanager self.reset_data_manager() # == Initialize SMBO stuff # first create a scenario seed = self.seed # TODO num_params = len(self.config_space.get_hyperparameters()) # allocate a run history run_history = RunHistory() meta_runhistory = RunHistory() meta_runs_dataset_indices = {} num_run = self.start_num_run instance_id = self.dataset_name + SENTINEL # == Train on subset # before doing anything, let us run the default_cfg # on a subset of the available data to ensure that # we at least have some models # we will try three different ratios of decreasing magnitude # in the hope that at least on the last one we will be able # to get a model n_data = self.datamanager.data['X_train'].shape[0] subset_ratio = 10000. / n_data if subset_ratio >= 0.5: subset_ratio = 0.33 subset_ratios = [subset_ratio, subset_ratio * 0.10] else: subset_ratios = [subset_ratio, 500. / n_data] self.logger.info("Training default configurations on a subset of " "%d/%d data points." % (int(n_data * subset_ratio), n_data)) # the time limit for these function evaluations is rigorously # set to only 1/2 of a full function evaluation subset_time_limit = max(5, int(self.func_eval_time_limit / 2)) # the configs we want to run on the data subset are: # 1) the default configs # 2) a set of configs we selected for training on a subset subset_configs = [self.config_space.get_default_configuration()] \ + self.collect_additional_subset_defaults() subset_config_succesful = [False] * len(subset_configs) for subset_config_id, next_config in enumerate(subset_configs): for i, ratio in enumerate(subset_ratios): self.reset_data_manager() n_data_subsample = int(n_data * ratio) # run the config, but throw away the result afterwards # since this cfg was evaluated only on a subset # and we don't want to confuse SMAC self.logger.info( "Starting to evaluate %d on SUBSET " "with size %d and time limit %ds.", num_run, n_data_subsample, subset_time_limit) self.logger.info(next_config) _info = eval_with_limits(self.datamanager, self.tmp_dir, next_config, seed, num_run, self.resampling_strategy, self.resampling_strategy_args, self.memory_limit, subset_time_limit, n_data_subsample) (duration, result, _, additional_run_info, status) = _info self.logger.info( "Finished evaluating %d. configuration on SUBSET. " "Duration %f; loss %f; status %s; additional run " "info: %s ", num_run, duration, result, str(status), additional_run_info) num_run += 1 if i < len(subset_ratios) - 1: if status != StatusType.SUCCESS: # Do not increase num_run here, because we will try # the same configuration with less data self.logger.info( "A CONFIG did not finish " " for subset ratio %f -> going smaller", ratio) continue else: self.logger.info( "Finished SUBSET training sucessfully " "with ratio %f", ratio) subset_config_succesful[subset_config_id] = True break else: if status != StatusType.SUCCESS: self.logger.info( "A CONFIG did not finish " " for subset ratio %f.", ratio) continue else: self.logger.info( "Finished SUBSET training sucessfully " "with ratio %f", ratio) subset_config_succesful[subset_config_id] = True break # Use the first non-failing configuration from the subsets as the new # default configuration -> this guards us against the random forest # failing on large, sparse datasets default_cfg = None for subset_config_id, next_config in enumerate(subset_configs): if subset_config_succesful[subset_config_id]: default_cfg = next_config break if default_cfg is None: default_cfg = self.config_space.get_default_configuration() # == METALEARNING suggestions # we start by evaluating the defaults on the full dataset again # and add the suggestions from metalearning behind it if self.metadata_directory is None: metalearning_directory = os.path.dirname( autosklearn.metalearning.__file__) # There is no multilabel data in OpenML if self.task == MULTILABEL_CLASSIFICATION: meta_task = BINARY_CLASSIFICATION else: meta_task = self.task metadata_directory = os.path.join( metalearning_directory, 'files', '%s_%s_%s' % (METRIC_TO_STRING[self.metric], TASK_TYPES_TO_STRING[meta_task], 'sparse' if self.datamanager.info['is_sparse'] else 'dense')) self.metadata_directory = metadata_directory self.logger.info('Metadata directory: %s', self.metadata_directory) meta_base = MetaBase(self.config_space, self.metadata_directory) metafeature_calculation_time_limit = int(self.total_walltime_limit / 4) metafeature_calculation_start_time = time.time() meta_features = self._calculate_metafeatures_with_limits( metafeature_calculation_time_limit) metafeature_calculation_end_time = time.time() metafeature_calculation_time_limit = \ metafeature_calculation_time_limit - ( metafeature_calculation_end_time - metafeature_calculation_start_time) if metafeature_calculation_time_limit < 1: self.logger.warning( 'Time limit for metafeature calculation less ' 'than 1 seconds (%f). Skipping calculation ' 'of metafeatures for encoded dataset.', metafeature_calculation_time_limit) meta_features_encoded = None else: self.datamanager.perform1HotEncoding() meta_features_encoded = \ self._calculate_metafeatures_encoded_with_limits( metafeature_calculation_time_limit) # In case there is a problem calculating the encoded meta-features if meta_features is None: if meta_features_encoded is not None: meta_features = meta_features_encoded else: if meta_features_encoded is not None: meta_features.metafeature_values.update( meta_features_encoded.metafeature_values) if meta_features is not None: meta_base.add_dataset(instance_id, meta_features) # Do mean imputation of the meta-features - should be done specific # for each prediction model! all_metafeatures = meta_base.get_metafeatures( features=list(meta_features.keys())) all_metafeatures.fillna(all_metafeatures.mean(), inplace=True) metalearning_configurations = self.collect_metalearning_suggestions( meta_base) if metalearning_configurations is None: metalearning_configurations = [] self.reset_data_manager() self.logger.info('%s', meta_features) # Convert meta-features into a dictionary because the scenario # expects a dictionary meta_features_dict = {} for dataset, series in all_metafeatures.iterrows(): meta_features_dict[dataset] = series.values meta_features_list = [] for meta_feature_name in all_metafeatures.columns: meta_features_list.append( meta_features[meta_feature_name].value) meta_features_list = np.array(meta_features_list).reshape((1, -1)) self.logger.info(list(meta_features_dict.keys())) meta_runs = meta_base.get_all_runs(METRIC_TO_STRING[self.metric]) meta_runs_index = 0 try: meta_durations = meta_base.get_all_runs('runtime') read_runtime_data = True except KeyError: read_runtime_data = False self.logger.critical('Cannot read runtime data.') if self.acquisition_function == 'EIPS': self.logger.critical( 'Reverting to acquisition function EI!') self.acquisition_function = 'EI' for meta_dataset in meta_runs.index: meta_dataset_start_index = meta_runs_index for meta_configuration in meta_runs.columns: if np.isfinite(meta_runs.loc[meta_dataset, meta_configuration]): try: config = meta_base.get_configuration_from_algorithm_index( meta_configuration) cost = meta_runs.loc[meta_dataset, meta_configuration] if read_runtime_data: runtime = meta_durations.loc[ meta_dataset, meta_configuration] else: runtime = 1 # TODO read out other status types! meta_runhistory.add(config, cost, runtime, StatusType.SUCCESS, instance_id=meta_dataset) meta_runs_index += 1 except: # TODO maybe add warning pass meta_runs_dataset_indices[meta_dataset] = ( meta_dataset_start_index, meta_runs_index) else: if self.acquisition_function == 'EIPS': self.logger.critical('Reverting to acquisition function EI!') self.acquisition_function = 'EI' meta_features_list = [] meta_features_dict = {} metalearning_configurations = [] self.scenario = AutoMLScenario(self.config_space, self.total_walltime_limit, self.func_eval_time_limit, meta_features_dict, self.tmp_dir, self.shared_mode) types = get_types(self.config_space, self.scenario.feature_array) if self.acquisition_function == 'EI': rh2EPM = RunHistory2EPM4Cost(num_params=num_params, scenario=self.scenario, success_states=None, impute_censored_data=False, impute_state=None) model = RandomForestWithInstances( types, instance_features=meta_features_list, seed=1, num_trees=10) smac = SMBO(self.scenario, model=model, rng=seed) elif self.acquisition_function == 'EIPS': rh2EPM = RunHistory2EPM4EIPS(num_params=num_params, scenario=self.scenario, success_states=None, impute_censored_data=False, impute_state=None) model = UncorrelatedMultiObjectiveRandomForestWithInstances( ['cost', 'runtime'], types, num_trees=10, instance_features=meta_features_list, seed=1) acquisition_function = EIPS(model) smac = SMBO(self.scenario, acquisition_function=acquisition_function, model=model, runhistory2epm=rh2EPM, rng=seed) else: raise ValueError('Unknown acquisition function value %s!' % self.acquisition_function) # Build a runtime model # runtime_rf = RandomForestWithInstances(types, # instance_features=meta_features_list, # seed=1, num_trees=10) # runtime_rh2EPM = RunHistory2EPM4EIPS(num_params=num_params, # scenario=self.scenario, # success_states=None, # impute_censored_data=False, # impute_state=None) # X_runtime, y_runtime = runtime_rh2EPM.transform(meta_runhistory) # runtime_rf.train(X_runtime, y_runtime[:, 1].flatten()) X_meta, Y_meta = rh2EPM.transform(meta_runhistory) # Transform Y_meta on a per-dataset base for meta_dataset in meta_runs_dataset_indices: start_index, end_index = meta_runs_dataset_indices[meta_dataset] end_index += 1 # Python indexing Y_meta[start_index:end_index, 0]\ [Y_meta[start_index:end_index, 0] >2.0] = 2.0 dataset_minimum = np.min(Y_meta[start_index:end_index, 0]) Y_meta[start_index:end_index, 0] = 1 - ((1. - Y_meta[start_index:end_index, 0]) / (1. - dataset_minimum)) Y_meta[start_index:end_index, 0]\ [Y_meta[start_index:end_index, 0] > 2] = 2 # == first, evaluate all metelearning and default configurations for i, next_config in enumerate( ([default_cfg] + metalearning_configurations)): # Do not evaluate default configurations more than once if i >= len([default_cfg]) and next_config in [default_cfg]: continue config_name = 'meta-learning' if i >= len([default_cfg]) \ else 'default' self.logger.info( "Starting to evaluate %d. configuration " "(%s configuration) with time limit %ds.", num_run, config_name, self.func_eval_time_limit) self.logger.info(next_config) self.reset_data_manager() info = eval_with_limits(self.datamanager, self.tmp_dir, next_config, seed, num_run, self.resampling_strategy, self.resampling_strategy_args, self.memory_limit, self.func_eval_time_limit) (duration, result, _, additional_run_info, status) = info run_history.add(config=next_config, cost=result, time=duration, status=status, instance_id=instance_id, seed=seed) run_history.update_cost(next_config, result) self.logger.info( "Finished evaluating %d. configuration. " "Duration %f; loss %f; status %s; additional run " "info: %s ", num_run, duration, result, str(status), additional_run_info) num_run += 1 if smac.incumbent is None: smac.incumbent = next_config elif result < run_history.get_cost(smac.incumbent): smac.incumbent = next_config if self.scenario.shared_model: pSMAC.write(run_history=run_history, output_directory=self.scenario.output_dir, num_run=self.seed) # == after metalearning run SMAC loop smac.runhistory = run_history smac_iter = 0 finished = False while not finished: if self.scenario.shared_model: pSMAC.read(run_history=run_history, output_directory=self.scenario.output_dir, configuration_space=self.config_space, logger=self.logger) next_configs = [] time_for_choose_next = -1 try: X_cfg, Y_cfg = rh2EPM.transform(run_history) if not run_history.empty(): # Update costs by normalization dataset_minimum = np.min(Y_cfg[:, 0]) Y_cfg[:, 0] = 1 - ((1. - Y_cfg[:, 0]) / (1. - dataset_minimum)) Y_cfg[:, 0][Y_cfg[:, 0] > 2] = 2 if len(X_meta) > 0 and len(X_cfg) > 0: pass #X_cfg = np.concatenate((X_meta, X_cfg)) #Y_cfg = np.concatenate((Y_meta, Y_cfg)) elif len(X_meta) > 0: X_cfg = X_meta.copy() Y_cfg = Y_meta.copy() elif len(X_cfg) > 0: X_cfg = X_cfg.copy() Y_cfg = Y_cfg.copy() else: raise ValueError( 'No training data for SMAC random forest!') self.logger.info('Using %d training points for SMAC.' % X_cfg.shape[0]) choose_next_start_time = time.time() next_configs_tmp = smac.choose_next( X_cfg, Y_cfg, num_interleaved_random=110, num_configurations_by_local_search=10, num_configurations_by_random_search_sorted=100) time_for_choose_next = time.time() - choose_next_start_time self.logger.info('Used %g seconds to find next ' 'configurations' % (time_for_choose_next)) next_configs.extend(next_configs_tmp) # TODO put Exception here! except Exception as e: self.logger.error(e) self.logger.error("Error in getting next configurations " "with SMAC. Using random configuration!") next_config = self.config_space.sample_configuration() next_configs.append(next_config) models_fitted_this_iteration = 0 start_time_this_iteration = time.time() for next_config in next_configs: x_runtime = impute_inactive_values(next_config) x_runtime = impute_inactive_values(x_runtime).get_array() # predicted_runtime = runtime_rf.predict_marginalized_over_instances( # x_runtime.reshape((1, -1))) # predicted_runtime = np.exp(predicted_runtime[0][0][0]) - 1 self.logger.info( "Starting to evaluate %d. configuration (from " "SMAC) with time limit %ds.", num_run, self.func_eval_time_limit) self.logger.info(next_config) self.reset_data_manager() info = eval_with_limits(self.datamanager, self.tmp_dir, next_config, seed, num_run, self.resampling_strategy, self.resampling_strategy_args, self.memory_limit, self.func_eval_time_limit) (duration, result, _, additional_run_info, status) = info run_history.add(config=next_config, cost=result, time=duration, status=status, instance_id=instance_id, seed=seed) run_history.update_cost(next_config, result) #self.logger.info('Predicted runtime %g, true runtime %g', # predicted_runtime, duration) # TODO add unittest to make sure everything works fine and # this does not get outdated! if smac.incumbent is None: smac.incumbent = next_config elif result < run_history.get_cost(smac.incumbent): smac.incumbent = next_config self.logger.info( "Finished evaluating %d. configuration. " "Duration: %f; loss: %f; status %s; additional " "run info: %s ", num_run, duration, result, str(status), additional_run_info) smac_iter += 1 num_run += 1 models_fitted_this_iteration += 1 time_used_this_iteration = time.time( ) - start_time_this_iteration if models_fitted_this_iteration >= 2 and \ time_for_choose_next > 0 and \ time_used_this_iteration > time_for_choose_next: break elif time_for_choose_next <= 0 and \ models_fitted_this_iteration >= 1: break elif models_fitted_this_iteration >= 50: break if max_iters is not None: finished = (smac_iter < max_iters) if self.scenario.shared_model: pSMAC.write(run_history=run_history, output_directory=self.scenario.output_dir, num_run=self.seed)
class MetaLearningOptimizer(object): def __init__(self, dataset_name, configuration_space, aslib_directory, distance='l1', seed=None, use_features='', distance_kwargs=None, subset='all'): """Metalearning optimizer. Parameters ---------- dataset_name : str Name of the dataset configuration_space : HPOlibConfigSpace.configuration_space.ConfigurationSpace datasets_file : str Path to an aslib directory distance : str, "l1" or "l2" or "random" Distance function to be used by the kNearestDatasets algorithm. seed use_features metric_kwargs subset """ self.dataset_name = dataset_name self.configuration_space = configuration_space self.aslib_dir = aslib_directory self.distance = distance self.seed = seed self.use_features = use_features self.distance_kwargs = distance_kwargs self.subset = subset self.kND = None # For caching, makes things faster... self.meta_base = MetaBase(configuration_space, self.aslib_dir) self.logger = logging.getLogger(__name__) def perform_sequential_optimization(self, target_algorithm=test_function, time_budget=None, evaluation_budget=None): raise NotImplementedError("Right now this is not implemented due to " "timing issues.") time_taken = 0 num_evaluations = 0 history = [] self.logger.info("Taking distance measure %s" % self.distance) while True: if time_budget is not None and time_taken >= time_budget: self.logger.info("Reached time budget. Exiting optimization.") break if evaluation_budget is not None and \ num_evaluations >= evaluation_budget: self.logger.info("Reached maximum number of evaluations. Exiting " "optimization.") break params = self.metalearning_suggest(history) fixed_params = OrderedDict() # Hack to remove all trailing - from the params which are # accidently in the experiment pickle of the current HPOlib version for key in params: if key[0] == "-": fixed_params[key[1:]] = params[key] else: fixed_params[key] = params[key] self.logger.info("%d/%d, parameters: %s" % (num_evaluations, evaluation_budget, str(fixed_params))) result = target_algorithm(fixed_params) history.append(Run(params, result)) num_evaluations += 1 return min([run.result for run in history]) def metalearning_suggest_all(self, exclude_double_configurations=True): """Return a list of the best hyperparameters of neighboring datasets""" # TODO check if _learn was called before! neighbors = self._learn(exclude_double_configurations) hp_list = [] for neighbor in neighbors: try: configuration = \ self.meta_base.get_configuration_from_algorithm_index( neighbor[2]) self.logger.info("%s %s %s" % (neighbor[0], neighbor[1], configuration)) except (KeyError): self.logger.warning("Configuration %s not found" % neighbor[2]) continue hp_list.append(configuration) return hp_list def metalearning_suggest(self, history): """Suggest the next most promising hyperparameters which were not yet evaluated""" # TODO test the object in the history! neighbors = self._learn() # Iterate over all datasets which are sorted ascending by distance history_with_indices = [] for run in history: history_with_indices.append(\ self.meta_base.get_algorithm_index_from_configuration(run)) for idx, neighbor in enumerate(neighbors): already_evaluated = False # Check if that dataset was already evaluated for run in history_with_indices: # If so, return to the outer loop if neighbor[2] == run: already_evaluated = True break if not already_evaluated: self.logger.info("Nearest dataset with hyperparameters of best value " "not evaluated yet is %s with a distance of %f" % (neighbor[0], neighbor[1])) return self.meta_base.get_configuration_from_algorithm_index( neighbor[2]) raise StopIteration("No more values available.") def _learn(self, exclude_double_configurations=True): dataset_metafeatures, all_other_metafeatures = self._get_metafeatures() # Remove metafeatures which could not be calculated for the target # dataset keep = [] for idx in dataset_metafeatures.index: if np.isfinite(dataset_metafeatures.loc[idx]): keep.append(idx) dataset_metafeatures = dataset_metafeatures.loc[keep] all_other_metafeatures = all_other_metafeatures.loc[:,keep] # Do mean imputation of all other metafeatures all_other_metafeatures = all_other_metafeatures.fillna( all_other_metafeatures.mean()) if self.kND is None: # In case that we learn our distance function, get_value the parameters for # the random forest if self.distance_kwargs: rf_params = ast.literal_eval(self.distance_kwargs) else: rf_params = None # To keep the distance the same in every iteration, we create a new # random state random_state = sklearn.utils.check_random_state(self.seed) kND = KNearestDatasets(metric=self.distance, random_state=random_state, metric_params=rf_params) runs = dict() # TODO move this code to the metabase for task_id in all_other_metafeatures.index: try: runs[task_id] = self.meta_base.get_runs(task_id) except KeyError: # TODO should I really except this? self.logger.warning("Could not find runs for instance %s" % task_id) runs[task_id] = pd.Series([], name=task_id) runs = pd.DataFrame(runs) kND.fit(all_other_metafeatures, runs) self.kND = kND return self.kND.kBestSuggestions(dataset_metafeatures, k=-1, exclude_double_configurations=exclude_double_configurations) def _get_metafeatures(self): """This is inside an extra function for testing purpose""" # Load the task self.logger.info("Going to use the metafeature subset: %s", self.subset) all_metafeatures = self.meta_base.get_all_metafeatures() self.logger.info(" ".join(all_metafeatures.columns)) # TODO: buggy and hacky, replace with a list seperated by commas if self.use_features and \ (type(self.use_features) != str or self.use_features != ''): #ogger.warn("Going to keep the following features %s", # str(self.use_features)) if type(self.use_features) == str: use_features = self.use_features.split(",") elif type(self.use_features) in (list, np.ndarray): use_features = self.use_features else: raise NotImplementedError(type(self.use_features)) if len(use_features) == 0: self.logger.info("You just tried to remove all metafeatures...") else: keep = [col for col in all_metafeatures.columns if col in use_features] if len(use_features) == 0: self.logger.info("You just tried to remove all metafeatures...") else: all_metafeatures = all_metafeatures.loc[:,keep] self.logger.info("Going to keep the following metafeatures:") self.logger.info(str(keep)) return self._split_metafeature_array(self.dataset_name, all_metafeatures) def _split_metafeature_array(self, dataset_name, metafeatures): """Split the metafeature array into dataset metafeatures and all other. This is inside an extra function for testing purpose. """ dataset_metafeatures = metafeatures.loc[dataset_name].copy() metafeatures = metafeatures[metafeatures.index != dataset_name] return dataset_metafeatures, metafeatures def read_task_list(self, fh): dataset_filenames = list() for line in fh: line = line.replace("\n", "") if line: dataset_filenames.append(line) else: raise ValueError("Blank lines in the task list are not " "supported.") return dataset_filenames def read_experiments_list(self, fh): experiments_list = list() for line in fh.readlines(): experiments_list.append(line.split()) return experiments_list
def get_metalearning_suggestions(self): # == METALEARNING suggestions # we start by evaluating the defaults on the full dataset again # and add the suggestions from metalearning behind it if self.num_metalearning_cfgs > 0: # If metadata directory is None, use default if self.metadata_directory is None: metalearning_directory = os.path.dirname( autosklearn.metalearning.__file__) # There is no multilabel data in OpenML if self.task == MULTILABEL_CLASSIFICATION: meta_task = BINARY_CLASSIFICATION else: meta_task = self.task metadata_directory = os.path.join( metalearning_directory, 'files', '%s_%s_%s' % (self.metric, TASK_TYPES_TO_STRING[meta_task], 'sparse' if self.datamanager.info['is_sparse'] else 'dense')) self.metadata_directory = metadata_directory # If metadata directory is specified by user, # then verify that it exists. else: if not os.path.exists(self.metadata_directory): raise ValueError('The specified metadata directory \'%s\' ' 'does not exist!' % self.metadata_directory) else: # There is no multilabel data in OpenML if self.task == MULTILABEL_CLASSIFICATION: meta_task = BINARY_CLASSIFICATION else: meta_task = self.task metadata_directory = os.path.join( self.metadata_directory, '%s_%s_%s' % (self.metric, TASK_TYPES_TO_STRING[meta_task], 'sparse' if self.datamanager.info['is_sparse'] else 'dense')) # Check that the metadata directory has the correct # subdirectory needed for this dataset. if os.path.basename(metadata_directory) not in \ os.listdir(self.metadata_directory): raise ValueError('The specified metadata directory ' '\'%s\' does not have the correct ' 'subdirectory \'%s\'' % (self.metadata_directory, os.path.basename(metadata_directory)) ) self.metadata_directory = metadata_directory if os.path.exists(self.metadata_directory): self.logger.info('Metadata directory: %s', self.metadata_directory) meta_base = MetaBase(self.config_space, self.metadata_directory) metafeature_calculation_time_limit = int( self.total_walltime_limit / 4) metafeature_calculation_start_time = time.time() meta_features = self._calculate_metafeatures_with_limits( metafeature_calculation_time_limit) metafeature_calculation_end_time = time.time() metafeature_calculation_time_limit = \ metafeature_calculation_time_limit - ( metafeature_calculation_end_time - metafeature_calculation_start_time) if metafeature_calculation_time_limit < 1: self.logger.warning( 'Time limit for metafeature calculation less ' 'than 1 seconds (%f). Skipping calculation ' 'of metafeatures for encoded dataset.', metafeature_calculation_time_limit) meta_features_encoded = None else: with warnings.catch_warnings(): warnings.showwarning = self._send_warnings_to_log self.datamanager.perform1HotEncoding() meta_features_encoded = \ self._calculate_metafeatures_encoded_with_limits( metafeature_calculation_time_limit) # In case there is a problem calculating the encoded meta-features if meta_features is None: if meta_features_encoded is not None: meta_features = meta_features_encoded else: if meta_features_encoded is not None: meta_features.metafeature_values.update( meta_features_encoded.metafeature_values) if meta_features is not None: meta_base.add_dataset(self.dataset_name, meta_features) # Do mean imputation of the meta-features - should be done specific # for each prediction model! all_metafeatures = meta_base.get_metafeatures( features=list(meta_features.keys())) all_metafeatures.fillna(all_metafeatures.mean(), inplace=True) with warnings.catch_warnings(): warnings.showwarning = self._send_warnings_to_log metalearning_configurations = self.collect_metalearning_suggestions( meta_base) if metalearning_configurations is None: metalearning_configurations = [] self.reset_data_manager() self.logger.info('%s', meta_features) # Convert meta-features into a dictionary because the scenario # expects a dictionary meta_features_dict = {} for dataset, series in all_metafeatures.iterrows(): meta_features_dict[dataset] = series.values meta_features_list = [] for meta_feature_name in all_metafeatures.columns: meta_features_list.append( meta_features[meta_feature_name].value) meta_features_list = np.array(meta_features_list).reshape( (1, -1)) self.logger.info(list(meta_features_dict.keys())) else: meta_features = None self.logger.warning('Could not find meta-data directory %s' % metadata_directory) else: meta_features = None if meta_features is None: meta_features_list = [] metalearning_configurations = [] return metalearning_configurations
def run_smbo(self, max_iters=1000): global evaluator # == first things first: load the datamanager self.reset_data_manager() # == Initialize SMBO stuff # first create a scenario seed = self.seed # TODO num_params = len(self.config_space.get_hyperparameters()) # allocate a run history run_history = RunHistory() meta_runhistory = RunHistory() meta_runs_dataset_indices = {} num_run = self.start_num_run instance_id = self.dataset_name + SENTINEL # == Train on subset # before doing anything, let us run the default_cfg # on a subset of the available data to ensure that # we at least have some models # we will try three different ratios of decreasing magnitude # in the hope that at least on the last one we will be able # to get a model n_data = self.datamanager.data['X_train'].shape[0] subset_ratio = 10000. / n_data if subset_ratio >= 0.5: subset_ratio = 0.33 subset_ratios = [subset_ratio, subset_ratio * 0.10] else: subset_ratios = [subset_ratio, 500. / n_data] self.logger.info("Training default configurations on a subset of " "%d/%d data points." % (int(n_data * subset_ratio), n_data)) # the time limit for these function evaluations is rigorously # set to only 1/2 of a full function evaluation subset_time_limit = max(5, int(self.func_eval_time_limit / 2)) # the configs we want to run on the data subset are: # 1) the default configs # 2) a set of configs we selected for training on a subset subset_configs = [self.config_space.get_default_configuration()] \ + self.collect_additional_subset_defaults() subset_config_succesful = [False] * len(subset_configs) for subset_config_id, next_config in enumerate(subset_configs): for i, ratio in enumerate(subset_ratios): self.reset_data_manager() n_data_subsample = int(n_data * ratio) # run the config, but throw away the result afterwards # since this cfg was evaluated only on a subset # and we don't want to confuse SMAC self.logger.info("Starting to evaluate %d on SUBSET " "with size %d and time limit %ds.", num_run, n_data_subsample, subset_time_limit) self.logger.info(next_config) _info = eval_with_limits( self.datamanager, self.tmp_dir, next_config, seed, num_run, self.resampling_strategy, self.resampling_strategy_args, self.memory_limit, subset_time_limit, n_data_subsample) (duration, result, _, additional_run_info, status) = _info self.logger.info("Finished evaluating %d. configuration on SUBSET. " "Duration %f; loss %f; status %s; additional run " "info: %s ", num_run, duration, result, str(status), additional_run_info) num_run += 1 if i < len(subset_ratios) - 1: if status != StatusType.SUCCESS: # Do not increase num_run here, because we will try # the same configuration with less data self.logger.info("A CONFIG did not finish " " for subset ratio %f -> going smaller", ratio) continue else: self.logger.info("Finished SUBSET training sucessfully " "with ratio %f", ratio) subset_config_succesful[subset_config_id] = True break else: if status != StatusType.SUCCESS: self.logger.info("A CONFIG did not finish " " for subset ratio %f.", ratio) continue else: self.logger.info("Finished SUBSET training sucessfully " "with ratio %f", ratio) subset_config_succesful[subset_config_id] = True break # Use the first non-failing configuration from the subsets as the new # default configuration -> this guards us against the random forest # failing on large, sparse datasets default_cfg = None for subset_config_id, next_config in enumerate(subset_configs): if subset_config_succesful[subset_config_id]: default_cfg = next_config break if default_cfg is None: default_cfg = self.config_space.get_default_configuration() # == METALEARNING suggestions # we start by evaluating the defaults on the full dataset again # and add the suggestions from metalearning behind it if self.metadata_directory is None: metalearning_directory = os.path.dirname( autosklearn.metalearning.__file__) # There is no multilabel data in OpenML if self.task == MULTILABEL_CLASSIFICATION: meta_task = BINARY_CLASSIFICATION else: meta_task = self.task metadata_directory = os.path.join( metalearning_directory, 'files', '%s_%s_%s' % (METRIC_TO_STRING[self.metric], TASK_TYPES_TO_STRING[meta_task], 'sparse' if self.datamanager.info['is_sparse'] else 'dense')) self.metadata_directory = metadata_directory self.logger.info('Metadata directory: %s', self.metadata_directory) meta_base = MetaBase(self.config_space, self.metadata_directory) metafeature_calculation_time_limit = int( self.total_walltime_limit / 4) metafeature_calculation_start_time = time.time() meta_features = self._calculate_metafeatures_with_limits( metafeature_calculation_time_limit) metafeature_calculation_end_time = time.time() metafeature_calculation_time_limit = \ metafeature_calculation_time_limit - ( metafeature_calculation_end_time - metafeature_calculation_start_time) if metafeature_calculation_time_limit < 1: self.logger.warning('Time limit for metafeature calculation less ' 'than 1 seconds (%f). Skipping calculation ' 'of metafeatures for encoded dataset.', metafeature_calculation_time_limit) meta_features_encoded = None else: self.datamanager.perform1HotEncoding() meta_features_encoded = \ self._calculate_metafeatures_encoded_with_limits( metafeature_calculation_time_limit) # In case there is a problem calculating the encoded meta-features if meta_features is None: if meta_features_encoded is not None: meta_features = meta_features_encoded else: if meta_features_encoded is not None: meta_features.metafeature_values.update( meta_features_encoded.metafeature_values) if meta_features is not None: meta_base.add_dataset(instance_id, meta_features) # Do mean imputation of the meta-features - should be done specific # for each prediction model! all_metafeatures = meta_base.get_metafeatures( features=list(meta_features.keys())) all_metafeatures.fillna(all_metafeatures.mean(), inplace=True) metalearning_configurations = self.collect_metalearning_suggestions( meta_base) if metalearning_configurations is None: metalearning_configurations = [] self.reset_data_manager() self.logger.info('%s', meta_features) # Convert meta-features into a dictionary because the scenario # expects a dictionary meta_features_dict = {} for dataset, series in all_metafeatures.iterrows(): meta_features_dict[dataset] = series.values meta_features_list = [] for meta_feature_name in all_metafeatures.columns: meta_features_list.append(meta_features[meta_feature_name].value) meta_features_list = np.array(meta_features_list).reshape((1, -1)) self.logger.info(list(meta_features_dict.keys())) meta_runs = meta_base.get_all_runs(METRIC_TO_STRING[self.metric]) meta_runs_index = 0 try: meta_durations = meta_base.get_all_runs('runtime') read_runtime_data = True except KeyError: read_runtime_data = False self.logger.critical('Cannot read runtime data.') if self.acquisition_function == 'EIPS': self.logger.critical('Reverting to acquisition function EI!') self.acquisition_function = 'EI' for meta_dataset in meta_runs.index: meta_dataset_start_index = meta_runs_index for meta_configuration in meta_runs.columns: if np.isfinite(meta_runs.loc[meta_dataset, meta_configuration]): try: config = meta_base.get_configuration_from_algorithm_index( meta_configuration) cost = meta_runs.loc[meta_dataset, meta_configuration] if read_runtime_data: runtime = meta_durations.loc[meta_dataset, meta_configuration] else: runtime = 1 # TODO read out other status types! meta_runhistory.add(config, cost, runtime, StatusType.SUCCESS, instance_id=meta_dataset) meta_runs_index += 1 except: # TODO maybe add warning pass meta_runs_dataset_indices[meta_dataset] = ( meta_dataset_start_index, meta_runs_index) else: if self.acquisition_function == 'EIPS': self.logger.critical('Reverting to acquisition function EI!') self.acquisition_function = 'EI' meta_features_list = [] meta_features_dict = {} metalearning_configurations = [] self.scenario = AutoMLScenario(self.config_space, self.total_walltime_limit, self.func_eval_time_limit, meta_features_dict, self.tmp_dir, self.shared_mode) types = get_types(self.config_space, self.scenario.feature_array) if self.acquisition_function == 'EI': rh2EPM = RunHistory2EPM4Cost(num_params=num_params, scenario=self.scenario, success_states=None, impute_censored_data=False, impute_state=None) model = RandomForestWithInstances(types, instance_features=meta_features_list, seed=1, num_trees=10) smac = SMBO(self.scenario, model=model, rng=seed) elif self.acquisition_function == 'EIPS': rh2EPM = RunHistory2EPM4EIPS(num_params=num_params, scenario=self.scenario, success_states=None, impute_censored_data=False, impute_state=None) model = UncorrelatedMultiObjectiveRandomForestWithInstances( ['cost', 'runtime'], types, num_trees = 10, instance_features=meta_features_list, seed=1) acquisition_function = EIPS(model) smac = SMBO(self.scenario, acquisition_function=acquisition_function, model=model, runhistory2epm=rh2EPM, rng=seed) else: raise ValueError('Unknown acquisition function value %s!' % self.acquisition_function) # Build a runtime model # runtime_rf = RandomForestWithInstances(types, # instance_features=meta_features_list, # seed=1, num_trees=10) # runtime_rh2EPM = RunHistory2EPM4EIPS(num_params=num_params, # scenario=self.scenario, # success_states=None, # impute_censored_data=False, # impute_state=None) # X_runtime, y_runtime = runtime_rh2EPM.transform(meta_runhistory) # runtime_rf.train(X_runtime, y_runtime[:, 1].flatten()) X_meta, Y_meta = rh2EPM.transform(meta_runhistory) # Transform Y_meta on a per-dataset base for meta_dataset in meta_runs_dataset_indices: start_index, end_index = meta_runs_dataset_indices[meta_dataset] end_index += 1 # Python indexing Y_meta[start_index:end_index, 0]\ [Y_meta[start_index:end_index, 0] >2.0] = 2.0 dataset_minimum = np.min(Y_meta[start_index:end_index, 0]) Y_meta[start_index:end_index, 0] = 1 - ( (1. - Y_meta[start_index:end_index, 0]) / (1. - dataset_minimum)) Y_meta[start_index:end_index, 0]\ [Y_meta[start_index:end_index, 0] > 2] = 2 # == first, evaluate all metelearning and default configurations for i, next_config in enumerate(([default_cfg] + metalearning_configurations)): # Do not evaluate default configurations more than once if i >= len([default_cfg]) and next_config in [default_cfg]: continue config_name = 'meta-learning' if i >= len([default_cfg]) \ else 'default' self.logger.info("Starting to evaluate %d. configuration " "(%s configuration) with time limit %ds.", num_run, config_name, self.func_eval_time_limit) self.logger.info(next_config) self.reset_data_manager() info = eval_with_limits(self.datamanager, self.tmp_dir, next_config, seed, num_run, self.resampling_strategy, self.resampling_strategy_args, self.memory_limit, self.func_eval_time_limit) (duration, result, _, additional_run_info, status) = info run_history.add(config=next_config, cost=result, time=duration , status=status, instance_id=instance_id, seed=seed) run_history.update_cost(next_config, result) self.logger.info("Finished evaluating %d. configuration. " "Duration %f; loss %f; status %s; additional run " "info: %s ", num_run, duration, result, str(status), additional_run_info) num_run += 1 if smac.incumbent is None: smac.incumbent = next_config elif result < run_history.get_cost(smac.incumbent): smac.incumbent = next_config if self.scenario.shared_model: pSMAC.write(run_history=run_history, output_directory=self.scenario.output_dir, num_run=self.seed) # == after metalearning run SMAC loop smac.runhistory = run_history smac_iter = 0 finished = False while not finished: if self.scenario.shared_model: pSMAC.read(run_history=run_history, output_directory=self.scenario.output_dir, configuration_space=self.config_space, logger=self.logger) next_configs = [] time_for_choose_next = -1 try: X_cfg, Y_cfg = rh2EPM.transform(run_history) if not run_history.empty(): # Update costs by normalization dataset_minimum = np.min(Y_cfg[:, 0]) Y_cfg[:, 0] = 1 - ((1. - Y_cfg[:, 0]) / (1. - dataset_minimum)) Y_cfg[:, 0][Y_cfg[:, 0] > 2] = 2 if len(X_meta) > 0 and len(X_cfg) > 0: pass #X_cfg = np.concatenate((X_meta, X_cfg)) #Y_cfg = np.concatenate((Y_meta, Y_cfg)) elif len(X_meta) > 0: X_cfg = X_meta.copy() Y_cfg = Y_meta.copy() elif len(X_cfg) > 0: X_cfg = X_cfg.copy() Y_cfg = Y_cfg.copy() else: raise ValueError('No training data for SMAC random forest!') self.logger.info('Using %d training points for SMAC.' % X_cfg.shape[0]) choose_next_start_time = time.time() next_configs_tmp = smac.choose_next(X_cfg, Y_cfg, num_interleaved_random=110, num_configurations_by_local_search=10, num_configurations_by_random_search_sorted=100) time_for_choose_next = time.time() - choose_next_start_time self.logger.info('Used %g seconds to find next ' 'configurations' % (time_for_choose_next)) next_configs.extend(next_configs_tmp) # TODO put Exception here! except Exception as e: self.logger.error(e) self.logger.error("Error in getting next configurations " "with SMAC. Using random configuration!") next_config = self.config_space.sample_configuration() next_configs.append(next_config) models_fitted_this_iteration = 0 start_time_this_iteration = time.time() for next_config in next_configs: x_runtime = impute_inactive_values(next_config) x_runtime = impute_inactive_values(x_runtime).get_array() # predicted_runtime = runtime_rf.predict_marginalized_over_instances( # x_runtime.reshape((1, -1))) # predicted_runtime = np.exp(predicted_runtime[0][0][0]) - 1 self.logger.info("Starting to evaluate %d. configuration (from " "SMAC) with time limit %ds.", num_run, self.func_eval_time_limit) self.logger.info(next_config) self.reset_data_manager() info = eval_with_limits(self.datamanager, self.tmp_dir, next_config, seed, num_run, self.resampling_strategy, self.resampling_strategy_args, self.memory_limit, self.func_eval_time_limit) (duration, result, _, additional_run_info, status) = info run_history.add(config=next_config, cost=result, time=duration , status=status, instance_id=instance_id, seed=seed) run_history.update_cost(next_config, result) #self.logger.info('Predicted runtime %g, true runtime %g', # predicted_runtime, duration) # TODO add unittest to make sure everything works fine and # this does not get outdated! if smac.incumbent is None: smac.incumbent = next_config elif result < run_history.get_cost(smac.incumbent): smac.incumbent = next_config self.logger.info("Finished evaluating %d. configuration. " "Duration: %f; loss: %f; status %s; additional " "run info: %s ", num_run, duration, result, str(status), additional_run_info) smac_iter += 1 num_run += 1 models_fitted_this_iteration += 1 time_used_this_iteration = time.time() - start_time_this_iteration if models_fitted_this_iteration >= 2 and \ time_for_choose_next > 0 and \ time_used_this_iteration > time_for_choose_next: break elif time_for_choose_next <= 0 and \ models_fitted_this_iteration >= 1: break elif models_fitted_this_iteration >= 50: break if max_iters is not None: finished = (smac_iter < max_iters) if self.scenario.shared_model: pSMAC.write(run_history=run_history, output_directory=self.scenario.output_dir, num_run=self.seed)