def tpe_objective_function(config): metric = get_metric('bal_acc') _, estimator = get_estimator(config) X_train, y_train = train_data.data X_test, y_test = test_data.data estimator.fit(X_train, y_train) return -metric(estimator, X_test, y_test)
def evaluate_ml_algorithm(dataset, algo, obj_metric, seed=1, task_type=None): print('EVALUATE-%s-%s-%s' % (dataset, algo, obj_metric)) train_data = load_data(dataset, task_type=task_type, datanode_returned=True) print(set(train_data.data[1])) metric = get_metric(obj_metric) cs = _classifiers[algo].get_hyperparameter_search_space() model = UnParametrizedHyperparameter("estimator", algo) cs.add_hyperparameter(model) default_hpo_config = cs.get_default_configuration() hpo_evaluator = ClassificationEvaluator(default_hpo_config, scorer=metric, data_node=train_data, name='hpo', resampling_strategy='holdout', seed=seed) hpo_optimizer = SMACOptimizer(evaluator=hpo_evaluator, config_space=cs, per_run_time_limit=600, per_run_mem_limit=5120, output_dir='./logs', trials_per_iter=args.iter) hpo_optimizer.iterate() hpo_eval_dict = dict() for key, value in hpo_optimizer.eval_dict.items(): hpo_eval_dict[key[1]] = value save_path = save_dir + '%s-%s-%s-hpo.pkl' % (dataset, algo, obj_metric) with open(save_path, 'wb') as f: pickle.dump(hpo_eval_dict, f)
def __init__(self, time_limit=300, dataset_name='default_name', amount_of_resource=None, task_type=None, metric='bal_acc', include_algorithms=None, ensemble_method='ensemble_selection', enable_meta_algorithm_selection=True, enable_fe=True, per_run_time_limit=150, ensemble_size=50, evaluation='holdout', output_dir="logs", logging_config=None, random_state=1, n_jobs=1): self.metric_id = metric self.metric = get_metric(self.metric_id) self.dataset_name = dataset_name self.time_limit = time_limit self.seed = random_state self.per_run_time_limit = per_run_time_limit self.output_dir = output_dir self.logging_config = logging_config if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) self.logger = self._get_logger(self.dataset_name) self.evaluation_type = evaluation self.amount_of_resource = amount_of_resource self.ensemble_method = ensemble_method self.ensemble_size = ensemble_size self.enable_meta_algorithm_selection = enable_meta_algorithm_selection self.enable_fe = enable_fe self.task_type = task_type self.n_jobs = n_jobs self.solver = None if include_algorithms is not None: self.include_algorithms = include_algorithms else: if task_type in CLS_TASKS: if task_type in [IMG_CLS, TEXT_CLS]: raise ValueError( 'Please use AutoDL module, instead of AutoML.') else: self.include_algorithms = list(classification_algorithms) elif task_type in REG_TASKS: self.include_algorithms = list(regression_algorithms) else: raise ValueError("Unknown task type %s" % task_type) if ensemble_method is not None and ensemble_method not in ensemble_list: raise ValueError("%s is not supported for ensemble!" % ensemble_method)
def __init__(self, estimator, metric, task_type, evaluation_strategy, **evaluation_params): self.estimator = estimator if task_type not in TASK_TYPES: raise ValueError('Unsupported task type: %s' % task_type) self.metric = get_metric(metric) self.evaluation_strategy = evaluation_strategy self.evaluation_params = evaluation_params if self.evaluation_strategy == 'holdout': if 'train_size' not in self.evaluation_params: self.evaluation_params['train_size']
def evaluate_ml_algorithm(dataset, algo, run_id, obj_metric, total_resource=20, seed=1, task_type=None): print('EVALUATE-%s-%s-%s: run_id=%d' % (dataset, algo, obj_metric, run_id)) train_data, test_data = load_train_test_data(dataset, task_type=task_type) if task_type in CLS_TASKS: task_type = BINARY_CLS if len(set( train_data.data[1])) == 2 else MULTICLASS_CLS print(set(train_data.data[1])) metric = get_metric(obj_metric) bandit = SecondLayerBandit(task_type, algo, train_data, metric, per_run_time_limit=300, seed=seed, eval_type='holdout', fe_algo='bo', total_resource=total_resource) bandit.optimize_fixed_pipeline() val_score = bandit.incumbent_perf best_config = bandit.inc['hpo'] fe_optimizer = bandit.optimizer['fe'] fe_optimizer.fetch_nodes(10) best_data_node = fe_optimizer.incumbent test_data_node = fe_optimizer.apply(test_data, best_data_node) estimator = fetch_predict_estimator( task_type, best_config, best_data_node.data[0], best_data_node.data[1], weight_balance=best_data_node.enable_balance, data_balance=best_data_node.data_balance) score = metric(estimator, test_data_node.data[0], test_data_node.data[1]) * metric._sign print('Test score', score) save_path = save_dir + '%s-%s-%s-%d-%d.pkl' % (dataset, algo, obj_metric, run_id, total_resource) with open(save_path, 'wb') as f: pickle.dump([dataset, algo, score, val_score, task_type], f)
def __init__(self, task_type=CLASSIFICATION, optimizer_type='eval_base', metric='acc', trans_set=None, time_budget=None, maximum_evaluation_num=None, time_limit_per_trans=600, mem_limit_per_trans=1024, fe_enabled=True, evaluator=None, debug=False, seed=1, tmp_directory='logs', logging_config=None, model_id=None, task_id='Default'): self.fe_enabled = fe_enabled self.trans_set = trans_set self.maximum_evaluation_num = maximum_evaluation_num self.time_budget = time_budget self.time_limit_per_trans = time_limit_per_trans self.mem_limit_per_trans = mem_limit_per_trans self.optimizer_type = optimizer_type self.evaluator = evaluator self.optimizer = None self.metric = get_metric(metric) self.task_type = task_type self.task_id = task_id self.model_id = model_id self._seed = seed self.tmp_directory = tmp_directory self.logging_config = logging_config self._logger = self._get_logger(task_id) # Set up backend. if not os.path.exists(self.tmp_directory): os.makedirs(self.tmp_directory) # For data preprocessing. self.uninformative_columns, self.uninformative_idx = list(), list() self.variance_selector = None self.onehot_encoder = None self.label_encoder = None
def evaluate(dataset): train_data, test_data = load_train_test_data(dataset, test_size=0.3, task_type=MULTICLASS_CLS) cs = _classifiers[algo_name].get_hyperparameter_search_space() default_hpo_config = cs.get_default_configuration() metric = get_metric('bal_acc') fe_cs = get_task_hyperparameter_space(0, algo_name) default_fe_config = fe_cs.get_default_configuration() evaluator = ClassificationEvaluator(default_hpo_config, default_fe_config, algo_name, data_node=train_data, scorer=metric, name='hpo', resampling_strategy='holdout', output_dir='./data/exp_sys', seed=1) from solnml.components.optimizers.tlbo_optimizer import TlboOptimizer optimizer = TlboOptimizer(evaluator, cs, time_limit=300, name='hpo') optimizer.run()
def evaluate(mode, dataset, run_id, metric): print(mode, dataset, run_id, metric) metric = get_metric(metric) train_data, test_data = load_train_test_data(dataset, task_type=MULTICLASS_CLS) cs = _classifiers[algo_name].get_hyperparameter_search_space() model = UnParametrizedHyperparameter("estimator", algo_name) cs.add_hyperparameter(model) default_hpo_config = cs.get_default_configuration() fe_evaluator = ClassificationEvaluator(default_hpo_config, scorer=metric, name='fe', resampling_strategy='holdout', seed=1) hpo_evaluator = ClassificationEvaluator(default_hpo_config, scorer=metric, data_node=train_data, name='hpo', resampling_strategy='holdout', seed=1) fe_optimizer = BayesianOptimizationOptimizer(task_type=CLASSIFICATION, input_data=train_data, evaluator=fe_evaluator, model_id=algo_name, time_limit_per_trans=600, mem_limit_per_trans=5120, number_of_unit_resource=10, seed=1) def objective_function(config): if benchmark == 'fe': return fe_optimizer.evaluate_function(config) else: return hpo_evaluator(config) if mode == 'bo': bo = BO(objective_function, config_space, max_runs=max_runs, surrogate_model='prob_rf') bo.run() print('BO result') print(bo.get_incumbent()) perf = bo.history_container.incumbent_value runs = [bo.configurations, bo.perfs] elif mode == 'lite_bo': from litebo.facade.bo_facade import BayesianOptimization bo = BayesianOptimization(objective_function, config_space, max_runs=max_runs) bo.run() print('BO result') print(bo.get_incumbent()) perf = bo.history_container.incumbent_value runs = [bo.configurations, bo.perfs] elif mode.startswith('tlbo'): _, gp_fusion = mode.split('_') meta_feature_vec = metafeature_dict[dataset] past_datasets = test_datasets.copy() if dataset in past_datasets: past_datasets.remove(dataset) past_history = load_runhistory(past_datasets) gp_models = [ gp_models_dict[dataset_name] for dataset_name in past_datasets ] tlbo = TLBO(objective_function, config_space, past_history, gp_models=gp_models, dataset_metafeature=meta_feature_vec, max_runs=max_runs, gp_fusion=gp_fusion) tlbo.run() print('TLBO result') print(tlbo.get_incumbent()) runs = [tlbo.configurations, tlbo.perfs] perf = tlbo.history_container.incumbent_value else: raise ValueError('Invalid mode.') file_saved = '%s_%s_%s_result_%d_%d_%s.pkl' % (mode, algo_name, dataset, max_runs, run_id, benchmark) with open(data_dir + file_saved, 'wb') as f: pk.dump([perf, runs], f)
cs.add_conditions(aug_space.get_conditions()) for estimator_id in algorithm_candidates: sub_cs = get_model_config_space(estimator_id, include_estimator=False, include_aug=False) parent_hyperparameter = { 'parent': estimator_choice, 'value': estimator_id } cs.add_configuration_space(estimator_id, sub_cs, parent_hyperparameter=parent_hyperparameter) return cs cs = get_pipeline_config_space(['resnet34', 'mobilenet']) dataset = 'cifar10' data_dir = 'data/img_datasets/%s/' % dataset image_data = ImageDataset(data_path=data_dir, train_val_split=True) hpo_evaluator = DLEvaluator(cs.get_default_configuration(), IMG_CLS, scorer=get_metric('acc'), dataset=image_data, device='cuda', image_size=32, seed=1) hpo_evaluator(cs.get_default_configuration())
def __init__(self, task_type, trial_num, classifier_ids: List[str], data: DataNode, metric='acc', ensemble_method='ensemble_selection', ensemble_size=10, per_run_time_limit=300, output_dir=None, dataset_name='default_dataset', eval_type='holdout', share_feature=False, inner_opt_algorithm='rb', fe_algo='bo', time_limit=None, n_jobs=1, seed=1): """ :param classifier_ids: subset of {'adaboost','bernoulli_nb','decision_tree','extra_trees','gaussian_nb','gradient_boosting', 'gradient_boosting','k_nearest_neighbors','lda','liblinear_svc','libsvm_svc','multinomial_nb','passive_aggressive','qda', 'random_forest','sgd'} """ self.timestamp = time.time() self.task_type = task_type self.metric = get_metric(metric) self.original_data = data.copy_() self.ensemble_method = ensemble_method self.ensemble_size = ensemble_size self.trial_num = trial_num self.n_jobs = n_jobs self.alpha = 4 self.B = 0.01 self.seed = seed self.shared_mode = share_feature self.output_dir = output_dir np.random.seed(self.seed) # Best configuration. self.optimal_algo_id = None self.nbest_algo_ids = None self.best_lower_bounds = None self.es = None # Set up backend. self.dataset_name = dataset_name self.time_limit = time_limit self.start_time = time.time() self.logger = get_logger('Soln-ml: %s' % dataset_name) # Bandit settings. self.incumbent_perf = -1. self.arms = classifier_ids self.include_algorithms = classifier_ids self.rewards = dict() self.sub_bandits = dict() self.evaluation_cost = dict() self.fe_datanodes = dict() self.eval_type = eval_type self.fe_algo = fe_algo self.inner_opt_algorithm = inner_opt_algorithm for arm in self.arms: self.rewards[arm] = list() self.evaluation_cost[arm] = list() self.fe_datanodes[arm] = list() self.sub_bandits[arm] = SecondLayerBandit( self.task_type, arm, self.original_data, metric=self.metric, output_dir=output_dir, per_run_time_limit=per_run_time_limit, share_fe=self.shared_mode, seed=self.seed, eval_type=eval_type, dataset_id=dataset_name, n_jobs=self.n_jobs, fe_algo=fe_algo, mth=inner_opt_algorithm, ) self.action_sequence = list() self.final_rewards = list() self.start_time = time.time() self.time_records = list()
def __init__(self, task_type, trial_num, classifier_ids: List[str], data: DataNode, include_preprocessors=None, time_limit=None, metric='acc', ensemble_method='ensemble_selection', ensemble_size=50, per_run_time_limit=300, output_dir="logs", dataset_name='default_dataset', eval_type='holdout', inner_opt_algorithm='fixed', enable_fe=True, fe_algo='bo', n_jobs=1, seed=1): """ :param classifier_ids: subset of {'adaboost','bernoulli_nb','decision_tree','extra_trees','gaussian_nb','gradient_boosting', 'gradient_boosting','k_nearest_neighbors','lda','liblinear_svc','libsvm_svc','multinomial_nb','passive_aggressive','qda', 'random_forest','sgd'} """ self.timestamp = time.time() self.task_type = task_type self.include_preprocessors = include_preprocessors self.metric = get_metric(metric) self.original_data = data.copy_() self.ensemble_method = ensemble_method self.ensemble_size = ensemble_size self.trial_num = trial_num self.n_jobs = n_jobs self.alpha = 4 self.seed = seed self.output_dir = output_dir self.early_stop_flag = False # np.random.seed(self.seed) # Best configuration. self.optimal_algo_id = None self.nbest_algo_ids = None self.best_lower_bounds = None self.es = None # Set up backend. self.dataset_name = dataset_name self.time_limit = time_limit self.start_time = time.time() self.logger = get_logger('Soln-ml: %s' % dataset_name) # Bandit settings. self.incumbent_perf = -float("INF") self.arms = classifier_ids self.include_algorithms = classifier_ids self.rewards = dict() self.sub_bandits = dict() self.evaluation_cost = dict() self.eval_type = eval_type self.enable_fe = enable_fe self.fe_algo = fe_algo self.inner_opt_algorithm = inner_opt_algorithm # Record the execution cost for each arm. if not (self.time_limit is None) ^ (self.trial_num is None): raise ValueError('Please set one of time_limit or trial_num.') self.arm_cost_stats = dict() for _arm in self.arms: self.arm_cost_stats[_arm] = list() for arm in self.arms: self.rewards[arm] = list() self.evaluation_cost[arm] = list() self.sub_bandits[arm] = SecondLayerBandit( self.task_type, arm, self.original_data, include_preprocessors=self.include_preprocessors, metric=self.metric, output_dir=output_dir, per_run_time_limit=per_run_time_limit, seed=self.seed, eval_type=eval_type, dataset_id=dataset_name, n_jobs=self.n_jobs, fe_algo=fe_algo, mth=self.inner_opt_algorithm, timestamp=self.timestamp) self.action_sequence = list() self.final_rewards = list() self.start_time = time.time() self.time_records = list()
def evaluate(mode, dataset, run_id, metric): print(mode, dataset, run_id, metric) metric = get_metric(metric) train_data, test_data = load_train_test_data(dataset, task_type=MULTICLASS_CLS) cs = _classifiers[algo_name].get_hyperparameter_search_space() model = UnParametrizedHyperparameter("estimator", algo_name) cs.add_hyperparameter(model) default_hpo_config = cs.get_default_configuration() fe_evaluator = ClassificationEvaluator(default_hpo_config, scorer=metric, name='fe', resampling_strategy='holdout', seed=1) hpo_evaluator = ClassificationEvaluator(default_hpo_config, scorer=metric, data_node=train_data, name='hpo', resampling_strategy='holdout', seed=1) fe_optimizer = BayesianOptimizationOptimizer(task_type=CLASSIFICATION, input_data=train_data, evaluator=fe_evaluator, model_id=algo_name, time_limit_per_trans=600, mem_limit_per_trans=5120, number_of_unit_resource=10, seed=1) def objective_function(config): if benchmark == 'fe': return fe_optimizer.evaluate_function(config) else: return hpo_evaluator(config) meta_feature_vec = metafeature_dict[dataset] past_datasets = test_datasets.copy() if dataset in past_datasets: past_datasets.remove(dataset) past_history = load_runhistory(past_datasets) tlbo = TLBO_AF(objective_function, config_space, past_history, dataset_metafeature=meta_feature_vec, max_runs=max_runs, acq_method='taff2') tlbo.run() print('TLBO result') print(tlbo.get_incumbent()) runs = [tlbo.configurations, tlbo.perfs] perf = tlbo.history_container.incumbent_value file_saved = '%s_%s_result_%d_%d_%s.pkl' % (mode, dataset, max_runs, run_id, benchmark) with open(data_dir + file_saved, 'wb') as f: pk.dump([perf, runs], f)
def __init__(self, time_limit=300, trial_num=None, dataset_name='default_name', task_type=IMG_CLS, metric='acc', include_algorithms=None, ensemble_method='ensemble_selection', ensemble_size=50, max_epoch=150, config_file_path=None, evaluation='holdout', logging_config=None, output_dir="logs/", random_state=1, n_jobs=1): from solnml.components.models.img_classification import _classifiers as _img_estimators, _addons as _img_addons from solnml.components.models.text_classification import _classifiers as _text_estimators, \ _addons as _text_addons from solnml.components.models.object_detection import _classifiers as _od_estimators, _addons as _od_addons self.metric_id = metric self.metric = get_metric(self.metric_id) self.dataset_name = dataset_name self.time_limit = time_limit self.termination_time = time.time() + self.time_limit self.trial_num = trial_num self.seed = random_state self.output_dir = output_dir if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) self.logging_config = logging_config self.logger = self._get_logger(self.dataset_name) self.evaluation_type = evaluation self.ensemble_method = ensemble_method self.ensemble_size = ensemble_size self.task_type = task_type self.n_jobs = n_jobs self.config_file_path = config_file_path self.update_cs = dict() if include_algorithms is not None: self.include_algorithms = include_algorithms else: if task_type == IMG_CLS: self.include_algorithms = list(_img_estimators.keys()) elif task_type == TEXT_CLS: self.include_algorithms = list(_text_estimators.keys()) elif task_type == OBJECT_DET: self.include_algorithms = list(_od_estimators.keys()) else: raise ValueError("Unknown task type %s" % task_type) if task_type == IMG_CLS: self._estimators = _img_estimators self._addons = _img_addons elif task_type == TEXT_CLS: self._estimators = _text_estimators self._addons = _text_addons elif task_type == OBJECT_DET: self._estimators = _od_estimators self._addons = _od_addons else: raise ValueError("Unknown task type %s" % task_type) if ensemble_method is not None and ensemble_method not in ensemble_list: raise ValueError("%s is not supported for ensemble!" % ensemble_method) self.es = None self.solvers = dict() self.evaluators = dict() # Single model. self.best_algo_id = None self.best_algo_config = None # Ensemble models. self.candidate_algo_ids = None self.device = 'cuda' # Neural architecture selection. self.nas_evaluator = None self.eval_hist_configs = dict() self.eval_hist_perfs = dict() self.max_epoch = max_epoch self.image_size = None
def __init__(self, node_list, node_index, task_type, timestamp, fe_config_space: ConfigurationSpace, cash_config_space: ConfigurationSpace, data: DataNode, fixed_config=None, trial_num=0, time_limit=None, metric='acc', ensemble_method='ensemble_selection', ensemble_size=50, per_run_time_limit=300, output_dir="logs", dataset_name='default_dataset', eval_type='holdout', resampling_params=None, n_jobs=1, seed=1): # Tree setting self.node_list = node_list self.node_index = node_index # Set up backend. self.dataset_name = dataset_name self.trial_num = trial_num self.time_limit = time_limit self.per_run_time_limit = per_run_time_limit self.start_time = time.time() self.logger = get_logger('Soln-ml: %s' % dataset_name) # Basic settings. self.eval_type = eval_type self.resampling_params = resampling_params self.task_type = task_type self.timestamp = timestamp self.fe_config_space = fe_config_space self.cash_config_space = cash_config_space self.fixed_config = fixed_config self.original_data = data.copy_() self.metric = get_metric(metric) self.ensemble_method = ensemble_method self.ensemble_size = ensemble_size self.n_jobs = n_jobs self.seed = seed self.output_dir = output_dir self.early_stop_flag = False self.timeout_flag = False self.incumbent_perf = -float("INF") self.incumbent = None self.eval_dict = dict() if self.task_type in CLS_TASKS: self.if_imbal = is_imbalanced_dataset(self.original_data) else: self.if_imbal = False self.es = None
from solnml.components.utils.constants import CLASSIFICATION, REGRESSION from solnml.datasets.utils import load_train_test_data from solnml.components.metrics.metric import get_metric from solnml.components.evaluators.base_evaluator import fetch_predict_estimator from solnml.components.evaluators.cls_evaluator import ClassificationEvaluator from solnml.components.evaluators.rgs_evaluator import RegressionEvaluator parser = argparse.ArgumentParser() parser.add_argument('--datasets', type=str, default='diabetes') parser.add_argument('--metrics', type=str, default='acc') parser.add_argument('--task', type=str, choices=['reg', 'cls'], default='cls') parser.add_argument('--output_dir', type=str, default='./data/fe_hpo_results') args = parser.parse_args() dataset_list = args.datasets.split(',') metric = get_metric(args.metrics) algorithms = ['lightgbm', 'random_forest', 'libsvm_svc', 'extra_trees', 'liblinear_svc', 'k_nearest_neighbors', 'logistic_regression', 'gradient_boosting', 'adaboost'] task = args.task if task == 'cls': from solnml.components.models.classification import _classifiers _estimators = _classifiers else: from solnml.components.models.regression import _regressors _estimators = _regressors
def evaluate(mth, dataset, run_id): print(mth, dataset, run_id) train_data, test_data = load_train_test_data(dataset, test_size=0.3, task_type=MULTICLASS_CLS) cs = _classifiers[algo_name].get_hyperparameter_search_space() model = UnParametrizedHyperparameter("estimator", algo_name) cs.add_hyperparameter(model) default_hpo_config = cs.get_default_configuration() metric = get_metric('bal_acc') fe_evaluator = ClassificationEvaluator(default_hpo_config, scorer=metric, name='fe', resampling_strategy='holdout', seed=1) fe_optimizer = BayesianOptimizationOptimizer(task_type=MULTICLASS_CLS, input_data=train_data, evaluator=fe_evaluator, model_id=algo_name, time_limit_per_trans=600, mem_limit_per_trans=5120, number_of_unit_resource=10, seed=1) config_space = fe_optimizer.hyperparameter_space def objective_function(config): return fe_optimizer.evaluate_function(config) if mth == 'gp_bo': bo = BO(objective_function, config_space, max_runs=max_runs) bo.run() print('new BO result') print(bo.get_incumbent()) perf_bo = bo.history_container.incumbent_value elif mth == 'lite_bo': from litebo.facade.bo_facade import BayesianOptimization bo = BayesianOptimization(objective_function, config_space, max_runs=max_runs) bo.run() print('lite BO result') print(bo.get_incumbent()) perf_bo = bo.history_container.incumbent_value elif mth == 'smac': from smac.scenario.scenario import Scenario from smac.facade.smac_facade import SMAC # Scenario object scenario = Scenario({ "run_obj": "quality", "runcount-limit": max_runs, "cs": config_space, "deterministic": "true" }) smac = SMAC(scenario=scenario, rng=np.random.RandomState(42), tae_runner=objective_function) incumbent = smac.optimize() perf_bo = objective_function(incumbent) print('SMAC BO result') print(perf_bo) else: raise ValueError('Invalid method.') return perf_bo