def evaluate_hmab(algorithms, dataset, run_id, trial_num, seed, time_limit=1200): print('%s-%s-%d: %d' % (hmab_flag, dataset, run_id, time_limit)) _start_time = time.time() train_data, test_data = load_train_test_data(dataset, task_type=MULTICLASS_CLS) cls_task_type = BINARY_CLS if len(set( train_data.data[1])) == 2 else MULTICLASS_CLS balanced_acc_metric = make_scorer(balanced_accuracy) if is_unbalanced_dataset(train_data): from solnml.components.feature_engineering.transformations.preprocessor.smote_balancer import DataBalancer train_data = DataBalancer().operate(train_data) bandit = FirstLayerBandit(cls_task_type, trial_num, algorithms, train_data, output_dir='logs', per_run_time_limit=per_run_time_limit, dataset_name=dataset, ensemble_size=50, inner_opt_algorithm=opt_algo, metric=balanced_acc_metric, fe_algo='bo', seed=seed, time_limit=time_limit, eval_type='holdout') bandit.optimize() time_taken = time.time() - _start_time model_desc = [ bandit.nbest_algo_ids, bandit.optimal_algo_id, bandit.final_rewards, bandit.action_sequence ] validation_accuracy = np.max(bandit.final_rewards) best_pred = bandit._best_predict(test_data) test_accuracy = balanced_accuracy(test_data.data[1], best_pred) bandit.refit() es_pred = bandit._es_predict(test_data) test_accuracy_with_ens = balanced_accuracy(test_data.data[1], es_pred) data = [ dataset, validation_accuracy, test_accuracy, test_accuracy_with_ens, time_taken, model_desc ] print(model_desc) print(data) save_path = project_dir + '%s_%s_%s_%d_%d_%d_%d_%d.pkl' % ( hmab_flag, opt_algo, dataset, trial_num, len(algorithms), seed, run_id, time_limit) with open(save_path, 'wb') as f: pickle.dump(data, f)
def evaluate_hmab(algorithms, dataset, run_id, trial_num, seed, time_limit=1200): print('%s-%s-%d: %d' % (hmab_flag, dataset, run_id, time_limit)) exclude_datasets = ['gina_prior2', 'pc2', 'abalone', 'wind', 'waveform-5000(2)', 'page-blocks(1)', 'winequality_white', 'pollen'] alad = AlgorithmAdvisor(task_type=MULTICLASS_CLS, n_algorithm=9, metric='bal_acc', exclude_datasets=exclude_datasets) n_algo = 5 assert dataset in exclude_datasets meta_infos = alad.fit_meta_learner() assert dataset not in meta_infos model_candidates = alad.fetch_algorithm_set(dataset) include_models = list() print(model_candidates) for algo in model_candidates: if algo in algorithms and len(include_models) < n_algo: include_models.append(algo) print('After algorithm recommendation', include_models) _start_time = time.time() train_data, test_data = load_train_test_data(dataset, task_type=MULTICLASS_CLS) cls_task_type = BINARY_CLS if len(set(train_data.data[1])) == 2 else MULTICLASS_CLS balanced_acc_metric = make_scorer(balanced_accuracy) if is_unbalanced_dataset(train_data): from solnml.components.feature_engineering.transformations.balancer.smote_balancer import DataBalancer train_data = DataBalancer().operate(train_data) bandit = FirstLayerBandit(cls_task_type, trial_num, include_models, train_data, output_dir='logs', per_run_time_limit=per_run_time_limit, dataset_name=dataset, ensemble_size=50, inner_opt_algorithm=opt_algo, metric=balanced_acc_metric, fe_algo='bo', seed=seed, time_limit=time_limit, eval_type='holdout') bandit.optimize() time_taken = time.time() - _start_time model_desc = [bandit.nbest_algo_ids, bandit.optimal_algo_id, bandit.final_rewards, bandit.action_sequence] validation_accuracy = np.max(bandit.final_rewards) best_pred = bandit._best_predict(test_data) test_accuracy = balanced_accuracy(test_data.data[1], best_pred) bandit.refit() es_pred = bandit._es_predict(test_data) test_accuracy_with_ens = balanced_accuracy(test_data.data[1], es_pred) data = [dataset, validation_accuracy, test_accuracy, test_accuracy_with_ens, time_taken, model_desc] print(model_desc) print(data) save_path = project_dir + '%s_%s_%s_%d_%d_%d_%d_%d.pkl' % ( hmab_flag, opt_algo, dataset, trial_num, len(algorithms), seed, run_id, time_limit) with open(save_path, 'wb') as f: pickle.dump(data, f)
def evaluate_hmab(algorithms, dataset, run_id, trial_num, seed, time_limit=1200): print('%s-%s-%d: %d' % (hmab_flag, dataset, run_id, time_limit)) _start_time = time.time() train_data, test_data = load_train_test_data(dataset, task_type=MULTICLASS_CLS) cls_task_type = BINARY_CLS if len(set( train_data.data[1])) == 2 else MULTICLASS_CLS balanced_acc_metric = make_scorer(balanced_accuracy) if is_unbalanced_dataset(train_data): from solnml.components.feature_engineering.transformations.preprocessor.smote_balancer import DataBalancer train_data = DataBalancer().operate(train_data) bandit = FirstLayerBandit(cls_task_type, trial_num, algorithms, train_data, output_dir='logs', per_run_time_limit=per_run_time_limit, dataset_name=dataset, ensemble_size=50, inner_opt_algorithm=opt_algo, metric=balanced_acc_metric, fe_algo='bo', seed=seed, time_limit=time_limit, eval_type='partial') # while time.time()-_start_time<time_limit: # bandit.sub_bandits['random_forest'].optimizer['fe'].iterate() # # print(bandit.sub_bandits['random_forest'].optimizer['hpo'].exp_output) bandit.optimize() fe_exp_output = bandit.sub_bandits['random_forest'].optimizer[ 'fe'].exp_output hpo_exp_output = bandit.sub_bandits['random_forest'].optimizer[ 'hpo'].exp_output validation_accuracy = np.max(bandit.final_rewards) best_pred = bandit._best_predict(test_data) test_accuracy = balanced_accuracy(test_data.data[1], best_pred) bandit.refit() es_pred = bandit._es_predict(test_data) test_accuracy_with_ens = balanced_accuracy(test_data.data[1], es_pred) data = [ dataset, validation_accuracy, test_accuracy, test_accuracy_with_ens, fe_exp_output, hpo_exp_output, _start_time ] save_path = project_dir + '%s_%s_%s_%d_%d_%d_%d_%d.pkl' % ( hmab_flag, opt_algo, dataset, trial_num, len(algorithms), seed, run_id, time_limit) with open(save_path, 'wb') as f: pickle.dump(data, f) del_path = './logs/' for i in os.listdir(del_path): file_data = del_path + "/" + i if os.path.isfile(file_data): os.remove(file_data)
class AutoML(object): def __init__(self, time_limit=300, dataset_name='default_name', amount_of_resource=None, task_type=None, metric='bal_acc', include_algorithms=None, ensemble_method='ensemble_selection', enable_meta_algorithm_selection=True, per_run_time_limit=150, ensemble_size=50, evaluation='holdout', output_dir="logs", logging_config=None, random_state=1, n_jobs=1): self.metric_id = metric self.metric = get_metric(self.metric_id) self.dataset_name = dataset_name self.time_limit = time_limit self.seed = random_state self.per_run_time_limit = per_run_time_limit self.output_dir = output_dir self.logging_config = logging_config if not os.path.exists(self.output_dir): os.makedirs(self.output_dir) self.logger = self._get_logger(self.dataset_name) self.evaluation_type = evaluation self.amount_of_resource = amount_of_resource self.ensemble_method = ensemble_method self.ensemble_size = ensemble_size self.enable_meta_algorithm_selection = enable_meta_algorithm_selection self.task_type = task_type self.n_jobs = n_jobs self.solver = None if include_algorithms is not None: self.include_algorithms = include_algorithms else: if task_type in CLS_TASKS: self.include_algorithms = list(classification_algorithms) elif task_type in REG_TASKS: self.include_algorithms = list(regression_algorithms) else: raise ValueError("Unknown task type %s" % task_type) if ensemble_method is not None and ensemble_method not in ensemble_list: raise ValueError("%s is not supported for ensemble!" % ensemble_method) def _get_logger(self, name): logger_name = 'SolnML-%s(%d)' % (name, self.seed) setup_logger( os.path.join(self.output_dir, '%s.log' % str(logger_name)), self.logging_config, ) return get_logger(logger_name) def fit(self, train_data: DataNode, dataset_id=None): """ this function includes this following two procedures. 1. tune each algorithm's hyperparameters. 2. engineer each algorithm's features automatically. :param train_data: :return: """ if self.enable_meta_algorithm_selection: try: alad = AlgorithmAdvisor(task_type=self.task_type, n_algorithm=9, metric=self.metric_id) n_algo = 5 model_candidates = alad.fetch_algorithm_set( train_data, dataset_id=dataset_id) include_models = list() for algo in model_candidates: if algo in self.include_algorithms and len( include_models) < n_algo: include_models.append(algo) self.include_algorithms = include_models self.logger.info( 'Executing meta-learning based algorithm recommendation!') self.logger.info('Algorithms recommended: %s' % ','.join(self.include_algorithms)) except Exception as e: self.logger.error("Meta-learning failed!") # Check whether this dataset is balanced or not. if self.task_type in CLS_TASKS and is_unbalanced_dataset(train_data): # self.include_algorithms = imb_classication_algorithms self.logger.info('Input dataset is imbalanced!') train_data = DataBalancer().operate(train_data) if self.amount_of_resource is None: trial_num = len(self.include_algorithms) * 30 else: trial_num = self.amount_of_resource self.solver = FirstLayerBandit( self.task_type, trial_num, self.include_algorithms, train_data, per_run_time_limit=self.per_run_time_limit, dataset_name=self.dataset_name, ensemble_method=self.ensemble_method, ensemble_size=self.ensemble_size, inner_opt_algorithm='fixed', metric=self.metric, fe_algo='bo', seed=self.seed, time_limit=self.time_limit, eval_type=self.evaluation_type, output_dir=self.output_dir) self.solver.optimize() def refit(self): self.solver.refit() def predict_proba(self, test_data: DataNode): return self.solver.predict_proba(test_data) def predict(self, test_data: DataNode): return self.solver.predict(test_data) def score(self, test_data: DataNode, metric_func=None): if metric_func is None: metric_func = self.metric return metric_func(self, test_data, test_data.data[1]) def get_ens_model_info(self): if self.ensemble_method is not None: return self.solver.es.get_ens_model_info() else: return None
class AutoML(object): def __init__(self, time_limit=300, dataset_name='default_name', amount_of_resource=None, task_type=None, metric='bal_acc', include_algorithms=None, include_preprocessors=None, ensemble_method='ensemble_selection', enable_meta_algorithm_selection=True, enable_fe=True, per_run_time_limit=150, ensemble_size=50, evaluation='holdout', output_dir="logs", logging_config=None, random_state=1, n_jobs=1): self.metric_id = metric self.metric = get_metric(self.metric_id) self.dataset_name = dataset_name self.time_limit = time_limit self.seed = random_state self.per_run_time_limit = per_run_time_limit self.output_dir = output_dir self.logging_config = logging_config self.logger = self._get_logger(self.dataset_name) self.evaluation_type = evaluation self.include_preprocessors = include_preprocessors self.amount_of_resource = amount_of_resource self.ensemble_method = ensemble_method self.ensemble_size = ensemble_size self.enable_meta_algorithm_selection = enable_meta_algorithm_selection self.enable_fe = enable_fe self.task_type = task_type self.n_jobs = n_jobs self.solver = None # Disable meta learning if self.include_preprocessors is not None: self.enable_meta_algorithm_selection = False if include_algorithms is not None: self.include_algorithms = include_algorithms else: if task_type in CLS_TASKS: if task_type in [IMG_CLS, TEXT_CLS]: raise ValueError( 'Please use AutoDL module, instead of AutoML.') else: self.include_algorithms = list(classification_algorithms) elif task_type in RGS_TASKS: self.include_algorithms = list(regression_algorithms) else: raise ValueError("Unknown task type %s" % task_type) if ensemble_method is not None and ensemble_method not in ensemble_list: raise ValueError("%s is not supported for ensemble!" % ensemble_method) def _get_logger(self, name): logger_name = 'SolnML-%s(%d)' % (name, self.seed) setup_logger( os.path.join(self.output_dir, '%s.log' % str(logger_name)), self.logging_config, ) return get_logger(logger_name) def fit(self, train_data: DataNode, **kwargs): """ This function includes this following two procedures. 1. tune each algorithm's hyperparameters. 2. engineer each algorithm's features automatically. :param train_data: :return: """ # Check whether this dataset is balanced or not. # if self.task_type in CLS_TASKS and is_imbalanced_dataset(train_data): # self.logger.info('Input dataset is imbalanced!') # train_data = DataBalancer().operate(train_data) dataset_id = kwargs.get('dataset_id', None) inner_opt_algorithm = kwargs.get('opt_strategy', 'alter_hpo') self.logger.info('Optimization algorithm in 2rd bandit: %s' % inner_opt_algorithm) if self.enable_meta_algorithm_selection: try: n_algo_recommended = 5 meta_datasets = kwargs.get('meta_datasets', None) self.logger.info( 'Executing Meta-Learning based Algorithm Recommendation.') alad = RankNetAdvisor(task_type=self.task_type, n_algorithm=n_algo_recommended, metric=self.metric_id) alad.fit() model_candidates = alad.fetch_algorithm_set( dataset_id, datanode=train_data) include_models = list() for algo in model_candidates: if algo in self.include_algorithms and len( include_models) < n_algo_recommended: include_models.append(algo) # if 'logistic_regression' in include_models: # include_models.remove('logistic_regression') # if 'adaboost' not in include_models: # include_models.append('adaboost') # include_models = ['extra_trees', 'adaboost', 'liblinear_svc', 'random_forest', # 'libsvm_svc', 'lightgbm'] self.include_algorithms = include_models self.logger.info('Final Algorithms Recommended: [%s]' % ','.join(self.include_algorithms)) except Exception as e: self.logger.error( "Meta-Learning based Algorithm Recommendation FAILED: %s." % str(e)) traceback.print_exc(file=sys.stdout) self.solver = FirstLayerBandit( self.task_type, self.amount_of_resource, self.include_algorithms, train_data, include_preprocessors=self.include_preprocessors, per_run_time_limit=self.per_run_time_limit, dataset_name=self.dataset_name, ensemble_method=self.ensemble_method, ensemble_size=self.ensemble_size, inner_opt_algorithm=inner_opt_algorithm, metric=self.metric, enable_fe=self.enable_fe, fe_algo='bo', seed=self.seed, time_limit=self.time_limit, eval_type=self.evaluation_type, output_dir=self.output_dir) self.solver.optimize() def refit(self): self.solver.refit() def predict_proba(self, test_data: DataNode): return self.solver.predict_proba(test_data) def predict(self, test_data: DataNode): return self.solver.predict(test_data) def score(self, test_data: DataNode, metric_func=None): if metric_func is None: metric_func = self.metric return metric_func(self, test_data, test_data.data[1]) def get_ens_model_info(self): if self.ensemble_method is not None: return self.solver.es.get_ens_model_info() else: return None def get_val_stats(self): return self.solver.get_stats()