def softmax_method(data): """Do model selection with softmax method Parameters ---------- data: utils.data_loader.DataSet training data """ log = get_logger('softmax', 'log/sf/softmax.log', level=DEBUG) optimizations = _get_optimizations() model_selection = SoftMaxSelection(optimizations) log.info('Begin fitting on {}'.format(data.name)) train_x, train_y = data.train_data() start = time.time() best_optimization = model_selection.fit(train_x, train_y, temperature=0.5, budget=BUDGET) elapsed = time.time() - start log.info('Fitting on {} is over, spend {}s'.format(data.name, elapsed)) csv_file = 'log/sf/sf_{}.csv'.format(data.name) pkl_file = 'log/sf/sf_{}.pkl'.format(data.name) return _get_test_result(best_optimization, data, model_selection.statistics(), csv_file, pkl_file, log)
def ucb_or_random_method(data, method): """Do model selection by traditional ucb method Parameters ---------- data: utils.data_loader.DataSet training data method: str model selection method (only ucb or random can be chosen) """ log = get_logger(method, 'log/{}/{}.log'.format(method, method), level=DEBUG) optimizations = _get_optimizations() model_selection = BanditModelSelection(optimizations, method) log.info('Begin fit on {}'.format(data.name)) train_x, train_y = data.train_data() start = time.time() best_optimization = model_selection.fit(train_x, train_y, budget=BUDGET) log.info('Fitting on {} is done! Spend {}s'.format(data.name, time.time() - start)) csv_file = 'log/{}/{}_{}.csv'.format(method, method, data.name) pkl_file = 'log/{}/{}_{}.pkl'.format(method, method, data.name) return _get_test_result(best_optimization, data, model_selection.statistics(), csv_file, pkl_file, log)
def eg_method(data): """Do model selection with epsilon-greedy method Parameters ---------- data: utils.data_loader.DataSet training data """ log = get_logger('epsilon-greedy', 'log/eg/epsilon-greedy.log', level=DEBUG) optimizations = _get_optimizations() model_selection = EpsilonGreedySelection(optimizations) log.info('Begin fitting on {}'.format(data.name)) train_x, train_y = data.train_data() start = time.time() best_optimization = model_selection.fit(train_x, train_y, budget=BUDGET) elapsed = time.time() - start log.info('Fitting on {} is over, spend {}s'.format(data.name, elapsed)) csv_file = 'log/eg/eg_{}.csv'.format(data.name) pkl_file = 'log/eg/eg_{}.pkl'.format(data.name) return _get_test_result(best_optimization, data, model_selection.statistics(), csv_file, pkl_file, log)
def single_arm_method(data, model_gen, budget=BUDGET): model_name = type(model_gen).__name__ optimization = RacosOptimization(model_gen, model_name) train_x, train_y = data.train_data() logger = get_logger('single_arm', 'log/single/single_arm.log') logger.info(f'Begin to fit {data.name} using {model_name}') for i in range(budget): logger.info(f'Process: {i + 1}/{budget}') optimization.run_one_step(train_x, train_y) logger.info(f'Fitting on {data.name} using model {model_name} is over') optimization.instances.to_csv(f'log/single/single_{data.name}_{model_name}.csv')
def run_extreme_bandit(data): log = get_logger('extreme bandit', 'log/exb/exb.log', level=DEBUG) optimizations = _get_optimizations() model_selection = ExtremeHunter(optimizations) log.info('Begin fit on {}'.format(data.name)) train_x, train_y = data.train_data() best_optimization = model_selection.fit(train_x, train_y, budget=50) log.info('Fitting on {} is over'.format(data.name)) csv_file = 'log/exh/exh_{}.csv'.format(data.name) return _get_test_result(best_optimization, data, model_selection.statistics(), csv_file, log)
def proposed_method(data, theta, gamma, beta, show_selection_detail=False): """Do model selection with proposed method Parameters ---------- data: utils.data_loader.DataSet training data theta: float gamma: float beta: float show_selection_detail: bool """ log_name = 'proposed-{}-{}'.format(theta, gamma) log = get_logger(log_name, 'log/proposed-new/' + log_name + '.log', level=DEBUG) optimizations = _get_optimizations() model_selection = BanditModelSelection(optimizations, 'new', theta=theta, gamma=gamma, beta=beta) log.info('Begin fit on {}'.format(data.name)) train_x, train_y = data.train_data() start = time.time() best_optimization = model_selection.fit(train_x, train_y, budget=BUDGET) # write parameter change information if show_selection_detail: with open('log/ps_{}_{}_{}.csv'.format(theta, gamma, data.name), 'a') as f: count = len(model_generators) for record in model_selection.param_change_info: f.write('t = {}'.format(count)) record.to_csv(f, mode='a') f.write('\n\n') count += 1 log.info('Fitting on {} is over, spend {}s'.format(data.name, time.time() - start)) csv_file = 'log/proposed-new/proposed_{}_{}_{}_{}.csv'.format(theta, gamma, beta, data.name) pkl_file = 'log/proposed-new/proposed_{}_{}_{}_{}.pkl'.format(theta, gamma, beta, data.name) return _get_test_result(best_optimization, data, model_selection.statistics(), csv_file, pkl_file, log)
def ground_truth_lab(): log = get_logger('gt', 'log/gt.log', level=INFO) for data in ALL_DATA: start = time.time() log.info('Start finding ground truth model for data set {}'.format(data.name)) result = [] for generator in model_generators: result.append(find_ground_truth(data, generator)) df_result = pd.DataFrame(data=result, columns=['name', 'max', 'mean', 'std', 'best_model', 'time']) df_result = df_result.set_index(df_result['name']) best_model = df_result['max'].idxmax() # save to csv with open('log/gt_{}.csv'.format(data.name), 'a') as f: f.write('best is {}\n'.format(best_model)) df_result.to_csv(f, mode='a') elapsed = time.time() - start log.info('g-test --- Fitting on {} is over, spend {}s'.format(data.name, elapsed))
def auto_sk_method(data, time_left): classifier = autosklearn.classification.AutoSklearnClassifier( time_left_for_this_task=time_left, per_run_time_limit=300, exclude_estimators=exclude_estimators) logger = get_logger('log/auto_sk/auto_sk_{}'.format(data.name), 'log/auto_sk/auto_sk_{}.log'.format(data.name)) train_x, train_y = data.train_data() logger.info('Start fitting on {}'.format(data.name)) start = time.time() classifier.fit(train_x, train_y) # get best validation score idx_best_run = np.argmax(classifier.cv_results_['mean_test_score']) best_score = classifier.cv_results_['mean_test_score'][idx_best_run] # calculate test v test_x, test_y = data.test_data() y_hat = classifier.predict(test_x) test_v = accuracy_score(test_y, y_hat) # show result information logger.info('Fitting on {} is done, spend {}s'.format( data.name, time.time() - start)) logger.info('Sprint statistics\n{}'.format(classifier.sprint_statistics())) logger.info('Test V is {}'.format(test_v)) # logger.info('Show model:\n{}'.format(classifier.show_models())) # save cv results cv_result = pd.DataFrame.from_dict(classifier.cv_results_) cv_result.to_csv('log/auto_sk/auto_sk_cv_result_on_{}.csv'.format( data.name)) cv_result.to_pickle('log/auto_sk/auto_sk_cv_result_on_{}.pkl'.format( data.name)) return data.name, best_score, test_v
def auto_sk_lab(start, end): logger = get_logger('auto-sklearn-{}-{}'.format(start, end), 'log/auto_sk/auto-sk-{}-{}.log'.format(start, end)) result = [] data_sets = data_loader.data_for_auto_sklearn()[start:end] for (data, time_left) in data_sets: logger.info('Start fitting {}'.format(data.name)) start = time.time() method_result = auto_sk_method(data, time_left) result.append(method_result) logger.info('Fitting on {} is over, spend {}s\n' 'result:\n' '{}'.format(data.name, time.time() - start, method_result)) df_result = pd.DataFrame(data=result, columns=['data set', 'best v', 'test v']) df_result.to_csv('log/auto_sk/auto-sk-{}to{}.csv'.format(start, end)) df_result.to_pickle('log/auto_sk/auto-sk-{}tp{}.pkl'.format(start, end))
def new_erucb_method(data, b=B): log_name = 'new-erucb' log = get_logger(log_name, 'log/proposed-new/' + log_name + '.log', level=DEBUG) model_selection = _get_model_selection(b) log.info('Begin fit on {}'.format(data.name)) train_x, train_y = data.train_data() start = time.time() best_optimization = model_selection.fit(train_x, train_y, budget=BUDGET) elapsed = time.time() - start log.info('Fitting on {} ends, spend {}s'.format(data.name, elapsed)) for (prefix, param_info) in model_selection.param_change_info: assert isinstance(param_info, pd.DataFrame) with open('log/proposed-new/erucb-process-{}-{}.csv'.format(data.name, b), mode='a') as f: f.write(prefix) param_info.to_csv(f, mode='a') csv = 'log/proposed-new/new_erucb_{}_{}.csv'.format(data.name, b) return _get_test_result(best_optimization, data, model_selection.statistics(), csv, '', log)
def find_ground_truth(data, model_generator, budget=BUDGET): """Find the ground truth model for each dataset Parameters ---------- data: utils.data_loader.DataSet training data model_generator: framework.base.ModelGenerator generator for the target model budget: int number of samples Returns ------- evaluation_result: (float, float, float) best evaluation result, mean and standard deviation """ train_x, train_y = data.train_data() model_name = type(model_generator).__name__ start = time.time() log = get_logger('gt.model', '', level=INFO) log.info('{} --- {} start fitting'.format(data.name, model_name)) # begin sampling result = random_search(model_generator, train_x, train_y, search_times=budget) best_result_index = result['Accuracy'].idxmax() best_result_params = result['Raw Parameters'][best_result_index] best_model = model_generator.generate_model(best_result_params) elapsed = time.time() - start log.info('{} --- {} end running, spend {}s'.format(data.name, model_name, elapsed)) acc_column = result['Accuracy'] return model_name, acc_column.max(), acc_column.mean(), acc_column.std(), best_model, elapsed
def ground_truth_method(data): logger = get_logger('gt', 'log/ground/ground_truth.log', level=INFO) result = [] budget_for_single_model = int(BUDGET / len(model_generators)) logger.info('Begin fitting on {}'.format(data.name)) start = time.time() for model_generator in model_generators: result.append(find_ground_truth(data, model_generator, budget_for_single_model)) logger.info('Fitting on {} is over, spend {}s'.format(data.name, time.time() - start)) df_result = pd.DataFrame(data=result, columns=['model', 'best v', 'mean', 'std', 'best model', 'time']) df_result.to_csv('log/ground/ground_{}.csv'.format(data.name)) # get test v best_model_index = df_result['best v'].idxmax() best_model = df_result['best model'][best_model_index] test_v = _evaluate_test_v(data, best_model) logger.info('Test v of {} is {}'.format(data.name, test_v)) return data.name, df_result['best v'].max(), type(best_model).__name__, test_v
import logging import random import signal import time import pandas as pd import framework.base as base from utils.logging_ import get_logger # -------------------------------------------------------- # define a logger log = get_logger('random_search', 'random_search.log', level=logging.INFO) def timeout_handler(signum, frame): raise TimeoutError("Timeout!") signal.signal(signal.SIGALRM, timeout_handler) def random_search(model_generator, train_x, train_y, search_times=100): evaluator = base.ModelEvaluator(model_generator, train_x, train_y) model_name = type(model_generator).__name__ raw_parameter_list = [] actual_parameter_list = [] accuracy_list = [] time_list = []