def test_mix_space(): dim_r = 2 # dimension of the real values def obj_fun(x): x_r = np.array([x['continuous_%d' % i] for i in range(dim_r)]) x_i = x['ordinal'] x_d = x['nominal'] _ = 0 if x_d == 'OK' else 1 return np.sum(x_r**2) + abs(x_i - 10) / 123. + _ * 2 search_space = ContinuousSpace([-5, 5], var_name='continuous') * dim_r + \ OrdinalSpace([5, 15], var_name='ordinal') + \ NominalSpace(['OK', 'A', 'B', 'C', 'D', 'E', 'F', 'G'], var_name='nominal') model = RandomForest(levels=search_space.levels) opt = ParallelBO( search_space=search_space, obj_fun=obj_fun, model=model, max_FEs=6, DoE_size=3, # the initial DoE size eval_type='dict', acquisition_fun='MGFI', acquisition_par={'t': 2}, n_job=3, # number of processes n_point=3, # number of the candidate solution proposed in each iteration verbose=True # turn this off, if you prefer no output ) xopt, fopt, stop_dict = opt.run() print('xopt: {}'.format(xopt)) print('fopt: {}'.format(fopt)) print('stop criteria: {}'.format(stop_dict))
def test_sampling(): C = ContinuousSpace([-5, 5]) * 3 I = OrdinalSpace([[-100, 100], [-5, 5]], 'heihei') N = NominalSpace([['OK', 'A', 'B', 'C', 'D', 'E', 'A']] * 2, ['x', 'y']) S = N + I + C S.sampling(5, method='LHS') S.sampling(5, method='uniform')
def test_BO_constraints(): search_space = OrdinalSpace([1, 10], var_name='mu') + \ OrdinalSpace([1, 10], var_name='lambda') + \ ContinuousSpace([0, 1], var_name='pc') + \ ContinuousSpace([0.005, 0.5], var_name='p') model = RandomForest(levels=search_space.levels) xopt, _, __ = BO(search_space=search_space, obj_fun=obj_func, ineq_fun=g, model=model, max_FEs=30, DoE_size=3, eval_type='dict', acquisition_fun='MGFI', acquisition_par={ 't': 2 }, n_job=1, n_point=1, verbose=True).run() assert isinstance(xopt, dict) assert all(np.array(g(xopt)) <= 0)
def test_warm_data_with_RF(): space = ContinuousSpace([-10, 10]) * 2 + \ OrdinalSpace([5, 15]) + \ NominalSpace(['OK', 'A', 'B', 'C', 'D', 'E', 'F', 'G']) X = space.sampling(10) y = [obj_fun(x) for x in X] model = RandomForest(levels=space.levels) opt = BO(search_space=space, obj_fun=obj_fun, model=model, minimize=True, eval_type='list', max_FEs=10, verbose=True, acquisition_fun='EI', warm_data=(X, y)) opt.run() assert opt.data.shape[0] == 20
def test_ProductSpace(): C = ContinuousSpace([-5, 5], precision=1) * 3 # product of the same space I = OrdinalSpace([[-100, 100], [-5, 5]], 'heihei') N = NominalSpace([['OK', 'A', 'B', 'C', 'D', 'E', 'A']] * 2, ['x', 'y']) space = C + C + C print(space.sampling(2)) # cartesian product of heterogeneous spaces space = C + I + N print(space.sampling(10)) print(space.bounds) print(space.var_name) print(space.var_type) print((C * 2).var_name) print((N * 3).sampling(2)) C = ContinuousSpace([[0, 1]] * 2, var_name='weight') print(C.var_name)
def test_MIES(): def obj_fun(X): v = [] for x in X: x_r = np.array([x['real_%d' % i] for i in range(2)]) x_i = np.array([x['int_%d' % i] for i in range(2)]) x_d = np.array([x['cat_%d' % i] for i in range(2)]) v.append( np.sum(x_r ** 2) + \ np.sum(np.abs(x_i - 10) / 123.) + \ np.sum(x_d == 'OK') ) return v search_space = ContinuousSpace([-5, 5], var_name='real') * 2 + \ OrdinalSpace([5, 15], var_name='int') * 2 + \ NominalSpace( ['OK', 'A', 'B', 'C', 'D', 'E', 'F', 'G'], var_name='cat' ) * 2 opt = MIES( search_space=search_space, obj_func=obj_fun, max_eval=100, eval_type='dict', verbose=True # turn this off, if you prefer no output ) xopt, fopt, stop_dict = opt.optimize() print('xopt: {}'.format(xopt)) print('fopt: {}'.format(fopt)) print('stop criteria: {}'.format(stop_dict)) # def test_multi_acquisition(): # dim_r = 2 # dimension of the real values # def obj_fun(x): # x_r = np.array([x['continuous_%d'%i] for i in range(dim_r)]) # x_i = x['ordinal'] # x_d = x['nominal'] # _ = 0 if x_d == 'OK' else 1 # return np.sum(x_r ** 2) + abs(x_i - 10) / 123. + _ * 2 # search_space = ContinuousSpace([-5, 5], var_name='continuous') * dim_r + \ # OrdinalSpace([5, 15], var_name='ordinal') + \ # NominalSpace(['OK', 'A', 'B', 'C', 'D', 'E', 'F', 'G'], var_name='nominal') # model = RandomForest(levels=search_space.levels) # opt = MultiAcquisitionBO( # search_space=search_space, # obj_fun=obj_fun, # model=model, # max_FEs=8, # DoE_size=4, # the initial DoE size # eval_type='dict', # n_job=4, # number of processes # n_point=4, # number of the candidate solution proposed in each iteration # verbose=True # turn this off, if you prefer no output # ) # xopt, fopt, stop_dict = opt.run() # print('xopt: {}'.format(xopt)) # print('fopt: {}'.format(fopt)) # print('stop criteria: {}'.format(stop_dict))
# Continuous variables can be specified as follows: # a 2-D variable in [-5, 5]^2 # for 2 variables, the naming scheme is continuous0, continuous1 C = ContinuousSpace([-5, 5], var_name='continuous') * dim_r # Equivalently, you can also use # C = ContinuousSpace([[-5, 5]]] * dim) # The general usage is: # ContinuousSpace([[lb_1, ub_1], [lb_2, ub_2], ..., [lb_n, ub_n]]) # Integer (ordinal) variables can be specified as follows: # The domain of integer variables can be given as with continuous ones # var_name is optional I = OrdinalSpace([5, 15], var_name='ordinal') # Discrete (nominal) variables can be specified as follows: # No lb, ub... a list of categories instead N = NominalSpace(['OK', 'A', 'B', 'C', 'D', 'E', 'F', 'G'], var_name='nominal') # The whole search space can be constructed: search_space = C + I + N # Bayesian optimization also uses a Surrogate model # For mixed variable type, the random forest is typically used model = RandomForest(levels=search_space.levels) opt = ParallelBO( search_space=search_space, obj_fun=obj_fun,
def modeling(train, targets, to_optimize, **kwargs): """ Training and performing hyperparmeter optimization by Bayesian Optimization. Currently only supporting Random Forests. TODO: Make the HO and train_seting more interactive :param to_optimize: perform or not HO (boolean) :param train: train set (pandas) :param targets: targets (labels) (np.arrays) :cv: CV count for hyperparameter optimization :to_drop: Features to be dropped from learning such as unit numbers, cycles, etc (list of string names) :DoE_size: Initial design of experiment for the BO HO. :max_FEs: maximum number of function evaluations of the BO HO :features_list= a list of features to use, cv= :return: trained model and list of used features """ start = time.time() features_list = kwargs.get('features_list', None) to_drop = kwargs.get('to_drop', None) cv = kwargs.get('cv', 10) DoE_size = kwargs.get('DoE_size', 200) max_FEs = kwargs.get('max_FEs', 20) train_set = train.copy() if to_drop: print(f'The following features will not be used in training: {to_drop}') train_set.drop(to_drop, axis=1, inplace=True) if features_list: print('Features selected by user') train_set = train_set[features_list] train_set = train_set.values else: print('Feature Selection (this will take a while...)') train_set, features_list = boruta_feature_selection(train_set, targets) with open('./features_list.pkl', 'wb') as f: pkl.dump(features_list, f) df_columns = ['acc', 'max_depth', 'n_estimators', 'bootstrap', 'max_features', 'min_samples_leaf', 'min_samples_split'] df_eval = pd.DataFrame(columns=df_columns) # Hyperparameter optimization # objective function def obj_func(x): # logger.info('Started internal cross-validation') nonlocal df_eval performance_ = [] skf = StratifiedKFold(n_splits=cv, random_state=np.random, shuffle=True) for train_set_index, test_index in tqdm(skf.split(train_set, targets), 'Optimizing HO'): X_train_set, X_test = train_set[train_set_index], train_set[test_index] y_train_set, y_test = targets[train_set_index], targets[test_index] rf_ = RandomForestClassifier(n_estimators=int(x[1]), max_depth=int(x[0]), bootstrap=x[2], max_features=x[3], min_samples_leaf=x[4], min_samples_split=x[5], n_jobs=-1) rf_.fit(X_train_set, y_train_set) predictions_ = rf_.predict(X_test) performance_.append(accuracy_score(y_test, predictions_)) val = np.mean(performance_) df_eval_tmp = pd.DataFrame([[val, x[0], x[1], x[2], x[3], x[4], x[5]]], columns=df_columns) df_eval = df_eval.append(df_eval_tmp) return val # definition of hyperparameter search space: max_depth = OrdinalSpace([2, 100]) n_estimators = OrdinalSpace([1, 1000]) min_samples_leaf = OrdinalSpace([1, 10]) min_samples_split = OrdinalSpace([2, 20]) bootstrap = NominalSpace(['True', 'False']) max_features = NominalSpace(['auto', 'sqrt', 'log2']) search_space = max_depth + n_estimators + bootstrap + max_features + min_samples_leaf + min_samples_split model = RandomForest(levels=search_space.levels) opt = BO(search_space=search_space, obj_fun=obj_func, model=model, max_FEs=max_FEs, DoE_size=DoE_size, n_point=1, n_job=1, minimize=False, verbose=False) if to_optimize: print(f'Hyperparameter optimization with {cv}-folds and {max_FEs} function evaluations') opt.run() best_params_ = df_eval[df_columns[1:]][df_eval['acc'] == df_eval['acc'].max()][:1].to_dict('records') # Training using the best parameters if to_optimize: rf = RandomForestClassifier(n_jobs=-1, **best_params_[0]) else: rf = RandomForestClassifier(n_jobs=-1) rf.fit(train_set, targets) dump(rf, './rf_model.joblib') end = time.time() print(f'----Duration of training is {(end - start) / 60} minutes') return rf, features_list