def SVC(self): def on_step(optim_result): score = bayesClf.best_score_ print("Score: SVC: ", score * 100) if score == 1: print('Max Score Achieved') return True bayesClf = BayesSearchCV(SVC(random_state=0), search_spaceSVC, n_iter=N_ITER, cv=CV, scoring=scoringMetrics, return_train_score=False) bayesClf.fit(self.Xtr, self.ytr, callback=on_step) y_pred = bayesClf.best_estimator_.predict(self.Xte) metrics = self.calculateMetrics(y_pred) logClassifier(SVC(), self.classes, metrics[0], metrics[1], metrics[2], metrics[3], metrics[4], metrics[5], metrics[6], bayesClf.best_params_, scoringMetrics) return SVC(**bayesClf.best_params_, probability=True).fit(self.Xtr, self.ytr)
class SklearnGeneralModel(ModelBase): def __init__(self, is_normalize, model, searchCV=False): self.is_normalize = is_normalize self.model = model self.searchCV = searchCV def build_model(self, config_args=None): if config_args is None: config_args = {} if not self.searchCV: self.model = self.model(**config_args) else: self.model = BayesSearchCV(estimator=self.model(), **config_args) def train(self, x, y): if self.is_normalize: self.scaler = Normalizer() x = self.scaler.fit_transform(x) with warnings.catch_warnings(): warnings.simplefilter('ignore', UserWarning) self.model.fit(x, y) def predict(self, x): if self.is_normalize: x = self.scaler.transform(x) return self.model.predict(x) def feature_based_metrics(self, columns=None, index=None): feature_importance = self.model.best_estimator_.feature_importances_ feature_importance = feature_importance / np.sum(feature_importance) return pd.DataFrame(feature_importance, index=columns, columns=index).T
def LR(self): def on_step(optim_result): score = bayesClf.best_score_ print("Score: LR: ", score * 100) if score == 1: print('Max Score Achieved') return True bayesClf = BayesSearchCV(LogisticRegression(max_iter=100, random_state=0), search_spaceLR, cv=CV, n_iter=N_ITER, scoring=scoringMetrics, return_train_score=False) bayesClf.fit(self.Xtr, self.ytr, callback=on_step) y_pred = bayesClf.best_estimator_.predict(self.Xte) metrics = self.calculateMetrics(y_pred) logClassifier(LogisticRegression(), self.classes, metrics[0], metrics[1], metrics[2], metrics[3], metrics[4], metrics[5], metrics[6], bayesClf.best_params_, scoringMetrics) return LogisticRegression(**bayesClf.best_params_).fit( self.Xtr, self.ytr)
def test_searchcv_callback(): # Test whether callback is used in BayesSearchCV and # whether is can be used to interrupt the search loop X, y = load_iris(True) opt = BayesSearchCV( DecisionTreeClassifier(), { 'max_depth': [3], # additional test for single dimension 'min_samples_split': Real(0.1, 0.9), }, n_iter=5 ) total_iterations = [0] def callback(opt_result): # this simply counts iterations total_iterations[0] += 1 # break the optimization loop at some point if total_iterations[0] > 2: return True # True == stop optimization return False opt.fit(X, y, callback=callback) assert total_iterations[0] == 3 # test whether final model was fit opt.score(X, y)
def testSVMParams(self,pipe): print("{} - CALCULATING BEST PARAMETERS... \n".format(datetime.datetime.now())) X_train, X_test, y_train, y_test = train_test_split(self.df, self.classes.values, train_size=0.75, test_size=.25, random_state=0) listaC = [0.001, 0.01, 0.1, 1, 10,100] listaGamma = [0.001, 0.01, 0.1, 1, 10, 100] listaKernels = ['rbf','linear','poly','sigmoid'] if self.problem == 'classification': paramsGrid = dict(clf__C=listaC, clf__gamma=listaGamma, clf__kernel=listaKernels) grid = BayesSearchCV(pipe,paramsGrid,scoring='accuracy',n_iter=9) elif self.problem == 'regression': if isinstance(self.classes,pd.DataFrame): paramsGrid = dict(reg__estimator__C=listaC, reg__estimator__gamma=listaGamma, reg__estimator__kernel=listaKernels) else: paramsGrid = dict(reg__C=listaC, reg__gamma=listaGamma, reg__kernel=listaKernels) grid = BayesSearchCV(pipe,paramsGrid,scoring='r2',n_iter=9) # print("DF: \n {}".format(self.df)) # print("CLASSES: \n {}".format(self.classes)) print("{} - FITTING DATA... \n".format(datetime.datetime.now())) grid.fit(X_train,y_train) print("{} - BEST RESULTS - {}".format(datetime.datetime.now(),grid.best_score_)) print("{} - TEST RESULTS: {}".format(datetime.datetime.now(),grid.score(X_test, y_test))) return grid.best_params_
def BayesSearchCV_optimisation(data): search_spaces = { 'learning_rate': (0.01, 1.0, 'log-uniform'), 'min_child_weight': (0, 10), 'max_depth': (0, 50), 'max_delta_step': (0, 20), 'subsample': (0.01, 1.0, 'uniform'), 'colsample_bytree': (0.01, 1.0, 'uniform'), 'colsample_bylevel': (0.01, 1.0, 'uniform'), 'reg_lambda': (1e-9, 1000, 'log-uniform'), 'reg_alpha': (1e-9, 1.0, 'log-uniform'), 'gamma': (1e-9, 0.5, 'log-uniform'), 'min_child_weight': (0, 5), 'n_estimators': (50, 100), 'scale_pos_weight': (1e-6, 500, 'log-uniform') } estimator = LGBMClassifier(objective='binary', metric='auc') opt = BayesSearchCV(estimator, search_spaces, n_iter=100, random_state=1234, verbose=0 #scoring = 'accuracy' ) opt.fit(X_train, y_train, callback=status_print)
def search(self, dataset, hyperparameter_space, n_iter, cv, random_seed, scorer, verbose=False): ''' For a given dataset and the space of hyperparameters, does a bayesian hyperparameters search. :input dataset: a Dataset object :input hyperparameter_space: a dictionnary, keys are hyperparameters, value their spaces defined with skopt :input n_iter: the number of iterations of the bayesian search :input cv: the size of the cross validation :input random_seed: int, the seed for the bayesian search :input scorer: str, the name of the scorer :input verbose: bool, print state of the research :return: a skopt.searchcv.BayesSearchCV object ''' if dataset.task == Task.REGRESSION: estimator = RandomForestRegressor(n_jobs=-1, random_state=random_seed) else: estimator = RandomForestClassifier(n_jobs=-1, random_state=random_seed) opt = BayesSearchCV(estimator, hyperparameter_space, n_iter=n_iter, cv=cv, n_jobs=-1, random_state=random_seed, scoring=scorer, verbose=verbose) opt.fit(dataset.X_train, dataset.y_train) return opt
class BayesSearch: def __init__(self, model, search_spaces, n_iter, export_path): self.export_path = export_path self.bayes_cv_tuner = BayesSearchCV( model, search_spaces, cv=5, n_jobs=-1, n_iter=n_iter, verbose=0, refit=True, random_state=RANDOM_SEED, ) def fit(self, X, y): self.bayes_cv_tuner.fit(X, y, callback=self.print_status) self.export_results() def export_results(self): pd.DataFrame(self.bayes_cv_tuner.cv_results_).to_csv( f"{self.export_path}_cv_results.csv") pd.Series(self.bayes_cv_tuner.best_params_).to_json( f"{self.export_path}_best_params.json") dump(self.bayes_cv_tuner, f"{self.export_path}_bayes_search.pkl") def print_status(self, optim_results): print(f""" Model #{len(opt.bayes_cv_tuner.cv_results_['params'])} Best: {self.bayes_cv_tuner.best_score_} Best params: {self.bayes_cv_tuner.best_params_} """)
def main(feature_set_key): x_train, y_train, x_test = read_feature_data(feature_set=feature_set_key) search_spaces = {'iterations': Integer(10, 1000), 'depth': Integer(1, 8), 'learning_rate': Real(0.01, 1.0, 'log-uniform'), 'random_strength': Real(1e-9, 10, 'log-uniform'), 'bagging_temperature': Real(0.0, 1.0), 'border_count': Integer(1, 255), 'l2_leaf_reg': Integer(2, 30), 'scale_pos_weight': Real(0.01, 1.0, 'uniform')} skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0) clf = CatBoostClassifier(thread_count=2, loss_function='Logloss', od_type='Iter', verbose=True ) scorer = make_scorer(matthews_corrcoef) opt = BayesSearchCV(clf, search_spaces, scoring=scorer, cv=skf, n_iter=1, n_jobs=1, return_train_score=False, refit=True, optimizer_kwargs={'base_estimator': 'GP'}, random_state=42) opt.fit(x_train, y_train) print(json.dumps(opt.best_params_, indent=4)) with open('best_params.json', 'w') as outfile: json.dump(opt.best_params_, outfile) dump(opt, 'optimizer.joblib')
def test_LogisticSGLCV_BayesSearchCV(): # make sure LogisticSGLCV gives same best params (l1_ratio and alpha) as # BayesSearchCV X, y, groups = make_group_classification(random_state=42) cv = StratifiedKFold(3) l1_ratios = np.linspace(0, 1, 3) alphas = np.logspace(-4, 4, 3) clf_cv = LogisticSGLCV( alphas=alphas, l1_ratio=l1_ratios, groups=groups, cv=cv, tuning_strategy="bayes", n_bayes_iter=10, random_state=42, ) clf_cv.fit(X, y) search_spaces = { "alpha": (np.min(alphas), np.max(alphas), "log-uniform"), "l1_ratio": (np.min(l1_ratios), np.max(l1_ratios), "uniform"), } clf = LogisticSGL(groups=groups) gs = BayesSearchCV(clf, search_spaces, cv=cv, random_state=42, n_iter=10) gs.fit(X, y) assert len(clf_cv.alphas_) == 10 assert gs.best_params_["l1_ratio"] == clf_cv.l1_ratio_ assert gs.best_params_["alpha"] == clf_cv.alpha_
def run_optimization_test(): N_iter = 100 # log-uniform: understand as search over p = exp(x) by varying x opt = BayesSearchCV( TemplateClassifier(), { "deltaEta": Real(0.0, 4.0, prior="uniform"), "deltaPhi": Real(0.0, 4.0, prior="uniform"), "maxNRegions": Integer(2, 100), "maxNVertices": Integer(1, 5), "nSigmaZBeamSpot": Real(0.0, 30.0, prior="uniform"), "nSigmaZVertex": Real(-1.0, 1.0, prior="uniform"), "originRadius": Real(0.0, 1.0, prior="uniform"), "ptMin": Real(0.0, 2.0, prior="uniform"), "zErrorBeamSpot": Real(0.0, 1.0, prior="uniform"), "zErrorVetex": Real(0.0, 1.0, prior="uniform"), }, n_iter=N_iter, cv=[(slice(None), slice(None))], verbose=1, # scoring="accuracy" ) opt.fit(np.zeros((100, 1)), np.zeros((100))) print("After {} iterations:".format(N_iter)) print("val. score: %s" % opt.best_score_) print("test score: %s" % opt.score(0.0, 0.0)) print("Final params:") params = opt.best_estimator_.get_params() for i, (param, val) in enumerate(params.items()): print("{0}:\t{1:2.2f} vs {2:2.2f}".format(param, val, targets[i]))
def train_val(classifier, name, partition, trial, params, n_iter, X_train, y_train): bscv = BayesSearchCV(classifier, params, n_iter=n_iter, cv=kf, scoring='f1', return_train_score=True, n_jobs=3) log(f'Making {bscv.total_iterations} iterations on {name}_{trial+1} ({partition})' ) total_iters = [0] prior_scores = [] def on_step(optim_result): total_iters[0] += 1 prior_scores.append(bscv.best_score_) log(f'{name}{total_iters}[{trial+1}] current best score: {bscv.best_score_:.4f}' ) if total_iters[0] >= stop_after_iters: if bscv.best_score_ == 1.0 or ( np.mean(prior_scores[-stop_after_iters:]) - np.mean(prior_scores[-(stop_after_iters + 1):-1]) ) < early_stop_tol: log(f'{name}{total_iters}[{trial+1}] stopped early') return True bscv.fit(X_train, y_train, callback=on_step) return bscv
def test_searchcv_callback(): # Test whether callback is used in BayesSearchCV and # whether is can be used to interrupt the search loop X, y = load_iris(True) opt = BayesSearchCV( DecisionTreeClassifier(), { 'max_depth': [3], # additional test for single dimension 'min_samples_split': Real(0.1, 0.9), }, n_iter=5) total_iterations = [0] def callback(opt_result): # this simply counts iterations total_iterations[0] += 1 # break the optimization loop at some point if total_iterations[0] > 2: return True # True == stop optimization return False opt.fit(X, y, callback=callback) assert total_iterations[0] == 3 # test whether final model was fit opt.score(X, y)
def _fit_svc(n_jobs=1, n_points=1, cv=None): """ Utility function to fit a larger classification task with SVC """ X, y = make_classification(n_samples=1000, n_features=20, n_redundant=0, n_informative=18, random_state=1, n_clusters_per_class=1) opt = BayesSearchCV( SVC(), { 'C': Real(1e-3, 1e+3, prior='log-uniform'), 'gamma': Real(1e-3, 1e+1, prior='log-uniform'), 'degree': Integer(1, 3), }, n_jobs=n_jobs, n_iter=11, n_points=n_points, cv=cv, random_state=42, ) opt.fit(X, y) assert opt.score(X, y) > 0.9
def tune_param(model, pipes, param_grid, refit, data, target, cv=5, n_iter=6): ''' Tuning parameters with bayesian search ''' param_grid = { model + '__' + key: param_grid[key] for key in param_grid.keys() } xgbcv = BayesSearchCV(pipes[model], param_grid, scoring="neg_mean_absolute_error", n_iter=n_iter, refit=refit, n_jobs=-1, verbose=True, cv=cv) xgbcv.fit(data, target) print('best score: ' + str(xgbcv.best_score_)) print('best params: ' + str(xgbcv.best_params_)) results = pd.DataFrame(xgbcv.cv_results_) return xgbcv, results
def bo_RandomForestRegressor(X, y): # Define the hyperparameter configuration space rf_params = { 'n_estimators': Integer(10, 100), "max_features": Integer(1, 13), 'max_depth': Integer(5, 50), "min_samples_split": Integer(2, 11), "min_samples_leaf": Integer(1, 11), "criterion": ['mse', 'mae'] } starttime = datetime.datetime.now() clf = RandomForestRegressor(random_state=0) Bayes_rf = BayesSearchCV(clf, rf_params, cv=3, n_iter=20, scoring='neg_mean_squared_error') # number of iterations is set to 20, you can increase this number if time permits Bayes_rf.fit(X, y) # bclf = Bayes_rf.best_estimator_ print("RandomForestRegressor MSE score:" + str(-Bayes_rf.best_score_)) endtime = datetime.datetime.now() process_time_rf = endtime - starttime print("程序执行时间(秒):{}".format(process_time_rf)) print("最佳超参数值集合:", Bayes_rf.best_params_) save_model_object(Bayes_rf, 'BO-GP', 'RandomForestRegressor', 'RandomForestRegressor') return str(-Bayes_rf.best_score_), process_time_rf, Bayes_rf.best_params_
def bo_ANN(X, y): rf_params = { 'activation': ['relu', 'tanh'], 'loss': ['mse'], 'batch_size': [32, 64, 128], 'neurons': Integer(256, 1024), 'epochs': [20, 30, 50, 60] # 'patience': Integer(3, 20) } starttime = datetime.datetime.now() clf = KerasRegressor(build_fn=ANN, verbose=verbose) Bayes_ann = BayesSearchCV(clf, rf_params, cv=3, n_iter=10, scoring='neg_mean_squared_error') Bayes_ann.fit(X, y) print("ANN MSE score:" + str(-Bayes_ann.best_score_)) endtime = datetime.datetime.now() process_time_ann = endtime - starttime print("程序执行时间(秒):{}".format(process_time_ann)) print("最佳超参数值集合:", Bayes_ann.best_params_) model_bo_ann = ANN(**Bayes_ann.best_params_) save_model_object(model_bo_ann, 'BO-GP', 'ANN', 'ANN') return str( -Bayes_ann.best_score_), process_time_ann, Bayes_ann.best_params_
def tune_parameter(X, y, clf, params): # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) gs = BayesSearchCV(estimator=clf, search_spaces=params, scoring="f1", n_iter=100, optimizer_kwargs={"base_estimator": "GP"}, verbose=2, n_jobs=-1, cv=4, refit=True, random_state=1234) gs.fit(X, y, callback=DeltaXStopper(0.000001)) best_params = gs.best_params_ best_score = gs.best_score_ print(best_params) print(best_score) str_time = str(datetime.datetime.now().strftime("%Y-%m-%d_%H-%M")) with open("kuaishou_stats.csv", 'a', newline='') as f: writer = csv.writer(f) writer.writerow(["the best params for svm: "]) for key, value in best_params.items(): writer.writerow([key, value]) writer.writerow(["the best score for svm: ", best_score, str_time]) return gs
def test_searchcv_rank(): """ Test whether results of BayesSearchCV can be reproduced with a fixed random state. """ X, y = load_iris(True) X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=0.75, random_state=0 ) random_state = 42 opt = BayesSearchCV( SVC(random_state=random_state), { 'C': Real(1e-6, 1e+6, prior='log-uniform'), 'gamma': Real(1e-6, 1e+1, prior='log-uniform'), 'degree': Integer(1, 8), 'kernel': Categorical(['linear', 'poly', 'rbf']), }, n_iter=11, random_state=random_state, return_train_score=True ) opt.fit(X_train, y_train) results = opt.cv_results_ test_rank = np.asarray(rankdata(-np.array(results["mean_test_score"]), method='min'), dtype=np.int32) train_rank = np.asarray(rankdata(-np.array(results["mean_train_score"]), method='min'), dtype=np.int32) assert_array_equal(np.array(results['rank_test_score']), test_rank) assert_array_equal(np.array(results['rank_train_score']), train_rank)
def fit(self, X, y): # X_ = X.reset_index(drop=True) # y_ = pd.DataFrame(y) # X_tune_merge = pd.concat([X_, y_], axis=1) # X_tune = X_tune_merge.sample(n=100, random_state=181) # y_tune = X_tune.iloc[:, -1] # X_tune = X_tune.iloc[:, :-1] kfold = KFold(n_splits=5, shuffle=True, random_state=81) bayes = BayesSearchCV(self.model, self.intervals, n_iter=5, n_jobs=-1, cv=kfold, verbose=0, random_state=82) # bayes.fit(X_tune, y_tune) bayes.fit(X, y) # bayes.best_params_.update( {'random_state': 183} ) parameters = bayes.best_params_ super(BayesSVR, self).__init__(**parameters, shrinking=False) # Return the Regressor super(BayesSVR, self).fit(X, y) return self
def perform_bayes_search( estimator, X_train, X_val, y_train, y_val, param_grid, scoring=None ): if isinstance(estimator, cb.core.CatBoostClassifier): eval_set = (X_val, y_val) else: eval_set = [[X_val, y_val]] hyperparam_optimizer = BayesSearchCV( estimator=estimator, search_spaces=param_grid, scoring=scoring, cv=2, n_iter=20, n_jobs=1, refit=True, return_train_score=False, optimizer_kwargs={"base_estimator": "GP"}, random_state=13, fit_params={ "eval_set": eval_set, }, ) hyperparam_optimizer.fit(X_train, y_train) return hyperparam_optimizer.best_estimator_
def test_searchcv_reproducibility(): """ Test whether results of BayesSearchCV can be reproduced with a fixed random state. """ X, y = load_iris(True) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=0) random_state = 42 opt = BayesSearchCV(SVC(random_state=random_state), { 'C': Real(1e-6, 1e+6, prior='log-uniform'), 'gamma': Real(1e-6, 1e+1, prior='log-uniform'), 'degree': Integer(1, 8), 'kernel': Categorical(['linear', 'poly', 'rbf']), }, n_iter=11, random_state=random_state) opt.fit(X_train, y_train) best_est = opt.best_estimator_ opt2 = clone(opt).fit(X_train, y_train) best_est2 = opt2.best_estimator_ assert getattr(best_est, 'C') == getattr(best_est2, 'C') assert getattr(best_est, 'gamma') == getattr(best_est2, 'gamma') assert getattr(best_est, 'degree') == getattr(best_est2, 'degree') assert getattr(best_est, 'kernel') == getattr(best_est2, 'kernel')
def test_searchcv_reproducibility(): """ Test whether results of BayesSearchCV can be reproduced with a fixed random state. """ X, y = load_iris(True) X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=0.75, random_state=0 ) random_state = 42 opt = BayesSearchCV( SVC(random_state=random_state), { 'C': Real(1e-6, 1e+6, prior='log-uniform'), 'gamma': Real(1e-6, 1e+1, prior='log-uniform'), 'degree': Integer(1, 8), 'kernel': Categorical(['linear', 'poly', 'rbf']), }, n_iter=11, random_state=random_state ) opt.fit(X_train, y_train) best_est = opt.best_estimator_ opt2 = clone(opt).fit(X_train, y_train) best_est2 = opt2.best_estimator_ assert getattr(best_est, 'C') == getattr(best_est2, 'C') assert getattr(best_est, 'gamma') == getattr(best_est2, 'gamma') assert getattr(best_est, 'degree') == getattr(best_est2, 'degree') assert getattr(best_est, 'kernel') == getattr(best_est2, 'kernel')
def test_search_cv_internal_parameter_types(): # Test whether the parameters passed to the # estimator of the BayesSearchCV are of standard python # types - float, int, str # This is estimator is used to check whether the types provided # are native python types. class TypeCheckEstimator(BaseEstimator): def __init__(self, float_param=0.0, int_param=0, str_param=""): self.float_param = float_param self.int_param = int_param self.str_param = str_param def fit(self, X, y): assert isinstance(self.float_param, float) assert isinstance(self.int_param, int) assert isinstance(self.str_param, str) return self def score(self, X, y): return 0.0 # Below is example code that used to not work. X, y = make_classification(10, 4) model = BayesSearchCV(estimator=TypeCheckEstimator(), search_spaces={ 'float_param': [0.0, 1.0], 'int_param': [0, 10], 'str_param': ["one", "two", "three"], }, n_iter=11) model.fit(X, y)
def bayesian_optimization(model, space, scorer, x_train, y_train, x_test, y_test, n_iter=256, cv=4, n_jobs=None): global counter global opt if n_jobs is None: n_jobs = cv opt = BayesSearchCV(model, space, scoring=scorer, n_iter=n_iter, cv=cv, verbose=10, n_jobs=n_jobs) counter = 0 opt.fit(x_train, y_train, callback=on_step) print(opt.best_params_) print("val. score: %s" % opt.best_score_) print("test score: %s" % opt.score(x_test, y_test))
def search_parameters(self, x: DataFrame, y: NpArray, parameters: dict, n_folds_validation: int, model: Any, score_type: str) -> tuple: clf = BayesSearchCV(estimator=model, search_spaces=parameters, cv=n_folds_validation, verbose=10, scoring=score_type) clf.fit(x, y) best_params = clf.best_params_ best_score = clf.best_score_ return best_params, best_score
def fit(self, df: pd.DataFrame): df_features = self._to_feature_df(df, True) df_features = df_features.dropna() df_features = df_features.sample(frac=1, random_state=42) X = self._get_X(df_features) y = self._get_y(df_features) if self.optimize_hyperparams: def scorer(estimator, X, y): y_pred = np.clip(np.squeeze(estimator.predict(X)), 0.0, 1.0) return -mean_absolute_error(y, y_pred) print( f'IouEstimator: optimizing hyperparams with Bayesian Optimization' ) opt = BayesSearchCV( LGBMRegressor(), { 'num_leaves': Integer(2, 128, prior='log-uniform', base=2), 'min_child_samples': Integer(2, 512, prior='log-uniform', base=2), 'max_bin': Integer(2, 8192, prior='log-uniform', base=2), }, n_iter=60, optimizer_kwargs={ 'n_initial_points': 20, 'base_estimator': 'GP', }, scoring=scorer, cv=3, refit=False, random_state=42, return_train_score=True, ) opt.fit(X, y) print(f'Found hyperparams {opt.best_params_}') print( f"Train score: {opt.cv_results_['mean_train_score'][opt.best_index_]}" ) print(f'Test score: {opt.best_score_}') estimator = LGBMRegressor(**opt.best_params_) elif self.hyperparams is not None: print(f'IOUEstimator: using using hyperparams {self.hyperparams}') estimator = LGBMRegressor(**self.hyperparams) else: print( f'IOUEstimator: using default hyperparams {self.DEFAULT_HYPERPARAMS}' ) estimator = LGBMRegressor(**self.DEFAULT_HYPERPARAMS) self.estimator_ = estimator.fit(X, y) return self
def test_searchcv_sklearn_compatibility(): """ Test whether the BayesSearchCV is compatible with base sklearn methods such as clone, set_params, get_params. """ X, y = load_iris(True) X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=0) # used to try different model classes pipe = Pipeline([('model', SVC())]) # single categorical value of 'model' parameter sets the model class lin_search = { 'model': Categorical([LinearSVC()]), 'model__C': Real(1e-6, 1e+6, prior='log-uniform'), } dtc_search = { 'model': Categorical([DecisionTreeClassifier()]), 'model__max_depth': Integer(1, 32), 'model__min_samples_split': Real(1e-3, 1.0, prior='log-uniform'), } svc_search = { 'model': Categorical([SVC()]), 'model__C': Real(1e-6, 1e+6, prior='log-uniform'), 'model__gamma': Real(1e-6, 1e+1, prior='log-uniform'), 'model__degree': Integer(1, 8), 'model__kernel': Categorical(['linear', 'poly', 'rbf']), } opt = BayesSearchCV(pipe, [(lin_search, 1), svc_search], n_iter=2) opt_clone = clone(opt) params, params_clone = opt.get_params(), opt_clone.get_params() assert params.keys() == params_clone.keys() for param, param_clone in zip(params.items(), params_clone.items()): assert param[0] == param_clone[0] assert isinstance(param[1], type(param_clone[1])) opt.set_params(search_spaces=[(dtc_search, 1)]) opt.fit(X_train, y_train) opt_clone.fit(X_train, y_train) total_evaluations = len(opt.cv_results_['mean_test_score']) total_evaluations_clone = len(opt_clone.cv_results_['mean_test_score']) # test if expected number of subspaces is explored assert total_evaluations == 1 assert total_evaluations_clone == 1 + 2
def test_searchcv_runs(surrogate, n_jobs): """ Test whether the cross validation search wrapper around sklearn models runs properly with available surrogates and with single or multiple workers. Parameters ---------- * `surrogate` [str or None]: A class of the scikit-optimize surrogate used. None means to use default surrogate. * `n_jobs` [int]: Number of parallel processes to use for computations. """ X, y = load_iris(True) X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=0.75, random_state=0 ) # None search space is only supported when only `step` function is used assert_raises(ValueError, BayesSearchCV(SVC(), None).fit, (X, y)) # check if invalid dimensions are raising errors with pytest.raises(ValueError): BayesSearchCV(SVC(), {'C': '1 ... 100.0'}) with pytest.raises(TypeError): BayesSearchCV(SVC(), ['C', (1.0, 1)]) # create an instance of a surrogate if it is not a string if surrogate is not None: optimizer_kwargs = {'base_estimator': surrogate} else: optimizer_kwargs = None opt = BayesSearchCV( SVC(), { 'C': Real(1e-6, 1e+6, prior='log-uniform'), 'gamma': Real(1e-6, 1e+1, prior='log-uniform'), 'degree': Integer(1, 8), 'kernel': Categorical(['linear', 'poly', 'rbf']), }, n_jobs=n_jobs, n_iter=11, optimizer_kwargs=optimizer_kwargs ) opt.fit(X_train, y_train) # this normally does not hold only if something is wrong # with the optimizaiton procedure as such assert_greater(opt.score(X_test, y_test), 0.9)
def main(): df_train = pd.read_csv('../train_dataset.csv') df_test = pd.read_csv('../test_dataset.csv') X_train, y_train = df_train.iloc[:, 2:], df_train.iloc[:, 0] X_test, y_test = df_test.iloc[:, 2:], df_test.iloc[:, 0] opt = BayesSearchCV( estimator=GradientBoostingClassifier(), # ref: https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/gradient_boosting.py search_spaces={ 'learning_rate': Real(0.01, 1, 'log-uniform'), 'n_estimators': Integer(50, 2000), 'subsample': Real(0.01, 1.0, 'uniform'), 'max_depth': Integer(1, 10), 'max_features': Real(0.1, 1.0, 'uniform'), 'min_samples_split': Integer(2, 20), 'min_samples_leaf': Integer(1, 20), 'criterion': ['friedman_mse', 'mse', 'mae'] }, cv=StratifiedKFold(n_splits=10, shuffle=True), n_jobs=3, n_iter=100, verbose=0, refit=True, random_state=42) def status_print(_): """Status callback durring bayesian hyperparameter search""" # Get all the models tested so far in DataFrame format all_models = pd.DataFrame(opt.cv_results_) best_parap_copy = copy.deepcopy(opt.best_params_) for k, v in opt.best_params_.items(): best_parap_copy[k] = v if isinstance(v, str) or isinstance( v, float) else v.item() param_list = [] for each in json.dumps(best_parap_copy)[1:-1].split(', '): param_list.append('='.join(each[1:].split('": '))) if hasattr(opt.estimator, 'verbose'): param_list.append('verbose=True') param = opt.estimator.__class__.__name__ + \ '(' + ', '.join(param_list) + ')' # Get current parameters and the best parameters print('Model #{}\nBest roc_auc: {}\nBest params: {}\n'.format( len(all_models), np.round(opt.best_score_, 4), param)) opt.fit(X_train, y_train, callback=status_print) print("val. score: %s" % opt.best_score_) print("test score: %s" % opt.score(X_test, y_test))
def main(): df_train = pd.read_csv('../train_dataset.csv') df_test = pd.read_csv('../test_dataset.csv') X_train, y_train = df_train.iloc[:, 2:].values, df_train.iloc[:, 0].values X_test, y_test = df_test.iloc[:, 2:].values, df_test.iloc[:, 0].values # log-uniform: understand as search over p = exp(x) by varying x opt = BayesSearchCV( estimator=SVC(), # ref: https://github.com/automl/auto-sklearn/blob/master/autosklearn/pipeline/components/classification/libsvm_svc.py search_spaces={ 'C': Real(1e-6, 1e+6, 'log-uniform'), 'gamma': Real(3.0517578125e-05, 8, 'log-uniform'), 'kernel': ['rbf', 'poly', 'sigmoid'], # categorical parameter 'decision_function_shape': ['ovo', 'ovr'], 'degree': Integer(2, 5), 'coef0': Real(-1, 1, 'uniform'), 'tol': Real(1e-5, 1e-1, 'log-uniform') }, cv=StratifiedKFold(n_splits=10, shuffle=True), n_jobs=3, n_iter=100, verbose=0, refit=True) def status_print(_): """Status callback durring bayesian hyperparameter search""" # Get all the models tested so far in DataFrame format all_models = pd.DataFrame(opt.cv_results_) best_parap_copy = copy.deepcopy(opt.best_params_) for k, v in opt.best_params_.items(): best_parap_copy[k] = v if isinstance(v, str) or isinstance( v, float) else v.item() param_list = [] for each in json.dumps(best_parap_copy)[1:-1].split(', '): param_list.append('='.join(each[1:].split('": '))) if hasattr(opt.estimator, 'verbose'): param_list.append('verbose=True') param = opt.estimator.__class__.__name__ + \ '(' + ', '.join(param_list) + ')' # Get current parameters and the best parameters print('Model #{}\nBest roc_auc: {}\nBest params: {}\n'.format( len(all_models), np.round(opt.best_score_, 4), param)) opt.fit(X_train, y_train, callback=status_print) print("val. score: %s" % opt.best_score_) print("test score: %s" % opt.score(X_test, y_test))
def get_model(self, X, y): search_space = {'reg_param': Real(0, 1)} model = BayesSearchCV(QuadraticDiscriminantAnalysis(), search_space, random_state=0, n_iter=1, cv=3, n_jobs=-1) model.fit(X, y) return model
def test_searchcv_runs(surrogate, n_jobs, n_points, cv=None): """ Test whether the cross validation search wrapper around sklearn models runs properly with available surrogates and with single or multiple workers and different number of parameter settings to ask from the optimizer in parallel. Parameters ---------- * `surrogate` [str or None]: A class of the scikit-optimize surrogate used. None means to use default surrogate. * `n_jobs` [int]: Number of parallel processes to use for computations. """ X, y = load_iris(True) X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=0.75, random_state=0 ) # create an instance of a surrogate if it is not a string if surrogate is not None: optimizer_kwargs = {'base_estimator': surrogate} else: optimizer_kwargs = None opt = BayesSearchCV( SVC(), { 'C': Real(1e-6, 1e+6, prior='log-uniform'), 'gamma': Real(1e-6, 1e+1, prior='log-uniform'), 'degree': Integer(1, 8), 'kernel': Categorical(['linear', 'poly', 'rbf']), }, n_jobs=n_jobs, n_iter=11, n_points=n_points, cv=cv, optimizer_kwargs=optimizer_kwargs ) opt.fit(X_train, y_train) # this normally does not hold only if something is wrong # with the optimizaiton procedure as such assert_greater(opt.score(X_test, y_test), 0.9)
def test_searchcv_runs_multiple_subspaces(): """ Test whether the BayesSearchCV runs without exceptions when multiple subspaces are given. """ X, y = load_iris(True) X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=0.75, random_state=0 ) # used to try different model classes pipe = Pipeline([ ('model', SVC()) ]) # single categorical value of 'model' parameter sets the model class lin_search = { 'model': Categorical([LinearSVC()]), 'model__C': Real(1e-6, 1e+6, prior='log-uniform'), } dtc_search = { 'model': Categorical([DecisionTreeClassifier()]), 'model__max_depth': Integer(1, 32), 'model__min_samples_split': Real(1e-3, 1.0, prior='log-uniform'), } svc_search = { 'model': Categorical([SVC()]), 'model__C': Real(1e-6, 1e+6, prior='log-uniform'), 'model__gamma': Real(1e-6, 1e+1, prior='log-uniform'), 'model__degree': Integer(1, 8), 'model__kernel': Categorical(['linear', 'poly', 'rbf']), } opt = BayesSearchCV( pipe, [(lin_search, 1), (dtc_search, 1), svc_search], n_iter=2 ) opt.fit(X_train, y_train) # test if all subspaces are explored total_evaluations = len(opt.cv_results_['mean_test_score']) assert total_evaluations == 1+1+2, "Not all spaces were explored!"
def _fit_svc(n_jobs=1, n_points=1, cv=None): """ Utility function to fit a larger classification task with SVC """ X, y = make_classification(n_samples=1000, n_features=20, n_redundant=0, n_informative=18, random_state=1, n_clusters_per_class=1) opt = BayesSearchCV( SVC(), { 'C': Real(1e-3, 1e+3, prior='log-uniform'), 'gamma': Real(1e-3, 1e+1, prior='log-uniform'), 'degree': Integer(1, 3), }, n_jobs=n_jobs, n_iter=11, n_points=n_points, cv=cv, random_state=42, ) opt.fit(X, y) assert_greater(opt.score(X, y), 0.9)
def test_search_cv_internal_parameter_types(): # Test whether the parameters passed to the # estimator of the BayesSearchCV are of standard python # types - float, int, str # This is estimator is used to check whether the types provided # are native python types. class TypeCheckEstimator(BaseEstimator): def __init__(self, float_param=0.0, int_param=0, str_param=""): self.float_param = float_param self.int_param = int_param self.str_param = str_param def fit(self, X, y): assert isinstance(self.float_param, float) assert isinstance(self.int_param, int) assert isinstance(self.str_param, str) return self def score(self, X, y): return 0.0 # Below is example code that used to not work. X, y = make_classification(10, 4) model = BayesSearchCV( estimator=TypeCheckEstimator(), search_spaces={ 'float_param': [0.0, 1.0], 'int_param': [0, 10], 'str_param': ["one", "two", "three"], }, n_iter=11 ) model.fit(X, y)
def test_searchcv_sklearn_compatibility(): """ Test whether the BayesSearchCV is compatible with base sklearn methods such as clone, set_params, get_params. """ X, y = load_iris(True) X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=0.75, random_state=0 ) # used to try different model classes pipe = Pipeline([ ('model', SVC()) ]) # single categorical value of 'model' parameter sets the model class lin_search = { 'model': Categorical([LinearSVC()]), 'model__C': Real(1e-6, 1e+6, prior='log-uniform'), } dtc_search = { 'model': Categorical([DecisionTreeClassifier()]), 'model__max_depth': Integer(1, 32), 'model__min_samples_split': Real(1e-3, 1.0, prior='log-uniform'), } svc_search = { 'model': Categorical([SVC()]), 'model__C': Real(1e-6, 1e+6, prior='log-uniform'), 'model__gamma': Real(1e-6, 1e+1, prior='log-uniform'), 'model__degree': Integer(1, 8), 'model__kernel': Categorical(['linear', 'poly', 'rbf']), } opt = BayesSearchCV( pipe, [(lin_search, 1), svc_search], n_iter=2 ) opt_clone = clone(opt) params, params_clone = opt.get_params(), opt_clone.get_params() assert params.keys() == params_clone.keys() for param, param_clone in zip(params.items(), params_clone.items()): assert param[0] == param_clone[0] assert isinstance(param[1], type(param_clone[1])) opt.set_params(search_spaces=[(dtc_search, 1)]) opt.fit(X_train, y_train) opt_clone.fit(X_train, y_train) total_evaluations = len(opt.cv_results_['mean_test_score']) total_evaluations_clone = len(opt_clone.cv_results_['mean_test_score']) # test if expected number of subspaces is explored assert total_evaluations == 1 assert total_evaluations_clone == 1+2