def evalModel(data, labels): loss = make_scorer(get_rmsle, greater_is_better=False) seed1 = 42 clf = xgb.XGBRegressor(seed=seed1, silent=True) param_dist = { "learning_rate": sp_uniform(0.01, 0.1), "n_estimators": sp_randint(50, 500), "max_depth": sp_randint(2, 6), "subsample": sp_uniform(0.5, 0.4), "max_delta_step": sp_uniform(1, 2), "min_child_weight": sp_uniform(1, 6), "colsample_bytree": sp_uniform(0.8, 0.2) } n_iter_search = 60 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, cv=5, scoring=loss, n_iter=n_iter_search, n_jobs=-1, pre_dispatch='n_jobs', verbose=2) report(random_search.grid_scores_, n_top=5)
def fit(self, x_train, y_train): self.processing_steps = [StandardScaler()] svr = SVR(kernel='rbf', gamma=0.1) # http://www.csie.ntu.edu.tw/~cjlin/papers/guide/guide.pdf # C = [2**i for i in np.arange(start=-5, stop=16, step=2)] # gamma = [2**i for i in np.arange(start=-15, stop=4, step=2)] # https://stats.stackexchange.com/questions/43943/ # which-search-range-for-determining-svm-optimal-c- # and-gamma-parameters C = [2**i for i in [-3, -2, -1, 0, 1, 2, 3, 4, 5]] gamma = [2**i for i in [-5, -4, -3, -2, -1, 0, 1, 2, 3]] params = {"C": sp_uniform(0.125, 32), "gamma": sp_uniform(0.03125, 8)} params.update(self.kwargs) reg = RandomizedSearchCV(estimator=svr, param_distributions=params, n_iter=10, scoring=self.score['function'], cv=3, iid=True) clf = MultiOutputRegressor(reg) self._update_pipeline_and_fit(x_train, y_train, [clf])
def params_optimize(x_train, y_train): x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.10, stratify=y_train) fit_params = { "early_stopping_rounds": 30, "eval_metric": 'auc', "eval_set": [(x_test, y_test)], 'eval_names': ['valid'], # 'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)], 'verbose': 100, 'categorical_feature': 'auto' } param_test = { 'num_leaves': sp_randint(6, 50), 'min_child_samples': sp_randint(100, 500), 'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4], 'subsample': sp_uniform(loc=0.2, scale=0.9), 'colsample_bytree': sp_uniform(loc=0.4, scale=0.6), 'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100], 'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100] } hp_points_to_test = 200 clf = lgb.LGBMClassifier(max_depth=-1, silent=True, metric='None', n_jobs=4, n_estimators=10000) clf = LGBMClassifier( nthread=4, n_estimators=10000, learning_rate=0.02, num_leaves=34, colsample_bytree=0.9497036, subsample=0.8715623, max_depth=8, reg_alpha=0.041545473, reg_lambda=0.0735294, min_split_gain=0.0222415, min_child_weight=39.3259775, silent=-1, verbose=-1, ) gs = RandomizedSearchCV(estimator=clf, param_distributions=param_test, n_iter=hp_points_to_test, scoring='roc_auc', cv=5, refit=True, verbose=True) gs.fit(x_train, y_train, **fit_params) print('Best score reached: {} with params: {} '.format( gs.best_score_, gs.best_params_))
def train_light_gbm_regressor(X, y, cv, n_params, test_size=.2, n_jobs=-1): LGBM_params = { 'num_leaves': sp_randint(6, 50), 'min_child_samples': sp_randint(100, 500), 'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4], 'subsample': sp_uniform(loc=0.2, scale=0.8), 'colsample_bytree': sp_uniform(loc=0.4, scale=0.6), 'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100], 'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100] } Xt, Xv, yt, yv = train_test_split(X, y, test_size=test_size) param_list = list(ParameterSampler(LGBM_params, n_iter=n_params)) param_scores = [] int_skf = KFold(n_splits=cv) for p in range(n_params): best_scs = [] for train_i, test_i in int_skf.split(Xt, yt): Xt_train, yt_train = Xt[train_i], yt[train_i] Xt_test, yt_test = Xt[test_i], yt[test_i] model = LGBMRegressor(n_jobs=n_jobs, silent=True, n_estimators=5000, **param_list[p]) model.fit(Xt_train, yt_train, eval_set=(Xt_test, yt_test), verbose=False, early_stopping_rounds=300) best_sc = model.best_score_['valid_0']['l2'] best_scs.append(best_sc) param_scores.append(np.mean(best_scs)) bp_ind = np.argmin(param_scores) model = LGBMRegressor(n_jobs=n_jobs, silent=True, n_estimators=5000, **param_list[bp_ind]) model.fit(Xt, yt, eval_set=(Xv, yv), verbose=False, early_stopping_rounds=500) return model
def hyperparameter_seach(train_x, train_y): from scipy.stats import randint as sp_randint from scipy.stats import uniform as sp_uniform fit_params = { "early_stopping_rounds": 30, "eval_metric": 'multiclass', "eval_set": [(train_x, train_y)], 'eval_names': ['valid'], 'verbose': 100, 'categorical_feature': [ 'max_dist_mode', 'min_dist_mode', 'max_price_mode', 'min_price_mode', 'max_eta_mode', 'min_eta_mode', 'first_mode', 'weekday', 'hour' ] } param_test = { 'num_leaves': sp_randint(6, 50), 'min_child_samples': sp_randint(100, 500), 'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4], 'subsample': sp_uniform(loc=0.2, scale=0.8), 'colsample_bytree': sp_uniform(loc=0.4, scale=0.6), 'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100], 'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100] } #This parameter defines the number of HP points to be tested n_HP_points_to_test = 100 import lightgbm as lgb from sklearn.model_selection import RandomizedSearchCV, GridSearchCV # n_estimators is set to a "large value". The actual number of trees build will depend on early stopping and 5000 define only the # absolute maximum clf = lgb.LGBMClassifier(max_depth=-1, random_state=314, silent=True, n_jobs=4, n_estimators=5000) gs = RandomizedSearchCV(estimator=clf, param_distributions=param_test, n_iter=n_HP_points_to_test, scoring='f1', cv=3, refit=True, random_state=314, verbose=True) gs.fit(train_x, train_y, **fit_params) print('Best score reached: {} with params: {} '.format( gs.best_score_, gs.best_params_))
def random_search(): from time import time from scipy.stats import uniform as sp_uniform, randint as sp_randint from sklearn.grid_search import RandomizedSearchCV from sklearn.cross_validation import ShuffleSplit crimes = np.load(DATA_FILE) # features_train = crimes['features_train'] all_labels = sorted(list(set(np.unique(crimes['labels_train'])) | set(np.unique(crimes['labels_val'])))) batch_size = 64 labels_train = create_labels(crimes['labels_train'], all_labels) labels_vals = create_labels(crimes['labels_val'], all_labels) labels_full = create_labels(crimes['labels'], all_labels) param_dist = {'layers': sp_randint(1, 3), "hidden_units": [64, 128, 256], 'input_dropout': sp_uniform(0, 0.5), "hidden_dropout": sp_uniform(0, 0.75), "learning_rate": sp_uniform(0.01, 0.1), "weight_decay": sp_uniform(0, 0.01) } model = NeuralNetworkClassifier(n_classes=len(all_labels), batch_size=batch_size, valid_set=(crimes['features_val'], labels_vals)) n_iter_search = 40 np.random.seed(42) random_searcher = RandomizedSearchCV(model, param_distributions=param_dist, scoring=None, n_iter=n_iter_search, random_state=42, error_score=100, verbose=5, cv=ShuffleSplit(n=crimes['features_train'].shape[0], n_iter=1, test_size=0)) start = time() random_searcher.fit(crimes['features_train'], labels_train.ravel()) print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) report(random_searcher.grid_scores_) loss_train = log_loss(labels_train, random_searcher.predict_proba(crimes['features_train'])) loss_val = log_loss(labels_vals, random_searcher.predict_proba(crimes['features_val'])) loss_all = log_loss(labels_full, random_searcher.predict_proba(crimes['features'])) print 'loss_all: ', loss_all print 'loss_train: ', loss_train print 'loss_val: ', loss_val return loss_val
def fit(self, x_train, y_train): self.processing_steps = [StandardScaler()] ann = MLPRegressor() params = { 'hidden_layer_sizes': sp_randint(20, 150), 'alpha': sp_uniform(0, 100), 'max_iter': sp_randint(100, 2000), 'solver': ['lbfgs'], # 'identity', 'logistic', 'tanh', 'relu' 'activation': ['relu'] } if 'hidden_layer_sizes' in self.kwargs: self.kwargs['hidden_layer_sizes'] = self.parsefunction( self.kwargs['hidden_layer_sizes']) params.update(self.kwargs) clf = RandomizedSearchCV(estimator=ann, param_distributions=params, n_iter=10, scoring=self.score['function'], cv=3, iid=True) self._update_pipeline_and_fit(x_train, y_train, [clf])
def get_random_grid_CV_params(): """Define the Random Grid Search parameters for each model.""" logit_params = {"C": sp_expon(loc=0.001, scale=1), "fit_intercept": [True, False], "intercept_scaling": sp_randint(1, 5), "warm_start": [False, True] } rf_params = {"min_samples_split": sp_randint(1, 50), "min_samples_leaf": sp_randint(1, 50), "criterion": ["gini", "entropy"], "class_weight": ['balanced', 'balanced_subsample'] } ada_dt_params = {"learning_rate": sp_expon(loc=0.001, scale=1.5), "algorithm": ['SAMME.R', 'SAMME'] } gbc_params = {"learning_rate": sp_expon(loc=0.001, scale=0.5), "subsample": sp_uniform(loc=0.2, scale=0.8), "max_features": [None, 'auto'], "max_depth": sp_randint(2, 6), } svc_params = {"C": sp_expon(loc=0.001, scale=2), "kernel": ['rbf', 'poly'], "degree": sp_randint(2, 10), "coef0": [0, 1, 2], "shrinking": [True, False] } rnd_CV_param_distributions = {'Logistic': logit_params, 'RandomForest': rf_params, 'AdaBoost_DT': ada_dt_params, 'GBC': gbc_params, 'SVC': svc_params } return rnd_CV_param_distributions
def evalModel(data, labels): loss = make_scorer(get_rmsle, greater_is_better=False) seed1 = 42 clf = xgb.XGBRegressor(seed=seed1, silent=True) param_dist = { "learning_rate":sp_uniform(0.01,0.1), "n_estimators":sp_randint(50,500), "max_depth": sp_randint(2,6), "subsample": sp_uniform(0.5,0.4), "max_delta_step": sp_uniform(1,2), "min_child_weight":sp_uniform(1,6), "colsample_bytree":sp_uniform(0.8,0.2)}; n_iter_search = 60 random_search = RandomizedSearchCV(clf, param_distributions=param_dist, cv=5, scoring=loss, n_iter=n_iter_search,n_jobs=-1,pre_dispatch='n_jobs',verbose=2) report(random_search.grid_scores_,n_top=5)
def get_param_distribution_for_model(model_str, iter_count): pdist = {} if model_str in ['ET', 'RF']: pdist['n_estimators'] = sp_randint(100, 500) pdist['max_features'] = [ 0.15, 0.2, 0.25, 0.3, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95, 1 ] #sp_uniform(0.5, 1) pdist['min_samples_split'] = sp_randint(1, 15) pdist['min_samples_leaf'] = sp_randint(1, 15) pdist['bootstrap'] = [True, False] elif model_str == 'GP': #Fails because Gp will accpet either single values or array-like values, and it seems #RandomizedSearchCV etc. get confused (as they must, given no other information) #corr_methods = ['absolute_exponential', 'squared_exponential', 'generalized_exponential', 'cubic', 'linear'] pdist['corr'] = [ 'absolute_exponential', 'squared_exponential', 'cubic', 'linear' ] #theta0_range = sp_uniform(0.1, 0.9) #thetaL_range = sp_uniform(1e-5, 3e-1) #thetaU_range = sp_uniform(7e-1, 1) #random_start_range = sp_randint(1, 3) ''' pdist = [] for i in range(iter_count): trial_dict = {} trial_dict['corr'] = [random.choice(corr_methods)] #trial_dict['theta0'] = [theta0_range.rvs()] #trial_dict['thetaL'] = [[thetaL_range.rvs()]] #trial_dict['thetaU'] = [[thetaU_range.rvs()]] #trial_dict['random_start'] = [random_start_range.rvs()] pdist.append(trial_dict) ''' elif model_str == 'KNN': pdist['weights'] = ['uniform', 'distance'] pdist['metric'] = ['euclidean', 'manhattan', 'chebyshev'] pdist['n_neighbors'] = sp_randint(2, 50) elif model_str == 'SVR': pdist['kernel'] = ['rbf', 'sigmoid', 'poly'] pdist['degree'] = sp_randint(2, 6) pdist['gamma'] = sp_uniform(1e-2, 1) pdist['coef0'] = sp_uniform(0, 1) pdist['epsilon'] = sp_uniform(1e-2, 3e-1) return pdist
def get_param_dist(self, X): num_rows = X.shape[0] num_features = X[self.inputs].shape[1] param_dist = { 'rank': sp_randint(1, num_features), 'batch_size': sp_randint(1, num_rows), 'lr': sp_uniform(loc=0.001, scale=0.01), } return param_dist
def get_param_distribution_for_model(model_str, iter_count): pdist = {} if model_str in ['ET', 'RF']: pdist['n_estimators'] = sp_randint(100, 500) pdist['max_features'] = [0.15, 0.2, 0.25, 0.3, 0.35, 0.40, 0.45, 0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95, 1]#sp_uniform(0.5, 1) pdist['min_samples_split'] = sp_randint(1, 15) pdist['min_samples_leaf'] = sp_randint(1, 15) pdist['bootstrap'] = [True, False] elif model_str == 'GP': #Fails because Gp will accpet either single values or array-like values, and it seems #RandomizedSearchCV etc. get confused (as they must, given no other information) #corr_methods = ['absolute_exponential', 'squared_exponential', 'generalized_exponential', 'cubic', 'linear'] pdist['corr'] = ['absolute_exponential', 'squared_exponential', 'cubic', 'linear'] #theta0_range = sp_uniform(0.1, 0.9) #thetaL_range = sp_uniform(1e-5, 3e-1) #thetaU_range = sp_uniform(7e-1, 1) #random_start_range = sp_randint(1, 3) ''' pdist = [] for i in range(iter_count): trial_dict = {} trial_dict['corr'] = [random.choice(corr_methods)] #trial_dict['theta0'] = [theta0_range.rvs()] #trial_dict['thetaL'] = [[thetaL_range.rvs()]] #trial_dict['thetaU'] = [[thetaU_range.rvs()]] #trial_dict['random_start'] = [random_start_range.rvs()] pdist.append(trial_dict) ''' elif model_str == 'KNN': pdist['weights'] = ['uniform', 'distance'] pdist['metric'] = ['euclidean', 'manhattan', 'chebyshev'] pdist['n_neighbors'] = sp_randint(2, 50) elif model_str == 'SVR': pdist['kernel'] = ['rbf', 'sigmoid', 'poly'] pdist['degree'] = sp_randint(2, 6) pdist['gamma'] = sp_uniform(1e-2, 1) pdist['coef0'] = sp_uniform(0, 1) pdist['epsilon'] = sp_uniform(1e-2, 3e-1) return pdist
def hpsearch_lgb(x_tr, y_tr, x_va, y_va): n_HP_points_to_test = 100 from scipy.stats import randint as sp_randint from scipy.stats import uniform as sp_uniform param_test = { 'num_leaves': sp_randint(6, 50), 'min_child_samples': sp_randint(100, 500), 'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4], 'subsample': sp_uniform(loc=0.2, scale=0.8), 'colsample_bytree': sp_uniform(loc=0.4, scale=0.6), 'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100], 'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100] } reg = lgb.LGBMRegressor(max_depth=-1, random_state=314, silent=True, metric='None', n_jobs=4, n_estimators=5000) gs = RandomizedSearchCV(estimator=reg, param_distributions=param_test, n_iter=n_HP_points_to_test, scoring='neg_root_mean_squared_error', cv=3, refit=True, random_state=314, verbose=True) fit_params = { "early_stopping_rounds": 30, "eval_metric": 'rmse', "eval_set": [(x_va, y_va)], 'eval_names': ['valid'], #'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)], 'verbose': 100 } #'categorical_feature': 'auto'} gs.fit(x_tr, y_tr, **fit_params) print('Best score reached: {} with params: {} '.format( gs.best_score_, gs.best_params_))
def method_lgbm(random_state): params = { 'learning_rate': 0.01, 'n_jobs': 10, 'n_estimators': 3000, 'random_state': random_state, 'verbose': -1, 'device': 'cpu', 'subsample': 0.5, 'feature_fraction': 0.01, 'lambda_l2': 0.1, 'max_depth': 1, 'min_data_in_leaf': 20 } return lgb.LGBMClassifier(**params), { 'learning_rate': sp_uniform(loc=0.001, scale=0.03), 'subsample': sp_uniform(loc=0.5, scale=0.3), 'max_depth': [1, 3, 7], 'min_data_in_leaf': [1, 3, 7, 10, 20], }
def turning_lgb(self, X_train, y_train, X_val, y_val): """ Applying Randomized search on turning lgb's parameters on validation dataset Args: X_train: Dataframe df: train set y_train: series: train set response X_val: Dataframe df: validation set y_val: series: validation set response return: the turned parameters for lgb """ param_test = { 'min_child_samples': sp_randint(10, 100), 'subsample': sp_uniform(loc=0.2, scale=0.8), 'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1], 'colsample_bytree': sp_uniform(loc=0.4, scale=0.6) } # This parameter defines the number of HP points to be tested n_HP_points_to_test = 300 clf = lgb.LGBMClassifier(is_unbalance=True) gs = RandomizedSearchCV(estimator=clf, param_distributions=param_test, n_iter=n_HP_points_to_test, scoring='f1', cv=3, refit=True, verbose=False) fit_params = { "early_stopping_rounds": 30, "eval_metric": ['logloss'], "eval_set": [(X_val, y_val)], 'eval_names': ['valid'] } gs.fit(X_train, y_train, **fit_params) print('Best f1_score reached: {} with params: {} '.format( gs.best_score_, gs.best_params_)) return (gs.best_params_)
def __init__(self, low, high, step_name, variable_name): """Random variable uniformly distributed between `low` and `high`. Inputs ------ low, high : float step_name, variable_name : str The name of the step in the sklearn pipeline and the name of the variable. """ super().__init__(step_name, variable_name, sp_uniform(low, high - low)) self.low = min(low, high) self.high = max(low, high)
def rand_distribution_hr(self): # LDA and GaussianNB don't need hyperparameters research param_dist = {} if self.method == 'SVC': param_dist.update({'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'C': sp_randint(1, 50), 'degree': sp_randint(1, 10), 'coef0': sp_randint(1, 50), 'gamma': sp_randint(1, 20)}) elif self.method == 'MultinomialNB': param_dist.update({'alpha': sp_uniform(0, 5)}) # (min_value, max_value - min_value) elif self.method == 'RF': param_dist.update({'n_estimators': sp_randint(10, 100), 'max_features': ['log2', 'sqrt', 1.0]}) elif self.method == 'KNN': param_dist.update({'n_neighbors': sp_randint(5, 30), 'weights': ['uniform', 'distance'], 'leaf_size': sp_randint(10, 50)}) elif self.method == 'MLP': param_dist.update({'hidden_layer_sizes': [(30, 30), (40, 40), (50, 50), (50, 30), (50, 20)], 'activation': ['identity', 'logistic', 'tanh', 'relu'], 'alpha': sp_uniform(1e-5, 1e-1)}) return param_dist
def fit_lgb(X, y, lgb_path): # set the hyperparameters # Define the search space param_test ={'num_leaves': sp_randint(6, 50), 'min_child_samples': sp_randint(100, 500), 'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4], 'subsample': sp_uniform(loc=0.2, scale=0.8), 'colsample_bytree': sp_uniform(loc=0.4, scale=0.6), 'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100], 'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]} n_HP_points_to_test = 100 # call the model clf = lgb.LGBMClassifier(max_depth=-1, random_state=314, silent=True, metric='auc', objective = 'binary', n_jobs=-1, n_estimators=5000) # perform the randomized grid search grid_search = RandomizedSearchCV(estimator=clf, param_distributions=param_test, n_iter=n_HP_points_to_test, scoring='roc_auc', cv=3, refit=True, random_state=314, verbose=True) grid_search.fit(X, y) # save the best model model = grid_search.best_estimator_
def tune_grad_boost_regressor(X, y, k_fold, n_iter_search): model_all = [] r2_all = [] r2_mean_all = [] # Gradient Boosting Regressor #loss = 'ls' # ls, huber, lad regressor = GradientBoostingRegressor(loss='ls') # specify parameters and distributions to sample from param_dist = { "learning_rate": sp_uniform(0, 1), "n_estimators": sp_randint(40, 1500), "max_depth": sp_randint(2, 6), "max_features": sp_randint(1, 11), "min_samples_split": sp_randint(2, 11), "min_samples_leaf": sp_randint(1, 11) } # run randomized search random_search = RandomizedSearchCV(estimator=regressor, param_distributions=param_dist, n_iter=n_iter_search, fit_params=None, n_jobs=1, iid=True, refit=True, cv=k_fold, verbose=1, pre_dispatch='2*n_jobs', random_state=None, error_score='raise', return_train_score=True) start = time() random_search.fit(X, y) print("RandomizedSearchCV took %.2f seconds for %d candidates" " parameter settings." % ((time() - start), n_iter_search)) # report results from report(random_search.cv_results_) model = random_search.best_estimator_ print("The best score in the search process is %f", random_search.best_score_) return model
def tune(self, X_train, Y_train): # define grid of possible hyper parameter values param_dist = { 'nodes': sp_randint(5, 50), 'eta': sp_norm(.05, .1), 'lmbda': sp_uniform(0, 1), 'patience': sp_randint(5, 15) } self.net.set_params(verbose=False) random_search = RandomizedSearchCV(self.net, scoring='neg_mean_squared_error', param_distributions=param_dist, cv=5, n_jobs=8) random_search.fit(X_train, Y_train) self.params = random_search.best_params_ self.net = Network(self.NF, nodes=self.params['nodes'], eta=self.params['eta'], lmbda=self.params['lmbda'], patience=self.params['patience']) return self.params
# Set up decay learning rate def learning_rate_power(current_round): base_learning_rate = 0.19000424246380565 min_learning_rate = 0.01 lr = base_learning_rate * np.power(0.995, current_round) return max(lr, min_learning_rate) from scipy.stats import randint as sp_randint from scipy.stats import uniform as sp_uniform tune_params = { 'n_estimators': [200, 500, 1000, 2500, 5000], 'max_depth': sp_randint(4, 12), 'colsample_bytree': sp_uniform(loc=0.8, scale=0.15), 'min_child_samples': sp_randint(60, 120), 'subsample': sp_uniform(loc=0.75, scale=0.25), 'reg_lambda': [1e-3, 1e-2, 1e-1, 1] } fit_params = { 'early_stopping_rounds': 40, 'eval_metric': 'accuracy', 'eval_set': [(X_train, y_train), (X_val, y_val)], 'verbose': 20, 'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_power)] } lgb_clf = lgb.LGBMClassifier(n_jobs=4, objective='binary', random_state=1) gs = RandomizedSearchCV(estimator=lgb_clf,
import numpy as np import pickle n_iter = 2000 k_fold = 10 file_dat = np.load("train_test_v2.npz") X_train = file_dat["X_train"] Y_train = file_dat["y_train"] X_test = file_dat["X_test"] cv = StratifiedKFold(Y_train,n_folds=k_fold,shuffle=True) # initialize the classifier GB = xgb.XGBClassifier() param_grid = {'max_depth': sp_randint(1, 100), 'learning_rate': sp_uniform(loc=0e0,scale=1e0), 'objective':['binary:logistic'], 'nthread': [8], 'missing': [np.nan], 'reg_alpha': [0.01,0.017782794,0.031622777,0.056234133,\ 0.1,0.17782794,0.31622777,0.56234133,1.,1.77827941,\ 3.16227766,5.62341325,10.,\ 17.7827941,31.6227766,56.2341325,100.], 'colsample_bytree': sp_uniform(loc=0.2e0,scale=0.8e0), 'subsample': np.arange(0.6,1.0,step=0.05), 'n_estimators': sp_randint(200,800) } search_GB = RandomizedSearchCV(GB,param_grid,scoring='accuracy',\ n_iter=n_iter,cv=cv,verbose=True) search_GB.fit(X_train,Y_train)
#starting parameters param_test ={ # ============================================================================= # 'num_leaves': sp_randint(100, 1000), # 'max_depth': sp_randint(1, 10), # 'min_data_in_leaf': sp_randint(1, 100), # ============================================================================= # ============================================================================= # 'min_child_samples': sp_randint(100, 1000), # 'min_child_weight': sp_uniform(loc=0, scale=1.0),#[1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4], # 'subsample': sp_uniform(loc=0.2, scale=0.8), # 'colsample_bytree': sp_uniform(loc=0.4, scale=0.6) # ============================================================================= 'bagging_freq': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'bagging_fraction':sp_uniform(loc=0.0, scale=1.0), 'reg_alpha': sp_uniform(loc=0.0, scale=1.0),#[0, 1e-1, 1, 2, 5, 7, 10, 50, 100], 'reg_lambda': sp_uniform(loc=0.0, scale=1.0)#[0, 1e-1, 1, 5, 10, 20, 50, 100] } n_hyper_parameter_points_to_test = 100 #n_estimators is set to a "large value". The actual number of trees build will depend on early stopping and 50000 define only the absolute maximum lgb_reg = lgb.LGBMRegressor(random_state=14, silent=True, metric='gamma', boosting='gbdt', n_jobs=1, n_estimators=50000, bagging_seed=1, max_depth=4, min_data_in_leaf=2, num_leaves=321, colsample_bytree=0.5155872558124424, min_child_samples=815, min_child_weight=0.5122614044196322, subsample=0.5555279793433687 ) gs = RandomizedSearchCV(estimator=lgb_reg, param_distributions=param_test, n_iter=n_hyper_parameter_points_to_test, scoring='neg_mean_absolute_error', cv=5, refit=True, random_state=14, verbose=True, n_jobs = 6)
from evaluate import model_selection_pipeline, generate_challenge_run # from sklearn.utils.estimator_checks import check_estimator # check_estimator(RandomForest) from scipy.stats import randint as sp_randint, uniform as sp_uniform dataset = '../data/pl_trusted_size1_noclc_scaled_pca.csv' init_param = dict(n_estimators=250, max_depth=3, bootstrap=False) param_grid = { 'n_estimators': sp_randint(50, 500), 'criterion': ['gini', 'entropy'], 'max_depth': sp_randint(2, 15), 'min_samples_split': sp_randint(2, 20), 'min_samples_leaf': sp_randint(1, 20), 'max_features': sp_uniform(0.2, 0.8), # range [0.2, 1.] 'bootstrap': [False, True] } results_file = 'experiments/random_forest_model.txt' model_file = 'experiments/random_forest_model.pkl' model_selection_pipeline(dataset, RandomForestModel, param_grid, results_file=results_file, model_file=model_file) # ----------------------------------------------------------------------- # Random search results on subsample: (random search of 20)
#'callbacks': [lgb.reset_parameter(learning_rate=learning_rate_010_decay_power_099)], 'verbose': 500, 'categorical_feature': 'auto', }, } from scipy.stats import randint as sp_randint from scipy.stats import uniform as sp_uniform SEARCH_PARAMS = { '_': { 'min_child_samples': sp_randint(2, 30), # 'num_leaves': [50, 100, 150, 200, 300, 500], # 'subsample': [0.2, 0.4, 0.6, 0.8, 0.9, 1], # 'learning_rate': sp_uniform(loc=0.001, scale=0.020), 'subsample': sp_uniform(loc=0.3, scale=0.7), 'colsample_bytree': sp_uniform(loc=0.3, scale=0.7), 'reg_alpha': sp_uniform(loc=0.0, scale=0.4), 'reg_lambda': sp_uniform(loc=0.0, scale=0.4), }, } # train, test, structures, contributions = t4_load_data(INPUT_DIR) # # train, test = t4_criskiev_features(train, test, structures) # # structures = t4_merge_yukawa(INPUT_DIR, structures) # # structures = t4_crane_features(structures) # # train, test = t4_merge_structures(train, test, structures)
print("Parameters: {0}".format(score.parameters)) print("") print("Starting RandomizedSearchCV") n_features = X_train.shape[1] N_FOLDS = 10 model = xgb.XGBRegressor() # specify parameters and distributions to sample from param_dist = {"objective": ["reg:linear"], # "booster" : ["gbtree"], # "eta": [0.1, 0.3, 0.5, 0.7], "max_depth": sp_randint(10, 30), "subsample": sp_uniform(0.1, 0.9), "colsample_bytree": sp_uniform(0.1, 1.0), "silent": [1], "seed": [42] } # run randomized search n_iter_search = 30 folds = cv.KFold(n=len(y_train), n_folds=N_FOLDS, shuffle=True, random_state=42) random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=n_iter_search, cv=folds, n_jobs=-1, scoring=utils.rmspe_scorer, iid=True,
x = np.load("/home/arjun/PycharmProjects/ML_proj/scripts/dataset1.npy") y = np.load("/home/arjun/PycharmProjects/ML_proj/scripts/outcome1.npy") X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=1) n_iter = 5 k_fold = 5 cv = StratifiedKFold(n_splits=k_fold, shuffle=True) GB = xgb.XGBClassifier() param_grid = {'max_depth': sp_randint(1, 90), 'learning_rate': sp_uniform(loc=0e0, scale=1e0), 'objective': ['multi:softprob'], 'nthread': [8], 'missing': [np.nan], 'reg_alpha': [0.01, 0.017782794, 0.031622777, 0.056234133, \ 0.1, 0.17782794, 0.31622777, 0.56234133, 1., 1.77827941, \ 3.16227766, 5.62341325, 10., \ 17.7827941, 31.6227766, 56.2341325, 100.], 'colsample_bytree': sp_uniform(loc=0.2e0, scale=0.8e0), 'subsample': sp_uniform(loc=0.2e0, scale=0.8e0), 'n_estimators': sp_randint(50, 200)} search_GB = RandomizedSearchCV(GB,param_grid,\ n_iter=n_iter,cv=cv,verbose=True).fit(X_train,y_train) print(search_GB.cv_results_) print(' ', search_GB.best_score_)
param_dist = {"max_depth": sp_randint(5, 50), "max_features": [0.1, 0.01, 0.001, 'auto', 'log2'], "min_samples_split": sp_randint(1, 11), "min_samples_leaf": sp_randint(1, 11), "bootstrap": [True, False] } rf1 = model_param_search( rf1, x_train, y_train, param_dist, scoring, n_iter_search, n_cv, verbose, model_id='rf1') rf2 = model_param_search( rf2, x_train, y_train, param_dist, scoring, n_iter_search, n_cv, verbose, model_id='rf2') ext1 = model_param_search( rf1, x_train, y_train, param_dist, scoring, n_iter_search, n_cv, verbose, model_id='ext1') ext2 = model_param_search( rf2, x_train, y_train, param_dist, scoring, n_iter_search, n_cv, verbose, model_id='ext2') xgb_estimator_fit(xgb1, x_train_xgb, y_train, 'mlogloss', useTrainCV=True, cv_folds=n_cv, early_stopping_rounds=50) param_dist = {"max_depth": sp_randint(10, 40), "min_child_weight": sp_randint(1, 20), "subsample": sp_uniform(0, 1), "colsample_bytree": sp_uniform(0, 1), "gamma": [i/10.0 for i in range(0, 5)] } xgb1 = model_param_search( xgb1, x_train_xgb, y_train, param_dist, scoring, n_iter_search, n_cv, verbose, model_id='xgb1') xgb_estimator_fit(xgb1, x_train_xgb, y_train, 'mlogloss', useTrainCV=True, cv_folds=n_cv, early_stopping_rounds=50)
def uniform(a, b): loc = a scale = b - a return sp_uniform(loc, scale)
from scipy.stats import uniform as sp_uniform # param_test ={'num_leaves': sp_randint(6, 50), # 'min_child_samples': sp_randint(100, 500), # 'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4], # 'subsample': sp_uniform(loc=0.2, scale=0.8), # 'colsample_bytree': sp_uniform(loc=0.4, scale=0.6), # 'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100], # 'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]} # Custom param_test ={'num_leaves': sp_randint(10, 20), 'min_child_samples': sp_randint(250, 450), 'min_child_weight': [1e-2, 1e-1, 1, 1e1], 'subsample': [0.8], "max_depth": [8, 12], 'colsample_bytree': sp_uniform(loc=0.4, scale=0.6), 'reg_alpha': [0, 1, 2, 5], 'reg_lambda': [0, 1, 5]} param_test = {'colsample_bytree': [0.95], 'max_depth': [10], 'min_child_samples': [429], 'min_child_weight': [1], 'num_leaves': [12], 'reg_alpha': [5], 'reg_lambda': [5], 'subsample': [0.8, 0.85]} # In[18]: #This parameter defines the number of HP points to be tested n_HP_points_to_test = 2 import lightgbm as lgb
def test_opt_lightgbm(self): df_train = dsutils.load_adult().head(1000) y = df_train.pop(14).values X = df_train cols = X.columns num_cols = X._get_numeric_data().columns cat_cols = list(set(cols) - set(num_cols)) le = LabelEncoder() for c in cat_cols: X[c] = le.fit_transform(X[c]) clf = LGBMClassifier(n_estimators=10, boosting_type='gbdt', categorical_feature=cat_cols, num_leaves=31) fit_params = {'eval_metric': 'roc_auc'} # randomized_search param_distributions = { # 'iterations': sp_randint(10, 1000), 'max_depth': [1, 3, 5], # sp_randint(1, 5), 'learning_rate': sp_uniform(0.01, 1.0), } best_params1 = BatchTrainer.randomized_search(clf, param_distributions, X, y, fit_params=fit_params, scoring='roc_auc', n_jobs=1, cv=5) # grid_search param_grid = { # 'iterations': [10, 30], 'max_depth': [1, 3, 5], # sp_randint(1, 5), 'learning_rate': [0.01, 0.05, 0.1], } best_params2 = BatchTrainer.grid_search(clf, param_grid, X, y, fit_params=fit_params, scoring='roc_auc', n_jobs=1, cv=5) # bayes_search search_spaces = { 'max_depth': Integer(1, 5), 'learning_rate': Real(0.02, 0.6, 'log-uniform'), } best_params3 = BatchTrainer.bayes_search(clf, search_spaces, X, y, fit_params=fit_params, scoring='roc_auc', n_jobs=1, cv=5, n_iter=10) assert best_params1['max_depth'] > 0 assert best_params2['max_depth'] > 0 assert best_params3['max_depth'] > 0
def test_opt_catboost(self): df_train = dsutils.load_adult().head(1000) y = df_train.pop(14).values X = df_train cols = X.columns num_cols = X._get_numeric_data().columns cat_cols = list(set(cols) - set(num_cols)) clf = CatBoostClassifier(thread_count=4, loss_function='Logloss', cat_features=cat_cols, od_type='Iter', nan_mode='Min', iterations=1, eval_metric='AUC', metric_period=50, verbose=False) fit_params = {'early_stopping_rounds': 10} # randomized_search param_distributions = { # 'iterations': sp_randint(10, 1000), 'depth': [1, 3, 5], # sp_randint(1, 5), 'learning_rate': sp_uniform(0.01, 1.0), } best_params1 = BatchTrainer.randomized_search(clf, param_distributions, X, y, fit_params=fit_params, scoring='roc_auc', n_jobs=1, cv=5) # grid_search param_grid = { # 'iterations': [10, 30], 'depth': [1, 3, 5], # sp_randint(1, 5), 'learning_rate': [0.01, 0.05, 0.1], } best_params2 = BatchTrainer.grid_search(clf, param_grid, X, y, fit_params=fit_params, scoring='roc_auc', n_jobs=1, cv=5) # bayes_search search_spaces = { 'depth': Integer(1, 5), 'learning_rate': Real(0.02, 0.6, 'log-uniform'), } best_params3 = BatchTrainer.bayes_search(clf, search_spaces, X, y, fit_params=fit_params, scoring='roc_auc', n_jobs=1, cv=5, n_iter=10) assert best_params1['depth'] > 0 assert best_params2['depth'] > 0 assert best_params3['depth'] > 0
print("BEST CV SCORE: " + str(gs_results.best_score_)) # Predict (after fitting GridSearchCV is an estimator with best parameters) y_pred = gs.predict(X_test) # Score score = r2_score(y_test, y_pred) print("R2 SCORE ON TEST DATA: {}".format(score)) #============================================================================== # Random Search CV #============================================================================== hyper_space = {'n_estimators': sp_randint(1000, 2500), 'max_depth': [4, 5, 8, -1], 'num_leaves': [15, 31, 63, 127], 'subsample': sp_uniform(0.6, 0.4), 'colsample_bytree': sp_uniform(0.6, 0.4)} # Random Search CV rs = RandomizedSearchCV(est, hyper_space, n_iter=60, scoring='r2', cv=4, verbose=1, random_state=2018) rs_results = rs.fit(X_train, y_train) print("BEST PARAMETERS: " + str(rs_results.best_params_)) print("BEST CV SCORE: " + str(rs_results.best_score_)) # Predict (after fitting RandomizedSearchCV is an estimator with best parameters) y_pred = rs.predict(X_test) # Score score = r2_score(y_test, y_pred) print("R2 SCORE ON TEST DATA: {}".format(score))
for bs_ind in range(N_bs): x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=bs_ind) scaler = StandardScaler() scaler.fit(x_train) x_train_stand = scaler.transform(x_train) x_test_stand = scaler.transform(x_test) scaler = StandardScaler() scaler.fit(y_train) y_train_stand = scaler.transform(y_train) y_test_stand = scaler.transform(y_test) SGD = SGDRegressor(random_state=bs_ind) param_dist1 = {'penalty': ['l1', 'l2'], 'alpha': sp_uniform(1e-5, 10.0)} sgd_lr = RandomizedSearchCV(SGD, param_dist1, n_iter=200, n_jobs=-1, cv=5, random_state=25, scoring='neg_mean_squared_error') sgd = sgd_lr.fit(x_train_stand, y_train_stand) y_pred = sgd.predict(x_test_stand) MSE_vec_sgd[bs_ind] = mean_squared_error(y_test_stand, y_pred) print('MSE for test set', bs_ind, ' is', MSE_vec_sgd[bs_ind]) # In[54]: mse_min = 0.3 for i, mse in enumerate(MSE_vec_sgd):
def hyperparameter_tuning(self) -> None: """ Performs a hyperparameter tuning search (either grid search or randomised search) on the defined parameters and saves the results in a CSV file for further analysis. Note: only designed to work with MLP (determined based on initial evaluations). :return: None. """ # Determine scoring metric to use based on dataset. scoring = str() if config.dataset == "binary": scoring = "f1" elif config.dataset == "multi": scoring = "f1_weighted" parameters = dict() search_alg_str = str() # Initialise Grid Search. if config.is_grid_search: print("Hyperparameter tuning technique chosen: GRID SEARCH") if config.dataset == "binary": parameters = { "hidden_layer_sizes": [(98,), (98, 98), (114,), (114, 114)], "learning_rate_init": [0.001, 0.03, 0.04, 0.1], "alpha": [0.0001, 0.26, 0.96] } print(parameters) elif config.dataset == "multi": parameters = { "hidden_layer_sizes": [(68,), (68, 68), (100,), (100, 100)], "learning_rate_init": [0.001, 0.01, 0.1], "momentum": [0.1, 0.9], "alpha": [0.0001, 0.1, 0.9] } searchCV = GridSearchCV( param_grid=parameters, estimator=self.clf, cv=self.folds, scoring=scoring ) search_alg_str = "gs" # Initialise Randomised Search. elif config.is_randomised_search: print("Hyperparameter tuning technique chosen: RANDOMISED SEARCH") parameters = { 'hidden_layer_sizes': (sp_randint(1, 150)), 'learning_rate_init': sp_uniform(0.001, 1), 'momentum': sp_uniform(0.1, 0.9), 'alpha': sp_uniform(0.0001, 1) } searchCV = RandomizedSearchCV( param_distributions=parameters, estimator=self.clf, n_iter=100, cv=self.folds, scoring=scoring ) search_alg_str = "rs" # Run the search and save results in a CSV file. gs_results = searchCV.fit(self.X, self.y) gs_results_df = pd.DataFrame(gs_results.cv_results_) gs_results_df.to_csv("../results/grid_search/{}_{}_{}.csv".format(config.dataset, config.model, search_alg_str)) # Print the best model found by hyperparameter tuning algorithm for the MLP and save the model in a Pickle file. final_model = gs_results.best_estimator_ print("\nBest model hyperparameters found by randomised search algorithm:") print(final_model) print("Score: {}".format(gs_results.best_score_)) save_model(final_model, config.dataset, "{}_{}_{}_best_estimator".format(config.dataset, config.model, search_alg_str))
fit_params = { "early_stopping_rounds": 100, "eval_metric": 'auc', "eval_set": [(X, y)], 'eval_names': ['valid'], 'verbose': 0, 'categorical_feature': 'auto' } param_test = { 'learning_rate': [0.01, 0.02, 0.03, 0.04, 0.05, 0.08, 0.1, 0.2, 0.3, 0.4], 'n_estimators': [100, 200, 300, 400, 500, 600, 800, 1000, 1500, 2000], 'num_leaves': sp_randint(6, 50), 'min_child_samples': sp_randint(100, 500), 'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4], 'subsample': sp_uniform(loc=0.2, scale=0.8), 'max_depth': [-1, 1, 2, 3, 4, 5, 6, 7], 'colsample_bytree': sp_uniform(loc=0.4, scale=0.6), 'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100], 'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100] } #number of combinations n_iter = 300 #intialize lgbm and lunch the search lgbm_clf = lgbm.LGBMClassifier(random_state=random_state, silent=True, metric='None', n_jobs=4) grid_search = RandomizedSearchCV(estimator=lgbm_clf,
Y = Y[rnd_idx] #split data to train and test #from sklearn.cross_validation import train_test_split from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X_data, Y, test_size=0.15, random_state=42) ############### Classification with random search ############## from scipy.stats import uniform as sp_uniform # Create parameters grid for RBF kernel, we have to set C and gamma C_dist = sp_uniform(scale=10) gamma_dist = sp_uniform(scale=1) parameters = {'kernel':['rbf'], 'C':C_dist, 'gamma': gamma_dist } from sklearn.model_selection import RandomizedSearchCV n_iter_search = 8 svm_clsf = svm.SVC() rnd_clsf = RandomizedSearchCV(estimator=svm_clsf, param_distributions=parameters, n_iter=n_iter_search, cv=3, n_jobs=1, verbose=2)