def test_tune_best_score_reproducibility(self) -> None: boston = sklearn.datasets.load_boston() X_trainval, X_test, y_trainval, y_test = train_test_split( boston.data, boston.target, random_state=0) train = lgb.Dataset(X_trainval, y_trainval) params = { "objective": "regression", "metric": "rmse", "random_seed": 0 } tuner_first_try = lgb.LightGBMTunerCV( params, train, early_stopping_rounds=3, folds=KFold(n_splits=3), optuna_seed=10, ) tuner_first_try.run() best_score_first_try = tuner_first_try.best_score tuner_second_try = lgb.LightGBMTunerCV( params, train, early_stopping_rounds=3, folds=KFold(n_splits=3), optuna_seed=10, ) tuner_second_try.run() best_score_second_try = tuner_second_try.best_score assert best_score_second_try == best_score_first_try
def tune(self, X, y): dtrain = lgb_opt.Dataset(X, label=y) params = { "objective": "binary", "metric": "auc", "verbosity": -1, "boosting_type": "gbdt", } tuner = lgb_opt.LightGBMTunerCV(params, dtrain, verbose_eval=100, early_stopping_rounds=100, folds=KFold(n_splits=3)) tuner.run() print("Best score:", tuner.best_score) best_params = tuner.best_params print("Best params:", best_params) print(" Params: ") for key, value in best_params.items(): print(" {}: {}".format(key, value)) return best_params
def test_tune_best_score_reproducibility(self) -> None: california = sklearn.datasets.fetch_california_housing() X_trainval, X_test, y_trainval, y_test = train_test_split( california.data, california.target, random_state=0 ) train = lgb.Dataset(X_trainval, y_trainval) params = { "objective": "regression", "metric": "rmse", "random_seed": 0, "deterministic": True, "force_col_wise": True, "verbosity": -1, } tuner_first_try = lgb.LightGBMTunerCV( params, train, early_stopping_rounds=3, folds=KFold(n_splits=3), optuna_seed=10, ) tuner_first_try.run() best_score_first_try = tuner_first_try.best_score tuner_second_try = lgb.LightGBMTunerCV( params, train, early_stopping_rounds=3, folds=KFold(n_splits=3), optuna_seed=10, ) tuner_second_try.run() best_score_second_try = tuner_second_try.best_score assert best_score_second_try == best_score_first_try
def _single_train(features, targets, params): ''' train single column of target ''' trainval = lgb.Dataset(features, targets) tuner = lgb.LightGBMTunerCV( params, trainval, verbose_eval=100, early_stopping_rounds=100, folds=KFold(n_splits=3), ) tuner.run() return tuner.best_params, tuner.best_score
data = train.drop(['revenue'], axis=1) target = train.revenue logtarget = np.log1p(target) dtrain = lgb.Dataset(data, label=logtarget) ### set the parameters and optimize the hiper-parameters #### params = { "objective": "rmse", "metric": "rmse", "verbosity": -1, "boosting_type": "gbdt", } tuner = lgb.LightGBMTunerCV(params, dtrain, verbose_eval=100, early_stopping_rounds=100, folds=KFold(n_splits=10), return_cvbooster=True) tuner.run() ### Print the results ### print("Best score:", tuner.best_score) best_params = tuner.best_params print("Best params:", best_params) print(" Params: ") for key, value in best_params.items(): print(" {}: {}".format(key, value)) ### Save the best model #### # model = tuner.get_best_booster() # model.save_model('lgbm_model.txt')
def training(train, test, validation_size, estimator, target_variable, drop_list, target_type, cv_folds, scoring_cv, cv=True, final=False, hypertuning=False): import matplotlib.pyplot as plt import pandas as pd import lightgbm as lgbm import training import os import sklearn import numpy as np import seaborn as sns import re import matplotlib.pyplot as plt import math from datetime import datetime import datetime import statsmodels.api as sm from sklearn.model_selection import train_test_split from scipy import stats from sklearn.feature_selection import SelectFromModel from sklearn.model_selection import cross_val_score, validation_curve from sklearn.ensemble import RandomForestClassifier from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import GradientBoostingRegressor from sklearn.ensemble import GradientBoostingClassifier from sklearn.model_selection import RandomizedSearchCV from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer from sklearn import ensemble from sklearn.linear_model import LogisticRegression from sklearn.model_selection import cross_val_score from sklearn.model_selection import GridSearchCV from sklearn.preprocessing import OneHotEncoder from sklearn.impute import SimpleImputer from sklearn.preprocessing import KBinsDiscretizer from sklearn.metrics import mean_squared_log_error from sklearn.metrics import make_scorer from sklearn.model_selection import KFold from sklearn.metrics import (confusion_matrix, accuracy_score, recall_score, roc_curve, roc_auc_score, plot_roc_curve, mean_squared_error) import xgboost import shap from catboost import CatBoostClassifier from catboost import CatBoostRegressor import lightgbm as lgbm import optuna.integration.lightgbm as lgb from optuna.integration import _lightgbm_tuner as tuner from optuna.integration._lightgbm_tuner import LightGBMTuner from optuna.integration._lightgbm_tuner import LightGBMTunerCV rmsle_scorer = make_scorer(score_func) train_y = train[target_variable] train_x = train.drop(columns=drop_list) test_y = test[target_variable] test_x = test.drop(columns=drop_list) column_names = list(train_x.columns) if final==True: train_x = train_x.append(test_x) train_y = train_y.append(test_y) if target_type=="bin": if estimator == "log_sk": model = LogisticRegression(max_iter=1000) log_sk = model.fit(train_x, train_y) fitted_model = log_sk if estimator == "gb" and hypertuning==False: model = ensemble.GradientBoostingClassifier(learning_rate = 0.1, max_depth=3, n_estimators= 100) gb = model.fit(train_x, train_y) fitted_model = gb if estimator == "gb" and hypertuning==True: param_grid = { 'n_estimators': [100, 200, 400], 'max_depth': [3, 5, 7], 'learning_rate': [0.1, 0.05, 0.025, 0.01, 0.001, 0.005], 'random_state': [42] } gb = ensemble.GradientBoostingClassifier() gb_grid = GridSearchCV(gb, param_grid, cv=cv_folds, scoring=scoring_cv) gb_grid.fit(train_x, train_y) print('Optimal parameters for gradient boosting classifier = ', gb_grid.best_params_) gb = gb_grid.best_estimator_ fitted_model = gb if estimator == "rf" and hypertuning==False: model = ensemble.RandomForestClassifier(max_depth= 80, max_features= 5, min_samples_leaf= 3, min_samples_split= 12, n_estimators= 100) rf = model.fit(train_x, train_y) fitted_model=rf if estimator == "rf" and hypertuning==True: param_grid = { 'bootstrap': [True], 'max_depth': [10, 20, 30], 'max_features': [2, 3, 5], 'min_samples_leaf': [3, 5, 10], 'min_samples_split': [8, 12], 'n_estimators': [100, 300, 500], 'n_jobs': [3] } rf = RandomForestClassifier() rf_grid = GridSearchCV(rf, param_grid, cv=cv_folds, scoring=scoring_cv) rf_grid.fit(train_x, train_y) print('Optimal parameters for random forest classifier = ', rf_grid.best_params_) rf = rf_grid.best_estimator_ fitted_model = rf if cv and hypertuning==False: cross_val_accuracy = cross_val_score(estimator=model , X=train_x , y=train_y , cv=cv_folds , scoring=scoring_cv) print(f'The average cross validation accuracy of the model is {round(cross_val_accuracy.mean(), 2)}') print(cross_val_accuracy) if target_type=="con": if estimator == "lgbm" and hypertuning==False: train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=validation_size, shuffle=True, random_state=42) train_data=lgb.Dataset(train_x,label=train_y) valid_data=lgb.Dataset(valid_x,label=valid_y) model = lgbm.LGBMRegressor(random_state=42, n_estimators=1000) lgbm_model = model.fit(train_x, train_y, eval_set=[(valid_x, valid_y)], eval_metric=scoring_cv, verbose = -1) fitted_model = lgbm_model if estimator == "lin_reg" and hypertuning==False: model = LinearRegression(max_iter=1000) lin_reg = model.fit(train_x, train_y) fitted_model = lin_reg if estimator == "gb" and hypertuning==False: model = ensemble.GradientBoostingRegressor(learning_rate = 0.001, max_depth=5, n_estimators= 100) gb = model.fit(train_x, train_y) fitted_model = gb if estimator == "rf" and hypertuning==False: model = ensemble.RandomForestRegressor(max_depth= 30, max_features= 5, min_samples_leaf= 3, min_samples_split= 8, n_estimators= 500, n_jobs= -1) rf = model.fit(train_x, train_y) fitted_model=rf if estimator == "gb" and hypertuning==True: # {'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 100, 'random_state': 42} param_grid = { 'n_estimators': [100,500,1000], 'max_features': ["auto","sqrt","log2",0.6,0.8], 'min_samples_leaf':[30,50,70], 'min_samples_split':[10,20,500,100], 'max_depth' : [10,15,20,25], 'learning_rate':[0.1,0.01,0.001] } gb = ensemble.GradientBoostingRegressor() gb_grid = GridSearchCV(gb, param_grid, cv=cv_folds, scoring=scoring_cv) gb_grid.fit(train_x, train_y) print('Optimal parameters for gradient boosting regressor = ', gb_grid.best_params_) gb = gb_grid.best_estimator_ fitted_model = gb if estimator == "lgbm" and hypertuning==True: if __name__ == "__main__": dtrain = lgb.Dataset(train_x, label=train_y) params = { "objective": "regression", "metric": "rmse", "verbosity": -1, "boosting_type": "gbdt", } tuner = lgb.LightGBMTunerCV( params, dtrain, verbose_eval=100, early_stopping_rounds=100, folds=KFold(n_splits=5) ) tuner.run() print("Best score:", tuner.best_score) best_params = tuner.best_params print("Best params:", best_params) print(" Params: ") for key, value in best_params.items(): print(" {}: {}".format(key, value)) if estimator == "rf" and hypertuning==True: # {'bootstrap': True, 'max_depth': 80, 'max_features': 2, 'min_samples_leaf': 5, 'min_samples_split': 12, 'n_estimators': 100, 'n_jobs': 1} # max_depth= 80, max_features= 5, min_samples_leaf= 3, min_samples_split= 8, n_estimators= 300, n_jobs= 1 # {'bootstrap': True, 'max_depth': 100, 'max_features': 5, 'min_samples_leaf': 3, 'min_samples_split': 8, 'n_estimators': 500, 'n_jobs': 4} param_grid = { 'max_depth': [10, 20, 30], 'max_features': [2, 3, 5], 'min_samples_leaf': [3, 5, 10], 'min_samples_split': [8, 12], 'n_estimators': [100, 300, 500], 'n_jobs': [4] } rf = RandomForestRegressor() rf_grid = GridSearchCV(rf, param_grid, cv=cv_folds, scoring=scoring_cv) rf_grid.fit(train_x, train_y) print('Optimal parameters for random forest regressor = ', rf_grid.best_params_) rf = rf_grid.best_estimator_ fitted_model = rf if cv and hypertuning==False: cross_val_rmse = cross_val_score(estimator=model , X=train_x , y=train_y , cv=cv_folds , scoring=scoring_cv) print(f'The average cross validation rmsle of the model is {-1*round(cross_val_rmse.mean(), 2)}') print(cross_val_rmse) if estimator=="gb" or estimator=="rf" or estimator=="lgbm": list_all_Features = train_x.columns.tolist() # Feature importance fi_df = pd.DataFrame({"Feature": list_all_Features, "Importance": fitted_model.feature_importances_}).sort_values(by="Importance", ascending=False) fi_selected=fi_df[:15] important_feature_list = fi_selected["Feature"].tolist() if estimator=="gb": fi_selected.to_excel(r'fi_selected.xlsx') fig = plt.figure(figsize=(20,10)) feat_importances = pd.Series(fitted_model.feature_importances_, index=list_all_Features) feat_importances.nlargest(30).plot(kind='barh', color="green") plt.title("Feature Importance from Gradient Boosting") plt.savefig('Feature Importance from Gradient Boosting.png', bbox_inches = "tight") if estimator=="rf": fi_selected.to_excel(r'fi_selected.xlsx') fig = plt.figure(figsize=(20,20)) feat_importances = pd.Series(fitted_model.feature_importances_, index=list_all_Features) feat_importances.nlargest(30).plot(kind='barh', color="green") plt.title("Feature Importance from Random Forest") plt.savefig('Feature Importance from Random Forest.png', bbox_inches = "tight") if estimator=="lgbm": fi_selected.to_excel(r'fi_selected.xlsx') feat_importances = pd.Series(fitted_model.feature_importances_, index=list_all_Features) explainer = shap.TreeExplainer(fitted_model) shap_values = explainer.shap_values(valid_x) shap.initjs() force_plot = shap.force_plot(explainer.expected_value, shap_values[0,:], valid_x.iloc[0,:]) shap.save_html("index_force_plot.htm", force_plot) force_plot_all = shap.force_plot(explainer.expected_value, shap_values, valid_x) shap.save_html("index_force_plot_all.htm", force_plot_all) plt.figure(figsize=(10,20)) shap.summary_plot(shap_values, valid_x, show=False) plt.savefig('summary_plot.png', bbox_inches = "tight") top_features = feat_importances.nlargest(10) top_features = top_features.reset_index() top_features = top_features['index'].to_list() for i in top_features: plt.figure(figsize=(20,20)) shap.dependence_plot(i, shap_values, valid_x, show=False) plt.savefig(f"dep_plot_{i}.png", bbox_inches = "tight") if final==False and target_type=="con": yhat = fitted_model.predict(test_x).astype(float) y_pred = list(yhat.astype(float)) y_true = list(test_y) print(np.sqrt(mean_squared_error(y_true, y_pred))) if final==False and target_type=="bin": yhat = fitted_model.predict(test_x) y_pred = list(map(round, yhat)) cm = confusion_matrix(test_y, y_pred) print ("Confusion Matrix : \n", cm) print('Test accuracy = ', accuracy_score(test_y, prediction)) print('Test recall = ', recall_score(test_y, prediction)) return fitted_model
""" import sklearn.datasets from sklearn.model_selection import KFold import optuna.integration.lightgbm as lgb if __name__ == "__main__": data, target = sklearn.datasets.load_breast_cancer(return_X_y=True) dtrain = lgb.Dataset(data, label=target) params = { "objective": "binary", "metric": "binary_logloss", "verbosity": -1, "boosting_type": "gbdt", } tuner = lgb.LightGBMTunerCV( params, dtrain, verbose_eval=100, early_stopping_rounds=100, folds=KFold(n_splits=3) ) tuner.run() print("Best score:", tuner.best_score) best_params = tuner.best_params print("Best params:", best_params) print(" Params: ") for key, value in best_params.items(): print(" {}: {}".format(key, value))
from sklearn.datasets import make_classification from sklearn.model_selection import StratifiedKFold import optuna.integration.lightgbm as lgb X, y = make_classification(10 ** 4, 100, shift=0.3, random_state=666) dtrain = lgb.Dataset(X, label=y) params = { "objective": "binary", "boosting_type": "gbdt", "metric": "auc", "n_jobs": 16, "verbosity": -1, } tuner = lgb.LightGBMTunerCV( params, dtrain, verbose_eval=100, early_stopping_rounds=100, folds=StratifiedKFold(5), show_progress_bar=False ) tuner.run() tuner.best_score tuner.best_params