class XGBWrapper_regr(object): """ A wrapper for xgboost model so that we will have a single api for various models. """ def __init__(self): self.model = XGBRegressor() def fit(self, X_train, y_train, X_valid=None, y_valid=None, X_holdout=None, y_holdout=None, params=None): self.model = self.model.set_params(**params) eval_set = [(X_train, y_train)] if X_valid is not None: eval_set.append((X_valid, y_valid)) if X_holdout is not None: eval_set.append((X_holdout, y_holdout)) self.model.fit(X=X_train, y=y_train, eval_set=eval_set, eval_metric='rmse', verbose=params['verbose'], early_stopping_rounds=params['early_stopping_rounds']) scores = self.model.evals_result() self.best_score_ = {k: {m: m_v[-1] for m, m_v in v.items()} for k, v in scores.items()} # self.best_score_ = {k: {m: n if m != 'cappa' else -n for m, n in v.items()} for k, v in self.best_score_.items()} self.feature_importances_ = self.model.feature_importances_ def predict(self, X_test): return self.model.predict(X_test)
def XGBoost_test(X_train, X_test, Y_train, Y_test, max_depth=4, n_estimators=100, learning_rate=0.1, min_child_weight=1, n_jobs=4, verbose_eval=False, verbose=True, plot_predict=True, return_y_pred=False, return_mse=False, Y_scaler=None): XGBoost = XGBRegressor(max_depth=max_depth, n_estimators=n_estimators, learning_rate=learning_rate, min_child_weight=min_child_weight, n_jobs=n_jobs) if verbose: print(XGBoost, '\n') XGBoost.fit(X_train, Y_train, verbose=verbose_eval, eval_set=[(X_train, Y_train), (X_test, Y_test)], eval_metric='rmse') Y_preds = XGBoost.predict(X_test) test_MSE = mean_squared_error(Y_preds, Y_test) if verbose: print('\nTest MSE', test_MSE) train_mse = pd.Series(np.square( XGBoost.evals_result()['validation_0']['rmse']), name='train') train_mse.index.name = 'n_estimators' train_mse.plot(title='MSE', legend=True) test_mse = pd.Series(np.square( XGBoost.evals_result()['validation_1']['rmse']), name='test') test_mse.index.name = 'n_estimators' test_mse.plot(title='MSE', legend=True) if plot_predict: if Y_scaler is not None: df = pd.concat([ pd.Series(Y_scaler.inverse_transform( Y_test.copy().values.reshape(-1, 1)).reshape(-1, ), index=Y_test.index), pd.Series(Y_scaler.inverse_transform( np.array(Y_preds).reshape(-1, 1)).reshape(-1, ), index=Y_test.index) ], axis=1) df.columns = ['test', 'pred'] df.plot(figsize=(10, 4), title='MSE:{}'.format(round(test_MSE, 4))) else: df = pd.concat( [Y_test, pd.Series(Y_preds, index=Y_test.index)], axis=1) df.columns = ['test', 'pred'] df.plot(figsize=(10, 4), title='MSE:{}'.format(round(test_MSE, 4))) if (return_y_pred) & (return_mse): return pd.Series(Y_preds, index=Y_test.index), test_MSE if return_y_pred: return pd.Series(Y_preds, index=Y_test.index) if return_mse: return test_MSE
def create_model(self, X_train, y_train, X_valid, y_valid, model_name, verbose=0): ''' #function creates xgBoost model & fits model to data ''' print('## Model {} is fitted..'.format(model_name)) #create model with Wrapper: xg_reg_model = XGBRegressor(objective =self.objective, subsample=self.subsample, colsample_bytree = self.colsample_bytree, learning_rate = self.learning_rate, max_depth = self.max_depth, reg_alpha = self.reg_alpha, n_estimators = self.n_estimators, n_jobs=self.n_jobs, random_state = self.random_state, verbosity=self.verbosity) #create evaluation set: eval_set = [(X_train, y_train),(X_valid, y_valid)] if self.verbosity == 1: verbose_flag = True else: verbose_flag = False #fit regressor: if self.early_stopping_flag == False: xg_reg_model = xg_reg_model.fit(X_train, y_train, eval_metric = 'rmse', eval_set = eval_set, verbose=verbose_flag) else: xg_reg_model = xg_reg_model.fit(X_train, y_train, early_stopping_rounds = self.early_stopping_rounds, eval_metric = 'rmse', eval_set = eval_set, verbose=verbose_flag) history = xg_reg_model.evals_result() print('Fitting Model done!') #assign/"store" model, history & model_name: if self.retraining_memory_save_mode == False: self.prediction_model = xg_reg_model self.model_name = model_name self.training_history = history #store model on disk in temp-folder: if self.retraining_memory_save_mode == True: Save_PATH = ('/media/vincent/harddrive/ML-Projects_all/NY_Cab_Project/NY_Cab_Data/results/xg_boost_Models/' 'Temp_Boosting_Models/') final_model_name = model_name + '.pickle.dat' file_to_save = Save_PATH + final_model_name #save model on disk: pickle.dump(xg_reg_model, open(file_to_save,"wb")) #delete model to release memory: xg_reg_model._Booster.__del__() del xg_reg_model gc.collect() return history
def predictive_model_aug(training_method, X_train0, y_train0, X_train, X_val, X_test, y_train, y_val, y_test, keys0): train_real = y_train val_real = y_val test_real = y_test keys0 = keys0[0:-1] if training_method == 'RF': """param_grid = {'n_estimators': [10, 50, 100, 150, 200, 400, 600], 'min_samples_split': [2, 4, 6, 8, 10], 'max_features': ['auto', 'sqrt', 'log2'], 'min_samples_leaf': [1, 3, 5], 'bootstrap': [True], 'max_samples': [50, 100, 200, 500, 1000]}""" """grid = GridSearchCV(RandomForestRegressor(), param_grid=param_grid, cv=5) grid.fit(X_train0, y_train0) print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_)) par_opt = grid.best_params_ print("Best parameters: ", grid.best_params_) print("Test set accuracy: {:.2f}".format(grid.score(X_test, y_test)))""" """Best cross-validation accuracy: 0.62 'bootstrap': True, 'max_features': 'auto', 'max_samples': 1000, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 600 Test set accuracy: 0.58""" rf_estimator = RandomForestRegressor(bootstrap=True, max_features='log2', max_depth=8, min_samples_split=2, max_samples=2800, n_estimators=600, min_samples_leaf=2, verbose=0, random_state=42, n_jobs=6) rf_estimator.fit(X_train, y_train) train_pred_nor = rf_estimator.predict(X_train) val_pred_nor = rf_estimator.predict(X_val) test_pred_nor = rf_estimator.predict(X_test) # Feature importance analysis feature_imp = rf_estimator.feature_importances_ print(sorted(feature_imp)) indices4 = np.argsort(feature_imp) keys1 = keys0[indices4] print('keys1 is', keys1) filename = './final models/' + training_method + '_finalized_model_aug.sav' model = rf_estimator pickle.dump(model, open(filename, 'wb')) # Calculate the prediction for training data train_pred = train_pred_nor val_pred = val_pred_nor test_pred = test_pred_nor if training_method == 'xgboost': # for illustration purposes only, don't use this code! """param_grid = {'n_estimators': [150, 200, 250], 'learning_rate': [0.01, 0.05, 0.1, 0.2], 'max_depth': [11, 13, 15], 'colsample_bytree': [0.6, 0.7, 0.8], 'reg_alpha': [3, 5, 7] } grid = GridSearchCV(XGBRegressor(), param_grid=param_grid, cv=5) grid.fit(X_train0, y_train0) print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_)) par_opt = grid.best_params_ print("Best parameters: ", grid.best_params_) print("Test set accuracy: {:.2f}".format(grid.score(X_test, y_test)))""" """Best cross-validation accuracy: 0.38 Best parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 13, 'n_estimators': 250, 'reg_alpha': 3} Test set accuracy: 0.51""" # rf_estimator = XGBRegressor(learning_rate=0.05, max_depth=13, colsample_bytree=0.7, # n_estimators=250, reg_alpha=3, random_state=0, n_jobs=5) xgb_estimator = XGBRegressor(learning_rate=0.03, max_depth=8, n_estimators=500, random_state=42, n_jobs=5) eval_set = [(X_train, y_train), (X_val, y_val)] xgb_estimator.fit(X_train, y_train, eval_metric="mae", eval_set=eval_set, verbose=True) results = xgb_estimator.evals_result() xgb_estimator.fit(X_train, y_train, eval_metric="mae", early_stopping_rounds=10, eval_set=eval_set, verbose=True) train_pred_nor = xgb_estimator.predict(X_train) val_pred_nor = xgb_estimator.predict(X_val) test_pred_nor = xgb_estimator.predict(X_test) # Feature importance analysis feature_imp = xgb_estimator.feature_importances_ print(sorted(feature_imp)) indices1 = np.argsort(feature_imp) keys1 = keys0[indices4] print('keys1 is', keys1) filename = './final models/' + training_method + '_finalized_model.sav' model = xgb_estimator pickle.dump(model, open(filename, 'wb')) # Calculate the prediction for training data train_pred = train_pred_nor val_pred = val_pred_nor test_pred = test_pred_nor r2_train, err_train, corr_train = evaluate(train_real, train_pred) print(training_method + ' Train R2 score = {:0.2f}'.format(r2_train)) print(training_method + ' Train avg. error = {:0.3f}'.format(err_train)) print(training_method + ' Train correlation = {:0.2f}'.format(corr_train)) r2_train, err_train, corr_train = evaluate(val_real, val_pred) print(training_method + ' Validation R2 score = {:0.2f}'.format(r2_train)) print(training_method + ' Validation avg. error = {:0.3f}'.format(err_train)) print(training_method + ' Validation correlation = {:0.2f}'.format(corr_train)) r2_train, err_train, corr_train = evaluate(test_real, test_pred) print(training_method + ' Test R2 score = {:0.2f}'.format(r2_train)) print(training_method + ' Test avg. error = {:0.3f}'.format(err_train)) print(training_method + ' Test correlation = {:0.2f}'.format(corr_train)) # Find the data with a bad prediction thd = 5.0 test_set_bad = [] test_pred_good = [] test_real_good = [] for ns in range(len(test_pred)): diff = test_pred[ns] - test_real[ns] if abs(diff) >= thd: data_tmp = X_test[ns, :] test_set_bad.append(data_tmp) # print(data_tmp[n_feature], data_tmp[0], data_tmp[3]) else: test_pred_good.append(test_pred[ns]) test_real_good.append(test_real[ns]) test_set_bad = np.asarray(test_set_bad) print(test_set_bad.shape) print(len(test_set_bad)) r2_train, err_train, corr_train = evaluate(test_real_good, test_pred_good) print(training_method + ' Test R2 score for good test set= {:0.2f}'.format(r2_train)) print(training_method + ' Test avg. error for good test set= {:0.3f}'.format(err_train)) print(training_method + ' Test correlation for good test set= {:0.2f}'.format(corr_train)) font = {'size': 18} plt.rc('font', **font) plt.figure(1) plt.scatter(train_real, train_pred, color='black') plt.xlabel("CDR") plt.ylabel("CDR (Predicted)") plt.xlim(-2, 20) plt.ylim(-2, 20) plt.figure(2) plt.scatter(val_real, val_pred, color='black') plt.xlabel("CDR") plt.ylabel("CDR (Predicted)") plt.xlim(-2, 20) plt.ylim(-2, 20) plt.figure(3) plt.scatter(test_real, test_pred, color='black') plt.xlabel("CDR") plt.ylabel("CDR (Predicted)") plt.xlim(-2, 20) plt.ylim(-2, 20) plt.figure(4) plt.scatter(test_real_good, test_pred_good, color='black') plt.xlabel("CDR") plt.ylabel("CDR (Predicted)") plt.xlim(-2, 20) plt.ylim(-2, 20) plt.show()
def predictive_model(training_method, X_train0, y_train0, X_train, X_val, X_test, y_train, y_val, y_test, keys0): train_real = y_train val_real = y_val test_real = y_test keys0 = keys0[0:-1] if training_method == 'RF': """param_grid = {'n_estimators': [10, 50, 100, 150, 200, 400, 600], 'min_samples_split': [2, 4, 6, 8, 10], 'max_features': ['auto', 'sqrt', 'log2'], 'min_samples_leaf': [1, 3, 5], 'bootstrap': [True], 'max_samples': [50, 100, 200, 500, 1000]}""" """grid = GridSearchCV(RandomForestRegressor(), param_grid=param_grid, cv=5) grid.fit(X_train0, y_train0) print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_)) par_opt = grid.best_params_ print("Best parameters: ", grid.best_params_) print("Test set accuracy: {:.2f}".format(grid.score(X_test, y_test)))""" """Best cross-validation accuracy: 0.62 'bootstrap': True, 'max_features': 'auto', 'max_samples': 1000, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 600 Test set accuracy: 0.58""" rf_estimator = RandomForestRegressor(bootstrap=True, max_features='log2', max_depth=8, min_samples_split=2, max_samples=2500, n_estimators=500, min_samples_leaf=2, verbose=0, random_state=42, n_jobs=6) rf_estimator.fit(X_train, y_train) train_pred_nor = rf_estimator.predict(X_train) val_pred_nor = rf_estimator.predict(X_val) test_pred_nor = rf_estimator.predict(X_test) # Feature importance analysis feature_imp = rf_estimator.feature_importances_ print(sorted(feature_imp)) indices4 = np.argsort(feature_imp) keys1 = keys0[indices4] print('keys1 is', keys1) filename = './final models/' + training_method + '_finalized_model.sav' model = rf_estimator pickle.dump(model, open(filename, 'wb')) # Calculate the prediction for training data train_pred = train_pred_nor val_pred = val_pred_nor test_pred = test_pred_nor if training_method == 'xgboost': # for illustration purposes only, don't use this code! """param_grid = {'n_estimators': [150, 200, 250], 'learning_rate': [0.01, 0.05, 0.1, 0.2], 'max_depth': [11, 13, 15], 'colsample_bytree': [0.6, 0.7, 0.8], 'reg_alpha': [3, 5, 7] } grid = GridSearchCV(XGBRegressor(), param_grid=param_grid, cv=5) grid.fit(X_train0, y_train0) print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_)) par_opt = grid.best_params_ print("Best parameters: ", grid.best_params_) print("Test set accuracy: {:.2f}".format(grid.score(X_test, y_test)))""" """Best cross-validation accuracy: 0.38 Best parameters: {'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 13, 'n_estimators': 250, 'reg_alpha': 3} Test set accuracy: 0.51""" # rf_estimator = XGBRegressor(learning_rate=0.05, max_depth=13, colsample_bytree=0.7, # n_estimators=250, reg_alpha=3, random_state=0, n_jobs=5) xgb_estimator = XGBRegressor(learning_rate=0.03, max_depth=8, n_estimators=500, random_state=42, n_jobs=5) eval_set = [(X_train, y_train), (X_val, y_val)] xgb_estimator.fit(X_train, y_train, eval_metric="mae", eval_set=eval_set, verbose=True) results = xgb_estimator.evals_result() xgb_estimator.fit(X_train, y_train, eval_metric="mae", early_stopping_rounds=10, eval_set=eval_set, verbose=True) train_pred_nor = xgb_estimator.predict(X_train) val_pred_nor = xgb_estimator.predict(X_val) test_pred_nor = xgb_estimator.predict(X_test) # Feature importance analysis feature_imp = xgb_estimator.feature_importances_ print(sorted(feature_imp)) indices1 = np.argsort(feature_imp) keys1 = keys0[indices4] print('keys1 is', keys1) filename = './final models/' + training_method + '_finalized_model.sav' model = xgb_estimator pickle.dump(model, open(filename, 'wb')) # Calculate the prediction for training data train_pred = train_pred_nor val_pred = val_pred_nor test_pred = test_pred_nor r2_train, err_train, corr_train = evaluate(train_real, train_pred) print(training_method + ' Train R2 score = {:0.2f}'.format(r2_train)) print(training_method + ' Train avg. error = {:0.3f}'.format(err_train)) print(training_method + ' Train correlation = {:0.2f}'.format(corr_train)) r2_train, err_train, corr_train = evaluate(val_real, val_pred) print(training_method + ' Validation R2 score = {:0.2f}'.format(r2_train)) print(training_method + ' Validation avg. error = {:0.3f}'.format(err_train)) print(training_method + ' Validation correlation = {:0.2f}'.format(corr_train)) r2_train, err_train, corr_train = evaluate(test_real, test_pred) print(training_method + ' Test R2 score = {:0.2f}'.format(r2_train)) print(training_method + ' Test avg. error = {:0.3f}'.format(err_train)) print(training_method + ' Test correlation = {:0.2f}'.format(corr_train)) # Plot training curve # epochs = len(results['validation_0']['mae']) # x_axis = range(0, epochs) # Feature importance analysis keys1 = ['Psychiatric disorders', 'B12 deficiency', 'Hand', 'Hypertension', 'Depression (Other)', 'Hypercholesterolemia', 'Angioplasty', 'Cardiovascular (others)', 'Marriage state', 'Packs per day', 'Incontinence (urinary)', 'Residence type', 'Family history', 'Gender', 'Smoking years', 'Depression (2 years)', 'Incontinence (bowel)', 'Living situation', 'APOE', 'Education', 'BMI', 'GDS', 'Age'] font = {'size': 18} plt.rc('font', **font) plt.figure(1) plt.title('Feature Importances') plt.barh(range(len(indices4)), sorted(feature_imp), color='b', align='center') plt.yticks(range(len(indices4)), [keys1[i] for i in range(len(indices4))]) plt.xlabel('Relative Importance') plt.show() plt.figure(2) plt.scatter(train_real, train_pred, color='black') plt.xlabel("CDR") plt.ylabel("CDR (Predicted)") plt.xlim(-2, 20) plt.ylim(-2, 20) plt.figure(3) plt.scatter(val_real, val_pred, color='black') plt.xlabel("CDR") plt.ylabel("CDR (Predicted)") plt.xlim(-2, 20) plt.ylim(-2, 20) plt.figure(4) plt.scatter(test_real, test_pred, color='black') plt.xlabel("CDR") plt.ylabel("CDR (Predicted)") plt.xlim(-2, 20) plt.ylim(-2, 20) """plt.figure(5) plt.plot(x_axis, results['validation_0']['mae'], label='Train', color='black') plt.plot(x_axis, results['validation_1']['mae'], label='Validation', color='r') plt.legend() plt.xlabel('Number of estimator') plt.ylabel('Mean absolute error') #plt.title('XGBoost Log Loss')""" plt.show()