class XGBWrapper_regr(object):
    """
    A wrapper for xgboost model so that we will have a single api for various models.
    """

    def __init__(self):
        self.model = XGBRegressor()

    def fit(self, X_train, y_train, X_valid=None, y_valid=None, X_holdout=None, y_holdout=None, params=None):

        self.model = self.model.set_params(**params)
        
        eval_set = [(X_train, y_train)]
        if X_valid is not None:
            eval_set.append((X_valid, y_valid))
        if X_holdout is not None:
            eval_set.append((X_holdout, y_holdout))

        self.model.fit(X=X_train, y=y_train,
                       eval_set=eval_set, eval_metric='rmse',
                       verbose=params['verbose'], early_stopping_rounds=params['early_stopping_rounds'])

        scores = self.model.evals_result()
        self.best_score_ = {k: {m: m_v[-1] for m, m_v in v.items()} for k, v in scores.items()}
#         self.best_score_ = {k: {m: n if m != 'cappa' else -n for m, n in v.items()} for k, v in self.best_score_.items()}

        self.feature_importances_ = self.model.feature_importances_
    
    def predict(self, X_test):
        return self.model.predict(X_test)
Пример #2
0
def XGBoost_test(X_train,
                 X_test,
                 Y_train,
                 Y_test,
                 max_depth=4,
                 n_estimators=100,
                 learning_rate=0.1,
                 min_child_weight=1,
                 n_jobs=4,
                 verbose_eval=False,
                 verbose=True,
                 plot_predict=True,
                 return_y_pred=False,
                 return_mse=False,
                 Y_scaler=None):
    XGBoost = XGBRegressor(max_depth=max_depth,
                           n_estimators=n_estimators,
                           learning_rate=learning_rate,
                           min_child_weight=min_child_weight,
                           n_jobs=n_jobs)

    if verbose:
        print(XGBoost, '\n')
    XGBoost.fit(X_train,
                Y_train,
                verbose=verbose_eval,
                eval_set=[(X_train, Y_train), (X_test, Y_test)],
                eval_metric='rmse')
    Y_preds = XGBoost.predict(X_test)
    test_MSE = mean_squared_error(Y_preds, Y_test)

    if verbose:
        print('\nTest MSE', test_MSE)

        train_mse = pd.Series(np.square(
            XGBoost.evals_result()['validation_0']['rmse']),
                              name='train')
        train_mse.index.name = 'n_estimators'
        train_mse.plot(title='MSE', legend=True)
        test_mse = pd.Series(np.square(
            XGBoost.evals_result()['validation_1']['rmse']),
                             name='test')
        test_mse.index.name = 'n_estimators'
        test_mse.plot(title='MSE', legend=True)

    if plot_predict:
        if Y_scaler is not None:
            df = pd.concat([
                pd.Series(Y_scaler.inverse_transform(
                    Y_test.copy().values.reshape(-1, 1)).reshape(-1, ),
                          index=Y_test.index),
                pd.Series(Y_scaler.inverse_transform(
                    np.array(Y_preds).reshape(-1, 1)).reshape(-1, ),
                          index=Y_test.index)
            ],
                           axis=1)
            df.columns = ['test', 'pred']
            df.plot(figsize=(10, 4), title='MSE:{}'.format(round(test_MSE, 4)))
        else:
            df = pd.concat(
                [Y_test, pd.Series(Y_preds, index=Y_test.index)], axis=1)
            df.columns = ['test', 'pred']
            df.plot(figsize=(10, 4), title='MSE:{}'.format(round(test_MSE, 4)))

    if (return_y_pred) & (return_mse):
        return pd.Series(Y_preds, index=Y_test.index), test_MSE
    if return_y_pred:
        return pd.Series(Y_preds, index=Y_test.index)
    if return_mse:
        return test_MSE
    def create_model(self, X_train, y_train, X_valid, y_valid, model_name, verbose=0):

        '''
        #function creates xgBoost model & fits model to data
        '''
        
        print('## Model {} is fitted..'.format(model_name))
        
        #create model with Wrapper:
        xg_reg_model = XGBRegressor(objective =self.objective, subsample=self.subsample, colsample_bytree = self.colsample_bytree, 
                              learning_rate = self.learning_rate, max_depth = self.max_depth, reg_alpha = self.reg_alpha, 
                              n_estimators = self.n_estimators, n_jobs=self.n_jobs, random_state = self.random_state, 
                              verbosity=self.verbosity)

        #create evaluation set:
        eval_set = [(X_train, y_train),(X_valid, y_valid)]

        if self.verbosity == 1:
            verbose_flag = True
        else:
            verbose_flag = False

        #fit regressor:
        
        if self.early_stopping_flag == False:
            xg_reg_model = xg_reg_model.fit(X_train, y_train, eval_metric = 'rmse', eval_set = eval_set, verbose=verbose_flag)
        else:
            xg_reg_model = xg_reg_model.fit(X_train, y_train, early_stopping_rounds = self.early_stopping_rounds, eval_metric = 'rmse', 
                                            eval_set = eval_set, verbose=verbose_flag)
        


        history = xg_reg_model.evals_result()

        print('Fitting Model done!')
        
        
        #assign/"store" model, history & model_name:
        if self.retraining_memory_save_mode == False: 
            self.prediction_model = xg_reg_model 
            
        self.model_name = model_name
        self.training_history = history
                
        
        #store model on disk in temp-folder:
        if self.retraining_memory_save_mode == True:
            Save_PATH = ('/media/vincent/harddrive/ML-Projects_all/NY_Cab_Project/NY_Cab_Data/results/xg_boost_Models/'
                         'Temp_Boosting_Models/')

            final_model_name =  model_name + '.pickle.dat'
            file_to_save = Save_PATH + final_model_name
            #save model on disk:
            pickle.dump(xg_reg_model, open(file_to_save,"wb"))
        
            #delete model to release memory:
            xg_reg_model._Booster.__del__()
            del xg_reg_model
            gc.collect()

        return history
Пример #4
0
def predictive_model_aug(training_method, X_train0, y_train0, X_train, X_val, X_test, y_train, y_val, y_test, keys0):
    train_real = y_train
    val_real = y_val
    test_real = y_test
    keys0 = keys0[0:-1]

    if training_method == 'RF':
        """param_grid = {'n_estimators': [10, 50, 100, 150, 200, 400, 600],
                      'min_samples_split': [2, 4, 6, 8, 10],
                      'max_features': ['auto', 'sqrt', 'log2'],
                      'min_samples_leaf': [1, 3, 5],
                      'bootstrap': [True],
                      'max_samples': [50, 100, 200, 500, 1000]}"""

        """grid = GridSearchCV(RandomForestRegressor(), param_grid=param_grid, cv=5)
        grid.fit(X_train0, y_train0)
        print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_))
        par_opt = grid.best_params_
        print("Best parameters: ", grid.best_params_)
        print("Test set accuracy: {:.2f}".format(grid.score(X_test, y_test)))"""

        """Best cross-validation accuracy: 0.62
        'bootstrap': True, 'max_features': 'auto', 'max_samples': 1000, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 600
        Test set accuracy: 0.58"""

        rf_estimator = RandomForestRegressor(bootstrap=True, max_features='log2', max_depth=8, min_samples_split=2,
                        max_samples=2800, n_estimators=600, min_samples_leaf=2, verbose=0, random_state=42, n_jobs=6)
        rf_estimator.fit(X_train, y_train)
        train_pred_nor = rf_estimator.predict(X_train)
        val_pred_nor = rf_estimator.predict(X_val)
        test_pred_nor = rf_estimator.predict(X_test)

        # Feature importance analysis
        feature_imp = rf_estimator.feature_importances_
        print(sorted(feature_imp))
        indices4 = np.argsort(feature_imp)
        keys1 = keys0[indices4]
        print('keys1 is', keys1)

        filename = './final models/' + training_method + '_finalized_model_aug.sav'
        model = rf_estimator
        pickle.dump(model, open(filename, 'wb'))

        # Calculate the prediction for training data
        train_pred = train_pred_nor
        val_pred = val_pred_nor
        test_pred = test_pred_nor

    if training_method == 'xgboost':
        # for illustration purposes only, don't use this code!
        """param_grid = {'n_estimators': [150, 200, 250],
                      'learning_rate': [0.01, 0.05, 0.1, 0.2],
                      'max_depth': [11, 13, 15],
                      'colsample_bytree': [0.6, 0.7, 0.8],
                      'reg_alpha': [3, 5, 7]
                      }

        grid = GridSearchCV(XGBRegressor(), param_grid=param_grid, cv=5)
        grid.fit(X_train0, y_train0)
        print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_))
        par_opt = grid.best_params_
        print("Best parameters: ", grid.best_params_)
        print("Test set accuracy: {:.2f}".format(grid.score(X_test, y_test)))"""

        """Best cross-validation accuracy: 0.38
           Best parameters:  {'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 13, 'n_estimators': 250, 'reg_alpha': 3}
           Test set accuracy: 0.51"""

        # rf_estimator = XGBRegressor(learning_rate=0.05, max_depth=13, colsample_bytree=0.7,
        # n_estimators=250, reg_alpha=3, random_state=0, n_jobs=5)
        xgb_estimator = XGBRegressor(learning_rate=0.03, max_depth=8,
                                     n_estimators=500, random_state=42, n_jobs=5)
        eval_set = [(X_train, y_train), (X_val, y_val)]
        xgb_estimator.fit(X_train, y_train, eval_metric="mae", eval_set=eval_set, verbose=True)
        results = xgb_estimator.evals_result()
        xgb_estimator.fit(X_train, y_train, eval_metric="mae", early_stopping_rounds=10, eval_set=eval_set,
                          verbose=True)

        train_pred_nor = xgb_estimator.predict(X_train)
        val_pred_nor = xgb_estimator.predict(X_val)
        test_pred_nor = xgb_estimator.predict(X_test)

        # Feature importance analysis
        feature_imp = xgb_estimator.feature_importances_
        print(sorted(feature_imp))
        indices1 = np.argsort(feature_imp)
        keys1 = keys0[indices4]
        print('keys1 is', keys1)

        filename = './final models/' + training_method + '_finalized_model.sav'
        model = xgb_estimator
        pickle.dump(model, open(filename, 'wb'))

        # Calculate the prediction for training data
        train_pred = train_pred_nor
        val_pred = val_pred_nor
        test_pred = test_pred_nor

    r2_train, err_train, corr_train = evaluate(train_real, train_pred)
    print(training_method + ' Train R2 score = {:0.2f}'.format(r2_train))
    print(training_method + ' Train avg. error = {:0.3f}'.format(err_train))
    print(training_method + ' Train correlation = {:0.2f}'.format(corr_train))

    r2_train, err_train, corr_train = evaluate(val_real, val_pred)
    print(training_method + ' Validation R2 score = {:0.2f}'.format(r2_train))
    print(training_method + ' Validation avg. error = {:0.3f}'.format(err_train))
    print(training_method + ' Validation correlation = {:0.2f}'.format(corr_train))

    r2_train, err_train, corr_train = evaluate(test_real, test_pred)
    print(training_method + ' Test R2 score = {:0.2f}'.format(r2_train))
    print(training_method + ' Test avg. error = {:0.3f}'.format(err_train))
    print(training_method + ' Test correlation = {:0.2f}'.format(corr_train))

    # Find the data with a bad prediction
    thd = 5.0
    test_set_bad = []
    test_pred_good = []
    test_real_good = []
    for ns in range(len(test_pred)):
        diff = test_pred[ns] - test_real[ns]
        if abs(diff) >= thd:
            data_tmp = X_test[ns, :]
            test_set_bad.append(data_tmp)
            # print(data_tmp[n_feature], data_tmp[0], data_tmp[3])
        else:
            test_pred_good.append(test_pred[ns])
            test_real_good.append(test_real[ns])
    test_set_bad = np.asarray(test_set_bad)
    print(test_set_bad.shape)
    print(len(test_set_bad))

    r2_train, err_train, corr_train = evaluate(test_real_good, test_pred_good)
    print(training_method + ' Test R2 score for good test set= {:0.2f}'.format(r2_train))
    print(training_method + ' Test avg. error for good test set= {:0.3f}'.format(err_train))
    print(training_method + ' Test correlation for good test set= {:0.2f}'.format(corr_train))

    font = {'size': 18}
    plt.rc('font', **font)

    plt.figure(1)
    plt.scatter(train_real, train_pred, color='black')
    plt.xlabel("CDR")
    plt.ylabel("CDR (Predicted)")
    plt.xlim(-2, 20)
    plt.ylim(-2, 20)

    plt.figure(2)
    plt.scatter(val_real, val_pred, color='black')
    plt.xlabel("CDR")
    plt.ylabel("CDR (Predicted)")
    plt.xlim(-2, 20)
    plt.ylim(-2, 20)

    plt.figure(3)
    plt.scatter(test_real, test_pred, color='black')
    plt.xlabel("CDR")
    plt.ylabel("CDR (Predicted)")
    plt.xlim(-2, 20)
    plt.ylim(-2, 20)

    plt.figure(4)
    plt.scatter(test_real_good, test_pred_good, color='black')
    plt.xlabel("CDR")
    plt.ylabel("CDR (Predicted)")
    plt.xlim(-2, 20)
    plt.ylim(-2, 20)

    plt.show()
Пример #5
0
def predictive_model(training_method, X_train0, y_train0, X_train, X_val, X_test, y_train, y_val, y_test, keys0):
    train_real = y_train
    val_real = y_val
    test_real = y_test
    keys0 = keys0[0:-1]

    if training_method == 'RF':
        """param_grid = {'n_estimators': [10, 50, 100, 150, 200, 400, 600],
                      'min_samples_split': [2, 4, 6, 8, 10],
                      'max_features': ['auto', 'sqrt', 'log2'],
                      'min_samples_leaf': [1, 3, 5],
                      'bootstrap': [True],
                      'max_samples': [50, 100, 200, 500, 1000]}"""

        """grid = GridSearchCV(RandomForestRegressor(), param_grid=param_grid, cv=5)
        grid.fit(X_train0, y_train0)
        print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_))
        par_opt = grid.best_params_
        print("Best parameters: ", grid.best_params_)
        print("Test set accuracy: {:.2f}".format(grid.score(X_test, y_test)))"""

        """Best cross-validation accuracy: 0.62
        'bootstrap': True, 'max_features': 'auto', 'max_samples': 1000, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 600
        Test set accuracy: 0.58"""

        rf_estimator = RandomForestRegressor(bootstrap=True, max_features='log2', max_depth=8, min_samples_split=2,
                        max_samples=2500, n_estimators=500, min_samples_leaf=2, verbose=0, random_state=42, n_jobs=6)
        rf_estimator.fit(X_train, y_train)
        train_pred_nor = rf_estimator.predict(X_train)
        val_pred_nor = rf_estimator.predict(X_val)
        test_pred_nor = rf_estimator.predict(X_test)

        # Feature importance analysis
        feature_imp = rf_estimator.feature_importances_
        print(sorted(feature_imp))
        indices4 = np.argsort(feature_imp)
        keys1 = keys0[indices4]
        print('keys1 is', keys1)

        filename = './final models/' + training_method + '_finalized_model.sav'
        model = rf_estimator
        pickle.dump(model, open(filename, 'wb'))

        # Calculate the prediction for training data
        train_pred = train_pred_nor
        val_pred = val_pred_nor
        test_pred = test_pred_nor

    if training_method == 'xgboost':
        # for illustration purposes only, don't use this code!
        """param_grid = {'n_estimators': [150, 200, 250],
                      'learning_rate': [0.01, 0.05, 0.1, 0.2],
                      'max_depth': [11, 13, 15],
                      'colsample_bytree': [0.6, 0.7, 0.8],
                      'reg_alpha': [3, 5, 7]
                      }

        grid = GridSearchCV(XGBRegressor(), param_grid=param_grid, cv=5)
        grid.fit(X_train0, y_train0)
        print("Best cross-validation accuracy: {:.2f}".format(grid.best_score_))
        par_opt = grid.best_params_
        print("Best parameters: ", grid.best_params_)
        print("Test set accuracy: {:.2f}".format(grid.score(X_test, y_test)))"""

        """Best cross-validation accuracy: 0.38
           Best parameters:  {'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 13, 'n_estimators': 250, 'reg_alpha': 3}
           Test set accuracy: 0.51"""

        # rf_estimator = XGBRegressor(learning_rate=0.05, max_depth=13, colsample_bytree=0.7,
        # n_estimators=250, reg_alpha=3, random_state=0, n_jobs=5)
        xgb_estimator = XGBRegressor(learning_rate=0.03, max_depth=8,
                                     n_estimators=500, random_state=42, n_jobs=5)
        eval_set = [(X_train, y_train), (X_val, y_val)]
        xgb_estimator.fit(X_train, y_train, eval_metric="mae", eval_set=eval_set, verbose=True)
        results = xgb_estimator.evals_result()
        xgb_estimator.fit(X_train, y_train, eval_metric="mae", early_stopping_rounds=10, eval_set=eval_set,
                          verbose=True)

        train_pred_nor = xgb_estimator.predict(X_train)
        val_pred_nor = xgb_estimator.predict(X_val)
        test_pred_nor = xgb_estimator.predict(X_test)

        # Feature importance analysis
        feature_imp = xgb_estimator.feature_importances_
        print(sorted(feature_imp))
        indices1 = np.argsort(feature_imp)
        keys1 = keys0[indices4]
        print('keys1 is', keys1)

        filename = './final models/' + training_method + '_finalized_model.sav'
        model = xgb_estimator
        pickle.dump(model, open(filename, 'wb'))

        # Calculate the prediction for training data
        train_pred = train_pred_nor
        val_pred = val_pred_nor
        test_pred = test_pred_nor

    r2_train, err_train, corr_train = evaluate(train_real, train_pred)
    print(training_method + ' Train R2 score = {:0.2f}'.format(r2_train))
    print(training_method + ' Train avg. error = {:0.3f}'.format(err_train))
    print(training_method + ' Train correlation = {:0.2f}'.format(corr_train))

    r2_train, err_train, corr_train = evaluate(val_real, val_pred)
    print(training_method + ' Validation R2 score = {:0.2f}'.format(r2_train))
    print(training_method + ' Validation avg. error = {:0.3f}'.format(err_train))
    print(training_method + ' Validation correlation = {:0.2f}'.format(corr_train))

    r2_train, err_train, corr_train = evaluate(test_real, test_pred)
    print(training_method + ' Test R2 score = {:0.2f}'.format(r2_train))
    print(training_method + ' Test avg. error = {:0.3f}'.format(err_train))
    print(training_method + ' Test correlation = {:0.2f}'.format(corr_train))

    # Plot training curve
    # epochs = len(results['validation_0']['mae'])
    # x_axis = range(0, epochs)

    # Feature importance analysis
    keys1 = ['Psychiatric disorders', 'B12 deficiency', 'Hand', 'Hypertension', 'Depression (Other)',
             'Hypercholesterolemia', 'Angioplasty', 'Cardiovascular (others)', 'Marriage state', 'Packs per day', 'Incontinence (urinary)',
             'Residence type', 'Family history', 'Gender', 'Smoking years', 'Depression (2 years)', 'Incontinence (bowel)',
             'Living situation', 'APOE', 'Education', 'BMI', 'GDS', 'Age']

    font = {'size': 18}
    plt.rc('font', **font)

    plt.figure(1)
    plt.title('Feature Importances')
    plt.barh(range(len(indices4)), sorted(feature_imp), color='b', align='center')
    plt.yticks(range(len(indices4)), [keys1[i] for i in range(len(indices4))])
    plt.xlabel('Relative Importance')
    plt.show()

    plt.figure(2)
    plt.scatter(train_real, train_pred, color='black')
    plt.xlabel("CDR")
    plt.ylabel("CDR (Predicted)")
    plt.xlim(-2, 20)
    plt.ylim(-2, 20)

    plt.figure(3)
    plt.scatter(val_real, val_pred, color='black')
    plt.xlabel("CDR")
    plt.ylabel("CDR (Predicted)")
    plt.xlim(-2, 20)
    plt.ylim(-2, 20)

    plt.figure(4)
    plt.scatter(test_real, test_pred, color='black')
    plt.xlabel("CDR")
    plt.ylabel("CDR (Predicted)")
    plt.xlim(-2, 20)
    plt.ylim(-2, 20)

    """plt.figure(5)
    plt.plot(x_axis, results['validation_0']['mae'], label='Train', color='black')
    plt.plot(x_axis, results['validation_1']['mae'], label='Validation', color='r')
    plt.legend()
    plt.xlabel('Number of estimator')
    plt.ylabel('Mean absolute error')
    #plt.title('XGBoost Log Loss')"""

    plt.show()