示例#1
0
def adaboost_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum,
                     learn_options):
    '''
    AdaBoostRegressor from scikitlearn.
    '''

    if learn_options['adaboost_version'] == 'python':
        if not learn_options['adaboost_CV']:
            clf = en.GradientBoostingRegressor(
                loss=learn_options['adaboost_loss'],
                learning_rate=learn_options['adaboost_learning_rate'],
                n_estimators=learn_options['adaboost_n_estimators'],
                alpha=learn_options['adaboost_alpha'],
                subsample=1.0,
                min_samples_split=2,
                min_samples_leaf=1,
                max_depth=learn_options['adaboost_max_depth'],
                init=None,
                random_state=None,
                max_features=None,
                verbose=0,
                max_leaf_nodes=None,
                warm_start=False)

            clf.fit(X[train], y[train].flatten())
            y_pred = clf.predict(X[test])[:, None]
        else:
            print "Adaboost with GridSearch"
            from sklearn.model_selection import GridSearchCV
            param_grid = {
                'learning_rate': [0.1, 0.05, 0.01],
                'max_depth': [4, 5, 6, 7],
                'min_samples_leaf': [5, 7, 10, 12, 15],
                'max_features': [1.0, 0.5, 0.3, 0.1]
            }

            label_encoder = sklearn.preprocessing.LabelEncoder()
            label_encoder.fit(y_all['Target gene'].values[train])
            gene_classes = label_encoder.transform(
                y_all['Target gene'].values[train])
            n_folds = len(np.unique(gene_classes))
            cv = sklearn.model_selection.StratifiedKFold(gene_classes,
                                                         n_folds=n_folds,
                                                         shuffle=True)

            est = en.GradientBoostingRegressor(
                loss=learn_options['adaboost_loss'],
                n_estimators=learn_options['adaboost_n_estimators'])
            clf = GridSearchCV(est,
                               param_grid,
                               n_jobs=20,
                               verbose=1,
                               cv=cv,
                               scoring=spearman_scoring,
                               iid=False).fit(X[train], y[train].flatten())
            print clf.best_params_
            y_pred = clf.predict(X[test])[:, None]
    else:
        raise NotImplementedError

    return y_pred, clf
def main(model='mlp', num_epochs=500, dataset='reuters', folder="", exp_start_time=None):
    # Load the dataset
    print("Loading data...")
      
    if dataset== 'boston':

        from sklearn import cross_validation
        from sklearn import preprocessing
        from sklearn import datasets
        #from sklearn.utils import shuffle
        boston = datasets.load_boston()
        X, y = boston.data.astype('float32'), boston.target.astype('float32')
        #X, y = shuffle(boston.data, boston.target, random_state=13)
        scaler = preprocessing.StandardScaler()
        X = scaler.fit_transform(X)
        
        X_train, X_test, y_train, y_test = cross_validation.train_test_split(
                X, y, test_size=0.1, random_state=42)
        
        #X_train = scaler.fit_transform(X_train)
        X_val = X_train.copy()
        y_val = y_train.copy()
        print("validation is just a copy of X_train, so results will be similar but with no drop out")
        
        
        from sklearn import ensemble
        from sklearn.metrics import mean_squared_error
        params = {'n_estimators': 150, 'max_depth': 4, 'min_samples_split': 2,
                  'learning_rate': 0.01, 'loss': 'ls'}
        clf = ensemble.GradientBoostingRegressor(**params)

        clf.fit(X_train, y_train)   
        mse_train = mean_squared_error(y_train, clf.predict(X_train))
        mse_test = mean_squared_error(y_test, clf.predict(X_test))
        print("GRAD BOOST MSE train: %.4f" % mse_train)
        print("GRAD BOOST MSE test: %.4f" % mse_test)
        
        input_var = T.fmatrix('inputs')
        target_var = T.fvector('targets')
        input_shape= (None, X_train.shape[1])
        output_shape  = 1
        batch_num = 128
        regress= True
        batch_norm =False
        print(output_shape)
        
        network = build_network_model(model, input_var, input_shape, output_shape,
                                      batch_norm=batch_norm, regress=regress)
        print(network.output_shape)
        train_fn, eval_fn, LR_params = build_functions(network, input_var, 
                                                       target_var, 
                                                       regress=regress)
    elif dataset.startswith('reuters'):
        X_train, y_train, X_val, y_val, X_test, y_test =load_dataset_reuters('../datasets/reuters/')
        print ("Train: ",X_train.shape, "Val: ", X_val.shape, "Test: ",X_test.shape)
        input_var = T.fmatrix('inputs')
        target_var = T.fmatrix('targets')
        input_shape= (None, X_train.shape[1])
        output_shape  = y_train.shape[1]
        regress = False
        batch_norm = True
        
#        from sklearn import ensemble
#        from sklearn.metrics import accuracy_score
#        params = {'n_estimators': 50, 'max_depth': 10, 'min_samples_split': 2}
#        clf = ensemble.RandomForestClassifier(**params)
#
#        clf.fit(X_train, y_train)   
#        mse_train = accuracy_score(y_train, clf.predict(X_train))
#        mse_test = accuracy_score(y_test, clf.predict(X_test))
#        print("RANDOM  train: %2.4f" % mse_train)
#        print("RANDOM  test: %2.4f" % mse_test)
#        
#        from sklearn import neural_network
#        clf = neural_network.MLPClassifier(hidden_layer_sizes=(150,), 
#                                           activation='relu', 
#                                           solver='sgd', batch_size=16, 
#                                           learning_rate='constant', 
#                                           learning_rate_init=0.001, 
#                                           max_iter=250, 
#                                           early_stopping=True,
#                                           shuffle=True)
#        clf.fit(X_train, y_train)   
#        mse_train = accuracy_score(y_train, clf.predict(X_train))
#        mse_test = accuracy_score(y_test, clf.predict(X_test))
#        print("KNN  train: %2.4f" % mse_train)
#        print("KNN  test: %2.4f" % mse_test)
#        import pdb
#        pdb.set_trace()
        batch_num = 16
        network = build_network_model(model, input_var, input_shape, output_shape,
                                      batch_norm=batch_norm, regress=regress)
        print(network.output_shape)
        train_fn, eval_fn, LR_params = build_functions(network, input_var, 
                                                       target_var, regress=regress)
    
    # Prepare Theano variables for inputs and targets
    print("input shape:", input_shape)
    
    val_acc_list =[]
    tst_acc_list =[] 
    val_err_list = []
    trn_err_list = []
    
    print("Model", model)
    if model.startswith("mlp:"):
        lr_all = 5e-4
    else:
        lr_all = 1e-4 #reuters best 1e-4 for focused doesnt change 5e-5
        
    lr_all_decay = .9
    lr_mu = 0.001 
    lr_mu_decay = 0.9
    lr_si = 0.001
    lr_si_decay = 0.9
    lr_fw = 0.001
    lr_fw_decay = .9
    decay_epoch = 30
    print_int = 10
    if dataset=='boston':
        lr_all = 0.005
        lr_all_decay = .9
        lr_mu = 0.001
        lr_mu_decay = 0.9
        lr_si = 0.001
        lr_si_decay = 0.9
        lr_fw = 0.005
        lr_fw_decay = .9
        decay_epoch = 1000
        print_int = 1000
    
    set_params_value(LR_params,[lr_all,lr_mu,lr_si,lr_fw])
    
    for epoch in range(num_epochs):
        # In each epoch, we do a full pass over the training data:
        train_err = 0
        train_batches = 0
        start_time = time.time()
      
        if (epoch>1 and epoch%decay_epoch==1):
            #lr_all = 0.001
            #lr_fw = 0.001
            lr_all = lr_all * lr_all_decay
            lr_mu = lr_mu * lr_mu_decay
            lr_si = lr_si * lr_si_decay
            lr_fw = lr_fw * lr_fw_decay
            
            set_params_value(LR_params,[lr_all,lr_mu,lr_si,lr_fw])
        
        for batch in iterate_minibatches(X_train, y_train, batch_num, shuffle=True):
            inputs, targets = batch

            train_err += train_fn(inputs, targets)
            train_batches += 1
        
        trn_err_list.append(train_err/train_batches)
        
        # And a full pass over the validation data:
        val_err = 0
        val_acc = 0
        val_batches = 0
        
        for batch in iterate_minibatches(X_val, y_val, y_val.shape[0], shuffle=False):
            inputs, targets = batch
            err, acc = eval_fn(inputs, targets)
            val_err += err
            val_acc += acc
            val_batches += 1
            
            
        train_err = train_err / train_batches
        val_err = val_err / val_batches
        val_acc = val_acc / val_batches * 100
        val_err_list.append(val_err)
        # Then we print the results for this epoch:
        if (epoch%print_int==0):
            print("Model {} Epoch {} of {} took {:.3f}s".format(model, epoch + 1, num_epochs, time.time() - start_time))
            print("  training loss:\t\t{:2.6f}".format(train_err))
            print("  validation loss:\t\t{:2.6f}".format(val_err))
            if not regress:
                print("  validation accuracy:\t\t{:2.4f} %".format(val_acc))
            else:
                val_acc = 1-val_err
                
            print_param_stats(network)
            #debug_focus_vars(network)
        
        
        
        if np.isnan(train_err):
            print("Train error NAN")
            break
        tst_err, tst_acc = eval_fn(X_test, y_test)
        if not regress:
            tst_acc_list.append(tst_acc * 100) # to pick the tst error at best val accuracy.
            val_acc_list.append(val_acc)
        else:
            tst_acc_list.append(tst_err) # to pick the tst error at best val accuracy. 
            val_acc_list.append(val_err)
    # After training, we compute and print the test error:
    
    val_ac_np = np.asarray(val_acc_list)
    if regress:
        best_val = np.argmin(val_ac_np)
    else:
        best_val = np.argmax(val_ac_np)
    if np.isnan(train_err):
        return
    tst_err_fin, tst_acc_fin = eval_fn(X_test, y_test)
    print("\nFinal results:")
    print("  test loss:\t\t\t{:.6f}".format(tst_err_fin))
    print("  test accuracy:\t\t{:.4f} %".format(tst_acc_fin))
    
    print("\nTest result at best val epoch: ", best_val)
    print("  test accuracy:\t\t{:.4f} %".format(tst_acc_list[best_val]))
    
    best_test_early_stop = tst_acc_list[best_val]
    from datetime import datetime
    now = datetime.now()
    timestr = now.strftime("%Y%m%d-%H%M%S")
    print("_result_change")
    print(start_time, timestr)
    filename= str(folder+dataset+"_result_"+model+"_"+exp_start_time+"_"+timestr)
    
    np.savez(filename,(trn_err_list, val_err_list, val_acc_list, tst_err_fin, 
                       tst_acc_fin*100, tst_acc_list, best_test_early_stop))

    
    # save model and code 
    filename= str(folder+dataset+"_model_"+model+"_"+timestr)
    fixed_params = lasagne.layers.get_all_params(network, trainable=False)
    fixed_params =[t.name for t in fixed_params]
    trn_params = lasagne.layers.get_all_params(network, trainable=True)
    trn_params =[t.name for t in trn_params]
    fixed_param_values = lasagne.layers.get_all_param_values(network, trainable=False)
    trn_param_values = lasagne.layers.get_all_param_values(network, trainable=True)
    
    np.savez(filename, trn_params, trn_param_values, fixed_params, fixed_param_values)




    
        
    plt_figures = False
    if plt_figures:
        import matplotlib.pyplot as plt
        plt.plot(trn_err_list)
        plt.plot(val_err_list)
        plt.ylim([0, 0.25])
        plt.title("Train and Validation Error")
        plt.legend(("Train","Validate"))
        plt.show()
示例#3
0
print "\n\n\n mean err ", s.mean()
#==============================================================================
# GBoost
#==============================================================================

# Fit regression model
params = {
    'n_estimators': 500,
    'max_depth': 4,
    'min_samples_split': 1,
    'learning_rate': 0.01,
    'loss': 'ls',
    'verbose': 1
}
clf = ensemble.GradientBoostingRegressor(**params)
i = 0
clf.fit(X_train, Y_train[:, i])
mse = mean_squared_error(Y_valid[:, i], clf.predict(X_valid))
print("MSE: %.4f" % mse)

#==============================================================================
# grid searching
#==============================================================================
#param_grid ={"kernel": ('linear', 'poly', 'rbf', 'sigmoid', 'precomputed')}
#
#params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 1,
#          'learning_rate': 0.01, 'loss': 'ls'}
#
#grid_search = GridSearchCV(sup_vec, param_grid=param_grid)
#start = time()
def eval_bootstrap(df, features, md):
    X = df[features].values
    y = df[LABEL_COLUMN_NAME].values

    aa = []
    bb = []
    cc = []
    dd = []
    for i in range(1, 5):
        a = []
        b = []
        c = []
        d = []
        cv = KFold(n_splits=N_FOLDS, shuffle=True, random_state=i)
        for (train, val) in cv.split(X, y):
            if md == 1:
                regressor = ensemble.GradientBoostingRegressor(
                    n_estimators=30,
                    max_depth=4,
                    min_samples_split=2,
                    learning_rate=0.1,
                    loss='ls',
                    random_state=RANDOM_STATE)
            elif md == 2:
                regressor = ensemble.RandomForestRegressor(
                    n_estimators=30,
                    max_depth=10,
                    min_samples_split=4,
                    random_state=RANDOM_STATE)
            elif md == 3:
                regressor = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=.1)
            elif md == 4:
                regressor = MLPRegressor(hidden_layer_sizes=(
                    20,
                    30,
                    30,
                    5,
                ),
                                         batch_size=10,
                                         activation='relu',
                                         random_state=RANDOM_STATE)
            elif md == 5:
                regressor = LinearRegression()
            elif md == 6:
                regressor = Lasso(alpha=0.1, random_state=RANDOM_STATE)

            regressor = regressor.fit(X[train], y[train])
            pred = regressor.predict(X[val])

            rmse = np.sqrt(np.mean((pred - y[val])**2))
            mae = mean_absolute_error(pred, y[val])
            r2 = r2_score(pred, y[val])

            a.insert(len(a), rmse)
            b.insert(len(b), mae)
            c.insert(len(c), r2)

        aa.append(np.mean(a))
        bb.append(np.mean(b))
        cc.append(np.mean(c))
    return np.mean(aa), np.mean(bb), np.mean(cc)
示例#5
0
model_random_forest_regressor = ensemble.RandomForestRegressor(n_estimators=20)  # 使用20个决策树
ModelList.append([model_random_forest_regressor,'随机森林回归'])


# 6.Adaboost回归
from sklearn import ensemble

model_adaboost_regressor = ensemble.AdaBoostRegressor(n_estimators=50)  # 这里使用50个决策树
ModelList.append([model_adaboost_regressor,'Adaboost回归'])


# 7.GBRT回归
from sklearn import ensemble

model_gradient_boosting_regressor = ensemble.GradientBoostingRegressor(n_estimators=100)  # 这里使用100个决策树
ModelList.append([model_gradient_boosting_regressor,'GBRT回归'])


# 8.Bagging回归
from sklearn import ensemble

model_bagging_regressor = ensemble.BaggingRegressor()
ModelList.append([model_bagging_regressor,'Bagging回归'])


# 9.ExtraTree极端随机数回归
from sklearn.tree import ExtraTreeRegressor

model_extra_tree_regressor = ExtraTreeRegressor()
ModelList.append([model_extra_tree_regressor,'ExtraTree极端随机数回归'])
    # data = boston.data
    # data = data / data.max(axis=0)

    x_train, x_test, y_train, y_test = train_test_split(boston.data,
                                                        boston.target,
                                                        test_size=0.2)

    median = np.median(x_train, axis=0)
    mad = np.median(np.abs(x_train - median), axis=0)
    mad[mad == 0] = 1e-19

    x_train /= mad
    x_test /= mad

    model = ensemble.GradientBoostingRegressor()
    model.fit(x_train, y_train)

    score = r2_score(y_test, model.predict(x_test))
    print("R^2: %.4f" % score)

    categorical_features = np.argwhere(
        np.array(
            [len(set(boston.data[:, x]))
             for x in range(boston.data.shape[1])]) <= 10).flatten()
    explainer = lime.lime_tabular.LimeTabularExplainer(
        x_train,
        feature_names=boston.feature_names,
        class_names=['price'],
        categorical_features=categorical_features,
        mode='regression')
    "model": tree.DecisionTreeRegressor(random_state=0)} )
models.append( {"name": "1.10. ExtraTreeRegressor", \
    "model": tree.ExtraTreeRegressor(random_state=0)} )

## 1.11. Ensemble methods
# averaging methods
models.append( {"name": "1.11.1. Bagging meta-estimator", \
    "model": ensemble.BaggingRegressor(neighbors.KNeighborsRegressor())} )
models.append( {"name": "1.11.2.1. Random Forests", \
    "model": ensemble.RandomForestRegressor()} )
models.append( {"name": "1.11.2.2. Extremely Randomized Trees", \
    "model": ensemble.ExtraTreesRegressor()} )
models.append( {"name": "1.11.3. AdaBoost", \
    "model": ensemble.AdaBoostRegressor()} )
models.append( {"name": "1.11.4. Gradient Tree Boosting", \
    "model": ensemble.GradientBoostingRegressor()} )

## 1.12. Multiclass and multilabel algorithms
# not regression

## 1.13. Feature selection
# not about estimator

## 1.14. Semi-Supervised
# all samples have price data, so doesn't apply

## 1.15. Isotonic regression
# ValueError("X should be a 1d array")
#models.append( {"name": "1.14. Semi-Supervised", \
#				"model": IsotonicRegression()} )
示例#8
0
df = pd.read_csv("weather_calls.csv")

y = df['calls'].as_matrix()

del df["calls"]

X = df.as_matrix()

for i in range(0,10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i)

    # Fit regression model
    model = ensemble.GradientBoostingRegressor(
        n_estimators=1000,
        learning_rate=0.029,
        max_depth=3,
        min_samples_leaf=3,
        max_features=0.2,
        loss='huber',
        random_state=i
    )
    model.fit(X_train, y_train)

    err = mean_absolute_error(y_train, model.predict(X_train))
    print("Error (train",i,"):", err)

    err = mean_absolute_error(y_test, model.predict(X_test))
    print("Error (test",i,"):", err)

示例#9
0
plot_data_3d_regression(tsne_data, y_train)

knn_estimator = neighbors.KNeighborsRegressor()
knn_grid = {'n_neighbors': list(range(3, 20))}
grid_search_plot_one_parameter_curves(knn_estimator,
                                      knn_grid,
                                      X_train1,
                                      y_trans,
                                      scoring=scoring)
knn_model = get_best_model(knn_estimator,
                           knn_grid,
                           X_train1,
                           y_trans,
                           scoring=scoring)

gb_estimator = ensemble.GradientBoostingRegressor()
gb_grid = {
    'n_estimators': list(range(100, 501, 200)),
    'learning_rate': [0.1, 1.0],
    'max_depth': [1, 3, 5]
}
gb_model = get_best_model(gb_estimator,
                          gb_grid,
                          X_train1,
                          y_trans,
                          scoring=scoring)

X_test = house3[house_train.shape[0]:]
X_test1 = select_features(rf_selector, X_test)

house_test['SalePrice'] = np.expm1(gb_model.predict(X_test1))
y = df['sale_price'].as_matrix()  #Expected output to predict

# Split the data set in a training set (70%) and a test set (30%)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)

# Fit regression model
model = ensemble.GradientBoostingRegressor(
    n_estimators=
    1000,  # Tells the model, how many decision trees to build Higher numbers usually allow the model to be more accurate but it increases the amount of time required to run the model
    learning_rate=
    0.1,  # Learning rate controls how much each additional decision tree influences the overall prediction. Lower rates usually lead to higher accuracy but only works if we have n_estimators set to a high value.
    max_depth=
    6,  # Max_depth controls how many layers deep each individual decision tree can be. We'll start with 6 which means that each decision tree in the model can be up to 6 layers deep.
    min_samples_leaf=
    9,  # Min_samples_leaf controls how many times a value must appear in our training set for a decision tree to make a decision based on it.
    max_features=
    0.1,  # Max_features is the percentage of features in our model that we randomly choose to consider each time we create a branch in our decision tree.
    loss=
    'huber'  # Loss controls how scikit-learn calculates the model's error rate or cost as it learns. The huber function does a good job while not being too influenced by outliers in the data set.
)
model.fit(
    X_train, y_train
)  # We tell the model to train using our training data set by calling scikit-learn's fit function on the model

# Save the trained model to a file so we can use it in other programs
joblib.dump(model, 'trained_house_classifier_model.pkl')

# Find the error rate on the training set
mse = mean_absolute_error(y_train, model.predict(X_train))
示例#11
0
from sklearn.model_selection import KFold
#import plot_learning_curve

csv_file = 'listStoreValue-long.csv'
test = np.array(pd.read_csv(csv_file))
X = test[:, 0:6]
y = test[:, -1]
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

params = {
    'n_estimators': 100,
    'max_depth': 5,
    'learning_rate': 0.1,
    'criterion': 'mse'
}
gradient_boosting_regressor_model = ensemble.GradientBoostingRegressor(
    **params)
crossvalidation = KFold(n_splits=10, random_state=7)
model = gradient_boosting_regressor_model.fit(X, y)
scores = cross_val_score(model,
                         X,
                         y,
                         scoring='neg_mean_squared_error',
                         cv=crossvalidation)

#y_pred = gradient_boosting_regressor_model.predict(X_test)
#RMSE = np.sqrt(mean_squared_error(y_test,y_pred))

RMSE = np.sqrt(-scores.mean())
print(RMSE)

#plt.figure(figsize=(12,6))
示例#12
0
def gradientBoost():
    params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 1,
          'learning_rate': 0.01, 'loss': 'ls'}
    return ensemble.GradientBoostingRegressor(**params)
示例#13
0
 def GradientBoostingRegressorTest(self):
     x_train, y_train, x_test, y_test = Elution().get_data()
     gbrt = ensemble.GradientBoostingRegressor(n_estimators=100)
     Elution().try_different_method(gbrt, x_train, y_train, x_test, y_test)
示例#14
0
def boosting(parameter):
    defaults = ['ls', 0.1, 100]
    split = pad(parameter, defaults)
    return ensemble.GradientBoostingRegressor(loss=split[0])
示例#15
0
trainFrame = dataclean.cleanDataset(dataclean.loadTrainData())
trainData = dataclean.convertPandasDataFrameToNumpyArray(trainFrame)

testFrame = dataclean.cleanDataset(dataclean.loadTestData(), True)
testData = dataclean.convertPandasDataFrameToNumpyArray(testFrame)

trainX = trainData[:, 1:]
trainY = trainData[:, 0]

testX = testData[:, 1:]
"""
Cross Validation
"""
crossvalidationTree = ensemble.GradientBoostingRegressor(n_estimators=400,
                                                         learning_rate=0.01,
                                                         max_depth=6,
                                                         random_state=1,
                                                         presort=True)
cvCount = 10
crossvalidation = Metrics.crossValidationScore(
    ensemble.GradientBoostingRegressor(random_state=1),
    trainX,
    trainY,
    cvCount=cvCount)

xTrain, xTest, yTrain, yTest = Metrics.traintestSplit(trainX,
                                                      trainY,
                                                      randomState=1)
"""
#{'n_estimators': 400, 'max_depth': 6, 'learning_rate': 0.01
示例#16
0
print("arr2_hi_E_n.shape: ", arr2_hi_E_n.shape)
#--- select 70% of sample for training and 30% for testing:
offset = int(arr2_hi_E_n.shape[0] * 0.7)
arr2_hi_E_train, arr3_hi_E_train = arr2_hi_E_n[:
                                               offset], arr3_hi_E[:offset].reshape(
                                                   -1)  # train sample
arr2_hi_E_test, arr3_hi_E_test = arr2_hi_E_n[offset:], arr3_hi_E[
    offset:].reshape(-1)  # test sample

print("train shape: ", arr2_hi_E_train.shape, " label: ",
      arr3_hi_E_train.shape)
print("test shape: ", arr2_hi_E_test.shape, " label: ", arr3_hi_E_test.shape)

print("training BDTG...")
net_hi_E = ensemble.GradientBoostingRegressor(**params)
net_hi_E.fit(arr2_hi_E_train, arr3_hi_E_train)
net_hi_E

mse = mean_squared_error(arr3_hi_E_test, net_hi_E.predict(arr2_hi_E_test))
print("MSE: %.4f" % mse)
print("events at training & test samples: ", len(arr_hi_E0))
print("events at train sample: ", len(arr2_hi_E_train))
print("events at test sample: ", len(arr2_hi_E_test))

test_score = np.zeros((params['n_estimators'], ), dtype=np.float64)

for i, y_pred in enumerate(net_hi_E.staged_predict(arr2_hi_E_test)):
    test_score[i] = net_hi_E.loss_(arr3_hi_E_test, y_pred)

#    fig,ax=plt.subplots(ncols=1, sharey=True)
# Create the X and y arrays
X = features_df.as_matrix()
y = df['sale_price'].as_matrix()

# Split the data set in a training set (70%) and a test set (30%)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)

# Fit regression model
model = ensemble.GradientBoostingRegressor(n_estimators=1000,
                                           learning_rate=0.1,
                                           max_depth=6,
                                           min_samples_leaf=9,
                                           max_features=0.1,
                                           loss='huber',
                                           random_state=0
                                           #default parameters
                                           )
model.fit(X_train, y_train)

# Save the trained model to a file so we can use it in other programs
joblib.dump(model, 'trained_house_classifier_model.pkl')

# Find the error rate on the training set
mse = mean_absolute_error(y_train, model.predict(X_train))
print("Training Set Mean Absolute Error: %.4f" % mse)

# Find the error rate on the test set
mse = mean_absolute_error(y_test, model.predict(X_test))
示例#18
0
def adaboost_on_fold(feature_sets,
                     train,
                     test,
                     y,
                     y_all,
                     X,
                     dim,
                     dimsum,
                     learn_options,
                     classification=False):
    '''
    AdaBoostRegressor/Classifier from scikitlearn.
    '''

    if learn_options['adaboost_version'] == 'python':
        if not learn_options['adaboost_CV']:
            if not classification:
                clf = en.GradientBoostingRegressor(
                    loss=learn_options['adaboost_loss'],
                    learning_rate=learn_options['adaboost_learning_rate'],
                    n_estimators=learn_options['adaboost_n_estimators'],
                    alpha=learn_options['adaboost_alpha'],
                    subsample=1.0,
                    min_samples_split=2,
                    min_samples_leaf=1,
                    max_depth=learn_options['adaboost_max_depth'],
                    init=None,
                    random_state=None,
                    max_features=None,
                    verbose=0,
                    max_leaf_nodes=None,
                    warm_start=False)
            else:
                clf = en.GradientBoostingClassifier(
                    learning_rate=learn_options['adaboost_learning_rate'],
                    n_estimators=learn_options['adaboost_n_estimators'],
                    subsample=1.0,
                    min_samples_split=2,
                    min_samples_leaf=1,
                    max_depth=learn_options['adaboost_max_depth'],
                    init=None,
                    random_state=None,
                    max_features=None,
                    verbose=0,
                    max_leaf_nodes=None,
                    warm_start=False)

            clf.fit(X[train], y[train].flatten())
            y_pred = clf.predict(X[test])[:, None]
        else:  # optimize the parameters if the adaboosted algorithm

            if learn_options["algorithm_hyperparam_search"] == "bo":
                print

                from hyperopt import hp, fmin, tpe, rand

                def adaboost_scoring_bo(params):
                    # label_encoder = sklearn.preprocessing.LabelEncoder()
                    # label_encoder.fit(y_all['Target gene'].values[train])
                    # gene_classes = label_encoder.transform(y_all['Target gene'].values[train])
                    # n_folds = len(np.unique(gene_classes))
                    cv = sklearn.cross_validation.KFold(
                        y_all['Target gene'].values[train].shape[0],
                        n_folds=20,
                        shuffle=True)
                    est = en.GradientBoostingRegressor(
                        n_estimators=1000,
                        learning_rate=params['learning_rate'],
                        max_depth=params['max_depth'],
                        min_samples_leaf=params['min_samples_leaf'],
                        max_features=params['max_features'])
                    scorer = cross_val_score(est,
                                             X[train],
                                             y[train].flatten(),
                                             cv=cv,
                                             n_jobs=20)
                    return np.median(scorer)

                space = {
                    'learning_rate': hp.uniform('learning_rate', 0.001, 0.1),
                    'max_depth': hp.quniform('max_depth', 1, 8, 1),
                    'min_samples_leaf': hp.quniform('min_samples_leaf', 3, 20,
                                                    1),
                    'max_features': hp.uniform('max_features', 0.05, 1.0)
                }

                best = fmin(adaboost_scoring_bo,
                            space,
                            algo=tpe.suggest,
                            max_evals=50,
                            verbose=1)
                print best
                clf = en.GradientBoostingRegressor(
                    n_estimators=learn_options['adaboost_n_estimators'],
                    learning_rate=best['learning_rate'],
                    max_depth=best['max_depth'],
                    min_samples_leaf=best['min_samples_leaf'],
                    max_features=best['max_features'])

                clf.fit(X[train], y[train].flatten())
            elif learn_options["algorithm_hyperparam_search"] == "grid":
                assert not classification, "need to tweak code below to do classificaton, as above"
                n_jobs = 20

                print "Adaboost with GridSearch"
                from sklearn.grid_search import GridSearchCV
                #param_grid = {'learning_rate': [0.1, 0.05, 0.01],
                #              'max_depth': [4, 5, 6, 7],
                #              'min_samples_leaf': [5, 7, 10, 12, 15],
                #              'max_features': [1.0, 0.5, 0.3, 0.1]}
                param_grid = {
                    'learning_rate': [0.1, 0.01],
                    'max_depth': [4, 7],
                    'min_samples_leaf': [5, 15],
                    'max_features': [1.0, 0.1]
                }

                label_encoder = sklearn.preprocessing.LabelEncoder()
                label_encoder.fit(y_all['Target gene'].values[train])
                gene_classes = label_encoder.transform(
                    y_all['Target gene'].values[train])
                n_folds = len(np.unique(gene_classes))
                cv = sklearn.cross_validation.StratifiedKFold(gene_classes,
                                                              n_folds=n_folds,
                                                              shuffle=True)

                est = en.GradientBoostingRegressor(
                    loss=learn_options['adaboost_loss'],
                    n_estimators=learn_options['adaboost_n_estimators'])
                clf = GridSearchCV(est,
                                   param_grid,
                                   n_jobs=n_jobs,
                                   verbose=1,
                                   cv=cv,
                                   scoring=spearman_scoring,
                                   iid=False).fit(X[train], y[train].flatten())
                print clf.best_params_
            else:
                raise Exception(
                    "if using adaboost_CV then need to specify grid (grid search) or bo (bayesian optimization)"
                )

            y_pred = clf.predict(X[test])[:, None]
    else:
        raise NotImplementedError

    return y_pred, clf
示例#19
0
    def double_chain_testing(self,
                             chain_steps,
                             column_to_overwrite_1,
                             column_to_overwrite_2,
                             model_2_inputs,
                             model_2_output,
                             result_indexes=[],
                             mape=False,
                             filename_2=""):

        tmp_list = []
        chain_predictions = []
        mse_list = []
        mape_list = []
        predictions_result = []
        if filename_2:
            model_2 = load(filename_2)
        else:
            model_2 = ensemble.GradientBoostingRegressor(
                n_estimators=self.num_of_trees,
                max_depth=8,
                max_features="auto",
                verbose=0,
                max_leaf_nodes=32)
            model_2.fit(X=self.train_frame[model_2_inputs],
                        y=self.train_frame[model_2_output])

        for starting_index in tqdm(range(len(self.test_frame) - chain_steps)):

            tmp_test_frame = self.test_frame.copy()
            tmp_test_frame["AvgP"] = tmp_test_frame["AvgP"].astype(np.float)
            for i in range(starting_index, starting_index + chain_steps):

                tmp_list.append(tmp_test_frame.loc[i, self.inputs])
                result = self.model.predict(tmp_list)
                tmp_list.clear()
                tmp_test_frame.at[i + 1, column_to_overwrite_1] = result[0]
                chain_predictions.append(result[0])

                tmp_list.append(tmp_test_frame.loc[i, model_2_inputs])
                result = model_2.predict(tmp_list)
                tmp_list.clear()
                tmp_test_frame.at[i + 1, column_to_overwrite_2] = result[0]

            mse = mean_squared_error(
                self.test_frame.loc[starting_index:(starting_index +
                                                    chain_steps - 1),
                                    self.output], np.array(chain_predictions))

            mse_list.append(mse)
            if mape:
                mape = np.mean(
                    np.abs((np.array(
                        list(self.test_frame.loc[starting_index:(
                            starting_index + chain_steps - 1), self.output])) -
                            np.array(chain_predictions)) / (np.array(
                                list(self.test_frame.loc[starting_index:(
                                    starting_index + chain_steps -
                                    1), self.output]))))) * 100
                mape_list.append(mape)

            if (starting_index in result_indexes) or (-1 in result_indexes):
                predictions_result.append(chain_predictions.copy())
            chain_predictions.clear()

        mse_result = np.sqrt(np.mean(mse_list))
        print("MSE: " + str(mse_result))
        if mape:
            mape_result = np.mean(mape_list)
            print("MAPE: " + str(mape_result))
            if len(predictions_result) > 0:
                return mse_result, mape_result, predictions_result
            else:
                return mse_result, mape_result
        else:
            if len(predictions_result) > 0:
                return mse_result, predictions_result
            else:
                return mse_result
data_type_nan = data_new[data_new['type'].isnull()]
data_type_no_nan = data_new.type.fillna(method='ffill')
data_new['type'] = data_type_no_nan

data_params_nan = data_new[data_new['params'].isnull()]
data_params_no_nan = data_new.params.fillna(data_new.params.median())
data_new['params'] = data_params_no_nan

#We have full dataset without any null values or outliers
#Define the different models to see which will fit best :

linreg = LinearRegression()
logreg = LogisticRegression()
gbr = ensemble.GradientBoostingRegressor(n_estimators=50,
                                         max_depth=8,
                                         min_samples_split=2,
                                         learning_rate=0.1,
                                         loss='ls')
tree_clf = tree.DecisionTreeClassifier(criterion='entropy')

labels = data_new['duration']
version_1 = data_new.drop(['duration', 'Unnamed: 0'], axis=1)
x_train, x_test, y_train, y_test = train_test_split(version_1,
                                                    labels,
                                                    test_size=0.10,
                                                    random_state=2)

#Linear Regression
linreg.fit(x_train, y_train)
lin_reg_score = linreg.score(x_test, y_test)
lin_reg_predict = linreg.predict(x_test)
示例#21
0
#removing rows with empty values
df.dropna(axis = 0, how = 'any', thresh = None, subset = None, inplace = True)

# apply one hot encoding
df = pd.get_dummies(df, columns = ["Suburb", "CouncilArea", "Type"])

# assign X (indipendent var) and y (dependat variables)
X = df.drop('Price', axis = 1)
y = df["Price"]

# now split dataset - training data and test data - 70% train 30% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, shuffle = True)

# Select and algorithm and config hyperparameters
model = ensemble.GradientBoostingRegressor(n_estimators = 150, learning_rate = 0.1, max_depth = 30, min_samples_split = 4, min_samples_leaf = 6, max_features = 0.6, loss = "huber")
"""
 we have selected the gradient boosting
 - n_estimators = nr of decision trees
 - learning_reate = rate at which additional trees influence the prediction.
 - max_depth max number of layers for each decision trees
 - min_samples_split = min samples to execute a binary split
 - min_samples_leaf = 
 - max_features =
 - loss = how the model loss is calculated
"""
print("Start training the model")
# train the prediction model
model.fit(X_train, y_train) 
print("Model trained")
示例#22
0
#Imputing and scalling data
imp = Imputer(missing_values='NaN', strategy='median', axis=0)
scaler = MinMaxScaler()
X = imp.fit_transform(X)
X = scaler.fit_transform(X)

#spliting data to x_train, x_test, y_train, y_test
x_train, x_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.4,
                                                    random_state=42)

#GradientBoostingRegressor
print 'GradientBoostingRegressor'
pca = decomposition.PCA()
reg_GB = ensemble.GradientBoostingRegressor()
pipe = Pipeline(steps=[('pca', pca), ('GradientBoostingRegressor', reg_GB)])
n_components = [23]  # 21
n_estimators = [300]  #150
learning_rate = [0.05]  #0.01
max_depth = [4]  #6
min_samples_leaf = [200]  #100

estimator = GridSearchCV(
    pipe,
    param_grid=dict(
        pca__n_components=n_components,
        GradientBoostingRegressor__n_estimators=n_estimators,
        GradientBoostingRegressor__learning_rate=learning_rate,
        GradientBoostingRegressor__max_depth=max_depth,
        GradientBoostingRegressor__min_samples_leaf=min_samples_leaf))
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
from sklearn import ensemble


main_data=pd.read_csv("kc_house_data.csv")
converting_dates = [1 if values == 2014 else 0 for values in main_data.date]
main_data['date']= converting_dates
x_set=main_data.drop(['id','price'],axis=1)
y_set=main_data['price']
x_trainingSet,x_testingSet,y_trainingSet,y_testingSet = train_test_split(x_set,y_set,test_size=0.15,random_state=2)


regression=LinearRegression()
regression.fit(x_trainingSet,y_trainingSet)
result_reg=regression.score(x_testingSet,y_testingSet)
print("Accuracy of Linear Regression Model in percentage : ",result_reg*100)

classification= ensemble.GradientBoostingRegressor(n_estimators=400,max_depth=5,min_samples_split = 2,learning_rate=0.7,loss='ls')

classification.fit(x_trainingSet,y_trainingSet)
result_GBreg=classification.score(x_testingSet,y_testingSet)
print("Accuracy of Linear Regression with Gradient Booster in percentage : ",result_GBreg*100)
示例#24
0
ndn['categories'] = pd.cut(ndn['Total Cholesterol( mg/dL)'], bins, labels=group_names)
x1=categories.astype(int)
y1=x1.fillna(x1.mean())
y1=pd.DataFrame(y1)
for i, col in enumerate(y1.columns.tolist(), 1):
    y1.loc[:, col] *= i
y1 = y1.sum(axis=1)
parameters = {
              'n_estimators': 500, 
              'max_depth': 3,
              'learning_rate': 0.02, 
              'loss': 'ls'
             }
from sklearn import ensemble
from sklearn import metrics
classifier = ensemble.GradientBoostingRegressor(**parameters)

classifier.fit(X_train, Y_train)
predictions = classifier.predict(X_test)
mse = metrics.mean_squared_error(Y_test, predictions)
print('Mean Square Error: {:.3f}'.format(mse))
from sklearn.metrics import label_ranking_average_precision_score
label_ranking_average_precision_score(Y_test, predictions) 
plt.figure(figsize=(16, 12))

plt.scatter(range(predictions.shape[0]), predictions, label='predictions', c='#348ABD', alpha=0.4)
plt.scatter(range(Y_test.shape[0]), Y_test, label='actual values', c='#A60628', alpha=0.4)
plt.ylim([Y_test.min(), predictions.max()])
plt.xlim([0, predictions.shape[0]])
plt.legend();
test_score = [classifier.loss_(Y_test, Y_pred) for Y_pred in classifier.staged_decision_function(X_test)]
示例#25
0
### Shuffling train sets
train_features_st, train_features, train_labels = shuffle(train_features_st, train_features, train_labels, random_state = 5)

### Splitting
x_train, x_test, y_train, y_test = train_test_split(train_features, train_labels, test_size=0.1, random_state=200)
x_train_st, x_test_st, y_train_st, y_test_st = train_test_split(train_features_st, train_labels, test_size=0.1, random_state=200)

'''
Elastic Net
'''
ENSTest = linear_model.ElasticNetCV(alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10], l1_ratio=[.01, .1, .5, .9, .99], max_iter=5000).fit(x_train_st, y_train_st)


'''
Gradient Boosting
'''
GBest = ensemble.GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=3, max_features='sqrt',
                                               min_samples_leaf=15, min_samples_split=10, loss='huber')

# Retraining models
GB_model = GBest.fit(train_features, train_labels)
ENST_model = ENSTest.fit(train_features_st, train_labels)

## Getting our SalePrice estimation
Final_labels = (np.exp(GB_model.predict(test_features)) + np.exp(ENST_model.predict(test_features_st))) / 2
Final_labels_train = (np.exp(ENST_model.predict(train_features_st)) + np.exp(GB_model.predict(train_features))) / 2

get_score(Final_labels_train, np.exp(train_labels))
## Saving to CSV
#pd.DataFrame({'Id': test.Id, 'SalePrice': Final_labels}).to_csv('submission12.csv', index =False)
示例#26
0
Player_Career_Data_Training_x = np.array(Player_Career_Data_Training[Features_Selected_From_Correlation])
Player_Career_Data_Training_x = Player_Career_Data_Training_x.copy(order='C')
Player_Career_Data_Training_y = np.array(Player_Career_Data_Training[Prediction_Column])
Player_Career_Data_Training_y = Player_Career_Data_Training_y.copy(order='C')

# Holdout set related to 4 players

Player_Career_Data_Testing_x = np.array(Player_Career_Data_Testing[Features_Selected_From_Correlation])
Player_Career_Data_Testing_x = Player_Career_Data_Testing_x.copy(order = 'C')
Player_Career_Data_Testing_y = np.array(Player_Career_Data_Testing[Prediction_Column])
Player_Career_Data_Testing_y = Player_Career_Data_Testing_y.copy(order = 'C')

# Global Feature Importance based on gradient boosted regression

GBR_Parameters = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2, 'learning_rate': 0.001, 'loss': 'ls'}
GBR_Model = ensemble.GradientBoostingRegressor(**GBR_Parameters)
GBR_Model.fit(Player_Career_Data_Training_x, Player_Career_Data_Training_y.ravel())
sorted_indices = np.argsort(GBR_Model.feature_importances_)[::-1]
for index in sorted_indices:
    print(f"{Numerical_Features_Model_Validation[index]}: {GBR_Model.feature_importances_[index]}")

# Plotting Global feature importance extracted from Gradient Boosted Regressor
    
Feature_Importances = GBR_Model.feature_importances_
sorted_idx = np.argsort(Feature_Importances)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.figure(figsize=(8,14))
plt.barh(pos, Feature_Importances[sorted_idx], align='center')
plt.yticks(pos, Player_Numerical_Features.columns[sorted_idx],rotation=40, ha="right")
plt.xlabel('Relative Importance')
plt.ylabel('Feature Names')
X = X.astype(np.float32)
offset = int(X.shape[0] * 0.9)
X_train, y_train = X[:offset], y[:offset]
X_test, y_test = X[offset:], y[offset:]

# #############################################################################
# Fit regression model
original_params = {
    'n_estimators': 500,
    'max_depth': 4,
    'min_samples_split': 2,
    'learning_rate': 0.01,
    'loss': 'ls'
}

clf = ensemble.GradientBoostingRegressor(**original_params)

clf.fit(X_train, y_train)
mse = mean_squared_error(y_test, clf.predict(X_test))
print("MSE: %.4f" % mse)

# #############################################################################
# gbm regularzation - 1) subsampling
#                   - 2) shrinkage
#                   - 3) early stopping
for label, color, setting in [('No shrinkage', 'orange', {
        'learning_rate': 1.0,
        'subsample': 1.0
}), ('subsample=0.5', 'blue', {
        'learning_rate': 1.0,
        'subsample': 0.5
示例#28
0
    #     ),
    #     (
    #         'MLPRegressor',
    #         make_pipeline(MLPRegressor(hidden_layer_sizes=(128,), max_iter=10000))
    #     ),
    #     (
    #         'MLPRegressor(100, 100)',
    #         make_pipeline(MLPRegressor(hidden_layer_sizes=(100, 100), max_iter=100000))
    #     ),
    (
        'RandomForestRegressor',
        make_pipeline(ensemble.RandomForestRegressor())
    ),
    (
        'ensemble_GradientBoostingRegressor',
        make_pipeline(ensemble.GradientBoostingRegressor(**ensemble_params))
    ),
    (
        'XGBRegressor',
        make_pipeline(xgb.XGBRegressor(objective="reg:linear", random_state=42))
    ),
    (
        'PolynomialFeatures-2-RandomForestRegressor',
        make_pipeline(PolynomialFeatures(2), ensemble.RandomForestRegressor())
    ),
    (
        'PolynomialFeatures-2-ensemble_GradientBoostingRegressor',
        make_pipeline(PolynomialFeatures(2), ensemble.GradientBoostingRegressor(**ensemble_params))
    ),

    (
train.fillna(0, inplace=True)

Y = train.loc[:, "SalePrice"]
X = train.drop(columns=["SalePrice"])

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X = scaler.fit_transform(X)
#Y=scaler.fit_transform(Y.values.reshape(-1,1))

from sklearn import ensemble
from sklearn.model_selection import cross_val_score

#%% GradientBoosted
model = ensemble.GradientBoostingRegressor(n_estimators=110, criterion="mse")

scores = cross_val_score(model, X, Y, cv=10)

print("*GradientBoosted - R2:")
print(scores.mean())
print("*GradientBoosted - Desvio Padrão:")
print(scores.std())

#%% RandomForest
model = ensemble.RandomForestRegressor(n_estimators=110)

scores = cross_val_score(model, X, Y, cv=10)

print("\nRandomForest - R2:")
print(scores.mean())
示例#30
0
def DT_main_seq(start, stop, testGroup, segmentName):
    print('\n----------Start-----------\n')
    #    (n_estimators,
    #     max_depth,
    #     min_samples_split,
    #     learning_rate,
    #     loss,
    #     start,
    #     stop,
    #     testGroup,
    #     segmentName) = parsingInit()
    n_estimators = 1000
    max_depth = 2
    min_samples_split = 2
    learning_rate = 0.01
    loss = 'ls'

    flowRates_Train = np.array([i for i in range(start, stop + 10, 10)])

    flowRates_Test = np.array(
        [i for i in range(testGroup, testGroup + 10, 10)])

    flowRates_reTrain = np.append(flowRates_Train, flowRates_Test)

    #The 160 flow rate data is corrupted!!
    #TODO: recollect the data
    flowRates_Train = np.delete(flowRates_Train,
                                np.where(flowRates_Train == 160))
    flowRates_Test = np.delete(flowRates_Test, np.where(flowRates_Test == 160))
    flowRates_reTrain = np.delete(flowRates_reTrain,
                                  np.where(flowRates_reTrain == 160))

    print('Train: ', flowRates_Train)
    print('Test: ', flowRates_Test)
    print('reTrain: ', flowRates_reTrain)

    print('1. Extracting Data... ')
    #Train Data
    X_Train, y_thic_Train, y_flow_Train = getXData(KPI_fileName, objectName,
                                                   segment_Numbers,
                                                   flowRates_Train,
                                                   segmentName, features)
    featureNames = X_Train.columns

    #Test Data
    X_Test, y_thic_Test, y_flow_Test = getXData(KPI_fileName, objectName,
                                                segment_Numbers,
                                                flowRates_Test, segmentName,
                                                features)

    #ReTrain Data
    X_reTrain, y_thic_reTrain, y_flow_reTrain = getXData(
        KPI_fileName, objectName, segment_Numbers, flowRates_reTrain,
        segmentName, features)

    #%% Preprocessing Data converting to float32 and removing NaN
    print('2. Preprocessing Data...')
    imp1 = Imputer(missing_values='NaN', strategy='mean', axis=0)
    #    imp2 = Imputer(missing_values=0, strategy='mean', axis=0)

    X_Train, y_thic_Train = preProcess(X_Train, y_thic_Train)
    X_Train = imp1.fit_transform(X_Train)

    X_Test, y_thic_Test = preProcess(X_Test, y_thic_Test)
    X_Test = imp1.fit_transform(X_Test)

    X_reTrain, y_thic_reTrain = preProcess(X_reTrain, y_thic_reTrain)
    X_reTrain = imp1.fit_transform(X_reTrain)

    #%%
    if not os.path.exists(destinationFolder):
        os.makedirs(destinationFolder)

    paramsGBR = {
        'n_estimators': n_estimators,
        'max_depth': max_depth,
        'min_samples_split': min_samples_split,
        'learning_rate': learning_rate,
        'loss': loss
    }

    model = ensemble.GradientBoostingRegressor(**paramsGBR)

    clf_Tr = clone(model)

    #%%
    print('3. Building Model with all the Samples...')
    X_Train, y_thic_Train = shuffle(X_Train, y_thic_Train)

    print('\t Shape Train: ', X_Train.shape)
    print('\t DataType Train: ', X_Train.dtype)

    print('\t Shape Train: ', y_thic_Train.shape)
    print('\t DataType Train: ', y_thic_Train.dtype)

    min_max_scaler_Train_X = preprocessing.MinMaxScaler().fit(X_Train)
    scaler_Train_X = preprocessing.StandardScaler().fit(X_Train)

    X_Tr = scaler_Train_X.transform(X_Train)
    X_Tr = min_max_scaler_Train_X.transform(X_Tr)

    clf_Tr = model.fit(X_Tr, y_thic_Train)

    #%%
    print('4. Results for Training:')
    y_pred1 = clf_Tr.predict(X_Tr)
    featureImportance(clf_Tr, featureNames,
                      str(testGroup) + '_initialRankings_' + segmentName)

    mse_Test = mean_squared_error(y_thic_Train, y_pred1)
    mae_Test = mean_absolute_error(y_thic_Train, y_pred1)
    medae_Test = median_absolute_error(y_thic_Train, y_pred1)
    r2_Test = r2_score(y_thic_Train, y_pred1)
    exvs_Test = explained_variance_score(y_thic_Train, y_pred1)

    print('\t Mean Squared Error      :', mse_Test)
    print('\t Mean Absolute Error     :', mae_Test)
    print('\t Median Absolute Error   :', medae_Test)
    print('\t R2 Score                :', r2_Test)
    print('\t Explained Variance Score:', exvs_Test)

    #%%
    print('\n5. Processing emissions Signals for Group ', flowRates_Test,
          ' ...')
    X_Test, y_thic_Test = shuffle(X_Test, y_thic_Test)

    print('\t Shape Test: ', X_Test.shape)
    print('\t DataType Train: ', X_Test.dtype)

    print('\t Shape y Test: ', y_thic_Test.shape)
    print('\t DataType y Test: ', y_thic_Test.dtype)

    print('6. Transforming emissions Signals for Group ', flowRates_Test,
          ' ...')
    X_Te = scaler_Train_X.transform(X_Test)
    X_Te = min_max_scaler_Train_X.transform(X_Te)

    print('\t Shape X_Te: ', X_Te.shape)
    print('\t DataType X_te: ', X_Te.dtype)

    print('7. Predicting KPI for Signals for Group ', flowRates_Test, ' ...')
    y_pred_Te = clf_Tr.predict(X_Te)

    print('8. Results for Predicting KPI for Signals for Group ',
          flowRates_Test, ' ...')
    mse_Test = mean_squared_error(y_thic_Test, y_pred_Te)
    mae_Test = mean_absolute_error(y_thic_Test, y_pred_Te)
    medae_Test = median_absolute_error(y_thic_Test, y_pred_Te)
    r2_Test = r2_score(y_thic_Test, y_pred_Te)
    exvs_Test = explained_variance_score(y_thic_Test, y_pred_Te)

    print('\t Mean Squared Error      :', mse_Test)
    print('\t Mean Absolute Error     :', mae_Test)
    print('\t Median Absolute Error   :', medae_Test)
    print('\t R2 Score                :', r2_Test)
    print('\t Explained Variance Score:', exvs_Test)

    fileNamecsv = destinationFolder + '/FeatureRanking_' + str(
        testGroup) + '_' + segmentName + '.csv'
    print('9. Saving Results', fileNamecsv, ' ...')
    np.savetxt(
        fileNamecsv, [[mse_Test, mae_Test, medae_Test, r2_Test, exvs_Test]],
        delimiter=',',
        header=
        'Mean Squared Error, Mean Absolute Error, Median Absolute Error,R2 Score, Explained Variance Score',
        comments='')

    print('10. Retraining the Model with new emission Signal...')
    X_reTrain, y_thic_reTrain = shuffle(X_reTrain, y_thic_reTrain)

    print('\t Shape reTrain: ', y_thic_reTrain.shape)
    print('\t DataType reTrain: ', y_thic_reTrain.dtype)

    print('\t Shape y reTrain: ', y_thic_Test.shape)
    print('\t DataType y reTrain: ', y_thic_Test.dtype)

    min_max_scaler_Train_X2 = preprocessing.MinMaxScaler().fit(X_reTrain)
    scaler_Train_X2 = preprocessing.StandardScaler().fit(X_reTrain)

    X_reTr = scaler_Train_X2.transform(X_reTrain)
    X_reTr = min_max_scaler_Train_X2.transform(X_reTr)

    print('\t Shape X_reTr: ', X_reTr.shape)
    print('\t DataType X_reTr: ', X_reTr.dtype)

    X_Te = scaler_Train_X.transform(X_Test)
    X_Te = min_max_scaler_Train_X.transform(X_Te)

    print('\t Shape X_Te: ', X_Te.shape)
    print('\t DataType X_Te: ', X_Te.dtype)

    clf_reTr = model.fit(X_reTr, y_thic_reTrain)
    print('11. New Results with emission signals Incorporated:')
    y_pred_Te = clf_reTr.predict(X_Te)
    mse_Test = mean_squared_error(y_thic_Test, y_pred_Te)
    mae_Test = mean_absolute_error(y_thic_Test, y_pred_Te)
    medae_Test = median_absolute_error(y_thic_Test, y_pred_Te)
    r2_Test = r2_score(y_thic_Test, y_pred_Te)
    exvs_Test = explained_variance_score(y_thic_Test, y_pred_Te)

    print('\t Mean Squared Error      :', mse_Test)
    print('\t Mean Absolute Error     :', mae_Test)
    print('\t Median Absolute Error   :', medae_Test)
    print('\t R2 Score                :', r2_Test)
    print('\t Explained Variance Score:', exvs_Test)

    print('12. Saving the new Results', fileNamecsv, ' ...')
    f = open(fileNamecsv, 'a')
    df = pd.DataFrame([[mse_Test, mae_Test, medae_Test, r2_Test, exvs_Test]])
    df.to_csv(f, index=False, header=False)
    f.close()
    featureImportance(clf_reTr, featureNames,
                      str(testGroup) + '_reTrainedRankings_' + segmentName)

    print('-----------:Finished!:--------------- \n')