Exemplo n.º 1
0
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold  # import KFold
from sklearn.metrics import mean_absolute_error, r2_score
import RegscorePy

df = pd.read_csv('training_dataset.csv')
data = df.values
x = data[:, 0:51]  # all rows, no label
y = data[:, 51]  # all rows of the labeled column

# ('Best parameters:', {'bootstrap': False, 'min_samples_leaf': 1, 'n_estimators': 1000, 'max_features': 'sqrt',
#  'min_samples_split': 2, 'max_depth': None})
model = RandomForestRegressor(n_estimators=1000,
                              random_state=42,
                              bootstrap=False,
                              min_samples_leaf=1,
                              max_features=9,
                              min_samples_split=2,
                              max_depth=None)

print(df.columns.tolist())

# Labels are the values we want to predict
labels = np.array(df['Sum_NOK'])
# Remove the labels from the features
# axis 1 refers to the columns
df = df.drop('Sum_NOK', axis=1)
# Saving feature names for later use
feature_list = list(df.columns)
# Convert to numpy array
df = np.array(df)
Exemplo n.º 2
0
  for m in models:
    m.fit(train.ix[:, [2,3,4,5] ] ,train['Target'])
    preds = m.predict(test.ix[:, [2,3,4,5]])
    mae_amt = mean_absolute_error(preds,test['Target'])
    mae_amts.append(mae_amt)

    m.fit(train.ix[:, [1,3,4,5] ] ,train['DaysSinceLast'])
    preds = m.predict(test.ix[:, [1,3,4,5]])
    mae_gap = mean_absolute_error(preds,test['DaysSinceLast'])
    mae_gaps.append(mae_gap)
  mae_gaps.append(mean_absolute_error(test['DaysSinceLast2'],test['DaysSinceLast']))
  mae_amts.append(mean_absolute_error(test['LastPayment'],test['Target']))
  return (mae_gaps,mae_amts)


models = [ Ridge(alpha=0.1),GradientBoostingRegressor(),RandomForestRegressor()]
model_names = ['Ridge Regression','Gradient Boosted Tree','Random Forest', 'Benchmark']

train,test = get_data()

mae_gaps,mae_amts = do_estimation(models,train,test)

fig1 = pl.figure('Payment Amount MAE')
ax1 = pl.subplot(111)
ax1.bar(range(len(model_names)),mae_amts,width=0.5)
ax1.set_xticks(np.arange(len(model_names))+0.25)
ax1.set_xticklabels(model_names)
ax1.set_title('Payment Amount MAE')
fig1.savefig('MAE_Payment_Amount.png')
  
# In[791]:


predictors=['Item_MRP','Outlet_Type_0','Outlet_5','Years_of_operation']
alg4=DecisionTreeRegressor(max_depth=8,min_samples_leaf=150)
modelfit(alg4,traindf,testdf,predictors,target,IDcol,'alg4.csv')
coef4=pd.Series(alg4.feature_importances_,predictors).sort_values(ascending=False)
coef4.plot(kind='bar',title='Feature importances')


# In[797]:


from sklearn.ensemble import RandomForestRegressor
predictors = [x for x in traindf.columns if x not in [target]+IDcol]
alg5=RandomForestRegressor(n_estimators=200,max_depth=5,min_samples_leaf=100,n_jobs=4)
modelfit(alg5, traindf, testdf, predictors, target, IDcol, 'alg5.csv')
coef5 = pd.Series(alg5.feature_importances_, predictors).sort_values(ascending=False)
coef5.plot(kind='bar', title='Feature Importances')
 


# In[799]:


predictors = [x for x in traindf.columns if x not in [target]+IDcol]
alg6 = RandomForestRegressor(n_estimators=400,max_depth=6, min_samples_leaf=100,n_jobs=4)
modelfit(alg6, traindf, testdf, predictors, target, IDcol, 'alg6.csv')
coef6 = pd.Series(alg6.feature_importances_, predictors).sort_values(ascending=False)
coef6.plot(kind='bar', title='Feature Importances')
Exemplo n.º 4
0
    'n_estimators': [300, 350, 400, 450],
    'learning_rate': [0.5, 1, 2, 4, 6]
}
clf = GridSearchCV(model, para_dict, cv=4, scoring='r2')
clf.fit(X, y1)
clf.best_params_

model = AdaBoostRegressor(n_estimators=350, learning_rate=2)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('MSE of Adaboost: ', mean_squared_error(y_test, y_pred))
print('Cross validation score (cv=4) of Adaboost:',
      cross_val_score(model, X, y1, cv=4).mean())

#Random Forest
model = RandomForestRegressor(max_depth=3)
para_dict = {'n_estimators': [20, 50, 80]}
clf = GridSearchCV(model, para_dict, cv=4, scoring='r2')
clf.fit(X, y1)
clf.best_params_
clf.best_score_

model = RandomForestRegressor(n_estimators=50, max_depth=3)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('MSE of Random Forest: ', mean_squared_error(y_test, y_pred))
print('Cross validation score (cv=4) of Random Forest:',
      cross_val_score(model, X, y1, cv=4).mean())

#SVR
para_dict = {
Exemplo n.º 5
0
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
X, y = make_regression(n_features=4,
                       n_informative=2,
                       random_state=0,
                       shuffle=False)
regr = RandomForestRegressor(max_depth=2, random_state=0, n_estimators=100)
regr.fit(X, y)
print(regr.feature_importances_)
print(regr.predict([[0, 0, 0, 0]]))
Exemplo n.º 6
0
def reconstructRF():
    """
    run KFOLD method for random forest regression 
    """
    #import packages
    import os
    import numpy as np
    import pandas as pd
    #from sklearn import metrics
    #from scipy import stats
    #import seaborn as sns
    #import matplotlib.pyplot as plt
    #from sklearn.model_selection import KFold
    from datetime import datetime
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.decomposition import PCA
    from sklearn.preprocessing import StandardScaler

    #defining directories
    dir_in = "/lustre/fs0/home/mtadesse/merraAllLagged"
    dir_out = "/lustre/fs0/home/mtadesse/rfReconstruction"
    surge_path = "/lustre/fs0/home/mtadesse/05_dmax_surge_georef"

    # #load KFOLD result csv file
    # os.chdir('F:\\06_eraint_results\\sonstig')
    # kf_dat = pd.read_csv('eraint_randForest_kfold.csv')
    # #edit the tg names to be usable later on
    # editName = lambda x: x.split('.csv')[0]
    # kf_dat['tg'] = pd.DataFrame(list(map(editName, kf_dat['tg'])), columns= ['tg'])

    #cd to the lagged predictors directory
    os.chdir(dir_in)

    x = 462
    y = 463

    #looping through
    for tg in range(x, y):

        os.chdir(dir_in)

        tg_name = os.listdir()[tg]
        print(tg, tg_name)

        #load predictor
        pred = pd.read_csv(tg_name)
        pred.drop('Unnamed: 0', axis=1, inplace=True)

        #add squared and cubed wind terms (as in WPI model)
        pickTerms = lambda x: x.startswith('wnd')
        wndTerms = pred.columns[list(map(pickTerms, pred.columns))]
        wnd_sqr = pred[wndTerms]**2
        wnd_cbd = pred[wndTerms]**3
        pred = pd.concat([pred, wnd_sqr, wnd_cbd], axis=1)

        #standardize predictor data
        dat = pred.iloc[:, 1:]
        scaler = StandardScaler()
        print(scaler.fit(dat))
        dat_standardized = pd.DataFrame(scaler.transform(dat), \
                                        columns = dat.columns)
        pred_standardized = pd.concat([pred['date'], dat_standardized], axis=1)

        #load surge data
        os.chdir(surge_path)
        surge = pd.read_csv(tg_name)
        surge.drop('Unnamed: 0', axis=1, inplace=True)

        #remove duplicated surge rows
        surge.drop(surge[surge['ymd'].duplicated()].index,
                   axis=0,
                   inplace=True)
        surge.reset_index(inplace=True)
        surge.drop('index', axis=1, inplace=True)

        #adjust surge time format to match that of pred
        time_str = lambda x: str(datetime.strptime(x, '%Y-%m-%d'))
        surge_time = pd.DataFrame(list(map(time_str, surge['ymd'])),
                                  columns=['date'])
        time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
        surge_new = pd.concat([surge_time, surge[['surge', 'lon', 'lat']]],
                              axis=1)

        #merge predictors and surge to find common time frame
        pred_surge = pd.merge(pred_standardized,
                              surge_new.iloc[:, :2],
                              on='date',
                              how='right')
        pred_surge.sort_values(by='date', inplace=True)

        #find rows that have nans and remove them
        row_nan = pred_surge[pred_surge.isna().any(axis=1)]
        pred_surge.drop(row_nan.index, axis=0, inplace=True)
        pred_surge.reset_index(inplace=True)
        pred_surge.drop('index', axis=1, inplace=True)

        #in case pred and surge don't overlap
        if pred_surge.shape[0] == 0:
            print('-' * 80)
            print('Predictors and Surge don' 't overlap')
            print('-' * 80)
            continue


        pred_surge['date'] = pd.DataFrame(list(map(time_stamp, \
                                                   pred_surge['date'])), \
                                          columns = ['date'])

        #prepare data for training/testing
        X = pred_surge.iloc[:, 1:-1]
        y = pd.DataFrame(pred_surge['surge'])
        y = y.reset_index()
        y.drop(['index'], axis=1, inplace=True)

        #apply PCA
        #get the number of PCs used during validation
        # pc_num = kf_dat.loc[kf_dat['tg'] == tg_name]['num_95pcs']
        pca = PCA(0.95)
        pca.fit(X)
        X_pca = pca.transform(X)

        {  # #apply 10 fold cross validation
            # kf = KFold(n_splits=10, random_state=29)

            # metric_corr = []; metric_rmse = []; #combo = pd.DataFrame(columns = ['pred', 'obs'])
            # for train_index, test_index in kf.split(X):
            #     X_train, X_test = X_pca[train_index], X_pca[test_index]
            #     y_train, y_test = y['surge'][train_index], y['surge'][test_index]

            #     #train regression model
            #     rf = RandomForestRegressor(n_estimator = 50, min_samples_leaf = 1)
            #     lm.fit(X_train, y_train)

            #     #predictions
            #     predictions = lm.predict(X_test)
            #     # pred_obs = pd.concat([pd.DataFrame(np.array(predictions)), \
            #     #                       pd.DataFrame(np.array(y_test))], \
            #     #                      axis = 1)
            #     # pred_obs.columns = ['pred', 'obs']
            #     # combo = pd.concat([combo, pred_obs], axis = 0)

            #     #evaluation matrix - check p value
            #     if stats.pearsonr(y_test, predictions)[1] >= 0.05:
            #         print("insignificant correlation!")
            #         continue
            #     else:
            #         #print(stats.pearsonr(y_test, predictions))
            #         metric_corr.append(stats.pearsonr(y_test, predictions)[0])
            #         #print(np.sqrt(metrics.mean_squared_error(y_test, predictions)))
            #         metric_rmse.append(np.sqrt(metrics.mean_squared_error(y_test, predictions)))

            # #number of years used to train/test model
            # num_years = np.ceil((pred_surge['date'][pred_surge.shape[0]-1] -\
            #                       pred_surge['date'][0]).days/365)
        }

        longitude = surge['lon'][0]
        latitude = surge['lat'][0]
        num_pc = X_pca.shape[1]  #number of principal components
        # corr = np.mean(metric_corr)
        # rmse = np.mean(metric_rmse)

        # print('num_year = ', num_years, ' num_pc = ', num_pc ,'avg_corr = ',\
        #       np.mean(metric_corr), ' -  avg_rmse (m) = ', \
        #       np.mean(metric_rmse), '\n')

        #%%
        #surge reconstruction
        pred_for_recon = pred[~pred.isna().any(axis=1)]
        pred_for_recon = pred_for_recon.reset_index().drop('index', axis=1)

        #standardize predictor data
        dat = pred_for_recon.iloc[:, 1:]
        scaler = StandardScaler()
        print(scaler.fit(dat))
        dat_standardized = pd.DataFrame(scaler.transform(dat), \
                                        columns = dat.columns)
        pred_standardized = pd.concat(
            [pred_for_recon['date'], dat_standardized], axis=1)

        X_recon = pred_standardized.iloc[:, 1:]

        #apply PCA
        pca = PCA(num_pc)  #use the same number of PCs used for training
        pca.fit(X_recon)
        X_pca_recon = pca.transform(X_recon)

        #%%
        #model preparation
        #defining the rf model with number of trees and minimum leaves
        rf = RandomForestRegressor(n_estimators=50, min_samples_leaf=1, \
                                   random_state = 29)
        rf.fit(X_pca, y)

        #get prediction interval
        def pred_ints(model, X_pca_recon, percentile=95):
            """
            function to construct prediction interval
            taking into account the result of each 
            regression tree
            """
            err_down = []
            err_up = []
            preds = []

            for pred in model.estimators_:
                preds.append(pred.predict(X_pca_recon))
            preds = np.vstack(preds).T
            err_down = np.percentile(preds, (100 - percentile)/2., axis = 1, \
                                     keepdims = True)
            err_up = np.percentile(preds, 100 - (100 - percentile)/2., axis =1, \
                                   keepdims = True)

            return err_down.reshape(-1), err_up.reshape(-1)

        #compute 95% prediction intervals
        err_down, err_up = pred_ints(rf, X_pca_recon, percentile=95)
        #reconstructed surge goes here
        truth = rf.predict(X_pca_recon)

        correct = 0.
        for i, val in enumerate(truth):
            if err_down[i] <= val <= err_up[i]:
                correct += 1
        print(correct * 100 / len(truth), '\n')

        #final dataframe
        final_dat = pd.concat([pred_standardized['date'], \
                               pd.DataFrame([truth, err_down, err_up]).T], axis = 1)
        final_dat['lon'] = longitude
        final_dat['lat'] = latitude
        final_dat.columns = ['date', 'surge_reconsturcted', 'pred_int_lower',\
                             'pred_int_upper', 'lon', 'lat']

        {  #plot - optional
            # time_stamp = lambda x: (datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
            # final_dat['date'] = pd.DataFrame(list(map(time_stamp, final_dat['date'])), columns = ['date'])
            # surge['date'] = pd.DataFrame(list(map(time_stamp, surge['date'])), columns = ['date'])
            # sns.set_context('notebook', font_scale = 2)
            # plt.figure()
            # plt.plot(final_dat['date'], final_dat['mean'], color = 'green')
            # plt.scatter(surge['date'], surge['surge'], color = 'blue')
            #prediction intervals
            # plt.plot(final_dat['date'], final_dat['obs_ci_lower'], color = 'red',  linestyle = "--", lw = 0.8)
            # plt.plot(final_dat['date'], final_dat['obs_ci_upper'], color = 'red',  linestyle = "--", lw = 0.8)
            #confidence intervals
            # plt.plot(final_dat['date'], final_dat['mean_ci_upper'], color = 'black',  linestyle = "--", lw = 0.8)
            # plt.plot(final_dat['date'], final_dat['mean_ci_lower'], color = 'black',  linestyle = "--", lw = 0.8)
        }

        #save df as cs - in case of interruption
        os.chdir(dir_out)
        final_dat.to_csv(tg_name)

        #cd to dir_in
        os.chdir(dir_in)
Exemplo n.º 7
0
def rf_r_test(n=10):
    X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0)
    X_train, X_test = X[:200], X[200:]
    y_train, y_test = y[:200], y[200:]
    ens = EnsembleRegressor([RandomForestRegressor(n_estimators=1, max_depth=None, min_samples_split=1, random_state=i) for i in range(n)]).fit(X_train, y_train)
    return RMSE(X_test, y_test, ens)
Exemplo n.º 8
0
X_train, X_test, y_train, y_test = train_test_split(X_opt,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)
'''#Feature scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
sc_Y = StandardScaler()
y_train = sc_Y.fit_transform(y_train.reshape(-1,1))'''

# Fitting Random Forest Regression to the dataset
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor(n_estimators=100, random_state=0)
regressor.fit(X_opt, y)

#applying k_fold cross validation
from sklearn.model_selection import cross_val_score

accuracies = cross_val_score(estimator=regressor, X=X_train, y=y_train, cv=10)
accuracies.mean()
accuracies.std()

# Predicting the Test set results
y_train_pred = regressor.predict(X_train)
y_pred = regressor.predict(X_test)
#y_pred = sc_Y.inverse_transform(y_pred)

#accuracy measurement
Exemplo n.º 9
0
                                                     test_size=0.25,
                                                     random_state=42)

# Create a position map
position_map = {}
positions = X_observed.position.unique()
for i in range(len(positions)):
    position_map[i] = positions[i]

# Declare the position feature importance map
position_feature_importance_map = {}

# Create a list of models to compare and select the best model to use for imputing price with values of 0.
models = [
    KNeighborsRegressor(n_neighbors=knr_n_neighbours),
    RandomForestRegressor(n_estimators=rf_xgb_n_estimators),
    XGBRegressor(n_estimators=rf_xgb_n_estimators, max_depth=7)
]

# Declare the subset models map
sub_models_map = {}

# Get all the imputations predicted by each regressor.
all_imputations = []
reordered_y_train = []
reordered_y_test = []
for i in range(len(models)):
    model_imputations = []
    for position in positions:
        # Create subsets by using position as price varies by the player's playing position.
        sub_X_observed_train = X_observed_train[X_observed_train.position ==
Exemplo n.º 10
0
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_predict


diabetes = datasets.load_diabetes()
X = diabetes.data[:150]
y = diabetes.target[:150]



model1 = LinearRegression()
model2 = SVR(gamma = 'auto')
model3 = DecisionTreeRegressor()
model4 = RandomForestRegressor(n_estimators = 20)



models = [model1 , model2 , model3 , model4]

x=0
for m in models:
    x+=1
    
    for n in range(2,5):
        print('result of model number : ' , x ,' for cv value ',n,' is \n' , cross_val_predict(m, X, y, cv=n))  
        print('-----------------------------------')
    print('=====================================')
    print('=====================================')
Exemplo n.º 11
0
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score

#Example
_models = [
    RandomForestRegressor(n_estimators=200,criterion='mse',max_depth=20,random_state=42),
    DecisionTreeRegressor(criterion='mse',max_depth=11,random_state=42),
    GradientBoostingRegressor(n_estimators=200,max_depth=12)
]
learning_mods = pd.DataFrame()
temp = {}

plot_different_models(models):
	for model in models:
	    print(model)
	    m = str(model)
	    temp['Model'] = m[:m.index('(')]
	    model.fit(X_train, y_train)
	    temp['R2_Price'] = r2_score(y_test, model.predict(X_test))
	    print('score on training',model.score(X_train, y_train))
	    print('r2 score',r2_score(y_test, model.predict(X_test)))
	    learning_mods = learning_mods.append([temp])
	learning_mods.set_index('Model', inplace=True)
	 
	fig, axes = plt.subplots(ncols=1, figsize=(10, 4))
	learning_mods.R2_Price.plot(ax=axes, kind='bar', title='R2_Price')
	plt.show()

Exemplo n.º 12
0
plt.plot(range(len(test_y)), test_y, 'r', label='DTTrue Data')
plt.plot(range(len(predictDT)), predictDT, 'b', label='DTPredict Data')
plt.legend()

# 可视化(散点图)
plt.subplot(122)
plt.scatter(test_y, predictDT)
plt.plot([test_y.min(), test_y.max()], [test_y.min(), test_y.max()], 'k--')
plt.xlabel('DTTrue')
plt.ylabel('DTPredict')
plt.show()

#########################################################
# 实现随机森林回归
from sklearn.ensemble import RandomForestRegressor
randomForest = RandomForestRegressor()
randomForest.fit(train_x, train_y)
# 预测
predictRF = randomForest.predict(test_x)
# print("预测结果")
# print(predictRF)
# print("真实结果")
# print(test_y)

# 评价结果
MSE = metrics.mean_squared_error(predictRF, test_y)
RMSE = np.sqrt(metrics.mean_squared_error(predictRF, test_y))
print("RandomForestRegressor 模型MSE: %.5f" % MSE)
print("RandomForestRegressor 模型RMSE: %.5f\n" % RMSE)

plt.figure(figsize=(15, 5))
x_test_rf1 = x_test.loc[:, pred_cols_rf1]
y_test_rf1 = y_test.loc[:, 'fulfill_duration']

# check out max depths for one that doesn't overfit
train_scores = []
test_scores = []
train_rmse = []
test_rmse = []

max_depths = list(range(1, 11))
max_depths = max_depths + list(range(12, 32, 2))

for dpth in max_depths:
    print(f'Calculating results for max depth of {dpth}')
    mdl_rf1 = RandomForestRegressor(n_estimators=20,
                                    max_depth=dpth,
                                    random_state=RANDOM_SEED)
    mdl_rf1.fit(x_train_rf1, y_train_rf1)

    train_scores.append(mdl_rf1.score(x_train_rf1, y_train_rf1))
    test_scores.append(mdl_rf1.score(x_test_rf1, y_test_rf1))

    train_rmse.append(
        np.sqrt(mean_squared_error(y_train_rf1, mdl_rf1.predict(x_train_rf1))))
    test_rmse.append(
        np.sqrt(mean_squared_error(y_test_rf1, mdl_rf1.predict(x_test_rf1))))

from matplotlib.legend_handler import HandlerLine2D

#plot the RMSEs for training and test, see if we see train keep going down but test rmse level off
line1, = plt.plot(max_depths, train_rmse, 'b', label='Training Data RMSE')
    test_data = pd.merge(test_data,
                         new_series[[
                             'shop_id', 'item_id', 'date_block_num',
                             'item_cnt_prev' + str(i)
                         ]],
                         how='left')
    test_data['item_cnt_prev' + str(i)] = series_agg['item_cnt_prev' +
                                                     str(i)].fillna(0)


def rmse(y, y_hat):
    return np.sqrt(np.mean((y_hat - y)**2))


#without any parameter
model = RandomForestRegressor()
model.fit(train_features, train_targets)
res = model.predict(test_data)
res_train = model.predict(train_features)

res_train = model.predict(train_features)
train_error = rmse(res_train, train_targets)
print(train_error)
#Train error= 0.754
#Test error= 4.825

#with maxdepth = 15
model_2 = RandomForestRegressor(max_depth=15)
model_2.fit(train_features, train_targets)
res_2 = model_2.predict(test_data)
res_2_train = model_2.predict(train_features)
labels_iq = labels_iq.tail(300)



### 4. Execute the regresor and make predictions

## San Juan
data_features_test_sj = data_features_test.loc[data_features_test['city'] == 'sj']

# Parametrization
n_estimators = 50
max_depth = None
max_features = len(features_selected_sj)

# Random Forest regressor
regressor_sj = RandomForestRegressor(n_estimators= n_estimators, max_depth = max_depth, max_features=max_features, criterion='mae', random_state=0)
regressor_sj.fit(features_sj, labels_sj)

# Prediction
pred_sj = [int(round(x)) for x in regressor_sj.predict(data_features_test_sj[features_selected_sj])]
data_features_test_sj = data_features_test_sj.assign(total_cases = pred_sj)


## Iquitos
data_features_test_iq = data_features_test.loc[data_features_test['city'] == 'iq']

# Normalization of the data
max_abs_scaler = preprocessing.MaxAbsScaler()
data_features_test_iq_norm = max_abs_scaler.fit_transform(data_features_test_iq[features_selected_iq])
features_iq_norm = max_abs_scaler.fit_transform(features_iq)
    df = pd.concat([X_split, connectomes], axis=1)
    return df, y_split


df, y_train = load_combine_data(X_train, merged_data, dmri)
X_train_post_hoc = df

df_test, y_test = load_combine_data(X_test, merged_data, dmri)
X_test_post_hoc = df_test

df = df.drop(columns=['eid', '20016-2.0'], axis=1)
df_test = df_test.drop(columns=['eid', '20016-2.0'], axis=1)

estimator = RandomForestRegressor(n_estimators=250,
                                  criterion='mse',
                                  n_jobs=-1,
                                  verbose=1,
                                  random_state=0)

pipeline = Pipeline([('imputation',
                      make_union(SimpleImputer(strategy="median"),
                                 MissingIndicator())),
                     ('estimator', estimator)])

cv = ShuffleSplit(n_splits=100, test_size=0.1, random_state=0)

param_grid = {
    'estimator__max_depth': [5, 10, 20, 40, None],
    'estimator__max_features': [1, 5, 'log2', 'sqrt', 'auto', None]
}
grid_search = GridSearchCV(pipeline,
Exemplo n.º 17
0
                              ExtraTreesClassifier)
from sklearn.linear_model import (BayesianRidge, RidgeClassifier, SGDRegressor,
                                  SGDClassifier, LinearRegression,
                                  LogisticRegression, Lasso, ElasticNet)

regression_options = {
    'MLPRegressor': {
        'model':
        MLPRegressor(learning_rate='adaptive',
                     max_iter=500,
                     learning_rate_init=.005),
        'name':
        'MLP NN'
    },
    'RandomForestRegressor': {
        'model': RandomForestRegressor(n_estimators=20, max_features=2),
        'name': 'Random Forest'
    },
    'BayesianRidge': {
        'model': BayesianRidge(),
        'name': 'Bayesian Ridge'
    },
    'Lasso': {
        'model': Lasso(),
        'name': 'Lasso Regressor'
    },
    'GradientBoostingRegressor': {
        'model': GradientBoostingRegressor(max_features=2),
        'name': 'Gradient Boost'
    },
    'ElasticNet': {
Exemplo n.º 18
0
dtr = tree.DecisionTreeRegressor(max_depth=2)
dtr.fit(hoseing["data"][:, [6, 7]], hoseing["target"])

dot_data = \
    tree.export_graphviz(
        dtr,
        out_file = None,
        feature_names=hoseing["feature_names"][6:8],
        filled = True,
        impurity = False,
        rounded = True
    )
import os
os.environ["PATH"] += os.pathsep + 'D:/Program Files (x86)/Graphviz2.38/bin/'
import pydotplus
graph = pydotplus.graph_from_dot_data(dot_data)
graph.get_nodes()[7].set_fillcolor("#FFF2DD")
graph.write_png("./res.png")

from sklearn.model_selection import train_test_split  #分割训练集

data_train,data_test,target_train,target_test = \
    train_test_split(hoseing["data"],hoseing["target"],test_size=0.1,random_state = 42)
dtr = tree.DecisionTreeRegressor(random_state=42)
dtr.fit(data_train, target_train)
print(dtr.score(data_test, target_test))

from sklearn.ensemble import RandomForestRegressor  #系统自己调整参数
rfr = RandomForestRegressor(random_state=42)
rfr.fit(data_train, target_train)
print(rfr.score(data_test, target_test))
Exemplo n.º 19
0
X_train = sc_x.fit_transform(x_train)
X_test = sc_x.transform(x_test)

sc_y = MinMaxScaler()
Y_train = sc_y.fit_transform(y_train)
Y_test  = sc_y.transform(y_test)


import keras
from keras.models import Sequential
from keras.layers import Dense,Dropout

model = Sequential()
model.add(Dense(units = 10, activation = 'relu', input_shape=(8,)))
model.add(Dense(units = 500, activation = 'relu'))
model.add(Dense(units = 300, activation = 'relu'))
model.add(Dense(units = 1, activation = 'sigmoid'))
model.compile(optimizer = 'adam', loss = 'mean_squared_error',metrics=['accuracy'])

model.fit(X_train,Y_train, batch_size = 5, epochs = 1000)
 
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators = 300,random_state=0)
model.fit(X_train,Y_train)

dataset1 = pd.read_csv('maths - Copy.csv')
z = dataset1.iloc[1:,:].values
y_pred = model.predict(sc_x.transform(z))
Y_pred = sc_y.inverse_transform(y_pred)
Y_pred
## label scalling
MaxPrice = max(Prices)
Prices = Prices / MaxPrice

xtrain, xtest, ytrain, ytest = train_test_split(TrainVector,
                                                Prices,
                                                test_size=0.05,
                                                random_state=42)

from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor()
gbr.fit(xtrain, ytrain)

from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()
rfr.fit(xtrain, ytrain)


def AccuracyPlotter(trueLabels, predictedLabels):
    size = len(trueLabels)
    x_y = [0.00, 0.001, 0.002]

    plt.scatter(trueLabels, predictedLabels)
    #plt.plot(x_y)
    plt.show()


AccuracyPlotter(ytest, rfr.predict(xtest))

Exemplo n.º 21
0
对基于CART的随机森林的调参,主要有:
1,树的个数
2,树的最大深度
3,内部节点最少样本数与叶节点最少样本数
4,特征个数

此外,调参过程中选择的误差函数是均值误差,5倍折叠
'''
X, y = trainData[numFeatures2], trainData['rec_rate']
'''
网格搜索参数
'''
param_test1 = {'n_estimators': range(10, 80, 5)}  #从10-80每5格取一个值
gsearch1 = GridSearchCV(estimator=RandomForestRegressor(min_samples_split=50,
                                                        min_samples_leaf=10,
                                                        max_depth=8,
                                                        max_features='sqrt',
                                                        random_state=10),
                        param_grid=param_test1,
                        scoring='neg_mean_squared_error',
                        cv=5)
gsearch1.fit(X, y)
print(gsearch1.best_params_, gsearch1.best_score_)
best_n_estimators = gsearch1.best_params_['n_estimators']  #估计出的最佳数个数

param_test2 = {
    'max_depth': range(3, 21),
    'min_samples_split': range(10, 100, 10)
}
gsearch2 = GridSearchCV(estimator=RandomForestRegressor(
    n_estimators=best_n_estimators,
Exemplo n.º 22
0
################################################## Random Forest Regressor #####################################################

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

param_grid = { 
            "n_estimators"      : [100, 200, 300], # default=100
#             "max_features"      : ["auto", "sqrt", "log2"], #default=auto
#             "min_samples_split" : [2,4,8], #default=2
#             "bootstrap": [True, False], #default=True
            }

RFR = RandomForestRegressor()
RFR_cv = GridSearchCV(RFR, param_grid, cv=5, scoring="neg_mean_squared_error")
RFR_cv.fit(X_train, Y_train)
print(RFR_cv.best_score_ , RFR_cv.best_params_)
    
# Feature Importance
feat_labels = X.columns.values
importances = RFR_cv.best_estimator_.feature_importances_
indices = np.argsort(importances)

rf_importance = pd.DataFrame()
rf_importance["features"] = feat_labels
rf_importance["importances"] = importances
rf_importance = rf_importance.sort_values(["importances"], ascending=0)

plt.title('RF Feature Importance')
]

x = dataframe.iloc[:, :-1].values
y = dataframe[['MEDV']].values

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.4,
                                                    random_state=1)

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

forest = RandomForestRegressor(n_estimators=1000,
                               criterion='mse',
                               random_state=1,
                               n_jobs=-1)

forest.fit(x_train, y_train)
y_train_pred = forest.predict(x_train)[:, np.newaxis]
y_test_pred = forest.predict(x_test)[:, np.newaxis]

print('训练集的均方误差:', mean_squared_error(y_train, y_train_pred))
print('测试集的均方误差:', mean_squared_error(y_test, y_test_pred))

print('训练集的决定系数:', r2_score(y_train, y_train_pred))
print('测试集的决定系数:', r2_score(y_test, y_test_pred))

plt.scatter(y_train_pred,
            y_train_pred - y_train,
            color='black',
Exemplo n.º 24
0
from sklearn.ensemble import AdaBoostRegressor
#from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor
# svm regressor
from sklearn.svm import SVR
print("Done ...")

# list model name list
print("\n*** Init Models Lists ***")
lModels = []
lModels.append(("LinearRegression  ", LinearRegression()))
lModels.append(("RidgeRegression   ", Ridge(alpha=10)))
lModels.append(("LassoRegression   ", Lasso(alpha=1)))
lModels.append(("ElasticNet        ", ElasticNet(alpha=1)))
lModels.append(("Random Forest     ", RandomForestRegressor(random_state=707)))
lModels.append(("SVM Regressor     ", SVR(C=1.0, epsilon=0.2)))
lModels.append(("DecTree Regressor ", DecisionTreeRegressor(random_state=707)))
lModels.append(("GradientBoostingRegressor  ",
                GradientBoostingRegressor(random_state=707)))

lModels.append(
    ("AdaBoostRegressor ", AdaBoostRegressor(random_state=707,
                                             n_estimators=100)))
for vModel in lModels:
    print(vModel)
print("Done ...")

################################
# Regression - Cross Validation
###############################
Exemplo n.º 25
0
def randomforest(data):
    """
    implememt RandomForest and report graphical representation of useful
    result
    :param data: modified and clean dataset for implementing RandomForest
    :param run: determine if the user is going to run this program
    If run is true, this method runs and produce the desired output.
    """
    # create dummy variables for categorical variables
    data = pd.get_dummies(data)
    # convert and get the label in Numpy array as required to implement
    # randomforest
    labels = np.array(data["kills"])

    # get features except for labels and unrelated ones
    features = data.drop(["kills", "player_slot", "match_id"], axis=1)

    # store the column names of features
    feature_list = list(features.columns)
    # convert features in Numpy array as required to implement randomforest
    features = np.array(features)

    # split the data into testing and training groups for once
    train_features, test_features, train_labels, test_labels = \
        train_test_split(features, labels, test_size=0.2)

    # The baseline predictions are the historical averages
    baseline_preds = data["kills"].mean()

    # get absolute mean of baseline error
    baseline_errors = round(np.mean(abs(baseline_preds - test_labels)), 2)

    # create list to store values and are used to have an output dataframe
    # in CSV
    num_tree = [1, 2, 3, 5, 10, 30, 60, 100]
    mse_list = []
    train_accuracy = []
    test_accuracy = []
    mean_absolute_error_list = []

    # create RandomForest with different number of estimators
    for tree in num_tree:
        # build 100 decision trees for this random forest model
        rf_model = RandomForestRegressor(n_estimators=tree)

        # train the model using randomforest
        rf_model.fit(train_features, train_labels)

        # get predictions of kills from the model created
        predictions = rf_model.predict(test_features)

        # store MSE, train accuracy, test_accuracy to the lists
        mse_list.append(mean_squared_error(test_labels, predictions))
        train_accuracy.append(rf_model.score(train_features, train_labels))
        test_accuracy.append(rf_model.score(test_features, test_labels))

        # get the absolute errors of the prediction and store to list
        errors = abs(predictions - test_labels)
        mean_absolute_errors = round(np.mean(errors), 3)
        mean_absolute_error_list.append(mean_absolute_errors)

    # store the relevant values from RandomForests into dataframe
    tree_estimator_data = pd.DataFrame(
        data={
            "n_estimator": num_tree,
            "MSE": mse_list,
            "train_accuracy": train_accuracy,
            "test_accuracy": test_accuracy,
            "mean_absolute_error": mean_absolute_error_list
        })

    # add baseline error into dataframe to compare with the mean
    # absolute errors
    tree_estimator_data["Baseline Error"] = baseline_errors

    # output data collected from RandomForests into dataframe for
    # easy access
    tree_estimator_data.to_csv("user_files/csv_files/randomforest_trees.csv",
                               index=False)

    # store the feature importance in series with indexes indicating the
    # name of features
    feature_imp = pd.Series(rf_model.feature_importances_,
                            index=feature_list).sort_values(ascending=False)
    # sort out top 10 feature importance
    top_feature_imp = feature_imp.iloc[0:11]

    # plot importance feature graph using horizonal bar chart
    num_feature = np.arange(len(top_feature_imp.index))
    performance = np.array(list(top_feature_imp))

    fig, ax = plt.subplots()
    ax.barh(num_feature, performance, align="center")
    ax.set_yticks(num_feature)
    ax.set_yticklabels(top_feature_imp.index)
    ax.invert_yaxis()  # labels read top-to-bottom
    ax.set_xlabel("Feature Importance")
    ax.set_title("Important Features in Predicting Number of Kills in Dota2")
    fig.savefig("user_files/image_files/Important_Features.png",
                bbox_inches="tight")

    # plot prediction vs actual kill graph
    # create a dataframe with predictions of and actual data of # of kills
    predictions_vs_actual = pd.DataFrame(data={
        "prediction": predictions,
        "label": test_labels
    })

    sns.relplot(x="label", y="prediction", data=predictions_vs_actual)
    x = np.linspace(data["kills"].min(), data["kills"].max(), 100)
    y = x
    plt.plot(x, y, "-r", label="45-degree line")
    plt.xlabel("Actual Number of Kills")
    plt.ylabel("Predicted Number of Kills")
    plt.title("Actual data vs. Prediction on Number of Kills")
    plt.savefig("user_files/image_files/prediction_actual_kills.png",
                bbox_inches="tight")

    # plot tree number vs test accuracy graph
    sns.relplot(x="n_estimator",
                y="test_accuracy",
                kind="line",
                data=tree_estimator_data)
    plt.xlabel("Number of Trees in RandomForest")
    plt.ylabel("Test Accuracy")
    plt.title("Test Accuracy vs. Number of Estimators in RandomForest")
    plt.savefig("user_files/image_files/tree_test_accuracy.png",
                bbox_inches="tight")

    # plot tree number vs MSE graph
    sns.relplot(x="n_estimator",
                y="MSE",
                kind="line",
                data=tree_estimator_data)
    plt.xlabel("Number of Trees in RandomForest")
    plt.ylabel("Mean Squared Error")
    plt.title("MSE vs. Number of Estimators in RandomForest")
    plt.savefig("user_files/image_files/MSE.png", bbox_inches="tight")

    # plot error difference vs number of estimators graph
    tree_estimator_data["dif_errors"] = (
        tree_estimator_data["Baseline Error"] -
        tree_estimator_data["mean_absolute_error"])
    sns.relplot(x="n_estimator",
                y="dif_errors",
                kind="line",
                data=tree_estimator_data)
    plt.xlabel("Number of Trees in RandomForest")
    plt.ylabel("Error Difference")
    plt.title(" Error Difference vs. Number of Estimators in RandomForest")
    plt.savefig("user_files/image_files/error_diff.png", bbox_inches="tight")
Exemplo n.º 26
0
def rf_tuning(n_estimators=[10, 11, 1],
              k=5,
              train_data_path='../data/training_data.csv',
              save_model=False,
              tracking_uri="http://0.0.0.0:5000"):

    # Log the parameters with mlflow
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.set_tag("k", k)

    # Set random seed for reproducibility
    np.random.seed(RANDOM_SEED)
    random.seed(RANDOM_SEED)

    # Get data shuffled and split into training and test sets
    mdr = MiningDataReader(path=train_data_path)
    (variable_names, X_train, X_test, y_train,
     y_test) = mdr.get_splitted_data()

    pipeline = Pipeline(steps=[('scaling', StandardScaler(
    )), ('regression', RandomForestRegressor(random_state=RANDOM_SEED))])

    ### TRAINING ###
    ################

    # Generate grid search for hyperparam tuning
    hyperparams = {}
    hyperparams['regression__n_estimators'] = np.arange(
        n_estimators[0], n_estimators[1], n_estimators[2])

    print("Training started...\n")

    # Create an instance of Random Forest Regressor and fit the data for the grid parameters using all processors
    modelCV = GridSearchCV(estimator=pipeline,
                           param_grid=hyperparams,
                           cv=k,
                           scoring='neg_mean_squared_error',
                           n_jobs=-1)

    with ProgressBar():
        modelCV.fit(X_train, y_train)

    # Iterate over the results storing training error for each hyperparameter combination
    results = modelCV.cv_results_
    param_list, training_err_list, training_dev_list = [], [], []
    for i in range(len(results['params'])):
        param = results['params'][i]
        score = (-1) * results['mean_test_score'][i]  # NEGATIVE MSE
        std = results['std_test_score'][i]
        param_list.append(param)
        training_err_list.append(score)
        training_dev_list.append(std)

    print(
        f"\nBest parameter set found for the training set:\n{modelCV.best_params_}"
    )

    # Store the index of the best combination
    best_index = param_list.index(modelCV.best_params_)

    # Get the best values for hyperparams
    best_nestimators = modelCV.best_params_['regression__n_estimators']

    print("\nTraining finished. Evaluating model...\n")

    ### EVALUATION ###
    ##################

    # Criteria is the number of trees
    criteria = 'n_estimators'
    mlflow.set_tag("criteria", criteria)
    param_values = range(n_estimators[0], n_estimators[1], n_estimators[2])

    # Predict test data variying criteria param and evaluate the models
    training_err_by_criteria, training_dev_by_criteria, test_err_list = [], [], []
    rmse_score, mae_score, r2_score = -1, -1, -1
    feature_names, feature_importances = [], []
    for param_value in tqdm(param_values):
        model = Pipeline(
            steps=[('scaler', StandardScaler()),
                   ('regression',
                    RandomForestRegressor(n_estimators=param_value,
                                          random_state=RANDOM_SEED,
                                          n_jobs=-1))])
        param = {'regression__n_estimators': param_value}

        # Fit model and evaluate results
        model.fit(X_train, y_train)
        prediction = model.predict(X_test)
        index = param_list.index(param)
        training_err = training_err_list[index]
        training_dev = training_dev_list[index]
        (training_mse, test_mse, rmse, mae,
         r2) = get_test_metrics(training_err, y_test, prediction)
        # Store metrics
        training_err_by_criteria.append(training_mse)
        training_dev_by_criteria.append(training_dev)
        test_err_list.append(test_mse)
        # Set aditional metrics for the best combination
        if index == best_index:
            rmse_score = rmse
            mae_score = mae
            r2_score = r2

    # Generate the plots
    empty_img_folder()
    plot_errors(criteria, param_values, training_err_by_criteria,
                training_dev_by_criteria, test_err_list)

    # Once hyperparameters are selected, train and save the best model
    if save_model:
        print(
            "\nEvaluation finished. Training final model with train + test data with the best hyperparameters..."
        )
        final_model = Pipeline(
            steps=[('scaler', StandardScaler()),
                   ('regression',
                    RandomForestRegressor(n_estimators=param_list[best_index]
                                          ['regression__n_estimators'],
                                          n_jobs=-1))])

        # Train the best model with all the data (training + test)
        full_X = np.vstack((X_train, X_test))
        full_y = np.concatenate((y_train, y_test))
        final_model.fit(full_X, full_y)

        # Get a barplot with feature importances
        feature_importances = final_model.named_steps[
            'regression'].feature_importances_
        plot_feature_importances(feature_importances, variable_names)

        # Log plots and model with mlflow
        mlflow.log_artifacts('./img')
        mlflow.sklearn.log_model(final_model, 'model')

    # Log results with mlflow
    mlflow.log_metric("train_mse", training_err_list[best_index])
    mlflow.log_metric("test_mse", min(test_err_list))
    mlflow.log_metric("rmse", rmse_score)
    mlflow.log_metric("mae", mae_score)
    mlflow.log_metric("r2", r2_score)
    mlflow.set_tag("best_params", param_list[best_index])

    # Output the results
    print(f'''
-----------------------------------------------------------------------------------------------------------------------
RESULTS
-----------------------------------------------------------------------------------------------------------------------
Best params: {param_list[best_index]}
Training MSE: {training_err_list[best_index]}
Test MSE: {min(test_err_list)}
RMSE: {rmse_score}
MAE: {mae_score}
R2: {r2_score}
-----------------------------------------------------------------------------------------------------------------------
''')
Exemplo n.º 27
0
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=None)

# Average CV score on the training set was:-74.90881962449828
exported_pipeline = make_pipeline(
    StackingEstimator(
        estimator=KNeighborsRegressor(n_neighbors=47, p=1, weights="uniform")),
    RandomForestRegressor(bootstrap=True,
                          max_features=0.25,
                          min_samples_leaf=16,
                          min_samples_split=4,
                          n_estimators=100))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
def project_check_data():

    sn_temp_mid = read_temp_mid_sn()
    # TODO: First dataset
    df_temp_first = pd.read_csv(
        'data/office_1_temperature_supply_points_data_2020-03-05_2020-03-19.csv'
    )
    df_temp_first = modify_df(df_temp_first, 'temp')
    df_temp_first = df_temp_first[df_temp_first['serialNumber'] == sn_temp_mid]

    df_target_temp_first = pd.read_csv(
        'data/office_1_targetTemperature_supply_points_data_2020-03-05_2020-03-19.csv'
    )
    df_target_temp_first = modify_df(df_target_temp_first, 'target_temp')

    df_valve_first = pd.read_csv(
        'data/office_1_valveLevel_supply_points_data_2020-03-05_2020-03-19.csv'
    )
    df_valve_first = modify_df(df_valve_first, 'valve')

    # TODO: Second Dataset
    df_temp_second = pd.read_csv(
        'data/office_1_temperature_supply_points_data_2020-10-13_2020-11-02.csv'
    )
    df_temp_second = modify_df(df_temp_second, 'temp')
    df_temp_second = df_temp_second[df_temp_second['serialNumber'] ==
                                    sn_temp_mid]

    df_target_temp_second = pd.read_csv(
        'data/office_1_targetTemperature_supply_points_data_2020-10-13_2020-11-01.csv'
    )
    df_target_temp_second = modify_df(df_target_temp_second, 'target_temp')

    df_valve_second = pd.read_csv(
        'data/office_1_valveLevel_supply_points_data_2020-10-13_2020-11-01.csv'
    )
    df_valve_second = modify_df(df_valve_second, 'valve')

    # TODO: CONCAT FIRST DATASET
    df_combined_first = pd.concat(
        [df_temp_first, df_target_temp_first, df_valve_first])
    df_combined_first = df_combined_first.resample(
        pd.Timedelta(minutes=3), label='right').mean().fillna(method='ffill')

    df_combined_first['valve_last'] = df_combined_first['valve'].shift(
        1, fill_value=40)
    df_combined_first['valve_gt'] = df_combined_first['valve'].shift(
        -1, fill_value=0)
    df_combined_first['diff_temp'] = df_combined_first[
        'target_temp'] - df_combined_first['temp']

    # TODO: CONCAT SECOND DATASET
    df_combined_second = pd.concat(
        [df_temp_second, df_target_temp_second, df_valve_second])
    df_combined_second = df_combined_second.resample(
        pd.Timedelta(minutes=3), label='right').mean().fillna(method='ffill')

    df_combined_second['valve_last'] = df_combined_second['valve'].shift(
        1, fill_value=30)
    df_combined_second['valve_gt'] = df_combined_second['valve'].shift(
        -1, fill_value=98.00)
    df_combined_second['diff_temp'] = df_combined_second[
        'target_temp'] - df_combined_second['temp']

    df_combined_first = df_combined_first[1:-1]
    df_combined_second = df_combined_second[1:-1]
    df_combined = pd.concat([df_combined_first, df_combined_second])

    df_train = df_combined

    X_train = df_train[['valve', 'temp', 'diff_temp', 'valve_last']].to_numpy()
    y_train = df_train['valve_gt'].to_numpy()

    mask = (df_combined.index > '2020-10-29')
    df_test = df_combined.loc[mask]
    X_test = df_test[['valve', 'temp', 'diff_temp', 'valve_last']].to_numpy()

    # model = RandomForestRegressor(criterion='mae')#, min_samples_split=40, random_state=42) # 0.337767500434254
    model = RandomForestRegressor(criterion='mae')  # 0.337767500434254

    model.fit(X_train, y_train)
    valve_file = 'valve_model.p'
    pickle.dump(model, open(valve_file, 'wb'))
    y_predicted = model.predict(X_test)

    y_test = df_test['valve_gt'].to_numpy()
    y_last = df_test['valve_last'].to_numpy()
    print(f'mae base: {metrics.mean_absolute_error(y_test, y_last)}')
    print(f'mae model: {metrics.mean_absolute_error(y_test, y_predicted)}')
def run_stacked(data, stacked_keys, repeat_idx, drop_na):
    out_scores = pd.DataFrame()
    out_predictions = data.copy()
    for key, sel in stacked_keys.items():
        this_data = data[sel]
        if drop_na == 'local':
            mask = this_data.dropna().index
        elif drop_na == 'global':
            mask = data.dropna().index
        else:
            mask = this_data.index
        X = this_data.loc[mask].values
        y = data['age'].loc[mask].values
        fold_idx = data.loc[mask]['fold_idx'].values

        if drop_na is False:
            # code missings to make the tress learn from it.
            X_left = X.copy()
            X_left[this_data.isna().values] = -1000
            X_right = X.copy()
            X_right[this_data.isna().values] = 1000
            assert np.sum(np.isnan(X_left)) == 0
            assert np.sum(np.isnan(X_right)) == 0
            assert np.min(X_left) == -1000
            assert np.max(X_right) == 1000
            X = np.concatenate([X_left, X_right], axis=1)

        for column in sel:
            score = get_mae(data.loc[mask], column)
            if column not in out_scores:
                out_scores[column] = score
            elif out_scores[column].mean() < np.mean(score):
                out_scores[column] = score

        unstacked = out_scores[sel].values
        idx = unstacked.mean(axis=0).argmin()
        unstacked_mean = unstacked[:, idx].mean()
        unstacked_std = unstacked[:, idx].std()
        print(f'{key} | best unstacked MAE: {unstacked_mean} '
              f'(+/- {unstacked_std}')

        print('n =', len(X))

        param_grid = {'max_depth': [4, 6, 8, None]}
        if X.shape[1] > 10:
            param_grid['max_features'] = (['log2', 'sqrt', None])

        reg = GridSearchCV(RandomForestRegressor(n_estimators=1000,
                                                 random_state=42),
                           param_grid=param_grid,
                           scoring='neg_mean_absolute_error',
                           iid=False,
                           cv=5)
        if DEBUG:
            reg = RandomForestRegressor(n_estimators=1000,
                                        max_features='log2',
                                        max_depth=6,
                                        random_state=42)

        cv = LeaveOneGroupOut()
        out_cv = Parallel(n_jobs=1)(
            delayed(fit_predict_score)(
                estimator=reg,
                X=X,
                y=y,
                train=train,
                test=test,
                test_index=this_data.loc[mask].index[test])
            for train, test in cv.split(X, y, fold_idx))

        out_cv = zip(*out_cv)
        predictions = next(out_cv)
        out_predictions[f'stacked_{key}'] = np.nan
        for pred in predictions:
            assert np.all(out_predictions.loc[pred.index]['age'] == pred['y'])
            out_predictions.loc[pred.index,
                                f'stacked_{key}'] = pred['prediction'].values
        scores = np.array(next(out_cv))
        print(f'{key} | MAE : %0.3f (+/- %0.3f)' %
              (np.mean(scores), np.std(scores)))

        out_scores[key] = scores
    out_scores['repeat_idx'] = repeat_idx
    out_predictions['repeat_idx'] = repeat_idx
    return out_scores, out_predictions
Exemplo n.º 30
0
clf2 = xgb.XGBRegressor(objective='reg:linear',
                        colsample_bytree=0.3,
                        learning_rate=0.7,
                        max_depth=8,
                        alpha=20,
                        n_estimators=10,
                        verbose=False)

clf1 = RandomForestRegressor(n_estimators=50,
                             criterion='mse',
                             max_depth=None,
                             min_samples_split=3,
                             min_samples_leaf=15,
                             min_weight_fraction_leaf=0.0,
                             max_features='auto',
                             max_leaf_nodes=None,
                             min_impurity_decrease=0.0,
                             min_impurity_split=None,
                             bootstrap=False,
                             oob_score=False,
                             n_jobs=5,
                             random_state=None,
                             verbose=5,
                             warm_start=False)

clf3 = RandomForestClassifier(n_estimators='warn',
                              criterion='gini',
                              max_depth=None,
                              min_samples_split=2,
                              min_samples_leaf=2,
                              min_weight_fraction_leaf=0.0,
                              max_features='auto',