Пример #1
0
def build_models(df_train_x, df_train_y,df_validation_x, df_validation_y, seed):
    # prepare the model

    model = ExtraTreesRegressor(random_state=seed, n_estimators=100)
    model.fit(df_train_x, df_train_y)
    # transform the validation dataset
    predictions = model.predict(df_validation_x)
    #print(predictions)
    #print(df_test_y)
    print(mean_squared_error(df_validation_y, predictions))
    print("Accuracy --> ", model.score(df_validation_x, df_validation_y) * 100)

    # prepare the model

    model_rf = RandomForestRegressor(random_state=seed, n_estimators=100)
    model_rf.fit(df_train_x, df_train_y)
    # transform the validation dataset
    predictions_rf = model_rf.predict(df_validation_x)
    print(mean_squared_error(df_validation_y, predictions_rf))
    print("Accuracy --> ", model.score(df_validation_x, df_validation_y) * 100)

    params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
              'learning_rate': 0.01, 'loss': 'ls'}
    model_gb = ensemble.GradientBoostingRegressor(**params)
    model_gb.fit(df_train_x, df_train_y)
    # transform the validation dataset
    predictions_gb = model_gb.predict(df_validation_x)
    print(mean_squared_error(df_validation_y, predictions_gb))
    print("Accuracy --> ", model.score(df_validation_x, df_validation_y) * 100)

    return [model, model_rf, model_gb]
Пример #2
0
def SelectFromETR(X, X_train, X_test, y_train, y_test):

    from sklearn.ensemble import ExtraTreesRegressor
    import numpy as np

    etr = ExtraTreesRegressor(n_estimators=200, n_jobs=2)
    etr.fit(X_train, y_train)
    print "R^2 on training set: %f" % etr.score(X_train, y_train)
    print "R^2 on test set: %f" % etr.score(X_test, y_test)

    importances = etr.feature_importances_
    indices = np.where(importances >= 0.009)[0]
    X = X.iloc[:, list(indices)]
    return X
Пример #3
0
def et_regressor(x_trn: pd.DataFrame, y_trn: np.ndarray, x_val: pd.DataFrame,
                 y_val: np.ndarray) -> tuple:
    x_trn, x_val = x_trn.copy(), x_val.copy()
    y_trn, y_val = y_trn.copy(), y_val.copy()
    model = ExtraTreesRegressor(n_estimators=400,
                                min_samples_leaf=3,
                                n_jobs=-1,
                                random_state=7)
    _ = model.fit(x_trn, y_trn)

    training_score = model.score(x_trn, y_trn)
    validation_score = model.score(x_val, y_val)

    return model, training_score, validation_score
Пример #4
0
    def ExtraTreesregressor(self, data):
        train, validacion = data
        x_tr, y_tr = train
        x_val, y_val = validacion
        #print("El set de train tiene {} filas y {} columnas".format(x_tr.shape[0],x_tr.shape[1]))
        #print("El set de validacion tiene {} filas y {} columnas".format(x_val.shape[0],x_val.shape[1]))

        print('Start training ExtraTreesRegressor...')
        start_time = self.timer()

        extr = ExtraTreesRegressor(n_estimators=100)
        extr.fit(x_tr, y_tr)
        print("The R2 is: {}".format(extr.score(x_tr, y_tr)))
        #		print("The alpha choose by CV is:{}".format(krrl.alpha_))
        self.timer(start_time)

        print("Making prediction on validation data")
        y_val = np.expm1(y_val)
        y_val_pred = np.expm1(extr.predict(x_val))
        mae = mean_absolute_error(y_val, y_val_pred)
        print("El mean absolute error de es {}".format(mae))

        print('Saving model into a pickle')
        try:
            os.mkdir('pickles')
        except:
            pass

        with open('pickles/extr.pkl', 'wb') as f:
            pickle.dump(extr, f)

        print('Making prediction and saving into a csv')
        y_test = extr.predict(self.x_test)

        return y_test
Пример #5
0
def etr_search(X_train, X_test, y_train, y_test):

    print "R^2 scores calculated on test set:"
    n_jobs = 2
    n = 600
    cv = 0
    max_features = 'auto'
    max_score = {}
    max_sc = 0
    for depth in [9]:
        for split in range(30, 100, 10):
            for leaf in range(15, 50, 5):
                start = time.time()
                # tuned_parameters = [{'n_estimators': [200, 500, 1000],
                #                      'max_features': ['auto', 'log2'],
                #                      'min_samples_leaf': [1, 10, 50]}]
                params = {'n_estimators': n, 'max_features': max_features, 'max_depth': depth,
                          'min_samples_split': split, 'min_samples_leaf':leaf , 'n_jobs': n_jobs}
                model = ExtraTreesRegressor(n_estimators=n, n_jobs=n_jobs, max_features=max_features,
                                            max_depth=depth, min_samples_split=split, min_samples_leaf=leaf)
                model.fit(X_train, y_train)
                sc = model.score(X_test, y_test)
                max_score[sc] = params
                if sc > max_sc:
                    max_sc = sc
                end = time.time()

                print "%0.8f for %r    [X_train.shape=%s, cv=%s]  %0.2f min" % \
                (sc, params, str(X_train.shape), cv, (end-start)/60)
    print "The best model is:\n%0.8f for %r    [X_train.shape=%s, cv=%s]" % \
          (max_sc, max_score[max_sc], str(X_train.shape), cv)
Пример #6
0
def dtrees(X_fit, y_fit, X_eval, y_eval, features, dt_file):
    #DTrees
    dtree = tree.DecisionTreeRegressor().fit(X_fit, y_fit)
    accuracy = dtree.score(X_eval, y_eval)
    dt_file.write(f'Single Dtree: {accuracy}\n')

    for feature, imp in zip(features, dtree.feature_importances_):
        dt_file.write("\tFeature %s: %s\n" % (feature, imp))

    pickle.dump(dtree, open('dtree.p', 'wb'))

    #Random Forest Trees
    rf_dtree = RandomForestRegressor(n_estimators=8).fit(X_fit, y_fit)
    accuracy = rf_dtree.score(X_eval, y_eval)
    dt_file.write(f'Random Forest Dtrees: {accuracy}\n')

    #Extremely Randomized Trees
    extra_rf_dtree = ExtraTreesRegressor(n_estimators=8).fit(X_fit, y_fit)
    accuracy = extra_rf_dtree.score(X_eval, y_eval)
    dt_file.write(f'Extremely Randomized Dtrees: {accuracy}\n')

    #Gradient Boosting Trees
    gb_tree = GradientBoostingRegressor(n_estimators=50,
                                        learning_rate=1.0,
                                        max_depth=2,
                                        random_state=0).fit(X_fit, y_fit)
    accuracy = gb_tree.score(X_eval, y_eval)
    dt_file.write(f'Gradient Boosting Dtrees: {accuracy}')
def ExtraTreesPredictor(X_train, y_train, X_test, y_test):
    extra_tree = ExtraTreesRegressor(n_estimators=200, random_state=1234)
    extra_tree.fit(X_train, y_train)
    extratree_score = extra_tree.score(X_test, y_test)
    extratree_score
    extratree_pred = extra_tree.predict(X_test)
    extratreeRMSE = sqrt(mean_squared_error(y_test, extratree_pred))
    print("Root mean squared error: %.2f" % extratreeRMSE)
    print('R-squared extra trees: %.2f' % r2_score(y_test, extratree_pred))

    features = X.columns
    importances = extra_tree.feature_importances_
    indices = np.argsort(importances)

    plt.title('Feature Importances')
    plt.barh(range(len(indices)),
             importances[indices],
             color='b',
             align='center')
    plt.yticks(range(len(indices)), features[indices])
    plt.xlabel('Relative Importance')
    plt.show()
    plt.scatter(y_test, extratree_pred)
    plt.xlabel('Measured')
    plt.ylabel('Predicted')
    plt.title('Extra Trees Predicted vs Actual')
    plt.show()
    chart_regression(extratree_pred, y_test, 'ExtraTrees Predictor')
    return extratree_score, extratreeRMSE
Пример #8
0
def extratrees():
    regressor = ExtraTreesRegressor(n_estimators=50).fit(X_train, y_train)
    regressor.fit(X_train, y_train)
    y_predictions = regressor.predict(X_test)
    print("Selected Features for Extratrees", regressor.feature_importances_)
    return (regressor.score(X_test, y_test),
            sqrt(mean_squared_error(y_test, y_predictions)))
Пример #9
0
def ET():
    global x1, x2, y1, y2, dict1
    model = ExtraTreesRegressor()
    name = "Extra Trees Regressor"
    model.fit(x1, y1)
    y_pred = model.predict(x2)
    error = mean_squared_error(y2, y_pred)
    score = model.score(y2, y_pred)
    dict1[name] = score
    plotgraph(y_pred, name, error, score)
Пример #10
0
def classfication_ETR(X_train, y_train, X_test, y_test, ss_y, boston):
    # 使用ExtraTreesRegressor训练模型,并对测试数据做出预测,结果存储在变量etr_y_predict中。
    etr = ExtraTreesRegressor()
    # https://stackoverflow.com/questions/34165731/a-column-vector-y-was-passed-when-a-1d-array-was-expected
    etr.fit(X_train, y_train.ravel())
    etr_y_predict = etr.predict(X_test)
    # 使用R-squared、MSE以及MAE指标对默认配置的极端回归森林在测试集上进行性能评估。
    print 'R-squared value of ExtraTreesRegessor: ', etr.score(X_test, y_test)
    print 'The mean squared error of  ExtraTreesRegessor: ', mean_squared_error(
        ss_y.inverse_transform(y_test), ss_y.inverse_transform(etr_y_predict))
    print 'The mean absoluate error of ExtraTreesRegessor: ', mean_absolute_error(
        ss_y.inverse_transform(y_test), ss_y.inverse_transform(etr_y_predict))
    # 利用训练好的极端回归森林模型,输出每种特征对预测目标的贡献度。
    print np.sort(zip(etr.feature_importances_, boston.feature_names), axis=0)
Пример #11
0
def dummie_columns_extra_trees(train, test):
    from sklearn.ensemble import ExtraTreesRegressor
    print "-- {} --".format(
        "Extremely Randomized Trees Regression using all but remarks")
    predicting_columns = list(train._get_numeric_data().columns.values)
    predicting_columns.remove("LISTPRICE")
    predicting_columns.remove("SOLDPRICE")
    rf = ExtraTreesRegressor(n_estimators=300, n_jobs=-1)
    rf.fit(train[predicting_columns], train["SOLDPRICE"])
    score = rf.score(test[predicting_columns], test["SOLDPRICE"])
    predictions = rf.predict(test[predicting_columns])
    sample_predictions(test, predictions)
    print "Accuracy: {}\n".format(score)
    return score, predictions
Пример #12
0
def dummie_columns_extra_trees(train, test):
    from sklearn.ensemble import ExtraTreesRegressor
    print "-- {} --".format("Extremely Randomized Trees Regression using all but remarks")
    predicting_columns = list(train._get_numeric_data().columns.values)
    predicting_columns.remove("LISTPRICE")
    predicting_columns.remove("SOLDPRICE")
    rf = ExtraTreesRegressor(
        n_estimators=300, n_jobs=-1)
    rf.fit(train[predicting_columns], train["SOLDPRICE"])
    score = rf.score(test[predicting_columns], test["SOLDPRICE"])
    predictions = rf.predict(test[predicting_columns])
    sample_predictions(test, predictions)
    print "Accuracy: {}\n".format(score)
    return score, predictions
Пример #13
0
def simple_extremely_random_trees(data_train_x, data_test_x, data_train_y, data_test_y):
    from sklearn.ensemble import ExtraTreesRegressor
    print "-- {} --".format("Extremely Randomized Trees Regression using all but remarks")
    rf = ExtraTreesRegressor(
        n_estimators=300,
        n_jobs=-1
    )
    rf.fit(data_train_x, data_train_y)
    sample_predictions(rf.predict(data_test_x), data_test_y)
    score = rf.score(data_test_x, data_test_y)
    cross_validated_scores = cross_val_score(
        rf, data_test_x, data_test_y, cv=5)
    print "MSE Accuracy: {}".format(score)
    print "MSE Across 5 Folds: {}".format(cross_validated_scores)
    print "95%% Confidence Interval: %0.3f (+/- %0.3f)\n" % (cross_validated_scores.mean(), cross_validated_scores.std() * 1.96)
Пример #14
0
def trainRegressorsAndSave(computeScore=False):
    for db in dbs:
        if (not os.path.exists("clfs/" + db)):
            clf = ExtraTreesRegressor(n_estimators=500, random_state=1, n_jobs=-1)
            saveTrainedClassifier(db, clf)
        elif (computeScore):
            clf = joblib.load("clfs/" + db)

        if (computeScore):
            print("Loading test data...")
            loaded = loadDB(db + ".csv")
            X_test = loaded[:, 0:-1]
            y_test = loaded[:, -1]

            print("Normalized score is {}".format(clf.score(X_test, y_test)))
            X_test = y_test = 0
Пример #15
0
def simple_extremely_random_trees(data_train_x, data_test_x, data_train_y,
                                  data_test_y):
    from sklearn.ensemble import ExtraTreesRegressor
    print "-- {} --".format(
        "Extremely Randomized Trees Regression using all but remarks")
    rf = ExtraTreesRegressor(n_estimators=300, n_jobs=-1)
    rf.fit(data_train_x, data_train_y)
    sample_predictions(rf.predict(data_test_x), data_test_y)
    score = rf.score(data_test_x, data_test_y)
    cross_validated_scores = cross_val_score(rf,
                                             data_test_x,
                                             data_test_y,
                                             cv=5)
    print "MSE Accuracy: {}".format(score)
    print "MSE Across 5 Folds: {}".format(cross_validated_scores)
    print "95%% Confidence Interval: %0.3f (+/- %0.3f)\n" % (
        cross_validated_scores.mean(), cross_validated_scores.std() * 1.96)
Пример #16
0
def extra_trees(X,y,n_est):
    '''
	INPUT: Dataframe with features (X), target variable dataframe (y), number of estimators (parameter)
	OUTPUT: Score of ExtaTrees model
    '''

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
    
    ext = ExtraTreesRegressor(n_estimators=n_est)
    mean = X.mean(axis=0)
    std = X.std(axis=0)
    X = (X - mean) / std
    clf = clone(ext)
    clf = ext.fit(X_train, y_train)

    scores = ext.score(X_test, y_test)

    return 'ExtraTrees Score: '+str(scores), dict(zip(cols2, clf.feature_importances_))
Пример #17
0
def ExtraTrees(Xtrain, Ytrain, Xtest, Ytest):
    """
	Apply the extra trees regressor
	"""
    from sklearn.ensemble import ExtraTreesRegressor
    print('\nExtra trees regressor:')

    clf = ExtraTreesRegressor(n_estimators=100, n_jobs=-1).fit(Xtrain, Ytrain)
    print('Accuracy: {0}'.format(clf.score(Xtrain, Ytrain)))

    #find the training error
    prediction = clf.predict(Xtrain)
    Etrain = error(prediction, Ytrain)
    print('Training error: {0}'.format(Etrain))

    #find the test error
    prediction = clf.predict(Xtest)
    Etrain = error(prediction, Ytest)
    print('Test error: {0}'.format(Etrain))
Пример #18
0
def Ensemble_test():
    rfr = RandomForestRegressor()
    rfr.fit(X_train, y_train.ravel())
    rfr_y_predict = rfr.predict(X_test)

    etr = ExtraTreesRegressor()
    etr.fit(X_train, y_train.ravel())
    etr_y_predict = etr.predict(X_test)

    gbr = GradientBoostingRegressor()
    gbr.fit(X_train, y_train.ravel())
    gbr_y_predict = gbr.predict(X_test)

    print("对普通随机森林使用R-squared评价标准:{}".format(rfr.score(X_test, y_test)))
    print("对普通随机森林使用MAE评价标准:{}".format(
        mean_absolute_error(ss_y.inverse_transform(y_test),
                            ss_y.inverse_transform(rfr_y_predict))))
    print("对普通随机森林使用MSE评价标准:{}".format(
        mean_squared_error(ss_y.inverse_transform(y_test),
                           ss_y.inverse_transform(rfr_y_predict))))
    print("\n")
    print("对极端回归森林使用R-squared评价标准:{}".format(etr.score(X_test, y_test)))
    print("对极端回归森林使用MAE评价标准:{}".format(
        mean_absolute_error(ss_y.inverse_transform(y_test),
                            ss_y.inverse_transform(etr_y_predict))))
    print("对极端回归森林使用MSE评价标准:{}".format(
        mean_squared_error(ss_y.inverse_transform(y_test),
                           ss_y.inverse_transform(etr_y_predict))))
    print("极端回归森林模型中每种特征对预测目标的贡献度:")
    print(
        np.sort(list(zip(etr.feature_importances_, boston.feature_names)),
                axis=0))
    print("\n")
    print("对梯度提升回归树使用R-squared评价标准:{}".format(gbr.score(X_test, y_test)))
    print("对梯度提升回归树使用MAE评价标准:{}".format(
        mean_absolute_error(ss_y.inverse_transform(y_test),
                            ss_y.inverse_transform(gbr_y_predict))))
    print("对梯度提升回归树使用MSE评价标准:{}".format(
        mean_squared_error(ss_y.inverse_transform(y_test),
                           ss_y.inverse_transform(gbr_y_predict))))
Пример #19
0
def TreeRegressor():
    #随机森林
    from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
    boston = load_boston()

    X_train, X_test, Y_train, Y_test = train_test_split(boston.data,
                                                        boston.target,
                                                        test_size=0.25,
                                                        random_state=33)

    # ss_X = StandardScaler()
    # ss_Y = StandardScaler()
    #
    # X_train = ss_X.fit_transform(X_train)
    # X_test = ss_X.transform(X_test)
    # Y_train = ss_Y.fit_transform(Y_train.reshape(-1, 1))
    # Y_test = ss_Y.transform(Y_test.reshape(-1, 1))

    svr = RandomForestRegressor()
    svr.fit(X_train, Y_train)
    y_predict = svr.predict(X_test)
    print 'the value of default measurement of RandomForestRegressor is', svr.score(
        X_test, Y_test)

    etr = ExtraTreesRegressor()
    etr.fit(X_train, Y_train)
    y_predict = etr.predict(X_test)
    print 'the value of default measurement of ExtraTreesRegressor is', etr.score(
        X_test, Y_test)

    svr = GradientBoostingRegressor()
    svr.fit(X_train, Y_train)
    y_predict = svr.predict(X_test)
    print 'the value of default measurement of GradientBoostingRegressor is', svr.score(
        X_test, Y_test)

    import numpy as np
    print np.sort(zip(etr.feature_importances_, boston.feature_names), axis=0)
Пример #20
0
# Defining the clean dataset
train_data = harmonize_data(train)
test_data  = harmonize_data(test)

# Feature enginnering
train_data["FamilySize"] = train_data["SibSp"]+train_data["Parch"]+1

test_data["FamilySize"] = test_data["SibSp"]+test_data["Parch"]+1

# Defining predictor
predictors = ["Sex", "Age", "Pclass", "FamilySize"]

#Applying method
max_score = 0
best_n = 0
for n in range(23,24):
    dtc_scr = 0.
    dtc = ExtraTreesRegressor(max_depth=n)
    for train, test in KFold(len(train_data), n_folds=10, shuffle=True):
        dtc.fit(train_data[predictors], train_data["Survived"])
        dtc_scr += dtc.score(train_data[predictors], train_data["Survived"])/10
    if dtc_scr > max_score:
        max_score = dtc_scr
        best_n = n

print(best_n, max_score)
dtc = ExtraTreesRegressor(max_depth=best_n)

# Creating submission
create_submission(dtc, train_data, test_data, predictors, "dtcsurvivors.csv")
Пример #21
0
print('R-squared value of uniform-weighted RandomForestRegressor is',
      rfr.score(X_test, y_test))
print(
    'The mean squared error of uniform-weighted RandomForestRegressor is',
    mean_squared_error(ss_y.inverse_transform(y_test),
                       ss_y.inverse_transform(rfr_y_predict)))
print(
    'The mean absolute error of uniform-weighted RandomForestRegressor is',
    mean_absolute_error(ss_y.inverse_transform(y_test),
                        ss_y.inverse_transform(rfr_y_predict)))

print()

print('R-squared value of uniform-weighted ExtraTreesRegressor is',
      etr.score(X_test, y_test))
print(
    'The mean squared error of uniform-weighted ExtraTreesRegressor is',
    mean_squared_error(ss_y.inverse_transform(y_test),
                       ss_y.inverse_transform(etr_y_predict)))
print(
    'The mean absolute error of uniform-weighted ExtraTreesRegressor is',
    mean_absolute_error(ss_y.inverse_transform(y_test),
                        ss_y.inverse_transform(etr_y_predict)))
# feature importance
print(
    np.sort(list(zip(etr.feature_importances_, boston.feature_names)), axis=0))

print()

print('R-squared value of uniform-weighted GradientBoostingRegressor is',
Пример #22
0
test_pd3['label'] = y_test
print compute_ks(test_pd[['label','predict']])
print clf3.feature_importances_
# Top Ten
feature_importance = clf3.feature_importances_
feature_importance = 100.0 * (feature_importance / feature_importance.max())

indices = np.argsort(feature_importance)[-10:]
plt.barh(np.arange(10), feature_importance[indices],color='dodgerblue',alpha=.4)
plt.yticks(np.arange(10 + 0.25), np.array(X.columns)[indices])
_ = plt.xlabel('Relative importance'), plt.title('Top Ten Important Variables')

# XTR
clf4 = ExtraTreesRegressor(n_jobs=-1, max_depth=10,random_state=0)
clf4.fit(x_train, y_train)
print clf4.score(x_test, y_test)
test_pd4 = pd.DataFrame()
test_pd4['predict'] = clf4.predict(x_test)
test_pd4['label'] = y_test
print compute_ks(test_pd[['label','predict']])
print clf4.feature_importances_
# Top Ten
feature_importance = clf4.feature_importances_
feature_importance = 100.0 * (feature_importance / feature_importance.max())

indices = np.argsort(feature_importance)[-10:]
plt.barh(np.arange(10), feature_importance[indices],color='dodgerblue',alpha=.4)
plt.yticks(np.arange(10 + 0.25), np.array(X.columns)[indices])
_ = plt.xlabel('Relative importance'), plt.title('Top Ten Important Variables')

Пример #23
0
with open('model.txt','wt') as f:
  print >> f, xfr
with open('estimators_.txt','wt') as f:
  #f.write(xfr.estimators_)
  print >> f, xfr.estimators_
with open('feature_importances_.txt','wt') as f:
  print >> f, xfr.feature_importances_
#with open('oob_score_.txt','wt') as f:
  #print >> f, xfr.oob_score_
#with open('oob_prediction_.txt','wt') as f:
  #print >> f, xfr.oob_prediction_

predict_loc_regres = xfr.predict(data_test)
if 'target_test' in locals():
  score = xfr.score(data_test,target_test)
  gn = normalized_weighted_gini(target_test,predict_loc_regres,data_test.var11)
end = time.clock()

#outdf = pd.DataFrame([data_test.ix[:,'id']])
if 'target_test' in locals():
  target_test.columns = ['true_target']
  outdf = pd.concat([data_test.ix[:,'id'].astype(int),pd.DataFrame(predict_loc_regres,columns=['target']),target_test],axis=1)
else:
  outdf = pd.concat([data_test.ix[:,'id'].astype(int),pd.DataFrame(predict_loc_regres,columns=['target'])],axis=1)

out_filename = (os.path.splitext(os.path.basename(sys.argv[1]))[0]+"_predict.csv")
outdf.to_csv(out_filename,index=0)
if 'target_test' in locals():
  print out_filename, score , gn
else:
Пример #24
0
#Extra Trees Regression

from sklearn.ensemble import ExtraTreesRegressor
extra_tree = ExtraTreesRegressor(n_estimators=200, random_state=1234)


# In[73]:


extra_tree.fit(X_train, y_train)


# In[74]:


extratree_score = extra_tree.score(X_test, y_test)
extratree_score


# In[75]:


extratree_score = extra_tree.score(X_train, y_train)
extratree_score


# In[76]:


extratree_pred = extra_tree.predict(X_test)
Пример #25
0
X = scaler.transform(X)

timeit("Standardizing the data")
'''

from sklearn.ensemble import ExtraTreesRegressor
#from sklearn.neighbors import KNeighborsRegressor

clf = ExtraTreesRegressor(n_estimators=10)
#clf = KNeighborsRegressor()

clf.fit(X_train, Y_train)

timeit("Training")

print "Validation score: " + str(clf.score(X_test, Y_test))

timeit("Validation")

#score = 0.
#wrong = []
#for i, item in enumerate(X_test):
#	if unconvert(clf.predict(item)[0]) == unconvert(Y_test[i]):
#		score += 1
#	else:
#		wrong.append((unconvert(clf.predict(item)[0]),unconvert(Y_test[i])))
#score /= len(X_test)
#print "Manual validation score: " + str(score)

#timeit("Manual validation")
Пример #26
0
MSEValue = mean_squared_error(y_test, y_pred, multioutput='uniform_average')
print('Mean Squared Error Value is : ', MSEValue)

MdSEValue = median_absolute_error(y_test, y_pred)
print('Median Squared Error Value is : ', MdSEValue)

print("-------------------------------------")

#apply ExtraTreesRegressor

regressor = ExtraTreesRegressor(n_estimators=200)
regressor.fit(X_train, y_train)

#Calculating Details
print('ExtraTreesRegressorModel Train Score is : ',
      regressor.score(X_train, y_train))
print('ExtraTreesRegressorModel Test Score is : ',
      regressor.score(X_test, y_test))
print('----------------------------------------------------')

#prediction and evaluation

y_pred = regressor.predict(X_test)

print('Predicted Value for ExtraTreesRegressor is : ', y_pred[:5])
print('Predicted Value for ExtraTreesRegressor is : ', y_test[:5])

#Calculating Mean Absolute Error

MAEValue = mean_absolute_error(y_test, y_pred, multioutput='uniform_average')
print('Mean Absolute Error Value is : ', MAEValue)
Пример #27
0

re = ExtraTreesRegressor(n_estimators =10,criterion='mae',random_state=0)
re.fit(X_train,y_train)


# In[17]:


y_pred = re.predict(X_test)


# In[18]:


re.score(X_test, y_test)


# In[ ]:


# plt.figure(figsize=(10, 6))
# #plt.plot(X_test, f(X_test), "b")
# plt.scatter(X_train, y_train, c="b", s=20)
# plt.plot(X_test, regr.predict(X_test), "r", lw=2)
# plt.xlim([-5, 5])


# In[53]:

Пример #28
0
class mixmodels:
    def __init__(self,nest=10):
        self.nest = nest
    def fit(self,data_train,target):
        self.target_train = target
        self.catcol = data_train.filter(like='var').columns.tolist()
        #start_gbr_tr = time.clock()
        self.gbr = GradientBoostingRegressor(n_estimators =self.nest,max_depth=7)
        self.gbr.fit(data_train,self.target_train)
        self.transformed_train_gbr = self.gbr.transform(data_train,threshold="0.35*mean")
        self.gbr_tr_fit = GradientBoostingRegressor(n_estimators =self.nest,max_depth=7)
        self.gbr_tr_fit.fit(self.transformed_train_gbr,self.target_train)
        #end_gbr_tr = time.clock()
        #print >> log, "time_gbr_tr = ", end_gbr_tr-start_gbr_tr

        #start_xfr_tr = time.clock()
        self.xfr= ExtraTreesRegressor(n_estimators =self.nest,max_depth=7)
        self.xfr.fit(data_train,self.target_train)
        self.transformed_train_xfr = self.xfr.transform(data_train,threshold="0.35*mean")
        self.xfr_tr_fit = ExtraTreesRegressor(n_estimators =self.nest,max_depth=7)
        self.xfr_tr_fit.fit(self.transformed_train_xfr,self.target_train)
        #end_xfr_tr = time.clock()
        #print >> log, "time_xfr_tr = ", end_xfr_tr-start_xfr_tr

        #start_gbr_cat = time.clock()
        self.gbr_cat_fit = GradientBoostingRegressor(n_estimators =self.nest,max_depth=7)
        self.gbr_cat_fit.fit(data_train[self.catcol],self.target_train)
        #end_gbr_cat = time.clock()
        #print >> log, "time_gbr_cat = ", end_gbr_cat-start_gbr_cat

        #start_xfr_cat = time.clock()
        self.xfr_cat_fit = ExtraTreesRegressor(n_estimators =self.nest,max_depth=7)
        self.xfr_cat_fit.fit(data_train[self.catcol],self.target_train)
        #end_xfr_cat = time.clock()
        #print >> log, "time_xfr_cat = ", end_xfr_cat-start_xfr_cat
        return self

    def predict(self,data_test):
        mix_test_list = []

        transformed_test_gbr = self.gbr.transform(data_test,threshold="0.35*mean")
        mix_test_list += [pd.Series(self.gbr_tr_fit.predict(transformed_test_gbr))]

        transformed_test_xfr = self.xfr.transform(data_test,threshold="0.35*mean")
        mix_test_list += [pd.Series(self.xfr_tr_fit.predict(transformed_test_xfr))]

        mix_test_list += [pd.Series(self.gbr_cat_fit.predict(data_test[self.catcol]))]

        mix_test_list += [pd.Series(self.xfr_cat_fit.predict(data_test[self.catcol]))]

        mix_test = pd.concat(mix_test_list,1)

        mix_ave = mix_test.mean(1)
        mix_ave.name='target'

        return mix_ave
    def score(self,data_test,target_test):
        total_score = []
        transformed_test_gbr = self.gbr.transform(data_test,threshold="0.35*mean")
        total_score += [ self.gbr_tr_fit.score(transformed_test_gbr,target_test) ]
        transformed_test_xfr = self.xfr.transform(data_test,threshold="0.35*mean")
        total_score += [ self.xfr_tr_fit.score(transformed_test_xfr,target_test) ]
        total_score += [ self.gbr_cat_fit.score(data_test[self.catcol],target_test) ]
        total_score += [ self.xfr_cat_fit.score(data_test[self.catcol],target_test) ]
        return sum(total_score)/float(len(total_score))

    def gini(self,data_test,target_test):
        weight = data_test.var11
        gns = []
        transformed_test_gbr = self.gbr.transform(data_test,threshold="0.35*mean")
        gns += [normalized_weighted_gini(target_test.tolist(),self.gbr_tr_fit.predict(transformed_test_gbr).tolist(),weight.tolist()) ]
        transformed_test_xfr = self.xfr.transform(data_test,threshold="0.35*mean")
        gns += [normalized_weighted_gini(target_test.tolist(),self.xfr_tr_fit.predict(transformed_test_xfr).tolist(),weight.tolist()) ]
        gns += [normalized_weighted_gini(target_test.tolist(),self.gbr_cat_fit.predict(data_test[self.catcol]).tolist(),weight.tolist()) ]
        gns += [normalized_weighted_gini(target_test.tolist(),self.xfr_cat_fit.predict(data_test[self.catcol]).tolist(),weight.tolist()) ]
        return sum(gns)/float(len(gns))
Пример #29
0
    score = gbr.score(X_test, Y_test)
    print('Problem 2 part 4 Test score : {}'.format(score))
    
    
    
    etr = ExtraTreesRegressor(n_estimators=100, max_depth=8,min_samples_leaf=2 )
    
    etr.fit(X_train, Y_train)
    
    Y_etr = etr.predict(X_test)    
    score = r2_score(Y_test.values, Y_etr)
    print('Problem 2 part 5a Test score : {}'.format(score))
    

    score = etr.score(X_test, Y_test)
    print('Problem 2 part 5b Test score : {}'.format(score))
    



if(runProblem3):
    from keras.models import Sequential
    from keras.layers.core import Activation, Dense, Dropout
    from keras.callbacks import EarlyStopping


    #X = dataset[['Feature_5', 'Feature_7','Ret_MinusTwo', 'Ret_MinusOne']+['Ret_{}'.format(i) for i in range(2,121)]]
    #Y = dataset['Ret_MinusZero']

    #X['Feature_5'] = (X['Feature_5'] - np.mean(X['Feature_5']))/np.std(X['Feature_5'])
Пример #30
0
# 使用R-squared、MSE以及MAE指标对默认配置的随机回归森林在测试集上进行性能评估
print('R-squared value of RandoomForestRegressor:', rfr.score(X_test, y_test))
print(
    'The mean squared error of RandomForestRegressor:',
    mean_squared_error(ss_y.inverse_transform(y_test),
                       ss_y.inverse_transform(rfr_y_predict)))
print(
    'The mean absoluate error of RandomForestRegressor:',
    mean_absolute_error(ss_y.inverse_transform(y_test),
                        ss_y.inverse_transform(rfr_y_predict)))
print(
    '---------------------------------------分界---------------------------------------'
)

# 使用R-squared、MSE以及MAE指标对默认配置的极端回归森林在测试集上进行性能评估
print('R-squared value of Exc', etr.score(X_test, y_test))
print(
    'The mean squared error of ExtraTreeRegressor:',
    mean_squared_error(ss_y.inverse_transform(y_test),
                       ss_y.inverse_transform(etr_y_predict)))
print(
    'The mean absoluate error of ExtraTreeRegressor:',
    mean_absolute_error(ss_y.inverse_transform(y_test),
                        ss_y.inverse_transform(etr_y_predict)))
print(
    '---------------------------------------分界---------------------------------------'
)

# 利用训练好的极端回归森林模型,输出每种特征对预测目标的贡献度
print(np.sort(zip(etr.feature_importances_, boston.feature_names), axis=-0))
print(
Пример #31
0
# 使用GradientBoostingRegressor训练模型,并对测试数据做出预测,结果存储在变量gbr_y_predict中。
gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train)
gbr_y_predict = gbr.predict(X_test)

from sklearn.metrics import mean_absolute_error, mean_squared_error
# 使用R-squared、MSE以及MAE指标对默认配置的随机回归森林在测试集上进行性能评估。
print('R-squared value of RandomForestRegressor:', rfr.score(X_test, y_test))
print('The mean squared error of RandomForestRegressor:',
      mean_squared_error(y_test, rfr_y_predict))
print('The mean absoluate error of RandomForestRegressor:',
      mean_absolute_error(y_test, rfr_y_predict))

# 使用R-squared、MSE以及MAE指标对默认配置的极端回归森林在测试集上进行性能评估。
print('R-squared value of ExtraTreesRegessor:', etr.score(X_test, y_test))
print('The mean squared error of  ExtraTreesRegessor:',
      mean_squared_error(y_test, etr_y_predict))
print('The mean absoluate error of ExtraTreesRegessor:',
      mean_absolute_error(y_test, etr_y_predict))

# 利用训练好的极端回归森林模型,输出每种特征对预测目标的贡献度。
print(zip(etr.feature_importances_, boston.feature_names))
featrue_importance = zip(etr.feature_importances_, boston.feature_names)
print(np.sort(list(featrue_importance), axis=0))
# 使用R-squared、MSE以及MAE指标对默认配置的梯度提升回归树在测试集上进行性能评估。
print('R-squared value of GradientBoostingRegressor:',
      gbr.score(X_test, y_test))
print('The mean squared error of GradientBoostingRegressor:',
      mean_squared_error(y_test, gbr_y_predict))
print('The mean absoluate error of GradientBoostingRegressor:',
                            max_depth=14,
                            max_features="log2")
rfr.fit(X_train, y_train)

ext = ExtraTreesRegressor(n_estimators=300, max_depth=14, max_features="log2")
ext.fit(X_train, y_train)

dtr = DecisionTreeRegressor(max_depth=14, max_features="log2")
dtr.fit(X_train, y_train)

print("Random Forest Model")
print("Train Score {}".format(rfr.score(X_train, y_train)))
print("Test Score {}".format(rfr.score(X_test, y_test)))
print("\n")
print("ExtraTreesRegressor Model")
print("Train Score {}".format(ext.score(X_train, y_train)))
print("Test Score {}".format(ext.score(X_test, y_test)))
print("\n")
print("DecisonTree Model")
print("Train Score {}".format(dtr.score(X_train, y_train)))
print("Test Score {}".format(dtr.score(X_test, y_test)))

print("\n")
print("\n")

print("Random Forest Model")
print("SMAPE Score {}".format(
    symmetric_mean_absolute_percentage_error(y_test, rfr.predict(X_test))))
print("ExtraTreesRegressor Model")
print("SMAPE Score {}".format(
    symmetric_mean_absolute_percentage_error(y_test, ext.predict(X_test))))
Пример #33
0
etr_y_predict = etr.predict(X_test)

# 使用GradientBoostingRegressor训练模型,并对测试数据做出预测,结果存储在变量gbr_y_predict中。
gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train)
gbr_y_predict = gbr.predict(X_test)

from sklearn.metrics import mean_absolute_error,mean_squared_error
# 使用R-squared、MSE以及MAE指标对默认配置的随机回归森林在测试集上进行性能评估。
print('R-squared value of RandomForestRegressor:', rfr.score(X_test, y_test))
print( 'The mean squared error of RandomForestRegressor:', mean_squared_error(y_test, rfr_y_predict))
print( 'The mean absoluate error of RandomForestRegressor:', mean_absolute_error(y_test, rfr_y_predict))


# 使用R-squared、MSE以及MAE指标对默认配置的极端回归森林在测试集上进行性能评估。
print('R-squared value of ExtraTreesRegessor:', etr.score(X_test, y_test))
print('The mean squared error of  ExtraTreesRegessor:', mean_squared_error(y_test,etr_y_predict))
print('The mean absoluate error of ExtraTreesRegessor:', mean_absolute_error(y_test, etr_y_predict))

# 利用训练好的极端回归森林模型,输出每种特征对预测目标的贡献度。
print(zip(etr.feature_importances_, boston.feature_names))
featrue_importance = zip(etr.feature_importances_, boston.feature_names)
print(np.sort(list(featrue_importance), axis= 0))
# 使用R-squared、MSE以及MAE指标对默认配置的梯度提升回归树在测试集上进行性能评估。
print('R-squared value of GradientBoostingRegressor:', gbr.score(X_test, y_test))
print('The mean squared error of GradientBoostingRegressor:', mean_squared_error(y_test, gbr_y_predict))
print('The mean absoluate error of GradientBoostingRegressor:', mean_absolute_error(y_test, gbr_y_predict))


# 许多业界从事商业分析系统开发和搭建的工作者更加青睐于集成模型,
#并经常以这些模型的性能表现为基准,与新设计的其他模型性能进行比对。
Пример #34
0
import pandas as pd
conc = pd.read_csv('concrete.csv')
conc = conc[conc.columns[1:10]]
y = conc['strength']
X = conc.drop(['strength'], axis=1)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=54,
                                                    shuffle=True)
from sklearn.ensemble import ExtraTreesRegressor
clf = ExtraTreesRegressor(n_estimators=382,
                          max_depth=None,
                          min_samples_split=2,
                          random_state=7)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))
dic = dict(zip(X.columns, clf.feature_importances_))
for item in sorted(dic.items(), key=lambda x: x[1], reverse=True):
    print(item[0], round(item[1], 4))
Пример #35
0
                    forest = ExtraTreesRegressor(n_estimators=nestt,                                                  
                                                 max_features = maxff,                                                  
                                                 max_depth = maxdd,                                                  
                                                 min_samples_leaf = minss,                                                  
                                                 n_jobs = processorsIn,                                                  
                                                 random_state=int(random.random()*200))
                    forest.fit(x_train, y_train)

                    importances = forest.feature_importances_
                    indices = np.argsort(importances)[::-1]
                    #featureNames
                    feat_df = pd.DataFrame({'FeatureNm':x_train.columns})
                    feat_df['Importance'] = forest.feature_importances_
                    feat_df = pd.DataFrame(feat_df.sort_values(by='Importance', ascending=False))
                    print(feat_df[:20])
                    scoress = forest.score(x_test,y_test)
                    shtname = 'F' +str(indx) + 'e' +str(nestt) + '_' +str(maxff)+ '_' +str(maxdd) + '_' +str(minss) + '_' + str(len(x_train)) + '_' + str(int(scoress*1000))
                    print('Runparams:' + str(shtname))
                    print('Number of observations in set:' + str(len(x_train)))
                    print("Score: ", int(scoress*1000))

                    scores_list.append([indx,nestt,maxff,maxdd,minss,len(x_train),int(scoress*1000),mean_squared_error(y_test,forest.predict(x_test)),sample])
                    print('----------------------')
                    print('----------------------')
                    # Convert the dataframe to an XlsxWriter Excel object.
                    feat_df.to_excel(writer, sheet_name=shtname)

                    indx += 1
    score_df = pd.DataFrame(scores_list, columns = ['index','nest','maxfeatures','maxdepth','minss','obs','scores','rsqr','tableName'])    
    score_df.to_excel(writer, sheet_name='overview')
    # Close the Pandas Excel writer and output the Excel file.
# In[14]:

# 使用 R-squared、MSE、MAE 指标对默认配置的随机回归森林在测试集上进行性能评估
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
print('R-squared value of RandomForestRegressor:', rfr.score(X_test, y_test))
print('The mean squared error of RandomForestRegressor:', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(rfr_y_predict)))
print('The mean absolute error of RandomForestRegressor:', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(rfr_y_predict)))


# ### ExtraTreesRegressor (ETR)

# In[51]:

# 使用 R-squared、MSE、MAE 指标对默认配置的极端回归森林在测试集上进行性能评估
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
print('R-squared value of ExtraForestRegressor:', etr.score(X_test, y_test))
print('The mean squared error of ExtraForestRegressor:', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(etr_y_predict)))
print('The mean absolute error of ExtraForestRegressor:', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(etr_y_predict)))

# 利用训练好的极端回归森林模型,输出每种特征对预测目标的贡献度
sorted(zip(etr.feature_importances_, boston.feature_names))


# ### GradientBoostingRegressor (GBR)

# In[52]:

# 使用 R-squared、MSE、MAE 指标对默认配置的梯度提升回归树在测试集上进行性能评估
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
print('R-squared value of GradientBoostingRegressor:', gbr.score(X_test, y_test))
print('The mean squared error of GradientBoostingRegressor:', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(gbr_y_predict)))
###随机森林回归模型###
# 使用RandomForestRegressor模型自带的评估模块,并输出评估结果
print '随机森林回归模型预测准确度: ', rfr.score(X_test, y_test)
# 使用R-squared、MSE和MAE指标对三种配置的支持向量机(回归)模型在相同测试集上进行性能评估
# 从sklearn.metrics依次导入r2_score、mean_squared_error以及mean_absoluate_error用于回归性能的评估
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
# 使用r2_score模块,并输出评估结果
print '确定系数 R-squared : ', r2_score(y_test, rfr_y_predict)
# 使用mean_squared_error模块,并输出评估结果
print '均值平方误差 Mean Squared Error: ', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(rfr_y_predict))
# 使用mean_absolute_error模块,并输出评估结果
print '均值绝对误差 Mean Absoluate Error: ', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(rfr_y_predict))

###极端随机森林回归模型###
# 使用ExtraTreesRegressor模型自带的评估模块,并输出评估结果
print '极端随机森林回归模型预测准确度: ', etr.score(X_test, y_test)
# 使用R-squared、MSE和MAE指标对三种配置的支持向量机(回归)模型在相同测试集上进行性能评估
# 从sklearn.metrics依次导入r2_score、mean_squared_error以及mean_absoluate_error用于回归性能的评估
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
# 使用r2_score模块,并输出评估结果
print '确定系数 R-squared : ', r2_score(y_test, etr_y_predict)
# 使用mean_squared_error模块,并输出评估结果
print '均值平方误差 Mean Squared Error: ', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(etr_y_predict))
# 使用mean_absolute_error模块,并输出评估结果
print '均值绝对误差 Mean Absoluate Error: ', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(etr_y_predict))

###梯度提升回归树模型###
# 使用ExtraTreesRegressor模型自带的评估模块,并输出评估结果
print '梯度提升回归树模型预测准确度: ', gbr.score(X_test, y_test)
# 使用R-squared、MSE和MAE指标对三种配置的支持向量机(回归)模型在相同测试集上进行性能评估
# 从sklearn.metrics依次导入r2_score、mean_squared_error以及mean_absoluate_error用于回归性能的评估
Пример #38
0
reg8.fit(X_train, y_train)
reg1.fit(X_train, y_train)
reg2.fit(X_train, y_train)
reg3.fit(X_train, y_train)
ereg.fit(X_train, y_train)
reg4.fit(X_train, y_train)
reg5.fit(X_train, y_train)
reg6.fit(X_train, y_train)
# reg7.fit(X_train, y_train)
print("GradientBoostingRegressor:", reg1.score(X_test, y_test))
print("RandomForestRegressor:", reg2.score(X_test, y_test))
print("LinearRegression:", reg3.score(X_test, y_test))
print("VotingRegressor:", ereg.score(X_test, y_test))
print("AdaBoostRegressor:", reg4.score(X_test, y_test))
print("BaggingRegressor:", reg5.score(X_test, y_test))
print("ExtraTreesRegressor:", reg6.score(X_test, y_test))
# print("StackingRegressor:", reg7.score(X_test, y_test))
print("XGBRegressor:", reg8.score(X_test, y_test))

XGBpredictions = reg8.predict(X_test)
MAE = mean_absolute_error(y_test, XGBpredictions)
print('XGBoost validation MAE = ', MAE)
xx = []
# try:
#     file = open('regression.csv', 'w', newline='')
#     file_w = csv.writer(file)
# except Exception:
#     print('regression.csv open faild')
#     exit()
# names = ['test', 'prediction']
# file_w.writerow(names)