Exemplo n.º 1
0
    predictions = np.zeros_like(valid_salaries)
    for cur_class_id in range(num_classes + 1):
        predictions_part, idx = predict(cur_class_id)
        if idx is not None:
            predictions[idx] = predictions_part
            print "Part MAE: ", metric(valid_salaries[idx], predictions_part)
    if submission:
        dio.save_prediction(name, predictions, type_n=type_v)
        dio.write_submission(name + ".csv", predictions=predictions)
    else:
        dio.compare_valid_pred(valid_salaries, predictions)
        metric = dio.error_metric
        mae = metric(valid_salaries, predictions)
        print "MAE validation: ", mae
        dio.save_model(ExtraTreesRegressor(), name, mae)
        dio.save_prediction(name, predictions, type_n=type_v)
#oob_predictions = classifier.oob_prediction_
#mae_oob = mean_absolute_error(salaries, oob_predictions)
#print "MAE OOB: ", mae_oob
#classifier1 = ExtraTreesRegressor(n_estimators=n_trees,
#verbose=1,
#n_jobs=3,
#oob_score=False,
#min_samples_split=min_samples_split,
#random_state=3465343)
#scores = cross_val_score(classifier1, features, salaries, cv=3, score_func=metric, verbose=1)
#print "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() / 2)
#mae_cv = "%0.2f (+/- %0.2f)" % (scores.mean(), scores.std() / 2)
#dio.save_model(classifier, name, mae_cv=mae_cv, parameters=param)
Exemplo n.º 2
0
def createExtraTree():
    clf = ExtraTreesRegressor(n_estimators=50)
    return clf
Exemplo n.º 3
0
X.head()

# In[34]:

y.head()

# In[35]:

X.shape, y.shape

# In[36]:

#now lets see the feature importance
from sklearn.ensemble import ExtraTreesRegressor
model = ExtraTreesRegressor()

# In[37]:

model.fit(X, y)

# In[38]:

print(model.feature_importances_)

# In[39]:

#lets plot the feature importance for better visualization
feat_importance = pd.Series(model.feature_importances_, index=X.columns)
feat_importance
Exemplo n.º 4
0
kf = KFold(shuffle=True, random_state=19)

from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor, XGBRFRegressor
from catboost import CatBoostRegressor

trees = {
    'linear': LinearRegression(),
    'randomfor': RandomForestRegressor(random_state=19), 
    'gradientb': GradientBoostingRegressor(random_state=19), 
    'xgb': XGBRegressor(random_state=19), 
    'xgbrf': XGBRFRegressor(random_state=19), 
    'catboost': CatBoostRegressor(random_state=19, silent=True),
    'DecisionTr': DecisionTreeRegressor(random_state=19),
    'extratre': ExtraTreesRegressor(random_state=19),
    
}

!pip3 install catboost

scores = []
rmse=[]
mse=[]
mae=[]
for train_index, test_index in kf.split(X):
  X_train, X_test = X.iloc[train_index], X.iloc[test_index]
  y_train, y_test = y.iloc[train_index], y.iloc[test_index]

from sklearn import metrics 
regression_model = LinearRegression()
Exemplo n.º 5
0
x_t1 = np.zeros(hours)
u_T = np.empty((0, hours))
u_t = np.zeros(hours)
r_T = np.empty((0, hours))
r_t = np.zeros(hours)
q_T = np.empty((0, hours))
q_t = np.zeros(hours)
rew = []
r_cum = 0

trees = []
model = []
for i in range(hours):
    inputs = pd.DataFrame({'X': [x_t[i]], 'U': [u_t[i]], 't': [i + 7]})
    outputs = q_t[i]
    model = ExtraTreesRegressor(n_estimators=50)
    model.fit(inputs, [outputs])
    trees.append(model)

"Building the set of four tuples x_t, u_t, r_t, x_t+1"
for d in range(days):
    x_k = 0
    r_cum = 0
    for i, t in enumerate(t_time):
        x_t[i] = x_k
        model = trees[i]
        if np.random.random() > epsilon:
            a = []
            for u in U:
                a.append(
                    model.predict(
# Ensemble

ensembles = []
ensembles.append(('ScalesAB',
                  Pipeline([('Scaler', StandardScaler()),
                            ('AB', AdaBoostRegressor())])))
ensembles.append(('ScalesGBM',
                  Pipeline([('Scaler', StandardScaler()),
                            ('GBM', GradientBoostingRegressor())])))
ensembles.append(('ScalesRF',
                  Pipeline([('Scaler', StandardScaler()),
                            ('RF', RandomForestRegressor())])))
ensembles.append(('ScalesET',
                  Pipeline([('Scaler', StandardScaler()),
                            ('ET', ExtraTreesRegressor())])))

results = []
names = []
for name, model in ensembles:
    kfold = KFold(n_splits=num_folds, random_state=seed)
    cv_results = cross_val_score(model,
                                 X_train,
                                 Y_train,
                                 cv=kfold,
                                 scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
Exemplo n.º 7
0
mae_rf = metrics.mean_absolute_error(y_test, forest_test_pred)

#%%
from sklearn.model_selection import cross_val_score

forest_scores = cross_val_score(regr,
                                x,
                                y,
                                cv=5,
                                scoring='neg_mean_absolute_error')
print(f'Cross-validation scores for RF: {-np.mean(forest_scores)}')

#%%
extra = ExtraTreesRegressor(n_estimators=300,
                            criterion='mae',
                            random_state=1,
                            n_jobs=-1)
regr = TransformedTargetRegressor(regressor=extra,
                                  func=func,
                                  inverse_func=inverse_func)
regr.fit(x_train, y_train)
extra_train_pred = regr.predict(x_train)
extra_test_pred = regr.predict(x_test)

print('MAE train data: %.3f, MAE test data: %.3f' %
      (metrics.mean_absolute_error(y_train, extra_train_pred),
       metrics.mean_absolute_error(y_test, extra_test_pred)))

mae_extra = metrics.mean_absolute_error(y_test, extra_test_pred)

#%%
Exemplo n.º 8
0
import sklearn
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv('deploy_df')
df.drop('Unnamed: 0', axis=1, inplace=True)
print(df.head())
x = df.drop('rate', axis=1)
y = df['rate']
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=.3,
                                                    random_state=10)

#Building a Model

from sklearn.ensemble import ExtraTreesRegressor
ET_Model = ExtraTreesRegressor(n_estimators=120)
ET_Model.fit(x_train, y_train)

y_predict = ET_Model.predict(x_test)
print(y_predict)

#Lets save the model using Pickle
import pickle
pickle.dump(ET_Model, open('model.pkl', 'wb'))
model = pickle.load(open('model.pkl', 'rb'))
Exemplo n.º 9
0
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator
from tpot.export_utils import set_param_recursive
from sklearn.preprocessing import FunctionTransformer
from copy import copy

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'], random_state=123)

# Average CV score on the training set was: 0.9721928061326809
exported_pipeline = make_pipeline(
    make_union(
        FunctionTransformer(copy),
        SelectFromModel(estimator=ExtraTreesRegressor(
            max_features=0.6000000000000001, n_estimators=100),
                        threshold=0.2)),
    GaussianProcessRegressor(kernel=Matern(length_scale=3.7, nu=1.5),
                             n_restarts_optimizer=20,
                             normalize_y=False))
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 123)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Exemplo n.º 10
0
    support_regressor.score(X_test, y_test)))

dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train)
print("Coefficient of determination R^2 <-- on train set: {}".format(
    dtr.score(X_train, y_train)))
print("Coefficient of determination R^2 <-- on test set: {}".format(
    dtr.score(X_test, y_test)))

indiana_jones = Lasso(alpha=1.0)
indiana_jones.fit(X_train, y_train)
print("Coefficient of determination R^2 <-- on train set : {}".format(
    indiana_jones.score(X_train, y_train)))
print("Coefficient of determination R^2 <-- on test set: {}".format(
    indiana_jones.score(X_test, y_test)))

etr = ExtraTreesRegressor(n_estimators=300)
etr.fit(X_train, y_train)

print(etr.feature_importances_)
indecis = np.argsort(etr.feature_importances_)[::-1]

plt.figure(num=None, figsize=(14, 10), dpi=80, facecolor='w')
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]),
        etr.feature_importances_[indecis],
        color="r",
        align="center")
plt.xticks(range(X_train.shape[1]), indecis)
plt.show()
Exemplo n.º 11
0
xtrain, ytrain, xtest, x_cv, y_cv = np.array(train)[:, :20], np.array(
    train)[:, 20], np.array(test)[:, :20], np.array(data_10)[:, :20], np.array(
        data_10)[:, 20],
print 'train', np.array(train).shape
print xtrain[1]
print 'xtrain', xtrain.shape
print 'ytrain', ytrain.shape
print 'test', xtest.shape
estimators = 100
#sup_vec = svm.SVC(C=11000, verbose = 2, probability=True)
#sup_vec = RandomForestRegressor(n_estimators=estimators, verbose=2, n_jobs=-1, max_leaf_nodes=100)
#sup_vec = ExtraTreesRegressor(n_estimators=estimators, verbose=2, n_jobs=-1, max_leaf_nodes=100)

#sup_vec =  AdaBoostRegressor(RandomForestRegressor(n_estimators=100, verbose=2, n_jobs = -1),n_estimators=100)
sup_vec = AdaBoostRegressor(ExtraTreesRegressor(n_estimators=100,
                                                verbose=2,
                                                n_jobs=-1),
                            n_estimators=160,
                            loss='exponential')

#sup_vec =  AdaBoostRegressor(DecisionTreeRegressor(max_depth=10),n_estimators=300)

#dt_stump = DecisionTreeClassifier(max_depth=4, min_samples_leaf=1)
#dt_stump.fit(xtrain, ytrain)
#dt_stump_err = 1.0 - dt_stump.score(xtrain, ytrain)
#n_estimators = 400
# A learning rate of 1. may not be optimal for both SAMME and SAMME.R
#learning_rate = 1.

#sup_vec = AdaBoostClassifier(
#    base_estimator=dt_stump,
Exemplo n.º 12
0
# ExtraTreesRegressor: rather than trying every variable, try few splits with few variables thus reducing correlation between the individual trees
########################################

from BlueBookForBullDozers.utilities import proc_df, split_vals, print_score, draw_tree, plt, set_rf_samples

# read the dataframe from the saved feather file
df_raw = pd.read_feather("tmp/raw")

# create the instance of the model we want
# if we remove max_depth, #leaves = #rows in data set ie. highly over fit trees
# m = RandomForestRegressor(n_jobs=-1, n_estimators=1, max_depth=3, bootstrap=False)
# min_samples_leaf = stop when number of samples in a leaf becomes <=3
# max_features = only use half of the columns at every spilt
m = ExtraTreesRegressor(n_jobs=-1,
                        n_estimators=40,
                        min_samples_leaf=3,
                        max_features=0.5,
                        oob_score=False)

# replace categorical variables by their numeric codes, handle missing values and separate out SalePrice
df, y, _ = proc_df(df_raw, y_fld="SalePrice")

# create validation set and the final training set
n_validation = 12000
n_training = len(df) - n_validation
X_training, X_validation = split_vals(df, n_training)
y_training, y_validation = split_vals(y, n_training)

# rather than limit the training data to the first 30k rows or any continuous subset, sample random 20k rows.
# This way the if there are enough number of trees we can get access to entire data set.
set_rf_samples(20000)
Exemplo n.º 13
0

""" 5:  t-SNE """
tsne = manifold.TSNE(n_components=3)
model_tsne = tsne.fit(d7)
model_tsne.metric

data_tsne = tsne.fit_transform(d7)
data_pca_fa_ica_tsne = tsne.fit_transform(data_pca_fa_ica)

"""------------------------------------------------------------------------ """
"""Step 15: Regression Modelling """
"""------------------------------------------------------------------------ """

"""Regression Models """
forest = ExtraTreesRegressor(n_estimators=250,random_state=0)
rforest = RandomForestRegressor()
params = {'n_estimators': 10, 'max_depth': 10, 'min_samples_split': 10,
          'learning_rate': 0.1, 'loss': 'ls'}
clf = ensemble.GradientBoostingRegressor(**params)


""" 1: ML No Dimensionality Reduction """
"""------------------------------------------------------------------------ """
X = d7.copy(deep=True)
y=target
X = X.astype(np.float32)
offset = int(X.shape[0] * 0.9)
X_train, y_train = X[:offset], y[:offset]
X_test, y_test = X[offset:], y[offset:]
Exemplo n.º 14
0
score = clf.score(X_test, Y_test)

print(score)

y_pred = clf.predict(X_test)

names = [
    "Decision Tree Regressor", "MLP Regressor", "Random Forest Regressor",
    "AdaBoost", "Bagging Regressor", "Extra Trees Regressor"
]

classifiers = [
    DecisionTreeRegressor(max_depth=5, max_features=1),
    MLPRegressor(alpha=1, max_iter=200, power_t=0.9, batch_size=50),
    RandomForestRegressor(max_depth=5, max_features=1, n_estimators=10),
    AdaBoostRegressor(n_estimators=10),
    BaggingRegressor(max_features=1, n_estimators=10, base_estimator=clf),
    ExtraTreesRegressor(max_depth=5)
]

for name, clf in zip(names, classifiers):
    clf.fit(X_train, Y_train)
    score = clf.score(X_test, Y_test)
    y_pred = clf.predict(X_test)
    print(name + ": " + str(score))
    mse = mean_squared_log_error(Y_test, y_pred)
    print('MSE: %.4f' % mse)
    # print(confusion_matrix(Y_test,y_pred,labels=None))
    # print(cohen_kappa_score(Y_test,y_pred, labels=None))
    # print(classification_report(Y_test,y_pred,labels=None))
                                     n_jobs=20,
                                     random_state=2017,
                                     max_features="auto",
                                     verbose=1)
adaboost = AdaBoostRegressor(n_estimators=30,
                             random_state=2017,
                             learning_rate=0.01)
gbdt = GradientBoostingRegressor(learning_rate=0.04,
                                 n_estimators=100,
                                 subsample=0.8,
                                 random_state=2017,
                                 max_depth=5,
                                 verbose=1)
extratree = ExtraTreesRegressor(n_estimators=600,
                                max_depth=8,
                                max_features="auto",
                                n_jobs=20,
                                random_state=2017,
                                verbose=1)
lr_reg = LinearRegression(n_jobs=-1)

#############################################################################################################################################
# parameters : regression ###################################################################################################################
#############################################################################################################################################

if __name__ == '__main__':
    for i in range(1, folds + 1):
        train_xgboost_regression(trainingSet, testingSet, feature_names, i,
                                 nbags, xgbModelName, xgbParameters,
                                 num_rounds)
    fulltrain_xgboost_regression(trainingSet, testingSet, feature_names, nbags,
                                 xgbModelName, xgbParameters, num_rounds)
X_train, X_test, y_train, y_test = train_test_split(dat1, target, test_size = 0.2, random_state=42)
y_train = y_train.values.ravel()

models = []
models.append(('SVR', SVR()))
models.append(('KNN', KNeighborsRegressor()))
models.append(('DT', DecisionTreeRegressor()))
models.append(('RF', RandomForestRegressor()))
models.append(('l', Lasso()))
models.append(('EN', ElasticNet()))
models.append(('R', Ridge()))
models.append(('BR', BayesianRidge()))
models.append(('GBR', GradientBoostingRegressor()))
models.append(('RF', AdaBoostRegressor()))
models.append(('ET', ExtraTreesRegressor()))
models.append(('BgR', BaggingRegressor()))

scoring = 'neg_mean_squared_error'

results = []
names = []
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=42)
    cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

pipeline = make_pipeline(preprocessing.StandardScaler(), GradientBoostingRegressor(random_state=42))
Exemplo n.º 17
0
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE',
                        sep='COLUMN_SEPARATOR',
                        dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=42)

imputer = Imputer(strategy="median")
imputer.fit(training_features)
training_features = imputer.transform(training_features)
testing_features = imputer.transform(testing_features)

# Score on the training set was:-0.04520857750956616
exported_pipeline = make_pipeline(
    VarianceThreshold(threshold=0.15000000000000002),
    ExtraTreesRegressor(bootstrap=False,
                        max_features=0.6500000000000001,
                        min_samples_leaf=11,
                        min_samples_split=8,
                        n_estimators=100))

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
Exemplo n.º 18
0
    def feature_selection(self, X, y, method):
        """
        purpose:    select feature
        input:  X:train data
                y:lable
                method: uesed method
        return:
        """
        X_indices = np.arange(X.shape[-1])

        score = []

        # Removing features with low variance

        # correlation coefficient
        # SelectKBest(lambda X,Y: np.array(map(lambda x: pearsonr(x, Y), X.T)).T, k=2).fit_transform(data, target)

        # mutual information
        # SelectKBest(lambda X, Y: array(map(lambda x: mic(x, Y), X.T)).T, k=2).fit_transform(data, target)

        # Univariate feature selection (for classification)
        if method == 'chi-squared':
            skb = SelectKBest(chi2)
            skb.fit_transform(X, y)
            score = skb.scores_

        # Univariate feature selection (for regression)
        if method == 'f_regression':
            skb = SelectKBest(f_regression)
            skb.fit_transform(X, y)
            score = skb.scores_

        # L1-based feature selection (for classification)
        if method == 'LinearSVC':
            lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
            sfm = SelectFromModel(lsvc, prefit=True)
            X_new = sfm.transform(X)

        # L1-based feature selection (for regression)
        elif method == 'LassoCV':
            lasso = LassoCV().fit(X, y)
            score = lasso.coef_
            sfm = SelectFromModel(lasso, threshold=0.25, prefit=True)
            X_new = sfm.transform(X)

        # Tree-based feature selection (for classification)
        elif method == 'ExtraTreesClassifier':
            clf = ExtraTreesClassifier()
            clf = clf.fit(X, y)
            print clf.feature_importances_
            sfm = SelectFromModel(clf, threshold=0.25, prefit=True)
            X_new = sfm.transform(X)

        # Tree-based feature selection (for regression)
        elif method == 'ExtraTreesRegressor':
            clf = ExtraTreesRegressor()
            clf = clf.fit(X, y)
            score = clf.feature_importances_
            sfm = SelectFromModel(clf, threshold=0.25, prefit=True)
            X_new = sfm.transform(X)

        # Tree-based feature selection (for classifier)
        elif method == 'GradientBoostingClassifier':
            clf = GradientBoostingClassifier(learning_rate=0.01)
            clf = clf.fit(X, y)
            score = clf.feature_importances_
            sfm = SelectFromModel(clf, threshold=0.25, prefit=True)
            X_new = sfm.transform(X)

        # Tree-based feature selection (for regression)
        elif method == 'GradientBoostingRegressor':
            clf = GradientBoostingRegressor(learning_rate=0.01)
            clf = clf.fit(X, y)
            score = clf.feature_importances_
            sfm = SelectFromModel(clf, threshold=0.25, prefit=True)
            X_new = sfm.transform(X)

        # Print the feature ranking
        indices = np.argsort(score)[::-1]
        print("Feature ranking:")
        for f in X_indices:
            print("feature %d: %s  (%f)" %
                  (indices[f], self.columns[indices[f]], score[indices[f]]))

        #draw plot
        plt.figure()
        # plt.bar(indices, score, width=0.2, color='r')
        plt.barh(indices, score, height=0.2, color='r')
        plt.title(method)
        plt.xlabel("score")
        plt.ylabel("feature")
        plt.grid(axis='x')
        plt.show()

        pass
Exemplo n.º 19
0
n_faces = 5
rng = check_random_state(4)
face_ids = rng.randint(test.shape[0], size=(n_faces, ))
test = test[face_ids, :]

n_pixels = data.shape[1]
X_train = train[:, :int(np.ceil(0.5 * n_pixels))]  # Upper half of the faces
y_train = train[:, int(np.floor(0.5 * n_pixels)):]  # Lower half of the faces

X_test = test[:, :int(np.ceil(0.5 * n_pixels))]
y_test = test[:, int(np.floor(0.5 * n_pixels)):]

# Fit estimators
ESTIMATORS = {
    "Extra trees":
    ExtraTreesRegressor(n_estimators=10, max_features=32, random_state=0),
    "K-nn":
    KNeighborsRegressor(),
    "Linear regression":
    LinearRegression(),
    "Ridge":
    RidgeCV(),
    "Lasso":
    Lasso(),

    #   "ElasticNet_0.5": ElasticNet(alpha=100000, l1_ratio=0.001),

    #    "ElasticNet_0.1" : ElasticNet(alpha=0.0001, l1_ratio=0.01),
}

y_test_predict = dict()
Exemplo n.º 20
0
#         dft = dft.drop(columns = i)
#         dfv = dfv.drop(columns = i)
#         dfkaggle2 = dfkaggle2.drop(columns = i)
        
        l.append(i)
        
encoder=ce.CatBoostEncoder(cols=l,return_df=1,drop_invariant=1,handle_missing=False,sigma=None,a=2)
encoder.fit(X=dft,y=dft['Total_Yearly_Income_EUR'])
dft=encoder.transform(dft)
dfv=encoder.transform(dfv)
dfkaggle2=encoder.transform(dfkaggle2)
            
        
imp = IterativeImputer(
                    max_iter = 3
                   , estimator = ExtraTreesRegressor()
                   , n_nearest_features = 5
                  )

dfc = dft.copy()
dft = dft.drop(columns = 'Total_Yearly_Income_EUR')

dfvc = dfv.copy()
dfv = dfv.drop(columns = 'Total_Yearly_Income_EUR')

dfkc = dfkaggle2.copy()
dfkaggle2 = dfkaggle2.drop(columns = 'Total_Yearly_Income_EUR')

dfcolumns = dft.columns
imp.fit(dft)
Exemplo n.º 21
0
boston = load_boston()
X, y = boston.data, boston.target

# Make train/test split
# As usual in machine learning task we have X_train, y_train, and X_test
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=0)

# print(X_train.shape)

# First-layer model
models_1 = [
    ExtraTreesRegressor(random_state=0,
                        n_jobs=1,
                        n_estimators=100,
                        max_depth=3),
    RandomForestRegressor(random_state=0,
                          n_jobs=1,
                          n_estimators=100,
                          max_depth=3),
    XGBRegressor(random_state=0,
                 n_jobs=1,
                 learning_rate=0.1,
                 n_estimators=100,
                 max_depth=3)
]
# Second-layer model
models_2 = [
    GradientBoostingRegressor(random_state=0),
    SVR(),
Exemplo n.º 22
0
def featureSelection(X, y, method = 'lasso', select = 500):
    
    t0 = time.time()
    
    # sparse (15 seconds)
    if method == 'lasso':
        from sklearn import linear_model
        
        a = 0.861 if select == 500 else 0.0755
        lasso = linear_model.Lasso(alpha = a)
        lasso.fit(X,y)
        XSelected = X[:,lasso.coef_ != 0]
        indices = np.where(lasso.coef_ != 0)
        if indices > select:
            indices = np.argsort(-lasso.coef_)[:select]
    
    # non-sparse (157 seconds)
    if method == 'rf':
        from sklearn.ensemble import ExtraTreesRegressor
        from sklearn.datasets import load_iris
        from sklearn.feature_selection import SelectFromModel
        
        t = ExtraTreesRegressor(n_estimators=50)
        t.fit(X, y)
        model = SelectFromModel(t, prefit=True,
                                max_features = select)
        XSelected = model.transform(X)
        indices = np.where(model.get_support)
    
    # non-sparse (8.5 seconds)
    if method == 'svm':
        from sklearn.svm import SVR
        from sklearn.feature_selection import SelectFromModel
        
        SVMReg = SVR(kernel = 'linear',
                     gamma='scale', C=1.0, epsilon=0.2)
        SVMReg.fit(X, y)
        model = SelectFromModel(SVMReg, prefit=True, 
                                max_features = select)
        XSelected = model.transform(X)
        indices = np.where(model.get_support())
    
    # wrapper model (preset number of features) (1000 seconds / 5000 seconds)
    if method == 'hsiclasso':
        from pyHSICLasso import HSICLasso
        
        hsic_lasso = HSICLasso()
        hsic_lasso.input(X,y)
        hsic_lasso.regression(select)
        XSelected = X[:,hsic_lasso.get_index()]
        indices = hsic_lasso.get_index()

    # dimensionality reduction
        # PCA
        # MDS
        # PLS
        # DWT
        
#    f = h5py.File('selected/' + str(select) + '/X_' + method + '.hdf5', "w")
#    f.create_dataset('X', data=XSelected)
#    f.create_dataset('indices', data=indices)
#    f.close()

    # return indices
    np.savetxt('selected/' + str(select) + '/X_' + method + '.dat', indices)
    
    # np.savetxt('selected/' + str(select) + '/X_' + method + '.dat', XSelected)

    print("--- %s seconds ---" % (time.time() - t0))
Exemplo n.º 23
0
Xtrain_scaled = robustscaler.transform(Xtrain)
Xtest_scaled = robustscaler.transform(Xtest)
Xvalidate_scaled = robustscaler.transform(Xvalidate)

# Transform pandas into numpy arrays (no nneed to do it if you are scaling
# the results)
Ytrain = Ytrain.values
Ytest = Ytest.values
Yvalidate = Yvalidate.values

# Best pipeline recommended by TPOT
exported_pipeline = make_pipeline(
      StackingEstimator(estimator=LinearRegression()),
      StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=True,
                                                      max_features=0.9000000000000001,
                                                      min_samples_leaf=3,
                                                      min_samples_split=10,
                                                      n_estimators=100,
                                                      random_state=42)),
      ExtraTreesRegressor(bootstrap=False,
                          max_features=0.55,
                          min_samples_leaf=5,
                          min_samples_split=17,
                          n_estimators=100,
                          random_state=42)
                                 )

exported_pipeline.fit(Xtrain_scaled, Ytrain)
print('Print MAE - test')
y_predicted = exported_pipeline.predict(Xtest_scaled)
mae = mean_absolute_error(Ytest, y_predicted)
print(mae)
Exemplo n.º 24
0
def _select_estimator(estimator, n_jobs, n_estimators, random_state=None):
    '''Select estimator and parameters from argument name.'''
    # Regressors
    if estimator == 'RandomForestRegressor':
        param_dist = {**parameters['ensemble'], **parameters['bootstrap']}
        estimator = RandomForestRegressor(n_jobs=n_jobs,
                                          n_estimators=n_estimators,
                                          random_state=random_state)
    elif estimator == 'ExtraTreesRegressor':
        param_dist = {**parameters['ensemble'], **parameters['bootstrap']}
        estimator = ExtraTreesRegressor(n_jobs=n_jobs,
                                        n_estimators=n_estimators,
                                        random_state=random_state)
    elif estimator == 'GradientBoostingRegressor':
        param_dist = parameters['ensemble']
        estimator = GradientBoostingRegressor(n_estimators=n_estimators,
                                              random_state=random_state)
    elif estimator == 'SVR':
        param_dist = {**parameters['svm'], 'epsilon': [0.0, 0.1]}
        estimator = SVR(kernel='rbf')
    elif estimator == 'LinearSVR':
        param_dist = {**parameters['svm'], 'epsilon': [0.0, 0.1]}
        estimator = SVR(kernel='linear')
    elif estimator == 'Ridge':
        param_dist = parameters['linear']
        estimator = Ridge(solver='auto', random_state=random_state)
    elif estimator == 'Lasso':
        param_dist = parameters['linear']
        estimator = Lasso(random_state=random_state)
    elif estimator == 'ElasticNet':
        param_dist = parameters['linear']
        estimator = ElasticNet(random_state=random_state)
    elif estimator == 'KNeighborsRegressor':
        param_dist = parameters['kneighbors']
        estimator = KNeighborsRegressor(algorithm='auto')

    # Classifiers
    elif estimator == 'RandomForestClassifier':
        param_dist = {
            **parameters['ensemble'],
            **parameters['bootstrap'],
            **parameters['criterion']
        }
        estimator = RandomForestClassifier(n_jobs=n_jobs,
                                           n_estimators=n_estimators,
                                           random_state=random_state)
    elif estimator == 'ExtraTreesClassifier':
        param_dist = {
            **parameters['ensemble'],
            **parameters['bootstrap'],
            **parameters['criterion']
        }
        estimator = ExtraTreesClassifier(n_jobs=n_jobs,
                                         n_estimators=n_estimators,
                                         random_state=random_state)
    elif estimator == 'GradientBoostingClassifier':
        param_dist = parameters['ensemble']
        estimator = GradientBoostingClassifier(n_estimators=n_estimators,
                                               random_state=random_state)
    elif estimator == 'LinearSVC':
        param_dist = parameters['linear_svm']
        estimator = LinearSVC(random_state=random_state)
    elif estimator == 'SVC':
        param_dist = parameters['svm']
        estimator = SVC(kernel='rbf', random_state=random_state)
    elif estimator == 'KNeighborsClassifier':
        param_dist = parameters['kneighbors']
        estimator = KNeighborsClassifier(algorithm='auto')

    return param_dist, estimator
Exemplo n.º 25
0
#ranked_features=[]
#for f in range(X.shape[1]):
#    ranked_features.append(features[indices[f]])
#    print("%d. %s (%f)" % (f + 1, features[indices[f]], importances[indices[f]]))

# Plot the feature importances of the forest
#plt.figure()
#plt.title("Feature importances")
#plt.bar(range(X.shape[1]), importances[indices],
#       color="r", yerr=std[indices], align="center")
#plt.xticks(range(X.shape[1]),ranked_features,fontsize=6.5)
#plt.xlim([-1, X.shape[1]])
#plt.savefig('random_forrest_feature_importance.png')

for score in scores:
    forest = GridSearchCV(ExtraTreesRegressor(random_state=1),
                          tuned_parameters,
                          verbose=10,
                          cv=4,
                          n_jobs=4,
                          scoring='%s' % score)

    forest.fit(X_train, y_train)
    model = forest.fit(X_train, y_train)
    model_train = model.predict(X_train)
    model_test = model.predict(X_test)
    r2_score_train = r2_score(y_train, model_train)
    mse_score_train = mean_squared_error(y_train, model_train)
    rmse_score_train = np.sqrt(mse_score_train)
    r2_score_test = r2_score(y_test, model_test)
    mse_score_test = mean_squared_error(y_test, model_test)
Exemplo n.º 26
0
 def __init__(self):
     self.etr_model = ExtraTreesRegressor(n_estimators=200,
                                          random_state=2019)
            df_2 = pd.read_csv("/home/mldm/covid_BRTT/dataset_finali/" +
                               dataset,
                               parse_dates=["Data"],
                               infer_datetime_format=True)
            rnd_state = rnd
            ################################################################################
            train_2 = df_2.tail(int(len(df_2) * 0.7))
            test_2 = df_2.drop(train_2.index)

            train_X_2 = train_2[predictor_columns]
            train_y_2 = train_2[feature]

            test_X_2 = test_2[predictor_columns]
            test_y_2 = test_2[feature]
            ################################################################################
            extra_tree_regressor = ExtraTreesRegressor(bootstrap=False,
                                                       random_state=rnd_state)
            ################################################################################
            imp = SimpleImputer(missing_values=np.nan, strategy="mean")
            imp = imp.fit(train_X_2)
            ################################################################################
            grid_regressor_2 = GridSearchCV(extra_tree_regressor,
                                            param_grd,
                                            n_jobs=-1,
                                            verbose=0)
            ################################################################################
            grid_regressor_2.fit(imp.transform(train_X_2), train_y_2)
            ################################################################################
            best_regressor_2 = grid_regressor_2.best_estimator_

            imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
            imputer = imputer.fit(test_X_2)
Exemplo n.º 28
0
train = train.drop('Item_Outlet_Sales', axis=1)

features = train.columns
target = 'Item_Outlet_Sales'

X_train, X_test = train, test

# Starting with different supervised learning algorithm, let us check which algorithm gives us the best results
print('=====\nTrain against different learning algorithms and pick one that produces the best result\n=====')
model_factory = [
    RandomForestRegressor(),
    # XGBRegressor(nthread=1),
    # MLPRegressor(),
    Ridge(),
    BayesianRidge(),
    ExtraTreesRegressor(),
    ElasticNet(),
    KNeighborsRegressor(),
    GradientBoostingRegressor()
]

for model in model_factory:
    model.seed = 42
    num_folds = 3

    scores = cross_val_score(
        model,
        X_train,
        y_train,
        cv=num_folds,
        scoring='neg_mean_squared_error'
Exemplo n.º 29
0
def main():
    boston = loadData()
    X = boston.data
    Y = boston.target
    X_train, X_test, Y_train, Y_test = train_test_split(
        X, Y, test_size=0.25, random_state=33)
    print('The max target value is', np.max(boston.target))
    print('The min target value is ', np.min(boston.target))
    print('The average target value is ', np.mean(boston.target))

    # 对数据进行标准化处理
    ss_X = StandardScaler()
    ss_Y = StandardScaler()
    X_train = ss_X.fit_transform(X_train)
    X_test = ss_X.transform(X_test)
    Y_train = ss_Y.fit_transform(Y_train.reshape(-1, 1))
    Y_test = ss_Y.transform(Y_test.reshape(-1, 1))

    # 导入线性回归模型并训练
    lr = LinearRegression()
    lr.fit(X_train, Y_train)
    lr_Y_predict = lr.predict(X_test)

    # 导入SDGR模型并训练
    sgdr = SGDRegressor()
    sgdr.fit(X_train, Y_train.ravel())
    sgdr_Y_predict = sgdr.predict(X_test)

    # 评估模型性能 进行对比 发现模型自带评价score等价于r2_score
    print('-----------------------------------------------------------------------')
    print('The value of default measurement of LinerRegression is ',
          lr.score(X_test, Y_test))
    print('The value of R-squared of LinerRegression is ',
          r2_score(Y_test, lr_Y_predict))
    print('The mean squared error of LinerRegression is ', mean_squared_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(lr_Y_predict)))
    print('The mean absolute error of LinerRegression is ', mean_absolute_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(lr_Y_predict)))
    print('-----------------------------------------------------------------------')
    print('The value of default measurement of SGDRession is ',
          sgdr.score(X_test, Y_test))
    print('The value of R-squared of SGDRession is ',
          r2_score(Y_test, sgdr_Y_predict))
    print('The mean squared error of SGDRession is ', mean_squared_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(sgdr_Y_predict)))
    print('The mean absolute error of SGDRession is ', mean_absolute_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(sgdr_Y_predict)))

    # SVM Regression
    # 线性核函数SVR
    liner_svr = SVR(kernel='linear')
    liner_svr.fit(X_train, Y_train.ravel())
    liner_svr_y_predict = liner_svr.predict(X_test)

    # 多项式核函数SVR
    poly_svr = SVR(kernel='poly')
    poly_svr.fit(X_train, Y_train.ravel())
    poly_svr_y_predict = poly_svr.predict(X_test)

    # 径向基核函数SVR
    rbf_svr = SVR(kernel="rbf")
    rbf_svr.fit(X_train, Y_train.ravel())
    rbf_svr_y_predict = rbf_svr.predict(X_test)

    # 对三种核函数的SVR进行性能评估
    print('-----------------------------------------------------------------------')
    print('R-square value of linear SVR is:', liner_svr.score(X_test, Y_test))
    print('The MSE of linear SVR is:', mean_squared_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(liner_svr_y_predict)))
    print('The MAE of linear SVR is:', mean_absolute_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(liner_svr_y_predict)))
    print('-----------------------------------------------------------------------')
    print('R-square value of poly SVR is:', poly_svr.score(X_test, Y_test))
    print('The MSE of poly SVR is:', mean_squared_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(poly_svr_y_predict)))
    print('The MAE of poly SVR is:', mean_absolute_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(poly_svr_y_predict)))
    print('-----------------------------------------------------------------------')
    print('R-square value of rbf SVR is:', rbf_svr.score(X_test, Y_test))
    print('The MSE of rbf SVR is:', mean_squared_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(rbf_svr_y_predict)))
    print('The MAE of rbf SVR is:', mean_absolute_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(rbf_svr_y_predict)))

    # 两种K近邻模型
    # 预测方式:平均回归
    uni_knr = KNeighborsRegressor(weights='uniform')
    uni_knr.fit(X_train, Y_train.ravel())
    uni_knr_y_predicrt = uni_knr.predict(X_test)

    # 预测方式:距离加权
    dis_knr = KNeighborsRegressor(weights='distance')
    dis_knr.fit(X_train, Y_train.ravel())
    dis_knr_y_predict = dis_knr.predict(X_test)

    # 对两种k近邻模型进行性能评估
    print('-----------------------------------------------------------------------')
    print('R-square value of uniform-weighted KNR is:',
          uni_knr.score(X_test, Y_test))
    print('The MSE of uniform-weighted KNR is:', mean_squared_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(uni_knr_y_predicrt)))
    print('The MAE of uniform-weighted KNR is:', mean_absolute_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(uni_knr_y_predicrt)))
    print('-----------------------------------------------------------------------')
    print('R-square value of distance-weighted KNR is:',
          dis_knr.score(X_test, Y_test))
    print('The MSE of distance-weighted KNR is:', mean_squared_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(dis_knr_y_predict)))
    print('The MAE of distance-weighted KNR is:', mean_absolute_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(dis_knr_y_predict)))

    # 使用回归树模型
    dtr = DecisionTreeRegressor()
    dtr.fit(X_train, Y_train.ravel())
    dtr_y_predict = dtr.predict(X_test)

    # 对回归树进行性能评估
    print('-----------------------------------------------------------------------')
    print('R-square value of DecisionTreeRegressor is:', dtr.score(X_test, Y_test))
    print('The MSE of DecisionTreeRegressor is:', mean_squared_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(dtr_y_predict)))
    print('The MAE of DecisionTreeRegressor is:', mean_absolute_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(dtr_y_predict)))

    # 使用三种集成模型进行训练
    rfr = RandomForestRegressor()
    rfr.fit(X_train, Y_train.ravel())
    rfr_y_predict = rfr.predict(X_test)

    etr = ExtraTreesRegressor()
    etr.fit(X_train, Y_train.ravel())
    etg_y_predict = etr.predict(X_test)

    gbr = GradientBoostingRegressor()
    gbr.fit(X_train, Y_train.ravel())
    gbr_y_predict = gbr.predict(X_test)

    # 对三种集成模型进行性能评估
    print('-----------------------------------------------------------------------')
    print('R-square value of RandomForestRegressor is:', rfr.score(X_test, Y_test))
    print('The MSE of RandomForestRegressor is:', mean_squared_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(rfr_y_predict)))
    print('The MAE of RandomForestRegressor is:', mean_absolute_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(rfr_y_predict)))

    print('-----------------------------------------------------------------------')
    print('R-square value of ExtraTreesRegressor is:', etr.score(X_test, Y_test))
    print('The MSE of ExtraTreesRegressor is:', mean_squared_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(etg_y_predict)))
    print('The MAE of ExtraTreesRegressor is:', mean_absolute_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(etg_y_predict)))
    print(np.sort(list(zip(etr.feature_importances_, boston.feature_names)), axis=0))
    print('-----------------------------------------------------------------------')
    print('R-square value of GradientBoostingRegressor is:',
          gbr.score(X_test, Y_test))
    print('The MSE of GradientBoostingRegressor is:', mean_squared_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(gbr_y_predict)))
    print('The MAE of GradientBoostingRegressor is:', mean_absolute_error(
        ss_Y.inverse_transform(Y_test), ss_Y.inverse_transform(gbr_y_predict)))
Exemplo n.º 30
0
train = pd.read_hdf('train.h5')
train = train[col]
d_mean = train.median(axis=0)

train = o.train[col]
n = train.isnull().sum(axis=1)
for c in train.columns:
    train[c + '_nan_'] = pd.isnull(train[c])
    d_mean[c + '_nan_'] = 0
train = train.fillna(d_mean)
train['znull'] = n
n = []
print "start train for Trees"
rfr = ExtraTreesRegressor(n_estimators=100,
                          max_depth=4,
                          n_jobs=-1,
                          random_state=17,
                          verbose=0)
model1 = rfr.fit(train, o.train['y'])
# clf_gbdt = GradientBoostingRegressor(n_estimators=15, learning_rate=0.5,max_depth=6, random_state=0)
# model1 = clf_gbdt.fit(train, o.train['y'])
#model1 = rfr.fit(train, o.train['y'])
#model_xgb = runXGB(train,o.train['y'])
print "finised train"

#https://www.kaggle.com/bguberfain/two-sigma-financial-modeling/univariate-model-with-clip/run/482189
low_y_cut = -0.06
high_y_cut = 0.06
y_is_above_cut = (o.train.y > high_y_cut)
y_is_below_cut = (o.train.y < low_y_cut)
y_is_within_cut = (~y_is_above_cut & ~y_is_below_cut)