示例#1
0
X_train, X_test, y_train, y_test = train_test_split(train_X_reduced,
                                                    train_y,
                                                    test_size=0.20,
                                                    random_state=42)

#########################################################################################################

model_lasso = Lasso(alpha=0.000507, random_state=1)
model_ridge = Ridge(alpha=10.0)
model_svr = SVR(C=15, epsilon=0.009, gamma=0.0004, kernel='rbf')
model_ENet = ElasticNet(alpha=0.0005,
                        l1_ratio=.9,
                        random_state=3,
                        max_iter=10000)
model_KRR = KernelRidge(alpha=0.5, kernel='polynomial', degree=2, coef0=2.5)
model_byr = BayesianRidge()
model_rforest = RandomForestRegressor(n_estimators=210)

model_lsvr = LinearSVR()
model_sgd = SGDRegressor()
model_extra = ExtraTreesRegressor()

model_xgb = XGBRegressor(colsample_bytree=0.4603,
                         gamma=0.0468,
                         learning_rate=0.05,
                         max_depth=4,
                         min_child_weight=1.7817,
                         n_estimators=3000,
                         reg_alpha=0.4640,
                         reg_lambda=0.88,
                         subsample=0.5213,
示例#2
0
def Ridge_Regression():
    model = BayesianRidge(compute_score=True)
    return model
示例#3
0
X = np.random.randn(n_samples, size**2)
for x in X:  # smooth data
    x[:] = ndimage.gaussian_filter(x.reshape(size, size), sigma=1.0).ravel()
X -= X.mean(axis=0)
X /= X.std(axis=0)

y = np.dot(X, coef.ravel())
noise = np.random.randn(y.shape[0])
noise_coef = (linalg.norm(y, 2) / np.exp(snr / 20.)) / linalg.norm(noise, 2)
y += noise_coef * noise  # add noise

###############################################################################
# Compute the coefs of a Bayesian Ridge with GridSearch
cv = KFold(len(y), 2)  # cross-validation generator for model selection
ridge = BayesianRidge()
cachedir = tempfile.mkdtemp()
mem = Memory(cachedir=cachedir, verbose=1)

# Ward agglomeration followed by BayesianRidge
A = grid_to_graph(n_x=size, n_y=size)
ward = WardAgglomeration(n_clusters=10,
                         connectivity=A,
                         memory=mem,
                         n_components=1)
clf = Pipeline([('ward', ward), ('ridge', ridge)])
# Select the optimal number of parcels with grid search
clf = GridSearchCV(clf, {'ward__n_clusters': [10, 20, 30]}, n_jobs=1, cv=cv)
clf.fit(X, y)  # set the best parameters
coef_ = clf.best_estimator_.steps[-1][1].coef_
coef_ = clf.best_estimator_.steps[0][1].inverse_transform(coef_)
示例#4
0
 elif alg.name == 'LinearRegression':
     if NVIDIA_RAPIDS_ENABLED:
         from cuml.linear_model import LinearRegression
         model = LinearRegression(**alg.input_variables.__dict__)
     else:
         from sklearn.linear_model import LinearRegression
         model = LinearRegression(**alg.input_variables.__dict__)
 elif alg.name == 'SupportVectorRegression':
     if NVIDIA_RAPIDS_ENABLED:
         from cuml.svm import SVR
     else:
         from sklearn.svm import SVR
     model = SVR(**alg.input_variables.__dict__)
 elif alg.name == 'BayesianRidgeRegression':
     from sklearn.linear_model import BayesianRidge
     model = BayesianRidge(**alg.input_variables.__dict__)
     warn_not_gpu_support(alg)
 elif alg.name == 'AdaBoost' and alg.type == 'regression':
     from sklearn.ensemble import AdaBoostRegressor
     model = AdaBoostRegressor(**alg.input_variables.__dict__)
     warn_not_gpu_support(alg)
 elif alg.name == 'GradientBoosting' and alg.type == 'regression':
     from sklearn.ensemble import GradientBoostingRegressor
     model = GradientBoostingRegressor(**alg.input_variables.__dict__)
     warn_not_gpu_support(alg)
 elif alg.name == 'RandomForest' and alg.type == 'regression':
     from sklearn.ensemble import RandomForestRegressor
     model = RandomForestRegressor(**alg.input_variables.__dict__)
     warn_not_gpu_support(alg)
 elif alg.name == 'XGBoost' and alg.type == 'regression':
     from xgboost.sklearn import XGBRegressor
示例#5
0
# 模型融合
# 将lgb和xgb的结果进行stacking
print('stacking...')
train_stack = np.vstack([oof_lgb, oof_xgb]).transpose()
test_stack = np.vstack([predictions_lgb, predictions_xgb]).transpose()

folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=4590)
oof_stack = np.zeros(train_stack.shape[0])
predictions = np.zeros(test_stack.shape[0])

for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack, target)):
    print("fold {}".format(fold_))
    trn_data, trn_y = train_stack[trn_idx], target.iloc[trn_idx].values
    val_data, val_y = train_stack[val_idx], target.iloc[val_idx].values

    clf_3 = BayesianRidge()
    clf_3.fit(trn_data, trn_y)

    oof_stack[val_idx] = clf_3.predict(val_data)
    predictions += clf_3.predict(test_stack) / 10

res_stack = mean_squared_error(target.values, oof_stack)

print('lgb:{:<8.8f}, xgb:{:<8.8f}, stack:{:<8.8f}'.format(res_lgb, res_xgb, res_stack))

# 保存
sub_df = pd.read_csv(pre_root_path + '/jinnan_round1_submit_20181227.csv', header=None)
sub_df[1] = predictions
# sub_df[1] = sub_df[1].apply(lambda x:round(x, 3))  # 这是覆盖读取的文件
sub_df.to_csv(result_path + '/jinnan_round1_submit_20181227_1.csv', index=0, header=0)  # 这是另存为,不保存索引行
print('save done!')
示例#6
0
    ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_]
    if imputation_order == 'roman':
        assert np.all(ordered_idx[:d - 1] == np.arange(1, d))
    elif imputation_order == 'arabic':
        assert np.all(ordered_idx[:d - 1] == np.arange(d - 1, 0, -1))
    elif imputation_order == 'random':
        ordered_idx_round_1 = ordered_idx[:d - 1]
        ordered_idx_round_2 = ordered_idx[d - 1:]
        assert ordered_idx_round_1 != ordered_idx_round_2
    elif 'ending' in imputation_order:
        assert len(ordered_idx) == 2 * (d - 1)


@pytest.mark.parametrize(
    "predictor",
    [DummyRegressor(), BayesianRidge(),
     ARDRegression()])
def test_mice_predictors(predictor):
    rng = np.random.RandomState(0)

    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()

    imputer = MICEImputer(missing_values=0,
                          n_imputations=1,
                          n_burn_in=1,
                          predictor=predictor,
                          random_state=rng)
    imputer.fit_transform(X)
示例#7
0
        model, data.values, y_train, scoring='neg_mean_squared_error', cv=kf)


def print_score(model, name, data=train):
    score = cross_val(model, data)
    print('  {}: {:.5f} {:.5f}'.format(name, score.mean(), score.std()))


def print_mse(y, pred, name):
    mse = mean_squared_error(y, pred)
    print('  {}: {:.8f}'.format(name, mse))


lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.00055))
ridge = make_pipeline(RobustScaler(), Ridge(alpha=25, tol=0.00001))
bayesian_ridge = make_pipeline(RobustScaler(), BayesianRidge())
elastic_net = make_pipeline(RobustScaler(),
                            ElasticNet(alpha=0.00055, l1_ratio=0.7))
svr = make_pipeline(RobustScaler(), SVR(C=10, epsilon=0.001, shrinking=False))

print('\nTesting different regression algorithms, scores:')
print_score(lasso, 'Lasso')
print_score(ridge, 'Ridge Regression')
print_score(bayesian_ridge, 'Bayesian Ridge Regression')
print_score(elastic_net, 'Elastic Net')
print_score(svr, 'Support Vector Regressor')

# fit train data to all models, predict train and test, print mean_squared_error for trainings data
lasso.fit(train, y_train)
lasso_train_pred = lasso.predict(train)
lasso_pred = lasso.predict(test)
示例#8
0
    def predict_ahead(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Make a single forecast with a Bayesian Ridge Regression model

        Parameters
        ----------
        df : pandas DataFrame
            the training (streamed) data to model

        Returns
        -------
        predictions : pandas DataFrame
            the forecast -> (1 row, W columns) where W is the forecast_window
        """
        # preprocess the data for supervised machine learning
        X, Y, X_new = self.preprocessing(df, binary=False)

        if self._counter >= self.train_frequency or self._model is None:
            object.__setattr__(self, "_counter", 0)

            # set up a machine learning pipeline
            model = MultiOutputRegressor(BayesianRidge(), n_jobs=N_JOBS)
            pipeline = Pipeline(
                [
                    ("var", VarianceThreshold()),
                    # ('poly', PolynomialFeatures(2)),  # longer run time, potentially more accurate
                    # ('var2', VarianceThreshold()),  # use this if 'poly' is used
                    # ('shape', QuantileTransformer(output_distribution="normal")),  # make input variables normally distributed
                    ("scale", MinMaxScaler()),
                    ("model", model),
                ]
            )

            if self.tune_model:
                # set up cross validation for time series
                tscv = TimeSeriesSplit(n_splits=3)
                folds = tscv.get_n_splits(X)

                # set up the tuner
                parameters = {
                    "model__estimator__n_iter": [300],
                    "model__estimator__tol": [1e-3],
                    "model__estimator__alpha_1": [1e-2, 1e-6, 1e-10],
                    "model__estimator__lambda_1": [1e-2, 1e-6, 1e-10],
                    "model__estimator__alpha_2": [1e-2, 1e-6, 1e-10],
                    "model__estimator__lambda_2": [1e-2, 1e-6, 1e-10],
                }
                grid = RandomizedSearchCV(
                    pipeline,
                    parameters,
                    n_iter=16,
                    cv=folds,
                    random_state=0,
                    n_jobs=1,
                )

                object.__setattr__(
                    self,
                    "_model",
                    grid.fit(X, Y).best_estimator_,  # search for the best model
                )
            else:
                object.__setattr__(
                    self, "_model", pipeline.fit(X, Y)  # train the model
                )

        predictions = self._model.predict(X_new)  # forecast
        predictions = pd.DataFrame(predictions)
        object.__setattr__(self, "_counter", self._counter + 1)
        return predictions
示例#9
0
                        zeroThreshold=1e-5)
    else:
        pipeline.verify(auto_X.sample(frac=0.05, random_state=13))
    store_pkl(pipeline, name)
    mpg = DataFrame(pipeline.predict(auto_X), columns=["mpg"])
    store_csv(mpg, name)


if "Auto" in datasets:
    build_auto(
        AdaBoostRegressor(DecisionTreeRegressor(random_state=13,
                                                min_samples_leaf=5),
                          random_state=13,
                          n_estimators=17), "AdaBoostAuto")
    build_auto(ARDRegression(normalize=True), "BayesianARDAuto")
    build_auto(BayesianRidge(normalize=True), "BayesianRidgeAuto")
    build_auto(DecisionTreeRegressor(random_state=13, min_samples_leaf=2),
               "DecisionTreeAuto",
               compact=False)
    build_auto(
        BaggingRegressor(DecisionTreeRegressor(random_state=13,
                                               min_samples_leaf=5),
                         random_state=13,
                         n_estimators=3,
                         max_features=0.5), "DecisionTreeEnsembleAuto")
    build_auto(DummyRegressor(strategy="median"), "DummyAuto")
    build_auto(ElasticNetCV(random_state=13), "ElasticNetAuto")
    build_auto(ExtraTreesRegressor(random_state=13, min_samples_leaf=5),
               "ExtraTreesAuto")
    build_auto(GradientBoostingRegressor(random_state=13, init=None),
               "GradientBoostingAuto")
def fit_bridge(X, y):
    from sklearn.linear_model import BayesianRidge
    br = BayesianRidge()
    br.fit(X,y)
    return br
示例#11
0
for column in x1:
    if column not in X:
        X[column] = 0

X = X.sort_index(axis=1)
x1 = x1.sort_index(axis=1)
from sklearn.model_selection import train_test_split
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,
                                                y,
                                                test_size=0.33,
                                                random_state=0)

# Fitting Simple Linear Regression to the Training set
from sklearn.linear_model import BayesianRidge
regressor = BayesianRidge()
fitResult = regressor.fit(Xtrain, Ytrain)
YPredTest = regressor.predict(Xtest)

print('Intercept: \n', regressor.intercept_)
print('Coefficients: \n', regressor.coef_)

df2.head()
# Predicting the Test set results
y_pred = regressor.predict(x1)
#print(y_pred)

df2['Income'] = y_pred
#print(df2)
df2.describe()
示例#12
0
rs.get_n_splits(X)
X_trainset = None
y_trainset = None
X_testset = None
y_testset = None

for train_index, test_index in rs.split(X, y):
    X_trainset, X_testset = X[train_index], X[test_index]
    y_trainset, y_testset = y[train_index], y[test_index]

# ## 模型训练
from sklearn.linear_model import BayesianRidge, HuberRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor

regression_model = BayesianRidge()
regression_model.fit(X_trainset, y_trainset)
#
# bagging = BaggingRegressor(BayesianRidge(),n_estimators=10)
# bagging = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1,max_depth=3, random_state=0, loss='ls')
# bagging.fit(X_trainset, y_trainset)
# regression_model = bagging
# joblib.dump(regression_model, "reg_0003-001.m")

## 预测测试集
X_testset = X
y_testset = y
lines = ""
# regression_model = joblib.load("reg_0003-001.m")
result = regression_model.predict(X_testset)
mse = 0.0
示例#13
0

####### Our Model for Comprison
na_mask_train = ~X_train.loc[X_train_odds.index].isna().T.any()
X_train_odds_comp = X_train.loc[X_train_odds.index].dropna()
# X_train_odds_comp = X_train_odds_comp.fillna(X_train_odds_comp.mean())
na_mask_val = ~X_val.loc[X_val_odds.index].isna().T.any()
X_val_odds_comp = X_val.loc[X_val_odds.index].dropna()
# X_val_odds_comp = X_val_odds_comp.fillna(X_val_odds_comp.mean())
X_train_odds = X_train_odds[na_mask_train]
X_val_odds = X_val_odds[na_mask_val]
y_train_odds =  y_train_odds[na_mask_train]
y_val_odds = y_val_odds[na_mask_val]


lm = BayesianRidge().fit(X_train_odds.median(axis=1).values.reshape(-1,1), y_train_odds)
predictions = lm.predict(X_val_odds.median(axis=1).values.reshape(-1,1))
print(mean_squared_error(y_val_odds, predictions))
lm.score(X_val_odds.median(axis=1).values.reshape(-1,1), y_val_odds)
# X_train_odds_comp_tot = pd.concat([X_train.loc[X_train_odds.index], X_train_odds], axis=1)
# X_val_odds_comp_tot = pd.concat([X_val.loc[X_val_odds.index], X_val_odds], axis=1)

####### Scale data select features
standardscaler = StandardScaler()
X_trainscaled_odds_comp = standardscaler.fit_transform(X_train_odds_comp[featurestouse])
X_valscaled_odds_comp = standardscaler.transform(X_val_odds_comp[featurestouse])

# standardscaler = StandardScaler()
# X_trainscaled_odds_comp_tot = standardscaler.fit_transform(X_train_odds_comp_tot[featurestouse])
# X_valscaled_odds_comp_tot = standardscaler.transform(X_val_odds_comp_tot[featurestouse])
示例#14
0
 },
 {
     'name': 'Lasso',
     'model': Lasso()
 },
 {
     'name': 'ElasticNet',
     'model': ElasticNet()
 },
 {
     'name': 'LassoLarsDefault',
     'model': LassoLars()
 },
 {
     'name': 'BayesianRidgeDefault',
     'model': BayesianRidge()
 },
 {
     'name': 'ARDRegressionDefault',
     'model': ARDRegression(fit_intercept=True)
 },
 {
     'name': 'ARDRegression',
     'model': ARDRegression(fit_intercept=True, threshold_lambda=10000)
 },
 {
     'name':
     'ARDRegressionOptim1',
     'model':
     ARDRegression(fit_intercept=True,
                   n_iter=100,
示例#15
0
def ml_regression(x_train,
                  y_train,
                  x_test,
                  y_test,
                  cross_validation=False,
                  show=False):
    """
    Build, train, and test the data set with classical machine learning regression models.
    If cross_validation=True an additional training with cross validation will be performed.
    """
    from time import time
    from sklearn.linear_model import LinearRegression
    from sklearn.linear_model import BayesianRidge
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.neighbors import KNeighborsRegressor
    from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor

    # from sklearn.model_selection import KFold
    # from sklearn.base import clone

    regressors = (LinearRegression(), BayesianRidge(), DecisionTreeRegressor(),
                  KNeighborsRegressor(n_neighbors=10), AdaBoostRegressor(),
                  RandomForestRegressor(100))

    names = [
        "Linear", "Bayesian Ridge", "Decision Tree", "KNeighbors", "AdaBoost",
        "Random Forest"
    ]

    col = ['Time (s)', 'Test loss', 'Test R2 score']
    results = pd.DataFrame(columns=col)

    for idx, clf in enumerate(regressors):

        name = names[idx]
        # clf_cv = clone(clf)

        print(name)

        t0 = time()
        # Fitting the model without cross validation
        clf.fit(x_train, y_train)
        train_time = np.around(time() - t0, 1)
        y_pred = clf.predict(x_test)

        loss, r2 = regression_scores(y_test, y_pred, show=show)

        if cross_validation:
            warnings.warn('Cross-validation removed')

            # k_fold = KFold(n_splits=10)
            # t0 = time()
            # # Fitting the model with cross validation
            # for id_train, id_test in k_fold.split(x_train):
            #     # print(y_train[id_train, 0].shape)
            #     clf_cv.fit(x_train[id_train], y_train[id_train, 0]) # TODO enhance
            # train_time_cv = time() - t0

            # y_pred_cv = clf_cv.predict(x_test)
            # r2_cv = r2_score(y_test, y_pred_cv[:,1])

            # print("Test R2-Score CV:\t {:.3f}".format(r2_cv))
            # print( "Training Time CV: \t {:.1f} ms".format(train_time_cv * 1000))

        results = results.append(
            pd.DataFrame([[train_time, loss, r2]], columns=col, index=[name]))

        if show:
            print("-" * 20)
            print("Training Time:  \t {:.1f} s".format(train_time))
            print("Test loss:  \t\t {:.4f}".format(loss))
            print("Test R2-score:  \t {:.3f}\n".format(r2))

    return results.sort_values('Test loss').round(2)
示例#16
0
df['Prediction'] = df_close.shift(-forecast_out) #  label column with data shifted 30 units up

# print(df.tail())

X = np.array(df.drop(['Prediction'], 1))
X = preprocessing.scale(X)



X_forecast = X[-forecast_out:] # set X_forecast equal to last 30
X = X[:-forecast_out] # remove last 30 from X


y = np.array(df['Prediction'])
y = y[:-forecast_out]



X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.3)

# Training
clf = BayesianRidge()
clf.fit(X_train,y_train)
# Testing
confidence = clf.score(X_test, y_test)
print("confidence: ", confidence)


forecast_prediction = clf.predict(X_forecast)
print(forecast_prediction)
示例#17
0
# 随机提取10个特征出来作为样本特征
relevant_features = np.random.randint(0, n_features, 10)
# 基于先验分布,产生特征对应的初始权值
for i in relevant_features:
    w[i] = stats.norm.rvs(loc=0, scale=1. / np.sqrt(alpha_))

# 产生alpha为50的噪声
alpha_ = 50.
noise = stats.norm.rvs(loc=0, scale=1. / np.sqrt(alpha_), size=n_samples)

# 产生目标数据
y = np.dot(X, w) + noise

###############################################################################
# 使用贝叶斯脊回归拟合数据
clf = BayesianRidge(compute_score=True)
clf.fit(X, y)

# 使用最小二乘法拟合数据
ols = LinearRegression()
ols.fit(X, y)

###############################################################################
# 作图比较两个方法的结果
plt.figure(figsize=(6, 5))
plt.title("Weights of the model")
plt.plot(clf.coef_, 'b-', label="Bayesian Ridge estimate")
plt.plot(w, 'g-', label="Ground truth")
plt.plot(ols.coef_, 'r--', label="OLS estimate")
plt.xlabel("Features")
plt.ylabel("Values of the weights")
示例#18
0
def dict_method_reg():
    """many reg method."""
    dict_method = {}
    # 1st part
    """4KNR"""
    me4 = neighbors.KNeighborsRegressor(n_neighbors=5,
                                        weights='uniform',
                                        algorithm='auto',
                                        leaf_size=30,
                                        p=2,
                                        metric='minkowski')
    cv4 = 5
    scoring4 = 'r2'
    param_grid4 = [{
        'n_neighbors': [3, 4, 5, 6, 7],
        "weights": ['uniform', "distance"],
        "leaf_size": [10, 20, 30]
    }]
    dict_method.update({"KNR-set": [me4, cv4, scoring4, param_grid4]})
    """1SVR"""
    me1 = SVR(kernel='rbf',
              gamma='auto',
              degree=3,
              tol=1e-3,
              epsilon=0.1,
              shrinking=False,
              max_iter=2000)
    cv1 = 5
    scoring1 = 'r2'
    param_grid1 = [{
        'C': [10000, 100, 50, 10, 5, 2.5, 1, 0.5, 0.1, 0.01],
        'kernel': ker
    }]
    dict_method.update({"SVR-set": [me1, cv1, scoring1, param_grid1]})
    """5kernelridge"""
    me5 = kernel_ridge.KernelRidge(alpha=1,
                                   gamma="scale",
                                   degree=3,
                                   coef0=1,
                                   kernel_params=None)
    cv5 = 5
    scoring5 = 'r2'
    param_grid5 = [{
        'alpha': [100, 50, 10, 5, 2.5, 1, 0.5, 0.1, 0.01, 0.001, 1e-4, 1e-5],
        'kernel':
        ker
    }]
    dict_method.update({'KRR-set': [me5, cv5, scoring5, param_grid5]})
    """6GPR"""
    me6 = gaussian_process.GaussianProcessRegressor(kernel=kernel,
                                                    alpha=1e-10,
                                                    optimizer='fmin_l_bfgs_b',
                                                    n_restarts_optimizer=0,
                                                    normalize_y=False,
                                                    copy_X_train=True,
                                                    random_state=0)
    cv6 = 5
    scoring6 = 'r2'
    param_grid6 = [{'alpha': [1e-3, 1e-2], 'kernel': ker}]
    dict_method.update({"GPR-set": [me6, cv6, scoring6, param_grid6]})

    # 2nd part
    """6RFR"""
    me7 = RandomForestRegressor(n_estimators=500,
                                max_depth=None,
                                min_samples_split=2,
                                min_samples_leaf=1,
                                min_weight_fraction_leaf=0.0,
                                max_leaf_nodes=None,
                                min_impurity_decrease=0.0,
                                min_impurity_split=None,
                                bootstrap=True,
                                oob_score=False,
                                random_state=None,
                                verbose=0,
                                warm_start=False)
    cv7 = 5
    scoring7 = 'r2'
    param_grid7 = [{
        'max_depth': [4, 5, 6, 7],
    }]
    dict_method.update({"RFR-em": [me7, cv7, scoring7, param_grid7]})
    """7GBR"""
    me8 = GradientBoostingRegressor(
        loss='ls',
        learning_rate=0.1,
        n_estimators=100,
        subsample=1.0,
        criterion='friedman_mse',
        min_samples_split=2,
        min_samples_leaf=1,
        min_weight_fraction_leaf=0.,
        max_depth=3,
        min_impurity_decrease=0.,
        min_impurity_split=None,
        init=None,
        random_state=None,
        max_features=None,
        alpha=0.9,
        verbose=0,
        max_leaf_nodes=None,
        warm_start=False,
    )
    cv8 = 5
    scoring8 = 'r2'
    param_grid8 = [{
        'max_depth': [3, 4, 5, 6],
        'min_samples_split': [2, 3],
        'learning_rate': [0.1, 0.05]
    }]
    dict_method.update({'GBR-em': [me8, cv8, scoring8, param_grid8]})

    "AdaBR"
    dt3 = DecisionTreeRegressor(criterion="mse",
                                splitter="best",
                                max_features=None,
                                max_depth=7,
                                min_samples_split=4)
    me9 = AdaBoostRegressor(dt3,
                            n_estimators=200,
                            learning_rate=0.05,
                            random_state=0)
    cv9 = 5
    scoring9 = 'explained_variance'
    param_grid9 = [{"base_estimator": [dt3]}]
    dict_method.update({"AdaBR-em": [me9, cv9, scoring9, param_grid9]})
    '''DTR'''
    me10 = DecisionTreeRegressor(
        criterion="mse",
        splitter="best",
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        min_weight_fraction_leaf=0.,
        max_features=None,
        random_state=0,
        max_leaf_nodes=None,
        min_impurity_decrease=0.,
        min_impurity_split=None,
    )
    cv10 = 5
    scoring10 = 'r2'
    param_grid10 = [{
        'max_depth': [2, 3, 4, 5, 6, 7, 8],
        "min_samples_split": [2, 3, 4],
        "min_samples_leaf": [1, 2]
    }]
    dict_method.update({'DTR-em': [me10, cv10, scoring10, param_grid10]})

    'ElasticNet'
    me11 = ElasticNet(alpha=1.0,
                      l1_ratio=0.7,
                      fit_intercept=True,
                      normalize=False,
                      precompute=False,
                      max_iter=1000,
                      copy_X=True,
                      tol=0.0001,
                      warm_start=False,
                      positive=False,
                      random_state=None)

    cv11 = 5
    scoring11 = 'r2'
    param_grid11 = [{
        'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10],
        'l1_ratio': [0.3, 0.5, 0.8]
    }]
    dict_method.update({"EN-L1": [me11, cv11, scoring11, param_grid11]})

    'Lasso'
    me12 = Lasso(
        alpha=1.0,
        fit_intercept=True,
        normalize=True,
        precompute=False,
        copy_X=True,
        max_iter=3000,
        tol=0.001,
        warm_start=False,
        positive=False,
        random_state=None,
    )

    cv12 = 5
    scoring12 = 'r2'
    param_grid12 = [
        {
            'alpha': [
                0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 10, 100,
                1000
            ],
            "tol": [0.001, 0.01, 0.1]
        },
    ]
    dict_method.update({"LASSO-L1": [me12, cv12, scoring12, param_grid12]})
    """2BayesianRidge"""
    me2 = BayesianRidge(alpha_1=1e-06,
                        alpha_2=1e-06,
                        compute_score=False,
                        copy_X=True,
                        fit_intercept=True,
                        lambda_1=1e-06,
                        lambda_2=1e-06,
                        n_iter=300,
                        normalize=False,
                        tol=0.01,
                        verbose=False)
    cv2 = 5
    scoring2 = 'r2'
    param_grid2 = [{
        'alpha_1': [1e-07, 1e-06, 1e-05],
        'alpha_2': [1e-07, 1e-06, 1e-05]
    }]
    dict_method.update({'BRR-L1': [me2, cv2, scoring2, param_grid2]})
    """3SGDRL2"""
    me3 = SGDRegressor(alpha=0.0001,
                       average=False,
                       epsilon=0.1,
                       eta0=0.01,
                       fit_intercept=True,
                       l1_ratio=0.15,
                       learning_rate='invscaling',
                       loss='squared_loss',
                       max_iter=1000,
                       penalty='l2',
                       power_t=0.25,
                       random_state=0,
                       shuffle=True,
                       tol=0.01,
                       verbose=0,
                       warm_start=False)
    cv3 = 5
    scoring3 = 'r2'
    param_grid3 = [{
        'alpha': [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 1e-05],
        'loss': ['squared_loss', "huber"],
        "penalty": ["l1", "l2"]
    }]
    dict_method.update({'SGDR-L1': [me3, cv3, scoring3, param_grid3]})
    """PassiveAggressiveRegressor"""
    me14 = PassiveAggressiveRegressor(C=1.0,
                                      fit_intercept=True,
                                      max_iter=1000,
                                      tol=0.001,
                                      early_stopping=False,
                                      validation_fraction=0.1,
                                      n_iter_no_change=5,
                                      shuffle=True,
                                      verbose=0,
                                      loss='epsilon_insensitive',
                                      epsilon=0.1,
                                      random_state=None,
                                      warm_start=False,
                                      average=False)
    cv14 = 5
    scoring14 = 'r2'
    param_grid14 = [{
        'C': [1.0e8, 1.0e6, 10000, 100, 50, 10, 5, 2.5, 1, 0.5, 0.1, 0.01]
    }]
    dict_method.update({'PAR-L1': [me14, cv14, scoring14, param_grid14]})

    return dict_method
示例#19
0
    def fit_transform(self, X, y=None):
        """Fits the imputer on X and return the transformed X.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Input data, where "n_samples" is the number of samples and
            "n_features" is the number of features.

        y : ignored.

        Returns
        -------
        Xt : array-like, shape (n_samples, n_features)
             The imputed input data.
        """
        self.random_state_ = getattr(self, "random_state_",
                                     check_random_state(self.random_state))

        if self.predictor is None:
            from sklearn.linear_model import BayesianRidge
            self._predictor = BayesianRidge()
        else:
            self._predictor = clone(self.predictor)

        self._min_value = np.nan if self.min_value is None else self.min_value
        self._max_value = np.nan if self.max_value is None else self.max_value

        self.initial_imputer_ = None
        X, X_filled, mask_missing_values = self._initial_imputation(X)

        # edge case: in case the user specifies 0 for n_imputations,
        # then there is no need to do burn in and the result should be
        # just the initial imputation (before clipping)
        if self.n_imputations < 1:
            return X_filled

        X_filled = np.clip(X_filled, self._min_value, self._max_value)

        # order in which to impute
        # note this is probably too slow for large feature data (d > 100000)
        # and a better way would be good.
        # see: https://goo.gl/KyCNwj and subsequent comments
        ordered_idx = self._get_ordered_idx(mask_missing_values)

        abs_corr_mat = self._get_abs_corr_mat(X_filled)

        # impute data
        n_rounds = self.n_burn_in + self.n_imputations
        n_samples, n_features = X_filled.shape
        Xt = np.zeros((n_samples, n_features), dtype=X.dtype)
        self.imputation_sequence_ = []
        if self.verbose > 0:
            print("[MICE] Completing matrix with shape %s" % (X.shape, ))
        start_t = time()
        for i_rnd in range(n_rounds):
            if self.imputation_order == 'random':
                ordered_idx = self._get_ordered_idx(mask_missing_values)

            for feat_idx in ordered_idx:
                neighbor_feat_idx = self._get_neighbor_feat_idx(
                    n_features, feat_idx, abs_corr_mat)
                X_filled, predictor = self._impute_one_feature(
                    X_filled,
                    mask_missing_values,
                    feat_idx,
                    neighbor_feat_idx,
                    predictor=None,
                    fit_mode=True)
                predictor_triplet = MICETriplet(feat_idx, neighbor_feat_idx,
                                                predictor)
                self.imputation_sequence_.append(predictor_triplet)

            if i_rnd >= self.n_burn_in:
                Xt += X_filled
            if self.verbose > 0:
                print('[MICE] Ending imputation round '
                      '%d/%d, elapsed time %0.2f' %
                      (i_rnd + 1, n_rounds, time() - start_t))

        Xt /= self.n_imputations
        Xt[~mask_missing_values] = X[~mask_missing_values]
        return Xt
ytrain_est[:, 3] = knn.predict(Xtrain)
yval_est[:, 3] = knn.predict(Xval)

svmnorm = SVR(tol=tol, gamma='auto')
svmnorm = svmnorm.fit(Xtrain_norm, ytrain)
predictions[:, 4] = svmnorm.predict(Xtest_norm)
ytrain_est[:, 4] = svmnorm.predict(Xtrain_norm)
yval_est[:, 4] = svmnorm.predict(Xval_norm)

svmlnorm = LinearSVR(max_iter=5000)
svmlnorm = svmlnorm.fit(Xtrain_norm, ytrain)
predictions[:, 5] = svmlnorm.predict(Xtest_norm)
ytrain_est[:, 5] = svmlnorm.predict(Xtrain_norm)
yval_est[:, 5] = svmlnorm.predict(Xval_norm)

gnb = BayesianRidge()
gnb = gnb.fit(Xtrain_norm, ytrain)
predictions[:, 6] = gnb.predict(Xtest_norm)
ytrain_est[:, 6] = gnb.predict(Xtrain_norm)
yval_est[:, 6] = gnb.predict(Xval_norm)

hr = HuberRegressor()
hr = hr.fit(Xtrain_norm, ytrain)
predictions[:, 7] = hr.predict(Xtest_norm)
ytrain_est[:, 7] = hr.predict(Xtrain_norm)
yval_est[:, 7] = hr.predict(Xval_norm)

# eval
d_train = xgb.DMatrix(data=Xtrain_norm,
                      label=ytrain,
                      feature_names=Xtrain.columns)
示例#21
0
def bayesian_ridge_regression():

    # #############################################################################
    # Generating simulated data with Gaussian weights
    np.random.seed(0)
    n_samples, n_features = 100, 100
    X = np.random.randn(n_samples, n_features)  # Create Gaussian data
    # Create weights with a precision lambda_ of 4.
    lambda_ = 4.
    w = np.zeros(n_features)
    # Only keep 10 weights of interest
    relevant_features = np.random.randint(0, n_features, 10)
    for i in relevant_features:
        w[i] = stats.norm.rvs(loc=0, scale=1. / np.sqrt(lambda_))
    # Create noise with a precision alpha of 50.
    alpha_ = 50.
    noise = stats.norm.rvs(loc=0, scale=1. / np.sqrt(alpha_), size=n_samples)
    # Create the target
    y = np.dot(X, w) + noise

    # #############################################################################
    # Fit the Bayesian Ridge Regression and an OLS for comparison
    clf = BayesianRidge(compute_score=True)
    clf.fit(X, y)

    ols = LinearRegression()
    ols.fit(X, y)

    # #############################################################################
    # Plot true weights, estimated weights, histogram of the weights, and
    # predictions with standard deviations
    # lw = 2
    # plt.figure(figsize=(6, 5))
    # plt.title("Weights of the model")
    # plt.plot(clf.coef_, color='lightgreen', linewidth=lw,
    #         label="Bayesian Ridge estimate")
    # plt.plot(w, color='gold', linewidth=lw, label="Ground truth")
    # plt.plot(ols.coef_, color='navy', linestyle='--', label="OLS estimate")
    # plt.xlabel("Features")
    # plt.ylabel("Values of the weights")
    # plt.legend(loc="best", prop=dict(size=12))

    # plt.figure(figsize=(6, 5))
    # plt.title("Histogram of the weights")
    # plt.hist(clf.coef_, bins=n_features, color='gold', log=True,
    #         edgecolor='black')
    # plt.scatter(clf.coef_[relevant_features], np.full(len(relevant_features), 5.),
    #             color='navy', label="Relevant features")
    # plt.ylabel("Features")
    # plt.xlabel("Values of the weights")
    # plt.legend(loc="upper left")

    # plt.figure(figsize=(6, 5))
    # plt.title("Marginal log-likelihood")
    # plt.plot(clf.scores_, color='navy', linewidth=lw)
    # plt.ylabel("Score")
    # plt.xlabel("Iterations")

    # Plotting some predictions for polynomial regression

    def f(x, noise_amount):
        y = np.sqrt(x) * np.sin(x)
        noise = np.random.normal(0, 1, len(x))
        return y + noise_amount * noise

    degree = 10
    X = np.linspace(0, 10, 100)
    y = f(X, noise_amount=0.1)
    clf_poly = BayesianRidge()
    clf_poly.fit(np.vander(X, degree), y)

    X_plot = np.linspace(0, 11, 25)
    y_plot = f(X_plot, noise_amount=0)
    y_mean, y_std = clf_poly.predict(np.vander(X_plot, degree),
                                     return_std=True)
    def train(self):
        X_test, y_test, act_test, X_cnn_test, X_lstm_test = self.load_data()
        if X_test.shape[0] > 0 and len(
                self.methods) > 1 and self.istrained == False:
            if self.model_type in {'pv', 'wind'}:
                if self.resampling == True:
                    pred_resample, y_resample, results = self.resampling_for_combine(
                        X_test, y_test, act_test, X_cnn_test, X_lstm_test)
                else:
                    pred_resample, y_resample, results = self.without_resampling(
                        X_test, y_test, act_test, X_cnn_test, X_lstm_test)
            elif self.model_type in {'load'}:
                if self.resampling == True:
                    pred_resample, y_resample, results = self.resampling_for_combine(
                        X_test, y_test, act_test, X_cnn_test, X_lstm_test)
                else:
                    pred_resample, y_resample, results = self.without_resampling(
                        X_test, y_test, act_test, X_cnn_test, X_lstm_test)
            elif self.model_type in {'fa'}:
                if self.resampling == True:
                    pred_resample, y_resample, results = self.resampling_for_combine(
                        X_test, y_test, act_test, X_cnn_test, X_lstm_test)
                else:
                    pred_resample, y_resample, results = self.without_resampling(
                        X_test, y_test, act_test, X_cnn_test, X_lstm_test)

            self.best_methods = results.nsmallest(4, 'mae').index.tolist()
            results = results.loc[self.best_methods]
            results['diff'] = results['mae'] - results['mae'].iloc[0]
            best_of_best = results.iloc[np.where(
                results['diff'] <= 0.02)].index.tolist()
            if len(best_of_best) == 1:
                best_of_best.extend(
                    [best_of_best[0], best_of_best[0], self.best_methods[1]])
            elif len(best_of_best) == 2:
                best_of_best.extend([best_of_best[0], best_of_best[0]])
            elif len(best_of_best) == 3:
                best_of_best.append(best_of_best[0])

            self.best_methods = best_of_best
            X_pred = np.array([])
            for method in sorted(self.best_methods):
                if X_pred.shape[0] == 0:
                    X_pred = pred_resample[method]
                else:
                    X_pred = np.hstack((X_pred, pred_resample[method]))
            X_pred[np.where(X_pred < 0)] = 0
            X_pred, y_resample = shuffle(X_pred, y_resample)
            self.weight_size = len(self.best_methods)
            self.model = dict()
            for combine_method in self.combine_methods:
                if combine_method == 'rls':
                    self.logger.info('RLS training')
                    self.logger.info('/n')
                    self.model[combine_method] = dict()
                    w = self.rls_fit(X_pred, y_resample)

                    self.model[combine_method]['w'] = w

                elif combine_method == 'bcp':
                    self.logger.info('BCP training')
                    self.logger.info('/n')
                    self.model[combine_method] = dict()
                    w = self.bcp_fit(X_pred, y_resample)
                    self.model[combine_method]['w'] = w

                elif combine_method == 'mlp':
                    self.logger.info('MLP training')
                    self.logger.info('/n')
                    cvs = []
                    for _ in range(3):
                        X_train1, X_test1, y_train1, y_test1 = train_test_split(
                            X_pred, y_resample, test_size=0.15)
                        X_train, X_val, y_train, y_val = train_test_split(
                            X_train1, y_train1, test_size=0.15)
                        cvs.append(
                            [X_train, y_train, X_val, y_val, X_test1, y_test1])
                    mlp_model = sklearn_model(
                        self.static_data,
                        self.model_dir,
                        self.rated,
                        'mlp',
                        self.n_jobs,
                        is_combine=True,
                        path_group=self.static_data['path_group'])
                    self.model[combine_method] = mlp_model.train(cvs)

                elif combine_method == 'bayesian_ridge':
                    self.logger.info('bayesian_ridge training')
                    self.logger.info('/n')
                    self.model[combine_method] = BayesianRidge()
                    self.model[combine_method].fit(X_pred, y_resample)

                elif combine_method == 'elastic_net':
                    self.logger.info('elastic_net training')
                    self.logger.info('/n')
                    self.model[combine_method] = ElasticNetCV(cv=5)
                    self.model[combine_method].fit(X_pred, y_resample)
                elif combine_method == 'ridge':
                    self.logger.info('ridge training')
                    self.logger.info('/n')
                    self.model[combine_method] = RidgeCV(cv=5)
                    self.model[combine_method].fit(X_pred, y_resample)
            self.logger.info('End of combine models training')
        else:
            self.combine_methods = ['average']
        self.istrained = True
        self.save(self.model_dir)

        return 'Done'

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y_time,
                                                    test_size=0.3,
                                                    random_state=0)

Regressor = {
    'Random Forest Regressor':
    RandomForestRegressor(n_estimators=200),
    'Gradient Boosting Regressor':
    GradientBoostingRegressor(n_estimators=500),
    'ExtraTrees Regressor':
    ExtraTreesRegressor(n_estimators=500, min_samples_split=5),
    'Bayesian Ridge':
    BayesianRidge(),
    'Elastic Net CV':
    ElasticNetCV()
}

for name, clf in Regressor.items():
    print(name)
    clf.fit(X_train, y_train)

    print('acc', clf.score(X_test, y_test))
    #print('new_acc',get_acc(y_test,clf.predict(X_test),10))

#         print(f'R2: {r2_score(y_test, clf.predict(X_test)):.2f}')
#         print(f'MAE: {mean_absolute_error(y_test, clf.predict(X_test)):.2f}')
#         print(f'MSE: {mean_squared_error(y_test, clf.predict(X_test)):.2f}')
示例#24
0
#!/usr/bin/env python

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import BayesianRidge
from sklearn import datasets
from sklearn.utils import shuffle
import numpy as np

boston = datasets.load_boston()
X, Y = shuffle(boston.data, boston.target, random_state=13)
X = X.astype(np.float32)
offset = int(X.shape[0] * 0.9)
X_train, Y_train = X[:offset], Y[:offset]
X_test, Y_test = X[offset:], Y[offset:]

regressor = BayesianRidge(compute_score=True)
regressor.fit(X_train, Y_train)
score = regressor.score(X_test, Y_test)
print(score)
    def regressionfunctions(X_temp, Y_temp, which_regs):

        tunedpars_lr = model_pars_name_dic["tunedpars_lr"]

        reg_ref_dic = {
            "omp": OrthogonalMatchingPursuit(),
            "muelnet": MultiTaskElasticNet(),
            "elnet": ElasticNet(),
            "rfr": RandomForestRegressor(random_state=0),
            "mlp": MLPRegressor(learning_rate="adaptive", random_state=0),
            "br": BayesianRidge(),
            "ard": ARDRegression(),
            "svr": SVR(),
            "nusvr": NuSVR()
        }

        tunedpars_dic = {
            "rfr": model_pars_name_dic["tunedpars_rfr"],
            "mlp": model_pars_name_dic["tunedpars_mlp"],
            "br": model_pars_name_dic["tunedpars_br"],
            "ard": model_pars_name_dic["tunedpars_ard"],
            "svr": model_pars_name_dic["tunedpars_svr"],
            "nusvr": model_pars_name_dic["tunedpars_nusvr"],
            "elnet": model_pars_name_dic["tunedpars_elnet"],
            "muelnet": model_pars_name_dic["tunedpars_muelnet"],
            "omp": model_pars_name_dic["tunedpars_omp"]
        }
        models_output_dic = dict()
        model_dic = dict()
        for key in which_regs.keys():
            if which_regs[key] == True:
                model_dic[key] = [tunedpars_dic[key], reg_ref_dic[key]]

        cv1 = KFold(n_splits=cv, shuffle=True, random_state=1)

        #LinearRegression
        reg = GridSearchCV(LinearRegression(),
                           tunedpars_lr,
                           cv=cv1,
                           n_jobs=-1,
                           return_train_score=True)
        reg.fit(X_temp, Y_temp.ravel())
        #rsquared=reg.best_score_
        rsquared = reg.score(X_temp, Y_temp)
        best_score_all = adj_r_sqrd(reg.score(X_temp, Y_temp))
        best_estimator_all = reg
        models_output_dic["lr"] = {
            "model": reg,
            "mod_score": rsquared,
            "predicted_Y": reg.predict(X_temp)
        }

        for ttm in model_dic.items():
            tunedpars = ttm[1][0]
            models = ttm[1][1]
            mod_name = ttm[0]
            reg = GridSearchCV(models,
                               tunedpars,
                               cv=cv1,
                               n_jobs=-1,
                               return_train_score=True)
            try:
                reg.fit(X_temp, Y_temp.ravel())
                mod_score = reg.score(X_temp, Y_temp)
                models_output_dic[mod_name] = {
                    "model": reg,
                    "mod_score": mod_score,
                    "predicted_Y": reg.predict(X_temp)
                }
                if adj_r_sqrd(mod_score) > best_score_all:
                    #rsquared=reg.best_score_
                    rsquared = mod_score
                    best_score_all = adj_r_sqrd(rsquared)
                    best_estimator_all = reg

            except:
                print("####In except: ", mod_name)
                pass

        #to transfer score from cv score to normal:
        #rsquared=best_estimator_all.score(X_temp, Y_temp)
        #best_score_all=adj_r_sqrd(rsquared)
        #################################################################################

        ####################################################################
        return best_score_all, best_estimator_all, rsquared, models_output_dic
示例#26
0
import pandas as pd
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor

# Store the algorithms into a dictionary
reg_all = {
    'Linear Regression': LinearRegression(),
    'Support Vector Machine': SVR(),
    'Byesian Ridge': BayesianRidge(),
    'Lasso': Lasso(),
    'K Neighbors Regression': KNeighborsRegressor(n_neighbors=2)
}


# Read Data
def read_file(filename, n_fold_input=5):
    data = pd.read_csv(filename, sep='\t', header=None)
    data_x = data.iloc[:, :-1]
    data_y = data.iloc[:, -1]
    n_fold = n_fold_input  # n_fold
    # Split data by KFold
    kf = KFold(len(data_y), n_fold)

    return data_x, data_y, kf
示例#27
0
def BayesianRidge_Model():
    x_train, y_train, x_test,_ = load_data()
    clf = BayesianRidge()
    test_score = np.sqrt(-cross_val_score(clf, x_train, y_train, cv=10, scoring='neg_mean_squared_error'))
    print(np.mean(test_score))
示例#28
0
      mean_squared_error(y_test, model.predict(X_test)))
logging.info("Linear Regression | MSE: " +
             str(mean_squared_error(y_test, model.predict(X_test))))

##### Decision Tree Regression #####
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()
model.fit(X_train, y_train)

print("Decision Tree Regression | Accuracy Score:",
      model.score(X_test, y_test))
logging.info("Decision Tree Regression | Accuracy Score: " +
             str(model.score(X_test, y_test)))
print("Decision Tree Regression | MSE:",
      mean_squared_error(y_test, model.predict(X_test)))
logging.info("Decision Tree Regression | MSE: " +
             str(mean_squared_error(y_test, model.predict(X_test))))

##### Bayesian Ridge #####
from sklearn.linear_model import BayesianRidge
model = BayesianRidge()
model.fit(X_train, y_train)

print("Bayesian Ridge | Accuracy Score:", model.score(X_test, y_test))
logging.info("Bayesian Ridge | Accuracy Score: " +
             str(model.score(X_test, y_test)))
print("Bayesian Ridge | MSE:", mean_squared_error(y_test,
                                                  model.predict(X_test)))
logging.info("Bayesian Ridge | MSE: " +
             str(mean_squared_error(y_test, model.predict(X_test))))
示例#29
0
pca = PCA(n_components=410)
X_scaled=pca.fit_transform(X_scaled)
test_X_scaled = pca.transform(test_X_scaled)
print(X_scaled.shape, test_X_scaled.shape)

'''modeling&evalution'''
#34
# define cross validation strategy
def rmse_cv(model,X,y):
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=5))
    return rmse

#35
#We choose 13 models and use 5-folds cross-calidation to evaluate these models.
models = [LinearRegression(),Ridge(),Lasso(alpha=0.01,max_iter=10000),RandomForestRegressor(),GradientBoostingRegressor(),SVR(),LinearSVR(),
          ElasticNet(alpha=0.001,max_iter=10000),SGDRegressor(),BayesianRidge(),KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5),
          ExtraTreesRegressor(),XGBRegressor()]

#36
names = ["LR", "Ridge", "Lasso", "RF", "GBR", "SVR", "LinSVR", "Ela","SGD","Bay","Ker","Extra","Xgb"]
for name, model in zip(names, models):
    score = rmse_cv(model, X_scaled, y_log)
    print("{}: {:.6f}, {:.4f}".format(name,score.mean(),score.std()))

#37
#Next we do some hyperparameters tuning. First define a gridsearch method.
class grid():
    def __init__(self, model):
        self.model = model

    def grid_get(self, X, y, param_grid):
示例#30
0
def task2(data):

	df = data

	dfreg = df.loc[:,['Adj Close','Volume']]
	dfreg['HL_PCT'] = (df['High'] - df['Low']) / df['Close'] * 100.0
	dfreg['PCT_change'] = (df['Close'] - df['Open']) / df['Open'] * 100.0

	# Drop missing value
	dfreg.fillna(value=-99999, inplace=True)
	# We want to separate 1 percent of the data to forecast
	forecast_out = int(math.ceil(0.01 * len(dfreg)))
	# Separating the label here, we want to predict the AdjClose
	forecast_col = 'Adj Close'
	dfreg['label'] = dfreg[forecast_col].shift(-forecast_out)
	X = np.array(dfreg.drop(['label'], 1))
	# Scale the X so that everyone can have the same distribution for linear regression
	X = preprocessing.scale(X)
	# Finally We want to find Data Series of late X and early X (train) for model generation and evaluation
	X_lately = X[-forecast_out:]
	X = X[:-forecast_out]
	# Separate label and identify it as y
	y = np.array(dfreg['label'])
	y = y[:-forecast_out]
	
	#Split data
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

	##################
	##################
	##################


	# Linear regression
	clfreg = LinearRegression(n_jobs=-1)
	# 1 - First save the models to local device in models folder
	# filename = 'models/clfreg_model.sav'
	# pickle.dump(clfreg, open(filename, 'wb'))

	# 2 - load the models from disk onces first instruction is done once.
	# clfreg = pickle.load(open(filename, 'rb'))
	clfreg.fit(X_train, y_train)


	# Quadratic Regression 2
	clfpoly2 = make_pipeline(PolynomialFeatures(2), Ridge())
	#Save model to a pickle
	# filename1 = 'models/clfpoly2_model.sav'
	# pickle.dump(clfpoly2, open(filename1, 'wb'))
	
	# 2 - load the models from disk onces first instruction is done once.
	# clfpoly2 = pickle.load(open(filename1, 'rb'))
	clfpoly2.fit(X_train, y_train)


	# Quadratic Regression 3
	clfpoly3 = make_pipeline(PolynomialFeatures(3), Ridge())
	#Save model to a pickle
	# filename2 = 'models/clfpoly3_model.sav'
	# pickle.dump(clfpoly3, open(filename2, 'wb'))
	
	# 2 - load the models from disk onces first instruction is done once.
	# clfpoly3 = pickle.load(open(filename2, 'rb'))
	clfpoly3.fit(X_train, y_train)


	# KNN Regression
	clfknn = KNeighborsRegressor(n_neighbors=2)
	#Save model to a pickle
	# filename3 = 'models/clfknn_model.sav'
	# pickle.dump(clfknn, open(filename3, 'wb'))
	
	# 2 - load the models from disk onces first instruction is done once.
	# clfknn = pickle.load(open(filename3, 'rb'))
	clfknn.fit(X_train, y_train)


	# Lasso Regression
	clflas = Lasso()
	#Save model to a pickle
	# filename4 = 'models/clflas_model.sav'
	# pickle.dump(clflas, open(filename4, 'wb'))
	
	# 2 - load the models from disk onces first instruction is done once.
	# clflas = pickle.load(open(filename4, 'rb'))
	clflas.fit(X_train, y_train)


	# Multitask Lasso Regression
	# clfmtl = MultiTaskLasso(alpha=1.)
	# clfmtl.fit(X_train, y_train).coef_


	# Bayesian Ridge Regression
	clfbyr = BayesianRidge()
	clfbyr.fit(X_train, y_train)
	#Save model to a pickle
	# filename5 = 'models/clfbyr_model.sav'
	# pickle.dump(clfbyr, open(filename5, 'wb'))
	
	# 2 - load the models from disk onces first instruction is done once.
	# clfbyr = pickle.load(open(filename5, 'rb'))


	# Lasso LARS Regression
	clflar = LassoLars(alpha=.1)
	clflar.fit(X_train, y_train)
	#Save model to a pickle
	# filename6 = 'models/clflar_model.sav'
	# pickle.dump(clflar, open(filename6, 'wb'))
	
	# 2 - load the models from disk onces first instruction is done once.
	# clflar = pickle.load(open(filename6, 'rb'))


	# Orthogonal Matching Pursuit Regression
	clfomp = OrthogonalMatchingPursuit(n_nonzero_coefs=2)
	clfomp.fit(X_train, y_train)
	#Save model to a pickle
	# filename7 = 'models/clfomp_model.sav'
	# pickle.dump(clfomp, open(filename7, 'wb'))
	
	# 2 - load the models from disk onces first instruction is done once.
	# clfomp = pickle.load(open(filename7, 'rb'))


	# Automatic Relevance Determination Regression
	clfard = ARDRegression(compute_score=True)
	clfard.fit(X_train, y_train)
	#Save model to a pickle
	# filename8 = 'models/clfard_model.sav'
	# pickle.dump(clfard, open(filename8, 'wb'))
	
	# 2 - load the models from disk onces first instruction is done once.
	# clfard = pickle.load(open(filename8, 'rb'))


	# Logistic Regression
	# clflgr = linear_model.LogisticRegression(penalty='l1', solver='saga', tol=1e-6, max_iter=int(1e6), warm_start=True)
	# coefs_ = []
	# for c in cs:
	#   clflgr.set_params(C=c)
	#   clflgr.fit(X_train, y_train)
	#   coefs_.append(clflgr.coef_.ravel().copy())


	#SGD Regression
	clfsgd = SGDRegressor(random_state=0, max_iter=1000, tol=1e-3)
	clfsgd.fit(X_train, y_train)
	#Save model to a pickle
	# filename9 = 'models/clfsgd_model.sav'
	# pickle.dump(clfsgd, open(filename9, 'wb'))
	
	# 2 - load the models from disk onces first instruction is done once.
	# clfsgd = pickle.load(open(filename9, 'rb'))


	##################
	##################
	##################


	#Create confindence scores
	confidencereg = clfreg.score(X_test, y_test)
	confidencepoly2 = clfpoly2.score(X_test,y_test)
	confidencepoly3 = clfpoly3.score(X_test,y_test)
	confidenceknn = clfknn.score(X_test, y_test)
	confidencelas = clflas.score(X_test, y_test)
	# confidencemtl = clfmtl.score(X_test, y_test)
	confidencebyr = clfbyr.score(X_test, y_test)
	confidencelar = clflar.score(X_test, y_test)
	confidenceomp = clfomp.score(X_test, y_test)
	confidenceard = clfard.score(X_test, y_test)
	confidencesgd = clfsgd.score(X_test, y_test)

	# results
	print('The linear regression confidence is:',confidencereg*100)
	print('The quadratic regression 2 confidence is:',confidencepoly2*100)
	print('The quadratic regression 3 confidence is:',confidencepoly3*100)
	print('The knn regression confidence is:',confidenceknn*100)
	print('The lasso regression confidence is:',confidencelas*100)
	# print('The lasso regression confidence is:',confidencemtl*100)
	print('The Bayesian Ridge regression confidence is:',confidencebyr*100)
	print('The Lasso LARS regression confidence is:',confidencelar*100)
	print('The OMP regression confidence is:',confidenceomp*100)
	print('The ARD regression confidence is:',confidenceard*100)
	print('The SGD regression confidence is:',confidencesgd*100)

	#Create new columns
	forecast_reg = clfreg.predict(X_lately)
	forecast_pol2 = clfpoly2.predict(X_lately)
	forecast_pol3 = clfpoly3.predict(X_lately)
	forecast_knn = clfknn.predict(X_lately)
	forecast_las = clflas.predict(X_lately)
	forecast_byr = clfbyr.predict(X_lately)
	forecast_lar = clflar.predict(X_lately)
	forecast_omp = clfomp.predict(X_lately)
	forecast_ard = clfard.predict(X_lately)
	forecast_sgd = clfsgd.predict(X_lately)

	#Process all new columns data
	dfreg['Forecast_reg'] = np.nan

	last_date = dfreg.iloc[-1].name
	last_unix = last_date
	next_unix = last_unix + datetime.timedelta(days=1)

	for i in forecast_reg:
	    next_date = next_unix
	    next_unix += datetime.timedelta(days=1)
	    dfreg.loc[next_date] = [np.nan for _ in range(len(dfreg.columns))]
	    dfreg['Forecast_reg'].loc[next_date] = i
	    
	dfreg['Forecast_pol2'] = np.nan

	last_date = dfreg.iloc[-26].name
	last_unix = last_date
	next_unix = last_unix + datetime.timedelta(days=1)
	    
	for i in forecast_pol2:
	    next_date = next_unix
	    next_unix += datetime.timedelta(days=1)
	    dfreg['Forecast_pol2'].loc[next_date] = i

	dfreg['Forecast_pol3'] = np.nan

	last_date = dfreg.iloc[-26].name
	last_unix = last_date
	next_unix = last_unix + datetime.timedelta(days=1)
	    
	for i in forecast_pol3:
	    next_date = next_unix
	    next_unix += datetime.timedelta(days=1)
	    dfreg['Forecast_pol3'].loc[next_date] = i
	    
	dfreg['Forecast_knn'] = np.nan

	last_date = dfreg.iloc[-26].name
	last_unix = last_date
	next_unix = last_unix + datetime.timedelta(days=1)
	    
	for i in forecast_knn:
	    next_date = next_unix
	    next_unix += datetime.timedelta(days=1)
	    dfreg['Forecast_knn'].loc[next_date] = i
	        
	dfreg['Forecast_las'] = np.nan

	last_date = dfreg.iloc[-26].name
	last_unix = last_date
	next_unix = last_unix + datetime.timedelta(days=1)
	    
	for i in forecast_las:
	    next_date = next_unix
	    next_unix += datetime.timedelta(days=1)
	    dfreg['Forecast_las'].loc[next_date] = i
	    
	dfreg['Forecast_byr'] = np.nan

	last_date = dfreg.iloc[-26].name
	last_unix = last_date
	next_unix = last_unix + datetime.timedelta(days=1)
	    
	for i in forecast_byr:
	    next_date = next_unix
	    next_unix += datetime.timedelta(days=1)
	    dfreg['Forecast_byr'].loc[next_date] = i
	    
	dfreg['Forecast_lar'] = np.nan

	last_date = dfreg.iloc[-26].name
	last_unix = last_date
	next_unix = last_unix + datetime.timedelta(days=1)
	    
	for i in forecast_lar:
	    next_date = next_unix
	    next_unix += datetime.timedelta(days=1)
	    dfreg['Forecast_lar'].loc[next_date] = i
	    
	dfreg['Forecast_omp'] = np.nan

	last_date = dfreg.iloc[-26].name
	last_unix = last_date
	next_unix = last_unix + datetime.timedelta(days=1)
	    
	for i in forecast_omp:
	    next_date = next_unix
	    next_unix += datetime.timedelta(days=1)
	    dfreg['Forecast_omp'].loc[next_date] = i
	    
	dfreg['Forecast_ard'] = np.nan

	last_date = dfreg.iloc[-26].name
	last_unix = last_date
	next_unix = last_unix + datetime.timedelta(days=1)
	    
	for i in forecast_ard:
	    next_date = next_unix
	    next_unix += datetime.timedelta(days=1)
	    dfreg['Forecast_ard'].loc[next_date] = i
	    
	dfreg['Forecast_sgd'] = np.nan

	last_date = dfreg.iloc[-26].name
	last_unix = last_date
	next_unix = last_unix + datetime.timedelta(days=1)
	    
	for i in forecast_sgd:
	    next_date = next_unix
	    next_unix += datetime.timedelta(days=1)
	    dfreg['Forecast_sgd'].loc[next_date] = i

	return dfreg.index.format(formatter=lambda x: x.strftime('%Y-%m-%d')), dfreg['Adj Close'].to_list(), dfreg['Forecast_reg'].to_list(), dfreg['Forecast_pol2'].to_list(), dfreg['Forecast_pol3'].to_list(), dfreg['Forecast_knn'].to_list(), dfreg['Forecast_las'].to_list(), dfreg['Forecast_byr'].to_list(), dfreg['Forecast_lar'].to_list(), dfreg['Forecast_omp'].to_list(), dfreg['Forecast_ard'].to_list(), dfreg['Forecast_sgd'].to_list()