Exemplo n.º 1
0
def stacking(X, y, k_cv):
    res = []
    estimators = [('krr', KernelRidge(kernel="cosine", alpha=0.001)),
                  ('svr', SVR(C=2000, gamma=0.001)),
                  ("enet",
                   ElasticNet(alpha=0.00001, l1_ratio=0.0005, max_iter=10000))]
    reg = StackingRegressor(estimators=estimators,
                            n_jobs=15,
                            final_estimator=LinearRegression())
    kfold = KFold(n_splits=k_cv, shuffle=True, random_state=0)
    vaild_split = kfold.split(y)
    for i in range(k_cv):
        split_index = vaild_split.__next__()
        test_index = split_index[1]
        y_test = y[test_index]
        trainval_index = split_index[0]
        X_trainval = X[trainval_index, :]
        X_test = X[test_index, :]
        y_trainval = y[trainval_index]
        reg.fit(X_trainval, y_trainval)
        print((reg.score(X_trainval, y_trainval))**0.5)
        test_pre = reg.predict(X_test)
        print("accuracy: ", (r_2(y_test, test_pre))**0.5)
        res.append(r_2(y_test, test_pre)**0.5)
        print("mean acacuracy: ", np.array(res).mean())
    print("mean acacuracy: ", np.array(res).mean())
Exemplo n.º 2
0
 def reg_ensemble_1(self):
     """
     Regressors Ensemble
     :return: ensempre prediction
     """
     lr, lr_pred = self.linear_regr()
     rf, rf_pred = self.random_forest_regr()
     lasso, lasso_pred = self.lasso_regr()
     # el, el_pred = self.elastic_net_regr()
     # dt, dt_pred = self.decis_tree_regr()
     # knr, knr_pred = self.kneighbors_regr()
     # gbr, gbr_pred = self.gradient_boost_regr()
     estimators = [
         # ("str", dt),
         # ("eln", el),
         ("lasso", lasso),
         # ("knr", knr),
         # ("gbr", gbr),
         ("lr", lr),
         ("rf", rf)
     ]
     reg = StackingRegressor(estimators=estimators,
                             final_estimator=RandomForestRegressor(),
                             n_jobs=-1)
     reg.fit(self.x_train, self.y_train)
     return reg.predict(self.x_test)
Exemplo n.º 3
0
def lvl2_xgb_randomsearch(rawdf, results_dir, pp_choice, param_dir, passthrough, final_pp_choice=None):
    x_train = rawdf.iloc[:, :-1]
    y_train = rawdf.iloc[:, -1]
    model_store = ['rf', 'et', 'xgb']
    model_object = {
        'xgb': XGBRegressor(),
        'rf': RandomForestRegressor(),
        'et': ExtraTreesRegressor()
    }

    with open(param_dir, 'rb') as f:
        model_results = pickle.load(f)
    model_results = {k: pd.DataFrame(v).sort_values('mean_test_score', ascending=False) for k, v in
                     model_results.items()}
    model_object = {k: model_object[k].set_params(**{kk.split('__')[1]: vv for kk, vv in v.loc[0, 'params'].items()})
                    for k, v in model_results.items()}

    preprocess_pipeline = pp_selector(pp_choice)

    lvl1_pipeline = [
        (model_name,
         Pipeline([
             ('preprocess', preprocess_pipeline),
             (model_name, model_object[model_name])
         ])
         )
        for model_name in model_store]
    final_estimator_params = {'final_estimator__final_est__n_estimators': scipy.stats.randint(150, 1000),
                              'final_estimator__final_est__learning_rate': scipy.stats.uniform(0.01, 0.59),
                              'final_estimator__final_est__subsample': scipy.stats.uniform(0.3, 0.6),
                              'final_estimator__final_est__max_depth': scipy.stats.randint(1, 16),
                              'final_estimator__final_est__colsample_bytree': scipy.stats.uniform(0.5, 0.4),
                              'final_estimator__final_est__min_child_weight': [1, 2, 3, 4],
                              'final_estimator__final_est__gamma': scipy.stats.expon(scale=0.05),
                              }
    if passthrough:
        final_est = Pipeline([
            ('final_preprocess', final_est_pipeline(feature_names=x_train.columns.tolist(),
                                                    preprocess_pipeline=pp_selector(final_pp_choice),
                                                    no_of_lvl1=len(lvl1_pipeline))),
            ('debugger', DebuggerTransformer(info='final')),
            ('final_est', XGBRegressor())
        ])
    else:
        final_est = XGBRegressor()

    est = StackingRegressor(estimators=lvl1_pipeline, final_estimator=final_est, passthrough=passthrough)
    est = RandomizedSearchCV(est,
                             param_distributions=final_estimator_params,
                             cv=5,
                             n_iter=100,
                             scoring=make_scorer(rmsle, greater_is_better=False),
                             verbose=1,
                             n_jobs=-1)

    est.fit(x_train, y_train)
    score = {'lvl2_xgb': est.cv_results_}
    results_dir = create_results_directory(results_dir)
    with open(f'{results_dir}/results_store.pkl', 'wb') as f:
        pickle.dump(score, f)
Exemplo n.º 4
0
def test_stacking_regressor_diabetes(cv, final_estimator, predict_params,
                                     passthrough):
    # prescale the data to avoid convergence warning without using a pipeline
    # for later assert
    X_train, X_test, y_train, _ = train_test_split(scale(X_diabetes),
                                                   y_diabetes,
                                                   random_state=42)
    estimators = [('lr', LinearRegression()), ('svr', LinearSVR())]
    reg = StackingRegressor(estimators=estimators,
                            final_estimator=final_estimator,
                            cv=cv,
                            passthrough=passthrough)
    reg.fit(X_train, y_train)
    result = reg.predict(X_test, **predict_params)
    expected_result_length = 2 if predict_params else 1
    if predict_params:
        assert len(result) == expected_result_length

    X_trans = reg.transform(X_test)
    expected_column_count = 12 if passthrough else 2
    assert X_trans.shape[1] == expected_column_count
    if passthrough:
        assert_allclose(X_test, X_trans[:, -10:])

    reg.set_params(lr='drop')
    reg.fit(X_train, y_train)
    reg.predict(X_test)

    X_trans = reg.transform(X_test)
    expected_column_count_drop = 11 if passthrough else 1
    assert X_trans.shape[1] == expected_column_count_drop
    if passthrough:
        assert_allclose(X_test, X_trans[:, -10:])
def main():
    data = pd.read_csv('dataset/complete.csv')
    data.drop("CountryCode", axis=1, inplace=True)
    data.drop("RegionName", axis=1, inplace=True)
    data.drop("RegionCode", axis=1, inplace=True)
    data.drop("M1_Wildcard", axis=1, inplace=True)

    # Remove Flag Columns
    for (colName, colData) in data.iteritems():
        if "flag" in colName.lower():
            data.drop(colName, axis=1, inplace=True)
        if "index" in colName.lower():
            data.drop(colName, axis=1, inplace=True)

    # remove any rows that contain 'nan'
    data.dropna(axis=0, how='any', inplace=True)

    # change datatype of Date from int to DateTime64
    date_series = pd.to_datetime(data['Date'].astype(str), format='%Y-%m-%d')
    data['Date'] = date_series.map(dt.datetime.toordinal)
    # encoding country name
    data = pd.get_dummies(data, columns=['CountryName'],
                          prefix=['CountryName'])

    for (colName, colData) in data.iteritems():
        if "countryname" in colName.lower():
            data.drop(colName, axis=1, inplace=True)
    print(data.info())

    # separate feature and label
    data_feature = data.drop(['ConfirmedCases', 'new_cases', 'ConfirmedDeaths'], axis=1, inplace=False)
    data_label_total_cases = data.loc[:, 'ConfirmedCases']
    data_label_total_deaths = data.loc[:, 'ConfirmedDeaths']
    data_label_cases_perDay = data.loc[:, 'new_cases']

    scaler = RobustScaler()
    features = scaler.fit_transform(data_feature)

    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        data_label_cases_perDay,
                                                        test_size=0.25,
                                                        random_state=42)

    estimators = [
        ('rfr', RandomForestRegressor(random_state=42, n_estimators=50)),
        ('gbr', GradientBoostingRegressor(random_state=42)),
        ('lsvr', LinearSVR(random_state=42, max_iter=1000)),
        ('etr', ExtraTreesRegressor(random_state=42, criterion='mae', n_estimators=50))
    ]

    model = StackingRegressor(
        estimators=estimators,
        final_estimator=ExtraTreesRegressor(random_state=42, n_estimators=50)
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    print("MAE: " + str(mae))
def stacking_regressor(estimators, final_estimator, data, labels, args={}):
    """
    Stacking算法:通过多个模型降低bias, 回归
    """
    from sklearn.ensemble import StackingRegressor
    reg = StackingRegressor(estimators=estimators, final_estimator=final_estimator, **args)
    reg.fit(data, labels)
    return reg
Exemplo n.º 7
0
def Stacked_Ensemble(x_train, x_test, y_train, y_test):

    # Path to save model
    path_to_model = os.path.join("model", "StackedEnsemble.sav")

    # define the base models
    level0 = list()
    level0.append(('lr', LinearRegression()))
    level0.append(('knn', KNeighborsRegressor()))
    level0.append(('cart', DecisionTreeRegressor()))
    level0.append(('svm', SVR()))
    level0.append(('adaboost', AdaBoostRegressor()))
    # level0.append(('bayes', ))

    # Classifier
    # level0.append(('lr', LogisticRegression()))
    # level0.append(('knn', KNeighborsClassifier()))
    # level0.append(('cart', DecisionTreeClassifier()))
    # level0.append(('svm', SVC()))
    # level0.append(('bayes', GaussianNB()))

    # define meta learner model
    level1 = LinearRegression()

    # Classifier
    # level1 = LogisticRegression()

    # define the stacking ensemble
    model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5)
    # model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)

    model.fit(x_train, y_train)

    # Predicting
    y_pred = model.predict(x_test)

    # Printing the training results
    print("\n\n(Stacked Ensemble) Confusion Matrix: \n",
          confusion_matrix(y_true=y_test, y_pred=y_pred.round()))
    print("(Stacked Ensemble) Report: \n",
          classification_report(y_test, y_pred.round()))
    print("(Stacked Ensemble) Accuracy: \n",
          accuracy_score(y_test, y_pred.round()))

    # Saving the Model
    if not os.path.exists(os.path.dirname(path_to_model)):
        try:
            os.makedirs(os.path.dirname(path_to_model))
        except OSError as exc:  # Guard against race condition
            print("File does not exist !!!!")

    pickle.dump(model, open(path_to_model, 'wb'))

    return y_test, y_pred
Exemplo n.º 8
0
def stacking_qtlmas(X_trainval, y_trainval, X_test, y_test):
    res = []
    estimators = [('krr', KernelRidge(kernel="cosine", alpha=0.005)),
                  ('svr', SVR(C=2500, gamma=0.001)),
                  ("enet",
                   ElasticNet(alpha=0.00001, l1_ratio=0.0005, max_iter=10000))]
    reg = StackingRegressor(estimators=estimators,
                            n_jobs=15,
                            final_estimator=LinearRegression())

    reg.fit(X_trainval, y_trainval)
    print((reg.score(X_trainval, y_trainval))**0.5)
    test_pre = reg.predict(X_test)
    return test_pre
Exemplo n.º 9
0
def train(prop, k_fold=5, test_size=0.2):
    # 0.settings
    set_seed(GLOBAL_SEED)
    cv = k_fold  # cross-validation generator
    if cv == 1:
        cv = LeaveOneOut()

    # 1.basic learner nets
    knn = KNeighborsRegressor(leaf_size=3, n_neighbors=2, p=1, weights='distance')
    svr = GridSearchCV(SVR(), param_grid={"C": np.logspace(0, 2, 4), "gamma": np.logspace(-2, 2, 7)}, n_jobs=-1)
    ridge = RidgeCV(alphas=(0.1, 1.0, 10.0, 100.0))
    mlp = MLPRegressor(hidden_layer_sizes=(50, 100, 50), max_iter=700)
    rf = RandomForestRegressor()
    gbdt = GradientBoostingRegressor()
    # 2.metal model net
    metal_model = RidgeCV(alphas=(0.1, 1.0, 10.0, 100.0))
    # 3.stacking model
    stacking_model = StackingRegressor(
        estimators=[('KNN', knn), ('SVR', svr), ('Ridge', ridge), ('MLP', mlp), ('RF', rf), ('GBDT', gbdt)],
        final_estimator=metal_model,
        n_jobs=-1, cv=cv  # cross validation
    )

    # 4.load data
    x, y = loadXY(config.data_load_path[prop])
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, shuffle=True)

    # 5.train model(stacking模型,已经内置交叉验证)
    stacking_model.fit(x_train, y_train)
    # val-scores
    result = cross_validate(stacking_model, x_train, y_train, scoring=['neg_mean_absolute_error','neg_mean_squared_error','r2'], cv=cv)
    mae_val = result['test_neg_mean_absolute_error'].mean()
    mse_val = result['test_neg_mean_squared_error'].mean()
    r2_val = result['test_r2'].mean()
    # test-score
    pred = stacking_model.predict(x_test)
    mae_test = sklearn.metrics.mean_absolute_error(y_test, pred).mean()
    mse_test = sklearn.metrics.mean_squared_error(y_test, pred).mean()
    r2_test = sklearn.metrics.r2_score(y_test, pred).mean()
    # show
    print("验证集: MAE:%f, MSE:%f, R2:%f\n"
          "测试集: MAE:%f, MSE:%f, R2:%f"
          % (mae_val, mse_val, r2_val,
             mae_test, mse_test, r2_test))

    # 7.save model
    month_once_save_name = time.strftime('%Y-%m.pkl', time.localtime())
    save_path = os.path.join(config.model_save_path[prop], month_once_save_name)
    file_util.save_model(stacking_model, save_path)
Exemplo n.º 10
0
def test_stacking_regressor_sparse_passthrough(fmt):
    # Check passthrough behavior on a sparse X matrix
    X_train, X_test, y_train, _ = train_test_split(
        sparse.coo_matrix(scale(X_diabetes)).asformat(fmt),
        y_diabetes, random_state=42
    )
    estimators = [('lr', LinearRegression()), ('svr', LinearSVR())]
    rf = RandomForestRegressor(n_estimators=10, random_state=42)
    clf = StackingRegressor(
        estimators=estimators, final_estimator=rf, cv=5, passthrough=True
    )
    clf.fit(X_train, y_train)
    X_trans = clf.transform(X_test)
    assert_allclose_dense_sparse(X_test, X_trans[:, -10:])
    assert sparse.issparse(X_trans)
    assert X_test.format == X_trans.format
Exemplo n.º 11
0
def test_stacking_regression():
    from sklearn.model_selection import train_test_split
    from sklearn.datasets import load_diabetes
    from sklearn.linear_model import RidgeCV
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.ensemble import StackingRegressor

    X, y = load_diabetes(return_X_y=True)
    estimators = [('gbm',
                   xgb.sklearn.XGBRegressor(objective='reg:squarederror')),
                  ('lr', RidgeCV())]
    reg = StackingRegressor(estimators=estimators,
                            final_estimator=RandomForestRegressor(
                                n_estimators=10, random_state=42))

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
    reg.fit(X_train, y_train).score(X_test, y_test)
Exemplo n.º 12
0
    def fit(self, X, y, random_state=None):
        """
        Train ENOLS on the given training set.

        Parameters
        ----------
        X: an input array of shape (n_sample, n_features)
        y: an array of shape (n_sample,) containing the classes for the input examples

        Return
        ------
        self: the fitted model
        """

        # use random instead of np.random to sample random numbers below
        random = check_random_state(random_state)

        estimators = [('lr', LinearRegression())]

        if isinstance(self.sample_size, int):
            self.sample_size = 'reservoir_sampling'

        # add all the trained OLS models to this list
        self.estimators_lr, self.estimators_TSR, self.estimators_enols = [], [], []
        for i in range(self.n_estimators):
            samples = sample_without_replacement(n_population=random.choice([50, 100]),
                                                 n_samples=random.choice([10, 20]),
                                                 random_state=random_state, method=self.sample_size)

            X_train, y_train = [], []
            for i in samples:
                X_train.append(X[i]), y_train.append(y[i])

            reg = LinearRegression()
            reg.fit(np.array(X_train), np.array(y_train))

            tsr = TheilSenRegressor()
            tsr.fit(np.array(X_train), np.array(y_train))

            enol = StackingRegressor(estimators=estimators, final_estimator=LinearRegression())
            enol.fit(np.array(X_train), np.array(y_train))

            self.estimators_lr.append(reg), self.estimators_TSR.append(tsr), self.estimators_enols.append(enol)

        return self
Exemplo n.º 13
0
 def reg_ensemble_4(self):
     """
     Regressors Ensemble
     :return: ensempre prediction
     """
     lr, lr_pred = self.linear_regr()
     rf, rf_pred = self.random_forest_regr()
     lasso, lasso_pred = self.lasso_regr()
     estimators = [
         ("lr", lr),
         ("rf", rf),
         ("lasso", lasso)
     ]
     reg = StackingRegressor(estimators=estimators,
                             final_estimator=RandomForestRegressor(),
                             cv=200,
                             n_jobs=-1)
     reg.fit(self.x_train, self.y_train)
     return reg.predict(self.x_test)
Exemplo n.º 14
0
def test_stacking_regressor_drop_estimator():
    # prescale the data to avoid convergence warning without using a pipeline
    # for later assert
    X_train, X_test, y_train, _ = train_test_split(scale(X_diabetes),
                                                   y_diabetes,
                                                   random_state=42)
    estimators = [('lr', 'drop'), ('svr', LinearSVR(random_state=0))]
    rf = RandomForestRegressor(n_estimators=10, random_state=42)
    reg = StackingRegressor(estimators=[('svr', LinearSVR(random_state=0))],
                            final_estimator=rf,
                            cv=5)
    reg_drop = StackingRegressor(estimators=estimators,
                                 final_estimator=rf,
                                 cv=5)

    reg.fit(X_train, y_train)
    reg_drop.fit(X_train, y_train)
    assert_allclose(reg.predict(X_test), reg_drop.predict(X_test))
    assert_allclose(reg.transform(X_test), reg_drop.transform(X_test))
Exemplo n.º 15
0
def Stacked_Ensemble(x_train, x_test, y_train, y_test):

    # define the base models
    level0 = list()
    level0.append(('lr', LinearRegression()))
    level0.append(('knn', KNeighborsRegressor()))
    level0.append(('cart', DecisionTreeRegressor()))
    level0.append(('svm', SVR()))
    level0.append(('adaboost', AdaBoostRegressor()))
    # level0.append(('bayes', ))

    # Classifier
    # level0.append(('lr', LogisticRegression()))
    # level0.append(('knn', KNeighborsClassifier()))
    # level0.append(('cart', DecisionTreeClassifier()))
    # level0.append(('svm', SVC()))
    # level0.append(('bayes', GaussianNB()))

    # define meta learner model
    level1 = LinearRegression()

    # Classifier
    # level1 = LogisticRegression()

    # define the stacking ensemble
    model = StackingRegressor(estimators=level0, final_estimator=level1, cv=5)
    # model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)

    model.fit(x_train, y_train)

    # Predicting
    y_pred = model.predict(x_test)

    # Printing the training results
    print("\n\n(Stacked Ensemble) Confusion Matrix: \n",
          confusion_matrix(y_true=y_test, y_pred=y_pred.round()))
    print("(Stacked Ensemble) Report: \n",
          classification_report(y_test, y_pred.round()))
    print("(Stacked Ensemble) Accuracy: \n",
          accuracy_score(y_test, y_pred.round()))

    return y_test, y_pred
def init_stacking(train_scaled, test_scaled, target, test_id):
    if not os.path.isfile('Data/pickles/models/pancake_stack'):

        estimators = [
            ('rfr', RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                                          max_depth=5, max_features='auto', max_leaf_nodes=None,
                                          max_samples=None, min_impurity_decrease=0.0,
                                          min_impurity_split=None, min_samples_leaf=4,
                                          min_samples_split=2, min_weight_fraction_leaf=0.0,
                                          n_estimators=700, n_jobs=None, oob_score=True,
                                          random_state=None, verbose=3, warm_start=False)),

            ('xgboost', XGBRegressor(learning_rate=0.08, max_depth=3, n_estimators=500, n_jobs=-1,
                                     reg_alpha=0.001, reg_lambda=1, verbosity=2)),

            ('svr', SVR(C=5, cache_size=200, coef0=0.0, degree=1, epsilon=0.01, gamma='auto',
                        kernel='poly', max_iter=-1, shrinking=True, tol=0.001, verbose=3)),

            ('lgbm', LGBMRegressor(boosting_type='gbdt', lambda_l1=0,
                                   lambda_l2=0.1, learning_rate=0.1,
                                   max_depth=0, num_leaves=10))
        ]

        stack = StackingRegressor(estimators=estimators, final_estimator=LassoCV(cv=5), verbose=3)

        stack.fit(train_scaled, target)

        with open('Data/pickles/models/pancake_stack', 'wb') as file:
            pass
            pickle.dump(stack, file)

    else:
        with open('Data/pickles/models/pancake_stack', 'rb') as file:
            stack = pickle.load(file)

    y_pred = stack.predict(test_scaled)

    y_pred = np.exp(y_pred)

    submission_df = pd.DataFrame(y_pred, index=test_id, columns=['SalePrice'])

    submission_df.to_csv('Data/Submission/S6.csv')
Exemplo n.º 17
0
 def reg_ensemble_5(self):
     """
     Regressors Ensemble
     :return: ensempre prediction
     """
     param = {'final_estimator__max_features': [1, 5],
              'final_estimator__n_jobs': [1, -1, 5]}
     lr, lr_pred = self.linear_regr()
     rf, rf_pred = self.random_forest_regr()
     estimators = [
         ("lr", lr),
         ("rf", rf)
     ]
     # tss = TimeSeriesSplit(n_splits=2, test_size=10)
     tss = TimeSeriesSplit(gap=20, max_train_size=None, n_splits=10, test_size=None)
     reg = StackingRegressor(estimators=estimators,
                             final_estimator=RandomForestRegressor(),
                             cv=tss,
                             n_jobs=-1)
     reg.fit(self.x_train, self.y_train)
     return reg.predict(self.x_test)
Exemplo n.º 18
0
def lvl2_generate_prediction(rawdf, x_test, results_dir, lvl1_results_dir, type_, pp_choice,
                             passthrough=False, final_pp_choice=None):
    x_train = rawdf.iloc[:, :-1]
    y_train = rawdf.iloc[:, -1]
    model_names = ['rf', 'et', 'xgb']
    model_object = {
        'xgb': XGBRegressor(),
        'rf': RandomForestRegressor(),
        'et': ExtraTreesRegressor()
    }

    with open(f'{lvl1_results_dir}/results_store.pkl', 'rb') as f:
        model_results = pickle.load(f)
    model_results = {k: pd.DataFrame(v).sort_values('mean_test_score', ascending=False) for k, v in
                     model_results.items()}

    lvl1_pipeline = [
        (model_name, Pipeline([
            ('preprocess', pp_selector(pp_choice)),
            (model_name, model_object[model_name])
        ]).set_params(**model_results[model_name].loc[0, 'params']))
        for model_name in model_names]

    if type_ == 'lvl2_ridgecv':
        est = StackingRegressor(estimators=lvl1_pipeline, final_estimator=RidgeCV(), passthrough=False)
    elif type_ == 'lvl2_xgb':
        if passthrough:
            final_est = Pipeline([
                ('final_preprocess', final_est_pipeline(feature_names=x_train.columns.tolist(),
                                                        preprocess_pipeline=pp_selector(final_pp_choice),
                                                        no_of_lvl1=len(lvl1_pipeline))),
                ('debugger', DebuggerTransformer(info='final')),
                ('final_est', XGBRegressor())
            ])
        else:
            final_est = XGBRegressor()

        est = StackingRegressor(estimators=lvl1_pipeline, final_estimator=final_est, passthrough=passthrough)

        with open(f'{results_dir}/results_store.pkl', 'rb') as f:
            model_results = pickle.load(f)
        model_results = {k: pd.DataFrame(v).sort_values('mean_test_score', ascending=False) for k, v in
                         model_results.items()}
        #est.set_params(
        #    **{f'final_estimator__{k}': v for k, v in model_results['lvl2ptvs_xgb'].loc[0, 'params'].items()})
        est.set_params(**model_results['lvl2ptvs_xgb'].loc[0, 'params'])

    prediction = est.fit(x_train, y_train).predict(x_test)
    sub = pd.DataFrame()
    sub['Id'] = x_test['Id']
    sub['SalePrice'] = prediction
    sub.to_csv(f'{results_dir}/{type_}_pp{pp_choice}_predictions.csv', index=False)
Exemplo n.º 19
0
 def reg_ensemble_2(self):
     """
     Regressors Ensemble
     :return: ensempre prediction
     """
     lr, lr_pred = self.linear_regr()
     rf, rf_pred = self.random_forest_regr()
     lasso, lasso_pred = self.lasso_regr()
     lor = LogisticRegression()
     # el, el_pred = self.elastic_net_regr()
     estimators = [
         # ("eln", el),
         ("lasso", lasso),
         ("lr", lr),
         ("rf", rf)
     ]
     reg = StackingRegressor(estimators=estimators,
                             final_estimator=RandomForestRegressor(),
                             cv=5, #10
                             n_jobs=-1)
     reg.fit(self.x_train, self.y_train)
     return reg.predict(self.x_test)
Exemplo n.º 20
0
    def test_stacking_regression(self):
        self._init_ray()

        from sklearn.model_selection import train_test_split
        from sklearn.datasets import load_diabetes
        from sklearn.linear_model import RidgeCV
        from sklearn.ensemble import RandomForestRegressor
        from sklearn.ensemble import StackingRegressor

        X, y = load_diabetes(return_X_y=True)
        estimators = [
            ("gbm", RayXGBRegressor(objective="reg:squarederror")),
            ("lr", RidgeCV()),
        ]
        reg = StackingRegressor(
            estimators=estimators,
            final_estimator=RandomForestRegressor(
                n_estimators=10, random_state=42),
        )

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, random_state=42)
        reg.fit(X_train, y_train).score(X_test, y_test)
Exemplo n.º 21
0
#Step 1:Loading data
X, y = load_boston(return_X_y=True)

#Step 2:Split data
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=40)

#step3:Training
regression = StackingRegressor(estimators=[
    ('knn',
     KNeighborsRegressor(n_neighbors=4,
                         weights='distance',
                         leaf_size=1,
                         metric='manhattan')),
    ('dt', GradientBoostingRegressor(max_depth=3, n_estimators=220))
],
                               final_estimator=Ridge(random_state=40),
                               cv=5,
                               n_jobs=-1)
regression.fit(X_train, y_train)
score_train = regression.score(X_train, y_train)
score_test = regression.score(X_test, y_test)
pred_train = regression.predict(X_train)
pred_test = regression.predict(X_test)
rmse_train = np.sqrt(metrics.mean_squared_error(pred_train, y_train))
rmse_test = np.sqrt(metrics.mean_squared_error(pred_test, y_test))
print('RMSE:{:.2f}/{:.2f}'.format(rmse_train, rmse_test))
print('R2Score:{:.2f}/{:.2f}'.format(score_train, score_test))
Exemplo n.º 22
0
df['IsNew'] = df.YearBuilt.apply(lambda x: 1 if x > 2000 else 0)

df['IsOld'] = df.YearBuilt.apply(lambda x: 1 if x < 1946 else 0)

df.drop('MiscFeature', axis=1, inplace=True)

# ------------------------------- #

df['Age'] = df['YrSold'] - df['YearBuilt']

df['BsmtTotalBathRooms'] = df['BsmtFullBath'] + df['BsmtHalfBath']

df['AbvGradeTotalBathRooms'] = df['FullBath'] + df['HalfBath']

df['Total Rooms'] = df['BedroomAbvGr'] + df['BsmtFullBath'] + df['BsmtHalfBath'] + df['FullBath'] + df['HalfBath'] \
                    + df['TotRmsAbvGrd'] + df['KitchenAbvGr']

stack.fit(X, y)
test = scale.fit_transform(df[Importances.nlargest(int(best_col)).index])

pred = stack.predict(scale.fit_transform(test))

sub['SalePrice'] = pred
sub.to_csv('submission_2.csv', index=False)
# the score is around RMSE(0.3400) on Kaggle

# ------- Plot best cols ------- #
plt.figure(figsize=(20, 15))
Importances.nlargest(int(best_col)).plot(kind='barh')
plt.show()
Exemplo n.º 23
0
params_XGB = {
    'reg_alpha': 0.001,
    'eta': 0.03,
    'reg_lambda': 0.001,
    'max_depth': 4,
    'n_estimators': 1000,
    'colsample_bytree': 0.6,
    'subsample': 0.6
}
XGB_reg.set_params(**params_XGB)

lr_lasso = Lasso(max_iter=10000, alpha=0.0002)

lr_ridge = Ridge(max_iter=10000, alpha=1.298710621242485)

#Creating stacked model
estimators = [('lasso', lr_lasso), ('xgb', XGB_reg), ('ridge', lr_ridge)]

reg = StackingRegressor(estimators=estimators)
reg.fit(X_train, y_train)

#Creating submission
submission_creator(reg, '_RidgeXGBLassoStack')

#Creating averaged model
vot = VotingRegressor(estimators=estimators)

#Creating submission
vot.fit(X_train, y_train)
submission_creator(vot, '_RidgeXGBLassoAverage')
Exemplo n.º 24
0
                   alpha=0.0001,
                   verbose=False,
                   max_iter=400)
rf = RandomForestRegressor(n_jobs=-1,
                           max_depth=25,
                           n_estimators=900,
                           random_state=0)
# adaknn = AdaBoostRegressor(base_estimator=knn, random_state=0, n_estimators=9)
bagdt = BaggingRegressor(base_estimator=dt, n_estimators=300, random_state=0)
# rf.fit(X_train,y_train)
# pred=rf.predict(X_test)
# -------------------- Stacking voting -----------------------------
stacking = StackingRegressor(estimators=[('bagdt', bagdt), ("mlp", mlp),
                                         ("randomForest", rf)],
                             n_jobs=-1)
stacking.fit(X, y)
y_pred_stacking = stacking.predict(df_test)
print(y_pred_stacking)

# ------------------ Predict the registered ones -------------------------
# knn = KNeighborsRegressor(n_jobs=-1, n_neighbors=2, weights='distance', p=1)
dt = DecisionTreeRegressor(random_state=0)
mlp = MLPRegressor(hidden_layer_sizes=(100, 60, 40, 20),
                   activation='relu',
                   solver='lbfgs',
                   alpha=0.0001,
                   verbose=False,
                   max_iter=400)
rf = RandomForestRegressor(n_jobs=-1,
                           max_depth=25,
                           n_estimators=900,
Exemplo n.º 25
0
train_prct = 0.8
n_train = int(round(X.shape[0] * train_prct))

## Models
knn = KNeighborsRegressor(n_neighbors=5)
svm = SVR()
rf = RandomForestRegressor(n_estimators=100, criterion='mse', random_state=0)
decision_tree = DecisionTreeRegressor(max_depth=3, max_features=2)
bayesian_ridge = BayesianRidge()

base_models = [("KNN", knn), ("SVM", svm), ("DecisionTree", decision_tree),
               ("RandomForest", rf)]

## Fit
stacked_learner = StackingRegressor(base_models, cv=N_FOLDS)
stacked_learner = stacked_learner.fit(X[:n_train], Y[:n_train])
y_pred_test = stacked_learner.predict(X[n_train:])
residuals_stacked = Y[n_train:] - y_pred_test
residuals_stacked_train = Y[:n_train] - stacked_learner.predict(X[:n_train])

adaboost = AdaBoostRegressor(n_estimators=100, loss="square", random_state=0)
adaboost = adaboost.fit(X[:n_train], Y[:n_train])
y_pred_test = adaboost.predict(X[n_train:])
residuals_adaboost = Y[n_train:] - y_pred_test
residuals_adaboost_train = Y[:n_train] - adaboost.predict(X[:n_train])

## Predict on entire dataset
y_pred = stacked_learner.predict(X)
df = pd.DataFrame.from_dict({
    "state": data.state,
    "population": data.population,
                           min_samples_leaf=2,
                           max_features='sqrt',
                           max_depth=5,
                           oob_score=True)),
]

stack = StackingRegressor(estimators=estimators,
                          final_estimator=RandomForestRegressor(
                              n_estimators=1400,
                              min_samples_split=2,
                              min_samples_leaf=2,
                              max_features='sqrt',
                              max_depth=5,
                              oob_score=True))

stack.fit(Xtrainv, ytrainv)
stack_train_pred = stack.predict(Xtrainv)
stack_val_pred = stack.predict(Xtestv)
stack_test_pred = stack.predict(Xtest)

stack_train_mse = mean_squared_error(ytrainv, stack_train_pred)
stack_val_mse = mean_squared_error(ytestv, stack_val_pred)
stack_test_mse = mean_squared_error(ytest, stack_test_pred)

print("RMSE using StackRegressor:\t{}\t{}\t{}\n".format(
    np.sqrt(stack_train_mse), np.sqrt(stack_val_mse), np.sqrt(stack_test_mse)))

df_rf = pd.DataFrame({'Actual': ytest, 'Predicted': stack_test_pred})
fig1 = pp.figure(figsize=(8, 6))
df_rf.head(n=300).plot()
pp.legend()
# 개별 모델이 예측한 데이터를 기반으로 final_estimator 종합하여 예측을 수행한다.
    성능을 극으로 끌어올릴 때 활용한다.
    과대적합을 유발할 수 있다.(특히 데이터셋이 적은 경우)
    시간이 많이 소요된다.
"""
from sklearn.ensemble import StackingRegressor

stack_models = [
    ('elasticnet', poly_pipeline),
    ('randomforest', rfr),
    ('gbr', gbr),
    ('lgbm', lgbm),
]

stack_reg = StackingRegressor(stack_models, final_estimator=xgb, n_jobs=-1)
stack_reg.fit(x_train, y_train)
stack_pred = stack_reg.predict(x_test)
mse_eval('Stacking Ensemble', stack_pred, y_test)

## Weighted Blending
"""
각 모델의 예측값에 대하여 weight(가중치)를 곱하여 최종 output 계산
    모델에 대한 가중치를 조절하여, 최종 output을 산출한다.
    가중치의 합은 1.0이 되도록 한다.
"""

final_outputs = {
    'elasticnet': poly_pred,
    'randomforest': rfr_pred,
    'gbr': gbr_pred,
    'xgb': xgb_pred,
Exemplo n.º 28
0
class AnalyticalModel:
    scorer_list = ['neg_mean_squared_error', 'neg_mean_absolute_error', 'r2']

    def __init__(self, data: pd.DataFrame, target: str, training_split=0.8, one_out=False, model_config=None,
                 random_seed=42, cv_folds=10, cv_reps=20):
        self.target = target
        self.data = data
        self.attribute_data = self.data.drop(target, axis=1)
        self.target_data = self.data[[target]]
        self.attributes = self.attribute_data.columns
        self.one_out = one_out
        self.training_split = training_split
        self.random_seed = random_seed
        self.cv_folds = cv_folds
        self.cv_reps =cv_reps
        if one_out:
            self.x_train, self.x_test, self.y_train, self.y_test = (
                self.attribute_data,
                self.attribute_data,
                self.target_data,
                self.target_data
            )
        else:
            self.x_train, self.x_test, self.y_train, self.y_test = sk.model_selection.train_test_split(
                self.attribute_data,
                self.target_data,
                test_size=(1. - self.training_split),
                random_state=self.random_seed
            )
        # self.train_n = int(self.data.shape[0] * self.training_split) if not self.one_out else int(self.data.shape[0])
        self.train_n = int(self.x_train.shape[0])
        self.model = None
        self.model_configs = model_config
        self.results = None
        self.score = None
        self.confusion = None
        self.coef = None
        self.r2 = None
        self.r2_adjusted = None
        self.mse = None
        self.rmse = None
        self.anderson = None
        self.anderson_p = None
        self.residuals = None
        self.predictions = None
        self.aic = None
        self.aaic = None
        self.bic = None
        self.eval = None
        self.build_model()

    def build_mlr(self, params):
        """
        Build, fit and predict with a multiple linear regression model.
        :param params:
        :return:
        """
        self.model = make_pipeline(
            sk.preprocessing.StandardScaler(),
            sk.linear_model.LinearRegression(**params)
        )
        y = self.y_train.to_numpy().flatten()
        self.results = self.model.fit(self.x_train, y)
        self.score = self.model.score(self.x_test, self.y_test)
        self.predictions = self.results.predict(self.x_test)
        self.coef = None
        y_test_f = self.y_test.to_numpy().flatten()
        res = (y_test_f - self.predictions)
        self.residuals = res

    def build_linear_svr(self, params):
        """
        Build, fit and predict with a Linear Support Vector Regressor
        :param params:
        :return:
        """
        self.model = make_pipeline(
            sk.preprocessing.StandardScaler(),
            sk.svm.LinearSVR(random_state=self.random_seed, tol=1e-4, max_iter=5000, C=1, **params)
        )
        y = self.y_train.to_numpy().flatten()
        self.results = self.model.fit(self.x_train, y)
        self.predictions = self.results.predict(self.x_test)
        self.coef = None
        y_test_f = self.y_test.to_numpy().flatten()
        res = (y_test_f - self.predictions)
        self.residuals = res

    def build_gbr(self, params):
        """
        Build, fit and predict with a Gradient Boost Regressor
        :param params:
        :return:
        """
        self.model = make_pipeline(
            sk.preprocessing.StandardScaler(),
            GradientBoostingRegressor(random_state=self.random_seed, **params)
        )
        y = self.y_train.to_numpy().flatten()
        self.results = self.model.fit(self.x_train, y)
        self.predictions = self.results.predict(self.x_test)
        self.coef = None
        y_test_f = self.y_test.to_numpy().flatten()
        res = (y_test_f - self.predictions)
        self.residuals = res

    def build_elastic_net(self, params):
        """
        Build, fit and predict with an Elastic Net CV

        :param params:
        :return:
        """
        self.model = make_pipeline(
            sk.preprocessing.StandardScaler(),
            sk.linear_model.ElasticNetCV(**params)
        )
        y = self.y_train.to_numpy().flatten()
        self.results = self.model.fit(self.x_train, y)
        self.predictions = self.results.predict(self.x_test)
        self.coef = None
        y_test_f = self.y_test.to_numpy().flatten()
        res = (y_test_f - self.predictions)
        self.residuals = res

    def build_rfr(self, params):
        """
        Build, fit and predict with a Random Forest Regressor
        :param params:
        :return:
        """
        self.model = make_pipeline(
            sk.preprocessing.StandardScaler(),
            RandomForestRegressor(random_state=self.random_seed, **params)
        )
        y = self.y_train.to_numpy().flatten()
        self.results = self.model.fit(self.x_train, y)
        self.predictions = self.results.predict(self.x_test)
        self.coef = None
        y_test_f = self.y_test.to_numpy().flatten()
        res = (y_test_f - self.predictions)
        self.residuals = res

    def build_svr(self, params):
        """
        Build, fit and predict with a Support Vector Regressor
        :param params:
        :return:
        """
        self.model = make_pipeline(
            sk.preprocessing.StandardScaler(),
            sk.svm.SVR(kernel='rbf',tol=1e-4,max_iter=5000, C=1, **params)
        )
        y = self.y_train.to_numpy().flatten()
        self.results = self.model.fit(self.x_train, y)
        self.predictions = self.results.predict(self.x_test)
        self.coef = None
        y_test_f = self.y_test.to_numpy().flatten()
        res = (y_test_f - self.predictions)
        self.residuals = res

    def build_stacker(self, train_x, train_y, test_x, test_y, params):
        """
        Build, fit and predict with a stacking regressor ensemble.
        :param train_x:
        :param train_y:
        :param test_x:
        :param test_y:
        :param params:
        :return:
        """
        # n_train_x = sk.preprocessing.scale(train_x, axis=1)
        if "estimators" in params.keys():
            estimators = []
            for e in params["estimators"]:
                # example estimator would be 'linear_model.RidgeCV', where the group and type must match the scikit-learn model
                sm = e.split(".")
                estimator = (sm[1], getattr(getattr(sk, sm[0]), sm[1]))
                estimators.append(estimator)
        else:
            estimators = [
                ('lr', sk.linear_model.LinearRegression()),
                # ('svr', sk.svm.LinearSVR(random_state=42)),
                ('enet', sk.linear_model.ElasticNetCV()),
                ('ridge', sk.linear_model.RidgeCV())
            ]
        self.model = StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor(random_state=42),  passthrough=False, n_jobs=-1)
        self.results = self.model.fit(train_x, train_y)
        pred_y = self.results.predict(test_x)
        self.predictions = pred_y
        test_y = test_y.to_numpy().flatten()
        self.coef = None
        res = test_y - pred_y
        self.residuals = res

    def build_model(self, weights=None):
        test_n = self.train_n if not self.one_out else 0
        test_m = self.data.shape[0] if not self.one_out else self.data.shape[0]
        train_data = self.data[0:self.train_n]
        test_data = self.data[test_n:test_m]
        train_x = train_data[self.attributes]
        train_y = train_data[[self.target]]
        test_x = test_data[self.attributes]
        test_y = test_data[[self.target]]
        weights = np.ones(train_x.shape[0]) if weights is None else weights

        y = train_y.to_numpy().flatten()
        x = train_x.to_numpy()
        model_configs = {} if self.model_configs is None else self.model_configs
        if "type" in model_configs.keys():
            params = model_configs["params"] if "params" in model_configs.keys() else {}
            if model_configs["type"] == "MLR":
                self.build_mlr(params)
            elif model_configs["type"] == "LinearSVR":
                self.build_linear_svr(params)
            elif model_configs["type"] == "GBR":
                self.build_gbr(params)
            elif model_configs["type"] == "RFR":
                self.build_rfr(params)
            elif model_configs["type"] == "SVR":
                self.build_svr(params)
            elif model_configs["type"] == "ElasticNetCV":
                self.build_elastic_net(params)
            # elif model_configs["type"] == "Stacker":
            #     self.build_stacker(params)
            else:
                self.build_mlr(params)
        else:
            model_configs["type"] = "MLR"
            self.model_configs = model_configs
            self.build_mlr({})

        n = float(self.data.shape[0])
        p = float(self.data.shape[1] - 1.)
        sse = np.sum(np.power(self.residuals, 2))
        sst = np.sum(np.power(test_y - np.mean(test_y), 2))
        self.r2 = ((sst - sse) / sst).round(4)
        self.r2_adjusted = (self.r2 - (1. - self.r2) * 2. / (n - 3.)).round(4)
        self.rmse = (np.sqrt(sse / (n - p - 1.))).round(4)
        self.mse = (np.power(self.rmse, 2)).round(4)
        self.aic = (n * np.log(sse / n) + (2. * p) + n + 2.).round(4)
        self.aaic = (self.aic + (2. * (p + 1.) * (p + 2.))/(n - p - 2.)).round(4)
        self.bic = ((n * np.log(sse/n)) + (p * np.log(n))).round(4)
        self.results.aic = self.aic
        self.results.bic = self.bic

        self.anderson = scipy.stats.anderson(self.residuals)
        self.anderson_pvalue(replicate=True)

    def anderson_pvalue(self, replicate=True):
        ad = self.anderson.statistic
        if replicate:
            if ad < 2:
                p = 1. - np.exp(-1.2337141/ad) / np.sqrt(ad) * (2.00012+(.247105-(.0649821-(.0347962-(.011672-.00168691*ad)*ad)*ad)*ad)*ad)
            else:
                p = 1. - np.exp(-1.*np.exp(1.0776-(2.30695-(.43424-(.082433-(.008056 -.0003146*ad)*ad)*ad)*ad)*ad))
        else:
            # https://www.spcforexcel.com/knowledge/basic-statistics/anderson-darling-test-for-normality
            ad = ad * (1. + (.75/50.) + 2.25/(50.**2))
            if ad >= 0.6:
                p = 1. - np.exp(1.2937 - 5.709*ad + 0.0186*(ad**2))
            elif 0.34 < ad < 0.6:
                p = 1. - np.exp(0.9177 - 4.279*ad - 1.38*(ad**2))
            elif 0.2 < ad < 0.34:
                p = 1.0 - np.exp(-8.318 + 42.796*ad - 59.938*(ad**2))
            else:
                p = 1.0 - np.exp(-13.436 + 101.14*ad - 223.73*(ad**2))
        self.anderson_p = p

    def evaluate_VIF(self, threshold=5.0):
        valid = True
        subset = self.data[list(self.attributes)]
        if len(self.attributes) > 1:
            for i in range(0, len(self.attributes)):
                subset_data = subset.drop(self.attributes[i], axis=1)
                mod = sm.OLS(subset[self.attributes[i]], sm.add_constant(subset_data))
                res = mod.fit()
                vif2 = 1. / (1. - res.rsquared)
                if vif2 > threshold:
                    valid = False
                    break
        if valid:
            return True
        else:
            return False

    def evaluate(self, use="rmse", ad=True, check_VIF=False, exclude=True):
        use = use.lower()
        self.eval = use
        if use == "r2":
            metric = abs(self.r2) - 1.0
        elif use == "r2a":
            metric = abs(self.results.rsquared_adj) - 1.0
        elif use == "rmse":
            metric = self.rmse
        elif use == "press":
            r = smo.OLSInfluence(self.results)
            metric = r.ess_press
        elif use == "aic":
            metric = self.aic
        elif use == "caic":
            k = self.data.shape[1] - 1
            n = self.results.nobs
            metric = self.aic + ((2*(k*k) + 2*k)/(n - k - 1))
        elif use == "bic":
            metric = self.bic
        else:
            metric = self.mse
        if ad:
            if self.anderson_p < 0.05:
                if exclude:
                    metric = float("inf")
                else:
                    metric = 10000
        if check_VIF:
            if not self.evaluate_VIF():
                if exclude:
                    metric = float("inf")
                else:
                    metric = 10000      # Allows for model to still be on the list but will let better models get added.
        return metric

    def plot_results(self):
        test_data = self.data[self.train_n:] if not self.one_out else self.data[0:self.train_n]
        test_y = test_data[[self.target]]
        pred_y = self.predictions

        plt.subplot(2, 1, 1)
        plt.title("{} Model Results ({}: {}) \n Attributes: {}".format(
            self.model_configs["type"], self.eval, getattr(self, self.eval), ", ".join(list(self.attributes)))
        )
        plot_x = np.arange(0, self.residuals.shape[0])
        plt.scatter(plot_x, test_y, color='gray', linewidth=1)
        plt.scatter(plot_x, pred_y, color='red', linewidth=1)
        plt.ylabel("Prediction/Actual")
        plt.axhline(y=np.mean(pred_y), linewidth=0.5, color='black')
        red_patch = mpatches.Patch(color='red', label='Prediction')
        gray_patch = mpatches.Patch(color='gray', label='Actual')
        plt.legend(handles=[gray_patch, red_patch])

        plt.subplot(2, 1, 2)
        plt.scatter(pred_y, self.residuals, facecolors='none', edgecolors='blue')
        plt.axhline(linewidth=0.5, color='black')
        plt.ylabel("Fitted vs Residuals")
        plt.show()

    def print_summary(self):
        test_data = self.data[self.train_n:] if not self.one_out else self.data[0:self.train_n]
        test_y = test_data[[self.target]]
        pred_y = self.predictions
        max_error = skm.max_error(test_y, pred_y)
        mean_absolute_error = skm.mean_absolute_error(test_y, pred_y)
        median_absolute_error = skm.median_absolute_error(test_y, pred_y)
        print("\n----------------- Model Summary ----------------")
        print("Type: {}\t\tEvaluation Criteria: {}".format(self.model_configs["type"], self.eval).expandtabs(15))
        print("Response: {}\t\tAttributes: {}".format(self.target, ", ".join(list(self.attributes))).expandtabs(15))
        print("Total Data Records: {}\t\tTraining Data Split: {}".format(self.data.shape[0], self.training_split).expandtabs(15))
        print("Total Training Records: {}\t\tTotal Testing Records: {}".format(self.train_n, test_data.shape[0]).expandtabs(15))
        print("R Squared: {}\t\tMean Squared Error: {}\t\tRoot Mean Squared Error: {}".format(round(self.r2,4), round(self.mse,4), round(self.rmse,4)).expandtabs(15))
        print("Max Error: {}\t\tMean Absolute Error: {}\t\tMedian Absolute Error: {}".format(
            round(max_error, 4), round(mean_absolute_error, 4), round(median_absolute_error,4)).expandtabs(15))

    def print_summary2(self):
        print(self.results.summary())
Exemplo n.º 29
0
def test_stacking_regressor_error(y, params, type_err, msg_err):
    with pytest.raises(type_err, match=msg_err):
        reg = StackingRegressor(**params, cv=3)
        reg.fit(scale(X_diabetes),
                y,
                sample_weight=np.ones(X_diabetes.shape[0]))
X = vectorizedData[:, :-1]
Y = vectorizedData[:, -1]
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=0)

baseModels = [('ridgeRegressor', linear_model.Ridge(alpha=0.01)),
              ('randomForestRegressor',
               RandomForestRegressor(max_depth=10,
                                     random_state=0,
                                     n_estimators=15,
                                     max_features=0.5)),
              ('supportVectorRegressor', svm.SVR(C=10, epsilon=0.5))]
stackedRegressor = StackingRegressor(estimators=baseModels)
stackedRegressor.fit(X_train, Y_train)
trainingError = np.mean((stackedRegressor.predict(X_train) - Y_train)**2)
print("Training Error: %.6f" % trainingError)
Y_predict_unscaled = stackedRegressor.predict(X_test)
testingError = np.mean((Y_predict_unscaled - Y_test)**2)
print("Testing Error: %.6f" % testingError)
meanScore = np.mean(imdbScores)
standDeviation = np.std(imdbScores)
Y_predict = Y_predict_unscaled * standDeviation + meanScore
errorsAllowed = [
    0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7,
    0.75, 0.8, 0.85, 0.9, 0.95, 1.0, 1.05, 1.1, 1.15, 1.2, 1.25, 1.3, 1.35,
    1.4, 1.45, 1.5
]
predictionAccuracyList = []
for errorAllowed in errorsAllowed: