Пример #1
0
def build_lasso():
    train_df, test_df = load_data()
    combined_df = pd.concat((train_df.loc[:, 'MSSubClass':'SaleCondition'],
                             test_df.loc[:, 'MSSubClass':'SaleCondition']))

    # feature engineering
    config_categorical_features(combined_df)
    # combined_df = extract_common_features(combined_df)
    log_transform_features(combined_df)
    combined_df = normalize_numerical_features(combined_df)
    combined_df = one_hot_encoding(combined_df)
    missing_value_fill(combined_df)

    X_train = combined_df[:train_df.shape[0]]
    X_test = combined_df[train_df.shape[0]:]
    y = np.log1p(train_df["SalePrice"])

    # models
    lass_model = Lasso(alpha=0.0005, max_iter=1000)

    lgb_params = {
        'lambda_l2': 0,
        'learning_rate': 0.05,
        'min_child_samples': 4,
        'n_estimators': 500,
        'num_leaves': 10
    }
    lgb_model = lgb.LGBMRegressor(**lgb_params)

    xgb_params = {'max_depth': 2, 'n_estimators': 360}
    xgb_model = xgb.XGBRegressor(**xgb_params)

    rf_params = {
        'max_depth': 50,
        'max_features': None,
        'min_samples_leaf': 4,
        'n_estimators': 50
    }
    rf_model = RandomForestRegressor(**rf_params)

    mata_model = rf_model
    model = StackingCVRegressor(regressors=(lass_model, lgb_model, xgb_model,
                                            rf_model),
                                meta_regressor=mata_model,
                                use_features_in_secondary=True)

    model.fit(np.array(X_train), np.array(y))
    print(
        "cross_validation_rmse:",
        np.mean(
            np.sqrt(-cross_val_score(model,
                                     np.array(X_train),
                                     np.array(y),
                                     cv=3,
                                     scoring="neg_mean_squared_error"))))

    # model prediction
    stack_preds = np.expm1(model.predict(np.array(X_test)))
    solution = pd.DataFrame({"id": test_df.Id, "SalePrice": stack_preds})
    solution.to_csv("./house_price/submission_stack_v1.csv", index=False)
Пример #2
0
def test_gridsearch_numerate_regr():
    svr_lin = SVR(kernel='linear', gamma='auto')
    ridge = Ridge(random_state=1)
    svr_rbf = SVR(kernel='rbf', gamma='auto')
    stack = StackingCVRegressor(regressors=[svr_lin, ridge, ridge],
                                meta_regressor=svr_rbf,
                                random_state=42)

    params = {
        'ridge-1__alpha': [0.01, 1.0],
        'ridge-2__alpha': [0.01, 1.0],
        'svr__C': [0.01, 1.0],
        'meta_regressor__C': [0.01, 1.0],
        'use_features_in_secondary': [True, False]
    }

    grid = GridSearchCV(estimator=stack,
                        param_grid=params,
                        cv=5,
                        iid=False,
                        refit=True,
                        verbose=0)
    grid = grid.fit(X1, y)
    got = round(grid.best_score_, 1)
    assert got >= 0.1 and got <= 0.2, '%f is wrong' % got
Пример #3
0
def test_gridsearch_replace_mix():
    svr_lin = SVR(kernel='linear', gamma='auto')
    ridge = Ridge(random_state=1)
    svr_rbf = SVR(kernel='rbf', gamma='auto')
    lr = LinearRegression()
    lasso = Lasso(random_state=1)
    stack = StackingCVRegressor(regressors=[svr_lin, lasso, ridge],
                                meta_regressor=svr_rbf,
                                shuffle=False)

    params = {
        'regressors': [[svr_lin, lr]],
        'linearregression': [None, lasso, ridge],
        'svr__kernel': ['poly']
    }

    grid = GridSearchCV(estimator=stack,
                        param_grid=params,
                        cv=KFold(5, shuffle=True, random_state=42),
                        iid=False,
                        refit=True,
                        verbose=0)
    grid = grid.fit(X1, y)

    got1 = round(grid.best_score_, 2)
    got2 = len(grid.best_params_['regressors'])
    got3 = grid.best_params_['regressors'][0].kernel

    assert got1 == 0.73, got1
    assert got2 == 2, got2
    assert got3 == 'poly', got3
Пример #4
0
def build_StackingModelCV(posBias=False, simple=False, cvRun=5):
    
    if simple:
        adaModel = AdaBoostRegressor(DecisionTreeRegressor(),n_estimators=100)
        etrModel = ExtraTreesRegressor(n_estimators=100)
        xgbModel = XGBRegressor(n_estimators=100)
        rfgModel = RandomForestRegressor(n_estimators=100)
        gbrModel = GradientBoostingRegressor(n_estimators=100)
        adaETRModel = AdaBoostRegressor(ExtraTreeRegressor(),n_estimators=100)
        lgbmModel = LGBMRegressor(learning_rate=0.8,n_estimators=100)
    else:
        adaModel = AdaBoostRegressor(DecisionTreeRegressor(**adaDTR_dtrParams), **adaDTR_adaParams8)
        etrModel = ExtraTreesRegressor(**etrParams)
        xgbModel = XGBRegressor(**xgbParams)
        rfgModel = RandomForestRegressor(**rfgParams)
        gbrModel = GradientBoostingRegressor(**gbrParams)
        adaETRModel = AdaBoostRegressor(ExtraTreeRegressor(**etParams),**adaParams)
        lgbmModel = LGBMRegressor(**lgbmParams)

    if posBias:
        metaModel = ElasticNet(alpha=0, positive=True)
    else:
        metaModel = LinearRegression()
        
    stregr = StackingCVRegressor(regressors=[adaModel, etrModel, xgbModel,rfgModel,gbrModel,adaETRModel,lgbmModel], meta_regressor=metaModel, cv=cvRun)
    
    return stregr
def test_sparse_matrix_inputs():
    lr = LinearRegression()
    svr_lin = SVR(kernel='linear', gamma='auto')
    ridge = Ridge(random_state=1)
    svr_rbf = SVR(kernel='rbf', gamma='auto')
    stack = StackingCVRegressor(regressors=[svr_lin, lr, ridge],
                                meta_regressor=svr_rbf,
                                random_state=42)

    # dense
    stack.fit(X1, y).predict(X1)
    mse = 0.21
    got = np.mean((stack.predict(X1) - y)**2)
    assert round(got, 2) == mse, got

    # sparse
    stack.fit(sparse.csr_matrix(X1), y)

    if Version(sklearn_version) < Version("0.21"):
        expected_value = 0.20
    elif Version(sklearn_version) < Version("0.22"):
        expected_value = 0.20
    else:
        expected_value = 0.21

    got = np.mean((stack.predict(sparse.csr_matrix(X1)) - y)**2)
    assert round(got, 2) == expected_value, got
Пример #6
0
    def regression_models(self):
        """Regression models"""
        # Support Vector Regression (SVR) used for working with continuous values (tune para for diff results)
        self.svr = make_pipeline(
            RobustScaler(), SVR(gamma=0.1, C=20, epsilon=0.008)
        )  # Small gamma value define a Gaussian function with a large variance

        # Light GBM
        self.lightgbm = LGBMRegressor(objective='regression',
                                      num_leaves=4,
                                      learning_rate=0.01,
                                      n_estimators=5000,
                                      max_bin=200,
                                      bagging_fraction=0.75,
                                      bagging_freq=5,
                                      bagging_seed=7,
                                      feature_fraction=0.2,
                                      feature_fraction_seed=7,
                                      verbose=-1)
        # XGBoost
        self.xgboost = XGBRegressor(learning_rate=0.01,
                                    n_estimators=3460,
                                    max_depth=3,
                                    min_child_weight=0,
                                    gamma=0,
                                    subsample=0.7,
                                    colsample_bytree=0.7,
                                    objective='reg:linear',
                                    nthread=-1,
                                    scale_pos_weight=1,
                                    seed=27,
                                    reg_alpha=0.00006)
        # Emsemble learning, using multiple regressors to predict
        self.stack_reg = StackingCVRegressor(regressors=(self.ridge, self.lasso, self.elastic_net, self.svr, self.lightgbm, self.xgboost),
                                        meta_regressor=self.xgboost, use_features_in_secondary=True)\
Пример #7
0
def main():

    x_train, x_val, y_train, y_val, x_test = get_dataset(0.1,
                                                         "xg",
                                                         is_duan=False)

    x_train_2, x_val_2, y_train, y_val, x_test_2 = get_dataset(0.1,
                                                               "rf",
                                                               is_duan=False)

    lr_regressor = get_best_lr_model(None)
    lr_regressor, y_train_pred_1 = train_model(x_train, y_train, lr_regressor)

    xg_regressor = get_best_xg_model(None)
    xg_regressor, y_train_pred_2 = train_model(x_train, y_train, xg_regressor)

    gb_regressor = get_best_xg_model(None)
    gb_regressor, y_train_pred_3 = train_model(x_train, y_train, gb_regressor)

    rf_regressor = get_best_rf_model(None)
    rf_regressor, y_train_pred_4 = train_model(x_train_2, y_train,
                                               rf_regressor)

    stack_gen = StackingCVRegressor(regressors=(xg_regressor, lr_regressor,
                                                gb_regressor, rf_regressor),
                                    meta_regressor=xg_regressor,
                                    use_features_in_secondary=True)

    stack_gen, y_train_pred_5 = train_model(x_train, y_train, stack_gen)

    eval_model(x_val, y_val, lr_regressor)
    eval_model(x_val, y_val, xg_regressor)
    eval_model(x_val, y_val, gb_regressor)
    eval_model(x_val_2, y_val, rf_regressor)
    eval_model(x_val, y_val, stack_gen)

    expected_exp_residual, residual_vector = get_mean_exp_residual(
        y_train_pred_5, y_train)

    print(expected_exp_residual)

    def blended_predictions(x1, x2):
        return (
            (0.35 * lr_regressor.predict(x1))(0.1 * gb_regressor.predict(x1)) +
            (0.1 * xg_regressor.predict(x1)) +
            (0.05 * rf_regressor.predict(x2)) +
            (0.1 * stack_gen.predict(np.array(x1))))

    y_test = blended_predictions(x_test, x_test_2)

    print(y_test)

    y_test = tf.math.exp(y_test)
    y_test = tf.math.multiply(y_test, expected_exp_residual)
    y_test = y_test.numpy()
    print(y_test)
    y_test = pd.Series(y_test)
    y_test.index = pd.RangeIndex(start=1461, stop=2920, step=1)
    y_test.to_csv("./data/submission.csv", sep=",")
Пример #8
0
    def stack(self, ridge, lasso, elasticnet, gbr, xgboost, lightgbm):
        stackgen = StackingCVRegressor(regressors=(ridge, lasso, elasticnet,
                                                   gbr, xgboost, lightgbm),
                                       meta_regressor=xgboost,
                                       use_features_in_secondary=True)

        stackgen.fit(self.X, self.y)
        return stackgen
def test_clone():
    lr = LinearRegression()
    svr_rbf = SVR(kernel='rbf', gamma='auto')
    ridge = Ridge(random_state=1)
    stregr = StackingCVRegressor(regressors=[lr, ridge],
                                 meta_regressor=svr_rbf,
                                 store_train_meta_features=True)
    clone(stregr)
Пример #10
0
def test_unsupported_meta_regressor():
    lr = LinearRegression()
    svr_lin = SVR(kernel='linear', gamma='auto')
    ridge = Ridge(random_state=1)
    lasso = Lasso()
    stack = StackingCVRegressor(regressors=[svr_lin, lr, ridge],
                                meta_regressor=lasso)
    stack.fit(X1, y, sample_weight=w).predict(X1)
Пример #11
0
def test_unsupported_regressor():
    lr = LinearRegression()
    svr_lin = SVR(kernel='linear')
    ridge = Ridge(random_state=1)
    lasso = Lasso(random_state=1)
    svr_rbf = SVR(kernel='rbf')
    stack = StackingCVRegressor(regressors=[svr_lin, lr, ridge, lasso],
                                meta_regressor=svr_rbf)
    stack.fit(X1, y, sample_weight=w).predict(X1)
def test_predict_meta_features():
    lr = LinearRegression()
    svr_rbf = SVR(kernel='rbf')
    ridge = Ridge(random_state=1)
    stregr = StackingCVRegressor(regressors=[lr, ridge],
                                 meta_regressor=svr_rbf)
    X_train, X_test, y_train, y_test = train_test_split(X2, y, test_size=0.3)
    stregr.fit(X_train, y_train)
    test_meta_features = stregr.predict(X_test)
    assert test_meta_features.shape[0] == X_test.shape[0]
def AlgoSCR(df_train, df_trainY, m1, m2, m3, m4, m5):
    model = StackingCVRegressor(regressors=(m1, m2, m3, m4, m5),
                                meta_regressor=m1,
                                use_features_in_secondary=True)
    rmsle_cv(model, df_train, df_trainY)
    model.fit(df_train, df_trainY)
    result = model.predict(df_train)
    print("rms value of same set: ",
          np.around(sqrt(mean_squared_error(df_trainY, result)), decimals=7))
    return model
def test_multivariate():
    lr = LinearRegression()
    svr_lin = SVR(kernel='linear')
    ridge = Ridge(random_state=1)
    svr_rbf = SVR(kernel='rbf')
    stack = StackingCVRegressor(regressors=[svr_lin, lr, ridge],
                                meta_regressor=svr_rbf)
    stack.fit(X2, y).predict(X2)
    mse = 0.19
    got = np.mean((stack.predict(X2) - y) ** 2)
    assert round(got, 2) == mse, '%f != %f' % (round(got, 2), mse)
Пример #15
0
def stack_model(X, y, test):
    elastic = load("../models/elastic_net_cv.joblib")
    gbr = load("../models/gbr.joblib")
    lasso = load("../models/lassocv.joblib")
    lgdm = load("../models/lgdm.joblib")
    ridge = load("../models/ridge.joblib")
    xgboost = load("../models/xgboost.joblib")
    stack_gen = StackingCVRegressor(regressors=(ridge, lasso, elastic, gbr, xgboost, lgdm),
                                    meta_regressor=xgboost, use_features_in_secondary=True, n_jobs=-1)
    stack_gen.fit(np.array(X), np.array(y))
    dump(stack_gen, '../models/result.joblib')
def test_different_models():
    lr = LinearRegression()
    svr_lin = SVR(kernel='linear')
    ridge = Ridge(random_state=1)
    svr_rbf = SVR(kernel='rbf')
    stack = StackingCVRegressor(regressors=[svr_lin, lr, ridge],
                                meta_regressor=svr_rbf)
    stack.fit(X1, y).predict(X1)
    mse = 0.21
    got = np.mean((stack.predict(X1) - y) ** 2)
    assert round(got, 2) == mse
def test_train_meta_features_():
    lr = LinearRegression()
    svr_rbf = SVR(kernel='rbf')
    ridge = Ridge(random_state=1)
    stregr = StackingCVRegressor(regressors=[lr, ridge],
                                 meta_regressor=svr_rbf,
                                 store_train_meta_features=True)
    X_train, X_test, y_train, y_test = train_test_split(X2, y, test_size=0.3)
    stregr.fit(X_train, y_train)
    train_meta_features = stregr.train_meta_features_
    assert train_meta_features.shape[0] == X_train.shape[0]
Пример #18
0
def test_regressor_gridsearch():
    lr = LinearRegression()
    svr_rbf = SVR(kernel='rbf')
    ridge = Ridge(random_state=1)
    stregr = StackingCVRegressor(regressors=[lr], meta_regressor=svr_rbf)

    params = {'regressors': [[ridge, lr], [lr, ridge, lr]]}

    grid = GridSearchCV(estimator=stregr, param_grid=params, cv=5, refit=True)
    grid.fit(X1, y)

    assert len(grid.best_params_['regressors']) == 3
def test_unsupported_meta_regressor():
    # meta regressor with no support for
    # sample_weight should raise error
    lr = LinearRegression()
    svr_lin = SVR(kernel='linear', gamma='auto')
    ridge = Ridge(random_state=1)
    knn = KNeighborsRegressor()
    stack = StackingCVRegressor(regressors=[svr_lin, lr, ridge],
                                meta_regressor=knn)

    with pytest.raises(TypeError):
        stack.fit(X1, y, sample_weight=w).predict(X1)
Пример #20
0
def test_multivariate_class():
    lr = LinearRegression()
    ridge = Ridge(random_state=1)
    meta = LinearRegression(normalize=True)
    stregr = StackingCVRegressor(regressors=[lr, ridge],
                                 meta_regressor=meta,
                                 multi_output=True,
                                 random_state=0)
    stregr.fit(X2, y2).predict(X2)
    mse = 0.13
    got = np.mean((stregr.predict(X2) - y2) ** 2.)
    assert round(got, 2) == mse, got
def test_unsupported_regressor():
    # including regressor that does not support
    # sample_weight should raise error
    lr = LinearRegression()
    svr_lin = SVR(kernel='linear', gamma='auto')
    ridge = Ridge(random_state=1)
    knn = KNeighborsRegressor()
    svr_rbf = SVR(kernel='rbf', gamma='auto')
    stack = StackingCVRegressor(regressors=[svr_lin, lr, ridge, knn],
                                meta_regressor=svr_rbf)
    with pytest.raises(TypeError):
        stack.fit(X1, y, sample_weight=w).predict(X1)
def test_use_features_in_secondary():
    lr = LinearRegression()
    svr_lin = SVR(kernel='linear')
    ridge = Ridge(random_state=1)
    svr_rbf = SVR(kernel='rbf')
    stack = StackingCVRegressor(regressors=[svr_lin, lr, ridge],
                                meta_regressor=svr_rbf,
                                cv=3,
                                use_features_in_secondary=True)
    stack.fit(X1, y).predict(X1)
    mse = 0.2
    got = np.mean((stack.predict(X1) - y) ** 2)
    assert round(got, 2) == mse, '%f != %f' % (round(got, 2), mse)
def test_not_fitted_predict():
    lr = LinearRegression()
    svr_rbf = SVR(kernel='rbf')
    ridge = Ridge(random_state=1)
    stregr = StackingCVRegressor(regressors=[lr, ridge],
                                 meta_regressor=svr_rbf,
                                 store_train_meta_features=True)
    X_train, X_test, y_train, y_test = train_test_split(X2, y, test_size=0.3)

    expect = ("Estimator not fitted, "
              "call `fit` before exploiting the model.")

    assert_raises(NotFittedError, expect, stregr.predict, X_train)
def test_weight_ones():
    # sample_weight = None and sample_weight = ones
    # should give the same result, provided that the
    # randomness of the models is controled
    lr = LinearRegression()
    svr_lin = SVR(kernel='linear', gamma='auto')
    ridge = Ridge(random_state=1)
    svr_rbf = SVR(kernel='rbf', gamma='auto')
    stack = StackingCVRegressor(regressors=[svr_lin, lr, ridge],
                                meta_regressor=svr_rbf,
                                cv=KFold(5, shuffle=True, random_state=5))
    pred1 = stack.fit(X1, y).predict(X1)
    pred2 = stack.fit(X1, y, sample_weight=np.ones(40)).predict(X1)
    assert np.max(np.abs(pred1 - pred2)) < 1e-3
def test_internals():
    lr = LinearRegression()
    regressors = [lr, lr, lr, lr, lr]
    cv = 10
    stack = StackingCVRegressor(regressors=[lr, lr, lr, lr, lr],
                                meta_regressor=lr,
                                cv=cv)
    stack.fit(X3, y2)
    assert stack.predict(X3).mean() == y2.mean()
    assert stack.meta_regr_.intercept_ == 0.0
    assert stack.meta_regr_.coef_[0] == 0.0
    assert stack.meta_regr_.coef_[1] == 0.0
    assert stack.meta_regr_.coef_[2] == 0.0
    assert len(stack.regr_) == len(regressors)
Пример #26
0
def test_get_params():
    lr = LinearRegression()
    svr_rbf = SVR(kernel='rbf')
    ridge = Ridge(random_state=1)
    stregr = StackingCVRegressor(regressors=[ridge, lr],
                                 meta_regressor=svr_rbf)

    got = sorted(list({s.split('__')[0] for s in stregr.get_params().keys()}))
    expect = [
        'cv', 'linearregression', 'meta-svr', 'meta_regressor', 'refit',
        'regressors', 'ridge', 'shuffle', 'store_train_meta_features',
        'use_features_in_secondary'
    ]
    assert got == expect, got
def test_sample_weight():
    lr = LinearRegression()
    svr_lin = SVR(kernel='linear', gamma='auto')
    ridge = Ridge(random_state=1)
    svr_rbf = SVR(kernel='rbf', gamma='auto')
    stack = StackingCVRegressor(regressors=[svr_lin, lr, ridge],
                                meta_regressor=svr_rbf,
                                cv=KFold(4, shuffle=True, random_state=7))
    pred1 = stack.fit(X1, y, sample_weight=w).predict(X1)
    mse = 0.21  # 0.20770
    got = np.mean((stack.predict(X1) - y)**2)
    assert round(got, 2) == mse, "Expected %.2f, but got %.5f" % (mse, got)
    pred2 = stack.fit(X1, y).predict(X1)
    maxdiff = np.max(np.abs(pred1 - pred2))
    assert maxdiff > 1e-3, "max diff is %.4f" % maxdiff
def test_not_fitted_predict():
    lr = LinearRegression()
    svr_rbf = SVR(kernel='rbf', gamma='auto')
    ridge = Ridge(random_state=1)
    stregr = StackingCVRegressor(regressors=[lr, ridge],
                                 meta_regressor=svr_rbf,
                                 store_train_meta_features=True)
    X_train, X_test, y_train, y_test = train_test_split(X2, y, test_size=0.3)

    expect = ("This StackingCVRegressor instance is not fitted yet. Call "
              "'fit' with appropriate arguments before using this method.")

    assert_raises(NotFittedError, expect, stregr.predict, X_train)

    assert_raises(NotFittedError, expect, stregr.predict_meta_features,
                  X_train)
Пример #29
0
def linear_stacking_reg(X, y):
    RANDOM_SEED = 0
    reg1, dict1 = SVM_reg()
    reg2, dict2 = sklearn_GBT_reg()
    reg3, dict3 = xgb_reg()
    reg4, dict4 = ridge_reg()
    reg5, dict5 = lasso_reg()
    reg6, dict6 = Lars_reg()
    reg7, dict7 = KNeighborsRegressor_reg()
    reg8, dict8 = sklearn_RF_reg()
    reg9, dict9 = KernelRidge_reg()
    reg10, dict10 = BayesianRidge_reg()
    params = {}
    my_dict_list = [
        dict1, dict2, dict3, dict4, dict5, dict6, dict7, dict8, dict9, dict10
    ]
    name_list = [
        "svr", "gradientboostingregressor", "xgbregressor", "ridge", "lasso",
        "lars", "kneighborsregressor", "randomforestregressor", "kernelridge",
        "meta_regressor"
    ]
    for i in range(len(my_dict_list)):
        current_dict = my_dict_list[i]
        for xx in current_dict:
            if i == 9:
                continue
            params['%s__%s' % (name_list[i], xx)] = current_dict[xx]
    linear_stacker = StackingCVRegressor(regressors=(reg1, reg2, reg3, reg4,
                                                     reg5, reg6, reg7, reg8,
                                                     reg9),
                                         meta_regressor=reg10,
                                         use_features_in_secondary=True)

    grid = RandomizedSearchCV(linear_stacker,
                              params,
                              n_iter=10,
                              n_jobs=10,
                              scoring="neg_mean_absolute_error",
                              cv=3,
                              verbose=1)

    grid.fit(X.values, y)

    print("Best: %f using %s" % (grid.best_score_, grid.best_params_))
    best_model = grid.best_estimator_

    return best_model
Пример #30
0
    def __init__(self,
                 lasso_target_params=LASSO_TARGET_PARAMS,
                 ridge_target_params=RIDGE_TARGET_PARAMS,
                 lasso_constraints_params=LASSO_CONSTRAINTS_PARAMS,
                 constraint_columns=CONSTRAINT_COLUMNS,
                 target_column=TARGET_COLUMN):

        self.target_estimator = StackingCVRegressor(
            regressors=(Lasso(**lasso_target_params),
                        Ridge(**ridge_target_params)),
            meta_regressor=Ridge(alpha=0.01))
        self.constraints_estimator = Lasso(**lasso_constraints_params)

        self.constraint_columns = constraint_columns
        self.target_column = target_column

        self.test_columns = None