Exemplo n.º 1
0
def test_huber_equals_lr_for_high_epsilon():
    # Test that Ridge matches LinearRegression for large epsilon
    X, y = make_regression_with_outliers()
    lr = LinearRegression(fit_intercept=True)
    lr.fit(X, y)
    huber = HuberRegressor(fit_intercept=True, epsilon=1e3, alpha=0.0)
    huber.fit(X, y)
    assert_almost_equal(huber.coef_, lr.coef_, 3)
    assert_almost_equal(huber.intercept_, lr.intercept_, 2)
Exemplo n.º 2
0
def test_huber_warm_start():
    X, y = make_regression_with_outliers()
    huber_warm = HuberRegressor(
        fit_intercept=True, alpha=1.0, max_iter=10000, warm_start=True, tol=1e-1)
    huber_warm.fit(X, y)
    huber_warm_coef = huber_warm.coef_.copy()
    huber_warm.fit(X, y)

    # SciPy performs the tol check after doing the coef updates, so
    # these would be almost same but not equal.
    assert_array_almost_equal(huber_warm.coef_, huber_warm_coef, 1)

    assert huber_warm.n_iter_ == 0
Exemplo n.º 3
0
def test_huber_warm_start():
    X, y = make_regression_with_outliers()
    huber_warm = HuberRegressor(
        fit_intercept=True, alpha=1.0, max_iter=10000, warm_start=True, tol=1e-1)
    huber_warm.fit(X, y)
    huber_warm_coef = huber_warm.coef_.copy()
    huber_warm.fit(X, y)

    # SciPy performs the tol check after doing the coef updates, so
    # these would be almost same but not equal.
    assert_array_almost_equal(huber_warm.coef_, huber_warm_coef, 1)

    # No n_iter_ in old SciPy (<=0.9)
    # And as said above, the first iteration seems to be run anyway.
    if huber_warm.n_iter_ is not None:
        assert_equal(1, huber_warm.n_iter_)
Exemplo n.º 4
0
def test_huber_and_sgd_same_results():
    # Test they should converge to same coefficients for same parameters

    X, y = make_regression_with_outliers(n_samples=10, n_features=2)

    # Fit once to find out the scale parameter. Scale down X and y by scale
    # so that the scale parameter is optimized to 1.0
    huber = HuberRegressor(fit_intercept=False, alpha=0.0, max_iter=100,
                           epsilon=1.35)
    huber.fit(X, y)
    X_scale = X / huber.scale_
    y_scale = y / huber.scale_
    huber.fit(X_scale, y_scale)
    assert_almost_equal(huber.scale_, 1.0, 3)

    sgdreg = SGDRegressor(
        alpha=0.0, loss="huber", shuffle=True, random_state=0, max_iter=10000,
        fit_intercept=False, epsilon=1.35, tol=None)
    sgdreg.fit(X_scale, y_scale)
    assert_array_almost_equal(huber.coef_, sgdreg.coef_, 1)
Exemplo n.º 5
0
def test_huber_better_r2_score():
    # Test that huber returns a better r2 score than non-outliers"""
    X, y = make_regression_with_outliers()
    huber = HuberRegressor(fit_intercept=True, alpha=0.01, max_iter=100)
    huber.fit(X, y)
    linear_loss = np.dot(X, huber.coef_) + huber.intercept_ - y
    mask = np.abs(linear_loss) < huber.epsilon * huber.scale_
    huber_score = huber.score(X[mask], y[mask])
    huber_outlier_score = huber.score(X[~mask], y[~mask])

    # The Ridge regressor should be influenced by the outliers and hence
    # give a worse score on the non-outliers as compared to the huber regressor.
    ridge = Ridge(fit_intercept=True, alpha=0.01)
    ridge.fit(X, y)
    ridge_score = ridge.score(X[mask], y[mask])
    ridge_outlier_score = ridge.score(X[~mask], y[~mask])
    assert_greater(huber_score, ridge_score)

    # The huber model should also fit poorly on the outliers.
    assert_greater(ridge_outlier_score, huber_outlier_score)
Exemplo n.º 6
0
def test_huber_sparse():
    X, y = make_regression_with_outliers()
    huber = HuberRegressor(fit_intercept=True, alpha=0.1)
    huber.fit(X, y)

    X_csr = sparse.csr_matrix(X)
    huber_sparse = HuberRegressor(fit_intercept=True, alpha=0.1)
    huber_sparse.fit(X_csr, y)
    assert_array_almost_equal(huber_sparse.coef_, huber.coef_)
Exemplo n.º 7
0
    def get_outliers_by_huber(self, table, column_indexes):
        '''
        Get outliers using huber regression, which outperforms RANSAC, 
        but doesn't scale well when the number of samples are very large. 
        Huber outputs both perfect precision (100%) and recall (100%) in our experiments.
        '''
        X = table[ :, column_indexes[ :-1]].astype(float)
        X = utils.enforce_columns(X)
        y = table[ :, column_indexes[-1]].astype(float)

        # preprocessing could make HUBER fail on some dataset in our experiments 
        #x = preprocessing.minmax_scale(x)
        #y = preprocessing.minmax_scale(y)

        model_huber = HuberRegressor()
        model_huber.fit(X, y)

        outlier_mask = model_huber.outliers_
        outliers = [idx for idx, val in enumerate(outlier_mask) if val]

        residuals = abs(model_huber.predict(X) - y)
        confidences = preprocessing.minmax_scale(residuals[outliers])*0.09+0.9

        return (outliers, confidences)
Exemplo n.º 8
0
def test_huber_scaling_invariant():
    # Test that outliers filtering is scaling independent.
    X, y = make_regression_with_outliers()
    huber = HuberRegressor(fit_intercept=False, alpha=0.0, max_iter=100)
    huber.fit(X, y)
    n_outliers_mask_1 = huber.outliers_
    assert_false(np.all(n_outliers_mask_1))

    huber.fit(X, 2. * y)
    n_outliers_mask_2 = huber.outliers_
    assert_array_equal(n_outliers_mask_2, n_outliers_mask_1)

    huber.fit(2. * X, 2. * y)
    n_outliers_mask_3 = huber.outliers_
    assert_array_equal(n_outliers_mask_3, n_outliers_mask_1)
Exemplo n.º 9
0
def test_huber_scaling_invariant():
    """Test that outliers filtering is scaling independent."""
    rng = np.random.RandomState(0)
    X, y = make_regression_with_outliers()
    huber = HuberRegressor(fit_intercept=False, alpha=0.0, max_iter=100,
                           epsilon=1.35)
    huber.fit(X, y)
    n_outliers_mask_1 = huber.outliers_

    huber.fit(X, 2. * y)
    n_outliers_mask_2 = huber.outliers_

    huber.fit(2. * X, 2. * y)
    n_outliers_mask_3 = huber.outliers_

    assert_array_equal(n_outliers_mask_2, n_outliers_mask_1)
    assert_array_equal(n_outliers_mask_3, n_outliers_mask_1)
Exemplo n.º 10
0
def test_huber_sample_weights():
    # Test sample_weights implementation in HuberRegressor"""

    X, y = make_regression_with_outliers()
    huber = HuberRegressor(fit_intercept=True, alpha=0.1)
    huber.fit(X, y)
    huber_coef = huber.coef_
    huber_intercept = huber.intercept_

    huber.fit(X, y, sample_weight=np.ones(y.shape[0]))
    assert_array_almost_equal(huber.coef_, huber_coef)
    assert_array_almost_equal(huber.intercept_, huber_intercept)

    X, y = make_regression_with_outliers(n_samples=5, n_features=20)
    X_new = np.vstack((X, np.vstack((X[1], X[1], X[3]))))
    y_new = np.concatenate((y, [y[1]], [y[1]], [y[3]]))
    huber.fit(X_new, y_new)
    huber_coef = huber.coef_
    huber_intercept = huber.intercept_
    huber.fit(X, y, sample_weight=[1, 3, 1, 2, 1])
    assert_array_almost_equal(huber.coef_, huber_coef, 3)
    assert_array_almost_equal(huber.intercept_, huber_intercept, 3)

    # Test sparse implementation with sample weights.
    X_csr = sparse.csr_matrix(X)
    huber_sparse = HuberRegressor(fit_intercept=True, alpha=0.1)
    huber_sparse.fit(X_csr, y, sample_weight=[1, 3, 1, 2, 1])
    assert_array_almost_equal(huber_sparse.coef_, huber_coef, 3)
Exemplo n.º 11
0
    Class1 = RANSACRegressor(random_state=42)
    Class1.fit(X_train, y_train)
    Class1_predictions = Class1.predict(X_test)
    Class1_accuracy = accuracy_score(y_true, Class1_predictions, normalize=True, sample_weight=None)

    Class2 = TheilSenRegressor(random_state=42)
    Class2.fit(X_train, y_train)
    Class2_predictions = Class1.predict(X_test)
    Class2_accuracy = accuracy_score(y_true, Class2_predictions, normalize=True, sample_weight=None)

    Class3 = LinearRegression()
    Class3.fit(X_train, y_train)
    Class3_predictions = Class3.predict(X_test)
    Class3_accuracy = accuracy_score(y_true, Class3_predictions, normalize=True, sample_weight=None)

    Class4 = HuberRegressor(alpha=0.0, epsilon=epsilon)
    Class4.fit(X_train, y_train)
    Class4_predictions = Class4.predict(X_test)
    Class4_accuracy = accuracy_score(y_true, Class4_predictions, normalize=True, sample_weight=None)

​
#Print different accuracies
    print("First Accuracy: ", Class1_accuracy)
    print("Second Accuracy: ", Class2_accuracy)
    print("Third Accuracy: ", Class3_accuracy)
    print("Fourth Accuracy: ", Class4_accuracy)

​
    return
​
​
Exemplo n.º 12
0
def trainCV(X, y, random, splits):
   

    kf = KFold(n_splits = splits)
    
    nSplits = kf.get_n_splits(X)

    nFold = 0

    l_lgbm = []
    l_ridge = []
    l_huber = []

    y_lgbm  = np.zeros(len (y))
    y_ridge = np.zeros(len (y))
    y_huber = np.zeros(len (y))

    for train_index, valid_index in kf.split(X):
        if is_stop():
            break

        print ("FOLD# " + str(nFold))

        train_X = X[train_index]  
        train_y = y[train_index]

        valid_X = X[valid_index]
        valid_y = y[valid_index]

        price_valid_real = np.expm1(valid_y)

        d_train = lgb.Dataset(train_X, label=train_y)
        d_valid = lgb.Dataset(valid_X, label=valid_y)
        
        watchlist = [d_train, d_valid]
    
        params = { 'learning_rate': 0.01, 'application': 'regression', 'num_leaves': 311, 'verbosity': -1, 'metric': 'RMSE', 'data_random_seed': 1,
                        'bagging_fraction': 0.6, 'bagging_freq': 0, 'nthread': 4, 'max_bin': 255 }

        model_lgbm = lgb.train(params, train_set=d_train, num_boost_round=810, valid_sets=watchlist, verbose_eval=50, early_stopping_rounds=400)

        preds_lgbm = model_lgbm.predict(valid_X)

        y_lgbm[valid_index] = preds_lgbm

        price_lgbm_pred = np.expm1(preds_lgbm)
        o_lgbm = rmsle_func(price_lgbm_pred, price_valid_real)

        print ("LGBM RMSLE: " + str(o_lgbm))
        l_lgbm.append(o_lgbm)

        model_ridge = Ridge(alpha=.05, copy_X=True, fit_intercept=True, max_iter=50, normalize=False, random_state=101, solver='auto', tol=0.001)

        model_ridge.fit(train_X, train_y)

        preds_ridge = model_ridge.predict(valid_X)
        
        y_ridge[valid_index] = preds_ridge
        
        price_ridge_pred = np.expm1(preds_ridge)
        o_ridge = rmsle_func(price_ridge_pred, price_valid_real)

        print ("RIDGE RMSLE: " + str(o_ridge))
        l_ridge.append(o_ridge)

        model_huber = HuberRegressor(fit_intercept=True, alpha=0.01, max_iter=58, epsilon=363)
        model_huber.fit(train_X, train_y)

        preds_huber = model_huber.predict(valid_X)

        y_huber[valid_index] = preds_huber

        price_huber_pred = np.expm1(preds_huber)
        o_huber = rmsle_func(price_huber_pred, price_valid_real)
    
        print ("HUBER RMSLE: " + str(o_huber))
        l_huber.append(o_huber)

        nFold = nFold + 1
    
    a_lgbm = np.array(l_lgbm)
    a_ridge = np.array(l_ridge)
    a_huber = np.array(l_huber)

    print ("LGBM  RMSLE = " + str (a_lgbm.mean()) + " +/- " + str(a_lgbm.std()))
    print ("RIDGE RMSLE = " + str (a_ridge.mean()) + " +/- " + str(a_ridge.std()))
    print ("HUBER RMSLE = " + str (a_huber.mean()) + " +/- " + str(a_huber.std()))

    return [y_lgbm, y_ridge, y_huber]
Exemplo n.º 13
0
# stacking
train_stack = np.vstack([oof_lgb, oof_lgb1, oof_xgb, oof_cat]).transpose()
test_stack = np.vstack(
    [predictions_lgb, predictions_lgb1, predictions_xgb,
     predictions_cat]).transpose()
folds_stack = StratifiedKFold(n_splits=10, shuffle=True, random_state=8888)
oof_stack = np.zeros(train_stack.shape[0])
predictions = np.zeros(test_stack.shape[0])

for fold_, (trn_idx,
            val_idx) in enumerate(folds_stack.split(train_stack, y_train)):
    print("fold :", fold_ + 1)
    trn_data, trn_y = train_stack[trn_idx], y_train[trn_idx]
    val_data, val_y = train_stack[val_idx], y_train[val_idx]

    stacking = HuberRegressor(epsilon=1.03, alpha=1e-5)

    stacking.fit(trn_data, trn_y)
    oof_stack[val_idx] = stacking.predict(val_data)
    predictions += stacking.predict(test_stack) / folds_stack.n_splits

print("stacking MAE score: {:<8.8f}".format(
    mean_absolute_error(oof_stack, y_train)))
print("stacking CV score: {:<8.8f}".format(
    1 / (mean_absolute_error(oof_stack, y_train) + 1)))

print(predictions_lgb.mean(), predictions_lgb1.mean(), y_train.mean(),
      predictions.mean())
result['score'] = predictions
result['score'] = round(result['score']).map(int)
result.to_csv('../result/stacking.csv', index=None)
Exemplo n.º 14
0
def test_huber_bool():
    # Test that it does not crash with bool data
    X, y = make_regression(n_samples=200, n_features=2, noise=4.0, random_state=0)
    X_bool = X > 0
    HuberRegressor().fit(X_bool, y)
Exemplo n.º 15
0
def forecaster(returns, ff, loss='MSE'):

    output = []
    factorLoadings = []
    varianceOfErrors = []
    df = ff.merge(returns, left_index=True, right_index=True)
    name = returns.columns.tolist()[0]
    df[name] = df[name] - df['RF']
    regressors = ['Mkt.Rf', 'HML', 'Mom', 'RMW', 'CMA']

    for j in range(120, len(df.index.tolist())):
        trainData = df.iloc[(j - 120):j, :]
        trainX = trainData[regressors]
        trainY = trainData[[name]]
        model = LinearRegression()
        if loss == 'MSE':
            model = LinearRegression()
        if loss == 'Ridge':
            model = Ridge()
        if loss == 'Lasso':
            model = Lasso()
        if loss == 'Hub':
            model = HuberRegressor()

        if True == trainY.isnull().values.any():
            output.append(np.nan)
            factorLoadings.append(np.zeros((1, 5)))
            varianceOfErrors.append(np.nan)
            continue

        model.fit(trainX, trainY)

        res = ''

        if loss == 'LAD':
            model = QuantReg(endog=trainY, exog=trainX)
            res = model.fit(q=0.5)

        if loss == '1Q':
            model = QuantReg(endog=trainY, exog=trainX)
            res = model.fit(q=0.25)

        if loss == '3Q':
            model = QuantReg(endog=trainY, exog=trainX)
            res = model.fit(q=0.75)

        if loss in ['LAD', '1Q', '3Q']:
            factorLoadings.append(np.array(res.params))
        else:
            factorLoadings.append(model.coef_)

        if loss not in ['Lasso', 'Hub', 'LAD', '1Q', '3Q']:
            varianceOfErrors.append(
                np.var(trainY - model.predict(trainX)).tolist()[0])
        if loss in ['Lasso', 'Hub']:
            varianceOfErrors.append(
                np.var(np.array(trainY) - model.predict(trainX)))
        if loss in ['LAD', '1Q', '3Q']:
            varianceOfErrors.append(
                np.var(
                    model.predict(res.params, exog=trainX) - np.array(trainY)))

        testData = pd.DataFrame(df.iloc[j, :]).T
        testX = testData[regressors]

        if loss in ['LAD', '1Q', '3Q']:
            prediction = model.predict(res.params, exog=testX)
        else:
            prediction = model.predict(testX)

        if loss in ['Lasso', 'Hub', 'LAD', '1Q', '3Q']:
            output.append(prediction[0])
        else:
            output.append(prediction[0][0])

    return (name, output, factorLoadings, varianceOfErrors)
Exemplo n.º 16
0
def regress(X_train, y_train):
    # comment out any classifier that should not be used
    classifiers = [
        (SGDRegressor(), "SGDRegressor", 1 * global_data_scale),
        (LinearRegression(), "LinearRegression", 1 * global_data_scale),
        (Ridge(), "Ridge", 1 * global_data_scale),
        (Lasso(), "Lasso", 1 * global_data_scale),
        (ElasticNet(), "ElasticNet", 1 * global_data_scale),
        (Lars(), "Lars", 1 * global_data_scale),
        (OrthogonalMatchingPursuit(), "OrthogonalMatchingPursuit", 1 * global_data_scale),
        (BayesianRidge(), "BayesianRidge", 1 * global_data_scale),
        (ARDRegression(), "ARDRegression", 1 * global_data_scale),
        ### NOTE the scoring might be different of PassiveAggressiveRegressor
        (PassiveAggressiveRegressor(), "PassiveAggressiveRegressor", 1 * global_data_scale),
        ### NOTE the scoring might be different of RANSACRegressor
        (RANSACRegressor(), "RANSACRegressor", 1 * global_data_scale),
        (TheilSenRegressor(), "TheilSenRegressor", 1 * global_data_scale),
        (HuberRegressor(), "HuberRegressor", 1 * global_data_scale),
        (DecisionTreeRegressor(), "DecisionTreeRegressor", 1 * global_data_scale),
        (GaussianProcessRegressor(), "GaussianProcessRegressor", 1 * global_data_scale),
        (MLPRegressor(), "MLPRegressor", 1 * global_data_scale),
        (KNeighborsRegressor(), "KNeighborsRegressor", 1 * global_data_scale),
        (RadiusNeighborsRegressor(), "RadiusNeighborsRegressor", 1 * global_data_scale),
        (SVR(), "SVR", 1 * global_data_scale),
        (NuSVR(), "NuSVR", 1 * global_data_scale),
        (LinearSVR(), "LinearSVR", 1 * global_data_scale),
        (KernelRidge(), "KernalRidge", 1 * global_data_scale),
        (IsotonicRegression(), "IsotonicRegression", 1 * global_data_scale)
    ]

    # set the list of the values that should be used in grid search
    params_dict = {
        "SGDRegressor": {
            "penalty": ["l2", "l1"],
            "alpha": [.001, .0001, .00001],
            "l1_ratio": [.15, .2, .25],
            "fit_intercept": [True, False],
            "max_iter": [1000],
            "shuffle": [True, False],
            "epsilon": [.05, .1, .2],
            "learning_rate": ["constant", "optimal", "invscaling", "adaptive"],
            "eta0": [.005, .01, .02],
            "power_t": [.2, .25, .3]
        },
        "LinearRegression": {
            "fit_intercept": [True, False],
            "normalize": [True, False]
        },
        "Ridge": {
            "alpha": [.8, 1., 1.2],
            "fit_intercept": [True, False],
            "normalize": [True, False],
            "tol": [.01, .001, .0001],
            "solver": ["svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga"]
        },
        "Lasso": {
            "alpha": [.8, 1., 1.2],
            "fit_intercept": [True, False],
            "normalize": [True, False],
            "positive": [True, False],
            "precompute": [True, False]
        },
        "ElasticNet": {
            "alpha": [.8, 1., 1.2],
            "fit_intercept": [True, False],
            "normalize": [True, False],
            "precompute": [True, False],
            "positive": [True, False],
            "selection": ["cyclic", "random"]
        },
        "Lars": {
            "fit_intercept": [True, False],
            "normalize": [True, False],
            "precompute": [True, False],
            "n_nonzero_coefs": [np.inf]
        },
        "OrthogonalMatchingPursuit": {
            "n_nonzero_coefs": [np.inf, None],
            "precompute": [True, False],
            "fit_intercept": [True, False],
            "normalize": [True, False]
        },
        "BayesianRidge": {
            "tol": [.01, .001, .0001],
            "alpha_1": [1e-5, 1e-6, 1e-7],
            "alpha_2": [1e-5, 1e-6, 1e-7],
            "lambda_1": [1e-5, 1e-6, 1e-7],
            "lambda_2": [1e-5, 1e-6, 1e-7],
            "fit_intercept": [True, False],
            "normalize": [True, False]
        },
        "ARDRegression": {
            "tol": [.01, .001, .0001],
            "alpha_1": [1e-5, 1e-6, 1e-7],
            "alpha_2": [1e-5, 1e-6, 1e-7],
            "lambda_1": [1e-5, 1e-6, 1e-7],
            "lambda_2": [1e-5, 1e-6, 1e-7],
            "threshold_lambda": [1000, 10000, 100000],
            "fit_intercept": [True, False],
            "normalize": [True, False]
        },
        "PassiveAggressiveRegressor": {
            "C": [.8, 1., 1.2 ],
            "tol": [1e-2, 1e-3, 1e-4],
            "n_iter_no_change": [3, 5, 8],
            "shuffle": [True, False],
            "average": [True, False]
        },
        "RANSACRegressor": {
            "base_estimator": [LinearRegression()]
        },
        "TheilSenRegressor": {
            "max_subpopulation": [1e3, 1e4, 1e5],
            "tol": [1e-2, 1e-3, 1e-4]
        },
        "HuberRegressor": {
            "epsilon": [1.1, 1.35,  1.5],
            "alpha": [1e-3, 1e-4, 1e-5],
            "warm_start": [True, False],
            "fit_intercept": [True, False],
            "": [1e-4, 1e-5, 1e-6]
        },
        "DecisionTreeRegressor": {
            "criterion": ["mse", "friedman_mse", "mae"],
            "splitter": ["best", "random"],
            "min_samples_split": [2, 3],
            "min_samples_leaf": [1, 2],
            "min_weight_fraction_leaf": [.0],
            "max_features": ["auto", "sqrt", "log2"],
            "min_impurity_split": [1e-6, 1e-7, 1e-8]
        },
        "GaussianProcessRegressor": {
            "alpha": [1e-8, 1e-10, 1e-12],
            "optimizer": ["fmin_l_bfgs_b"],
            "normalize_y": [True, False]
        },
        "MLPRegressor": {
            "hidden_layer_sizes": [(100,)],
            "activation": ["identity", "logistic", "tanh", "relu"],
            "solver": ["lbfgs", "sgd", "adam"],
            "alpha": [1e-3, 1e-4, 1e-5],
            # "learning_rate": ["constant", "invscaling", "adaptive"],
            # "learning_rate_init": [1e-2, 1e-3, 1e-4],
            # "power_t": [.3, .5, .8],
            # "shuffle": [True, False],
            # "tol": [1e-3, 1e-4, 1e-5],
            # "momentum": [.8, .9, .99],
            # "beta_1": [.8, .9, .99],
            # "beta_2": [.999],
            # "epsilon": [1e-7, 1e-8, 1e-9],
            # "n_iter_no_change": [10],
            # "max_fun": [15000]
        },
        "KNeighborsRegressor": {
            "n_neighbors": [20, 10, 5, 3],
            "weights": ["uniform", "distance"],
            "algorithm": ["ball_tree", "kd_tree", "brute"],
            "leaf_size": [20, 30, 40],
            "p": [1, 2]
        },
        "RadiusNeighborsRegressor": {
            "radius": [.8, 1, 1.2],
            "n_neighbors": [20, 10, 5, 3],
            "weights": ["uniform", "distance"],
            "algorithm": ["ball_tree", "kd_tree", "brute"],
            "leaf_size": [20, 30, 40],
            "p": [1, 2]
        },
        "SVR": {
            "kernel": ["poly", "rbf", "sigmoid"],
            "degree": [2, 3, 5],
            "gamma": ["scale", "auto"],
            "coef0": [.0],
            "tol": [1e-2, 1e-3, 1e-4],
            "C": [.8, .1, 1.2],
            "epsilon": [.08, .1, .12],
            "shrinking": [True, False],
            "max_iter": [-1]
        },
        "NuSVR": {
            "nu": [.2, .5, .8],
            "C": [.8, .1, 1.2],
            "kernel": ["poly", "rbf", "sigmoid"],
            "degree": [2, 3, 5],
            "gamma": ["scale", "auto"],
            "coef0": [.0],
            "shrinking": [True, False],
            "tol": [1e-2, 1e-3, 1e-4],
            "max_iter": [-1]
        },
        "LinearSVR": {
            "epsilon": [.0],
            "tol": [1e-3, 1e-4, 1e-5],
            "C": [.8, .1, 1.2],
            "fit_intercept": [True, False],
            "dual": [True, False],
            "intercept_scaling": [.8, 1., 1.2]
        },
        "KernelRidge": {
            "coef0": [.8, 1, 1.2],
            "degree": [2, 3, 5],
        },
        "IsotonicRegression": {
            "increasing": [True, False],
        }
    }

    for model, params, frac in classifiers:
        full = pd.DataFrame(X_train).join(pd.DataFrame(y_train))
        loan_data = full.sample(frac=frac, random_state=random_state)
        X = loan_data.drop("loan_status", axis=1)
        y = loan_data["loan_status"]
        grid = GridSearchCV(model, params_dict[params], verbose=verbose, cv=folds, n_jobs=workers)
        grid.fit(X, y)
        yield grid, params
Exemplo n.º 17
0
 def __init__(self, **hyperparams):
     self._hyperparams = hyperparams
     self._wrapped_model = Op(**self._hyperparams)
def Huber_regressor(features, labels):
    from sklearn.linear_model import HuberRegressor
    model = HuberRegressor()
    model.fit(features, labels)
    pred = model.predict(features)
    AsGraph(labels, pred)
Exemplo n.º 19
0
def runModel(data, config, retrain, runGPU, runNN, frequency, pre_dir):
    container = {}

    save_model = partial(_save_model, pre_dir=pre_dir)
    save_year_res = partial(_save_year_res, pre_dir=pre_dir)

    if runNN:
        nn_valid_r2 = []
        nn_oos_r2 = []

    bcktst_df = data[['Y']].copy()
    if frequency == 'M':
        date_range = pd.date_range('20131231', '20200831', freq='M')
    elif frequency == 'Q':
        date_range = pd.date_range('20131231', '20200630', freq='Q')
    elif frequency == 'Y':
        date_range = pd.date_range('20131231', '20181231', freq='Y')
    else:
        raise NotImplementedError()

    for year in tqdm(date_range):
        year = datetime.datetime.strftime(year, "%Y-%m")

        p_t = ['1900-01', str(year)]  # period of training
        # p_t = [sub_months(year, 48), str(year)]  # period of training
        if frequency == 'M':
            p_v = [add_months(year, 1),
                   add_months(year, 3)]  # period of valiation
            p_test = [add_months(year, 4), add_months(year, 4)]
        elif frequency == 'Q':
            p_v = [add_months(year, 1),
                   add_months(year, 3)]  # period of valiation
            p_test = [add_months(year, 4), add_months(year, 6)]
        elif frequency == 'Y':
            p_v = [add_months(year, 1),
                   add_months(year, 12)]  # period of valiation
            p_test = [add_months(year, 13), add_months(year, 24)]

        _Xt, _yt = split(
            data.loc(axis=0)[:, p_t[0]:p_t[1]].sample(frac=1, random_state=0))
        _Xv, _yv = split(
            data.loc(axis=0)[:, p_v[0]:p_v[1]].sample(frac=1, random_state=0))

        test_df = data.loc(axis=0)[:, p_test[0]:p_test[1]]
        _Xtest, _ytest = split(test_df)

        #OLS
        if config['runOLS3']:
            model_name = "OLS3" + f" {frequency}"
            data_ols3 = data[[
                'Factor46_mom12m', 'Factor07_beta', 'Factor51_mve',
                'Factor09_bm', 'Y'
            ]]

            _Xt, _yt = split(data_ols3.loc(axis=0)[:, p_t[0]:p_t[1]])
            _Xv, _yv = split(data_ols3.loc(axis=0)[:, p_v[0]:p_v[1]])
            _Xtest, _ytest = split(
                data_ols3.loc(axis=0)[:, p_test[0]:p_test[1]])

            Xt = np.vstack((_Xt, _Xv))
            yt = np.vstack((_yt, _yv))
            Xtest, ytest = _Xtest, _ytest
            model_fit = LinearRegression().fit(Xt, yt.reshape(-1, ))

        elif config['runOLS3+H']:
            model_name = "OLS3+H" + f" {frequency}"
            data_ols3 = data[[
                'Factor46_mom12m', 'Factor07_beta', 'Factor51_mve',
                'Factor09_bm', 'Y'
            ]]

            _Xt, _yt = split(data_ols3.loc(axis=0)[:, p_t[0]:p_t[1]])
            _Xv, _yv = split(data_ols3.loc(axis=0)[:, p_v[0]:p_v[1]])
            _Xtest, _ytest = split(
                data_ols3.loc(axis=0)[:, p_test[0]:p_test[1]])

            Xt = np.vstack((_Xt, _Xv))
            yt = np.vstack((_yt, _yv))
            Xtest, ytest = _Xtest, _ytest
            model_fit = HuberRegressor(epsilon=3).fit(Xt, yt.reshape(-1, ))
        elif config['runOLS5']:
            model_name = "OLS5" + f" {frequency}"
            data_ols5 = data[[
                'Factor46_mom12m', 'Factor07_beta', 'Factor51_mve',
                'Factor09_bm', 'Factor76_roeq', 'Factor05_agr', 'Y'
            ]]

            _Xt, _yt = split(data_ols5.loc(axis=0)[:, p_t[0]:p_t[1]])
            _Xv, _yv = split(data_ols5.loc(axis=0)[:, p_v[0]:p_v[1]])
            _Xtest, _ytest = split(
                data_ols5.loc(axis=0)[:, p_test[0]:p_test[1]])

            Xt = np.vstack((_Xt, _Xv))
            yt = np.vstack((_yt, _yv))
            Xtest, ytest = _Xtest, _ytest
            model_fit = LinearRegression().fit(Xt, yt.reshape(-1, ))

        elif config['runOLS5+H']:
            model_name = "OLS5+H" + f" {frequency}"
            data_ols5 = data[[
                'Factor46_mom12m', 'Factor07_beta', 'Factor51_mve',
                'Factor09_bm', 'Factor76_roeq', 'Factor05_agr', 'Y'
            ]]

            _Xt, _yt = split(data_ols5.loc(axis=0)[:, p_t[0]:p_t[1]])
            _Xv, _yv = split(data_ols5.loc(axis=0)[:, p_v[0]:p_v[1]])
            _Xtest, _ytest = split(
                data_ols5.loc(axis=0)[:, p_test[0]:p_test[1]])

            Xt = np.vstack((_Xt, _Xv))
            yt = np.vstack((_yt, _yv))
            Xtest, ytest = _Xtest, _ytest
            model_fit = HuberRegressor(epsilon=3).fit(Xt, yt.reshape(-1, ))

        elif config['runOLS']:
            model_name = "OLS" + f" {frequency}"
            Xt = np.vstack((_Xt, _Xv))
            yt = np.vstack((_yt, _yv))
            Xtest, ytest = _Xtest, _ytest
            model_fit = LinearRegression(n_jobs=-1).fit(Xt, yt.reshape(-1, ))
            save_model(model_name, year, model_fit)

        elif config['runOLSH']:  # OLS + H
            model_name = "OLSH" + f" {frequency}"
            Xt = np.vstack((_Xt, _Xv))
            yt = np.vstack((_yt, _yv))
            Xtest, ytest = _Xtest, _ytest
            model_fit = HuberRegressor().fit(Xt, yt.reshape(-1, ))

        elif config['runENET']:
            from sklearn.linear_model import ElasticNet
            model_name = "ENET" + f" {frequency}"
            Xt, yt = _Xt, _yt
            Xv, yv = _Xv, _yv
            Xtest, ytest = _Xtest, _ytest

            lambda_ = [0.1, 0.01, 0.001, 0.0001]
            params = [{'lambda': i} for i in lambda_]

            out_cv = []
            for p in tqdm(params):
                model_fit = ElasticNet(alpha=p['lambda'],
                                       l1_ratio=0.5,
                                       random_state=0)
                model_fit.fit(Xt, yt.reshape(-1, ))

                yv_hat = model_fit.predict(Xv).reshape(-1, 1)
                perfor = cal_r2(yv, yv_hat)
                out_cv.append(perfor)
                # print('params: ' + str(p) + '. CV r2-validation:' + str(perfor))
                logger.info('params: ' + str(p) + '. CV r2-validation:' +
                            str(perfor))
            # tic = time.time()
            # print(f"{model} train time: ", tic - tis)
            best_p = params[np.argmax(out_cv)]
            print("best p", best_p)
            logger.info(f"{model_name} {year} {params} best hyperparamer ",
                        best_p)
            model_fit = ElasticNet(alpha=best_p['lambda'],
                                   l1_ratio=0.5,
                                   random_state=0)
            model_fit.fit(Xt, yt)
            ytest_hat = model_fit.predict(Xtest).reshape(-1, 1)
            best_perfor = cal_r2(ytest, ytest_hat)
            print(f"{model_name} oss r2:", best_perfor)
            save_model(model_name, year, model_fit)

        elif config['runPLS']:
            from sklearn.cross_decomposition import PLSRegression
            model_name = "PLS" + f" {frequency}"
            Xt, yt = _Xt, _yt
            Xv, yv = _Xv, _yv
            Xtest, ytest = _Xtest, _ytest

            maxk = min(30, Xt.shape[1])
            ks = np.arange(1, maxk, 2)
            params = [{'k': i} for i in ks]

            out_cv = []
            for p in tqdm(params):
                pls = PLSRegression(n_components=p['k'])
                model_fit = pls.fit(Xt, yt)

                yv_hat = model_fit.predict(Xv)
                perfor = cal_r2(yv, yv_hat)
                out_cv.append(perfor)
                print('params: ' + str(p) + '. CV r2-validation:' +
                      "{0:.3%}".format(perfor))
                logging.info('params: ' + str(p) + '. CV r2-validation:' +
                             "{0:.3%}".format(perfor))

            best_p = params[np.argmax(out_cv)]
            print("best hyper-parameter", best_p)

            pls = PLSRegression(n_components=best_p['k'])
            model_fit = pls.fit(Xt, yt)

            ytest_hat = model_fit.predict(Xtest)
            best_perfor = cal_r2(ytest, ytest_hat)
            print(f"{model_name} oss r2 in {year}:", best_perfor)

        elif config['runPCR']:
            model_name = "PCR" + f" {frequency}"
            pca_name = "PCA" + f" {frequency}"
            # mtrain = np.mean(_yt)
            Xt, yt = _Xt, _yt
            Xv, yv = _Xv, _yv
            Xtest, ytest = _Xtest, _ytest

            # # prepare for PCR running
            # XTX = np.dot(Xt.T, Xt)  # X=xtrain.'*xtrain;
            # _pca_val, _pca_vec = np.linalg.eig(XTX)  # X*pca_vec = pca_vec*pca_val
            # idx = _pca_val.argsort()[::-1]
            # pca_val = _pca_val[idx]
            # pca_vec = _pca_vec[:, idx]

            # p1 = pca_vec[:, :maxk-5]  # 选出最大的30个
            # Z = np.dot(Xt, p1)

            # hyper-parameter
            maxk = min(30, Xt.shape[1])
            ks = np.arange(1, maxk, 2)
            params = [{'k': i} for i in ks]

            out_cv = []
            for p in tqdm(params):
                # xx = Z[:, :p['k']]
                # b = np.linalg.inv(xx.T@xx) @ (xx.T@yt)  # b = (inv(xx.'*xx)*xx.') * Y;
                # bf = p1[:, :p['k']]@b  #b = p1(:, 1: j)*b;
                #
                # yv_hat = Xv@bf + mtrain  # yhatbig1 = xtest * b + mtrain;
                pca = PCA(n_components=p['k'])
                X_reduced = pca.fit_transform(Xt)
                model_fit = LinearRegression()
                model_fit = model_fit.fit(X_reduced, yt)

                xv_r = pca.transform(Xv)
                yv_hat = model_fit.predict(xv_r)
                perfor = cal_r2(yv, yv_hat)
                out_cv.append(perfor)
                print('params: ' + str(p) + '. CV r2-validation:' +
                      "{0:.3%}".format(perfor))
                logging.info('params: ' + str(p) + '. CV r2-validation:' +
                             "{0:.3%}".format(perfor))

            best_p = params[np.argmax(out_cv)]
            print("best hyper-parameter", best_p)
            # xx = Z[:, :best_p['k']]
            # b = np.linalg.inv(xx.T @ xx) @ (xx.T @ yt)
            # bf = p1[:, :best_p['k']] @ b
            # ytest_hat = (Xtest @ bf + mtrain).reshape(-1, 1)
            pca = PCA(n_components=best_p['k'])
            Xt = pca.fit_transform(Xt)
            model_fit = LinearRegression()
            model_fit = model_fit.fit(Xt, yt)

            Xtest = pca.transform(Xtest)
            ytest_hat = model_fit.predict(Xtest)
            best_perfor = cal_r2(ytest, ytest_hat)
            print(f"{model_name} oss r2 in {year}:", best_perfor)
            save_model(pca_name, year, pca)
            save_model(model_name, year, model_fit)

        elif runNN:
            import tensorflow as tf
            import tensorflow.keras as keras
            from keras.models import Sequential
            from keras.layers import Dense, LeakyReLU, BatchNormalization, Dropout
            from strategy_func import genNNmodel, _loss_fn

            if config["runNN1"]:
                i = 1
            elif config["runNN2"]:
                i = 2
            elif config["runNN3"]:
                i = 3
            elif config["runNN4"]:
                i = 4
            elif config["runNN5"]:
                i = 5
            elif config["runNN6"]:
                i = 6

            model_name = f"NN{i}" + f" {frequency}"

            nn_is_preds = []
            nn_valid_preds = []
            nn_oos_preds = []

            model_cntn = []
            for model_num in range(5):
                model_pt = gen_model_pt(model_name,
                                        year,
                                        pre_dir,
                                        runNN=True,
                                        model_num=model_num)

                _Xt, _yt = split(
                    data.loc(axis=0)[:, p_t[0]:p_t[1]].sample(
                        frac=1, random_state=model_num))
                _Xv, _yv = split(
                    data.loc(axis=0)[:, p_v[0]:p_v[1]].sample(
                        frac=1, random_state=model_num + 1))

                Xt, yt = _Xt, _yt
                Xv, yv = _Xv, _yv
                Xtest, ytest = _Xtest, _ytest

                if retrain:
                    model_fit = train_NN_model(Xt, yt, Xv, yv, model_pt,
                                               model_num, i, runGPU)
                else:
                    model_fit = load_NN_model(Xt, yt, Xv, yv, model_pt,
                                              model_num, i, runGPU)

                model_cntn.append(model_fit)

                # is_predictions = model_fit.predict(Xt)
                valid_pred = model_fit.predict(Xv)
                oos_pred = model_fit.predict(Xtest)
                # r2is = cal_r2(yt, is_predictions)
                r2valid = cal_r2(yv, valid_pred)
                r2oos = cal_r2(ytest, oos_pred)
                # nr2oos = cal_normal_r2(ytest, predictions)

                # print(f"model{model_num} train r2", "{0:.3%}".format(r2is))
                print(f"model{model_num} valid r2", "{0:.3%}".format(r2valid))
                print(f"model{model_num} test r2", "{0:.3%}".format(r2oos))

                # nn_is_preds.append(is_predictions)
                nn_valid_r2.append(r2valid)
                nn_oos_r2.append(r2oos)
                # if r2valid < 0.11273255028948781:
                #   nn_oos_preds.append(oos_pred)
                nn_valid_preds.append(valid_pred)
                nn_oos_preds.append(oos_pred)
        elif config['runRF']:
            logger.info(year)
            model_name = "RF" + f" {frequency}"
            Xt, yt = _Xt, _yt
            Xv, yv = _Xv, _yv
            Xtest, ytest = _Xtest, _ytest
            if not retrain:
                model_fit = tree_model_fast(model_name,
                                            year,
                                            pre_dir,
                                            Xt,
                                            yt,
                                            Xv,
                                            yv,
                                            runRF=True,
                                            runGBRT=False,
                                            runGBRT2=False)
            else:
                model_fit = tree_model(Xt,
                                       yt,
                                       Xv,
                                       yv,
                                       runRF=True,
                                       runGBRT=False,
                                       runGBRT2=False)
                save_model(model_name, year, model_fit)
        elif config['runGBRT']:
            model_name = "GBRT+H" + f" {frequency}"
            Xt, yt = _Xt, _yt
            Xv, yv = _Xv, _yv
            Xtest, ytest = _Xtest, _ytest
            if not retrain:
                model_fit = tree_model_fast(model_name,
                                            year,
                                            pre_dir,
                                            Xt,
                                            yt,
                                            Xv,
                                            yv,
                                            runRF=False,
                                            runGBRT=True,
                                            runGBRT2=False)
            else:
                model_fit = tree_model(Xt,
                                       yt,
                                       Xv,
                                       yv,
                                       runRF=False,
                                       runGBRT=True,
                                       runGBRT2=False)
                # Don't use pickle or joblib as that may introduces dependencies on xgboost version.
                # The canonical way to save and restore models is by load_model and save_model.
                model_pt = gen_model_pt(model_name, year, pre_dir)
                model_fit.save_model(model_pt)
        elif config['runGBRT2']:
            model_name = "GBRT+l2" + f" {frequency}"
            Xt, yt = _Xt, _yt
            Xv, yv = _Xv, _yv
            Xtest, ytest = _Xtest, _ytest
            if not retrain:
                model_fit = tree_model_fast(model_name,
                                            year,
                                            pre_dir,
                                            Xt,
                                            yt,
                                            Xv,
                                            yv,
                                            runRF=False,
                                            runGBRT=False,
                                            runGBRT2=True)
            else:
                model_fit = tree_model(Xt,
                                       yt,
                                       Xv,
                                       yv,
                                       runRF=False,
                                       runGBRT=False,
                                       runGBRT2=True)
            # Don't use pickle or joblib as that may introduces dependencies on xgboost version.
            # The canonical way to save and restore models is by load_model and save_model.
            model_pt = gen_model_pt(model_name, year, pre_dir)
            model_fit.save_model(model_pt)

        # predict and save
        if runNN:
            # yt_hat = np.mean(np.concatenate(nn_is_preds, axis=1), axis=1).reshape(-1, 1)
            yv_hat = np.mean(np.concatenate(nn_valid_preds, axis=1),
                             axis=1).reshape(-1, 1)
            ytest_hat = np.mean(np.concatenate(nn_oos_preds, axis=1),
                                axis=1).reshape(-1, 1)
            print(f"mean r2 in {year}among models",
                  "{0:.3%}".format(np.mean(nn_oos_r2)))

            save_arrays(container, model_name, year, yv_hat, savekey='yv_hat')
            save_arrays(container, model_name, year, yv, savekey='yv')

            save_arrays(container,
                        model_name,
                        year,
                        ytest_hat,
                        savekey='ytest_hat')
            save_arrays(container, model_name, year, ytest, savekey='ytest')

            bcktst_df.loc[test_df.index, "predict"] = ytest_hat

            save_year_res(model_name, year, cal_r2(yv, yv_hat),
                          cal_r2(ytest, ytest_hat))
        else:
            yt_hat = model_fit.predict(Xt).reshape(-1, 1)
            ytest_hat = model_fit.predict(Xtest).reshape(-1, 1)

            save_arrays(container, model_name, year, yt_hat, savekey='yt_hat')
            save_arrays(container, model_name, year, yt, savekey='yt')
            save_arrays(container,
                        model_name,
                        year,
                        ytest_hat,
                        savekey='ytest_hat')
            save_arrays(container, model_name, year, ytest, savekey='ytest')

            bcktst_df.loc[test_df.index, "predict"] = ytest_hat

            save_year_res(model_name, year, cal_r2(yt, yt_hat),
                          cal_r2(ytest, ytest_hat))

    if runNN:
        model_dir = model_pt.parent
        return model_name, bcktst_df, container, nn_valid_r2, nn_oos_r2, model_dir
    else:
        return model_name, bcktst_df, container
Exemplo n.º 20
0
def train1(X, y, random, is_output):
    X_Backup = X
    y_backup = y

    idx = list(range(len(y)))

    process_X, holdout_X, process_y, holdout_y, process_idx, holdout_idx = train_test_split(X, y, idx, test_size = 0.1, random_state = random)


    train_X, valid_X, train_y, valid_y, train_idx, valid_idx = train_test_split(process_X, process_y, process_idx, test_size = 0.1, random_state = random)

   

    d_train = lgb.Dataset(train_X, label=train_y)
    d_valid = lgb.Dataset(valid_X, label=valid_y)
        
    watchlist = [d_train, d_valid]
    
    params = { 'learning_rate': 0.03, 'application': 'regression', 'num_leaves': 31, 'verbosity': -1, 'metric': 'RMSE', 'data_random_seed': 1,
                    'bagging_fraction': 0.6, 'bagging_freq': 0, 'nthread': 4, 'max_bin': 255 }

    eval_out = 50

    if is_output:
        eval_out = 35
   
    model_lgbm = lgb.train(params, train_set=d_train, num_boost_round=6310, valid_sets=watchlist, verbose_eval=eval_out,early_stopping_rounds=400) 
    
    preds_lgbm = model_lgbm.predict(valid_X)

    price_lgbm_pred = np.expm1(preds_lgbm)
    price_valid_real = np.expm1(valid_y)

    o_lgbm = rmsle_func(price_lgbm_pred, price_valid_real)

    print ("LGBM RMSLE: " + str(o_lgbm))


    preds_hold_out_lgbm = model_lgbm.predict(holdout_X)
    price_hold_out_lgbm = np.expm1(preds_hold_out_lgbm)
    price_hold_out_real = np.expm1(holdout_y)

    o_lgbm_holdout = rmsle_func(price_hold_out_lgbm, price_hold_out_real)
    print ("LGBM HOLDOUT RMSLE: " + str(o_lgbm_holdout))

    model_ridge = Ridge(solver = "lsqr", fit_intercept=False)
    model_ridge.fit(train_X, train_y)

    preds_ridge = model_ridge.predict(valid_X)

    price_ridge_pred = np.expm1(preds_ridge)

    o_ridge = rmsle_func(price_ridge_pred, price_valid_real)

    print ("RIDGE RMSLE: " + str(o_ridge))
   

    model_huber = HuberRegressor(fit_intercept=True, alpha=0.01, max_iter=80, epsilon=363)
    model_huber.fit(train_X, train_y)

    preds_huber = model_huber.predict(valid_X)
    price_huber_pred = np.expm1(preds_huber)

    o_huber = rmsle_func(price_huber_pred, price_valid_real)

    print ("HUBER RMSLE: " + str(o_huber))
   


    y2 = np.power(np.log1p(price_lgbm_pred)-np.log1p(price_valid_real), 2)

    y2 = y2.values

    if is_output:
        error_dist(y2, 0.1)

    l = (-y2).argsort()

    # Todo: Display a set of predictions, one for each run model.

    if is_output:
        for x in l:
            s = get_by_validation_sequence(valid_idx, price_lgbm_pred, price_ridge_pred, price_huber_pred, x)
            print (s)


    return o
#importing the library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import classification_report, confusion_matrix

#loading the dataset
train = pd.read_csv("C:/Users/HP/Desktop/train (1).csv")
test = pd.read_csv("C:/Users/HP/Desktop/test (2).csv")
train = train.dropna()
test = test.dropna()
train.head()

X_train = np.array(train.iloc[:, :-1].values)
y_train = np.array(train.iloc[:, 1].values)
X_test = np.array(test.iloc[:, :-1].values)
y_test = np.array(test.iloc[:, 1].values)

#Huber Regressor
from sklearn.linear_model import HuberRegressor
model = HuberRegressor()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = model.score(X_test, y_test)
plt.plot(X_train, model.predict(X_train), color='y')
plt.show()
print(accuracy)
Exemplo n.º 22
0
from sklearn.metrics import mean_squared_error

models = [['DecisionTree :',DecisionTreeRegressor()],
           ['Linear Regression :', LinearRegression()],
           ['RandomForest :',RandomForestRegressor()],
           ['KNeighbours :', KNeighborsRegressor(n_neighbors = 2)],
           ['SVM :', SVR()],
           ['AdaBoostClassifier :', AdaBoostRegressor()],
           ['GradientBoostingClassifier: ', GradientBoostingRegressor()],
           ['Xgboost: ', XGBRegressor()],
           ['CatBoost: ', CatBoostRegressor(logging_level='Silent')],
           ['Lasso: ', Lasso()],
           ['Ridge: ', Ridge()],
           ['BayesianRidge: ', BayesianRidge()],
           ['ElasticNet: ', ElasticNet()],
           ['HuberRegressor: ', HuberRegressor()]]

print("Results...")


for name,model in models:
    model = model
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print(name, (np.sqrt(mean_squared_error(y_test, predictions))))


# Something as simple as Linear Regression performs the best in this case, which proves that complicated models doesnt always mean better results. There are situations when simple models are much better suited

# **Generate Feature Importances**
Exemplo n.º 23
0
	mpg = DataFrame(pipeline.predict(auto_X, **predict_params), columns = ["mpg"])
	store_csv(mpg, name)

if "Auto" in datasets:
	build_auto(AdaBoostRegressor(DecisionTreeRegressor(min_samples_leaf = 5, random_state = 13), random_state = 13, n_estimators = 17), "AdaBoostAuto")
	build_auto(ARDRegression(normalize = True), "BayesianARDAuto")
	build_auto(BayesianRidge(normalize = True), "BayesianRidgeAuto")
	build_auto(DecisionTreeRegressor(min_samples_leaf = 2, random_state = 13), "DecisionTreeAuto", compact = False)
	build_auto(BaggingRegressor(DecisionTreeRegressor(min_samples_leaf = 5, random_state = 13), n_estimators = 3, max_features = 0.5, random_state = 13), "DecisionTreeEnsembleAuto")
	build_auto(DummyRegressor(strategy = "median"), "DummyAuto")
	build_auto(ElasticNetCV(cv = 3, random_state = 13), "ElasticNetAuto")
	build_auto(ExtraTreesRegressor(n_estimators = 10, min_samples_leaf = 5, random_state = 13), "ExtraTreesAuto")
	build_auto(GBDTLMRegressor(RandomForestRegressor(n_estimators = 7, max_depth = 6, random_state = 13), LinearRegression()), "GBDTLMAuto")
	build_auto(GBDTLMRegressor(XGBRFRegressor(n_estimators = 17, max_depth = 6, random_state = 13), ElasticNet(random_state = 13)), "XGBRFLMAuto")
	build_auto(GradientBoostingRegressor(init = None, random_state = 13), "GradientBoostingAuto")
	build_auto(HuberRegressor(), "HuberAuto")
	build_auto(LarsCV(cv = 3), "LarsAuto")
	build_auto(LassoCV(cv = 3, random_state = 13), "LassoAuto")
	build_auto(LassoLarsCV(cv = 3), "LassoLarsAuto")
	build_auto(LinearRegression(), "LinearRegressionAuto")
	build_auto(BaggingRegressor(LinearRegression(), max_features = 0.75, random_state = 13), "LinearRegressionEnsembleAuto")
	build_auto(OrthogonalMatchingPursuitCV(cv = 3), "OMPAuto")
	build_auto(RandomForestRegressor(n_estimators = 10, min_samples_leaf = 3, random_state = 13), "RandomForestAuto", flat = True)
	build_auto(RidgeCV(), "RidgeAuto")
	build_auto(StackingRegressor([("ridge", Ridge(random_state = 13)), ("lasso", Lasso(random_state = 13))], final_estimator = GradientBoostingRegressor(n_estimators = 7, random_state = 13)), "StackingEnsembleAuto")
	build_auto(TheilSenRegressor(n_subsamples = 31, random_state = 13), "TheilSenAuto")
	build_auto(VotingRegressor([("dt", DecisionTreeRegressor(random_state = 13)), ("knn", KNeighborsRegressor()), ("lr", LinearRegression())], weights = [3, 1, 2]), "VotingEnsembleAuto")
	build_auto(XGBRFRegressor(n_estimators = 31, max_depth = 6, random_state = 13), "XGBRFAuto")

if "Auto" in datasets:
	build_auto(TransformedTargetRegressor(DecisionTreeRegressor(random_state = 13)), "TransformedDecisionTreeAuto")
Exemplo n.º 24
0
def test_huber_max_iter():
    X, y = make_regression_with_outliers()
    huber = HuberRegressor(max_iter=1)
    huber.fit(X, y)
    assert huber.n_iter_ == huber.max_iter
Exemplo n.º 25
0
lgb_model = LGBMRegressor(**lgb_params)

rf_model = RandomForestRegressor(**rf_params)

et_model = ExtraTreesRegressor()

# SVR model ; SVM is too slow in more then 10000 set
svr_model = SVR(kernel='rbf', C=1.0, epsilon=0.05)

# DecsionTree model
dt_model = DecisionTreeRegressor()

# AdaBoost model
ada_model = AdaBoostRegressor()

stack = Ensemble(n_splits=7,
                 stacker=HuberRegressor(),
                 base_models=(nn, cb_model, gbr_model, rf_model, xgb_model,
                              et_model, ada_model))

y_test = stack.fit_predict(x_train, y_train, x_test)

from datetime import datetime
print("submit...")
pre = y_test
sub = pd.read_csv('sample_submission.csv')
for c in sub.columns[sub.columns != 'ParcelId']:
    sub[c] = pre
submit_file = '{}.csv'.format(datetime.now().strftime('%Y%m%d_%H_%M'))
sub.to_csv(submit_file, index=False, float_format='%.4f')
Exemplo n.º 26
0
def test_huber_sample_weights():
    # Test sample_weights implementation in HuberRegressor"""

    X, y = make_regression_with_outliers()
    huber = HuberRegressor(fit_intercept=True)
    huber.fit(X, y)
    huber_coef = huber.coef_
    huber_intercept = huber.intercept_

    # Rescale coefs before comparing with assert_array_almost_equal to make sure
    # that the number of decimal places used is somewhat insensitive to the
    # amplitude of the coefficients and therefore to the scale of the data
    # and the regularization parameter
    scale = max(np.mean(np.abs(huber.coef_)),
                np.mean(np.abs(huber.intercept_)))

    huber.fit(X, y, sample_weight=np.ones(y.shape[0]))
    assert_array_almost_equal(huber.coef_ / scale, huber_coef / scale)
    assert_array_almost_equal(huber.intercept_ / scale,
                              huber_intercept / scale)

    X, y = make_regression_with_outliers(n_samples=5, n_features=20)
    X_new = np.vstack((X, np.vstack((X[1], X[1], X[3]))))
    y_new = np.concatenate((y, [y[1]], [y[1]], [y[3]]))
    huber.fit(X_new, y_new)
    huber_coef = huber.coef_
    huber_intercept = huber.intercept_
    sample_weight = np.ones(X.shape[0])
    sample_weight[1] = 3
    sample_weight[3] = 2
    huber.fit(X, y, sample_weight=sample_weight)

    assert_array_almost_equal(huber.coef_ / scale, huber_coef / scale)
    assert_array_almost_equal(huber.intercept_ / scale,
                              huber_intercept / scale)

    # Test sparse implementation with sample weights.
    X_csr = sparse.csr_matrix(X)
    huber_sparse = HuberRegressor(fit_intercept=True)
    huber_sparse.fit(X_csr, y, sample_weight=sample_weight)
    assert_array_almost_equal(huber_sparse.coef_ / scale,
                              huber_coef / scale)
Exemplo n.º 27
0
other data points, e.g., due to measurement errors.""" 

from sklearn import linear_model
from sklearn.linear_model import HuberRegressor

m = 10                            # we use 100 data points of the house sales database 
max_r = 10                        # maximum number of features used 

X,y = GetFeaturesLabels(m,max_r)  # read in 100 data points using 10 features 

linreg_time = np.zeros(max_r)     # vector for storing the exec. times of LinearRegresion.fit() for each r
linreg_error = np.zeros(max_r)    # vector for storing the training error of LinearRegresion.fit() for each r


for r in range(max_r):
    reg_hub = HuberRegressor(fit_intercept=False) 
    start_time = time.time()
    reg_hub = reg_hub.fit(X[:,:(r+1)], y)
    end_time = (time.time() - start_time)*1000
    linreg_time[r] = end_time
    pred = reg_hub.predict(X[:,:(r+1)])
    linreg_error[r] = mean_squared_error(y, pred)

plot_x = np.linspace(1, max_r, max_r, endpoint=True)
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(8, 4))
axes[0].plot(plot_x, linreg_error, label='MSE', color='red')
axes[1].plot(plot_x, linreg_time, label='time', color='green')
axes[0].set_xlabel('features')
axes[0].set_ylabel('empirical error')
axes[1].set_xlabel('features')
axes[1].set_ylabel('Time (ms)')
Exemplo n.º 28
0
desig = np.array(df["designation"].tolist())

features = np.column_stack((BV, BR, BI, VR, VI, RI, totCounts, randomFeature))

features_train, features_test, temp_train, temp_test = train_test_split(
    features, temps, test_size=0.1)

names = [
    "Random Forest", "Ada Boost", "Huber", "Linear Regression", "K Neighbours",
    "RANSAC", "TheilSen", "Gaussian Process", "SVR"
]

classifiers = [
    RandomForestRegressor(),
    AdaBoostRegressor(),
    HuberRegressor(),
    LinearRegression(),
    KNeighborsRegressor(),
    RANSACRegressor(),
    TheilSenRegressor(),
    GaussianProcessRegressor(),
    SVR(kernel='rbf', gamma=0.1)
]

fig, axes = plt.subplots(3, 3, sharex=True, sharey=True)
fig.suptitle('Regressor Comparison', y=1.03, fontsize=18)
fig.text(0.5, -0.02, 'Actual Temperature / K', ha='center')
fig.text(-0.01,
         0.5,
         'Predicted Temperature / K',
         va='center',
Exemplo n.º 29
0
def test_huber_sample_weights():
    # Test sample_weights implementation in HuberRegressor"""

    X, y = make_regression_with_outliers()
    huber = HuberRegressor()
    huber.fit(X, y)
    huber_coef = huber.coef_
    huber_intercept = huber.intercept_

    # Rescale coefs before comparing with assert_array_almost_equal to make
    # sure that the number of decimal places used is somewhat insensitive to
    # the amplitude of the coefficients and therefore to the scale of the
    # data and the regularization parameter
    scale = max(np.mean(np.abs(huber.coef_)), np.mean(np.abs(huber.intercept_)))

    huber.fit(X, y, sample_weight=np.ones(y.shape[0]))
    assert_array_almost_equal(huber.coef_ / scale, huber_coef / scale)
    assert_array_almost_equal(huber.intercept_ / scale, huber_intercept / scale)

    X, y = make_regression_with_outliers(n_samples=5, n_features=20)
    X_new = np.vstack((X, np.vstack((X[1], X[1], X[3]))))
    y_new = np.concatenate((y, [y[1]], [y[1]], [y[3]]))
    huber.fit(X_new, y_new)
    huber_coef = huber.coef_
    huber_intercept = huber.intercept_
    sample_weight = np.ones(X.shape[0])
    sample_weight[1] = 3
    sample_weight[3] = 2
    huber.fit(X, y, sample_weight=sample_weight)

    assert_array_almost_equal(huber.coef_ / scale, huber_coef / scale)
    assert_array_almost_equal(huber.intercept_ / scale, huber_intercept / scale)

    # Test sparse implementation with sample weights.
    X_csr = sparse.csr_matrix(X)
    huber_sparse = HuberRegressor()
    huber_sparse.fit(X_csr, y, sample_weight=sample_weight)
    assert_array_almost_equal(huber_sparse.coef_ / scale, huber_coef / scale)
Exemplo n.º 30
0
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor


aml_basic_regressors = [
    ('model1', LinearRegression()),
    ('model2', Lasso()),
    ('model3', Ridge()),
    ('model4', ElasticNet()),
    ('model5', Lars()),
    ('model6', LassoLars()),
    ('model7', OrthogonalMatchingPursuit()),
    ('model8', BayesianRidge()),
    ('model9', ARDRegression()),
    ('model10', PassiveAggressiveRegressor()),
    ('model11', RANSACRegressor()),
    ('model12', TheilSenRegressor()),
    ('model13', HuberRegressor()),
    ('model14', KernelRidge()),
    ('model15', SVR()),
    ('model16', KNeighborsRegressor()),
    ('model17', DecisionTreeRegressor()),
    ('model18', RandomForestRegressor()),
    ('model19', ExtraTreesRegressor()),
    ('model20', AdaBoostRegressor()),
    ('model21', GradientBoostingRegressor()),
    ('model22', MLPRegressor()),
    ('model23', XGBRegressor()),
]
)  #train the algorithm on training data and predict using the testing data
y_predransac = ransac.predict(X_test)
print('Betas: ', list(zip(ransac.coef_, X)))
print('Beta0: %.2f' % ransac.intercept_)  #Beta0

# 5.1.5.2 Theil-Sen regression
ts = TheilSenRegressor()
pred_ts = ts.fit(X_train, y_train).predict(
    X_test
)  #train the algorithm on training data and predict using the testing data
y_predts = ts.predict(X_test)
print('Betas: ', list(zip(ts.coef_, X)))
print('Beta0: %.2f' % ts.intercept_)  #Beta0

# 5.1.5.3 Huber regression
huber = HuberRegressor(alpha=0.0)
pred_huber = huber.fit(X_train, y_train).predict(
    X_test
)  #train the algorithm on training data and predict using the testing data
y_predhuber = huber.predict(X_test)
print('Betas: ', list(zip(huber.coef_, X)))
print('Beta0: %.2f' % huber.intercept_)  #Beta0
"""# Regression Model selection
After calculating different regression models it is necessary to compare models and evaluate which is the best given the database.
- MAE
- MSE
- RMSE
- R²
- Adjusted R²
"""
Exemplo n.º 32
0
def run(seed):

    # create folders for scores models and preds
    folder_models = './models/domain2_var1/scores/'
    if not os.path.exists(folder_models):
        os.makedirs(folder_models)

    folder_preds = './predicts/domain2_var1/scores/'
    if not os.path.exists(folder_preds):
        os.makedirs(folder_preds)

    print('Loading data...')

    # load biases
    ic_bias = read_pickle('./data/biases/ic_biases.pickle')
    ic_bias_site = read_pickle('./data/biases/ic_biases_site.pickle')
    fnc_bias = read_pickle('./data/biases/fnc_biases.pickle')
    fnc_bias_site = read_pickle('./data/biases/fnc_biases_site.pickle')
    pca_bias = read_pickle('./data/biases/200pca_biases.pickle')
    pca_bias_site = read_pickle('./data/biases/200pca_biases_site.pickle')

    # load classifier and add extra sites2
    extra_site = pd.DataFrame()
    extra_site['Id'] = np.load('./predicts/classifier/site2_test_new_9735.npy')

    # load competiton data
    ids_df = pd.read_csv('./data/raw/reveal_ID_site2.csv')
    fnc_df = pd.read_csv('./data/raw/fnc.csv')
    loading_df = pd.read_csv('./data/raw/loading.csv')
    labels_df = pd.read_csv('./data/raw/train_scores.csv')

    ids_df = ids_df.append(extra_site)
    print('Detected Site2 ids count: ', ids_df['Id'].nunique())

    # load created features
    agg_df = pd.read_csv('./data/features/agg_feats.csv')
    im_df = pd.read_csv('./data/features/im_feats.csv')
    dl_df = pd.read_csv('./data/features/dl_feats.csv')

    pca_df = pd.read_csv('./data/features/200pca_feats/200pca_3d_k0.csv')
    for i in range(1, 6):
        part = pd.read_csv(
            './data/features/200pca_feats/200pca_3d_k{}.csv'.format(i))
        del part['Id']
        pca_df = pd.concat((pca_df, part), axis=1)

    # merge data
    ic_cols = list(loading_df.columns[1:])
    fnc_cols = list(fnc_df.columns[1:])
    agg_cols = list(agg_df.columns[1:])
    im_cols = list(im_df.columns[1:])
    pca_cols = list(pca_df.columns[1:])
    dl_cols = list(dl_df.columns[1:])
    pca0_cols = [c for c in pca_cols if 'k0' in c]

    df = fnc_df.merge(loading_df, on='Id')
    df = df.merge(agg_df, how='left', on='Id')
    df = df.merge(im_df, how='left', on='Id')
    df = df.merge(pca_df, how='left', on='Id')
    df = df.merge(dl_df, how='left', on='Id')
    df = df.merge(labels_df, how='left', on='Id')

    del loading_df, fnc_df, agg_df, im_df, pca_df
    gc.collect()

    # split train and test
    df.loc[df['Id'].isin(labels_df['Id']), 'is_test'] = 0
    df.loc[~df['Id'].isin(labels_df['Id']), 'is_test'] = 1

    train = df.query('is_test==0')
    del train['is_test']
    test = df.query('is_test==1')
    del test['is_test']
    y = train['domain2_var1'].copy().reset_index(drop=True)
    d21_index = list(train['domain2_var1'].dropna().index)

    # apply biases
    for c in ic_bias_site.keys():
        test.loc[~test['Id'].isin(ids_df['Id']), c] += ic_bias[c]
        test.loc[test['Id'].isin(ids_df['Id']), c] += ic_bias_site[c]

    for c in fnc_bias_site.keys():
        test.loc[~test['Id'].isin(ids_df['Id']), c] += fnc_bias[c]
        test.loc[test['Id'].isin(ids_df['Id']), c] += fnc_bias_site[c]

    for c in pca_bias_site.keys():
        test.loc[~test['Id'].isin(ids_df['Id']), c] += pca_bias[c]
        test.loc[test['Id'].isin(ids_df['Id']), c] += pca_bias_site[c]

    # save df for scaling
    df_scale = pd.concat([train, test], axis=0)

    # I. Create fnc score
    print('Creating FNC score...')

    # prepare datasets for fnc score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, fnc_cols)

    # define models
    names = ['ENet', 'BRidge']
    names = [name + '_fnc_seed{}'.format(seed) for name in names]
    pack = [
        ElasticNet(alpha=0.05, l1_ratio=0.5, random_state=0),
        BayesianRidge()
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 2, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 2, names)

    # save oof, pred, models
    np.save(folder_preds + 'fnc_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'fnc_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # II. Create agg score
    print('Creating AGG score...')

    # prepare datasets for agg score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, agg_cols)

    # define models
    names = ['RGF', 'ENet', 'Huber']
    names = [name + '_agg_seed{}'.format(seed) for name in names]
    pack = [
        RGFRegressor(max_leaf=1000,
                     reg_depth=5,
                     min_samples_leaf=100,
                     normalize=True),
        ElasticNet(alpha=0.05, l1_ratio=0.3, random_state=0),
        HuberRegressor(epsilon=2.5, alpha=1)
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 3, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 3, names)

    # save oof, pred, models
    np.save(folder_preds + 'agg_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'agg_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # III. Create pca score
    print('Creating PCA score...')

    # prepare datasets for pca score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, pca_cols)

    # define models
    names = ['ENet', 'BRidge', 'OMP']
    names = [name + '_pca_seed{}'.format(seed) for name in names]
    pack = [
        ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0),
        BayesianRidge(),
        OrthogonalMatchingPursuit()
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 3, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 3, names)

    # save oof, pred, models
    np.save(folder_preds + 'pca_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'pca_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # IV. Create im score
    print('Creating IM score...')

    # prepare datasets for pca score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, im_cols)

    # define models
    names = ['ENet', 'BRidge', 'OMP']
    names = [name + '_im_seed{}'.format(seed) for name in names]
    pack = [
        ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0),
        BayesianRidge(),
        OrthogonalMatchingPursuit()
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 3, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 3, names)

    # save oof, pred, models
    np.save(folder_preds + 'im_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'im_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # V. Create dl score
    print('Creating DL score...')

    # prepare datasets for pca score
    train_for_score, test_for_score = scale_select_data(
        train, test, df_scale, dl_cols)

    # define models
    names = ['ENet', 'BRidge', 'OMP']
    names = [name + '_dl_seed{}'.format(seed) for name in names]
    pack = [
        ElasticNet(alpha=0.2, l1_ratio=0.2, random_state=0),
        BayesianRidge(),
        OrthogonalMatchingPursuit()
    ]

    # train models
    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_for_score] * 3, y)
    score_blend = zoo.blend_oof()
    pred = zoo.predict([test_for_score] * 3, names)

    # save oof, pred, models
    np.save(folder_preds + 'dl_score_seed{}.npy'.format(seed), score_blend)
    np.save(folder_preds + 'dl_score_test_seed{}.npy'.format(seed), pred)
    zoo.save_models(names, folder=folder_models)

    # VI. Training and predicting procedure
    print('Training has started...')

    # add scores
    for prefix in ['fnc', 'agg', 'im', 'pca', 'dl']:
        train.loc[d21_index, prefix + '_score'] = np.load(
            folder_preds + '{}_score_seed{}.npy'.format(prefix, seed))
        test.loc[:, prefix + '_score'] = np.load(
            folder_preds + '{}_score_test_seed{}.npy'.format(prefix, seed))
    score_cols = [c for c in train.columns if c.endswith('_score')]

    # save df for scaling
    df_scale = pd.concat([train, test], axis=0)

    # create differents datasets
    # linear
    linear_cols = sorted(
        list(set(ic_cols + fnc_cols + pca0_cols) - set(['IC_20'])))
    train_linear, test_linear = scale_select_data(train, test, df_scale,
                                                  linear_cols)

    # kernel
    kernel_cols = sorted(list(set(ic_cols + pca0_cols) - set(['IC_20'])))
    train_kernel, test_kernel = scale_select_data(train=train,
                                                  test=test,
                                                  df_scale=df_scale,
                                                  cols=kernel_cols,
                                                  scale_factor=0.2,
                                                  scale_cols=pca0_cols,
                                                  sc=StandardScaler())

    # score
    sc_cols = sorted(list(set(ic_cols + score_cols) - set(['IC_20'])))
    train_sc, test_sc = scale_select_data(train, test, df_scale, sc_cols)

    # learning process on different datasets
    names = ['GP', 'SVM1', 'SVM2', 'Lasso', 'BgR']
    names = [name + '_seed{}'.format(seed) for name in names]
    pack = [
        GaussianProcessRegressor(DotProduct(), random_state=0),
        NuSVR(C=3, kernel='rbf'),
        NuSVR(C=3, kernel='rbf'),
        Lasso(alpha=0.1, random_state=0),
        BaggingRegressor(Ridge(alpha=1),
                         n_estimators=100,
                         max_samples=0.2,
                         max_features=0.2,
                         random_state=0)
    ]

    zoo = TrendsModelSklearn(pack, seed=seed)
    zoo.fit([train_sc] * 2 + [train_kernel] + [train_linear] * 2, y)
    de_blend = zoo.blend_oof()
    preds = zoo.predict([test_sc] * 2 + [test_kernel] + [test_linear] * 2,
                        names,
                        is_blend=True)

    # rewrite folders for models and preds
    folder_models = './models/domain2_var1/stack/'
    if not os.path.exists(folder_models):
        os.makedirs(folder_models)

    folder_preds = './predicts/domain2_var1/stack/'
    if not os.path.exists(folder_preds):
        os.makedirs(folder_preds)

    print('Saving models to', folder_models)
    print('Saving predictions to', folder_preds)

    # save oofs and models
    zoo.save_oofs(names, folder=folder_preds)
    zoo.save_models(names, folder=folder_models)

    # stacking predictions
    print('Stacking predictions...')
    d21_prediction = pd.DataFrame()
    d21_prediction['Id'] = test['Id'].values
    d21_prediction['pred'] = preds
    d21_prediction.to_csv(folder_preds +
                          'domain2_var1_stack_seed{}.csv'.format(seed),
                          index=False)
    print('domain2_var1 seed pred is saved as',
          folder_preds + 'domain2_var1_stack_seed{}.csv'.format(seed))
def preprocess(data,
               fps=100.,
               old_fps=60,
               filter=None,
               verbosity=0,
               fps_threshold=.1):
    """
	Normalize calcium traces and spike trains.
	This function does three things:
		1. Remove any linear trends using robust linear regression.
		2. Normalize the range of the calcium trace by the 5th and 80th percentile.
		3. Change the sampling rate of the calcium trace and spike train.
	If C{filter} is set, the first step is replaced by estimating and removing a baseline using
	a percentile filter (40 seconds seems like a good value for the percentile filter).
	@type  data: list
	@param data: list of dictionaries containing calcium/fluorescence traces
	@type  fps: float
	@param fps: desired sampling rate of signals
	@type  filter: float/none
	@param filter: percentile filter length in seconds
	@type  filter: float/None
	@param filter: number of seconds used in percentile filter
	@type  verbosity: int
	@param verbosity: if positive, print messages indicating progress
	@type  fps_threshold: float
	@param fps_threshold: only resample if sampling rate differs more than this
	@rtype: list
	@return: list of preprocessed recordings
	"""

    seed(42)

    data = deepcopy(data)

    for k in range(len(data)):
        if verbosity > 0:
            print('Preprocessing calcium trace {0}...'.format(k))

        data[k]['fps'] = float(data[k]['fps'])

        if filter is None:
            # remove any linear trends
            # x = arange(data[k]['calcium'].size)
            # a, b = robust_linear_regression(x, data[k]['calcium'])

            # data[k]['calcium'] = data[k]['calcium'] - (a * x + b)

            # using LinearRegression from sklearn
            X_temp = arange(0, len(data[k]['calcium'])).reshape(-1, 1)
            model = HuberRegressor()
            model.fit(X_temp, data[k]['calcium'])
            # calculate trend
            trend = model.predict(X_temp)
            # detrend
            data[k]['calcium'] = data[k]['calcium'] - trend

        else:
            data[k]['calcium'] = data[k]['calcium'] - \
             percentile_filter(data[k]['calcium'], window_length=int(data[k]['fps'] * filter), perc=5)

        # normalize dispersion
        calcium05 = percentile(data[k]['calcium'], 5)
        calcium80 = percentile(data[k]['calcium'], 80)

        if calcium80 - calcium05 > 0.:
            data[k]['calcium'] = ((data[k]['calcium'] - calcium05) /
                                  float(calcium80 - calcium05)).reshape(
                                      (len(data[k]['calcium']), ))

        # compute spike times if binned spikes are given
        if 'spikes' in data[k] and 'spike_times' not in data[k]:
            spikes = asarray(data[k]['spikes'].ravel(), dtype='uint16')

            # compute spike times in milliseconds
            spike_times = where(spikes > 0)[0]
            spike_times = repeat(spike_times, spikes[spike_times])
            spike_times = (spike_times +
                           rand(*spike_times.shape)) * (1000. / data[k]['fps'])

            data[k]['spike_times'] = sort(spike_times).reshape(1, -1)

        # normalize sampling rate
        if fps is not None and fps > 0. and abs(data[k]['fps'] -
                                                fps) > fps_threshold:
            # number of samples after update of sampling rate
            num_samples = int(
                float(data[k]['calcium'].size) * fps / data[k]['fps'] + .5)

            if num_samples != data[k]['calcium'].size:
                # factor by which number of samples will actually be changed
                factor = num_samples / float(data[k]['calcium'].size)

                # resample calcium signal
                data[k]['calcium'] = resample(data[k]['calcium'].ravel(),
                                              num_samples).reshape(1, -1)
                data[k]['fps'] = data[k]['fps'] * factor
        else:
            # don't change sampling rate
            num_samples = data[k]['calcium'].size

        # compute binned spike trains if missing
        if 'spike_times' in data[k] and ('spikes' not in data[k] or num_samples
                                         != data[k]['spikes'].size):
            # spike times in bins
            spike_times = asarray(data[k]['spike_times'] *
                                  (data[k]['fps'] / 1000.),
                                  dtype=int).ravel()
            spike_times = spike_times[spike_times < num_samples]
            spike_times = spike_times[spike_times >= 0]

            # create binned spike train
            data[k]['spikes'] = zeros([1, num_samples], dtype='uint16')
            for t in spike_times:
                data[k]['spikes'][0, t] += 1

        # make sure spike trains are row vectors
        if 'spikes' in data[k]:
            data[k]['spike_times'] = data[k]['spike_times'].reshape(
                -1, )  #data[k]['spike_times'].reshape(1, -1)
            data[k]['spikes'] = data[k]['spikes'].reshape(
                -1, )  #data[k]['spikes'].reshape(1, -1)

        # added by Gavin
        data[k]['calcium'] = data[k]['calcium'].reshape(-1, )
        data[k]['spike_count'] = int(sum(data[k]['spikes']))

    return data
Exemplo n.º 34
0
def test_huber_max_iter():
    X, y = make_regression_with_outliers()
    huber = HuberRegressor(max_iter=1)
    huber.fit(X, y)
    assert huber.n_iter_ == huber.max_iter
Exemplo n.º 35
0
                        ):  #para percorrer por todas as pastas da pasta

    os.chdir(folder)
    name_folder = folder.split("/")[6]
    train_data = np.array(pd.read_csv('train_data.csv', sep=';'))
    test_data = np.array(pd.read_csv('test_data.csv', sep=';'))
    train_labels = np.array(pd.read_csv('train_labels.csv', sep=';'))
    test_labels = np.array(pd.read_csv('test_labels.csv', sep=';'))

    inicio = time.time()

    # importar o modelo de regressão linear
    from sklearn.linear_model import HuberRegressor

    # treinar o modelo no conjunto de dados
    regression = HuberRegressor().fit(train_data, train_labels)

    # prever
    predictions_labels = regression.predict(test_data)

    fim = time.time()
    df_time = pd.DataFrame({'Execution Time:': [fim - inicio]})

    output_path = os.path.join('/home/isadorasalles/Documents/Regressao/huber',
                               'time_' + name_folder)
    df_time.to_csv(output_path, sep=';')

    from sklearn import metrics

    df_metrics = pd.DataFrame({
        'Mean Absolute Error':
Exemplo n.º 36
0
import os
import pandas as pd
from sklearn.model_selection import cross_val_score
import random
import math
import numpy as np
from sklearn import metrics

data_train = pd.read_csv("train_dataset.csv")
data_test = pd.read_csv("test_dataset.csv")

feature = []  ###feature数据集
for i in data_train.columns:
    if (i != 'death_infection_rate') & (i != 'country') & (i != 'num') & (
            i != 'sqrt-factor') & (i != 'ICU/thousand'):
        feature.append(i)
train_feature = data_train[feature]
train_target = data_train['death_infection_rate']
test_feature = data_test[feature]

LiR = HuberRegressor()
LiR.fit(train_feature, train_target)
predictions_LiR = LiR.predict(test_feature)
print(LiR.coef_)

print(LiR.intercept_)

result1 = [pd.DataFrame(data_test), pd.DataFrame(predictions_LiR)]
result1_new = pd.concat(result1, axis=1)  ###axis=1,按照列合并,=0按照行合并
result1_new.to_csv('CDR.csv', index=False)
Exemplo n.º 37
0
from sklearn.svm import SVR

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

if __name__ == "__main__":
    dataset = pd.read_csv('./data/felicidad_corrupt.csv')
    print(dataset.head(5))

    X = dataset.drop(['country', 'score'], axis=1)
    y = dataset[['score']]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42
    )

    """ El valor epsilon por defecto es 1.35 y por conveniencia es mejor dejarlo
    asi ya que el 95% de los datos resulta mejor con este valor de epsilon """
    estimadores = {
        'SVR' : SVR(gamma='auto', C=1.0, epsilon=0.1),
        'RANSAC': RANSACRegressor(),
        'HBER': HuberRegressor(epsilon=1.35)
    }

for name, estimador in estimadores.items():
    estimador.fit(X_train, y_train)
    predictions = estimador.predict(X_test)
    print("="*64)
    print(name)
    print("MSE: ", mean_squared_error(y_test, predictions))
    print("Score: ", estimador.score(X_test, y_test))
Exemplo n.º 38
0
class TDPRegressor:
    def __init__(self, features=[], target=[], model='ols', tag='train'):
        self.tag = tag + '_' + model
        self.outdir = 'fig/final_v5/' + self.tag
        self.model = model
        import os
        os.system('mkdir -p ' + self.outdir)

        # setup analysis
        self.X = features
        self.y = target

        # Scale
        self.scaler = StandardScaler(with_mean=True, with_std=True).fit(self.X)

        if model == 'ols':
            self.regr = skl_lm.LinearRegression()
        elif model == 'huber':
            self.regr = HuberRegressor(fit_intercept=True,
                                       alpha=0.0,
                                       max_iter=100,
                                       epsilon=1.35)
        print self

    def __repr__(self):
        return "Regression " + self.tag + " --- %.3d entries" % len(self.X)

    def Add(self, b):
        self.X = np.append(self.X, b.X, axis=0)
        self.y = np.append(self.y, b.y, axis=0)
        self.X_scaled = np.append(self.X_scaled, b.X_scaled, axis=0)
        self.yhat = np.append(self.yhat, b.yhat, axis=0)

    def transform(self):
        self.X_scaled = self.scaler.transform(self.X)

    def fit(self):
        # Fit
        X_scaled = self.X_scaled
        self.regr.fit(X_scaled, self.y)
        print(self.regr.intercept_)
        print(self.regr.coef_)

    def predict(self):
        X_scaled = self.X_scaled
        self.yhat = self.regr.predict(X_scaled)
        if len(self.y) > 0:
            self.CalcErrorMetric()

    def CalcErrorMetric(self):
        X = self.X
        X_scaled = self.X_scaled
        y = self.y
        lin_rmse = np.sqrt(mean_squared_error(y, self.yhat))
        lin_ame = mean_absolute_error(y, self.yhat)
        lin_mad = mad(y - self.yhat)
        ymean = np.mean(y)
        self.frac_ame = lin_ame / ymean
        self.frac_err = lin_ame / ymean
        self.R2 = r2_score(self.y, self.yhat)
        print 'residual standard error (rse):', lin_rmse, 'residual mean_absolute_error:', lin_ame, 'residual mad', lin_mad, lin_ame, '<y>: ', ymean
        print 'ratio (err): ', self.frac_err
        print 'R^2 score: ', self.R2

    def PlotInputs(self, xmin=0.5, xmax=5000, xc='linear'):
        # convenient
        X = self.X
        X_scaled = self.X_scaled
        y = self.y

        # vars to fit
        fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(7, 7))
        axes.scatter(X[:, 0], y, color='red', marker='o', alpha=0.2)
        axes.set_xlabel('x', fontsize='xx-large')
        axes.set_ylabel('Time (min)', fontsize='xx-large')
        plt.xscale(xc)
        plt.yscale(xc)
        plt.xlim(xmin, xmax)
        plt.ylim(ymin=10, ymax=100000)
        plt.savefig(self.outdir + '/x_vs_t.png')

    def PlotPerformanceSingle(self, xmin=1, xmax=5000, xc='linear'):
        # convenient
        X = self.X
        y = self.y
        yhat = self.yhat

        fig, axarr = plt.subplots(nrows=1,
                                  ncols=1,
                                  figsize=(7, 7),
                                  sharex=False)
        axarr.scatter(X[:, 0],
                      y,
                      color='red',
                      marker='o',
                      alpha=0.2,
                      label='data')
        axarr.scatter(X[:, 0], yhat, color='blue', marker='s', alpha=0.5, s=5)
        #ax.set_xlabel(r'$\Delta_i$', fontsize=15)
        axarr.set_xlabel('Volume (cm^3)', fontsize=20)
        axarr.set_ylabel('Time (min)', fontsize=20)
        axarr.yaxis.set_tick_params(labelsize=20)
        axarr.set_xscale('linear')
        axarr.set_yscale('linear')
        axarr.set_xlim(xmin, xmax)
        axarr.set_ylim(ymin=10, ymax=15000)
        fig.savefig(self.outdir + '/data_model_vs_x.png')

    def PlotPerformance(self, xmin=1, xmax=5000, xc='linear', plotLeg=True):
        # convenient
        X = self.X
        y = self.y / 60
        yhat = self.yhat / 60

        # plot residual
        fig, axarr = plt.subplots(nrows=1,
                                  ncols=1,
                                  figsize=(7, 7),
                                  sharex=False)
        #fig.subplots_adjust(hspace=0)
        # Two subplots, the axes array is 1-d

        axarr.scatter(X[:, 0],
                      y,
                      color='red',
                      marker='o',
                      alpha=0.2,
                      label='data')
        axarr.scatter(X[:, 0], yhat, color='blue', marker='s', alpha=0.5, s=5)
        #ax.set_xlabel(r'$\Delta_i$', fontsize=15)
        axarr.set_xscale('linear')
        axarr.set_yscale('linear')
        axarr.set_xlim(xmin, xmax)
        axarr.set_ylim(ymin=0, ymax=250)
        axarr.xaxis.set_tick_params(labelsize=20)
        axarr.yaxis.set_tick_params(labelsize=20)
        axarr.set_xlabel('Volume (cm^3)', fontsize=20)
        axarr.set_ylabel('Build Time (hours)', fontsize=20)
        fig.savefig(self.outdir + '/data_model_data_vs_x.png')

        fig, axarr = plt.subplots(nrows=1,
                                  ncols=1,
                                  figsize=(7, 7),
                                  sharex=False)
        axarr.scatter(X[:, 0], y - yhat, color='red', alpha=0.2)
        axarr.set_xscale(xc)
        axarr.set_yscale('linear')
        axarr.set_xlim(xmin, xmax)
        axarr.set_ylim(-50, 50)
        axarr.xaxis.set_tick_params(labelsize=20)
        axarr.yaxis.set_tick_params(labelsize=20)
        axarr.set_xlabel('Volume (cm^3)', fontsize=20)
        axarr.set_ylabel('Build Time (hours)', fontsize=20)

        axarr.scatter(X[:, 0].T,
                      X[:, 0].T * 0,
                      color='blue',
                      marker='s',
                      alpha=0.5,
                      s=5,
                      label=self.tag + '\nFrac Error = ' +
                      "%.3f" % self.frac_ame + '\n' + r'$R^2$ = ' +
                      "%.3f" % self.R2)

        if plotLeg:
            axarr.legend(loc='lower left', framealpha=0, fontsize=16)

        fig.savefig(self.outdir + '/data_model_residual_vs_x.png')

    def export_model(self):
        from sklearn.externals import joblib
        joblib.dump([self.scaler, self.regr],
                    self.outdir + '/' + self.model + '.pkl')

    def import_model(self, scaler, regr):
        self.scaler = scaler
        self.regr = regr
        print 'scaler:', scaler.mean_
        print 'regr coefficients:', regr.intercept_, regr.coef_
Exemplo n.º 39
0
    def __init__(self):
        random_rate = 8240
        clf1 = SGDClassifier(alpha=5e-05,
                             average=False,
                             class_weight='balanced',
                             loss='log',
                             n_iter=30,
                             penalty='l2',
                             n_jobs=-1,
                             random_state=random_rate)
        clf2 = MultinomialNB(alpha=0.1)
        clf3 = LinearSVC(C=0.1, random_state=random_rate)
        clf4 = LogisticRegression(C=1.0,
                                  n_jobs=-1,
                                  max_iter=100,
                                  class_weight='balanced',
                                  random_state=random_rate)
        clf5 = BernoulliNB(alpha=0.1)
        clf6 = VotingClassifier(estimators=[('sgd', clf1), ('mb', clf2),
                                            ('bb', clf3), ('lf', clf4),
                                            ('bnb', clf5)],
                                voting='hard')
        clf7 = SGDClassifier(alpha=5e-05,
                             average=False,
                             class_weight='balanced',
                             loss='log',
                             n_iter=30,
                             penalty='l1',
                             n_jobs=-1,
                             random_state=random_rate)
        clf8 = LinearSVC(C=0.9, random_state=random_rate)
        clf9 = LogisticRegression(C=0.5,
                                  n_jobs=-1,
                                  max_iter=100,
                                  class_weight='balanced',
                                  random_state=random_rate)
        clf10 = MultinomialNB(alpha=0.9)
        clf11 = BernoulliNB(alpha=0.9)
        clf12 = LogisticRegression(C=0.2,
                                   n_jobs=-1,
                                   max_iter=100,
                                   class_weight='balanced',
                                   random_state=random_rate,
                                   penalty='l1')
        clf13 = LogisticRegression(C=0.8,
                                   n_jobs=-1,
                                   max_iter=100,
                                   class_weight='balanced',
                                   random_state=random_rate,
                                   penalty='l1')
        clf14 = RidgeClassifier(alpha=8)
        clf15 = PassiveAggressiveClassifier(C=0.01,
                                            loss='squared_hinge',
                                            n_iter=20,
                                            n_jobs=-1)
        clf16 = RidgeClassifier(alpha=2)
        clf17 = PassiveAggressiveClassifier(C=0.5,
                                            loss='squared_hinge',
                                            n_iter=30,
                                            n_jobs=-1)
        clf18 = LinearSVC(C=0.5, random_state=random_rate)
        clf19 = MultinomialNB(alpha=0.5)
        clf20 = BernoulliNB(alpha=0.5)
        clf21 = Lasso(alpha=0.1, max_iter=20, random_state=random_rate)
        clf22 = Lasso(alpha=0.9, max_iter=30, random_state=random_rate)
        clf23 = PassiveAggressiveClassifier(C=0.1,
                                            loss='hinge',
                                            n_iter=30,
                                            n_jobs=-1,
                                            random_state=random_rate)
        clf24 = PassiveAggressiveClassifier(C=0.9,
                                            loss='hinge',
                                            n_iter=30,
                                            n_jobs=-1,
                                            random_state=random_rate)
        clf25 = HuberRegressor(max_iter=30)

        basemodel = [
            ['sgd', clf1],
            ['nb', clf2],
            ['lsvc1', clf3],
            ['LR1', clf4],
            ['bb', clf5],
            ['vote', clf6],
            ['sgdl1', clf7],
            ['lsvc2', clf8],
            ['LR2', clf9],
            ['nb2', clf10],
            ['bb2', clf11],
            ['LR3', clf12],
            ['LR4', clf13],
            ['rc1', clf14],
            ['pac1', clf15],
            ['rc2', clf16],
            ['pac2', clf17],
            ['lsvc3', clf18],
            ['nb3', clf19],
            ['bb3', clf20],
            ['lr5', clf21],
            ['lr6', clf22],
            ['rc3', clf23],
            ['pac3', clf24],
            ['hub', clf25],
        ]
        #####################################
        clf_svc = SVC(C=1, random_state=random_rate, cache_size=1000)

        self.base_models = basemodel
        self.LR = clf4
        self.svc = clf_svc
Exemplo n.º 40
0
def _ellip_smooth(R, E, deg):
    model = make_pipeline(PolynomialFeatures(deg), HuberRegressor(epsilon=2.))
    model.fit(np.log10(R).reshape(-1, 1), _inv_x_to_eps(E))
    return _x_to_eps(model.predict(np.log10(R).reshape(-1, 1)))
Exemplo n.º 41
0
 "kr":
 SklearnWrapper(KernelRidge(), accept_singleton=True),
 "rf":
 SklearnWrapper(RandomForestRegressor(), accept_singleton=True),
 "gb":
 SklearnWrapper(MultiOutputRegressor(GradientBoostingRegressor()),
                accept_singleton=True),
 "lr":
 SklearnWrapper(Pipeline([("poly", PolynomialFeatures(2)),
                          ("regressor",
                           MultiOutputRegressor(LinearRegression()))]),
                accept_singleton=True),
 "hr":
 SklearnWrapper(Pipeline([("poly", PolynomialFeatures(2)),
                          ("regressor",
                           MultiOutputRegressor(HuberRegressor()))]),
                accept_singleton=True),
 "ran":
 SklearnWrapper(Pipeline([("poly", PolynomialFeatures(2)),
                          ("regressor",
                           MultiOutputRegressor(RANSACRegressor()))]),
                accept_singleton=True),
 "gpr":
 SklearnWrapper(MultiOutputRegressor(GaussianProcessRegressor()),
                accept_singleton=True),
 "wei":
 SklearnWrapper(MultiOutputRegressor(WeightedCurver(maxfev=100000)),
                accept_singleton=True),
 "sum":
 SklearnWrapper(MultiOutputRegressor(
     SummedCurver(maxfev=2000, method="dogbox")),
Exemplo n.º 42
0
y_outliers = rng.normal(0, 2.0, size=4)
X_outliers[:2, :] += X.max() + X.mean() / 4.
X_outliers[2:, :] += X.min() - X.mean() / 4.
y_outliers[:2] += y.min() - y.mean() / 4.
y_outliers[2:] += y.max() + y.mean() / 4.
X = np.vstack((X, X_outliers))
y = np.concatenate((y, y_outliers))
plt.plot(X, y, 'b.')

# Fit the huber regressor over a series of epsilon values.
colors = ['r-', 'b-', 'y-', 'm-']

x = np.linspace(X.min(), X.max(), 7)
epsilon_values = [1.35, 1.5, 1.75, 1.9]
for k, epsilon in enumerate(epsilon_values):
    huber = HuberRegressor(fit_intercept=True, alpha=0.0, max_iter=100,
                           epsilon=epsilon)
    huber.fit(X, y)
    coef_ = huber.coef_ * x + huber.intercept_
    plt.plot(x, coef_, colors[k], label="huber loss, %s" % epsilon)

# Fit a ridge regressor to compare it to huber regressor.
ridge = Ridge(fit_intercept=True, alpha=0.0, random_state=0, normalize=True)
ridge.fit(X, y)
coef_ridge = ridge.coef_
coef_ = ridge.coef_ * x + ridge.intercept_
plt.plot(x, coef_, 'g-', label="ridge regression")

plt.title("Comparison of HuberRegressor vs Ridge")
plt.xlabel("X")
plt.ylabel("y")
plt.legend(loc=0)
Exemplo n.º 43
0
# doesn't appear to be any trend with the year.

fig, ax = plt.subplots()
train[['SalePrice', 'YrSold']].boxplot(by='YrSold', column='SalePrice', ax=ax)
plt.xlabel('Year sold')
plt.ylabel('Price ($)')
plt.suptitle("")
plt.show()

# %% From a human perspective, the living space looks like the strongest
# indicator of price, lets see whether a basic fit can be made. Need to use
# HuberRegressor because it is more robust to outliers.

fig, ax = plt.subplots()
ax.scatter(train.GrLivArea, train.SalePrice, alpha=0.2, label='Real data')
clf = HuberRegressor()
clf.fit(train.GrLivArea.values.reshape(-1, 1),
        train.SalePrice.values.reshape(-1, 1))
salePredictGrLivArea = clf.predict(train.GrLivArea.values.reshape(-1, 1))
ax.plot(train.GrLivArea.values.reshape(-1, 1),
        clf.predict(train.GrLivArea.values.reshape(-1, 1)),
        'black',
        label='Linear fit')
plt.xlabel('Living area')
plt.ylabel('Price ($)')
plt.legend()
plt.show()

# %% Lets look at correlations in the dataset (at least between numeric values)
# We only care about correlations with sale price, so lets visualise that.
# It turns out a number of variables have a large positive correlation