示例#1
0
def test_regularized_options():
    n = 100
    p = 5
    np.random.seed(3132)
    xmat = np.random.normal(size=(n, p))
    yvec = xmat.sum(1) + np.random.normal(size=n)
    model1 = OLS(yvec - 1, xmat)
    result1 = model1.fit_regularized(alpha=1., L1_wt=0.5)
    model2 = OLS(yvec, xmat, offset=1)
    result2 = model2.fit_regularized(alpha=1., L1_wt=0.5,
                                     start_params=np.zeros(5))
    assert_allclose(result1.params, result2.params)
示例#2
0
def test_regularized_refit():
    n = 100
    p = 5
    np.random.seed(3132)
    xmat = np.random.normal(size=(n, p))
    yvec = xmat.sum(1) + np.random.normal(size=n)
    model1 = OLS(yvec, xmat)
    result1 = model1.fit_regularized(alpha=2., L1_wt=0.5, refit=True)
    model2 = OLS(yvec, xmat)
    result2 = model2.fit_regularized(alpha=2., L1_wt=0.5, refit=True)
    assert_allclose(result1.params, result2.params)
    assert_allclose(result1.bse, result2.bse)
示例#3
0
def test_regularized_refit():
    n = 100
    p = 5
    np.random.seed(3132)
    xmat = np.random.normal(size=(n, p))
    yvec = xmat.sum(1) + np.random.normal(size=n)
    model1 = OLS(yvec, xmat)
    result1 = model1.fit_regularized(alpha=2., L1_wt=0.5, refit=True)
    model2 = OLS(yvec, xmat)
    result2 = model2.fit_regularized(alpha=2., L1_wt=0.5, refit=True)
    assert_allclose(result1.params, result2.params)
    assert_allclose(result1.bse, result2.bse)
def test_regularized_options():
    n = 100
    p = 5
    np.random.seed(3132)
    xmat = np.random.normal(size=(n, p))
    yvec = xmat.sum(1) + np.random.normal(size=n)
    model1 = OLS(yvec - 1, xmat)
    result1 = model1.fit_regularized(alpha=1., L1_wt=0.5)
    model2 = OLS(yvec, xmat, offset=1)
    result2 = model2.fit_regularized(alpha=1., L1_wt=0.5,
                                     start_params=np.zeros(5))
    assert_allclose(result1.params, result2.params)
def test_repeat_partition():

    # tests that if we use identical partitions the average is the same
    # as the estimate for the full data

    np.random.seed(435265)
    N = 200
    p = 10
    m = 1

    beta = np.random.normal(size=p)
    beta = beta * np.random.randint(0, 2, p)
    X = np.random.normal(size=(N, p))
    y = X.dot(beta) + np.random.normal(size=N)

    def _rep_data_gen(endog, exog, partitions):
        """partitions data"""

        n_exog = exog.shape[0]
        n_part = np.ceil(n_exog / partitions)

        ii = 0
        while ii < n_exog:
            yield endog, exog
            ii += int(n_part)

    nv_mod = DistributedModel(m, estimation_method=_est_regularized_naive,
                              join_method=_join_naive)
    fitOLSnv = nv_mod.fit(_rep_data_gen(y, X, m), fit_kwds={"alpha": 0.1})

    ols_mod = OLS(y, X)
    fitOLS = ols_mod.fit_regularized(alpha=0.1)

    assert_allclose(fitOLSnv.params, fitOLS.params)
示例#6
0
    def test_regularized_weights(self):

        np.random.seed(1432)
        exog1 = np.random.normal(size=(100, 3))
        endog1 = exog1[:, 0] + exog1[:, 1] + np.random.normal(size=100)
        exog2 = np.random.normal(size=(100, 3))
        endog2 = exog2[:, 0] + exog2[:, 1] + np.random.normal(size=100)

        exog_a = np.vstack((exog1, exog1, exog2))
        endog_a = np.concatenate((endog1, endog1, endog2))

        # Should be equivalent to exog_a, endog_a.
        exog_b = np.vstack((exog1, exog2))
        endog_b = np.concatenate((endog1, endog2))
        wgts = np.ones(200)
        wgts[0:100] = 2
        sigma = np.diag(1 / wgts)

        for L1_wt in 0, 0.5, 1:
            for alpha in 0, 1:
                mod1 = OLS(endog_a, exog_a)
                rslt1 = mod1.fit_regularized(L1_wt=L1_wt, alpha=alpha)

                mod2 = WLS(endog_b, exog_b, weights=wgts)
                rslt2 = mod2.fit_regularized(L1_wt=L1_wt, alpha=alpha)

                mod3 = GLS(endog_b, exog_b, sigma=sigma)
                rslt3 = mod3.fit_regularized(L1_wt=L1_wt, alpha=alpha)

                assert_almost_equal(rslt1.params, rslt2.params, decimal=3)
                assert_almost_equal(rslt1.params, rslt3.params, decimal=3)
    def test_regularized_weights(self):

        np.random.seed(1432)
        exog1 = np.random.normal(size=(100, 3))
        endog1 = exog1[:, 0] + exog1[:, 1] + np.random.normal(size=100)
        exog2 = np.random.normal(size=(100, 3))
        endog2 = exog2[:, 0] + exog2[:, 1] + np.random.normal(size=100)

        exog_a = np.vstack((exog1, exog1, exog2))
        endog_a = np.concatenate((endog1, endog1, endog2))

        # Should be equivalent to exog_a, endog_a.
        exog_b = np.vstack((exog1, exog2))
        endog_b = np.concatenate((endog1, endog2))
        wgts = np.ones(200)
        wgts[0:100] = 2
        sigma = np.diag(1/wgts)

        for L1_wt in 0, 0.5, 1:
            for alpha in 0, 1:
                mod1 = OLS(endog_a, exog_a)
                rslt1 = mod1.fit_regularized(L1_wt=L1_wt, alpha=alpha)

                mod2 = WLS(endog_b, exog_b, weights=wgts)
                rslt2 = mod2.fit_regularized(L1_wt=L1_wt, alpha=alpha)

                mod3 = GLS(endog_b, exog_b, sigma=sigma)
                rslt3 = mod3.fit_regularized(L1_wt=L1_wt, alpha=alpha)

                assert_almost_equal(rslt1.params, rslt2.params, decimal=3)
                assert_almost_equal(rslt1.params, rslt3.params, decimal=3)
    def test_regularized(self):

        import os
        from . import glmnet_r_results

        cur_dir = os.path.dirname(os.path.abspath(__file__))
        data = np.loadtxt(os.path.join(cur_dir, "results", "lasso_data.csv"),
                          delimiter=",")

        tests = [x for x in dir(glmnet_r_results) if x.startswith("rslt_")]

        for test in tests:

            vec = getattr(glmnet_r_results, test)

            n = vec[0]
            p = vec[1]
            L1_wt = float(vec[2])
            lam = float(vec[3])
            params = vec[4:].astype(np.float64)

            endog = data[0:int(n), 0]
            exog = data[0:int(n), 1:(int(p)+1)]

            endog = endog - endog.mean()
            endog /= endog.std(ddof=1)
            exog = exog - exog.mean(0)
            exog /= exog.std(0, ddof=1)

            mod = OLS(endog, exog)
            rslt = mod.fit_regularized(L1_wt=L1_wt, alpha=lam)
            assert_almost_equal(rslt.params, params, decimal=3)

            # Smoke test for summary
            smry = rslt.summary()
def test_repeat_partition():

    # tests that if we use identical partitions the average is the same
    # as the estimate for the full data

    np.random.seed(435265)
    N = 200
    p = 10
    m = 1

    beta = np.random.normal(size=p)
    beta = beta * np.random.randint(0, 2, p)
    X = np.random.normal(size=(N, p))
    y = X.dot(beta) + np.random.normal(size=N)

    def _rep_data_gen(endog, exog, partitions):
        """partitions data"""

        n_exog = exog.shape[0]
        n_part = np.ceil(n_exog / partitions)

        ii = 0
        while ii < n_exog:
            yield endog, exog
            ii += int(n_part)

    nv_mod = DistributedModel(m,
                              estimation_method=_est_regularized_naive,
                              join_method=_join_naive)
    fitOLSnv = nv_mod.fit(_rep_data_gen(y, X, m), fit_kwds={"alpha": 0.1})

    ols_mod = OLS(y, X)
    fitOLS = ols_mod.fit_regularized(alpha=0.1)

    assert_allclose(fitOLSnv.params, fitOLS.params)
示例#10
0
    def test_empty_model(self):

       np.random.seed(742)
       n = 100
       endog = np.random.normal(size=n)
       exog = np.random.normal(size=(n, 3))

       model = OLS(endog, exog)
       result = model.fit_regularized(alpha=1000)

       assert_equal(result.params, 0.)
示例#11
0
    def test_empty_model(self):

        np.random.seed(742)
        n = 100
        endog = np.random.normal(size=n)
        exog = np.random.normal(size=(n, 3))

        model = OLS(endog, exog)
        result = model.fit_regularized(alpha=1000)

        assert_equal(result.params, 0.)
示例#12
0
def test_regularized_refit():
    n = 100
    p = 5
    np.random.seed(3132)
    xmat = np.random.normal(size=(n, p))
    # covariates 0 and 2 matter
    yvec = xmat[:, 0] + xmat[:, 2] + np.random.normal(size=n)
    model1 = OLS(yvec, xmat)
    result1 = model1.fit_regularized(alpha=2., L1_wt=0.5, refit=True)
    model2 = OLS(yvec, xmat[:, [0, 2]])
    result2 = model2.fit()
    ii = [0, 2]
    assert_allclose(result1.params[ii], result2.params)
    assert_allclose(result1.bse[ii], result2.bse)
示例#13
0
def test_regularized_refit():
    n = 100
    p = 5
    np.random.seed(3132)
    xmat = np.random.normal(size=(n, p))
    # covariates 0 and 2 matter
    yvec = xmat[:, 0] + xmat[:, 2] + np.random.normal(size=n)
    model1 = OLS(yvec, xmat)
    result1 = model1.fit_regularized(alpha=2., L1_wt=0.5, refit=True)
    model2 = OLS(yvec, xmat[:, [0, 2]])
    result2 = model2.fit()
    ii = [0, 2]
    assert_allclose(result1.params[ii], result2.params)
    assert_allclose(result1.bse[ii], result2.bse)
示例#14
0
def test_ridge():
    n = 100
    p = 5
    np.random.seed(3132)
    xmat = np.random.normal(size=(n, p))
    yvec = xmat.sum(1) + np.random.normal(size=n)

    v = np.ones(p)
    v[0] = 0

    for a in (0, 1, 10):
        for alpha in (a, a*np.ones(p), a*v):
            model1 = OLS(yvec, xmat)
            result1 = model1._fit_ridge(alpha=alpha)
            model2 = OLS(yvec, xmat)
            result2 = model2.fit_regularized(alpha=alpha, L1_wt=0)
            assert_allclose(result1.params, result2.params)
            model3 = OLS(yvec, xmat)
            result3 = model3.fit_regularized(alpha=alpha, L1_wt=1e-10)
            assert_allclose(result1.params, result3.params)

    fv1 = result1.fittedvalues
    fv2 = np.dot(xmat, result1.params)
    assert_allclose(fv1, fv2)
示例#15
0
def test_ridge():
    n = 100
    p = 5
    np.random.seed(3132)
    xmat = np.random.normal(size=(n, p))
    yvec = xmat.sum(1) + np.random.normal(size=n)

    v = np.ones(p)
    v[0] = 0

    for a in (0, 1, 10):
        for alpha in (a, a * np.ones(p), a * v):
            model1 = OLS(yvec, xmat)
            result1 = model1._fit_ridge(alpha=alpha)
            model2 = OLS(yvec, xmat)
            result2 = model2.fit_regularized(alpha=alpha, L1_wt=0)
            assert_allclose(result1.params, result2.params)
            model3 = OLS(yvec, xmat)
            result3 = model3.fit_regularized(alpha=alpha, L1_wt=1e-10)
            assert_allclose(result1.params, result3.params)

    fv1 = result1.fittedvalues
    fv2 = np.dot(xmat, result1.params)
    assert_allclose(fv1, fv2)
示例#16
0
def test_ridge():
    n = 100
    p = 5
    np.random.seed(3132)
    xmat = np.random.normal(size=(n, p))
    yvec = xmat.sum(1) + np.random.normal(size=n)

    for alpha in [1., np.ones(p), 10, 10*np.ones(p)]:
        model1 = OLS(yvec, xmat)
        result1 = model1._fit_ridge(alpha=1.)
        model2 = OLS(yvec, xmat)
        result2 = model2.fit_regularized(alpha=1., L1_wt=0)
        assert_allclose(result1.params, result2.params)

    fv1 = result1.fittedvalues
    fv2 = np.dot(xmat, result1.params)
    assert_allclose(fv1, fv2)
示例#17
0
def test_ridge():
    n = 100
    p = 5
    np.random.seed(3132)
    xmat = np.random.normal(size=(n, p))
    yvec = xmat.sum(1) + np.random.normal(size=n)

    for alpha in [1., np.ones(p), 10, 10*np.ones(p)]:
        model1 = OLS(yvec, xmat)
        result1 = model1._fit_ridge(alpha=1.)
        model2 = OLS(yvec, xmat)
        result2 = model2.fit_regularized(alpha=1., L1_wt=0)
        assert_allclose(result1.params, result2.params)

    fv1 = result1.fittedvalues
    fv2 = np.dot(xmat, result1.params)
    assert_allclose(fv1, fv2)
def _calc_nodewise_row(exog, idx, alpha):
    """calculates the nodewise_row values for the idxth variable, used to
    estimate approx_inv_cov.

    Parameters
    ----------
    exog : array-like
        The weighted design matrix for the current partition.
    idx : scalar
        Index of the current variable.
    alpha : scalar or array-like
        The penalty weight.  If a scalar, the same penalty weight
        applies to all variables in the model.  If a vector, it
        must have the same length as `params`, and contains a
        penalty weight for each coefficient.

    Returns
    -------
    An array-like object of length p-1

    Notes
    -----

    nodewise_row_i = arg min 1/(2n) ||exog_i - exog_-i gamma||_2^2
                             + alpha ||gamma||_1
    """

    p = exog.shape[1]
    # handle array alphas
    if not np.isscalar(alpha):
        alpha = alpha[ind]

    ind = list(range(p))
    ind.pop(idx)

    tmod = OLS(exog[:, idx], exog[:, ind])

    nodewise_row = tmod.fit_regularized(alpha=alpha).params

    return nodewise_row
def _calc_nodewise_row(exog, idx, alpha):
    """calculates the nodewise_row values for the idxth variable, used to
    estimate approx_inv_cov.

    Parameters
    ----------
    exog : array-like
        The weighted design matrix for the current partition.
    idx : scalar
        Index of the current variable.
    alpha : scalar or array-like
        The penalty weight.  If a scalar, the same penalty weight
        applies to all variables in the model.  If a vector, it
        must have the same length as `params`, and contains a
        penalty weight for each coefficient.

    Returns
    -------
    An array-like object of length p-1

    Notes
    -----

    nodewise_row_i = arg min 1/(2n) ||exog_i - exog_-i gamma||_2^2
                             + alpha ||gamma||_1
    """

    p = exog.shape[1]
    # handle array alphas
    if not np.isscalar(alpha):
        alpha = alpha[ind]

    ind = list(range(p))
    ind.pop(idx)

    tmod = OLS(exog[:, idx], exog[:, ind])

    nodewise_row = tmod.fit_regularized(alpha=alpha).params

    return nodewise_row
def test_single_partition():

    # tests that the results make sense if we have a single partition

    np.random.seed(435265)
    N = 200
    p = 10
    m = 1

    beta = np.random.normal(size=p)
    beta = beta * np.random.randint(0, 2, p)
    X = np.random.normal(size=(N, p))
    y = X.dot(beta) + np.random.normal(size=N)

    # test regularized OLS v. naive
    db_mod = DistributedModel(m)
    fitOLSdb = db_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0})

    nv_mod = DistributedModel(m,
                              estimation_method=_est_regularized_naive,
                              join_method=_join_naive)
    fitOLSnv = nv_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0})

    ols_mod = OLS(y, X)
    fitOLS = ols_mod.fit(alpha=0)

    assert_allclose(fitOLSdb.params, fitOLS.params)
    assert_allclose(fitOLSnv.params, fitOLS.params)

    # test regularized
    nv_mod = DistributedModel(m,
                              estimation_method=_est_regularized_naive,
                              join_method=_join_naive)
    fitOLSnv = nv_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0.1})

    ols_mod = OLS(y, X)
    fitOLS = ols_mod.fit_regularized(alpha=0.1)

    assert_allclose(fitOLSnv.params, fitOLS.params)
def test_non_zero_params():

    # tests that the thresholding does not cause any issues

    np.random.seed(435265)
    N = 200
    p = 10
    m = 5

    beta = np.random.normal(size=p)
    beta = beta * np.random.randint(0, 2, p)
    X = np.random.normal(size=(N, p))
    y = X.dot(beta) + np.random.normal(size=N)

    db_mod = DistributedModel(m, join_kwds={"threshold": 0.13})
    fitOLSdb = db_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0.1})
    ols_mod = OLS(y, X)
    fitOLS = ols_mod.fit_regularized(alpha=0.1)

    nz_params_db = 1 * (fitOLSdb.params != 0)
    nz_params_ols = 1 * (fitOLS.params != 0)

    assert_allclose(nz_params_db, nz_params_ols)
def test_non_zero_params():

    # tests that the thresholding does not cause any issues

    np.random.seed(435265)
    N = 200
    p = 10
    m = 5

    beta = np.random.normal(size=p)
    beta = beta * np.random.randint(0, 2, p)
    X = np.random.normal(size=(N, p))
    y = X.dot(beta) + np.random.normal(size=N)

    db_mod = DistributedModel(m, join_kwds={"threshold": 0.13})
    fitOLSdb = db_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0.1})
    ols_mod = OLS(y, X)
    fitOLS = ols_mod.fit_regularized(alpha=0.1)

    nz_params_db = 1 * (fitOLSdb.params != 0)
    nz_params_ols = 1 * (fitOLS.params != 0)

    assert_allclose(nz_params_db, nz_params_ols)
def test_single_partition():

    # tests that the results make sense if we have a single partition

    np.random.seed(435265)
    N = 200
    p = 10
    m = 1

    beta = np.random.normal(size=p)
    beta = beta * np.random.randint(0, 2, p)
    X = np.random.normal(size=(N, p))
    y = X.dot(beta) + np.random.normal(size=N)

    # test regularized OLS v. naive
    db_mod = DistributedModel(m)
    fitOLSdb = db_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0})

    nv_mod = DistributedModel(m, estimation_method=_est_regularized_naive,
                              join_method=_join_naive)
    fitOLSnv = nv_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0})

    ols_mod = OLS(y, X)
    fitOLS = ols_mod.fit(alpha=0)

    assert_allclose(fitOLSdb.params, fitOLS.params)
    assert_allclose(fitOLSnv.params, fitOLS.params)

    # test regularized
    nv_mod = DistributedModel(m, estimation_method=_est_regularized_naive,
                              join_method=_join_naive)
    fitOLSnv = nv_mod.fit(_data_gen(y, X, m), fit_kwds={"alpha": 0.1})

    ols_mod = OLS(y, X)
    fitOLS = ols_mod.fit_regularized(alpha=0.1)

    assert_allclose(fitOLSnv.params, fitOLS.params)
示例#24
0
ts_b = np.round(ts_b, 3)
p_values = np.round(p_values, 3)
params = np.round(params, 4)

myDF3 = pd.DataFrame()
myDF3["Coefficients"], myDF3["Standard Errors"], myDF3["t values"], myDF3[
    "Probabilites"] = [params, sd_b, ts_b, p_values]
print(myDF3)

diabetes = datasets.load_diabetes()
X = diabetes.data
y = diabetes.target
#
X2 = sm.add_constant(X)
est = OLS(y, X2)
est2 = est.fit_regularized()
print(est2.summary())

###### Ridge
from sklearn.linear_model import Ridge

# X, y = make_regression(n_features=2, random_state=0)
X = enb_feature
y = enb_target
regr = Ridge(alpha=5.0)
regr.fit(X, y)
print(regr.coef_)
print(regr.intercept_)
# a= [[4,5],[89,76]]
# np.array(a)
# print(regr.predict(a))
示例#25
-1
    def test_regularized(self):

        import os
        from . import glmnet_r_results

        cur_dir = os.path.dirname(os.path.abspath(__file__))
        data = np.loadtxt(os.path.join(cur_dir, "results", "lasso_data.csv"),
                          delimiter=",")

        tests = [x for x in dir(glmnet_r_results) if x.startswith("rslt_")]

        for test in tests:

            vec = getattr(glmnet_r_results, test)

            n = vec[0]
            p = vec[1]
            L1_wt = float(vec[2])
            lam = float(vec[3])
            params = vec[4:].astype(np.float64)

            endog = data[0:n, 0]
            exog = data[0:n, 1:(p + 1)]

            endog = endog - endog.mean()
            endog /= endog.std(ddof=1)
            exog = exog - exog.mean(0)
            exog /= exog.std(0, ddof=1)

            mod = OLS(endog, exog)
            rslt = mod.fit_regularized(L1_wt=L1_wt, alpha=lam)
            assert_almost_equal(rslt.params, params, decimal=3)

            # Smoke test for summary
            smry = rslt.summary()