def test_skinny_fat():

    X, Y = instance()[:2]
    n, p = X.shape
    lam = choose_lambda(X)
    obj1 = sqlasso_objective(X, Y)
    obj2 = sqlasso_objective_skinny(X, Y)
    soln1 = solve_sqrt_lasso_fat(X,
                                 Y,
                                 weights=np.ones(p) * lam,
                                 solve_args={'min_its': 500})[0]
    soln2 = solve_sqrt_lasso_skinny(X,
                                    Y,
                                    weights=np.ones(p) * lam,
                                    solve_args={'min_its': 500})[0]

    np.testing.assert_allclose(soln1, soln2, rtol=1.e-3)

    X, Y = instance(p=50)[:2]
    n, p = X.shape
    lam = choose_lambda(X)
    obj1 = sqlasso_objective(X, Y)
    obj2 = sqlasso_objective_skinny(X, Y)
    soln1 = solve_sqrt_lasso_fat(X,
                                 Y,
                                 weights=np.ones(p) * lam,
                                 solve_args={'min_its': 500})[0]
    soln2 = solve_sqrt_lasso_skinny(X,
                                    Y,
                                    weights=np.ones(p) * lam,
                                    solve_args={'min_its': 500})[0]

    np.testing.assert_allclose(soln1, soln2, rtol=1.e-3)
示例#2
0
def test_sqrt_lasso_sandwich_pvals(n=200,
                                   p=50,
                                   s=10,
                                   sigma=10,
                                   rho=0.3,
                                   signal=6.,
                                   use_lasso_sd=False):

    X, y, beta, true_active, sigma, _ = instance(n=n,
                                                 p=p,
                                                 s=s,
                                                 sigma=sigma,
                                                 rho=rho,
                                                 signal=signal)

    heteroscedastic_error = sigma * np.random.standard_normal(n) * (
        np.fabs(X[:, -1]) + 0.5)**2
    heteroscedastic_error += sigma * np.random.standard_normal(n) * (
        np.fabs(X[:, -2]) + 0.2)**2
    heteroscedastic_error += sigma * np.random.standard_normal(n) * (
        np.fabs(X[:, -3]) + 0.5)**2
    y += heteroscedastic_error

    feature_weights = np.ones(p) * choose_lambda(X)
    feature_weights[10:12] = 0

    L_SQ = lasso.sqrt_lasso(X, y, feature_weights, covariance='sandwich')
    L_SQ.fit()

    if set(true_active).issubset(L_SQ.active):
        S = L_SQ.summary('twosided')
        return S['pval'], [v in true_active for v in S['variable']]
示例#3
0
def test_logistic_pvals(n=500,
                        p=200,
                        s=3,
                        sigma=2,
                        rho=0.3,
                        snr=7.):

    X, y, beta, true_active, sigma = instance(n=n, 
                                         p=p, 
                                         s=s, 
                                         sigma=sigma, 
                                         rho=rho, 
                                         snr=snr)

    z = (y > 0)
    X = np.hstack([np.ones((n,1)), X])

    active = np.array(true_active)
    active += 1
    active = [0] + list(active)

    L = lasso.logistic(X, z, [0]*1 + [1.2]*p)
    L.fit()
    S = L.summary('onesided')

    true_active = np.nonzero(active)[0]
    if set(true_active).issubset(L.active) > 0:
        return S['pval'], [v in true_active for v in S['variable']]
示例#4
0
def test_sqrt_lasso_pvals(n=100,
                          p=200,
                          s=7,
                          sigma=5,
                          rho=0.3,
                          snr=7.):

    X, y, beta, true_active, sigma = instance(n=n, 
                                         p=p, 
                                         s=s, 
                                         sigma=sigma, 
                                         rho=rho, 
                                         snr=snr)

    lam_theor = np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 1000)))).max(0)) / np.sqrt(n)
    Q = rr.identity_quadratic(0.01, 0, np.ones(p), 0)

    weights_with_zeros = 0.7*lam_theor * np.ones(p)
    weights_with_zeros[:3] = 0.

    lasso.sqrt_lasso(X, y, weights_with_zeros, covariance='parametric')
    L = lasso.sqrt_lasso(X, y, weights_with_zeros)
    L.fit()
    if set(true_active).issubset(L.active):
        S = L.summary('onesided')
        S = L.summary('twosided')
        return S['pval'], [v in true_active for v in S['variable']]
示例#5
0
def test_gaussian_sandwich_pvals(n=200,
                                 p=50,
                                 s=10,
                                 sigma=10,
                                 rho=0.3,
                                 snr=6.,
                                 use_lasso_sd=False):

    X, y, beta, true_active, sigma = instance(n=n, 
                                         p=p, 
                                         s=s, 
                                         sigma=sigma, 
                                         rho=rho, 
                                         snr=snr)

    heteroscedastic_error = sigma * np.random.standard_normal(n) * (np.fabs(X[:,-1]) + 0.5)**2
    heteroscedastic_error += sigma * np.random.standard_normal(n) * (np.fabs(X[:,-2]) + 0.2)**2
    heteroscedastic_error += sigma * np.random.standard_normal(n) * (np.fabs(X[:,-3]) + 0.5)**2
    y += heteroscedastic_error

    # two different estimators of variance
    loss = rr.glm.gaussian(X, y)
    sandwich = glm_sandwich_estimator(loss, B=5000)


    # make sure things work with some unpenalized columns

    feature_weights = np.ones(p) * 3 * sigma
    feature_weights[10:12] = 0

    # try using RSS from LASSO to estimate sigma 

    if use_lasso_sd:
        L_prelim = lasso.gaussian(X, y, feature_weights)
        L_prelim.fit()
        beta_lasso = L_prelim.lasso_solution
        sigma_hat = np.linalg.norm(y - X.dot(beta_lasso))**2 / (n - len(L_prelim.active))
        parametric = glm_parametric_estimator(loss, dispersion=sigma_hat**2)
    else:
        parametric = glm_parametric_estimator(loss, dispersion=None)

    L_P = lasso.gaussian(X, y, feature_weights, covariance_estimator=parametric)
    L_P.fit()

    if set(true_active).issubset(L_P.active):

        S = L_P.summary('twosided')
        P_P = [p for p, v in zip(S['pval'], S['variable']) if v not in true_active]

        L_S = lasso.gaussian(X, y, feature_weights, covariance_estimator=sandwich)
        L_S.fit()

        S = L_S.summary('twosided')
        P_S = [p for p, v in zip(S['pval'], S['variable']) if v not in true_active]

        return P_P, P_S, [v in true_active for v in S['variable']]
示例#6
0
def test_intervals(n=100, p=20, s=5):
    t = []
    X, y, beta, true_active, sigma, _ = instance(n=n, p=p, s=s)
    las = lasso.gaussian(X, y, 4., sigma=sigma)
    las.fit()

    # smoke test

    las.soln
    las.constraints
    S = las.summary(compute_intervals=True)
    nominal_intervals(las)
示例#7
0
def test_gaussian_pvals(n=100, p=500, s=7, sigma=5, rho=0.3, signal=8.):

    X, y, beta, true_active, sigma, _ = instance(n=n,
                                                 p=p,
                                                 s=s,
                                                 sigma=sigma,
                                                 rho=rho,
                                                 signal=signal)
    L = lasso.gaussian(X, y, 20., sigma=sigma)
    L.fit()
    L.fit(L.lasso_solution)
    if set(true_active).issubset(L.active):
        S = L.summary('onesided')
        S = L.summary('twosided')
        return S['pval'], [v in true_active for v in S['variable']]
示例#8
0
def test_adding_quadratic_lasso():

    X, y, beta, true_active, sigma = instance(n=300, p=200)
    Q = rr.identity_quadratic(0.01, 0, np.random.standard_normal(X.shape[1]), 0)

    L1 = lasso.gaussian(X, y, 20, quadratic=Q)
    beta1 = L1.fit(solve_args={'min_its':500, 'tol':1.e-12})
    G1 = X[:,L1.active].T.dot(X.dot(beta1) - y) + Q.objective(beta1,'grad')[L1.active]
    np.testing.assert_allclose(G1 * np.sign(beta1[L1.active]), -20)

    lin = rr.identity_quadratic(0.0, 0, np.random.standard_normal(X.shape[1]), 0)
    L2 = lasso.gaussian(X, y, 20, quadratic=lin)
    beta2 = L2.fit(solve_args={'min_its':500, 'tol':1.e-12})
    G2 = X[:,L2.active].T.dot(X.dot(beta2) - y) + lin.objective(beta2,'grad')[L2.active]
    np.testing.assert_allclose(G2 * np.sign(beta2[L2.active]), -20)
def test_gaussian(n=100, p=20):

    X, y, beta = instance(n=n, p=p, sigma=1.)[:3]

    lam_theor = np.mean(
        np.fabs(np.dot(X.T, np.random.standard_normal((n, 1000)))).max(0))

    weights = 1.1 * lam_theor * np.ones(p)
    weights[:3] = 0.

    L = lasso.gaussian(X, y, weights, sigma=1.)
    L.ignore_inactive_constraints = True
    L.fit()

    print(debiased_lasso_inference(L, L.active, np.sqrt(2 * np.log(p) / n)))
    print(beta)
示例#10
0
def test_data_carving_poisson(n=500,
                              p=300,
                              s=5,
                              sigma=5,
                              rho=0.3,
                              signal=12.,
                              split_frac=0.8,
                              lam_frac=1.2,
                              ndraw=8000,
                              burnin=2000,
                              df=np.inf,
                              compute_intervals=True,
                              use_full_cov=True,
                              return_only_screening=True):

    X, y, beta, true_active, sigma, _ = instance(n=n,
                                                 p=p,
                                                 s=s,
                                                 sigma=sigma,
                                                 rho=rho,
                                                 signal=signal,
                                                 df=df)
    X = np.hstack([np.ones((n, 1)), X])
    y = np.random.poisson(10, size=y.shape)
    s = 1
    true_active = [0]

    idx = np.arange(n)
    np.random.shuffle(idx)
    stage_one = idx[:int(n * split_frac)]
    n1 = len(stage_one)

    lam_theor = 3. * np.ones(p + 1)
    lam_theor[0] = 0.
    DC = data_carving.poisson(X,
                              y,
                              feature_weights=lam_theor,
                              stage_one=stage_one)

    DC.fit()

    if len(DC.active) < n - int(n * split_frac):
        DS = data_splitting.poisson(X,
                                    y,
                                    feature_weights=lam_theor,
                                    stage_one=stage_one)
        DS.fit(use_full_cov=True)
        data_split = True
    else:
        print('not enough data for data splitting second stage')
        print(DC.active)
        data_split = False

    print(DC.active)
    if set(true_active).issubset(DC.active):
        carve = []
        split = []
        for var in DC.active:
            carve.append(DC.hypothesis_test(var, burnin=burnin, ndraw=ndraw))
            if data_split:
                split.append(DS.hypothesis_test(var))
            else:
                split.append(np.random.sample())

        Xa = X[:, DC.active]

        active = np.zeros(p + 1, np.bool)
        active[true_active] = 1
        v = (carve, split, active)
        return v
示例#11
0
def test_data_carving_sqrt_lasso(n=200,
                                 p=200,
                                 s=7,
                                 sigma=5,
                                 rho=0.3,
                                 signal=7.,
                                 split_frac=0.9,
                                 lam_frac=1.2,
                                 ndraw=8000,
                                 burnin=2000,
                                 df=np.inf,
                                 compute_intervals=True,
                                 return_only_screening=True):

    X, y, beta, true_active, sigma, _ = instance(n=n,
                                                 p=p,
                                                 s=s,
                                                 sigma=sigma,
                                                 rho=rho,
                                                 signal=signal,
                                                 df=df)
    mu = np.dot(X, beta)

    idx = np.arange(n)
    np.random.shuffle(idx)
    stage_one = idx[:int(n * split_frac)]
    n1 = len(stage_one)

    lam_theor = lam_frac * np.mean(
        np.fabs(np.dot(X[stage_one].T, np.random.standard_normal(
            (n1, 5000)))).max(0)) / np.sqrt(n1)
    DC = data_carving.sqrt_lasso(X,
                                 y,
                                 feature_weights=lam_theor,
                                 stage_one=stage_one)

    DC.fit()

    if len(DC.active) < n - int(n * split_frac):
        DS = data_splitting.sqrt_lasso(X,
                                       y,
                                       feature_weights=lam_theor,
                                       stage_one=stage_one)
        DS.fit(use_full_cov=True)
        data_split = True
    else:
        print('not enough data for second stage data splitting')
        print(DC.active)
        data_split = False

    if set(true_active).issubset(DC.active):
        carve = []
        split = []
        for var in DC.active:
            carve.append(DC.hypothesis_test(var, burnin=burnin, ndraw=ndraw))
            if data_split:
                split.append(DS.hypothesis_test(var))
            else:
                split.append(np.random.sample())

        Xa = X[:, DC.active]
        truth = np.dot(np.linalg.pinv(Xa), mu)

        active = np.zeros(p, np.bool)
        active[true_active] = 1
        v = (carve, split, active)
        return v
示例#12
0
    selected : []
        Sequence of selected variables.

    active_set : set
        Set of active variables.

    Returns
    -------

    idx : int
        Completion index.

    >>> selected = [1,3,2,4,6,7,8,23,11,5]
    >>> active = [1,4,8]
    >>> completion_index(selected, active)
    6
    """
    active_set = set(active_set)
    for i in range(len(selected)):
        if active_set.issubset(selected[:i]):
            return i - 1
    return len(selected) - 1


if __name__ == "__main__":
    from selection.tests.instance import gaussian_instance as instance
    X, y, beta, active, sigma = instance(n=100, p=40, signal=0, rho=0.3)
    R, FS = compute_pvalues(y, X, sigma=sigma, maxstep=20)
    print(R)
    print(completion_index(R['variable_selected'], active))
    selected : []
        Sequence of selected variables.

    active_set : set
        Set of active variables.

    Returns
    -------

    idx : int
        Completion index.

    >>> selected = [1,3,2,4,6,7,8,23,11,5]
    >>> active = [1,4,8]
    >>> completion_index(selected, active)
    6
    """
    active_set = set(active_set)
    for i in range(len(selected)):
        if active_set.issubset(selected[:i]):
            return i-1
    return len(selected)-1

if __name__ == "__main__":
    from selection.tests.instance import gaussian_instance as instance
    X, y, beta, active, sigma = instance(n=100, p=40, signal=0, rho=0.3)
    R, FS = compute_pvalues(y, X, sigma=sigma, maxstep=20)
    print(R)
    print(completion_index(R['variable_selected'], active))
示例#14
0
def test_data_carving_logistic(n=700,
                               p=300,
                               s=5,
                               sigma=5,
                               rho=0.05,
                               snr=4.,
                               split_frac=0.8,
                               ndraw=8000,
                               burnin=2000, 
                               df=np.inf,
                               compute_intervals=True,
                               use_full_cov=False,
                               return_only_screening=True):
    
    X, y, beta, true_active, sigma = instance(n=n, 
                                         p=p, 
                                         s=s, 
                                         sigma=sigma, 
                                         rho=rho, 
                                         snr=snr, 
                                         df=df)


    mu = X.dot(beta)
    prob = np.exp(mu) / (1 + np.exp(mu))

    X = np.hstack([np.ones((n,1)), X])
    z = np.random.binomial(1, prob)
    active = np.array(true_active)
    active += 1
    s += 1
    active = [0] + list(active)
    true_active = np.nonzero(active)[0]

    idx = np.arange(n)
    np.random.shuffle(idx)
    stage_one = idx[:int(n*split_frac)]
    n1 = len(stage_one)

    lam_theor = 1.0 * np.ones(p+1)
    lam_theor[0] = 0.
    DC = data_carving.logistic(X, z, feature_weights=lam_theor,
                               stage_one=stage_one)

    DC.fit()

    if len(DC.active) < n - int(n*split_frac):
        DS = data_splitting.logistic(X, z, feature_weights=lam_theor,
                                     stage_one=stage_one)
        DS.fit(use_full_cov=True)
        data_split = True
    else:
        print('not enough data for data splitting second stage')
        print(DC.active)
        data_split = False

    if set(true_active).issubset(DC.active):
        carve = []
        split = []
        for var in DC.active:
            carve.append(DC.hypothesis_test(var, burnin=burnin, ndraw=ndraw))
            if data_split:
                split.append(DS.hypothesis_test(var))
            else:
                split.append(np.random.sample())

        Xa = X[:,DC.active]

        active = np.zeros_like(DC.active, np.bool)
        active[true_active] = 1
        v = (carve, split, active)
        return v

    return return_value
示例#15
0
def test_data_carving_gaussian(n=200,
                               p=200,
                               s=7,
                               sigma=5,
                               rho=0.3,
                               snr=7.,
                               split_frac=0.8,
                               lam_frac=2.,
                               ndraw=8000,
                               burnin=2000, 
                               df=np.inf,
                               compute_intervals=True,
                               use_full_cov=True,
                               return_only_screening=True):

    X, y, beta, true_active, sigma = instance(n=n, 
                                              p=p, 
                                              s=s, 
                                              sigma=sigma, 
                                              rho=rho, 
                                              snr=snr, 
                                              df=df)
    mu = np.dot(X, beta)

    idx = np.arange(n)
    np.random.shuffle(idx)
    stage_one = idx[:int(n*split_frac)]

    lam_theor = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 5000)))).max(0)) * sigma
    DC = data_carving.gaussian(X, y, feature_weights=lam_theor,
                               sigma=sigma,
                               stage_one=stage_one)
    DC.fit()

    if len(DC.active) < n - int(n*split_frac):
        DS = data_splitting.gaussian(X, y, feature_weights=lam_theor,
                                     sigma=sigma,
                                     stage_one=stage_one)
        DS.fit(use_full_cov=True)
        DS.fit(use_full_cov=False)
        DS.fit(use_full_cov=use_full_cov)
        data_split = True
    else:
        print('not enough data for second stage data splitting')
        print(DC.active)
        data_split = False

    if set(true_active).issubset(DC.active):
        carve = []
        split = []
        for var in DC.active:
            carve.append(DC.hypothesis_test(var, burnin=burnin, ndraw=ndraw))
            if data_split:
                split.append(DS.hypothesis_test(var))
            else:
                split.append(np.random.sample()) # appropriate p-value if data splitting can't estimate 2nd stage

        Xa = X[:,DC.active]
        truth = np.dot(np.linalg.pinv(Xa), mu) 

        active = np.zeros_like(DC.active, np.bool)
        active[true_active] = 1
        v = (carve, split, active)
        return v