Exemplo n.º 1
0
def test_group_lasso_weightedl1_lagrange():
    n, p = 100, 50

    X = np.random.standard_normal((n, p))
    Y = np.random.standard_normal(n)

    loss = rr.glm.gaussian(X, Y)
    weights = np.ones(p)
    weights[-2:] = np.inf
    weights[:2] = 0
    weight_dict = dict([(i, w) for i, w in enumerate(weights)])
    pen1 = rr.weighted_l1norm(weights, lagrange=0.5 * np.sqrt(n))
    pen2 = rr.group_lasso(np.arange(p),
                          weights=weight_dict,
                          lagrange=0.5 * np.sqrt(n))

    problem1 = rr.simple_problem(loss, pen1)
    problem2 = rr.simple_problem(loss, pen2)

    beta1 = problem1.solve(tol=1.e-14, min_its=500)
    beta2 = problem2.solve(tol=1e-14, min_its=500)

    npt.assert_allclose(beta1, beta2)

    bound_val = pen1.seminorm(beta1, lagrange=1)
    bound1 = rr.weighted_l1norm(weights, bound=bound_val)
    bound2 = rr.group_lasso(np.arange(p), weights=weight_dict, bound=bound_val)
    problem3 = rr.simple_problem(loss, bound1)
    problem4 = rr.simple_problem(loss, bound2)

    beta3 = problem3.solve(tol=1.e-14, min_its=500)
    beta4 = problem4.solve(tol=1.e-14, min_its=500)

    npt.assert_allclose(beta3, beta4)
    npt.assert_allclose(beta3, beta1)
def test_multiple_queries_individual_coeff(ndraw=10000, burnin=2000):
    s, n, p = 3, 120, 10

    randomizer = randomization.laplace((p,), scale=1)
    X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=0, snr=5)

    nonzero = np.where(beta)[0]
    lam_frac = 1.

    loss = rr.glm.logistic(X, y)
    epsilon = 1.

    lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0))
    W = np.ones(p)*lam
    W[0] = 0 # use at least some unpenalized
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)), lagrange=1.)

    view = []
    nview = 5
    for i in range(nview):
        view.append(glm_group_lasso(loss, epsilon, penalty, randomizer))

    mv = multiple_queries(view)
    mv.solve()

    active_union = np.zeros(p, np.bool)
    for i in range(nview):
        active_union += view[i].selection_variable['variables']

    nactive = np.sum(active_union)
    print("nactive", nactive)
    active_set = np.nonzero(active_union)[0]

    pvalues = []
    true_beta = beta[active_union]
    if set(nonzero).issubset(np.nonzero(active_union)[0]):
        for j in range(nactive):

            subset = np.zeros(p, np.bool)
            subset[active_set[j]] = True
            target_sampler, target_observed = glm_target(loss,
                                                         active_union * ~subset,
                                                         mv,
                                                         subset=subset,
                                                         reference=np.zeros((1,)))
            test_stat = lambda x: np.atleast_1d(x)

            pval = target_sampler.hypothesis_test(test_stat,
                                                  np.atleast_1d(target_observed-true_beta[j]),
                                                  alternative='twosided',
                                                  ndraw=ndraw,
                                                  burnin=burnin)
            pvalues.append(pval)

        active_var = np.zeros_like(pvalues, np.bool)
        _nonzero = np.array([i in nonzero for i in active_set])
        active_var[_nonzero] = True

        return pvalues, [active_set[j] in nonzero for j in range(nactive)]
Exemplo n.º 3
0
def test_parametric_covariance(ndraw=10000, burnin=2000):
    s, n, p = 3, 120, 10

    randomizer = randomization.laplace((p, ), scale=1)
    X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=0, signal=12)

    nonzero = np.where(beta)[0]
    lam_frac = 1.

    loss = rr.glm.logistic(X, y)
    epsilon = 1.

    lam = lam_frac * np.mean(
        np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0))
    W = np.ones(p) * lam
    W[0] = 0  # use at least some unpenalized
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)

    # first randomization
    M_est1 = glm_group_lasso_parametric(loss, epsilon, penalty, randomizer)
    # second randomization
    M_est2 = glm_group_lasso_parametric(loss, epsilon, penalty, randomizer)

    mv = multiple_queries([M_est1, M_est2])
    mv.solve()

    target = M_est1.selection_variable['variables'].copy()
    if target[-1] or M_est2.selection_variable['variables'][-1]:
        return None
    if target[-2] or M_est2.selection_variable['variables'][-2]:
        return None

    # we should check they are different sizes
    target[-2:] = 1

    if set(nonzero).issubset(np.nonzero(target)[0]):

        form_covariances = glm_parametric_covariance(loss)
        mv.setup_sampler(form_covariances)

        target_observed = restricted_Mest(loss, target)
        linear_func = np.zeros((2, target_observed.shape[0]))
        linear_func[0, -1] = 1.  # we know this one is null
        linear_func[1, -2] = 1.  # also null

        target_observed = linear_func.dot(target_observed)
        target_sampler = mv.setup_target((target, linear_func),
                                         target_observed,
                                         parametric=True)

        test_stat = lambda x: np.linalg.norm(x)
        pval = target_sampler.hypothesis_test(test_stat,
                                              test_stat(target_observed),
                                              alternative='greater',
                                              ndraw=ndraw,
                                              burnin=burnin)

        return [pval], [False]
Exemplo n.º 4
0
def data_splitting_screening(frac=0.5, snr=15, s=5, n=200, p=20, rho=0.1):

    count = 0

    while True:
        count += 1
        X, y, beta, _ = generate_data(n=n, p=p, s=s, rho=rho, snr=snr)

        n2 = int(frac * n)
        X = X[:n2]
        y = y[:n2]

        nonzero = np.where(beta)[0]
        lam_frac = 1.

        loss = rr.glm.logistic(X, y)
        epsilon = 1. / np.sqrt(n2)

        lam = lam_frac * np.mean(
            np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2,
                                                   (n2, 10000)))).max(0))
        W = np.ones(p) * lam
        penalty = rr.group_lasso(np.arange(p),
                                 weights=dict(zip(np.arange(p), W)),
                                 lagrange=1.)

        problem = rr.simple_problem(loss, penalty)
        quadratic = rr.identity_quadratic(epsilon, 0, 0, 0)

        soln = problem.solve(quadratic)
        active_set = np.nonzero(soln != 0)[0]
        if set(nonzero).issubset(active_set):
            return count
    def __init__(
        self,
        loglike,
        groups,
        weights,
        ridge_term,
        randomizer,
        use_lasso=True,  # should lasso solver be used where applicable - defaults to True
        perturb=None):

        _check_groups(groups)  # make sure groups looks sensible

        # log likelihood : quadratic loss
        self.loglike = loglike
        self.nfeature = self.loglike.shape[0]

        # ridge parameter
        self.ridge_term = ridge_term

        # group lasso penalty (from regreg)
        # use regular lasso penalty if all groups are size 1
        if use_lasso and groups.size == np.unique(groups).size:
            # need to provide weights an an np.array rather than a dictionary
            weights_np = np.array([w[1] for w in sorted(weights.items())])
            self.penalty = rr.weighted_l1norm(weights=weights_np, lagrange=1.)
        else:
            self.penalty = rr.group_lasso(groups, weights=weights, lagrange=1.)

        # store groups as a class variable since the non-group lasso doesn't
        self.groups = groups

        self._initial_omega = perturb

        # gaussian randomization
        self.randomizer = randomizer
Exemplo n.º 6
0
def randomization_screening(scale=1., snr=15, s=5, n=200, p=20, rho=0.1):

    count = 0

    randomizer = randomization.laplace((p, ), scale=scale)

    while True:
        count += 1
        X, y, beta, _ = generate_data(n=n, p=p, s=s, rho=rho, snr=snr)

        nonzero = np.where(beta)[0]
        lam_frac = 1.

        loss = rr.glm.logistic(X, y)
        epsilon = 1. / np.sqrt(n)

        lam = lam_frac * np.mean(
            np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2,
                                                   (n, 10000)))).max(0))
        W = np.ones(p) * lam
        penalty = rr.group_lasso(np.arange(p),
                                 weights=dict(zip(np.arange(p), W)),
                                 lagrange=1.)

        M_est = glm_group_lasso(loss, epsilon, penalty, randomizer)
        M_est.solve()

        active_set = np.nonzero(M_est.initial_soln != 0)[0]
        if set(nonzero).issubset(active_set):
            return count
def selection_nonrandomized(X, y, sigma=None, method="theoretical"):
    n, p = X.shape
    loss = rr.glm.gaussian(X, y)
    epsilon = 1. / np.sqrt(n)
    lam_frac = 1.
    if sigma is None:
        sigma = 1.
    if method == "theoretical":
        lam = 1. * sigma * lam_frac * np.mean(
            np.fabs(np.dot(X.T, np.random.standard_normal((n, 10000)))).max(0))

    W = np.ones(p) * lam
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)

    # initial solution

    problem = rr.simple_problem(loss, penalty)
    random_term = rr.identity_quadratic(epsilon, 0, 0, 0)

    solve_args = {'tol': 1.e-10, 'min_its': 100, 'max_its': 500}
    initial_soln = problem.solve(random_term, **solve_args)
    active = (initial_soln != 0)
    if np.sum(active) == 0:
        return None
    initial_grad = loss.smooth_objective(initial_soln, mode='grad')
    betaE = initial_soln[active]
    subgradient = -(initial_grad + epsilon * initial_soln)
    cube = subgradient[~active] / lam
    return lam, epsilon, active, betaE, cube, initial_soln
Exemplo n.º 8
0
def test_reconstruction(s=3,
                        n=200,
                        p=50,
                        snr=7,
                        rho=0.1,
                        split_frac=0.8,
                        lam_frac=0.7,
                        ndraw=100,
                        burnin=200,
                        bootstrap=True,
                        solve_args={
                            'min_its': 50,
                            'tol': 1.e-10
                        },
                        reference_known=False):

    X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, snr=snr)

    m = int(split_frac * n)
    nonzero = np.where(beta)[0]

    loss = rr.glm.logistic(X, y)
    epsilon = 1. / np.sqrt(n)

    lam = lam_frac * np.mean(
        np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 2000)))).max(0))
    W = np.ones(p) * lam
    W[0] = 0  # use at least some unpenalized
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)

    M_est = split_glm_group_lasso(loss, epsilon, m, penalty)
    mv = multiple_queries([M_est])
    mv.solve()

    M_est.selection_variable['variables'] = M_est.selection_variable[
        'variables']
    nactive = np.sum(M_est.selection_variable['variables'])

    if nactive == 0:
        return None

    if set(nonzero).issubset(
            np.nonzero(M_est.selection_variable['variables'])[0]):

        active_set = np.nonzero(M_est.selection_variable['variables'])[0]

        target_sampler, target_observed = glm_target(
            loss, M_est.selection_variable['variables'], mv)

        target_sample = target_sampler.sample(ndraw=ndraw,
                                              burnin=burnin,
                                              keep_opt=True)

        reconstruction = target_sampler.reconstruction_map(target_sample)
        logdens = target_sampler.log_randomization_density(target_sample)
        return logdens.shape
def test_multiple_queries_individual_coeff_small(ndraw=10000, 
                                                 burnin=2000, 
                                                 bootstrap=True):
    s, n, p = 3, 100, 20

    randomizer = randomization.laplace((p,), scale=1)
    X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=0, snr=20.)

    nonzero = np.where(beta)[0]
    lam_frac = 3.

    loss = rr.glm.logistic(X, y)
    epsilon = 1.

    lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0))
    W = np.ones(p)*lam
    W[0] = 0 # use at least some unpenalized
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)), lagrange=1.)

    # randomization
    M_est = glm_group_lasso(loss, epsilon, penalty, randomizer)
    mv = multiple_queries([M_est])
    mv.solve()

    active_vars = M_est.selection_variable['variables'] 

    nactive = np.sum(active_vars)
    active_set = np.nonzero(active_vars)[0]

    pvalues = []
    true_beta = beta[active_vars]

    print(nonzero, active_set)
    if set(nonzero).issubset(active_set):

        for j in range(nactive):

            print(j)
            subset = np.zeros(p, np.bool)
            subset[active_set[j]] = True
            target_sampler, target_observed = glm_target(loss,
                                                         active_vars,
                                                         mv,
                                                         subset=subset,
                                                         bootstrap=bootstrap,
                                                         reference=np.zeros((1,)))

            test_stat = lambda x: x 

            pval = target_sampler.hypothesis_test(test_stat,
                                                  target_observed,
                                                  alternative='twosided',
                                                  ndraw=ndraw,
                                                  burnin=burnin)
            pvalues.append(pval)
        return pvalues, [active_set[j] in nonzero for j in range(nactive)]
def test_approximate_mle(n=100,
                         p=10,
                         s=3,
                         snr=5,
                         rho=0.1,
                         lam_frac = 1.,
                         loss='gaussian',
                         randomizer='gaussian'):

    from selection.api import randomization

    if loss == "gaussian":
        X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, snr=snr, sigma=1.)
        lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) * sigma
        loss = rr.glm.gaussian(X, y)
    elif loss == "logistic":
        X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, snr=snr)
        loss = rr.glm.logistic(X, y)
        lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0))

    epsilon = 1. / np.sqrt(n)

    W = np.ones(p) * lam
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)), lagrange=1.)
    if randomizer == 'gaussian':
        randomization = randomization.isotropic_gaussian((p,), scale=1.)
    elif randomizer == 'laplace':
        randomization = randomization.laplace((p,), scale=1.)

    M_est = M_estimator_approx(loss, epsilon, penalty, randomization, randomizer)
    M_est.solve_approx()

    inf = approximate_conditional_density(M_est)
    inf.solve_approx()

    active = M_est._overall
    active_set = np.asarray([i for i in range(p) if active[i]])

    true_support = np.asarray([i for i in range(p) if i < s])

    nactive = np.sum(active)

    print("active set, true_support", active_set, true_support)

    true_vec = beta[active]

    print("true coefficients", true_vec)

    if (set(active_set).intersection(set(true_support)) == set(true_support)) == True:

        mle_active = np.zeros(nactive)

        for j in range(nactive):
            mle_active[j] = inf.approx_MLE_solver(j, nstep=100)[0]

        print("mle for target", mle_active)
Exemplo n.º 11
0
def test_selection():
    n = 500
    p = 100
    s = 0
    signal = 0.

    np.random.seed(3)  # ensures different y
    X, y, beta, nonzero, sigma = gaussian_instance(n=n,
                                                   p=p,
                                                   s=s,
                                                   sigma=1.,
                                                   rho=0,
                                                   signal=signal)
    lam = 1. * np.mean(
        np.fabs(np.dot(X.T, np.random.standard_normal(
            (n, 2000)))).max(0)) * sigma

    n, p = X.shape

    loss = rr.glm.gaussian(X, y)
    epsilon = 1. / np.sqrt(n)

    W = np.ones(p) * lam
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)
    randomizer = randomization.isotropic_gaussian((p, ), scale=1.)

    M_est = M_estimator_approx(loss, epsilon, penalty, randomizer, 'gaussian',
                               'parametric')
    M_est.solve_approx()
    active = M_est._overall
    active_set = np.asarray([i for i in range(p) if active[i]])
    nactive = np.sum(active)

    prior_variance = 1000.
    noise_variance = sigma**2

    generative_mean = np.zeros(p)
    generative_mean[:nactive] = M_est.initial_soln[active]
    sel_split = selection_probability_random_lasso(M_est, generative_mean)
    min = sel_split.minimize2(nstep=200)
    print(min[0], min[1])

    test_point = np.append(M_est.observed_score_state,
                           np.abs(M_est.initial_soln[M_est._overall]))
    print("value of likelihood",
          sel_split.likelihood_loss.smooth_objective(test_point, mode="func"))

    inv_cov = np.linalg.inv(M_est.score_cov)
    lik = (M_est.observed_score_state -
           generative_mean).T.dot(inv_cov).dot(M_est.observed_score_state -
                                               generative_mean) / 2.
    print("value of likelihood check", lik)
    grad = inv_cov.dot(M_est.observed_score_state - generative_mean)
    print("grad at likelihood loss", grad)
Exemplo n.º 12
0
def test_path_group_lasso():
    '''
    this test looks at the paths of three different parameterizations
    of the same problem

    '''
    n = 100
    X = np.random.standard_normal((n, 10))
    U = np.random.standard_normal((n, 2))
    Y = np.random.standard_normal(100)
    betaX = np.array([3, 4, 5, 0, 0] + [0] * 5)
    betaU = np.array([10, -5])
    Y += (np.dot(X, betaX) + np.dot(U, betaU)) * 5

    Xn = rr.normalize(np.hstack([np.ones((100, 1)), X]),
                      inplace=True,
                      center=True,
                      scale=True,
                      intercept_column=0).normalized_array()
    lasso = rr.lasso.squared_error(Xn[:, 1:],
                                   Y,
                                   penalty_structure=[0] * 7 + [1] * 3,
                                   nstep=10)

    sol = lasso.main(inner_tol=1.e-12, verbose=True)
    beta = np.array(sol['beta'].todense())

    sols = []
    sols_sep = []
    for l in sol['lagrange']:
        loss = rr.squared_error(Xn, Y, coef=1. / n)
        penalty = rr.group_lasso([rr.UNPENALIZED] + [0] * 7 + [1] * 3,
                                 l)  # matrix contains an intercept...
        problem = rr.simple_problem(loss, penalty)
        sols.append(problem.solve(tol=1.e-12).copy())

        sep = rr.separable((11, ), [
            rr.l2norm((7, ),
                      np.sqrt(7) * l),
            rr.l2norm((3, ),
                      np.sqrt(3) * l)
        ], [np.arange(1, 8), np.arange(8, 11)])
        sep_problem = rr.simple_problem(loss, sep)
        sols_sep.append(sep_problem.solve(tol=1.e-12).copy())

    sols = np.array(sols).T
    sols_sep = np.array(sols_sep).T

    nt.assert_true(
        np.linalg.norm(beta - sols) / (1 + np.linalg.norm(beta)) <= 1.e-4)
    nt.assert_true(
        np.linalg.norm(beta - sols_sep) / (1 + np.linalg.norm(beta)) <= 1.e-4)
Exemplo n.º 13
0
def test_multiple_views():
    s, n, p = 5, 200, 20

    randomizer = randomization.laplace((p, ), scale=0.5)
    X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=0.1, snr=7)

    nonzero = np.where(beta)[0]
    lam_frac = 1.

    loss = rr.glm.logistic(X, y)
    epsilon = 1.

    lam = lam_frac * np.mean(
        np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0))
    W = np.ones(p) * lam
    W[0] = 0  # use at least some unpenalized
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)

    # first randomization
    M_est1 = glm_group_lasso(loss, epsilon, penalty, randomizer)
    # second randomization
    M_est2 = glm_group_lasso(loss, epsilon, penalty, randomizer)

    mv = multiple_views([M_est1, M_est2])
    mv.solve()

    active = M_est1.overall + M_est2.overall

    if set(nonzero).issubset(np.nonzero(active)[0]):

        active_set = np.nonzero(active)[0]
        inactive_selected = I = [
            i for i in np.arange(active_set.shape[0])
            if active_set[i] not in nonzero
        ]

        boot_target, target_observed = pairs_bootstrap_glm(loss, active)
        inactive_target = lambda indices: boot_target(indices)[
            inactive_selected]
        inactive_observed = target_observed[inactive_selected]
        sampler = lambda: np.random.choice(n, size=(n, ), replace=True)

        mv.setup_sampler(sampler)
        target_sampler = mv.setup_target(inactive_target, inactive_observed)
        test_stat = lambda x: np.linalg.norm(x)

        pval = target_sampler.hypothesis_test(test_stat,
                                              inactive_observed,
                                              alternative='greater')
        return pval
Exemplo n.º 14
0
    def __init__(self,
                  loglike,
                  groups,
                  weights,
                  ridge_term,
                  randomizer,
                  perturb=None):
         r"""
         Create a new post-selection object for the LASSO problem

         Parameters
         ----------

         loglike : `regreg.smooth.glm.glm`
             A (negative) log-likelihood as implemented in `regreg`.

         feature_weights : np.ndarray
             Feature weights for L-1 penalty. If a float,
             it is brodcast to all features.

         ridge_term : float
             How big a ridge term to add?

         randomizer : object
             Randomizer -- contains representation of randomization density.

         perturb : np.ndarray
             Random perturbation subtracted as a linear
             term in the objective function.
         """

         self.loglike = loglike
         self.nfeature = p = self.loglike.shape[0]

         self.ridge_term = ridge_term
         self.penalty = rr.group_lasso(groups,
                                       weights=weights,
                                       lagrange=1.)
         self._initial_omega = perturb  # random perturbation

         self.randomizer = randomizer
Exemplo n.º 15
0
def test_group_lasso_weightedl1_bound():
    n, p = 100, 50

    X = np.random.standard_normal((n, p))
    Y = np.random.standard_normal(n)

    loss = rr.glm.gaussian(X, Y)
    weights = np.ones(p)
    weights[-2:] = np.inf
    weights[:2] = 0
    weight_dict = dict([(i, w) for i, w in enumerate(weights)])
    bound1 = rr.weighted_l1norm(weights, bound=2)
    bound2 = rr.group_lasso(np.arange(p), weights=weight_dict, bound=2)

    problem1 = rr.simple_problem(loss, bound1)
    problem2 = rr.simple_problem(loss, bound2)

    beta1 = problem1.solve(tol=1.e-14, min_its=500)
    beta2 = problem2.solve(tol=1e-14, min_its=500)

    npt.assert_allclose(beta1, beta2)
def test_group_lasso_equivalent():
    """
    with 0 as lasso weights should be group lasso
    """
    pen1 = sparse_group_lasso([1,1,2,2,2], 
                              np.zeros(5), 
                              weights={1:0.2, 2:0.1},
                              lagrange=0.4)
    pen2 = rr.group_lasso([1,1,2,2,2], {1:0.2, 2:0.1}, lagrange=0.4)

    Z = np.array([3,2,4,6,7])
    np.testing.assert_allclose(pen1.lagrange_prox(Z), pen2.lagrange_prox(Z))

    Z = np.random.standard_normal(5) * 100
    np.testing.assert_allclose(pen1.lagrange_prox(Z), pen2.lagrange_prox(Z))

    dual1 = pen1.conjugate
    dual2 = pen2.conjugate

    np.testing.assert_allclose(Z, pen1.lagrange_prox(Z) + dual1.bound_prox(Z))
    np.testing.assert_allclose(dual1.bound_prox(Z), dual2.bound_prox(Z))
Exemplo n.º 17
0
def test_path_group_lasso():
    '''
    this test looks at the paths of three different parameterizations
    of the same problem

    '''
    n = 100
    X = np.random.standard_normal((n,10))
    U = np.random.standard_normal((n,2))
    Y = np.random.standard_normal(100)
    betaX = np.array([3,4,5,0,0] + [0]*5)
    betaU = np.array([10,-5])
    Y += (np.dot(X, betaX) + np.dot(U, betaU)) * 5

    Xn = rr.normalize(np.hstack([np.ones((100,1)),X]), inplace=True, center=True, scale=True, intercept_column=0).normalized_array()
    lasso = rr.lasso.squared_error(Xn[:,1:] ,Y, penalty_structure=[0]*7+[1]*3, nstep=10)

    sol = lasso.main(inner_tol=1.e-12, verbose=True)
    beta = np.array(sol['beta'].todense())

    sols = []
    sols_sep = []
    for l in sol['lagrange']:
        loss = rr.squared_error(Xn, Y, coef=1./n)
        penalty = rr.group_lasso([rr.UNPENALIZED] + [0]*7 + [1]*3, l) # matrix contains an intercept...
        problem = rr.simple_problem(loss, penalty)
        sols.append(problem.solve(tol=1.e-12).copy())

        sep = rr.separable((11,), [rr.l2norm((7,),np.sqrt(7)*l), rr.l2norm((3,),np.sqrt(3)*l)],[np.arange(1,8),np.arange(8,11)])
        sep_problem = rr.simple_problem(loss, sep)
        sols_sep.append(sep_problem.solve(tol=1.e-12).copy())

    sols = np.array(sols).T
    sols_sep = np.array(sols_sep).T

    nt.assert_true(np.linalg.norm(beta - sols) / (1 + np.linalg.norm(beta)) <= 1.e-4)
    nt.assert_true(np.linalg.norm(beta - sols_sep) / (1 + np.linalg.norm(beta)) <= 1.e-4)
Exemplo n.º 18
0
def test_group_lasso_equivalent():
    """
    with 0 as lasso weights should be group lasso
    """
    pen1 = sparse_group_lasso([1, 1, 2, 2, 2],
                              np.zeros(5),
                              weights={
                                  1: 0.2,
                                  2: 0.1
                              },
                              lagrange=0.4)
    pen2 = rr.group_lasso([1, 1, 2, 2, 2], {1: 0.2, 2: 0.1}, lagrange=0.4)

    Z = np.array([3, 2, 4, 6, 7])
    np.testing.assert_allclose(pen1.lagrange_prox(Z), pen2.lagrange_prox(Z))

    Z = np.random.standard_normal(5) * 100
    np.testing.assert_allclose(pen1.lagrange_prox(Z), pen2.lagrange_prox(Z))

    dual1 = pen1.conjugate
    dual2 = pen2.conjugate

    np.testing.assert_allclose(Z, pen1.lagrange_prox(Z) + dual1.bound_prox(Z))
    np.testing.assert_allclose(dual1.bound_prox(Z), dual2.bound_prox(Z))
Exemplo n.º 19
0
    def __init__(self,
                 loglike,
                 feature_weights,
                 candidate,
                 randomizer_scale,
                 active=None,
                 randomizer='gaussian',
                 parametric_cov_estimator=False):
        r"""

        Create a new post-selection for the stepwise problem

        Parameters
        ----------

        loglike : `regreg.smooth.glm.glm`
            A (negative) log-likelihood as implemented in `regreg`.

        feature_weights : np.ndarray
            Feature weights for L-1 penalty. If a float,
            it is brodcast to all features.

        candidate : np.bool
            Which groups of variables are candidates
            for inclusion in this step.

        randomizer_scale : float
            Scale for IID components of randomization.

        active : np.bool (optional)
            Which groups of variables make up $E$, the
            set of variables we partially minimize over.

        randomizer : str (optional)
            One of ['laplace', 'logistic', 'gaussian']


        """

        self.active = active
        self.candidate = candidate

        self.loglike = loglike
        self.nfeature = p = loglike.shape[0]

        if np.asarray(feature_weights).shape == ():
            feature_weights = np.ones(loglike.shape) * feature_weights
        self.feature_weights = np.asarray(feature_weights)

        self.parametric_cov_estimator = parametric_cov_estimator

        nrandom = candidate.sum()
        if randomizer == 'laplace':
            self.randomizer = randomization.laplace((nrandom, ),
                                                    scale=randomizer_scale)
        elif randomizer == 'gaussian':
            self.randomizer = randomization.isotropic_gaussian(
                (nrandom, ), randomizer_scale)
        elif randomizer == 'logistic':
            self.randomizer = randomization.logistic((nrandom, ),
                                                     scale=randomizer_scale)

        self.penalty = rr.group_lasso(np.arange(p),
                                      weights=dict(
                                          zip(np.arange(p),
                                              self.feature_weights)),
                                      lagrange=1.)
Exemplo n.º 20
0
def test_without_screening(s=10,
                           n=300,
                           p=100,
                           rho=0.,
                           signal=3.5,
                           lam_frac=1.,
                           ndraw=10000,
                           burnin=2000,
                           loss='gaussian',
                           randomizer='laplace',
                           randomizer_scale=1.,
                           scalings=False,
                           subgrad=True,
                           check_screen=False):

    if loss == "gaussian":
        X, y, beta, nonzero, sigma = gaussian_instance(n=n,
                                                       p=p,
                                                       s=s,
                                                       rho=rho,
                                                       signal=signal,
                                                       sigma=1,
                                                       random_signs=False)
        lam = lam_frac * np.mean(
            np.fabs(np.dot(X.T, np.random.standard_normal(
                (n, 2000)))).max(0)) * sigma
        loss = rr.glm.gaussian(X, y)
        X_indep, y_indep, _, _, _ = gaussian_instance(n=n,
                                                      p=p,
                                                      s=s,
                                                      rho=rho,
                                                      signal=signal,
                                                      sigma=1)
        loss_indep = rr.glm.gaussian(X_indep, y_indep)
    elif loss == "logistic":
        X, y, beta, _ = logistic_instance(n=n,
                                          p=p,
                                          s=s,
                                          rho=rho,
                                          signal=signal)
        loss = rr.glm.logistic(X, y)
        lam = lam_frac * np.mean(
            np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2,
                                                   (n, 10000)))).max(0))
        X_indep, y_indep, _, _ = logistic_instance(n=n,
                                                   p=p,
                                                   s=s,
                                                   rho=rho,
                                                   signal=signal,
                                                   random_signs=False)
        loss_indep = rr.glm.logistic(X_indep, y_indep)
    nonzero = np.where(beta)[0]

    if randomizer == 'laplace':
        randomizer = randomization.laplace((p, ), scale=randomizer_scale)
    elif randomizer == 'gaussian':
        randomizer = randomization.isotropic_gaussian((p, ),
                                                      scale=randomizer_scale)

    epsilon = 1. / np.sqrt(n)
    W = np.ones(p) * lam
    #W[0] = 0 # use at least some unpenalized
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)
    M_est = glm_group_lasso(loss, epsilon, penalty, randomizer)
    M_est.solve()
    active_union = M_est._overall
    nactive = np.sum(active_union)
    print("nactive", nactive)
    active_set = np.nonzero(active_union)[0]
    print("active set", active_set)
    print("true nonzero", np.nonzero(beta)[0])

    views = [M_est]
    queries = multiple_queries(views)
    queries.solve()

    screened = False
    if set(nonzero).issubset(np.nonzero(active_union)[0]):
        screened = True

    if check_screen == False or (check_screen == True and screened == True):

        #if nactive==s:
        #    return None

        if scalings:  # try condition on some scalings
            M_est.condition_on_subgradient()
            M_est.condition_on_scalings()
        if subgrad:
            M_est.decompose_subgradient(conditioning_groups=np.zeros(
                p, dtype=bool),
                                        marginalizing_groups=np.ones(p, bool))

        boot_target1, boot_target_observed1 = pairs_bootstrap_glm(
            loss, active_union, inactive=~active_union)
        boot_target2, boot_target_observed2 = pairs_bootstrap_glm(
            loss_indep, active_union, inactive=~active_union)
        target_observed = (boot_target_observed1 -
                           boot_target_observed2)[:nactive]

        def _target(indices):
            return boot_target1(indices)[:nactive] - boot_target2(
                indices)[:nactive]

        form_covariances = glm_nonparametric_bootstrap(n, n)
        queries.setup_sampler(form_covariances)
        queries.setup_opt_state()

        target_sampler = queries.setup_target(_target,
                                              target_observed,
                                              reference=target_observed)

        target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin)
        LU = target_sampler.confidence_intervals(target_observed,
                                                 sample=target_sample,
                                                 level=0.9)
        pivots = target_sampler.coefficient_pvalues(
            target_observed, parameter=np.zeros(nactive), sample=target_sample)

        #test_stat = lambda x: np.linalg.norm(x - beta[active_union])
        #observed_test_value = test_stat(target_observed)
        #pivots = target_sampler.hypothesis_test(test_stat,
        #                                       observed_test_value,
        #                                       alternative='twosided',
        #                                       parameter = beta[active_union],
        #                                       ndraw=ndraw,
        #                                       burnin=burnin,
        #                                       stepsize=None)

        true_vec = np.zeros(nactive)

        def coverage(LU):
            L, U = LU[:, 0], LU[:, 1]
            covered = np.zeros(nactive)
            ci_length = np.zeros(nactive)
            for j in range(nactive):
                if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]):
                    covered[j] = 1
                ci_length[j] = U[j] - L[j]
            return covered, ci_length

        covered, ci_length = coverage(LU)
        LU_naive = naive_confidence_intervals(target_sampler, target_observed)
        covered_naive, ci_length_naive = coverage(LU_naive)
        naive_pvals = naive_pvalues(target_sampler, target_observed, true_vec)
        return pivots, covered, ci_length, naive_pvals, covered_naive, ci_length_naive
Exemplo n.º 21
0
def test_cv(n=100,
            p=50,
            s=5,
            signal=7.5,
            K=5,
            rho=0.,
            randomizer='gaussian',
            randomizer_scale=1.,
            scale1=0.1,
            scale2=0.2,
            lam_frac=1.,
            glmnet=True,
            loss='gaussian',
            bootstrap=False,
            condition_on_CVR=True,
            marginalize_subgrad=True,
            ndraw=10000,
            burnin=2000,
            nboot=nboot):

    print(n, p, s, condition_on_CVR, scale1, scale2)
    if randomizer == 'laplace':
        randomizer = randomization.laplace((p, ), scale=randomizer_scale)
    elif randomizer == 'gaussian':
        randomizer = randomization.isotropic_gaussian((p, ), randomizer_scale)
    elif randomizer == 'logistic':
        randomizer = randomization.logistic((p, ), scale=randomizer_scale)

    if loss == "gaussian":
        X, y, beta, nonzero, sigma = gaussian_instance(n=n,
                                                       p=p,
                                                       s=s,
                                                       rho=rho,
                                                       signal=signal,
                                                       sigma=1)
        glm_loss = rr.glm.gaussian(X, y)
    elif loss == "logistic":
        X, y, beta, _ = logistic_instance(n=n,
                                          p=p,
                                          s=s,
                                          rho=rho,
                                          signal=signal)
        glm_loss = rr.glm.logistic(X, y)

    epsilon = 1. / np.sqrt(n)

    # view 1
    cv = CV_view(glm_loss,
                 loss_label=loss,
                 lasso_randomization=randomizer,
                 epsilon=epsilon,
                 scale1=scale1,
                 scale2=scale2)
    if glmnet:
        try:
            cv.solve(glmnet=glmnet)
        except ImportError:
            cv.solve(glmnet=False)
    else:
        cv.solve(glmnet=False)

    # for the test make sure we also run the python code

    cv_py = CV_view(glm_loss,
                    loss_label=loss,
                    lasso_randomization=randomizer,
                    epsilon=epsilon,
                    scale1=scale1,
                    scale2=scale2)
    cv_py.solve(glmnet=False)

    lam = cv.lam_CVR
    print("lam", lam)

    if condition_on_CVR:
        cv.condition_on_opt_state()
        lam = cv.one_SD_rule(direction="up")
        print("new lam", lam)

    # non-randomized Lasso, just looking how many vars it selects
    problem = rr.simple_problem(glm_loss, rr.l1norm(p, lagrange=lam))
    beta_hat = problem.solve()
    active_hat = beta_hat != 0
    print("non-randomized lasso ", active_hat.sum())

    # view 2
    W = lam_frac * np.ones(p) * lam
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)
    M_est = glm_group_lasso(glm_loss, epsilon, penalty, randomizer)

    if nboot > 0:
        cv.nboot = M_est.nboot = nboot

    mv = multiple_queries([cv, M_est])
    mv.solve()

    active_union = M_est._overall
    nactive = np.sum(active_union)
    print("nactive", nactive)
    if nactive == 0:
        return None

    nonzero = np.where(beta)[0]

    if set(nonzero).issubset(np.nonzero(active_union)[0]):

        active_set = np.nonzero(active_union)[0]
        true_vec = beta[active_union]

        if marginalize_subgrad == True:
            M_est.decompose_subgradient(conditioning_groups=np.zeros(p, bool),
                                        marginalizing_groups=np.ones(p, bool))

        selected_features = np.zeros(p, np.bool)
        selected_features[active_set] = True

        unpenalized_mle = restricted_Mest(M_est.loss, selected_features)

        form_covariances = glm_nonparametric_bootstrap(n, n)
        target_info, target_observed = pairs_bootstrap_glm(M_est.loss,
                                                           selected_features,
                                                           inactive=None)

        cov_info = M_est.setup_sampler()
        target_cov, score_cov = form_covariances(target_info,
                                                 cross_terms=[cov_info],
                                                 nsample=M_est.nboot)

        opt_sample = M_est.sampler.sample(ndraw, burnin)

        pvalues = M_est.sampler.coefficient_pvalues(
            unpenalized_mle,
            target_cov,
            score_cov,
            parameter=np.zeros(selected_features.sum()),
            sample=opt_sample)
        intervals = M_est.sampler.confidence_intervals(unpenalized_mle,
                                                       target_cov,
                                                       score_cov,
                                                       sample=opt_sample)

        L, U = intervals.T
        sel_covered = np.zeros(nactive, np.bool)
        sel_length = np.zeros(nactive)

        LU_naive = naive_confidence_intervals(np.diag(target_cov),
                                              target_observed)
        naive_covered = np.zeros(nactive, np.bool)
        naive_length = np.zeros(nactive)
        naive_pvals = naive_pvalues(np.diag(target_cov), target_observed,
                                    true_vec)

        active_var = np.zeros(nactive, np.bool)

        for j in range(nactive):
            if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]):
                sel_covered[j] = 1
            if (LU_naive[j, 0] <= true_vec[j]) and (LU_naive[j, 1] >=
                                                    true_vec[j]):
                naive_covered[j] = 1
            sel_length[j] = U[j] - L[j]
            naive_length[j] = LU_naive[j, 1] - LU_naive[j, 0]
            active_var[j] = active_set[j] in nonzero

        q = 0.2
        BH_desicions = multipletests(pvalues, alpha=q, method="fdr_bh")[0]
        return sel_covered, sel_length, naive_pvals, naive_covered, naive_length, active_var, BH_desicions, active_var
Exemplo n.º 22
0
    def __init__(self,
                 loglike,
                 feature_weights,
                 ridge_term,
                 randomizer_scale,
                 randomizer='gaussian',
                 covariance_estimator=None):
        r"""

        Create a new post-selection dor the LASSO problem

        Parameters
        ----------

        loglike : `regreg.smooth.glm.glm`
            A (negative) log-likelihood as implemented in `regreg`.

        feature_weights : np.ndarray
            Feature weights for L-1 penalty. If a float,
            it is brodcast to all features.

        ridge_term : float
            How big a ridge term to add?

        randomizer_scale : float
            Scale for IID components of randomization.

        randomizer : str
            One of ['laplace', 'logistic', 'gaussian']

        covariance_estimator : callable (optional)
            If None, use the parameteric
            covariance estimate of the selected model.

        Notes
        -----

        If not None, `covariance_estimator` should 
        take arguments (beta, active, inactive)
        and return an estimate of the covariance of
        $(\bar{\beta}_E, \nabla \ell(\bar{\beta}_E)_{-E})$,
        the unpenalized estimator and the inactive
        coordinates of the gradient of the likelihood at
        the unpenalized estimator.

        """

        self.loglike = loglike
        self.nfeature = p = self.loglike.shape[0]

        if np.asarray(feature_weights).shape == ():
            feature_weights = np.ones(loglike.shape) * feature_weights
        self.feature_weights = np.asarray(feature_weights)

        self.covariance_estimator = covariance_estimator

        if randomizer == 'laplace':
            self.randomizer = randomization.laplace((p, ),
                                                    scale=randomizer_scale)
        elif randomizer == 'gaussian':
            self.randomizer = randomization.isotropic_gaussian(
                (p, ), randomizer_scale)
        elif randomizer == 'logistic':
            self.randomizer = randomization.logistic((p, ),
                                                     scale=randomizer_scale)

        self.ridge_term = ridge_term

        self.penalty = rr.group_lasso(np.arange(p),
                                      weights=dict(
                                          zip(np.arange(p),
                                              self.feature_weights)),
                                      lagrange=1.)
Exemplo n.º 23
0
def test_multiple_queries(s=3,
                          n=300,
                          p=20,
                          signal=7,
                          rho=0.1,
                          lam_frac=0.7,
                          nviews=4,
                          intervals='new',
                          ndraw=10000,
                          burnin=2000,
                          solve_args={
                              'min_its': 50,
                              'tol': 1.e-10
                          },
                          check_screen=True):

    randomizer = randomization.laplace((p, ), scale=1)
    X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal)

    nonzero = np.where(beta)[0]

    loss = rr.glm.logistic(X, y)
    epsilon = 1.

    lam = lam_frac * np.mean(
        np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0))
    W = np.ones(p) * lam
    W[0] = 0  # use at least some unpenalized
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)

    view = []
    for i in range(nviews):
        view.append(glm_group_lasso(loss, epsilon, penalty, randomizer))

    mv = multiple_queries(view)
    mv.solve()

    active_union = np.zeros(p, np.bool)
    for i in range(nviews):
        active_union += view[i].selection_variable['variables']

    nactive = np.sum(active_union)
    print("nactive", nactive)
    if nactive == 0:
        return None

    screen = set(nonzero).issubset(np.nonzero(active_union)[0])

    if check_screen and not screen:
        return None

    if True:
        active_set = np.nonzero(active_union)[0]
        true_vec = beta[active_union]

        ## bootstrap
        target_sampler_boot, target_observed = glm_target(loss,
                                                          active_union,
                                                          mv,
                                                          bootstrap=True)

        if intervals == 'old':
            target_sample_boot = target_sampler_boot.sample(ndraw=ndraw,
                                                            burnin=burnin)
            LU_boot = target_sampler_boot.confidence_intervals(
                target_observed, sample=target_sample_boot, level=0.9)
            pivots_boot = target_sampler_boot.coefficient_pvalues(
                target_observed, parameter=true_vec, sample=target_sample_boot)
        else:
            full_sample_boot = target_sampler_boot.sample(ndraw=ndraw,
                                                          burnin=burnin,
                                                          keep_opt=True)
            LU_boot = target_sampler_boot.confidence_intervals_translate(
                target_observed, sample=full_sample_boot, level=0.9)
            pivots_boot = target_sampler_boot.coefficient_pvalues_translate(
                target_observed, parameter=true_vec, sample=full_sample_boot)
        ## CLT plugin
        target_sampler, _ = glm_target(loss, active_union, mv, bootstrap=False)

        if intervals == 'old':
            target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin)
            LU = target_sampler.confidence_intervals(target_observed,
                                                     sample=target_sample,
                                                     level=0.9)
            pivots = target_sampler.coefficient_pvalues(target_observed,
                                                        parameter=true_vec,
                                                        sample=target_sample)
        else:
            full_sample = target_sampler.sample(ndraw=ndraw,
                                                burnin=burnin,
                                                keep_opt=True)
            LU = target_sampler.confidence_intervals_translate(
                target_observed, sample=full_sample, level=0.9)
            pivots = target_sampler.coefficient_pvalues_translate(
                target_observed, parameter=true_vec, sample=full_sample)

        LU_naive = naive_confidence_intervals(target_sampler, target_observed)

        def coverage(LU):
            L, U = LU[:, 0], LU[:, 1]
            covered = np.zeros(nactive)
            ci_length = np.zeros(nactive)

            for j in range(nactive):
                if check_screen:
                    if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]):
                        covered[j] = 1
                else:
                    covered[j] = None
                ci_length[j] = U[j] - L[j]
            return covered, ci_length

        covered, ci_length = coverage(LU)
        covered_boot, ci_length_boot = coverage(LU_boot)
        covered_naive, ci_length_naive = coverage(LU_naive)

        active_var = np.zeros(nactive, np.bool)
        for j in range(nactive):
            active_var[j] = active_set[j] in nonzero

        return pivots, pivots_boot, covered, ci_length, covered_boot, ci_length_boot, \
                active_var, covered_naive, ci_length_naive
Exemplo n.º 24
0
def test_scaling(
    snr=15,
    s=5,
    n=200,
    p=20,
    rho=0.1,
    burnin=20000,
    ndraw=30000,
    scale=0.9,
    nsim=None,  # needed for decorator
    frac=0.5
):  # 0.9 has roughly same screening probability as 50% data splitting, i.e. around 10%

    randomizer = randomization.laplace((p, ), scale=scale)
    X, y, beta, _ = generate_data(n=n, p=p, s=s, rho=rho, snr=snr)

    nonzero = np.where(beta)[0]
    lam_frac = 1.

    loss = rr.glm.logistic(X, y)
    epsilon = 1. / np.sqrt(n)

    lam = lam_frac * np.mean(
        np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0))
    W = np.ones(p) * lam
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)

    M_est = glm_group_lasso(loss, epsilon, penalty, randomizer)

    mv = multiple_queries([M_est])
    mv.solve()

    active = M_est.selection_variable['variables']
    nactive = active.sum()

    if set(nonzero).issubset(np.nonzero(active)[0]):

        pvalues = []
        active_set = np.nonzero(active)[0]
        inactive_selected = I = [
            i for i in np.arange(active_set.shape[0])
            if active_set[i] not in nonzero
        ]
        active_selected = A = [
            i for i in np.arange(active_set.shape[0])
            if active_set[i] in nonzero
        ]

        if not I:
            return None
        idx = I[0]
        inactive = ~M_est.selection_variable['variables']
        boot_target, target_observed = pairs_bootstrap_glm(loss,
                                                           active,
                                                           inactive=inactive)

        if DEBUG:
            sampler = lambda: np.random.choice(n, size=(n, ), replace=True)
            print(boot_target(sampler())[-3:], 'boot target')

        form_covariances = glm_nonparametric_bootstrap(n, n)
        mv.setup_sampler(form_covariances)

        # null saturated

        def null_target(indices):
            result = boot_target(indices)
            return result[idx]

        null_observed = np.zeros(1)
        null_observed[0] = target_observed[idx]

        target_sampler = mv.setup_target(null_target, null_observed)

        #target_scaling = 5 * np.linalg.svd(target_sampler.target_transform[0][0])[1].max()**2# should have something do with noise scale too

        print(target_sampler.crude_lipschitz(), 'crude')

        test_stat = lambda x: x[0]
        pval = target_sampler.hypothesis_test(
            test_stat,
            test_stat(null_observed),
            burnin=burnin,
            ndraw=ndraw,
            stepsize=.5 /
            target_sampler.crude_lipschitz())  # twosided by default
        pvalues.append(pval)

        # true saturated

        idx = A[0]

        def active_target(indices):
            result = boot_target(indices)
            return result[idx]

        active_observed = np.zeros(1)
        active_observed[0] = target_observed[idx]

        target_sampler = mv.setup_target(active_target, active_observed)
        target_scaling = 5 * np.linalg.svd(
            target_sampler.target_transform[0]
            [0])[1].max()**2  # should have something do with noise scale too

        test_stat = lambda x: x[0]
        pval = target_sampler.hypothesis_test(
            test_stat,
            test_stat(active_observed),
            burnin=burnin,
            ndraw=ndraw,
            stepsize=.5 /
            target_sampler.crude_lipschitz())  # twosided by default
        pvalues.append(pval)

        # null selected

        idx = I[0]

        def null_target(indices):
            result = boot_target(indices)
            return np.hstack([result[idx], result[nactive:]])

        null_observed = np.zeros_like(null_target(range(n)))
        null_observed[0] = target_observed[idx]
        null_observed[1:] = target_observed[nactive:]

        target_sampler = mv.setup_target(null_target,
                                         null_observed)  #, target_set=[0])
        target_scaling = 5 * np.linalg.svd(
            target_sampler.target_transform[0]
            [0])[1].max()**2  # should have something do with noise scale too

        print(target_sampler.crude_lipschitz(), 'crude')

        test_stat = lambda x: x[0]
        pval = target_sampler.hypothesis_test(
            test_stat,
            test_stat(null_observed),
            burnin=burnin,
            ndraw=ndraw,
            stepsize=.5 /
            target_sampler.crude_lipschitz())  # twosided by default
        pvalues.append(pval)

        # true selected

        idx = A[0]

        def active_target(indices):
            result = boot_target(indices)
            return np.hstack([result[idx], result[nactive:]])

        active_observed = np.zeros_like(active_target(range(n)))
        active_observed[0] = target_observed[idx]
        active_observed[1:] = target_observed[nactive:]

        target_sampler = mv.setup_target(active_target,
                                         active_observed)  #, target_set=[0])

        test_stat = lambda x: x[0]
        pval = target_sampler.hypothesis_test(
            test_stat,
            test_stat(active_observed),
            burnin=burnin,
            ndraw=ndraw,
            stepsize=.5 /
            target_sampler.crude_lipschitz())  # twosided by default
        pvalues.append(pval)

        # condition on opt variables

        ### NOT WORKING -- need to implement conditioning within M_estimator!!!

        if False:

            # null saturated

            idx = I[0]

            def null_target(indices):
                result = boot_target(indices)
                return result[idx]

            null_observed = np.zeros(1)
            null_observed[0] = target_observed[idx]

            target_sampler = mv.setup_target(null_target, null_observed)

            print(target_sampler.crude_lipschitz(), 'crude')

            test_stat = lambda x: x[0]
            pval = target_sampler.hypothesis_test(
                test_stat,
                test_stat(null_observed),
                burnin=burnin,
                ndraw=ndraw,
                stepsize=.5 /
                target_sampler.crude_lipschitz())  # twosided by default
            pvalues.append(pval)

            # true saturated

            idx = A[0]

            def active_target(indices):
                result = boot_target(indices)
                return result[idx]

            active_observed = np.zeros(1)
            active_observed[0] = target_observed[idx]

            sampler = lambda: np.random.choice(n, size=(n, ), replace=True)

            target_sampler = mv.setup_target(active_target, active_observed)

            test_stat = lambda x: x[0]
            pval = target_sampler.hypothesis_test(
                test_stat,
                test_stat(active_observed),
                burnin=burnin,
                ndraw=ndraw,
                stepsize=.5 /
                target_sampler.crude_lipschitz())  # twosided by default
            pvalues.append(pval)

        # true selected

        # oracle p-value -- draws a new data set

        X, y, beta, _ = generate_data(n=n, p=p, s=s, rho=rho, snr=snr)
        X_E = X[:, active_set]

        active_var = [False, True, False, True]

        if statsmodels_available:
            try:
                model = sm.GLM(y, X_E, family=sm.families.Binomial())
                model_results = model.fit()
                pvalues.extend(
                    [model_results.pvalues[I[0]], model_results.pvalues[A[0]]])
                active_var.extend([False, True])
            except sm.tools.sm_exceptions.PerfectSeparationError:
                pass
        else:
            pass

        # data splitting-ish p-value -- draws a new data set of smaller size
        # frac is presumed to be how much data was used in stage 1, we get (1-frac)*n for stage 2
        # frac defaults to 0.5

        Xs, ys, beta, _ = generate_data(n=n, p=p, s=s, rho=rho, snr=snr)
        Xs = Xs[:int((1 - frac) * n)]
        ys = ys[:int((1 - frac) * n)]
        X_Es = Xs[:, active_set]

        if statsmodels_available:
            try:
                model = sm.GLM(ys, X_Es, family=sm.families.Binomial())
                model_results = model.fit()
                pvalues.extend(
                    [model_results.pvalues[I[0]], model_results.pvalues[A[0]]])
                active_var.extend([False, False])
            except sm.tools.sm_exceptions.PerfectSeparationError:
                pass
        else:
            pass

        return pvalues, active_var
def test_intervals(s=0,
                   n=200,
                   p=10,
                   signal=7,
                   rho=0.,
                   lam_frac=6.,
                   ndraw=10000,
                   burnin=2000,
                   bootstrap=True,
                   loss='gaussian',
                   intervals='old',
                   randomizer='laplace',
                   solve_args={
                       'min_its': 50,
                       'tol': 1.e-10
                   }):

    if randomizer == 'laplace':
        randomizer = randomization.laplace((p, ), scale=1.)
    elif randomizer == 'gaussian':
        randomizer = randomization.isotropic_gaussian((p, ), scale=1.)
    elif randomizer == 'logistic':
        randomizer = randomization.logistic((p, ), scale=1.)

    if loss == "gaussian":
        X, y, beta, nonzero, sigma = gaussian_instance(n=n,
                                                       p=p,
                                                       s=s,
                                                       rho=rho,
                                                       signal=signal,
                                                       sigma=1)
        lam = np.mean(
            np.fabs(np.dot(X.T, np.random.standard_normal((n, 1000))))) * sigma
        loss = rr.glm.gaussian(X, y)
    elif loss == "logistic":
        X, y, beta, _ = logistic_instance(n=n,
                                          p=p,
                                          s=s,
                                          rho=rho,
                                          signal=signal)
        loss = rr.glm.logistic(X, y)
        lam = lam_frac * np.mean(
            np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2,
                                                   (n, 10000)))).max(0))

    nonzero = np.where(beta)[0]
    epsilon = 1. / np.sqrt(n)

    W = lam_frac * np.ones(p) * lam
    # W[0] = 0 # use at least some unpenalized
    groups = np.concatenate([np.arange(10) for i in range(p / 10)])
    #print(groups)
    #groups = np.arange(p)
    penalty = rr.group_lasso(groups,
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)

    # first randomization
    M_est1 = glm_group_lasso(loss, epsilon, penalty, randomizer)
    mv = multiple_queries([M_est1])
    # second randomization
    #M_est2 = glm_group_lasso(loss, epsilon, penalty, randomizer)
    #mv = multiple_queries([M_est1, M_est2])

    mv.solve()

    active_union = M_est1.selection_variable['variables']
    print("active set", np.nonzero(active_union)[0])
    nactive = np.sum(active_union)

    if nactive == 0:
        return None

    if set(nonzero).issubset(np.nonzero(active_union)[0]):

        active_set = np.nonzero(active_union)[0]
        true_vec = beta[active_union]

        target_sampler, target_observed = glm_target(loss,
                                                     active_union,
                                                     mv,
                                                     bootstrap=bootstrap)

        if intervals == 'old':
            target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin)
            LU = target_sampler.confidence_intervals(target_observed,
                                                     sample=target_sample,
                                                     level=0.9)
            pivots_mle = target_sampler.coefficient_pvalues(
                target_observed,
                parameter=target_sampler.reference,
                sample=target_sample)
            pivots_truth = target_sampler.coefficient_pvalues(
                target_observed, parameter=true_vec, sample=target_sample)
            pvalues = target_sampler.coefficient_pvalues(
                target_observed,
                parameter=np.zeros_like(true_vec),
                sample=target_sample)
        else:
            full_sample = target_sampler.sample(ndraw=ndraw,
                                                burnin=burnin,
                                                keep_opt=True)
            LU = target_sampler.confidence_intervals_translate(
                target_observed, sample=full_sample, level=0.9)
            pivots_mle = target_sampler.coefficient_pvalues_translate(
                target_observed,
                parameter=target_sampler.reference,
                sample=full_sample)
            pivots_truth = target_sampler.coefficient_pvalues_translate(
                target_observed, parameter=true_vec, sample=full_sample)
            pvalues = target_sampler.coefficient_pvalues_translate(
                target_observed,
                parameter=np.zeros_like(true_vec),
                sample=full_sample)

        LU_naive = naive_confidence_intervals(target_sampler, target_observed)

        L, U = LU.T
        ci_length_sel = np.zeros(nactive)
        covered = np.zeros(nactive, np.bool)
        naive_covered = np.zeros(nactive, np.bool)
        ci_length_naive = np.zeros(nactive)
        active_var = np.zeros(nactive, np.bool)

        for j in range(nactive):
            if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]):
                covered[j] = 1
            ci_length_sel[j] = U[j] - L[j]
            if (LU_naive[j, 0] <= true_vec[j]) and (LU_naive[j, 1] >=
                                                    true_vec[j]):
                naive_covered[j] = 1
            ci_length_naive[j] = LU_naive[j, 1] - LU_naive[j, 0]
            active_var[j] = active_set[j] in nonzero

        naive_pvals = naive_pvalues(target_sampler, target_observed, true_vec)

        return pivots_mle, pivots_truth, pvalues, covered, ci_length_sel,\
               naive_pvals, naive_covered, ci_length_naive, active_var
Exemplo n.º 26
0
def test_sqrt_lasso(n=500,
                    p=20,
                    s=3,
                    signal=10,
                    K=5,
                    rho=0.,
                    randomizer='gaussian',
                    randomizer_scale=1.,
                    scale1=0.1,
                    scale2=0.2,
                    lam_frac=1.,
                    bootstrap=False,
                    condition_on_CVR=False,
                    marginalize_subgrad=True,
                    ndraw=10000,
                    burnin=2000):

    print(n, p, s)
    if randomizer == 'laplace':
        randomizer = randomization.laplace((p, ), scale=randomizer_scale)
    elif randomizer == 'gaussian':
        randomizer = randomization.isotropic_gaussian((p, ), randomizer_scale)
    elif randomizer == 'logistic':
        randomizer = randomization.logistic((p, ), scale=randomizer_scale)

    X, y, beta, nonzero, sigma = gaussian_instance(n=n,
                                                   p=p,
                                                   s=s,
                                                   rho=rho,
                                                   signal=signal,
                                                   sigma=1)
    lam_nonrandom = choose_lambda(X)
    lam_random = choose_lambda_with_randomization(X, randomizer)
    loss = l2norm_glm(X, y)
    #sqloss = rr.glm.gaussian(X, y)
    epsilon = 1. / n

    # non-randomized sqrt-Lasso, just looking how many vars it selects
    problem = rr.simple_problem(loss, rr.l1norm(p, lagrange=lam_nonrandom))
    beta_hat = problem.solve()
    active_hat = beta_hat != 0
    print("non-randomized sqrt-root Lasso active set", np.where(beta_hat)[0])
    print("non-randomized sqrt-lasso", active_hat.sum())

    # view 2
    W = lam_frac * np.ones(p) * lam_random
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1. / np.sqrt(n))
    M_est = glm_group_lasso(loss, epsilon, penalty, randomizer)

    mv = multiple_queries([M_est])
    mv.solve()

    active_set = M_est._overall
    nactive = np.sum(active_set)

    if nactive == 0:
        return None

    nonzero = np.where(beta)[0]
    if set(nonzero).issubset(np.nonzero(active_set)[0]):

        active_set = np.nonzero(active_set)[0]
        true_vec = beta[active_set]

        if marginalize_subgrad == True:
            M_est.decompose_subgradient(conditioning_groups=np.zeros(
                p, dtype=bool),
                                        marginalizing_groups=np.ones(p, bool))

        selected_features = np.zeros(p, np.bool)
        selected_features[active_set] = True

        unpenalized_mle = restricted_Mest(M_est.loss, selected_features)

        form_covariances = glm_nonparametric_bootstrap(n, n)
        boot_target, boot_target_observed = pairs_bootstrap_glm(
            M_est.loss, selected_features, inactive=None)
        target_info = boot_target

        cov_info = M_est.setup_sampler()
        target_cov, score_cov = form_covariances(target_info,
                                                 cross_terms=[cov_info],
                                                 nsample=M_est.nboot)

        opt_sample = M_est.sampler.sample(ndraw, burnin)

        pvalues = M_est.sampler.coefficient_pvalues(
            unpenalized_mle,
            target_cov,
            score_cov,
            parameter=np.zeros(selected_features.sum()),
            sample=opt_sample)
        intervals = M_est.sampler.confidence_intervals(unpenalized_mle,
                                                       target_cov,
                                                       score_cov,
                                                       sample=opt_sample)

        true_vec = beta[M_est.selection_variable['variables']]

        L, U = intervals.T

        covered = np.zeros(nactive, np.bool)
        active_var = np.zeros(nactive, np.bool)

        for j in range(nactive):
            if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]):
                covered[j] = 1
            active_var[j] = active_set[j] in nonzero

        return pvalues, covered, active_var
Exemplo n.º 27
0
def test_nonrandomized(s=0,
                       n=200,
                       p=10,
                       signal=7,
                       rho=0,
                       lam_frac=0.8,
                       loss='gaussian',
                       solve_args={
                           'min_its': 20,
                           'tol': 1.e-10
                       }):
    if loss == "gaussian":
        X, y, beta, nonzero, sigma = gaussian_instance(n=n,
                                                       p=p,
                                                       s=s,
                                                       rho=rho,
                                                       signal=signal,
                                                       sigma=1)
        lam = lam_frac * np.mean(
            np.fabs(np.dot(X.T, np.random.standard_normal(
                (n, 2000)))).max(0)) * sigma
        loss = rr.glm.gaussian(X, y)
    elif loss == "logistic":
        X, y, beta, _ = logistic_instance(n=n,
                                          p=p,
                                          s=s,
                                          rho=rho,
                                          signal=signal)
        loss = rr.glm.logistic(X, y)
        lam = lam_frac * np.mean(
            np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2,
                                                   (n, 10000)))).max(0))

    nonzero = np.where(beta)[0]
    print("lam", lam)
    W = np.ones(p) * lam
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)
    true_vec = beta
    M_est = M_estimator(lam, loss, penalty)
    M_est.solve()
    active = M_est._overall
    nactive = np.sum(active)
    print("nactive", nactive)
    if nactive == 0:
        return None

    #score_mean = M_est.observed_internal_state.copy()
    #score_mean[nactive:] = 0
    M_est.setup_sampler(score_mean=np.zeros(p))
    #M_est.setup_sampler(score_mean=score_mean)
    #M_est.sample(ndraw = 1000, burnin=1000, stepsize=1./p)

    if set(nonzero).issubset(np.nonzero(active)[0]):
        check_screen = True
        #test_stat = lambda x: np.linalg.norm(x)
        #return M_est.hypothesis_test(test_stat, test_stat(M_est.observed_internal_state), stepsize=1./p)

        ci = M_est.confidence_intervals(M_est.observed_internal_state)
        pivots = M_est.coefficient_pvalues(M_est.observed_internal_state)

        def coverage(LU):
            L, U = LU[:, 0], LU[:, 1]
            covered = np.zeros(nactive)
            ci_length = np.zeros(nactive)

            for j in range(nactive):
                if check_screen:
                    if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]):
                        covered[j] = 1
                else:
                    covered[j] = None
                ci_length[j] = U[j] - L[j]
            return covered, ci_length

        covered = coverage(ci)[0]
        #print(pivots)
        #print(coverage)
        return pivots, covered
Exemplo n.º 28
0
def randomized_lasso_trial(X,
                           y,
                           beta,
                           sigma,
                           lam,
                           loss='logistic',
                           randomizer='gaussian',
                           estimation='parametric'):

    from selection.api import randomization

    n, p = X.shape
    if loss == "gaussian":
        loss = rr.glm.gaussian(X, y)

    elif loss == "logistic":
        loss = rr.glm.logistic(X, y)

    epsilon = 1. / np.sqrt(n)

    W = np.ones(p) * lam
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)
    randomization = randomization.isotropic_gaussian((p, ), scale=1.)

    M_est = M_estimator_approx_logistic(loss, epsilon, penalty, randomization,
                                        randomizer, estimation)
    M_est.solve_approx()
    active = M_est._overall
    #print("here",glm.shape)
    active_set = np.asarray([i for i in range(p) if active[i]])
    nactive = np.sum(active)
    glm = M_est.observed_score_state[:nactive]

    prior_variance = 100000.

    #generative_mean = np.zeros(p)
    #sel_split = selection_probability_random_lasso(M_est, generative_mean)
    #test_point = np.append(M_est.observed_score_state, np.abs(M_est.initial_soln[M_est._overall]))

    #print("gradient at test point", sel_split.smooth_objective(test_point, mode= "grad"))

    class target_class(object):
        def __init__(self, target_cov):
            self.target_cov = target_cov
            self.shape = target_cov.shape

    target = target_class(M_est.target_cov)
    unadjusted_intervals = (naive_confidence_intervals(
        target, M_est.target_observed)).T

    grad_lasso = sel_inf_random_lasso(M_est, prior_variance)
    samples = grad_lasso.posterior_samples()
    adjusted_intervals = np.vstack([
        np.percentile(samples, 5, axis=0),
        np.percentile(samples, 95, axis=0)
    ])

    selective_mean = np.mean(samples, axis=0)

    true_val = np.zeros(nactive)

    coverage_ad = np.zeros(nactive)
    coverage_unad = np.zeros(nactive)
    ad_length = np.zeros(nactive)
    unad_length = np.zeros(nactive)

    for l in range(nactive):
        if (adjusted_intervals[0, l] <=
                true_val[l]) and (true_val[l] <= adjusted_intervals[1, l]):
            coverage_ad[l] += 1
        ad_length[l] = adjusted_intervals[1, l] - adjusted_intervals[0, l]
        if (unadjusted_intervals[0, l] <=
                true_val[l]) and (true_val[l] <= unadjusted_intervals[1, l]):
            coverage_unad[l] += 1
        unad_length[l] = unadjusted_intervals[1, l] - unadjusted_intervals[0,
                                                                           l]

    sel_cov = coverage_ad.sum() / nactive
    naive_cov = coverage_unad.sum() / nactive
    ad_len = ad_length.sum() / nactive
    unad_len = unad_length.sum() / nactive
    bayes_risk_ad = np.power(selective_mean - true_val, 2.).sum() / nactive
    bayes_risk_unad = np.power(glm - true_val, 2.).sum() / nactive

    return np.vstack(
        [sel_cov, naive_cov, ad_len, unad_len, bayes_risk_ad, bayes_risk_unad])
def test_intervals(s=3,
                   n=200,
                   p=50,
                   snr=7,
                   rho=0.1,
                   split_frac=0.8,
                   lam_frac=0.7,
                   ndraw=10000,
                   burnin=2000,
                   bootstrap=True,
                   intervals='new',
                   solve_args={
                       'min_its': 50,
                       'tol': 1.e-10
                   }):

    randomizer = randomization.laplace((p, ), scale=1.)
    X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, snr=snr)

    nonzero = np.where(beta)[0]

    loss = rr.glm.logistic(X, y)
    epsilon = 1.

    lam = lam_frac * np.mean(
        np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0))
    W = np.ones(p) * lam
    W[0] = 0  # use at least some unpenalized
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)

    # first randomization
    M_est1 = glm_group_lasso(loss, epsilon, penalty, randomizer)
    # second randomization
    # M_est2 = glm_group_lasso(loss, epsilon, penalty, randomizer)

    # mv = multiple_queries([M_est1, M_est2])
    mv = multiple_queries([M_est1])
    mv.solve()

    active_union = M_est1.selection_variable['variables']
    nactive = np.sum(active_union)

    if nactive == 0:
        return None

    if set(nonzero).issubset(np.nonzero(active_union)[0]):

        active_set = np.nonzero(active_union)[0]
        true_vec = beta[active_union]

        target_sampler, target_observed = glm_target(loss, active_union, mv)

        target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin)
        if intervals == 'old':
            LU = target_sampler.confidence_intervals(target_observed,
                                                     sample=target_sample,
                                                     level=0.9)
        else:
            full_sample = target_sampler.sample(ndraw=ndraw,
                                                burnin=burnin,
                                                keep_opt=True)

            LU = target_sampler.confidence_intervals_translate(
                target_observed, sample=full_sample, level=0.9)

        LU_naive = naive_confidence_intervals(target_sampler, target_observed)

        pivots_mle = target_sampler.coefficient_pvalues(
            target_observed,
            parameter=target_sampler.reference,
            sample=target_sample)

        pivots_truth = target_sampler.coefficient_pvalues(target_observed,
                                                          parameter=true_vec,
                                                          sample=target_sample)
        pvalues = target_sampler.coefficient_pvalues(
            target_observed,
            parameter=np.zeros_like(true_vec),
            sample=target_sample)
        unpenalized_mle = restricted_Mest(
            loss,
            M_est1.selection_variable['variables'],
            solve_args=solve_args)

        L, U = LU.T

        covered = np.zeros(nactive, np.bool)
        naive_covered = np.zeros(nactive, np.bool)
        active_var = np.zeros(nactive, np.bool)

        for j in range(nactive):
            if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]):
                covered[j] = 1
            if (LU_naive[j, 0] <= true_vec[j]) and (LU_naive[j, 1] >=
                                                    true_vec[j]):
                naive_covered[j] = 1
            active_var[j] = active_set[j] in nonzero

        return pivots_mle, pivots_truth, pvalues, covered, naive_covered, active_var
Exemplo n.º 30
0
def test_fixedX(ndraw=10000, burnin=2000):  # nsim needed for decorator
    s, n, p = 5, 200, 20

    randomizer = randomization.laplace((p, ), scale=1.)
    X, Y, beta, nonzero, sigma = gaussian_instance(n=n,
                                                   p=p,
                                                   s=s,
                                                   rho=0.1,
                                                   snr=7)

    lam_frac = 1.
    lam = lam_frac * np.mean(
        np.fabs(X.T.dot(np.random.standard_normal((n, 50000)))).max(0)) * sigma
    W = np.ones(p) * lam
    epsilon = 1. / np.sqrt(n)

    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)

    M_est = fixedX_group_lasso(X, Y, epsilon, penalty, randomizer)

    mv = multiple_queries([M_est])
    mv.solve()

    active = M_est.selection_variable['variables']
    nactive = active.sum()

    if set(nonzero).issubset(
            np.nonzero(active)[0]) and active.sum() > len(nonzero):

        pvalues = []
        active_set = np.nonzero(active)[0]
        inactive_selected = I = [
            i for i in np.arange(active_set.shape[0])
            if active_set[i] not in nonzero
        ]
        active_selected = A = [
            i for i in np.arange(active_set.shape[0])
            if active_set[i] in nonzero
        ]

        if not I:
            return None

        idx = I[0]
        boot_target, target_observed = resid_bootstrap(M_est.loss, active)

        X_active = X[:, active]
        beta_hat = np.linalg.pinv(X_active).dot(Y)
        resid_hat = Y - X_active.dot(beta_hat)
        form_covariances = glm_nonparametric_bootstrap(n, n)
        mv.setup_sampler(form_covariances)

        # null saturated

        def null_target(Y_star):
            result = boot_target(Y_star)
            return result[idx]

        null_observed = np.zeros(1)
        null_observed[0] = target_observed[idx]

        target_sampler = mv.setup_target(null_target, null_observed)

        test_stat = lambda x: x[0]
        pval = target_sampler.hypothesis_test(
            test_stat, null_observed, burnin=burnin,
            ndraw=ndraw)  # twosided by default
        pvalues.append(pval)

        # null selected

        def null_target(Y_star):
            result = boot_target(Y_star)
            return np.hstack([result[idx], result[nactive:]])

        null_observed = np.zeros_like(null_target(
            np.random.standard_normal(n)))
        null_observed[0] = target_observed[idx]
        null_observed[1:] = target_observed[nactive:]

        target_sampler = mv.setup_target(null_target,
                                         null_observed,
                                         target_set=[0])

        test_stat = lambda x: x[0]
        pval = target_sampler.hypothesis_test(
            test_stat, null_observed, burnin=burnin,
            ndraw=ndraw)  # twosided by default
        pvalues.append(pval)

        # true saturated

        idx = A[0]

        def active_target(Y_star):
            result = boot_target(Y_star)
            return result[idx]

        active_observed = np.zeros(1)
        active_observed[0] = target_observed[idx]

        sampler = lambda: np.random.choice(n, size=(n, ), replace=True)

        target_sampler = mv.setup_target(active_target, active_observed)

        test_stat = lambda x: x[0]
        pval = target_sampler.hypothesis_test(
            test_stat, active_observed, burnin=burnin,
            ndraw=ndraw)  # twosided by default
        pvalues.append(pval)

        # true selected

        def active_target(Y_star):
            result = boot_target(Y_star)
            return np.hstack([result[idx], result[nactive:]])

        active_observed = np.zeros_like(
            active_target(np.random.standard_normal(n)))
        active_observed[0] = target_observed[idx]
        active_observed[1:] = target_observed[nactive:]

        target_sampler = mv.setup_target(active_target,
                                         active_observed,
                                         target_set=[0])

        test_stat = lambda x: x[0]
        pval = target_sampler.hypothesis_test(
            test_stat, active_observed, burnin=burnin,
            ndraw=ndraw)  # twosided by default
        pvalues.append(pval)

        return pvalues, [False, False, True, True]
Exemplo n.º 31
0
ols_fit = sm.OLS(Y, X).fit()
sigma_3TC = np.linalg.norm(ols_fit.resid) / np.sqrt(n - p - 1)
OLS_3TC = ols_fit.params

lam_frac = 1.
loss = rr.glm.gaussian(X, Y)
epsilon = 1. / np.sqrt(n)
lam = lam_frac * np.mean(
    np.fabs(np.dot(X.T, np.random.standard_normal(
        (n, 2000)))).max(0)) * sigma_3TC
print(lam)

W = np.ones(p) * lam
penalty = rr.group_lasso(np.arange(p),
                         weights=dict(zip(np.arange(p), W)),
                         lagrange=1.)

randomization = randomization.isotropic_gaussian((p, ), scale=1.)

M_est = M_estimator_approx(loss,
                           epsilon,
                           penalty,
                           randomization,
                           randomizer='gaussian')
M_est.solve_approx()
active = M_est._overall
active_set = np.asarray([i for i in range(p) if active[i]])
nactive = np.sum(active)

active_set_0 = [NRTI_muts[i] for i in range(p) if active[i]]
Exemplo n.º 32
0
def test_split_compare(s=3,
                       n=200,
                       p=20,
                       signal=7,
                       rho=0.1,
                       split_frac=0.8,
                       lam_frac=0.7,
                       ndraw=10000, burnin=2000,
                       solve_args={'min_its':50, 'tol':1.e-10}, check_screen =True):

    X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal)

    nonzero = np.where(beta)[0]

    loss = rr.glm.logistic(X, y)
    epsilon = 1.

    lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0))
    W = np.ones(p)*lam
    W[0] = 0 # use at least some unpenalized
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)), lagrange=1.)

    m = int(split_frac * n)

    M_est1 = split_glm_group_lasso(loss, epsilon, m, penalty)
    mv = multiple_queries([M_est1])
    mv.solve()

    active_union = M_est1.selection_variable['variables'] #+ M_est2.selection_variable['variables']
    nactive = np.sum(active_union)
    print("nactive", nactive)
    if nactive==0:
        return None

    leftout_indices = M_est1.randomized_loss.saturated_loss.case_weights == 0

    screen = set(nonzero).issubset(np.nonzero(active_union)[0])

    if check_screen and not screen:
        return None

    if True:
        active_set = np.nonzero(active_union)[0]
        true_vec = beta[active_union]

        ## bootstrap
        target_sampler_boot, target_observed = glm_target(loss,
                                                          active_union,
                                                          mv,
                                                          bootstrap=True)

        target_sample_boot = target_sampler_boot.sample(ndraw=ndraw,
                                              burnin=burnin)
        LU_boot = target_sampler_boot.confidence_intervals(target_observed,
                                                 sample=target_sample_boot,
                                                 level=0.9)
        pivots_boot = target_sampler_boot.coefficient_pvalues(target_observed,
                                                          parameter=true_vec,
                                                          sample=target_sample_boot)

        ## CLT plugin
        target_sampler, _ = glm_target(loss,
                                       active_union,
                                       mv,
                                       bootstrap=False)

        target_sample = target_sampler.sample(ndraw=ndraw,
                                              burnin=burnin)
        LU = target_sampler.confidence_intervals(target_observed,
                                                 sample=target_sample,
                                                 level=0.9)
        pivots = target_sampler.coefficient_pvalues(target_observed,
                                                    parameter=true_vec,
                                                    sample=target_sample)

        LU_naive = naive_confidence_intervals(target_sampler, target_observed)

        if X.shape[0] - leftout_indices.sum() > nactive:
            LU_split = standard_split_ci(rr.glm.logistic, X, y, active_union, leftout_indices)
        else:
            LU_split = np.ones((nactive, 2)) * np.nan

        def coverage(LU):
            L, U = LU[:,0], LU[:,1]
            covered = np.zeros(nactive)
            ci_length = np.zeros(nactive)

            for j in range(nactive):
                if check_screen:
                  if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]):
                    covered[j] = 1
                else:
                    covered[j] = None
                ci_length[j] = U[j]-L[j]
            return covered, ci_length

        covered, ci_length = coverage(LU)
        covered_boot, ci_length_boot = coverage(LU_boot)
        covered_split, ci_length_split = coverage(LU_split)
        covered_naive, ci_length_naive = coverage(LU_naive)

        active_var = np.zeros(nactive, np.bool)
        for j in range(nactive):
            active_var[j] = active_set[j] in nonzero

        return pivots, pivots_boot, covered, ci_length, covered_boot, ci_length_boot, \
               covered_split, ci_length_split, active_var, covered_naive, ci_length_naive