Python logistic_instance примеры, selection.tests.instance.logistic_instance Python примеры использования

Пример #1

0

Показать файл

Файл: test_multiple_queries.py Проект: xizepu/selective-inference

def test_multiple_queries_individual_coeff(ndraw=10000, burnin=2000):
    s, n, p = 3, 120, 10

    randomizer = randomization.laplace((p,), scale=1)
    X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=0, snr=5)

    nonzero = np.where(beta)[0]
    lam_frac = 1.

    loss = rr.glm.logistic(X, y)
    epsilon = 1.

    lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0))
    W = np.ones(p)*lam
    W[0] = 0 # use at least some unpenalized
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)), lagrange=1.)

    view = []
    nview = 5
    for i in range(nview):
        view.append(glm_group_lasso(loss, epsilon, penalty, randomizer))

    mv = multiple_queries(view)
    mv.solve()

    active_union = np.zeros(p, np.bool)
    for i in range(nview):
        active_union += view[i].selection_variable['variables']

    nactive = np.sum(active_union)
    print("nactive", nactive)
    active_set = np.nonzero(active_union)[0]

    pvalues = []
    true_beta = beta[active_union]
    if set(nonzero).issubset(np.nonzero(active_union)[0]):
        for j in range(nactive):

            subset = np.zeros(p, np.bool)
            subset[active_set[j]] = True
            target_sampler, target_observed = glm_target(loss,
                                                         active_union * ~subset,
                                                         mv,
                                                         subset=subset,
                                                         reference=np.zeros((1,)))
            test_stat = lambda x: np.atleast_1d(x)

            pval = target_sampler.hypothesis_test(test_stat,
                                                  np.atleast_1d(target_observed-true_beta[j]),
                                                  alternative='twosided',
                                                  ndraw=ndraw,
                                                  burnin=burnin)
            pvalues.append(pval)

        active_var = np.zeros_like(pvalues, np.bool)
        _nonzero = np.array([i in nonzero for i in active_set])
        active_var[_nonzero] = True

        return pvalues, [active_set[j] in nonzero for j in range(nactive)]

Пример #2

0

Показать файл

def test_parametric_covariance(ndraw=10000, burnin=2000):
    s, n, p = 3, 120, 10

    randomizer = randomization.laplace((p, ), scale=1)
    X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=0, signal=12)

    nonzero = np.where(beta)[0]
    lam_frac = 1.

    loss = rr.glm.logistic(X, y)
    epsilon = 1.

    lam = lam_frac * np.mean(
        np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0))
    W = np.ones(p) * lam
    W[0] = 0  # use at least some unpenalized
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)

    # first randomization
    M_est1 = glm_group_lasso_parametric(loss, epsilon, penalty, randomizer)
    # second randomization
    M_est2 = glm_group_lasso_parametric(loss, epsilon, penalty, randomizer)

    mv = multiple_queries([M_est1, M_est2])
    mv.solve()

    target = M_est1.selection_variable['variables'].copy()
    if target[-1] or M_est2.selection_variable['variables'][-1]:
        return None
    if target[-2] or M_est2.selection_variable['variables'][-2]:
        return None

    # we should check they are different sizes
    target[-2:] = 1

    if set(nonzero).issubset(np.nonzero(target)[0]):

        form_covariances = glm_parametric_covariance(loss)
        mv.setup_sampler(form_covariances)

        target_observed = restricted_Mest(loss, target)
        linear_func = np.zeros((2, target_observed.shape[0]))
        linear_func[0, -1] = 1.  # we know this one is null
        linear_func[1, -2] = 1.  # also null

        target_observed = linear_func.dot(target_observed)
        target_sampler = mv.setup_target((target, linear_func),
                                         target_observed,
                                         parametric=True)

        test_stat = lambda x: np.linalg.norm(x)
        pval = target_sampler.hypothesis_test(test_stat,
                                              test_stat(target_observed),
                                              alternative='greater',
                                              ndraw=ndraw,
                                              burnin=burnin)

        return [pval], [False]

Пример #3

0

Показать файл

Файл: test_lasso.py Проект: nanbi/Python-software

def test_logistic_pvals(n=500,
                        p=200,
                        s=3,
                        rho=0.3,
                        signal=15.):

    X, y, beta, true_active = logistic_instance(n=n, 
                                                p=p, 
                                                s=s, 
                                                rho=rho, 
                                                signal=signal,
                                                equicorrelated=False)

    X = np.hstack([np.ones((n,1)), X])

    print(true_active, 'true')
    active = np.array(true_active)
    active += 1
    active = [0] + list(active)
    true_active = active

    L = lasso.logistic(X, y, [0]*1 + [1.2]*p)
    L.fit()
    S = L.summary('onesided')

    print(true_active, L.active)
    if set(true_active).issubset(L.active):
        return S['pval'], [v in true_active for v in S['variable']]

Пример #4

0

Показать файл

def generate_data(s=5, 
                  n=200, 
                  p=20, 
                  rho=0.1, 
                  signal=15):

    return logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal, scale=False, center=False)

Пример #5

0

Показать файл

def test_reconstruction(s=3,
                        n=200,
                        p=50,
                        snr=7,
                        rho=0.1,
                        split_frac=0.8,
                        lam_frac=0.7,
                        ndraw=100,
                        burnin=200,
                        bootstrap=True,
                        solve_args={
                            'min_its': 50,
                            'tol': 1.e-10
                        },
                        reference_known=False):

    X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, snr=snr)

    m = int(split_frac * n)
    nonzero = np.where(beta)[0]

    loss = rr.glm.logistic(X, y)
    epsilon = 1. / np.sqrt(n)

    lam = lam_frac * np.mean(
        np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 2000)))).max(0))
    W = np.ones(p) * lam
    W[0] = 0  # use at least some unpenalized
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)

    M_est = split_glm_group_lasso(loss, epsilon, m, penalty)
    mv = multiple_queries([M_est])
    mv.solve()

    M_est.selection_variable['variables'] = M_est.selection_variable[
        'variables']
    nactive = np.sum(M_est.selection_variable['variables'])

    if nactive == 0:
        return None

    if set(nonzero).issubset(
            np.nonzero(M_est.selection_variable['variables'])[0]):

        active_set = np.nonzero(M_est.selection_variable['variables'])[0]

        target_sampler, target_observed = glm_target(
            loss, M_est.selection_variable['variables'], mv)

        target_sample = target_sampler.sample(ndraw=ndraw,
                                              burnin=burnin,
                                              keep_opt=True)

        reconstruction = target_sampler.reconstruction_map(target_sample)
        logdens = target_sampler.log_randomization_density(target_sample)
        return logdens.shape

Пример #6

0

Показать файл

Файл: test_mle_approx.py Проект: matthew-brett/selective-inference

def test_approximate_mle(n=100,
                         p=10,
                         s=3,
                         snr=5,
                         rho=0.1,
                         lam_frac = 1.,
                         loss='gaussian',
                         randomizer='gaussian'):

    from selection.api import randomization

    if loss == "gaussian":
        X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, snr=snr, sigma=1.)
        lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) * sigma
        loss = rr.glm.gaussian(X, y)
    elif loss == "logistic":
        X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, snr=snr)
        loss = rr.glm.logistic(X, y)
        lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0))

    epsilon = 1. / np.sqrt(n)

    W = np.ones(p) * lam
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)), lagrange=1.)
    if randomizer == 'gaussian':
        randomization = randomization.isotropic_gaussian((p,), scale=1.)
    elif randomizer == 'laplace':
        randomization = randomization.laplace((p,), scale=1.)

    M_est = M_estimator_approx(loss, epsilon, penalty, randomization, randomizer)
    M_est.solve_approx()

    inf = approximate_conditional_density(M_est)
    inf.solve_approx()

    active = M_est._overall
    active_set = np.asarray([i for i in range(p) if active[i]])

    true_support = np.asarray([i for i in range(p) if i < s])

    nactive = np.sum(active)

    print("active set, true_support", active_set, true_support)

    true_vec = beta[active]

    print("true coefficients", true_vec)

    if (set(active_set).intersection(set(true_support)) == set(true_support)) == True:

        mle_active = np.zeros(nactive)

        for j in range(nactive):
            mle_active[j] = inf.approx_MLE_solver(j, nstep=100)[0]

        print("mle for target", mle_active)

Пример #7

0

Показать файл

Файл: test_randomization_to_zero.py Проект: xizepu/selective-inference

def test_multiple_queries_individual_coeff_small(ndraw=10000, 
                                                 burnin=2000, 
                                                 bootstrap=True):
    s, n, p = 3, 100, 20

    randomizer = randomization.laplace((p,), scale=1)
    X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=0, snr=20.)

    nonzero = np.where(beta)[0]
    lam_frac = 3.

    loss = rr.glm.logistic(X, y)
    epsilon = 1.

    lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0))
    W = np.ones(p)*lam
    W[0] = 0 # use at least some unpenalized
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)), lagrange=1.)

    # randomization
    M_est = glm_group_lasso(loss, epsilon, penalty, randomizer)
    mv = multiple_queries([M_est])
    mv.solve()

    active_vars = M_est.selection_variable['variables'] 

    nactive = np.sum(active_vars)
    active_set = np.nonzero(active_vars)[0]

    pvalues = []
    true_beta = beta[active_vars]

    print(nonzero, active_set)
    if set(nonzero).issubset(active_set):

        for j in range(nactive):

            print(j)
            subset = np.zeros(p, np.bool)
            subset[active_set[j]] = True
            target_sampler, target_observed = glm_target(loss,
                                                         active_vars,
                                                         mv,
                                                         subset=subset,
                                                         bootstrap=bootstrap,
                                                         reference=np.zeros((1,)))

            test_stat = lambda x: x 

            pval = target_sampler.hypothesis_test(test_stat,
                                                  target_observed,
                                                  alternative='twosided',
                                                  ndraw=ndraw,
                                                  burnin=burnin)
            pvalues.append(pval)
        return pvalues, [active_set[j] in nonzero for j in range(nactive)]

Пример #8

0

Показать файл

def test_without_screening(s=10,
                           n=300,
                           p=100,
                           rho=0.,
                           signal=3.5,
                           lam_frac=1.,
                           ndraw=10000,
                           burnin=2000,
                           loss='gaussian',
                           randomizer='laplace',
                           randomizer_scale=1.,
                           scalings=False,
                           subgrad=True,
                           check_screen=False):

    if loss == "gaussian":
        X, y, beta, nonzero, sigma = gaussian_instance(n=n,
                                                       p=p,
                                                       s=s,
                                                       rho=rho,
                                                       signal=signal,
                                                       sigma=1,
                                                       random_signs=False)
        lam = lam_frac * np.mean(
            np.fabs(np.dot(X.T, np.random.standard_normal(
                (n, 2000)))).max(0)) * sigma
        loss = rr.glm.gaussian(X, y)
        X_indep, y_indep, _, _, _ = gaussian_instance(n=n,
                                                      p=p,
                                                      s=s,
                                                      rho=rho,
                                                      signal=signal,
                                                      sigma=1)
        loss_indep = rr.glm.gaussian(X_indep, y_indep)
    elif loss == "logistic":
        X, y, beta, _ = logistic_instance(n=n,
                                          p=p,
                                          s=s,
                                          rho=rho,
                                          signal=signal)
        loss = rr.glm.logistic(X, y)
        lam = lam_frac * np.mean(
            np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2,
                                                   (n, 10000)))).max(0))
        X_indep, y_indep, _, _ = logistic_instance(n=n,
                                                   p=p,
                                                   s=s,
                                                   rho=rho,
                                                   signal=signal,
                                                   random_signs=False)
        loss_indep = rr.glm.logistic(X_indep, y_indep)
    nonzero = np.where(beta)[0]

    if randomizer == 'laplace':
        randomizer = randomization.laplace((p, ), scale=randomizer_scale)
    elif randomizer == 'gaussian':
        randomizer = randomization.isotropic_gaussian((p, ),
                                                      scale=randomizer_scale)

    epsilon = 1. / np.sqrt(n)
    W = np.ones(p) * lam
    #W[0] = 0 # use at least some unpenalized
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)
    M_est = glm_group_lasso(loss, epsilon, penalty, randomizer)
    M_est.solve()
    active_union = M_est._overall
    nactive = np.sum(active_union)
    print("nactive", nactive)
    active_set = np.nonzero(active_union)[0]
    print("active set", active_set)
    print("true nonzero", np.nonzero(beta)[0])

    views = [M_est]
    queries = multiple_queries(views)
    queries.solve()

    screened = False
    if set(nonzero).issubset(np.nonzero(active_union)[0]):
        screened = True

    if check_screen == False or (check_screen == True and screened == True):

        #if nactive==s:
        #    return None

        if scalings:  # try condition on some scalings
            M_est.condition_on_subgradient()
            M_est.condition_on_scalings()
        if subgrad:
            M_est.decompose_subgradient(conditioning_groups=np.zeros(
                p, dtype=bool),
                                        marginalizing_groups=np.ones(p, bool))

        boot_target1, boot_target_observed1 = pairs_bootstrap_glm(
            loss, active_union, inactive=~active_union)
        boot_target2, boot_target_observed2 = pairs_bootstrap_glm(
            loss_indep, active_union, inactive=~active_union)
        target_observed = (boot_target_observed1 -
                           boot_target_observed2)[:nactive]

        def _target(indices):
            return boot_target1(indices)[:nactive] - boot_target2(
                indices)[:nactive]

        form_covariances = glm_nonparametric_bootstrap(n, n)
        queries.setup_sampler(form_covariances)
        queries.setup_opt_state()

        target_sampler = queries.setup_target(_target,
                                              target_observed,
                                              reference=target_observed)

        target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin)
        LU = target_sampler.confidence_intervals(target_observed,
                                                 sample=target_sample,
                                                 level=0.9)
        pivots = target_sampler.coefficient_pvalues(
            target_observed, parameter=np.zeros(nactive), sample=target_sample)

        #test_stat = lambda x: np.linalg.norm(x - beta[active_union])
        #observed_test_value = test_stat(target_observed)
        #pivots = target_sampler.hypothesis_test(test_stat,
        #                                       observed_test_value,
        #                                       alternative='twosided',
        #                                       parameter = beta[active_union],
        #                                       ndraw=ndraw,
        #                                       burnin=burnin,
        #                                       stepsize=None)

        true_vec = np.zeros(nactive)

        def coverage(LU):
            L, U = LU[:, 0], LU[:, 1]
            covered = np.zeros(nactive)
            ci_length = np.zeros(nactive)
            for j in range(nactive):
                if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]):
                    covered[j] = 1
                ci_length[j] = U[j] - L[j]
            return covered, ci_length

        covered, ci_length = coverage(LU)
        LU_naive = naive_confidence_intervals(target_sampler, target_observed)
        covered_naive, ci_length_naive = coverage(LU_naive)
        naive_pvals = naive_pvalues(target_sampler, target_observed, true_vec)
        return pivots, covered, ci_length, naive_pvals, covered_naive, ci_length_naive

Пример #9

0

Показать файл

    niter = 3
    ad_cov = 0.
    unad_cov = 0.
    ad_len = 0.
    unad_len = 0.
    ad_risk = 0.
    unad_risk = 0.

    for i in range(niter):

        ### GENERATE X, Y BASED ON SEED
        np.random.seed(i + 68)  # ensures different X and y
        #X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, sigma=1., rho=0., snr=snr)
        # lam = 1. * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) * sigma
        X, y, beta, nonzero = logistic_instance(n=n, p=p, s=s, rho=0., snr=snr)
        lam = 1.5 * np.mean(
            np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2,
                                                   (n, 10000)))).max(0))
        sigma = 1.

        ### RUN LASSO AND TEST
        lasso = randomized_lasso_trial(X, y, beta, sigma, lam)

        if lasso is not None:
            ad_cov += lasso[0, 0]
            unad_cov += lasso[1, 0]
            ad_len += lasso[2, 0]
            unad_len += lasso[3, 0]
            ad_risk += lasso[4, 0]
            unad_risk += lasso[5, 0]

Пример #10

0

Показать файл

def test_data_carving_logistic(n=700,
                               p=300,
                               s=5,
                               rho=0.05,
                               signal=12.,
                               split_frac=0.8,
                               ndraw=8000,
                               burnin=2000,
                               df=np.inf,
                               compute_intervals=True,
                               use_full_cov=False,
                               return_only_screening=True):

    X, y, beta, true_active, _ = logistic_instance(n=n,
                                                   p=p,
                                                   s=s,
                                                   rho=rho,
                                                   signal=signal,
                                                   equicorrelated=False)

    mu = X.dot(beta)
    prob = np.exp(mu) / (1 + np.exp(mu))

    X = np.hstack([np.ones((n, 1)), X])
    active = np.array(true_active)
    active += 1
    s += 1
    active = [0] + list(active)
    true_active = active

    idx = np.arange(n)
    np.random.shuffle(idx)
    stage_one = idx[:int(n * split_frac)]
    n1 = len(stage_one)

    lam_theor = 1.0 * np.ones(p + 1)
    lam_theor[0] = 0.
    DC = data_carving.logistic(X,
                               y,
                               feature_weights=lam_theor,
                               stage_one=stage_one)

    DC.fit()

    if len(DC.active) < n - int(n * split_frac):
        DS = data_splitting.logistic(X,
                                     y,
                                     feature_weights=lam_theor,
                                     stage_one=stage_one)
        DS.fit(use_full_cov=True)
        data_split = True
    else:
        print('not enough data for data splitting second stage')
        print(DC.active)
        data_split = False

    print(true_active, DC.active)
    if set(true_active).issubset(DC.active):
        carve = []
        split = []
        for var in DC.active:
            carve.append(DC.hypothesis_test(var, burnin=burnin, ndraw=ndraw))
            if data_split:
                split.append(DS.hypothesis_test(var))
            else:
                split.append(np.random.sample())

        Xa = X[:, DC.active]

        active = np.zeros(p + 1, np.bool)
        active[true_active] = 1
        v = (carve, split, active)
        return v

Пример #11

0

Показать файл

Файл: test_intervals.py Проект: matthew-brett/selective-inference

def test_intervals(s=0,
                   n=200,
                   p=10,
                   signal=7,
                   rho=0.,
                   lam_frac=6.,
                   ndraw=10000,
                   burnin=2000,
                   bootstrap=True,
                   loss='gaussian',
                   intervals='old',
                   randomizer='laplace',
                   solve_args={
                       'min_its': 50,
                       'tol': 1.e-10
                   }):

    if randomizer == 'laplace':
        randomizer = randomization.laplace((p, ), scale=1.)
    elif randomizer == 'gaussian':
        randomizer = randomization.isotropic_gaussian((p, ), scale=1.)
    elif randomizer == 'logistic':
        randomizer = randomization.logistic((p, ), scale=1.)

    if loss == "gaussian":
        X, y, beta, nonzero, sigma = gaussian_instance(n=n,
                                                       p=p,
                                                       s=s,
                                                       rho=rho,
                                                       signal=signal,
                                                       sigma=1)
        lam = np.mean(
            np.fabs(np.dot(X.T, np.random.standard_normal((n, 1000))))) * sigma
        loss = rr.glm.gaussian(X, y)
    elif loss == "logistic":
        X, y, beta, _ = logistic_instance(n=n,
                                          p=p,
                                          s=s,
                                          rho=rho,
                                          signal=signal)
        loss = rr.glm.logistic(X, y)
        lam = lam_frac * np.mean(
            np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2,
                                                   (n, 10000)))).max(0))

    nonzero = np.where(beta)[0]
    epsilon = 1. / np.sqrt(n)

    W = lam_frac * np.ones(p) * lam
    # W[0] = 0 # use at least some unpenalized
    groups = np.concatenate([np.arange(10) for i in range(p / 10)])
    #print(groups)
    #groups = np.arange(p)
    penalty = rr.group_lasso(groups,
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)

    # first randomization
    M_est1 = glm_group_lasso(loss, epsilon, penalty, randomizer)
    mv = multiple_queries([M_est1])
    # second randomization
    #M_est2 = glm_group_lasso(loss, epsilon, penalty, randomizer)
    #mv = multiple_queries([M_est1, M_est2])

    mv.solve()

    active_union = M_est1.selection_variable['variables']
    print("active set", np.nonzero(active_union)[0])
    nactive = np.sum(active_union)

    if nactive == 0:
        return None

    if set(nonzero).issubset(np.nonzero(active_union)[0]):

        active_set = np.nonzero(active_union)[0]
        true_vec = beta[active_union]

        target_sampler, target_observed = glm_target(loss,
                                                     active_union,
                                                     mv,
                                                     bootstrap=bootstrap)

        if intervals == 'old':
            target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin)
            LU = target_sampler.confidence_intervals(target_observed,
                                                     sample=target_sample,
                                                     level=0.9)
            pivots_mle = target_sampler.coefficient_pvalues(
                target_observed,
                parameter=target_sampler.reference,
                sample=target_sample)
            pivots_truth = target_sampler.coefficient_pvalues(
                target_observed, parameter=true_vec, sample=target_sample)
            pvalues = target_sampler.coefficient_pvalues(
                target_observed,
                parameter=np.zeros_like(true_vec),
                sample=target_sample)
        else:
            full_sample = target_sampler.sample(ndraw=ndraw,
                                                burnin=burnin,
                                                keep_opt=True)
            LU = target_sampler.confidence_intervals_translate(
                target_observed, sample=full_sample, level=0.9)
            pivots_mle = target_sampler.coefficient_pvalues_translate(
                target_observed,
                parameter=target_sampler.reference,
                sample=full_sample)
            pivots_truth = target_sampler.coefficient_pvalues_translate(
                target_observed, parameter=true_vec, sample=full_sample)
            pvalues = target_sampler.coefficient_pvalues_translate(
                target_observed,
                parameter=np.zeros_like(true_vec),
                sample=full_sample)

        LU_naive = naive_confidence_intervals(target_sampler, target_observed)

        L, U = LU.T
        ci_length_sel = np.zeros(nactive)
        covered = np.zeros(nactive, np.bool)
        naive_covered = np.zeros(nactive, np.bool)
        ci_length_naive = np.zeros(nactive)
        active_var = np.zeros(nactive, np.bool)

        for j in range(nactive):
            if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]):
                covered[j] = 1
            ci_length_sel[j] = U[j] - L[j]
            if (LU_naive[j, 0] <= true_vec[j]) and (LU_naive[j, 1] >=
                                                    true_vec[j]):
                naive_covered[j] = 1
            ci_length_naive[j] = LU_naive[j, 1] - LU_naive[j, 0]
            active_var[j] = active_set[j] in nonzero

        naive_pvals = naive_pvalues(target_sampler, target_observed, true_vec)

        return pivots_mle, pivots_truth, pvalues, covered, ci_length_sel,\
               naive_pvals, naive_covered, ci_length_naive, active_var

Пример #12

0

Показать файл

def test_cv(n=100,
            p=50,
            s=5,
            signal=7.5,
            K=5,
            rho=0.,
            randomizer='gaussian',
            randomizer_scale=1.,
            scale1=0.1,
            scale2=0.2,
            lam_frac=1.,
            glmnet=True,
            loss='gaussian',
            intervals='old',
            bootstrap=False,
            condition_on_CVR=True,
            marginalize_subgrad=True,
            ndraw=10000,
            burnin=2000,
            nboot=nboot):

    print(n, p, s, condition_on_CVR, scale1, scale2)
    if randomizer == 'laplace':
        randomizer = randomization.laplace((p, ), scale=randomizer_scale)
    elif randomizer == 'gaussian':
        randomizer = randomization.isotropic_gaussian((p, ), randomizer_scale)
    elif randomizer == 'logistic':
        randomizer = randomization.logistic((p, ), scale=randomizer_scale)

    if loss == "gaussian":
        X, y, beta, nonzero, sigma = gaussian_instance(n=n,
                                                       p=p,
                                                       s=s,
                                                       rho=rho,
                                                       signal=signal,
                                                       sigma=1)
        glm_loss = rr.glm.gaussian(X, y)
    elif loss == "logistic":
        X, y, beta, _ = logistic_instance(n=n,
                                          p=p,
                                          s=s,
                                          rho=rho,
                                          signal=signal)
        glm_loss = rr.glm.logistic(X, y)

    epsilon = 1. / np.sqrt(n)

    # view 1
    cv = CV_view(glm_loss,
                 loss_label=loss,
                 lasso_randomization=randomizer,
                 epsilon=epsilon,
                 scale1=scale1,
                 scale2=scale2)
    if glmnet:
        try:
            cv.solve(glmnet=glmnet)
        except ImportError:
            cv.solve(glmnet=False)
    else:
        cv.solve(glmnet=False)

    # for the test make sure we also run the python code

    cv_py = CV_view(glm_loss,
                    loss_label=loss,
                    lasso_randomization=randomizer,
                    epsilon=epsilon,
                    scale1=scale1,
                    scale2=scale2)
    cv_py.solve(glmnet=False)

    lam = cv.lam_CVR
    print("lam", lam)

    if condition_on_CVR:
        cv.condition_on_opt_state()
        lam = cv.one_SD_rule(direction="up")
        print("new lam", lam)

    # non-randomized Lasso, just looking how many vars it selects
    problem = rr.simple_problem(glm_loss, rr.l1norm(p, lagrange=lam))
    beta_hat = problem.solve()
    active_hat = beta_hat != 0
    print("non-randomized lasso ", active_hat.sum())

    # view 2
    W = lam_frac * np.ones(p) * lam
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)
    M_est1 = glm_group_lasso(glm_loss, epsilon, penalty, randomizer)

    if nboot > 0:
        cv.nboot = M_est1.nboot = nboot

    mv = multiple_queries([cv, M_est1])
    mv.solve()

    active_union = M_est1._overall
    nactive = np.sum(active_union)
    print("nactive", nactive)
    if nactive == 0:
        return None

    nonzero = np.where(beta)[0]

    if set(nonzero).issubset(np.nonzero(active_union)[0]):

        active_set = np.nonzero(active_union)[0]
        true_vec = beta[active_union]

        if marginalize_subgrad == True:
            M_est1.decompose_subgradient(conditioning_groups=np.zeros(p, bool),
                                         marginalizing_groups=np.ones(p, bool))

        target_sampler, target_observed = glm_target(glm_loss,
                                                     active_union,
                                                     mv,
                                                     bootstrap=bootstrap)

        if intervals == 'old':
            target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin)
            LU = target_sampler.confidence_intervals(target_observed,
                                                     sample=target_sample,
                                                     level=0.9)

            pivots_truth = target_sampler.coefficient_pvalues(
                target_observed, parameter=true_vec, sample=target_sample)
            pvalues = target_sampler.coefficient_pvalues(
                target_observed,
                parameter=np.zeros_like(true_vec),
                sample=target_sample)
        else:
            full_sample = target_sampler.sample(ndraw=ndraw,
                                                burnin=burnin,
                                                keep_opt=True)
            LU = target_sampler.confidence_intervals_translate(
                target_observed, sample=full_sample, level=0.9)
            pivots_truth = target_sampler.coefficient_pvalues_translate(
                target_observed, parameter=true_vec, sample=full_sample)
            pvalues = target_sampler.coefficient_pvalues_translate(
                target_observed,
                parameter=np.zeros_like(true_vec),
                sample=full_sample)

        L, U = LU.T
        sel_covered = np.zeros(nactive, np.bool)
        sel_length = np.zeros(nactive)

        LU_naive = naive_confidence_intervals(target_sampler, target_observed)
        naive_covered = np.zeros(nactive, np.bool)
        naive_length = np.zeros(nactive)
        naive_pvals = naive_pvalues(target_sampler, target_observed, true_vec)

        active_var = np.zeros(nactive, np.bool)

        for j in range(nactive):
            if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]):
                sel_covered[j] = 1
            if (LU_naive[j, 0] <= true_vec[j]) and (LU_naive[j, 1] >=
                                                    true_vec[j]):
                naive_covered[j] = 1
            sel_length[j] = U[j] - L[j]
            naive_length[j] = LU_naive[j, 1] - LU_naive[j, 0]
            active_var[j] = active_set[j] in nonzero

        q = 0.2
        BH_desicions = multipletests(pvalues, alpha=q, method="fdr_bh")[0]
        return pivots_truth, sel_covered, sel_length, naive_pvals, naive_covered, naive_length, active_var, BH_desicions, active_var

Пример #13

0

Показать файл

def test_condition(s=0,
                   n=100,
                   p=200,
                   rho=0.1,
                   signal=10,
                   lam_frac=1.4,
                   ndraw=10000,
                   burnin=2000,
                   loss='logistic',
                   nviews=4,
                   scalings=True):

    if loss == "gaussian":
        X, y, beta, nonzero, sigma = gaussian_instance(n=n,
                                                       p=p,
                                                       s=s,
                                                       rho=rho,
                                                       signal=signal,
                                                       sigma=1)
        lam = lam_frac * np.mean(
            np.fabs(np.dot(X.T, np.random.standard_normal(
                (n, 2000)))).max(0)) * sigma
        loss = rr.glm.gaussian(X, y)
    elif loss == "logistic":
        X, y, beta, _ = logistic_instance(n=n,
                                          p=p,
                                          s=s,
                                          rho=rho,
                                          signal=signal)
        loss = rr.glm.logistic(X, y)
        lam = lam_frac * np.mean(
            np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2,
                                                   (n, 10000)))).max(0))

    randomizer = randomization.laplace((p, ), scale=0.6)

    epsilon = 1. / np.sqrt(n)

    W = np.ones(p) * lam
    W[0] = 0  # use at least some unpenalized
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)

    views = []
    for i in range(nviews):
        views.append(glm_group_lasso(loss, epsilon, penalty, randomizer))

    queries = multiple_queries(views)
    queries.solve()

    active_union = np.zeros(p, np.bool)
    for view in views:
        active_union += view.selection_variable['variables']

    nactive = np.sum(active_union)
    print("nactive", nactive)

    nonzero = np.where(beta)[0]

    if set(nonzero).issubset(np.nonzero(active_union)[0]):
        if nactive == s:
            return None

        if scalings:  # try condition on some scalings
            for i in range(nviews // 2):
                conditioning_groups = np.zeros(p, bool)
                conditioning_groups[:p // 2] = True
                marginalizing_groups = np.ones(p, bool)
                marginalizing_groups[:p // 2] = False
                views[i].decompose_subgradient(
                    conditioning_groups=conditioning_groups,
                    marginalizing_groups=marginalizing_groups)
                views[i].condition_on_scalings()
        else:
            for i in range(nviews):
                views[i].decompose_subgradient(
                    conditioning_groups=np.zeros(p, bool),
                    marginalizing_groups=np.ones(p, bool))

        active_set = np.nonzero(active_union)[0]
        target_sampler, target_observed = glm_target(loss, active_union,
                                                     queries)

        test_stat = lambda x: np.linalg.norm(x - beta[active_union])
        observed_test_value = test_stat(target_observed)

        pivots = target_sampler.hypothesis_test(test_stat,
                                                observed_test_value,
                                                alternative='twosided',
                                                parameter=beta[active_union],
                                                ndraw=ndraw,
                                                burnin=burnin)

        return [pivots], [False]

Пример #14

0

Показать файл

Файл: test_multiple_queries.py Проект: xizepu/selective-inference

def test_multiple_queries(ndraw=10000, burnin=2000, bootstrap=False, test = 'selected zeros'): 
    s, n, p = 3, 600, 10

    randomizer = randomization.laplace((p,), scale=1)
    X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=0, snr=4)

    nonzero = np.where(beta)[0]
    lam_frac = 1.

    loss = rr.glm.logistic(X, y)
    epsilon = 1.

    lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0))
    W = np.ones(p)*lam
    W[0] = 0 # use at least some unpenalized
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)), lagrange=1.)

    view = []
    nview = 5
    for i in range(nview):
        view.append(glm_group_lasso(loss, epsilon, penalty, randomizer))


    mv = multiple_queries(view)
    mv.solve()

    active_union = np.zeros(p, np.bool)
    for i in range(nview):
        active_union += view[i].selection_variable['variables']

    nactive = np.sum(active_union)
    print("nactive", nactive)

    if set(nonzero).issubset(np.nonzero(active_union)[0]):
        if nactive==s:
            return None

        active_set = np.nonzero(active_union)[0]

        if test == 'selected zeros':
            inactive_selected = np.array([active_union[i] and i not in nonzero for i in range(p)])
            true_active = (beta != 0)
            reference = np.zeros(inactive_selected.sum())
            target_sampler, target_observed = glm_target(loss,
                                                         true_active,
                                                         mv,
                                                         subset=inactive_selected,
                                                         bootstrap=bootstrap,
                                                         reference=reference)
        else:
            target_sampler, target_observed = glm_target(loss,
                                                         active_union,
                                                         mv,
                                                         bootstrap=bootstrap)

        test_stat = lambda x: np.linalg.norm(x)
        observed_test_value = test_stat(target_observed)
        pivot = target_sampler.hypothesis_test(test_stat,
                                               observed_test_value,
                                               alternative='twosided',
                                               ndraw=ndraw,
                                               burnin=burnin)

        full_sample = target_sampler.sample(ndraw=ndraw,
                                       burnin=burnin,
                                       keep_opt=True)

        pivot = target_sampler.hypothesis_test_translate(full_sample,
                                                         test_stat,
                                                         target_observed,
                                                         alternative='twosided')

        return [pivot], [False]

Пример #15

0

Показать файл

def test_split(s=3,
               n=200,
               p=50, 
               signal=7,
               rho=0.1,
               split_frac=0.8,
               lam_frac=0.7,
               ndraw=10000, 
               burnin=2000, 
               bootstrap=True,
               solve_args={'min_its':50, 'tol':1.e-10},
               reference_known=False): 

    X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal)

    m = int(split_frac * n)
    nonzero = np.where(beta)[0]

    loss = rr.glm.logistic(X, y)
    epsilon = 1. / np.sqrt(n)

    lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 2000)))).max(0))
    W = np.ones(p)*lam
    W[0] = 0 # use at least some unpenalized
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)), lagrange=1.)

    M_est = split_glm_group_lasso(loss, epsilon, m, penalty)
    mv = multiple_queries([M_est])
    mv.solve()

    M_est.selection_variable['variables'] = M_est.selection_variable['variables']
    nactive = np.sum(M_est.selection_variable['variables'])

    if nactive==0:
        return None

    if set(nonzero).issubset(np.nonzero(M_est.selection_variable['variables'])[0]):

        active_set = np.nonzero(M_est.selection_variable['variables'])[0]

        if bootstrap:
            target_sampler, target_observed = glm_target(loss, 
                                                         M_est.selection_variable['variables'],
                                                         mv)

        else:
            target_sampler, target_observed = glm_target(loss, 
                                                         M_est.selection_variable['variables'],
                                                         mv,
                                                         bootstrap=True)

        reference_known = True
        if reference_known:
            reference = beta[M_est.selection_variable['variables']] 
        else:
            reference = target_observed

        target_sampler.reference = reference

        target_sample = target_sampler.sample(ndraw=ndraw,
                                              burnin=burnin)


        LU = target_sampler.confidence_intervals(target_observed,
                                                 sample=target_sample).T

        LU_naive = naive_confidence_intervals(target_sampler, target_observed)

        pivots_mle = target_sampler.coefficient_pvalues(target_observed,
                                                        parameter=target_sampler.reference,
                                                        sample=target_sample)
        
        pivots_truth = target_sampler.coefficient_pvalues(target_observed,
                                                          parameter=beta[M_est.selection_variable['variables']],
                                                          sample=target_sample)
        
        true_vec = beta[M_est.selection_variable['variables']]

        pvalues = target_sampler.coefficient_pvalues(target_observed,
                                                     parameter=np.zeros_like(true_vec),
                                                     sample=target_sample)

        L, U = LU

        covered = np.zeros(nactive, np.bool)
        naive_covered = np.zeros(nactive, np.bool)
        active_var = np.zeros(nactive, np.bool)

        for j in range(nactive):
            if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]):
                covered[j] = 1
            if (LU_naive[j,0] <= true_vec[j]) and (LU_naive[j,1] >= true_vec[j]):
                naive_covered[j] = 1
            active_var[j] = active_set[j] in nonzero

        return pivots_mle, pivots_truth, pvalues, covered, naive_covered, active_var

Пример #16

0

Показать файл

Файл: test_marginalize_subgrad.py Проект: matthew-brett/selective-inference

def test_marginalize(s=4,
                     n=600,
                     p=200,
                     rho=0.,
                     signal=3.5,
                     lam_frac=2.5,
                     ndraw=10000,
                     burnin=2000,
                     loss='gaussian',
                     randomizer='gaussian',
                     randomizer_scale=1.,
                     nviews=3,
                     scalings=True,
                     subgrad=True,
                     parametric=False,
                     intervals='old'):
    print(n, p, s)

    if randomizer == 'laplace':
        randomizer = randomization.laplace((p, ), scale=randomizer_scale)
    elif randomizer == 'gaussian':
        randomizer = randomization.isotropic_gaussian((p, ), randomizer_scale)
    elif randomizer == 'logistic':
        randomizer = randomization.logistic((p, ), scale=randomizer_scale)

    if loss == "gaussian":
        X, y, beta, nonzero, sigma = gaussian_instance(n=n,
                                                       p=p,
                                                       s=s,
                                                       rho=rho,
                                                       signal=signal,
                                                       sigma=1)
        lam = np.mean(
            np.fabs(np.dot(X.T, np.random.standard_normal((n, 1000))))) * sigma
        loss = rr.glm.gaussian(X, y)
    elif loss == "logistic":
        X, y, beta, _ = logistic_instance(n=n,
                                          p=p,
                                          s=s,
                                          rho=rho,
                                          signal=signal)
        loss = rr.glm.logistic(X, y)
        lam = lam_frac * np.mean(
            np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2,
                                                   (n, 10000)))).max(0))

    epsilon = 1. / np.sqrt(n)

    W = lam_frac * np.ones(p) * lam
    #W[0] = 0 # use at least some unpenalized
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)

    views = []
    for i in range(nviews):
        if parametric == False:
            views.append(glm_group_lasso(loss, epsilon, penalty, randomizer))
        else:
            views.append(
                glm_group_lasso_parametric(loss, epsilon, penalty, randomizer))

    queries = multiple_queries(views)
    queries.solve()

    active_union = np.zeros(p, np.bool)
    for view in views:
        active_union += view.selection_variable['variables']

    nactive = np.sum(active_union)
    print("nactive", nactive)

    nonzero = np.where(beta)[0]
    true_vec = beta[active_union]

    if set(nonzero).issubset(np.nonzero(active_union)[0]):
        check_screen = True

        if nactive == s:
            return None

        # BUG: if this scalings code is moveed after the decompose_subgradient,
        # code seems to run fine

        if scalings:  # try condition on some scalings
            for i in range(nviews):
                views[i].condition_on_scalings()
        if subgrad:
            for i in range(nviews):
                conditioning_groups = np.zeros(p, dtype=bool)
                conditioning_groups[:(p / 2)] = True
                marginalizing_groups = np.zeros(p, dtype=bool)
                marginalizing_groups[(p / 2):] = True
                views[i].decompose_subgradient(
                    conditioning_groups=conditioning_groups,
                    marginalizing_groups=marginalizing_groups)

        active_set = np.nonzero(active_union)[0]
        target_sampler, target_observed = glm_target(loss,
                                                     active_union,
                                                     queries,
                                                     bootstrap=False,
                                                     parametric=parametric)
        #reference= beta[active_union])

        if intervals == 'old':
            target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin)
            LU = target_sampler.confidence_intervals(target_observed,
                                                     sample=target_sample,
                                                     level=0.9)
            pivots = target_sampler.coefficient_pvalues(target_observed,
                                                        parameter=true_vec,
                                                        sample=target_sample)
        elif intervals == 'new':
            full_sample = target_sampler.sample(ndraw=ndraw,
                                                burnin=burnin,
                                                keep_opt=True)
            LU = target_sampler.confidence_intervals_translate(
                target_observed, sample=full_sample, level=0.9)
            pivots = target_sampler.coefficient_pvalues_translate(
                target_observed, parameter=true_vec, sample=full_sample)

        #test_stat = lambda x: np.linalg.norm(x - beta[active_union])
        #observed_test_value = test_stat(target_observed)
        #pivots = target_sampler.hypothesis_test(test_stat,
        #                                       observed_test_value,
        #                                       alternative='twosided',
        #                                       parameter = beta[active_union],
        #                                       ndraw=ndraw,
        #                                       burnin=burnin,
        #                                       stepsize=None)

        def coverage(LU):
            L, U = LU[:, 0], LU[:, 1]
            covered = np.zeros(nactive)
            ci_length = np.zeros(nactive)

            for j in range(nactive):
                if check_screen:
                    if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]):
                        covered[j] = 1
                else:
                    covered[j] = None
                ci_length[j] = U[j] - L[j]
            return covered, ci_length

        covered, ci_length = coverage(LU)
        LU_naive = naive_confidence_intervals(target_sampler, target_observed)
        covered_naive, ci_length_naive = coverage(LU_naive)
        naive_pvals = naive_pvalues(target_sampler, target_observed, true_vec)

        return pivots, covered, ci_length, naive_pvals, covered_naive, ci_length_naive

Пример #17

0

Показать файл

Файл: test_randomization_to_zero.py Проект: xizepu/selective-inference

def test_multiple_queries_small(ndraw=10000, burnin=2000, nsim=None): # nsim needed for decorator
    s, n, p = 2, 100, 10

    randomizer = randomization.laplace((p,), scale=1)
    X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=0, snr=3)

    nonzero = np.where(beta)[0]
    lam_frac = .6

    loss = rr.glm.logistic(X, y)
    epsilon = 1.

    lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0))
    W = np.ones(p)*lam
    W[0] = 0 # use at least some unpenalized
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)), lagrange=1.)

    # first randomization
    M_est = glm_group_lasso(loss, epsilon, penalty, randomizer)

    mv = multiple_queries([M_est])
    mv.solve()

    active_union = M_est.selection_variable['variables'] 
    nactive = np.sum(active_union)
    print("nactive", nactive)

    if set(nonzero).issubset(np.nonzero(active_union)[0]):
        if nactive==s:
            return None

        active_set = np.nonzero(active_union)[0]
        inactive_selected = I = [i for i in np.arange(active_set.shape[0]) if active_set[i] not in nonzero]

        if not I:
            return None

        inactive_indicators_mat = np.zeros((len(inactive_selected),nactive))
        j = 0
        for i in range(nactive):
            if active_set[i] not in nonzero:
                inactive_indicators_mat[j,i] = 1
                j+=1

        form_covariances = glm_nonparametric_bootstrap(n, n)
        mv.setup_sampler(form_covariances)

        boot_target, target_observed = pairs_bootstrap_glm(loss, active_union)
        inactive_target = lambda indices: boot_target(indices)[inactive_selected]
        inactive_observed = target_observed[inactive_selected]
        # param_cov = _parametric_cov_glm(loss, active_union)

        alpha_mat = set_alpha_matrix(loss, active_union)
        # target = target_alpha\times alpha+reference_vec
        target_alpha = np.dot(inactive_indicators_mat, alpha_mat) 

        target_sampler = mv.setup_bootstrapped_target(inactive_target, inactive_observed, target_alpha)

        test_stat = lambda x: np.linalg.norm(x)
        pval = target_sampler.hypothesis_test(test_stat, 
                                              np.linalg.norm(inactive_observed), 
                                              alternative='twosided',
                                              ndraw=ndraw,
                                              burnin=burnin)


        # testing the global null
        all_selected = np.arange(active_set.shape[0])
        target_gn = lambda indices: boot_target(indices)[:nactive]
        target_observed_gn = target_observed[:nactive]

        target_alpha_gn = alpha_mat

        target_sampler_gn = mv.setup_bootstrapped_target(target_gn, target_observed_gn, target_alpha_gn, reference = beta[active_union])
        test_stat_boot_gn = lambda x: np.linalg.norm(x)
        observed_test_value = np.linalg.norm(target_observed_gn-beta[active_union])
        pval_gn = target_sampler_gn.hypothesis_test(test_stat_boot_gn, 
                                                    observed_test_value, 
                                                    alternative='twosided',
                                                    ndraw=ndraw,
                                                    burnin=burnin)

        return [pval, pval_gn], [False, False]

Пример #18

0

Показать файл

Файл: test_intervals.py Проект: kelli-jean/selective-inference

def test_intervals(s=3,
                   n=200,
                   p=50,
                   snr=7,
                   rho=0.1,
                   split_frac=0.8,
                   lam_frac=0.7,
                   ndraw=10000,
                   burnin=2000,
                   bootstrap=True,
                   intervals='new',
                   solve_args={
                       'min_its': 50,
                       'tol': 1.e-10
                   }):

    randomizer = randomization.laplace((p, ), scale=1.)
    X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, snr=snr)

    nonzero = np.where(beta)[0]

    loss = rr.glm.logistic(X, y)
    epsilon = 1.

    lam = lam_frac * np.mean(
        np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0))
    W = np.ones(p) * lam
    W[0] = 0  # use at least some unpenalized
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)

    # first randomization
    M_est1 = glm_group_lasso(loss, epsilon, penalty, randomizer)
    # second randomization
    # M_est2 = glm_group_lasso(loss, epsilon, penalty, randomizer)

    # mv = multiple_queries([M_est1, M_est2])
    mv = multiple_queries([M_est1])
    mv.solve()

    active_union = M_est1.selection_variable['variables']
    nactive = np.sum(active_union)

    if nactive == 0:
        return None

    if set(nonzero).issubset(np.nonzero(active_union)[0]):

        active_set = np.nonzero(active_union)[0]
        true_vec = beta[active_union]

        target_sampler, target_observed = glm_target(loss, active_union, mv)

        target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin)
        if intervals == 'old':
            LU = target_sampler.confidence_intervals(target_observed,
                                                     sample=target_sample,
                                                     level=0.9)
        else:
            full_sample = target_sampler.sample(ndraw=ndraw,
                                                burnin=burnin,
                                                keep_opt=True)

            LU = target_sampler.confidence_intervals_translate(
                target_observed, sample=full_sample, level=0.9)

        LU_naive = naive_confidence_intervals(target_sampler, target_observed)

        pivots_mle = target_sampler.coefficient_pvalues(
            target_observed,
            parameter=target_sampler.reference,
            sample=target_sample)

        pivots_truth = target_sampler.coefficient_pvalues(target_observed,
                                                          parameter=true_vec,
                                                          sample=target_sample)
        pvalues = target_sampler.coefficient_pvalues(
            target_observed,
            parameter=np.zeros_like(true_vec),
            sample=target_sample)
        unpenalized_mle = restricted_Mest(
            loss,
            M_est1.selection_variable['variables'],
            solve_args=solve_args)

        L, U = LU.T

        covered = np.zeros(nactive, np.bool)
        naive_covered = np.zeros(nactive, np.bool)
        active_var = np.zeros(nactive, np.bool)

        for j in range(nactive):
            if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]):
                covered[j] = 1
            if (LU_naive[j, 0] <= true_vec[j]) and (LU_naive[j, 1] >=
                                                    true_vec[j]):
                naive_covered[j] = 1
            active_var[j] = active_set[j] in nonzero

        return pivots_mle, pivots_truth, pvalues, covered, naive_covered, active_var

Пример #19

0

Показать файл

def test_approximate_ci(n=100,
                        p=10,
                        s=0,
                        snr=5,
                        rho=0.1,
                        lam_frac = 1.,
                        loss='gaussian',
                        randomizer='gaussian'):

    from selection.api import randomization

    if loss == "gaussian":
        X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, snr=snr, sigma=1.)
        loss = rr.glm.gaussian(X, y)
        lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) * sigma
    elif loss == "logistic":
        X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, snr=snr)
        loss = rr.glm.logistic(X, y)
        lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0))

    if randomizer == 'gaussian':
        randomization = randomization.isotropic_gaussian((p,), scale=1.)
    elif randomizer == 'laplace':
        randomization = randomization.laplace((p,), scale=1.)

    W = np.ones(p) * lam
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)), lagrange=1.)

    # active_bool = np.zeros(p, np.bool)
    # active_bool[range(3)] = 1
    # inactive_bool = ~active_bool

    GS = greedy_score_step_approx(loss,
                                  penalty,
                                  np.zeros(p, dtype=bool),
                                  np.ones(p, dtype=bool),
                                  randomization,
                                  randomizer)

    GS.solve_approx()
    active = GS._overall
    print("nactive", active.sum())

    ci = approximate_conditional_density(GS)
    ci.solve_approx()

    active_set = np.asarray([i for i in range(p) if active[i]])
    true_support = np.asarray([i for i in range(p) if i < s])
    nactive = np.sum(active)
    print("active set, true_support", active_set, true_support)
    true_vec = beta[active]
    print("true coefficients", true_vec)

    if (set(active_set).intersection(set(true_support)) == set(true_support)) == True:

        ci_active = np.zeros((nactive, 2))
        covered = np.zeros(nactive, np.bool)
        ci_length = np.zeros(nactive)
        pivots = np.zeros(nactive)

        toc = time.time()

        for j in range(nactive):
            ci_active[j, :] = np.array(ci.approximate_ci(j))
            if (ci_active[j, 0] <= true_vec[j]) and (ci_active[j, 1] >= true_vec[j]):
                covered[j] = 1
            ci_length[j] = ci_active[j, 1] - ci_active[j, 0]
            # print(ci_active[j, :])
            pivots[j] = ci.approximate_pvalue(j, true_vec[j])

        print("confidence intervals", ci_active)
        tic = time.time()
        print('ci time now', tic - toc)

Пример #20

0

Показать файл

Файл: test_threshold_score.py Проект: matthew-brett/selective-inference

def test_approximate_ci(n=200,
                        p=50,
                        s=0,
                        snr=5,
                        threshold = 3.,
                        rho=0.1,
                        lam_frac = 1.,
                        loss='gaussian',
                        randomizer='gaussian'):

    from selection.api import randomization

    if loss == "gaussian":
        X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, snr=snr, sigma=1.)
        loss = rr.glm.gaussian(X, y)
    elif loss == "logistic":
        X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, snr=snr)
        loss = rr.glm.logistic(X, y)

    if randomizer=='gaussian':
        randomization = randomization.isotropic_gaussian((p,), scale=1.)
    elif randomizer=='laplace':
        randomization = randomization.laplace((p,), scale=1.)

    active_bool = np.zeros(p, np.bool)
    #active_bool[range(3)] = 1
    inactive_bool = ~active_bool

    TS = threshold_score_approx(loss,
                                threshold,
                                randomization,
                                active_bool,
                                inactive_bool,
                                randomizer)

    TS.solve_approx()
    active = TS._overall
    print("nactive", active.sum())

    ci = approximate_conditional_density(TS)
    ci.solve_approx()

    active_set = np.asarray([i for i in range(p) if active[i]])
    true_support = np.asarray([i for i in range(p) if i < s])
    nactive = np.sum(active)
    print("active set, true_support", active_set, true_support)
    true_vec = beta[active]
    print("true coefficients", true_vec)

    if (set(active_set).intersection(set(true_support)) == set(true_support))== True:

        ci_active = np.zeros((nactive, 2))
        covered = np.zeros(nactive, np.bool)
        ci_length = np.zeros(nactive)
        pivots = np.zeros(nactive)

        class target_class(object):
            def __init__(self, target_cov):
                self.target_cov = target_cov
                self.shape = target_cov.shape

        target = target_class(TS.target_cov)
        ci_naive = naive_confidence_intervals(target, TS.target_observed)
        naive_pvals = naive_pvalues(target, TS.target_observed, true_vec)
        naive_covered = np.zeros(nactive)
        toc = time.time()

        for j in range(nactive):
            ci_active[j, :] = np.array(ci.approximate_ci(j))
            if (ci_active[j, 0] <= true_vec[j]) and (ci_active[j,1] >= true_vec[j]):
                covered[j] = 1
            ci_length[j] = ci_active[j,1] - ci_active[j,0]
            print(ci_active[j, :])
            pivots[j] = ci.approximate_pvalue(j, true_vec[j])

            # naive ci
            if (ci_naive[j,0]<=true_vec[j]) and (ci_naive[j,1]>=true_vec[j]):
                naive_covered[j]+=1

        tic = time.time()
        print('ci time now', tic - toc)

        return covered, ci_length, pivots, naive_covered, naive_pvals

Пример #21

0

Показать файл

Файл: test_greedy_step.py Проект: matthew-brett/selective-inference

def test_overall_null_two_queries(ndraw=10000,
                                  burnin=2000,
                                  nsim=None):  # nsim needed for decorator
    s, n, p = 5, 200, 20

    randomizer = randomization.laplace((p, ), scale=0.5)
    X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=0.1, signal=7)

    nonzero = np.where(beta)[0]
    lam_frac = 1.

    loss = rr.glm.logistic(X, y)
    epsilon = 1.

    lam = lam_frac * np.mean(
        np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0))
    W = np.ones(p) * lam
    W += np.arange(p) / 200
    W[0] = 0  # use at least some unpenalized
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)
    # first randomization

    M_est1 = glm_group_lasso(loss, epsilon, penalty, randomizer)
    M_est1.solve()
    bootstrap_score1 = M_est1.setup_sampler()

    # second randomization -- a greedy step from LASSO

    active = M_est1.selection_variable['variables']
    inactive = ~active
    inactive_randomizer = randomization.laplace((inactive.sum(), ), scale=0.5)

    step = glm_greedy_step(loss, penalty, active, inactive,
                           inactive_randomizer)
    step.solve()
    bootstrap_score2 = step.setup_sampler()

    # we take target to be union of two active sets

    active = M_est1.selection_variable['variables'] + step.selection_variable[
        'variables']

    if set(nonzero).issubset(np.nonzero(active)[0]):
        boot_target, target_observed = pairs_bootstrap_glm(loss, active)

        # target are all true null coefficients selected

        sampler = lambda: np.random.choice(n, size=(n, ), replace=True)
        target_cov, cov1, cov2 = bootstrap_cov(sampler,
                                               boot_target,
                                               cross_terms=(bootstrap_score1,
                                                            bootstrap_score2))

        active_set = np.nonzero(active)[0]
        inactive_selected = I = [
            i for i in np.arange(active_set.shape[0])
            if active_set[i] not in nonzero
        ]

        if not I:
            return None

        # is it enough only to bootstrap the inactive ones?
        # seems so...

        A1, b1 = M_est1.linear_decomposition(cov1[I], target_cov[I][:, I],
                                             target_observed[I])
        A2, b2 = step.linear_decomposition(cov2[I], target_cov[I][:, I],
                                           target_observed[I])

        target_inv_cov = np.linalg.inv(target_cov[I][:, I])

        initial_state = np.hstack([
            target_observed[I], M_est1.observed_opt_state,
            step.observed_opt_state
        ])

        ntarget = len(I)
        target_slice = slice(0, ntarget)
        opt_slice1 = slice(ntarget, p + ntarget)
        opt_slice2 = slice(p + ntarget, 2 * p + ntarget)

        def target_gradient(state):
            # with many samplers, we will add up the `target_slice` component
            # many target_grads
            # and only once do the Gaussian addition of full_grad

            target = state[target_slice]
            opt_state1 = state[opt_slice1]
            opt_state2 = state[opt_slice2]
            target_grad1 = M_est1.randomization_gradient(
                target, (A1, b1), opt_state1)
            target_grad2 = step.randomization_gradient(target, (A2, b2),
                                                       opt_state2)

            full_grad = np.zeros_like(state)
            full_grad[opt_slice1] = -target_grad1[1]
            full_grad[opt_slice2] = -target_grad2[1]
            full_grad[target_slice] -= target_grad1[0] + target_grad2[0]
            full_grad[target_slice] -= target_inv_cov.dot(target)

            return full_grad

        def target_projection(state):
            opt_state1 = state[opt_slice1]
            state[opt_slice1] = M_est1.projection(opt_state1)
            opt_state2 = state[opt_slice2]
            state[opt_slice2] = step.projection(opt_state2)
            return state

        target_langevin = projected_langevin(initial_state, target_gradient,
                                             target_projection,
                                             .5 / (2 * p + 1))

        samples = []
        for i in range(ndraw + burnin):
            target_langevin.next()
            if i >= burnin:
                samples.append(target_langevin.state[target_slice].copy())

        test_stat = lambda x: np.linalg.norm(x)
        observed = test_stat(target_observed[I])
        sample_test_stat = np.array([test_stat(x) for x in samples])

        family = discrete_family(sample_test_stat,
                                 np.ones_like(sample_test_stat))
        pval = family.ccdf(0, observed)
        return pval, False

Пример #22

0

Показать файл

Файл: test_threshold_score.py Проект: matthew-brett/selective-inference

def test_threshold_score(ndraw=10000,
                         burnin=2000,
                         nsim=None):  # nsim needed for decorator

    s, n, p = 5, 200, 20
    threshold = 0.5

    X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=0.1, signal=7)

    nonzero = np.where(beta)[0]
    lam_frac = 1.

    loss = rr.glm.logistic(X, y)
    active_bool = np.zeros(p, np.bool)
    active_bool[range(3)] = 1
    inactive_bool = ~active_bool
    randomizer = randomization.laplace((inactive_bool.sum(), ), scale=0.5)

    # threshold the score

    thresh = glm_threshold_score(loss, threshold, randomizer, active_bool,
                                 inactive_bool)
    mv = multiple_queries([thresh])
    mv.solve()

    boundary = thresh.selection_variable['boundary_set']
    new_active = np.nonzero(np.arange(3, 20)[boundary])[0]
    active_set = np.array(sorted(set(range(3)).union(new_active)))

    if set(nonzero).issubset(active_set):

        full_active = np.zeros(p, np.bool)
        full_active[active_set] = 1
        nactive = active_set.shape[0]
        inactive_selected = I = [
            i for i in np.arange(active_set.shape[0])
            if active_set[i] not in nonzero
        ]

        if not I:
            return None

        inactive_indicators_mat = np.zeros((len(inactive_selected), nactive))
        j = 0
        for i in range(nactive):
            if active_set[i] not in nonzero:
                inactive_indicators_mat[j, i] = 1
                j += 1

        form_covariances = glm_nonparametric_bootstrap(n, n)
        mv.setup_sampler(form_covariances)

        boot_target, target_observed = pairs_bootstrap_glm(loss, full_active)
        inactive_target = lambda indices: boot_target(indices)[
            inactive_selected]
        inactive_observed = target_observed[inactive_selected]
        # param_cov = _parametric_cov_glm(loss, active_union)

        target_sampler = mv.setup_target(inactive_target, inactive_observed)

        test_stat = lambda x: np.linalg.norm(x)
        pval = target_sampler.hypothesis_test(
            test_stat,
            np.linalg.norm(inactive_observed),
            alternative='twosided',
            ndraw=ndraw,
            burnin=burnin)
        print(pval)
        return pval, False

Пример #23

0

Показать файл

def test_nonrandomized(s=0,
                       n=200,
                       p=10,
                       signal=7,
                       rho=0,
                       lam_frac=0.8,
                       loss='gaussian',
                       solve_args={
                           'min_its': 20,
                           'tol': 1.e-10
                       }):
    if loss == "gaussian":
        X, y, beta, nonzero, sigma = gaussian_instance(n=n,
                                                       p=p,
                                                       s=s,
                                                       rho=rho,
                                                       signal=signal,
                                                       sigma=1)
        lam = lam_frac * np.mean(
            np.fabs(np.dot(X.T, np.random.standard_normal(
                (n, 2000)))).max(0)) * sigma
        loss = rr.glm.gaussian(X, y)
    elif loss == "logistic":
        X, y, beta, _ = logistic_instance(n=n,
                                          p=p,
                                          s=s,
                                          rho=rho,
                                          signal=signal)
        loss = rr.glm.logistic(X, y)
        lam = lam_frac * np.mean(
            np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2,
                                                   (n, 10000)))).max(0))

    nonzero = np.where(beta)[0]
    print("lam", lam)
    W = np.ones(p) * lam
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)
    true_vec = beta
    M_est = M_estimator(lam, loss, penalty)
    M_est.solve()
    active = M_est._overall
    nactive = np.sum(active)
    print("nactive", nactive)
    if nactive == 0:
        return None

    #score_mean = M_est.observed_internal_state.copy()
    #score_mean[nactive:] = 0
    M_est.setup_sampler(score_mean=np.zeros(p))
    #M_est.setup_sampler(score_mean=score_mean)
    #M_est.sample(ndraw = 1000, burnin=1000, stepsize=1./p)

    if set(nonzero).issubset(np.nonzero(active)[0]):
        check_screen = True
        #test_stat = lambda x: np.linalg.norm(x)
        #return M_est.hypothesis_test(test_stat, test_stat(M_est.observed_internal_state), stepsize=1./p)

        ci = M_est.confidence_intervals(M_est.observed_internal_state)
        pivots = M_est.coefficient_pvalues(M_est.observed_internal_state)

        def coverage(LU):
            L, U = LU[:, 0], LU[:, 1]
            covered = np.zeros(nactive)
            ci_length = np.zeros(nactive)

            for j in range(nactive):
                if check_screen:
                    if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]):
                        covered[j] = 1
                else:
                    covered[j] = None
                ci_length[j] = U[j] - L[j]
            return covered, ci_length

        covered = coverage(ci)[0]
        #print(pivots)
        #print(coverage)
        return pivots, covered

Пример #24

0

Показать файл

def test_split_compare(ndraw=20000,
                       burnin=10000,
                       solve_args={
                           'min_its': 50,
                           'tol': 1.e-10
                       },
                       check_screen=True):
    # s, n, p = 0, 200, 10
    s, n, p = 6, 300, 40

    randomizer = randomization.laplace((p, ), scale=1.)
    X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=0.1, snr=5)

    nonzero = np.where(beta)[0]
    lam_frac = 1.

    loss = rr.glm.logistic(X, y)
    epsilon = 1.

    lam = lam_frac * np.mean(
        np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0))
    W = np.ones(p) * lam
    W[0] = 0  # use at least some unpenalized
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)

    m = int(0.8 * n)
    # first randomization
    M_est1 = split_glm_group_lasso(loss, epsilon, m, penalty)
    # second randomization
    # M_est2 = glm_group_lasso(loss, epsilon, penalty, randomizer)

    # mv = multiple_queries([M_est1, M_est2])
    mv = multiple_queries([M_est1])
    mv.solve()

    active_union = M_est1.selection_variable[
        'variables']  #+ M_est2.selection_variable['variables']
    nactive = np.sum(active_union)
    print("nactive", nactive)
    if nactive == 0:
        return None

    leftout_indices = M_est1.randomized_loss.saturated_loss.case_weights == 0

    screen = set(nonzero).issubset(np.nonzero(active_union)[0])

    if check_screen and not screen:
        return None

    if True:
        active_set = np.nonzero(active_union)[0]
        true_vec = beta[active_union]

        ## bootstrap

        target_sampler_boot, target_observed = glm_target(loss,
                                                          active_union,
                                                          mv,
                                                          bootstrap=True)

        target_sample_boot = target_sampler_boot.sample(ndraw=ndraw,
                                                        burnin=burnin)

        LU_boot = target_sampler_boot.confidence_intervals(
            target_observed, sample=target_sample_boot)

        pivots_boot = target_sampler_boot.coefficient_pvalues(
            target_observed, parameter=true_vec, sample=target_sample_boot)
        ## CLT plugin

        target_sampler, _ = glm_target(loss, active_union, mv)

        target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin)

        target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin)

        LU = target_sampler.confidence_intervals(target_observed,
                                                 sample=target_sample)

        LU_naive = naive_confidence_intervals(target_sampler, target_observed)

        if X.shape[0] - leftout_indices.sum() > nactive:
            LU_split = standard_ci(X, y, active_union, leftout_indices)
            LU_split_sm = standard_ci_sm(X, y, active_union, leftout_indices)
        else:
            LU_split = LU_split_sm = np.ones((nactive, 2)) * np.nan

        pivots = target_sampler.coefficient_pvalues(target_observed,
                                                    parameter=true_vec,
                                                    sample=target_sample)

        def coverage(LU):
            L, U = LU[:, 0], LU[:, 1]
            covered = np.zeros(nactive)
            ci_length = np.zeros(nactive)

            for j in range(nactive):
                if check_screen:
                    if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]):
                        covered[j] = 1
                else:
                    covered[j] = None
                ci_length[j] = U[j] - L[j]
            return covered, ci_length

        covered, ci_length = coverage(LU)
        covered_boot, ci_length_boot = coverage(LU_boot)
        covered_split, ci_length_split = coverage(LU_split)
        covered_naive, ci_length_naive = coverage(LU_naive)

        active_var = np.zeros(nactive, np.bool)
        for j in range(nactive):
            active_var[j] = active_set[j] in nonzero

        return pivots, pivots_boot, covered, ci_length, covered_boot, ci_length_boot, \
               covered_split, ci_length_split, active_var, covered_naive, ci_length_naive

Python logistic_instance примеры использования