Пример #1
0
def test_scaling(
    snr=15,
    s=5,
    n=200,
    p=20,
    rho=0.1,
    burnin=20000,
    ndraw=30000,
    scale=0.9,
    nsim=None,  # needed for decorator
    frac=0.5
):  # 0.9 has roughly same screening probability as 50% data splitting, i.e. around 10%

    randomizer = randomization.laplace((p, ), scale=scale)
    X, y, beta, _ = generate_data(n=n, p=p, s=s, rho=rho, snr=snr)

    nonzero = np.where(beta)[0]
    lam_frac = 1.

    loss = rr.glm.logistic(X, y)
    epsilon = 1. / np.sqrt(n)

    lam = lam_frac * np.mean(
        np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0))
    W = np.ones(p) * lam
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)

    M_est = glm_group_lasso(loss, epsilon, penalty, randomizer)

    mv = multiple_queries([M_est])
    mv.solve()

    active = M_est.selection_variable['variables']
    nactive = active.sum()

    if set(nonzero).issubset(np.nonzero(active)[0]):

        pvalues = []
        active_set = np.nonzero(active)[0]
        inactive_selected = I = [
            i for i in np.arange(active_set.shape[0])
            if active_set[i] not in nonzero
        ]
        active_selected = A = [
            i for i in np.arange(active_set.shape[0])
            if active_set[i] in nonzero
        ]

        if not I:
            return None
        idx = I[0]
        inactive = ~M_est.selection_variable['variables']
        boot_target, target_observed = pairs_bootstrap_glm(loss,
                                                           active,
                                                           inactive=inactive)

        if DEBUG:
            sampler = lambda: np.random.choice(n, size=(n, ), replace=True)
            print(boot_target(sampler())[-3:], 'boot target')

        form_covariances = glm_nonparametric_bootstrap(n, n)
        mv.setup_sampler(form_covariances)

        # null saturated

        def null_target(indices):
            result = boot_target(indices)
            return result[idx]

        null_observed = np.zeros(1)
        null_observed[0] = target_observed[idx]

        target_sampler = mv.setup_target(null_target, null_observed)

        #target_scaling = 5 * np.linalg.svd(target_sampler.target_transform[0][0])[1].max()**2# should have something do with noise scale too

        print(target_sampler.crude_lipschitz(), 'crude')

        test_stat = lambda x: x[0]
        pval = target_sampler.hypothesis_test(
            test_stat,
            test_stat(null_observed),
            burnin=burnin,
            ndraw=ndraw,
            stepsize=.5 /
            target_sampler.crude_lipschitz())  # twosided by default
        pvalues.append(pval)

        # true saturated

        idx = A[0]

        def active_target(indices):
            result = boot_target(indices)
            return result[idx]

        active_observed = np.zeros(1)
        active_observed[0] = target_observed[idx]

        target_sampler = mv.setup_target(active_target, active_observed)
        target_scaling = 5 * np.linalg.svd(
            target_sampler.target_transform[0]
            [0])[1].max()**2  # should have something do with noise scale too

        test_stat = lambda x: x[0]
        pval = target_sampler.hypothesis_test(
            test_stat,
            test_stat(active_observed),
            burnin=burnin,
            ndraw=ndraw,
            stepsize=.5 /
            target_sampler.crude_lipschitz())  # twosided by default
        pvalues.append(pval)

        # null selected

        idx = I[0]

        def null_target(indices):
            result = boot_target(indices)
            return np.hstack([result[idx], result[nactive:]])

        null_observed = np.zeros_like(null_target(range(n)))
        null_observed[0] = target_observed[idx]
        null_observed[1:] = target_observed[nactive:]

        target_sampler = mv.setup_target(null_target,
                                         null_observed)  #, target_set=[0])
        target_scaling = 5 * np.linalg.svd(
            target_sampler.target_transform[0]
            [0])[1].max()**2  # should have something do with noise scale too

        print(target_sampler.crude_lipschitz(), 'crude')

        test_stat = lambda x: x[0]
        pval = target_sampler.hypothesis_test(
            test_stat,
            test_stat(null_observed),
            burnin=burnin,
            ndraw=ndraw,
            stepsize=.5 /
            target_sampler.crude_lipschitz())  # twosided by default
        pvalues.append(pval)

        # true selected

        idx = A[0]

        def active_target(indices):
            result = boot_target(indices)
            return np.hstack([result[idx], result[nactive:]])

        active_observed = np.zeros_like(active_target(range(n)))
        active_observed[0] = target_observed[idx]
        active_observed[1:] = target_observed[nactive:]

        target_sampler = mv.setup_target(active_target,
                                         active_observed)  #, target_set=[0])

        test_stat = lambda x: x[0]
        pval = target_sampler.hypothesis_test(
            test_stat,
            test_stat(active_observed),
            burnin=burnin,
            ndraw=ndraw,
            stepsize=.5 /
            target_sampler.crude_lipschitz())  # twosided by default
        pvalues.append(pval)

        # condition on opt variables

        ### NOT WORKING -- need to implement conditioning within M_estimator!!!

        if False:

            # null saturated

            idx = I[0]

            def null_target(indices):
                result = boot_target(indices)
                return result[idx]

            null_observed = np.zeros(1)
            null_observed[0] = target_observed[idx]

            target_sampler = mv.setup_target(null_target, null_observed)

            print(target_sampler.crude_lipschitz(), 'crude')

            test_stat = lambda x: x[0]
            pval = target_sampler.hypothesis_test(
                test_stat,
                test_stat(null_observed),
                burnin=burnin,
                ndraw=ndraw,
                stepsize=.5 /
                target_sampler.crude_lipschitz())  # twosided by default
            pvalues.append(pval)

            # true saturated

            idx = A[0]

            def active_target(indices):
                result = boot_target(indices)
                return result[idx]

            active_observed = np.zeros(1)
            active_observed[0] = target_observed[idx]

            sampler = lambda: np.random.choice(n, size=(n, ), replace=True)

            target_sampler = mv.setup_target(active_target, active_observed)

            test_stat = lambda x: x[0]
            pval = target_sampler.hypothesis_test(
                test_stat,
                test_stat(active_observed),
                burnin=burnin,
                ndraw=ndraw,
                stepsize=.5 /
                target_sampler.crude_lipschitz())  # twosided by default
            pvalues.append(pval)

        # true selected

        # oracle p-value -- draws a new data set

        X, y, beta, _ = generate_data(n=n, p=p, s=s, rho=rho, snr=snr)
        X_E = X[:, active_set]

        active_var = [False, True, False, True]

        if statsmodels_available:
            try:
                model = sm.GLM(y, X_E, family=sm.families.Binomial())
                model_results = model.fit()
                pvalues.extend(
                    [model_results.pvalues[I[0]], model_results.pvalues[A[0]]])
                active_var.extend([False, True])
            except sm.tools.sm_exceptions.PerfectSeparationError:
                pass
        else:
            pass

        # data splitting-ish p-value -- draws a new data set of smaller size
        # frac is presumed to be how much data was used in stage 1, we get (1-frac)*n for stage 2
        # frac defaults to 0.5

        Xs, ys, beta, _ = generate_data(n=n, p=p, s=s, rho=rho, snr=snr)
        Xs = Xs[:int((1 - frac) * n)]
        ys = ys[:int((1 - frac) * n)]
        X_Es = Xs[:, active_set]

        if statsmodels_available:
            try:
                model = sm.GLM(ys, X_Es, family=sm.families.Binomial())
                model_results = model.fit()
                pvalues.extend(
                    [model_results.pvalues[I[0]], model_results.pvalues[A[0]]])
                active_var.extend([False, False])
            except sm.tools.sm_exceptions.PerfectSeparationError:
                pass
        else:
            pass

        return pvalues, active_var
Пример #2
0
def test_logistic_saturated_active_coordinate():
    s, n, p = 5, 200, 20

    randomizer = randomization.laplace((p, ), scale=1.)
    X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=0.1, snr=14)

    nonzero = np.where(beta)[0]
    lam_frac = 1.

    loss = rr.glm.logistic(X, y)
    epsilon = 1.

    lam = lam_frac * np.mean(
        np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0))
    W = np.ones(p) * lam
    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)

    print(lam)
    # our randomization

    M_est1 = glm_group_lasso(loss, epsilon, penalty, randomizer)

    mv = multiple_queries([M_est1])
    mv.solve()

    active = M_est1.selection_variable['variables']
    nactive = active.sum()
    if set(nonzero).issubset(np.nonzero(active)[0]):

        active_set = np.nonzero(active)[0]
        inactive_selected = I = [
            i for i in np.arange(active_set.shape[0])
            if active_set[i] not in nonzero
        ]
        active_selected = A = [
            i for i in np.arange(active_set.shape[0])
            if active_set[i] in nonzero
        ]

        idx = A[0]
        inactive = ~M_est1.selection_variable['variables']
        boot_target, target_observed = pairs_bootstrap_glm(loss,
                                                           active,
                                                           inactive=inactive)

        def active_target(indices):
            result = boot_target(indices)
            return result[idx]

        active_observed = np.zeros(1)
        active_observed[0] = target_observed[idx]

        # the active_observed[1:] is only used as a
        # starting point for chain -- could be 0
        # active_observed[1:] = target_observed[nactive:]

        form_covariances = glm_nonparametric_bootstrap(n, n)

        mv.setup_sampler(form_covariances)
        target_sampler = mv.setup_target(active_target, active_observed)
        test_stat = lambda x: x[0]
        pval = target_sampler.hypothesis_test(
            test_stat, test_stat(active_observed), burnin=10000,
            ndraw=10000)  # twosided by default
        return pval, True
Пример #3
0
def test_fixedX(ndraw=10000, burnin=2000):  # nsim needed for decorator
    s, n, p = 5, 200, 20

    randomizer = randomization.laplace((p, ), scale=1.)
    X, Y, beta, nonzero, sigma = gaussian_instance(n=n,
                                                   p=p,
                                                   s=s,
                                                   rho=0.1,
                                                   snr=7)

    lam_frac = 1.
    lam = lam_frac * np.mean(
        np.fabs(X.T.dot(np.random.standard_normal((n, 50000)))).max(0)) * sigma
    W = np.ones(p) * lam
    epsilon = 1. / np.sqrt(n)

    penalty = rr.group_lasso(np.arange(p),
                             weights=dict(zip(np.arange(p), W)),
                             lagrange=1.)

    M_est = fixedX_group_lasso(X, Y, epsilon, penalty, randomizer)

    mv = multiple_queries([M_est])
    mv.solve()

    active = M_est.selection_variable['variables']
    nactive = active.sum()

    if set(nonzero).issubset(
            np.nonzero(active)[0]) and active.sum() > len(nonzero):

        pvalues = []
        active_set = np.nonzero(active)[0]
        inactive_selected = I = [
            i for i in np.arange(active_set.shape[0])
            if active_set[i] not in nonzero
        ]
        active_selected = A = [
            i for i in np.arange(active_set.shape[0])
            if active_set[i] in nonzero
        ]

        if not I:
            return None

        idx = I[0]
        boot_target, target_observed = resid_bootstrap(M_est.loss, active)

        X_active = X[:, active]
        beta_hat = np.linalg.pinv(X_active).dot(Y)
        resid_hat = Y - X_active.dot(beta_hat)
        form_covariances = glm_nonparametric_bootstrap(n, n)
        mv.setup_sampler(form_covariances)

        # null saturated

        def null_target(Y_star):
            result = boot_target(Y_star)
            return result[idx]

        null_observed = np.zeros(1)
        null_observed[0] = target_observed[idx]

        target_sampler = mv.setup_target(null_target, null_observed)

        test_stat = lambda x: x[0]
        pval = target_sampler.hypothesis_test(
            test_stat, null_observed, burnin=burnin,
            ndraw=ndraw)  # twosided by default
        pvalues.append(pval)

        # null selected

        def null_target(Y_star):
            result = boot_target(Y_star)
            return np.hstack([result[idx], result[nactive:]])

        null_observed = np.zeros_like(null_target(
            np.random.standard_normal(n)))
        null_observed[0] = target_observed[idx]
        null_observed[1:] = target_observed[nactive:]

        target_sampler = mv.setup_target(null_target,
                                         null_observed,
                                         target_set=[0])

        test_stat = lambda x: x[0]
        pval = target_sampler.hypothesis_test(
            test_stat, null_observed, burnin=burnin,
            ndraw=ndraw)  # twosided by default
        pvalues.append(pval)

        # true saturated

        idx = A[0]

        def active_target(Y_star):
            result = boot_target(Y_star)
            return result[idx]

        active_observed = np.zeros(1)
        active_observed[0] = target_observed[idx]

        sampler = lambda: np.random.choice(n, size=(n, ), replace=True)

        target_sampler = mv.setup_target(active_target, active_observed)

        test_stat = lambda x: x[0]
        pval = target_sampler.hypothesis_test(
            test_stat, active_observed, burnin=burnin,
            ndraw=ndraw)  # twosided by default
        pvalues.append(pval)

        # true selected

        def active_target(Y_star):
            result = boot_target(Y_star)
            return np.hstack([result[idx], result[nactive:]])

        active_observed = np.zeros_like(
            active_target(np.random.standard_normal(n)))
        active_observed[0] = target_observed[idx]
        active_observed[1:] = target_observed[nactive:]

        target_sampler = mv.setup_target(active_target,
                                         active_observed,
                                         target_set=[0])

        test_stat = lambda x: x[0]
        pval = target_sampler.hypothesis_test(
            test_stat, active_observed, burnin=burnin,
            ndraw=ndraw)  # twosided by default
        pvalues.append(pval)

        return pvalues, [False, False, True, True]
def test_threshold_score(ndraw=10000,
                         burnin=2000,
                         nsim=None):  # nsim needed for decorator

    s, n, p = 5, 200, 20
    threshold = 0.5

    X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=0.1, signal=7)

    nonzero = np.where(beta)[0]
    lam_frac = 1.

    loss = rr.glm.logistic(X, y)
    active_bool = np.zeros(p, np.bool)
    active_bool[range(3)] = 1
    inactive_bool = ~active_bool
    randomizer = randomization.laplace((inactive_bool.sum(), ), scale=0.5)

    # threshold the score

    thresh = glm_threshold_score(loss, threshold, randomizer, active_bool,
                                 inactive_bool)
    mv = multiple_queries([thresh])
    mv.solve()

    boundary = thresh.selection_variable['boundary_set']
    new_active = np.nonzero(np.arange(3, 20)[boundary])[0]
    active_set = np.array(sorted(set(range(3)).union(new_active)))

    if set(nonzero).issubset(active_set):

        full_active = np.zeros(p, np.bool)
        full_active[active_set] = 1
        nactive = active_set.shape[0]
        inactive_selected = I = [
            i for i in np.arange(active_set.shape[0])
            if active_set[i] not in nonzero
        ]

        if not I:
            return None

        inactive_indicators_mat = np.zeros((len(inactive_selected), nactive))
        j = 0
        for i in range(nactive):
            if active_set[i] not in nonzero:
                inactive_indicators_mat[j, i] = 1
                j += 1

        form_covariances = glm_nonparametric_bootstrap(n, n)
        mv.setup_sampler(form_covariances)

        boot_target, target_observed = pairs_bootstrap_glm(loss, full_active)
        inactive_target = lambda indices: boot_target(indices)[
            inactive_selected]
        inactive_observed = target_observed[inactive_selected]
        # param_cov = _parametric_cov_glm(loss, active_union)

        target_sampler = mv.setup_target(inactive_target, inactive_observed)

        test_stat = lambda x: np.linalg.norm(x)
        pval = target_sampler.hypothesis_test(
            test_stat,
            np.linalg.norm(inactive_observed),
            alternative='twosided',
            ndraw=ndraw,
            burnin=burnin)
        print(pval)
        return pval, False