Exemplo n.º 1
0
def test_selected_targets(n=2000,
                          p=200,
                          signal_fac=1.,
                          s=5,
                          sigma=3,
                          rho=0.4,
                          randomizer_scale=1,
                          full_dispersion=True):
    """
    Compare to R randomized lasso
    """

    inst, const = gaussian_instance, lasso.gaussian
    signal = np.sqrt(signal_fac * 2 * np.log(p))

    while True:
        X, Y, beta = inst(n=n,
                          p=p,
                          signal=signal,
                          s=s,
                          equicorrelated=False,
                          rho=rho,
                          sigma=sigma,
                          random_signs=True)[:3]

        idx = np.arange(p)
        sigmaX = rho**np.abs(np.subtract.outer(idx, idx))
        print("snr", beta.T.dot(sigmaX).dot(beta) / ((sigma**2.) * n))

        n, p = X.shape

        sigma_ = np.std(Y)
        W = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_

        conv = const(X, Y, W, randomizer_scale=randomizer_scale * sigma_)

        signs = conv.fit()
        nonzero = signs != 0

        if nonzero.sum() > 0:
            dispersion = None
            if full_dispersion:
                dispersion = np.linalg.norm(
                    Y - X.dot(np.linalg.pinv(X).dot(Y)))**2 / (n - p)

            (observed_target, cov_target, cov_target_score,
             alternatives) = selected_targets(conv.loglike,
                                              conv._W,
                                              nonzero,
                                              dispersion=dispersion)

            estimate, _, _, pval, intervals, _ = conv.selective_MLE(
                observed_target, cov_target, cov_target_score, alternatives)

            beta_target = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta))

            coverage = (beta_target > intervals[:, 0]) * (beta_target <
                                                          intervals[:, 1])
            return pval[beta[nonzero] == 0], pval[
                beta[nonzero] != 0], coverage, intervals
Exemplo n.º 2
0
def compare_methods(n=500,
                    p=100,
                    nval=500,
                    rho=0.35,
                    s=5,
                    beta_type=1,
                    snr=0.20,
                    target="selected",
                    randomizer_scale=np.sqrt(0.50),
                    full_dispersion=True,
                    tuning_rand="lambda.theory"):

    X, y, _, _, Sigma, beta, sigma = sim_xy(n=n,
                                            p=p,
                                            nval=nval,
                                            rho=rho,
                                            s=s,
                                            beta_type=beta_type,
                                            snr=snr)
    print("snr", snr)
    X -= X.mean(0)[None, :]
    X /= (X.std(0)[None, :] * np.sqrt(n / (n - 1.)))
    y = y - y.mean()
    true_set = np.asarray([u for u in range(p) if beta[u] != 0])

    if full_dispersion:
        dispersion = np.linalg.norm(y - X.dot(np.linalg.pinv(X).dot(y)))**2 / (
            n - p)
        sigma_ = np.sqrt(dispersion)
    else:
        dispersion = None
        sigma_ = np.std(y)
    print("estimated and true sigma", sigma, sigma_)

    lam_theory = sigma_ * 1. * np.mean(
        np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0))
    randomized_lasso = lasso.gaussian(X,
                                      y,
                                      feature_weights=lam_theory * np.ones(p),
                                      randomizer_scale=np.sqrt(n) *
                                      randomizer_scale * sigma_)

    signs = randomized_lasso.fit()
    nonzero = signs != 0
    sys.stderr.write("active variables selected by randomized LASSO " +
                     str(nonzero.sum()) + "\n" + "\n")
    active_set_rand = np.asarray([t for t in range(p) if nonzero[t]])
    active_rand_bool = np.asarray(
        [(np.in1d(active_set_rand[x], true_set).sum() > 0)
         for x in range(nonzero.sum())], np.bool)
    nreport = 0.
    if nonzero.sum() > 0:
        if target == "full":
            target_randomized = beta[nonzero]
            (observed_target, cov_target, cov_target_score,
             alternatives) = full_targets(randomized_lasso.loglike,
                                          randomized_lasso._W,
                                          nonzero,
                                          dispersion=dispersion)
        elif target == "selected":
            target_randomized = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta))
            (observed_target, cov_target, cov_target_score,
             alternatives) = selected_targets(randomized_lasso.loglike,
                                              randomized_lasso._W,
                                              nonzero,
                                              dispersion=dispersion)
        else:
            raise ValueError('not a valid specification of target')
        toc = time.time()
        MLE_estimate, _, _, MLE_pval, MLE_intervals, ind_unbiased_estimator = randomized_lasso.selective_MLE(
            observed_target, cov_target, cov_target_score, alternatives)
        tic = time.time()
        time_MLE = tic - toc

        cov_MLE, selective_MLE_power = coverage(MLE_intervals, MLE_pval,
                                                target_randomized,
                                                beta[nonzero])
        length_MLE = np.mean(MLE_intervals[:, 1] - MLE_intervals[:, 0])
        power_MLE = ((active_rand_bool) * (np.logical_or(
            (0. < MLE_intervals[:, 0]),
            (0. > MLE_intervals[:, 1])))).sum() / float((beta != 0).sum())
        MLE_discoveries = BHfilter(MLE_pval, q=0.1)
        power_MLE_BH = (MLE_discoveries * active_rand_bool).sum() / float(
            (beta != 0).sum())
        fdr_MLE_BH = (MLE_discoveries * ~active_rand_bool).sum() / float(
            max(MLE_discoveries.sum(), 1.))
        bias_MLE = np.mean(MLE_estimate - target_randomized)

        toc = time.time()
        intervals_uni, pvalue_uni = randomized_lasso.inference_new(
            observed_target, cov_target, cov_target_score, alternatives)

        tic = time.time()
        time_uni = tic - toc
        intervals_uni = intervals_uni.T
        cov_uni, selective_uni_power = coverage(intervals_uni, pvalue_uni,
                                                target_randomized,
                                                beta[nonzero])
        length_uni = np.mean(intervals_uni[:, 1] - intervals_uni[:, 0])
        power_uni = ((active_rand_bool) * (np.logical_or(
            (0. < intervals_uni[:, 0]),
            (0. > intervals_uni[:, 1])))).sum() / float((beta != 0).sum())
        uni_discoveries = BHfilter(pvalue_uni, q=0.1)
        power_uni_BH = (uni_discoveries * active_rand_bool).sum() / float(
            (beta != 0).sum())
        fdr_uni_BH = (uni_discoveries * ~active_rand_bool).sum() / float(
            max(uni_discoveries.sum(), 1.))
        bias_randLASSO = np.mean(randomized_lasso.initial_soln[nonzero] -
                                 target_randomized)

    else:
        nreport += 1
        cov_MLE, length_MLE, power_MLE, power_MLE_BH, fdr_MLE_BH, bias_MLE, selective_MLE_power, time_MLE = [
            0., 0., 0., 0., 0., 0., 0., 0.
        ]
        cov_uni, length_uni, power_uni, power_uni_BH, fdr_uni_BH, bias_randLASSO, selective_uni_power, time_uni = [
            0., 0., 0., 0., 0., 0., 0., 0.
        ]
        MLE_discoveries = np.zeros(1)
        uni_discoveries = np.zeros(1)

    MLE_inf = np.vstack(
        (cov_MLE, length_MLE, 0., nonzero.sum(), bias_MLE, selective_MLE_power,
         time_MLE, power_MLE, power_MLE_BH, fdr_MLE_BH, MLE_discoveries.sum()))

    uni_inf = np.vstack(
        (cov_uni, length_uni, 0., nonzero.sum(), bias_randLASSO,
         selective_uni_power, time_uni, power_uni, power_uni_BH, fdr_uni_BH,
         uni_discoveries.sum()))

    return np.vstack((MLE_inf, uni_inf, nreport))
def risk_comparison(n=500,
                    p=100,
                    nval=500,
                    rho=0.35,
                    s=5,
                    beta_type=1,
                    snr=0.20,
                    randomizer_scale=np.sqrt(0.50),
                    full_dispersion=False,
                    tuning_nonrand="lambda.min",
                    tuning_rand="lambda.1se",
                    ndraw=50):

    risks = np.zeros((6, 1))
    for i in range(ndraw):
        X, y, _, _, Sigma, beta, sigma = sim_xy(n=n,
                                                p=p,
                                                nval=nval,
                                                rho=rho,
                                                s=s,
                                                beta_type=beta_type,
                                                snr=snr)
        print("snr", snr)
        X -= X.mean(0)[None, :]
        X /= (X.std(0)[None, :] * np.sqrt(n / (n - 1.)))
        y = y - y.mean()

        if full_dispersion:
            print("shapes", y.shape,
                  (np.linalg.norm(y -
                                  X.dot(np.linalg.pinv(X).dot(y)))**2).shape)
            dispersion = np.linalg.norm(
                y - X.dot(np.linalg.pinv(X).dot(y)))**2 / (n - p)
            sigma_ = np.sqrt(dispersion)
        else:
            dispersion = None
            _sigma_ = np.std(y)
        lam_theory = _sigma_ * 1. * np.mean(
            np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0))
        glm_LASSO_theory, glm_LASSO_1se, glm_LASSO_min, lam_min, lam_1se = glmnet_lasso(
            X, y, lam_theory / float(n))

        if full_dispersion is False:
            dispersion = None
            active_min = (glm_LASSO_min != 0)
            if active_min.sum() > 0:
                sigma_ = np.sqrt(
                    np.linalg.norm(y - X[:, active_min].dot(
                        np.linalg.pinv(X[:, active_min]).dot(y)))**2 /
                    (n - active_min.sum()))
            else:
                sigma_ = _sigma_

        print("true and estimated sigma", sigma, _sigma_, sigma_)

        if tuning_nonrand == "lambda.min":
            lam_LASSO = lam_min
            glm_LASSO = glm_LASSO_min
        elif tuning_nonrand == "lambda.1se":
            lam_LASSO = lam_1se
            glm_LASSO = glm_LASSO_1se
        else:
            lam_LASSO = lam_theory / float(n)
            glm_LASSO = glm_LASSO_theory
        active_LASSO = (glm_LASSO != 0)
        rel_LASSO = np.zeros(p)
        if active_LASSO.sum() > 0:
            post_LASSO_OLS = np.linalg.pinv(X[:, active_LASSO]).dot(y)
            rel_LASSO[active_LASSO] = post_LASSO_OLS

        if tuning_rand == "lambda.min":
            randomized_lasso = lasso.gaussian(
                X,
                y,
                feature_weights=n * lam_min * np.ones(p),
                randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_)
        elif tuning_rand == "lambda.1se":
            randomized_lasso = lasso.gaussian(
                X,
                y,
                feature_weights=n * lam_1se * np.ones(p),
                randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_)
        else:
            randomized_lasso = lasso.gaussian(
                X,
                y,
                feature_weights=lam_theory * np.ones(p),
                randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_)
        signs = randomized_lasso.fit()
        nonzero = signs != 0
        sel_MLE = np.zeros(p)
        ind_est = np.zeros(p)
        randomized_lasso_est = np.zeros(p)
        randomized_rel_lasso_est = np.zeros(p)

        if nonzero.sum() > 0:
            target_randomized = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta))
            (observed_target, cov_target, cov_target_score,
             alternatives) = selected_targets(randomized_lasso.loglike,
                                              randomized_lasso._W,
                                              nonzero,
                                              dispersion=dispersion)

            MLE_estimate, _, _, _, _, ind_unbiased_estimator = randomized_lasso.selective_MLE(
                observed_target, cov_target, cov_target_score, alternatives)
            sel_MLE[nonzero] = MLE_estimate
            ind_est[nonzero] = ind_unbiased_estimator
            randomized_lasso_est = randomized_lasso.initial_soln
            randomized_rel_lasso_est = randomized_lasso._beta_full

        risks += np.vstack(
            (relative_risk(sel_MLE, beta,
                           Sigma), relative_risk(ind_est, beta, Sigma),
             relative_risk(randomized_lasso_est, beta, Sigma),
             relative_risk(randomized_rel_lasso_est, beta,
                           Sigma), relative_risk(rel_LASSO, beta, Sigma),
             relative_risk(glm_LASSO, beta, Sigma)))
        print("risks so far", risks / (i + 1))

    return risks / ndraw
Exemplo n.º 4
0
def test_randomized_slope(n=500,
                          p=100,
                          signal_fac=1.3,
                          s=5,
                          sigma=3.,
                          rho=0.35,
                          randomizer_scale=np.sqrt(1.),
                          target="selected",
                          use_MLE=True):

    while True:
        inst = gaussian_instance
        signal = np.sqrt(signal_fac * 2. * np.log(p))
        X, Y, beta = inst(n=n,
                          p=p,
                          signal=signal,
                          s=s,
                          equicorrelated=False,
                          rho=rho,
                          sigma=sigma,
                          random_signs=True)[:3]

        sigma_ = np.sqrt(
            np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y)))**2 / (n - p))

        Y /= sigma_
        r_beta, r_E, r_lambda_seq, r_sigma = slope_R(
            X,
            Y,
            W=None,
            normalize=True,
            choice_weights="gaussian",  #put gaussian
            sigma=1.)

        conv = slope.gaussian(X,
                              Y,
                              r_sigma * r_lambda_seq,
                              sigma=1.,
                              randomizer_scale=randomizer_scale * 1.)

        signs = conv.fit()
        nonzero = signs != 0
        print("dimensions", n, p, nonzero.sum())

        if nonzero.sum() > 0:

            if target == 'full':
                (observed_target, cov_target, cov_target_score,
                 alternatives) = full_targets(conv.loglike,
                                              conv._W,
                                              nonzero,
                                              dispersion=1.)
            elif target == 'selected':
                (observed_target, cov_target, cov_target_score,
                 alternatives) = selected_targets(conv.loglike,
                                                  conv._W,
                                                  nonzero,
                                                  dispersion=1.)

            if target == "selected":
                beta_target = np.linalg.pinv(X[:, nonzero]).dot(
                    X.dot(beta)) / sigma_
            else:
                beta_target = beta[nonzero] / sigma_
            if use_MLE:

                estimate, _, _, pval, intervals, _ = conv.selective_MLE(
                    observed_target, cov_target, cov_target_score,
                    alternatives)
            else:
                _, pval, intervals = conv.summary(observed_target,
                                                  cov_target,
                                                  cov_target_score,
                                                  alternatives,
                                                  compute_intervals=True)
            coverage = (beta_target > intervals[:, 0]) * (beta_target <
                                                          intervals[:, 1])
            break

    if True:
        return pval[beta_target == 0], pval[
            beta_target != 0], coverage, intervals
Exemplo n.º 5
0
def comparison_cvmetrics_selected(n=500,
                                  p=100,
                                  nval=500,
                                  rho=0.35,
                                  s=5,
                                  beta_type=1,
                                  snr=0.20,
                                  randomizer_scale=np.sqrt(0.50),
                                  full_dispersion=True,
                                  tuning_nonrand="lambda.min",
                                  tuning_rand="lambda.1se"):

    X, y, _, _, Sigma, beta, sigma = sim_xy(n=n,
                                            p=p,
                                            nval=nval,
                                            rho=rho,
                                            s=s,
                                            beta_type=beta_type,
                                            snr=snr)
    true_mean = X.dot(beta)
    print("snr", snr)
    X -= X.mean(0)[None, :]
    X /= (X.std(0)[None, :] * np.sqrt(n / (n - 1.)))
    y = y - y.mean()
    true_set = np.asarray([u for u in range(p) if beta[u] != 0])

    if full_dispersion:
        dispersion = np.linalg.norm(y - X.dot(np.linalg.pinv(X).dot(y)))**2 / (
            n - p)
        sigma_ = np.sqrt(dispersion)
    else:
        dispersion = None
        sigma_ = np.std(y)
    print("estimated and true sigma", sigma, sigma_)

    lam_theory = sigma_ * 1. * np.mean(
        np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0))
    glm_LASSO_theory, glm_LASSO_1se, glm_LASSO_min, lam_min, lam_1se = glmnet_lasso(
        X, y, lam_theory / float(n))
    if tuning_nonrand == "lambda.min":
        lam_LASSO = lam_min
        glm_LASSO = glm_LASSO_min
    elif tuning_nonrand == "lambda.1se":
        lam_LASSO = lam_1se
        glm_LASSO = glm_LASSO_1se
    else:
        lam_LASSO = lam_theory / float(n)
        glm_LASSO = glm_LASSO_theory
    active_LASSO = (glm_LASSO != 0)
    nactive_LASSO = active_LASSO.sum()
    active_set_LASSO = np.asarray([r for r in range(p) if active_LASSO[r]])
    active_LASSO_bool = np.asarray(
        [(np.in1d(active_set_LASSO[z], true_set).sum() > 0)
         for z in range(nactive_LASSO)], np.bool)

    rel_LASSO = np.zeros(p)
    Lee_nreport = 0
    bias_Lee = 0.
    bias_naive = 0.

    if nactive_LASSO > 0:
        post_LASSO_OLS = np.linalg.pinv(X[:, active_LASSO]).dot(y)
        rel_LASSO[active_LASSO] = post_LASSO_OLS
        Lee_target = np.linalg.pinv(X[:, active_LASSO]).dot(X.dot(beta))
        Lee_intervals, Lee_pval = selInf_R(X,
                                           y,
                                           glm_LASSO,
                                           n * lam_LASSO,
                                           sigma_,
                                           Type=0,
                                           alpha=0.1)

        if (Lee_pval.shape[0] == Lee_target.shape[0]):

            cov_Lee, selective_Lee_power = coverage(Lee_intervals, Lee_pval,
                                                    Lee_target,
                                                    beta[active_LASSO])
            inf_entries_bool = np.isinf(Lee_intervals[:, 1] -
                                        Lee_intervals[:, 0])
            inf_entries = np.mean(inf_entries_bool)
            if inf_entries == 1.:
                length_Lee = 0.
            else:
                length_Lee = np.mean((Lee_intervals[:, 1] -
                                      Lee_intervals[:, 0])[~inf_entries_bool])
            power_Lee = ((active_LASSO_bool) * (np.logical_or((0. < Lee_intervals[:, 0]), (0. > Lee_intervals[:, 1])))) \
                            .sum() / float((beta != 0).sum())
            Lee_discoveries = BHfilter(Lee_pval, q=0.1)
            power_Lee_BH = (Lee_discoveries * active_LASSO_bool).sum() / float(
                (beta != 0).sum())
            fdr_Lee_BH = (Lee_discoveries * ~active_LASSO_bool).sum() / float(
                max(Lee_discoveries.sum(), 1.))
            bias_Lee = np.mean(glm_LASSO[active_LASSO] - Lee_target)

            naive_sd = sigma_ * np.sqrt(
                np.diag((np.linalg.inv(X[:, active_LASSO].T.dot(
                    X[:, active_LASSO])))))
            naive_intervals = np.vstack([
                post_LASSO_OLS - 1.65 * naive_sd,
                post_LASSO_OLS + 1.65 * naive_sd
            ]).T
            naive_pval = 2 * ndist.cdf(np.abs(post_LASSO_OLS) / naive_sd)
            cov_naive, selective_naive_power = coverage(
                naive_intervals, naive_pval, Lee_target, beta[active_LASSO])
            length_naive = np.mean(naive_intervals[:, 1] -
                                   naive_intervals[:, 0])
            power_naive = ((active_LASSO_bool) * (np.logical_or(
                (0. < naive_intervals[:, 0]),
                (0. > naive_intervals[:, 1])))).sum() / float(
                    (beta != 0).sum())
            naive_discoveries = BHfilter(naive_pval, q=0.1)
            power_naive_BH = (naive_discoveries *
                              active_LASSO_bool).sum() / float(
                                  (beta != 0).sum())
            fdr_naive_BH = (naive_discoveries *
                            ~active_LASSO_bool).sum() / float(
                                max(naive_discoveries.sum(), 1.))
            bias_naive = np.mean(rel_LASSO[active_LASSO] - Lee_target)

            partial_Lasso_risk = (glm_LASSO[active_LASSO] -
                                  Lee_target).T.dot(glm_LASSO[active_LASSO] -
                                                    Lee_target)
            partial_relLasso_risk = (post_LASSO_OLS -
                                     Lee_target).T.dot(post_LASSO_OLS -
                                                       Lee_target)

        else:
            Lee_nreport = 1
            cov_Lee, length_Lee, inf_entries, power_Lee, power_Lee_BH, fdr_Lee_BH, selective_Lee_power = [
                0., 0., 0., 0., 0., 0., 0.
            ]
            cov_naive, length_naive, power_naive, power_naive_BH, fdr_naive_BH, selective_naive_power = [
                0., 0., 0., 0., 0., 0.
            ]
            naive_discoveries = np.zeros(1)
            Lee_discoveries = np.zeros(1)
            partial_Lasso_risk, partial_relLasso_risk = [0., 0.]
    elif nactive_LASSO == 0:
        Lee_nreport = 1
        cov_Lee, length_Lee, inf_entries, power_Lee, power_Lee_BH, fdr_Lee_BH, selective_Lee_power = [
            0., 0., 0., 0., 0., 0., 0.
        ]
        cov_naive, length_naive, power_naive, power_naive_BH, fdr_naive_BH, selective_naive_power = [
            0., 0., 0., 0., 0., 0.
        ]
        naive_discoveries = np.zeros(1)
        Lee_discoveries = np.zeros(1)
        partial_Lasso_risk, partial_relLasso_risk = [0., 0.]

    if tuning_rand == "lambda.min":
        randomized_lasso = lasso.gaussian(
            X,
            y,
            feature_weights=n * lam_min * np.ones(p),
            randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_)
    elif tuning_rand == "lambda.1se":
        randomized_lasso = lasso.gaussian(
            X,
            y,
            feature_weights=n * lam_1se * np.ones(p),
            randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_)
    else:
        randomized_lasso = lasso.gaussian(
            X,
            y,
            feature_weights=lam_theory * np.ones(p),
            randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_)
    signs = randomized_lasso.fit()
    nonzero = signs != 0
    active_set_rand = np.asarray([t for t in range(p) if nonzero[t]])
    active_rand_bool = np.asarray(
        [(np.in1d(active_set_rand[x], true_set).sum() > 0)
         for x in range(nonzero.sum())], np.bool)
    sel_MLE = np.zeros(p)
    ind_est = np.zeros(p)
    randomized_lasso_est = np.zeros(p)
    randomized_rel_lasso_est = np.zeros(p)
    MLE_nreport = 0

    sys.stderr.write("active variables selected by cv LASSO  " +
                     str(nactive_LASSO) + "\n")
    sys.stderr.write("active variables selected by randomized LASSO " +
                     str(nonzero.sum()) + "\n" + "\n")

    if nonzero.sum() > 0:
        target_randomized = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta))
        (observed_target, cov_target, cov_target_score,
         alternatives) = selected_targets(randomized_lasso.loglike,
                                          randomized_lasso._W,
                                          nonzero,
                                          dispersion=dispersion)

        MLE_estimate, _, _, MLE_pval, MLE_intervals, ind_unbiased_estimator = randomized_lasso.selective_MLE(
            observed_target, cov_target, cov_target_score, alternatives)
        sel_MLE[nonzero] = MLE_estimate
        ind_est[nonzero] = ind_unbiased_estimator
        randomized_lasso_est = randomized_lasso.initial_soln
        randomized_rel_lasso_est = randomized_lasso._beta_full

        cov_MLE, selective_MLE_power = coverage(MLE_intervals, MLE_pval,
                                                target_randomized,
                                                beta[nonzero])
        length_MLE = np.mean(MLE_intervals[:, 1] - MLE_intervals[:, 0])
        power_MLE = ((active_rand_bool) * (np.logical_or(
            (0. < MLE_intervals[:, 0]),
            (0. > MLE_intervals[:, 1])))).sum() / float((beta != 0).sum())
        MLE_discoveries = BHfilter(MLE_pval, q=0.1)
        power_MLE_BH = (MLE_discoveries * active_rand_bool).sum() / float(
            (beta != 0).sum())
        fdr_MLE_BH = (MLE_discoveries * ~active_rand_bool).sum() / float(
            max(MLE_discoveries.sum(), 1.))
        bias_MLE = np.mean(MLE_estimate - target_randomized)

        partial_MLE_risk = (MLE_estimate -
                            target_randomized).T.dot(MLE_estimate -
                                                     target_randomized)
        partial_ind_risk = (ind_unbiased_estimator -
                            target_randomized).T.dot(ind_unbiased_estimator -
                                                     target_randomized)
        partial_randLasso_risk = (
            randomized_lasso_est[nonzero] -
            target_randomized).T.dot(randomized_lasso_est[nonzero] -
                                     target_randomized)
        partial_relrandLasso_risk = (
            randomized_rel_lasso_est[nonzero] -
            target_randomized).T.dot(randomized_rel_lasso_est[nonzero] -
                                     target_randomized)
    else:
        MLE_nreport = 1
        cov_MLE, length_MLE, power_MLE, power_MLE_BH, fdr_MLE_BH, bias_MLE, selective_MLE_power = [
            0., 0., 0., 0., 0., 0., 0.
        ]
        MLE_discoveries = np.zeros(1)
        partial_MLE_risk, partial_ind_risk, partial_randLasso_risk, partial_relrandLasso_risk = [
            0., 0., 0., 0.
        ]

    risks = np.vstack(
        (relative_risk(sel_MLE, beta,
                       Sigma), relative_risk(ind_est, beta, Sigma),
         relative_risk(randomized_lasso_est, beta, Sigma),
         relative_risk(randomized_rel_lasso_est, beta,
                       Sigma), relative_risk(rel_LASSO, beta, Sigma),
         relative_risk(glm_LASSO, beta, Sigma)))

    partial_risks = np.vstack(
        (partial_MLE_risk, partial_ind_risk, partial_randLasso_risk,
         partial_relrandLasso_risk, partial_relLasso_risk, partial_Lasso_risk))

    naive_inf = np.vstack(
        (cov_naive, length_naive, 0., nactive_LASSO, bias_naive,
         selective_naive_power, power_naive, power_naive_BH, fdr_naive_BH,
         naive_discoveries.sum()))
    Lee_inf = np.vstack(
        (cov_Lee, length_Lee, inf_entries, nactive_LASSO, bias_Lee,
         selective_Lee_power, power_Lee, power_Lee_BH, fdr_Lee_BH,
         Lee_discoveries.sum()))
    Liu_inf = np.zeros((10, 1))
    MLE_inf = np.vstack(
        (cov_MLE, length_MLE, 0., nonzero.sum(), bias_MLE, selective_MLE_power,
         power_MLE, power_MLE_BH, fdr_MLE_BH, MLE_discoveries.sum()))
    nreport = np.vstack((Lee_nreport, 0., MLE_nreport))
    return np.vstack(
        (risks, naive_inf, Lee_inf, Liu_inf, MLE_inf, partial_risks, nreport))
Exemplo n.º 6
0
def test_marginal_slope(n=3000, p=1000, signal_fac=1.5, s=30, sigma=2., rho=0.20, randomizer_scale= np.sqrt(0.5),
                        split_proportion= 0.67, target = "selected"):

    inst = gaussian_instance
    signal = np.sqrt(signal_fac * 2. * np.log(p))
    X, y, beta = inst(n=n,
                      p=p,
                      signal=signal,
                      s=s,
                      equicorrelated=False,
                      rho=rho,
                      sigma=sigma,
                      random_signs=True)[:3]

    sigma_ = np.sqrt(np.linalg.norm(y - X.dot(np.linalg.pinv(X).dot(y))) ** 2 / (n - p))
    #sigma_ = np.std(y)/np.sqrt(2)
    #sigma_ = 1.
    Y = y/sigma_

    score = X.T.dot(Y)
    omega = randomization.isotropic_gaussian((p,), randomizer_scale * sigma_).sample()
    W = X.T.dot(X)
    marginal_select = marginal_screening.type1(score,
                                               W,
                                               0.1,
                                               randomizer_scale,
                                               useC=True,
                                               perturb=omega)

    boundary, cond_mean_1, cond_cov_1, affine_con_1, logdens_linear_1, initial_soln_1 = marginal_select.fit()
    nonzero = boundary != 0
    first_selected = np.asarray([t for t in range(p) if nonzero[t]])

    X_tilde = X[:, nonzero]

    r_beta, r_E, r_lambda_seq, r_sigma = slope_R(X_tilde,
                                                 Y,
                                                 W=None,
                                                 normalize=True,
                                                 choice_weights="gaussian",  # put gaussian
                                                 sigma=1.)

    conv = slope.gaussian(X_tilde,
                          Y,
                          r_sigma * r_lambda_seq,
                          sigma=1.,
                          randomizer_scale=randomizer_scale * 1.)

    signs, cond_mean_2, cond_cov_2, affine_con_2, logdens_linear_2, initial_soln_2 = conv.fit()
    nonzero_slope = signs != 0
    second_selected = np.asarray([s for s in range(nonzero.sum()) if nonzero_slope[s]])

    subsample_size = int(split_proportion * n)
    sel_idx = np.zeros(n, np.bool)
    sel_idx[:subsample_size] = 1
    np.random.shuffle(sel_idx)
    inf_idx = ~sel_idx

    Y_inf = Y[inf_idx]
    X_inf = X[inf_idx, :]
    #_sigma_ = np.sqrt(np.linalg.norm(Y_inf - X_inf.dot(np.linalg.pinv(X_inf).dot(Y_inf))) ** 2 / (n - p))
    Y_sel = Y[sel_idx]
    X_sel = X[sel_idx, :]
    #Y_inf /= _sigma_

    score_split = X_sel.T.dot(Y_sel)
    stdev_split = np.sqrt(np.diag(X_sel.T.dot(X_sel)))
    threshold_split = stdev_split * ndist.ppf(1. - 0.1/ 2.)
    boundary_split = np.fabs(score_split) >= threshold_split
    nonzero_split = boundary_split != 0
    first_selected_split = np.asarray([u for u in range(p) if nonzero_split[u]])

    X_tilde_sel = X_sel[:, nonzero_split]
    r_beta_split, r_E_split, r_lambda_seq_split, r_sigma_split = slope_R(X_tilde_sel,
                                                                         Y_sel,
                                                                         W=None,
                                                                         normalize=True,
                                                                         choice_weights="gaussian",
                                                                         sigma=1.)

    nonzero_slope_split = (r_beta_split != 0)
    second_selected_split = np.asarray([r for r in range(nonzero_split.sum()) if nonzero_slope_split[r]])

    print("compare dimensions- ms ", nonzero.sum(), nonzero_split.sum())
    print("compare dimensions- slope ", nonzero_slope.sum(), nonzero_slope_split.sum())

    beta_target_split = np.linalg.pinv(X_inf[:, first_selected_split[second_selected_split]]).dot(X_inf[:, first_selected_split].dot(beta[nonzero_split]))/ sigma_
    post_split_OLS = np.linalg.pinv(X_inf[:, first_selected_split[second_selected_split]]).dot(Y_inf)
    naive_split_sd = np.sqrt(np.diag((np.linalg.inv(X_inf[:, first_selected_split[second_selected_split]].T.dot(X_inf[:, first_selected_split[second_selected_split]])))))
    intervals_split = np.vstack([post_split_OLS - 1.65 * naive_split_sd,
                                 post_split_OLS + 1.65 * naive_split_sd]).T
    coverage_split = (beta_target_split > intervals_split[:, 0]) * (beta_target_split < intervals_split[:, 1])
    length_split = intervals_split[:, 1] - intervals_split[:, 0]
    pval_split = 2 *(1.-ndist.cdf(np.abs(post_split_OLS) / naive_split_sd))

    pval_alt_split = (pval_split[beta[first_selected_split[second_selected_split]] != 0]) < 0.1
    if pval_alt_split.sum() > 0:
        power_split = np.mean(pval_alt_split)
    else:
        power_split = 0.

    if target == "selected":
        _, _, cov_target_score_1, _ = marginal_select.multivariate_targets(first_selected[second_selected])

        (observed_target,
         cov_target,
         cov_target_score_2,
         alternatives) = selected_targets(conv.loglike,
                                          conv._W,
                                          nonzero_slope,
                                          dispersion=1.)

        beta_target = np.linalg.pinv(X_tilde[:, nonzero_slope]).dot(X_tilde.dot(beta[nonzero])) / sigma_

    elif target == "full":
        _, _, cov_target_score_1, _ = marginal_select.marginal_targets(first_selected[second_selected])

        (observed_target,
         cov_target,
         cov_target_score_2,
         alternatives) = full_targets(conv.loglike,
                                      conv._W,
                                      nonzero_slope,
                                      dispersion=1.)

        beta_target = beta[first_selected[second_selected]] / sigma_

    estimate, _, _, pval, intervals, _ = twostage_selective_MLE(observed_target,
                                                                cov_target,
                                                                cov_target_score_1,
                                                                cov_target_score_2,
                                                                initial_soln_1,
                                                                initial_soln_2,
                                                                cond_mean_1,
                                                                cond_mean_2,
                                                                cond_cov_1,
                                                                cond_cov_2,
                                                                logdens_linear_1,
                                                                logdens_linear_2,
                                                                affine_con_1.linear_part,
                                                                affine_con_2.linear_part,
                                                                affine_con_1.offset,
                                                                affine_con_2.offset,
                                                                solve_args={'tol': 1.e-12},
                                                                level=0.9)
    pval_alt = (pval[beta[first_selected[second_selected]] != 0]) < 0.1
    if pval_alt.sum() > 0:
        power_adjusted = np.mean(pval_alt)
    else:
        power_adjusted = 0.
    fdr = ((pval[beta[first_selected[second_selected]] == 0]) < 0.1).sum() / float((pval < 0.1).sum())

    coverage_adjusted = (beta_target > intervals[:, 0]) * (beta_target < intervals[:, 1])
    length_adjusted = intervals[:, 1] - intervals[:, 0]

    post_sel_OLS = np.linalg.pinv(X_tilde[:, nonzero_slope]).dot(Y)
    naive_sd = np.sqrt(np.diag((np.linalg.inv(X_tilde[:, nonzero_slope].T.dot(X_tilde[:, nonzero_slope])))))
    intervals_naive = np.vstack([post_sel_OLS - 1.65 * naive_sd,
                                 post_sel_OLS + 1.65 * naive_sd]).T
    coverage_naive = (beta_target > intervals_naive[:, 0]) * (beta_target < intervals_naive[:, 1])
    length_naive = intervals_naive[:, 1] - intervals_naive[:, 0]

    return coverage_adjusted, sigma_ * length_adjusted, power_adjusted, coverage_naive, sigma_ * length_naive, \
           coverage_split, sigma_ * length_split, power_split, fdr
def test_selected_targets(n=100,
                          p=500,
                          signal_fac=0.2,
                          s=10,
                          sigma=3.,
                          rho=0.4,
                          randomizer_scale=1.):
    """
    Compare to R randomized lasso
    """

    inst, const = gaussian_instance, lasso.gaussian
    signal = np.sqrt(signal_fac * 2 * np.log(p))

    while True:
        X, Y, beta = inst(n=n,
                          p=p,
                          signal=signal,
                          s=s,
                          equicorrelated=False,
                          rho=rho,
                          sigma=sigma,
                          random_signs=True)[:3]

        idx = np.arange(p)
        sigmaX = rho ** np.abs(np.subtract.outer(idx, idx))
        print("snr", beta.T.dot(sigmaX).dot(beta) / ((sigma ** 2.) * n))

        n, p = X.shape

        sigma_ = np.std(Y)
        W = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_

        conv = const(X,
                     Y,
                     W,
                     randomizer_scale=randomizer_scale * sigma_)

        signs = conv.fit()
        nonzero = signs != 0

        if nonzero.sum() > 0:
            dispersion = None

            (observed_target,
             cov_target,
             cov_target_score,
             alternatives) = selected_targets(conv.loglike,
                                              conv._W,
                                              nonzero,
                                              dispersion=dispersion)

            estimate, observed_info_mean, _, _, _, _ = conv.selective_MLE(observed_target,
                                                                    cov_target,
                                                                    cov_target_score,
                                                                    alternatives)

            index = np.random.permutation(n)[0]
            contrast = ((X[:, nonzero])[index,:])
            target = contrast.T.dot(np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)))
            est = contrast.T.dot(estimate)
            var_est = contrast.T.dot(observed_info_mean).dot(contrast)
            quantile = ndist.ppf(1 - 0.05)
            intervals = np.vstack([est - quantile * np.sqrt(var_est),
                                   est + quantile * np.sqrt(var_est)]).T
            pivot = ndist.cdf((est-target)/np.sqrt(var_est))

            coverage = (target > intervals[0,0]) * (target < intervals[0,1])
            return coverage, pivot
def compute_sampler_quantiles(n=500, p=100, signal_fac=1.2, s=5, sigma=1., rho=0., randomizer_scale=1, full_dispersion=True):

    inst, const = gaussian_instance, lasso.gaussian
    signal = np.sqrt(signal_fac * 2 * np.log(p))

    while True:
        X, Y, beta = inst(n=n,
                          p=p,
                          signal=signal,
                          s=s,
                          equicorrelated=False,
                          rho=rho,
                          sigma=sigma,
                          random_signs=True)[:3]

        idx = np.arange(p)
        sigmaX = rho ** np.abs(np.subtract.outer(idx, idx))
        print("snr", beta.T.dot(sigmaX).dot(beta) / ((sigma ** 2.) * n))

        n, p = X.shape

        if full_dispersion:
            dispersion = np.linalg.norm(Y - X.dot(np.linalg.pinv(X).dot(Y))) ** 2 / (n - p)
            sigma_ = np.sqrt(dispersion)
        W = np.ones(X.shape[1]) * np.sqrt(2 * np.log(p)) * sigma_

        conv = const(X,
                     Y,
                     W,
                     randomizer_scale=randomizer_scale * sigma_)

        signs = conv.fit()
        nonzero = signs != 0
        (observed_target,
         cov_target,
         cov_target_score,
         alternatives) = selected_targets(conv.loglike,
                                          conv._W,
                                          nonzero,
                                          dispersion=dispersion)

        true_mean = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta))
        estimate, observed_info_mean, _, pval, intervals, _ = conv.selective_MLE(observed_target,
                                                                                 cov_target,
                                                                                 cov_target_score,
                                                                                 alternatives)

        opt_linear, opt_offset = conv.opt_transform
        target_precision = np.linalg.inv(cov_target)
        randomizer_cov, randomizer_precision = conv.randomizer.cov_prec
        score_linear = np.identity(p)
        target_linear = score_linear.dot(cov_target_score.T.dot(target_precision))
        target_offset = conv.observed_score_state - target_linear.dot(observed_target)

        nopt = opt_linear.shape[1]
        ntarget = target_linear.shape[1]

        implied_precision = np.zeros((ntarget + nopt, ntarget + nopt))
        implied_precision[:ntarget, :ntarget] = target_linear.T.dot(randomizer_precision).dot(target_linear) + target_precision
        implied_precision[:ntarget, ntarget:] = target_linear.T.dot(randomizer_precision).dot(opt_linear)
        implied_precision[ntarget:, :ntarget] = opt_linear.T.dot(randomizer_precision).dot(target_linear)
        implied_precision[ntarget:, ntarget:] = opt_linear.T.dot(randomizer_precision).dot(opt_linear)
        implied_cov = np.linalg.inv(implied_precision)

        conditioned_value = target_offset + opt_offset
        implied_mean = implied_cov.dot(np.hstack((target_precision.dot(true_mean)-target_linear.T.dot(randomizer_precision).dot(conditioned_value),
                                                  -opt_linear.T.dot(randomizer_precision).dot(conditioned_value))))

        A_scaling = np.zeros((nopt, ntarget+nopt))
        A_scaling[:,ntarget:] = -np.identity(nopt)
        b_scaling = np.zeros(nopt)
        affine_con = constraints(A_scaling,
                                 b_scaling,
                                 mean=implied_mean,
                                 covariance=implied_cov)

        initial_point = np.zeros(ntarget+nopt)
        initial_point[ntarget:] = conv.observed_opt_state

        sampler = sample_from_constraints(affine_con,
                                          initial_point,
                                          ndraw=500000,
                                          burnin=1000)

        print("sampler", sampler.shape, sampler[:,:ntarget].shape)
        mle_sample = []
        for j in range(sampler.shape[0]):
            estimate, _, _, _, _, _ = conv.selective_MLE(sampler[j,:ntarget],
                                                         cov_target,
                                                         cov_target_score,
                                                         alternatives)
            mle_sample.append(estimate)
            print("iteration ", j)
        mle_sample = np.asarray(mle_sample)
        print("check", mle_sample.shape, np.mean(mle_sample, axis=0) - true_mean)

        for i in range(nonzero.sum()):
            temp = 251 + i
            ax = plt.subplot(temp)
            stats.probplot(mle_sample[:,i], dist="norm", plot=pylab)
            plt.subplots_adjust(hspace=.5, wspace=.5)
        pylab.show()

        sampler_quantiles = np.vstack([np.percentile(mle_sample, 5, axis=0), np.percentile(mle_sample, 95, axis=0)])

        normal_quantiles = np.vstack((norm.ppf(0.05, loc=true_mean, scale=np.sqrt(np.diag(observed_info_mean))),
                                      norm.ppf(0.95, loc=true_mean, scale=np.sqrt(np.diag(observed_info_mean)))))

        print("sampler quantiles", sampler_quantiles.T)
        print("normal quantiles", normal_quantiles.T)
        break
Exemplo n.º 9
0
def multiple_runs_lasso(n=500, p=100, nval=500, rho=0.35, s=5, beta_type=1, snr=0.20,
                         randomizer_scale=np.sqrt(0.50), full_dispersion=True):


    X, y, _, _, Sigma, beta, sigma = sim_xy(n=n, p=p, nval=nval, rho=rho, s=s, beta_type=beta_type, snr=snr)
    X -= X.mean(0)[None, :]
    X /= (X.std(0)[None, :] * np.sqrt(n / (n - 1.)))
    y = y - y.mean()

    if full_dispersion:
        dispersion = np.linalg.norm(y - X.dot(np.linalg.pinv(X).dot(y))) ** 2 / (n - p)
        sigma_ = np.sqrt(dispersion)
    else:
        dispersion = None
        sigma_ = np.std(y)
    print("estimated and true sigma", sigma, sigma_)

    lam_theory = sigma_ * 1. * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0))
    glm_LASSO_theory, glm_LASSO_1se, glm_LASSO_min, lam_min, lam_1se = glmnet_lasso(X, y, lam_theory / float(n))

    active_LASSO_1 = (glm_LASSO_theory != 0)
    active_LASSO_2 = (glm_LASSO_1se != 0)
    active_LASSO = np.logical_or(active_LASSO_1, active_LASSO_2)
    nreport_nonrand = 0.
    if active_LASSO.sum()>0:
        target_nonrandomized = np.linalg.pinv(X[:, active_LASSO]).dot(X.dot(beta))
        post_LASSO_OLS = np.linalg.pinv(X[:, active_LASSO]).dot(y)

        naive_sd = sigma_ * np.sqrt(np.diag((np.linalg.inv(X[:, active_LASSO].T.dot(X[:, active_LASSO])))))
        naive_intervals = np.vstack([post_LASSO_OLS - 1.65 * naive_sd,
                                     post_LASSO_OLS + 1.65 * naive_sd]).T
        naive_pval = 2 * (1.-ndist.cdf(np.abs(post_LASSO_OLS)/ naive_sd))
        cov_naive, power_naive = coverage(naive_intervals, naive_pval, target_nonrandomized, beta[active_LASSO])
        length_naive = np.mean(naive_intervals[:, 1] - naive_intervals[:, 0])
        fdr_naive = ((naive_pval[beta[active_LASSO] == 0]) < 0.1).sum() / float((naive_pval < 0.1).sum())
    else:
        nreport_nonrand +=1.
        cov_naive, power_naive, length_naive, fdr_naive = [0.,0., 0.,0.]

    randomized_lasso_1 = lasso.gaussian(X,
                                        y,
                                        feature_weights=lam_theory * np.ones(p),
                                        randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_)

    signs_1 = randomized_lasso_1.fit()
    nonzero_1 = signs_1 != 0

    randomized_lasso_2 = lasso.gaussian(X,
                                        y,
                                        feature_weights=n * lam_1se * np.ones(p),
                                        randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_)

    signs_2 = randomized_lasso_2.fit()
    nonzero_2 = signs_2 != 0

    signs = np.logical_or(signs_1, signs_2)
    nonzero = signs!=0
    print("check", nonzero_1.sum(), nonzero_2.sum(), nonzero.sum(), active_LASSO.sum())
    nreport = 0.
    if nonzero.sum() > 0:
        target_randomized = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta))
        observed_target = np.linalg.pinv(X[:, nonzero]).dot(y)
        (_,
         _,
         cov_target_score_1,
         alternatives_1) = selected_targets(randomized_lasso_1.loglike,
                                            randomized_lasso_1._W,
                                            nonzero,
                                            dispersion=dispersion)

        (_,
         cov_target,
         cov_target_score_2,
         alternatives_2) = selected_targets(randomized_lasso_2.loglike,
                                            randomized_lasso_2._W,
                                            nonzero,
                                            dispersion=dispersion)


        estimate, _, _, pval, intervals, _ = twostage_selective_MLE(observed_target,
                                                                    cov_target,
                                                                    cov_target_score_1,
                                                                    cov_target_score_2,
                                                                    randomized_lasso_1.observed_opt_state,
                                                                    randomized_lasso_2.observed_opt_state,
                                                                    randomized_lasso_1.cond_mean,
                                                                    randomized_lasso_2.cond_mean,
                                                                    randomized_lasso_1.cond_cov,
                                                                    randomized_lasso_2.cond_cov,
                                                                    randomized_lasso_1.logdens_linear,
                                                                    randomized_lasso_2.logdens_linear,
                                                                    randomized_lasso_1.con_linear,
                                                                    randomized_lasso_2.con_linear,
                                                                    randomized_lasso_1.con_offset,
                                                                    randomized_lasso_2.con_offset,
                                                                    solve_args={'tol': 1.e-12},
                                                                    level=0.9)

        coverage_adjusted, power_adjusted = coverage(intervals, pval, target_randomized, beta[nonzero])
        length_adjusted = np.mean(intervals[:, 1] - intervals[:, 0])
        fdr_adjusted = ((pval[beta[nonzero] == 0]) < 0.1).sum() / float((pval < 0.1).sum())

    else:
        nreport +=1
        coverage_adjusted, length_adjusted, power_adjusted, fdr_adjusted = [0., 0., 0., 0.]

    MLE_inf = np.vstack((coverage_adjusted, length_adjusted, power_adjusted, fdr_adjusted, nonzero.sum()))
    Naive_inf = np.vstack((cov_naive, length_naive, power_naive, fdr_naive, active_LASSO.sum()))

    print MLE_inf, Naive_inf
    return np.vstack((MLE_inf, Naive_inf, nreport, nreport_nonrand))
Exemplo n.º 10
0
def compare_twostage_mle(n=3000, p=1000, nval=3000, rho=0.35, s=35, beta_type=1, snr=0.20,
                         randomizer_scale=np.sqrt(0.50), full_dispersion=True):

    X, y, _, _, Sigma, beta, sigma = sim_xy(n=n, p=p, nval=nval, rho=rho, s=s, beta_type=beta_type, snr=snr)
    X -= X.mean(0)[None, :]
    scaling = X.std(0)[None, :] * np.sqrt(n)
    X /= scaling
    y = y - y.mean()

    if full_dispersion:
        dispersion = np.linalg.norm(y - X.dot(np.linalg.pinv(X).dot(y))) ** 2 / (n - p)
        sigma_ = np.sqrt(dispersion)
    else:
        dispersion = None
        sigma_ = np.std(y)
    print("estimated and true sigma", sigma, sigma_)

    Y = y / sigma_

    score = X.T.dot(Y)
    omega = randomization.isotropic_gaussian((p,), randomizer_scale * sigma_).sample()
    W = X.T.dot(X)
    marginal_select = marginal_screening.type1(score,
                                               W,
                                               0.1,
                                               randomizer_scale,
                                               useC=True,
                                               perturb=omega)

    boundary, cond_mean_1, cond_cov_1, affine_con_1, logdens_linear_1, initial_soln_1 = marginal_select.fit()
    nonzero = boundary != 0
    first_selected = np.asarray([t for t in range(p) if nonzero[t]])

    X_tilde = X[:, nonzero]

    r_beta, r_E, r_lambda_seq, r_sigma = slope_R(X_tilde,
                                                 Y,
                                                 W=None,
                                                 normalize=True,
                                                 choice_weights="gaussian",  # put gaussian
                                                 sigma=1.)

    conv = slope.gaussian(X_tilde,
                          Y,
                          r_sigma * r_lambda_seq,
                          sigma=1.,
                          randomizer_scale=randomizer_scale * 1.)

    signs, cond_mean_2, cond_cov_2, affine_con_2, logdens_linear_2, initial_soln_2 = conv.fit()
    nonzero_slope = signs != 0
    second_selected = np.asarray([s for s in range(nonzero.sum()) if nonzero_slope[s]])

    stdev = np.sqrt(np.diag(X.T.dot(X)))
    boundary_nonrand = (score > stdev * ndist.ppf(1. - 0.10 / 2.))
    nonzero_nonrand = boundary_nonrand != 0
    first_selected_nonrand = np.asarray([z for z in range(p) if nonzero[z]])

    X_tilde_nonrand = X[:, nonzero_nonrand]

    r_beta_nonrand, r_E_nonrand, _, _ = slope_R(X_tilde_nonrand,
                                                Y,
                                                W=None,
                                                normalize=True,
                                                choice_weights="gaussian",  # put gaussian
                                                sigma=1.)

    nonzero_slope_nonrand = (r_beta_nonrand != 0)
    second_selected_nonrand = np.asarray([w for w in range(nonzero_nonrand.sum()) if nonzero_slope_nonrand[w]])

    print("compare dimensions- ms ", nonzero.sum(), nonzero_nonrand.sum())
    print("compare dimensions- slope ", nonzero_slope.sum(), nonzero_slope_nonrand.sum())

    nreport = 0.
    nreport_nonrand = 0.
    if nonzero_slope.sum()>0:
        _, _, cov_target_score_1, _ = marginal_select.multivariate_targets(first_selected[second_selected])

        (observed_target,
         cov_target,
         cov_target_score_2,
         alternatives) = selected_targets(conv.loglike,
                                          conv._W,
                                          nonzero_slope,
                                          dispersion=1.)

        beta_target = np.sqrt(n) * np.linalg.pinv(X_tilde[:, nonzero_slope]).dot(X_tilde.dot(beta[nonzero])) / sigma_

        estimate, _, _, pval, intervals, _ = twostage_selective_MLE(observed_target,
                                                                    cov_target,
                                                                    cov_target_score_1,
                                                                    cov_target_score_2,
                                                                    initial_soln_1,
                                                                    initial_soln_2,
                                                                    cond_mean_1,
                                                                    cond_mean_2,
                                                                    cond_cov_1,
                                                                    cond_cov_2,
                                                                    logdens_linear_1,
                                                                    logdens_linear_2,
                                                                    affine_con_1.linear_part,
                                                                    affine_con_2.linear_part,
                                                                    affine_con_1.offset,
                                                                    affine_con_2.offset,
                                                                    solve_args={'tol': 1.e-12},
                                                                    level=0.9)

        pval_alt = (pval[beta[first_selected[second_selected]] != 0]) < 0.1
        if pval_alt.sum() > 0:
            power_adjusted = np.mean(pval_alt)
        else:
            power_adjusted = 0.
        fdr_adjusted = ((pval[beta[first_selected[second_selected]] == 0]) < 0.1).sum()/float((pval<0.1).sum())

        coverage_adjusted = np.mean((beta_target > intervals[:, 0]) * (beta_target < intervals[:, 1]))
        length_adjusted = sigma_* np.mean(intervals[:, 1] - intervals[:, 0])/np.sqrt(n)

        post_sel_OLS = np.linalg.pinv(X_tilde[:, nonzero_slope]).dot(Y)
        naive_sd = np.sqrt(np.diag((np.linalg.inv(X_tilde[:, nonzero_slope].T.dot(X_tilde[:, nonzero_slope])))))
        intervals_naive = np.vstack([post_sel_OLS - 1.65 * naive_sd,
                                     post_sel_OLS + 1.65 * naive_sd]).T
        coverage_naive = np.mean((beta_target > intervals_naive[:, 0]) * (beta_target < intervals_naive[:, 1]))
        length_naive = sigma_* np.mean(intervals_naive[:, 1] - intervals_naive[:, 0])/np.sqrt(n)

    else:
        nreport += 1
        coverage_adjusted, length_adjusted, power_adjusted, fdr_adjusted, coverage_naive, length_naive = [0., 0., 0., 0., 0., 0.]

    if nonzero_slope_nonrand.sum()>0:
        beta_target_nonrand = np.sqrt(n) * np.linalg.pinv(X_tilde_nonrand[:, nonzero_slope_nonrand]).dot(X_tilde_nonrand.dot(beta[nonzero_nonrand])) / sigma_
        post_sel_OLS_nonrand = np.linalg.pinv(X_tilde_nonrand[:, nonzero_slope_nonrand]).dot(Y)
        naive_sd_nonrand = np.sqrt(np.diag((np.linalg.inv(X_tilde_nonrand[:, nonzero_slope_nonrand].T.dot(X_tilde_nonrand[:, nonzero_slope_nonrand])))))
        intervals_naive_nonrand = np.vstack([post_sel_OLS_nonrand - 1.65 * naive_sd_nonrand,
                                             post_sel_OLS_nonrand + 1.65 * naive_sd_nonrand]).T
        coverage_naive_nonrand = np.mean((beta_target_nonrand > intervals_naive_nonrand[:, 0]) * (beta_target_nonrand < intervals_naive_nonrand[:, 1]))
        length_naive_nonrand = sigma_ * np.mean(intervals_naive_nonrand[:, 1] - intervals_naive_nonrand[:, 0])/np.sqrt(n)
        pval_nonrand = 2 * (1.-ndist.cdf(np.abs(post_sel_OLS_nonrand) / naive_sd_nonrand))

        pval_alt_nonrand = (pval_nonrand[beta[first_selected_nonrand[second_selected_nonrand]] != 0]) < 0.1
        if pval_alt_nonrand.sum() > 0:
            power_nonrand = np.mean(pval_alt_nonrand)
        else:
            power_nonrand = 0.
        fdr_nonrand = ((pval_nonrand[beta[first_selected_nonrand[second_selected_nonrand]] == 0]) < 0.1).sum() / float((pval_nonrand < 0.1).sum())
    else:
        nreport_nonrand += 1
        coverage_naive_nonrand, length_naive_nonrand, power__nonrand, fdr__nonrand = [0., 0., 0., 0.]

    MLE_inf = np.vstack((coverage_adjusted, length_adjusted, power_adjusted, fdr_adjusted, nonzero.sum(), nonzero_slope.sum()))
    #Naive_rand_inf = np.vstack((coverage_naive, length_naive, 0., 0.))
    Naive_inf = np.vstack((coverage_naive_nonrand, length_naive_nonrand, power_nonrand, fdr_nonrand, nonzero_nonrand.sum(), nonzero_slope_nonrand.sum()))
    print("inf", MLE_inf, Naive_inf)

    return np.vstack((MLE_inf, Naive_inf, nreport, nreport_nonrand))
Exemplo n.º 11
0
def pivot(n=500,
          p=100,
          nval=500,
          rho=0.,
          s=5,
          beta_type=1,
          snr=0.25,
          randomizer_scale=np.sqrt(1.),
          full_dispersion=True):

    X, y, _, _, Sigma, beta, sigma = sim_xy(n=n,
                                            p=p,
                                            nval=nval,
                                            rho=rho,
                                            s=s,
                                            beta_type=beta_type,
                                            snr=snr)
    print("snr", snr)
    X -= X.mean(0)[None, :]
    X /= (X.std(0)[None, :] * np.sqrt(n / (n - 1.)))
    y = y - y.mean()

    if full_dispersion:
        dispersion = np.linalg.norm(y - X.dot(np.linalg.pinv(X).dot(y)))**2 / (
            n - p)
        sigma_ = np.sqrt(dispersion)
    else:
        dispersion = None
        sigma_ = np.std(y)
    print("estimated and true sigma", sigma, sigma_)

    lam_theory = sigma_ * 1. * np.mean(
        np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0))
    randomized_lasso = lasso.gaussian(X,
                                      y,
                                      feature_weights=lam_theory * np.ones(p),
                                      randomizer_scale=np.sqrt(n) *
                                      randomizer_scale * sigma_)

    signs = randomized_lasso.fit()
    nonzero = signs != 0
    sys.stderr.write("active variables selected by randomized LASSO " +
                     str(nonzero.sum()) + "\n" + "\n")

    if nonzero.sum() > 0:
        target_randomized = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta))
        (observed_target, cov_target, cov_target_score,
         alternatives) = selected_targets(randomized_lasso.loglike,
                                          randomized_lasso._W,
                                          nonzero,
                                          dispersion=dispersion)

        toc = time.time()
        MLE_estimate, observed_info_mean, _, MLE_pval, MLE_intervals, ind_unbiased_estimator = randomized_lasso.selective_MLE(
            observed_target, cov_target, cov_target_score, alternatives)
        tic = time.time()
        cov_MLE, _ = coverage(MLE_intervals, MLE_pval, target_randomized,
                              beta[nonzero])

        pivot_MLE = np.true_divide(MLE_estimate - target_randomized,
                                   np.sqrt(np.diag(observed_info_mean)))
        time_MLE = tic - toc

        toc = time.time()
        sampler_pivot, sampler_pval, sampler_intervals = randomized_lasso.summary(
            observed_target,
            cov_target,
            cov_target_score,
            alternatives,
            level=0.9,
            compute_intervals=True,
            ndraw=200000)

        tic = time.time()
        cov_sampler, _ = coverage(sampler_intervals, sampler_pval,
                                  target_randomized, beta[nonzero])
        time_sampler = tic - toc

        return pivot_MLE, sampler_pivot, time_MLE, time_sampler, np.mean(
            cov_MLE), np.mean(cov_sampler)