def test_multiple_queries_individual_coeff(ndraw=10000, burnin=2000): s, n, p = 3, 120, 10 randomizer = randomization.laplace((p,), scale=1) X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=0, snr=5) nonzero = np.where(beta)[0] lam_frac = 1. loss = rr.glm.logistic(X, y) epsilon = 1. lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) W = np.ones(p)*lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) view = [] nview = 5 for i in range(nview): view.append(glm_group_lasso(loss, epsilon, penalty, randomizer)) mv = multiple_queries(view) mv.solve() active_union = np.zeros(p, np.bool) for i in range(nview): active_union += view[i].selection_variable['variables'] nactive = np.sum(active_union) print("nactive", nactive) active_set = np.nonzero(active_union)[0] pvalues = [] true_beta = beta[active_union] if set(nonzero).issubset(np.nonzero(active_union)[0]): for j in range(nactive): subset = np.zeros(p, np.bool) subset[active_set[j]] = True target_sampler, target_observed = glm_target(loss, active_union * ~subset, mv, subset=subset, reference=np.zeros((1,))) test_stat = lambda x: np.atleast_1d(x) pval = target_sampler.hypothesis_test(test_stat, np.atleast_1d(target_observed-true_beta[j]), alternative='twosided', ndraw=ndraw, burnin=burnin) pvalues.append(pval) active_var = np.zeros_like(pvalues, np.bool) _nonzero = np.array([i in nonzero for i in active_set]) active_var[_nonzero] = True return pvalues, [active_set[j] in nonzero for j in range(nactive)]
def test_intervals(s=3, n=200, p=50, snr=7, rho=0.1, split_frac=0.8, lam_frac=0.7, ndraw=10000, burnin=2000, bootstrap=True, intervals='new', solve_args={ 'min_its': 50, 'tol': 1.e-10 }): randomizer = randomization.laplace((p, ), scale=1.) X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, snr=snr) nonzero = np.where(beta)[0] loss = rr.glm.logistic(X, y) epsilon = 1. lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) W = np.ones(p) * lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) # first randomization M_est1 = glm_group_lasso(loss, epsilon, penalty, randomizer) # second randomization # M_est2 = glm_group_lasso(loss, epsilon, penalty, randomizer) # mv = multiple_queries([M_est1, M_est2]) mv = multiple_queries([M_est1]) mv.solve() active_union = M_est1.selection_variable['variables'] nactive = np.sum(active_union) if nactive == 0: return None if set(nonzero).issubset(np.nonzero(active_union)[0]): active_set = np.nonzero(active_union)[0] true_vec = beta[active_union] target_sampler, target_observed = glm_target(loss, active_union, mv) target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) if intervals == 'old': LU = target_sampler.confidence_intervals(target_observed, sample=target_sample, level=0.9) else: full_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin, keep_opt=True) LU = target_sampler.confidence_intervals_translate( target_observed, sample=full_sample, level=0.9) LU_naive = naive_confidence_intervals(target_sampler, target_observed) pivots_mle = target_sampler.coefficient_pvalues( target_observed, parameter=target_sampler.reference, sample=target_sample) pivots_truth = target_sampler.coefficient_pvalues(target_observed, parameter=true_vec, sample=target_sample) pvalues = target_sampler.coefficient_pvalues( target_observed, parameter=np.zeros_like(true_vec), sample=target_sample) unpenalized_mle = restricted_Mest( loss, M_est1.selection_variable['variables'], solve_args=solve_args) L, U = LU.T covered = np.zeros(nactive, np.bool) naive_covered = np.zeros(nactive, np.bool) active_var = np.zeros(nactive, np.bool) for j in range(nactive): if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): covered[j] = 1 if (LU_naive[j, 0] <= true_vec[j]) and (LU_naive[j, 1] >= true_vec[j]): naive_covered[j] = 1 active_var[j] = active_set[j] in nonzero return pivots_mle, pivots_truth, pvalues, covered, naive_covered, active_var
def test_multiple_queries(s=3, n=300, p=20, signal=7, rho=0.1, lam_frac=0.7, nviews=4, intervals='new', ndraw=10000, burnin=2000, solve_args={ 'min_its': 50, 'tol': 1.e-10 }, check_screen=True): randomizer = randomization.laplace((p, ), scale=1) X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) nonzero = np.where(beta)[0] loss = rr.glm.logistic(X, y) epsilon = 1. lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) W = np.ones(p) * lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) view = [] for i in range(nviews): view.append(glm_group_lasso(loss, epsilon, penalty, randomizer)) mv = multiple_queries(view) mv.solve() active_union = np.zeros(p, np.bool) for i in range(nviews): active_union += view[i].selection_variable['variables'] nactive = np.sum(active_union) print("nactive", nactive) if nactive == 0: return None screen = set(nonzero).issubset(np.nonzero(active_union)[0]) if check_screen and not screen: return None if True: active_set = np.nonzero(active_union)[0] true_vec = beta[active_union] ## bootstrap target_sampler_boot, target_observed = glm_target(loss, active_union, mv, bootstrap=True) if intervals == 'old': target_sample_boot = target_sampler_boot.sample(ndraw=ndraw, burnin=burnin) LU_boot = target_sampler_boot.confidence_intervals( target_observed, sample=target_sample_boot, level=0.9) pivots_boot = target_sampler_boot.coefficient_pvalues( target_observed, parameter=true_vec, sample=target_sample_boot) else: full_sample_boot = target_sampler_boot.sample(ndraw=ndraw, burnin=burnin, keep_opt=True) LU_boot = target_sampler_boot.confidence_intervals_translate( target_observed, sample=full_sample_boot, level=0.9) pivots_boot = target_sampler_boot.coefficient_pvalues_translate( target_observed, parameter=true_vec, sample=full_sample_boot) ## CLT plugin target_sampler, _ = glm_target(loss, active_union, mv, bootstrap=False) if intervals == 'old': target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) LU = target_sampler.confidence_intervals(target_observed, sample=target_sample, level=0.9) pivots = target_sampler.coefficient_pvalues(target_observed, parameter=true_vec, sample=target_sample) else: full_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin, keep_opt=True) LU = target_sampler.confidence_intervals_translate( target_observed, sample=full_sample, level=0.9) pivots = target_sampler.coefficient_pvalues_translate( target_observed, parameter=true_vec, sample=full_sample) LU_naive = naive_confidence_intervals(target_sampler, target_observed) def coverage(LU): L, U = LU[:, 0], LU[:, 1] covered = np.zeros(nactive) ci_length = np.zeros(nactive) for j in range(nactive): if check_screen: if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): covered[j] = 1 else: covered[j] = None ci_length[j] = U[j] - L[j] return covered, ci_length covered, ci_length = coverage(LU) covered_boot, ci_length_boot = coverage(LU_boot) covered_naive, ci_length_naive = coverage(LU_naive) active_var = np.zeros(nactive, np.bool) for j in range(nactive): active_var[j] = active_set[j] in nonzero return pivots, pivots_boot, covered, ci_length, covered_boot, ci_length_boot, \ active_var, covered_naive, ci_length_naive
def test_intervals(s=0, n=200, p=10, signal=7, rho=0., lam_frac=6., ndraw=10000, burnin=2000, bootstrap=True, loss='gaussian', intervals='old', randomizer='laplace', solve_args={ 'min_its': 50, 'tol': 1.e-10 }): if randomizer == 'laplace': randomizer = randomization.laplace((p, ), scale=1.) elif randomizer == 'gaussian': randomizer = randomization.isotropic_gaussian((p, ), scale=1.) elif randomizer == 'logistic': randomizer = randomization.logistic((p, ), scale=1.) if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1) lam = np.mean( np.fabs(np.dot(X.T, np.random.standard_normal((n, 1000))))) * sigma loss = rr.glm.gaussian(X, y) elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) loss = rr.glm.logistic(X, y) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) nonzero = np.where(beta)[0] epsilon = 1. / np.sqrt(n) W = lam_frac * np.ones(p) * lam # W[0] = 0 # use at least some unpenalized groups = np.concatenate([np.arange(10) for i in range(p / 10)]) #print(groups) #groups = np.arange(p) penalty = rr.group_lasso(groups, weights=dict(zip(np.arange(p), W)), lagrange=1.) # first randomization M_est1 = glm_group_lasso(loss, epsilon, penalty, randomizer) mv = multiple_queries([M_est1]) # second randomization #M_est2 = glm_group_lasso(loss, epsilon, penalty, randomizer) #mv = multiple_queries([M_est1, M_est2]) mv.solve() active_union = M_est1.selection_variable['variables'] print("active set", np.nonzero(active_union)[0]) nactive = np.sum(active_union) if nactive == 0: return None if set(nonzero).issubset(np.nonzero(active_union)[0]): active_set = np.nonzero(active_union)[0] true_vec = beta[active_union] target_sampler, target_observed = glm_target(loss, active_union, mv, bootstrap=bootstrap) if intervals == 'old': target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) LU = target_sampler.confidence_intervals(target_observed, sample=target_sample, level=0.9) pivots_mle = target_sampler.coefficient_pvalues( target_observed, parameter=target_sampler.reference, sample=target_sample) pivots_truth = target_sampler.coefficient_pvalues( target_observed, parameter=true_vec, sample=target_sample) pvalues = target_sampler.coefficient_pvalues( target_observed, parameter=np.zeros_like(true_vec), sample=target_sample) else: full_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin, keep_opt=True) LU = target_sampler.confidence_intervals_translate( target_observed, sample=full_sample, level=0.9) pivots_mle = target_sampler.coefficient_pvalues_translate( target_observed, parameter=target_sampler.reference, sample=full_sample) pivots_truth = target_sampler.coefficient_pvalues_translate( target_observed, parameter=true_vec, sample=full_sample) pvalues = target_sampler.coefficient_pvalues_translate( target_observed, parameter=np.zeros_like(true_vec), sample=full_sample) LU_naive = naive_confidence_intervals(target_sampler, target_observed) L, U = LU.T ci_length_sel = np.zeros(nactive) covered = np.zeros(nactive, np.bool) naive_covered = np.zeros(nactive, np.bool) ci_length_naive = np.zeros(nactive) active_var = np.zeros(nactive, np.bool) for j in range(nactive): if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): covered[j] = 1 ci_length_sel[j] = U[j] - L[j] if (LU_naive[j, 0] <= true_vec[j]) and (LU_naive[j, 1] >= true_vec[j]): naive_covered[j] = 1 ci_length_naive[j] = LU_naive[j, 1] - LU_naive[j, 0] active_var[j] = active_set[j] in nonzero naive_pvals = naive_pvalues(target_sampler, target_observed, true_vec) return pivots_mle, pivots_truth, pvalues, covered, ci_length_sel,\ naive_pvals, naive_covered, ci_length_naive, active_var
def test_split_compare(s=3, n=200, p=20, signal=7, rho=0.1, split_frac=0.8, lam_frac=0.7, ndraw=10000, burnin=2000, solve_args={'min_its':50, 'tol':1.e-10}, check_screen =True): X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) nonzero = np.where(beta)[0] loss = rr.glm.logistic(X, y) epsilon = 1. lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) W = np.ones(p)*lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) m = int(split_frac * n) M_est1 = split_glm_group_lasso(loss, epsilon, m, penalty) mv = multiple_queries([M_est1]) mv.solve() active_union = M_est1.selection_variable['variables'] #+ M_est2.selection_variable['variables'] nactive = np.sum(active_union) print("nactive", nactive) if nactive==0: return None leftout_indices = M_est1.randomized_loss.saturated_loss.case_weights == 0 screen = set(nonzero).issubset(np.nonzero(active_union)[0]) if check_screen and not screen: return None if True: active_set = np.nonzero(active_union)[0] true_vec = beta[active_union] ## bootstrap target_sampler_boot, target_observed = glm_target(loss, active_union, mv, bootstrap=True) target_sample_boot = target_sampler_boot.sample(ndraw=ndraw, burnin=burnin) LU_boot = target_sampler_boot.confidence_intervals(target_observed, sample=target_sample_boot, level=0.9) pivots_boot = target_sampler_boot.coefficient_pvalues(target_observed, parameter=true_vec, sample=target_sample_boot) ## CLT plugin target_sampler, _ = glm_target(loss, active_union, mv, bootstrap=False) target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) LU = target_sampler.confidence_intervals(target_observed, sample=target_sample, level=0.9) pivots = target_sampler.coefficient_pvalues(target_observed, parameter=true_vec, sample=target_sample) LU_naive = naive_confidence_intervals(target_sampler, target_observed) if X.shape[0] - leftout_indices.sum() > nactive: LU_split = standard_split_ci(rr.glm.logistic, X, y, active_union, leftout_indices) else: LU_split = np.ones((nactive, 2)) * np.nan def coverage(LU): L, U = LU[:,0], LU[:,1] covered = np.zeros(nactive) ci_length = np.zeros(nactive) for j in range(nactive): if check_screen: if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): covered[j] = 1 else: covered[j] = None ci_length[j] = U[j]-L[j] return covered, ci_length covered, ci_length = coverage(LU) covered_boot, ci_length_boot = coverage(LU_boot) covered_split, ci_length_split = coverage(LU_split) covered_naive, ci_length_naive = coverage(LU_naive) active_var = np.zeros(nactive, np.bool) for j in range(nactive): active_var[j] = active_set[j] in nonzero return pivots, pivots_boot, covered, ci_length, covered_boot, ci_length_boot, \ covered_split, ci_length_split, active_var, covered_naive, ci_length_naive
def test_condition(s=0, n=100, p=200, rho=0.1, signal=10, lam_frac=1.4, ndraw=10000, burnin=2000, loss='logistic', nviews=4, scalings=True): if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.standard_normal( (n, 2000)))).max(0)) * sigma loss = rr.glm.gaussian(X, y) elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) loss = rr.glm.logistic(X, y) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) randomizer = randomization.laplace((p, ), scale=0.6) epsilon = 1. / np.sqrt(n) W = np.ones(p) * lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) views = [] for i in range(nviews): views.append(glm_group_lasso(loss, epsilon, penalty, randomizer)) queries = multiple_queries(views) queries.solve() active_union = np.zeros(p, np.bool) for view in views: active_union += view.selection_variable['variables'] nactive = np.sum(active_union) print("nactive", nactive) nonzero = np.where(beta)[0] if set(nonzero).issubset(np.nonzero(active_union)[0]): if nactive == s: return None if scalings: # try condition on some scalings for i in range(nviews // 2): conditioning_groups = np.zeros(p, bool) conditioning_groups[:p // 2] = True marginalizing_groups = np.ones(p, bool) marginalizing_groups[:p // 2] = False views[i].decompose_subgradient( conditioning_groups=conditioning_groups, marginalizing_groups=marginalizing_groups) views[i].condition_on_scalings() else: for i in range(nviews): views[i].decompose_subgradient( conditioning_groups=np.zeros(p, bool), marginalizing_groups=np.ones(p, bool)) active_set = np.nonzero(active_union)[0] target_sampler, target_observed = glm_target(loss, active_union, queries) test_stat = lambda x: np.linalg.norm(x - beta[active_union]) observed_test_value = test_stat(target_observed) pivots = target_sampler.hypothesis_test(test_stat, observed_test_value, alternative='twosided', parameter=beta[active_union], ndraw=ndraw, burnin=burnin) return [pivots], [False]
def test_split(s=3, n=200, p=50, signal=7, rho=0.1, split_frac=0.8, lam_frac=0.7, ndraw=10000, burnin=2000, bootstrap=True, solve_args={'min_its':50, 'tol':1.e-10}, reference_known=False): X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) m = int(split_frac * n) nonzero = np.where(beta)[0] loss = rr.glm.logistic(X, y) epsilon = 1. / np.sqrt(n) lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 2000)))).max(0)) W = np.ones(p)*lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) M_est = split_glm_group_lasso(loss, epsilon, m, penalty) mv = multiple_queries([M_est]) mv.solve() M_est.selection_variable['variables'] = M_est.selection_variable['variables'] nactive = np.sum(M_est.selection_variable['variables']) if nactive==0: return None if set(nonzero).issubset(np.nonzero(M_est.selection_variable['variables'])[0]): active_set = np.nonzero(M_est.selection_variable['variables'])[0] if bootstrap: target_sampler, target_observed = glm_target(loss, M_est.selection_variable['variables'], mv) else: target_sampler, target_observed = glm_target(loss, M_est.selection_variable['variables'], mv, bootstrap=True) reference_known = True if reference_known: reference = beta[M_est.selection_variable['variables']] else: reference = target_observed target_sampler.reference = reference target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) LU = target_sampler.confidence_intervals(target_observed, sample=target_sample).T LU_naive = naive_confidence_intervals(target_sampler, target_observed) pivots_mle = target_sampler.coefficient_pvalues(target_observed, parameter=target_sampler.reference, sample=target_sample) pivots_truth = target_sampler.coefficient_pvalues(target_observed, parameter=beta[M_est.selection_variable['variables']], sample=target_sample) true_vec = beta[M_est.selection_variable['variables']] pvalues = target_sampler.coefficient_pvalues(target_observed, parameter=np.zeros_like(true_vec), sample=target_sample) L, U = LU covered = np.zeros(nactive, np.bool) naive_covered = np.zeros(nactive, np.bool) active_var = np.zeros(nactive, np.bool) for j in range(nactive): if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): covered[j] = 1 if (LU_naive[j,0] <= true_vec[j]) and (LU_naive[j,1] >= true_vec[j]): naive_covered[j] = 1 active_var[j] = active_set[j] in nonzero return pivots_mle, pivots_truth, pvalues, covered, naive_covered, active_var
def test_multiple_queries(ndraw=10000, burnin=2000, bootstrap=False, test = 'selected zeros'): s, n, p = 3, 600, 10 randomizer = randomization.laplace((p,), scale=1) X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=0, snr=4) nonzero = np.where(beta)[0] lam_frac = 1. loss = rr.glm.logistic(X, y) epsilon = 1. lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) W = np.ones(p)*lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) view = [] nview = 5 for i in range(nview): view.append(glm_group_lasso(loss, epsilon, penalty, randomizer)) mv = multiple_queries(view) mv.solve() active_union = np.zeros(p, np.bool) for i in range(nview): active_union += view[i].selection_variable['variables'] nactive = np.sum(active_union) print("nactive", nactive) if set(nonzero).issubset(np.nonzero(active_union)[0]): if nactive==s: return None active_set = np.nonzero(active_union)[0] if test == 'selected zeros': inactive_selected = np.array([active_union[i] and i not in nonzero for i in range(p)]) true_active = (beta != 0) reference = np.zeros(inactive_selected.sum()) target_sampler, target_observed = glm_target(loss, true_active, mv, subset=inactive_selected, bootstrap=bootstrap, reference=reference) else: target_sampler, target_observed = glm_target(loss, active_union, mv, bootstrap=bootstrap) test_stat = lambda x: np.linalg.norm(x) observed_test_value = test_stat(target_observed) pivot = target_sampler.hypothesis_test(test_stat, observed_test_value, alternative='twosided', ndraw=ndraw, burnin=burnin) full_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin, keep_opt=True) pivot = target_sampler.hypothesis_test_translate(full_sample, test_stat, target_observed, alternative='twosided') return [pivot], [False]
def test_cv(n=100, p=50, s=5, signal=7.5, K=5, rho=0., randomizer='gaussian', randomizer_scale=1., scale1=0.1, scale2=0.2, lam_frac=1., glmnet=True, loss='gaussian', intervals='old', bootstrap=False, condition_on_CVR=True, marginalize_subgrad=True, ndraw=10000, burnin=2000, nboot=nboot): print(n, p, s, condition_on_CVR, scale1, scale2) if randomizer == 'laplace': randomizer = randomization.laplace((p, ), scale=randomizer_scale) elif randomizer == 'gaussian': randomizer = randomization.isotropic_gaussian((p, ), randomizer_scale) elif randomizer == 'logistic': randomizer = randomization.logistic((p, ), scale=randomizer_scale) if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1) glm_loss = rr.glm.gaussian(X, y) elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) glm_loss = rr.glm.logistic(X, y) epsilon = 1. / np.sqrt(n) # view 1 cv = CV_view(glm_loss, loss_label=loss, lasso_randomization=randomizer, epsilon=epsilon, scale1=scale1, scale2=scale2) if glmnet: try: cv.solve(glmnet=glmnet) except ImportError: cv.solve(glmnet=False) else: cv.solve(glmnet=False) # for the test make sure we also run the python code cv_py = CV_view(glm_loss, loss_label=loss, lasso_randomization=randomizer, epsilon=epsilon, scale1=scale1, scale2=scale2) cv_py.solve(glmnet=False) lam = cv.lam_CVR print("lam", lam) if condition_on_CVR: cv.condition_on_opt_state() lam = cv.one_SD_rule(direction="up") print("new lam", lam) # non-randomized Lasso, just looking how many vars it selects problem = rr.simple_problem(glm_loss, rr.l1norm(p, lagrange=lam)) beta_hat = problem.solve() active_hat = beta_hat != 0 print("non-randomized lasso ", active_hat.sum()) # view 2 W = lam_frac * np.ones(p) * lam penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) M_est1 = glm_group_lasso(glm_loss, epsilon, penalty, randomizer) if nboot > 0: cv.nboot = M_est1.nboot = nboot mv = multiple_queries([cv, M_est1]) mv.solve() active_union = M_est1._overall nactive = np.sum(active_union) print("nactive", nactive) if nactive == 0: return None nonzero = np.where(beta)[0] if set(nonzero).issubset(np.nonzero(active_union)[0]): active_set = np.nonzero(active_union)[0] true_vec = beta[active_union] if marginalize_subgrad == True: M_est1.decompose_subgradient(conditioning_groups=np.zeros(p, bool), marginalizing_groups=np.ones(p, bool)) target_sampler, target_observed = glm_target(glm_loss, active_union, mv, bootstrap=bootstrap) if intervals == 'old': target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) LU = target_sampler.confidence_intervals(target_observed, sample=target_sample, level=0.9) pivots_truth = target_sampler.coefficient_pvalues( target_observed, parameter=true_vec, sample=target_sample) pvalues = target_sampler.coefficient_pvalues( target_observed, parameter=np.zeros_like(true_vec), sample=target_sample) else: full_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin, keep_opt=True) LU = target_sampler.confidence_intervals_translate( target_observed, sample=full_sample, level=0.9) pivots_truth = target_sampler.coefficient_pvalues_translate( target_observed, parameter=true_vec, sample=full_sample) pvalues = target_sampler.coefficient_pvalues_translate( target_observed, parameter=np.zeros_like(true_vec), sample=full_sample) L, U = LU.T sel_covered = np.zeros(nactive, np.bool) sel_length = np.zeros(nactive) LU_naive = naive_confidence_intervals(target_sampler, target_observed) naive_covered = np.zeros(nactive, np.bool) naive_length = np.zeros(nactive) naive_pvals = naive_pvalues(target_sampler, target_observed, true_vec) active_var = np.zeros(nactive, np.bool) for j in range(nactive): if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): sel_covered[j] = 1 if (LU_naive[j, 0] <= true_vec[j]) and (LU_naive[j, 1] >= true_vec[j]): naive_covered[j] = 1 sel_length[j] = U[j] - L[j] naive_length[j] = LU_naive[j, 1] - LU_naive[j, 0] active_var[j] = active_set[j] in nonzero q = 0.2 BH_desicions = multipletests(pvalues, alpha=q, method="fdr_bh")[0] return pivots_truth, sel_covered, sel_length, naive_pvals, naive_covered, naive_length, active_var, BH_desicions, active_var
def test_condition(ndraw=10000, burnin=2000, scalings=True): s, n, p = 6, 600, 40 X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=0.2, snr=5) randomizer = randomization.isotropic_gaussian((p, ), scale=sigma) lam_frac = 1.5 loss = rr.glm.gaussian(X, y) epsilon = 1. / np.sqrt(n) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.standard_normal( (n, 2000)))).max(0)) * sigma W = np.ones(p) * lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) views = [] nview = 3 for i in range(nview): views.append(glm_group_lasso(loss, epsilon, penalty, randomizer)) queries = multiple_queries(views) queries.solve() active_union = np.zeros(p, np.bool) for view in views: active_union += view.selection_variable['variables'] nactive = np.sum(active_union) print("nactive", nactive) if set(nonzero).issubset(np.nonzero(active_union)[0]): if nactive == s: return None if scalings: # try condition on some scalings views[0].condition_on_scalings() views[0].condition_on_subgradient() views[1].condition_on_subgradient() views[2].condition_on_scalings() else: views[0].condition_on_subgradient() views[1].condition_on_subgradient() views[2].condition_on_subgradient() active_set = np.nonzero(active_union)[0] target_sampler, target_observed = glm_target(loss, active_union, queries) pvalues = target_sampler.coefficient_pvalues(target_observed, alternative='twosided', ndraw=ndraw, burnin=burnin) active_var = np.zeros_like(pvalues, np.bool) _nonzero = np.array([i in nonzero for i in active_set]) active_var[_nonzero] = True return pvalues, active_var
def test_marginalize(s=4, n=600, p=200, rho=0., signal=3.5, lam_frac=2.5, ndraw=10000, burnin=2000, loss='gaussian', randomizer='gaussian', randomizer_scale=1., nviews=3, scalings=True, subgrad=True, parametric=False, intervals='old'): print(n, p, s) if randomizer == 'laplace': randomizer = randomization.laplace((p, ), scale=randomizer_scale) elif randomizer == 'gaussian': randomizer = randomization.isotropic_gaussian((p, ), randomizer_scale) elif randomizer == 'logistic': randomizer = randomization.logistic((p, ), scale=randomizer_scale) if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1) lam = np.mean( np.fabs(np.dot(X.T, np.random.standard_normal((n, 1000))))) * sigma loss = rr.glm.gaussian(X, y) elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) loss = rr.glm.logistic(X, y) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) epsilon = 1. / np.sqrt(n) W = lam_frac * np.ones(p) * lam #W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) views = [] for i in range(nviews): if parametric == False: views.append(glm_group_lasso(loss, epsilon, penalty, randomizer)) else: views.append( glm_group_lasso_parametric(loss, epsilon, penalty, randomizer)) queries = multiple_queries(views) queries.solve() active_union = np.zeros(p, np.bool) for view in views: active_union += view.selection_variable['variables'] nactive = np.sum(active_union) print("nactive", nactive) nonzero = np.where(beta)[0] true_vec = beta[active_union] if set(nonzero).issubset(np.nonzero(active_union)[0]): check_screen = True if nactive == s: return None # BUG: if this scalings code is moveed after the decompose_subgradient, # code seems to run fine if scalings: # try condition on some scalings for i in range(nviews): views[i].condition_on_scalings() if subgrad: for i in range(nviews): conditioning_groups = np.zeros(p, dtype=bool) conditioning_groups[:(p / 2)] = True marginalizing_groups = np.zeros(p, dtype=bool) marginalizing_groups[(p / 2):] = True views[i].decompose_subgradient( conditioning_groups=conditioning_groups, marginalizing_groups=marginalizing_groups) active_set = np.nonzero(active_union)[0] target_sampler, target_observed = glm_target(loss, active_union, queries, bootstrap=False, parametric=parametric) #reference= beta[active_union]) if intervals == 'old': target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) LU = target_sampler.confidence_intervals(target_observed, sample=target_sample, level=0.9) pivots = target_sampler.coefficient_pvalues(target_observed, parameter=true_vec, sample=target_sample) elif intervals == 'new': full_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin, keep_opt=True) LU = target_sampler.confidence_intervals_translate( target_observed, sample=full_sample, level=0.9) pivots = target_sampler.coefficient_pvalues_translate( target_observed, parameter=true_vec, sample=full_sample) #test_stat = lambda x: np.linalg.norm(x - beta[active_union]) #observed_test_value = test_stat(target_observed) #pivots = target_sampler.hypothesis_test(test_stat, # observed_test_value, # alternative='twosided', # parameter = beta[active_union], # ndraw=ndraw, # burnin=burnin, # stepsize=None) def coverage(LU): L, U = LU[:, 0], LU[:, 1] covered = np.zeros(nactive) ci_length = np.zeros(nactive) for j in range(nactive): if check_screen: if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): covered[j] = 1 else: covered[j] = None ci_length[j] = U[j] - L[j] return covered, ci_length covered, ci_length = coverage(LU) LU_naive = naive_confidence_intervals(target_sampler, target_observed) covered_naive, ci_length_naive = coverage(LU_naive) naive_pvals = naive_pvalues(target_sampler, target_observed, true_vec) return pivots, covered, ci_length, naive_pvals, covered_naive, ci_length_naive