def test_multiple_queries_individual_coeff(ndraw=10000, burnin=2000): s, n, p = 3, 120, 10 randomizer = randomization.laplace((p,), scale=1) X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=0, snr=5) nonzero = np.where(beta)[0] lam_frac = 1. loss = rr.glm.logistic(X, y) epsilon = 1. lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) W = np.ones(p)*lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) view = [] nview = 5 for i in range(nview): view.append(glm_group_lasso(loss, epsilon, penalty, randomizer)) mv = multiple_queries(view) mv.solve() active_union = np.zeros(p, np.bool) for i in range(nview): active_union += view[i].selection_variable['variables'] nactive = np.sum(active_union) print("nactive", nactive) active_set = np.nonzero(active_union)[0] pvalues = [] true_beta = beta[active_union] if set(nonzero).issubset(np.nonzero(active_union)[0]): for j in range(nactive): subset = np.zeros(p, np.bool) subset[active_set[j]] = True target_sampler, target_observed = glm_target(loss, active_union * ~subset, mv, subset=subset, reference=np.zeros((1,))) test_stat = lambda x: np.atleast_1d(x) pval = target_sampler.hypothesis_test(test_stat, np.atleast_1d(target_observed-true_beta[j]), alternative='twosided', ndraw=ndraw, burnin=burnin) pvalues.append(pval) active_var = np.zeros_like(pvalues, np.bool) _nonzero = np.array([i in nonzero for i in active_set]) active_var[_nonzero] = True return pvalues, [active_set[j] in nonzero for j in range(nactive)]
def test_parametric_covariance(ndraw=10000, burnin=2000): s, n, p = 3, 120, 10 randomizer = randomization.laplace((p, ), scale=1) X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=0, signal=12) nonzero = np.where(beta)[0] lam_frac = 1. loss = rr.glm.logistic(X, y) epsilon = 1. lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) W = np.ones(p) * lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) # first randomization M_est1 = glm_group_lasso_parametric(loss, epsilon, penalty, randomizer) # second randomization M_est2 = glm_group_lasso_parametric(loss, epsilon, penalty, randomizer) mv = multiple_queries([M_est1, M_est2]) mv.solve() target = M_est1.selection_variable['variables'].copy() if target[-1] or M_est2.selection_variable['variables'][-1]: return None if target[-2] or M_est2.selection_variable['variables'][-2]: return None # we should check they are different sizes target[-2:] = 1 if set(nonzero).issubset(np.nonzero(target)[0]): form_covariances = glm_parametric_covariance(loss) mv.setup_sampler(form_covariances) target_observed = restricted_Mest(loss, target) linear_func = np.zeros((2, target_observed.shape[0])) linear_func[0, -1] = 1. # we know this one is null linear_func[1, -2] = 1. # also null target_observed = linear_func.dot(target_observed) target_sampler = mv.setup_target((target, linear_func), target_observed, parametric=True) test_stat = lambda x: np.linalg.norm(x) pval = target_sampler.hypothesis_test(test_stat, test_stat(target_observed), alternative='greater', ndraw=ndraw, burnin=burnin) return [pval], [False]
def test_logistic_pvals(n=500, p=200, s=3, rho=0.3, signal=15.): X, y, beta, true_active = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal, equicorrelated=False) X = np.hstack([np.ones((n,1)), X]) print(true_active, 'true') active = np.array(true_active) active += 1 active = [0] + list(active) true_active = active L = lasso.logistic(X, y, [0]*1 + [1.2]*p) L.fit() S = L.summary('onesided') print(true_active, L.active) if set(true_active).issubset(L.active): return S['pval'], [v in true_active for v in S['variable']]
def generate_data(s=5, n=200, p=20, rho=0.1, signal=15): return logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal, scale=False, center=False)
def test_reconstruction(s=3, n=200, p=50, snr=7, rho=0.1, split_frac=0.8, lam_frac=0.7, ndraw=100, burnin=200, bootstrap=True, solve_args={ 'min_its': 50, 'tol': 1.e-10 }, reference_known=False): X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, snr=snr) m = int(split_frac * n) nonzero = np.where(beta)[0] loss = rr.glm.logistic(X, y) epsilon = 1. / np.sqrt(n) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 2000)))).max(0)) W = np.ones(p) * lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) M_est = split_glm_group_lasso(loss, epsilon, m, penalty) mv = multiple_queries([M_est]) mv.solve() M_est.selection_variable['variables'] = M_est.selection_variable[ 'variables'] nactive = np.sum(M_est.selection_variable['variables']) if nactive == 0: return None if set(nonzero).issubset( np.nonzero(M_est.selection_variable['variables'])[0]): active_set = np.nonzero(M_est.selection_variable['variables'])[0] target_sampler, target_observed = glm_target( loss, M_est.selection_variable['variables'], mv) target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin, keep_opt=True) reconstruction = target_sampler.reconstruction_map(target_sample) logdens = target_sampler.log_randomization_density(target_sample) return logdens.shape
def test_approximate_mle(n=100, p=10, s=3, snr=5, rho=0.1, lam_frac = 1., loss='gaussian', randomizer='gaussian'): from selection.api import randomization if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, snr=snr, sigma=1.) lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) * sigma loss = rr.glm.gaussian(X, y) elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, snr=snr) loss = rr.glm.logistic(X, y) lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) epsilon = 1. / np.sqrt(n) W = np.ones(p) * lam penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) if randomizer == 'gaussian': randomization = randomization.isotropic_gaussian((p,), scale=1.) elif randomizer == 'laplace': randomization = randomization.laplace((p,), scale=1.) M_est = M_estimator_approx(loss, epsilon, penalty, randomization, randomizer) M_est.solve_approx() inf = approximate_conditional_density(M_est) inf.solve_approx() active = M_est._overall active_set = np.asarray([i for i in range(p) if active[i]]) true_support = np.asarray([i for i in range(p) if i < s]) nactive = np.sum(active) print("active set, true_support", active_set, true_support) true_vec = beta[active] print("true coefficients", true_vec) if (set(active_set).intersection(set(true_support)) == set(true_support)) == True: mle_active = np.zeros(nactive) for j in range(nactive): mle_active[j] = inf.approx_MLE_solver(j, nstep=100)[0] print("mle for target", mle_active)
def test_multiple_queries_individual_coeff_small(ndraw=10000, burnin=2000, bootstrap=True): s, n, p = 3, 100, 20 randomizer = randomization.laplace((p,), scale=1) X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=0, snr=20.) nonzero = np.where(beta)[0] lam_frac = 3. loss = rr.glm.logistic(X, y) epsilon = 1. lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) W = np.ones(p)*lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) # randomization M_est = glm_group_lasso(loss, epsilon, penalty, randomizer) mv = multiple_queries([M_est]) mv.solve() active_vars = M_est.selection_variable['variables'] nactive = np.sum(active_vars) active_set = np.nonzero(active_vars)[0] pvalues = [] true_beta = beta[active_vars] print(nonzero, active_set) if set(nonzero).issubset(active_set): for j in range(nactive): print(j) subset = np.zeros(p, np.bool) subset[active_set[j]] = True target_sampler, target_observed = glm_target(loss, active_vars, mv, subset=subset, bootstrap=bootstrap, reference=np.zeros((1,))) test_stat = lambda x: x pval = target_sampler.hypothesis_test(test_stat, target_observed, alternative='twosided', ndraw=ndraw, burnin=burnin) pvalues.append(pval) return pvalues, [active_set[j] in nonzero for j in range(nactive)]
def test_without_screening(s=10, n=300, p=100, rho=0., signal=3.5, lam_frac=1., ndraw=10000, burnin=2000, loss='gaussian', randomizer='laplace', randomizer_scale=1., scalings=False, subgrad=True, check_screen=False): if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1, random_signs=False) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.standard_normal( (n, 2000)))).max(0)) * sigma loss = rr.glm.gaussian(X, y) X_indep, y_indep, _, _, _ = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1) loss_indep = rr.glm.gaussian(X_indep, y_indep) elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) loss = rr.glm.logistic(X, y) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) X_indep, y_indep, _, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal, random_signs=False) loss_indep = rr.glm.logistic(X_indep, y_indep) nonzero = np.where(beta)[0] if randomizer == 'laplace': randomizer = randomization.laplace((p, ), scale=randomizer_scale) elif randomizer == 'gaussian': randomizer = randomization.isotropic_gaussian((p, ), scale=randomizer_scale) epsilon = 1. / np.sqrt(n) W = np.ones(p) * lam #W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) M_est = glm_group_lasso(loss, epsilon, penalty, randomizer) M_est.solve() active_union = M_est._overall nactive = np.sum(active_union) print("nactive", nactive) active_set = np.nonzero(active_union)[0] print("active set", active_set) print("true nonzero", np.nonzero(beta)[0]) views = [M_est] queries = multiple_queries(views) queries.solve() screened = False if set(nonzero).issubset(np.nonzero(active_union)[0]): screened = True if check_screen == False or (check_screen == True and screened == True): #if nactive==s: # return None if scalings: # try condition on some scalings M_est.condition_on_subgradient() M_est.condition_on_scalings() if subgrad: M_est.decompose_subgradient(conditioning_groups=np.zeros( p, dtype=bool), marginalizing_groups=np.ones(p, bool)) boot_target1, boot_target_observed1 = pairs_bootstrap_glm( loss, active_union, inactive=~active_union) boot_target2, boot_target_observed2 = pairs_bootstrap_glm( loss_indep, active_union, inactive=~active_union) target_observed = (boot_target_observed1 - boot_target_observed2)[:nactive] def _target(indices): return boot_target1(indices)[:nactive] - boot_target2( indices)[:nactive] form_covariances = glm_nonparametric_bootstrap(n, n) queries.setup_sampler(form_covariances) queries.setup_opt_state() target_sampler = queries.setup_target(_target, target_observed, reference=target_observed) target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) LU = target_sampler.confidence_intervals(target_observed, sample=target_sample, level=0.9) pivots = target_sampler.coefficient_pvalues( target_observed, parameter=np.zeros(nactive), sample=target_sample) #test_stat = lambda x: np.linalg.norm(x - beta[active_union]) #observed_test_value = test_stat(target_observed) #pivots = target_sampler.hypothesis_test(test_stat, # observed_test_value, # alternative='twosided', # parameter = beta[active_union], # ndraw=ndraw, # burnin=burnin, # stepsize=None) true_vec = np.zeros(nactive) def coverage(LU): L, U = LU[:, 0], LU[:, 1] covered = np.zeros(nactive) ci_length = np.zeros(nactive) for j in range(nactive): if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): covered[j] = 1 ci_length[j] = U[j] - L[j] return covered, ci_length covered, ci_length = coverage(LU) LU_naive = naive_confidence_intervals(target_sampler, target_observed) covered_naive, ci_length_naive = coverage(LU_naive) naive_pvals = naive_pvalues(target_sampler, target_observed, true_vec) return pivots, covered, ci_length, naive_pvals, covered_naive, ci_length_naive
niter = 3 ad_cov = 0. unad_cov = 0. ad_len = 0. unad_len = 0. ad_risk = 0. unad_risk = 0. for i in range(niter): ### GENERATE X, Y BASED ON SEED np.random.seed(i + 68) # ensures different X and y #X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, sigma=1., rho=0., snr=snr) # lam = 1. * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) * sigma X, y, beta, nonzero = logistic_instance(n=n, p=p, s=s, rho=0., snr=snr) lam = 1.5 * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) sigma = 1. ### RUN LASSO AND TEST lasso = randomized_lasso_trial(X, y, beta, sigma, lam) if lasso is not None: ad_cov += lasso[0, 0] unad_cov += lasso[1, 0] ad_len += lasso[2, 0] unad_len += lasso[3, 0] ad_risk += lasso[4, 0] unad_risk += lasso[5, 0]
def test_data_carving_logistic(n=700, p=300, s=5, rho=0.05, signal=12., split_frac=0.8, ndraw=8000, burnin=2000, df=np.inf, compute_intervals=True, use_full_cov=False, return_only_screening=True): X, y, beta, true_active, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal, equicorrelated=False) mu = X.dot(beta) prob = np.exp(mu) / (1 + np.exp(mu)) X = np.hstack([np.ones((n, 1)), X]) active = np.array(true_active) active += 1 s += 1 active = [0] + list(active) true_active = active idx = np.arange(n) np.random.shuffle(idx) stage_one = idx[:int(n * split_frac)] n1 = len(stage_one) lam_theor = 1.0 * np.ones(p + 1) lam_theor[0] = 0. DC = data_carving.logistic(X, y, feature_weights=lam_theor, stage_one=stage_one) DC.fit() if len(DC.active) < n - int(n * split_frac): DS = data_splitting.logistic(X, y, feature_weights=lam_theor, stage_one=stage_one) DS.fit(use_full_cov=True) data_split = True else: print('not enough data for data splitting second stage') print(DC.active) data_split = False print(true_active, DC.active) if set(true_active).issubset(DC.active): carve = [] split = [] for var in DC.active: carve.append(DC.hypothesis_test(var, burnin=burnin, ndraw=ndraw)) if data_split: split.append(DS.hypothesis_test(var)) else: split.append(np.random.sample()) Xa = X[:, DC.active] active = np.zeros(p + 1, np.bool) active[true_active] = 1 v = (carve, split, active) return v
def test_intervals(s=0, n=200, p=10, signal=7, rho=0., lam_frac=6., ndraw=10000, burnin=2000, bootstrap=True, loss='gaussian', intervals='old', randomizer='laplace', solve_args={ 'min_its': 50, 'tol': 1.e-10 }): if randomizer == 'laplace': randomizer = randomization.laplace((p, ), scale=1.) elif randomizer == 'gaussian': randomizer = randomization.isotropic_gaussian((p, ), scale=1.) elif randomizer == 'logistic': randomizer = randomization.logistic((p, ), scale=1.) if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1) lam = np.mean( np.fabs(np.dot(X.T, np.random.standard_normal((n, 1000))))) * sigma loss = rr.glm.gaussian(X, y) elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) loss = rr.glm.logistic(X, y) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) nonzero = np.where(beta)[0] epsilon = 1. / np.sqrt(n) W = lam_frac * np.ones(p) * lam # W[0] = 0 # use at least some unpenalized groups = np.concatenate([np.arange(10) for i in range(p / 10)]) #print(groups) #groups = np.arange(p) penalty = rr.group_lasso(groups, weights=dict(zip(np.arange(p), W)), lagrange=1.) # first randomization M_est1 = glm_group_lasso(loss, epsilon, penalty, randomizer) mv = multiple_queries([M_est1]) # second randomization #M_est2 = glm_group_lasso(loss, epsilon, penalty, randomizer) #mv = multiple_queries([M_est1, M_est2]) mv.solve() active_union = M_est1.selection_variable['variables'] print("active set", np.nonzero(active_union)[0]) nactive = np.sum(active_union) if nactive == 0: return None if set(nonzero).issubset(np.nonzero(active_union)[0]): active_set = np.nonzero(active_union)[0] true_vec = beta[active_union] target_sampler, target_observed = glm_target(loss, active_union, mv, bootstrap=bootstrap) if intervals == 'old': target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) LU = target_sampler.confidence_intervals(target_observed, sample=target_sample, level=0.9) pivots_mle = target_sampler.coefficient_pvalues( target_observed, parameter=target_sampler.reference, sample=target_sample) pivots_truth = target_sampler.coefficient_pvalues( target_observed, parameter=true_vec, sample=target_sample) pvalues = target_sampler.coefficient_pvalues( target_observed, parameter=np.zeros_like(true_vec), sample=target_sample) else: full_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin, keep_opt=True) LU = target_sampler.confidence_intervals_translate( target_observed, sample=full_sample, level=0.9) pivots_mle = target_sampler.coefficient_pvalues_translate( target_observed, parameter=target_sampler.reference, sample=full_sample) pivots_truth = target_sampler.coefficient_pvalues_translate( target_observed, parameter=true_vec, sample=full_sample) pvalues = target_sampler.coefficient_pvalues_translate( target_observed, parameter=np.zeros_like(true_vec), sample=full_sample) LU_naive = naive_confidence_intervals(target_sampler, target_observed) L, U = LU.T ci_length_sel = np.zeros(nactive) covered = np.zeros(nactive, np.bool) naive_covered = np.zeros(nactive, np.bool) ci_length_naive = np.zeros(nactive) active_var = np.zeros(nactive, np.bool) for j in range(nactive): if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): covered[j] = 1 ci_length_sel[j] = U[j] - L[j] if (LU_naive[j, 0] <= true_vec[j]) and (LU_naive[j, 1] >= true_vec[j]): naive_covered[j] = 1 ci_length_naive[j] = LU_naive[j, 1] - LU_naive[j, 0] active_var[j] = active_set[j] in nonzero naive_pvals = naive_pvalues(target_sampler, target_observed, true_vec) return pivots_mle, pivots_truth, pvalues, covered, ci_length_sel,\ naive_pvals, naive_covered, ci_length_naive, active_var
def test_cv(n=100, p=50, s=5, signal=7.5, K=5, rho=0., randomizer='gaussian', randomizer_scale=1., scale1=0.1, scale2=0.2, lam_frac=1., glmnet=True, loss='gaussian', intervals='old', bootstrap=False, condition_on_CVR=True, marginalize_subgrad=True, ndraw=10000, burnin=2000, nboot=nboot): print(n, p, s, condition_on_CVR, scale1, scale2) if randomizer == 'laplace': randomizer = randomization.laplace((p, ), scale=randomizer_scale) elif randomizer == 'gaussian': randomizer = randomization.isotropic_gaussian((p, ), randomizer_scale) elif randomizer == 'logistic': randomizer = randomization.logistic((p, ), scale=randomizer_scale) if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1) glm_loss = rr.glm.gaussian(X, y) elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) glm_loss = rr.glm.logistic(X, y) epsilon = 1. / np.sqrt(n) # view 1 cv = CV_view(glm_loss, loss_label=loss, lasso_randomization=randomizer, epsilon=epsilon, scale1=scale1, scale2=scale2) if glmnet: try: cv.solve(glmnet=glmnet) except ImportError: cv.solve(glmnet=False) else: cv.solve(glmnet=False) # for the test make sure we also run the python code cv_py = CV_view(glm_loss, loss_label=loss, lasso_randomization=randomizer, epsilon=epsilon, scale1=scale1, scale2=scale2) cv_py.solve(glmnet=False) lam = cv.lam_CVR print("lam", lam) if condition_on_CVR: cv.condition_on_opt_state() lam = cv.one_SD_rule(direction="up") print("new lam", lam) # non-randomized Lasso, just looking how many vars it selects problem = rr.simple_problem(glm_loss, rr.l1norm(p, lagrange=lam)) beta_hat = problem.solve() active_hat = beta_hat != 0 print("non-randomized lasso ", active_hat.sum()) # view 2 W = lam_frac * np.ones(p) * lam penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) M_est1 = glm_group_lasso(glm_loss, epsilon, penalty, randomizer) if nboot > 0: cv.nboot = M_est1.nboot = nboot mv = multiple_queries([cv, M_est1]) mv.solve() active_union = M_est1._overall nactive = np.sum(active_union) print("nactive", nactive) if nactive == 0: return None nonzero = np.where(beta)[0] if set(nonzero).issubset(np.nonzero(active_union)[0]): active_set = np.nonzero(active_union)[0] true_vec = beta[active_union] if marginalize_subgrad == True: M_est1.decompose_subgradient(conditioning_groups=np.zeros(p, bool), marginalizing_groups=np.ones(p, bool)) target_sampler, target_observed = glm_target(glm_loss, active_union, mv, bootstrap=bootstrap) if intervals == 'old': target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) LU = target_sampler.confidence_intervals(target_observed, sample=target_sample, level=0.9) pivots_truth = target_sampler.coefficient_pvalues( target_observed, parameter=true_vec, sample=target_sample) pvalues = target_sampler.coefficient_pvalues( target_observed, parameter=np.zeros_like(true_vec), sample=target_sample) else: full_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin, keep_opt=True) LU = target_sampler.confidence_intervals_translate( target_observed, sample=full_sample, level=0.9) pivots_truth = target_sampler.coefficient_pvalues_translate( target_observed, parameter=true_vec, sample=full_sample) pvalues = target_sampler.coefficient_pvalues_translate( target_observed, parameter=np.zeros_like(true_vec), sample=full_sample) L, U = LU.T sel_covered = np.zeros(nactive, np.bool) sel_length = np.zeros(nactive) LU_naive = naive_confidence_intervals(target_sampler, target_observed) naive_covered = np.zeros(nactive, np.bool) naive_length = np.zeros(nactive) naive_pvals = naive_pvalues(target_sampler, target_observed, true_vec) active_var = np.zeros(nactive, np.bool) for j in range(nactive): if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): sel_covered[j] = 1 if (LU_naive[j, 0] <= true_vec[j]) and (LU_naive[j, 1] >= true_vec[j]): naive_covered[j] = 1 sel_length[j] = U[j] - L[j] naive_length[j] = LU_naive[j, 1] - LU_naive[j, 0] active_var[j] = active_set[j] in nonzero q = 0.2 BH_desicions = multipletests(pvalues, alpha=q, method="fdr_bh")[0] return pivots_truth, sel_covered, sel_length, naive_pvals, naive_covered, naive_length, active_var, BH_desicions, active_var
def test_condition(s=0, n=100, p=200, rho=0.1, signal=10, lam_frac=1.4, ndraw=10000, burnin=2000, loss='logistic', nviews=4, scalings=True): if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.standard_normal( (n, 2000)))).max(0)) * sigma loss = rr.glm.gaussian(X, y) elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) loss = rr.glm.logistic(X, y) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) randomizer = randomization.laplace((p, ), scale=0.6) epsilon = 1. / np.sqrt(n) W = np.ones(p) * lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) views = [] for i in range(nviews): views.append(glm_group_lasso(loss, epsilon, penalty, randomizer)) queries = multiple_queries(views) queries.solve() active_union = np.zeros(p, np.bool) for view in views: active_union += view.selection_variable['variables'] nactive = np.sum(active_union) print("nactive", nactive) nonzero = np.where(beta)[0] if set(nonzero).issubset(np.nonzero(active_union)[0]): if nactive == s: return None if scalings: # try condition on some scalings for i in range(nviews // 2): conditioning_groups = np.zeros(p, bool) conditioning_groups[:p // 2] = True marginalizing_groups = np.ones(p, bool) marginalizing_groups[:p // 2] = False views[i].decompose_subgradient( conditioning_groups=conditioning_groups, marginalizing_groups=marginalizing_groups) views[i].condition_on_scalings() else: for i in range(nviews): views[i].decompose_subgradient( conditioning_groups=np.zeros(p, bool), marginalizing_groups=np.ones(p, bool)) active_set = np.nonzero(active_union)[0] target_sampler, target_observed = glm_target(loss, active_union, queries) test_stat = lambda x: np.linalg.norm(x - beta[active_union]) observed_test_value = test_stat(target_observed) pivots = target_sampler.hypothesis_test(test_stat, observed_test_value, alternative='twosided', parameter=beta[active_union], ndraw=ndraw, burnin=burnin) return [pivots], [False]
def test_multiple_queries(ndraw=10000, burnin=2000, bootstrap=False, test = 'selected zeros'): s, n, p = 3, 600, 10 randomizer = randomization.laplace((p,), scale=1) X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=0, snr=4) nonzero = np.where(beta)[0] lam_frac = 1. loss = rr.glm.logistic(X, y) epsilon = 1. lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) W = np.ones(p)*lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) view = [] nview = 5 for i in range(nview): view.append(glm_group_lasso(loss, epsilon, penalty, randomizer)) mv = multiple_queries(view) mv.solve() active_union = np.zeros(p, np.bool) for i in range(nview): active_union += view[i].selection_variable['variables'] nactive = np.sum(active_union) print("nactive", nactive) if set(nonzero).issubset(np.nonzero(active_union)[0]): if nactive==s: return None active_set = np.nonzero(active_union)[0] if test == 'selected zeros': inactive_selected = np.array([active_union[i] and i not in nonzero for i in range(p)]) true_active = (beta != 0) reference = np.zeros(inactive_selected.sum()) target_sampler, target_observed = glm_target(loss, true_active, mv, subset=inactive_selected, bootstrap=bootstrap, reference=reference) else: target_sampler, target_observed = glm_target(loss, active_union, mv, bootstrap=bootstrap) test_stat = lambda x: np.linalg.norm(x) observed_test_value = test_stat(target_observed) pivot = target_sampler.hypothesis_test(test_stat, observed_test_value, alternative='twosided', ndraw=ndraw, burnin=burnin) full_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin, keep_opt=True) pivot = target_sampler.hypothesis_test_translate(full_sample, test_stat, target_observed, alternative='twosided') return [pivot], [False]
def test_split(s=3, n=200, p=50, signal=7, rho=0.1, split_frac=0.8, lam_frac=0.7, ndraw=10000, burnin=2000, bootstrap=True, solve_args={'min_its':50, 'tol':1.e-10}, reference_known=False): X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) m = int(split_frac * n) nonzero = np.where(beta)[0] loss = rr.glm.logistic(X, y) epsilon = 1. / np.sqrt(n) lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 2000)))).max(0)) W = np.ones(p)*lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) M_est = split_glm_group_lasso(loss, epsilon, m, penalty) mv = multiple_queries([M_est]) mv.solve() M_est.selection_variable['variables'] = M_est.selection_variable['variables'] nactive = np.sum(M_est.selection_variable['variables']) if nactive==0: return None if set(nonzero).issubset(np.nonzero(M_est.selection_variable['variables'])[0]): active_set = np.nonzero(M_est.selection_variable['variables'])[0] if bootstrap: target_sampler, target_observed = glm_target(loss, M_est.selection_variable['variables'], mv) else: target_sampler, target_observed = glm_target(loss, M_est.selection_variable['variables'], mv, bootstrap=True) reference_known = True if reference_known: reference = beta[M_est.selection_variable['variables']] else: reference = target_observed target_sampler.reference = reference target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) LU = target_sampler.confidence_intervals(target_observed, sample=target_sample).T LU_naive = naive_confidence_intervals(target_sampler, target_observed) pivots_mle = target_sampler.coefficient_pvalues(target_observed, parameter=target_sampler.reference, sample=target_sample) pivots_truth = target_sampler.coefficient_pvalues(target_observed, parameter=beta[M_est.selection_variable['variables']], sample=target_sample) true_vec = beta[M_est.selection_variable['variables']] pvalues = target_sampler.coefficient_pvalues(target_observed, parameter=np.zeros_like(true_vec), sample=target_sample) L, U = LU covered = np.zeros(nactive, np.bool) naive_covered = np.zeros(nactive, np.bool) active_var = np.zeros(nactive, np.bool) for j in range(nactive): if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): covered[j] = 1 if (LU_naive[j,0] <= true_vec[j]) and (LU_naive[j,1] >= true_vec[j]): naive_covered[j] = 1 active_var[j] = active_set[j] in nonzero return pivots_mle, pivots_truth, pvalues, covered, naive_covered, active_var
def test_marginalize(s=4, n=600, p=200, rho=0., signal=3.5, lam_frac=2.5, ndraw=10000, burnin=2000, loss='gaussian', randomizer='gaussian', randomizer_scale=1., nviews=3, scalings=True, subgrad=True, parametric=False, intervals='old'): print(n, p, s) if randomizer == 'laplace': randomizer = randomization.laplace((p, ), scale=randomizer_scale) elif randomizer == 'gaussian': randomizer = randomization.isotropic_gaussian((p, ), randomizer_scale) elif randomizer == 'logistic': randomizer = randomization.logistic((p, ), scale=randomizer_scale) if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1) lam = np.mean( np.fabs(np.dot(X.T, np.random.standard_normal((n, 1000))))) * sigma loss = rr.glm.gaussian(X, y) elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) loss = rr.glm.logistic(X, y) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) epsilon = 1. / np.sqrt(n) W = lam_frac * np.ones(p) * lam #W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) views = [] for i in range(nviews): if parametric == False: views.append(glm_group_lasso(loss, epsilon, penalty, randomizer)) else: views.append( glm_group_lasso_parametric(loss, epsilon, penalty, randomizer)) queries = multiple_queries(views) queries.solve() active_union = np.zeros(p, np.bool) for view in views: active_union += view.selection_variable['variables'] nactive = np.sum(active_union) print("nactive", nactive) nonzero = np.where(beta)[0] true_vec = beta[active_union] if set(nonzero).issubset(np.nonzero(active_union)[0]): check_screen = True if nactive == s: return None # BUG: if this scalings code is moveed after the decompose_subgradient, # code seems to run fine if scalings: # try condition on some scalings for i in range(nviews): views[i].condition_on_scalings() if subgrad: for i in range(nviews): conditioning_groups = np.zeros(p, dtype=bool) conditioning_groups[:(p / 2)] = True marginalizing_groups = np.zeros(p, dtype=bool) marginalizing_groups[(p / 2):] = True views[i].decompose_subgradient( conditioning_groups=conditioning_groups, marginalizing_groups=marginalizing_groups) active_set = np.nonzero(active_union)[0] target_sampler, target_observed = glm_target(loss, active_union, queries, bootstrap=False, parametric=parametric) #reference= beta[active_union]) if intervals == 'old': target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) LU = target_sampler.confidence_intervals(target_observed, sample=target_sample, level=0.9) pivots = target_sampler.coefficient_pvalues(target_observed, parameter=true_vec, sample=target_sample) elif intervals == 'new': full_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin, keep_opt=True) LU = target_sampler.confidence_intervals_translate( target_observed, sample=full_sample, level=0.9) pivots = target_sampler.coefficient_pvalues_translate( target_observed, parameter=true_vec, sample=full_sample) #test_stat = lambda x: np.linalg.norm(x - beta[active_union]) #observed_test_value = test_stat(target_observed) #pivots = target_sampler.hypothesis_test(test_stat, # observed_test_value, # alternative='twosided', # parameter = beta[active_union], # ndraw=ndraw, # burnin=burnin, # stepsize=None) def coverage(LU): L, U = LU[:, 0], LU[:, 1] covered = np.zeros(nactive) ci_length = np.zeros(nactive) for j in range(nactive): if check_screen: if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): covered[j] = 1 else: covered[j] = None ci_length[j] = U[j] - L[j] return covered, ci_length covered, ci_length = coverage(LU) LU_naive = naive_confidence_intervals(target_sampler, target_observed) covered_naive, ci_length_naive = coverage(LU_naive) naive_pvals = naive_pvalues(target_sampler, target_observed, true_vec) return pivots, covered, ci_length, naive_pvals, covered_naive, ci_length_naive
def test_multiple_queries_small(ndraw=10000, burnin=2000, nsim=None): # nsim needed for decorator s, n, p = 2, 100, 10 randomizer = randomization.laplace((p,), scale=1) X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=0, snr=3) nonzero = np.where(beta)[0] lam_frac = .6 loss = rr.glm.logistic(X, y) epsilon = 1. lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) W = np.ones(p)*lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) # first randomization M_est = glm_group_lasso(loss, epsilon, penalty, randomizer) mv = multiple_queries([M_est]) mv.solve() active_union = M_est.selection_variable['variables'] nactive = np.sum(active_union) print("nactive", nactive) if set(nonzero).issubset(np.nonzero(active_union)[0]): if nactive==s: return None active_set = np.nonzero(active_union)[0] inactive_selected = I = [i for i in np.arange(active_set.shape[0]) if active_set[i] not in nonzero] if not I: return None inactive_indicators_mat = np.zeros((len(inactive_selected),nactive)) j = 0 for i in range(nactive): if active_set[i] not in nonzero: inactive_indicators_mat[j,i] = 1 j+=1 form_covariances = glm_nonparametric_bootstrap(n, n) mv.setup_sampler(form_covariances) boot_target, target_observed = pairs_bootstrap_glm(loss, active_union) inactive_target = lambda indices: boot_target(indices)[inactive_selected] inactive_observed = target_observed[inactive_selected] # param_cov = _parametric_cov_glm(loss, active_union) alpha_mat = set_alpha_matrix(loss, active_union) # target = target_alpha\times alpha+reference_vec target_alpha = np.dot(inactive_indicators_mat, alpha_mat) target_sampler = mv.setup_bootstrapped_target(inactive_target, inactive_observed, target_alpha) test_stat = lambda x: np.linalg.norm(x) pval = target_sampler.hypothesis_test(test_stat, np.linalg.norm(inactive_observed), alternative='twosided', ndraw=ndraw, burnin=burnin) # testing the global null all_selected = np.arange(active_set.shape[0]) target_gn = lambda indices: boot_target(indices)[:nactive] target_observed_gn = target_observed[:nactive] target_alpha_gn = alpha_mat target_sampler_gn = mv.setup_bootstrapped_target(target_gn, target_observed_gn, target_alpha_gn, reference = beta[active_union]) test_stat_boot_gn = lambda x: np.linalg.norm(x) observed_test_value = np.linalg.norm(target_observed_gn-beta[active_union]) pval_gn = target_sampler_gn.hypothesis_test(test_stat_boot_gn, observed_test_value, alternative='twosided', ndraw=ndraw, burnin=burnin) return [pval, pval_gn], [False, False]
def test_intervals(s=3, n=200, p=50, snr=7, rho=0.1, split_frac=0.8, lam_frac=0.7, ndraw=10000, burnin=2000, bootstrap=True, intervals='new', solve_args={ 'min_its': 50, 'tol': 1.e-10 }): randomizer = randomization.laplace((p, ), scale=1.) X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, snr=snr) nonzero = np.where(beta)[0] loss = rr.glm.logistic(X, y) epsilon = 1. lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) W = np.ones(p) * lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) # first randomization M_est1 = glm_group_lasso(loss, epsilon, penalty, randomizer) # second randomization # M_est2 = glm_group_lasso(loss, epsilon, penalty, randomizer) # mv = multiple_queries([M_est1, M_est2]) mv = multiple_queries([M_est1]) mv.solve() active_union = M_est1.selection_variable['variables'] nactive = np.sum(active_union) if nactive == 0: return None if set(nonzero).issubset(np.nonzero(active_union)[0]): active_set = np.nonzero(active_union)[0] true_vec = beta[active_union] target_sampler, target_observed = glm_target(loss, active_union, mv) target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) if intervals == 'old': LU = target_sampler.confidence_intervals(target_observed, sample=target_sample, level=0.9) else: full_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin, keep_opt=True) LU = target_sampler.confidence_intervals_translate( target_observed, sample=full_sample, level=0.9) LU_naive = naive_confidence_intervals(target_sampler, target_observed) pivots_mle = target_sampler.coefficient_pvalues( target_observed, parameter=target_sampler.reference, sample=target_sample) pivots_truth = target_sampler.coefficient_pvalues(target_observed, parameter=true_vec, sample=target_sample) pvalues = target_sampler.coefficient_pvalues( target_observed, parameter=np.zeros_like(true_vec), sample=target_sample) unpenalized_mle = restricted_Mest( loss, M_est1.selection_variable['variables'], solve_args=solve_args) L, U = LU.T covered = np.zeros(nactive, np.bool) naive_covered = np.zeros(nactive, np.bool) active_var = np.zeros(nactive, np.bool) for j in range(nactive): if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): covered[j] = 1 if (LU_naive[j, 0] <= true_vec[j]) and (LU_naive[j, 1] >= true_vec[j]): naive_covered[j] = 1 active_var[j] = active_set[j] in nonzero return pivots_mle, pivots_truth, pvalues, covered, naive_covered, active_var
def test_approximate_ci(n=100, p=10, s=0, snr=5, rho=0.1, lam_frac = 1., loss='gaussian', randomizer='gaussian'): from selection.api import randomization if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, snr=snr, sigma=1.) loss = rr.glm.gaussian(X, y) lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) * sigma elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, snr=snr) loss = rr.glm.logistic(X, y) lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) if randomizer == 'gaussian': randomization = randomization.isotropic_gaussian((p,), scale=1.) elif randomizer == 'laplace': randomization = randomization.laplace((p,), scale=1.) W = np.ones(p) * lam penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) # active_bool = np.zeros(p, np.bool) # active_bool[range(3)] = 1 # inactive_bool = ~active_bool GS = greedy_score_step_approx(loss, penalty, np.zeros(p, dtype=bool), np.ones(p, dtype=bool), randomization, randomizer) GS.solve_approx() active = GS._overall print("nactive", active.sum()) ci = approximate_conditional_density(GS) ci.solve_approx() active_set = np.asarray([i for i in range(p) if active[i]]) true_support = np.asarray([i for i in range(p) if i < s]) nactive = np.sum(active) print("active set, true_support", active_set, true_support) true_vec = beta[active] print("true coefficients", true_vec) if (set(active_set).intersection(set(true_support)) == set(true_support)) == True: ci_active = np.zeros((nactive, 2)) covered = np.zeros(nactive, np.bool) ci_length = np.zeros(nactive) pivots = np.zeros(nactive) toc = time.time() for j in range(nactive): ci_active[j, :] = np.array(ci.approximate_ci(j)) if (ci_active[j, 0] <= true_vec[j]) and (ci_active[j, 1] >= true_vec[j]): covered[j] = 1 ci_length[j] = ci_active[j, 1] - ci_active[j, 0] # print(ci_active[j, :]) pivots[j] = ci.approximate_pvalue(j, true_vec[j]) print("confidence intervals", ci_active) tic = time.time() print('ci time now', tic - toc)
def test_approximate_ci(n=200, p=50, s=0, snr=5, threshold = 3., rho=0.1, lam_frac = 1., loss='gaussian', randomizer='gaussian'): from selection.api import randomization if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, snr=snr, sigma=1.) loss = rr.glm.gaussian(X, y) elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, snr=snr) loss = rr.glm.logistic(X, y) if randomizer=='gaussian': randomization = randomization.isotropic_gaussian((p,), scale=1.) elif randomizer=='laplace': randomization = randomization.laplace((p,), scale=1.) active_bool = np.zeros(p, np.bool) #active_bool[range(3)] = 1 inactive_bool = ~active_bool TS = threshold_score_approx(loss, threshold, randomization, active_bool, inactive_bool, randomizer) TS.solve_approx() active = TS._overall print("nactive", active.sum()) ci = approximate_conditional_density(TS) ci.solve_approx() active_set = np.asarray([i for i in range(p) if active[i]]) true_support = np.asarray([i for i in range(p) if i < s]) nactive = np.sum(active) print("active set, true_support", active_set, true_support) true_vec = beta[active] print("true coefficients", true_vec) if (set(active_set).intersection(set(true_support)) == set(true_support))== True: ci_active = np.zeros((nactive, 2)) covered = np.zeros(nactive, np.bool) ci_length = np.zeros(nactive) pivots = np.zeros(nactive) class target_class(object): def __init__(self, target_cov): self.target_cov = target_cov self.shape = target_cov.shape target = target_class(TS.target_cov) ci_naive = naive_confidence_intervals(target, TS.target_observed) naive_pvals = naive_pvalues(target, TS.target_observed, true_vec) naive_covered = np.zeros(nactive) toc = time.time() for j in range(nactive): ci_active[j, :] = np.array(ci.approximate_ci(j)) if (ci_active[j, 0] <= true_vec[j]) and (ci_active[j,1] >= true_vec[j]): covered[j] = 1 ci_length[j] = ci_active[j,1] - ci_active[j,0] print(ci_active[j, :]) pivots[j] = ci.approximate_pvalue(j, true_vec[j]) # naive ci if (ci_naive[j,0]<=true_vec[j]) and (ci_naive[j,1]>=true_vec[j]): naive_covered[j]+=1 tic = time.time() print('ci time now', tic - toc) return covered, ci_length, pivots, naive_covered, naive_pvals
def test_overall_null_two_queries(ndraw=10000, burnin=2000, nsim=None): # nsim needed for decorator s, n, p = 5, 200, 20 randomizer = randomization.laplace((p, ), scale=0.5) X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=0.1, signal=7) nonzero = np.where(beta)[0] lam_frac = 1. loss = rr.glm.logistic(X, y) epsilon = 1. lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) W = np.ones(p) * lam W += np.arange(p) / 200 W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) # first randomization M_est1 = glm_group_lasso(loss, epsilon, penalty, randomizer) M_est1.solve() bootstrap_score1 = M_est1.setup_sampler() # second randomization -- a greedy step from LASSO active = M_est1.selection_variable['variables'] inactive = ~active inactive_randomizer = randomization.laplace((inactive.sum(), ), scale=0.5) step = glm_greedy_step(loss, penalty, active, inactive, inactive_randomizer) step.solve() bootstrap_score2 = step.setup_sampler() # we take target to be union of two active sets active = M_est1.selection_variable['variables'] + step.selection_variable[ 'variables'] if set(nonzero).issubset(np.nonzero(active)[0]): boot_target, target_observed = pairs_bootstrap_glm(loss, active) # target are all true null coefficients selected sampler = lambda: np.random.choice(n, size=(n, ), replace=True) target_cov, cov1, cov2 = bootstrap_cov(sampler, boot_target, cross_terms=(bootstrap_score1, bootstrap_score2)) active_set = np.nonzero(active)[0] inactive_selected = I = [ i for i in np.arange(active_set.shape[0]) if active_set[i] not in nonzero ] if not I: return None # is it enough only to bootstrap the inactive ones? # seems so... A1, b1 = M_est1.linear_decomposition(cov1[I], target_cov[I][:, I], target_observed[I]) A2, b2 = step.linear_decomposition(cov2[I], target_cov[I][:, I], target_observed[I]) target_inv_cov = np.linalg.inv(target_cov[I][:, I]) initial_state = np.hstack([ target_observed[I], M_est1.observed_opt_state, step.observed_opt_state ]) ntarget = len(I) target_slice = slice(0, ntarget) opt_slice1 = slice(ntarget, p + ntarget) opt_slice2 = slice(p + ntarget, 2 * p + ntarget) def target_gradient(state): # with many samplers, we will add up the `target_slice` component # many target_grads # and only once do the Gaussian addition of full_grad target = state[target_slice] opt_state1 = state[opt_slice1] opt_state2 = state[opt_slice2] target_grad1 = M_est1.randomization_gradient( target, (A1, b1), opt_state1) target_grad2 = step.randomization_gradient(target, (A2, b2), opt_state2) full_grad = np.zeros_like(state) full_grad[opt_slice1] = -target_grad1[1] full_grad[opt_slice2] = -target_grad2[1] full_grad[target_slice] -= target_grad1[0] + target_grad2[0] full_grad[target_slice] -= target_inv_cov.dot(target) return full_grad def target_projection(state): opt_state1 = state[opt_slice1] state[opt_slice1] = M_est1.projection(opt_state1) opt_state2 = state[opt_slice2] state[opt_slice2] = step.projection(opt_state2) return state target_langevin = projected_langevin(initial_state, target_gradient, target_projection, .5 / (2 * p + 1)) samples = [] for i in range(ndraw + burnin): target_langevin.next() if i >= burnin: samples.append(target_langevin.state[target_slice].copy()) test_stat = lambda x: np.linalg.norm(x) observed = test_stat(target_observed[I]) sample_test_stat = np.array([test_stat(x) for x in samples]) family = discrete_family(sample_test_stat, np.ones_like(sample_test_stat)) pval = family.ccdf(0, observed) return pval, False
def test_threshold_score(ndraw=10000, burnin=2000, nsim=None): # nsim needed for decorator s, n, p = 5, 200, 20 threshold = 0.5 X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=0.1, signal=7) nonzero = np.where(beta)[0] lam_frac = 1. loss = rr.glm.logistic(X, y) active_bool = np.zeros(p, np.bool) active_bool[range(3)] = 1 inactive_bool = ~active_bool randomizer = randomization.laplace((inactive_bool.sum(), ), scale=0.5) # threshold the score thresh = glm_threshold_score(loss, threshold, randomizer, active_bool, inactive_bool) mv = multiple_queries([thresh]) mv.solve() boundary = thresh.selection_variable['boundary_set'] new_active = np.nonzero(np.arange(3, 20)[boundary])[0] active_set = np.array(sorted(set(range(3)).union(new_active))) if set(nonzero).issubset(active_set): full_active = np.zeros(p, np.bool) full_active[active_set] = 1 nactive = active_set.shape[0] inactive_selected = I = [ i for i in np.arange(active_set.shape[0]) if active_set[i] not in nonzero ] if not I: return None inactive_indicators_mat = np.zeros((len(inactive_selected), nactive)) j = 0 for i in range(nactive): if active_set[i] not in nonzero: inactive_indicators_mat[j, i] = 1 j += 1 form_covariances = glm_nonparametric_bootstrap(n, n) mv.setup_sampler(form_covariances) boot_target, target_observed = pairs_bootstrap_glm(loss, full_active) inactive_target = lambda indices: boot_target(indices)[ inactive_selected] inactive_observed = target_observed[inactive_selected] # param_cov = _parametric_cov_glm(loss, active_union) target_sampler = mv.setup_target(inactive_target, inactive_observed) test_stat = lambda x: np.linalg.norm(x) pval = target_sampler.hypothesis_test( test_stat, np.linalg.norm(inactive_observed), alternative='twosided', ndraw=ndraw, burnin=burnin) print(pval) return pval, False
def test_nonrandomized(s=0, n=200, p=10, signal=7, rho=0, lam_frac=0.8, loss='gaussian', solve_args={ 'min_its': 20, 'tol': 1.e-10 }): if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.standard_normal( (n, 2000)))).max(0)) * sigma loss = rr.glm.gaussian(X, y) elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) loss = rr.glm.logistic(X, y) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) nonzero = np.where(beta)[0] print("lam", lam) W = np.ones(p) * lam penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) true_vec = beta M_est = M_estimator(lam, loss, penalty) M_est.solve() active = M_est._overall nactive = np.sum(active) print("nactive", nactive) if nactive == 0: return None #score_mean = M_est.observed_internal_state.copy() #score_mean[nactive:] = 0 M_est.setup_sampler(score_mean=np.zeros(p)) #M_est.setup_sampler(score_mean=score_mean) #M_est.sample(ndraw = 1000, burnin=1000, stepsize=1./p) if set(nonzero).issubset(np.nonzero(active)[0]): check_screen = True #test_stat = lambda x: np.linalg.norm(x) #return M_est.hypothesis_test(test_stat, test_stat(M_est.observed_internal_state), stepsize=1./p) ci = M_est.confidence_intervals(M_est.observed_internal_state) pivots = M_est.coefficient_pvalues(M_est.observed_internal_state) def coverage(LU): L, U = LU[:, 0], LU[:, 1] covered = np.zeros(nactive) ci_length = np.zeros(nactive) for j in range(nactive): if check_screen: if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): covered[j] = 1 else: covered[j] = None ci_length[j] = U[j] - L[j] return covered, ci_length covered = coverage(ci)[0] #print(pivots) #print(coverage) return pivots, covered
def test_split_compare(ndraw=20000, burnin=10000, solve_args={ 'min_its': 50, 'tol': 1.e-10 }, check_screen=True): # s, n, p = 0, 200, 10 s, n, p = 6, 300, 40 randomizer = randomization.laplace((p, ), scale=1.) X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=0.1, snr=5) nonzero = np.where(beta)[0] lam_frac = 1. loss = rr.glm.logistic(X, y) epsilon = 1. lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) W = np.ones(p) * lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) m = int(0.8 * n) # first randomization M_est1 = split_glm_group_lasso(loss, epsilon, m, penalty) # second randomization # M_est2 = glm_group_lasso(loss, epsilon, penalty, randomizer) # mv = multiple_queries([M_est1, M_est2]) mv = multiple_queries([M_est1]) mv.solve() active_union = M_est1.selection_variable[ 'variables'] #+ M_est2.selection_variable['variables'] nactive = np.sum(active_union) print("nactive", nactive) if nactive == 0: return None leftout_indices = M_est1.randomized_loss.saturated_loss.case_weights == 0 screen = set(nonzero).issubset(np.nonzero(active_union)[0]) if check_screen and not screen: return None if True: active_set = np.nonzero(active_union)[0] true_vec = beta[active_union] ## bootstrap target_sampler_boot, target_observed = glm_target(loss, active_union, mv, bootstrap=True) target_sample_boot = target_sampler_boot.sample(ndraw=ndraw, burnin=burnin) LU_boot = target_sampler_boot.confidence_intervals( target_observed, sample=target_sample_boot) pivots_boot = target_sampler_boot.coefficient_pvalues( target_observed, parameter=true_vec, sample=target_sample_boot) ## CLT plugin target_sampler, _ = glm_target(loss, active_union, mv) target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) LU = target_sampler.confidence_intervals(target_observed, sample=target_sample) LU_naive = naive_confidence_intervals(target_sampler, target_observed) if X.shape[0] - leftout_indices.sum() > nactive: LU_split = standard_ci(X, y, active_union, leftout_indices) LU_split_sm = standard_ci_sm(X, y, active_union, leftout_indices) else: LU_split = LU_split_sm = np.ones((nactive, 2)) * np.nan pivots = target_sampler.coefficient_pvalues(target_observed, parameter=true_vec, sample=target_sample) def coverage(LU): L, U = LU[:, 0], LU[:, 1] covered = np.zeros(nactive) ci_length = np.zeros(nactive) for j in range(nactive): if check_screen: if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): covered[j] = 1 else: covered[j] = None ci_length[j] = U[j] - L[j] return covered, ci_length covered, ci_length = coverage(LU) covered_boot, ci_length_boot = coverage(LU_boot) covered_split, ci_length_split = coverage(LU_split) covered_naive, ci_length_naive = coverage(LU_naive) active_var = np.zeros(nactive, np.bool) for j in range(nactive): active_var[j] = active_set[j] in nonzero return pivots, pivots_boot, covered, ci_length, covered_boot, ci_length_boot, \ covered_split, ci_length_split, active_var, covered_naive, ci_length_naive