def choose_lambda_CVR(self, scale1=None, scale2=None, loss=None): """ Minimizes CV error curve with additive randomization (CVR=CV+R1+R2=CV1+R2) """ if loss is None: loss = copy.copy(self.loss) CV_curve = [] X, _ = loss.data p = X.shape[1] for lam in self.lam_seq: penalty = rr.l1norm(p, lagrange=lam) #CV_curve.append(self.CV_err(penalty, loss) + (lam,)) CV_curve.append(self.CV_err(penalty, loss)) CV_curve = np.array(CV_curve) rv1, rv2 = np.zeros(self.lam_seq.shape[0]), np.zeros( self.lam_seq.shape[0]) if scale1 is not None: randomization1 = randomization.isotropic_gaussian( (self.lam_seq.shape[0], ), scale=scale1) rv1 = np.asarray(randomization1._sampler(size=(1, ))) if scale2 is not None: randomization2 = randomization.isotropic_gaussian( (self.lam_seq.shape[0], ), scale=scale2) rv2 = np.asarray(randomization2._sampler(size=(1, ))) CVR_val = CV_curve[:, 0] + rv1.flatten() + rv2.flatten() lam_CVR = self.lam_seq[np.argmin(CVR_val)] # lam_CVR minimizes CVR CV1_val = CV_curve[:, 0] + rv1.flatten() SD = CV_curve[:, 1] return lam_CVR, SD, CVR_val, CV1_val, self.lam_seq
def test_approximate_mle(n=100, p=10, s=3, snr=5, rho=0.1, lam_frac = 1., loss='gaussian', randomizer='gaussian'): from selection.api import randomization if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, snr=snr, sigma=1.) lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) * sigma loss = rr.glm.gaussian(X, y) elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, snr=snr) loss = rr.glm.logistic(X, y) lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) epsilon = 1. / np.sqrt(n) W = np.ones(p) * lam penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) if randomizer == 'gaussian': randomization = randomization.isotropic_gaussian((p,), scale=1.) elif randomizer == 'laplace': randomization = randomization.laplace((p,), scale=1.) M_est = M_estimator_approx(loss, epsilon, penalty, randomization, randomizer) M_est.solve_approx() inf = approximate_conditional_density(M_est) inf.solve_approx() active = M_est._overall active_set = np.asarray([i for i in range(p) if active[i]]) true_support = np.asarray([i for i in range(p) if i < s]) nactive = np.sum(active) print("active set, true_support", active_set, true_support) true_vec = beta[active] print("true coefficients", true_vec) if (set(active_set).intersection(set(true_support)) == set(true_support)) == True: mle_active = np.zeros(nactive) for j in range(nactive): mle_active[j] = inf.approx_MLE_solver(j, nstep=100)[0] print("mle for target", mle_active)
def test_selection(): n = 500 p = 100 s = 0 signal = 0. np.random.seed(3) # ensures different y X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, sigma=1., rho=0, signal=signal) lam = 1. * np.mean( np.fabs(np.dot(X.T, np.random.standard_normal( (n, 2000)))).max(0)) * sigma n, p = X.shape loss = rr.glm.gaussian(X, y) epsilon = 1. / np.sqrt(n) W = np.ones(p) * lam penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) randomizer = randomization.isotropic_gaussian((p, ), scale=1.) M_est = M_estimator_approx(loss, epsilon, penalty, randomizer, 'gaussian', 'parametric') M_est.solve_approx() active = M_est._overall active_set = np.asarray([i for i in range(p) if active[i]]) nactive = np.sum(active) prior_variance = 1000. noise_variance = sigma**2 generative_mean = np.zeros(p) generative_mean[:nactive] = M_est.initial_soln[active] sel_split = selection_probability_random_lasso(M_est, generative_mean) min = sel_split.minimize2(nstep=200) print(min[0], min[1]) test_point = np.append(M_est.observed_score_state, np.abs(M_est.initial_soln[M_est._overall])) print("value of likelihood", sel_split.likelihood_loss.smooth_objective(test_point, mode="func")) inv_cov = np.linalg.inv(M_est.score_cov) lik = (M_est.observed_score_state - generative_mean).T.dot(inv_cov).dot(M_est.observed_score_state - generative_mean) / 2. print("value of likelihood check", lik) grad = inv_cov.dot(M_est.observed_score_state - generative_mean) print("grad at likelihood loss", grad)
def test_without_screening(s=10, n=300, p=100, rho=0., signal=3.5, lam_frac=1., ndraw=10000, burnin=2000, loss='gaussian', randomizer='laplace', randomizer_scale=1., scalings=False, subgrad=True, check_screen=False): if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1, random_signs=False) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.standard_normal( (n, 2000)))).max(0)) * sigma loss = rr.glm.gaussian(X, y) X_indep, y_indep, _, _, _ = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1) loss_indep = rr.glm.gaussian(X_indep, y_indep) elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) loss = rr.glm.logistic(X, y) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) X_indep, y_indep, _, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal, random_signs=False) loss_indep = rr.glm.logistic(X_indep, y_indep) nonzero = np.where(beta)[0] if randomizer == 'laplace': randomizer = randomization.laplace((p, ), scale=randomizer_scale) elif randomizer == 'gaussian': randomizer = randomization.isotropic_gaussian((p, ), scale=randomizer_scale) epsilon = 1. / np.sqrt(n) W = np.ones(p) * lam #W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) M_est = glm_group_lasso(loss, epsilon, penalty, randomizer) M_est.solve() active_union = M_est._overall nactive = np.sum(active_union) print("nactive", nactive) active_set = np.nonzero(active_union)[0] print("active set", active_set) print("true nonzero", np.nonzero(beta)[0]) views = [M_est] queries = multiple_queries(views) queries.solve() screened = False if set(nonzero).issubset(np.nonzero(active_union)[0]): screened = True if check_screen == False or (check_screen == True and screened == True): #if nactive==s: # return None if scalings: # try condition on some scalings M_est.condition_on_subgradient() M_est.condition_on_scalings() if subgrad: M_est.decompose_subgradient(conditioning_groups=np.zeros( p, dtype=bool), marginalizing_groups=np.ones(p, bool)) boot_target1, boot_target_observed1 = pairs_bootstrap_glm( loss, active_union, inactive=~active_union) boot_target2, boot_target_observed2 = pairs_bootstrap_glm( loss_indep, active_union, inactive=~active_union) target_observed = (boot_target_observed1 - boot_target_observed2)[:nactive] def _target(indices): return boot_target1(indices)[:nactive] - boot_target2( indices)[:nactive] form_covariances = glm_nonparametric_bootstrap(n, n) queries.setup_sampler(form_covariances) queries.setup_opt_state() target_sampler = queries.setup_target(_target, target_observed, reference=target_observed) target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) LU = target_sampler.confidence_intervals(target_observed, sample=target_sample, level=0.9) pivots = target_sampler.coefficient_pvalues( target_observed, parameter=np.zeros(nactive), sample=target_sample) #test_stat = lambda x: np.linalg.norm(x - beta[active_union]) #observed_test_value = test_stat(target_observed) #pivots = target_sampler.hypothesis_test(test_stat, # observed_test_value, # alternative='twosided', # parameter = beta[active_union], # ndraw=ndraw, # burnin=burnin, # stepsize=None) true_vec = np.zeros(nactive) def coverage(LU): L, U = LU[:, 0], LU[:, 1] covered = np.zeros(nactive) ci_length = np.zeros(nactive) for j in range(nactive): if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): covered[j] = 1 ci_length[j] = U[j] - L[j] return covered, ci_length covered, ci_length = coverage(LU) LU_naive = naive_confidence_intervals(target_sampler, target_observed) covered_naive, ci_length_naive = coverage(LU_naive) naive_pvals = naive_pvalues(target_sampler, target_observed, true_vec) return pivots, covered, ci_length, naive_pvals, covered_naive, ci_length_naive
def randomized_marginal_lasso_screening(X, y, beta, sigma): from selection.api import randomization n, p = X.shape random_Z = np.random.standard_normal(p) Z_stats = X.T.dot(y) randomized_Z_stats = np.true_divide(Z_stats, sigma) + random_Z active_1 = np.zeros(p, bool) active_1[np.fabs(randomized_Z_stats) > 2.33] = 1 active_signs_1 = np.sign(randomized_Z_stats[active_1]) nactive_1 = active_1.sum() threshold = 2.33 * np.ones(p) #print("active_1", active_1, nactive_1) X_step2 = X[:, active_1] random_Z_2 = np.random.standard_normal(nactive_1) sel = selection(X_step2, y, random_Z_2) lam, epsilon, active_2, betaE, cube, initial_soln = sel noise_variance = 1. lagrange = lam * np.ones(nactive_1) nactive_2 = betaE.shape[0] #print("active_2", active_2, nactive_2) active_signs_2 = np.sign(betaE) # getting the active indices active = np.zeros(p, bool) indices_stage2 = np.where(active_1 == 1)[0] active[indices_stage2[active_2]] = 1 nactive = active.sum() print("the active indices after two stages of screening", active.sum()) primal_feasible_1 = np.fabs(randomized_Z_stats[active_1]) primal_feasible_2 = np.fabs(betaE) feasible_point = np.append(primal_feasible_1, primal_feasible_2) randomizer = randomization.isotropic_gaussian((p, ), 1.) generative_X = X_step2[:, active_2] prior_variance = 1000. projection_active = X[:, active].dot( np.linalg.inv(X[:, active].T.dot(X[:, active]))) M_1 = prior_variance * (X.dot(X.T)) + noise_variance * np.identity(n) M_2 = prior_variance * ((X.dot(X.T)).dot(projection_active)) M_3 = prior_variance * (projection_active.T.dot(X.dot( X.T)).dot(projection_active)) post_mean = M_2.T.dot(np.linalg.inv(M_1)).dot(y) #print("observed data", post_mean) post_var = M_3 - M_2.T.dot(np.linalg.inv(M_1)).dot(M_2) unadjusted_intervals = np.vstack([ post_mean - 1.65 * (np.sqrt(post_var.diagonal())), post_mean + 1.65 * (np.sqrt(post_var.diagonal())) ]) grad_map = sel_prob_gradient_map_ms_lasso( X, feasible_point, # in R^{|E|_1 + |E|_2} active_1, # the active set chosen by randomized marginal screening active_2, # the active set chosen by randomized lasso active_signs_1, # the set of signs of active coordinates chosen by ms active_signs_2, # the set of signs of active coordinates chosen by lasso lagrange, # in R^p threshold, # in R^p generative_X, # in R^{p}\times R^{n} noise_variance, randomizer, epsilon) ms = selective_map_credible_ms_lasso(y, grad_map, prior_variance) samples = ms.posterior_samples() adjusted_intervals = np.vstack([ np.percentile(samples, 5, axis=0), np.percentile(samples, 95, axis=0) ]) selective_mean = np.mean(samples, axis=0) coverage_ad = np.zeros(nactive) coverage_unad = np.zeros(nactive) ad_length = np.zeros(nactive) unad_length = np.zeros(nactive) true_val = projection_active.T.dot(X.dot(beta)) for l in range(nactive): if (adjusted_intervals[0, l] <= true_val[l]) and (true_val[l] <= adjusted_intervals[1, l]): coverage_ad[l] += 1 ad_length[l] = adjusted_intervals[1, l] - adjusted_intervals[0, l] if (unadjusted_intervals[0, l] <= true_val[l]) and (true_val[l] <= unadjusted_intervals[1, l]): coverage_unad[l] += 1 unad_length[l] = unadjusted_intervals[1, l] - unadjusted_intervals[0, l] sel_cov = coverage_ad.sum() / nactive naive_cov = coverage_unad.sum() / nactive ad_len = ad_length.sum() / nactive unad_len = unad_length.sum() / nactive bayes_risk_ad = np.power(selective_mean - true_val, 2.).sum() / nactive bayes_risk_unad = np.power(post_mean - true_val, 2.).sum() / nactive return np.vstack( [sel_cov, naive_cov, ad_len, unad_len, bayes_risk_ad, bayes_risk_unad])
def test_intervals(s=0, n=200, p=10, signal=7, rho=0., lam_frac=6., ndraw=10000, burnin=2000, bootstrap=True, loss='gaussian', intervals='old', randomizer='laplace', solve_args={ 'min_its': 50, 'tol': 1.e-10 }): if randomizer == 'laplace': randomizer = randomization.laplace((p, ), scale=1.) elif randomizer == 'gaussian': randomizer = randomization.isotropic_gaussian((p, ), scale=1.) elif randomizer == 'logistic': randomizer = randomization.logistic((p, ), scale=1.) if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1) lam = np.mean( np.fabs(np.dot(X.T, np.random.standard_normal((n, 1000))))) * sigma loss = rr.glm.gaussian(X, y) elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) loss = rr.glm.logistic(X, y) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) nonzero = np.where(beta)[0] epsilon = 1. / np.sqrt(n) W = lam_frac * np.ones(p) * lam # W[0] = 0 # use at least some unpenalized groups = np.concatenate([np.arange(10) for i in range(p / 10)]) #print(groups) #groups = np.arange(p) penalty = rr.group_lasso(groups, weights=dict(zip(np.arange(p), W)), lagrange=1.) # first randomization M_est1 = glm_group_lasso(loss, epsilon, penalty, randomizer) mv = multiple_queries([M_est1]) # second randomization #M_est2 = glm_group_lasso(loss, epsilon, penalty, randomizer) #mv = multiple_queries([M_est1, M_est2]) mv.solve() active_union = M_est1.selection_variable['variables'] print("active set", np.nonzero(active_union)[0]) nactive = np.sum(active_union) if nactive == 0: return None if set(nonzero).issubset(np.nonzero(active_union)[0]): active_set = np.nonzero(active_union)[0] true_vec = beta[active_union] target_sampler, target_observed = glm_target(loss, active_union, mv, bootstrap=bootstrap) if intervals == 'old': target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) LU = target_sampler.confidence_intervals(target_observed, sample=target_sample, level=0.9) pivots_mle = target_sampler.coefficient_pvalues( target_observed, parameter=target_sampler.reference, sample=target_sample) pivots_truth = target_sampler.coefficient_pvalues( target_observed, parameter=true_vec, sample=target_sample) pvalues = target_sampler.coefficient_pvalues( target_observed, parameter=np.zeros_like(true_vec), sample=target_sample) else: full_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin, keep_opt=True) LU = target_sampler.confidence_intervals_translate( target_observed, sample=full_sample, level=0.9) pivots_mle = target_sampler.coefficient_pvalues_translate( target_observed, parameter=target_sampler.reference, sample=full_sample) pivots_truth = target_sampler.coefficient_pvalues_translate( target_observed, parameter=true_vec, sample=full_sample) pvalues = target_sampler.coefficient_pvalues_translate( target_observed, parameter=np.zeros_like(true_vec), sample=full_sample) LU_naive = naive_confidence_intervals(target_sampler, target_observed) L, U = LU.T ci_length_sel = np.zeros(nactive) covered = np.zeros(nactive, np.bool) naive_covered = np.zeros(nactive, np.bool) ci_length_naive = np.zeros(nactive) active_var = np.zeros(nactive, np.bool) for j in range(nactive): if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): covered[j] = 1 ci_length_sel[j] = U[j] - L[j] if (LU_naive[j, 0] <= true_vec[j]) and (LU_naive[j, 1] >= true_vec[j]): naive_covered[j] = 1 ci_length_naive[j] = LU_naive[j, 1] - LU_naive[j, 0] active_var[j] = active_set[j] in nonzero naive_pvals = naive_pvalues(target_sampler, target_observed, true_vec) return pivots_mle, pivots_truth, pvalues, covered, ci_length_sel,\ naive_pvals, naive_covered, ci_length_naive, active_var
def randomized_marginal_screening(X, y, beta, sigma): from selection.api import randomization n, p = X.shape random_Z = np.random.standard_normal(p) Z_stats = X.T.dot(y) randomized_Z_stats = np.true_divide(Z_stats, sigma) + random_Z active = np.zeros(p, bool) active[np.fabs(randomized_Z_stats) > 2.33] = 1 active_signs = np.sign(randomized_Z_stats[active]) nactive = active.sum() threshold = 2.33 * np.ones(p) if nactive >= 1: feasible_point = np.fabs(randomized_Z_stats[active]) noise_variance = sigma ** 2 randomizer = randomization.isotropic_gaussian((p,), 1.) generative_X = X[:, active] prior_variance = 1000. grad_map = sel_prob_gradient_map_ms(X, feasible_point, active, active_signs, threshold, generative_X, noise_variance, randomizer) inf = selective_inf_ms(y, grad_map, prior_variance) samples = inf.posterior_samples() adjusted_intervals = np.vstack([np.percentile(samples, 5, axis=0), np.percentile(samples, 95, axis=0)]) projection_active = X[:, active].dot(np.linalg.inv(X[:, active].T.dot(X[:, active]))) M_1 = prior_variance * (X.dot(X.T)) + noise_variance * np.identity(n) M_2 = prior_variance * ((X.dot(X.T)).dot(projection_active)) M_3 = prior_variance * (projection_active.T.dot(X.dot(X.T)).dot(projection_active)) post_mean = M_2.T.dot(np.linalg.inv(M_1)).dot(y) print("observed data", post_mean) post_var = M_3 - M_2.T.dot(np.linalg.inv(M_1)).dot(M_2) unadjusted_intervals = np.vstack([post_mean - 1.65 * (np.sqrt(post_var.diagonal())), post_mean + 1.65 * (np.sqrt(post_var.diagonal()))]) coverage_ad = np.zeros(nactive) coverage_unad = np.zeros(nactive) nerr = 0. true_val = projection_active.T.dot(X.dot(beta)) active_set = [i for i in range(p) if active[i]] for l in range(nactive): if (adjusted_intervals[0, l] <= true_val[l]) and (true_val[l] <= adjusted_intervals[1, l]): coverage_ad[l] += 1 if (unadjusted_intervals[0, l] <= true_val[l]) and (true_val[l] <= unadjusted_intervals[1, l]): coverage_unad[l] += 1 sel_cov = coverage_ad.sum() / nactive naive_cov = coverage_unad.sum() / nactive return sel_cov, naive_cov else: return None
def hiv_inference_test(): if not os.path.exists("NRTI_DATA.txt"): NRTI = pandas.read_table( "http://hivdb.stanford.edu/pages/published_analysis/genophenoPNAS2006/DATA/NRTI_DATA.txt", na_values="NA") else: NRTI = pandas.read_table("NRTI_DATA.txt") NRTI_specific = [] NRTI_muts = [] for i in range(1, 241): d = NRTI['P%d' % i] for mut in np.unique(d): if mut not in ['-', '.'] and len(mut) == 1: test = np.equal(d, mut) if test.sum() > 10: NRTI_specific.append(np.array(np.equal(d, mut))) NRTI_muts.append("P%d%s" % (i, mut)) NRTI_specific = NRTI.from_records(np.array(NRTI_specific).T, columns=NRTI_muts) X_NRTI = np.array(NRTI_specific, np.float) Y = NRTI['3TC'] # shorthand keep = ~np.isnan(Y).astype(np.bool) X_NRTI = X_NRTI[np.nonzero(keep)]; Y = Y[keep] Y = np.array(np.log(Y), np.float); Y -= Y.mean() X_NRTI -= X_NRTI.mean(0)[None, :]; X_NRTI /= X_NRTI.std(0)[None, :] X = X_NRTI # shorthand n, p = X.shape X /= np.sqrt(n) ols_fit = sm.OLS(Y, X).fit() sigma_3TC = np.linalg.norm(ols_fit.resid) / np.sqrt(n - p - 1) lam_frac = 1. loss = rr.glm.gaussian(X, Y) epsilon = 1. / np.sqrt(n) lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) * sigma_3TC print(lam) W = np.ones(p) * lam penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) from selection.api import randomization randomization = randomization.isotropic_gaussian((p,), scale=1.) #change grid for parameter for HIV data M_est = M_estimator_map(loss, epsilon, penalty, randomization, randomization_scale=0.7) M_est.solve_approx() active = M_est._overall nactive = np.sum(active) ci_active = np.zeros((nactive, 2)) ci_length = np.zeros(nactive) mle_active = np.zeros((nactive, 1)) ci = approximate_conditional_density(M_est) ci.solve_approx() class target_class(object): def __init__(self, target_cov): self.target_cov = target_cov self.shape = target_cov.shape target = target_class(M_est.target_cov) ci_naive = naive_confidence_intervals(target, M_est.target_observed) for j in range(nactive): ci_active[j, :] = np.array(ci.approximate_ci(j)) ci_length[j] = ci_active[j, 1] - ci_active[j, 0] mle_active[j, :] = ci.approx_MLE_solver(j, nstep=100)[0] unadjusted_mle = np.zeros((nactive, 1)) for j in range(nactive): unadjusted_mle[j, :] = ci.target_observed[j] adjusted_intervals = np.hstack([mle_active, ci_active]).T unadjusted_intervals = np.hstack([unadjusted_mle, ci_naive]).T print("adjusted confidence", adjusted_intervals) print("naive confidence", unadjusted_intervals) intervals = np.vstack([unadjusted_intervals, adjusted_intervals]) return intervals
def randomized_forward_step(X, y, beta, sigma): from selection.api import randomization n, p = X.shape random_Z = np.random.standard_normal(p) Z_stats = X.T.dot(y) random_obs = X.T.dot(y) + random_Z active_index = np.argmax(np.fabs(random_obs)) active = np.zeros(p, bool) active[active_index] = 1 active_sign = np.sign(random_obs[active_index]) print("observed statistic", random_obs[active_index], Z_stats[active_index]) print("first step--chosen index and sign", active_index, active_sign) feasible_point = np.fabs(random_obs[active_index]) noise_variance = sigma**2 randomizer = randomization.isotropic_gaussian((p, ), 1.) generative_X = X[:, active] prior_variance = 1000. grad_map = sel_prob_gradient_map_fs(X, feasible_point, active, active_sign, generative_X, noise_variance, randomizer) inf = selective_map_credible_fs(y, grad_map, prior_variance) samples = inf.posterior_samples() adjusted_intervals = np.vstack([ np.percentile(samples, 5, axis=0), np.percentile(samples, 95, axis=0) ]) selective_mean = np.mean(samples, axis=0) projection_active = X[:, active].dot( np.linalg.inv(X[:, active].T.dot(X[:, active]))) M_1 = prior_variance * (X.dot(X.T)) + noise_variance * np.identity(n) M_2 = prior_variance * ((X.dot(X.T)).dot(projection_active)) M_3 = prior_variance * (projection_active.T.dot(X.dot( X.T)).dot(projection_active)) post_mean = M_2.T.dot(np.linalg.inv(M_1)).dot(y) print("observed data", post_mean) post_var = M_3 - M_2.T.dot(np.linalg.inv(M_1)).dot(M_2) unadjusted_intervals = np.vstack([ post_mean - 1.65 * (np.sqrt(post_var.diagonal())), post_mean + 1.65 * (np.sqrt(post_var.diagonal())) ]) coverage_ad = np.zeros(1) coverage_unad = np.zeros(1) ad_length = np.zeros(1) unad_length = np.zeros(1) true_val = projection_active.T.dot(X.dot(beta)) if (adjusted_intervals[0, 0] <= true_val[0]) and (true_val[0] <= adjusted_intervals[1, 0]): coverage_ad[0] += 1 ad_length[0] = adjusted_intervals[1, 0] - adjusted_intervals[0, 0] if (unadjusted_intervals[0, 0] <= true_val[0]) and (true_val[0] <= unadjusted_intervals[1, 0]): coverage_unad[0] += 1 unad_length[0] = unadjusted_intervals[1, 0] - unadjusted_intervals[0, 0] sel_cov = coverage_ad.sum() / 1. naive_cov = coverage_unad.sum() / 1. ad_len = ad_length.sum() / 1. unad_len = unad_length.sum() / 1. bayes_risk_ad = np.power(selective_mean - true_val, 2.).sum() / 1. bayes_risk_unad = np.power(post_mean - true_val, 2.).sum() / 1. return np.vstack( [sel_cov, naive_cov, ad_len, unad_len, bayes_risk_ad, bayes_risk_unad])
def randomized_lasso_trial(X, y, beta, sigma, lam, loss='gaussian', randomizer='gaussian', estimation='parametric'): from selection.api import randomization n, p = X.shape if loss == "gaussian": loss = rr.glm.gaussian(X, y) elif loss == "logistic": loss = rr.glm.logistic(X, y) epsilon = 1. / np.sqrt(n) W = np.ones(p) * lam penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) randomization = randomization.isotropic_gaussian((p, ), scale=1.) M_est = M_estimator_approx(loss, epsilon, penalty, randomization, randomizer, estimation) M_est.solve_approx() active = M_est._overall active_set = np.asarray([i for i in range(p) if active[i]]) nactive = np.sum(active) prior_variance = 1000. noise_variance = sigma**2 projection_active = X[:, active].dot( np.linalg.inv(X[:, active].T.dot(X[:, active]))) M_1 = prior_variance * (X.dot(X.T)) + noise_variance * np.identity(n) M_2 = prior_variance * ((X.dot(X.T)).dot(projection_active)) M_3 = prior_variance * (projection_active.T.dot(X.dot( X.T)).dot(projection_active)) post_mean = M_2.T.dot(np.linalg.inv(M_1)).dot(y) print("observed data", post_mean) post_var = M_3 - M_2.T.dot(np.linalg.inv(M_1)).dot(M_2) unadjusted_intervals = np.vstack([ post_mean - 1.65 * (np.sqrt(post_var.diagonal())), post_mean + 1.65 * (np.sqrt(post_var.diagonal())) ]) #generative_mean = np.zeros(p) #sel_split = selection_probability_random_lasso(M_est, generative_mean) #test_point = np.append(M_est.observed_score_state, np.abs(M_est.initial_soln[M_est._overall])) #print("gradient at test point", sel_split.smooth_objective(test_point, mode= "grad")) grad_lasso = sel_inf_random_lasso(M_est, prior_variance) samples = grad_lasso.posterior_samples() adjusted_intervals = np.vstack([ np.percentile(samples, 5, axis=0), np.percentile(samples, 95, axis=0) ]) selective_mean = np.mean(samples, axis=0) coverage_ad = np.zeros(nactive) coverage_unad = np.zeros(nactive) ad_length = np.zeros(nactive) unad_length = np.zeros(nactive) true_val = projection_active.T.dot(X.dot(beta)) for l in range(nactive): if (adjusted_intervals[0, l] <= true_val[l]) and (true_val[l] <= adjusted_intervals[1, l]): coverage_ad[l] += 1 ad_length[l] = adjusted_intervals[1, l] - adjusted_intervals[0, l] if (unadjusted_intervals[0, l] <= true_val[l]) and (true_val[l] <= unadjusted_intervals[1, l]): coverage_unad[l] += 1 unad_length[l] = unadjusted_intervals[1, l] - unadjusted_intervals[0, l] sel_cov = coverage_ad.sum() / nactive naive_cov = coverage_unad.sum() / nactive ad_len = ad_length.sum() / nactive unad_len = unad_length.sum() / nactive bayes_risk_ad = np.power(selective_mean - true_val, 2.).sum() / nactive bayes_risk_unad = np.power(post_mean - true_val, 2.).sum() / nactive return np.vstack( [sel_cov, naive_cov, ad_len, unad_len, bayes_risk_ad, bayes_risk_unad])
def test_cv(n=100, p=50, s=5, signal=7.5, K=5, rho=0., randomizer='gaussian', randomizer_scale=1., scale1=0.1, scale2=0.2, lam_frac=1., glmnet=True, loss='gaussian', intervals='old', bootstrap=False, condition_on_CVR=True, marginalize_subgrad=True, ndraw=10000, burnin=2000, nboot=nboot): print(n, p, s, condition_on_CVR, scale1, scale2) if randomizer == 'laplace': randomizer = randomization.laplace((p, ), scale=randomizer_scale) elif randomizer == 'gaussian': randomizer = randomization.isotropic_gaussian((p, ), randomizer_scale) elif randomizer == 'logistic': randomizer = randomization.logistic((p, ), scale=randomizer_scale) if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1) glm_loss = rr.glm.gaussian(X, y) elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) glm_loss = rr.glm.logistic(X, y) epsilon = 1. / np.sqrt(n) # view 1 cv = CV_view(glm_loss, loss_label=loss, lasso_randomization=randomizer, epsilon=epsilon, scale1=scale1, scale2=scale2) if glmnet: try: cv.solve(glmnet=glmnet) except ImportError: cv.solve(glmnet=False) else: cv.solve(glmnet=False) # for the test make sure we also run the python code cv_py = CV_view(glm_loss, loss_label=loss, lasso_randomization=randomizer, epsilon=epsilon, scale1=scale1, scale2=scale2) cv_py.solve(glmnet=False) lam = cv.lam_CVR print("lam", lam) if condition_on_CVR: cv.condition_on_opt_state() lam = cv.one_SD_rule(direction="up") print("new lam", lam) # non-randomized Lasso, just looking how many vars it selects problem = rr.simple_problem(glm_loss, rr.l1norm(p, lagrange=lam)) beta_hat = problem.solve() active_hat = beta_hat != 0 print("non-randomized lasso ", active_hat.sum()) # view 2 W = lam_frac * np.ones(p) * lam penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) M_est1 = glm_group_lasso(glm_loss, epsilon, penalty, randomizer) if nboot > 0: cv.nboot = M_est1.nboot = nboot mv = multiple_queries([cv, M_est1]) mv.solve() active_union = M_est1._overall nactive = np.sum(active_union) print("nactive", nactive) if nactive == 0: return None nonzero = np.where(beta)[0] if set(nonzero).issubset(np.nonzero(active_union)[0]): active_set = np.nonzero(active_union)[0] true_vec = beta[active_union] if marginalize_subgrad == True: M_est1.decompose_subgradient(conditioning_groups=np.zeros(p, bool), marginalizing_groups=np.ones(p, bool)) target_sampler, target_observed = glm_target(glm_loss, active_union, mv, bootstrap=bootstrap) if intervals == 'old': target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) LU = target_sampler.confidence_intervals(target_observed, sample=target_sample, level=0.9) pivots_truth = target_sampler.coefficient_pvalues( target_observed, parameter=true_vec, sample=target_sample) pvalues = target_sampler.coefficient_pvalues( target_observed, parameter=np.zeros_like(true_vec), sample=target_sample) else: full_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin, keep_opt=True) LU = target_sampler.confidence_intervals_translate( target_observed, sample=full_sample, level=0.9) pivots_truth = target_sampler.coefficient_pvalues_translate( target_observed, parameter=true_vec, sample=full_sample) pvalues = target_sampler.coefficient_pvalues_translate( target_observed, parameter=np.zeros_like(true_vec), sample=full_sample) L, U = LU.T sel_covered = np.zeros(nactive, np.bool) sel_length = np.zeros(nactive) LU_naive = naive_confidence_intervals(target_sampler, target_observed) naive_covered = np.zeros(nactive, np.bool) naive_length = np.zeros(nactive) naive_pvals = naive_pvalues(target_sampler, target_observed, true_vec) active_var = np.zeros(nactive, np.bool) for j in range(nactive): if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): sel_covered[j] = 1 if (LU_naive[j, 0] <= true_vec[j]) and (LU_naive[j, 1] >= true_vec[j]): naive_covered[j] = 1 sel_length[j] = U[j] - L[j] naive_length[j] = LU_naive[j, 1] - LU_naive[j, 0] active_var[j] = active_set[j] in nonzero q = 0.2 BH_desicions = multipletests(pvalues, alpha=q, method="fdr_bh")[0] return pivots_truth, sel_covered, sel_length, naive_pvals, naive_covered, naive_length, active_var, BH_desicions, active_var
def approximate_inference(X, y, beta, sigma, seed_n = 0, lam_frac = 1., loss='gaussian', randomization_scale = 1.): from selection.api import randomization n, p = X.shape np.random.seed(seed_n) if loss == "gaussian": loss = rr.glm.gaussian(X, y) lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) * sigma elif loss == "logistic": loss = rr.glm.logistic(X, y) lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) randomization = randomization.isotropic_gaussian((p,), scale=randomization_scale) W = np.ones(p) * lam penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) GS = greedy_score_map(loss, penalty, np.zeros(p, dtype=bool), np.ones(p, dtype=bool), randomization, randomization_scale) GS.solve_approx() active = GS._overall nactive = np.sum(active) if nactive == 0: return None else: active_set = np.asarray([i for i in range(p) if active[i]]) s = beta.sum() true_support = np.asarray([i for i in range(p) if i < s]) true_vec = beta[active] if (set(active_set).intersection(set(true_support)) == set(true_support)) == True: ci = approximate_conditional_density(GS) ci.solve_approx() sys.stderr.write("True target to be covered" + str(true_vec) + "\n") class target_class(object): def __init__(self, target_cov): self.target_cov = target_cov self.shape = target_cov.shape target = target_class(GS.target_cov) ci_naive = naive_confidence_intervals(target, GS.target_observed) naive_covered = np.zeros(nactive) naive_risk = np.zeros(nactive) ci_sel = np.zeros((nactive, 2)) sel_MLE = np.zeros(nactive) sel_length = np.zeros(nactive) for j in range(nactive): ci_sel[j, :] = np.array(ci.approximate_ci(j)) sel_MLE[j] = ci.approx_MLE_solver(j, step=1, nstep=150)[0] sel_length[j] = ci_sel[j, 1] - ci_sel[j, 0] sel_covered = np.zeros(nactive, np.bool) sel_risk = np.zeros(nactive) for j in range(nactive): sel_risk[j] = (sel_MLE[j] - true_vec[j]) ** 2. naive_risk[j] = (GS.target_observed[j] - true_vec[j]) ** 2. if (ci_sel[j, 0] <= true_vec[j]) and (ci_sel[j, 1] >= true_vec[j]): sel_covered[j] = 1 if (ci_naive[j, 0] <= true_vec[j]) and (ci_naive[j, 1] >= true_vec[j]): naive_covered[j] = 1 print("lengths", sel_length.sum() / nactive) print("selective intervals", ci_sel.T) print("risks", sel_risk.sum() / nactive) return np.transpose(np.vstack((ci_sel[:, 0], ci_sel[:, 1], ci_naive[:, 0], ci_naive[:, 1], sel_MLE, GS.target_observed, sel_covered, naive_covered, sel_risk, naive_risk)))
def test_condition(ndraw=10000, burnin=2000, scalings=True): s, n, p = 6, 600, 40 X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=0.2, snr=5) randomizer = randomization.isotropic_gaussian((p, ), scale=sigma) lam_frac = 1.5 loss = rr.glm.gaussian(X, y) epsilon = 1. / np.sqrt(n) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.standard_normal( (n, 2000)))).max(0)) * sigma W = np.ones(p) * lam W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) views = [] nview = 3 for i in range(nview): views.append(glm_group_lasso(loss, epsilon, penalty, randomizer)) queries = multiple_queries(views) queries.solve() active_union = np.zeros(p, np.bool) for view in views: active_union += view.selection_variable['variables'] nactive = np.sum(active_union) print("nactive", nactive) if set(nonzero).issubset(np.nonzero(active_union)[0]): if nactive == s: return None if scalings: # try condition on some scalings views[0].condition_on_scalings() views[0].condition_on_subgradient() views[1].condition_on_subgradient() views[2].condition_on_scalings() else: views[0].condition_on_subgradient() views[1].condition_on_subgradient() views[2].condition_on_subgradient() active_set = np.nonzero(active_union)[0] target_sampler, target_observed = glm_target(loss, active_union, queries) pvalues = target_sampler.coefficient_pvalues(target_observed, alternative='twosided', ndraw=ndraw, burnin=burnin) active_var = np.zeros_like(pvalues, np.bool) _nonzero = np.array([i in nonzero for i in active_set]) active_var[_nonzero] = True return pvalues, active_var
def test_marginalize(s=4, n=600, p=200, rho=0., signal=3.5, lam_frac=2.5, ndraw=10000, burnin=2000, loss='gaussian', randomizer='gaussian', randomizer_scale=1., nviews=3, scalings=True, subgrad=True, parametric=False, intervals='old'): print(n, p, s) if randomizer == 'laplace': randomizer = randomization.laplace((p, ), scale=randomizer_scale) elif randomizer == 'gaussian': randomizer = randomization.isotropic_gaussian((p, ), randomizer_scale) elif randomizer == 'logistic': randomizer = randomization.logistic((p, ), scale=randomizer_scale) if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, signal=signal, sigma=1) lam = np.mean( np.fabs(np.dot(X.T, np.random.standard_normal((n, 1000))))) * sigma loss = rr.glm.gaussian(X, y) elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, signal=signal) loss = rr.glm.logistic(X, y) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) epsilon = 1. / np.sqrt(n) W = lam_frac * np.ones(p) * lam #W[0] = 0 # use at least some unpenalized penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) views = [] for i in range(nviews): if parametric == False: views.append(glm_group_lasso(loss, epsilon, penalty, randomizer)) else: views.append( glm_group_lasso_parametric(loss, epsilon, penalty, randomizer)) queries = multiple_queries(views) queries.solve() active_union = np.zeros(p, np.bool) for view in views: active_union += view.selection_variable['variables'] nactive = np.sum(active_union) print("nactive", nactive) nonzero = np.where(beta)[0] true_vec = beta[active_union] if set(nonzero).issubset(np.nonzero(active_union)[0]): check_screen = True if nactive == s: return None # BUG: if this scalings code is moveed after the decompose_subgradient, # code seems to run fine if scalings: # try condition on some scalings for i in range(nviews): views[i].condition_on_scalings() if subgrad: for i in range(nviews): conditioning_groups = np.zeros(p, dtype=bool) conditioning_groups[:(p / 2)] = True marginalizing_groups = np.zeros(p, dtype=bool) marginalizing_groups[(p / 2):] = True views[i].decompose_subgradient( conditioning_groups=conditioning_groups, marginalizing_groups=marginalizing_groups) active_set = np.nonzero(active_union)[0] target_sampler, target_observed = glm_target(loss, active_union, queries, bootstrap=False, parametric=parametric) #reference= beta[active_union]) if intervals == 'old': target_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin) LU = target_sampler.confidence_intervals(target_observed, sample=target_sample, level=0.9) pivots = target_sampler.coefficient_pvalues(target_observed, parameter=true_vec, sample=target_sample) elif intervals == 'new': full_sample = target_sampler.sample(ndraw=ndraw, burnin=burnin, keep_opt=True) LU = target_sampler.confidence_intervals_translate( target_observed, sample=full_sample, level=0.9) pivots = target_sampler.coefficient_pvalues_translate( target_observed, parameter=true_vec, sample=full_sample) #test_stat = lambda x: np.linalg.norm(x - beta[active_union]) #observed_test_value = test_stat(target_observed) #pivots = target_sampler.hypothesis_test(test_stat, # observed_test_value, # alternative='twosided', # parameter = beta[active_union], # ndraw=ndraw, # burnin=burnin, # stepsize=None) def coverage(LU): L, U = LU[:, 0], LU[:, 1] covered = np.zeros(nactive) ci_length = np.zeros(nactive) for j in range(nactive): if check_screen: if (L[j] <= true_vec[j]) and (U[j] >= true_vec[j]): covered[j] = 1 else: covered[j] = None ci_length[j] = U[j] - L[j] return covered, ci_length covered, ci_length = coverage(LU) LU_naive = naive_confidence_intervals(target_sampler, target_observed) covered_naive, ci_length_naive = coverage(LU_naive) naive_pvals = naive_pvalues(target_sampler, target_observed, true_vec) return pivots, covered, ci_length, naive_pvals, covered_naive, ci_length_naive
def randomized_lasso_trial(X, y, beta, sigma, ndraw=1000, burnin=50): n, p = X.shape random_Z = np.random.standard_normal(p) sel = selection(X, y, random_Z) lam, epsilon, active, betaE, cube, initial_soln = sel if sel is not None: lagrange = lam * np.ones(p) active_sign = np.sign(betaE) nactive = active.sum() print("number of selected variables by Lasso", nactive) feasible_point = np.fabs(betaE) noise_variance = sigma ** 2 randomizer = randomization.isotropic_gaussian((p,), 1.) generative_X = X[:, active] prior_variance = 1000. grad_map = sel_prob_gradient_map_lasso(X, feasible_point, active, active_sign, lagrange, generative_X, noise_variance, randomizer, epsilon) inf = selective_inf_lasso(y, grad_map, prior_variance) # for the tests, just take a few steps samples = inf.posterior_samples(langevin_steps=ndraw, burnin=burnin) adjusted_intervals = np.vstack([np.percentile(samples, 5, axis=0), np.percentile(samples, 95, axis=0)]) selective_mean = np.mean(samples, axis=0) projection_active = X[:, active].dot(np.linalg.inv(X[:, active].T.dot(X[:, active]))) M_1 = prior_variance * (X.dot(X.T)) + noise_variance * np.identity(n) M_2 = prior_variance * ((X.dot(X.T)).dot(projection_active)) M_3 = prior_variance * (projection_active.T.dot(X.dot(X.T)).dot(projection_active)) post_mean = M_2.T.dot(np.linalg.inv(M_1)).dot(y) print("observed data", post_mean) post_var = M_3 - M_2.T.dot(np.linalg.inv(M_1)).dot(M_2) unadjusted_intervals = np.vstack([post_mean - 1.65 * (np.sqrt(post_var.diagonal())), post_mean + 1.65 * (np.sqrt(post_var.diagonal()))]) coverage_ad = np.zeros(nactive) coverage_unad = np.zeros(nactive) ad_length = np.zeros(nactive) unad_length = np.zeros(nactive) true_val = projection_active.T.dot(X.dot(beta)) for l in range(nactive): if (adjusted_intervals[0, l] <= true_val[l]) and (true_val[l] <= adjusted_intervals[1, l]): coverage_ad[l] += 1 ad_length[l] = adjusted_intervals[1, l] - adjusted_intervals[0, l] if (unadjusted_intervals[0, l] <= true_val[l]) and (true_val[l] <= unadjusted_intervals[1, l]): coverage_unad[l] += 1 unad_length[l] = unadjusted_intervals[1, l] - unadjusted_intervals[0, l] sel_cov = coverage_ad.sum() / nactive naive_cov = coverage_unad.sum() / nactive ad_len = ad_length.sum() / nactive unad_len = unad_length.sum() / nactive bayes_risk_ad = np.power(selective_mean - true_val, 2.).sum() / nactive bayes_risk_unad = np.power(post_mean - true_val, 2.).sum() / nactive return np.vstack([sel_cov, naive_cov, ad_len, unad_len, bayes_risk_ad, bayes_risk_unad]) else: return None
def test_approximate_ci(n=200, p=50, s=0, snr=5, threshold = 3., rho=0.1, lam_frac = 1., loss='gaussian', randomizer='gaussian'): from selection.api import randomization if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, snr=snr, sigma=1.) loss = rr.glm.gaussian(X, y) elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, snr=snr) loss = rr.glm.logistic(X, y) if randomizer=='gaussian': randomization = randomization.isotropic_gaussian((p,), scale=1.) elif randomizer=='laplace': randomization = randomization.laplace((p,), scale=1.) active_bool = np.zeros(p, np.bool) #active_bool[range(3)] = 1 inactive_bool = ~active_bool TS = threshold_score_approx(loss, threshold, randomization, active_bool, inactive_bool, randomizer) TS.solve_approx() active = TS._overall print("nactive", active.sum()) ci = approximate_conditional_density(TS) ci.solve_approx() active_set = np.asarray([i for i in range(p) if active[i]]) true_support = np.asarray([i for i in range(p) if i < s]) nactive = np.sum(active) print("active set, true_support", active_set, true_support) true_vec = beta[active] print("true coefficients", true_vec) if (set(active_set).intersection(set(true_support)) == set(true_support))== True: ci_active = np.zeros((nactive, 2)) covered = np.zeros(nactive, np.bool) ci_length = np.zeros(nactive) pivots = np.zeros(nactive) class target_class(object): def __init__(self, target_cov): self.target_cov = target_cov self.shape = target_cov.shape target = target_class(TS.target_cov) ci_naive = naive_confidence_intervals(target, TS.target_observed) naive_pvals = naive_pvalues(target, TS.target_observed, true_vec) naive_covered = np.zeros(nactive) toc = time.time() for j in range(nactive): ci_active[j, :] = np.array(ci.approximate_ci(j)) if (ci_active[j, 0] <= true_vec[j]) and (ci_active[j,1] >= true_vec[j]): covered[j] = 1 ci_length[j] = ci_active[j,1] - ci_active[j,0] print(ci_active[j, :]) pivots[j] = ci.approximate_pvalue(j, true_vec[j]) # naive ci if (ci_naive[j,0]<=true_vec[j]) and (ci_naive[j,1]>=true_vec[j]): naive_covered[j]+=1 tic = time.time() print('ci time now', tic - toc) return covered, ci_length, pivots, naive_covered, naive_pvals
def randomized_lasso_trial(X, y, beta, sigma, lam, loss='logistic', randomizer='gaussian', estimation='parametric'): from selection.api import randomization n, p = X.shape if loss == "gaussian": loss = rr.glm.gaussian(X, y) elif loss == "logistic": loss = rr.glm.logistic(X, y) epsilon = 1. / np.sqrt(n) W = np.ones(p) * lam penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) randomization = randomization.isotropic_gaussian((p, ), scale=1.) M_est = M_estimator_approx_logistic(loss, epsilon, penalty, randomization, randomizer, estimation) M_est.solve_approx() active = M_est._overall #print("here",glm.shape) active_set = np.asarray([i for i in range(p) if active[i]]) nactive = np.sum(active) glm = M_est.observed_score_state[:nactive] prior_variance = 100000. #generative_mean = np.zeros(p) #sel_split = selection_probability_random_lasso(M_est, generative_mean) #test_point = np.append(M_est.observed_score_state, np.abs(M_est.initial_soln[M_est._overall])) #print("gradient at test point", sel_split.smooth_objective(test_point, mode= "grad")) class target_class(object): def __init__(self, target_cov): self.target_cov = target_cov self.shape = target_cov.shape target = target_class(M_est.target_cov) unadjusted_intervals = (naive_confidence_intervals( target, M_est.target_observed)).T grad_lasso = sel_inf_random_lasso(M_est, prior_variance) samples = grad_lasso.posterior_samples() adjusted_intervals = np.vstack([ np.percentile(samples, 5, axis=0), np.percentile(samples, 95, axis=0) ]) selective_mean = np.mean(samples, axis=0) true_val = np.zeros(nactive) coverage_ad = np.zeros(nactive) coverage_unad = np.zeros(nactive) ad_length = np.zeros(nactive) unad_length = np.zeros(nactive) for l in range(nactive): if (adjusted_intervals[0, l] <= true_val[l]) and (true_val[l] <= adjusted_intervals[1, l]): coverage_ad[l] += 1 ad_length[l] = adjusted_intervals[1, l] - adjusted_intervals[0, l] if (unadjusted_intervals[0, l] <= true_val[l]) and (true_val[l] <= unadjusted_intervals[1, l]): coverage_unad[l] += 1 unad_length[l] = unadjusted_intervals[1, l] - unadjusted_intervals[0, l] sel_cov = coverage_ad.sum() / nactive naive_cov = coverage_unad.sum() / nactive ad_len = ad_length.sum() / nactive unad_len = unad_length.sum() / nactive bayes_risk_ad = np.power(selective_mean - true_val, 2.).sum() / nactive bayes_risk_unad = np.power(glm - true_val, 2.).sum() / nactive return np.vstack( [sel_cov, naive_cov, ad_len, unad_len, bayes_risk_ad, bayes_risk_unad])
def test_approximate_ci(n=100, p=10, s=0, snr=5, rho=0.1, lam_frac = 1., loss='gaussian', randomizer='gaussian'): from selection.api import randomization if loss == "gaussian": X, y, beta, nonzero, sigma = gaussian_instance(n=n, p=p, s=s, rho=rho, snr=snr, sigma=1.) loss = rr.glm.gaussian(X, y) lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) * sigma elif loss == "logistic": X, y, beta, _ = logistic_instance(n=n, p=p, s=s, rho=rho, snr=snr) loss = rr.glm.logistic(X, y) lam = lam_frac * np.mean(np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) if randomizer == 'gaussian': randomization = randomization.isotropic_gaussian((p,), scale=1.) elif randomizer == 'laplace': randomization = randomization.laplace((p,), scale=1.) W = np.ones(p) * lam penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) # active_bool = np.zeros(p, np.bool) # active_bool[range(3)] = 1 # inactive_bool = ~active_bool GS = greedy_score_step_approx(loss, penalty, np.zeros(p, dtype=bool), np.ones(p, dtype=bool), randomization, randomizer) GS.solve_approx() active = GS._overall print("nactive", active.sum()) ci = approximate_conditional_density(GS) ci.solve_approx() active_set = np.asarray([i for i in range(p) if active[i]]) true_support = np.asarray([i for i in range(p) if i < s]) nactive = np.sum(active) print("active set, true_support", active_set, true_support) true_vec = beta[active] print("true coefficients", true_vec) if (set(active_set).intersection(set(true_support)) == set(true_support)) == True: ci_active = np.zeros((nactive, 2)) covered = np.zeros(nactive, np.bool) ci_length = np.zeros(nactive) pivots = np.zeros(nactive) toc = time.time() for j in range(nactive): ci_active[j, :] = np.array(ci.approximate_ci(j)) if (ci_active[j, 0] <= true_vec[j]) and (ci_active[j, 1] >= true_vec[j]): covered[j] = 1 ci_length[j] = ci_active[j, 1] - ci_active[j, 0] # print(ci_active[j, :]) pivots[j] = ci.approximate_pvalue(j, true_vec[j]) print("confidence intervals", ci_active) tic = time.time() print('ci time now', tic - toc)
OLS_3TC = ols_fit.params lam_frac = 1. loss = rr.glm.gaussian(X, Y) epsilon = 1. / np.sqrt(n) lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.standard_normal( (n, 2000)))).max(0)) * sigma_3TC print(lam) W = np.ones(p) * lam penalty = rr.group_lasso(np.arange(p), weights=dict(zip(np.arange(p), W)), lagrange=1.) randomization = randomization.isotropic_gaussian((p, ), scale=1.) M_est = M_estimator_approx(loss, epsilon, penalty, randomization, randomizer='gaussian') M_est.solve_approx() active = M_est._overall active_set = np.asarray([i for i in range(p) if active[i]]) nactive = np.sum(active) active_set_0 = [NRTI_muts[i] for i in range(p) if active[i]] ci_active = np.zeros((nactive, 2)) ci_length = np.zeros(nactive)
def test_approximate_inference(X, y, true_mean, sigma, threshold=3., seed_n=0, lam_frac=1., loss='gaussian', randomization_scale=1.): from selection.api import randomization n, p = X.shape np.random.seed(seed_n) if loss == "gaussian": lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.standard_normal( (n, 2000)))).max(0)) * sigma loss = rr.glm.gaussian(X, y) elif loss == "logistic": lam = lam_frac * np.mean( np.fabs(np.dot(X.T, np.random.binomial(1, 1. / 2, (n, 10000)))).max(0)) loss = rr.glm.logistic(X, y) active_bool = np.zeros(p, np.bool) inactive_bool = ~active_bool randomization = randomization.isotropic_gaussian((p, ), scale=randomization_scale) TS = threshold_score_map(loss, threshold, randomization, active_bool, inactive_bool, randomization_scale) TS.solve_approx() active = TS._overall active_set = np.asarray([i for i in range(p) if active[i]]) nactive = np.sum(active) sys.stderr.write("number of active selected by thresholding" + str(nactive) + "\n") sys.stderr.write("Active set selected by thresholding" + str(active_set) + "\n") sys.stderr.write("Observed target" + str(TS.target_observed) + "\n") if nactive == 0: return None else: true_vec = np.linalg.inv(X[:, active].T.dot(X[:, active])).dot( X[:, active].T).dot(true_mean) sys.stderr.write("True target to be covered" + str(true_vec) + "\n") class target_class(object): def __init__(self, target_cov): self.target_cov = target_cov self.shape = target_cov.shape target = target_class(TS.target_cov) ci_naive = naive_confidence_intervals(target, TS.target_observed) naive_covered = np.zeros(nactive) naive_risk = np.zeros(nactive) ci = approximate_conditional_density(TS) ci.solve_approx() ci_sel = np.zeros((nactive, 2)) sel_MLE = np.zeros(nactive) sel_length = np.zeros(nactive) for j in range(nactive): ci_sel[j, :] = np.array(ci.approximate_ci(j)) sel_MLE[j] = ci.approx_MLE_solver(j, step=1, nstep=150)[0] sel_length[j] = ci_sel[j, 1] - ci_sel[j, 0] sel_covered = np.zeros(nactive, np.bool) sel_risk = np.zeros(nactive) for j in range(nactive): sel_risk[j] = (sel_MLE[j] - true_vec[j])**2. naive_risk[j] = (TS.target_observed[j] - true_vec[j])**2. if (ci_sel[j, 0] <= true_vec[j]) and (ci_sel[j, 1] >= true_vec[j]): sel_covered[j] = 1 if (ci_naive[j, 0] <= true_vec[j]) and (ci_naive[j, 1] >= true_vec[j]): naive_covered[j] = 1 print("lengths", sel_length.sum() / nactive) print("selective intervals", ci_sel.T) print("risks", sel_risk.sum() / nactive) return np.transpose( np.vstack((ci_sel[:, 0], ci_sel[:, 1], ci_naive[:, 0], ci_naive[:, 1], sel_MLE, TS.target_observed, sel_covered, naive_covered, sel_risk, naive_risk)))