def method_instance(self): if not hasattr(self, "_method_instance"): n, p = self.X.shape self._method_instance = random_lasso_method.gaussian(self.X, self.Y, self.lagrange * np.sqrt(n), randomizer_scale=self.randomizer_scale * np.std(self.Y) * np.sqrt(n)) return self._method_instance
def method_instance(self): if not hasattr(self, "_method_instance"): n, p = self.X.shape lagrange = np.ones(p) * choose_lambda(self.X) * self.kappa self._method_instance = random_lasso_method.gaussian(self.X, self.Y, lagrange, randomizer_scale=self.randomizer_scale * np.std(self.Y)) return self._method_instance
def method_instance(self): if not hasattr(self, "_method_instance"): n, p = self.X.shape mean_diag = np.mean((self.X ** 2).sum(0)) self._method_instance = random_lasso_method.gaussian(self.X, self.Y, feature_weights = self.lagrange * np.sqrt(n), ridge_term=np.std(self.Y) * np.sqrt(mean_diag) / np.sqrt(n), randomizer_scale=self.randomizer_scale * np.std(self.Y) * np.sqrt(n)) return self._method_instance
def test_full_lasso(n=200, p=30, signal_fac=1.5, s=5, ndraw=5000, burnin=1000, sigma=3, full=False, rho=0.4, randomizer_scale=1): """ General LASSO -- """ inst, const = gaussian_instance, lasso.gaussian signal = np.sqrt(signal_fac * np.log(p)) X, Y, beta = inst(n=n, p=p, signal=signal, s=s, equicorrelated=False, rho=rho, sigma=sigma, random_signs=True)[:3] n, p = X.shape W = np.ones(X.shape[1]) * np.sqrt(1.5 * np.log(p)) * sigma conv = const(X, Y, W, randomizer_scale=randomizer_scale * sigma) signs = conv.fit(solve_args={'min_its':500, 'tol':1.e-13}) nonzero = signs != 0 conv2 = lasso.gaussian(X, Y, W, randomizer_scale=randomizer_scale * sigma) conv2.fit(perturb=conv._initial_omega, solve_args={'min_its':500, 'tol':1.e-13}) conv2.decompose_subgradient(condition=np.ones(p, np.bool)) np.testing.assert_allclose(conv2._view.sampler.affine_con.covariance, conv.sampler.affine_con.covariance) np.testing.assert_allclose(conv2._view.sampler.affine_con.mean, conv.sampler.affine_con.mean) np.testing.assert_allclose(conv2._view.sampler.affine_con.linear_part, conv.sampler.affine_con.linear_part) np.testing.assert_allclose(conv2._view.sampler.affine_con.offset, conv.sampler.affine_con.offset) np.testing.assert_allclose(conv2._view.initial_soln, conv.initial_soln) np.testing.assert_allclose(conv2._view.initial_subgrad, conv.initial_subgrad)
def compare_methods(n=500, p=100, nval=500, rho=0.35, s=5, beta_type=1, snr=0.20, target="selected", randomizer_scale=np.sqrt(0.50), full_dispersion=True, tuning_rand="lambda.theory"): X, y, _, _, Sigma, beta, sigma = sim_xy(n=n, p=p, nval=nval, rho=rho, s=s, beta_type=beta_type, snr=snr) print("snr", snr) X -= X.mean(0)[None, :] X /= (X.std(0)[None, :] * np.sqrt(n / (n - 1.))) y = y - y.mean() true_set = np.asarray([u for u in range(p) if beta[u] != 0]) if full_dispersion: dispersion = np.linalg.norm(y - X.dot(np.linalg.pinv(X).dot(y)))**2 / ( n - p) sigma_ = np.sqrt(dispersion) else: dispersion = None sigma_ = np.std(y) print("estimated and true sigma", sigma, sigma_) lam_theory = sigma_ * 1. * np.mean( np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) randomized_lasso = lasso.gaussian(X, y, feature_weights=lam_theory * np.ones(p), randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_) signs = randomized_lasso.fit() nonzero = signs != 0 sys.stderr.write("active variables selected by randomized LASSO " + str(nonzero.sum()) + "\n" + "\n") active_set_rand = np.asarray([t for t in range(p) if nonzero[t]]) active_rand_bool = np.asarray( [(np.in1d(active_set_rand[x], true_set).sum() > 0) for x in range(nonzero.sum())], np.bool) nreport = 0. if nonzero.sum() > 0: if target == "full": target_randomized = beta[nonzero] (observed_target, cov_target, cov_target_score, alternatives) = full_targets(randomized_lasso.loglike, randomized_lasso._W, nonzero, dispersion=dispersion) elif target == "selected": target_randomized = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) (observed_target, cov_target, cov_target_score, alternatives) = selected_targets(randomized_lasso.loglike, randomized_lasso._W, nonzero, dispersion=dispersion) else: raise ValueError('not a valid specification of target') toc = time.time() MLE_estimate, _, _, MLE_pval, MLE_intervals, ind_unbiased_estimator = randomized_lasso.selective_MLE( observed_target, cov_target, cov_target_score, alternatives) tic = time.time() time_MLE = tic - toc cov_MLE, selective_MLE_power = coverage(MLE_intervals, MLE_pval, target_randomized, beta[nonzero]) length_MLE = np.mean(MLE_intervals[:, 1] - MLE_intervals[:, 0]) power_MLE = ((active_rand_bool) * (np.logical_or( (0. < MLE_intervals[:, 0]), (0. > MLE_intervals[:, 1])))).sum() / float((beta != 0).sum()) MLE_discoveries = BHfilter(MLE_pval, q=0.1) power_MLE_BH = (MLE_discoveries * active_rand_bool).sum() / float( (beta != 0).sum()) fdr_MLE_BH = (MLE_discoveries * ~active_rand_bool).sum() / float( max(MLE_discoveries.sum(), 1.)) bias_MLE = np.mean(MLE_estimate - target_randomized) toc = time.time() intervals_uni, pvalue_uni = randomized_lasso.inference_new( observed_target, cov_target, cov_target_score, alternatives) tic = time.time() time_uni = tic - toc intervals_uni = intervals_uni.T cov_uni, selective_uni_power = coverage(intervals_uni, pvalue_uni, target_randomized, beta[nonzero]) length_uni = np.mean(intervals_uni[:, 1] - intervals_uni[:, 0]) power_uni = ((active_rand_bool) * (np.logical_or( (0. < intervals_uni[:, 0]), (0. > intervals_uni[:, 1])))).sum() / float((beta != 0).sum()) uni_discoveries = BHfilter(pvalue_uni, q=0.1) power_uni_BH = (uni_discoveries * active_rand_bool).sum() / float( (beta != 0).sum()) fdr_uni_BH = (uni_discoveries * ~active_rand_bool).sum() / float( max(uni_discoveries.sum(), 1.)) bias_randLASSO = np.mean(randomized_lasso.initial_soln[nonzero] - target_randomized) else: nreport += 1 cov_MLE, length_MLE, power_MLE, power_MLE_BH, fdr_MLE_BH, bias_MLE, selective_MLE_power, time_MLE = [ 0., 0., 0., 0., 0., 0., 0., 0. ] cov_uni, length_uni, power_uni, power_uni_BH, fdr_uni_BH, bias_randLASSO, selective_uni_power, time_uni = [ 0., 0., 0., 0., 0., 0., 0., 0. ] MLE_discoveries = np.zeros(1) uni_discoveries = np.zeros(1) MLE_inf = np.vstack( (cov_MLE, length_MLE, 0., nonzero.sum(), bias_MLE, selective_MLE_power, time_MLE, power_MLE, power_MLE_BH, fdr_MLE_BH, MLE_discoveries.sum())) uni_inf = np.vstack( (cov_uni, length_uni, 0., nonzero.sum(), bias_randLASSO, selective_uni_power, time_uni, power_uni, power_uni_BH, fdr_uni_BH, uni_discoveries.sum())) return np.vstack((MLE_inf, uni_inf, nreport))
def risk_comparison(n=500, p=100, nval=500, rho=0.35, s=5, beta_type=1, snr=0.20, randomizer_scale=np.sqrt(0.50), full_dispersion=False, tuning_nonrand="lambda.min", tuning_rand="lambda.1se", ndraw=50): risks = np.zeros((6, 1)) for i in range(ndraw): X, y, _, _, Sigma, beta, sigma = sim_xy(n=n, p=p, nval=nval, rho=rho, s=s, beta_type=beta_type, snr=snr) print("snr", snr) X -= X.mean(0)[None, :] X /= (X.std(0)[None, :] * np.sqrt(n / (n - 1.))) y = y - y.mean() if full_dispersion: print("shapes", y.shape, (np.linalg.norm(y - X.dot(np.linalg.pinv(X).dot(y)))**2).shape) dispersion = np.linalg.norm( y - X.dot(np.linalg.pinv(X).dot(y)))**2 / (n - p) sigma_ = np.sqrt(dispersion) else: dispersion = None _sigma_ = np.std(y) lam_theory = _sigma_ * 1. * np.mean( np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) glm_LASSO_theory, glm_LASSO_1se, glm_LASSO_min, lam_min, lam_1se = glmnet_lasso( X, y, lam_theory / float(n)) if full_dispersion is False: dispersion = None active_min = (glm_LASSO_min != 0) if active_min.sum() > 0: sigma_ = np.sqrt( np.linalg.norm(y - X[:, active_min].dot( np.linalg.pinv(X[:, active_min]).dot(y)))**2 / (n - active_min.sum())) else: sigma_ = _sigma_ print("true and estimated sigma", sigma, _sigma_, sigma_) if tuning_nonrand == "lambda.min": lam_LASSO = lam_min glm_LASSO = glm_LASSO_min elif tuning_nonrand == "lambda.1se": lam_LASSO = lam_1se glm_LASSO = glm_LASSO_1se else: lam_LASSO = lam_theory / float(n) glm_LASSO = glm_LASSO_theory active_LASSO = (glm_LASSO != 0) rel_LASSO = np.zeros(p) if active_LASSO.sum() > 0: post_LASSO_OLS = np.linalg.pinv(X[:, active_LASSO]).dot(y) rel_LASSO[active_LASSO] = post_LASSO_OLS if tuning_rand == "lambda.min": randomized_lasso = lasso.gaussian( X, y, feature_weights=n * lam_min * np.ones(p), randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_) elif tuning_rand == "lambda.1se": randomized_lasso = lasso.gaussian( X, y, feature_weights=n * lam_1se * np.ones(p), randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_) else: randomized_lasso = lasso.gaussian( X, y, feature_weights=lam_theory * np.ones(p), randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_) signs = randomized_lasso.fit() nonzero = signs != 0 sel_MLE = np.zeros(p) ind_est = np.zeros(p) randomized_lasso_est = np.zeros(p) randomized_rel_lasso_est = np.zeros(p) if nonzero.sum() > 0: target_randomized = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) (observed_target, cov_target, cov_target_score, alternatives) = selected_targets(randomized_lasso.loglike, randomized_lasso._W, nonzero, dispersion=dispersion) MLE_estimate, _, _, _, _, ind_unbiased_estimator = randomized_lasso.selective_MLE( observed_target, cov_target, cov_target_score, alternatives) sel_MLE[nonzero] = MLE_estimate ind_est[nonzero] = ind_unbiased_estimator randomized_lasso_est = randomized_lasso.initial_soln randomized_rel_lasso_est = randomized_lasso._beta_full risks += np.vstack( (relative_risk(sel_MLE, beta, Sigma), relative_risk(ind_est, beta, Sigma), relative_risk(randomized_lasso_est, beta, Sigma), relative_risk(randomized_rel_lasso_est, beta, Sigma), relative_risk(rel_LASSO, beta, Sigma), relative_risk(glm_LASSO, beta, Sigma))) print("risks so far", risks / (i + 1)) return risks / ndraw
def comparison_cvmetrics_debiased(n=100, p=150, nval=500, rho=0.35, s=5, beta_type=1, snr=0.20, randomizer_scale=np.sqrt(0.25), full_dispersion=False, tuning_nonrand="lambda.min", tuning_rand="lambda.1se"): X, y, _, _, Sigma, beta, sigma = sim_xy(n=n, p=p, nval=nval, rho=rho, s=s, beta_type=beta_type, snr=snr) print("snr", snr) X -= X.mean(0)[None, :] X /= (X.std(0)[None, :] * np.sqrt(n / (n - 1.))) y = y - y.mean() true_set = np.asarray([u for u in range(p) if beta[u] != 0]) if full_dispersion: dispersion = np.linalg.norm(y - X.dot(np.linalg.pinv(X).dot(y)))**2 / ( n - p) sigma_ = np.sqrt(dispersion) else: dispersion = None _sigma_ = np.std(y) lam_theory = _sigma_ * 1. * np.mean( np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) glm_LASSO_theory, glm_LASSO_1se, glm_LASSO_min, lam_min, lam_1se = glmnet_lasso( X, y, lam_theory / float(n)) if full_dispersion is False: dispersion = None active_min = (glm_LASSO_min != 0) if active_min.sum() > 0: sigma_ = np.sqrt( np.linalg.norm(y - X[:, active_min].dot( np.linalg.pinv(X[:, active_min]).dot(y)))**2 / (n - active_min.sum())) else: sigma_ = _sigma_ print("estimated and true sigma", sigma, _sigma_, sigma_) if tuning_nonrand == "lambda.min": lam_LASSO = lam_min glm_LASSO = glm_LASSO_min elif tuning_nonrand == "lambda.1se": lam_LASSO = lam_1se glm_LASSO = glm_LASSO_1se else: lam_LASSO = lam_theory / float(n) glm_LASSO = glm_LASSO_theory active_LASSO = (glm_LASSO != 0) nactive_LASSO = active_LASSO.sum() active_set_LASSO = np.asarray([r for r in range(p) if active_LASSO[r]]) active_LASSO_bool = np.asarray( [(np.in1d(active_set_LASSO[z], true_set).sum() > 0) for z in range(nactive_LASSO)], np.bool) rel_LASSO = np.zeros(p) Lee_nreport = 0. bias_naive = 0. if nactive_LASSO > 0: rel_LASSO[active_LASSO] = np.linalg.pinv(X[:, active_LASSO]).dot(y) Lee_target = beta[active_LASSO] post_LASSO_OLS = np.linalg.pinv(X[:, active_LASSO]).dot(y) naive_sd = sigma_ * np.sqrt( np.diag( (np.linalg.inv(X[:, active_LASSO].T.dot(X[:, active_LASSO]))))) naive_intervals = np.vstack([ post_LASSO_OLS - 1.65 * naive_sd, post_LASSO_OLS + 1.65 * naive_sd ]).T naive_pval = 2 * ndist.cdf(np.abs(post_LASSO_OLS) / naive_sd) cov_naive, selective_naive_power = coverage(naive_intervals, naive_pval, Lee_target, beta[active_LASSO]) length_naive = np.mean(naive_intervals[:, 1] - naive_intervals[:, 0]) power_naive = ((active_LASSO_bool) * (np.logical_or( (0. < naive_intervals[:, 0]), (0. > naive_intervals[:, 1])))).sum() / float((beta != 0).sum()) naive_discoveries = BHfilter(naive_pval, q=0.1) power_naive_BH = (naive_discoveries * active_LASSO_bool).sum() / float( (beta != 0).sum()) fdr_naive_BH = (naive_discoveries * ~active_LASSO_bool).sum() / float( max(naive_discoveries.sum(), 1.)) bias_naive = np.mean(rel_LASSO[active_LASSO] - Lee_target) partial_Lasso_risk = (glm_LASSO[active_LASSO] - Lee_target).T.dot(glm_LASSO[active_LASSO] - Lee_target) partial_relLasso_risk = (post_LASSO_OLS - Lee_target).T.dot(post_LASSO_OLS - Lee_target) elif nactive_LASSO == 0: Lee_nreport += 1 cov_naive, length_naive, power_naive, power_naive_BH, fdr_naive_BH, selective_naive_power = [ 0., 0., 0., 0., 0., 0. ] naive_discoveries = np.zeros(1) partial_Lasso_risk, partial_relLasso_risk = [0., 0.] if tuning_rand == "lambda.min": randomized_lasso = lasso.gaussian( X, y, feature_weights=n * lam_min * np.ones(p), randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_) elif tuning_rand == "lambda.1se": randomized_lasso = lasso.gaussian( X, y, feature_weights=n * lam_1se * np.ones(p), randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_) else: randomized_lasso = lasso.gaussian( X, y, feature_weights=lam_theory * np.ones(p), randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_) signs = randomized_lasso.fit() nonzero = signs != 0 active_set_rand = np.asarray([t for t in range(p) if nonzero[t]]) active_rand_bool = np.asarray( [(np.in1d(active_set_rand[x], true_set).sum() > 0) for x in range(nonzero.sum())], np.bool) sel_MLE = np.zeros(p) ind_est = np.zeros(p) randomized_lasso_est = np.zeros(p) randomized_rel_lasso_est = np.zeros(p) MLE_nreport = 0 if nonzero.sum() > 0: target_randomized = beta[nonzero] (observed_target, cov_target, cov_target_score, alternatives) = debiased_targets(randomized_lasso.loglike, randomized_lasso._W, nonzero, penalty=randomized_lasso.penalty, dispersion=dispersion) MLE_estimate, _, _, MLE_pval, MLE_intervals, ind_unbiased_estimator = randomized_lasso.selective_MLE( observed_target, cov_target, cov_target_score, alternatives) sel_MLE[nonzero] = MLE_estimate ind_est[nonzero] = ind_unbiased_estimator randomized_lasso_est = randomized_lasso.initial_soln randomized_rel_lasso_est = randomized_lasso._beta_full cov_MLE, selective_MLE_power = coverage(MLE_intervals, MLE_pval, target_randomized, beta[nonzero]) length_MLE = np.mean(MLE_intervals[:, 1] - MLE_intervals[:, 0]) power_MLE = ((active_rand_bool) * (np.logical_or( (0. < MLE_intervals[:, 0]), (0. > MLE_intervals[:, 1])))).sum() / float((beta != 0).sum()) MLE_discoveries = BHfilter(MLE_pval, q=0.1) power_MLE_BH = (MLE_discoveries * active_rand_bool).sum() / float( (beta != 0).sum()) fdr_MLE_BH = (MLE_discoveries * ~active_rand_bool).sum() / float( max(MLE_discoveries.sum(), 1.)) bias_MLE = np.mean(MLE_estimate - target_randomized) partial_MLE_risk = (MLE_estimate - target_randomized).T.dot(MLE_estimate - target_randomized) partial_ind_risk = (ind_unbiased_estimator - target_randomized).T.dot(ind_unbiased_estimator - target_randomized) partial_randLasso_risk = ( randomized_lasso_est[nonzero] - target_randomized).T.dot(randomized_lasso_est[nonzero] - target_randomized) partial_relrandLasso_risk = ( randomized_rel_lasso_est[nonzero] - target_randomized).T.dot(randomized_rel_lasso_est[nonzero] - target_randomized) else: MLE_nreport = 1 cov_MLE, length_MLE, power_MLE, power_MLE_BH, fdr_MLE_BH, bias_MLE, selective_MLE_power = [ 0., 0., 0., 0., 0., 0., 0. ] MLE_discoveries = np.zeros(1) partial_MLE_risk, partial_ind_risk, partial_randLasso_risk, partial_relrandLasso_risk = [ 0., 0., 0., 0. ] risks = np.vstack( (relative_risk(sel_MLE, beta, Sigma), relative_risk(ind_est, beta, Sigma), relative_risk(randomized_lasso_est, beta, Sigma), relative_risk(randomized_rel_lasso_est, beta, Sigma), relative_risk(rel_LASSO, beta, Sigma), relative_risk(glm_LASSO, beta, Sigma))) partial_risks = np.vstack( (partial_MLE_risk, partial_ind_risk, partial_randLasso_risk, partial_relrandLasso_risk, partial_relLasso_risk, partial_Lasso_risk)) naive_inf = np.vstack( (cov_naive, length_naive, 0., nactive_LASSO, bias_naive, selective_naive_power, power_naive, power_naive_BH, fdr_naive_BH, naive_discoveries.sum())) Lee_inf = np.zeros((10, 1)) Liu_inf = np.zeros((10, 1)) MLE_inf = np.vstack( (cov_MLE, length_MLE, 0., nonzero.sum(), bias_MLE, selective_MLE_power, power_MLE, power_MLE_BH, fdr_MLE_BH, MLE_discoveries.sum())) nreport = np.vstack((Lee_nreport, 0., MLE_nreport)) return np.vstack( (risks, naive_inf, Lee_inf, Liu_inf, MLE_inf, partial_risks, nreport))
def comparison_cvmetrics_full(n=500, p=100, nval=500, rho=0.35, s=5, beta_type=1, snr=0.20, randomizer_scale=np.sqrt(0.25), full_dispersion=True, tuning_nonrand="lambda.min", tuning_rand="lambda.1se"): X, y, _, _, Sigma, beta, sigma = sim_xy(n=n, p=p, nval=nval, rho=rho, s=s, beta_type=beta_type, snr=snr) print("snr", snr) X -= X.mean(0)[None, :] X /= (X.std(0)[None, :] * np.sqrt(n / (n - 1.))) y = y - y.mean() true_set = np.asarray([u for u in range(p) if beta[u] != 0]) if full_dispersion: dispersion = np.linalg.norm(y - X.dot(np.linalg.pinv(X).dot(y)))**2 / ( n - p) sigma_ = np.sqrt(dispersion) else: dispersion = None sigma_ = np.std(y) print("estimated and true sigma", sigma, sigma_) lam_theory = sigma_ * 1. * np.mean( np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) glm_LASSO_theory, glm_LASSO_1se, glm_LASSO_min, lam_min, lam_1se = glmnet_lasso( X, y, lam_theory / float(n)) if tuning_nonrand == "lambda.min": lam_LASSO = lam_min glm_LASSO = glm_LASSO_min elif tuning_nonrand == "lambda.1se": lam_LASSO = lam_1se glm_LASSO = glm_LASSO_1se else: lam_LASSO = lam_theory / float(n) glm_LASSO = glm_LASSO_theory active_LASSO = (glm_LASSO != 0) nactive_LASSO = active_LASSO.sum() active_set_LASSO = np.asarray([r for r in range(p) if active_LASSO[r]]) active_LASSO_bool = np.asarray( [(np.in1d(active_set_LASSO[z], true_set).sum() > 0) for z in range(nactive_LASSO)], np.bool) rel_LASSO = np.zeros(p) Lee_nreport = 0 bias_Lee = 0. bias_naive = 0. if nactive_LASSO > 0: rel_LASSO[active_LASSO] = np.linalg.pinv(X[:, active_LASSO]).dot(y) Lee_target = beta[active_LASSO] Lee_intervals, Lee_pval = selInf_R(X, y, glm_LASSO, n * lam_LASSO, sigma_, Type=1, alpha=0.1) if (Lee_pval.shape[0] == Lee_target.shape[0]): cov_Lee, selective_Lee_power = coverage(Lee_intervals, Lee_pval, Lee_target, beta[active_LASSO]) inf_entries_bool = np.isinf(Lee_intervals[:, 1] - Lee_intervals[:, 0]) inf_entries = np.mean(inf_entries_bool) if inf_entries == 1.: length_Lee = 0. else: length_Lee = np.mean((Lee_intervals[:, 1] - Lee_intervals[:, 0])[~inf_entries_bool]) power_Lee = ((active_LASSO_bool) * (np.logical_or( (0. < Lee_intervals[:, 0]), (0. > Lee_intervals[:, 1])))).sum() / float((beta != 0).sum()) Lee_discoveries = BHfilter(Lee_pval, q=0.1) power_Lee_BH = (Lee_discoveries * active_LASSO_bool).sum() / float( (beta != 0).sum()) fdr_Lee_BH = (Lee_discoveries * ~active_LASSO_bool).sum() / float( max(Lee_discoveries.sum(), 1.)) bias_Lee = np.mean(glm_LASSO[active_LASSO] - Lee_target) post_LASSO_OLS = np.linalg.pinv(X[:, active_LASSO]).dot(y) naive_sd = sigma_ * np.sqrt( np.diag((np.linalg.inv(X[:, active_LASSO].T.dot( X[:, active_LASSO]))))) naive_intervals = np.vstack([ post_LASSO_OLS - 1.65 * naive_sd, post_LASSO_OLS + 1.65 * naive_sd ]).T naive_pval = 2 * ndist.cdf(np.abs(post_LASSO_OLS) / naive_sd) cov_naive, selective_naive_power = coverage( naive_intervals, naive_pval, Lee_target, beta[active_LASSO]) length_naive = np.mean(naive_intervals[:, 1] - naive_intervals[:, 0]) power_naive = ((active_LASSO_bool) * (np.logical_or( (0. < naive_intervals[:, 0]), (0. > naive_intervals[:, 1])))).sum() / float( (beta != 0).sum()) naive_discoveries = BHfilter(naive_pval, q=0.1) power_naive_BH = (naive_discoveries * active_LASSO_bool).sum() / float( (beta != 0).sum()) fdr_naive_BH = (naive_discoveries * ~active_LASSO_bool).sum() / float( max(naive_discoveries.sum(), 1.)) bias_naive = np.mean(rel_LASSO[active_LASSO] - Lee_target) partial_Lasso_risk = (glm_LASSO[active_LASSO] - Lee_target).T.dot(glm_LASSO[active_LASSO] - Lee_target) partial_relLasso_risk = (post_LASSO_OLS - Lee_target).T.dot(post_LASSO_OLS - Lee_target) else: Lee_nreport = 1 cov_Lee, length_Lee, inf_entries, power_Lee, power_Lee_BH, fdr_Lee_BH, selective_Lee_power = [ 0., 0., 0., 0., 0., 0., 0. ] cov_naive, length_naive, power_naive, power_naive_BH, fdr_naive_BH, selective_naive_power = [ 0., 0., 0., 0., 0., 0. ] naive_discoveries = np.zeros(1) Lee_discoveries = np.zeros(1) partial_Lasso_risk, partial_relLasso_risk = [0., 0.] elif nactive_LASSO == 0: Lee_nreport = 1 cov_Lee, length_Lee, inf_entries, power_Lee, power_Lee_BH, fdr_Lee_BH, selective_Lee_power = [ 0., 0., 0., 0., 0., 0., 0. ] cov_naive, length_naive, power_naive, power_naive_BH, fdr_naive_BH, selective_naive_power = [ 0., 0., 0., 0., 0., 0. ] naive_discoveries = np.zeros(1) Lee_discoveries = np.zeros(1) partial_Lasso_risk, partial_relLasso_risk = [0., 0.] lasso_Liu = ROSI.gaussian(X, y, n * lam_LASSO) Lasso_soln_Liu = lasso_Liu.fit() active_set_Liu = np.nonzero(Lasso_soln_Liu != 0)[0] nactive_Liu = active_set_Liu.shape[0] active_Liu_bool = np.asarray( [(np.in1d(active_set_Liu[a], true_set).sum() > 0) for a in range(nactive_Liu)], np.bool) Liu_nreport = 0 if nactive_Liu > 0: Liu_target = beta[Lasso_soln_Liu != 0] df = lasso_Liu.summary(level=0.90, compute_intervals=True, dispersion=dispersion) Liu_lower, Liu_upper, Liu_pval = np.asarray(df['lower_confidence']), \ np.asarray(df['upper_confidence']), \ np.asarray(df['pval']) Liu_intervals = np.vstack((Liu_lower, Liu_upper)).T cov_Liu, selective_Liu_power = coverage(Liu_intervals, Liu_pval, Liu_target, beta[Lasso_soln_Liu != 0]) length_Liu = np.mean(Liu_intervals[:, 1] - Liu_intervals[:, 0]) power_Liu = ((active_Liu_bool) * (np.logical_or( (0. < Liu_intervals[:, 0]), (0. > Liu_intervals[:, 1])))).sum() / float((beta != 0).sum()) Liu_discoveries = BHfilter(Liu_pval, q=0.1) power_Liu_BH = (Liu_discoveries * active_Liu_bool).sum() / float( (beta != 0).sum()) fdr_Liu_BH = (Liu_discoveries * ~active_Liu_bool).sum() / float( max(Liu_discoveries.sum(), 1.)) else: Liu_nreport = 1 cov_Liu, length_Liu, power_Liu, power_Liu_BH, fdr_Liu_BH, selective_Liu_power = [ 0., 0., 0., 0., 0., 0. ] Liu_discoveries = np.zeros(1) if tuning_rand == "lambda.min": randomized_lasso = lasso.gaussian( X, y, feature_weights=n * lam_min * np.ones(p), randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_) elif tuning_rand == "lambda.1se": randomized_lasso = lasso.gaussian( X, y, feature_weights=n * lam_1se * np.ones(p), randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_) else: randomized_lasso = lasso.gaussian( X, y, feature_weights=lam_theory * np.ones(p), randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_) signs = randomized_lasso.fit() nonzero = signs != 0 active_set_rand = np.asarray([t for t in range(p) if nonzero[t]]) active_rand_bool = np.asarray( [(np.in1d(active_set_rand[x], true_set).sum() > 0) for x in range(nonzero.sum())], np.bool) sel_MLE = np.zeros(p) ind_est = np.zeros(p) randomized_lasso_est = np.zeros(p) randomized_rel_lasso_est = np.zeros(p) MLE_nreport = 0 if nonzero.sum() > 0: target_randomized = beta[nonzero] (observed_target, cov_target, cov_target_score, alternatives) = full_targets(randomized_lasso.loglike, randomized_lasso._W, nonzero, dispersion=dispersion) MLE_estimate, _, _, MLE_pval, MLE_intervals, ind_unbiased_estimator = randomized_lasso.selective_MLE( observed_target, cov_target, cov_target_score, alternatives) sel_MLE[nonzero] = MLE_estimate ind_est[nonzero] = ind_unbiased_estimator randomized_lasso_est = randomized_lasso.initial_soln randomized_rel_lasso_est = randomized_lasso._beta_full cov_MLE, selective_MLE_power = coverage(MLE_intervals, MLE_pval, target_randomized, beta[nonzero]) length_MLE = np.mean(MLE_intervals[:, 1] - MLE_intervals[:, 0]) power_MLE = ((active_rand_bool) * (np.logical_or( (0. < MLE_intervals[:, 0]), (0. > MLE_intervals[:, 1])))).sum() / float((beta != 0).sum()) MLE_discoveries = BHfilter(MLE_pval, q=0.1) power_MLE_BH = (MLE_discoveries * active_rand_bool).sum() / float( (beta != 0).sum()) fdr_MLE_BH = (MLE_discoveries * ~active_rand_bool).sum() / float( max(MLE_discoveries.sum(), 1.)) bias_MLE = np.mean(MLE_estimate - target_randomized) partial_MLE_risk = (MLE_estimate - target_randomized).T.dot(MLE_estimate - target_randomized) partial_ind_risk = (ind_unbiased_estimator - target_randomized).T.dot(ind_unbiased_estimator - target_randomized) partial_randLasso_risk = ( randomized_lasso_est[nonzero] - target_randomized).T.dot(randomized_lasso_est[nonzero] - target_randomized) partial_relrandLasso_risk = ( randomized_rel_lasso_est[nonzero] - target_randomized).T.dot(randomized_rel_lasso_est[nonzero] - target_randomized) else: MLE_nreport = 1 cov_MLE, length_MLE, power_MLE, power_MLE_BH, fdr_MLE_BH, bias_MLE, selective_MLE_power = [ 0., 0., 0., 0., 0., 0., 0. ] MLE_discoveries = np.zeros(1) partial_MLE_risk, partial_ind_risk, partial_randLasso_risk, partial_relrandLasso_risk = [ 0., 0., 0., 0. ] risks = np.vstack( (relative_risk(sel_MLE, beta, Sigma), relative_risk(ind_est, beta, Sigma), relative_risk(randomized_lasso_est, beta, Sigma), relative_risk(randomized_rel_lasso_est, beta, Sigma), relative_risk(rel_LASSO, beta, Sigma), relative_risk(glm_LASSO, beta, Sigma))) partial_risks = np.vstack( (partial_MLE_risk, partial_ind_risk, partial_randLasso_risk, partial_relrandLasso_risk, partial_relLasso_risk, partial_Lasso_risk)) naive_inf = np.vstack( (cov_naive, length_naive, 0., nactive_LASSO, bias_naive, selective_naive_power, power_naive, power_naive_BH, fdr_naive_BH, naive_discoveries.sum())) Lee_inf = np.vstack( (cov_Lee, length_Lee, inf_entries, nactive_LASSO, bias_Lee, selective_Lee_power, power_Lee, power_Lee_BH, fdr_Lee_BH, Lee_discoveries.sum())) Liu_inf = np.vstack( (cov_Liu, length_Liu, 0., nactive_Liu, bias_Lee, selective_Liu_power, power_Liu, power_Liu_BH, fdr_Liu_BH, Liu_discoveries.sum())) MLE_inf = np.vstack( (cov_MLE, length_MLE, 0., nonzero.sum(), bias_MLE, selective_MLE_power, power_MLE, power_MLE_BH, fdr_MLE_BH, MLE_discoveries.sum())) nreport = np.vstack((Lee_nreport, Liu_nreport, MLE_nreport)) return np.vstack( (risks, naive_inf, Lee_inf, Liu_inf, MLE_inf, partial_risks, nreport))
def multiple_runs_lasso(n=500, p=100, nval=500, rho=0.35, s=5, beta_type=1, snr=0.20, randomizer_scale=np.sqrt(0.50), full_dispersion=True): X, y, _, _, Sigma, beta, sigma = sim_xy(n=n, p=p, nval=nval, rho=rho, s=s, beta_type=beta_type, snr=snr) X -= X.mean(0)[None, :] X /= (X.std(0)[None, :] * np.sqrt(n / (n - 1.))) y = y - y.mean() if full_dispersion: dispersion = np.linalg.norm(y - X.dot(np.linalg.pinv(X).dot(y))) ** 2 / (n - p) sigma_ = np.sqrt(dispersion) else: dispersion = None sigma_ = np.std(y) print("estimated and true sigma", sigma, sigma_) lam_theory = sigma_ * 1. * np.mean(np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) glm_LASSO_theory, glm_LASSO_1se, glm_LASSO_min, lam_min, lam_1se = glmnet_lasso(X, y, lam_theory / float(n)) active_LASSO_1 = (glm_LASSO_theory != 0) active_LASSO_2 = (glm_LASSO_1se != 0) active_LASSO = np.logical_or(active_LASSO_1, active_LASSO_2) nreport_nonrand = 0. if active_LASSO.sum()>0: target_nonrandomized = np.linalg.pinv(X[:, active_LASSO]).dot(X.dot(beta)) post_LASSO_OLS = np.linalg.pinv(X[:, active_LASSO]).dot(y) naive_sd = sigma_ * np.sqrt(np.diag((np.linalg.inv(X[:, active_LASSO].T.dot(X[:, active_LASSO]))))) naive_intervals = np.vstack([post_LASSO_OLS - 1.65 * naive_sd, post_LASSO_OLS + 1.65 * naive_sd]).T naive_pval = 2 * (1.-ndist.cdf(np.abs(post_LASSO_OLS)/ naive_sd)) cov_naive, power_naive = coverage(naive_intervals, naive_pval, target_nonrandomized, beta[active_LASSO]) length_naive = np.mean(naive_intervals[:, 1] - naive_intervals[:, 0]) fdr_naive = ((naive_pval[beta[active_LASSO] == 0]) < 0.1).sum() / float((naive_pval < 0.1).sum()) else: nreport_nonrand +=1. cov_naive, power_naive, length_naive, fdr_naive = [0.,0., 0.,0.] randomized_lasso_1 = lasso.gaussian(X, y, feature_weights=lam_theory * np.ones(p), randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_) signs_1 = randomized_lasso_1.fit() nonzero_1 = signs_1 != 0 randomized_lasso_2 = lasso.gaussian(X, y, feature_weights=n * lam_1se * np.ones(p), randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_) signs_2 = randomized_lasso_2.fit() nonzero_2 = signs_2 != 0 signs = np.logical_or(signs_1, signs_2) nonzero = signs!=0 print("check", nonzero_1.sum(), nonzero_2.sum(), nonzero.sum(), active_LASSO.sum()) nreport = 0. if nonzero.sum() > 0: target_randomized = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) observed_target = np.linalg.pinv(X[:, nonzero]).dot(y) (_, _, cov_target_score_1, alternatives_1) = selected_targets(randomized_lasso_1.loglike, randomized_lasso_1._W, nonzero, dispersion=dispersion) (_, cov_target, cov_target_score_2, alternatives_2) = selected_targets(randomized_lasso_2.loglike, randomized_lasso_2._W, nonzero, dispersion=dispersion) estimate, _, _, pval, intervals, _ = twostage_selective_MLE(observed_target, cov_target, cov_target_score_1, cov_target_score_2, randomized_lasso_1.observed_opt_state, randomized_lasso_2.observed_opt_state, randomized_lasso_1.cond_mean, randomized_lasso_2.cond_mean, randomized_lasso_1.cond_cov, randomized_lasso_2.cond_cov, randomized_lasso_1.logdens_linear, randomized_lasso_2.logdens_linear, randomized_lasso_1.con_linear, randomized_lasso_2.con_linear, randomized_lasso_1.con_offset, randomized_lasso_2.con_offset, solve_args={'tol': 1.e-12}, level=0.9) coverage_adjusted, power_adjusted = coverage(intervals, pval, target_randomized, beta[nonzero]) length_adjusted = np.mean(intervals[:, 1] - intervals[:, 0]) fdr_adjusted = ((pval[beta[nonzero] == 0]) < 0.1).sum() / float((pval < 0.1).sum()) else: nreport +=1 coverage_adjusted, length_adjusted, power_adjusted, fdr_adjusted = [0., 0., 0., 0.] MLE_inf = np.vstack((coverage_adjusted, length_adjusted, power_adjusted, fdr_adjusted, nonzero.sum())) Naive_inf = np.vstack((cov_naive, length_naive, power_naive, fdr_naive, active_LASSO.sum())) print MLE_inf, Naive_inf return np.vstack((MLE_inf, Naive_inf, nreport, nreport_nonrand))
def pivot(n=500, p=100, nval=500, rho=0., s=5, beta_type=1, snr=0.25, randomizer_scale=np.sqrt(1.), full_dispersion=True): X, y, _, _, Sigma, beta, sigma = sim_xy(n=n, p=p, nval=nval, rho=rho, s=s, beta_type=beta_type, snr=snr) print("snr", snr) X -= X.mean(0)[None, :] X /= (X.std(0)[None, :] * np.sqrt(n / (n - 1.))) y = y - y.mean() if full_dispersion: dispersion = np.linalg.norm(y - X.dot(np.linalg.pinv(X).dot(y)))**2 / ( n - p) sigma_ = np.sqrt(dispersion) else: dispersion = None sigma_ = np.std(y) print("estimated and true sigma", sigma, sigma_) lam_theory = sigma_ * 1. * np.mean( np.fabs(np.dot(X.T, np.random.standard_normal((n, 2000)))).max(0)) randomized_lasso = lasso.gaussian(X, y, feature_weights=lam_theory * np.ones(p), randomizer_scale=np.sqrt(n) * randomizer_scale * sigma_) signs = randomized_lasso.fit() nonzero = signs != 0 sys.stderr.write("active variables selected by randomized LASSO " + str(nonzero.sum()) + "\n" + "\n") if nonzero.sum() > 0: target_randomized = np.linalg.pinv(X[:, nonzero]).dot(X.dot(beta)) (observed_target, cov_target, cov_target_score, alternatives) = selected_targets(randomized_lasso.loglike, randomized_lasso._W, nonzero, dispersion=dispersion) toc = time.time() MLE_estimate, observed_info_mean, _, MLE_pval, MLE_intervals, ind_unbiased_estimator = randomized_lasso.selective_MLE( observed_target, cov_target, cov_target_score, alternatives) tic = time.time() cov_MLE, _ = coverage(MLE_intervals, MLE_pval, target_randomized, beta[nonzero]) pivot_MLE = np.true_divide(MLE_estimate - target_randomized, np.sqrt(np.diag(observed_info_mean))) time_MLE = tic - toc toc = time.time() sampler_pivot, sampler_pval, sampler_intervals = randomized_lasso.summary( observed_target, cov_target, cov_target_score, alternatives, level=0.9, compute_intervals=True, ndraw=200000) tic = time.time() cov_sampler, _ = coverage(sampler_intervals, sampler_pval, target_randomized, beta[nonzero]) time_sampler = tic - toc return pivot_MLE, sampler_pivot, time_MLE, time_sampler, np.mean( cov_MLE), np.mean(cov_sampler)