def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=3000): # description of statistical problem X, y, truth = gaussian_instance(n=n, p=p, s=s, equicorrelated=False, rho=0.5, sigma=sigma, signal=signal, random_signs=True, scale=False)[:3] dispersion = sigma**2 S = X.T.dot(y) covS = dispersion * X.T.dot(X) smooth_sampler = normal_sampler(S, covS) def meta_algorithm(XTX, XTXi, lam, sampler): p = XTX.shape[0] success = np.zeros(p) loss = rr.quadratic_loss((p, ), Q=XTX) pen = rr.l1norm(p, lagrange=lam) scale = 0.5 noisy_S = sampler(scale=scale) loss.quadratic = rr.identity_quadratic(0, 0, -noisy_S, 0) problem = rr.simple_problem(loss, pen) soln = problem.solve(max_its=100, tol=1.e-10) success += soln != 0 return set(np.nonzero(success)[0]) XTX = X.T.dot(X) XTXi = np.linalg.inv(XTX) resid = y - X.dot(XTXi.dot(X.T.dot(y))) dispersion = np.linalg.norm(resid)**2 / (n - p) lam = 4. * np.sqrt(n) selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, lam) # run selection algorithm return full_model_inference(X, y, truth, selection_algorithm, smooth_sampler, success_params=(1, 1), B=B, fit_probability=keras_fit, fit_args={ 'epochs': 20, 'sizes': [100] * 5, 'dropout': 0., 'activation': 'relu' })
def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=1000): # description of statistical problem X, y, truth = gaussian_instance(n=n, p=p, s=s, equicorrelated=False, rho=0.5, sigma=sigma, signal=signal, random_signs=True, scale=False)[:3] XTX = X.T.dot(X) XTXi = np.linalg.inv(XTX) resid = y - X.dot(XTXi.dot(X.T.dot(y))) dispersion = np.linalg.norm(resid)**2 / (n - p) S = X.T.dot(y) covS = dispersion * X.T.dot(X) smooth_sampler = normal_sampler(S, covS) def meta_algorithm(XTX, XTXi, dispersion, lam, sampler): p = XTX.shape[0] success = np.zeros(p) loss = rr.quadratic_loss((p, ), Q=XTX) pen = rr.l1norm(p, lagrange=lam) scale = 0. noisy_S = sampler(scale=scale) soln = XTXi.dot(noisy_S) solnZ = soln / (np.sqrt(np.diag(XTXi)) * np.sqrt(dispersion)) pval = ndist.cdf(solnZ) pval = 2 * np.minimum(pval, 1 - pval) return set(BHfilter(pval, q=0.2)) lam = 4. * np.sqrt(n) selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, dispersion, lam) # run selection algorithm return full_model_inference(X, y, truth, selection_algorithm, smooth_sampler, success_params=(1, 1), B=B, fit_probability=keras_fit, fit_args={ 'epochs': 5, 'sizes': [200] * 10, 'dropout': 0., 'activation': 'relu' })
def simulate(n=1000, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, seed=0, B=5000): # description of statistical problem np.random.seed(seed) X, y, truth = gaussian_instance(n=n, p=p, s=s, equicorrelated=False, rho=0.5, sigma=sigma, signal=signal, random_signs=True, scale=False, center=False)[:3] dispersion = sigma**2 S = X.T.dot(y) covS = dispersion * X.T.dot(X) smooth_sampler = normal_sampler(S, covS) def meta_algorithm(X, XTXi, resid, sampler): n, p = X.shape rho = 0.8 S = sampler(scale=0.) # deterministic with scale=0 ynew = X.dot(XTXi).dot(S) + resid # will be ok for n>p and non-degen X Xnew = rho * X + np.sqrt(1 - rho**2) * np.random.standard_normal(X.shape) X_full = np.hstack([X, Xnew]) beta_full = np.linalg.pinv(X_full).dot(ynew) winners = np.fabs(beta_full)[:p] > np.fabs(beta_full)[p:] return set(np.nonzero(winners)[0]) XTX = X.T.dot(X) XTXi = np.linalg.inv(XTX) resid = y - X.dot(XTXi.dot(X.T.dot(y))) dispersion = np.linalg.norm(resid)**2 / (n-p) selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid) # run selection algorithm return full_model_inference(X, y, truth, selection_algorithm, smooth_sampler, success_params=(8, 10), B=B, fit_probability=keras_fit, fit_args={'epochs':20, 'sizes':[100]*5, 'dropout':0., 'activation':'relu'})
def simulate(n=100): # description of statistical problem truth = np.array([2., -2.]) / np.sqrt(n) dispersion = 2 data = np.sqrt(dispersion) * np.random.standard_normal( (n, 2)) + np.multiply.outer(np.ones(n), truth) S = np.sum(data, 0) observed_sampler = normal_sampler(S, dispersion * n * np.identity(2)) def selection_algorithm(sampler): min_success = 1 ntries = 3 success = 0 for _ in range(ntries): noisyS = sampler(scale=0.5) success += noisyS.sum() > 0.2 * np.sqrt(n) * np.sqrt(dispersion) if success >= min_success: return set([1, 0]) return set([1]) # run selection algorithm observed_set = selection_algorithm(observed_sampler) # find the target, based on the observed outcome # we just take the first target pivots, covered, lengths = [], [], [] for idx in observed_set: true_target = truth[idx] pivot, interval = infer_full_target(selection_algorithm, observed_set, [idx], observed_sampler, dispersion, hypothesis=[true_target], fit_probability=probit_fit)[0][:2] pivots.append(pivot) covered.append( (interval[0] < true_target) * (interval[1] > true_target)) lengths.append(interval[1] - interval[0]) return pivots, covered, lengths
def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=3000): # description of statistical problem X, y, truth = gaussian_instance(n=n, p=p, s=s, equicorrelated=False, rho=0.5, sigma=sigma, signal=signal, random_signs=True, scale=False)[:3] dispersion = sigma**2 S = X.T.dot(y) covS = dispersion * X.T.dot(X) smooth_sampler = normal_sampler(S, covS) def meta_algorithm(X, XTXi, resid, sampler): S = sampler(scale=0.5) # deterministic with scale=0 ynew = X.dot(XTXi).dot(S) + resid # will be ok for n>p and non-degen X G = lasso_glmnet(X, ynew, *[None]*4) select = G.select() return set(list(select[0])) XTX = X.T.dot(X) XTXi = np.linalg.inv(XTX) resid = y - X.dot(XTXi.dot(X.T.dot(y))) dispersion = np.linalg.norm(resid)**2 / (n-p) selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid) # run selection algorithm return full_model_inference(X, y, truth, selection_algorithm, smooth_sampler, success_params=(1, 1), B=B, fit_probability=gbm_fit_sk, fit_args={'n_estimators':2000})
def simulate(n=100): # description of statistical problem truth = np.array([2., -2.]) / np.sqrt(n) data = np.random.standard_normal( (n, 2)) + np.multiply.outer(np.ones(n), truth) S = np.mean(data, 0) observed_sampler = normal_sampler(S, 1 / n * np.identity(2)) def selection_algorithm(sampler): min_success = 1 ntries = 3 success = 0 for _ in range(ntries): noisyS = sampler(scale=0.5) success += noisyS.sum() > 0.2 / np.sqrt(n) return success >= min_success # run selection algorithm observed_outcome = selection_algorithm(observed_sampler) # find the target, based on the observed outcome if observed_outcome: # target is truth[0] (true_target, observed_target, target_cov, cross_cov) = (truth[0], S[0], 1. / n * np.identity(1), np.array([1., 0.]).reshape((2, 1)) / n) else: (true_target, observed_target, target_cov, cross_cov) = (truth[1], S[1], 1. / n * np.identity(1), np.array([0., 1.]).reshape((2, 1)) / n) pivot, interval = infer_general_target(selection_algorithm, observed_outcome, observed_sampler, observed_target, cross_cov, target_cov, hypothesis=true_target, fit_probability=probit_fit)[:2] return pivot, (interval[0] < true_target) * ( interval[1] > true_target), interval[1] - interval[0]
def simulate(n=400, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, seed=0, B=2000): # description of statistical problem np.random.seed(seed) X, y, truth = gaussian_instance(n=n, p=p, s=s, equicorrelated=False, rho=0.5, sigma=sigma, signal=signal, random_signs=True, scale=False, center=False)[:3] dispersion = sigma**2 S = X.T.dot(y) covS = dispersion * X.T.dot(X) smooth_sampler = normal_sampler(S, covS) def meta_algorithm(X, XTXi, resid, sampler): n, p = X.shape idx = np.random.choice(np.arange(n), 200, replace=False) S = sampler(scale=0.) # deterministic with scale=0 ynew = X.dot(XTXi).dot(S) + resid # will be ok for n>p and non-degen X G = lasso_glmnet(X[idx], ynew[idx], *[None] * 4) select = G.select() return set(list(select[0])) XTX = X.T.dot(X) XTXi = np.linalg.inv(XTX) resid = y - X.dot(XTXi.dot(X.T.dot(y))) dispersion = np.linalg.norm(resid)**2 / (n - p) selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid) # run selection algorithm df = full_model_inference(X, y, truth, selection_algorithm, smooth_sampler, success_params=(1, 1), B=B, fit_probability=keras_fit, fit_args={ 'epochs': 20, 'sizes': [100] * 5, 'dropout': 0., 'activation': 'relu' }) if df is not None: observed_set = list(df['variable']) true_target = truth[observed_set] np.random.seed(seed) X2, _, _ = gaussian_instance(n=n, p=p, s=s, equicorrelated=False, rho=0.5, sigma=sigma, signal=signal, random_signs=True, center=False, scale=False)[:3] stage_1 = np.random.choice(np.arange(n), 200, replace=False) stage_2 = sorted(set(range(n)).difference(stage_1)) X2 = X2[stage_2] y2 = X2.dot(truth) + sigma * np.random.standard_normal(X2.shape[0]) XTXi_2 = np.linalg.inv(X2.T.dot(X2)) resid2 = y2 - X2.dot(XTXi_2.dot(X2.T.dot(y2))) dispersion_2 = np.linalg.norm(resid2)**2 / (X2.shape[0] - X2.shape[1]) naive_df = naive_full_model_inference(X2, y2, dispersion_2, observed_set, alpha=alpha) df = pd.merge(df, naive_df, on='variable') return df
def simulate(n=1000, p=60, s=15, signal=3, sigma=2, alpha=0.1): # description of statistical problem X, y, truth = gaussian_instance(n=n, p=p, s=s, equicorrelated=False, rho=0.5, sigma=sigma, signal=signal, random_signs=True)[:3] dispersion = sigma**2 S = X.T.dot(y) covS = dispersion * X.T.dot(X) smooth_sampler = normal_sampler(S, covS) splitting_sampler = split_sampler(X * y[:, None], covS / n) def meta_algorithm(XTX, XTXi, dispersion, sampler): min_success = 3 ntries = 7 p = XTX.shape[0] success = np.zeros(p) for _ in range(ntries): scale = 0.5 frac = 1. / (scale**2 + 1.) noisy_S = sampler(scale=scale) noisy_beta = XTXi.dot(noisy_S) noisy_Z = noisy_beta / np.sqrt(dispersion * np.diag(XTXi) * frac) success += np.fabs(noisy_Z) > 2 return set(np.nonzero(success >= min_success)[0]) XTX = X.T.dot(X) XTXi = np.linalg.inv(XTX) resid = y - X.dot(XTXi.dot(X.T.dot(y))) dispersion = np.linalg.norm(resid)**2 / (n-p) selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, dispersion) # run selection algorithm observed_set = selection_algorithm(splitting_sampler) # find the target, based on the observed outcome # we just take the first target pivots, covered, lengths, naive_lengths = [], [], [], [] for idx in observed_set: print(idx, len(observed_set)) true_target = truth[idx] (pivot, interval) = infer_full_target(selection_algorithm, observed_set, [idx], splitting_sampler, dispersion, hypothesis=[true_target], fit_probability=probit_fit, success_params=(1, 1), alpha=alpha, B=1000)[0][:2] pivots.append(pivot) covered.append((interval[0] < true_target) * (interval[1] > true_target)) lengths.append(interval[1] - interval[0]) target_sd = np.sqrt(dispersion * XTXi[idx, idx]) naive_lengths.append(2 * ndist.ppf(1 - 0.5 * alpha) * target_sd) return pivots, covered, lengths, naive_lengths
def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=2000): # description of statistical problem X, y, truth = gaussian_instance(n=n, p=p, s=s, equicorrelated=False, rho=0.5, sigma=sigma, signal=signal, random_signs=True, scale=False)[:3] dispersion = sigma**2 S = X.T.dot(y) covS = dispersion * X.T.dot(X) smooth_sampler = normal_sampler(S, covS) splitting_sampler = split_sampler(X * y[:, None], covS) def meta_algorithm(X, XTXi, resid, sampler): S = sampler(scale=0.) # deterministic with scale=0 ynew = X.dot(XTXi).dot(S) + resid # will be ok for n>p and non-degen X G = lasso_glmnet(X, ynew, *[None] * 4) select = G.select() return set(list(select[0])) XTX = X.T.dot(X) XTXi = np.linalg.inv(XTX) resid = y - X.dot(XTXi.dot(X.T.dot(y))) dispersion = np.linalg.norm(resid)**2 / (n - p) selection_algorithm = functools.partial(meta_algorithm, X, XTXi, resid) # run selection algorithm success_params = (1, 1) observed_set = repeat_selection(selection_algorithm, smooth_sampler, *success_params) # find the target, based on the observed outcome # we just take the first target pivots, covered, lengths, pvalues = [], [], [], [] lower, upper = [], [] naive_pvalues, naive_pivots, naive_covered, naive_lengths = [], [], [], [] targets = [] observed_list = sorted(observed_set) np.random.shuffle(observed_list) for idx in observed_list[:1]: print("variable: ", idx, "total selected: ", len(observed_set)) true_target = [truth[idx]] targets.extend(true_target) (pivot, interval, pvalue) = infer_full_target(selection_algorithm, observed_set, [idx], splitting_sampler, dispersion, hypothesis=true_target, fit_probability=probit_fit, success_params=success_params, alpha=alpha, B=B, single=True)[0][:3] pvalues.append(pvalue) pivots.append(pivot) covered.append( (interval[0] < true_target[0]) * (interval[1] > true_target[0])) lengths.append(interval[1] - interval[0]) target_sd = np.sqrt(dispersion * XTXi[idx, idx]) observed_target = np.squeeze(XTXi[idx].dot(X.T.dot(y))) quantile = ndist.ppf(1 - 0.5 * alpha) naive_interval = (observed_target - quantile * target_sd, observed_target + quantile * target_sd) naive_pivot = (1 - ndist.cdf( (observed_target - true_target[0]) / target_sd)) naive_pivot = 2 * min(naive_pivot, 1 - naive_pivot) naive_pivots.append(naive_pivot) naive_pvalue = (1 - ndist.cdf(observed_target / target_sd)) naive_pvalue = 2 * min(naive_pivot, 1 - naive_pivot) naive_pvalues.append(naive_pvalue) naive_covered.append((naive_interval[0] < true_target[0]) * (naive_interval[1] > true_target[0])) naive_lengths.append(naive_interval[1] - naive_interval[0]) lower.append(interval[0]) upper.append(interval[1]) if len(pvalues) > 0: return pd.DataFrame({ 'pivot': pivots, 'target': targets, 'pvalue': pvalues, 'coverage': covered, 'length': lengths, 'naive_pivot': naive_pivots, 'naive_coverage': naive_covered, 'naive_length': naive_lengths, 'upper': upper, 'lower': lower })
def simulate(n=200, p=100, s=10, signal=(0.5, 1), sigma=2, alpha=0.1, B=1000): # description of statistical problem X, y, truth = gaussian_instance(n=n, p=p, s=s, equicorrelated=False, rho=0.5, sigma=sigma, signal=signal, random_signs=True, scale=False)[:3] XTX = X.T.dot(X) XTXi = np.linalg.inv(XTX) resid = y - X.dot(XTXi.dot(X.T.dot(y))) dispersion = np.linalg.norm(resid)**2 / (n-p) S = X.T.dot(y) covS = dispersion * X.T.dot(X) smooth_sampler = normal_sampler(S, covS) splitting_sampler = split_sampler(X * y[:, None], covS) def meta_algorithm(XTX, XTXi, dispersion, sampler): p = XTX.shape[0] success = np.zeros(p) scale = 0. noisy_S = sampler(scale=scale) soln = XTXi.dot(noisy_S) solnZ = soln / (np.sqrt(np.diag(XTXi)) * np.sqrt(dispersion)) pval = ndist.cdf(solnZ) pval = 2 * np.minimum(pval, 1 - pval) return set(BHfilter(pval, q=0.2)) selection_algorithm = functools.partial(meta_algorithm, XTX, XTXi, dispersion) # run selection algorithm success_params = (1, 1) observed_set = repeat_selection(selection_algorithm, smooth_sampler, *success_params) # find the target, based on the observed outcome # we just take the first target targets = [] idx = sorted(observed_set) np.random.shuffle(idx) idx = idx[:1] if len(idx) > 0: print("variable: ", idx, "total selected: ", len(observed_set)) true_target = truth[idx] results = infer_full_target(selection_algorithm, observed_set, idx, splitting_sampler, dispersion, hypothesis=true_target, fit_probability=logit_fit, fit_args={'df':20}, success_params=success_params, alpha=alpha, B=B, single=True) pvalues = [r[2] for r in results] covered = [(r[1][0] < t) * (r[1][1] > t) for r, t in zip(results, true_target)] pivots = [r[0] for r in results] target_sd = np.sqrt(np.diag(dispersion * XTXi)[idx]) observed_target = XTXi[idx].dot(X.T.dot(y)) quantile = ndist.ppf(1 - 0.5 * alpha) naive_interval = np.vstack([observed_target - quantile * target_sd, observed_target + quantile * target_sd]) naive_pivots = (1 - ndist.cdf((observed_target - true_target) / target_sd)) naive_pivots = 2 * np.minimum(naive_pivots, 1 - naive_pivots) naive_pvalues = (1 - ndist.cdf(observed_target / target_sd)) naive_pvalues = 2 * np.minimum(naive_pvalues, 1 - naive_pvalues) naive_covered = (naive_interval[0] < true_target) * (naive_interval[1] > true_target) naive_lengths = naive_interval[1] - naive_interval[0] lower = [r[1][0] for r in results] upper = [r[1][1] for r in results] lengths = np.array(upper) - np.array(lower) return pd.DataFrame({'pivot':pivots, 'pvalue':pvalues, 'coverage':covered, 'length':lengths, 'naive_pivot':naive_pivots, 'naive_coverage':naive_covered, 'naive_length':naive_lengths, 'upper':upper, 'lower':lower, 'targets':true_target, 'batch_size':B * np.ones(len(idx), np.int)})