def test_subset(k=10, ndraw=5000, burnin=5000): n, p = 100, 200 X = np.random.standard_normal((n,p)) + 0.4 * np.random.standard_normal(n)[:,None] X /= (X.std(0)[None,:] * np.sqrt(n)) Y = np.random.standard_normal(100) * 0.5 subset = np.ones(n, np.bool) subset[-10:] = 0 FS = forward_step(X, Y, subset=subset, covariance=0.5**2 * np.identity(n)) for i in range(k): FS.step() print('first %s variables selected' % k, FS.variables) print('pivots for last variable of 3rd selected model knowing that we performed %d steps of forward stepwise' % k) print(FS.model_pivots(3, saturated=True)) print(FS.model_pivots(3, saturated=False, which_var=[FS.variables[2]], burnin=burnin, ndraw=ndraw)) FS = forward_step(X, Y, subset=subset) for i in range(k): FS.step() print(FS.model_pivots(3, saturated=False, which_var=[FS.variables[2]], burnin=burnin, ndraw=ndraw))
def test_subset(k=10, ndraw=5000, burnin=5000, nsim=None): n, p = 100, 200 X = np.random.standard_normal((n,p)) + 0.4 * np.random.standard_normal(n)[:,None] X /= (X.std(0)[None,:] * np.sqrt(n)) Y = np.random.standard_normal(100) * 0.5 subset = np.ones(n, np.bool) subset[-10:] = 0 FS = forward_step(X, Y, subset=subset, covariance=0.5**2 * np.identity(n)) for i in range(k): FS.next() print 'first %s variables selected' % k, FS.variables print 'pivots for last variable of 3rd selected model knowing that we performed %d steps of forward stepwise' % k print FS.model_pivots(3, saturated=True) print FS.model_pivots(3, saturated=False, which_var=[FS.variables[2]], burnin=burnin, ndraw=ndraw) FS = forward_step(X, Y, subset=subset) for i in range(k): FS.next() print FS.model_pivots(3, saturated=False, which_var=[FS.variables[2]], burnin=burnin, ndraw=ndraw)
def test_mcmc_tests(n=100, p=40, s=4, rho=0.3, signal=5, ndraw=None, burnin=2000, nstep=200, method='serial'): X, y, beta, active, sigma, _ = gaussian_instance(n=n, p=p, signal=signal, rho=rho, s=s) FS = forward_step(X, y, covariance=sigma**2 * np.identity(n)) extra_steps = 4 null_rank, alt_rank = None, None for i in range(min(n, p)): FS.step() if extra_steps <= 0: null_rank = forward_mod.mcmc_test(FS, i+1, variable=FS.variables[i-2], nstep=nstep, burnin=burnin, method="serial") alt_rank = forward_mod.mcmc_test(FS, i+1, variable=FS.variables[0], burnin=burnin, nstep=nstep, method="parallel") break if set(active).issubset(FS.variables): extra_steps -= 1 return null_rank, alt_rank
def test_independence_null_mcmc(n=100, p=40, s=4, rho=0.5, signal=5, ndraw=None, burnin=2000, nstep=200, method='serial'): X, y, beta, active, sigma, _ = gaussian_instance(n=n, p=p, signal=signal, rho=rho, s=s) FS = forward_step(X, y, covariance=sigma**2 * np.identity(n)) extra_steps = 4 completed = False null_ranks = [] for i in range(min(n, p)): FS.step() if completed and extra_steps > 0: null_rank = forward_mod.mcmc_test(FS, i+1, variable=FS.variables[-1], nstep=nstep, burnin=burnin, method="serial") null_ranks.append(int(null_rank)) if extra_steps <= 0: break if set(active).issubset(FS.variables): extra_steps -= 1 completed = True return tuple(null_ranks)
def test_FS_unknown(k=10, ndraw=5000, burnin=5000): n, p = 100, 200 X = np.random.standard_normal( (n, p)) + 0.4 * np.random.standard_normal(n)[:, None] X /= (X.std(0)[None, :] * np.sqrt(n)) Y = np.random.standard_normal(100) * 0.5 FS = forward_step(X, Y) for i in range(k): FS.next() print('first %s variables selected' % k, FS.variables) print( 'pivots for last variable of 3rd selected model knowing that we performed %d steps of forward stepwise' % k) print( FS.model_pivots(3, saturated=False, which_var=[FS.variables[2]], burnin=burnin, ndraw=ndraw))
def test_independence_null_mcmc(n=100, p=40, s=4, rho=0.5, snr=5, ndraw=None, burnin=2000, nsim=None, nstep=200, method='serial'): X, y, beta, active, sigma = instance(n=n, p=p, snr=snr, rho=rho, s=s) FS = forward_step(X, y, covariance=sigma**2 * np.identity(n)) extra_steps = 4 completed = False null_ranks = [] for i in range(min(n, p)): FS.next() if completed and extra_steps > 0: null_rank = FS.mcmc_test(i+1, variable=FS.variables[-1], nstep=nstep, burnin=burnin, method="serial") null_ranks.append(int(null_rank)) if extra_steps <= 0: break if set(active).issubset(FS.variables): extra_steps -= 1 completed = True return tuple(null_ranks)
def test_mcmc_tests(n=100, p=40, s=4, rho=0.3, snr=5, ndraw=None, burnin=2000, nsim=None, nstep=200, method='serial'): X, y, beta, active, sigma = instance(n=n, p=p, snr=snr, rho=rho, s=s) FS = forward_step(X, y, covariance=sigma**2 * np.identity(n)) extra_steps = 4 null_rank, alt_rank = None, None for i in range(min(n, p)): FS.next() if extra_steps <= 0: null_rank = FS.mcmc_test(i+1, variable=FS.variables[i-2], nstep=nstep, burnin=burnin, method="serial") alt_rank = FS.mcmc_test(i+1, variable=FS.variables[0], burnin=burnin, nstep=nstep, method="parallel") break if set(active).issubset(FS.variables): extra_steps -= 1 return null_rank, alt_rank
def test_full_pvals(n=100, p=40, rho=0.3, snr=4, ndraw=8000, burnin=2000, nsim=None): X, y, beta, active, sigma = instance(n=n, p=p, snr=snr, rho=rho) FS = forward_step(X, y, covariance=sigma**2 * np.identity(n)) from scipy.stats import norm as ndist pval = [] completed_yet = False for i in range(min(n, p)): FS.next() var_select, pval_select = FS.model_pivots(i+1, alternative='twosided', which_var=[FS.variables[-1]], saturated=False, burnin=burnin, ndraw=ndraw)[0] pval_saturated = FS.model_pivots(i+1, alternative='twosided', which_var=[FS.variables[-1]], saturated=True)[0][1] # now, nominal ones LSfunc = np.linalg.pinv(FS.X[:,FS.variables]) Z = np.dot(LSfunc[-1], FS.Y) / (np.linalg.norm(LSfunc[-1]) * sigma) pval_nominal = 2 * ndist.sf(np.fabs(Z)) pval.append((var_select, pval_select, pval_saturated, pval_nominal)) if set(active).issubset(np.array(pval)[:,0]) and not completed_yet: completed_yet = True completion_index = i + 1 return X, y, beta, active, sigma, np.array(pval), completion_index
def test_FS(k=10, ndraw=5000, burnin=5000): n, p = 100, 200 X = np.random.standard_normal( (n, p)) + 0.4 * np.random.standard_normal(n)[:, None] X /= (X.std(0)[None, :] * np.sqrt(n)) Y = np.random.standard_normal(100) * 0.5 FS = forward_step(X, Y, covariance=0.5**2 * np.identity(n)) for i in range(k): FS.next(compute_pval=True) print('first %s variables selected' % k, FS.variables) print( 'pivots for 3rd selected model knowing that we performed %d steps of forward stepwise' % k) print(FS.model_pivots(3)) print( FS.model_pivots(3, saturated=False, which_var=[FS.variables[2]], burnin=burnin, ndraw=ndraw)) print(FS.model_quadratic(3))
def test_full_pvals(n=100, p=40, rho=0.3, signal=4, ndraw=8000, burnin=2000): X, y, beta, active, sigma, _ = gaussian_instance(n=n, p=p, signal=signal, rho=rho) FS = forward_step(X, y, covariance=sigma**2 * np.identity(n)) from scipy.stats import norm as ndist pval = [] completed_yet = False for i in range(min(n, p)): FS.step() var_select, pval_select = FS.model_pivots(i+1, alternative='twosided', which_var=[FS.variables[-1]], saturated=False, burnin=burnin, ndraw=ndraw)[0] pval_saturated = FS.model_pivots(i+1, alternative='twosided', which_var=[FS.variables[-1]], saturated=True)[0][1] # now, nominal ones LSfunc = np.linalg.pinv(FS.X[:,FS.variables]) Z = np.dot(LSfunc[-1], FS.Y) / (np.linalg.norm(LSfunc[-1]) * sigma) pval_nominal = 2 * ndist.sf(np.fabs(Z)) pval.append((var_select, pval_select, pval_saturated, pval_nominal)) if set(active).issubset(np.array(pval)[:,0]) and not completed_yet: completed_yet = True completion_index = i + 1 return X, y, beta, active, sigma, np.array(pval), completion_index
def test_forward_step(): """ Check that forward step results agree with R """ tol = 1.e-5 R_code = """ library(selectiveInference) set.seed(33) n = 50 p = 10 sigma = 1.1 x = matrix(rnorm(n*p),n,p) beta = c(3,2,rep(0,p-2)) y = x%*%beta + sigma*rnorm(n) # run forward stepwise fsfit = fs(x,y) beta_hat = fsfit$beta # compute sequential p-values and confidence intervals out.seq = fsInf(fsfit,sigma=sigma) vars = out.seq$vars pval = out.seq$pv vlo = out.seq$vlo vup = out.seq$vup """ rpy.r(R_code) R_pvals = np.asarray(rpy.r('pval')) sigma = float(np.asarray(rpy.r('sigma'))) selected_vars = np.asarray(rpy.r('vars')) y = np.asarray(rpy.r('y')) beta_hat = np.asarray(rpy.r('beta_hat')) x = np.asarray(rpy.r('x')) y = y.reshape(-1) y -= y.mean() x -= x.mean(0)[None, :] vlo = np.asarray(rpy.r('vlo')) vup = np.asarray(rpy.r('vup')) print(np.vstack([vlo, vup]).T) FS = forward_step(x, y, covariance=sigma**2 * np.identity(y.shape[0])) steps = [] for i in range(x.shape[1]): FS.step() steps.extend( FS.model_pivots(i + 1, which_var=FS.variables[-1:], alternative='onesided')) print(selected_vars, [i + 1 for i, p in steps]) print(FS.variables, FS.signs) np.testing.assert_array_equal(selected_vars, [i + 1 for i, p in steps]) np.testing.assert_allclose([p for i, p in steps], R_pvals, atol=tol, rtol=tol)
def simulate_null(saturated=True, ndraw=8000, burnin=2000): n, p = 100, 40 X = np.random.standard_normal((n,p)) + 0.4 * np.random.standard_normal(n)[:,None] X /= (X.std(0)[None,:] * np.sqrt(n)) Y = np.random.standard_normal(100) * 0.5 FS = forward_step(X, Y, covariance=0.5**2 * np.identity(n)) for i in range(5): FS.step() return [p[-1] for p in FS.model_pivots(3, saturated=saturated, ndraw=ndraw, burnin=burnin)]
def simulate_null(saturated=True, ndraw=8000, burnin=2000): n, p = 100, 40 X = np.random.standard_normal((n,p)) + 0.4 * np.random.standard_normal(n)[:,None] X /= (X.std(0)[None,:] * np.sqrt(n)) Y = np.random.standard_normal(100) * 0.5 FS = forward_step(X, Y, covariance=0.5**2 * np.identity(n)) for i in range(5): FS.next() return [p[-1] for p in FS.model_pivots(3, saturated=saturated, ndraw=ndraw, burnin=burnin)]
def test_FS_unknown(k=10, ndraw=5000, burnin=5000, nsim=None): n, p = 100, 200 X = np.random.standard_normal((n,p)) + 0.4 * np.random.standard_normal(n)[:,None] X /= (X.std(0)[None,:] * np.sqrt(n)) Y = np.random.standard_normal(100) * 0.5 FS = forward_step(X, Y) for i in range(k): FS.next() print 'first %s variables selected' % k, FS.variables print 'pivots for last variable of 3rd selected model knowing that we performed %d steps of forward stepwise' % k print FS.model_pivots(3, saturated=False, which_var=[FS.variables[2]], burnin=burnin, ndraw=ndraw)
def test_forward_step_all(): tol = 1.e-5 R_code = """ library(selectiveInference) set.seed(33) n = 50 p = 10 sigma = 1.1 x = matrix(rnorm(n*p),n,p) beta = c(3,2,rep(0,p-2)) y = x%*%beta + sigma*rnorm(n) # run forward stepwise fsfit = fs(x,y) beta_hat = fsfit$beta # compute sequential p-values and confidence intervals out.seq = fsInf(fsfit,sigma=sigma, type='all', k=5) vars = out.seq$vars pval = out.seq$pv """ rpy.r(R_code) R_pvals = np.asarray(rpy.r('pval')) sigma = float(np.asarray(rpy.r('sigma'))) selected_vars = np.asarray(rpy.r('vars')) y = np.asarray(rpy.r('y')) beta_hat = np.asarray(rpy.r('beta_hat')) x = np.asarray(rpy.r('x')) y = y.reshape(-1) y -= y.mean() x -= x.mean(0)[None, :] FS = forward_step(x, y, covariance=sigma**2 * np.identity(y.shape[0])) steps = [] for i in range(5): FS.next() steps = FS.model_pivots(5, alternative='onesided') np.testing.assert_array_equal(selected_vars, [i + 1 for i, p in steps]) np.testing.assert_allclose([p for i, p in steps], R_pvals, atol=tol, rtol=tol) print(R_pvals, [p for i, p in steps])
def test_forward_step(): tol = 1.e-5 R_code = """ library(selectiveInference) set.seed(33) n = 50 p = 10 sigma = 1.1 x = matrix(rnorm(n*p),n,p) beta = c(3,2,rep(0,p-2)) y = x%*%beta + sigma*rnorm(n) # run forward stepwise fsfit = fs(x,y) beta_hat = fsfit$beta # compute sequential p-values and confidence intervals out.seq = fsInf(fsfit,sigma=sigma) vars = out.seq$vars pval = out.seq$pv """ rpy.r(R_code) R_pvals = np.asarray(rpy.r('pval')) sigma = float(np.asarray(rpy.r('sigma'))) selected_vars = np.asarray(rpy.r('vars')) y = np.asarray(rpy.r('y')) beta_hat = np.asarray(rpy.r('beta_hat')) x = np.asarray(rpy.r('x')) y = y.reshape(-1) y -= y.mean() x -= x.mean(0)[None,:] FS = forward_step(x, y, covariance=sigma**2 * np.identity(y.shape[0])) steps = [] for i in range(x.shape[1]): FS.next() steps.extend(FS.model_pivots(i+1, which_var=FS.variables[-1:], alternative='onesided')) np.testing.assert_array_equal(selected_vars, [i + 1 for i, p in steps]) np.testing.assert_allclose([p for i, p in steps], R_pvals, atol=tol, rtol=tol) print (R_pvals, [p for i, p in steps])
def test_FS(k=10, ndraw=5000, burnin=5000, nsim=None): n, p = 100, 200 X = np.random.standard_normal((n,p)) + 0.4 * np.random.standard_normal(n)[:,None] X /= (X.std(0)[None,:] * np.sqrt(n)) Y = np.random.standard_normal(100) * 0.5 FS = forward_step(X, Y, covariance=0.5**2 * np.identity(n)) for i in range(k): FS.next(compute_pval=True) print 'first %s variables selected' % k, FS.variables print 'pivots for 3rd selected model knowing that we performed %d steps of forward stepwise' % k print FS.model_pivots(3) print FS.model_pivots(3, saturated=False, which_var=[FS.variables[2]], burnin=burnin, ndraw=ndraw) print FS.model_quadratic(3)
def compute_pvalues(y, X, active_set=None, sigma=1., maxstep=np.inf, compute_maxT_identify=True, burnin=2000, ndraw=8000, accept_reject_params=(100,15,2000), shortcut=True): """ Parameters ---------- y : np.float(n) The target, in the model $y = X\beta$ X : np.float((n, p)) The data, in the model $y = X\beta$ active_set : [] (optional) The true active set. For steps beyond this the selected model methods can (for the purposes of simulation) just draw np.random.sample() sigma : np.float Standard deviation of the gaussian distribution : The covariance matrix is `sigma**2 * np.identity(X.shape[0])`. Defauts to 1. compute_maxT_identify : bool If True, compute the maxT test having identified the variable and sign (i.e. conditioning on the variable added and its sign). Returns ------- results : pd.DataFrame DataFrame with variables (variable_id, selective_pvalue, saturated_pvalue, nominal_pvalue) """ n, p = X.shape FS_identity = forward_step(X, y, covariance=sigma**2 * np.identity(n)) FS_maxT = forward_step(X, y, covariance=sigma**2 * np.identity(n)) FS_identity_U = forward_step(X, y, covariance=np.identity(n)) FS_maxT_U = forward_step(X, y, covariance=np.identity(n)) results = [] completed = False for i in range(min([n, p, maxstep])): if active_set is not None: screened = set(active_set).issubset(FS_maxT.variables) or ((i > (3 * len(active_set))) and shortcut) # can't be any power way out there... i figure else: screened = False if not screened: # take a step of FS pval_maxT = FS_maxT.step(compute_maxZ_pval=True, use_identity=False, ndraw=ndraw, burnin=burnin) pval_maxT_U = FS_maxT_U.step(compute_maxZ_pval=True, use_identity=False, ndraw=ndraw, burnin=burnin, sigma_known=False) if compute_maxT_identify: pval_maxT_identify = FS_identity.step(compute_maxZ_pval=True, use_identity=True, ndraw=ndraw, burnin=burnin) pval_maxT_identify_U = FS_identity_U.step(compute_maxZ_pval=True, use_identity=True, ndraw=ndraw, burnin=burnin, sigma_known=False) else: FS_identity.step(compute_maxZ_pval=False) pval_maxT_identify = np.random.sample() pval_maxT_identify_U = np.random.sample() else: FS_maxT.step(compute_maxZ_pval=False) pval_maxT, pval_maxT_identify, pval_maxT_U, pval_maxT_identify_U = np.random.sample(4) if not completed: completed = True completion_idx = i alternative = 'onesided' var_select, pval_saturated = FS_maxT.model_pivots(i+1, alternative=alternative, which_var=[FS_maxT.variables[-1]], saturated=True)[0] # now, nominal ones LSfunc = np.linalg.pinv(FS_maxT.X[:,FS_maxT.variables]) Z = np.dot(LSfunc[-1], FS_maxT.Y) / (np.linalg.norm(LSfunc[-1]) * sigma) # assuming known variance pval_nominal = 2 * ndist.sf(np.fabs(Z)) # using T OLS_model = sm.OLS(y, np.hstack([FS_maxT.fixed_regressors, X[:,FS_maxT.variables]])) pval_nominalT = OLS_model.fit().pvalues[-1] results.append((var_select, pval_maxT_identify, pval_saturated, pval_nominal, pval_nominalT, pval_maxT, pval_maxT_U, pval_maxT_identify_U)) results = np.array(results).T return pd.DataFrame({'variable_selected': results[0].astype(np.int), 'maxT_identify_pvalue': results[1], 'saturated_pvalue': results[2], 'nominal_pvalue': results[3], 'nominalT_pvalue': results[4], 'maxT_pvalue': results[5], 'maxT_unknown_pvalue': results[6], 'maxT_identify_unknown_pvalue': results[7]}), FS_maxT
def compute_pvalues(y, X, active_set=None, sigma=1., maxstep=np.inf, compute_maxT_identify=True, burnin=2000, ndraw=8000, accept_reject_params=(100, 15, 2000), shortcut=True): """ Parameters ---------- y : np.float(n) The target, in the model $y = X\beta$ X : np.float((n, p)) The data, in the model $y = X\beta$ active_set : [] (optional) The true active set. For steps beyond this the selected model methods can (for the purposes of simulation) just draw np.random.sample() sigma : np.float Standard deviation of the gaussian distribution : The covariance matrix is `sigma**2 * np.identity(X.shape[0])`. Defauts to 1. compute_maxT_identify : bool If True, compute the maxT test having identified the variable and sign (i.e. conditioning on the variable added and its sign). Returns ------- results : pd.DataFrame DataFrame with variables (variable_id, selective_pvalue, saturated_pvalue, nominal_pvalue) """ n, p = X.shape FS_identity = forward_step(X, y, covariance=sigma**2 * np.identity(n)) FS_maxT = forward_step(X, y, covariance=sigma**2 * np.identity(n)) FS_identity_U = forward_step(X, y, covariance=np.identity(n)) FS_maxT_U = forward_step(X, y, covariance=np.identity(n)) iter(FS_identity) iter(FS_maxT) iter(FS_identity_U) iter(FS_maxT_U) results = [] completed = False for i in range(min([n, p, maxstep])): if active_set is not None: screened = set(active_set).issubset(FS_maxT.variables) or ( (i > (3 * len(active_set))) and shortcut ) # can't be any power way out there... i figure else: screened = False if not screened: # take a step of FS pval_maxT = FS_maxT.next(compute_pval=True, use_identity=False, ndraw=ndraw, burnin=burnin) pval_maxT_U = FS_maxT_U.next(compute_pval=True, use_identity=False, ndraw=ndraw, burnin=burnin, sigma_known=False) if compute_maxT_identify: pval_maxT_identify = FS_identity.next(compute_pval=True, use_identity=True, ndraw=ndraw, burnin=burnin) pval_maxT_identify_U = FS_identity_U.next(compute_pval=True, use_identity=True, ndraw=ndraw, burnin=burnin, sigma_known=False) else: FS_identity.next(compute_pval=False) pval_maxT_identify = np.random.sample() pval_maxT_identify_U = np.random.sample() else: FS_maxT.next(compute_pval=False) pval_maxT, pval_maxT_identify, pval_maxT_U, pval_maxT_identify_U = np.random.sample( 4) if not completed: completed = True completion_idx = i alternative = {1: 'greater', -1: 'less'}[FS_maxT.signs[-1]] var_select, pval_saturated = FS_maxT.model_pivots( i + 1, alternative=alternative, which_var=[FS_maxT.variables[-1]], saturated=True)[0] # now, nominal ones LSfunc = np.linalg.pinv(FS_maxT.X[:, FS_maxT.variables]) Z = np.dot(LSfunc[-1], FS_maxT.Y) / (np.linalg.norm(LSfunc[-1]) * sigma) # assuming known variance pval_nominal = 2 * ndist.sf(np.fabs(Z)) # using T OLS_model = sm.OLS( y, np.hstack([FS_maxT.fixed_regressors, X[:, FS_maxT.variables]])) pval_nominalT = OLS_model.fit().pvalues[-1] results.append( (var_select, pval_maxT_identify, pval_saturated, pval_nominal, pval_nominalT, pval_maxT, pval_maxT_U, pval_maxT_identify_U)) results = np.array(results).T return pd.DataFrame({ 'variable_selected': results[0].astype(np.int), 'maxT_identify_pvalue': results[1], 'saturated_pvalue': results[2], 'nominal_pvalue': results[3], 'nominalT_pvalue': results[4], 'maxT_pvalue': results[5], 'maxT_unknown_pvalue': results[6], 'maxT_identify_unknown_pvalue': results[7] }), FS_maxT