def test_partialcorr_sample(self): p = 50 rho = 0.99 dgprocess = dgp.DGP() _, _, _, _, V = dgprocess.sample_data(p=p, method="partialcorr", rho=rho) diag_diff = np.mean(np.abs(np.diag(V) - 1)) self.assertTrue( diag_diff < 1e-4, f"Partial corr Sigma={V} for rho={rho} is not a correlation matrix", ) pairwise_corr = V[0, 1] expected = -1 / (p - 1) self.assertTrue( np.abs(pairwise_corr - expected) < 1e-4, f"Partial corr pairwise_corr {pairwise_corr} deviates from expectation {expected} for rho={rho}", )
def test_complex_group_solns(self): """ Check the solutions of the PSGD solver for group knockoffs. """ if not TORCH_AVAILABLE: return None from knockpy import kpytorch # Construct graph + groups np.random.seed(110) p = 50 groups = knockpy.utilities.preprocess_groups( np.random.randint(1, p + 1, p)) for method in ["ar1", "ver"]: dgprocess = dgp.DGP() _, _, _, _, Sigma = dgprocess.sample_data( method=method, p=p, ) # Use SDP as baseline init_S = knockpy.mac.solve_group_SDP(Sigma, groups) init_loss = mrc.mvr_loss(Sigma, init_S) # Apply gradient solver opt_S = kpytorch.mrcgrad.solve_mrc_psgd( Sigma=Sigma, groups=groups, init_S=init_S, tol=1e-5, max_epochs=100, line_search_iter=10, ) psgd_loss = mrc.mvr_loss(Sigma, opt_S) # Check S matrix self.check_S_properties(Sigma, opt_S, groups) # Check new loss < init_loss self.assertTrue( psgd_loss <= init_loss, msg= f"For {method}, PSGD solver has higher loss {psgd_loss} v. sdp {init_loss}", )
def test_nested_AR1(self): # Check that a, b parameters work np.random.seed(110) a = 100 b = 40 dgprocess = dgp.DGP() _, _, _, _, Sigma = dgprocess.sample_data( p=500, method="nestedar1", a=a, b=b, nest_size=2, num_nests=1 ) mean_rho = np.diag(Sigma, k=1).mean() expected = a / (2 * (a + b)) + (a / (a + b)) ** 2 / 2 np.testing.assert_almost_equal( mean_rho, expected, decimal=2, err_msg=f"random nested AR1 gen has unexpected avg rho {mean_rho}, should be ~ {expected} ", )
def test_gmliu2019_sample(self): n = 300 p = 1000 rho = 0.8 np.random.seed(110) dgprocess = dgp.DGP() _, _, beta, _, _ = dgprocess.sample_data( rho=rho, gamma=1, p=p, n=n, sparsity=0.06, method="blockequi", coeff_dist="gmliu2019", ) self.assertTrue( (beta != 0).sum() == 60, f"Sparsity constraint for gmliu2019 violated" )
def test_smoothing(self): """ Smoothing is not required for this, but this is a nice check anyway. """ p = 50 smoothing = 0.1 dgprocess = dgp.DGP() _, _, _, _, V = dgprocess.sample_data( method="partialcorr", rho=0.1, ) S_MVR = mrc.solve_mvr(Sigma=V, smoothing=smoothing) # Not implemented yet S_SDP = mac.solve_SDP(Sigma=V, tol=1e-5) mvr_mean = np.diag(S_MVR).mean() sdp_mean = np.diag(S_SDP).mean() self.assertTrue( sdp_mean - mvr_mean < 1e-3, f"Highly smoothed S_MVR ({S_MVR}) too far from S_SDP ({S_SDP}) for equi partial corr", )
def test_beta_corr_signals(self): # Test signals are grouped together p = 4 sparsity = 0.5 expected_nn = int(sparsity * p) for j in range(10): dgprocess = dgp.DGP() _, _, beta, _, _ = dgprocess.sample_data( p=p, sparsity=0.5, corr_signals=True ) nn_flags = beta != 0 self.assertTrue( nn_flags.sum() == expected_nn, f"Corr_signals breaks sparsity (beta = {beta}, should have {expected_nn} non-nulls)", ) first_nonzero = np.where(nn_flags)[0].min() self.assertTrue( nn_flags[first_nonzero + 1], f"Corr_signals does not produce correlated signals (beta = {beta})", )
def test_large_ising_samples(self): # Test that sampling does not throw an error np.random.seed(110) n = 100 p = 625 mu = np.zeros(p) dgprocess = dgp.DGP() X, _, _, _, _ = dgprocess.sample_data( n=n, p=p, method="ising", x_dist="gibbs", ) gibbs_graph = dgprocess.gibbs_graph np.fill_diagonal(gibbs_graph, 1) # We load custom cov/q matrices for this file_directory = os.path.dirname(os.path.abspath(__file__)) V = np.loadtxt(f"{file_directory}/test_covs/vout{p}.txt") Q = np.loadtxt(f"{file_directory}/test_covs/qout{p}.txt") max_nonedge = np.max(np.abs(Q[gibbs_graph == 0])) self.assertTrue( max_nonedge < 1e-5, f"Estimated precision for ising{p} has max_nonedge {max_nonedge} >= 1e-5", ) # Initialize sampler metro_sampler = metro.GibbsGridSampler( X=X, gibbs_graph=gibbs_graph, mu=mu, Sigma=V, Q=Q, max_width=5, method="equicorrelated", ) # Sample and hope for no errors Xk = metro_sampler.sample_knockoffs()
def test_maxent(self): """ Both maxent/mmi work properly """ # Sample data dgprocess = dgp.DGP() dgprocess.sample_data(p=50, method='ar1', a=3) # Check solve_maxent/solve_mmi np.random.seed(110) S_ME = smatrix.compute_smatrix(dgprocess.Sigma, method='maxent') np.random.seed(110) S_MMI = smatrix.compute_smatrix(dgprocess.Sigma, method='mmi') np.testing.assert_array_almost_equal( S_ME, S_MMI, decimal=3, err_msg=f"compute_smatrix yields diff answers for mmi, maxent") # Check solve_maxent/solve_mmi np.random.seed(110) S_ME = mrc.solve_maxent(dgprocess.Sigma) np.random.seed(110) S_MMI = mrc.solve_mmi(dgprocess.Sigma) np.testing.assert_array_almost_equal( S_ME, S_MMI, decimal=3, err_msg=f"solve_maxent and solve_mmi yield different answers") # Check maxent_loss/mmi_loss L_ME = mrc.maxent_loss(dgprocess.Sigma, S_ME) L_MMI = mrc.mmi_loss(dgprocess.Sigma, S_MMI) np.testing.assert_almost_equal( L_ME, L_MMI, decimal=3, err_msg=f"maxent_loss and mmi_loss yield different answers")
def test_consistency_of_inferring_sigma(self): """ Checks that the same knockoffs are produced whether you infer the covariance matrix first and pass it to the gaussian_knockoffs generator, or you let the generator do the work for you """ n = 25 p = 300 rho = 0.5 dgprocess = dgp.DGP() X, _, _, _, _ = dgprocess.sample_data(n=n, p=p, rho=rho, method="AR1") # Method 1: infer cov first V, _ = utilities.estimate_covariance(X, tol=1e-2) np.random.seed(110) Xk1 = knockoffs.GaussianSampler(X=X, Sigma=V, method="sdp").sample_knockoffs() # Method 2: Infer during np.random.seed(110) Xk2 = knockoffs.GaussianSampler(X=X, method="sdp").sample_knockoffs() np.testing.assert_array_almost_equal( Xk1, Xk2, 5, err_msg="Knockoff gen is inconsistent")
def test_misaligned_covariance_estimation(self): # Inputs seed = 110 sample_kwargs = { "n": 640, "p": 300, "method": "blockequi", "gamma": 1, "rho": 0.8, } # Extracta couple of constants n = sample_kwargs["n"] p = sample_kwargs["p"] # Create data generating process np.random.seed(seed) dgprocess = dgp.DGP() X, y, beta, _, V = dgprocess.sample_data(**sample_kwargs) # Make sure this does not raise an error # (even though it is ill-conditioned and the graph lasso doesn't fail) utilities.estimate_covariance(X, shrinkage="graphicallasso")
def test_small_ising_samples(self): # Test samples to make sure the # knockoff properties hold np.random.seed(110) n = 100000 p = 9 mu = np.zeros(p) dgprocess = dgp.DGP() X, _, _, _, _ = dgprocess.sample_data( n=n, p=p, method="ising", x_dist="gibbs", ) gibbs_graph = dgprocess.gibbs_graph np.fill_diagonal(gibbs_graph, 1) # We load custom cov/q matrices for this file_directory = os.path.dirname(os.path.abspath(__file__)) V = np.loadtxt(f"{file_directory}/test_covs/vout{p}.txt") Q = np.loadtxt(f"{file_directory}/test_covs/qout{p}.txt") max_nonedge = np.max(np.abs(Q[gibbs_graph == 0])) self.assertTrue( max_nonedge < 1e-5, f"Estimated precision for ising{p} has max_nonedge {max_nonedge} >= 1e-5", ) # Initialize sampler metro_sampler = metro.GibbsGridSampler( X=X, gibbs_graph=gibbs_graph, mu=mu, Sigma=V, Q=Q, max_width=2, ) # Sample Xk = metro_sampler.sample_knockoffs() # Check empirical means # Check empirical covariance matrix mu_hat = X.mean(axis=0) muk_hat = np.mean(Xk, axis=0) np.testing.assert_almost_equal( muk_hat, mu_hat, decimal=2, err_msg= f"For Ising sampler, empirical mean of Xk does not match mean of X", ) # Check empirical covariance matrix V_hat = np.cov(X.T) Vk_hat = np.cov(Xk.T) np.testing.assert_almost_equal( V_hat / 2, Vk_hat / 2, decimal=1, err_msg= f"For Ising sampler, empirical covariance of Xk does not match cov of X", ) # Check that marginal fourth moments match X4th = np.mean(np.power(X, 4), axis=0) Xk4th = np.mean(np.power(Xk, 4), axis=0) np.testing.assert_almost_equal( X4th / 10, Xk4th / 10, decimal=1, err_msg= f"For Ising sampler, fourth moment of Xk does not match theoretical fourth moment", ) # Run a ton of KS tests metro_sampler.check_xk_validity( X, Xk, testname="SMALL_ISING", )
def ARsample(): dgprocess = dgp.DGP() dgprocess.sample_data(method="AR1", rho=1.5)
def test_divconquer_likelihoods(self): # Test to make sure the way we split up # cliques does not change the likelihood np.random.seed(110) n = 10 p = 625 mu = np.zeros(p) dgprocess = dgp.DGP() X, _, _, _, _ = dgprocess.sample_data( n=n, p=p, method="ising", x_dist="gibbs", ) gibbs_graph = dgprocess.gibbs_graph np.fill_diagonal(gibbs_graph, 1) # Read V file_directory = os.path.dirname(os.path.abspath(__file__)) V = np.loadtxt(f"{file_directory}/test_covs/vout{p}.txt") # Initialize sampler metro_sampler = metro.GibbsGridSampler( X=X, gibbs_graph=gibbs_graph, mu=mu, Sigma=V, max_width=2, ) # Non-divided likelihood nondiv_like = 0 for clique, lp in zip(metro_sampler.cliques, metro_sampler.log_potentials): nondiv_like += lp(X[:, np.array(clique)]) # Divided likelihood for the many keys many_div_like = np.zeros(n) for dc_key in metro_sampler.dc_keys: # Initialize likelihood for these data points div_like = 0 # Helpful constants seps = metro_sampler.separators[dc_key] n_inds = metro_sampler.X_ninds[dc_key] # Add separator-to-separator cliques manually for clique, lp in zip(metro_sampler.cliques, metro_sampler.log_potentials): if clique[0] not in seps or clique[1] not in seps: continue sepX = X[n_inds] div_like += lp(sepX[:, np.array(clique)]) # Now loop through other blocks div_dict_list = metro_sampler.divconq_info[dc_key] for block_dict in div_dict_list: blockX = X[n_inds][:, block_dict["inds"]] for clique, lp in zip(block_dict["cliques"], block_dict["lps"]): div_like += lp(blockX[:, clique]) many_div_like[n_inds] = np.array(div_like) # Test to make sure these likelihoods agree np.testing.assert_almost_equal( nondiv_like, many_div_like, decimal=5, err_msg= f"Non-divided clique potentials {nondiv_like} do not agree with divided cliques {div_like}", )
def test_blockt_samples(self): # Test to make sure low df --> heavy tails # and therefore acceptances < 1 np.random.seed(110) n = 2000000 p = 6 df_t = 5 dgprocess = dgp.DGP() X, _, _, Q, V = dgprocess.sample_data( n=n, p=p, method="blockequi", rho=0.4, gamma=0, block_size=3, x_dist="blockt", df_t=df_t, ) for S in [np.eye(p), None]: # Sample t tsampler = metro.BlockTSampler(X=X, Sigma=V, df_t=df_t, S=S, metro_verbose=True) # Sample Xk = tsampler.sample_knockoffs() # Check empirical means # Check empirical covariance matrix muk_hat = np.mean(Xk, axis=0) np.testing.assert_almost_equal( muk_hat, np.zeros(p), decimal=2, err_msg= f"For block T sampler, empirical mean of Xk does not match mean of X", ) # Check empirical covariance matrix Vk_hat = np.cov(Xk.T) np.testing.assert_almost_equal( V, Vk_hat, decimal=2, err_msg= f"For block T sampler, empirical covariance of Xk does not match cov of X", ) # Check that marginal fourth moments match X4th = np.mean(np.power(X, 4), axis=0) Xk4th = np.mean(np.power(Xk, 4), axis=0) np.testing.assert_almost_equal( X4th / 10, Xk4th / 10, decimal=1, err_msg= f"For block T sampler, fourth moment of Xk does not match theoretical fourth moment", ) # Run a ton of KS tests tsampler.check_xk_validity(X, Xk, testname="BLOCKT")
def test_tmarkov_samples(self): # Test to make sure low df --> heavy tails # and therefore acceptances < 1 np.random.seed(110) n = 1000000 p = 5 df_t = 3 dgprocess = dgp.DGP() X, _, _, Q, V = dgprocess.sample_data(n=n, p=p, method="AR1", rho=0.3, x_dist="ar1t", df_t=df_t) for S in [None, np.eye(p)]: # Sample t tsampler = metro.ARTKSampler(X=X, Sigma=V, df_t=df_t, S=S, metro_verbose=True) # Correct junction tree self.assertTrue( tsampler.width == 1, f"tsampler should have width 1, not {tsampler.width}") # Sample Xk = tsampler.sample_knockoffs() # Check empirical means # Check empirical covariance matrix muk_hat = np.mean(Xk, axis=0) np.testing.assert_almost_equal( muk_hat, np.zeros(p), decimal=2, err_msg= f"For ARTK sampler, empirical mean of Xk does not match mean of X", ) # Check empirical covariance matrix Vk_hat = np.corrcoef(Xk.T) np.testing.assert_almost_equal( V, Vk_hat, decimal=2, err_msg= f"For ARTK sampler, empirical covariance of Xk does not match cov of X", ) # Check that marginal fourth moments match X4th = np.mean(np.power(X, 4), axis=0) Xk4th = np.mean(np.power(Xk, 4), axis=0) np.testing.assert_almost_equal( X4th / 10, Xk4th / 10, decimal=1, err_msg= f"For ARTK sampler, fourth moment of Xk does not match theoretical fourth moment", ) # Run a ton of KS tests tsampler.check_xk_validity(X, Xk, testname="ARTK")
def test_tmarkov_likelihood(self): # Data np.random.seed(110) n = 15 p = 10 df_t = 5 X1 = np.random.randn(n, p) X2 = np.random.randn(n, p) V = np.eye(p) Q = np.eye(p) # Scipy likelihood ratio for X, scale matrix inv_scale = np.sqrt(df_t / (df_t - 2)) sp_like1 = stats.t.logpdf(inv_scale * X1, df=df_t).sum(axis=1) sp_like2 = stats.t.logpdf(inv_scale * X2, df=df_t).sum(axis=1) sp_ratio = sp_like1 - sp_like2 # General likelihood rhos = np.zeros(p - 1) ar1_like1 = metro.t_markov_loglike(X1, rhos, df_t=df_t) ar1_like2 = metro.t_markov_loglike(X2, rhos, df_t=df_t) ar1_ratio = ar1_like1 - ar1_like2 self.assertTrue( np.abs(ar1_ratio - sp_ratio).sum() < 0.01, f"AR1 ratio {ar1_ratio} and scipy ratio {sp_ratio} disagree for independent t vars", ) # Test again with df_t --> infinity, so it should be approx gaussian dgprocess = dgp.DGP() X1, _, _, Q, V = dgprocess.sample_data(n=n, p=p, method="AR1", a=3, b=1) X2 = np.random.randn(n, p) # Ratio using normals df_t = 100000 mu = np.zeros(p) norm_like1 = stats.multivariate_normal(mean=mu, cov=V).logpdf(X1) norm_like2 = stats.multivariate_normal(mean=mu, cov=V).logpdf(X2) norm_ratio = norm_like1 - norm_like2 # Ratios using T rhos = np.diag(V, 1) ar1_like1 = metro.t_markov_loglike(X1, rhos, df_t=df_t) ar1_like2 = metro.t_markov_loglike(X2, rhos, df_t=df_t) ar1_ratio = ar1_like1 - ar1_like2 self.assertTrue( np.abs(ar1_ratio - norm_ratio).mean() < 0.01, f"AR1 ratio {ar1_ratio} and gaussian ratio {norm_ratio} disagree for corr. t vars, df={df_t}", ) # Check consistency of tsampler class tsampler = metro.ARTKSampler( X=X1, Sigma=V, df_t=df_t, ) new_ar1_like1 = tsampler.lf(tsampler.X) self.assertTrue( np.abs(ar1_like1 - new_ar1_like1).sum() < 0.01, f"AR1 loglike inconsistent between class ({new_ar1_like1}) and function ({ar1_ratio})", )
def test_dense_sample(self): # Fake data np.random.seed(110) n = 10000 p = 4 dgprocess = dgp.DGP() X, _, _, Q, V = dgprocess.sample_data(method="blockequi", rho=0.6, n=n, p=p, gamma=1, block_size=p) ksampler = knockpy.knockoffs.GaussianSampler(X=X, Sigma=V, method="mvr") S = ksampler.fetch_S() # Network graph Q_graph = np.abs(Q) > 1e-5 Q_graph = Q_graph - np.eye(p) undir_graph = nx.Graph(Q_graph) width, T = treewidth.treewidth_decomp(undir_graph) order, active_frontier = metro.get_ordering(T) # Metro sampler and likelihood mvn = stats.multivariate_normal(mean=np.zeros(p), cov=V) def mvn_likelihood(X): return mvn.logpdf(X) gamma = 0.99999 metro_sampler = metro.MetropolizedKnockoffSampler( lf=mvn_likelihood, X=X, mu=np.zeros(p), Sigma=V, order=order, active_frontier=active_frontier, gamma=gamma, S=S, metro_verbose=True, ) # Output knockoffs Xk = metro_sampler.sample_knockoffs() # Acceptance rate should be exactly one acc_rate = metro_sampler.final_acc_probs.mean() self.assertTrue( acc_rate - gamma > -1e-3, msg= f"For equi gaussian design, metro has acc_rate={acc_rate} < gamma={gamma}", ) # Check covariance matrix features = np.concatenate([X, Xk], axis=1) emp_corr_matrix = np.corrcoef(features.T) G = np.concatenate([ np.concatenate([V, V - S]), np.concatenate([V - S, V]), ], axis=1) np.testing.assert_almost_equal( emp_corr_matrix, G, decimal=2, err_msg= f"For equi gaussian design, metro does not match theoretical matrix", )
def non_ar1_t(): dgprocess = dgp.DGP() dgprocess.sample_data(n=n, p=p, method="ver", x_dist="ar1t")
def test_complex_solns(self): """ Check the solution of the various solvers for non-grouped knockoffs. """ # Check availability if not TORCH_AVAILABLE: return None from knockpy import kpytorch np.random.seed(110) p = 100 methods = ["ar1", "ver"] groups = np.arange(1, p + 1, 1) for method in methods: dgprocess = dgp.DGP() _, _, _, _, Sigma = dgprocess.sample_data(method=method, p=p) # Use SDP as baseline init_S = knockpy.mac.solve_group_SDP(Sigma, groups) sdp_mvr_loss = mrc.mvr_loss(Sigma, init_S) # Apply gradient solver opt_S = kpytorch.mrcgrad.solve_mrc_psgd( Sigma=Sigma, groups=groups, init_S=init_S, tol=1e-5, max_epochs=100, line_search_iter=10, ) psgd_mvr_loss = mrc.mvr_loss(Sigma, opt_S) # Check S matrix self.check_S_properties(Sigma, opt_S, groups) # Check new loss < init_loss self.assertTrue( psgd_mvr_loss <= sdp_mvr_loss, msg= f"For {method}, PSGD solver has higher loss {psgd_mvr_loss} v. sdp {sdp_mvr_loss}", ) # MVR solver outperforms PSGD opt_S_mvr = mrc.solve_mvr(Sigma=Sigma) self.check_S_properties(Sigma, opt_S_mvr, groups) cd_mvr_loss = mrc.mvr_loss(Sigma, opt_S_mvr) self.assertTrue( cd_mvr_loss <= psgd_mvr_loss, msg= f"For {method}, coord descent MVR solver has higher loss {cd_mvr_loss} v. PSGD {psgd_mvr_loss}", ) # mmi solver outperforms PSGD print(Sigma) print(np.linalg.eigh(Sigma)[0].min()) opt_S_mmi = mrc.solve_mmi(Sigma=Sigma) print(opt_S_mmi) print(opt_S) self.check_S_properties(Sigma, opt_S_mmi, groups) cd_mmi_loss = mrc.mmi_loss(Sigma, opt_S_mmi) psgd_mmi_loss = mrc.mmi_loss(Sigma, opt_S) self.assertTrue( cd_mmi_loss <= psgd_mmi_loss, msg= f"For {method}, coord descent mmi solver has higher loss {cd_mmi_loss} v. PSGD {psgd_mmi_loss}", )
def bad_xdist(): dgprocess = dgp.DGP() dgprocess.sample_data(method="ver", x_dist="t_dist")
def test_ar1_sample(self): # Fake data np.random.seed(110) n = 30000 p = 8 dgprocess = dgp.DGP() X, _, _, Q, V = dgprocess.sample_data(method="AR1", n=n, p=p) ksampler = knockpy.knockoffs.GaussianSampler(X=X, Sigma=V, method="mvr") S = ksampler.fetch_S() # Graph structure + junction tree Q_graph = np.abs(Q) > 1e-5 Q_graph = Q_graph - np.eye(p) # Metro sampler + likelihood mvn = stats.multivariate_normal(mean=np.zeros(p), cov=V) def mvn_likelihood(X): return mvn.logpdf(X) gamma = 0.9999 metro_sampler = metro.MetropolizedKnockoffSampler( lf=mvn_likelihood, X=X, mu=np.zeros(p), Sigma=V, undir_graph=Q_graph, S=S, gamma=gamma, ) # Output knockoffs Xk = metro_sampler.sample_knockoffs() # Acceptance rate should be exactly one acc_rate = metro_sampler.final_acc_probs.mean() self.assertTrue( acc_rate - gamma > -1e-3, msg= f"For AR1 gaussian design, metro has acc_rate={acc_rate} < gamma={gamma}", ) # Check covariance matrix features = np.concatenate([X, Xk], axis=1) emp_corr_matrix = np.corrcoef(features.T) G = np.concatenate([ np.concatenate([V, V - S]), np.concatenate([V - S, V]), ], axis=1) np.testing.assert_almost_equal( emp_corr_matrix, G, decimal=2, err_msg= f"For AR1 gaussian design, metro does not match theoretical matrix", )
def test_debiased_lasso(self): # Create data generating process n = 200 p = 20 rho = 0.3 np.random.seed(110) dgprocess = dgp.DGP() X, y, beta, _, corr_matrix = dgprocess.sample_data( n=n, p=p, y_dist="gaussian", coeff_size=100, sign_prob=0.5, method="blockequi", rho=rho, ) groups = np.arange(1, p + 1, 1) # Create knockoffs S = (1 - rho) * np.eye(p) ksampler = knockpy.knockoffs.GaussianSampler(X=X, groups=groups, Sigma=corr_matrix, verbose=False, S=S) knockoffs = ksampler.sample_knockoffs() G = np.concatenate( [ np.concatenate([corr_matrix, corr_matrix - S]), np.concatenate([corr_matrix - S, corr_matrix]), ], axis=1, ) Ginv = utilities.chol2inv(G) # Debiased lasso - test accuracy dlasso_stat = kstats.LassoStatistic() dlasso_stat.fit(X, knockoffs, y, use_lars=False, cv_score=False, debias=True, Ginv=Ginv) W = dlasso_stat.W l2norm = np.power(W - beta, 2).mean() self.assertTrue( l2norm > 1, msg= f"Debiased lasso fits gauissan very poorly (l2norm = {l2norm} btwn real/fitted coeffs)", ) # Test that this throws the correct errors # first for Ginv def debiased_lasso_sans_Ginv(): dlasso_stat.fit(X, knockoffs, y, use_lars=False, cv_score=False, debias=True, Ginv=None) self.assertRaisesRegex(ValueError, "Ginv must be provided", debiased_lasso_sans_Ginv) # Second for logistic data y = np.random.binomial(1, 0.5, n) def binomial_debiased_lasso(): dlasso_stat.fit( X, knockoffs, y, use_lars=False, cv_score=False, debias=True, Ginv=Ginv, ) self.assertRaisesRegex( ValueError, "Debiased lasso is not implemented for binomial data", binomial_debiased_lasso, )
def test_cv_scoring(self): # Create data generating process n = 100 p = 20 np.random.seed(110) dgprocess = dgp.DGP() X, y, beta, _, corr_matrix = dgprocess.sample_data(n=n, p=p, y_dist="gaussian", coeff_size=100, sign_prob=1) groups = np.arange(1, p + 1, 1) # These are not real, just helpful syntatically knockoffs = np.zeros((n, p)) # 1. Test lars cv scoring lars_stat = kstats.LassoStatistic() lars_stat.fit( X, knockoffs, y, use_lars=True, cv_score=True, ) self.assertTrue( lars_stat.score_type == "mse_cv", msg= f"cv_score=True fails to create cross-validated scoring for lars (score_type={lars_stat.score_type})", ) # 2. Test OLS cv scoring ols_stat = kstats.OLSStatistic() ols_stat.fit( X, knockoffs, y, cv_score=True, ) self.assertTrue( ols_stat.score_type == "mse_cv", msg= f"cv_score=True fails to create cross-validated scoring for lars (score_type={lars_stat.score_type})", ) self.assertTrue( ols_stat.score < 2, msg= f"cv scoring fails for ols_stat as cv_score={ols_stat.score} >= 2", ) # 3. Test that throws correct error for non-sklearn backend def non_sklearn_backend_cvscore(): dgprocess = dgp.DGP() X, y, beta, _, corr_matrix = dgprocess.sample_data( n=n, p=p, y_dist="binomial", coeff_size=100, sign_prob=1) groups = np.random.randint(1, p + 1, size=(p, )) group = utilities.preprocess_groups(groups) pyglm_logit = kstats.LassoStatistic() pyglm_logit.fit( X, knockoffs, y, use_pyglm=True, group_lasso=True, groups=groups, cv_score=True, ) self.assertRaisesRegex(ValueError, "must be sklearn estimator", non_sklearn_backend_cvscore)
def check_kstat_fit( self, fstat, fstat_name, fstat_kwargs={}, min_power=0.8, max_l2norm=9, seed=110, group_features=False, **sample_kwargs, ): """ fstat should be a class instance inheriting from FeatureStatistic """ # Add defaults to sample kwargs if "method" not in sample_kwargs: sample_kwargs["method"] = "blockequi" if "gamma" not in sample_kwargs: sample_kwargs["gamma"] = 1 if "n" not in sample_kwargs: sample_kwargs["n"] = 200 if "p" not in sample_kwargs: sample_kwargs["p"] = 50 if "rho" not in sample_kwargs: sample_kwargs["rho"] = 0.5 if "y_dist" not in sample_kwargs: sample_kwargs["y_dist"] = "gaussian" n = sample_kwargs["n"] p = sample_kwargs["p"] rho = sample_kwargs["rho"] y_dist = sample_kwargs["y_dist"] # Create data generating process np.random.seed(seed) dgprocess = dgp.DGP() X, y, beta, _, corr_matrix = dgprocess.sample_data(**sample_kwargs) # Create groups if group_features: groups = np.random.randint(1, p + 1, size=(p, )) groups = utilities.preprocess_groups(groups) else: groups = np.arange(1, p + 1, 1) # Create knockoffs ksampler = knockpy.knockoffs.GaussianSampler( X=X, groups=groups, Sigma=corr_matrix, verbose=False, S=(1 - rho) * np.eye(p), ) Xk = ksampler.sample_knockoffs() S = ksampler.fetch_S() # Fit and extract coeffs/T fstat.fit( X, Xk, y, groups=groups, **fstat_kwargs, ) W = fstat.W T = data_dependent_threshhold(W, fdr=0.2) # Test L2 norm m = np.unique(groups).shape[0] if m == p: pair_W = W else: pair_W = kstats.combine_Z_stats(fstat.Z, antisym="cd") l2norm = np.power(pair_W - np.abs(beta), 2) l2norm = l2norm.mean() self.assertTrue( l2norm < max_l2norm, msg= f"{fstat_name} fits {y_dist} data very poorly (l2norm = {l2norm} btwn real {beta} / fitted {pair_W} coeffs)", ) # Test power for non-grouped setting. # (For group setting, power will be much lower.) selections = (W >= T).astype("float32") group_nnulls = utilities.fetch_group_nonnulls(beta, groups) power = ( (group_nnulls != 0) * selections).sum() / np.sum(group_nnulls != 0) fdp = ((group_nnulls == 0) * selections).sum() / max( np.sum(selections), 1) self.assertTrue( power >= min_power, msg= f"Power {power} for {fstat_name} in equicor case (n={n},p={p},rho={rho}, y_dist {y_dist}, grouped={group_features}) should be > {min_power}. W stats are {W}, beta is {beta}", )
def check_fdr_control( self, reps=NUM_REPS, q=0.2, alpha=0.05, filter_kwargs={}, S=None, infer_sigma=False, test_grouped=True, S_method="mvr", **kwargs, ): np.random.seed(110) filter_kwargs = filter_kwargs.copy() kwargs = kwargs.copy() fixedX = False if "ksampler" in filter_kwargs: if filter_kwargs["ksampler"] == "fx": fixedX = True # Create and name DGP mu = kwargs.pop("mu", None) Sigma = kwargs.pop("Sigma", None) invSigma = kwargs.pop("invSigma", None) beta = kwargs.pop("beta", None) dgprocess = dgp.DGP(mu=mu, Sigma=Sigma, invSigma=invSigma, beta=beta) X0, _, beta, _, Sigma = dgprocess.sample_data(**kwargs) basename = "" for key in kwargs: basename += f"{key}={kwargs[key]} " # Two settings: one grouped, one not p = Sigma.shape[0] groups1 = np.arange(1, p + 1, 1) name1 = basename + " (ungrouped)" groups2 = np.random.randint(1, p + 1, size=(p, )) groups2 = utilities.preprocess_groups(groups2) name2 = basename + " (grouped)" # Split filter_kwargs init_filter_kwargs = {} init_filter_kwargs["ksampler"] = filter_kwargs.pop( "ksampler", "gaussian") init_filter_kwargs["fstat"] = filter_kwargs.pop("fstat", "lasso") knockoff_kwargs = filter_kwargs.pop('knockoff_kwargs', {}) for name, groups in zip([name1, name2], [groups1, groups2]): if not test_grouped and np.all(groups == groups2): continue # Solve for S matrix if S is None and not fixedX and not infer_sigma: ksampler = knockpy.knockoffs.GaussianSampler( X=X0, Sigma=Sigma, groups=groups, method=S_method, ) if not fixedX: invSigma = utilities.chol2inv(Sigma) group_nonnulls = utilities.fetch_group_nonnulls(beta, groups) # Container for fdps fdps = [] # Sample data reps times for j in range(reps): np.random.seed(j) dgprocess = dgp.DGP(Sigma=Sigma, beta=beta) X, y, _, Q, _ = dgprocess.sample_data(**kwargs) gibbs_graph = dgprocess.gibbs_graph # Infer y_dist if "y_dist" in kwargs: y_dist = kwargs["y_dist"] else: y_dist = "gaussian" # Run (MX) knockoff filter if fixedX or infer_sigma: mu_arg = None Sigma_arg = None invSigma_arg = None else: mu_arg = np.zeros(p) Sigma_arg = Sigma invSigma_arg = invSigma # Initialize filter knockoff_filter = KnockoffFilter(**init_filter_kwargs) # Knockoff kwargs knockoff_kwargs['S'] = S knockoff_kwargs['invSigma'] = invSigma_arg knockoff_kwargs['verbose'] = False if "df_t" in kwargs: knockoff_kwargs["df_t"] = kwargs["df_t"] if "x_dist" in kwargs: if kwargs["x_dist"] == "gibbs": knockoff_kwargs["gibbs_graph"] = gibbs_graph knockoff_kwargs.pop("S", None) selections = knockoff_filter.forward( X=X, y=y, mu=mu_arg, Sigma=Sigma_arg, groups=groups, knockoff_kwargs=knockoff_kwargs, fdr=q, **filter_kwargs, ) # Check null W-statistics are symmetric pos_prop = (knockoff_filter.W[group_nonnulls == 0] > 0).mean() pos_prop_se = np.sqrt(pos_prop * (1 - pos_prop) / (1 - group_nonnulls).sum()) Zstat = (pos_prop - 0.5) / pos_prop_se pval = 1 - stats.norm.cdf(Zstat) self.assertTrue( pval >= 0.001, msg= f"MX filter null W-stats have pos_prob {pos_prop} with p={p} and pval={pval}", ) # Calculate fdp fdp = np.sum(selections * (1 - group_nonnulls)) / max( 1, np.sum(selections)) fdps.append(fdp) del knockoff_filter fdps = np.array(fdps) fdr = fdps.mean()
def sample_bad_dist(): dgprocess = dgp.DGP() dgprocess.sample_data(p=100, coeff_dist="bad_dist_arg")