def test_chol2inv(self): # Random pos def matrix X = np.random.randn(100, 100) X = np.dot(X.T, X) # Check cholesky decomposition inverse = utilities.chol2inv(X) np.testing.assert_array_almost_equal( np.eye(100), np.dot(X, inverse), decimal = 6, err_msg = 'chol2inv fails to correctly calculate inverses' )
def test_corrmatrix_errors(self): """ Tests that SDP raises informative errors when sigma is not scaled properly""" # Get graph np.random.seed(110) Q = graphs.ErdosRenyi(p=50, tol=1e-1) V = utilities.chol2inv(Q) groups = np.concatenate([np.zeros(10) + j for j in range(5)]) + 1 groups = groups.astype('int32') # Helper function def SDP_solver(): return knockoffs.solve_group_SDP(V, groups) # Make sure the value error increases self.assertRaisesRegex(ValueError, "Sigma is not a correlation matrix", SDP_solver)
def check_S_properties(self, V, S, groups): # Test PSD-ness of S min_S_eig = np.linalg.eigh(S)[0].min() self.assertTrue( min_S_eig > 0, f'S matrix is not positive semidefinite: mineig is {min_S_eig}') # Test PSD-ness of 2V - S min_diff_eig = np.linalg.eigh(2 * V - S)[0].min() self.assertTrue( min_diff_eig > 0, f"2Sigma-S matrix is not positive semidefinite: mineig is {min_diff_eig}" ) # Calculate conditional knockoff matrix invV = utilities.chol2inv(V) invV_S = np.dot(invV, S) Vk = 2 * S - np.dot(S, invV_S) # Test PSD-ness of the conditional knockoff matrix min_Vk_eig = np.linalg.eigh(Vk)[0].min() self.assertTrue( min_Vk_eig > 0, f"conditional knockoff matrix is not positive semidefinite: mineig is {min_Vk_eig}" ) # Test that S is just a block matrix p = V.shape[0] S_test = np.zeros((p, p)) for j in np.unique(groups): # Select subset of S inds = np.where(groups == j)[0] full_inds = np.ix_(inds, inds) group_S = S[full_inds] # Fill only in this subset of S S_test[full_inds] = group_S # return np.testing.assert_almost_equal( S_test, S, decimal=5, err_msg="S matrix is not a block matrix of the correct shape")
def test_sdp_tolerance(self): # Get graph np.random.seed(110) Q = graphs.ErdosRenyi(p=50, tol=1e-1) V = cov2corr(utilities.chol2inv(Q)) groups = np.concatenate([np.zeros(10) + j for j in range(5)]) + 1 groups = groups.astype('int32') # Solve SDP for tol in [1e-3, 0.01, 0.02]: S = knockoffs.compute_S_matrix(Sigma=V, groups=groups, method='sdp', objective="pnorm", num_iter=10, tol=tol) G = np.hstack([np.vstack([V, V - S]), np.vstack([V - S, V])]) mineig = np.linalg.eig(G)[0].min() self.assertTrue( tol - mineig > -1 * tol / 10, f'sdp solver fails to control minimum eigenvalues: tol is {tol}, val is {mineig}' ) self.check_S_properties(V, S, groups)
def test_debiased_lasso(self): # Create data generating process n = 200 p = 20 rho = 0.3 np.random.seed(110) X, y, beta, _, corr_matrix = graphs.sample_data( n = n, p = p, y_dist = 'gaussian', coeff_size = 100, sign_prob = 0.5, method = 'daibarber2016', rho=rho ) groups = np.arange(1, p+1, 1) # Create knockoffs knockoffs, S = knockadapt.knockoffs.gaussian_knockoffs( X=X, groups=groups, Sigma=corr_matrix, return_S=True, verbose=False, sdp_verbose=False, S = (1-rho)*np.eye(p) ) knockoffs = knockoffs[:, :, 0] G = np.concatenate([ np.concatenate([corr_matrix, corr_matrix-S]), np.concatenate([corr_matrix-S, corr_matrix])], axis=1 ) Ginv = utilities.chol2inv(G) # Debiased lasso - test accuracy dlasso_stat = kstats.LassoStatistic() dlasso_stat.fit( X, knockoffs, y, use_lars=False, cv_score=False, debias=True, Ginv=Ginv ) W = dlasso_stat.W l2norm = np.power(W - beta, 2).mean() self.assertTrue(l2norm > 1, msg = f'Debiased lasso fits gauissan very poorly (l2norm = {l2norm} btwn real/fitted coeffs)' ) # Test that this throws the correct errors # first for Ginv def debiased_lasso_sans_Ginv(): dlasso_stat.fit( X, knockoffs, y, use_lars=False, cv_score=False, debias=True, Ginv=None ) self.assertRaisesRegex( ValueError, "Ginv must be provided", debiased_lasso_sans_Ginv ) # Second for logistic data y = np.random.binomial(1, 0.5, n) def binomial_debiased_lasso(): dlasso_stat.fit( X, knockoffs, y, use_lars=False, cv_score=False, debias=True, Ginv=Ginv, ) self.assertRaisesRegex( ValueError, "Debiased lasso is not implemented for binomial data", binomial_debiased_lasso )
def single_dataset_power_fdr( seed, Sigma, beta, groups, normalize=True, sample_kwargs={}, filter_kwargs={ 'feature_stat_kwargs': {}, 'knockoff_kwargs': {}, }, S_matrices=None, time0=None, S_curve=False, ): """ Knockoff kwargs should be included in filter_kwargs """ # Fetch groups, infer q if Sigma is not None: p = Sigma.shape[0] else: p = sample_kwargs['p'] if groups is None: groups = np.arange(1, p + 1, 1) # Prevent global changes to kwargs filter_kwargs = filter_kwargs.copy() sample_kwargs = sample_kwargs.copy() # Sample data, record time localtime = time.time() np.random.seed(seed) X, y, beta, _, Sigma = knockadapt.graphs.sample_data(corr_matrix=Sigma, beta=beta, **sample_kwargs) # Parse nonnulls (this allows for group knockoffs although # we do not actually use this in our experiments) group_nonnulls = knockadapt.utilities.fetch_group_nonnulls(beta, groups) # Some updates for fixedX knockoffs # and MX knockoffs when Sigma must be inferred fixedX = fetch_kwarg(filter_kwargs, 'fixedx', default=False) infer_sigma = fetch_kwarg(filter_kwargs, 'infer_sigma', default=False) # For the metro sampler, we can compute better S matrices if we have a # guess of the rejection rate. We do not use this in our experiments # by default. rej_rate = fetch_kwarg(filter_kwargs, 'rej_rate', default=0) # We calculate S-matrices if we do not already know them. if 'knockoff_kwargs' in filter_kwargs: kwargs = filter_kwargs['knockoff_kwargs'] else: kwargs = {} # Case 1: We have to estimate Sigma from the data if infer_sigma: shrinkage = fetch_kwarg(filter_kwargs, 'shrinkage', default='ledoitwolf') Sigma, _ = knockadapt.utilities.estimate_covariance( X, shrinkage=shrinkage) Sigma = utilities.cov2corr(Sigma) invSigma = utilities.chol2inv(Sigma) # Case 2: We are running FX knockoffs if fixedX: Sigma = np.dot(X.T, X) invSigma = None # Calculate S-matrices from Sigma if infer_sigma or fixedX or S_matrices is None: verbose = fetch_kwarg(kwargs, 'verbose', default=False) S_matrices = fetch_competitor_S(Sigma=Sigma, groups=groups, time0=time0, rej_rate=rej_rate, verbose=verbose, S_curve=S_curve, **kwargs) # Now we loop through the S matrices degen_flag = 'sdp_perturbed' in S_matrices output = [] for S_method in S_matrices: # Pull S matrix S = S_matrices[S_method] # If the method produces fully degenerate knockoffs, # signal this as part of the filter kwargs. _sdp_degen = (degen_flag and S_method == 'sdp') # Do NOT run OLS or debiased lasso for degenerate case, # because this will lead to linear algebra errors. if _sdp_degen and 'feature_stat' in filter_kwargs: if filter_kwargs['feature_stat'] == 'dlasso': for pair_agg in PAIR_AGGS: for q in q_values: output.append([ S_method, 0, 0, q, 0, "NULL", pair_agg, np.zeros(p), np.zeros(p), np.zeros(p), np.zeros(p), seed ]) continue # Create knockoff_kwargs if 'knockoff_kwargs' not in filter_kwargs: filter_kwargs['knockoff_kwargs'] = {} filter_kwargs['knockoff_kwargs']['S'] = S filter_kwargs['knockoff_kwargs']['method'] = S_method filter_kwargs['knockoff_kwargs']['_sdp_degen'] = _sdp_degen if 'verbose' not in filter_kwargs['knockoff_kwargs']: filter_kwargs['knockoff_kwargs']['verbose'] = False # Possibly pass a few parameters to metro sampler if 'x_dist' in sample_kwargs: # For Ising if 'gibbs_graph' in sample_kwargs: filter_kwargs['knockoff_kwargs'][ 'gibbs_graph'] = sample_kwargs['gibbs_graph'] # For t-distributions if str(sample_kwargs['x_dist']).lower() in ['ar1t', 'blockt']: if 'df_t' in sample_kwargs: filter_kwargs['knockoff_kwargs']['df_t'] = sample_kwargs[ 'df_t'] else: filter_kwargs['knockoff_kwargs'][ 'df_t'] = DEFAULT_DF_T # This matters # Run MX knockoff filter to obtain # Z statistics knockoff_filter = KnockoffFilter(fixedX=fixedX) knockoff_filter.forward(X=X, y=y, mu=np.zeros(p), Sigma=Sigma, groups=groups, **filter_kwargs) Z = knockoff_filter.Z score = knockoff_filter.score score_type = knockoff_filter.score_type # Calculate power/fdp/score for a variety of # antisymmetric functions for pair_agg in PAIR_AGGS: for q in q_values: # Start by creating selections selections, W = Z2selections(Z=Z, groups=groups, q=q, pair_agg=pair_agg) # Then create power/fdp power, fdp = selection2power(selections, group_nonnulls) output.append([ S_method, power, fdp, q, score, score_type, pair_agg, np.around(W, 4), np.around(Z[0:p], 4), np.around(Z[p:], 4), selections, seed, ]) # Possibly log progress try: if seed % 10 == 0 and sample_kwargs['n'] == MAXIMUM_N: overall_cost = time.time() - time0 local_cost = time.time() - localtime print( f"Finished one seed {seed}, took {local_cost} per seed, {overall_cost} total" ) except: # In notebooks this will error pass # Output: list of # [S_method, power, fdp, q, score, score_type, pair_agg, W, Z, tildeZ, selection, seed] return output