def test_unbatched(self): W1 = np.array( [1, -2, 3, 6, 3, -2, 1, 2, 5, 3, 0.5, 1, 1, 1, 1, 1, 1, 1]) T1 = data_dependent_threshhold(W1, fdr=0.2) expected = np.abs(W1).min() self.assertTrue( T1 == expected, msg= f"Incorrect data dependent threshhold: T1 should be 0, not {T1}", ) W2 = np.array([-1, -2, -3]) T2 = data_dependent_threshhold(W2, fdr=0.3) self.assertTrue( T2 == np.inf, msg= f"Incorrect data dependent threshhold: T2 should be inf, not {T2}", ) W3 = np.array([-5, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) T3 = data_dependent_threshhold(W3, fdr=0.2) self.assertTrue( T3 == 5, msg= f"Incorrect data dependent threshhold: T3 should be 5, not {T3}", )
def Z2selections(Z, groups, q, **kwargs): # Calculate W statistics W = kstats.combine_Z_stats(Z, groups, **kwargs) # Calculate selections T = kstats.data_dependent_threshhold(W=W, fdr=q) selected_flags = (W >= T).astype("float32") return selected_flags, W
def test_batched(self): W1 = np.array([1] * 10) W2 = np.array([-2, -1, 1, 2, 3, 4, 5, 6, 7, 8]) W3 = np.array([-1] * 10) combined = np.stack([W1, W2, W3]).transpose() Ts = data_dependent_threshhold(combined, fdr=0.2) expected = np.array([1, 2, np.inf]) np.testing.assert_array_almost_equal( Ts, expected, err_msg= f"Incorrect data dependent threshhold (batched): Ts should be {expected}, not {Ts}", )
def check_kstat_fit( self, fstat, fstat_name, fstat_kwargs={}, min_power=0.8, max_l2norm=9, seed=110, group_features=False, **sample_kwargs, ): """ fstat should be a class instance inheriting from FeatureStatistic """ # Add defaults to sample kwargs if "method" not in sample_kwargs: sample_kwargs["method"] = "blockequi" if "gamma" not in sample_kwargs: sample_kwargs["gamma"] = 1 if "n" not in sample_kwargs: sample_kwargs["n"] = 200 if "p" not in sample_kwargs: sample_kwargs["p"] = 50 if "rho" not in sample_kwargs: sample_kwargs["rho"] = 0.5 if "y_dist" not in sample_kwargs: sample_kwargs["y_dist"] = "gaussian" n = sample_kwargs["n"] p = sample_kwargs["p"] rho = sample_kwargs["rho"] y_dist = sample_kwargs["y_dist"] # Create data generating process np.random.seed(seed) dgprocess = dgp.DGP() X, y, beta, _, corr_matrix = dgprocess.sample_data(**sample_kwargs) # Create groups if group_features: groups = np.random.randint(1, p + 1, size=(p, )) groups = utilities.preprocess_groups(groups) else: groups = np.arange(1, p + 1, 1) # Create knockoffs ksampler = knockpy.knockoffs.GaussianSampler( X=X, groups=groups, Sigma=corr_matrix, verbose=False, S=(1 - rho) * np.eye(p), ) Xk = ksampler.sample_knockoffs() S = ksampler.fetch_S() # Fit and extract coeffs/T fstat.fit( X, Xk, y, groups=groups, **fstat_kwargs, ) W = fstat.W T = data_dependent_threshhold(W, fdr=0.2) # Test L2 norm m = np.unique(groups).shape[0] if m == p: pair_W = W else: pair_W = kstats.combine_Z_stats(fstat.Z, antisym="cd") l2norm = np.power(pair_W - np.abs(beta), 2) l2norm = l2norm.mean() self.assertTrue( l2norm < max_l2norm, msg= f"{fstat_name} fits {y_dist} data very poorly (l2norm = {l2norm} btwn real {beta} / fitted {pair_W} coeffs)", ) # Test power for non-grouped setting. # (For group setting, power will be much lower.) selections = (W >= T).astype("float32") group_nnulls = utilities.fetch_group_nonnulls(beta, groups) power = ( (group_nnulls != 0) * selections).sum() / np.sum(group_nnulls != 0) fdp = ((group_nnulls == 0) * selections).sum() / max( np.sum(selections), 1) self.assertTrue( power >= min_power, msg= f"Power {power} for {fstat_name} in equicor case (n={n},p={p},rho={rho}, y_dist {y_dist}, grouped={group_features}) should be > {min_power}. W stats are {W}, beta is {beta}", )