示例#1
0
    def test_unbatched(self):

        W1 = np.array(
            [1, -2, 3, 6, 3, -2, 1, 2, 5, 3, 0.5, 1, 1, 1, 1, 1, 1, 1])
        T1 = data_dependent_threshhold(W1, fdr=0.2)
        expected = np.abs(W1).min()
        self.assertTrue(
            T1 == expected,
            msg=
            f"Incorrect data dependent threshhold: T1 should be 0, not {T1}",
        )

        W2 = np.array([-1, -2, -3])
        T2 = data_dependent_threshhold(W2, fdr=0.3)
        self.assertTrue(
            T2 == np.inf,
            msg=
            f"Incorrect data dependent threshhold: T2 should be inf, not {T2}",
        )

        W3 = np.array([-5, -1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
        T3 = data_dependent_threshhold(W3, fdr=0.2)
        self.assertTrue(
            T3 == 5,
            msg=
            f"Incorrect data dependent threshhold: T3 should be 5, not {T3}",
        )
示例#2
0
def Z2selections(Z, groups, q, **kwargs):

    # Calculate W statistics
    W = kstats.combine_Z_stats(Z, groups, **kwargs)

    # Calculate selections
    T = kstats.data_dependent_threshhold(W=W, fdr=q)
    selected_flags = (W >= T).astype("float32")
    return selected_flags, W
示例#3
0
    def test_batched(self):

        W1 = np.array([1] * 10)
        W2 = np.array([-2, -1, 1, 2, 3, 4, 5, 6, 7, 8])
        W3 = np.array([-1] * 10)
        combined = np.stack([W1, W2, W3]).transpose()
        Ts = data_dependent_threshhold(combined, fdr=0.2)
        expected = np.array([1, 2, np.inf])
        np.testing.assert_array_almost_equal(
            Ts,
            expected,
            err_msg=
            f"Incorrect data dependent threshhold (batched): Ts should be {expected}, not {Ts}",
        )
示例#4
0
    def check_kstat_fit(
        self,
        fstat,
        fstat_name,
        fstat_kwargs={},
        min_power=0.8,
        max_l2norm=9,
        seed=110,
        group_features=False,
        **sample_kwargs,
    ):
        """ fstat should be a class instance inheriting from FeatureStatistic """

        # Add defaults to sample kwargs
        if "method" not in sample_kwargs:
            sample_kwargs["method"] = "blockequi"
        if "gamma" not in sample_kwargs:
            sample_kwargs["gamma"] = 1
        if "n" not in sample_kwargs:
            sample_kwargs["n"] = 200
        if "p" not in sample_kwargs:
            sample_kwargs["p"] = 50
        if "rho" not in sample_kwargs:
            sample_kwargs["rho"] = 0.5
        if "y_dist" not in sample_kwargs:
            sample_kwargs["y_dist"] = "gaussian"
        n = sample_kwargs["n"]
        p = sample_kwargs["p"]
        rho = sample_kwargs["rho"]
        y_dist = sample_kwargs["y_dist"]

        # Create data generating process
        np.random.seed(seed)
        dgprocess = dgp.DGP()
        X, y, beta, _, corr_matrix = dgprocess.sample_data(**sample_kwargs)

        # Create groups
        if group_features:
            groups = np.random.randint(1, p + 1, size=(p, ))
            groups = utilities.preprocess_groups(groups)
        else:
            groups = np.arange(1, p + 1, 1)

        # Create knockoffs
        ksampler = knockpy.knockoffs.GaussianSampler(
            X=X,
            groups=groups,
            Sigma=corr_matrix,
            verbose=False,
            S=(1 - rho) * np.eye(p),
        )
        Xk = ksampler.sample_knockoffs()
        S = ksampler.fetch_S()

        # Fit and extract coeffs/T
        fstat.fit(
            X,
            Xk,
            y,
            groups=groups,
            **fstat_kwargs,
        )
        W = fstat.W
        T = data_dependent_threshhold(W, fdr=0.2)

        # Test L2 norm
        m = np.unique(groups).shape[0]
        if m == p:
            pair_W = W
        else:
            pair_W = kstats.combine_Z_stats(fstat.Z, antisym="cd")
        l2norm = np.power(pair_W - np.abs(beta), 2)
        l2norm = l2norm.mean()
        self.assertTrue(
            l2norm < max_l2norm,
            msg=
            f"{fstat_name} fits {y_dist} data very poorly (l2norm = {l2norm} btwn real {beta} / fitted {pair_W} coeffs)",
        )

        # Test power for non-grouped setting.
        # (For group setting, power will be much lower.)
        selections = (W >= T).astype("float32")
        group_nnulls = utilities.fetch_group_nonnulls(beta, groups)
        power = (
            (group_nnulls != 0) * selections).sum() / np.sum(group_nnulls != 0)
        fdp = ((group_nnulls == 0) * selections).sum() / max(
            np.sum(selections), 1)
        self.assertTrue(
            power >= min_power,
            msg=
            f"Power {power} for {fstat_name} in equicor case (n={n},p={p},rho={rho}, y_dist {y_dist}, grouped={group_features}) should be > {min_power}. W stats are {W}, beta is {beta}",
        )