Пример #1
0
	def test_chol2inv(self):

		# Random pos def matrix
		X = np.random.randn(100, 100)
		X = np.dot(X.T, X)

		# Check cholesky decomposition
		inverse = utilities.chol2inv(X)
		np.testing.assert_array_almost_equal(
			np.eye(100), np.dot(X, inverse), decimal = 6,
			err_msg = 'chol2inv fails to correctly calculate inverses'
		)
Пример #2
0
    def test_corrmatrix_errors(self):
        """ Tests that SDP raises informative errors when sigma is not scaled properly"""

        # Get graph
        np.random.seed(110)
        Q = graphs.ErdosRenyi(p=50, tol=1e-1)
        V = utilities.chol2inv(Q)
        groups = np.concatenate([np.zeros(10) + j for j in range(5)]) + 1
        groups = groups.astype('int32')

        # Helper function
        def SDP_solver():
            return knockoffs.solve_group_SDP(V, groups)

        # Make sure the value error increases
        self.assertRaisesRegex(ValueError, "Sigma is not a correlation matrix",
                               SDP_solver)
Пример #3
0
    def check_S_properties(self, V, S, groups):

        # Test PSD-ness of S
        min_S_eig = np.linalg.eigh(S)[0].min()
        self.assertTrue(
            min_S_eig > 0,
            f'S matrix is not positive semidefinite: mineig is {min_S_eig}')

        # Test PSD-ness of 2V - S
        min_diff_eig = np.linalg.eigh(2 * V - S)[0].min()
        self.assertTrue(
            min_diff_eig > 0,
            f"2Sigma-S matrix is not positive semidefinite: mineig is {min_diff_eig}"
        )

        # Calculate conditional knockoff matrix
        invV = utilities.chol2inv(V)
        invV_S = np.dot(invV, S)
        Vk = 2 * S - np.dot(S, invV_S)

        # Test PSD-ness of the conditional knockoff matrix
        min_Vk_eig = np.linalg.eigh(Vk)[0].min()
        self.assertTrue(
            min_Vk_eig > 0,
            f"conditional knockoff matrix is not positive semidefinite: mineig is {min_Vk_eig}"
        )

        # Test that S is just a block matrix
        p = V.shape[0]
        S_test = np.zeros((p, p))
        for j in np.unique(groups):

            # Select subset of S
            inds = np.where(groups == j)[0]
            full_inds = np.ix_(inds, inds)
            group_S = S[full_inds]

            # Fill only in this subset of S
            S_test[full_inds] = group_S

        # return
        np.testing.assert_almost_equal(
            S_test,
            S,
            decimal=5,
            err_msg="S matrix is not a block matrix of the correct shape")
Пример #4
0
    def test_sdp_tolerance(self):

        # Get graph
        np.random.seed(110)
        Q = graphs.ErdosRenyi(p=50, tol=1e-1)
        V = cov2corr(utilities.chol2inv(Q))
        groups = np.concatenate([np.zeros(10) + j for j in range(5)]) + 1
        groups = groups.astype('int32')

        # Solve SDP
        for tol in [1e-3, 0.01, 0.02]:
            S = knockoffs.compute_S_matrix(Sigma=V,
                                           groups=groups,
                                           method='sdp',
                                           objective="pnorm",
                                           num_iter=10,
                                           tol=tol)
            G = np.hstack([np.vstack([V, V - S]), np.vstack([V - S, V])])
            mineig = np.linalg.eig(G)[0].min()
            self.assertTrue(
                tol - mineig > -1 * tol / 10,
                f'sdp solver fails to control minimum eigenvalues: tol is {tol}, val is {mineig}'
            )
            self.check_S_properties(V, S, groups)
Пример #5
0
	def test_debiased_lasso(self):

		# Create data generating process
		n = 200
		p = 20
		rho = 0.3
		np.random.seed(110)
		X, y, beta, _, corr_matrix = graphs.sample_data(
			n = n, p = p, y_dist = 'gaussian', 
			coeff_size = 100, sign_prob = 0.5,
			method = 'daibarber2016',
			rho=rho
		)       
		groups = np.arange(1, p+1, 1)

		# Create knockoffs
		knockoffs, S = knockadapt.knockoffs.gaussian_knockoffs(
			X=X, 
			groups=groups,
			Sigma=corr_matrix,
			return_S=True,
			verbose=False,
			sdp_verbose=False,
			S = (1-rho)*np.eye(p)
		)
		knockoffs = knockoffs[:, :, 0]
		G = np.concatenate([
				np.concatenate([corr_matrix, corr_matrix-S]),
				np.concatenate([corr_matrix-S, corr_matrix])],
				axis=1
			)
		Ginv = utilities.chol2inv(G)

		# Debiased lasso - test accuracy
		dlasso_stat = kstats.LassoStatistic()
		dlasso_stat.fit(
			X,
			knockoffs,
			y,
			use_lars=False,
			cv_score=False,
			debias=True,
			Ginv=Ginv
		)
		W = dlasso_stat.W
		l2norm = np.power(W - beta, 2).mean()
		self.assertTrue(l2norm > 1,
			msg = f'Debiased lasso fits gauissan very poorly (l2norm = {l2norm} btwn real/fitted coeffs)'
		)

		# Test that this throws the correct errors
		# first for Ginv
		def debiased_lasso_sans_Ginv():
			dlasso_stat.fit(
				X,
				knockoffs,
				y,
				use_lars=False,
				cv_score=False,
				debias=True,
				Ginv=None
			)
		self.assertRaisesRegex(
			ValueError, "Ginv must be provided",
			debiased_lasso_sans_Ginv
		)

		# Second for logistic data
		y = np.random.binomial(1, 0.5, n)
		def binomial_debiased_lasso():
			dlasso_stat.fit(
				X,
				knockoffs,
				y,
				use_lars=False,
				cv_score=False,
				debias=True,
				Ginv=Ginv,
			)
		self.assertRaisesRegex(
			ValueError, "Debiased lasso is not implemented for binomial data",
			binomial_debiased_lasso
		)
Пример #6
0
def single_dataset_power_fdr(
    seed,
    Sigma,
    beta,
    groups,
    normalize=True,
    sample_kwargs={},
    filter_kwargs={
        'feature_stat_kwargs': {},
        'knockoff_kwargs': {},
    },
    S_matrices=None,
    time0=None,
    S_curve=False,
):
    """ 
	Knockoff kwargs should be included in filter_kwargs
	"""

    # Fetch groups, infer q
    if Sigma is not None:
        p = Sigma.shape[0]
    else:
        p = sample_kwargs['p']
    if groups is None:
        groups = np.arange(1, p + 1, 1)

    # Prevent global changes to kwargs
    filter_kwargs = filter_kwargs.copy()
    sample_kwargs = sample_kwargs.copy()

    # Sample data, record time
    localtime = time.time()
    np.random.seed(seed)
    X, y, beta, _, Sigma = knockadapt.graphs.sample_data(corr_matrix=Sigma,
                                                         beta=beta,
                                                         **sample_kwargs)

    # Parse nonnulls (this allows for group knockoffs although
    # we do not actually use this in our experiments)
    group_nonnulls = knockadapt.utilities.fetch_group_nonnulls(beta, groups)

    # Some updates for fixedX knockoffs
    # and MX knockoffs when Sigma must be inferred
    fixedX = fetch_kwarg(filter_kwargs, 'fixedx', default=False)
    infer_sigma = fetch_kwarg(filter_kwargs, 'infer_sigma', default=False)

    # For the metro sampler, we can compute better S matrices if we have a
    # guess of the rejection rate. We do not use this in our experiments
    # by default.
    rej_rate = fetch_kwarg(filter_kwargs, 'rej_rate', default=0)

    # We calculate S-matrices if we do not already know them.
    if 'knockoff_kwargs' in filter_kwargs:
        kwargs = filter_kwargs['knockoff_kwargs']
    else:
        kwargs = {}
    # Case 1: We have to estimate Sigma from the data
    if infer_sigma:
        shrinkage = fetch_kwarg(filter_kwargs,
                                'shrinkage',
                                default='ledoitwolf')
        Sigma, _ = knockadapt.utilities.estimate_covariance(
            X, shrinkage=shrinkage)
        Sigma = utilities.cov2corr(Sigma)
        invSigma = utilities.chol2inv(Sigma)
    # Case 2: We are running FX knockoffs
    if fixedX:
        Sigma = np.dot(X.T, X)
        invSigma = None
    # Calculate S-matrices from Sigma
    if infer_sigma or fixedX or S_matrices is None:
        verbose = fetch_kwarg(kwargs, 'verbose', default=False)
        S_matrices = fetch_competitor_S(Sigma=Sigma,
                                        groups=groups,
                                        time0=time0,
                                        rej_rate=rej_rate,
                                        verbose=verbose,
                                        S_curve=S_curve,
                                        **kwargs)

    # Now we loop through the S matrices
    degen_flag = 'sdp_perturbed' in S_matrices
    output = []
    for S_method in S_matrices:

        # Pull S matrix
        S = S_matrices[S_method]

        # If the method produces fully degenerate knockoffs,
        # signal this as part of the filter kwargs.
        _sdp_degen = (degen_flag and S_method == 'sdp')

        # Do NOT run OLS or debiased lasso for degenerate case,
        # because this will lead to linear algebra errors.
        if _sdp_degen and 'feature_stat' in filter_kwargs:
            if filter_kwargs['feature_stat'] == 'dlasso':
                for pair_agg in PAIR_AGGS:
                    for q in q_values:
                        output.append([
                            S_method, 0, 0, q, 0, "NULL", pair_agg,
                            np.zeros(p),
                            np.zeros(p),
                            np.zeros(p),
                            np.zeros(p), seed
                        ])
                continue

        # Create knockoff_kwargs
        if 'knockoff_kwargs' not in filter_kwargs:
            filter_kwargs['knockoff_kwargs'] = {}
        filter_kwargs['knockoff_kwargs']['S'] = S
        filter_kwargs['knockoff_kwargs']['method'] = S_method
        filter_kwargs['knockoff_kwargs']['_sdp_degen'] = _sdp_degen
        if 'verbose' not in filter_kwargs['knockoff_kwargs']:
            filter_kwargs['knockoff_kwargs']['verbose'] = False

        # Possibly pass a few parameters to metro sampler
        if 'x_dist' in sample_kwargs:
            # For Ising
            if 'gibbs_graph' in sample_kwargs:
                filter_kwargs['knockoff_kwargs'][
                    'gibbs_graph'] = sample_kwargs['gibbs_graph']
            # For t-distributions
            if str(sample_kwargs['x_dist']).lower() in ['ar1t', 'blockt']:
                if 'df_t' in sample_kwargs:
                    filter_kwargs['knockoff_kwargs']['df_t'] = sample_kwargs[
                        'df_t']
                else:
                    filter_kwargs['knockoff_kwargs'][
                        'df_t'] = DEFAULT_DF_T  # This matters

        # Run MX knockoff filter to obtain
        # Z statistics
        knockoff_filter = KnockoffFilter(fixedX=fixedX)
        knockoff_filter.forward(X=X,
                                y=y,
                                mu=np.zeros(p),
                                Sigma=Sigma,
                                groups=groups,
                                **filter_kwargs)
        Z = knockoff_filter.Z
        score = knockoff_filter.score
        score_type = knockoff_filter.score_type

        # Calculate power/fdp/score for a variety of
        # antisymmetric functions
        for pair_agg in PAIR_AGGS:
            for q in q_values:
                # Start by creating selections
                selections, W = Z2selections(Z=Z,
                                             groups=groups,
                                             q=q,
                                             pair_agg=pair_agg)
                # Then create power/fdp
                power, fdp = selection2power(selections, group_nonnulls)
                output.append([
                    S_method,
                    power,
                    fdp,
                    q,
                    score,
                    score_type,
                    pair_agg,
                    np.around(W, 4),
                    np.around(Z[0:p], 4),
                    np.around(Z[p:], 4),
                    selections,
                    seed,
                ])

    # Possibly log progress
    try:
        if seed % 10 == 0 and sample_kwargs['n'] == MAXIMUM_N:
            overall_cost = time.time() - time0
            local_cost = time.time() - localtime
            print(
                f"Finished one seed {seed}, took {local_cost} per seed, {overall_cost} total"
            )
    except:
        # In notebooks this will error
        pass

    # Output: list of
    # [S_method, power, fdp, q, score, score_type, pair_agg, W, Z, tildeZ, selection, seed]
    return output