예제 #1
0
    def test_use_sample_weights(self):
        x, y = self.multinomial[1]
        class_0_idx = np.where(y == 0)
        to_drop = class_0_idx[0][:-3]
        to_keep = np.ones(len(y), dtype=bool)
        to_keep[to_drop] = False
        y = y[to_keep]
        x = x[to_keep, :]
        sample_weight = class_weight.compute_sample_weight('balanced', y)
        sample_weight[0] = 0.

        unweighted = LogitNet(random_state=2, scoring='f1_micro')
        unweighted = unweighted.fit(x, y)
        unweighted_acc = f1_score(y,
                                  unweighted.predict(x),
                                  sample_weight=sample_weight,
                                  average='micro')

        weighted = LogitNet(random_state=2, scoring='f1_micro')
        weighted = weighted.fit(x, y, sample_weight=sample_weight)
        weighted_acc = f1_score(y,
                                weighted.predict(x),
                                sample_weight=sample_weight,
                                average='micro')

        self.assertTrue(weighted_acc >= unweighted_acc)
예제 #2
0
    def test_predict_without_cv(self):
        x, y = self.binomial[0]
        m = LogitNet(n_folds=0, random_state=399001)
        m = m.fit(x, y)

        # should not make prediction unless value is passed for lambda
        with self.assertRaises(ValueError):
            m.predict(x)
예제 #3
0
    def test_predict_without_cv(self):
        x, y = self.binomial[0]
        m = LogitNet(n_splits=0, random_state=399001)
        m = m.fit(x, y)

        # should not make prediction unless value is passed for lambda
        with self.assertRaises(ValueError):
            m.predict(x)
예제 #4
0
    def test_lambda_clip_warning(self):
        x, y = self.binomial[0]
        m = LogitNet(n_folds=0, random_state=1729)
        m = m.fit(x, y)

        with self.assertWarns(RuntimeWarning):
            m.predict(x, lamb=m.lambda_path_[0] + 1)

        with self.assertWarns(RuntimeWarning):
            m.predict(x, lamb=m.lambda_path_[-1] - 1)
예제 #5
0
    def test_lambda_clip_warning(self):
        x, y = self.binomial[0]
        m = LogitNet(n_splits=0, random_state=1729)
        m = m.fit(x, y)

        with self.assertWarns(RuntimeWarning):
            m.predict(x, lamb=m.lambda_path_[0] + 1)

        with self.assertWarns(RuntimeWarning):
            m.predict(x, lamb=m.lambda_path_[-1] - 1)
예제 #6
0
 def test_one_row_predict(self):
     # Verify that predicting on one row gives only one row of output
     m = LogitNet(random_state=42)
     for X, y in itertools.chain(self.binomial, self.multinomial):
         m.fit(X, y)
         p = m.predict(X[0].reshape((1, -1)))
         assert p.shape == (1, )
예제 #7
0
 def test_one_row_predict(self):
     # Verify that predicting on one row gives only one row of output
     m = LogitNet(random_state=42)
     for X, y in itertools.chain(self.binomial, self.multinomial):
         m.fit(X, y)
         p = m.predict(X[0].reshape((1, -1)))
         assert p.shape == (1,)
예제 #8
0
 def test_one_row_predict_with_lambda(self):
     # One row to predict along with lambdas should give 2D output
     m = LogitNet(random_state=42)
     lamb = [0.01, 0.02, 0.04, 0.1]
     for X, y in itertools.chain(self.binomial, self.multinomial):
         m.fit(X, y)
         p = m.predict(X[0].reshape((1, -1)), lamb=lamb)
         assert p.shape == (1, len(lamb))
예제 #9
0
 def test_one_row_predict_with_lambda(self):
     # One row to predict along with lambdas should give 2D output
     m = LogitNet(random_state=42)
     lamb = [0.01, 0.02, 0.04, 0.1]
     for X, y in itertools.chain(self.binomial, self.multinomial):
         m.fit(X, y)
         p = m.predict(X[0].reshape((1, -1)), lamb=lamb)
         assert p.shape == (1, len(lamb))
예제 #10
0
    def test_cv_scoring_multinomial(self):
        x, y = self.multinomial[0]
        for method in self.scoring:
            m = LogitNet(scoring=method, random_state=488881)

            if method in self.multinomial_scoring:
                m = m.fit(x, y)
                check_accuracy(y, m.predict(x), 0.65, scoring=method)
            else:
                with self.assertRaises(ValueError):
                    m.fit(x, y)
예제 #11
0
    def test_cv_scoring_multinomial(self):
        x, y = self.multinomial[0]
        for method in self.scoring:
            m = LogitNet(scoring=method, random_state=488881)

            if method in self.multinomial_scoring:
                m = m.fit(x, y)
                check_accuracy(y, m.predict(x), 0.65, scoring=method)
            else:
                with self.assertRaises(ValueError):
                    m.fit(x, y)
예제 #12
0
    def test_with_defaults(self):
        m = LogitNet(random_state=29341)
        for x, y in itertools.chain(self.binomial, self.multinomial):
            m = m.fit(x, y)
            sanity_check_logistic(m, x)

            # check selection of lambda_best
            assert m.lambda_best_inx_ <= m.lambda_max_inx_

            # check full path predict
            p = m.predict(x, lamb=m.lambda_path_)
            assert p.shape[-1] == m.lambda_path_.size
예제 #13
0
    def test_with_defaults(self):
        m = LogitNet(random_state=29341)
        for x, y in itertools.chain(self.binomial, self.multinomial):
            m = m.fit(x, y)
            sanity_check_logistic(m, x)

            # check selection of lambda_best
            ok_(m.lambda_best_inx_ <= m.lambda_max_inx_)

            # check full path predict
            p = m.predict(x, lamb=m.lambda_path_)
            eq_(p.shape[-1], m.lambda_path_.size)
예제 #14
0
    def test_use_sample_weights(self):
        x, y = self.multinomial[1]
        class_0_idx = np.where(y==0)
        to_drop = class_0_idx[0][:-3]
        to_keep = np.ones(len(y), dtype=bool)
        to_keep[to_drop] = False
        y = y[to_keep]
        x = x[to_keep, :]
        sample_weight = class_weight.compute_sample_weight('balanced', y)
        sample_weight[0] = 0.

        unweighted = LogitNet(random_state=2, scoring='f1_micro')
        unweighted = unweighted.fit(x, y)
        unweighted_acc = f1_score(y, unweighted.predict(x), sample_weight=sample_weight,
                                  average='micro')

        weighted = LogitNet(random_state=2, scoring='f1_micro')
        weighted = weighted.fit(x, y, sample_weight=sample_weight)
        weighted_acc = f1_score(y, weighted.predict(x), sample_weight=sample_weight,
                                average='micro')

        self.assertTrue(weighted_acc >= unweighted_acc)
예제 #15
0
파일: stats.py 프로젝트: arose13/rosey
class PenalisedFDRControl:
    """
    Implemented 'yi' the 'FDR control using data permutation' method from
    Penalised Multimarker vs Single-Marker Regression Methods for Genome-Wide Association Studies of Quantitative Traits
    (Yi et al 2015)

    Also if method is 'arbet' then implements
    Implemented the type-1-error control from Arbet et al Permutations to select lambda for type-1-error control
    'Resampling-based tests for Lasso in genome-wide association studies'
    (Arbet et al 2017)

    >>> import numpy as np
    >>> from sklearn.datasets import load_breast_cancer, load_boston
    >>> x, y = load_boston(True)
    >>> reg_fdr = PenalisedFDRControl().fit(x, y)
    >>> np.isclose(reg_fdr.model.score(x,y,), 0.618, atol=0.01)
    True
    >>> x, y = load_breast_cancer(True)
    >>> clf_fdr = PenalisedFDRControl(is_regression=False).fit(x, y)
    >>> np.isclose(clf_fdr.model.score(x,y,), 0.984, atol=0.01)
    True

    """
    def __init__(self,
                 penalty_free_indices=list(),
                 min_lambda_ratio=1e-3,
                 n_lambdas=250,
                 cv=10,
                 is_regression=True,
                 norm_num=1):
        from glmnet import ElasticNet, LogitNet

        if not (isinstance(penalty_free_indices, list)
                or isinstance(penalty_free_indices, np.ndarray)):
            raise ValueError('ols_indices must be a list or np.array')

        if is_regression:
            self.model = ElasticNet(norm_num,
                                    n_lambdas,
                                    min_lambda_ratio,
                                    n_splits=cv,
                                    n_jobs=cpu_count())
        else:
            self.model = LogitNet(norm_num,
                                  n_lambdas,
                                  min_lambda_ratio,
                                  n_splits=cv,
                                  n_jobs=cpu_count())

        self.norm_num = norm_num
        self.ols_idx = penalty_free_indices
        self.is_regression = is_regression

        self.n = None
        self.p = None
        self.coef_path = None
        self.lambdas = None
        self.fdr_grid = None
        self.fdr_analytic_grid = None
        self.n_nonzero_true_coefs = None
        self.mean_n_false_positive_coefs = None

    def _penalty_weights(self):
        penalty_weights = np.ones(self.p)
        penalty_weights[self.ols_idx] = 0
        return penalty_weights

    def plot_coef_path(self,
                       only_penalised_coefs=True,
                       complete=False,
                       show_graph=False):
        import matplotlib.pyplot as graph

        if self.fdr_grid is None:
            raise NotFittedError

        coef_path = self.model.coef_path_ if complete else self.coef_path
        lambdas = self.model.lambda_path_ if complete else self.lambdas

        for f in range(coef_path.shape[0]):
            if np.isclose(coef_path[f, :], 0).all():
                continue
            if only_penalised_coefs and f in self.ols_idx:
                continue
            graph.plot(lambdas, coef_path[f, :], linewidth=2, alpha=0.25)
        graph.ylabel(r'$\beta$')
        graph.xlabel(r'$\lambda$')

        if show_graph:
            graph.show()

    def fit(self, X, y, n_permutations=500, n_jobs=-1, verbose=False):
        from scipy.stats import norm

        self.n, self.p = X.shape
        penalties = self._penalty_weights()

        # Fit real model (R)
        if verbose:
            print('Regression Model' if self.
                  is_regression else 'Classification Model')
            print('Fitting y -> R(alpha)')
        self.model.fit(X, y, relative_penalties=self._penalty_weights())

        # Get lasso path
        subset_idx = np.argwhere(
            self.model.lambda_path_ >= self.model.lambda_best_ *
            0.8).flatten()
        self.lambdas = self.model.lambda_path_[subset_idx]
        self.coef_path = np.squeeze(self.model.coef_path_)[:, subset_idx]

        # Compute R(alpha) and multiple by penalties to prevent counting OLS coefs
        self.n_nonzero_true_coefs = np.sign(
            np.abs(self.coef_path) * vec_to_array(penalties)).sum(axis=0)

        # About to fit single lambda path models, ignore RuntimeWarnings
        warnings.simplefilter('ignore', RuntimeWarning)

        # Compute Permutation FDR Control F(b, alpha)
        iter_perm = range(n_permutations)
        iter_perm = tqdm(iter_perm,
                         desc='Computing permuted FDR y -> F(b, lambda)...'
                         ) if verbose else iter_perm
        f_grid = Parallel(n_jobs)(
            delayed(_parallel_permute_count_nonzero_penalised_coefs)(X, y.copy(
            ), self.lambdas, penalties, self.norm_num, self.is_regression)
            for _ in iter_perm)
        self.mean_n_false_positive_coefs = np.vstack(f_grid).mean(axis=0)
        self.fdr_grid = self.mean_n_false_positive_coefs / self.n_nonzero_true_coefs
        self.fdr_grid[np.isnan(self.fdr_grid)] = 0

        # Compute Analytic FDR Control
        analytic_fdr = []
        prediction_lam = self.model.predict(X, self.lambdas)
        iter_lam = range(len(self.lambdas))
        iter_lam = tqdm(iter_lam,
                        desc='Computing analytic FDR') if verbose else iter_lam
        for i in iter_lam:
            if self.is_regression is False:
                warnings.warn(
                    'Analytic FDR was not intended for classification')
            rej = np.sign(np.abs(self.coef_path[:, i]) * penalties).sum()
            residuals = y - prediction_lam[:, i]

            test_val = -(
                (self.lambdas[i] * self.n) / np.sqrt(residuals.T @ residuals))
            probit = norm.cdf(test_val)

            fdr_hat = (2 * self.p * probit) / rej
            analytic_fdr.append(fdr_hat)
        self.fdr_analytic_grid = np.array(analytic_fdr).flatten()

        warnings.simplefilter('default', RuntimeWarning)
        return self

    def compute_coef_stability(self,
                               X,
                               y,
                               penalty,
                               n_samples=500,
                               n_jobs=-1,
                               verbose=False):
        """
        Uses resampling to compute the Prob(beta_i != 0 | penalty)

        :param X:
        :param y:
        :param penalty: lambda to check the
        :param n_samples: effectively controls the resolution as 1/n is the sample resolution
        :param n_jobs:
        :param verbose:
        :return:
        """
        from copy import copy
        from .models import L1Classifier, L1Regressor

        model = L1Regressor(
            lam=penalty) if self.is_regression else L1Classifier(lam=penalty)
        penalties = self._penalty_weights()
        iterator = range(n_samples)
        iterator = tqdm(iterator,
                        desc=f'Prob(beta_i != 0 | lam={penalty:0.3f})'
                        ) if verbose else iterator

        warnings.simplefilter('ignore')
        is_nonzero = Parallel(n_jobs)(
            delayed(_resampled_model)(X, y, copy(model), penalties)
            for _ in iterator)
        warnings.simplefilter('default')

        is_nonzero = np.vstack(is_nonzero)
        return is_nonzero.mean(axis=0)

    def estimate_fpr(self, penalty):
        """
        Estimates the FPR for any given lambda. This will return the Expected FDR at this rate.
        The approximate False Positive Rate is estimated using permutation testing.

        :param penalty: Lasso alpha param
        :return: The approximate FPR
        """
        from .helpers import find_nearest

        fpr = self.mean_n_false_positive_coefs / self.p
        fpr[np.isnan(fpr)] = 0
        return fpr[find_nearest(self.lambdas, penalty, return_idx=True)]

    def sharp_threshold(self, X: np.ndarray, verbose=False):
        """
        This method finds the lowest value for alpha where the coefficients are all extremely likely to be replicable.

        'Sharp Thresholds for High-Dimensional and Noisy Sparsity Recovery Using l1 Constrained Quadratic Programming'
        (Wainwright 2009)

        :param X:
        :param verbose:
        :return:
        """
        import matplotlib.pyplot as graph

        results = []
        for lambda_pos in range(self.coef_path.shape[1]):
            results.append(
                l1_coef_is_probably_correct(self.coef_path[:, lambda_pos],
                                            X,
                                            return_result=True))
        results = np.array(results)

        if verbose:
            graph.plot(self.lambdas, self.fdr_grid, label='FWER')
            graph.plot(self.lambdas, results, label='Thresholds')
            graph.show()

        lowest_alpha_idx = np.argmin(results >= 1)
        return self.lambdas[lowest_alpha_idx]

    def fdr_alpha(self, alpha, method='yi', verbose=False):
        """
        Returns the alpha for L1 regression that best controls for FDR at the rate requested

        :param alpha: FDR (Yi) or FPR (Arbet) or FDR (analytic)
        :param method:
            `yi` uses Yi's FDR = E(F)/R,
            `arbet` uses Arbet's FPR = N_fp / N_features,
            `analytic` uses Yi's analytic FDR control FDR ~= 2p N.cdf((-lam N) / sqrt(r.T @ r)) / R
        :param verbose:
        """
        from .helpers import find_nearest

        metric = ''
        method = method.lower()
        if method == 'yi':
            # Penalized Mutlimarker vs Single-Marker Regression Methods for Genome-Wide Association Studies
            fpr_grid = self.mean_n_false_positive_coefs / self.n_nonzero_true_coefs
            fpr_grid[np.isnan(fpr_grid)] = 0
            metric = 'FDR (False Discovery Rate)'

        elif method == 'arbet':
            # Resampling-based tests for Lasso in genome-wide association studies
            fpr_grid = self.mean_n_false_positive_coefs / self.p
            fpr_grid[np.isnan(fpr_grid)] = 0
            metric = 'FPR (Expected False Positives per Feature)'

        elif method == 'analytic':
            # Penalized Mutlimarker vs Single-Marker Regression Methods for Genome-Wide Association Studies (Analytic)
            fpr_grid = self.fdr_analytic_grid
            fpr_grid[np.isnan(fpr_grid)] = 0
            metric = 'aFDR (analytic False Discovery Rate)'

        else:
            fpr_grid = False
            ValueError(
                'Only supported methods are `yi` `arbet` and `analytic`')

        approx_idx = find_nearest(fpr_grid, alpha, return_idx=True)
        if verbose:
            print(
                f'{metric} ~{fpr_grid[approx_idx]} @ alpha {self.lambdas[approx_idx]}'
            )
        return self.lambdas[approx_idx]
예제 #16
0
 def test_cv_scoring(self):
     x, y = self.binomial[0]
     for method in self.scoring:
         m = LogitNet(scoring=method, random_state=52633)
         m = m.fit(x, y)
         check_accuracy(y, m.predict(x), 0.85, scoring=method)
예제 #17
0
 def test_cv_scoring(self):
     x, y = self.binomial[0]
     for method in self.scoring:
         m = LogitNet(scoring=method, random_state=52633)
         m = m.fit(x, y)
         check_accuracy(y, m.predict(x), 0.85, scoring=method)
예제 #18
0
 def test_alphas(self):
     x, y = self.binomial[0]
     for alpha in self.alphas:
         m = LogitNet(alpha=alpha, random_state=41041)
         m = m.fit(x, y)
         check_accuracy(y, m.predict(x), 0.85, alpha=alpha)
예제 #19
0
 def test_alphas(self):
     x, y = self.binomial[0]
     for alpha in self.alphas:
         m = LogitNet(alpha=alpha, random_state=41041)
         m = m.fit(x, y)
         check_accuracy(y, m.predict(x), 0.85, alpha=alpha)