def test_use_sample_weights(self): x, y = self.multinomial[1] class_0_idx = np.where(y == 0) to_drop = class_0_idx[0][:-3] to_keep = np.ones(len(y), dtype=bool) to_keep[to_drop] = False y = y[to_keep] x = x[to_keep, :] sample_weight = class_weight.compute_sample_weight('balanced', y) sample_weight[0] = 0. unweighted = LogitNet(random_state=2, scoring='f1_micro') unweighted = unweighted.fit(x, y) unweighted_acc = f1_score(y, unweighted.predict(x), sample_weight=sample_weight, average='micro') weighted = LogitNet(random_state=2, scoring='f1_micro') weighted = weighted.fit(x, y, sample_weight=sample_weight) weighted_acc = f1_score(y, weighted.predict(x), sample_weight=sample_weight, average='micro') self.assertTrue(weighted_acc >= unweighted_acc)
def test_predict_without_cv(self): x, y = self.binomial[0] m = LogitNet(n_folds=0, random_state=399001) m = m.fit(x, y) # should not make prediction unless value is passed for lambda with self.assertRaises(ValueError): m.predict(x)
def test_predict_without_cv(self): x, y = self.binomial[0] m = LogitNet(n_splits=0, random_state=399001) m = m.fit(x, y) # should not make prediction unless value is passed for lambda with self.assertRaises(ValueError): m.predict(x)
def test_lambda_clip_warning(self): x, y = self.binomial[0] m = LogitNet(n_folds=0, random_state=1729) m = m.fit(x, y) with self.assertWarns(RuntimeWarning): m.predict(x, lamb=m.lambda_path_[0] + 1) with self.assertWarns(RuntimeWarning): m.predict(x, lamb=m.lambda_path_[-1] - 1)
def test_lambda_clip_warning(self): x, y = self.binomial[0] m = LogitNet(n_splits=0, random_state=1729) m = m.fit(x, y) with self.assertWarns(RuntimeWarning): m.predict(x, lamb=m.lambda_path_[0] + 1) with self.assertWarns(RuntimeWarning): m.predict(x, lamb=m.lambda_path_[-1] - 1)
def test_one_row_predict(self): # Verify that predicting on one row gives only one row of output m = LogitNet(random_state=42) for X, y in itertools.chain(self.binomial, self.multinomial): m.fit(X, y) p = m.predict(X[0].reshape((1, -1))) assert p.shape == (1, )
def test_one_row_predict(self): # Verify that predicting on one row gives only one row of output m = LogitNet(random_state=42) for X, y in itertools.chain(self.binomial, self.multinomial): m.fit(X, y) p = m.predict(X[0].reshape((1, -1))) assert p.shape == (1,)
def test_one_row_predict_with_lambda(self): # One row to predict along with lambdas should give 2D output m = LogitNet(random_state=42) lamb = [0.01, 0.02, 0.04, 0.1] for X, y in itertools.chain(self.binomial, self.multinomial): m.fit(X, y) p = m.predict(X[0].reshape((1, -1)), lamb=lamb) assert p.shape == (1, len(lamb))
def test_cv_scoring_multinomial(self): x, y = self.multinomial[0] for method in self.scoring: m = LogitNet(scoring=method, random_state=488881) if method in self.multinomial_scoring: m = m.fit(x, y) check_accuracy(y, m.predict(x), 0.65, scoring=method) else: with self.assertRaises(ValueError): m.fit(x, y)
def test_with_defaults(self): m = LogitNet(random_state=29341) for x, y in itertools.chain(self.binomial, self.multinomial): m = m.fit(x, y) sanity_check_logistic(m, x) # check selection of lambda_best assert m.lambda_best_inx_ <= m.lambda_max_inx_ # check full path predict p = m.predict(x, lamb=m.lambda_path_) assert p.shape[-1] == m.lambda_path_.size
def test_with_defaults(self): m = LogitNet(random_state=29341) for x, y in itertools.chain(self.binomial, self.multinomial): m = m.fit(x, y) sanity_check_logistic(m, x) # check selection of lambda_best ok_(m.lambda_best_inx_ <= m.lambda_max_inx_) # check full path predict p = m.predict(x, lamb=m.lambda_path_) eq_(p.shape[-1], m.lambda_path_.size)
def test_use_sample_weights(self): x, y = self.multinomial[1] class_0_idx = np.where(y==0) to_drop = class_0_idx[0][:-3] to_keep = np.ones(len(y), dtype=bool) to_keep[to_drop] = False y = y[to_keep] x = x[to_keep, :] sample_weight = class_weight.compute_sample_weight('balanced', y) sample_weight[0] = 0. unweighted = LogitNet(random_state=2, scoring='f1_micro') unweighted = unweighted.fit(x, y) unweighted_acc = f1_score(y, unweighted.predict(x), sample_weight=sample_weight, average='micro') weighted = LogitNet(random_state=2, scoring='f1_micro') weighted = weighted.fit(x, y, sample_weight=sample_weight) weighted_acc = f1_score(y, weighted.predict(x), sample_weight=sample_weight, average='micro') self.assertTrue(weighted_acc >= unweighted_acc)
class PenalisedFDRControl: """ Implemented 'yi' the 'FDR control using data permutation' method from Penalised Multimarker vs Single-Marker Regression Methods for Genome-Wide Association Studies of Quantitative Traits (Yi et al 2015) Also if method is 'arbet' then implements Implemented the type-1-error control from Arbet et al Permutations to select lambda for type-1-error control 'Resampling-based tests for Lasso in genome-wide association studies' (Arbet et al 2017) >>> import numpy as np >>> from sklearn.datasets import load_breast_cancer, load_boston >>> x, y = load_boston(True) >>> reg_fdr = PenalisedFDRControl().fit(x, y) >>> np.isclose(reg_fdr.model.score(x,y,), 0.618, atol=0.01) True >>> x, y = load_breast_cancer(True) >>> clf_fdr = PenalisedFDRControl(is_regression=False).fit(x, y) >>> np.isclose(clf_fdr.model.score(x,y,), 0.984, atol=0.01) True """ def __init__(self, penalty_free_indices=list(), min_lambda_ratio=1e-3, n_lambdas=250, cv=10, is_regression=True, norm_num=1): from glmnet import ElasticNet, LogitNet if not (isinstance(penalty_free_indices, list) or isinstance(penalty_free_indices, np.ndarray)): raise ValueError('ols_indices must be a list or np.array') if is_regression: self.model = ElasticNet(norm_num, n_lambdas, min_lambda_ratio, n_splits=cv, n_jobs=cpu_count()) else: self.model = LogitNet(norm_num, n_lambdas, min_lambda_ratio, n_splits=cv, n_jobs=cpu_count()) self.norm_num = norm_num self.ols_idx = penalty_free_indices self.is_regression = is_regression self.n = None self.p = None self.coef_path = None self.lambdas = None self.fdr_grid = None self.fdr_analytic_grid = None self.n_nonzero_true_coefs = None self.mean_n_false_positive_coefs = None def _penalty_weights(self): penalty_weights = np.ones(self.p) penalty_weights[self.ols_idx] = 0 return penalty_weights def plot_coef_path(self, only_penalised_coefs=True, complete=False, show_graph=False): import matplotlib.pyplot as graph if self.fdr_grid is None: raise NotFittedError coef_path = self.model.coef_path_ if complete else self.coef_path lambdas = self.model.lambda_path_ if complete else self.lambdas for f in range(coef_path.shape[0]): if np.isclose(coef_path[f, :], 0).all(): continue if only_penalised_coefs and f in self.ols_idx: continue graph.plot(lambdas, coef_path[f, :], linewidth=2, alpha=0.25) graph.ylabel(r'$\beta$') graph.xlabel(r'$\lambda$') if show_graph: graph.show() def fit(self, X, y, n_permutations=500, n_jobs=-1, verbose=False): from scipy.stats import norm self.n, self.p = X.shape penalties = self._penalty_weights() # Fit real model (R) if verbose: print('Regression Model' if self. is_regression else 'Classification Model') print('Fitting y -> R(alpha)') self.model.fit(X, y, relative_penalties=self._penalty_weights()) # Get lasso path subset_idx = np.argwhere( self.model.lambda_path_ >= self.model.lambda_best_ * 0.8).flatten() self.lambdas = self.model.lambda_path_[subset_idx] self.coef_path = np.squeeze(self.model.coef_path_)[:, subset_idx] # Compute R(alpha) and multiple by penalties to prevent counting OLS coefs self.n_nonzero_true_coefs = np.sign( np.abs(self.coef_path) * vec_to_array(penalties)).sum(axis=0) # About to fit single lambda path models, ignore RuntimeWarnings warnings.simplefilter('ignore', RuntimeWarning) # Compute Permutation FDR Control F(b, alpha) iter_perm = range(n_permutations) iter_perm = tqdm(iter_perm, desc='Computing permuted FDR y -> F(b, lambda)...' ) if verbose else iter_perm f_grid = Parallel(n_jobs)( delayed(_parallel_permute_count_nonzero_penalised_coefs)(X, y.copy( ), self.lambdas, penalties, self.norm_num, self.is_regression) for _ in iter_perm) self.mean_n_false_positive_coefs = np.vstack(f_grid).mean(axis=0) self.fdr_grid = self.mean_n_false_positive_coefs / self.n_nonzero_true_coefs self.fdr_grid[np.isnan(self.fdr_grid)] = 0 # Compute Analytic FDR Control analytic_fdr = [] prediction_lam = self.model.predict(X, self.lambdas) iter_lam = range(len(self.lambdas)) iter_lam = tqdm(iter_lam, desc='Computing analytic FDR') if verbose else iter_lam for i in iter_lam: if self.is_regression is False: warnings.warn( 'Analytic FDR was not intended for classification') rej = np.sign(np.abs(self.coef_path[:, i]) * penalties).sum() residuals = y - prediction_lam[:, i] test_val = -( (self.lambdas[i] * self.n) / np.sqrt(residuals.T @ residuals)) probit = norm.cdf(test_val) fdr_hat = (2 * self.p * probit) / rej analytic_fdr.append(fdr_hat) self.fdr_analytic_grid = np.array(analytic_fdr).flatten() warnings.simplefilter('default', RuntimeWarning) return self def compute_coef_stability(self, X, y, penalty, n_samples=500, n_jobs=-1, verbose=False): """ Uses resampling to compute the Prob(beta_i != 0 | penalty) :param X: :param y: :param penalty: lambda to check the :param n_samples: effectively controls the resolution as 1/n is the sample resolution :param n_jobs: :param verbose: :return: """ from copy import copy from .models import L1Classifier, L1Regressor model = L1Regressor( lam=penalty) if self.is_regression else L1Classifier(lam=penalty) penalties = self._penalty_weights() iterator = range(n_samples) iterator = tqdm(iterator, desc=f'Prob(beta_i != 0 | lam={penalty:0.3f})' ) if verbose else iterator warnings.simplefilter('ignore') is_nonzero = Parallel(n_jobs)( delayed(_resampled_model)(X, y, copy(model), penalties) for _ in iterator) warnings.simplefilter('default') is_nonzero = np.vstack(is_nonzero) return is_nonzero.mean(axis=0) def estimate_fpr(self, penalty): """ Estimates the FPR for any given lambda. This will return the Expected FDR at this rate. The approximate False Positive Rate is estimated using permutation testing. :param penalty: Lasso alpha param :return: The approximate FPR """ from .helpers import find_nearest fpr = self.mean_n_false_positive_coefs / self.p fpr[np.isnan(fpr)] = 0 return fpr[find_nearest(self.lambdas, penalty, return_idx=True)] def sharp_threshold(self, X: np.ndarray, verbose=False): """ This method finds the lowest value for alpha where the coefficients are all extremely likely to be replicable. 'Sharp Thresholds for High-Dimensional and Noisy Sparsity Recovery Using l1 Constrained Quadratic Programming' (Wainwright 2009) :param X: :param verbose: :return: """ import matplotlib.pyplot as graph results = [] for lambda_pos in range(self.coef_path.shape[1]): results.append( l1_coef_is_probably_correct(self.coef_path[:, lambda_pos], X, return_result=True)) results = np.array(results) if verbose: graph.plot(self.lambdas, self.fdr_grid, label='FWER') graph.plot(self.lambdas, results, label='Thresholds') graph.show() lowest_alpha_idx = np.argmin(results >= 1) return self.lambdas[lowest_alpha_idx] def fdr_alpha(self, alpha, method='yi', verbose=False): """ Returns the alpha for L1 regression that best controls for FDR at the rate requested :param alpha: FDR (Yi) or FPR (Arbet) or FDR (analytic) :param method: `yi` uses Yi's FDR = E(F)/R, `arbet` uses Arbet's FPR = N_fp / N_features, `analytic` uses Yi's analytic FDR control FDR ~= 2p N.cdf((-lam N) / sqrt(r.T @ r)) / R :param verbose: """ from .helpers import find_nearest metric = '' method = method.lower() if method == 'yi': # Penalized Mutlimarker vs Single-Marker Regression Methods for Genome-Wide Association Studies fpr_grid = self.mean_n_false_positive_coefs / self.n_nonzero_true_coefs fpr_grid[np.isnan(fpr_grid)] = 0 metric = 'FDR (False Discovery Rate)' elif method == 'arbet': # Resampling-based tests for Lasso in genome-wide association studies fpr_grid = self.mean_n_false_positive_coefs / self.p fpr_grid[np.isnan(fpr_grid)] = 0 metric = 'FPR (Expected False Positives per Feature)' elif method == 'analytic': # Penalized Mutlimarker vs Single-Marker Regression Methods for Genome-Wide Association Studies (Analytic) fpr_grid = self.fdr_analytic_grid fpr_grid[np.isnan(fpr_grid)] = 0 metric = 'aFDR (analytic False Discovery Rate)' else: fpr_grid = False ValueError( 'Only supported methods are `yi` `arbet` and `analytic`') approx_idx = find_nearest(fpr_grid, alpha, return_idx=True) if verbose: print( f'{metric} ~{fpr_grid[approx_idx]} @ alpha {self.lambdas[approx_idx]}' ) return self.lambdas[approx_idx]
def test_cv_scoring(self): x, y = self.binomial[0] for method in self.scoring: m = LogitNet(scoring=method, random_state=52633) m = m.fit(x, y) check_accuracy(y, m.predict(x), 0.85, scoring=method)
def test_alphas(self): x, y = self.binomial[0] for alpha in self.alphas: m = LogitNet(alpha=alpha, random_state=41041) m = m.fit(x, y) check_accuracy(y, m.predict(x), 0.85, alpha=alpha)