def test_randomized_logistic(): # Check randomized sparse logistic regression iris = load_iris() X = iris.data[:, [0, 2]] y = iris.target X = X[y != 2] y = y[y != 2] F, _ = f_classif(X, y) scaling = 0.3 clf = RandomizedLogisticRegression(verbose=False, C=1., random_state=42, scaling=scaling, n_resampling=50, tol=1e-3) X_orig = X.copy() feature_scores = clf.fit(X, y).scores_ assert_array_equal(X, X_orig) # fit does not modify X assert_array_equal(np.argsort(F), np.argsort(feature_scores)) clf = RandomizedLogisticRegression(verbose=False, C=[1., 0.5], random_state=42, scaling=scaling, n_resampling=50, tol=1e-3) feature_scores = clf.fit(X, y).scores_ assert_array_equal(np.argsort(F), np.argsort(feature_scores)) clf = RandomizedLogisticRegression(verbose=False, C=[[1., 0.5]]) assert_raises(ValueError, clf.fit, X, y)
def test_randomized_logistic_sparse(): # Check randomized sparse logistic regression on sparse data iris = load_iris() X = iris.data[:, [0, 2]] y = iris.target X = X[y != 2] y = y[y != 2] # center here because sparse matrices are usually not centered # labels should not be centered X, _, _, _, _ = _preprocess_data(X, y, True, True) X_sp = sparse.csr_matrix(X) F, _ = f_classif(X, y) scaling = 0.3 clf = RandomizedLogisticRegression(verbose=False, C=1., random_state=42, scaling=scaling, n_resampling=50, tol=1e-3) feature_scores = clf.fit(X, y).scores_ clf = RandomizedLogisticRegression(verbose=False, C=1., random_state=42, scaling=scaling, n_resampling=50, tol=1e-3) feature_scores_sp = clf.fit(X_sp, y).scores_ assert_array_equal(feature_scores, feature_scores_sp)
def test_randomized_logistic(): """Check randomized sparse logistic regression""" iris = load_iris() X = iris.data[:, [0, 2]] y = iris.target X = X[y != 2] y = y[y != 2] F, _ = f_classif(X, y) scaling = 0.3 clf = RandomizedLogisticRegression(verbose=False, C=1., random_state=42, scaling=scaling, n_resampling=50, tol=1e-3) feature_scores = clf.fit(X, y).scores_ assert_array_equal(np.argsort(F), np.argsort(feature_scores)) clf = RandomizedLogisticRegression(verbose=False, C=[1., 0.5], random_state=42, scaling=scaling, n_resampling=50, tol=1e-3) feature_scores = clf.fit(X, y).scores_ assert_array_equal(np.argsort(F), np.argsort(feature_scores))
def test_randomized_logistic(): """Check randomized sparse logistic regression""" iris = load_iris() X = iris.data[:, [0, 2]] y = iris.target X = X[y != 2] y = y[y != 2] F, _ = f_classif(X, y) scaling = 0.3 clf = RandomizedLogisticRegression(verbose=False, C=1., random_state=42, scaling=scaling, n_resampling=50, tol=1e-3) feature_scores = clf.fit(X, y).scores_ assert_equal(np.argsort(F), np.argsort(feature_scores)) clf = RandomizedLogisticRegression(verbose=False, C=[1., 0.5], random_state=42, scaling=scaling, n_resampling=50, tol=1e-3) feature_scores = clf.fit(X, y).scores_ assert_equal(np.argsort(F), np.argsort(feature_scores))
def get_lasso_feature_scores(x, y, mode=CLASSIFICATION, scaling=0.5, sample_fraction=0.75, n_resampling=200, random_state=None): ''' Calculate features scores using a randomized lasso (regression) or randomized logistic regression (classification). This is also known as stability selection. see http://scikit-learn.org/stable/modules/feature_selection.html for details. Parameters ---------- x : structured array y : 1D nd.array mode : {CLASSIFICATION, REGRESSION} scaling : float, optional scaling parameter, should be between 0 and 1 sample_fraction : float, optional the fraction of samples to used in each randomized dataset n_resmpling : int, optional the number of times the model is trained on a random subset of the data random_state : int, optional if it is an int, it specifies the seed to use, defaults to None. Returns ------- pandas DataFrame sorted in descending order of tuples with uncertainty and feature scores ''' uncs = recfunctions.get_names(x.dtype) x = _prepare_experiments(x) if mode==CLASSIFICATION: lfs = RandomizedLogisticRegression(scaling=scaling, sample_fraction=sample_fraction, n_resampling=n_resampling, random_state=random_state) lfs.fit(x,y) elif mode==REGRESSION: # we use LassoLarsCV to determine alpha see # http://scikit-learn.org/stable/auto_examples/linear_model/plot_sparse_recovery.html lars_cv = LassoLarsCV(cv=6).fit(x, y,) alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6) # fit the randomized lasso lfs = RandomizedLasso(alpha=alphas,scaling=scaling, sample_fraction=sample_fraction, n_resampling=n_resampling, random_state=random_state) lfs.fit(x, y) else: raise ValueError('{} invalid value for mode'.format(mode)) importances = lfs.scores_ importances = zip(uncs, importances) importances = list(importances) importances.sort(key=itemgetter(1), reverse=True) importances = pd.DataFrame(importances) return importances
def get_lasso_feature_scores(results, classify, scaling=0.5, sample_fraction=0.75, n_resampling=200, random_state=None): ''' Calculate features scores using a randomized lasso (regression) or randomized logistic regression (classification). This is also known as stability selection. see http://scikit-learn.org/stable/modules/feature_selection.html for details. Parameters ---------- results : tuple classify : callable or str a classify function or variable analogous to PRIM scaling : float, optional scaling parameter, should be between 0 and 1 sample_fraction : float, optional the fraction of samples to used in each randomized dataset n_resmpling : int, optional the number of times the model is trained on a random subset of the data random_state : int, optional if it is an int, it specifies the seed to use, defaults to None. Returns ------- list of tuples sorted in descending order of tuples with uncertainty and feature scores ''' experiments, outcomes = results uncs = recfunctions.get_names(experiments.dtype) x = _prepare_experiments(experiments) y, categorical = _prepare_outcomes(outcomes, classify) if categorical: lfs = RandomizedLogisticRegression(scaling=scaling, sample_fraction=sample_fraction, n_resampling=n_resampling, random_state=random_state) lfs.fit(x, y) else: # we use LassoLarsCV to determine alpha see # http://scikit-learn.org/stable/auto_examples/linear_model/plot_sparse_recovery.html lars_cv = LassoLarsCV(cv=6).fit( x, y, ) alphas = np.linspace(lars_cv.alphas_[0], .1 * lars_cv.alphas_[0], 6) # fit the randomized lasso lfs = RandomizedLasso(alpha=alphas, scaling=scaling, sample_fraction=sample_fraction, n_resampling=n_resampling, random_state=random_state) lfs.fit(x, y) importances = lfs.scores_ importances = zip(uncs, importances) importances = list(importances) importances.sort(key=itemgetter(1), reverse=True) return importances