예제 #1
0
def test_select_kbest_classif():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple classification problem
    # with the k best heuristic
    X, y = make_classification(n_samples=200,
                               n_features=20,
                               n_informative=3,
                               n_redundant=2,
                               n_repeated=0,
                               n_classes=8,
                               n_clusters_per_class=1,
                               flip_y=0.0,
                               class_sep=10,
                               shuffle=False,
                               random_state=0)

    univariate_filter = SelectKBest(f_classif, k=5)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode='k_best',
                                   param=5).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
예제 #2
0
def test_mutual_info_classif():
    X, y = make_classification(n_samples=100,
                               n_features=5,
                               n_informative=1,
                               n_redundant=1,
                               n_repeated=0,
                               n_classes=2,
                               n_clusters_per_class=1,
                               flip_y=0.0,
                               class_sep=10,
                               shuffle=False,
                               random_state=0)

    # Test in KBest mode.
    univariate_filter = SelectKBest(mutual_info_classif, k=2)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(mutual_info_classif, mode='k_best',
                                   param=2).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(5)
    gtruth[:2] = 1
    assert_array_equal(support, gtruth)

    # Test in Percentile mode.
    univariate_filter = SelectPercentile(mutual_info_classif, percentile=40)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(mutual_info_classif,
                                   mode='percentile',
                                   param=40).fit(X, y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(5)
    gtruth[:2] = 1
    assert_array_equal(support, gtruth)
예제 #3
0
def test_select_heuristics_classif():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple classification problem
    # with the fdr, fwe and fpr heuristics
    X, y = make_classification(n_samples=200,
                               n_features=20,
                               n_informative=3,
                               n_redundant=2,
                               n_repeated=0,
                               n_classes=8,
                               n_clusters_per_class=1,
                               flip_y=0.0,
                               class_sep=10,
                               shuffle=False,
                               random_state=0)

    univariate_filter = SelectFwe(f_classif, alpha=0.01)
    X_r = univariate_filter.fit(X, y).transform(X)
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    for mode in ['fdr', 'fpr', 'fwe']:
        X_r2 = GenericUnivariateSelect(f_classif, mode=mode,
                                       param=0.01).fit(X, y).transform(X)
        assert_array_equal(X_r, X_r2)
        support = univariate_filter.get_support()
        assert_array_almost_equal(support, gtruth)
예제 #4
0
def test_select_percentile_classif_sparse():
    # Test whether the relative univariate feature selection
    # gets the correct items in a simple classification problem
    # with the percentile heuristic
    X, y = make_classification(n_samples=200,
                               n_features=20,
                               n_informative=3,
                               n_redundant=2,
                               n_repeated=0,
                               n_classes=8,
                               n_clusters_per_class=1,
                               flip_y=0.0,
                               class_sep=10,
                               shuffle=False,
                               random_state=0)
    X = sparse.csr_matrix(X)
    univariate_filter = SelectPercentile(f_classif, percentile=25)
    X_r = univariate_filter.fit(X, y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode='percentile',
                                   param=25).fit(X, y).transform(X)
    assert_array_equal(X_r.toarray(), X_r2.toarray())
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)

    X_r2inv = univariate_filter.inverse_transform(X_r2)
    assert sparse.issparse(X_r2inv)
    support_mask = safe_mask(X_r2inv, support)
    assert X_r2inv.shape == X.shape
    assert_array_equal(X_r2inv[:, support_mask].toarray(), X_r.toarray())
    # Check other columns are empty
    assert X_r2inv.getnnz() == X_r.getnnz()
예제 #5
0
def test_select_kbest_all():
    # Test whether k="all" correctly returns all features.
    X, y = make_classification(n_samples=20,
                               n_features=10,
                               shuffle=False,
                               random_state=0)

    univariate_filter = SelectKBest(f_classif, k='all')
    X_r = univariate_filter.fit(X, y).transform(X)
    assert_array_equal(X, X_r)
예제 #6
0
def test_select_kbest_zero():
    # Test whether k=0 correctly returns no features.
    X, y = make_classification(n_samples=20,
                               n_features=10,
                               shuffle=False,
                               random_state=0)

    univariate_filter = SelectKBest(f_classif, k=0)
    univariate_filter.fit(X, y)
    support = univariate_filter.get_support()
    gtruth = np.zeros(10, dtype=bool)
    assert_array_equal(support, gtruth)
    X_selected = assert_warns_message(UserWarning, 'No features were selected',
                                      univariate_filter.transform, X)
    assert X_selected.shape == (20, 0)
예제 #7
0
def test_f_classif_multi_class():
    # Test whether the F test yields meaningful results
    # on a simple simulated classification problem
    X, y = make_classification(n_samples=200,
                               n_features=20,
                               n_informative=3,
                               n_redundant=2,
                               n_repeated=0,
                               n_classes=8,
                               n_clusters_per_class=1,
                               flip_y=0.0,
                               class_sep=10,
                               shuffle=False,
                               random_state=0)

    F, pv = f_classif(X, y)
    assert (F > 0).all()
    assert (pv > 0).all()
    assert (pv < 1).all()
    assert (pv[:5] < 0.05).all()
    assert (pv[5:] > 1.e-4).all()
예제 #8
0
def test_f_classif_constant_feature():
    # Test that f_classif warns if a feature is constant throughout.

    X, y = make_classification(n_samples=10, n_features=5)
    X[:, 0] = 2.0
    assert_warns(UserWarning, f_classif, X, y)
예제 #9
0
Using a sub-pipeline, the fitted coefficients can be mapped back into
the original feature space.
"""
from mrex import svm
from mrex.datasets import samples_generator
from mrex.feature_selection import SelectKBest, f_regression
from mrex.pipeline import make_pipeline
from mrex.model_selection import train_test_split
from mrex.metrics import classification_report

print(__doc__)

# import some data to play with
X, y = samples_generator.make_classification(n_features=20,
                                             n_informative=3,
                                             n_redundant=0,
                                             n_classes=4,
                                             n_clusters_per_class=2)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# ANOVA SVM-C
# 1) anova filter, take 3 best ranked features
anova_filter = SelectKBest(f_regression, k=3)
# 2) svm
clf = svm.LinearSVC()

anova_svm = make_pipeline(anova_filter, clf)
anova_svm.fit(X_train, y_train)
y_pred = anova_svm.predict(X_test)
print(classification_report(y_test, y_pred))