Пример #1
0
def apply_feature_selection(X_train, y_train, X_test, features):
    if CONFIG['preprocessing']['use_feature_selection'] == 'random_forest':
        clf = RandomForestClassifier()
        clf = clf.fit(X_train.toarray(), y_train)
        features_scores = [(feature, score) for (score, feature) in sorted(
            zip(clf.feature_importances_, features), reverse=True)]
        selected_features = features_scores[:CONFIG['preprocessing']
                                            ['top_features_to_select']]
        selected_indeces = np.searchsorted(features,
                                           [f[0] for f in selected_features])
        X_train = X_train[:, selected_indeces]
        X_test = X_test[:, selected_indeces]
        return X_train, y_train, X_test, selected_features
    if CONFIG['preprocessing']['use_feature_selection'] == 'chi2':
        algorithm = chi2
    elif CONFIG['preprocessing']['use_feature_selection'] == 'ANOVA':
        algorithm = f_classif
    else:
        raise ValueError("No implementation for " +
                         str(CONFIG['preprocessing']['use_feature_selection']))
    feature_selector = SelectKBest(
        algorithm, k=CONFIG['preprocessing']['top_features_to_select'])
    feature_selector.fit(X_train, y_train)
    X_train = feature_selector.fit_transform(X_train, y_train)
    X_test = feature_selector.transform(X_test)
    features = [
        (feature, score)
        for (score, feature
             ) in sorted(zip(feature_selector.scores_, features), reverse=True)
    ]
    selected_features = features[:CONFIG['preprocessing']
                                 ['top_features_to_select']]
    return X_train, y_train, X_test, selected_features
Пример #2
0
    def fit(self, k=100, percent=None):
        selector = SelectKBest(k=k)
        selector.fit(self.doc_vecs.todense(), np.asarray(self.labels))

        scores = selector.scores_
        indices = np.argsort(scores)

        if k is not None:
            select = k
        elif percent is not None:
            select = int(len(scores) * percent)
        else:
            raise ValueError('One of `k` or `percent` parameter must be not None.')

        indices = indices[:select]
        self._filtered_words = [self.words[i] for i in indices]
Пример #3
0
def feature_reduce(X, Y, num_features_to_keep):
    #use the chi-squared method to reduce features and reshape data
    test = SelectKBest(score_func=chi2, k=num_features_to_keep)
    fit = test.fit(X, Y)

    #return the data with reduced features
    return fit.transform(X)
Пример #4
0
def feature_selection(feat_select, X, y):
    """" Implements various kinds of feature selection """
    # K-best
    if re.match('.*-best', feat_select) is not None:
        n = int(feat_select.split('-')[0])

        selector = SelectKBest(k=n)

        import warnings
        with warnings.catch_warnings():
            warnings.simplefilter('ignore', category=UserWarning)
            features_selected = np.where(
                selector.fit(X, y).get_support() == True)[0]

    elif re.match('.*-randombest', feat_select) is not None:
        n = int(feat_select.split('-')[0])

        from random import shuffle
        features = range(0, X.shape[1])
        shuffle(features)

        features_selected = features[:n]


    return features_selected
Пример #5
0
def feature_selection(feat_select, X, y):
    """" Implements various kinds of feature selection """
    # K-best
    if re.match('.*-best', feat_select) is not None:
        n = int(feat_select.split('-')[0])

        selector = SelectKBest(k=n)

        import warnings
        with warnings.catch_warnings():
            warnings.simplefilter('ignore', category=UserWarning)
            features_selected = np.where(
                selector.fit(X, y).get_support() == True)[0]

    elif re.match('.*-randombest', feat_select) is not None:
        n = int(feat_select.split('-')[0])

        from random import shuffle
        features = range(0, X.shape[1])
        shuffle(features)

        features_selected = features[:n]


    return features_selected
Пример #6
0
def feature_reduce_f_class_if(X, Y, num_features_to_keep):

    test = SelectKBest(score_func=f_classif, k=num_features_to_keep)
    fit = test.fit(X, Y)

    #return the data with reduced features
    return fit.transform(X)
Пример #7
0
def test_select_kbest_classif():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple classification problem
    with the k best heuristic
    """
    X, Y = make_classification(n_samples=200,
                               n_features=20,
                               n_informative=3,
                               n_redundant=2,
                               n_repeated=0,
                               n_classes=8,
                               n_clusters_per_class=1,
                               flip_y=0.0,
                               class_sep=10,
                               shuffle=False,
                               random_state=0)

    univariate_filter = SelectKBest(f_classif, k=5)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode='k_best',
                                   param=5).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
Пример #8
0
def test_select_kbest_classif():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple classification problem
    with the k best heuristic
    """
    X, Y = make_classification(
        n_samples=200,
        n_features=20,
        n_informative=3,
        n_redundant=2,
        n_repeated=0,
        n_classes=8,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )

    univariate_filter = SelectKBest(f_classif, k=5)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode="k_best", param=5).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
Пример #9
0
    def get_best_features(self, data, labels, k=3):
        '''
        Using the scikit-learn library, narrow down feature set.
        '''
        num_feat = len(data.columns)
        while num_feat > k:
            num_feat = max(k, num_feat // 2)
            selector = SelectKBest(f_classif, k=num_feat)
            selector.fit(data, labels)

            chosen = selector.get_support()
            if sum(selector._pvalues[chosen]) > 0:
                data = data[data.columns[chosen]]
            else:
                # Many of our p-vals are zero. Accept all.
                data = data[data.columns[selector._pvalues == 0]]
                num_feat = k

        return data.columns
Пример #10
0
def feature_selection(feat_select, X, y):
    """" Implements various kinds of feature selection """
    # K-best
    if re.match('.*-best', feat_select) is not None:
        n = feat_select.split('-')[0]

        selector = SelectKBest(k=int(n))

        features_selected = np.where(
            selector.fit(X, y).get_support() == True)[0]

    return features_selected
Пример #11
0
def feature_selection(feat_select, X, y):
    """" Implements various kinds of feature selection """
    # K-best
    if re.match('.*-best', feat_select) is not None:
        n = feat_select.split('-')[0]

        selector = SelectKBest(k=int(n))

        features_selected = np.where(
            selector.fit(X, y).get_support() == True)[0]

    return features_selected
Пример #12
0
def test_select_kbest_regression():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple regression problem
    with the k best heuristic
    """
    X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectKBest(f_regression, k=5)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_regression, mode="k_best", param=5).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
Пример #13
0
def test_select_kbest_regression():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple regression problem
    with the k best heuristic
    """
    X, Y = make_regression(n_samples=200, n_features=20,
                           n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectKBest(f_regression, k=5)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_regression, mode='k_best',
                    param=5).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
	y_test[idx_start:idx_end] = cat
	idx_start += N_test
 
print X_train.shape, y_train.shape
print X_test.shape, y_test.shape

print "start classification"

# vectorization
vectorizer = TfidfVectorizer(strip_accents="unicode", ngram_range=(1,1))
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# feature reduction
ch2 = SelectKBest(chi2, k="all")
ch2.fit(X_train, y_train)
X_train = ch2.fit_transform(X_train, y_train)
X_test = ch2.transform(X_test)

# training
clf = LinearSVC()
clf.fit(X_train, y_train)

if validation_mode == "train":
	X_test = X_train
	y_test = y_train

# predict categories
predicted = clf.predict(X_test)

print numpy.mean(predicted == y_test)
Пример #15
0
from sklearn import datasets

from sklearn.feature_selection.univariate_selection import SelectKBest, chi2

iris = datasets.load_iris()

k_best0 = SelectKBest(score_func=chi2, k=2)
fit = k_best0.fit(iris.data, iris.target)
print(fit.scores_)

features = fit.transform(iris.data)
print(features)

k_best1 = SelectKBest(score_func=chi2, k=4)
newX = k_best1.fit_transform(iris.data, iris.target)
print(newX)