def test_select_kbest_classif():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple classification problem
    with the k best heuristic
    """
    X, Y = make_classification(n_samples=200,
                               n_features=20,
                               n_informative=3,
                               n_redundant=2,
                               n_repeated=0,
                               n_classes=8,
                               n_clusters_per_class=1,
                               flip_y=0.0,
                               class_sep=10,
                               shuffle=False,
                               random_state=0)

    univariate_filter = SelectKBest(f_classif, k=5)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode='k_best',
                                   param=5).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
def test_select_kbest_classif():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple classification problem
    with the k best heuristic
    """
    X, Y = make_classification(
        n_samples=200,
        n_features=20,
        n_informative=3,
        n_redundant=2,
        n_repeated=0,
        n_classes=8,
        n_clusters_per_class=1,
        flip_y=0.0,
        class_sep=10,
        shuffle=False,
        random_state=0,
    )

    univariate_filter = SelectKBest(f_classif, k=5)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_classif, mode="k_best", param=5).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
示例#3
0
 def corr_matrix_of_important_words(term_doc_mat, word_list, scores,
                                    n_features_to_keep):
     selector = SelectKBest(k=n_features_to_keep).fit(term_doc_mat, scores)
     informative_words_index = selector.get_support(indices=True)
     labels = [word_list[i] for i in informative_words_index]
     data = pd.DataFrame(term_doc_mat[:, informative_words_index].todense(),
                         columns=labels)
     data['Score'] = df_one_company.Rating
     return (data.corr())
def test_select_kbest_regression():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple regression problem
    with the k best heuristic
    """
    X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectKBest(f_regression, k=5)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_regression, mode="k_best", param=5).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
示例#5
0
def test_select_kbest_regression():
    """
    Test whether the relative univariate feature selection
    gets the correct items in a simple regression problem
    with the k best heuristic
    """
    X, Y = make_regression(n_samples=200, n_features=20,
                           n_informative=5, shuffle=False, random_state=0)

    univariate_filter = SelectKBest(f_regression, k=5)
    X_r = univariate_filter.fit(X, Y).transform(X)
    X_r2 = GenericUnivariateSelect(f_regression, mode='k_best',
                    param=5).fit(X, Y).transform(X)
    assert_array_equal(X_r, X_r2)
    support = univariate_filter.get_support()
    gtruth = np.zeros(20)
    gtruth[:5] = 1
    assert_array_equal(support, gtruth)
示例#6
0
    def get_best_features(self, data, labels, k=3):
        '''
        Using the scikit-learn library, narrow down feature set.
        '''
        num_feat = len(data.columns)
        while num_feat > k:
            num_feat = max(k, num_feat // 2)
            selector = SelectKBest(f_classif, k=num_feat)
            selector.fit(data, labels)

            chosen = selector.get_support()
            if sum(selector._pvalues[chosen]) > 0:
                data = data[data.columns[chosen]]
            else:
                # Many of our p-vals are zero. Accept all.
                data = data[data.columns[selector._pvalues == 0]]
                num_feat = k

        return data.columns
 sss = StratifiedShuffleSplit(y, 1, test_size=0.40, random_state=42)
 y_train = []
 y_test = []
 for train, test in sss:
     print train
     np.save('train_vect', train)
     np.save('test_vect', test)
     y_train = y[train]
     y_test = y[test]
 
 processed_comment_list = extract_global_bag_of_words_processed(commentList)  
 train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy')
 train_list = []
 test_list = []
 for v in train_v:
     train_list.append(processed_comment_list[v])
 for v in test_v:
     test_list.append(processed_comment_list[v])
     
 #train, test, terms = extract_words(CountVectorizer(analyzer=UnigramAnalyzer(), dtype=float), train_list, test_list)
 train, test, terms = extract_words(CountVectorizer(analyzer=BigramAnalyzer(), dtype=float), train_list, test_list)
 
 
 
 
 
 selector2 = SelectKBest(score_func=chi2, k=min(50, train.shape[1])).fit(train,y_train)
 ind = [zero_based_index for zero_based_index in list(selector2.get_support(indices=True))]
 print np.asarray(terms)[selector2.get_support()]
 
 
示例#8
0
    y_train = []
    y_test = []
    for train, test in sss:
        print train
        np.save('train_vect', train)
        np.save('test_vect', test)
        y_train = y[train]
        y_test = y[test]

    processed_comment_list = extract_global_bag_of_words_processed(commentList)
    train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy')
    train_list = []
    test_list = []
    for v in train_v:
        train_list.append(processed_comment_list[v])
    for v in test_v:
        test_list.append(processed_comment_list[v])

    #train, test, terms = extract_words(CountVectorizer(analyzer=UnigramAnalyzer(), dtype=float), train_list, test_list)
    train, test, terms = extract_words(
        CountVectorizer(analyzer=BigramAnalyzer(), dtype=float), train_list,
        test_list)

    selector2 = SelectKBest(score_func=chi2,
                            k=min(50, train.shape[1])).fit(train, y_train)
    ind = [
        zero_based_index
        for zero_based_index in list(selector2.get_support(indices=True))
    ]
    print np.asarray(terms)[selector2.get_support()]