示例#1
0
def test_chi2():
    # Test Chi2 feature extraction

    chi2 = mkchi2(k=1).fit(X, y)
    chi2 = mkchi2(k=1).fit(X, y)
    assert_array_equal(chi2.get_support(indices=True), [0])
    assert_array_equal(chi2.transform(X), np.array(X)[:, [0]])

    chi2 = mkchi2(k=2).fit(X, y)
    assert_array_equal(sorted(chi2.get_support(indices=True)), [0, 2])

    Xsp = csr_matrix(X, dtype=np.float64)
    chi2 = mkchi2(k=2).fit(Xsp, y)
    assert_array_equal(sorted(chi2.get_support(indices=True)), [0, 2])
    Xtrans = chi2.transform(Xsp)
    assert_array_equal(Xtrans.shape, [Xsp.shape[0], 2])

    # == doesn't work on scipy.sparse matrices
    Xtrans = Xtrans.toarray()
    Xtrans2 = mkchi2(k=2).fit_transform(Xsp, y).toarray()
    assert_array_almost_equal(Xtrans, Xtrans2)
示例#2
0
def test_chi2():
    # Test Chi2 feature extraction

    chi2 = mkchi2(k=1).fit(X, y)
    chi2 = mkchi2(k=1).fit(X, y)
    assert_array_equal(chi2.get_support(indices=True), [0])
    assert_array_equal(chi2.transform(X), np.array(X)[:, [0]])

    chi2 = mkchi2(k=2).fit(X, y)
    assert_array_equal(sorted(chi2.get_support(indices=True)), [0, 2])

    Xsp = csr_matrix(X, dtype=np.float64)
    chi2 = mkchi2(k=2).fit(Xsp, y)
    assert_array_equal(sorted(chi2.get_support(indices=True)), [0, 2])
    Xtrans = chi2.transform(Xsp)
    assert_array_equal(Xtrans.shape, [Xsp.shape[0], 2])

    # == doesn't work on scipy.sparse matrices
    Xtrans = Xtrans.toarray()
    Xtrans2 = mkchi2(k=2).fit_transform(Xsp, y).toarray()
    assert_array_almost_equal(Xtrans, Xtrans2)
示例#3
0
文件: test.py 项目: mb16/Kaggle
def test_chi2():
    """Test Chi2 feature extraction"""
 
    chi = sklearn.feature_selection.chi2(X, y)
    print chi
 
    chi2 = mkchi2(k=1).fit(X, y)
    chi2 = mkchi2(k=1).fit(X, y)
    print chi2.get_support(indices=True), [0]
    print chi2.transform(X), np.array(X)[:, [0]]
 
    chi2 = mkchi2(k=2).fit(X, y)
    print sorted(chi2.get_support(indices=True)), [0, 2]
 
    Xsp = csr_matrix(X, dtype=np.float)
    chi2 = mkchi2(k=2).fit(Xsp, y)
    print sorted(chi2.get_support(indices=True)), [0, 2]
    Xtrans = chi2.transform(Xsp)
    print Xtrans.shape, [Xsp.shape[0], 2]
 
    # == doesn't work on scipy.sparse matrices
    Xtrans = Xtrans.toarray()
    Xtrans2 = mkchi2(k=2).fit_transform(Xsp, y).toarray()
    assert_equal(Xtrans, Xtrans2)
示例#4
0
文件: test.py 项目: zyx061212/Kaggle
def test_chi2():
    """Test Chi2 feature extraction"""

    chi = sklearn.feature_selection.chi2(X, y)
    print chi

    chi2 = mkchi2(k=1).fit(X, y)
    chi2 = mkchi2(k=1).fit(X, y)
    print chi2.get_support(indices=True), [0]
    print chi2.transform(X), np.array(X)[:, [0]]

    chi2 = mkchi2(k=2).fit(X, y)
    print sorted(chi2.get_support(indices=True)), [0, 2]

    Xsp = csr_matrix(X, dtype=np.float)
    chi2 = mkchi2(k=2).fit(Xsp, y)
    print sorted(chi2.get_support(indices=True)), [0, 2]
    Xtrans = chi2.transform(Xsp)
    print Xtrans.shape, [Xsp.shape[0], 2]

    # == doesn't work on scipy.sparse matrices
    Xtrans = Xtrans.toarray()
    Xtrans2 = mkchi2(k=2).fit_transform(Xsp, y).toarray()
    assert_equal(Xtrans, Xtrans2)
#print scaled_features 

features_train, features_test, labels_train, labels_test = \
    train_test_split(scaled_features, labels, test_size=0.1, random_state=42)

# Manually tried several k values, Number of top features to select, for Chi-squared the k=10 was returning best 
# results for different methods and clasifiers. 

chi2 = SelectKBest(chi2, 10)
features_train = chi2.fit_transform(features_train, labels_train)
features_test = chi2.transform(features_test)

# keep selected feature names
# i+1 because we still have poi as the first name in the feature_list, while the actual features matrix does not

features_list_new = [features_list[i+1] for i in chi2.get_support(indices=True)]

features_list = ["poi"] + features_list_new
print "chi2 selected features_list = "
pprint (features_list)

# I will apply featureFormat to new feature_list with 10 best members and extraxt 
# new labels/features to use them for the same varity of clasifiers and compare their scores.

data = featureFormat(my_dataset, features_list)
labels, features = targetFeatureSplit(data)
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.1, random_state=42)


clf = GaussianNB()