def test_chi2(): # Test Chi2 feature extraction chi2 = mkchi2(k=1).fit(X, y) chi2 = mkchi2(k=1).fit(X, y) assert_array_equal(chi2.get_support(indices=True), [0]) assert_array_equal(chi2.transform(X), np.array(X)[:, [0]]) chi2 = mkchi2(k=2).fit(X, y) assert_array_equal(sorted(chi2.get_support(indices=True)), [0, 2]) Xsp = csr_matrix(X, dtype=np.float64) chi2 = mkchi2(k=2).fit(Xsp, y) assert_array_equal(sorted(chi2.get_support(indices=True)), [0, 2]) Xtrans = chi2.transform(Xsp) assert_array_equal(Xtrans.shape, [Xsp.shape[0], 2]) # == doesn't work on scipy.sparse matrices Xtrans = Xtrans.toarray() Xtrans2 = mkchi2(k=2).fit_transform(Xsp, y).toarray() assert_array_almost_equal(Xtrans, Xtrans2)
def test_chi2(): """Test Chi2 feature extraction""" chi = sklearn.feature_selection.chi2(X, y) print chi chi2 = mkchi2(k=1).fit(X, y) chi2 = mkchi2(k=1).fit(X, y) print chi2.get_support(indices=True), [0] print chi2.transform(X), np.array(X)[:, [0]] chi2 = mkchi2(k=2).fit(X, y) print sorted(chi2.get_support(indices=True)), [0, 2] Xsp = csr_matrix(X, dtype=np.float) chi2 = mkchi2(k=2).fit(Xsp, y) print sorted(chi2.get_support(indices=True)), [0, 2] Xtrans = chi2.transform(Xsp) print Xtrans.shape, [Xsp.shape[0], 2] # == doesn't work on scipy.sparse matrices Xtrans = Xtrans.toarray() Xtrans2 = mkchi2(k=2).fit_transform(Xsp, y).toarray() assert_equal(Xtrans, Xtrans2)
#print scaled_features features_train, features_test, labels_train, labels_test = \ train_test_split(scaled_features, labels, test_size=0.1, random_state=42) # Manually tried several k values, Number of top features to select, for Chi-squared the k=10 was returning best # results for different methods and clasifiers. chi2 = SelectKBest(chi2, 10) features_train = chi2.fit_transform(features_train, labels_train) features_test = chi2.transform(features_test) # keep selected feature names # i+1 because we still have poi as the first name in the feature_list, while the actual features matrix does not features_list_new = [features_list[i+1] for i in chi2.get_support(indices=True)] features_list = ["poi"] + features_list_new print "chi2 selected features_list = " pprint (features_list) # I will apply featureFormat to new feature_list with 10 best members and extraxt # new labels/features to use them for the same varity of clasifiers and compare their scores. data = featureFormat(my_dataset, features_list) labels, features = targetFeatureSplit(data) features_train, features_test, labels_train, labels_test = \ train_test_split(features, labels, test_size=0.1, random_state=42) clf = GaussianNB()