def test_select_kbest_classif(): """ Test whether the relative univariate feature selection gets the correct items in a simple classification problem with the k best heuristic """ X, Y = make_classification(n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0) univariate_filter = SelectKBest(f_classif, k=5) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode='k_best', param=5).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def test_select_kbest_classif(): """ Test whether the relative univariate feature selection gets the correct items in a simple classification problem with the k best heuristic """ X, Y = make_classification( n_samples=200, n_features=20, n_informative=3, n_redundant=2, n_repeated=0, n_classes=8, n_clusters_per_class=1, flip_y=0.0, class_sep=10, shuffle=False, random_state=0, ) univariate_filter = SelectKBest(f_classif, k=5) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_classif, mode="k_best", param=5).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def corr_matrix_of_important_words(term_doc_mat, word_list, scores, n_features_to_keep): selector = SelectKBest(k=n_features_to_keep).fit(term_doc_mat, scores) informative_words_index = selector.get_support(indices=True) labels = [word_list[i] for i in informative_words_index] data = pd.DataFrame(term_doc_mat[:, informative_words_index].todense(), columns=labels) data['Score'] = df_one_company.Rating return (data.corr())
def test_select_kbest_regression(): """ Test whether the relative univariate feature selection gets the correct items in a simple regression problem with the k best heuristic """ X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectKBest(f_regression, k=5) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode="k_best", param=5).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def test_select_kbest_regression(): """ Test whether the relative univariate feature selection gets the correct items in a simple regression problem with the k best heuristic """ X, Y = make_regression(n_samples=200, n_features=20, n_informative=5, shuffle=False, random_state=0) univariate_filter = SelectKBest(f_regression, k=5) X_r = univariate_filter.fit(X, Y).transform(X) X_r2 = GenericUnivariateSelect(f_regression, mode='k_best', param=5).fit(X, Y).transform(X) assert_array_equal(X_r, X_r2) support = univariate_filter.get_support() gtruth = np.zeros(20) gtruth[:5] = 1 assert_array_equal(support, gtruth)
def get_best_features(self, data, labels, k=3): ''' Using the scikit-learn library, narrow down feature set. ''' num_feat = len(data.columns) while num_feat > k: num_feat = max(k, num_feat // 2) selector = SelectKBest(f_classif, k=num_feat) selector.fit(data, labels) chosen = selector.get_support() if sum(selector._pvalues[chosen]) > 0: data = data[data.columns[chosen]] else: # Many of our p-vals are zero. Accept all. data = data[data.columns[selector._pvalues == 0]] num_feat = k return data.columns
sss = StratifiedShuffleSplit(y, 1, test_size=0.40, random_state=42) y_train = [] y_test = [] for train, test in sss: print train np.save('train_vect', train) np.save('test_vect', test) y_train = y[train] y_test = y[test] processed_comment_list = extract_global_bag_of_words_processed(commentList) train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy') train_list = [] test_list = [] for v in train_v: train_list.append(processed_comment_list[v]) for v in test_v: test_list.append(processed_comment_list[v]) #train, test, terms = extract_words(CountVectorizer(analyzer=UnigramAnalyzer(), dtype=float), train_list, test_list) train, test, terms = extract_words(CountVectorizer(analyzer=BigramAnalyzer(), dtype=float), train_list, test_list) selector2 = SelectKBest(score_func=chi2, k=min(50, train.shape[1])).fit(train,y_train) ind = [zero_based_index for zero_based_index in list(selector2.get_support(indices=True))] print np.asarray(terms)[selector2.get_support()]
y_train = [] y_test = [] for train, test in sss: print train np.save('train_vect', train) np.save('test_vect', test) y_train = y[train] y_test = y[test] processed_comment_list = extract_global_bag_of_words_processed(commentList) train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy') train_list = [] test_list = [] for v in train_v: train_list.append(processed_comment_list[v]) for v in test_v: test_list.append(processed_comment_list[v]) #train, test, terms = extract_words(CountVectorizer(analyzer=UnigramAnalyzer(), dtype=float), train_list, test_list) train, test, terms = extract_words( CountVectorizer(analyzer=BigramAnalyzer(), dtype=float), train_list, test_list) selector2 = SelectKBest(score_func=chi2, k=min(50, train.shape[1])).fit(train, y_train) ind = [ zero_based_index for zero_based_index in list(selector2.get_support(indices=True)) ] print np.asarray(terms)[selector2.get_support()]