def test_chi2(): # Test Chi2 feature extraction chi2 = mkchi2(k=1).fit(X, y) chi2 = mkchi2(k=1).fit(X, y) assert_array_equal(chi2.get_support(indices=True), [0]) assert_array_equal(chi2.transform(X), np.array(X)[:, [0]]) chi2 = mkchi2(k=2).fit(X, y) assert_array_equal(sorted(chi2.get_support(indices=True)), [0, 2]) Xsp = csr_matrix(X, dtype=np.float64) chi2 = mkchi2(k=2).fit(Xsp, y) assert_array_equal(sorted(chi2.get_support(indices=True)), [0, 2]) Xtrans = chi2.transform(Xsp) assert_array_equal(Xtrans.shape, [Xsp.shape[0], 2]) # == doesn't work on scipy.sparse matrices Xtrans = Xtrans.toarray() Xtrans2 = mkchi2(k=2).fit_transform(Xsp, y).toarray() assert_array_almost_equal(Xtrans, Xtrans2)
def ex_feature(train_set,test_set,t_train,t_test,hash=False,use_tf=False,K=2000): ''' extract feature from train_set,test_set,using term frequency or tf-idf. And a stop_word.txt is necessary :param train_set:numpy array or sparse matrix of shape [n_samples,n_features] Training data :param test_set:numpy array or sparse matrix of shape [n_samples,n_features] Training data :param t_train:numpy array of shape [n_samples, n_targets] Target values :param t_test:numpy array of shape [n_samples, n_targets] Target values :param hash: use HashingVectorizer :param use_tf: use term frequency to descend dimensions :param K:select k best features based on cki2,only used if ``use_tf == 'False'`` :return:train_Set and test_set after extracting features ''' with open('chinese_stopword.txt', 'r', encoding='utf-8-sig') as f: stop_words = list(f.read().splitlines()) data_train_size_mb = size_mb(train_set) data_test_size_mb = size_mb(test_set) start_time = time.time() print('extracting features......') if hash: from sklearn.feature_extraction.text import HashingVectorizer vectorizer = HashingVectorizer(non_negative=True) x_train = vectorizer.fit_transform(train_set) x_test = vectorizer.fit_transform(test_set) else: tfidf_transformer = TfidfTransformer() if use_tf: vectorizer = CountVectorizer(max_features=K,stop_words=stop_words, decode_error='strict') x_train_tf_matrix = vectorizer.fit_transform(train_set) x_train = tfidf_transformer.fit_transform(x_train_tf_matrix) x_test_tf_matrix = vectorizer.transform(test_set)#共用一个vectorizer x_test = tfidf_transformer.fit_transform(x_test_tf_matrix) else: from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer(stop_words=stop_words) x_train_tfidf_matrix = vectorizer.fit_transform(train_set) x_test_tfidf_matrix = vectorizer.transform(test_set) chi2=SelectKBest(chi2, k=K) x_train = chi2.fit_transform(x_train_tfidf_matrix, t_train) x_test = chi2.transform(x_test_tfidf_matrix) end_time=time.time() print('extract features took %.2f s at %0.2fMB/S' % ( (time.time() - start_time), (data_train_size_mb+data_test_size_mb) / (end_time-start_time))) return x_train, x_test
def test_chi2(): """Test Chi2 feature extraction""" chi = sklearn.feature_selection.chi2(X, y) print chi chi2 = mkchi2(k=1).fit(X, y) chi2 = mkchi2(k=1).fit(X, y) print chi2.get_support(indices=True), [0] print chi2.transform(X), np.array(X)[:, [0]] chi2 = mkchi2(k=2).fit(X, y) print sorted(chi2.get_support(indices=True)), [0, 2] Xsp = csr_matrix(X, dtype=np.float) chi2 = mkchi2(k=2).fit(Xsp, y) print sorted(chi2.get_support(indices=True)), [0, 2] Xtrans = chi2.transform(Xsp) print Xtrans.shape, [Xsp.shape[0], 2] # == doesn't work on scipy.sparse matrices Xtrans = Xtrans.toarray() Xtrans2 = mkchi2(k=2).fit_transform(Xsp, y).toarray() assert_equal(Xtrans, Xtrans2)
### Splitting Data into Train and Test using a StratifiedShuffleSplit sss = StratifiedShuffleSplit(Y, 10, test_size=0.3, random_state=0) ### Using the generated indices to create Test and Train datasets for train_index, test_index in sss: #print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[test_index] y_train, y_test = Y[train_index], Y[test_index] ### Select K best features based on a chi-squared test feature_chi = 72 ## selecting best 2/3rd features chi2 = SelectKBest(chi2, k=feature_chi) X_train = chi2.fit_transform(X_train, y_train) X_test = chi2.transform(X_test) #print(X_train) ### Defining a function to print statistics which helps us benchmark classifier performance def benchmark(clf): clf_descr = str(clf).split('(')[0] ## store name of the classifier print(clf_descr) ## print name t0 = time() ## store current time in t0 clf.fit(X_train, y_train) ## run classifier to fit data train_time = time() - t0 ## Calculate time take to train print("train time: %0.3fs" % train_time) ## print statistic t0 = time() ## store current time in t0 pred = clf.predict(X_test) ## use trained classifer to predict class for test data test_time = time() - t0 ## Calculate time take to predict for test data
data_path = '/Users/zhangzhaopeng/统计学习/机器学习/Text_Classification/data_preprocessing.pkl' fp = open(data_path, 'rb') x_train, x_test, y_train, y_test = pickle.load(fp) fp.close() ## 卡方检验选择特征 from sklearn import naive_bayes from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 vectorizer = CountVectorizer(min_df=2) x_train_tf = vectorizer.fit_transform(x_train) x_test_tf = vectorizer.transform(x_test) chi2 = SelectKBest(chi2, k=4000) x_train_chi2 = chi2.fit_transform(x_train_tf, y_train) x_test_chi2 = chi2.transform(x_test_tf) ## naive bayes naive_chi2 = naive_bayes.MultinomialNB().fit(x_train_chi2, y_train) naive_chi2_preds = naive_chi2.predict(x_test_chi2) count_accu = 0 for i in range(len(y_test)): if y_test[i] == naive_chi2_preds[i]: count_accu += 1 naive_accu_chi2 = count_accu / len(y_test) #naive_accu2 = metrics.accuracy_score(naive_preds, y_test) print("Test set accuracy: ", naive_accu_chi2) # confusion_matrix conf_arr_naive_chi2 = [[0, 0], [0, 0]] for i in range(len(y_test)): if y_test[i] == 0:
# SelectKBest based on Chi-squared scoring function to choose 10 best features. scalar = MinMaxScaler() scaled_features = scalar.fit_transform(features) #print scaled_features features_train, features_test, labels_train, labels_test = \ train_test_split(scaled_features, labels, test_size=0.1, random_state=42) # Manually tried several k values, Number of top features to select, for Chi-squared the k=10 was returning best # results for different methods and clasifiers. chi2 = SelectKBest(chi2, 10) features_train = chi2.fit_transform(features_train, labels_train) features_test = chi2.transform(features_test) # keep selected feature names # i+1 because we still have poi as the first name in the feature_list, while the actual features matrix does not features_list_new = [features_list[i+1] for i in chi2.get_support(indices=True)] features_list = ["poi"] + features_list_new print "chi2 selected features_list = " pprint (features_list) # I will apply featureFormat to new feature_list with 10 best members and extraxt # new labels/features to use them for the same varity of clasifiers and compare their scores. data = featureFormat(my_dataset, features_list) labels, features = targetFeatureSplit(data)
def baoxian(): #outputfile = u"E://项目需求//爬虫项目//和硕爬虫//tfidf//" outputfile = u"D://workspace//python//classify_WeChat//baoxian//bxtfidf" X_train,y_train = cPickle.load(open(os.path.join(outputfile,"train.data"),"rb")) X_test,y_test = cPickle.load(open(os.path.join(outputfile,"test.data"),"rb")) vectorizer = cPickle.load(open(os.path.join(outputfile,"vectorizer.data"),"rb")) chi2 = cPickle.load(open(os.path.join(outputfile,"ch2.data"),"rb")) clf = cPickle.load(open(os.path.join(outputfile,"SGD_l2.model"),"rb")) #inputpath =u"E://项目需求//JDPower//分类//4月份//financeoutput1_final.txt" #outputpath =u"E://项目需求//JDPower//分类//4月份//保险.txt" inputpath =u"D://workspace//python//classify_WeChat//data//financeoutput1_final//financeoutput1_final.txt" outputpath =u"D://workspace//python//classify_WeChat//data//financeoutput1_final//baoxian.txt" # inputpath =u"..//data//financeoutput1_final//financeoutput1_final.txt" # outputpath =u"..//data//financeoutput1_final//保险.txt" label = "保险" forbidkword = {} # load # forbidpath = u"..//keyword.txt" forbidpath = u"keyword.txt" with open(forbidpath, "rb") as f: for line in f: word = line.strip() forbidkword[word] = 0 outfile = open(outputpath,"wb") with open(inputpath, "rb") as f: for line in f: splits = line.strip().split("\t") tag = splits[0] if tag.find(label) > -1 : print(tag) train = [] #print (splits[-1]) seg = jieba.cut(splits[-1], cut_all=False) #seglist = [i for i in seg] seglist = [] for w in seg: #print w w = w.strip().encode("utf-8") if w not in forbidkword: if not re.match(r"\d+$", w): seglist.append(w) train.append(" ".join(seglist)) X_test = vectorizer.transform(train) X_test = chi2.transform(X_test) pred = clf.predict(X_test) #print(" ".join(pred)) print (pred) lb = str(pred[0]) #print(isinstance(lb, unicode)) #print( lb.decode("gbk").encode("utf-8")) #outfile.writelines(lb+"\n") if lb == '1' : outfile.writelines(line.strip()+"\t") outfile.writelines(lb+"\n") #outfile.writelines(line.strip()+"\t"+lb.decode("utf-8").encode("utf-8")+"\n") outfile.close()