def classificationTest( train_set, train_label, test_set, test_label, lowFreqK=2, classifier=MultinomialNB()): # RandomForestClassifier(n_estimators=100) print 'classification test processing ...' all = train_set[:].tolist() all.extend(test_set) # 去停用词 stoplist = set('for a of the and to in'.split()) allTexts = [[ word for word in text.lower().replace(',', '').replace('.', '').split() if word not in stoplist ] for text in all] # 去低频词 frequency = defaultdict(int) for text in allTexts: for token in text: frequency[token] += 1 allTexts = [[token for token in text if frequency[token] > lowFreqK] for text in allTexts] # 构建字典 dictionary = corpora.Dictionary(allTexts[0:len(train_set)]) # 怎么把dict转化为列表形式的向量 http://www.mamicode.com/info-detail-1518042.html num_terms = len(dictionary.keys()) all_features = dict2matrix([dictionary.doc2bow(text) for text in allTexts], num_terms).toarray() train_data_features = all_features[0:len(train_set)] test_data_features = all_features[len(train_set):] classifier = classifier.fit(train_data_features, train_label) result = classifier.predict(test_data_features) printlabels = [1, 0] # 这个要对应实际的类别类型 res = [ accuracy_score(test_label, result), precision_score(test_label, result, pos_label=1), precision_score(test_label, result, pos_label=0), recall_score(test_label, result, pos_label=1), recall_score(test_label, result, pos_label=0), f1_score(test_label, result, pos_label=1), f1_score(test_label, result, pos_label=0) ] #print result.astype(np.int).tolist() #print test_label.astype(np.int).tolist() #print confusion_matrix(test_label, result,labels=printlabels) return res
def classify_test_21(train_set, train_label, test_set, test_label, reverseVetorize=False): print 'final_sa_method:classify_test' all = train_set[:].tolist() all.extend(test_set) # 去停用词 stoplist = set('for a of the and to in'.split()) allTexts = [[ word for word in text.lower().replace(',', '').replace('.', '').split() if word not in stoplist ] for text in all] # 去低频词 k = 2 frequency = defaultdict(int) for text in allTexts: for token in text: frequency[token] += 1 allTexts = [[token for token in text if frequency[token] > k] for text in allTexts] # 构建字典 dictionary = corpora.Dictionary(allTexts[0:len(train_set)]) # 怎么把dict转化为列表形式的向量 http://www.mamicode.com/info-detail-1518042.html num_terms = len(dictionary.keys()) all_features = dict2matrix([dictionary.doc2bow(text) for text in allTexts], num_terms).toarray() train_data_features = all_features[0:len(train_set)] test_data_features = all_features[len(train_set):] #print train_data_features.toarray() classier = MultinomialNB() # RandomForestClassifier(n_estimators=100) classier = classier.fit(train_data_features, train_label) print "Predicting test labels..." result = classier.predict(test_data_features) print 'result: ', result from sklearn.metrics import accuracy_score, confusion_matrix print accuracy_score(test_label, result) printlabels = [1, 0] # 这个要对应实际的类别类型 print printlabels print confusion_matrix(test_label, result, labels=printlabels)