class Sentiment: def __init__(self): self.classifier = Bayes() self.seg = Seg() self.seg.load('seg.pickle') def save(self, fname): self.classifier.save(fname) def load(self, fname): self.classifier = self.classifier.load(fname) def handle(self, doc): words = self.seg.seg(doc) words = self.filter_stop(words) return words def train(self, neg_docs, pos_docs): datas = [] for doc in neg_docs: datas.append([self.handle(doc), 'neg']) for doc in pos_docs: datas.append([self.handle(doc), 'pos']) self.classifier.train(datas) def classify(self, doc): ret, prob = self.classifier.classify(self.handle(doc)) if ret == 'pos': return prob else: return 1 - prob @staticmethod def filter_stop(words): return list(filter(lambda x: x not in stop_words, words))
from bayes import Bayes, URGENCIES from sys import argv import json from signal import signal, SIGPIPE, SIG_DFL signal(SIGPIPE, SIG_DFL) f = open(argv[1], 'r') json_str = f.read() message_list = json.loads(json_str) bae = Bayes(message_list) bae.train() test_data = open(argv[2], 'r') test_data = test_data.read() test_data = json.loads(test_data) def prob_class(string, clazz): S = set(string.split()) fv = bae.gen_feature_vector(S) return bae.prob_class(fv, clazz) def main(): tests = test_data out = {} for test in tests: o = {} for u in URGENCIES.keys():
data = pda.read_csv("./iris.csv") '''标准化''' data_standard = preprocessing.scale(data.iloc[:, :-1]) '''切分数据集,处理过拟合''' train_data, test_data, train_labels, test_labels = train_test_split( data_standard, data.as_matrix()[:, -1], test_size=0.2, random_state=int(time.time())) ''' 贝叶斯算法识别 ''' print("---------------------------贝叶斯----------------------------------") start = time.clock() by = Bayes() by.train(list(train_data), list(train_labels)) test_data_size = test_data.shape[0] error_count = 0 for index, td in enumerate(list(test_data)): this_label = by.test(td) print("预测类别:{0},真实类别:{1}".format(this_label, test_labels[index])) if this_label != test_labels[index]: error_count += 1 end = time.clock() error_rate = (error_count / test_data_size) * 100 time_consuming = end - start print("错误率为:{0:.2f}%".format(error_rate)) print("耗时:{0:.4f}s".format(time_consuming)) ''' k-近邻算法识别 '''
import pandas from matplotlib import pyplot from bayes import Bayes from utils import generate_datasets data = pandas.read_csv('./datasets/leaf.csv') labels = data["species"] data.drop(data.columns[-1], axis=1, inplace=True) print(data.index) for dataset in generate_datasets(data, labels): print('\n' + dataset.name) for training_percent in range(60, 91, 5): classifier = Bayes(dataset.data, labels, training_percent) classifier.train() classifier.test() dataset.result.append(classifier.get_accuracy()) print('Training percent: ' + str(training_percent) + '%, accuracy: ' + str(classifier.get_accuracy())) pyplot.plot(range(60, 91, 5), dataset.result, label=dataset.name) pyplot.xlabel('Training percent') pyplot.ylabel('Accuracy') pyplot.legend() pyplot.savefig('plot', dpi=200, bbox_inches='tight')
def train_model(ngrams_file, output_file): model = Bayes() model.train(Ngrams._load_data(ngrams_file)) model.serialize(output_file)
def main(): #Making list of .txt-files (per sentiment) print("\tLOADING FILES") path = Path('..').joinpath('Data') test_ = path.joinpath('test') train = path.joinpath('train') tp_reviews = txtToList(test_.joinpath('pos')) tn_reviews = txtToList(test_.joinpath("neg")) pos_reviews = txtToList(train.joinpath("pos")) neg_reviews = txtToList(train.joinpath("neg")) print("\tFILES LOADED") #Cleaning reviews reviews = [pos_reviews, neg_reviews, tp_reviews, tn_reviews] print("\tCLEANING REVIEWS") for list_ in reviews: for i, review in enumerate(list_): list_[i] = clean_text(review) #Joining the reviews into one string (per sentiment) pos_string = "".join([string for string in pos_reviews]) neg_string = "".join([string for string in neg_reviews]) #Counting the frequency of words (per sentiment and total) posCounter = Counter(pos_string.split()) negCounter = Counter(neg_string.split()) vocabCounter = Counter(pos_string.split() + neg_string.split()) for term in list(posCounter): if (posCounter[term] == 1): del posCounter[term] for term in list(negCounter): if (negCounter[term] == 1): del negCounter[term] classifier = Bayes(vocab_counts=vocabCounter) classifier.train(posCounter, negCounter) testSets = [tp_reviews, tn_reviews] n_pos_tp, n_neg_tp = 0, 0 n_pos_tn, n_neg_tn = 0, 0 for i, testSet in enumerate(testSets): print("_" * 15 + "RESULTS" + "_" * 15) n_pos, n_neg = 0, 0 for review in testSet: pos, neg = classifier.test(review) if (pos >= neg): n_pos += 1 else: n_neg += 1 if (i == 0): print("Positive Testset: ") n_pos_tp, n_neg_tp = n_pos, n_neg else: print("Negative Testset: ") n_pos_tn, n_neg_tn = n_pos, n_neg print("Positive reviews: {}".format(n_pos)) print("Negative reviews: {}".format(n_neg)) pos_prec = n_pos_tp / (n_pos_tp + len(tn_reviews) - n_neg_tn) pos_rec = n_pos_tp / len(tp_reviews) pos_f1 = 2 * ((pos_prec * pos_rec) / (pos_prec + pos_rec)) neg_prec = n_neg_tn / (n_neg_tn + len(tp_reviews) - n_pos_tp) neg_rec = n_neg_tn / len(tn_reviews) neg_f1 = 2 * ((neg_prec * neg_rec) / (neg_prec + neg_rec)) scores = [pos_prec, pos_rec, pos_f1, neg_prec, neg_rec, neg_f1] save_stats(scores) print_stats(scores) return classifier