def test_doc2words(): contents = load_positive_contents() cfy = Classify() for i in range(1): doc = contents.next() print(doc) for word in cfy.doc2words(doc): print(word),
def test_gen_dict(): contents = load_positive_contents() cfy = Classify() word_lists = map(cfy.doc2words, contents) cfy.gen_dict(word_lists) for tokenid, docfreq in cfy.dict.dfs.items(): word = cfy.dict.get(tokenid) print tokenid, docfreq, word
def test_negative_contents(): contents = load_positive_contents() cfy = Classify(500, 0.99, 0.2, 0.05) cfy.train(contents) contents = load_negative_contents() r = cfy.predict(contents) r = map(int, r) print(r.count(-1), r.count(1))
def test_predict(): contents = load_positive_contents() cfy = Classify(feature_num=10) cfy.train(contents) contents = load_negative_contents() cs = [contents.next() for i in range(10)] r = cfy.predict(cs) print(r)
def test_doc2vector(): contents = load_positive_contents() cfy = Classify() word_lists = map(cfy.doc2words, contents) cfy.gen_dict(word_lists) contents = load_positive_contents() matrix = [] for i in range(10): doc = contents.next() matrix.append(cfy.doc2vector(doc)) print(matrix)
def load_data(self, ): temp_cfy = Classify() temp_cfy.add_jieba_dict(self.words) p_wls = [temp_cfy.doc2words(c) for c in load_positive_contents()] n_wls = [temp_cfy.doc2words(c) for c in load_negative_contents()] n = len(p_wls) * 80 / 100 self.train_data = p_wls[:] self.test_data = n_wls self.test_y = [-1] * len(n_wls) for wl in p_wls[n:]: self.test_data.append(wl) self.test_y.append(1)
def test(self, params): fnum, na, nu, gs = params cfy = Classify(fnum, na, nu, gs) cfy.add_jieba_dict(self.words) cfy.use_dict(self.words) cfy.train_by_wls(self.train_data) y_pred = cfy.predict_by_wls(self.test_data) r = zip(map(int, y_pred), self.test_y) tp = r.count((1, 1)) fp = r.count((1, -1)) fn = r.count((-1, 1)) if (tp == 0): precision = 0 recall = 0 f1 = 0 else: precision = tp * 1.0 / (tp + fp) recall = tp * 1.0 / (tp + fn) f1 = 5 * precision * recall / (4 * precision + recall) print(fnum, na, nu, gs, precision, recall, f1) return (precision, recall, f1)
def test_gen_matrix(): contents = load_positive_contents() cfy = Classify() word_lists = map(cfy.doc2words, contents) cfy.gen_dict(word_lists) contents = load_positive_contents() wls = [] for i in range(20): wls.append(cfy.doc2words(contents.next())) matrix = cfy.gen_matrix(wls) print matrix.toarray()
def test_save_load(): contents = load_positive_contents() cfy = Classify(500, 0.99, 0.2, 0.05) cfy.train(contents) contents = load_negative_contents() cs = [contents.next() for i in range(10)] r = cfy.predict(cs) path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'save') cfy.save(path) cfy1 = load_cfy(path) r1 = cfy1.predict(cs) for i in range(10): if r[i] != r1[i]: print("save load error, got different svm")