def __init__(self, threshold, stop_words='', words_bag_root='', mode='c'): if stop_words: self.stop_words_file = stop_words else: self.stop_words_file = stop_words_file if words_bag_root: self.words_bag_root = words_bag_root else: self.words_bag_root = words_bag_path self.threshold = threshold self.jt = JiebaTokenizer(self.stop_words_file, mode=mode)
class JiebaTokenizerTestCase(unittest.TestCase): def setUp(self): self.jt = JiebaTokenizer("../data/stopwords.txt") def testTokens(self): in_text = u"完整的单元测试很少只执行一个测试用例," \ u"开发人员通常都需要编写多个测试用例才能" \ u"对某一软件功能进行比较完整的测试,这些" \ u"相关的测试用例称为一个测试用例集,在" \ u"PyUnit中是用TestSuite类来表示的。" tokens_text = u"完整/单元/测试/单元测试/只/执行/" \ u"一个/测试/试用/测试用例/开发/发人/" \ u"人员/开发人员/通常/需要/编写/多个/" \ u"测试/试用/测试用例/软件/功能/进行/" \ u"比较/完整/测试/相关/测试/试用/测试用例/" \ u"称为/一个/测试/试用/测试用例/集/PyUnit/" \ u"中是/TestSuite/类来/表示" self.assertEqual(tokens_text, u'/'.join(self.jt.tokens(in_text)), "Tokenization Results differ")
def setUp(self): self.jt = JiebaTokenizer("../data/stopwords.txt")
class TextHelper: def __init__(self, threshold, stop_words='', words_bag_root='', mode='c'): if stop_words: self.stop_words_file = stop_words else: self.stop_words_file = stop_words_file if words_bag_root: self.words_bag_root = words_bag_root else: self.words_bag_root = words_bag_path self.threshold = threshold self.jt = JiebaTokenizer(self.stop_words_file, mode=mode) def compare_similarity(self, input_tpl, compare_tpl, way=2): # 检测文本编码 if not isinstance(input_tpl, unicode): input_tpl = input_tpl.decode('utf8') if not isinstance(compare_tpl, unicode): compare_tpl = compare_tpl.decode('utf8') doc_token_1 = self.jt.tokens(input_tpl) doc_token_2 = self.jt.tokens(compare_tpl) word_list = list(set(doc_token_1 + doc_token_2)) # Build unicode string word dict word_dict = {} for idx, ascword in enumerate(word_list): word_dict[ascword] = idx # Build nonzero-feature fb = FeatureBuilder(word_dict) doc_feat_1 = fb.compute(doc_token_1) doc_feat_2 = fb.compute(doc_token_2) # Init simhash_builder smb = SimhashBuilder(word_list) doc_fl_1 = DocFeatLoader(smb, doc_feat_1) doc_fl_2 = DocFeatLoader(smb, doc_feat_2) if way == 1: # print 'Matching by Simhash + hamming distance' dist = hamming_distance(doc_fl_1.fingerprint, doc_fl_2.fingerprint) if dist < float(self.threshold): return True, dist else: return False, dist elif way == 2: # print 'Matching by VSM + cosine distance' dist = cosine_distance_nonzero(doc_fl_1.feat_vec, doc_fl_2.feat_vec, norm=False) if dist > float(self.threshold): return True, dist else: return False, dist # 初始化字典,要扔数据集进来 def init_bag(self, coll, del_old=True): self.words_bag = BagOfWords(self.jt, self.words_bag_root) if del_old: self.words_bag.del_old() # rebuild dict dict_set = set() for data in coll.find(): words = self.jt.tokens(data['content']) dict_set |= set(words) self.words_bag.build_dictionary(dict_set) train_feature, train_target = self.words_bag.transform_data(coll) logreg = linear_model.LogisticRegression(C=1e5) logreg.fit(train_feature, train_target) self.words_bag.save_model(logreg) def classify(self, text): # init model lr = joblib.load('lr.model') # init bow BOW = self.words_bag.load_dictionary() # TextClassify pred = TextClassify.find_classify(text, BOW, lr) return pred[0]