def test_single_email(self, content): spam_prob = 0.0 ham_prob = 0.0 lines = content.split('\n') for line in lines: line = clean_str(line) seg = jieba.cut(line, cut_all=False) for x in seg: spam_prob += self.get_spam_word_log_prob(x) ham_prob += self.get_ham_word_log_prob(x) return spam_prob >= ham_prob
def test_email(self, email): if not self.flag: self._count() spam_prob = 0.0 ham_prob = 0.0 with open(email, 'r', encoding='utf-8') as f: while True: line = f.readline() if not line: break line = clean_str(line) seg = jieba.cut(line, cut_all=False) for x in seg: # 转化成对数,所以是相加 spam_prob += self.get_spam_word_log_prob(x) ham_prob += self.get_ham_word_log_prob(x) # spam_exp = math.exp(spam_prob) + 1e-12 # ham_exp = math.exp(ham_prob) + 1e-12 # print("垃圾邮件的概率:", spam_exp / (spam_exp + ham_exp)) # print("垃圾邮件 vs 非垃圾邮件:", spam_prob, "vs", ham_prob) # 垃圾邮件返回 True,否则返回 False return spam_prob >= ham_prob
def _count(self): # 每次重新训练时,清空原本的数据! self.spam_word_prob = dict() self.ham_word_prob = dict() # 垃圾邮件、非垃圾邮件计数 self.spam_count = 0 self.ham_count = 0 self.total_count = 0 self.spam_prob = 0.0 self.ham_prob = 0.0 spam_counter = Counter() ham_counter = Counter() for file in self.train_file_list: try: with open(file, 'r', encoding='utf-8') as f: unique_word_set = set() if is_spam(file, self.spam_set): self.spam_count += 1 while True: line = f.readline() if not line: break line = clean_str(line) seg = jieba.cut(line, cut_all=False) for x in seg: unique_word_set.add(x) spam_counter += Counter(unique_word_set) # spam_counter += Counter(set(seg)) else: self.ham_count += 1 while True: line = f.readline() if not line: break line = clean_str(line) seg = jieba.cut(line, cut_all=False) for x in seg: unique_word_set.add(x) ham_counter += Counter(unique_word_set) self.total_count += 1 if self.total_count % 1000 == 0: print("{} 文件已处理!".format(self.total_count)) except UnicodeDecodeError: continue # 可以观察一下某些最容易出现在垃圾邮件中的词 # print(spam_counter.most_common(20)) # print(ham_counter.most_common(20)) assert (self.ham_count == self.total_count - self.spam_count) # 计算两种邮件的概率,在全概率公式下要用到 # 注意,我们不能假设垃圾邮件概率和非垃圾邮件的先验概率是相等的 self.ham_prob = float(self.ham_count) / float(self.total_count) self.spam_prob = float(self.spam_count) / float(self.total_count) # 将计数转化成概率,并且进行 +1 的平滑操作, # 避免出现最终计算的概率为 0,或者设置一个较小的默认值,如 1x10^(-6) 这样的极小概率 # 所以这里的从 counter 中得到的值要进行 +1 for w, c in spam_counter.most_common(): self.spam_word_prob[w] = (1.0 + float(c)) / ( float(self.spam_count) * 2.0) for w, c in ham_counter.most_common(): self.ham_word_prob[w] = (1.0 + float(c)) / (float(self.ham_count) * 2.0) # print(self.spam_word_prob.get('的')) # 设置模型是否为空的标志位 self.flag = True model_path = os.path.join(self.meta_path, 'model') self.save_model(model_path)