Exemplo n.º 1
0
 def test_single_email(self, content):
     spam_prob = 0.0
     ham_prob = 0.0
     lines = content.split('\n')
     for line in lines:
         line = clean_str(line)
         seg = jieba.cut(line, cut_all=False)
         for x in seg:
             spam_prob += self.get_spam_word_log_prob(x)
             ham_prob += self.get_ham_word_log_prob(x)
     return spam_prob >= ham_prob
Exemplo n.º 2
0
    def test_email(self, email):
        if not self.flag:
            self._count()
        spam_prob = 0.0
        ham_prob = 0.0
        with open(email, 'r', encoding='utf-8') as f:
            while True:
                line = f.readline()
                if not line:
                    break
                line = clean_str(line)
                seg = jieba.cut(line, cut_all=False)
                for x in seg:
                    # 转化成对数,所以是相加
                    spam_prob += self.get_spam_word_log_prob(x)
                    ham_prob += self.get_ham_word_log_prob(x)

        # spam_exp = math.exp(spam_prob) + 1e-12
        # ham_exp = math.exp(ham_prob) + 1e-12
        # print("垃圾邮件的概率:", spam_exp / (spam_exp + ham_exp))
        # print("垃圾邮件 vs 非垃圾邮件:", spam_prob, "vs", ham_prob)

        # 垃圾邮件返回 True,否则返回 False
        return spam_prob >= ham_prob
Exemplo n.º 3
0
    def _count(self):
        # 每次重新训练时,清空原本的数据!
        self.spam_word_prob = dict()
        self.ham_word_prob = dict()
        # 垃圾邮件、非垃圾邮件计数
        self.spam_count = 0
        self.ham_count = 0
        self.total_count = 0
        self.spam_prob = 0.0
        self.ham_prob = 0.0
        spam_counter = Counter()
        ham_counter = Counter()
        for file in self.train_file_list:
            try:
                with open(file, 'r', encoding='utf-8') as f:
                    unique_word_set = set()
                    if is_spam(file, self.spam_set):
                        self.spam_count += 1
                        while True:
                            line = f.readline()
                            if not line:
                                break
                            line = clean_str(line)
                            seg = jieba.cut(line, cut_all=False)
                            for x in seg:
                                unique_word_set.add(x)
                        spam_counter += Counter(unique_word_set)
                        # spam_counter += Counter(set(seg))
                    else:
                        self.ham_count += 1
                        while True:
                            line = f.readline()
                            if not line:
                                break
                            line = clean_str(line)
                            seg = jieba.cut(line, cut_all=False)
                            for x in seg:
                                unique_word_set.add(x)
                        ham_counter += Counter(unique_word_set)
                    self.total_count += 1
                    if self.total_count % 1000 == 0:
                        print("{} 文件已处理!".format(self.total_count))
            except UnicodeDecodeError:
                continue
        # 可以观察一下某些最容易出现在垃圾邮件中的词
        # print(spam_counter.most_common(20))
        # print(ham_counter.most_common(20))
        assert (self.ham_count == self.total_count - self.spam_count)

        # 计算两种邮件的概率,在全概率公式下要用到
        # 注意,我们不能假设垃圾邮件概率和非垃圾邮件的先验概率是相等的
        self.ham_prob = float(self.ham_count) / float(self.total_count)
        self.spam_prob = float(self.spam_count) / float(self.total_count)

        # 将计数转化成概率,并且进行 +1 的平滑操作,
        # 避免出现最终计算的概率为 0,或者设置一个较小的默认值,如 1x10^(-6) 这样的极小概率
        # 所以这里的从 counter 中得到的值要进行 +1
        for w, c in spam_counter.most_common():
            self.spam_word_prob[w] = (1.0 + float(c)) / (
                float(self.spam_count) * 2.0)
        for w, c in ham_counter.most_common():
            self.ham_word_prob[w] = (1.0 + float(c)) / (float(self.ham_count) *
                                                        2.0)

        # print(self.spam_word_prob.get('的'))
        # 设置模型是否为空的标志位
        self.flag = True
        model_path = os.path.join(self.meta_path, 'model')
        self.save_model(model_path)