forked from silverzhaojr/spam-filter
-
Notifications
You must be signed in to change notification settings - Fork 0
/
judgemail.py
76 lines (58 loc) · 1.7 KB
/
judgemail.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/python2
# -*- coding: utf-8 -*-
import sys
from emailparser import EmailParser
from splitwords import SplitWords
from trainmodule import TrainModule
class JudgeMail:
'''
calculate the possibility of being a spam
'''
def __init__(self, mail_file):
self.mail_file = mail_file
self.train_module = TrainModule()
self.P_SPAM = 0.5
self.P_NORMAL = 1 - self.P_SPAM
self.P_SPAM_WORD = 0.4
self.P_IS_SPAM_LIMIT = 0.9
self.train_module.set_dic_word_freq()
def judge(self):
mail_content = EmailParser(self.mail_file).get_mail_content()
res_list = SplitWords(mail_content).get_word_list()
word_list = list(set(res_list))
for i in \
[';', ':', ',', '.', '?', '!', '(', ')', ' ', '/',\
';', ':', ',', '。', '?', '!', '(', ')', ' ', '、']:
if i in word_list:
word_list.remove(i)
word_freq = []
for word in word_list:
if word in self.train_module.dic_word_freq:
p_w_n = self.train_module.dic_word_freq[word][0]
p_w_s = self.train_module.dic_word_freq[word][1]
p_s_w = p_w_s * self.P_SPAM / (p_w_s * self.P_SPAM + p_w_n * self.P_NORMAL)
word_freq.append((word, p_s_w))
else:
word_freq.append((word, self.P_SPAM_WORD))
word_freq_most = sorted(word_freq, key = lambda x:x[1], reverse=True)[:15]
p = 1.0
rest_p = 1.0
for i in word_freq_most:
print i[0], i[1]
p *= i[1]
rest_p *= (1 - i[1])
p_spam = p / (p + rest_p)
mail_type = ''
if p_spam > self.P_IS_SPAM_LIMIT:
mail_type = 'spam'
else:
mail_type = 'normal'
self.train_module.update(mail_type, word_list)
return p_spam
def main():
fp = open(sys.argv[1], 'r')
p = JudgeMail(fp).judge()
fp.close()
print 'SPAM: p = ', p
if __name__ == '__main__':
main()