-
Notifications
You must be signed in to change notification settings - Fork 1
/
SpamClassifier.py
69 lines (55 loc) · 2.68 KB
/
SpamClassifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import os
import pyensae
from nltk.classify import NaiveBayesClassifier
from nltk.tokenize import word_tokenize
import nltk.classify.util
import pickle
# download Enron-Spam datasets
pyensae.download_data("enron1.tar.gz", url="http://www.aueb.gr/users/ion/data/enron-spam/preprocessed/",
whereTo="website/dataSources/enron")
pyensae.download_data("enron2.tar.gz", url="http://www.aueb.gr/users/ion/data/enron-spam/preprocessed/",
whereTo="website/dataSources/enron")
pyensae.download_data("enron3.tar.gz", url="http://www.aueb.gr/users/ion/data/enron-spam/preprocessed/",
whereTo="website/dataSources/enron")
pyensae.download_data("enron4.tar.gz", url="http://www.aueb.gr/users/ion/data/enron-spam/preprocessed/",
whereTo="website/dataSources/enron")
pyensae.download_data("enron5.tar.gz", url="http://www.aueb.gr/users/ion/data/enron-spam/preprocessed/",
whereTo="website/dataSources/enron")
pyensae.download_data("enron6.tar.gz", url="http://www.aueb.gr/users/ion/data/enron-spam/preprocessed/",
whereTo="website/dataSources/enron")
pathToDatasets = "website/dataSources/enron"
# get word, and return dictionary
def createDctionary(words):
dictionary = dict([(word, True) for word in words])
return dictionary
ham_tab = []
spam_tab = []
# Read files, and split words in string with word_tokenize() and create dictionary
for folders, subfolders, files in os.walk(pathToDatasets):
if os.path.split(folders)[1] == 'ham':
for file in files:
with open(os.path.join(folders, file), encoding="latin-1") as nextFile:
data = nextFile.read()
# break string into words with nltk library
words = word_tokenize(data)
ham_tab.append((createDctionary(words), "ham"))
if os.path.split(folders)[1] == 'spam':
for file in files:
try:
with open(os.path.join(folders, file), encoding="latin-1") as nextFile:
data = nextFile.read()
words = word_tokenize(data)
spam_tab.append((createDctionary(words), "spam"))
except:
print(file)
# split data into 80% train data and 20% test data
combined_tab = ham_tab + spam_tab
training_part = int(len(combined_tab) * .8)
training_set = combined_tab[:training_part]
test_set = combined_tab[training_part:]
# Create NB classifier
classifier = NaiveBayesClassifier.train(training_set)
# print(nltk.classify.util.accuracy(classifier, test_set))
# save to file
with open('website/website/NB_classifier.pkl', 'wb') as file:
pickle.dump(classifier, file)