Пример #1
0
def test1():
    tokenizer = RegexpTokenizer(r'\w+')
    spams = []
    hams = []
    load_data(hams, spams, 'res/test1.txt')
    spam_words = []
    ham_words = []

    for spam in spams:
        spam_words += tokenizer.tokenize(spam)

    for ham in hams:
        ham_words += tokenizer.tokenize(ham)

    naive_bayes = NaiveBayes()

    naive_bayes.load(ham_words, spam_words)

    test_spams = []
    test_hams = []

    load_data(test_hams, test_spams, 'res/test1_check.txt')

    spam_correct = 0
    spam_incorrect = 0

    for word in test_spams:
        result = naive_bayes.is_positive(tokenizer.tokenize(word))
        if result:
            spam_incorrect += 1
        else:
            spam_correct += 1

    print('spam:', 'correct', spam_correct, 'incorrect', spam_incorrect)
    print('spam:', (spam_correct / (spam_incorrect + spam_correct)) * 100, '%')

    ham_correct = 0
    ham_incorrect = 0

    for word in test_hams:
        result = naive_bayes.is_positive(tokenizer.tokenize(word))
        if result:
            ham_correct += 1
        else:
            ham_incorrect += 1

    print('ham:', 'correct', ham_correct, 'incorrect', ham_incorrect)
    print('ham:', (ham_correct / (ham_incorrect + ham_correct)) * 100, '%')
Пример #2
0
def test2(is_from_begginning=True, training_percent=70):
    tokenizer = RegexpTokenizer(r'\w+')

    data = get_data('res/SMSSpamCollection.txt')

    training_data_length = int((len(data) * training_percent) / 100)

    if is_from_begginning:
        training_data = data[:training_data_length]

        test_data_length = len(data) - training_data_length

        test_data = data[-test_data_length:]
    else:
        training_data = data[-training_data_length:]

        test_data_length = len(data) - training_data_length

        test_data = data[test_data_length:]

    training_hams = []
    training_spams = []

    divide_data(training_data, training_hams, training_spams)

    training_spam_words = []
    training_ham_words = []

    for ham in training_hams:
        training_ham_words += tokenizer.tokenize(ham)

    for spam in training_spams:
        training_spam_words += tokenizer.tokenize(spam)

    naive_bayes = NaiveBayes()

    naive_bayes.load(training_ham_words, training_spam_words)

    test_hams = []
    test_spams = []

    divide_data(test_data, test_hams, test_spams)

    spam_correct = 0
    spam_incorrect = 0

    for word in test_spams:
        result = naive_bayes.is_positive(tokenizer.tokenize(word))
        if result:
            spam_incorrect += 1
        else:
            spam_correct += 1

    print('spam:', 'correct', spam_correct, 'incorrect', spam_incorrect)
    print('spam:', (spam_correct / (spam_incorrect + spam_correct)) * 100, '%')

    ham_correct = 0
    ham_incorrect = 0

    for word in test_hams:
        result = naive_bayes.is_positive(tokenizer.tokenize(word))
        if result:
            ham_correct += 1
        else:
            ham_incorrect += 1

    print('ham:', 'correct', ham_correct, 'incorrect', ham_incorrect)
    print('ham:', (ham_correct / (ham_incorrect + ham_correct)) * 100, '%')
Пример #3
0
from classifier import NaiveBayes
from util import load_data

tokenizer = RegexpTokenizer(r'\w+')

spams = []
hams = []
load_data(hams, spams, 'res/SMSSpamCollection.txt')
spam_words = []
ham_words = []

for spam in spams:
    spam_words += tokenizer.tokenize(spam)

for ham in hams:
    ham_words += tokenizer.tokenize(ham)

naive_bayes = NaiveBayes()

naive_bayes.load(ham_words, spam_words)

message = ""

while message != "stop":
    message = input("Enter your SMS:")
    if naive_bayes.is_positive(tokenizer.tokenize(message)):
        print("ham")
    else:
        print("spam")