def test1(): tokenizer = RegexpTokenizer(r'\w+') spams = [] hams = [] load_data(hams, spams, 'res/test1.txt') spam_words = [] ham_words = [] for spam in spams: spam_words += tokenizer.tokenize(spam) for ham in hams: ham_words += tokenizer.tokenize(ham) naive_bayes = NaiveBayes() naive_bayes.load(ham_words, spam_words) test_spams = [] test_hams = [] load_data(test_hams, test_spams, 'res/test1_check.txt') spam_correct = 0 spam_incorrect = 0 for word in test_spams: result = naive_bayes.is_positive(tokenizer.tokenize(word)) if result: spam_incorrect += 1 else: spam_correct += 1 print('spam:', 'correct', spam_correct, 'incorrect', spam_incorrect) print('spam:', (spam_correct / (spam_incorrect + spam_correct)) * 100, '%') ham_correct = 0 ham_incorrect = 0 for word in test_hams: result = naive_bayes.is_positive(tokenizer.tokenize(word)) if result: ham_correct += 1 else: ham_incorrect += 1 print('ham:', 'correct', ham_correct, 'incorrect', ham_incorrect) print('ham:', (ham_correct / (ham_incorrect + ham_correct)) * 100, '%')
from classifier import NaiveBayes from util import load_data tokenizer = RegexpTokenizer(r'\w+') spams = [] hams = [] load_data(hams, spams, 'res/SMSSpamCollection.txt') spam_words = [] ham_words = [] for spam in spams: spam_words += tokenizer.tokenize(spam) for ham in hams: ham_words += tokenizer.tokenize(ham) naive_bayes = NaiveBayes() naive_bayes.load(ham_words, spam_words) message = "" while message != "stop": message = input("Enter your SMS:") if naive_bayes.is_positive(tokenizer.tokenize(message)): print("ham") else: print("spam")
def test2(is_from_begginning=True, training_percent=70): tokenizer = RegexpTokenizer(r'\w+') data = get_data('res/SMSSpamCollection.txt') training_data_length = int((len(data) * training_percent) / 100) if is_from_begginning: training_data = data[:training_data_length] test_data_length = len(data) - training_data_length test_data = data[-test_data_length:] else: training_data = data[-training_data_length:] test_data_length = len(data) - training_data_length test_data = data[test_data_length:] training_hams = [] training_spams = [] divide_data(training_data, training_hams, training_spams) training_spam_words = [] training_ham_words = [] for ham in training_hams: training_ham_words += tokenizer.tokenize(ham) for spam in training_spams: training_spam_words += tokenizer.tokenize(spam) naive_bayes = NaiveBayes() naive_bayes.load(training_ham_words, training_spam_words) test_hams = [] test_spams = [] divide_data(test_data, test_hams, test_spams) spam_correct = 0 spam_incorrect = 0 for word in test_spams: result = naive_bayes.is_positive(tokenizer.tokenize(word)) if result: spam_incorrect += 1 else: spam_correct += 1 print('spam:', 'correct', spam_correct, 'incorrect', spam_incorrect) print('spam:', (spam_correct / (spam_incorrect + spam_correct)) * 100, '%') ham_correct = 0 ham_incorrect = 0 for word in test_hams: result = naive_bayes.is_positive(tokenizer.tokenize(word)) if result: ham_correct += 1 else: ham_incorrect += 1 print('ham:', 'correct', ham_correct, 'incorrect', ham_incorrect) print('ham:', (ham_correct / (ham_incorrect + ham_correct)) * 100, '%')
folds = np.array_split(data, 10) #make 10 folds in the dataset test_set = 0 #define test_set #for each folds treat one fold as test set and 9 fols at train set for y in range(len(folds)): X_train = pd.DataFrame() #if not test-set append fold in the train set for x in range(len(folds)): if x == test_set: y_test = folds[x]['class'].values X_test = folds[x].drop(['class'], axis=1) else: X_train = X_train.append(folds[x]) y_train = X_train['class'].values X_train = X_train.drop(['class'], axis=1) nb = NaiveBayes() #initialize Naive Bayes Classifier nb.fit(X_train, y_train) #train model with train data y_pred = nb.predict(X_test) #test model with test set #find error with respect to zero-one loss function error = nb.zero_one_loss_function(y_test, y_pred) printstr = "\nAccuracy of 0-1 loss for fold {0} ::: {1}".format( y, (1 - error)) print_both(file, printstr) accuracy_list.append((1 - error)) #get mean square error acc, precision, recall = nb.confusion_matrix(y_test, y_pred) printstr = "\nCF for fold {0} ::: acc:: {1} :: precision:: {2} :: recall :: {3}".format( y, acc, precision, recall) print_both(file, printstr) CF_accuracy_list.append(acc) CF_precision_list.append(precision)