# classify the test dataset # read the test dataset label_test_buf = list() test_path = os.path.expanduser('./spam_classification/SPARSE.TEST') with open(test_path, newline='') as test: reader = csv.reader(test, delimiter=' ') for row in reader: label_test_buf.append(int(row[0])) label_test = np.asarray(label_test_buf, dtype=int) nd_test = len(label_test) count_d_w_test = np.zeros([nd_test, nw], dtype=int) with open(test_path, newline='') as test: reader = csv.reader(test, delimiter=' ') for d_id, row in enumerate(reader): current_email = csv.reader(row[2:-1], delimiter=':') for rows in current_email: w_id = int(rows[0]) count = int(rows[1]) count_d_w_test[d_id][w_id - 1] = count df_test = pd.DataFrame(count_d_w_test) nb_model = nb.train(df_train) nb_predictions = nb.test(nb_model, df_test) y = pd.Series(label_test) nb_error = nb.compute_error(y, nb_predictions) print('NB Test error: {}'.format(nb_error)) words = nb.k_most_indicative_words(5, nb_model.to_dataframe().iloc[:, :-1]) print('The {} most spam-worthy words are: {}'.format(len(words), words))
import nb import preprocess from sklearn.metrics import f1_score as f1_score import matplotlib.pyplot as plt from matplotlib.backends.backend_pdf import PdfPages import numpy as np train_id, train_sentence, train_class = preprocess.parse_tsv("train.tsv") tokenized_sentence = [ preprocess.better_tokenize(line) for line in train_sentence ] p_x, p_y, p_x_y = nb.train(tokenized_text=tokenized_sentence, label=train_class, smoothing_alpha=0) dev_id, dev_sentence, dev_class = preprocess.parse_tsv("dev.tsv") tokenized_dev_sentence = [ preprocess.better_tokenize(line) for line in dev_sentence ] y_pred = [nb.classify(doc, p_y, p_x_y) for doc in tokenized_dev_sentence] f1 = f1_score(y_true=dev_class, y_pred=y_pred) print("f1 score for dev.tsv is %f" % f1) smoothing_values = np.arange(0.0, 5, 0.05) f1_scores = [] for value in smoothing_values: p_x, p_y, p_x_y = nb.train(tokenized_text=tokenized_sentence, label=train_class, smoothing_alpha=value) y_pred = [nb.classify(doc, p_y, p_x_y) for doc in tokenized_dev_sentence]
'a': 0.25, 'c': 0.16666666666666666, 'd': 0.25, 'b': 0.25 } } scores_gt = [[(-5.903088603156555, 'class2'), (-4.852030263919617, 'class1')], [(-7.040921604977946, 'class2'), (-8.147867129923947, 'class1')]] pi_gt = {'class2': 0.5, 'class1': 0.5} vocab = vocabulary(train_data) print('Vocabulary result: {}'.format(vocab_gt == vocab)) theta, pi = train(train_data, train_labels, vocab) theta_success = True for class_name in theta_gt: for word in theta[class_name]: if abs(theta[class_name][word] - theta_gt[class_name][word]) > 10**-5: theta_success = False print('Theta result: {}'.format(theta_success)) pi_success = True for class_name in pi_gt: if abs(pi_gt[class_name] - pi[class_name]) > 10**-5: pi_success = False print('Pi result: {}'.format(pi_success)) scores = test(theta, pi, vocab, test_data) scores_success = True for score_gt, score_pred in zip(scores_gt, scores):