예제 #1
0
# classify the test dataset
# read the test dataset
label_test_buf = list()
test_path = os.path.expanduser('./spam_classification/SPARSE.TEST')
with open(test_path, newline='') as test:
    reader = csv.reader(test, delimiter=' ')
    for row in reader:
        label_test_buf.append(int(row[0]))
label_test = np.asarray(label_test_buf, dtype=int)

nd_test = len(label_test)
count_d_w_test = np.zeros([nd_test, nw], dtype=int)
with open(test_path, newline='') as test:
    reader = csv.reader(test, delimiter=' ')
    for d_id, row in enumerate(reader):
        current_email = csv.reader(row[2:-1], delimiter=':')
        for rows in current_email:
            w_id = int(rows[0])
            count = int(rows[1])
            count_d_w_test[d_id][w_id - 1] = count

df_test = pd.DataFrame(count_d_w_test)
nb_model = nb.train(df_train)
nb_predictions = nb.test(nb_model, df_test)
y = pd.Series(label_test)
nb_error = nb.compute_error(y, nb_predictions)
print('NB Test error: {}'.format(nb_error))

words = nb.k_most_indicative_words(5, nb_model.to_dataframe().iloc[:, :-1])
print('The {} most spam-worthy words are: {}'.format(len(words), words))
예제 #2
0
import nb
import preprocess
from sklearn.metrics import f1_score as f1_score
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import numpy as np

train_id, train_sentence, train_class = preprocess.parse_tsv("train.tsv")
tokenized_sentence = [
    preprocess.better_tokenize(line) for line in train_sentence
]
p_x, p_y, p_x_y = nb.train(tokenized_text=tokenized_sentence,
                           label=train_class,
                           smoothing_alpha=0)

dev_id, dev_sentence, dev_class = preprocess.parse_tsv("dev.tsv")
tokenized_dev_sentence = [
    preprocess.better_tokenize(line) for line in dev_sentence
]
y_pred = [nb.classify(doc, p_y, p_x_y) for doc in tokenized_dev_sentence]

f1 = f1_score(y_true=dev_class, y_pred=y_pred)
print("f1 score for dev.tsv is %f" % f1)

smoothing_values = np.arange(0.0, 5, 0.05)
f1_scores = []
for value in smoothing_values:
    p_x, p_y, p_x_y = nb.train(tokenized_text=tokenized_sentence,
                               label=train_class,
                               smoothing_alpha=value)
    y_pred = [nb.classify(doc, p_y, p_x_y) for doc in tokenized_dev_sentence]
        'a': 0.25,
        'c': 0.16666666666666666,
        'd': 0.25,
        'b': 0.25
    }
}

scores_gt = [[(-5.903088603156555, 'class2'), (-4.852030263919617, 'class1')],
             [(-7.040921604977946, 'class2'), (-8.147867129923947, 'class1')]]

pi_gt = {'class2': 0.5, 'class1': 0.5}

vocab = vocabulary(train_data)
print('Vocabulary result: {}'.format(vocab_gt == vocab))

theta, pi = train(train_data, train_labels, vocab)
theta_success = True
for class_name in theta_gt:
    for word in theta[class_name]:
        if abs(theta[class_name][word] - theta_gt[class_name][word]) > 10**-5:
            theta_success = False
print('Theta result: {}'.format(theta_success))

pi_success = True
for class_name in pi_gt:
    if abs(pi_gt[class_name] - pi[class_name]) > 10**-5:
        pi_success = False
print('Pi result: {}'.format(pi_success))
scores = test(theta, pi, vocab, test_data)
scores_success = True
for score_gt, score_pred in zip(scores_gt, scores):