/
hw1_Naive_Bayes.py
79 lines (66 loc) · 2.66 KB
/
hw1_Naive_Bayes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# coding: utf-8
import numpy as np
import pandas as pd
from scipy.stats import t
import matplotlib.pyplot as plt
path = '/Users/jingweili/Documents/CU/Courses/EE6720/Homework/HW1/hw1_data_csv'
xtrain = pd.read_csv(path + '/Xtrain.csv', header = None)
ytrain = pd.read_csv(path + '/ytrain.csv', header = None)
xtest = pd.read_csv(path + '/Xtest.csv', header = None)
ytest = pd.read_csv(path + '/ytest.csv', header = None)
def parameters(x):
n = x.shape[0]
mean = x.mean()
var = x.var()
mu = (n * mean) / (n + 1)
kappa = n + 1
alpha = 1 + n / 2
beta = 1 + var * (n - 1) / 2 + (n * mean**2) / (2 * (n + 1))
df = 2 * alpha
scale = ((beta * (1 + kappa)) / (alpha * kappa))**(1/2)
return(df, mu, scale)
def naiveBayes(xtrain, ytrain, xtest):
ntrain = xtrain.shape[0]
ntest = xtest.shape[0]
log_prob = pd.DataFrame(0, index = np.arange(ntest), columns = [0, 1])
for y in [0, 1]:
xtrain_y = xtrain[ytrain[0] == y]
n_y = xtrain_y.shape[0]
prob_y = (1 + n_y) / (ntrain + 2)
log_condi_prob = pd.DataFrame(0, index = np.arange(ntest), columns = range(15))
for d in range(15):
[df, loc, scale] = parameters(xtrain_y[d])
log_condi_prob[d] = np.log(t.pdf(xtest[d], df, loc, scale))
log_prob[y] = log_condi_prob.sum(axis = 1) + np.log(prob_y)
prob = log_prob.apply(lambda x: np.exp(x), axis = 1)
prob = prob.apply(lambda x: pd.Series([x[0]/x.sum(), x[1]/x.sum()]), axis = 1)
pred = prob.apply(lambda x: 0 if x[0] > x[1] else 1, axis = 1)
return(prob, pred)
from sklearn.metrics import confusion_matrix
[prob, pred] = naiveBayes(xtrain, ytrain, xtest)
confusion_matrix(pred, ytest[0])
# Misclassified digits
mis_pred = abs(ytest[0] - pred)
mis_list = list(mis_pred.index[mis_pred == 1])
for i in mis_list[0:3]:
mis_prob = prob[pred[i]][i]
print('No.{0} digit is misclassified as {1} with probability {2}'.format(i, pred[i], mis_prob))
# Reconstruct images
Q = pd.read_csv(path + '/Q.csv', header = None)
x = np.asarray(xtest)[mis_list[0:3]]
x2 = x.dot(np.asarray(Q).T)
for i in range(3):
image = x2[i,].reshape((28, 28))
plt.matshow(image, cmap=plt.cm.gray)
# Reconstruct most ambiguous images
ambig = abs(prob[0] - 0.5).sort_values()
ambig_top3 = list(ambig.index[0:3])
print('The most ambiguous images are {}'.format(ambig_top3))
print('No.\tActual\tProb:4\tProb:9')
for i in ambig_top3[0:3]:
print('{0}\t{1}\t{2:1.4f}\t{3:1.4f}'.format(i, ytest[0][i], prob[0][i], prob[1][i]))
ambig_x = np.asarray(xtest)[ambig_top3]
ambig_x2 = ambig_x.dot(np.asarray(Q).T)
for i in range(3):
image = ambig_x2[i,].reshape((28, 28))
plt.matshow(image, cmap=plt.cm.gray)