def test_prob_density_function(self): print("test_prob_density_function") # Creating object of classifier for unit testing nb = NaiveBayesClassifier() nb.mean = [1] nb.variance = [3] # Testing probability calc with known calculation. self.assertAlmostEqual(nb.prob_den_func(0, 3), 0.11825507) nb.mean = [1, 2] nb.variance = [3, 1] self.assertAlmostEqual(nb.prob_den_func(1, 0.1), 0.06561581)
def test_fit(self): print("test_fit") nb = NaiveBayesClassifier() Xis = np.array([[3, 4], [2, 3]]) yis = [0, 1] nb.prior_probs = np.zeros(2, dtype=np.float64) # Testing value updation with known calculation. self.assertEqual(nb.prior_probs[0], 0) self.assertEqual(nb.prior_probs[1], 0) self.assertEqual(nb.fit(Xis, yis), None) self.assertEqual(nb.prior_probs[0], 0.5) self.assertEqual(nb.prior_probs[1], 0.5)
def __init__(self, file_location): self.features = set([]) raw_data = [] training_data = [] word_freq = {} #self.word_freq = {} with open(file_location, 'rb') as data: data_reader = csv.DictReader(data) for row in data_reader: # print row h_tokens = nltk.word_tokenize(row['headline'].lower()) #self.features = self.features.union(set(h_tokens)) for token in h_tokens: if token in word_freq: word_freq[token] += 1 else: word_freq[token] = 1 #for token in h_tokens: # if token in self.word_freq: # self.word_freq[token] += 1 # else: # self.word_freq[token] = 1 raw_data.append( (h_tokens, 0, float(row[' anger']) / 100)) # anger raw_data.append( (h_tokens, 1, float(row[' disgust']) / 100)) # disgust raw_data.append( (h_tokens, 2, float(row[' fear']) / 100)) # fear raw_data.append((h_tokens, 3, float(row[' joy']) / 100)) # joy raw_data.append( (h_tokens, 4, float(row[' sadness']) / 100)) # sadness raw_data.append( (h_tokens, 5, float(row[' surprise']) / 100)) # surprise for key in word_freq.keys(): if word_freq[key] > self.threshold: self.features.add(key) print "F-vec size: " + str(len(self.features)) for data in raw_data: f_vector = [] for f in self.features: f_vector.append(1 if f in data[0] else 0) training_data.append((f_vector, data[1], data[2])) self.classifier = NaiveBayesClassifier(6, len(self.features)) self.classifier.train(training_data)
def test_pred(self): print("test_pred") nb = NaiveBayesClassifier() nb.mean = [1, 2] nb.variance = [3, 1] # testing condition where it is not trained self.assertEqual(nb.predict([1.4, 12, 3, 9]), None) self.assertEqual(nb.predict([2, 3, 4, 5]), None) self.assertEqual(nb.predict([1]), None) self.assertEqual(nb.predict([2, -3, 4, -5, -7, -7]), None) nb.mean = [1, 1] nb.variance = [1, 5] nb.in_classes = [1, 0] nb.prior_probs = [0.3, 0.2] pred = nb.predict([5, 3, 4, 7]) # Testing condition simulating trained model. self.assertEqual(pred[0], [0]) self.assertEqual(pred[1], [0]) self.assertEqual(pred[2], [0]) self.assertEqual(pred[3], [0])
from NaiveBayes import NaiveBayesClassifier #importing the class import numpy as np from sklearn.model_selection import train_test_split from sklearn import datasets import matplotlib.pyplot as plt #Calculating accuracy --(actual-prediction)/(total no of samples) def accuracy(y_actual, yhat): accuracy = np.sum(y_actual == yhat) / len(y_actual) return accuracy X, y = datasets.make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=123) #Dividing the dataset into train and test -80-20 division X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) nb = NaiveBayesClassifier() nb.fit(X_train, y_train) #fitting the model predictions = nb.predict(X_test) #predicting on the test set print("Accuracy", accuracy(y_test, predictions)) #calculating the accuracy