class CrossValidation(object): def __init__(self): self.classifier = NaiveBayes() def create_data(self, user_ids): data = [] for category, ids in user_ids.items(): tweets = get_tweets(ids) categories = [category] * len(tweets) data += list(zip(tweets, categories)) np.random.shuffle(data) return data def split(self, data, test_percentage): n_test = int(len(data)*test_percentage) n_training = len(data)-n_test # unzip (inverse of zip) training = zip(*data[:n_training]) test = zip(*data[n_training:]) return training, test def show_tweets_with_labels(self, tweets, labels): for tweet, label in zip(tweets, labels): print("{}:\n{}\n".format(label, tweet)) def evaluate(self, user_ids, test_percentage=0.2, verbose=True): """ user_ids: Twitter IDs separated into categories. test_percentage: Ratio of the amount of test data extracted from tweets. """ if not(0 <= test_percentage <= 1): raise ValueError("test_percentage must be between 0 and 1 " "(inclusive).") data = self.create_data(user_ids) training, test = self.split(data, test_percentage) tweets, categories = training self.classifier.fit(tweets, categories) tweets, answers = test results = self.classifier.predict(tweets) if(verbose): self.show_tweets_with_labels(tweets, results) return results, answers
class CrossValidation(object): def __init__(self): self.classifier = NaiveBayes() def create_data(self, user_ids): data = [] for category, ids in user_ids.items(): tweets = get_tweets(ids) categories = [category] * len(tweets) data += list(zip(tweets, categories)) np.random.shuffle(data) return data def split(self, data, test_percentage): n_test = int(len(data) * test_percentage) n_training = len(data) - n_test # unzip (inverse of zip) training = zip(*data[:n_training]) test = zip(*data[n_training:]) return training, test def show_tweets_with_labels(self, tweets, labels): for tweet, label in zip(tweets, labels): print("{}:\n{}\n".format(label, tweet)) def evaluate(self, user_ids, test_percentage=0.2, verbose=True): """ user_ids: Twitter IDs separated into categories. test_percentage: Ratio of the amount of test data extracted from tweets. """ if not (0 <= test_percentage <= 1): raise ValueError("test_percentage must be between 0 and 1 " "(inclusive).") data = self.create_data(user_ids) training, test = self.split(data, test_percentage) tweets, categories = training self.classifier.fit(tweets, categories) tweets, answers = test results = self.classifier.predict(tweets) if (verbose): self.show_tweets_with_labels(tweets, results) return results, answers
if c_true == pos_class: pos += 1 else: neg += 1 result_pos = [] result_neg = [] result_dif = [] result_nor = [] for (v, c_true) in d.test_set: """ prepare predictions for sorting in case of equal weight, positive instances come first store both true class and first NB prediction """ c_pred_nb = prnb.predict(v) wy = 0 wn = 0 for c in prnb.clssprobs: if c == pos_class: wy += prnb.value_weight(v, c) else: wn += prnb.value_weight(v, c) result_dif.append((wy - wn, c_true == pos_class, c_true, c_pred_nb[0])) result_pos.append((wy, c_true == pos_class, c_true, c_pred_nb[0])) result_neg.append((wn, c_true != pos_class, c_true, c_pred_nb[0])) result_nor.append( (wy / (wy + wn), c_true == pos_class, c_true, c_pred_nb[0])) plt.plot([-0.001, 1.001], [-0.001, 1.001], color="orange") # diagonal reference trpos = 0
import numpy as np from sklearn.model_selection import train_test_split from sklearn import datasets import matplotlib.pyplot as plt from naivebayes import NaiveBayes def accuracy(y_true, y_pred): accuracy = np.sum(y_true == y_pred) / len(y_true) return accuracy X, y = datasets.make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=123) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) nb = NaiveBayes() nb.fit(X_train, y_train) predictions = nb.predict(X_test) print("Naive Bayes classification accuracy", accuracy(y_test, predictions))
from data import Data from naivebayes import NaiveBayes filename = "datasets/weatherNominal.td" ## filename = "datasets/titanic.td" ## filename = "datasets/cmc.td" d = Data(filename) d.report() pr = NaiveBayes(d) pr.train() pr.show() for (v, c_true) in d.test_set: c_pred = pr.predict(v)[0] print(v, ":") print(" ", c_pred, "( true class:", c_true, ")") ## print(pr.predict(("Class:1st","Sex:Female","Age:Child"))) ## print(pr.predict(("Class:Crew","Sex:Female","Age:Child")))
def accuracy(y_true, y_pred): return np.sum(y_true == y_pred) / len(y_true) X, y = datasets.make_blobs(n_samples=1000, n_features=2, centers=3, cluster_std=1.0, center_box=(-10.0, 10.0), shuffle=True, random_state=123, return_centers=False) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9, random_state=1234) clf = NaiveBayes() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print(accuracy(y_test, y_pred)) color_map = {0: 'r', 1: 'k', 2: 'g'} label_color = [color_map[l] for l in y_pred] plt.scatter(X_test[:, 0], X_test[:, 1], c=label_color) plt.show()
filename = "ds/titanicTr.txt" d = Data(filename, 75) prmap = MaxAPost(d) prmap.train() prnb = NaiveBayes(d) prnb.train() cmmap = ConfMat(prmap.clsscnts) cmnb = ConfMat(prnb.clsscnts) comparing = set([]) for (v, c_true) in d.test_set: c_pred_map = tuple(prmap.predict(v)) c_pred_nb = tuple(prnb.predict(v)) if len(c_pred_map) and len(c_pred_nb): warn = (c_pred_map[0] != c_pred_nb[0]) cmmap.mat[c_pred_map[0], c_true] += 1 cmnb.mat[c_pred_nb[0], c_true] += 1 else: warn = True if warn: comparing.add((v, c_true, c_pred_map, c_pred_nb)) print for r in sorted(comparing): print r[0], ": true class ", r[1] print " MAP pred", r[2], print " NB pred", r[3]
validation_set_size = 10000 train_set, validation_set = split_train_validation(dataset, validation_set_size) num_to_train_on = 10000000 time_before("training adaboost") ab.train_set(dataset[:num_to_train_on]) time_after("training adaboost") time_before("training naive bayes") nb.train_set(dataset[:num_to_train_on]) time_after("training naive bayes") kg_validations_nb = [] kg_validations_ab = [] for i in validation_set: kg_validations_nb.append(nb.predict(*i[1:]) == i[0]) kg_validations_ab.append(ab.predict(*i[1:]) == i[0]) # print("Errors nb: %s " % sum([0 if i else 1 for i in kg_validations_nb])) print("Errors ab: %s " % sum([0 if i else 1 for i in kg_validations_ab])) # import pdb; pdb.set_trace() predictions = [] print("creating predictions...") with open(testset, "r") as testfile: data = testfile.read() lines = data.split('\n')[1:][:num_to_train_on] for line in lines: if not line:
# Warming up textrocessing engines textPrep = TextPreprocessing(ngrams_n=4, ngrams_count=2000).load(dataset.stopWords) # Warming up the FeaturesMatrixBuilder featuresMatrix = FeaturesMatrixBuilder(dataset, textPrep) # Doing the actual training on the first 22000 reviews XTrain, yTrain = featuresMatrix.buildTrainingData() nb = NaiveBayes() nb.fit(XTrain[:22000, :], yTrain[:22000]) # Validating on the remaining y = yTrain[22000:] yhat = nb.predict(XTrain[22000:, :]) m = getConfusionMatrix(yTrain[22000:], yhat) print("\n=== RESULTS ===") endTimer(t) printResults(m) # Running the model on the test set print("Training using the whole training set this time") nb.fit(XTrain, yTrain) (XTest, ids) = featuresMatrix.buildTestData() yhat = nb.predict(XTest) with open("output/test.txt", "w") as f: t = timer() print("Writing the test results file") f.write("Id,Category\n") for i, yi in tqdm(enumerate(yhat)):
def simplified_bayes(train_letters, test_letters, prior): nb = NaiveBayes(train_letters, prior) return ''.join([nb.predict(letter) for letter in test_letters])