def computeNaiveBayes(args, dict_algorithms): if (args.debug): print("Running naive bayes...", end='') model = NaiveBayes(args) dict_algorithms["naive_bayes"] = model.compute() if (args.debug): print("ok!")
def _populate(self, tweets): """ :param tweets: A python dictionary containing trends as keys and list of tweets as values against each trend. :return: None This is a private method used by the constructor to populate the inverted index object """ for trendName in tweets: self.trends.append(trendName) self.totalTweets += len(tweets[trendName]) # classify trend tweetsDoc = " ".join([tweet.text for tweet in tweets[trendName]]) model = NaiveBayes() model.loadModelFromDB() self.categories.append(model.classify(tweetsDoc)) for tweet in tweets[trendName]: if tweet.user.screen_name not in self.twitterHandles: self.twitterHandles.append(tweet.user.screen_name) posts = [(self.trends.index(trendName), tweet)] self.indexLists.append(posts) else: posts = self.indexLists[self.twitterHandles.index( tweet.user.screen_name)] posts.append((self.trends.index(trendName), tweet)) self.logger.debug( 'Created and populated Inverted Index: Trends-{}, Tweets-{}'. format(len(self.trends), self.totalTweets))
def evaluate(trainfile, testfile): # 訓練データをロード trainData = [] fp = codecs.open(trainfile, "r", "utf-8") for line in fp: line = line.rstrip() temp = line.split() trainData.append(temp) fp.close() # ナイーブベイズを訓練 nb = NaiveBayes() nb.train(trainData) print nb # テストデータを評価 hit = 0 numTest = 0 fp = codecs.open(testfile, "r", "utf-8") for line in fp: line = line.rstrip() temp = line.split() correct = temp[0] # 正解カテゴリ words = temp[1:] # 文書:単語の集合 predict = nb.classify(words) # ナイーブベイズでカテゴリを予測 if correct == predict: hit += 1 # 予測と正解が一致したらヒット! numTest += 1 print "accuracy:", float(hit) / float(numTest) fp.close()
def main(args): """ HamOrSpam entrypoint """ # Get arguments try: script, train, test = args except ValueError: print 'usage: python hamorspam.py train.txt test.txt' sys.exit(-1) classifier = NaiveBayes(['ham', 'spam']) # Train classifier train_file = open(train, 'r') for line in train_file: # Discard empty lines if not line.strip(): continue typ, message = line.split('\t', 1) classifier.teach(typ, message) train_file.close() # Query classifier test_file = open(test, 'r') for line in test_file: # Discard empty lines if not line.strip(): continue print classifier.classify(line) test_file.close()
def crossValidation(data, N=num, randomize=False): if randomize: from random import shuffle shuffle(data) # Cross Validation accuracyList = [] for n in range(N): # split train and test data trainData = [d for i, d in enumerate(data) if i % N != n] testData = [d for i, d in enumerate(data) if i % N == n] # train data nb = NaiveBayes() nb.train(trainData) # accuracy of test data hit = 0 numTest = 0 for d in testData: correct = d[0] words = d[1:] predict = nb.classifier(words) if correct == predict: hit += 1 numTest += 1 accuracy = float(hit) / float(numTest) accuracyList.append(accuracy) average = sum(accuracyList) / float(N) average_f = round(average, 4) return average
def cross_validate(folds, method): if folds < 2: print 'Must have at least 2 folds.. evaluating 2-fold cross validation' folds = 2 test_size = 100/folds training_size = 100 - test_size songs_by_class = split_by_class() sentiment_accuracy_sum = 0.0 emotion_accuracy_sum = 0.0 for f in range(0,folds): test_set = songs_by_class['+'][int(test_size*f):int(test_size+test_size*f)] + songs_by_class['0'][int(test_size*f):int(test_size+test_size*f)] +songs_by_class['-'][int(test_size*f):int(test_size+test_size*f)] training_set = songs_by_class['+'][int(test_size+test_size*f):] + songs_by_class['+'][:int(test_size*f)] + songs_by_class['0'][int(test_size+test_size*f):] + songs_by_class['0'][:int(test_size*f)] + songs_by_class['-'][int(test_size+test_size*f):] + songs_by_class['-'][:int(test_size*f)] if method == 'nb': nb = NaiveBayes() nb.train_model(training_set) sentiment_accuracy, emotion_accuracy = nb.evaluate_model(test_set, len(training_set)) emotion_accuracy_sum += emotion_accuracy sentiment_accuracy_sum += sentiment_accuracy elif method == 'sa': sa = SimpleAveraging() avgs = sa.train(training_set) sentiment_accuracy, emotion_accuracy = sa.evaluate(test_set, avgs) emotion_accuracy_sum += emotion_accuracy sentiment_accuracy_sum += sentiment_accuracy # elif method == 'pool': # pool = AffectPool(NaiveBayes(), SimpleAveraging()) # pool.simple_train(training_set) elif method =='r': nb = NaiveBayes() nb.train_model(test_set + training_set) print "EMOTION ACCURACY ", emotion_accuracy_sum / folds, " SENTIMENT ACCURACY: ", sentiment_accuracy_sum / folds
def test_naivebayes(traindata, testdata): #raw pixel feature feature_domians = [[i for i in np.arange(0, 1.1, 0.5)] for _ in range(traindata.width * traindata.height)] for p in range(10, 101, 10): print("Training with %d" % int(p * traindata.number * 0.01)) nb = NaiveBayes(feature_domians, traindata.labeldomain, 1) images, labels = traindata.orderedout(p) nb.train(images, labels) x = nb.classify(testdata.images) a = Accuracy(x, testdata.labels) print(a)
def main(): url, model = sys.argv[1], sys.argv[2] classifier = NaiveBayes() classifier.load(model) page = urlopen(url).read() soup = BeautifulSoup(page) tags = [tag.name for tag in soup.findAll(True)] classification = classifier.classify(tags) print("Classified as: %s" % classification)
def test_naivebayes_compare(self): basepath = '../resource/' naivebayes = NaiveBayes() json_data = Serializer.load_json(os.path.join(basepath, 'ocr.json')) naivebayes.human_labels = json_data['translate']['country'] x_list = ['ネツァワル王国', 'カセドリア連合王国', 'ゲブランド帝国', 'ホルデイン王国', 'エルソード王国'] print(json_data['translate']['country']) out = naivebayes.predict_all(x_list) for i, y in enumerate(out): if x_list[i] != y: raise Exception('compare x:{0},predict:{1}'.format( x_list[i], y))
class CrossValidation(object): def __init__(self): self.classifier = NaiveBayes() def create_data(self, user_ids): data = [] for category, ids in user_ids.items(): tweets = get_tweets(ids) categories = [category] * len(tweets) data += list(zip(tweets, categories)) np.random.shuffle(data) return data def split(self, data, test_percentage): n_test = int(len(data)*test_percentage) n_training = len(data)-n_test # unzip (inverse of zip) training = zip(*data[:n_training]) test = zip(*data[n_training:]) return training, test def show_tweets_with_labels(self, tweets, labels): for tweet, label in zip(tweets, labels): print("{}:\n{}\n".format(label, tweet)) def evaluate(self, user_ids, test_percentage=0.2, verbose=True): """ user_ids: Twitter IDs separated into categories. test_percentage: Ratio of the amount of test data extracted from tweets. """ if not(0 <= test_percentage <= 1): raise ValueError("test_percentage must be between 0 and 1 " "(inclusive).") data = self.create_data(user_ids) training, test = self.split(data, test_percentage) tweets, categories = training self.classifier.fit(tweets, categories) tweets, answers = test results = self.classifier.predict(tweets) if(verbose): self.show_tweets_with_labels(tweets, results) return results, answers
def main(flag=True): if flag: start = timer() # 加载邮件数据的label label_df = pd.read_csv("./input/trec06c/full/index_bak", sep=' ..', names=['label', 'filename']) for key in label_df['label'].unique(): print(key, len(label_df[label_df['label'] == key])) train, valid = train_test_split(label_df, test_size=0.2, random_state=2018) normFilelen = train[train['label'] == 'ham'].shape[0] spamFilelen = train[train['label'] == 'spam'].shape[0] model = NaiveBayes(normFilelen, spamFilelen) # model.getStopWords() for index, row in tqdm(train.iterrows(), total=train.shape[0]): # 将每封邮件出现的词保存在wordsList中 model.get_word_list('./input/trec06c' + row['filename'], row['label']) print('训练集学习完毕,已耗时%2fs' % (timer() - start)) for index, row in tqdm(valid.iterrows(), total=valid.shape[0]): if 'test' in model.wordDict.keys(): model.wordDict['test'].clear() model.get_word_list('./input/trec06c' + row['filename'], 'test') wordProbList = model.getTestWords(model.wordDict['test']) # 对每封邮件得到的15个词计算贝叶斯概率 trash_p = model.calBayes(wordProbList) if row['label'] == 'spam': if trash_p > 0.9: model.validResult['TN'] += 1 # trash else: model.validResult['FN'] += 1 # normal else: if trash_p > 0.9: model.validResult['FP'] += 1 # trash else: model.validResult['TP'] += 1 # normal model.calMetric() print('验证集处理完毕,已耗时%2fs' % (timer() - start)) pickle.dump(model, open('bayes_model.obj', 'wb')) else: model = pickle.load(open('bayes_model.obj', 'rb')) print("模型加载成功!") return model
class CrossValidation(object): def __init__(self): self.classifier = NaiveBayes() def create_data(self, user_ids): data = [] for category, ids in user_ids.items(): tweets = get_tweets(ids) categories = [category] * len(tweets) data += list(zip(tweets, categories)) np.random.shuffle(data) return data def split(self, data, test_percentage): n_test = int(len(data) * test_percentage) n_training = len(data) - n_test # unzip (inverse of zip) training = zip(*data[:n_training]) test = zip(*data[n_training:]) return training, test def show_tweets_with_labels(self, tweets, labels): for tweet, label in zip(tweets, labels): print("{}:\n{}\n".format(label, tweet)) def evaluate(self, user_ids, test_percentage=0.2, verbose=True): """ user_ids: Twitter IDs separated into categories. test_percentage: Ratio of the amount of test data extracted from tweets. """ if not (0 <= test_percentage <= 1): raise ValueError("test_percentage must be between 0 and 1 " "(inclusive).") data = self.create_data(user_ids) training, test = self.split(data, test_percentage) tweets, categories = training self.classifier.fit(tweets, categories) tweets, answers = test results = self.classifier.predict(tweets) if (verbose): self.show_tweets_with_labels(tweets, results) return results, answers
def train(self, train_set): """Teaches the classifier with labeled data instances.""" for d in train_set: self.corpus.add_doc(d) print 'Training on %d documents...\n' % len(train_set) if isinstance(self.classifier, NaiveBayes): self.classifier.train(self.corpus) for c in self.corpus.get_classes(): if len(c.get_classes()) > 1: subclassifier = NaiveBayes() subclassifier.train(c) self.subclassifiers[c.get_label()] = subclassifier else: # for nltk classifiers labeled_feature_set = [(d.get_features(), d.get_labels()[0]) for d in train_set] self.classifier.train(labeled_feature_set) # Sklearn classifiers
def dev_train(): docs = build_doc_set('../papers') driver = Processor() for d in docs: driver.process_document(d) driver.clf.set_classifier(NaiveBayes()) driver.clf.train(docs) driver.save_classifier('saved_classifier-367-1')
class Classifier(object): def __init__(self): self.classifier = NaiveBayes() def learn_from_tweets(self, user_ids, category): """ Train the classifier by tweets. user_ids : A list of twitter ids which their tweets are included in the category. category : The category of the tweets. """ tweets = get_tweets(user_ids) categories = [category] * len(tweets) self.classifier.fit(tweets, categories) print("Training...") def predict_user_input(self): """Read user input until 'exit' is entered""" sentence = input("input =>") while(sentence != 'exit'): category = self.classifier.predict_(sentence) print("{}\n".format(category)) sentence = input("input =>") def save(self, filename): """Save the model.""" self.classifier.dump_json(filename) def load(self, filename): """Load the model from a file.""" self.classifier.load_json(filename)
class Classifier(object): def __init__(self): self.classifier = NaiveBayes() def learn_from_tweets(self, user_ids, category): """ Train the classifier by tweets. user_ids : A list of twitter ids which their tweets are included in the category. category : The category of the tweets. """ tweets = get_tweets(user_ids) categories = [category] * len(tweets) self.classifier.fit(tweets, categories) print("Training...") def predict_user_input(self): """Read user input until 'exit' is entered""" sentence = input("input =>") while (sentence != 'exit'): category = self.classifier.predict_(sentence) print("{}\n".format(category)) sentence = input("input =>") def save(self, filename): """Save the model.""" self.classifier.dump_json(filename) def load(self, filename): """Load the model from a file.""" self.classifier.load_json(filename)
def dev_train_test(): """Train and test a new classifier on a directory of .txt documents.""" docs = build_doc_set('../papers') print 'Processing docset with %d docs...' % len(docs) driver = Processor() for d in docs: driver.process_document(d) driver.clf.set_classifier(NaiveBayes()) driver.clf.train_and_test(docs, split=.07)
def test_naivebayes_labeling(self): naivebayes = NaiveBayes() corpus = Serializer.load_csv('../resource/corpus.tsv') data = [] target = [] for row in corpus: data.append(str(row[0])) t = int(row[1]) if t > 5: raise Exception(t) target.append(t) np.array(target, dtype=np.uint8, ndmin=1)
class Ranking(object): def __init__(self, config): self.ocr = OCREngine() self.naivebayes = NaiveBayes() self.naivebayes.human_labels = self.ocr.settings['translate'][ 'country'] def create_TemporyFile(self, buffer, verbose=False): """ @param {io.BytesIO}buffer {bool}verbose @return {string}create file tempory file """ temp_file_name = '' with tempfile.NamedTemporaryFile(delete=False) as temp: temp.write(buffer.getvalue()) temp_file_name = temp.name if verbose: logger.info(temp_file_name) return temp_file_name def getResult(self, src, save_image=False): """ @param {string} src {bool}save_image output debug image @return {OCRDocument} doucument """ pro = DataProcessor(src, ImageType.RAW, save_image=save_image) if pro.prepare() is None: logger.error('image error:{0}'.format(src)) return None buffer = pro.tobinary(pro.batch()) temp_file_name = self.create_TemporyFile(buffer, True) document = self.ocr.recognize(temp_file_name) os.remove(temp_file_name) output = '#' + datetime.now().strftime('%F %T.%f')[:-3] + '\n' output += '\n'.join(document.names()) + '\n' with Serializer.open_stream('../temp/corpus.txt', mode='a') as file: file.write(output) # ocr corpus data -> NaiveBayes classifier # ranking name swap change = self.naivebayes.predict_all(document.names()) #doucument.changeNames(change) document.dump() return document
def train(instances): print('starting training') p = None if args.algorithm == 'lambda_means': p = LambdaMeans(args.cluster_lambda, max_max_index, args.clustering_training_iterations) p.train(instances) elif args.algorithm == 'nb_clustering': p = NaiveBayes(args.num_clusters, max_max_index, args.clustering_training_iterations) p.train(instances) print('ending training') return p
def main(): df = data() for train_size in np.linspace(0.5, 0.9, 5): train, test = train_test_split(df, train_size=train_size, random_state=42) # Since there is only 1 sample with native-country == Holand-Netherlands, # ensure that this sample is in the training set if "Holand-Netherlands" in test["native-country"].unique(): train = train.append(test[test["native-country"] == "Holand-Netherlands"]) test = test[test["native-country"] != "Holand-Netherlands"] for ignore_missing in [True, False]: nb = NaiveBayes(ignore_missing=ignore_missing) nb.learn_parameters(train) acc = nb.score(test[test["native-country"] != "Holand-Netherlands"]) print( "\nTrain size: {} Test error: {} Ignore features with missing values: {}".format( train_size, (1 - acc), ignore_missing ) )
def main(): nb = NaiveBayes() nb.load_data_training() nb.mulai_training() # TODO: [LANGKAH-10] Cobalah untuk melakukan prediksi! # Apbila cuacanya 'Hujan', suhunya 'Dingin', tingkat kemalasannya 'tinggi', dan 'Bangun siang', # mahasiswanya masuk atau bolos? hasil_prediksi = nb.prediksi(nilai_cuaca='Hujan', nilai_suhu='Dingin', nilai_tingkat_malas='Tinggi', nilai_bangun_siang='Ya') print('=====================================') print('Hasil akhir prediksi = {}, dengan peluang sebesar {}%'.format( hasil_prediksi['hasil'], (hasil_prediksi['peluang'] * 100)))
def trainTrendClassifier(): """ :return: None This function instantiates a model of the NaiveBayes class and trains the model on the categorized trends data. The trained model is stored in the database for future classification purpose. """ logger.debug("trainTrendsClassifier()") trainingFolder = config['training']['trends'] trainingDocs, trainingLabels = getData(trainingFolder) logger.debug("documents: " + str(len(trainingDocs)) + ", labels: " + str(len(trainingLabels))) model = NaiveBayes() model.train(trainingDocs, trainingLabels, stopWordsFlag=True, stem=True) model.saveToDB()
def main(): dataset_file, model = sys.argv[1], sys.argv[2] classes = load_classes(dataset_file) train, test = split_dataset(classes, 0.6) classifier = NaiveBayes() classifier.train(train) accuracy, recall, f1 = classifier.perfomance(test) print("Total perfomance") print("================") print("Accuracy: %f" % accuracy) print("Recall: %f" % recall) print("F1: %f" % f1) print("\n") class_accuracy, class_recall, class_f1 = classifier.class_perfomance(test) print("Class accuracy") print("================") for klass in class_accuracy: print("%s: %f" % (klass, class_accuracy[klass])) print("\n") print("Class recall") print("================") for klass in class_recall: print("%s: %f" % (klass, class_recall[klass])) print("\n") print("Class F1") print("================") for klass in class_f1: print("%s: %f" % (klass, class_f1[klass])) print("\n") classifier.save(model)
with open("winter-" + classifier + ".json") as json_file: for line in json_file: json_obj = json.loads(line) reviews += [(classifier, json_obj)] # Creating model objects model = args.model if (model == "baseline"): model_obj = BaseLine(reviews, categories) elif (model == "logreg"): model_obj = LogReg(reviews) elif (model == "multinomialNB"): model_obj = NaiveBayes(reviews, "multinomial") elif (model == "lda"): model_obj = TopicModel(reviews) elif (model == "kNearestNeighbors"): model_obj = knn(reviews, target) else: # put additional models here. print("Argument Error: invalid model specified") sys.exit() model_classified = [] # classifications stored here reviews = [] # resetting reviews list to save memory # Reading test data into reviews list
##datafile = "../data/weather.nominal.txt" ##pos_class = "play:yes" ##pos_class = "play:no" ##datafile = "haireyescolor.txt" ##pos_class = "Sex:Male" ##pos_class = "Sex:Female" ##datafile = "../data/cmc-full.txt" ##pos_class = "contraceptive-method:none" ##pos_class = "contraceptive-method:long-term" ##pos_class = "contraceptive-method:short-term" d = Data(datafile) prnb = NaiveBayes(d) ##prnb = MaxAPost(d) prnb.train() pos = 0.0 neg = 0.0 for (v, c_true) in d.test_set: if c_true == pos_class: pos += 1 else: neg += 1 result_pos = [] result_neg = [] result_dif = []
df['text'], df['is_spam'], test_size=0.2, random_state=191) print('Data set:') print('{} total'.format(df.shape[0])) for t, t_name in zip(targets, target_names): print('{} {}'.format(len(df[df['is_spam'] == t]), t_name)) print('\nTraining set:') print('{} total'.format(len(X_train))) for t, t_name in zip(targets, target_names): print('{} {}'.format(sum([y == t for y in y_train]), t_name)) print('\nTest set:') print('{} total'.format(len(X_test))) for t, t_name in zip(targets, target_names): print('{} {}'.format(sum([y == t for y in y_test]), t_name)) print('') # Build Classifier gvoc_model = NaiveBayes('General Vocabulary', X_train, y_train, targets, target_names) gvoc_model.train() gvoc_model.evaluate(X_test, y_test, show_top_features=10) rvoc_model = NaiveBayes('Reduced Vocabulary', X_train, y_train, targets, target_names, max_features=200) rvoc_model.train() rvoc_model.evaluate(X_test, y_test, show_top_features=10)
import numpy as np from sklearn.model_selection import train_test_split from sklearn import datasets import matplotlib.pyplot as plt from naivebayes import NaiveBayes def accuracy(y_true, y_pred): accuracy = np.sum(y_true == y_pred) / len(y_true) return accuracy X, y = datasets.make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=123) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123) nb = NaiveBayes() nb.fit(X_train, y_train) predictions = nb.predict(X_test) print("Naive Bayes classification accuracy", accuracy(y_test, predictions))
def accuracy(y_true, y_pred): return np.sum(y_true == y_pred) / len(y_true) X, y = datasets.make_blobs(n_samples=1000, n_features=2, centers=3, cluster_std=1.0, center_box=(-10.0, 10.0), shuffle=True, random_state=123, return_centers=False) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9, random_state=1234) clf = NaiveBayes() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print(accuracy(y_test, y_pred)) color_map = {0: 'r', 1: 'k', 2: 'g'} label_color = [color_map[l] for l in y_pred] plt.scatter(X_test[:, 0], X_test[:, 1], c=label_color) plt.show()
def __init__(self): self.classifier = NaiveBayes()
from naivebayes import NaiveBayes nb = NaiveBayes(3, 3) dataset = [ ([0, 0, 1], 1), ([0, 1, 0], 0), ([0, 1, 1], 1), ([1, 0, 0], 0), ([1, 1, 0], 0), ([1, 1, 1], 2), ([1, 0, 1], 2), ([0, 1, 1], 1), ([0, 1, 1], 1), ([0, 1, 1], 1), ([0, 1, 1], 1), ([0, 1, 1], 2), ([0, 0, 1], 1), ([1, 0, 1], 2), ([1, 1, 0], 0) ] for i,t in dataset: nb.update(i,t) print nb.class_count print nb.feature_count for i,t in dataset:
def evaluate_naivebayes(): nb = NaiveBayes(sys.argv[1], evaluate = True) out = nb.evaluate(sys.argv[2]) process(out, 'Naive Bayes')
def stdmean(): limit = 0.7 ratio = 0.8 times = 5 print("digit") traindata, testdata = dataloader_digit() sal = [] mal = [] pal = [] for p in range(10, 101, 10): al = [] il = [] for i in range(times): images, labels = traindata.shuffleout(p) pc = Perceptron(traindata.width * traindata.height, traindata.labeldomain) pc.train(images, labels, 3, ratio) x = pc.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) il.append(i + 1) print(a * 100) sal.append(np.std(al)) mal.append(np.mean(al)) pal.append(p) plt.plot(pal, sal, label="digitdata Perceptron std") plt.plot(pal, mal, label="digitdata Perceptron mean") feature_domians = [[i for i in np.arange(0, 1.1, 0.5)] for _ in range(traindata.width * traindata.height)] sal = [] mal = [] pal = [] for p in range(10, 101, 10): al = [] for i in range(3): images, labels = traindata.shuffleout(p) nb = NaiveBayes(feature_domians, traindata.labeldomain, 1) nb.train(images, labels) x = nb.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) sal.append(np.std(al)) mal.append(np.mean(al)) pal.append(p) print(a) plt.plot(pal, sal, label="digitdata NaiveBayes std") plt.plot(pal, mal, label="digitdata NaiveBayes mean") sal = [] mal = [] pal = [] for p in range(10, 101, 10): al = [] il = [] for i in range(times): images, labels = traindata.shuffleout(p) pc = NeuralNetwork((traindata.width * traindata.height, 15, 15, len(traindata.labeldomain)), traindata.labeldomain) pc.train(images, labels, 50, ratio) x = pc.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) il.append(i + 1) sal.append(np.std(al)) mal.append(np.mean(al)) pal.append(p) print(a) plt.plot(pal, sal, label="digitdata NeuralNetwork std") plt.plot(pal, mal, label="digitdata NeuralNetwork mean") print("face") traindata, testdata = dataloader_face() sal = [] mal = [] pal = [] for p in range(10, 101, 10): al = [] il = [] for i in range(times): images, labels = traindata.shuffleout(p) pc = Perceptron(traindata.width * traindata.height, traindata.labeldomain) pc.train(images, labels, 3, ratio) x = pc.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) il.append(i + 1) print(a * 100) sal.append(np.std(al)) mal.append(np.mean(al)) pal.append(p) plt.plot(pal, sal, label="facedata Perceptron std") plt.plot(pal, mal, label="facedata Perceptron mean") feature_domians = [[i for i in np.arange(0, 1.1, 0.5)] for _ in range(traindata.width * traindata.height)] sal = [] mal = [] pal = [] for p in range(10, 101, 10): al = [] for i in range(3): images, labels = traindata.shuffleout(p) nb = NaiveBayes(feature_domians, traindata.labeldomain, 1) nb.train(images, labels) x = nb.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) sal.append(np.std(al)) mal.append(np.mean(al)) pal.append(p) print(a) plt.plot(pal, sal, label="facedata NaiveBayes std") plt.plot(pal, mal, label="facedata NaiveBayes mean") sal = [] mal = [] pal = [] for p in range(10, 101, 10): al = [] il = [] for i in range(times): images, labels = traindata.shuffleout(p) pc = NeuralNetwork((traindata.width * traindata.height, 15, 15, len(traindata.labeldomain)), traindata.labeldomain) pc.train(images, labels, 50, ratio) x = pc.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) il.append(i + 1) sal.append(np.std(al)) mal.append(np.mean(al)) pal.append(p) print(a) plt.plot(pal, sal, label="facedata NeuralNetwork std") plt.plot(pal, mal, label="facedata NeuralNetwork mean") leg = plt.legend(ncol=1, shadow=True, fancybox=True) leg.get_frame().set_alpha(0.5) plt.xlabel("data size precentage") plt.ylabel("time(in second)") plt.show()
def naivebayes(trainf, testf): nb = NaiveBayes(trainf) nb.classify(testf)
from naivebayes import NaiveBayes from data import Data print_numbers = False datafile = "ds/titanicTr.txt" pos_class = "Survived:Yes" #pos_class = "Survived:No" # datafile = "cmcTr.txt" # pos_class = "contraceptive-method:none" d = Data(datafile) prnb = NaiveBayes(d) prnb.train() r = Roc(prnb, pos_class) r.do_curve() print "Predicting", pos_class, "for data file", datafile, print "with", int(r.curve[2]), "positive instances and", int( r.curve[3]), "negative instances" if print_numbers: prnb.show() print "Scores for predicting", pos_class, ":" for e in sorted(r.preds):
from data import Data from naivebayes import NaiveBayes filename = "datasets/weatherNominal.td" ## filename = "datasets/titanic.td" ## filename = "datasets/cmc.td" d = Data(filename) d.report() pr = NaiveBayes(d) pr.train() pr.show() for (v, c_true) in d.test_set: c_pred = pr.predict(v)[0] print(v, ":") print(" ", c_pred, "( true class:", c_true, ")") ## print(pr.predict(("Class:1st","Sex:Female","Age:Child"))) ## print(pr.predict(("Class:Crew","Sex:Female","Age:Child")))
def test_naivebayes_argmax_all(): traindata, testdata = dataloader_digit() feature_domians = [[i for i in np.arange(0, 1.1, 0.5)] for _ in range(traindata.width * traindata.height)] fal = [] pal = [] for p in range(10, 101, 10): print("Training with %d" % int(p * traindata.number * 0.01)) nb = NaiveBayes(feature_domians, traindata.labeldomain, 1) images, labels = traindata.orderedout(p) nb.train(images, labels) x = nb.classify(testdata.images) a = Accuracy(x, testdata.labels) fal.append(a * 100) pal.append(p) print(a) plt.plot(pal, fal, label="digitdata order") fal = [] pal = [] for p in range(10, 101, 10): print("Training with %d" % int(p * traindata.number * 0.01)) nb = NaiveBayes(feature_domians, traindata.labeldomain, 1) images, labels = traindata.shuffleout(p) nb.train(images, labels) x = nb.classify(testdata.images) a = Accuracy(x, testdata.labels) fal.append(a * 100) pal.append(p) print(a) plt.plot(pal, fal, label="digitdata random") traindata, testdata = dataloader_face() feature_domians = [[0, 1] for _ in range(traindata.width * traindata.height)] fal = [] pal = [] for p in range(10, 101, 10): print("Training with %d" % int(p * traindata.number * 0.01)) nb = NaiveBayes(feature_domians, traindata.labeldomain, 1) images, labels = traindata.orderedout(p) nb.train(images, labels) x = nb.classify(testdata.images) a = Accuracy(x, testdata.labels) fal.append(a * 100) pal.append(p) print(a) plt.plot(pal, fal, label="facedata order") fal = [] pal = [] for p in range(10, 101, 10): print("Training with %d" % int(p * traindata.number * 0.01)) nb = NaiveBayes(feature_domians, traindata.labeldomain, 1) images, labels = traindata.shuffleout(p) nb.train(images, labels) x = nb.classify(testdata.images) a = Accuracy(x, testdata.labels) fal.append(a * 100) pal.append(p) print(a) plt.plot(pal, fal, label="facedata random") leg = plt.legend(ncol=1, shadow=True, fancybox=True) leg.get_frame().set_alpha(0.5) plt.xlabel("data size precentage") plt.ylabel("accuracy") plt.show()
def spamHamtoyExample() -> None: ''' Trains a naive bayes classifier using a folder with spam/ham emails Checks quality of classifier by using the model to predict the emails from the 'test' folder Different feature numbers are used to check how many features gives the best classification score (1-50) Plots the classification - x-axis = number of features, y-axis = classification accuracy ''' filedir = '../data/emails/' naivebay = NaiveBayes() naivebay.train(os.path.join(filedir, 'train/')) numOfItemsToPrint = 4 naivebay.printMostPopularHamWords(numOfItemsToPrint) naivebay.printMostPopularSpamWords(numOfItemsToPrint) naivebay.printMostindicativeHamWords(numOfItemsToPrint) naivebay.printMostindicativeSpamWords(numOfItemsToPrint) print('Model logPrior: {}'.format(naivebay.logPrior)) features = [1, 2, 5, 10, 20, 30, 40, 50] accuracy = [] for i in features: acc = naivebay.classifyAndEvaluateAllInFolder( os.path.join(filedir, 'test/'), i) accuracy.append(acc) print(i, "features, classification score:", acc) plt.figure("Naive results: #features vs classification error rate") plt.plot(features, accuracy) plt.grid(True) plt.xlabel('Number of Features') plt.ylabel('Classification Score') plt.show()
def timeana(): import time limit = 0.7 ratio = 1 times = 200 print("digit") traindata, testdata = dataloader_digit() fal = [] pal = [] for p in range(20, 101, 10): images, labels = traindata.orderedout(p) al = [] il = [] start = time.time() pc = Perceptron(traindata.width * traindata.height, traindata.labeldomain) for i in range(times): pc.train(images, labels, 1, ratio) x = pc.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) il.append(i + 1) print(a * 100) if a > limit: end = time.time() break fal.append(end - start) pal.append(p) plt.plot(pal, fal, label="digitdata Perceptron") feature_domians = [[i for i in np.arange(0, 1.1, 0.5)] for _ in range(traindata.width * traindata.height)] fal = [] pal = [] for p in range(20, 101, 10): start = time.time() nb = NaiveBayes(feature_domians, traindata.labeldomain, 1) images, labels = traindata.orderedout(p) nb.train(images, labels) x = nb.classify(testdata.images) a = Accuracy(x, testdata.labels) end = time.time() fal.append(end - start) pal.append(p) print(a) plt.plot(pal, fal, label="digitdata NaiveBayes") fal = [] pal = [] for p in range(20, 101, 10): images, labels = traindata.orderedout(p) al = [] il = [] start = time.time() pc = NeuralNetwork((traindata.width * traindata.height, 15, 15, len(traindata.labeldomain)), traindata.labeldomain) for i in range(times): pc.train(images, labels, 1, ratio) x = pc.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) il.append(i + 1) print(a * 100) if a > limit: end = time.time() break fal.append(end - start) pal.append(p) plt.plot(pal, fal, label="digitdata NeuralNetwork") print("face") traindata, testdata = dataloader_face() fal = [] pal = [] for p in range(20, 101, 10): images, labels = traindata.orderedout(p) al = [] il = [] start = time.time() pc = Perceptron(traindata.width * traindata.height, traindata.labeldomain) for i in range(times): pc.train(images, labels, 1, ratio) x = pc.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) il.append(i + 1) print(a * 100) if a > limit: end = time.time() break fal.append(end - start) pal.append(p) plt.plot(pal, fal, label="facedata Perceptron") feature_domians = [[i for i in np.arange(0, 1.1, 0.5)] for _ in range(traindata.width * traindata.height)] fal = [] pal = [] for p in range(20, 101, 10): start = time.time() nb = NaiveBayes(feature_domians, traindata.labeldomain, 1) images, labels = traindata.orderedout(p) nb.train(images, labels) x = nb.classify(testdata.images) a = Accuracy(x, testdata.labels) end = time.time() fal.append(end - start) pal.append(p) print(a) plt.plot(pal, fal, label="facedata NaiveBayes") fal = [] pal = [] for p in range(20, 101, 10): images, labels = traindata.orderedout(p) al = [] il = [] start = time.time() pc = NeuralNetwork((traindata.width * traindata.height, 15, 15, len(traindata.labeldomain)), traindata.labeldomain) for i in range(times): pc.train(images, labels, 1, ratio) x = pc.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) il.append(i + 1) print(a * 100) if a > limit: end = time.time() break fal.append(end - start) pal.append(p) plt.plot(pal, fal, label="facedata NeuralNetwork") leg = plt.legend(ncol=1, shadow=True, fancybox=True) leg.get_frame().set_alpha(0.5) plt.xlabel("data size precentage") plt.ylabel("time(in second)") plt.show()
from naivebayes import NaiveBayes from TwitterAPI import TwitterAPI consumer_key = "A81OeId94VKneudlMgtvseNWK" consumer_secret = "M1GBfmhLvzSvJAplZgSwjgwsWZkZZh0W8qe7yXetjoGjiJ2HMU" access_token_key = "4056128655-094VMOircUBKQofYmr1izWx9UQJjZ6XoKDcThJa" access_token_secret = "0DBWRSO3k7iwWgSds8j7iXntmSDOyMR9T2rcTBk6SGG8H" api = TwitterAPI(consumer_key, consumer_secret, access_token_key, access_token_secret) training_set = [] r = api.request('statuses/filter', {'locations':'-74,40,-73,41'}) nb = NaiveBayes() nb.load() cin = '' for item in r: print item['text'] cin = raw_input('Basic? Y/n/quit: ') if cin == 'n': training_set.append(('non-basic', item['text'])) elif cin == 'quit': break else: training_set.append(('basic', item['text'])) nb.train(training_set) nb.save()