def main(args): """ HamOrSpam entrypoint """ # Get arguments try: script, train, test = args except ValueError: print 'usage: python hamorspam.py train.txt test.txt' sys.exit(-1) classifier = NaiveBayes(['ham', 'spam']) # Train classifier train_file = open(train, 'r') for line in train_file: # Discard empty lines if not line.strip(): continue typ, message = line.split('\t', 1) classifier.teach(typ, message) train_file.close() # Query classifier test_file = open(test, 'r') for line in test_file: # Discard empty lines if not line.strip(): continue print classifier.classify(line) test_file.close()
def evaluate(trainfile, testfile): # 訓練データをロード trainData = [] fp = codecs.open(trainfile, "r", "utf-8") for line in fp: line = line.rstrip() temp = line.split() trainData.append(temp) fp.close() # ナイーブベイズを訓練 nb = NaiveBayes() nb.train(trainData) print nb # テストデータを評価 hit = 0 numTest = 0 fp = codecs.open(testfile, "r", "utf-8") for line in fp: line = line.rstrip() temp = line.split() correct = temp[0] # 正解カテゴリ words = temp[1:] # 文書:単語の集合 predict = nb.classify(words) # ナイーブベイズでカテゴリを予測 if correct == predict: hit += 1 # 予測と正解が一致したらヒット! numTest += 1 print "accuracy:", float(hit) / float(numTest) fp.close()
def _populate(self, tweets): """ :param tweets: A python dictionary containing trends as keys and list of tweets as values against each trend. :return: None This is a private method used by the constructor to populate the inverted index object """ for trendName in tweets: self.trends.append(trendName) self.totalTweets += len(tweets[trendName]) # classify trend tweetsDoc = " ".join([tweet.text for tweet in tweets[trendName]]) model = NaiveBayes() model.loadModelFromDB() self.categories.append(model.classify(tweetsDoc)) for tweet in tweets[trendName]: if tweet.user.screen_name not in self.twitterHandles: self.twitterHandles.append(tweet.user.screen_name) posts = [(self.trends.index(trendName), tweet)] self.indexLists.append(posts) else: posts = self.indexLists[self.twitterHandles.index( tweet.user.screen_name)] posts.append((self.trends.index(trendName), tweet)) self.logger.debug( 'Created and populated Inverted Index: Trends-{}, Tweets-{}'. format(len(self.trends), self.totalTweets))
def test_naivebayes(traindata, testdata): #raw pixel feature feature_domians = [[i for i in np.arange(0, 1.1, 0.5)] for _ in range(traindata.width * traindata.height)] for p in range(10, 101, 10): print("Training with %d" % int(p * traindata.number * 0.01)) nb = NaiveBayes(feature_domians, traindata.labeldomain, 1) images, labels = traindata.orderedout(p) nb.train(images, labels) x = nb.classify(testdata.images) a = Accuracy(x, testdata.labels) print(a)
def main(): url, model = sys.argv[1], sys.argv[2] classifier = NaiveBayes() classifier.load(model) page = urlopen(url).read() soup = BeautifulSoup(page) tags = [tag.name for tag in soup.findAll(True)] classification = classifier.classify(tags) print("Classified as: %s" % classification)
def stdmean(): limit = 0.7 ratio = 0.8 times = 5 print("digit") traindata, testdata = dataloader_digit() sal = [] mal = [] pal = [] for p in range(10, 101, 10): al = [] il = [] for i in range(times): images, labels = traindata.shuffleout(p) pc = Perceptron(traindata.width * traindata.height, traindata.labeldomain) pc.train(images, labels, 3, ratio) x = pc.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) il.append(i + 1) print(a * 100) sal.append(np.std(al)) mal.append(np.mean(al)) pal.append(p) plt.plot(pal, sal, label="digitdata Perceptron std") plt.plot(pal, mal, label="digitdata Perceptron mean") feature_domians = [[i for i in np.arange(0, 1.1, 0.5)] for _ in range(traindata.width * traindata.height)] sal = [] mal = [] pal = [] for p in range(10, 101, 10): al = [] for i in range(3): images, labels = traindata.shuffleout(p) nb = NaiveBayes(feature_domians, traindata.labeldomain, 1) nb.train(images, labels) x = nb.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) sal.append(np.std(al)) mal.append(np.mean(al)) pal.append(p) print(a) plt.plot(pal, sal, label="digitdata NaiveBayes std") plt.plot(pal, mal, label="digitdata NaiveBayes mean") sal = [] mal = [] pal = [] for p in range(10, 101, 10): al = [] il = [] for i in range(times): images, labels = traindata.shuffleout(p) pc = NeuralNetwork((traindata.width * traindata.height, 15, 15, len(traindata.labeldomain)), traindata.labeldomain) pc.train(images, labels, 50, ratio) x = pc.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) il.append(i + 1) sal.append(np.std(al)) mal.append(np.mean(al)) pal.append(p) print(a) plt.plot(pal, sal, label="digitdata NeuralNetwork std") plt.plot(pal, mal, label="digitdata NeuralNetwork mean") print("face") traindata, testdata = dataloader_face() sal = [] mal = [] pal = [] for p in range(10, 101, 10): al = [] il = [] for i in range(times): images, labels = traindata.shuffleout(p) pc = Perceptron(traindata.width * traindata.height, traindata.labeldomain) pc.train(images, labels, 3, ratio) x = pc.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) il.append(i + 1) print(a * 100) sal.append(np.std(al)) mal.append(np.mean(al)) pal.append(p) plt.plot(pal, sal, label="facedata Perceptron std") plt.plot(pal, mal, label="facedata Perceptron mean") feature_domians = [[i for i in np.arange(0, 1.1, 0.5)] for _ in range(traindata.width * traindata.height)] sal = [] mal = [] pal = [] for p in range(10, 101, 10): al = [] for i in range(3): images, labels = traindata.shuffleout(p) nb = NaiveBayes(feature_domians, traindata.labeldomain, 1) nb.train(images, labels) x = nb.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) sal.append(np.std(al)) mal.append(np.mean(al)) pal.append(p) print(a) plt.plot(pal, sal, label="facedata NaiveBayes std") plt.plot(pal, mal, label="facedata NaiveBayes mean") sal = [] mal = [] pal = [] for p in range(10, 101, 10): al = [] il = [] for i in range(times): images, labels = traindata.shuffleout(p) pc = NeuralNetwork((traindata.width * traindata.height, 15, 15, len(traindata.labeldomain)), traindata.labeldomain) pc.train(images, labels, 50, ratio) x = pc.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) il.append(i + 1) sal.append(np.std(al)) mal.append(np.mean(al)) pal.append(p) print(a) plt.plot(pal, sal, label="facedata NeuralNetwork std") plt.plot(pal, mal, label="facedata NeuralNetwork mean") leg = plt.legend(ncol=1, shadow=True, fancybox=True) leg.get_frame().set_alpha(0.5) plt.xlabel("data size precentage") plt.ylabel("time(in second)") plt.show()
def timeana(): import time limit = 0.7 ratio = 1 times = 200 print("digit") traindata, testdata = dataloader_digit() fal = [] pal = [] for p in range(20, 101, 10): images, labels = traindata.orderedout(p) al = [] il = [] start = time.time() pc = Perceptron(traindata.width * traindata.height, traindata.labeldomain) for i in range(times): pc.train(images, labels, 1, ratio) x = pc.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) il.append(i + 1) print(a * 100) if a > limit: end = time.time() break fal.append(end - start) pal.append(p) plt.plot(pal, fal, label="digitdata Perceptron") feature_domians = [[i for i in np.arange(0, 1.1, 0.5)] for _ in range(traindata.width * traindata.height)] fal = [] pal = [] for p in range(20, 101, 10): start = time.time() nb = NaiveBayes(feature_domians, traindata.labeldomain, 1) images, labels = traindata.orderedout(p) nb.train(images, labels) x = nb.classify(testdata.images) a = Accuracy(x, testdata.labels) end = time.time() fal.append(end - start) pal.append(p) print(a) plt.plot(pal, fal, label="digitdata NaiveBayes") fal = [] pal = [] for p in range(20, 101, 10): images, labels = traindata.orderedout(p) al = [] il = [] start = time.time() pc = NeuralNetwork((traindata.width * traindata.height, 15, 15, len(traindata.labeldomain)), traindata.labeldomain) for i in range(times): pc.train(images, labels, 1, ratio) x = pc.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) il.append(i + 1) print(a * 100) if a > limit: end = time.time() break fal.append(end - start) pal.append(p) plt.plot(pal, fal, label="digitdata NeuralNetwork") print("face") traindata, testdata = dataloader_face() fal = [] pal = [] for p in range(20, 101, 10): images, labels = traindata.orderedout(p) al = [] il = [] start = time.time() pc = Perceptron(traindata.width * traindata.height, traindata.labeldomain) for i in range(times): pc.train(images, labels, 1, ratio) x = pc.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) il.append(i + 1) print(a * 100) if a > limit: end = time.time() break fal.append(end - start) pal.append(p) plt.plot(pal, fal, label="facedata Perceptron") feature_domians = [[i for i in np.arange(0, 1.1, 0.5)] for _ in range(traindata.width * traindata.height)] fal = [] pal = [] for p in range(20, 101, 10): start = time.time() nb = NaiveBayes(feature_domians, traindata.labeldomain, 1) images, labels = traindata.orderedout(p) nb.train(images, labels) x = nb.classify(testdata.images) a = Accuracy(x, testdata.labels) end = time.time() fal.append(end - start) pal.append(p) print(a) plt.plot(pal, fal, label="facedata NaiveBayes") fal = [] pal = [] for p in range(20, 101, 10): images, labels = traindata.orderedout(p) al = [] il = [] start = time.time() pc = NeuralNetwork((traindata.width * traindata.height, 15, 15, len(traindata.labeldomain)), traindata.labeldomain) for i in range(times): pc.train(images, labels, 1, ratio) x = pc.classify(testdata.images) a = Accuracy(x, testdata.labels) al.append(a * 100) il.append(i + 1) print(a * 100) if a > limit: end = time.time() break fal.append(end - start) pal.append(p) plt.plot(pal, fal, label="facedata NeuralNetwork") leg = plt.legend(ncol=1, shadow=True, fancybox=True) leg.get_frame().set_alpha(0.5) plt.xlabel("data size precentage") plt.ylabel("time(in second)") plt.show()
def test_naivebayes_argmax_all(): traindata, testdata = dataloader_digit() feature_domians = [[i for i in np.arange(0, 1.1, 0.5)] for _ in range(traindata.width * traindata.height)] fal = [] pal = [] for p in range(10, 101, 10): print("Training with %d" % int(p * traindata.number * 0.01)) nb = NaiveBayes(feature_domians, traindata.labeldomain, 1) images, labels = traindata.orderedout(p) nb.train(images, labels) x = nb.classify(testdata.images) a = Accuracy(x, testdata.labels) fal.append(a * 100) pal.append(p) print(a) plt.plot(pal, fal, label="digitdata order") fal = [] pal = [] for p in range(10, 101, 10): print("Training with %d" % int(p * traindata.number * 0.01)) nb = NaiveBayes(feature_domians, traindata.labeldomain, 1) images, labels = traindata.shuffleout(p) nb.train(images, labels) x = nb.classify(testdata.images) a = Accuracy(x, testdata.labels) fal.append(a * 100) pal.append(p) print(a) plt.plot(pal, fal, label="digitdata random") traindata, testdata = dataloader_face() feature_domians = [[0, 1] for _ in range(traindata.width * traindata.height)] fal = [] pal = [] for p in range(10, 101, 10): print("Training with %d" % int(p * traindata.number * 0.01)) nb = NaiveBayes(feature_domians, traindata.labeldomain, 1) images, labels = traindata.orderedout(p) nb.train(images, labels) x = nb.classify(testdata.images) a = Accuracy(x, testdata.labels) fal.append(a * 100) pal.append(p) print(a) plt.plot(pal, fal, label="facedata order") fal = [] pal = [] for p in range(10, 101, 10): print("Training with %d" % int(p * traindata.number * 0.01)) nb = NaiveBayes(feature_domians, traindata.labeldomain, 1) images, labels = traindata.shuffleout(p) nb.train(images, labels) x = nb.classify(testdata.images) a = Accuracy(x, testdata.labels) fal.append(a * 100) pal.append(p) print(a) plt.plot(pal, fal, label="facedata random") leg = plt.legend(ncol=1, shadow=True, fancybox=True) leg.get_frame().set_alpha(0.5) plt.xlabel("data size precentage") plt.ylabel("accuracy") plt.show()
def naivebayes(trainf, testf): nb = NaiveBayes(trainf) nb.classify(testf)
nb = NaiveBayes() nb.load() cin = '' for item in r: print item['text'] cin = raw_input('Basic? Y/n/quit: ') if cin == 'n': training_set.append(('non-basic', item['text'])) elif cin == 'quit': break else: training_set.append(('basic', item['text'])) nb.train(training_set) nb.save() for item in r: print item['text'] print nb.classify(item['text']) cin = raw_input('Basic? Y/n/quit: ') if cin == 'n': training_set.append(('non-basic', item['text'])) elif cin == 'quit': break else: training_set.append(('basic', item['text'])) nb.save()
nb = NaiveBayes(3, 3) dataset = [ ([0, 0, 1], 1), ([0, 1, 0], 0), ([0, 1, 1], 1), ([1, 0, 0], 0), ([1, 1, 0], 0), ([1, 1, 1], 2), ([1, 0, 1], 2), ([0, 1, 1], 1), ([0, 1, 1], 1), ([0, 1, 1], 1), ([0, 1, 1], 1), ([0, 1, 1], 2), ([0, 0, 1], 1), ([1, 0, 1], 2), ([1, 1, 0], 0) ] for i,t in dataset: nb.update(i,t) print nb.class_count print nb.feature_count for i,t in dataset: print nb.classify(i), t