def build_bayes_graph(img, labels, sigma=1e2, kappa=2): """ Build a graph from 4-neighborhood of pixels. Foreground and background is determined from labels (1 for foreground, 0 for background) and is modeled with naive Bayes classifiers.""" m, n = img.shape[:2] # RGB vector version (one pixel per row) vim = img.reshape((-1, 3)) # RGB for foreground and background foreground = img[labels == 1].reshape((-1, 3)) background = img[labels == 0].reshape((-1, 3)) train_data = [foreground, background] # train naive Bayes classifier bc = BayesClassifier() bc.train(train_data) # get probabilities for all pixels bc_lables, prob = bc.classify(vim) prob_fg, prob_bg = prob[0], prob[1] print(np.amax(prob_fg), np.max(prob_bg)) # create graph with m*n+2 nodes gr = Graph() gr.add_node(range(m * n + 2)) source = m * n # second to last is source sink = m * n + 1 # last node is sink # normalize for i in range(vim.shape[0]): vim[i] = vim[i] / np.linalg.norm(vim[i]) # go through all nodes and add edges for i in range(m * n): print(i) # add edge from source gr.add_edge((source, i), (prob_fg[i] / (prob_fg[i] + prob_bg[i]))) # add edge to sink gr.add_edge((i, sink), (prob_bg[i] / (prob_fg[i] + prob_bg[i]))) # add edges to neighbors if i % n != 0: # left exists edge_wt = kappa * \ np.exp(-1.0 * sum((vim[i] - vim[i - 1])**2) / sigma) gr.add_edge((i, i - 1), edge_wt) if (i + 1) % n != 0: # right exists edge_wt = kappa * \ np.exp(-1.0 * sum((vim[i] - vim[i + 1])**2) / sigma) gr.add_edge((i, i + 1), edge_wt) if i // n != 0: # up exists edge_wt = kappa * \ np.exp(-1.0 * sum((vim[i] - vim[i - n])**2) / sigma) gr.add_edge((i, i - n), edge_wt) if i // n != m - 1: # down exists edge_wt = kappa * \ np.exp(-1.0 * sum((vim[i] - vim[i + n])**2) / sigma) gr.add_edge((i, i + n), edge_wt) gr.build_flow(source, sink) return gr
words = re.findall("[a-z0-9']+", " ".join(line_split[1:])) return msg_type, set(words) if __name__ == '__main__': with open('data/data.txt', 'r') as f: messages = [] for line in f: msg_type, words = tokenize(line) messages.append({'msg_type': msg_type, 'words': words}) training_set = messages[:int(len(messages) * 0.75)] testing_set = messages[int(len(messages) * 0.75):] bayes = BayesClassifier() bayes.train(training_set) classified = bayes.classify(testing_set) true_positive = len([ 1 for message in classified if message['msg_type'] == 'spam' and message['prob_spam'] > 0.5 ]) false_positive = len([ 1 for message in classified if message['msg_type'] == 'ham' and message['prob_spam'] > 0.5 ]) true_negative = len([ 1 for message in classified if message['msg_type'] == 'ham' and message['prob_spam'] <= 0.5 ]) false_negative = len([
#!/usr/bin/env python #-*- encoding:utf-8 -*- import sys, os from preprocess import Preprocessor from features import FeatureSelector from bayes import BayesClassifier if __name__ == '__main__': train_file = sys.argv[1] test_file = sys.argv[2] pr = Preprocessor() pr.build_vocabulary_and_categories(train_file) fs = FeatureSelector(train_file, ck = 500) fs.select_features() bc = BayesClassifier(train_file, test_file, model = 'bernoulli') bc.train() bc.test()