class Feature_Chooser: def __init__(self): self.features = [] self.model = Boolean_Model() self.threshold = .03 def choose(self, spam_dir, ham_dir): i = 0 for f in get_files(spam_dir): i += 1 for word in munge_All_Words(f): if word not in self.features: self.features.append(word) j = 0 for f in get_files(ham_dir): j += 1 for word in munge_All_Words(f): if word not in self.features: self.features.append(word) print len(self.features) self.model.set_features(self.features) print "finished choosing features" def train(self, spam_dir, ham_dir): N = 0 loss = 0. for f in get_files(spam_dir): N += 1 if N % 23 == 0: print N self.model.observe_example(self.munge(f), 1) for f in get_files(ham_dir): N += 1 print N if N % 23 == 0: print N self.model.observe_example(self.munge(f), 0) self.model.build_network() new_features = [] for i, attribute in enumerate(self.model.attribute_params): print attribute, attribute[1] - attribute[0] if abs(attribute[1] - attribute[0]) > self.threshold: new_features.append(self.features[i]) print new_features self.features = new_features print "finished training" def munge(self, email_file): f = open(email_file, 'rb') text = f.read() word_list = re.split('\W+', text) boolean_vector = [int(token in word_list) for token in self.features] return boolean_vector def pickle(self, features_file): output = open(features_file, 'wb') pickle.dump(self.features, output) print self.features print "pickled"
class NB_Boolean(NaiveBayesModel): def classify(self, example, cost_ratio): log_likelihood1 = math.log(self.model.base_param) log_likelihood2 = math.log(1 - self.model.base_param) for i, token in enumerate(self.model.attribute_params): if example[i] == 1: log_likelihood1 += math.log(self.model.attribute_params[i][0]) log_likelihood2 += math.log(self.model.attribute_params[i][1]) else: log_likelihood1 += math.log(1 - self.model.attribute_params[i][0]) log_likelihood2 += math.log(1 - self.model.attribute_params[i][1]) return int(log_likelihood1 - math.log(cost_ratio) > log_likelihood2) def train(self, spam_dir, ham_dir): self.model = Boolean_Model() self.model.set_features(self.features) N = 0 loss = 0. for f in get_files(spam_dir): print N N += 1 self.model.observe_example(self.munge(f), 1) for f in get_files(ham_dir): print N N += 1 self.model.observe_example(self.munge(f), 0) self.model.build_network() print "finished training" def munge(self, email_file): f = open(email_file, 'rb') text = f.read() word_list = re.split('\W+', text) boolean_vector = [int(token in word_list) for token in self.features] return boolean_vector