def test(self): stats = dict.fromkeys(self.classes) success = 0.0 errors = 0.0 for k in stats: stats[k] = {'tp': 0, 'fp': 0, 'fn': 0, 'tn': 0} start = time.time() for doc in self.testing_docs.find(): if not database.doc_has_data(doc): continue cls = self.classify(doc) if doc['field'] == cls: success += 1 stats[cls]['tp'] += 1 log.debug('Success (%s/%s)', doc['author'], doc['field']) else: errors += 1 # doc['field'] is the correct label, cls is wrong. # Account a false positive at the correct label: doc['field'] stats[doc['field']]['fp'] += 1 # Account a false negative: cls stats[cls]['fn'] += 1 log.debug('Error (%s/%s)', doc['author'], doc['field']) log.info("Took %.3f secs to classify", time.time() - start) info_stats = {} for cls, stat in stats.iteritems(): log.info('Class %s (tp=%d,fp=%d,fn=%d,tn=%d)', cls, stat['tp'], stat['fp'], stat['fn'], stat['tn']) precision = stat['tp'] / float(stat['tp'] + stat['fp']) try: recall = stat['tp'] / float(stat['tp'] + stat['fn']) f1 = 2.0 * precision * recall / (precision + recall) except ZeroDivisionError: f1 = recall = 'nan' log.debug('\tPrecision=%.3f, Recall=%.3f, F1=%.3f', precision, recall, f1) info_stats[cls] = {} info_stats[cls]['precision'] = precision info_stats[cls]['recall'] = recall info_stats[cls]['f1'] = f1 info_stats[cls]['features'] = len(self.features) global_accuracy = success / float(success + errors) log.info('Global accuracy: %.2f %%', global_accuracy * 100.0) info_stats['global_accuracy'] = global_accuracy return info_stats
def test(self): stats = dict.fromkeys(self.classes) success = 0.0 errors = 0.0 for k in stats: stats[k] = {'tp': 0 , 'fp': 0, 'fn': 0, 'tn': 0} start = time.time() for doc in self.testing_docs.find(): if not database.doc_has_data(doc): continue cls = self.classify(doc) if doc['field'] == cls: success += 1 stats[cls]['tp'] += 1 log.debug('Success (%s/%s)', doc['author'], doc['field']) else: errors += 1 # doc['field'] is the correct label, cls is wrong. # Account a false positive at the correct label: doc['field'] stats[doc['field']]['fp'] += 1 # Account a false negative: cls stats[cls]['fn'] += 1 log.debug('Error (%s/%s)', doc['author'], doc['field']) log.info("Took %.3f secs to classify", time.time() - start) info_stats = {} for cls, stat in stats.iteritems(): log.info('Class %s (tp=%d,fp=%d,fn=%d,tn=%d)', cls, stat['tp'], stat['fp'], stat['fn'], stat['tn']) precision = stat['tp']/float(stat['tp'] + stat['fp']) try: recall = stat['tp']/float(stat['tp'] + stat['fn']) f1 = 2.0 * precision * recall / (precision + recall) except ZeroDivisionError: f1 = recall = 'nan' log.debug('\tPrecision=%.3f, Recall=%.3f, F1=%.3f', precision, recall, f1) info_stats[cls] = {} info_stats[cls]['precision'] = precision info_stats[cls]['recall'] = recall info_stats[cls]['f1'] = f1 info_stats[cls]['features'] = len(self.features) global_accuracy = success/float(success + errors) log.info('Global accuracy: %.2f %%', global_accuracy * 100.0) info_stats['global_accuracy'] = global_accuracy return info_stats
def train(self): """ Performs a naive-bayes on the given features. Fills out dictionaries prior and cond with prior probabilities P(c) of a class 'c' and conditional P(term|c). """ docs = self.training_docs n = docs.count() prior = {} cond = {} memo = {} log.debug('starting training on %d documents..', n) start_time = time.time() for cls in self.classes: # Compute prior probabilities nc = docs.find({'field': cls}).count() # Maximum Likelihood Estimate (MLE) prior[cls] = numpy.log(nc / float(n)) # Join all documents for faster counting textfile = StringIO.StringIO() for doc in docs.find({'field': cls}): if not database.doc_has_data(doc): continue textfile.write((' '.join(tokenize(doc['data'])) + ' ').encode( 'utf-8', 'replace')) # Count vocabulary occurences on joined documents nterm = dict.fromkeys(self.features, 0) for term in textfile.getvalue().split(): if term in self.features: nterm[term] += 1 # Precompute denominator for conditional prob. estimator base = float(sum(nterm.values()) + len(nterm)) # Compute conditional probabilities for term in self.features: if term not in cond: cond[term] = {} val = (nterm[term] + 1) / base if val not in memo: memo[val] = numpy.log(val) cond[term][cls] = memo[val] log.info('finished training (took %.3f secs)', time.time() - start_time) self.prior = prior self.cond = cond
def train(self): """ Performs a naive-bayes on the given features. Fills out dictionaries prior and cond with prior probabilities P(c) of a class 'c' and conditional P(term|c). """ docs = self.training_docs n = docs.count() prior = {} cond = {} memo = {} log.debug('starting training on %d documents..', n) start_time = time.time() for cls in self.classes: # Compute prior probabilities nc = docs.find({'field': cls}).count() # Maximum Likelihood Estimate (MLE) prior[cls] = numpy.log(nc/float(n)) # Join all documents for faster counting textfile = StringIO.StringIO() for doc in docs.find({'field': cls}): if not database.doc_has_data(doc): continue textfile.write((' '.join(tokenize(doc['data'])) + ' ').encode('utf-8', 'replace')) # Count vocabulary occurences on joined documents nterm = dict.fromkeys(self.features, 0) for term in textfile.getvalue().split(): if term in self.features: nterm[term] += 1 # Precompute denominator for conditional prob. estimator base = float(sum(nterm.values()) + len(nterm)) # Compute conditional probabilities for term in self.features: if term not in cond: cond[term] = {} val = (nterm[term] + 1)/base if val not in memo: memo[val] = numpy.log(val) cond[term][cls] = memo[val] log.info('finished training (took %.3f secs)', time.time() - start_time) self.prior = prior self.cond = cond
def select_features(self): """ Select most frequent terms. """ start_time = time.time() log.info('selecting features..') f = {} for doc in self.training_docs.find(): if not database.doc_has_data(doc): continue for term in bag.bag_of_words(doc['data']): f.setdefault(term, 1) f[term] += 1 cut = (numpy.max(f.values()) + numpy.mean(f.values()))/self.size_divider higher = dict(filter(lambda n: n[1] >= cut, f.iteritems())) log.info('selected %d terms (took %.3f secs)', len(higher), time.time() - start_time) return higher
def select_features(self): """ Select most frequent terms. """ start_time = time.time() log.info('selecting features..') f = {} for doc in self.training_docs.find(): if not database.doc_has_data(doc): continue for term in bag.bag_of_words(doc['data']): f.setdefault(term, 1) f[term] += 1 cut = (numpy.max(f.values()) + numpy.mean(f.values())) / 8.0 higher = dict(filter(lambda n: n[1] >= cut, f.iteritems())) log.info('selected %d terms (took %.3f secs)', len(higher), time.time() - start_time) return higher