def train(self, training_data=None): if training_data: self.docs.extend(training_data) for category, doc in self.docs: for word in th.tokenize(doc): self.word_counts[category][word] = self.word_counts[category].get(word, 0.0) + 1 self.vocabulary[word] = self.vocabulary.get(word, 0.0) + 1 for word in self.vocabulary.keys(): self.probabilities[word] = {c: self.word_counts[c].get(word, 0.0) / sum(self.word_counts[c].values()) for c in self.word_counts.keys()}
def classify(self, sentence): classes = { c: 0.0 for c in self.word_counts.keys() } doc_probs = [self.probabilities.get(word, {c: 0.0 for c in classes.keys()}) for word in th.tokenize(sentence)] prior_denom = sum([len(self.word_counts[c]) for c in classes.keys()]) post_nums = {} #posterior numerators for c, v in classes.items(): class_probs = [x[c] for x in doc_probs if x[c] != 0.0] prob = self.__product(class_probs) if class_probs else 0.0 prior = len(self.word_counts[c]) / prior_denom post_nums[c] = prob * prior post_denom = sum(post_nums.values()) if not post_denom: return 'unknown' posteriors = {c: v / post_denom for c, v in post_nums.items()} return max(posteriors.keys(), key=lambda c: posteriors[c])