def train(self): for category, file in self.to_train: email = EmailObject(io.open(file, 'rb')) self.categories.add(category) for token in Tokenizer.unique_tokenizer(email.body()): self.training[category][token] += 1 self.totals['_all'] += 1 self.totals[category] += 1 self.to_train = {}
def score(self, email): self.train() cat_totals = self.totals aggregates = { cat: cat_totals[cat] / cat_totals['_all'] for cat in self.categories } for token in Tokenizer.unique_tokenizer(email.body()): for cat in self.categories: value = self.training[cat][token] r = (value + 1) / (cat_totals[cat] + 1) aggregates[cat] *= r return aggregates
def score(self, email): """ Calculates score :param email: EmailObject :return: float number """ self.train() cat_totals = self.totals aggregates = { cat: cat_totals[cat] / cat_totals['_all'] for cat in self.categories } for token in Tokenizer.unique_tokenizer(email.body()): for cat in self.categories: value = self.training[cat][token] r = (value + 1) / (cat_totals[cat] + 1) aggregates[cat] *= r return aggregates