def __init__(self, tokenizer=None, cache_path='/tmp/'): """ :param tokenizer: A tokenizer override :type tokenizer: function (optional) :param cache_path: path to data storage :type cache_path: str """ self.categories = BayesCategories() self.tokenizer = tokenizer or SimpleBayes.tokenize_text self.cache_path = cache_path self.probabilities = {}
def test_persist_cache(self, dump_mock, open_mock): open_mock.return_value = 'opened' categories = BayesCategories() categories.categories = {'foo': 'bar'} sb = SimpleBayes() sb.cache_path = '/tmp/' sb.categories = categories sb.cache_persist() open_mock.assert_called_once_with('/tmp/_simplebayes.pickle', 'wb') dump_mock.assert_called_once_with(categories, 'opened')
def test_cache_train(self, exists_mock, load_mock, open_mock, calc_mock): categories = BayesCategories() categories.categories = {'foo': 'bar'} load_mock.return_value = categories open_mock.return_value = 'opened' exists_mock.return_value = True sb = SimpleBayes(cache_path='foo') sb.cache_train() exists_mock.assert_called_once_with('foo/_simplebayes.pickle') open_mock.assert_called_once_with('foo/_simplebayes.pickle', 'rb') load_mock.assert_called_once_with('opened') calc_mock.assert_called_once_with() self.assertEqual(sb.categories, categories)
def __init__(self, tokenizer=None, cache_path='/tmp/', cache_data=None, cache_in_memory=False): """ :param tokenizer: A tokenizer override :type tokenizer: function (optional) :param cache_path: path to data storage :type cache_path: str :param cache_data: from an existing cache :type cache_data: pickle.dumps object :param cache_in_memory: True if the persistant cache is kept in memory :type: boolean """ self.categories = BayesCategories() self.tokenizer = tokenizer or SimpleBayes.tokenize_text self.cache_path = cache_path self.cache_data = cache_data self.cache_in_memory = True if cache_data is not None else cache_in_memory self.probabilities = {}
def test_get_categories(self): bc = BayesCategories() bc.add_category('foo') self.assertEqual(bc.get_categories(), bc.categories)
def test_get_category(self): bc = BayesCategories() bc.add_category('foo') self.assertIsInstance(bc.get_category('foo'), BayesCategory)
def test_add_category(self): bc = BayesCategories() bc.add_category('foo') self.assertIn('foo', bc.categories) self.assertIsInstance(bc.categories['foo'], BayesCategory)
def flush(self): """ Deletes all tokens & categories """ self.categories = BayesCategories()
class SimpleBayes(object): """A memory-based, optional-persistence naïve bayesian text classifier.""" cache_file = '_simplebayes.pickle' def __init__(self, tokenizer=None, cache_path='/tmp/'): """ :param tokenizer: A tokenizer override :type tokenizer: function (optional) :param cache_path: path to data storage :type cache_path: str """ self.categories = BayesCategories() self.tokenizer = tokenizer or SimpleBayes.tokenize_text self.cache_path = cache_path self.probabilities = {} @classmethod def tokenize_text(cls, text): """ Default tokenize method; can be overridden :param text: the text we want to tokenize :type text: str :return: list of tokenized text :rtype: list """ return [w for w in text.split() if len(w) > 2] @classmethod def count_token_occurrences(cls, words): """ Creates a key/value set of word/count for a given sample of text :param words: full list of all tokens, non-unique :type words: list :return: key/value pairs of words and their counts in the list :rtype: dict """ counts = {} for word in words: if word in counts: counts[word] += 1 else: counts[word] = 1 return counts def flush(self): """ Deletes all tokens & categories """ self.categories = BayesCategories() def calculate_category_probability(self): """ Caches the individual probabilities for each category """ total_tally = 0.0 probs = {} for category, bayes_category in \ self.categories.get_categories().items(): count = bayes_category.get_tally() total_tally += count probs[category] = count # Calculating the probability for category, count in probs.items(): if total_tally > 0: probs[category] = float(count) / float(total_tally) else: probs[category] = 0.0 for category, probability in probs.items(): self.probabilities[category] = { # Probability that any given token is of this category 'prc': probability, # Probability that any given token is not of this category 'prnc': sum(probs.values()) - probability } def train(self, category, text): """ Trains a category with a sample of text :param category: the name of the category we want to train :type category: str :param text: the text we want to train the category with :type text: str """ try: bayes_category = self.categories.get_category(category) except KeyError: bayes_category = self.categories.add_category(category) tokens = self.tokenizer(str(text)) occurrence_counts = self.count_token_occurrences(tokens) for word, count in occurrence_counts.items(): bayes_category.train_token(word, count) # Updating our per-category overall probabilities self.calculate_category_probability() def untrain(self, category, text): """ Untrains a category with a sample of text :param category: the name of the category we want to train :type category: str :param text: the text we want to untrain the category with :type text: str """ try: bayes_category = self.categories.get_category(category) except KeyError: return tokens = self.tokenizer(str(text)) occurance_counts = self.count_token_occurrences(tokens) for word, count in occurance_counts.items(): bayes_category.untrain_token(word, count) # Updating our per-category overall probabilities self.calculate_category_probability() def classify(self, text): """ Chooses the highest scoring category for a sample of text :param text: sample text to classify :type text: str :return: the "winning" category :rtype: str """ score = self.score(text) if not score: return None return sorted(score.items(), key=lambda v: v[1])[-1][0] def score(self, text): """ Scores a sample of text :param text: sample text to score :type text: str :return: dict of scores per category :rtype: dict """ occurs = self.count_token_occurrences(self.tokenizer(text)) scores = {} for category in self.categories.get_categories().keys(): scores[category] = 0 categories = self.categories.get_categories().items() for word, count in occurs.items(): token_scores = {} # Adding up individual token scores for category, bayes_category in categories: token_scores[category] = \ float(bayes_category.get_token_count(word)) # We use this to get token-in-category probabilities token_tally = sum(token_scores.values()) # If this token isn't found anywhere its probability is 0 if token_tally == 0.0: continue # Calculating bayes probabiltity for this token # http://en.wikipedia.org/wiki/Naive_Bayes_spam_filtering for category, token_score in token_scores.items(): # Bayes probability * the number of occurances of this token scores[category] += count * \ self.calculate_bayesian_probability( category, token_score, token_tally ) # Removing empty categories from the results final_scores = {} for category, score in scores.items(): if score > 0: final_scores[category] = score return final_scores def calculate_bayesian_probability(self, cat, token_score, token_tally): """ Calculates the bayesian probability for a given token/category :param cat: The category we're scoring for this token :type cat: str :param token_score: The tally of this token for this category :type token_score: float :param token_tally: The tally total for this token from all categories :type token_tally: float :return: bayesian probability :rtype: float """ # P that any given token IS in this category prc = self.probabilities[cat]['prc'] # P that any given token is NOT in this category prnc = self.probabilities[cat]['prnc'] # P that this token is NOT of this category prtnc = (token_tally - token_score) / token_tally # P that this token IS of this category prtc = token_score / token_tally # Assembling the parts of the bayes equation numerator = (prtc * prc) denominator = (numerator + (prtnc * prnc)) # Returning the calculated bayes probability unless the denom. is 0 return numerator / denominator if denominator != 0.0 else 0.0 def tally(self, category): """ Gets the tally for a requested category :param category: The category we want a tally for :type category: str :return: tally for a given category :rtype: int """ try: bayes_category = self.categories.get_category(category) except KeyError: return 0 return bayes_category.get_tally() def get_cache_location(self): """ Gets the location of the cache file :return: the location of the cache file :rtype: string """ filename = self.cache_path if \ self.cache_path[-1:] == '/' else \ self.cache_path + '/' filename += self.cache_file return filename def cache_persist(self): """ Saves the current trained data to the cache. This is initiated by the program using this module """ filename = self.get_cache_location() pickle.dump(self.categories, open(filename, 'wb')) def cache_train(self): """ Loads the data for this classifier from a cache file :return: whether or not we were successful :rtype: bool """ filename = self.get_cache_location() if not os.path.exists(filename): return False categories = pickle.load(open(filename, 'rb')) assert isinstance(categories, BayesCategories), \ "Cache data is either corrupt or invalid" self.categories = categories # Updating our per-category overall probabilities self.calculate_category_probability() return True
class SimpleBayes(object): """A memory-based, optional-persistence naïve bayesian text classifier.""" cache_file = '_simplebayes.pickle' def __init__(self, tokenizer=None, cache_path='/tmp/'): """ :param tokenizer: A tokenizer override :type tokenizer: function (optional) :param cache_path: path to data storage :type cache_path: str """ self.categories = BayesCategories() self.tokenizer = tokenizer or SimpleBayes.tokenize_text self.cache_path = cache_path self.probabilities = {} @classmethod def tokenize_text(cls, text): """ Default tokenize method; can be overridden :param text: the text we want to tokenize :type text: str :return: list of tokenized text :rtype: list """ return [w for w in text.split() if len(w) > 2] @classmethod def count_token_occurrences(cls, words): """ Creates a key/value set of word/count for a given sample of text :param words: full list of all tokens, non-unique :type words: list :return: key/value pairs of words and their counts in the list :rtype: dict """ counts = {} for word in words: if word in counts: counts[word] += 1 else: counts[word] = 1 return counts def flush(self): """ Deletes all tokens & categories """ self.categories = BayesCategories() def calculate_category_probability(self): """ Caches the individual probabilities for each category """ total_tally = 0.0 probs = {} for category, bayes_category in \ self.categories.get_categories().items(): count = bayes_category.get_tally() total_tally += count probs[category] = count # Calculating the probability for category, count in probs.items(): if total_tally > 0: probs[category] = float(count)/float(total_tally) else: probs[category] = 0.0 for category, probability in probs.items(): self.probabilities[category] = { # Probability that any given token is of this category 'prc': probability, # Probability that any given token is not of this category 'prnc': sum(probs.values()) - probability } def train(self, category, text): """ Trains a category with a sample of text :param category: the name of the category we want to train :type category: str :param text: the text we want to train the category with :type text: str """ try: bayes_category = self.categories.get_category(category) except KeyError: bayes_category = self.categories.add_category(category) tokens = self.tokenizer(str(text)) occurrence_counts = self.count_token_occurrences(tokens) for word, count in occurrence_counts.items(): bayes_category.train_token(word, count) # Updating our per-category overall probabilities self.calculate_category_probability() def untrain(self, category, text): """ Untrains a category with a sample of text :param category: the name of the category we want to train :type category: str :param text: the text we want to untrain the category with :type text: str """ try: bayes_category = self.categories.get_category(category) except KeyError: return tokens = self.tokenizer(str(text)) occurance_counts = self.count_token_occurrences(tokens) for word, count in occurance_counts.items(): bayes_category.untrain_token(word, count) # Updating our per-category overall probabilities self.calculate_category_probability() def classify(self, text): """ Chooses the highest scoring category for a sample of text :param text: sample text to classify :type text: str :return: the "winning" category :rtype: str """ score = self.score(text) if not score: return None return sorted(score.items(), key=lambda v: v[1])[-1][0] def score(self, text): """ Scores a sample of text :param text: sample text to score :type text: str :return: dict of scores per category :rtype: dict """ occurs = self.count_token_occurrences(self.tokenizer(text)) scores = {} for category in self.categories.get_categories().keys(): scores[category] = 0 categories = self.categories.get_categories().items() for word, count in occurs.items(): token_scores = {} # Adding up individual token scores for category, bayes_category in categories: token_scores[category] = \ float(bayes_category.get_token_count(word)) # We use this to get token-in-category probabilities token_tally = sum(token_scores.values()) # If this token isn't found anywhere its probability is 0 if token_tally == 0.0: continue # Calculating bayes probabiltity for this token # http://en.wikipedia.org/wiki/Naive_Bayes_spam_filtering for category, token_score in token_scores.items(): # Bayes probability * the number of occurances of this token scores[category] += count * \ self.calculate_bayesian_probability( category, token_score, token_tally ) # Removing empty categories from the results final_scores = {} for category, score in scores.items(): if score > 0: final_scores[category] = score return final_scores def calculate_bayesian_probability(self, cat, token_score, token_tally): """ Calculates the bayesian probability for a given token/category :param cat: The category we're scoring for this token :type cat: str :param token_score: The tally of this token for this category :type token_score: float :param token_tally: The tally total for this token from all categories :type token_tally: float :return: bayesian probability :rtype: float """ # P that any given token IS in this category prc = self.probabilities[cat]['prc'] # P that any given token is NOT in this category prnc = self.probabilities[cat]['prnc'] # P that this token is NOT of this category prtnc = (token_tally - token_score) / token_tally # P that this token IS of this category prtc = token_score / token_tally # Assembling the parts of the bayes equation numerator = (prtc * prc) denominator = (numerator + (prtnc * prnc)) # Returning the calculated bayes probability unless the denom. is 0 return numerator / denominator if denominator != 0.0 else 0.0 def tally(self, category): """ Gets the tally for a requested category :param category: The category we want a tally for :type category: str :return: tally for a given category :rtype: int """ try: bayes_category = self.categories.get_category(category) except KeyError: return 0 return bayes_category.get_tally() def get_cache_location(self): """ Gets the location of the cache file :return: the location of the cache file :rtype: string """ filename = self.cache_path if \ self.cache_path[-1:] == '/' else \ self.cache_path + '/' filename += self.cache_file return filename def cache_persist(self): """ Saves the current trained data to the cache. This is initiated by the program using this module """ filename = self.get_cache_location() pickle.dump(self.categories, open(filename, 'wb')) def cache_train(self): """ Loads the data for this classifier from a cache file :return: whether or not we were successful :rtype: bool """ filename = self.get_cache_location() if not os.path.exists(filename): return False categories = pickle.load(open(filename, 'rb')) assert isinstance(categories, BayesCategories), \ "Cache data is either corrupt or invalid" self.categories = categories # Updating our per-category overall probabilities self.calculate_category_probability() return True