Exemplo n.º 1
 def __init__(self, tokenizer=None, cache_path='/tmp/'):
     :param tokenizer: A tokenizer override
     :type tokenizer: function (optional)
     :param cache_path: path to data storage
     :type cache_path: str
     self.categories = BayesCategories()
     self.tokenizer = tokenizer or SimpleBayes.tokenize_text
     self.cache_path = cache_path
     self.probabilities = {}
Exemplo n.º 2
    def test_persist_cache(self, dump_mock, open_mock):
        open_mock.return_value = 'opened'

        categories = BayesCategories()
        categories.categories = {'foo': 'bar'}

        sb = SimpleBayes()
        sb.cache_path = '/tmp/'
        sb.categories = categories

        open_mock.assert_called_once_with('/tmp/_simplebayes.pickle', 'wb')
        dump_mock.assert_called_once_with(categories, 'opened')
Exemplo n.º 3
    def test_persist_cache(self, dump_mock, open_mock):
        open_mock.return_value = 'opened'

        categories = BayesCategories()
        categories.categories = {'foo': 'bar'}

        sb = SimpleBayes()
        sb.cache_path = '/tmp/'
        sb.categories = categories

        open_mock.assert_called_once_with('/tmp/_simplebayes.pickle', 'wb')
        dump_mock.assert_called_once_with(categories, 'opened')
Exemplo n.º 4
    def test_cache_train(self, exists_mock, load_mock, open_mock, calc_mock):
        categories = BayesCategories()
        categories.categories = {'foo': 'bar'}

        load_mock.return_value = categories
        open_mock.return_value = 'opened'
        exists_mock.return_value = True

        sb = SimpleBayes(cache_path='foo')

        open_mock.assert_called_once_with('foo/_simplebayes.pickle', 'rb')

        self.assertEqual(sb.categories, categories)
Exemplo n.º 5
    def test_cache_train(self, exists_mock, load_mock, open_mock, calc_mock):
        categories = BayesCategories()
        categories.categories = {'foo': 'bar'}

        load_mock.return_value = categories
        open_mock.return_value = 'opened'
        exists_mock.return_value = True

        sb = SimpleBayes(cache_path='foo')

        open_mock.assert_called_once_with('foo/_simplebayes.pickle', 'rb')

        self.assertEqual(sb.categories, categories)
Exemplo n.º 6
 def __init__(self, tokenizer=None, cache_path='/tmp/'):
     :param tokenizer: A tokenizer override
     :type tokenizer: function (optional)
     :param cache_path: path to data storage
     :type cache_path: str
     self.categories = BayesCategories()
     self.tokenizer = tokenizer or SimpleBayes.tokenize_text
     self.cache_path = cache_path
     self.probabilities = {}
Exemplo n.º 7
 def __init__(self,
     :param tokenizer: A tokenizer override
     :type tokenizer: function (optional)
     :param cache_path: path to data storage
     :type cache_path: str
     :param cache_data: from an existing cache
     :type cache_data: pickle.dumps object
     :param cache_in_memory: True if the persistant cache is kept in memory
     :type: boolean
     self.categories = BayesCategories()
     self.tokenizer = tokenizer or SimpleBayes.tokenize_text
     self.cache_path = cache_path
     self.cache_data = cache_data
     self.cache_in_memory = True if cache_data is not None else cache_in_memory
     self.probabilities = {}
Exemplo n.º 8
 def test_get_categories(self):
     bc = BayesCategories()
     self.assertEqual(bc.get_categories(), bc.categories)
Exemplo n.º 9
 def test_get_category(self):
     bc = BayesCategories()
     self.assertIsInstance(bc.get_category('foo'), BayesCategory)
Exemplo n.º 10
 def test_add_category(self):
     bc = BayesCategories()
     self.assertIn('foo', bc.categories)
     self.assertIsInstance(bc.categories['foo'], BayesCategory)
Exemplo n.º 11
 def flush(self):
     Deletes all tokens & categories
     self.categories = BayesCategories()
Exemplo n.º 12
class SimpleBayes(object):
    """A memory-based, optional-persistence naïve bayesian text classifier."""

    cache_file = '_simplebayes.pickle'

    def __init__(self, tokenizer=None, cache_path='/tmp/'):
        :param tokenizer: A tokenizer override
        :type tokenizer: function (optional)
        :param cache_path: path to data storage
        :type cache_path: str
        self.categories = BayesCategories()
        self.tokenizer = tokenizer or SimpleBayes.tokenize_text
        self.cache_path = cache_path
        self.probabilities = {}

    def tokenize_text(cls, text):
        Default tokenize method; can be overridden

        :param text: the text we want to tokenize
        :type text: str
        :return: list of tokenized text
        :rtype: list
        return [w for w in text.split() if len(w) > 2]

    def count_token_occurrences(cls, words):
        Creates a key/value set of word/count for a given sample of text

        :param words: full list of all tokens, non-unique
        :type words: list
        :return: key/value pairs of words and their counts in the list
        :rtype: dict
        counts = {}
        for word in words:
            if word in counts:
                counts[word] += 1
                counts[word] = 1
        return counts

    def flush(self):
        Deletes all tokens & categories
        self.categories = BayesCategories()

    def calculate_category_probability(self):
        Caches the individual probabilities for each category
        total_tally = 0.0
        probs = {}
        for category, bayes_category in \
            count = bayes_category.get_tally()
            total_tally += count
            probs[category] = count

        # Calculating the probability
        for category, count in probs.items():
            if total_tally > 0:
                probs[category] = float(count) / float(total_tally)
                probs[category] = 0.0

        for category, probability in probs.items():
            self.probabilities[category] = {
                # Probability that any given token is of this category
                'prc': probability,
                # Probability that any given token is not of this category
                'prnc': sum(probs.values()) - probability

    def train(self, category, text):
        Trains a category with a sample of text

        :param category: the name of the category we want to train
        :type category: str
        :param text: the text we want to train the category with
        :type text: str
            bayes_category = self.categories.get_category(category)
        except KeyError:
            bayes_category = self.categories.add_category(category)

        tokens = self.tokenizer(str(text))
        occurrence_counts = self.count_token_occurrences(tokens)

        for word, count in occurrence_counts.items():
            bayes_category.train_token(word, count)

        # Updating our per-category overall probabilities

    def untrain(self, category, text):
        Untrains a category with a sample of text

        :param category: the name of the category we want to train
        :type category: str
        :param text: the text we want to untrain the category with
        :type text: str
            bayes_category = self.categories.get_category(category)
        except KeyError:

        tokens = self.tokenizer(str(text))
        occurance_counts = self.count_token_occurrences(tokens)

        for word, count in occurance_counts.items():
            bayes_category.untrain_token(word, count)

        # Updating our per-category overall probabilities

    def classify(self, text):
        Chooses the highest scoring category for a sample of text

        :param text: sample text to classify
        :type text: str
        :return: the "winning" category
        :rtype: str
        score = self.score(text)
        if not score:
            return None
        return sorted(score.items(), key=lambda v: v[1])[-1][0]

    def score(self, text):
        Scores a sample of text

        :param text: sample text to score
        :type text: str
        :return: dict of scores per category
        :rtype: dict
        occurs = self.count_token_occurrences(self.tokenizer(text))
        scores = {}
        for category in self.categories.get_categories().keys():
            scores[category] = 0

        categories = self.categories.get_categories().items()

        for word, count in occurs.items():
            token_scores = {}

            # Adding up individual token scores
            for category, bayes_category in categories:
                token_scores[category] = \

            # We use this to get token-in-category probabilities
            token_tally = sum(token_scores.values())

            # If this token isn't found anywhere its probability is 0
            if token_tally == 0.0:

            # Calculating bayes probabiltity for this token
            # http://en.wikipedia.org/wiki/Naive_Bayes_spam_filtering
            for category, token_score in token_scores.items():
                # Bayes probability * the number of occurances of this token
                scores[category] += count * \

        # Removing empty categories from the results
        final_scores = {}
        for category, score in scores.items():
            if score > 0:
                final_scores[category] = score

        return final_scores

    def calculate_bayesian_probability(self, cat, token_score, token_tally):
        Calculates the bayesian probability for a given token/category

        :param cat: The category we're scoring for this token
        :type cat: str
        :param token_score: The tally of this token for this category
        :type token_score: float
        :param token_tally: The tally total for this token from all categories
        :type token_tally: float
        :return: bayesian probability
        :rtype: float
        # P that any given token IS in this category
        prc = self.probabilities[cat]['prc']
        # P that any given token is NOT in this category
        prnc = self.probabilities[cat]['prnc']
        # P that this token is NOT of this category
        prtnc = (token_tally - token_score) / token_tally
        # P that this token IS of this category
        prtc = token_score / token_tally

        # Assembling the parts of the bayes equation
        numerator = (prtc * prc)
        denominator = (numerator + (prtnc * prnc))

        # Returning the calculated bayes probability unless the denom. is 0
        return numerator / denominator if denominator != 0.0 else 0.0

    def tally(self, category):
        Gets the tally for a requested category

        :param category: The category we want a tally for
        :type category: str
        :return: tally for a given category
        :rtype: int
            bayes_category = self.categories.get_category(category)
        except KeyError:
            return 0

        return bayes_category.get_tally()

    def get_cache_location(self):
        Gets the location of the cache file

        :return: the location of the cache file
        :rtype: string
        filename = self.cache_path if \
            self.cache_path[-1:] == '/' else \
            self.cache_path + '/'
        filename += self.cache_file
        return filename

    def cache_persist(self):
        Saves the current trained data to the cache.
        This is initiated by the program using this module
        filename = self.get_cache_location()
        pickle.dump(self.categories, open(filename, 'wb'))

    def cache_train(self):
        Loads the data for this classifier from a cache file

        :return: whether or not we were successful
        :rtype: bool
        filename = self.get_cache_location()

        if not os.path.exists(filename):
            return False

        categories = pickle.load(open(filename, 'rb'))

        assert isinstance(categories, BayesCategories), \
            "Cache data is either corrupt or invalid"

        self.categories = categories

        # Updating our per-category overall probabilities

        return True
Exemplo n.º 13
 def flush(self):
     Deletes all tokens & categories
     self.categories = BayesCategories()
Exemplo n.º 14
class SimpleBayes(object):
    """A memory-based, optional-persistence naïve bayesian text classifier."""

    cache_file = '_simplebayes.pickle'

    def __init__(self, tokenizer=None, cache_path='/tmp/'):
        :param tokenizer: A tokenizer override
        :type tokenizer: function (optional)
        :param cache_path: path to data storage
        :type cache_path: str
        self.categories = BayesCategories()
        self.tokenizer = tokenizer or SimpleBayes.tokenize_text
        self.cache_path = cache_path
        self.probabilities = {}

    def tokenize_text(cls, text):
        Default tokenize method; can be overridden

        :param text: the text we want to tokenize
        :type text: str
        :return: list of tokenized text
        :rtype: list
        return [w for w in text.split() if len(w) > 2]

    def count_token_occurrences(cls, words):
        Creates a key/value set of word/count for a given sample of text

        :param words: full list of all tokens, non-unique
        :type words: list
        :return: key/value pairs of words and their counts in the list
        :rtype: dict
        counts = {}
        for word in words:
            if word in counts:
                counts[word] += 1
                counts[word] = 1
        return counts

    def flush(self):
        Deletes all tokens & categories
        self.categories = BayesCategories()

    def calculate_category_probability(self):
        Caches the individual probabilities for each category
        total_tally = 0.0
        probs = {}
        for category, bayes_category in \
            count = bayes_category.get_tally()
            total_tally += count
            probs[category] = count

        # Calculating the probability
        for category, count in probs.items():
            if total_tally > 0:
                probs[category] = float(count)/float(total_tally)
                probs[category] = 0.0

        for category, probability in probs.items():
            self.probabilities[category] = {
                # Probability that any given token is of this category
                'prc': probability,
                # Probability that any given token is not of this category
                'prnc': sum(probs.values()) - probability

    def train(self, category, text):
        Trains a category with a sample of text

        :param category: the name of the category we want to train
        :type category: str
        :param text: the text we want to train the category with
        :type text: str
            bayes_category = self.categories.get_category(category)
        except KeyError:
            bayes_category = self.categories.add_category(category)

        tokens = self.tokenizer(str(text))
        occurrence_counts = self.count_token_occurrences(tokens)

        for word, count in occurrence_counts.items():
            bayes_category.train_token(word, count)

        # Updating our per-category overall probabilities

    def untrain(self, category, text):
        Untrains a category with a sample of text

        :param category: the name of the category we want to train
        :type category: str
        :param text: the text we want to untrain the category with
        :type text: str
            bayes_category = self.categories.get_category(category)
        except KeyError:

        tokens = self.tokenizer(str(text))
        occurance_counts = self.count_token_occurrences(tokens)

        for word, count in occurance_counts.items():
            bayes_category.untrain_token(word, count)

        # Updating our per-category overall probabilities

    def classify(self, text):
        Chooses the highest scoring category for a sample of text

        :param text: sample text to classify
        :type text: str
        :return: the "winning" category
        :rtype: str
        score = self.score(text)
        if not score:
            return None
        return sorted(score.items(), key=lambda v: v[1])[-1][0]

    def score(self, text):
        Scores a sample of text

        :param text: sample text to score
        :type text: str
        :return: dict of scores per category
        :rtype: dict
        occurs = self.count_token_occurrences(self.tokenizer(text))
        scores = {}
        for category in self.categories.get_categories().keys():
            scores[category] = 0

        categories = self.categories.get_categories().items()

        for word, count in occurs.items():
            token_scores = {}

            # Adding up individual token scores
            for category, bayes_category in categories:
                token_scores[category] = \

            # We use this to get token-in-category probabilities
            token_tally = sum(token_scores.values())

            # If this token isn't found anywhere its probability is 0
            if token_tally == 0.0:

            # Calculating bayes probabiltity for this token
            # http://en.wikipedia.org/wiki/Naive_Bayes_spam_filtering
            for category, token_score in token_scores.items():
                # Bayes probability * the number of occurances of this token
                scores[category] += count * \

        # Removing empty categories from the results
        final_scores = {}
        for category, score in scores.items():
            if score > 0:
                final_scores[category] = score

        return final_scores

    def calculate_bayesian_probability(self, cat, token_score, token_tally):
        Calculates the bayesian probability for a given token/category

        :param cat: The category we're scoring for this token
        :type cat: str
        :param token_score: The tally of this token for this category
        :type token_score: float
        :param token_tally: The tally total for this token from all categories
        :type token_tally: float
        :return: bayesian probability
        :rtype: float
        # P that any given token IS in this category
        prc = self.probabilities[cat]['prc']
        # P that any given token is NOT in this category
        prnc = self.probabilities[cat]['prnc']
        # P that this token is NOT of this category
        prtnc = (token_tally - token_score) / token_tally
        # P that this token IS of this category
        prtc = token_score / token_tally

        # Assembling the parts of the bayes equation
        numerator = (prtc * prc)
        denominator = (numerator + (prtnc * prnc))

        # Returning the calculated bayes probability unless the denom. is 0
        return numerator / denominator if denominator != 0.0 else 0.0

    def tally(self, category):
        Gets the tally for a requested category

        :param category: The category we want a tally for
        :type category: str
        :return: tally for a given category
        :rtype: int
            bayes_category = self.categories.get_category(category)
        except KeyError:
            return 0

        return bayes_category.get_tally()

    def get_cache_location(self):
        Gets the location of the cache file

        :return: the location of the cache file
        :rtype: string
        filename = self.cache_path if \
            self.cache_path[-1:] == '/' else \
            self.cache_path + '/'
        filename += self.cache_file
        return filename

    def cache_persist(self):
        Saves the current trained data to the cache.
        This is initiated by the program using this module
        filename = self.get_cache_location()
        pickle.dump(self.categories, open(filename, 'wb'))

    def cache_train(self):
        Loads the data for this classifier from a cache file

        :return: whether or not we were successful
        :rtype: bool
        filename = self.get_cache_location()

        if not os.path.exists(filename):
            return False

        categories = pickle.load(open(filename, 'rb'))

        assert isinstance(categories, BayesCategories), \
            "Cache data is either corrupt or invalid"

        self.categories = categories

        # Updating our per-category overall probabilities

        return True