示例#1
0
  def test_ngrams(self):
    expectation = [
      [u'\u0000', "quick"],
      ["quick", "brown"],
      ["brown", "fox"],
    ]

    actual = Tokenizer.ngram("quick brown fox", 2)
    self.assertEqual(actual, expectation)
示例#2
0
    def test_ngrams(self):
        expectation = [
            [u'\u0000', 'quick'],
            ['quick', 'brown'],
            ['brown', 'fox']
        ]

        actual = Tokenizer.ngram('quick brown fox', 2)
        self.assertEqual(actual, expectation)
  def test_ngrams(self):
    expectation = [
        [u'\u0000', "quick"],
        ["quick", "brown"],
        ["brown", "fox"],
    ]

    actual = Tokenizer.ngram("quick brown fox", 2)
    self.assertEqual(actual, expectation)
示例#4
0
    def train(self):
        for category, file in self.to_train:
            email = EmailObject(io.open(file, 'rb'))

            self.categories.add(category)

            for token in Tokenizer.unique_tokenizer(email.body()):
                self.training[category][token] += 1
                self.totals['_all'] += 1
                self.totals[category] += 1

        self.to_train = {}
示例#5
0
    def score(self, email):
        self.train()

        cat_totals = self.totals

        aggregates = {
            cat: cat_totals[cat] / cat_totals['_all']
            for cat in self.categories
        }
        for token in Tokenizer.unique_tokenizer(email.body()):
            for cat in self.categories:
                value = self.training[cat][token]
                r = (value + 1) / (cat_totals[cat] + 1)
                aggregates[cat] *= r
        return aggregates
    def score(self, email):
        """
    Calculates score
    :param email: EmailObject
    :return: float number
    """
        self.train()

        cat_totals = self.totals

        aggregates = {
            cat: cat_totals[cat] / cat_totals['_all']
            for cat in self.categories
        }

        for token in Tokenizer.unique_tokenizer(email.body()):
            for cat in self.categories:
                value = self.training[cat][token]
                r = (value + 1) / (cat_totals[cat] + 1)
                aggregates[cat] *= r

        return aggregates
示例#7
0
  def test_downcasing(self):
    expectation = ["this", "is", "all", "caps"]

    actual = Tokenizer.tokenize("THIS IS ALL CAPS")
    self.assertEqual(actual, expectation)
示例#8
0
    def test_cowncasting(self):
        expectation = ['this', 'is', 'all', 'caps']

        actual = Tokenizer.tokenize('THIS IS ALL CAPS')
        self.assertEqual(actual, expectation)
  def test_downcasing(self):
    expectation = ["this", "is", "all", "caps"]

    actual = Tokenizer.tokenize("THIS IS ALL CAPS")
    self.assertEqual(actual, expectation)