def test_corpus_token_counts_split_with_max_lines(self): token_counts = tokenizer.corpus_token_counts(self.corpus_path, corpus_max_lines=5, split_on_newlines=True) self.assertIn(u"slept", token_counts) self.assertNotIn(u"Mitch", token_counts)
def test_corpus_token_counts_no_split_with_max_lines(self): token_counts = tokenizer.corpus_token_counts( self.corpus_path, corpus_max_lines=5, split_on_newlines=False) self.assertIn(u"slept", token_counts) self.assertNotIn(u"Mitch", token_counts) self.assertDictContainsSubset({ u".\n\n": 1, u"\n": 2, u".\n": 1 }, token_counts)
def test_corpus_token_counts_split_on_newlines(self): token_counts = tokenizer.corpus_token_counts( self.corpus_path, corpus_max_lines=0, split_on_newlines=True) expected = { u"'": 2, u".": 2, u". ": 1, u"... ": 1, u"Groucho": 1, u"Marx": 1, u"Mitch": 1, u"Hedberg": 1, u"I": 3, u"in": 2, u"my": 2, u"pajamas": 2, } self.assertDictContainsSubset(expected, token_counts) self.assertNotIn(u".\n\n", token_counts) self.assertNotIn(u"\n", token_counts)
def test_corpus_token_counts_no_split_on_newlines(self): token_counts = tokenizer.corpus_token_counts(self.corpus_path, corpus_max_lines=0, split_on_newlines=False) self.assertDictContainsSubset({u".\n\n": 2, u"\n": 3}, token_counts)