def test_stopwords(self): text = 'this is a nice house' tok = BaseTokenizer(stop_words='english') self.assertListEqual(tok.tokenize(text), ['nice', 'house']) tok = BaseTokenizer(stop_words=['is', 'a']) self.assertListEqual(tok.tokenize(text), ['this', 'nice', 'house']) try: BaseTokenizer(stop_words='vietnamese') except ValueError: assert True
class TestBaseTokenizer(unittest.TestCase): def setUp(self): self.tok = BaseTokenizer() def test_init(self): self.assertEqual(self.tok.sep, ' ') def test_tokenize(self): tokens = self.tok.tokenize('a b c') self.assertListEqual(tokens, ['a', 'b', 'c']) def test_batch_tokenize(self): token_list = self.tok.batch_tokenize(['a b c', 'd e f']) self.assertListEqual(token_list, [['a', 'b', 'c'], ['d', 'e', 'f']]) def test_default_rules(self): tok = BaseTokenizer(pre_rules=DEFAULT_PRE_RULES) token_list = tok.tokenize('<t>a</t> B |{ C ]?&$ d123 E') self.assertListEqual(token_list, ['a', 'b', 'c', 'd', 'e']) def test_stopwords(self): text = 'this is a nice house' tok = BaseTokenizer(stop_words='english') self.assertListEqual(tok.tokenize(text), ['nice', 'house']) tok = BaseTokenizer(stop_words=['is', 'a']) self.assertListEqual(tok.tokenize(text), ['this', 'nice', 'house']) try: BaseTokenizer(stop_words='vietnamese') except ValueError: assert True
def test_default_rules(self): tok = BaseTokenizer(pre_rules=DEFAULT_PRE_RULES) token_list = tok.tokenize('<t>a</t> B |{ C ]?&$ d123 E') self.assertListEqual(token_list, ['a', 'b', 'c', 'd', 'e'])