def test_set_delim_set(self): tok = DelimiterTokenizer(['*', '.']) self.assertSetEqual(tok.get_delim_set(), {'*', '.'}) self.assertEqual(tok.tokenize('ab cd*ef.*bb. gg.'), ['ab cd', 'ef', 'bb', ' gg']) self.assertEqual(tok.set_delim_set({'..', 'ab'}), True) self.assertSetEqual(tok.get_delim_set(), {'..', 'ab'}) self.assertEqual(tok.tokenize('ab cd..efabbb....ggab cd..efabgh'), [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh'])
def test_set_return_set(self): tok = DelimiterTokenizer(set(['..', 'ab'])) self.assertEqual(tok.get_return_set(), False) self.assertEqual(tok.tokenize('ab cd..efabbb....ggab cd..efabgh'), [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh']) self.assertEqual(tok.set_return_set(True), True) self.assertEqual(tok.get_return_set(), True) self.assertEqual(tok.tokenize('ab cd..efabbb....ggab cd..efabgh'), [' cd', 'ef', 'bb', 'gg', 'gh']) self.assertEqual(tok.set_return_set(False), True) self.assertEqual(tok.get_return_set(), False) self.assertEqual(tok.tokenize('ab cd..efabbb....ggab cd..efabgh'), [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh'])
class DelimiterTokenizerTestCases(unittest.TestCase): def setUp(self): self.delim_tok1 = DelimiterTokenizer() self.delim_tok2 = DelimiterTokenizer(set([','])) self.delim_tok3 = DelimiterTokenizer(set(['*', '.'])) self.delim_tok4 = DelimiterTokenizer(set(['..', 'ab'])) self.delim_tok4_list = DelimiterTokenizer(['..', 'ab', '..']) self.delim_tok4_return_set = DelimiterTokenizer(set(['..', 'ab']), return_set=True) def test_delimiter_valid(self): self.assertEqual(self.delim_tok1.tokenize('data science'), ['data', 'science']) self.assertEqual(self.delim_tok2.tokenize('data,science'), ['data', 'science']) self.assertEqual(self.delim_tok2.tokenize('data science'), ['data science']) self.assertEqual(self.delim_tok3.tokenize('ab cd*ef.*bb. gg.'), ['ab cd', 'ef', 'bb', ' gg']) self.assertEqual( self.delim_tok4.tokenize('ab cd..efabbb....ggab cd..efabgh'), [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh']) self.assertEqual( self.delim_tok4_list.tokenize('ab cd..efabbb....ggab cd..efabgh'), [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh']) self.assertEqual( self.delim_tok4_return_set.tokenize( 'ab cd..efabbb....ggab cd..efabgh'), [' cd', 'ef', 'bb', 'gg', 'gh']) def test_get_return_set(self): self.assertEqual(self.delim_tok4.get_return_set(), False) self.assertEqual(self.delim_tok4_return_set.get_return_set(), True) def test_get_delim_set(self): self.assertSetEqual(self.delim_tok1.get_delim_set(), {' '}) self.assertSetEqual(self.delim_tok3.get_delim_set(), {'*', '.'}) self.assertSetEqual(self.delim_tok4_list.get_delim_set(), {'..', 'ab'}) def test_set_return_set(self): tok = DelimiterTokenizer(set(['..', 'ab'])) self.assertEqual(tok.get_return_set(), False) self.assertEqual(tok.tokenize('ab cd..efabbb....ggab cd..efabgh'), [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh']) self.assertEqual(tok.set_return_set(True), True) self.assertEqual(tok.get_return_set(), True) self.assertEqual(tok.tokenize('ab cd..efabbb....ggab cd..efabgh'), [' cd', 'ef', 'bb', 'gg', 'gh']) self.assertEqual(tok.set_return_set(False), True) self.assertEqual(tok.get_return_set(), False) self.assertEqual(tok.tokenize('ab cd..efabbb....ggab cd..efabgh'), [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh']) def test_set_delim_set(self): tok = DelimiterTokenizer(['*', '.']) self.assertSetEqual(tok.get_delim_set(), {'*', '.'}) self.assertEqual(tok.tokenize('ab cd*ef.*bb. gg.'), ['ab cd', 'ef', 'bb', ' gg']) self.assertEqual(tok.set_delim_set({'..', 'ab'}), True) self.assertSetEqual(tok.get_delim_set(), {'..', 'ab'}) self.assertEqual(tok.tokenize('ab cd..efabbb....ggab cd..efabgh'), [' cd', 'ef', 'bb', 'gg', ' cd', 'ef', 'gh']) @raises(TypeError) def test_delimiter_invalid1(self): invalid_delim_tok = DelimiterTokenizer(set([',', 10])) @raises(TypeError) def test_delimiter_invalid2(self): self.delim_tok1.tokenize(None) @raises(TypeError) def test_delimiter_invalid3(self): self.delim_tok1.tokenize(99)