示例#1
0
 def tokenize(self, text):
     return [Token(text, Span(s, e)) for s, e in _white_space_spans(text)]
示例#2
0
 def test_infix(self):
     text = 'Hel?lo my 55!5! dear,est (world?'
     #       01234567890123456789012345678901
     res = self._tokz.tokenize(text)
     self.assertEqual(res, [Token(text, Span(0, 3)),
                            Token(text, Span(3, 4)),
                            Token(text, Span(4, 6)),
                            Token(text, Span(7, 9)),
                            Token(text, Span(10, 12)),
                            Token(text, Span(12, 13)),
                            Token(text, Span(13, 14)),
                            Token(text, Span(14, 15)),
                            Token(text, Span(16, 20)),
                            Token(text, Span(20, 21)),
                            Token(text, Span(21, 24)),
                            Token(text, Span(25, 26)),
                            Token(text, Span(26, 31)),
                            Token(text, Span(31, 32))
                            ])
示例#3
0
 def test_singleword(self):
     text = 'This'
     tokenz = self._tokz.tokenize(text)
     self.assertEqual(tokenz[0], Token(text, Span(0, 4)))
示例#4
0
 def test_empty(self):
     text = ''
     tokenz = self._tokz.tokenize(text)
     self.assertEqual(tokenz[0], Token(text, Span(0, 1)))
示例#5
0
    def tokenize(self, text):

        return [Token(text, Span(s, e)) for s, e in _white_space_spans(text)]

        raise NotImplementedError('Default tokenizer method not implemented')