def tokenize(self, text): return [Token(text, Span(s, e)) for s, e in _white_space_spans(text)]
def test_infix(self): text = 'Hel?lo my 55!5! dear,est (world?' # 01234567890123456789012345678901 res = self._tokz.tokenize(text) self.assertEqual(res, [Token(text, Span(0, 3)), Token(text, Span(3, 4)), Token(text, Span(4, 6)), Token(text, Span(7, 9)), Token(text, Span(10, 12)), Token(text, Span(12, 13)), Token(text, Span(13, 14)), Token(text, Span(14, 15)), Token(text, Span(16, 20)), Token(text, Span(20, 21)), Token(text, Span(21, 24)), Token(text, Span(25, 26)), Token(text, Span(26, 31)), Token(text, Span(31, 32)) ])
def test_singleword(self): text = 'This' tokenz = self._tokz.tokenize(text) self.assertEqual(tokenz[0], Token(text, Span(0, 4)))
def test_empty(self): text = '' tokenz = self._tokz.tokenize(text) self.assertEqual(tokenz[0], Token(text, Span(0, 1)))
def tokenize(self, text): return [Token(text, Span(s, e)) for s, e in _white_space_spans(text)] raise NotImplementedError('Default tokenizer method not implemented')