def test_broken_example(self): """ Real example from Unbabel """ text = u'''Dear Pinterest Help, I checked my account right now but it've had same problem…. Best regards, Shoichi Hirota 2015/12/18 18:02、 Pinterest Help < [email protected]> のメール:''' tok = ChineseTokenizor() tok.span_tokenize(text)
def test_canonical_cases(self): canonical_cases = [ [u"A。B", [[0, 2], [2, 3]]], # Two sentences [u"A。", [[0, 2]]], # One Sentence [u"", []], # Emtpy String [u" ", []], # Emtpy String with spaces [u" A。 ", [[4, 6]]], # One Sentence with spaces [u" A。B", [[3, 5], [5, 6]]], # Two sentences [u"A。B ", [[0, 2], [2, 3]]], # Two sentences [u"A。 B", [[0, 2], [5, 6]]], # Two sentences [u" A。 B ", [[4, 6], [9, 10]]], # Two sentences ] tok = ChineseTokenizor() for i, (text, truth) in enumerate(canonical_cases): with self.subTest(i=i): intervals = tok.span_tokenize(text) self.assertEqual(truth, intervals)