def tokenize(text): hold_back = None skip = False for word in nltk.tokenize.word_tokenize(text): if hold_back is not None: if word == hold_back[0]: yield Token(hold_back[0]) yield Token(hold_back[1]) yield Token(word) skip = True else: yield Token(hold_back[0] + hold_back[1]) hold_back = None if not skip: if word.startswith(Token.APOSTROPHE): # Use hold_back to fix tokenization errors of the form: # | input | output | expected | # | ------ | ------- | -------- | # | 'word' | 'word ' | ' word ' | hold_back = (word[0], word[1:]) else: hold_back = None if hold_back is None: yield Token(word) skip = False if hold_back is not None: yield Token(hold_back[0] + hold_back[1])
def as_sentence(words): sentence_builder = SentenceBuilder() for i, word in enumerate(words): if isinstance(word, Token): token = word else: token = Token(word) # We can only complete a sentence when we're at the final token. # v sentence_builder.process(token, can_complete=i + 1 == len(words)) return sentence_builder.build()
def test_canonicalization_strict(self): with self.assertRaisesRegex(ValueError, "invalid character"): Token("€") with self.assertRaisesRegex(ValueError, "invalid character"): Token("â€s")
def test_canonicalization_strict(self): token = Token("€") self.assertEqual(token.literal, "€") token = Token("â€s") self.assertEqual(token.literal, "a€s")
def test_canonicalization(self): token = Token("Über") self.assertEqual(token.literal, "uber") token = Token("łeet") self.assertEqual(token.literal, "leet")
def test_quote(self): value = '"' token = Token(value) self.assertEqual(token.word, value) self.assertEqual(token.is_open(), False) self.assertEqual(token.is_close(), False) self.assertEqual(token.is_quote(), True) value = '"s' token = Token(value) self.assertEqual(token.word, value) value = 's"' token = Token(value) self.assertEqual(token.word, value) value = '"s"' token = Token(value) self.assertEqual(token.word, value) value = 'wo"rd' token = Token(value) self.assertEqual(token.word, value)
def test_apostrophe(self): value = "'" token = Token(value) self.assertEqual(token.word, value) self.assertEqual(token.is_open(), False) self.assertEqual(token.is_close(), False) self.assertEqual(token.is_quote(), False) self.assertEqual(token.is_apostrophe(), True) value = "'s" token = Token(value) self.assertEqual(token.word, value) self.assertEqual(token.is_apostrophe(), False) value = "s'" token = Token(value) self.assertEqual(token.word, value) self.assertEqual(token.is_apostrophe(), False) value = "'s'" token = Token(value) self.assertEqual(token.word, value) self.assertEqual(token.is_apostrophe(), False) value = "wo'rd" token = Token(value) self.assertEqual(token.word, value) self.assertEqual(token.is_apostrophe(), False)
def test_token(self): value = "MyWord" token = Token(value) self.assertEqual(token.word, value) self.assertEqual(token.literal, value.lower())