Exemplo n.º 1
0
    def tokenize(text):
        hold_back = None
        skip = False

        for word in nltk.tokenize.word_tokenize(text):
            if hold_back is not None:
                if word == hold_back[0]:
                    yield Token(hold_back[0])
                    yield Token(hold_back[1])
                    yield Token(word)
                    skip = True
                else:
                    yield Token(hold_back[0] + hold_back[1])

                hold_back = None

            if not skip:
                if word.startswith(Token.APOSTROPHE):
                    # Use hold_back to fix tokenization errors of the form:
                    # | input  | output  | expected |
                    # | ------ | ------- | -------- |
                    # | 'word' | 'word ' | ' word ' |
                    hold_back = (word[0], word[1:])
                else:
                    hold_back = None

                if hold_back is None:
                    yield Token(word)

            skip = False

        if hold_back is not None:
            yield Token(hold_back[0] + hold_back[1])
Exemplo n.º 2
0
def as_sentence(words):
    sentence_builder = SentenceBuilder()

    for i, word in enumerate(words):
        if isinstance(word, Token):
            token = word
        else:
            token = Token(word)

        # We can only complete a sentence when we're at the final token.
        #                               v
        sentence_builder.process(token, can_complete=i + 1 == len(words))

    return sentence_builder.build()
Exemplo n.º 3
0
    def test_canonicalization_strict(self):
        with self.assertRaisesRegex(ValueError, "invalid character"):
            Token("€")

        with self.assertRaisesRegex(ValueError, "invalid character"):
            Token("â€s")
Exemplo n.º 4
0
    def test_canonicalization_strict(self):
        token = Token("€")
        self.assertEqual(token.literal, "€")

        token = Token("â€s")
        self.assertEqual(token.literal, "a€s")
Exemplo n.º 5
0
    def test_canonicalization(self):
        token = Token("Über")
        self.assertEqual(token.literal, "uber")

        token = Token("łeet")
        self.assertEqual(token.literal, "leet")
Exemplo n.º 6
0
    def test_quote(self):
        value = '"'
        token = Token(value)
        self.assertEqual(token.word, value)
        self.assertEqual(token.is_open(), False)
        self.assertEqual(token.is_close(), False)
        self.assertEqual(token.is_quote(), True)

        value = '"s'
        token = Token(value)
        self.assertEqual(token.word, value)

        value = 's"'
        token = Token(value)
        self.assertEqual(token.word, value)

        value = '"s"'
        token = Token(value)
        self.assertEqual(token.word, value)

        value = 'wo"rd'
        token = Token(value)
        self.assertEqual(token.word, value)
Exemplo n.º 7
0
    def test_apostrophe(self):
        value = "'"
        token = Token(value)
        self.assertEqual(token.word, value)
        self.assertEqual(token.is_open(), False)
        self.assertEqual(token.is_close(), False)
        self.assertEqual(token.is_quote(), False)
        self.assertEqual(token.is_apostrophe(), True)

        value = "'s"
        token = Token(value)
        self.assertEqual(token.word, value)
        self.assertEqual(token.is_apostrophe(), False)

        value = "s'"
        token = Token(value)
        self.assertEqual(token.word, value)
        self.assertEqual(token.is_apostrophe(), False)

        value = "'s'"
        token = Token(value)
        self.assertEqual(token.word, value)
        self.assertEqual(token.is_apostrophe(), False)

        value = "wo'rd"
        token = Token(value)
        self.assertEqual(token.word, value)
        self.assertEqual(token.is_apostrophe(), False)
Exemplo n.º 8
0
 def test_token(self):
     value = "MyWord"
     token = Token(value)
     self.assertEqual(token.word, value)
     self.assertEqual(token.literal, value.lower())