예제 #1
0
    def test_tokenizer_encoded_and_decodes_simple_text(self):
        X = 'abcdef'
        tokenizer = TokenEncoder().fit(X)

        encoded = tokenizer.transform(['fedcba'])[0]
        assert len(encoded) == 6
        assert all([type(idx) == int for idx in encoded])

        decoded = tokenizer.inverse_transform([encoded])[0]
        assert decoded == 'fedcba'
예제 #2
0
    def test_tokenizer_encoded_and_decodes_simple_text(self):
        X = 'abcdef'
        tokenizer = TokenEncoder().fit(X)

        encoded = tokenizer.transform(['fedcba'])[0]
        assert len(encoded) == 6
        assert all([type(idx) == int for idx in encoded])

        decoded = tokenizer.inverse_transform([encoded])[0]
        assert decoded == 'fedcba'
예제 #3
0
    def test_tokenizer_with_special_special_token(self, line):
        tokenizer = TokenEncoder(special_tokens=['$']).fit(['hi'])

        encoded = tokenizer.transform([line])
        decoded = tokenizer.inverse_transform(encoded)[0]
        assert decoded == line
예제 #4
0
    def test_tokenizer_wo_args(self, line, expected):
        tokenizer = TokenEncoder().fit(line)

        result = tokenizer.transform([line])[0]
        result = [tokenizer.id2token_[idx] for idx in result]
        assert result == expected
예제 #5
0
    def test_detokenize_with_space(self, line):
        tokenizer = TokenEncoder(separator=" ").fit([line])
        encoded = tokenizer.transform([line])
        decoded = tokenizer.inverse_transform(encoded)[0]

        assert decoded == line
예제 #6
0
    def test_tokenizer_split_on_space(self, line, expected):
        tokenizer = TokenEncoder(separator=" ").fit([line])

        encoded = tokenizer.transform([line])[0]
        expected = [tokenizer.token2id_[token] for token in expected]
        assert encoded == expected
예제 #7
0
    def test_tokenizer_with_special_tokens_more_matches(self, line, expected):
        tokenizer = TokenEncoder(special_tokens=['abc']).fit(ALPHABET)

        encoded = tokenizer.transform([line])[0]
        expected = [tokenizer.token2id_[token] for token in expected]
        assert encoded == expected
예제 #8
0
    def test_tokenizer_with_special_special_token(self, line):
        tokenizer = TokenEncoder(special_tokens=['$']).fit(['hi'])

        encoded = tokenizer.transform([line])
        decoded = tokenizer.inverse_transform(encoded)[0]
        assert decoded == line
예제 #9
0
    def test_tokenizer_wo_args(self, line, expected):
        tokenizer = TokenEncoder().fit(line)

        result = tokenizer.transform([line])[0]
        result = [tokenizer.id2token_[idx] for idx in result]
        assert result == expected
예제 #10
0
    def test_detokenize_with_space(self, line):
        tokenizer = TokenEncoder(separator=" ").fit([line])
        encoded = tokenizer.transform([line])
        decoded = tokenizer.inverse_transform(encoded)[0]

        assert decoded == line
예제 #11
0
    def test_tokenizer_split_on_space(self, line, expected):
        tokenizer = TokenEncoder(separator=" ").fit([line])

        encoded = tokenizer.transform([line])[0]
        expected = [tokenizer.token2id_[token] for token in expected]
        assert encoded == expected
예제 #12
0
    def test_tokenizer_with_special_tokens_more_matches(self, line, expected):
        tokenizer = TokenEncoder(special_tokens=['abc']).fit(ALPHABET)

        encoded = tokenizer.transform([line])[0]
        expected = [tokenizer.token2id_[token] for token in expected]
        assert encoded == expected