def test_tokenizer_encoded_and_decodes_simple_text(self): X = 'abcdef' tokenizer = TokenEncoder().fit(X) encoded = tokenizer.transform(['fedcba'])[0] assert len(encoded) == 6 assert all([type(idx) == int for idx in encoded]) decoded = tokenizer.inverse_transform([encoded])[0] assert decoded == 'fedcba'
def test_tokenizer_encoded_and_decodes_simple_text(self): X = 'abcdef' tokenizer = TokenEncoder().fit(X) encoded = tokenizer.transform(['fedcba'])[0] assert len(encoded) == 6 assert all([type(idx) == int for idx in encoded]) decoded = tokenizer.inverse_transform([encoded])[0] assert decoded == 'fedcba'
def test_tokenizer_with_special_special_token(self, line): tokenizer = TokenEncoder(special_tokens=['$']).fit(['hi']) encoded = tokenizer.transform([line]) decoded = tokenizer.inverse_transform(encoded)[0] assert decoded == line
def test_tokenizer_wo_args(self, line, expected): tokenizer = TokenEncoder().fit(line) result = tokenizer.transform([line])[0] result = [tokenizer.id2token_[idx] for idx in result] assert result == expected
def test_detokenize_with_space(self, line): tokenizer = TokenEncoder(separator=" ").fit([line]) encoded = tokenizer.transform([line]) decoded = tokenizer.inverse_transform(encoded)[0] assert decoded == line
def test_tokenizer_split_on_space(self, line, expected): tokenizer = TokenEncoder(separator=" ").fit([line]) encoded = tokenizer.transform([line])[0] expected = [tokenizer.token2id_[token] for token in expected] assert encoded == expected
def test_tokenizer_with_special_tokens_more_matches(self, line, expected): tokenizer = TokenEncoder(special_tokens=['abc']).fit(ALPHABET) encoded = tokenizer.transform([line])[0] expected = [tokenizer.token2id_[token] for token in expected] assert encoded == expected
def test_tokenizer_with_special_special_token(self, line): tokenizer = TokenEncoder(special_tokens=['$']).fit(['hi']) encoded = tokenizer.transform([line]) decoded = tokenizer.inverse_transform(encoded)[0] assert decoded == line
def test_tokenizer_wo_args(self, line, expected): tokenizer = TokenEncoder().fit(line) result = tokenizer.transform([line])[0] result = [tokenizer.id2token_[idx] for idx in result] assert result == expected
def test_detokenize_with_space(self, line): tokenizer = TokenEncoder(separator=" ").fit([line]) encoded = tokenizer.transform([line]) decoded = tokenizer.inverse_transform(encoded)[0] assert decoded == line
def test_tokenizer_split_on_space(self, line, expected): tokenizer = TokenEncoder(separator=" ").fit([line]) encoded = tokenizer.transform([line])[0] expected = [tokenizer.token2id_[token] for token in expected] assert encoded == expected
def test_tokenizer_with_special_tokens_more_matches(self, line, expected): tokenizer = TokenEncoder(special_tokens=['abc']).fit(ALPHABET) encoded = tokenizer.transform([line])[0] expected = [tokenizer.token2id_[token] for token in expected] assert encoded == expected