def test_save(tokenizer, tmp_path): tokenizer.save(tmp_path) loaded_tokenizer = TransformersTokenizer() loaded_tokenizer.load(tmp_path) tokens = loaded_tokenizer.tokenize("This is a test") assert len(tokens) == 4
def test_lowercase(): tokenizer = TransformersTokenizer(lowercase=False) tokenizer.fit(texts) tokens = tokenizer.tokenize("This is a test") assert tokens[0] == "This"
def test_bpe_model(): tokenizer = TransformersTokenizer(model="bpe") tokenizer.fit(texts) tokens = tokenizer.tokenize("This is a test") assert len(tokens) == 4