예제 #1
0
def test_save(tokenizer, tmp_path):
    tokenizer.save(tmp_path)

    loaded_tokenizer = TransformersTokenizer()
    loaded_tokenizer.load(tmp_path)
    tokens = loaded_tokenizer.tokenize("This is a test")
    assert len(tokens) == 4
예제 #2
0
def test_lowercase():
    tokenizer = TransformersTokenizer(lowercase=False)
    tokenizer.fit(texts)
    tokens = tokenizer.tokenize("This is a test")
    assert tokens[0] == "This"
예제 #3
0
def test_bpe_model():
    tokenizer = TransformersTokenizer(model="bpe")
    tokenizer.fit(texts)
    tokens = tokenizer.tokenize("This is a test")
    assert len(tokens) == 4