def test_basic_encode(self, roberta_files): tokenizer = ByteLevelBPETokenizer.from_file(roberta_files["vocab"], roberta_files["merges"]) output = tokenizer.encode( "The quick brown fox jumps over the lazy dog") assert output.ids == [ 133, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335 ] assert output.tokens == [ "The", "Ġquick", "Ġbrown", "Ġfox", "Ġjumps", "Ġover", "Ġthe", "Ġlazy", "Ġdog", ] assert output.offsets == [ (0, 3), (3, 9), (9, 15), (15, 19), (19, 25), (25, 30), (30, 34), (34, 39), (39, 43), ]
def load(cls, dir_path, name='bbpe', **kwargs): tokenizer = cls() vocab_file = os.path.join(dir_path, f'{name}-vocab.json') merges_file = os.path.join(dir_path, f'{name}-merges.txt') tokenizer.hf_tokenizers = ByteLevelBPETokenizer.from_file( vocab_filename=vocab_file, merges_filename=merges_file, **kwargs) return tokenizer
def load(cls, dir_path, name='bbpe', lowercase=True): tokenizer = cls() vocab_file = dir_path + "{}-vocab.json".format(name) merges_file = dir_path + "{}-merges.txt".format(name) tokenizer.hf_tokenizers = ByteLevelBPETokenizer.from_file( vocab_filename=vocab_file, merges_filename=merges_file, lowercase=lowercase) return tokenizer
def test_lowerspace(self, roberta_files): tokenizer = ByteLevelBPETokenizer.from_file( roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True, lowercase=True, ) output = tokenizer.encode( "The Quick Brown Fox Jumps Over The Lazy Dog") assert output.ids == [5, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335] assert output.tokens == [ "Ġthe", "Ġquick", "Ġbrown", "Ġfox", "Ġjumps", "Ġover", "Ġthe", "Ġlazy", "Ġdog", ]
def test_multiprocessing_with_parallelism(self, roberta_files): tokenizer = ByteLevelBPETokenizer.from_file(roberta_files["vocab"], roberta_files["merges"]) multiprocessing_with_parallelism(tokenizer, False) multiprocessing_with_parallelism(tokenizer, True)