def __init__(self, vocab_path, bpe_merges_path): tokenizer = Tokenizer(models.BPE.from_files(vocab_path, bpe_merges_path)) # Use the byte level add_prefix_spaces = False # Whether to automatically prefix the sequences with a space if none found tokenizer.with_pre_tokenizer(pre_tokenizers.ByteLevel.new(add_prefix_spaces)) tokenizer.with_decoder(decoders.ByteLevel.new()) # Setup truncation if needed truncate = False max_length = 1024 if truncate: stride = 0 strategy = 'longest_first' # Can also be `only_first` or `only_second` tokenizer.with_truncation(max_length, stride, strategy) # Setup padding if needed padding = False # Whether to always pad to max_length. If this is false, we will pad to the # longest sequence in the batch. pad_to_max_length = False padding_side = "right" # Can also be "left" pad_token_id = 0 pad_token_type_id = 0 pad_token = "[PAD]" if padding: tokenizer.with_padding( max_length if pad_to_max_length else None, padding_side, pad_token_id, pad_token_type_id, pad_token ) self.tokenizer = tokenizer
merges = "./models/%s/vocab.bpe" % args.model_name tokenizer = Tokenizer(models.BPE.from_files(vocab, merges)) # Use the byte level add_prefix_spaces = False # Whether to automatically prefix the sequences with a space if none found tokenizer.with_pre_tokenizer(pre_tokenizers.ByteLevel.new(add_prefix_spaces)) tokenizer.with_decoder(decoders.ByteLevel.new()) # Setup truncation if needed truncate = False max_length = 1024 if truncate: stride = 0 strategy = 'longest_first' # Can also be `only_first` or `only_second` tokenizer.with_truncation(max_length, stride, strategy) # Setup padding if needed padding = False # Whether to always pad to max_length. If this is false, we will pad to the # longest sequence in the batch. pad_to_max_length = False padding_side = "right" # Can also be "left" pad_token_id = 0 pad_token_type_id = 0 pad_token = "[PAD]" if padding: tokenizer.with_padding(max_length if pad_to_max_length else None, padding_side, pad_token_id, pad_token_type_id, pad_token)