def __init__(self, vocab_path, bpe_merges_path): tokenizer = Tokenizer(models.BPE.from_files(vocab_path, bpe_merges_path)) # Use the byte level add_prefix_spaces = False # Whether to automatically prefix the sequences with a space if none found tokenizer.with_pre_tokenizer(pre_tokenizers.ByteLevel.new(add_prefix_spaces)) tokenizer.with_decoder(decoders.ByteLevel.new()) # Setup truncation if needed truncate = False max_length = 1024 if truncate: stride = 0 strategy = 'longest_first' # Can also be `only_first` or `only_second` tokenizer.with_truncation(max_length, stride, strategy) # Setup padding if needed padding = False # Whether to always pad to max_length. If this is false, we will pad to the # longest sequence in the batch. pad_to_max_length = False padding_side = "right" # Can also be "left" pad_token_id = 0 pad_token_type_id = 0 pad_token = "[PAD]" if padding: tokenizer.with_padding( max_length if pad_to_max_length else None, padding_side, pad_token_id, pad_token_type_id, pad_token ) self.tokenizer = tokenizer
strategy = 'longest_first' # Can also be `only_first` or `only_second` tokenizer.with_truncation(max_length, stride, strategy) # Setup padding if needed padding = False # Whether to always pad to max_length. If this is false, we will pad to the # longest sequence in the batch. pad_to_max_length = False padding_side = "right" # Can also be "left" pad_token_id = 0 pad_token_type_id = 0 pad_token = "[PAD]" if padding: tokenizer.with_padding(max_length if pad_to_max_length else None, padding_side, pad_token_id, pad_token_type_id, pad_token) # http://stackoverflow.com/questions/1624883/alternative-way-to-split-a-list-into-groups-of-n import itertools def group(n, iterable, fillvalue=None): "group(3, 'ABCDEFG', 'x') --> ABC DEF Gxx" args = [iter(iterable)] * n return itertools.zip_longest(*args, fillvalue=fillvalue) import tflex_utils import tqdm import time