Пример #1
0
 def __init__(self, vocab_path, bpe_merges_path):
   tokenizer = Tokenizer(models.BPE.from_files(vocab_path, bpe_merges_path))
   # Use the byte level
   add_prefix_spaces = False # Whether to automatically prefix the sequences with a space if none found
   tokenizer.with_pre_tokenizer(pre_tokenizers.ByteLevel.new(add_prefix_spaces))
   tokenizer.with_decoder(decoders.ByteLevel.new())
   # Setup truncation if needed
   truncate = False
   max_length = 1024
   if truncate:
     stride = 0
     strategy = 'longest_first' # Can also be `only_first` or `only_second`
     tokenizer.with_truncation(max_length, stride, strategy)
   # Setup padding if needed
   padding = False
   # Whether to always pad to max_length. If this is false, we will pad to the
   # longest sequence in the batch.
   pad_to_max_length = False
   padding_side = "right" # Can also be "left"
   pad_token_id = 0
   pad_token_type_id = 0
   pad_token = "[PAD]"
   if padding:
     tokenizer.with_padding(
       max_length if pad_to_max_length else None,
       padding_side,
       pad_token_id,
       pad_token_type_id,
       pad_token
     )
   self.tokenizer = tokenizer
Пример #2
0
    strategy = 'longest_first'  # Can also be `only_first` or `only_second`
    tokenizer.with_truncation(max_length, stride, strategy)

# Setup padding if needed
padding = False
# Whether to always pad to max_length. If this is false, we will pad to the
# longest sequence in the batch.
pad_to_max_length = False
padding_side = "right"  # Can also be "left"
pad_token_id = 0
pad_token_type_id = 0
pad_token = "[PAD]"

if padding:
    tokenizer.with_padding(max_length if pad_to_max_length else None,
                           padding_side, pad_token_id, pad_token_type_id,
                           pad_token)

# http://stackoverflow.com/questions/1624883/alternative-way-to-split-a-list-into-groups-of-n
import itertools


def group(n, iterable, fillvalue=None):
    "group(3, 'ABCDEFG', 'x') --> ABC DEF Gxx"
    args = [iter(iterable)] * n
    return itertools.zip_longest(*args, fillvalue=fillvalue)


import tflex_utils
import tqdm
import time