Python Tokenizer.with_padding примеры использования

Язык программирования: Python

Пространство имен/Пакет: tokenizers

Класс/Тип: Tokenizer

Метод/Функция: with_padding

Примеров на hotexamples.com: 2

Python Tokenizer.with_padding - 2 примера найдено. Это лучшие примеры Python кода для tokenizers.Tokenizer.with_padding, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Tokenizer(30)

decoder(30)

save(30)

normalizer(30)

from_file(30)

train(30)

post_processor(30)

encode(30)

pre_tokenizer(30)

add_special_tokens(26)

token_to_id(24)

encode_batch(18)

train_from_iterator(17)

add_tokens(17)

enable_padding(13)

enable_truncation(12)

normalize(8)

model(8)

decode_batch(6)

decode(5)

get_vocab(5)

from_str(4)

get_vocab_size(3)

_process_all_1(3)

decoders(3)

save_pretrained(2)

save_model(2)

with_pre_tokenizer(2)

with_padding(2)

with_decoder(2)

with_truncation(2)

from_pretrained(2)

convert_tokens_to_ids(2)

post_process(1)

raise_error(1)

pad_token(1)

mask_token(1)

get_special_tokens_mask(1)

save_tokenizer(1)

to_str(1)

tokenize(1)

tokenize_and_pad_training_data(1)

encode_plus(1)

convert_ids_to_tokens(1)

build_inputs_with_special_tokens(1)

batch_encode_plus(1)

load_tokenizer(1)

Пример #1

Показать файл

 def __init__(self, vocab_path, bpe_merges_path):
   tokenizer = Tokenizer(models.BPE.from_files(vocab_path, bpe_merges_path))
   # Use the byte level
   add_prefix_spaces = False # Whether to automatically prefix the sequences with a space if none found
   tokenizer.with_pre_tokenizer(pre_tokenizers.ByteLevel.new(add_prefix_spaces))
   tokenizer.with_decoder(decoders.ByteLevel.new())
   # Setup truncation if needed
   truncate = False
   max_length = 1024
   if truncate:
     stride = 0
     strategy = 'longest_first' # Can also be `only_first` or `only_second`
     tokenizer.with_truncation(max_length, stride, strategy)
   # Setup padding if needed
   padding = False
   # Whether to always pad to max_length. If this is false, we will pad to the
   # longest sequence in the batch.
   pad_to_max_length = False
   padding_side = "right" # Can also be "left"
   pad_token_id = 0
   pad_token_type_id = 0
   pad_token = "[PAD]"
   if padding:
     tokenizer.with_padding(
       max_length if pad_to_max_length else None,
       padding_side,
       pad_token_id,
       pad_token_type_id,
       pad_token
     )
   self.tokenizer = tokenizer

Пример #2

Показать файл

Файл: tokenize_dataset.py Проект: jennettefir/news

    strategy = 'longest_first'  # Can also be `only_first` or `only_second`
    tokenizer.with_truncation(max_length, stride, strategy)

# Setup padding if needed
padding = False
# Whether to always pad to max_length. If this is false, we will pad to the
# longest sequence in the batch.
pad_to_max_length = False
padding_side = "right"  # Can also be "left"
pad_token_id = 0
pad_token_type_id = 0
pad_token = "[PAD]"

if padding:
    tokenizer.with_padding(max_length if pad_to_max_length else None,
                           padding_side, pad_token_id, pad_token_type_id,
                           pad_token)

# http://stackoverflow.com/questions/1624883/alternative-way-to-split-a-list-into-groups-of-n
import itertools


def group(n, iterable, fillvalue=None):
    "group(3, 'ABCDEFG', 'x') --> ABC DEF Gxx"
    args = [iter(iterable)] * n
    return itertools.zip_longest(*args, fillvalue=fillvalue)


import tflex_utils
import tqdm
import time