Python PreTrainedTokenizerFast.pad_token示例

编程语言: Python

命名空间/包名称: transformers

方法/功能: pad_token

hotexamples.com的示例: 5

Python PreTrainedTokenizerFast.pad_token - 已找到5个示例。这些是从开源项目中提取的最受好评的transformers.PreTrainedTokenizerFast.pad_token现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

PreTrainedTokenizerFast(28)

from_pretrained(25)

encode(6)

add_special_tokens(5)

pad_token(5)

encode_plus(4)

mask_token(4)

decode(3)

tokenize(3)

batch_decode(2)

cls_token(2)

convert_ids_to_tokens(2)

convert_tokens_to_ids(2)

save_pretrained(2)

sep_token(2)

unk_token(2)

get_vocab(1)

num_special_tokens_to_add(1)

示例#1

显示文件

def load_tokenizer(tknzr_file,
                   flag_tknzr_fast,
                   pad_token=None,
                   mask_token=None):
    """
    Interestingly, HuggingFace does not allow the base tokenizer to be called.
    This is a bizarre choice, but accordingly we have to look for something else
    , which is why I use the PreTrainedTokenizerFast to wrap the base tokenizer.
    Written in Rust, it's faster than the base tokenizer class, but also lets
    you call the tokenizer as tknzr('text to be tokenized').

    Input
        tknzr_file (str) : .json file of the tokenizer trained previously
        *_tokens (str)  : tokens that are to be used in the corresponding context
                            Some of them are not implemented yet...
    Output
        tknzr     : tokenizer as PreTrainedTokenizerFast class to be passed on
    """
    if flag_tknzr_fast:
        tknzr = PreTrainedTokenizerFast(tokenizer_file=tknzr_file)
    else:
        tknzr = PreTrainedTokenizer(tokenizer_file=tknzr_file)
    tknzr.pad_token = pad_token
    tknzr.mask_token = mask_token

    return tknzr

示例#2

显示文件

文件： preprocess.py 项目： antklen/data_fusion_solution

def preprocess(texts, tokenizer_path, max_len=32):

    input_ids, input_masks = [], []

    tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path)
    tokenizer.mask_token = '[MASK]'
    tokenizer.pad_token = "[PAD]"
    tokenizer.sep_token = "[SEP]"
    tokenizer.cls_token = "[CLS]"
    tokenizer.unk_token = "[UNK]"

    for text in tqdm(texts):
        encoded = tokenizer.encode_plus(text,
                                        max_length=max_len,
                                        pad_to_max_length=True,
                                        truncation=True)
        input_ids.append(encoded['input_ids'])
        input_masks.append(encoded['attention_mask'])

    return [np.array(input_ids), np.array(input_masks)]

示例#3

显示文件

文件： roberta_train_script.py 项目： wojtekcz/poetry2021

tokenizer_path = dataset_path / 'tokenizer1'
tokenizer_path.mkdir(parents=True, exist_ok=True)
tokenizer.save(str(tokenizer_path / "tokenizer.json"))

# Re-create as roberta compatible tokenizer
tokenizer_path = dataset_path / 'tokenizer1'
print(tokenizer_path)

tokenizer2 = PreTrainedTokenizerFast(tokenizer_file=str(tokenizer_path / "tokenizer.json"))
tokenizer2._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer2._tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer2._tokenizer.token_to_id("<s>")),
)
tokenizer2._tokenizer.enable_truncation(max_length=128)  # 512
tokenizer2.mask_token = "<mask>"
tokenizer2.pad_token = "<pad>"

# 3. Train a language model
config = RobertaConfig(
    vocab_size=tokenizer2._tokenizer.get_vocab_size(),
    hidden_size=240,
    intermediate_size=2048,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
    bos_token_id=tokenizer2._tokenizer.token_to_id("<s>"),
    eos_token_id=tokenizer2._tokenizer.token_to_id("</s>"),
    pad_token_id=tokenizer2._tokenizer.token_to_id("<pad>"),
    # attention_probs_dropout_prob=0.0,
    # hidden_dropout_prob=0.0,

示例#4

显示文件

文件： train_lm.py 项目： antklen/data_fusion_solution

DATA_PATH = 'data/item_name.txt'

parser = argparse.ArgumentParser(description='Training language model')
parser.add_argument('--config_path', type=str, default='src/configs/train_lm1.yaml',
                    help='path to config file')
args = parser.parse_args()

config = OmegaConf.load(args.config_path)
print(OmegaConf.to_yaml(config))

os.environ['WANDB_DISABLED'] = 'true'

tokenizer = PreTrainedTokenizerFast(tokenizer_file=config.tokenizer_path)
tokenizer.mask_token = '[MASK]'
tokenizer.pad_token = "[PAD]"
tokenizer.sep_token = "[SEP]"
tokenizer.cls_token = "[CLS]"
tokenizer.unk_token = "[UNK]"

distilbert_config = DistilBertConfig(vocab_size=config.vocab_size,
                                     n_heads=8, dim=512, hidden_dim=2048)
model = DistilBertForMaskedLM(distilbert_config)

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=DATA_PATH,
    block_size=64)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,

示例#5

显示文件

    def __init__(
        self,
        args: GenerationTrainArguments,
        tokenizer: PreTrainedTokenizerFast,
        corpus,
        mode: Optional[str] = "train",
        convert_examples_to_features_fn=_convert_examples_to_generation_features,
    ):
        if corpus is not None:
            self.corpus = corpus
        else:
            raise KeyError("corpus is not valid")
        if not mode in ["train", "val", "test"]:
            raise KeyError(f"mode({mode}) is not a valid split name")
        # Load data features from cache or dataset file
        cached_features_file = os.path.join(
            args.downstream_corpus_root_dir,
            args.downstream_corpus_name,
            "cached_{}_{}_{}_{}_{}".format(
                mode,
                tokenizer.__class__.__name__,
                str(args.max_seq_length),
                args.downstream_corpus_name,
                args.downstream_task_name,
            ),
        )

        # Make sure only the first process in distributed training processes the dataset,
        # and the others will use the cache.
        lock_path = cached_features_file + ".lock"
        with FileLock(lock_path):

            if os.path.exists(
                    cached_features_file) and not args.overwrite_cache:
                start = time.time()
                self.features = torch.load(cached_features_file)
                logger.info(
                    f"Loading features from cached file {cached_features_file} [took %.3f s]",
                    time.time() - start)
            else:
                corpus_path = os.path.join(
                    args.downstream_corpus_root_dir,
                    args.downstream_corpus_name,
                )
                logger.info(
                    f"Creating features from dataset file at {corpus_path}")
                examples = self.corpus.get_examples(corpus_path, mode)
                tokenizer.pad_token = tokenizer.eos_token
                self.features = convert_examples_to_features_fn(
                    examples,
                    tokenizer,
                    args,
                )
                start = time.time()
                logging.info(
                    "Saving features into cached file, it could take a lot of time..."
                )
                torch.save(self.features, cached_features_file)
                logger.info(
                    "Saving features into cached file %s [took %.3f s]",
                    cached_features_file,
                    time.time() - start)