Python PreTrainedTokenizer._convert_token_to_id示例

编程语言: Python

命名空间/包名称: transformers

方法/功能: _convert_token_to_id

hotexamples.com的示例: 2

Python PreTrainedTokenizer._convert_token_to_id - 已找到2个示例。这些是从开源项目中提取的最受好评的transformers.PreTrainedTokenizer._convert_token_to_id现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

PreTrainedTokenizer(6)

_convert_token_to_id(2)

__call__(1)

__init__(1)

_convert_id_to_token(1)

示例#1

显示文件

文件： create.py 项目： martijnvanbeers/diagnnose

def create_tokenizer(path: str,
                     notify_unk: bool = False) -> PreTrainedTokenizer:
    """Creates a tokenizer from a path.

    A LSTM tokenizer is defined as a file with an entry at each line,
    and `path` should point towards that file.

    A Transformer tokenizer is defined by its model name, and is
    imported using the AutoTokenizer class.

    Parameters
    ----------
    path : str
        Either the path towards a vocabulary file, or the model name
        of a Huggingface Transformer.
    notify_unk : bool, optional
        Optional toggle to notify a user if a token is not present in
        the vocabulary of the tokenizer. Defaults to False.

    Returns
    -------
    tokenizer : PreTrainedTokenizer
        The instantiated tokenizer that maps tokens to their indices.
    """
    if os.path.exists(os.path.expanduser(path)):
        # Word-based vocabulary, used by older LSTM models
        vocab = W2I(token_to_index(path), notify_unk=notify_unk)

        tokenizer = PreTrainedTokenizer()

        tokenizer.added_tokens_encoder = vocab
        tokenizer.added_tokens_decoder = {idx: w for w, idx in vocab.items()}
        tokenizer.vocab = tokenizer.added_tokens_encoder
        tokenizer.ids_to_tokens = tokenizer.added_tokens_decoder

        tokenizer.unk_token = vocab.unk_token
        tokenizer.eos_token = vocab.eos_token
        tokenizer.pad_token = vocab.pad_token
        tokenizer.mask_token = vocab.unk_token

        # Separates punctuation from a token "hi!" -> ["hi", "!"]
        tokenizer._tokenize = lambda s: re.findall(r"[\w']+|[.,!?;]", s)
        tokenizer._convert_token_to_id = lambda w: vocab[w]

        return tokenizer

    # Subword-based vocabulary, used by Transformer models
    tokenizer = AutoTokenizer.from_pretrained(path)
    if hasattr(tokenizer, "encoder"):
        # GPT-2 & Roberta use a different attribute for the underlying vocab dictionary.
        encoder: Dict[str, int] = getattr(tokenizer, "encoder")
        tokenizer.vocab = W2I(encoder, unk_token=tokenizer.unk_token)
        tokenizer.ids_to_tokens = tokenizer.decoder

    if getattr(tokenizer, "pad_token", None) is None:
        tokenizer.pad_token = tokenizer.unk_token

    return tokenizer

示例#2

显示文件

文件： create.py 项目： i-machine-think/diagNNose

def create_tokenizer(path: str,
                     notify_unk: bool = False,
                     cache_dir: Optional[str] = None,
                     **kwargs) -> PreTrainedTokenizer:
    """Creates a tokenizer from a path.

    A LSTM tokenizer is defined as a file with an entry at each line,
    and `path` should point towards that file.

    A Transformer tokenizer is defined by its model name, and is
    imported using the AutoTokenizer class.

    Parameters
    ----------
    path : str
        Either the path towards a vocabulary file, or the model name
        of a Huggingface Transformer.
    notify_unk : bool, optional
        Optional toggle to notify a user if a token is not present in
        the vocabulary of the tokenizer. Defaults to False.
    cache_dir : str, optional
        Cache directory for Huggingface tokenizers.

    Returns
    -------
    tokenizer : PreTrainedTokenizer
        The instantiated tokenizer that maps tokens to their indices.
    """
    if os.path.exists(os.path.expanduser(path)):
        # Word-based vocabulary, used by older LSTM models
        vocab = W2I(token_to_index(path), notify_unk=notify_unk, **kwargs)

        tokenizer = PreTrainedTokenizer()

        tokenizer.added_tokens_encoder = vocab
        tokenizer.added_tokens_decoder = {idx: w for w, idx in vocab.items()}
        tokenizer.vocab = tokenizer.added_tokens_encoder
        tokenizer.ids_to_tokens = tokenizer.added_tokens_decoder

        tokenizer.unk_token = vocab.unk_token
        tokenizer.eos_token = vocab.eos_token
        tokenizer.pad_token = vocab.pad_token

        tokenizer._tokenize = lambda s: s.split(" ")
        tokenizer._convert_token_to_id = lambda w: vocab[w]

        return tokenizer

    # Subword-based vocabulary, used by Transformer models
    tokenizer = AutoTokenizer.from_pretrained(path,
                                              cache_dir=cache_dir,
                                              use_fast=False)
    if hasattr(tokenizer, "encoder"):
        # GPT-2 & Roberta use a different attribute for the underlying vocab dictionary.
        encoder: Dict[str, int] = getattr(tokenizer, "encoder")
        tokenizer.vocab = W2I(encoder, unk_token=tokenizer.unk_token)
        tokenizer.ids_to_tokens = tokenizer.decoder
    elif hasattr(tokenizer, "sp_model"):
        # XLNet uses a sentencepiece tokenizer without an explicit vocab dict
        vocab = {
            tokenizer.sp_model.id_to_piece(idx): idx
            for idx in range(len(tokenizer.sp_model))
        }
        tokenizer.vocab = vocab
        tokenizer.ids_to_tokens = {idx: w for w, idx in vocab.items()}
    elif hasattr(tokenizer, "sym2idx"):
        tokenizer.vocab = tokenizer.sym2idx
        tokenizer.ids_to_tokens = tokenizer.idx2sym

    if getattr(tokenizer, "pad_token", None) is None:
        tokenizer.pad_token = tokenizer.unk_token

    return tokenizer