Python BertTokenizer.get_special_tokens_mask 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: transformers

클래스/타입: BertTokenizer

메소드/함수: get_special_tokens_mask

hotexamples.com에서의 예제들: 5

Python BertTokenizer.get_special_tokens_mask - 5개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 transformers.BertTokenizer.get_special_tokens_mask에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

BertTokenizer(30)

convert_ids_to_tokens(30)

from_pretrained(30)

encode_plus(30)

encode(30)

convert_tokens_to_ids(30)

tokenize(30)

batch_encode_plus(18)

decode(14)

save_pretrained(11)

build_inputs_with_special_tokens(8)

add_special_tokens(8)

convert_tokens_to_string(7)

get_special_tokens_mask(5)

save_vocabulary(4)

_pad_token(3)

prepare_for_model(2)

clean_up_tokenization(2)

_convert_id_to_token(2)

_convert_token_to_id(2)

get_vocab(2)

batch_decode(1)

prepare_for_tokenization(1)

_from_pretrained(1)

morphs(1)

get_morphes_by_tags(1)

add_tokens(1)

create_token_type_ids_from_sequences(1)

예제 #1

파일 보기

def mask_tokens(inputs, bert_tokenizer: BertTokenizer,
                jp_tokenizer: JumanTokenizer, args):
    """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
    labels = inputs.clone()
    # パディング部分(-1)も除外対象に設定
    # -1部分を持ったパディングマスクを作成
    padding_mask = labels.clone()
    # パディングを認識可能な0([UNK])に戻す
    inputs[padding_mask == -1] = 0
    # 予測に使う部分を確率決定(True = 1.0)
    # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
    probability_matrix = torch.full(labels.shape, args.mlm_probability)
    special_tokens_mask = [
        bert_tokenizer.get_special_tokens_mask(val,
                                               already_has_special_tokens=True)
        for val in labels.tolist()
    ]
    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask,
                                                 dtype=torch.bool).byte(),
                                    value=0.0)
    masked_indices = torch.bernoulli(probability_matrix)
    labels[(masked_indices !=
            1.0)] = -1  # We only compute loss on masked tokens

    # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8))
    inputs[(indices_replaced == 1.0)
           & (labels == -1)] = bert_tokenizer.convert_tokens_to_ids(
               bert_tokenizer.mask_token)

    # 10% of the time, we replace masked input tokens with random word
    indices_random = torch.bernoulli(torch.full(labels.shape, 0.5))
    random_words = torch.randint(len(bert_tokenizer),
                                 labels.shape,
                                 dtype=torch.long)
    inputs[(indices_random == 1.0) & (indices_replaced == 1.0) &
           (labels == -1)] = random_words[(indices_random == 1.0)
                                          & (indices_replaced == 1.0) &
                                          (labels == -1)]

    # The rest of the time (10% of the time) we keep the masked input tokens unchanged
    return inputs, labels

예제 #2

파일 보기

파일: Bert_Pretraining.py 프로젝트: huakeda1/Basic-algorithm-and-framework-study-for-AI

def mask_token(inputs: torch.Tensor, tokenizer: BertTokenizer,
               args) -> Tuple[torch.Tensor, torch.Tensor]:
    if tokenizer.mask_token is None:
        raise ValueError(
            'This tokenizer does not have a mask token which is necessary for masked language model. Remove the --mlm flag if you want to use this tokenizer.'
        )
    labels = inputs.clone()
    # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
    probability_matrix = torch.full(labels.shape, args.mlm_probability)
    # filter the exist special token which will not be masked anymore.
    special_tokens_mask = [
        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True)
        for val in labels.tolist()
    ]
    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask,
                                                 dtype=torch.bool),
                                    value=0.0)
    # filter the exist pad token which will not be masked anymore
    if tokenizer.pad_token is not None:
        padding_mask = labels.eq(tokenizer.pad_token_id)
        probability_matrix.masked_fill_(padding_mask, value=0.0)
    # get out the possible masked position with 1.0 which means 15% of all pure tokens will be picked out for relevant masking.
    masked_indices = torch.bernoulli(probability_matrix).bool()
    # we only need the masked position to compute loss while the other token ids are set to be -100
    labels[~masked_indices] = -100

    # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
    indices_replaced = torch.bernoulli(torch.full(labels.shape,
                                                  0.8)).bool() & masked_indices
    inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(
        tokenizer.mask_token)

    # 10% of the time, we replace masked input tokens with random word
    indices_random = torch.bernoulli(torch.full(
        labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
    random_words = torch.randint(len(tokenizer),
                                 labels.shape,
                                 dtype=torch.long)
    inputs[indices_random] = random_words[indices_random]

    # The rest of the time(10%) we keep the masked input tokens unchanged
    return inputs, labels

예제 #3

파일 보기

파일: augmentation.py 프로젝트: JhnLee/contextual_augmentation

def mask_tokens(inputs: torch.Tensor, tokenizer: BertTokenizer,
                args) -> Tuple[torch.Tensor, torch.Tensor]:
    """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
    labels = inputs.clone()
    # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
    probability_matrix = torch.full(labels.shape, args.mlm_probability)
    special_tokens_mask = [
        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True)
        for val in labels.tolist()
    ]
    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask,
                                                 dtype=torch.bool),
                                    value=0.0)
    if tokenizer._pad_token is not None:
        padding_mask = labels.eq(tokenizer.pad_token_id)
        probability_matrix.masked_fill_(padding_mask, value=0.0)
    masked_indices = torch.bernoulli(probability_matrix).bool()
    labels[~masked_indices] = -100  # We only compute loss on masked tokens

    inputs[masked_indices] = tokenizer.convert_tokens_to_ids(
        tokenizer.mask_token)
    return inputs, labels

예제 #4

파일 보기

def mask_tokens(inputs: torch.Tensor, tokenizer: BertTokenizer,
                args) -> Tuple[torch.Tensor, torch.Tensor]:
    """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
    labels = inputs.clone()
    # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
    probability_matrix = torch.full(labels.shape, args.mlm_probability)
    special_tokens_mask = [
        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True)
        for val in labels.tolist()
    ]
    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask,
                                                 dtype=torch.bool),
                                    value=0.0)
    if tokenizer._pad_token is not None:
        padding_mask = labels.eq(tokenizer.pad_token_id)
        probability_matrix.masked_fill_(padding_mask, value=0.0)
    masked_indices = torch.bernoulli(probability_matrix).bool()
    labels[~masked_indices] = -100  # We only compute loss on masked tokens

    # TODO replace all tokens-to-be-changed with [MASK] (prob 80% -> 100%)
    # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
    indices_replaced = torch.bernoulli(torch.full(labels.shape,
                                                  0.8)).bool() & masked_indices
    inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(
        tokenizer.mask_token)

    # 10% of the time, we replace masked input tokens with random word
    indices_random = (torch.bernoulli(torch.full(labels.shape, 0.5)).bool()
                      & masked_indices & ~indices_replaced)
    random_words = torch.randint(len(tokenizer),
                                 labels.shape,
                                 dtype=torch.long)
    inputs[indices_random] = random_words[indices_random]

    # The rest of the time (10% of the time) we keep the masked input tokens unchanged
    return inputs, labels

예제 #5

파일 보기

tokenizer.eos_token_id
tokenizer.all_special_ids

tokenizer.special_tokens_map
tokenizer.additional_special_tokens
y = "<BOS> I like embeddings <EOS> [SEP] i like tea"
z = tokenizer.encode(y)
tokenizer.convert_ids_to_tokens(z)
tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(z))

tokenizer.encode("embeddings embedding")
tokenizer.encode("i like tea")
tokenizer.encode("i like tea")
tokenizer.decode(tokenizer.encode("embeddings embedding"))

tokenizer.get_special_tokens_mask([100, 101, 102], [1, 2, 3])
tokenizer.get_special_tokens_mask([100, 101, 102, 1, 2, 3])

tokenizer("s")
from transformers import BertTokenizerFast
t1 = BertTokenizerFast.from_pretrained("bert-base-uncased",
                                       bos_token="<BOS>",
                                       eos_token="<EOS>")

t1.tokenize("<BOS> I like embeddings <EOS> [SEP] i like tea")
t1.special_tokens_map
y = t1.encode("<BOS> I like embeddings <EOS> [SEP] i like tea")
t1.create_token_type_ids_from_sequences(y)

t1("abd")
t1.covert_ids_to_tokens(y)