예제 #1
0
def mask_tokens(inputs, bert_tokenizer: BertTokenizer,
                jp_tokenizer: JumanTokenizer, args):
    """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
    labels = inputs.clone()
    # パディング部分(-1)も除外対象に設定
    # -1部分を持ったパディングマスクを作成
    padding_mask = labels.clone()
    # パディングを認識可能な0([UNK])に戻す
    inputs[padding_mask == -1] = 0
    # 予測に使う部分を確率決定(True = 1.0)
    # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
    probability_matrix = torch.full(labels.shape, args.mlm_probability)
    special_tokens_mask = [
        bert_tokenizer.get_special_tokens_mask(val,
                                               already_has_special_tokens=True)
        for val in labels.tolist()
    ]
    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask,
                                                 dtype=torch.bool).byte(),
                                    value=0.0)
    masked_indices = torch.bernoulli(probability_matrix)
    labels[(masked_indices !=
            1.0)] = -1  # We only compute loss on masked tokens

    # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
    indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8))
    inputs[(indices_replaced == 1.0)
           & (labels == -1)] = bert_tokenizer.convert_tokens_to_ids(
               bert_tokenizer.mask_token)

    # 10% of the time, we replace masked input tokens with random word
    indices_random = torch.bernoulli(torch.full(labels.shape, 0.5))
    random_words = torch.randint(len(bert_tokenizer),
                                 labels.shape,
                                 dtype=torch.long)
    inputs[(indices_random == 1.0) & (indices_replaced == 1.0) &
           (labels == -1)] = random_words[(indices_random == 1.0)
                                          & (indices_replaced == 1.0) &
                                          (labels == -1)]

    # The rest of the time (10% of the time) we keep the masked input tokens unchanged
    return inputs, labels
def mask_token(inputs: torch.Tensor, tokenizer: BertTokenizer,
               args) -> Tuple[torch.Tensor, torch.Tensor]:
    if tokenizer.mask_token is None:
        raise ValueError(
            'This tokenizer does not have a mask token which is necessary for masked language model. Remove the --mlm flag if you want to use this tokenizer.'
        )
    labels = inputs.clone()
    # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
    probability_matrix = torch.full(labels.shape, args.mlm_probability)
    # filter the exist special token which will not be masked anymore.
    special_tokens_mask = [
        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True)
        for val in labels.tolist()
    ]
    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask,
                                                 dtype=torch.bool),
                                    value=0.0)
    # filter the exist pad token which will not be masked anymore
    if tokenizer.pad_token is not None:
        padding_mask = labels.eq(tokenizer.pad_token_id)
        probability_matrix.masked_fill_(padding_mask, value=0.0)
    # get out the possible masked position with 1.0 which means 15% of all pure tokens will be picked out for relevant masking.
    masked_indices = torch.bernoulli(probability_matrix).bool()
    # we only need the masked position to compute loss while the other token ids are set to be -100
    labels[~masked_indices] = -100

    # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
    indices_replaced = torch.bernoulli(torch.full(labels.shape,
                                                  0.8)).bool() & masked_indices
    inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(
        tokenizer.mask_token)

    # 10% of the time, we replace masked input tokens with random word
    indices_random = torch.bernoulli(torch.full(
        labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
    random_words = torch.randint(len(tokenizer),
                                 labels.shape,
                                 dtype=torch.long)
    inputs[indices_random] = random_words[indices_random]

    # The rest of the time(10%) we keep the masked input tokens unchanged
    return inputs, labels
def mask_tokens(inputs: torch.Tensor, tokenizer: BertTokenizer,
                args) -> Tuple[torch.Tensor, torch.Tensor]:
    """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
    labels = inputs.clone()
    # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
    probability_matrix = torch.full(labels.shape, args.mlm_probability)
    special_tokens_mask = [
        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True)
        for val in labels.tolist()
    ]
    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask,
                                                 dtype=torch.bool),
                                    value=0.0)
    if tokenizer._pad_token is not None:
        padding_mask = labels.eq(tokenizer.pad_token_id)
        probability_matrix.masked_fill_(padding_mask, value=0.0)
    masked_indices = torch.bernoulli(probability_matrix).bool()
    labels[~masked_indices] = -100  # We only compute loss on masked tokens

    inputs[masked_indices] = tokenizer.convert_tokens_to_ids(
        tokenizer.mask_token)
    return inputs, labels
예제 #4
0
def mask_tokens(inputs: torch.Tensor, tokenizer: BertTokenizer,
                args) -> Tuple[torch.Tensor, torch.Tensor]:
    """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """
    labels = inputs.clone()
    # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa)
    probability_matrix = torch.full(labels.shape, args.mlm_probability)
    special_tokens_mask = [
        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True)
        for val in labels.tolist()
    ]
    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask,
                                                 dtype=torch.bool),
                                    value=0.0)
    if tokenizer._pad_token is not None:
        padding_mask = labels.eq(tokenizer.pad_token_id)
        probability_matrix.masked_fill_(padding_mask, value=0.0)
    masked_indices = torch.bernoulli(probability_matrix).bool()
    labels[~masked_indices] = -100  # We only compute loss on masked tokens

    # TODO replace all tokens-to-be-changed with [MASK] (prob 80% -> 100%)
    # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
    indices_replaced = torch.bernoulli(torch.full(labels.shape,
                                                  0.8)).bool() & masked_indices
    inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(
        tokenizer.mask_token)

    # 10% of the time, we replace masked input tokens with random word
    indices_random = (torch.bernoulli(torch.full(labels.shape, 0.5)).bool()
                      & masked_indices & ~indices_replaced)
    random_words = torch.randint(len(tokenizer),
                                 labels.shape,
                                 dtype=torch.long)
    inputs[indices_random] = random_words[indices_random]

    # The rest of the time (10% of the time) we keep the masked input tokens unchanged
    return inputs, labels
예제 #5
0
tokenizer.eos_token_id
tokenizer.all_special_ids

tokenizer.special_tokens_map
tokenizer.additional_special_tokens
y = "<BOS> I like embeddings <EOS> [SEP] i like tea"
z = tokenizer.encode(y)
tokenizer.convert_ids_to_tokens(z)
tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(z))

tokenizer.encode("embeddings embedding")
tokenizer.encode("i like tea")
tokenizer.encode("i like tea")
tokenizer.decode(tokenizer.encode("embeddings embedding"))

tokenizer.get_special_tokens_mask([100, 101, 102], [1, 2, 3])
tokenizer.get_special_tokens_mask([100, 101, 102, 1, 2, 3])

tokenizer("s")
from transformers import BertTokenizerFast
t1 = BertTokenizerFast.from_pretrained("bert-base-uncased",
                                       bos_token="<BOS>",
                                       eos_token="<EOS>")

t1.tokenize("<BOS> I like embeddings <EOS> [SEP] i like tea")
t1.special_tokens_map
y = t1.encode("<BOS> I like embeddings <EOS> [SEP] i like tea")
t1.create_token_type_ids_from_sequences(y)

t1("abd")
t1.covert_ids_to_tokens(y)