def mask_tokens(inputs, bert_tokenizer: BertTokenizer, jp_tokenizer: JumanTokenizer, args): """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ labels = inputs.clone() # パディング部分(-1)も除外対象に設定 # -1部分を持ったパディングマスクを作成 padding_mask = labels.clone() # パディングを認識可能な0([UNK])に戻す inputs[padding_mask == -1] = 0 # 予測に使う部分を確率決定(True = 1.0) # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) probability_matrix = torch.full(labels.shape, args.mlm_probability) special_tokens_mask = [ bert_tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool).byte(), value=0.0) masked_indices = torch.bernoulli(probability_matrix) labels[(masked_indices != 1.0)] = -1 # We only compute loss on masked tokens # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)) inputs[(indices_replaced == 1.0) & (labels == -1)] = bert_tokenizer.convert_tokens_to_ids( bert_tokenizer.mask_token) # 10% of the time, we replace masked input tokens with random word indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)) random_words = torch.randint(len(bert_tokenizer), labels.shape, dtype=torch.long) inputs[(indices_random == 1.0) & (indices_replaced == 1.0) & (labels == -1)] = random_words[(indices_random == 1.0) & (indices_replaced == 1.0) & (labels == -1)] # The rest of the time (10% of the time) we keep the masked input tokens unchanged return inputs, labels
def mask_token(inputs: torch.Tensor, tokenizer: BertTokenizer, args) -> Tuple[torch.Tensor, torch.Tensor]: if tokenizer.mask_token is None: raise ValueError( 'This tokenizer does not have a mask token which is necessary for masked language model. Remove the --mlm flag if you want to use this tokenizer.' ) labels = inputs.clone() # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) probability_matrix = torch.full(labels.shape, args.mlm_probability) # filter the exist special token which will not be masked anymore. special_tokens_mask = [ tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) # filter the exist pad token which will not be masked anymore if tokenizer.pad_token is not None: padding_mask = labels.eq(tokenizer.pad_token_id) probability_matrix.masked_fill_(padding_mask, value=0.0) # get out the possible masked position with 1.0 which means 15% of all pure tokens will be picked out for relevant masking. masked_indices = torch.bernoulli(probability_matrix).bool() # we only need the masked position to compute loss while the other token ids are set to be -100 labels[~masked_indices] = -100 # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices inputs[indices_replaced] = tokenizer.convert_tokens_to_ids( tokenizer.mask_token) # 10% of the time, we replace masked input tokens with random word indices_random = torch.bernoulli(torch.full( labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long) inputs[indices_random] = random_words[indices_random] # The rest of the time(10%) we keep the masked input tokens unchanged return inputs, labels
def mask_tokens(inputs: torch.Tensor, tokenizer: BertTokenizer, args) -> Tuple[torch.Tensor, torch.Tensor]: """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ labels = inputs.clone() # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) probability_matrix = torch.full(labels.shape, args.mlm_probability) special_tokens_mask = [ tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) if tokenizer._pad_token is not None: padding_mask = labels.eq(tokenizer.pad_token_id) probability_matrix.masked_fill_(padding_mask, value=0.0) masked_indices = torch.bernoulli(probability_matrix).bool() labels[~masked_indices] = -100 # We only compute loss on masked tokens inputs[masked_indices] = tokenizer.convert_tokens_to_ids( tokenizer.mask_token) return inputs, labels
def mask_tokens(inputs: torch.Tensor, tokenizer: BertTokenizer, args) -> Tuple[torch.Tensor, torch.Tensor]: """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ labels = inputs.clone() # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) probability_matrix = torch.full(labels.shape, args.mlm_probability) special_tokens_mask = [ tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) if tokenizer._pad_token is not None: padding_mask = labels.eq(tokenizer.pad_token_id) probability_matrix.masked_fill_(padding_mask, value=0.0) masked_indices = torch.bernoulli(probability_matrix).bool() labels[~masked_indices] = -100 # We only compute loss on masked tokens # TODO replace all tokens-to-be-changed with [MASK] (prob 80% -> 100%) # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices inputs[indices_replaced] = tokenizer.convert_tokens_to_ids( tokenizer.mask_token) # 10% of the time, we replace masked input tokens with random word indices_random = (torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced) random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long) inputs[indices_random] = random_words[indices_random] # The rest of the time (10% of the time) we keep the masked input tokens unchanged return inputs, labels
tokenizer.eos_token_id tokenizer.all_special_ids tokenizer.special_tokens_map tokenizer.additional_special_tokens y = "<BOS> I like embeddings <EOS> [SEP] i like tea" z = tokenizer.encode(y) tokenizer.convert_ids_to_tokens(z) tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(z)) tokenizer.encode("embeddings embedding") tokenizer.encode("i like tea") tokenizer.encode("i like tea") tokenizer.decode(tokenizer.encode("embeddings embedding")) tokenizer.get_special_tokens_mask([100, 101, 102], [1, 2, 3]) tokenizer.get_special_tokens_mask([100, 101, 102, 1, 2, 3]) tokenizer("s") from transformers import BertTokenizerFast t1 = BertTokenizerFast.from_pretrained("bert-base-uncased", bos_token="<BOS>", eos_token="<EOS>") t1.tokenize("<BOS> I like embeddings <EOS> [SEP] i like tea") t1.special_tokens_map y = t1.encode("<BOS> I like embeddings <EOS> [SEP] i like tea") t1.create_token_type_ids_from_sequences(y) t1("abd") t1.covert_ids_to_tokens(y)