def mask_tokens(inputs: torch.Tensor, tokenizer: Tokenizer, args) -> Tuple[torch.Tensor, torch.Tensor]: """ Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. """ if tokenizer.mask_token is None: raise ValueError( "This tokenizer does not have a mask token which is necessary for masked language modeling. Remove the --mlm flag if you want to use this tokenizer." ) labels = inputs.clone() # We sample a few tokens in each sequence for masked-LM training (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) probability_matrix = torch.full(labels.shape, args.mlm_probability) special_tokens_mask = [ tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() ] probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) if tokenizer._pad_token is not None: padding_mask = labels.eq(tokenizer.pad_token_id) probability_matrix.masked_fill_(padding_mask, value=0.0) masked_indices = torch.bernoulli(probability_matrix).bool() labels[~masked_indices] = -100 # We only compute loss on masked tokens # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices inputs[indices_replaced] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token) # 10% of the time, we replace masked input tokens with random word indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced random_words = torch.randint(len(tokenizer), labels.shape, dtype=torch.long) inputs[indices_random] = random_words[indices_random] # The rest of the time (10% of the time) we keep the masked input tokens unchanged return inputs, labels
torch.set_grad_enabled(False) # Store the model we want to use MODEL_NAME = "bert-base-cased" # We need to create the model and tokenizer model = AutoModel.from_pretrained(MODEL_NAME) tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) # Tokens comes from a process that splits the input into sub-entities with interesting linguistic properties. tokens = tokenizer.tokenize("This is an input example") print("Tokens: {}".format(tokens)) # This is not sufficient for the model, as it requires integers as input, # not a problem, let's convert tokens to ids. tokens_ids = tokenizer.convert_tokens_to_ids(tokens) print("Tokens id: {}".format(tokens_ids)) # Add the required special tokens tokens_ids = tokenizer.build_inputs_with_special_tokens(tokens_ids) # We need to convert to a Deep Learning framework specific format, let's use PyTorch for now. tokens_pt = torch.tensor([tokens_ids]) print("Tokens PyTorch: {}".format(tokens_pt)) # Now we're ready to go through BERT with out input outputs, pooled = model(tokens_pt) print("Token wise output: {}, Pooled output: {}".format( outputs.shape, pooled.shape)) # Same thing factored into one-line as follow