def forward(self, features, **kwargs): x = self.dense(features) x = gelu(x) x = self.layer_norm(x) # project back to size of vocabulary with bias x = self.decoder(x) return x
def forward(self, features, **kwargs): x = self.dense(features) x = gelu(x) x = self.layer_norm(x) # project back to size of labels x = self.decoder(x) return x
def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, masked_lm_labels=None): r""" masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.DistilBertConfig`) and inputs: loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Masked language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import DistilBertTokenizer, DistilBertForMaskedLM import torch tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') model = DistilBertForMaskedLM.from_pretrained('distilbert-base-cased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, masked_lm_labels=input_ids) loss, prediction_scores = outputs[:2] """ dlbrt_output = self.distilbert( input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds ) hidden_states = dlbrt_output[0] # (bs, seq_length, dim) prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim) prediction_logits = gelu(prediction_logits) # (bs, seq_length, dim) prediction_logits = self.vocab_layer_norm(prediction_logits) # (bs, seq_length, dim) prediction_logits = self.vocab_projector(prediction_logits) # (bs, seq_length, vocab_size) outputs = (prediction_logits,) + dlbrt_output[1:] if masked_lm_labels is not None: mlm_loss = self.mlm_loss_fct( prediction_logits.view(-1, prediction_logits.size(-1)), masked_lm_labels.view(-1) ) outputs = (mlm_loss,) + outputs return outputs # (mlm_loss), prediction_logits, (all hidden_states), (all attentions)
def forward( self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``. """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict dlbrt_output = self.distilbert( input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_states = dlbrt_output[0] # (bs, seq_length, dim) prediction_logits = self.vocab_transform( hidden_states) # (bs, seq_length, dim) prediction_logits = gelu(prediction_logits) # (bs, seq_length, dim) prediction_logits = self.vocab_layer_norm( prediction_logits) # (bs, seq_length, dim) prediction_logits = self.vocab_projector( prediction_logits) # (bs, seq_length, vocab_size) mlm_loss = None if labels is not None: mlm_loss = self.mlm_loss_fct( prediction_logits.view(-1, prediction_logits.size(-1)), labels.view(-1)) if not return_dict: output = (prediction_logits, ) + dlbrt_output[1:] return ((mlm_loss, ) + output) if mlm_loss is not None else output return MaskedLMOutput( loss=mlm_loss, logits=prediction_logits, hidden_states=dlbrt_output.hidden_states, attentions=dlbrt_output.attentions, )
def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, ): outputs = self.bert( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, ) hidden_output = outputs[2] last_cat = torch.stack( (hidden_output[-1][:, 0], hidden_output[-2][:, 0], hidden_output[-3][:, 0]), 1, ) last_cat = torch.matmul(torch.nn.functional.softmax(self.avg_weight), last_cat) last_cat = torch.squeeze(last_cat) last_cat = gelu(last_cat) logits = self.classifier(last_cat) if labels is not None: if self.num_labels == 1: # We are doing regression loss_fct = MSELoss() loss = loss_fct(logits.view(-1), labels.view(-1)) else: loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) outputs = [ loss, ] outputs = outputs + [nn.functional.softmax(logits, -1)] return outputs # (loss), logits, (hidden_states), (attentions)
def forward( self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None, # New argument: select_positions=None, **kwargs ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``. kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): Used to hide legacy arguments that have been deprecated. """ if "masked_lm_labels" in kwargs: warnings.warn( "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.", FutureWarning, ) labels = kwargs.pop("masked_lm_labels") assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." return_dict = return_dict if return_dict is not None else self.config.use_return_dict dlbrt_output = self.distilbert( input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) hidden_states = dlbrt_output[0] # (bs, seq_length, dim) ### START MODIFICATION # Only apply MLM head to desired positions if select_positions is not None: hidden_states = hidden_states[[[i] for i in range(hidden_states.shape[0])], select_positions, :] ### END MODIFICATION prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim) prediction_logits = gelu(prediction_logits) # (bs, seq_length, dim) prediction_logits = self.vocab_layer_norm(prediction_logits) # (bs, seq_length, dim) prediction_logits = self.vocab_projector(prediction_logits) # (bs, seq_length, vocab_size) mlm_loss = None if labels is not None: mlm_loss = self.mlm_loss_fct(prediction_logits.view(-1, prediction_logits.size(-1)), labels.view(-1)) if not return_dict: output = (prediction_logits,) + dlbrt_output[1:] return ((mlm_loss,) + output) if mlm_loss is not None else output return MaskedLMOutput( loss=mlm_loss, logits=prediction_logits, hidden_states=dlbrt_output.hidden_states, attentions=dlbrt_output.attentions, )
def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, **kwargs): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): Used to hide legacy arguments that have been deprecated. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.DistilBertConfig`) and inputs: loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Masked language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. """ if "masked_lm_labels" in kwargs: warnings.warn( "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.", DeprecationWarning, ) labels = kwargs.pop("masked_lm_labels") assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." dlbrt_output = self.distilbert( input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, ) hidden_states = dlbrt_output[0] # (bs, seq_length, dim) prediction_logits = self.vocab_transform( hidden_states) # (bs, seq_length, dim) prediction_logits = gelu(prediction_logits) # (bs, seq_length, dim) prediction_logits = self.vocab_layer_norm( prediction_logits) # (bs, seq_length, dim) prediction_logits = self.vocab_projector( prediction_logits) # (bs, seq_length, vocab_size) outputs = (prediction_logits, ) + dlbrt_output[1:] if labels is not None: mlm_loss = self.mlm_loss_fct( prediction_logits.view(-1, prediction_logits.size(-1)), labels.view(-1)) outputs = (mlm_loss, ) + outputs return outputs # (mlm_loss), prediction_logits, (all hidden_states), (all attentions)