def forward( self, input_ids=None, past_key_values=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, use_lm=False, **kwargs, ): if "past" in kwargs: warnings.warn( "The `past` argument is deprecated and will be removed in a future version, use `past_key_values` instead.", FutureWarning, ) past_key_values = kwargs.pop("past") assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." output_attentions = (output_attentions if output_attentions is not None else self.config.output_attentions) output_hidden_states = (output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states) use_cache = use_cache if use_cache is not None else self.config.use_cache return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError( "You cannot specify both input_ids and inputs_embeds at the same time" ) elif input_ids is not None: input_shape = input_ids.size() input_ids = input_ids.view(-1, input_shape[-1]) batch_size = input_ids.shape[0] elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] batch_size = inputs_embeds.shape[0] else: raise ValueError( "You have to specify either input_ids or inputs_embeds") if token_type_ids is not None: token_type_ids = token_type_ids.view(-1, input_shape[-1]) if position_ids is not None: position_ids = position_ids.view(-1, input_shape[-1]) if past_key_values is None: past_length = 0 past_key_values = [None] * len(self.h) else: past_length = past_key_values[0][0].size(-2) if position_ids is None: device = input_ids.device if input_ids is not None else inputs_embeds.device position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device) position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1]) # Attention mask. if attention_mask is not None: assert batch_size > 0, "batch_size has to be defined and > 0" attention_mask = attention_mask.view(batch_size, -1) # We create a 3D attention mask from a 2D tensor mask. # Sizes are [batch_size, 1, 1, to_seq_length] # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] # this attention mask is more simple than the triangular masking of causal attention # used in OpenAI GPT, we just need to prepare the broadcast dimension here. attention_mask = attention_mask[:, None, None, :] # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. attention_mask = attention_mask.to(dtype=next( self.parameters()).dtype) # fp16 compatibility attention_mask = (1.0 - attention_mask) * -10000.0 # If a 2D ou 3D attention mask is provided for the cross-attention # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length] if self.config.add_cross_attention and encoder_hidden_states is not None: encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size( ) encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) if encoder_attention_mask is None: encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) encoder_attention_mask = self.invert_attention_mask( encoder_attention_mask) elif use_lm: encoder_attention_mask = self.invert_attention_mask( encoder_attention_mask) else: encoder_attention_mask = None # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # head_mask has shape n_layer x batch x n_heads x N x N head_mask = self.get_head_mask(head_mask, self.config.n_layer) if inputs_embeds is None: inputs_embeds = self.wte(input_ids) position_embeds = self.wpe(position_ids) if token_type_ids is not None: token_type_embeds = self.wte(token_type_ids) else: token_type_embeds = 0 hidden_states = inputs_embeds + position_embeds + token_type_embeds hidden_states = self.drop(hidden_states) output_shape = input_shape + (hidden_states.size(-1), ) presents = () if use_cache else None all_attentions = () if output_attentions else None all_hidden_states = () if output_hidden_states else None if use_lm: hidden_states = torch.cat((encoder_hidden_states, inputs_embeds), 1) attention_mask = torch.cat( (encoder_attention_mask, attention_mask), 3) y_len = inputs_embeds.shape[-2] encoder_hidden_states = None encoder_attention_mask = None else: y_len = None for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)): if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states.view( *output_shape), ) outputs = block( hidden_states, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask[i], encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, use_cache=use_cache, output_attentions=output_attentions, decoder_input_length=y_len, ) hidden_states, present = outputs[:2] if use_cache is True: presents = presents + (present, ) if output_attentions: all_attentions = all_attentions + (outputs[2], ) hidden_states = self.ln_f(hidden_states) if use_lm: hidden_states = hidden_states[:, -output_shape[-2]:, :] else: hidden_states = hidden_states.view(*output_shape) # Add last hidden state if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states, ) if not return_dict: return tuple( v for v in [hidden_states, presents, all_hidden_states, all_attentions] if v is not None) return BaseModelOutputWithPast( last_hidden_state=hidden_states, past_key_values=presents, hidden_states=all_hidden_states, attentions=all_attentions, )
def forward( self, input_ids=None, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, use_cache=True, return_dict=None, ): r""" Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs: last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the last layer of the model. If `past` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, 1, hidden_size)` is output. past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: from transformers import GPT2Tokenizer, GPT2Model import torch tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2Model.from_pretrained('gpt2') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") elif input_ids is not None: input_shape = input_ids.size() input_ids = input_ids.view(-1, input_shape[-1]) batch_size = input_ids.shape[0] elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] batch_size = inputs_embeds.shape[0] else: raise ValueError("You have to specify either input_ids or inputs_embeds") if token_type_ids is not None: token_type_ids = token_type_ids.view(-1, input_shape[-1]) if position_ids is not None: position_ids = position_ids.view(-1, input_shape[-1]) if past is None: past_length = 0 past = [None] * len(self.h) else: past_length = past[0][0].size(-2) if position_ids is None: device = input_ids.device if input_ids is not None else inputs_embeds.device position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device) position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1]) # Attention mask. if attention_mask is not None: assert batch_size > 0, "batch_size has to be defined and > 0" attention_mask = attention_mask.view(batch_size, -1) # We create a 3D attention mask from a 2D tensor mask. # Sizes are [batch_size, 1, 1, to_seq_length] # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] # this attention mask is more simple than the triangular masking of causal attention # used in OpenAI GPT, we just need to prepare the broadcast dimension here. attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility attention_mask = (1.0 - attention_mask) * -10000.0 # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # head_mask has shape n_layer x batch x n_heads x N x N head_mask = self.get_head_mask(head_mask, self.config.n_layer) if inputs_embeds is None: inputs_embeds = self.wte(input_ids) position_embeds = self.wpe(position_ids) if token_type_ids is not None: token_type_embeds = self.wte(token_type_ids) else: token_type_embeds = 0 hidden_states = inputs_embeds + position_embeds + token_type_embeds hidden_states = self.drop(hidden_states) if _USE_GROVER: hidden_states = self.emb_norm(hidden_states) output_shape = input_shape + (hidden_states.size(-1),) presents = () all_attentions = [] all_hidden_states = () for i, (block, layer_past) in enumerate(zip(self.h, past)): if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),) outputs = block( hidden_states, layer_past=layer_past, attention_mask=attention_mask, head_mask=head_mask[i], use_cache=use_cache, ) hidden_states, present = outputs[:2] if use_cache is True: presents = presents + (present,) if self.output_attentions: all_attentions.append(outputs[2]) if not _USE_GROVER: hidden_states = self.ln_f(hidden_states) hidden_states = hidden_states.view(*output_shape) # Add last hidden state if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) # outputs = (hidden_states,) # if use_cache is True: # outputs = outputs + (presents,) # if self.output_hidden_states: # outputs = outputs + (all_hidden_states,) #if self.output_attentions: # # let the number of heads free (-1) so we can extract attention even after head pruning # attention_output_shape = input_shape[:-1] + (-1,) + all_attentions[0].shape[-2:] # all_attentions = tuple(t.view(*attention_output_shape) for t in all_attentions) # outputs = outputs + (all_attentions,) # return outputs # last hidden state, (presents), (all hidden_states), (attentions) if not return_dict: return tuple(v for v in [hidden_states, presents, all_hidden_states, all_attentions] if v is not None) return BaseModelOutputWithPast( last_hidden_state=hidden_states, past_key_values=presents, hidden_states=all_hidden_states, attentions=all_attentions, )
def forward( self, input_ids, encoder_hidden_states, encoder_padding_mask, decoder_padding_mask, decoder_causal_mask, past_key_values=None, use_cache=False, output_attentions=False, output_hidden_states=False, return_dict=False, **unused, ): """ Includes several features from "Jointly Learning to Align and Translate with Transformer Models" (Garg et al., EMNLP 2019). Args: input_ids (LongTensor): previous decoder outputs of shape `(batch, tgt_len)`, for teacher forcing encoder_hidden_states: output from the encoder, used for encoder-side attention encoder_padding_mask: for ignoring pad tokens past_key_values (dict or None): dictionary used for storing state during generation Returns: BaseModelOutputWithPast or tuple: - the decoder's features of shape `(batch, tgt_len, embed_dim)` - the cache - hidden states - attentions """ if "decoder_cached_states" in unused: warnings.warn( "The `decoder_cached_states` argument is deprecated and will be removed in a future version, use `past_key_values` instead.", FutureWarning, ) past_key_values = unused.pop("decoder_cached_states") if "decoder_past_key_values" in unused: warnings.warn( "The `decoder_past_key_values` argument is deprecated and will be removed in a future version, use `past_key_values` instead.", FutureWarning, ) past_key_values = unused.pop("decoder_past_key_values") # check attention mask and invert if encoder_padding_mask is not None: encoder_padding_mask = invert_mask(encoder_padding_mask) _, seq_len = input_ids.shape[:2] # decoder position starts with max_encoder_length (default:128) positions = ( self.position_start_idx + torch.arange(seq_len, dtype=torch.long, device=input_ids.device) ).unsqueeze(0) if use_cache: input_ids = input_ids[:, -1:] positions = positions[:, -1:] # happens after we embed them # assert input_ids.ne(self.padding_idx).any() x = self.embed_tokens(input_ids) x = x * (self.embed_dim ** 0.5) embed_pos = self.embed_positions(positions) x += embed_pos x = F.dropout(x, p=self.dropout, training=self.training) # Convert to output format: (seq_len, BS, model_dim) -> (BS, seq_len, model_dim) x = x.transpose(0, 1) encoder_hidden_states = encoder_hidden_states.transpose(0, 1) # decoder layers all_hidden_states = () if output_hidden_states else None all_self_attns = () if output_attentions else None next_decoder_cache = [] for idx, decoder_layer in enumerate(self.layers): if output_hidden_states: all_hidden_states += (x,) layer_state = past_key_values[idx] if past_key_values is not None else None x, layer_self_attn, layer_past = decoder_layer( x, encoder_hidden_states, encoder_attn_mask=encoder_padding_mask, decoder_padding_mask=decoder_padding_mask, layer_state=layer_state, causal_mask=decoder_causal_mask, output_attentions=output_attentions, ) if use_cache: next_decoder_cache.append(layer_past.copy()) if output_attentions: all_self_attns += (layer_self_attn,) # Convert to standard output format: (seq_len, BS, model_dim) -> (BS, seq_len, model_dim) if output_hidden_states: all_hidden_states = tuple(hidden_state.transpose(0, 1) for hidden_state in all_hidden_states) x = x.transpose(0, 1) encoder_hidden_states = encoder_hidden_states.transpose(0, 1) next_cache = next_decoder_cache if use_cache else None if not return_dict: return tuple(v for v in [x, next_cache, all_hidden_states, all_self_attns] if v is not None) return BaseModelOutputWithPast( last_hidden_state=x, past_key_values=next_cache, hidden_states=all_hidden_states, attentions=all_self_attns, )
def forward( self, input_ids=None, past_key_values=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = (output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states) use_cache = use_cache if use_cache is not None else self.config.use_cache return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError( "You cannot specify both input_ids and inputs_embeds at the same time" ) elif input_ids is not None: input_shape = input_ids.size() input_ids = input_ids.view(-1, input_shape[-1]) batch_size = input_ids.shape[0] elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] batch_size = inputs_embeds.shape[0] else: raise ValueError( "You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device if token_type_ids is not None: token_type_ids = token_type_ids.view(-1, input_shape[-1]) if position_ids is not None: position_ids = position_ids.view(-1, input_shape[-1]) if past_key_values is None: # print("!!!past_key_values is None!!!") past_length = 0 past_key_values = tuple([None] * len(self.h)) else: past_length = past_key_values[0][0].size(-2) device = input_ids.device if input_ids is not None else inputs_embeds.device if position_ids is None: position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device) position_ids = position_ids.unsqueeze(0).view(-1, input_shape[-1]) # Attention mask. if attention_mask is not None: assert batch_size > 0, "batch_size has to be defined and > 0" global_attention_mask = attention_mask.view(batch_size, -1) # We create a 3D attention mask from a 2D tensor mask. # Sizes are [batch_size, 1, 1, to_seq_length] # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] # this attention mask is more simple than the triangular masking of causal attention # used in OpenAI GPT, we just need to prepare the broadcast dimension here. global_attention_mask = global_attention_mask[:, None, None, :] # Since global_attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. global_attention_mask = global_attention_mask.to( dtype=self.dtype) # fp16 compatibility global_attention_mask = (1.0 - global_attention_mask) * -10000.0 else: global_attention_mask = None # Local causal attention mask batch_size, seq_length = input_shape full_seq_length = seq_length + past_length # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x num_heads x N x N # head_mask has shape n_layer x batch x num_heads x N x N head_mask = self.get_head_mask(head_mask, self.config.num_layers) if inputs_embeds is None: inputs_embeds = self.wte(input_ids) position_embeds = self.wpe(position_ids) hidden_states = inputs_embeds + position_embeds if token_type_ids is not None: token_type_embeds = self.wte(token_type_ids) hidden_states = hidden_states + token_type_embeds hidden_states = self.drop(hidden_states) output_shape = input_shape + (hidden_states.size(-1), ) presents = () if use_cache else None all_self_attentions = () if output_attentions else None all_hidden_states = () if output_hidden_states else None for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)): attn_type = self.config.attention_layers[i] attn_mask = global_attention_mask if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states, ) if getattr(self.config, "gradient_checkpointing", False) and self.training: if use_cache: logger.warning( "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting " "`use_cache=False`...") use_cache = False def create_custom_forward(module): def custom_forward(*inputs): # None for past_key_value return module(*inputs, use_cache, output_attentions) return custom_forward outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(block), hidden_states, None, attn_mask, head_mask[i], ) else: try: outputs = block( hidden_states, layer_past=layer_past, attention_mask=attn_mask, head_mask=head_mask[i], use_cache=use_cache, output_attentions=output_attentions, ) except AfterStoppingPointException as e: raise e except Exception as e: print("failed with:") print(f"\t block {i}") print(f"\t input_ids.shape {input_ids.shape}") print(f"\t hidden_states.shape {hidden_states.shape}") print( f"\t past shapes {layer_past[0].shape if layer_past else layer_past}" ) raise e hidden_states = outputs[0] if use_cache is True: presents = presents + (outputs[1], ) if output_attentions: all_self_attentions = all_self_attentions + ( outputs[2 if use_cache else 1], ) hidden_states = self.ln_f(hidden_states) hidden_states = hidden_states.view(*output_shape) # Add last hidden state if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states, ) if not return_dict: return tuple(v for v in [ hidden_states, presents, all_hidden_states, all_self_attentions ] if v is not None) return BaseModelOutputWithPast( last_hidden_state=hidden_states, past_key_values=presents, hidden_states=all_hidden_states, attentions=all_self_attentions, )