def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_values=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): output_attentions = self.config.output_attentions output_hidden_states = self.config.output_hidden_states return_dict = self.config.use_return_dict use_cache = False input_shape = input_ids.size() batch_size, seq_length = input_shape device = input_ids.device past_key_values_length = 0 attention_mask = torch.ones( ((batch_size, seq_length + past_key_values_length)), device=device) token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( attention_mask, input_shape, device) encoder_extended_attention_mask = None head_mask = self.get_head_mask(None, self.config.num_hidden_layers) torch.cuda.synchronize() start_time = datetime.datetime.now() embedding_output = self.embeddings( input_ids=input_ids, token_type_ids=token_type_ids, ) encoder_outputs = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, ) sequence_output = encoder_outputs[0] pooled_output = (self.pooler(sequence_output) if self.pooler is not None else None) return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, past_key_values=encoder_outputs.past_key_values, hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, cross_attentions=encoder_outputs.cross_attentions, )
def sentemb_forward( cls, encoder, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, labels=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): return_dict = return_dict if return_dict is not None else cls.config.use_return_dict outputs = encoder( input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=True if cls.pooler_type in ['avg_top2', 'avg_first_last'] else False, return_dict=True, ) pooler_output = cls.pooler(attention_mask, outputs) if cls.pooler_type == "cls" and not cls.model_args.mlp_only_train: pooler_output = cls.mlp(pooler_output) if not return_dict: return (outputs[0], pooler_output) + outputs[2:] return BaseModelOutputWithPoolingAndCrossAttentions( pooler_output=pooler_output, last_hidden_state=outputs.last_hidden_state, hidden_states=outputs.hidden_states, )
def forward( self, input_ids=None, bbox=None, image=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = (output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states) return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError( "You cannot specify both input_ids and inputs_embeds at the same time" ) elif input_ids is not None: input_shape = input_ids.size() elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: raise ValueError( "You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device visual_shape = list(input_shape) visual_shape[1] = self.config.image_feature_pool_shape[ 0] * self.config.image_feature_pool_shape[1] visual_shape = torch.Size(visual_shape) final_shape = list(input_shape) final_shape[1] += visual_shape[1] final_shape = torch.Size(final_shape) visual_bbox_x = (torch.arange( 0, 1000 * (self.config.image_feature_pool_shape[1] + 1), 1000, device=device, dtype=bbox.dtype, ) // self.config.image_feature_pool_shape[1]) visual_bbox_y = (torch.arange( 0, 1000 * (self.config.image_feature_pool_shape[0] + 1), 1000, device=device, dtype=bbox.dtype, ) // self.config.image_feature_pool_shape[0]) visual_bbox = torch.stack( [ visual_bbox_x[:-1].repeat( self.config.image_feature_pool_shape[0], 1), visual_bbox_y[:-1].repeat( self.config.image_feature_pool_shape[1], 1).transpose( 0, 1), visual_bbox_x[1:].repeat( self.config.image_feature_pool_shape[0], 1), visual_bbox_y[1:].repeat( self.config.image_feature_pool_shape[1], 1).transpose( 0, 1), ], dim=-1, ).view(-1, bbox.size(-1)) visual_bbox = visual_bbox.repeat(final_shape[0], 1, 1) final_bbox = torch.cat([bbox, visual_bbox], dim=1) if attention_mask is None: attention_mask = torch.ones(input_shape, device=device) visual_attention_mask = torch.ones(visual_shape, device=device) final_attention_mask = torch.cat( [attention_mask, visual_attention_mask], dim=1) if token_type_ids is None: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) if position_ids is None: seq_length = input_shape[1] position_ids = self.embeddings.position_ids[:, :seq_length] position_ids = position_ids.expand_as(input_ids) visual_position_ids = torch.arange(0, visual_shape[1], dtype=torch.long, device=input_ids.device).repeat( input_shape[0], 1) final_position_ids = torch.cat([position_ids, visual_position_ids], dim=1) if bbox is None: bbox = torch.zeros(tuple(list(input_shape) + [4]), dtype=torch.long, device=device) text_layout_emb = self._calc_text_embeddings( input_ids=input_ids, bbox=bbox, token_type_ids=token_type_ids, position_ids=position_ids, ) visual_emb = self._calc_img_embeddings( image=image, bbox=visual_bbox, position_ids=visual_position_ids, ) final_emb = torch.cat([text_layout_emb, visual_emb], dim=1) extended_attention_mask = final_attention_mask.unsqueeze(1).unsqueeze( 2) extended_attention_mask = extended_attention_mask.to(dtype=self.dtype) extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 if head_mask is not None: if head_mask.dim() == 1: head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze( -1).unsqueeze(-1) head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1) elif head_mask.dim() == 2: head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) head_mask = head_mask.to(dtype=next(self.parameters()).dtype) else: head_mask = [None] * self.config.num_hidden_layers encoder_outputs = self.encoder( final_emb, extended_attention_mask, bbox=final_bbox, position_ids=final_position_ids, head_mask=head_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = encoder_outputs[0] pooled_output = self.pooler(sequence_output) if not return_dict: return (sequence_output, pooled_output) + encoder_outputs[1:] return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, cross_attentions=encoder_outputs.cross_attentions, )
def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_values=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, bboxes=None # added bboxes argument ) -> BaseModelOutputWithPoolingAndCrossAttentions: """ `bboxes` is a tensor of shape (batch_size, sequence_length, 4); see `RobertaModel` for the remaining args """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = (output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states) return_dict = return_dict if return_dict is not None else self.config.use_return_dict use_cache = use_cache if use_cache is not None else self.config.use_cache if not self.config.is_decoder: use_cache = False if input_ids is not None and inputs_embeds is not None: raise ValueError( "You cannot specify both input_ids and inputs_embeds at the same time" ) elif input_ids is not None: input_shape = input_ids.size() batch_size, seq_length = input_shape elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] batch_size, seq_length = input_shape else: raise ValueError( "You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device past_key_values_length = past_key_values[0][0].shape[ 2] if past_key_values is not None else 0 if attention_mask is None: attention_mask = torch.ones( ((batch_size, seq_length + past_key_values_length)), device=device) if token_type_ids is None: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( attention_mask, input_shape, device) if self.config.is_decoder and encoder_hidden_states is not None: encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size( ) encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) if encoder_attention_mask is None: encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) encoder_extended_attention_mask = self.invert_attention_mask( encoder_attention_mask) else: encoder_extended_attention_mask = None head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) embedding_output = self.embeddings( input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds, past_key_values_length=past_key_values_length, bboxes=bboxes # added bboxes argument ) # extended attention mask is applied additively, so we can simply inject it with the relative biases bias = self.relative_bias(input_ids, bboxes) extended_attention_mask = extended_attention_mask + bias encoder_outputs = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_extended_attention_mask, past_key_values=past_key_values, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = encoder_outputs[0] pooled_output = self.pooler( sequence_output) if self.pooler is not None else None if not return_dict: return (sequence_output, pooled_output) + encoder_outputs[1:] return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, past_key_values=encoder_outputs.past_key_values, hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, cross_attentions=encoder_outputs.cross_attentions, )
def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, soft_position_ids=None, head_mask=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_values=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder. encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. use_cache (:obj:`bool`, `optional`): If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up decoding (see :obj:`past_key_values`). """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = (output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states) return_dict = return_dict if return_dict is not None else self.config.use_return_dict if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache else: use_cache = False if input_ids is not None and inputs_embeds is not None: raise ValueError( "You cannot specify both input_ids and inputs_embeds at the same time" ) elif input_ids is not None: input_shape = input_ids.size() batch_size, seq_length = input_shape elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] batch_size, seq_length = input_shape else: raise ValueError( "You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device # past_key_values_length past_key_values_length = past_key_values[0][0].shape[ 2] if past_key_values is not None else 0 if attention_mask is None: attention_mask = torch.ones( ((batch_size, seq_length + past_key_values_length)), device=device) if token_type_ids is None: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( attention_mask, input_shape, device) # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] if self.config.is_decoder and encoder_hidden_states is not None: encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size( ) encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) if encoder_attention_mask is None: encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) encoder_extended_attention_mask = self.invert_attention_mask( encoder_attention_mask) else: encoder_extended_attention_mask = None # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) embedding_output = self.embeddings( input_ids=input_ids, position_ids=position_ids, soft_position_ids=soft_position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds, past_key_values_length=past_key_values_length, ) encoder_outputs = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_extended_attention_mask, past_key_values=past_key_values, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict) sequence_output = encoder_outputs[0] pooled_output = self.pooler( sequence_output) if self.pooler is not None else None if not return_dict: return (sequence_output, pooled_output) + encoder_outputs[1:] return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, past_key_values=encoder_outputs.past_key_values, hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, cross_attentions=encoder_outputs.cross_attentions, )
def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder. encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = (output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states) return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError( "You cannot specify both input_ids and inputs_embeds at the same time" ) elif input_ids is not None: input_shape = input_ids.size() elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: raise ValueError( "You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device if attention_mask is None: attention_mask = torch.ones(input_shape, device=device) # TODO: remove redundant code # if token_type_ids is None: # token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( attention_mask, input_shape, device) # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] if self.config.is_decoder and encoder_hidden_states is not None: encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size( ) encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) if encoder_attention_mask is None: encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) encoder_extended_attention_mask = self.invert_attention_mask( encoder_attention_mask) else: encoder_extended_attention_mask = None # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds) # adapter stuff if (encoder_attention_mask is not None) and (encoder_hidden_states is not None): encoder_extended_attention_mask = self.invert_attention_mask( encoder_attention_mask) encoder_outputs = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_extended_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = encoder_outputs[0] # must not use pooler output becauze CLS is 2nd token now pooled_output = self.pooler( sequence_output) if self.pooler is not None else None # length embedding stuff if self.add_length_embedding: return { "length_logits": sequence_output[:, :1, :], "last_hidden_state": sequence_output[:, 1:, :] } if not return_dict: return (sequence_output, pooled_output) + encoder_outputs[1:] return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, cross_attentions=encoder_outputs.cross_attentions, )
def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_values=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, num_layers=None, num_layers_total=None, rng_seed=None, no_grad_embedding=False, drop_unused_layers=False, approximate_unused_layers=False, start_sampling_from=0, exclude_layers=tuple(), keep_last_layer=False, run_large_encoder=False, ): r""" encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder. encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids` (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`. use_cache (:obj:`bool`, `optional`): If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up decoding (see :obj:`past_key_values`). """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = (output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states) return_dict = return_dict if return_dict is not None else self.config.use_return_dict use_cache = use_cache if use_cache is not None else self.config.use_cache if not self.config.is_decoder: use_cache = False if input_ids is not None and inputs_embeds is not None: raise ValueError( "You cannot specify both input_ids and inputs_embeds at the same time" ) elif input_ids is not None: input_shape = input_ids.size() batch_size, seq_length = input_shape elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] batch_size, seq_length = input_shape else: raise ValueError( "You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device # past_key_values_length past_key_values_length = past_key_values[0][0].shape[ 2] if past_key_values is not None else 0 if attention_mask is None: attention_mask = torch.ones( ((batch_size, seq_length + past_key_values_length)), device=device) if token_type_ids is None: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( attention_mask, input_shape, device) # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] if self.config.is_decoder and encoder_hidden_states is not None: encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size( ) encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) if encoder_attention_mask is None: encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) encoder_extended_attention_mask = self.invert_attention_mask( encoder_attention_mask) else: encoder_extended_attention_mask = None # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) with torch.set_grad_enabled(not no_grad_embedding and self.training and start_sampling_from == 0 and torch.is_grad_enabled()): embedding_output = self.embeddings( input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds, past_key_values_length=past_key_values_length, layer_normalizer=self.layer_normalizers[0] if self.layer_normalizers is not None else None, ) # print(embedding_output[:, :6, :6]) encoder_inputs = dict( attention_mask=extended_attention_mask, head_mask=head_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_extended_attention_mask, past_key_values=past_key_values, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, num_layers=num_layers, num_layers_total=num_layers_total, rng_seed=rng_seed, drop_unused_layers=drop_unused_layers, approximate_unused_layers=approximate_unused_layers, start_sampling_from=start_sampling_from, exclude_layers=exclude_layers, keep_last_layer=keep_last_layer, ) if run_large_encoder: encoder_inputs.update( dict(layer_normalizers=self.layer_normalizers)) encoder_outputs = self.encoder(embedding_output, **encoder_inputs) else: encoder_inputs.update( dict(layer_normalizers=self.layer_normalizers_small)) encoder_outputs = self.small_encoder( self.embedding2encoder(embedding_output), **encoder_inputs) sequence_output = encoder_outputs[0] if not return_dict: return (sequence_output, None) + encoder_outputs[1:] rv = BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=None, past_key_values=encoder_outputs.past_key_values, hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, cross_attentions=encoder_outputs.cross_attentions, ) rv["layer_scales_loss"] = encoder_outputs["layer_scales_loss"] rv["selected_layers"] = encoder_outputs["selected_layers"] rv["n_grad_forward_layers"] = encoder_outputs["n_grad_forward_layers"] rv["n_forward_layers"] = encoder_outputs["n_forward_layers"] return rv
def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_values=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = (output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states) return_dict = return_dict if return_dict is not None else self.config.use_return_dict if self.config.is_decoder: use_cache = use_cache if use_cache is not None else self.config.use_cache else: use_cache = False if input_ids is not None and inputs_embeds is not None: raise ValueError( "You cannot specify both input_ids and inputs_embeds at the same time" ) elif input_ids is not None: input_shape = input_ids.size() batch_size, seq_length = input_shape elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] batch_size, seq_length = input_shape else: raise ValueError( "You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device # past_key_values_length past_key_values_length = past_key_values[0][0].shape[ 2] if past_key_values is not None else 0 if attention_mask is None: attention_mask = torch.ones( ((batch_size, seq_length + past_key_values_length)), device=device) if token_type_ids is None: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) extended_attention_mask = self.creat_diagonal_mask( attention_mask, device) encoder_extended_attention_mask = None # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) embedding_output, combined_embedding = self.embeddings( input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds, past_key_values_length=past_key_values_length, ) encoder_outputs = self.encoder( embedding_output, combined_hidden=combined_embedding, attention_mask=extended_attention_mask, head_mask=head_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_extended_attention_mask, past_key_values=past_key_values, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = encoder_outputs[0] pooled_output = self.pooler( sequence_output) if self.pooler is not None else None if not return_dict: return (sequence_output, pooled_output) + encoder_outputs[1:] return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, past_key_values=encoder_outputs.past_key_values, hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, cross_attentions=encoder_outputs.cross_attentions, )
def forward(self, x: BaseModelOutputWithPoolingAndCrossAttentions): """ :param token_embs: numpy array of shape (batch_size, max_seq_len, emb_dim) containing the embeddings for each token :param token_ids: numpy array of shape (batch_size, max_seq_len) containing the ids for each token in the vocab :param token_weights: dict with key=token_id, value= weight in corpus :param centroids: numpy array of shape (n_cluster, emb_dim) that describes the centroids of our clusters in the embedding space :param token_to_cluster: numpy array of shape (vocab_size, 1) where token_to_cluster[i] = cluster_id that token with id i belongs to :param svd_components: Components from a truncated singular value decomposition (SVD, aka LSA) to be removed from the final sentence embeddings in a postprocessing step. SVD must be fit on representative sample of sentence embeddings first and can then be removed from all subsequent embeddings in this function. We expect the sklearn.decomposition.TruncatedSVD.fit(<your_embeddings>)._components to be passed here. :return: embeddings matrix of shape (batch_size, emb_dim + (n_clusters*n_clusters+1)/2) """ token_vecs = x.sequence_output.cpu().numpy() # we only take the aggregated value of non-padding tokens padding_mask = x.padding_mask.cpu().numpy() ignore_mask_2d = x.padding_mask == 0 # sometimes we want to exclude the CLS token as well from our aggregation operation if self.ignore_first_token: ignore_mask_2d[:, 0] = True ignore_mask_3d = np.zeros(token_vecs.shape, dtype=bool) ignore_mask_3d[:, :, :] = ignore_mask_2d[:, :, np.newaxis] input_ids = x.input_ids.cpu().numpy() mask = x.padding_mask == 0 token_weights = self.s3e_stats["token_weights"] centroids = self.s3e_stats["centroids"] token_to_cluster = self.s3e_stats["token_to_cluster"] svd_components = self.s3e_stats.get("svd_components", None) embeddings = [] n_clusters = centroids.shape[0] emb_dim = token_vecs.shape[2] # n_tokens = token_embs.shape[1] n_samples = token_vecs.shape[0] # Mask tokens that should be ignored (e.g. Padding, CLS ...) x.input_ids[mask] = -1 # Process each sentence in the batch at a time for sample_idx in range(n_samples): stage_vec = [{}] # 1) create a dict with key=tok_id, value = embedding for tok_idx, tok_id in enumerate(x.input_ids[sample_idx, :]): if tok_id != -1: stage_vec[-1][tok_id] = token_vecs[sample_idx, tok_idx] # 2) create a second dict with key=cluster_id, val=[embeddings] (= C) stage_vec.append({}) for k, v in stage_vec[-2].items(): cluster = token_to_cluster[k] if cluster in stage_vec[-1]: stage_vec[-1][cluster].append(stage_vec[-2][k] * token_weights[k]) else: stage_vec[-1][cluster] = [] stage_vec[-1][cluster].append(stage_vec[-2][k] * token_weights[k]) # VLAD for each cluster for k, v in stage_vec[-1].items(): # Centroids centroid_vec = centroids[k] # Residual v = [wv - centroid_vec for wv in v] stage_vec[-1][k] = np.sum(v, 0) # Compute Sentence Embedding (weighted avg, dim = original embedding dim) sentvec = [] vec = np.zeros((emb_dim)) for key, value in stage_vec[0].items(): # print(token_weights[key]) vec = vec + value * token_weights[key] sentvec.append(vec / len(stage_vec[0].keys())) # Covariance Descriptor (dim = k*(k+1)/2, with k=n_clusters) matrix = np.zeros((n_clusters, emb_dim)) for j in range(n_clusters): if j in stage_vec[-1]: matrix[j, :] = stage_vec[-1][j] matrix_no_mean = matrix - matrix.mean(1)[:, np.newaxis] cov = matrix_no_mean.dot(matrix_no_mean.T) # Generate Embedding iu1 = np.triu_indices(cov.shape[0]) iu2 = np.triu_indices(cov.shape[0], 1) cov[iu2] = cov[iu2] * np.sqrt(2) vec = cov[iu1] vec = vec / np.linalg.norm(vec) sentvec.append(vec) # Concatenate weighted avg + covariance descriptors sentvec = np.concatenate(sentvec) embeddings.append(sentvec) embeddings = np.vstack(embeddings) # Post processing (removal of first principal component) if svd_components is not None: embeddings = embeddings - embeddings.dot( svd_components.transpose()) * svd_components return embeddings
def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, shifter_embedding=None, position_ids=None, head_mask=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder. encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = (output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states) return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError( "You cannot specify both input_ids and inputs_embeds at the same time" ) elif input_ids is not None: input_shape = input_ids.size() elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: raise ValueError( "You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device if attention_mask is None: attention_mask = torch.ones(input_shape, device=device) if token_type_ids is None: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( attention_mask, input_shape, device) # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] if self.config.is_decoder and encoder_hidden_states is not None: encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size( ) encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) if encoder_attention_mask is None: encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) encoder_extended_attention_mask = self.invert_attention_mask( encoder_attention_mask) else: encoder_extended_attention_mask = None # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) # use audio embedding if non-zero inputs_embeds = self.word_embedding_predictor(shifter_embedding) inputs_embeds_original = self.get_input_embeddings()(input_ids) inputs_embeds = torch.where( torch.sum(torch.abs(shifter_embedding), dim=-1, keepdim=True) > 0.0, inputs_embeds, inputs_embeds_original) if hasattr(self.config, 'word_predictor_pre_training' ) and self.config.word_predictor_pre_training: loss_fct = L1Loss() word_predictor_loss = loss_fct(inputs_embeds, inputs_embeds_original.detach()) word_predictor_loss = word_predictor_loss * 100 # inputs_embeds_shifter = self.word_embedding_shifter(shifter_embedding) # inputs_embeds = inputs_embeds + inputs_embeds_shifter # inputs_embeds = torch.where(torch.sum(torch.abs(shifter_embedding), dim=-1, keepdim=True) > 0.0, # inputs_embeds, inputs_embeds_original) embedding_output = self.embeddings(input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds) encoder_outputs = self.encoder( embedding_output, # shifter_embedding, attention_mask=extended_attention_mask, head_mask=head_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_extended_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = encoder_outputs[0] pooled_output = self.pooler( sequence_output) if self.pooler is not None else None # length_output = sequence_output.permute(0, 2, 1) # length_output = self.length_predictor(length_output) # length_output = length_output.squeeze(1) if not return_dict: return (sequence_output, pooled_output) + encoder_outputs[1:] if hasattr(self.config, 'word_predictor_pre_training' ) and self.config.word_predictor_pre_training: return BaseModelOutputWithPoolingAndCrossAttentionsAndWordPredictorLoss( last_hidden_state=sequence_output, pooler_output=pooled_output, hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, cross_attentions=encoder_outputs.cross_attentions, word_predictor_loss=word_predictor_loss, ) return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, cross_attentions=encoder_outputs.cross_attentions, # length_output=length_output, )
def forward( self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, entity_ids=None, entity_masking=None, entity_probs=None, entity_word_embeds_dropout=False, head_mask=None, inputs_embeds=None, encoder_hidden_states=None, encoder_attention_mask=None, output_attentions=None, output_hidden_states=None, return_dict=None, embedding_output=None, ): output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = (output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states) return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is not None and inputs_embeds is not None: raise ValueError( "You cannot specify both input_ids and inputs_embeds at the same time" ) elif input_ids is not None: input_shape = input_ids.size() elif inputs_embeds is not None: input_shape = inputs_embeds.size()[:-1] else: raise ValueError( "You have to specify either input_ids or inputs_embeds") device = input_ids.device if input_ids is not None else inputs_embeds.device if attention_mask is None: attention_mask = torch.ones(input_shape, device=device) if token_type_ids is None: token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( attention_mask, input_shape, device) # If a 2D or 3D attention mask is provided for the cross-attention # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] if self.config.is_decoder and encoder_hidden_states is not None: encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size( ) encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) if encoder_attention_mask is None: encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) encoder_extended_attention_mask = self.invert_attention_mask( encoder_attention_mask) else: encoder_extended_attention_mask = None # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) if embedding_output is None: embedding_output = self.embeddings( input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds, entity_ids=entity_ids, entity_masking=entity_masking, entity_probs=entity_probs, entity_word_embeds_dropout=entity_word_embeds_dropout) encoder_outputs = self.encoder( embedding_output, attention_mask=extended_attention_mask, head_mask=head_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_extended_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) sequence_output = encoder_outputs[0] pooled_output = self.pooler( sequence_output) if self.pooler is not None else None if not return_dict: return (sequence_output, pooled_output) + encoder_outputs[1:] return BaseModelOutputWithPoolingAndCrossAttentions( last_hidden_state=sequence_output, pooler_output=pooled_output, hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, cross_attentions=encoder_outputs.cross_attentions, )