def forward( self, hidden_states, attention_mask=None, output_attentions=False, output_hidden_states=False, return_dict=True, ): all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None if attention_mask is not None: # make sure padded tokens output 0 hidden_states[~attention_mask] = 0.0 # extend attention_mask attention_mask = (1.0 - attention_mask[:, None, None, :].to( dtype=hidden_states.dtype)) * -10000.0 attention_mask = attention_mask.expand(attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]) position_embeddings = self.pos_conv_embed(hidden_states) hidden_states = hidden_states + position_embeddings hidden_states = self.layer_norm(hidden_states) hidden_states = self.dropout(hidden_states) deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled() for layer in self.layers: if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states, ) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) dropout_probability = np.random.uniform(0, 1) skip_the_layer = True if self.training and ( dropout_probability < self.config.layerdrop) else False if not skip_the_layer or deepspeed_zero3_is_enabled: # under deepspeed zero3 all gpus must run in sync if getattr(self.config, "gradient_checkpointing", False) and self.training: # create gradient checkpointing function def create_custom_forward(module): def custom_forward(*inputs): return module(*inputs, output_attentions) return custom_forward layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer), hidden_states, attention_mask, ) else: layer_outputs = layer(hidden_states, attention_mask=attention_mask, output_attentions=output_attentions) hidden_states = layer_outputs[0] if skip_the_layer: layer_outputs = (None, None) if output_attentions: all_self_attentions = all_self_attentions + ( layer_outputs[1], ) if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states, ) if not return_dict: return tuple( v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_self_attentions, )
def forward( self, hidden_states, attention_mask=None, output_attentions=False, output_hidden_states=False, return_dict=True, ): all_hidden_states = () if output_hidden_states else None all_self_attentions = () if output_attentions else None if attention_mask is not None: # make sure padded tokens output 0 hidden_states[~attention_mask] = 0.0 input_lengths = (attention_mask.long()).sum(-1) # apply pooling formula to get real output_lengths output_lengths = input_lengths // self.config.squeeze_factor max_encoder_length = hidden_states.shape[1] // self.config.squeeze_factor attention_ids = ( torch.arange(0, max_encoder_length, device=output_lengths.device) .view(1, -1) .expand(output_lengths.shape[0], -1) ) attention_mask = (attention_ids < output_lengths.view(-1, 1)).long() # extend attention_mask attention_mask = (1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)) * -10000.0 attention_mask = attention_mask.expand( attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1] ) n_input_timesteps = hidden_states.shape[1] hidden_states = hidden_states.transpose(1, 2) position_embeddings = self.pos_conv_embed(hidden_states) pooled_hidden_states = self.pool(hidden_states) min_length = min(position_embeddings.size(-1), pooled_hidden_states.size(-1)) hidden_states = pooled_hidden_states[..., :min_length] + position_embeddings[..., :min_length] hidden_states = hidden_states.transpose(1, 2) hidden_states = self.layer_norm(hidden_states) hidden_states = self.dropout(hidden_states) deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled() for layer in self.layers: if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) dropout_probability = np.random.uniform(0, 1) skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False if not skip_the_layer or deepspeed_zero3_is_enabled: # under deepspeed zero3 all gpus must run in sync if self.gradient_checkpointing and self.training: # create gradient checkpointing function def create_custom_forward(module): def custom_forward(*inputs): return module(*inputs, output_attentions) return custom_forward layer_outputs = torch.utils.checkpoint.checkpoint( create_custom_forward(layer), hidden_states, attention_mask, ) else: layer_outputs = layer( hidden_states, attention_mask=attention_mask, output_attentions=output_attentions ) hidden_states = layer_outputs[0] if skip_the_layer: layer_outputs = (None, None) if output_attentions: all_self_attentions = all_self_attentions + (layer_outputs[1],) if output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) hidden_states = self.upsample(hidden_states) if hidden_states.shape[1] < n_input_timesteps: hidden_states = nn.functional.pad(hidden_states, (0, 0, 0, n_input_timesteps - hidden_states.shape[1])) if not return_dict: return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_self_attentions, )