def forward(self, src_tokens, src_lengths): x, input_lengths = self.subsample(src_tokens, src_lengths) x = self.embed_scale * x encoder_padding_mask = lengths_to_padding_mask(input_lengths) positions = self.embed_positions(encoder_padding_mask).transpose(0, 1) x += positions x = self.dropout_module(x) for layer in self.transformer_layers: x = layer(x, encoder_padding_mask) if not encoder_padding_mask.any(): encoder_padding_mask = None if self.layer_norm is not None: x = self.layer_norm(x) return EncoderOut( encoder_out=x, encoder_padding_mask=encoder_padding_mask, encoder_embedding=None, encoder_states=None, src_tokens=None, src_lengths=None, )
def forward(self, src_tokens, src_lengths=None, speaker=None, durations=None, pitches=None, energies=None, **kwargs): x = self.embed_tokens(src_tokens) enc_padding_mask = src_tokens.eq(self.padding_idx) x += self.pos_emb_alpha * self.embed_positions(enc_padding_mask) x = self.dropout_module(x) for layer in self.encoder_fft_layers: x = layer(x, enc_padding_mask) if self.embed_speaker is not None: bsz, seq_len, _ = x.size() emb = self.embed_speaker(speaker).expand(bsz, seq_len, -1) x = self.spk_emb_proj(torch.cat([x, emb], dim=2)) x, out_lens, log_dur_out, pitch_out, energy_out = self.var_adaptor( x, enc_padding_mask, durations, pitches, energies) dec_padding_mask = lengths_to_padding_mask(out_lens) x += self.dec_pos_emb_alpha * self.embed_positions(dec_padding_mask) for layer in self.decoder_fft_layers: x = layer(x, dec_padding_mask) x = self.out_proj(x) x_post = None if self.postnet is not None: x_post = x + self.postnet(x) return x, x_post, out_lens, log_dur_out, pitch_out, energy_out
def _forward(self, src_tokens, src_lengths, return_all_hiddens=False): x, input_lengths = self.subsample(src_tokens, src_lengths) x = self.embed_scale * x encoder_padding_mask = lengths_to_padding_mask(input_lengths) positions = self.embed_positions(encoder_padding_mask).transpose(0, 1) x += positions x = self.dropout_module(x) encoder_states = [] for layer in self.transformer_layers: x = layer(x, encoder_padding_mask) if return_all_hiddens: encoder_states.append(x) if self.layer_norm is not None: x = self.layer_norm(x) return { "encoder_out": [x], # T x B x C "encoder_padding_mask": [encoder_padding_mask] if encoder_padding_mask.any() else [], # B x T "encoder_embedding": [], # B x T x C "encoder_states": encoder_states, # List[T x B x C] "src_tokens": [], "src_lengths": [], }
def forward(self, x, padding_mask: Optional[torch.Tensor]): if self.layernorm is not None: x = self.layernorm(x) if self.proj is not None: x = x + 0.5 * self.proj(x) x = self.proj_ln(x) # T x B x C -> B x C x T x = x.transpose(0, 1).transpose(1, 2) out_lens = None if padding_mask is not None: out_lens = (~padding_mask).sum(1).float() for layer in self.layers: layerdrop_prob = np.random.random() if not self.training or (layerdrop_prob > self.layerdrop): x = nn.functional.glu(layer(x), dim=1) if padding_mask is not None: out_lens = ((out_lens - 1) / self.stride + 1).floor() # B x C x T -> T x B x C x = x.transpose(1, 2).transpose(0, 1) if self.post_proj is not None: x = x + 0.5 * self.post_proj(x) x = self.post_proj_ln(x) out_padding_mask = None if padding_mask is not None: out_padding_mask = lengths_to_padding_mask(out_lens.long()) return x, out_padding_mask
def forward(self, src_tokens, src_lengths=None, **kwargs): padding_mask = lengths_to_padding_mask(src_lengths) if not padding_mask.any(): padding_mask = None out = self.w2v_encoder.forward(src_tokens, padding_mask, tbc=True) x = out["encoder_out"] enc_padding_mask = None if out["encoder_padding_mask"] is not None: enc_padding_mask = out["encoder_padding_mask"].transpose( 0, 1 ) # T X B --> B X T x, enc_padding_mask = self.adaptor(x, enc_padding_mask) for layer in self.shared_layers: x, _ = layer(x, enc_padding_mask) if self.final_layer_norm is not None: x = self.final_layer_norm(x) return { "encoder_out": [x], # T x B x C "encoder_padding_mask": [enc_padding_mask] if enc_padding_mask is not None else [], # B x T "encoder_embedding": [], # B x T x C "encoder_states": [], # List[T x B x C] "src_tokens": [], "src_lengths": [], }
def forward(self, src_tokens, src_lengths=None, **kwargs): if ( self.freezing_updates is not None and self.num_updates > self.freezing_updates ): for p in self.w2v_encoder.w2v_model.parameters(): p.requires_grad = True padding_mask = lengths_to_padding_mask(src_lengths) out = self.w2v_encoder.forward(src_tokens, padding_mask, tbc=True) x, padding_mask = out["encoder_out"], out["padding_mask"] if self.w2v_proj_ln is not None: x = self.w2v_proj_ln(x) if self.adaptor is not None: x, padding_mask = self.adaptor(x, padding_mask) return { "encoder_out": [x], # T x B x C "encoder_padding_mask": [] if padding_mask is None else [padding_mask], # B x T "encoder_embedding": [], # B x T x C "encoder_states": [], # List[T x B x C] "src_tokens": [], "src_lengths": [], }
def forward(self, src_tokens, src_lengths=None, return_all_hiddens=False, **kwargs): padding_mask = lengths_to_padding_mask(src_lengths) if not padding_mask.any(): padding_mask = None out = self.w2v_encoder.forward(src_tokens, padding_mask, tbc=True) x = out["encoder_out"] enc_padding_mask = None if out["padding_mask"] is not None: enc_padding_mask = out["padding_mask"] # B X T x, enc_padding_mask = self.adaptor(x, enc_padding_mask) encoder_states = [] for layer in self.mbart_encoder_layers: x = layer(x, enc_padding_mask) if return_all_hiddens: encoder_states.append(x) if self.final_layer_norm is not None: x = self.final_layer_norm(x) return { "encoder_out": [x], # T x B x C "encoder_padding_mask": [enc_padding_mask] if enc_padding_mask is not None else [], # B x T "encoder_embedding": [], # B x T x C "encoder_states": encoder_states, # List[T x B x C] "src_tokens": [], "src_lengths": [], }
def forward(self, src_tokens, src_lengths=None, **kwargs): """ Args src_tokens: padded tensor (B, T, C * feat) src_lengths: tensor of original lengths of input utterances (B,) """ bsz, max_seq_len, _ = src_tokens.size() # (B, C, T, feat) x = ( src_tokens.view(bsz, max_seq_len, self.in_channels, self.input_dim) .transpose(1, 2) .contiguous() ) for input_layer in self.input_layers: x = input_layer(x) x = torch.tanh(x) for conv_layer in self.conv_layers: x = conv_layer(x) bsz, _, output_seq_len, _ = x.size() # (B, C, T, feat) -> (B, T, C, feat) -> (T, B, C, feat) -> # (T, B, C * feat) x = x.transpose(1, 2).transpose(0, 1).contiguous().view(output_seq_len, bsz, -1) input_lengths = src_lengths.clone() for k, s in self.conv_kernel_sizes_and_strides: p = k // 2 input_lengths = (input_lengths.float() + 2 * p - k) / s + 1 input_lengths = input_lengths.floor().long() packed_x = nn.utils.rnn.pack_padded_sequence(x, input_lengths) h0 = x.new(2 * self.num_blstm_layers, bsz, self.lstm_size).zero_() c0 = x.new(2 * self.num_blstm_layers, bsz, self.lstm_size).zero_() packed_outs, _ = self.lstm(packed_x, (h0, c0)) # unpack outputs and apply dropout x, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_outs) if self.dropout is not None: x = self.dropout(x) encoder_padding_mask = ( lengths_to_padding_mask(output_lengths).to(src_tokens.device).t() ) return { "encoder_out": x, # (T, B, C) "encoder_padding_mask": encoder_padding_mask, # (T, B) }
def forward(self, src_tokens, src_lengths, return_all_hiddens=False): """ Args: src_tokens: Input source tokens Tensor of shape B X T X C src_lengths: Lengths Tensor corresponding to input source tokens return_all_hiddens: If true will append the self attention states to the encoder states Returns: encoder_out: Tensor of shape B X T X C encoder_padding_mask: Optional Tensor with mask encoder_embedding: Optional Tensor. Always empty here encoder_states: List of Optional Tensors wih self attention states src_tokens: Optional Tensor. Always empty here src_lengths: Optional Tensor. Always empty here """ x, input_lengths = self.subsample(src_tokens, src_lengths) # returns T X B X C encoder_padding_mask = lengths_to_padding_mask(input_lengths) x = self.embed_scale * x if self.pos_enc_type == "rel_pos": positions = self.embed_positions(x) elif self.pos_enc_type == "rope": positions = None else: positions = self.embed_positions(encoder_padding_mask).transpose( 0, 1) x += positions positions = None x = self.linear(x) x = self.dropout(x) encoder_states = [] # x is T X B X C for layer in self.conformer_layers: x, _ = layer(x, encoder_padding_mask, positions) if return_all_hiddens: encoder_states.append(x) return { "encoder_out": [x], # T x B x C "encoder_padding_mask": [encoder_padding_mask] if encoder_padding_mask.any() else [], # B x T "encoder_embedding": [], # B x T x C "encoder_states": encoder_states, # List[T x B x C] "src_tokens": [], "src_lengths": [], }
def forward(self, src_tokens, src_lengths): """Encode input sequence. :param torch.Tensor xs: input tensor :param torch.Tensor masks: input mask :return: position embedded tensor and mask :rtype Tuple[torch.Tensor, torch.Tensor]: """ bsz, max_seq_len, _ = src_tokens.size() x = ( src_tokens.view(bsz, max_seq_len, self.in_channels, self.input_dim) .transpose(1, 2) .contiguous() ) x = self.conv(x) bsz, _, output_seq_len, _ = x.size() x = x.transpose(1, 2).transpose(0, 1).contiguous().view(output_seq_len, bsz, -1) x = self.out(x) x = self.embed_scale * x subsampling_factor = int(max_seq_len * 1.0 / output_seq_len + 0.5) input_len_0 = (src_lengths.float() / subsampling_factor).ceil().long() input_len_1 = x.size(0) * torch.ones([src_lengths.size(0)]).long().to( input_len_0.device ) input_lengths = torch.min(input_len_0, input_len_1) encoder_padding_mask = lengths_to_padding_mask(input_lengths) positions = self.embed_positions(encoder_padding_mask).transpose(0, 1) x += positions x = F.dropout(x, p=self.dropout, training=self.training) for layer in self.transformer_layers: x = layer(x, encoder_padding_mask) if not encoder_padding_mask.any(): maybe_encoder_padding_mask = None else: maybe_encoder_padding_mask = encoder_padding_mask return { "encoder_out": [x], "encoder_padding_mask": [maybe_encoder_padding_mask] if maybe_encoder_padding_mask is not None else [], "encoder_embedding": [], "encoder_states": [], "src_tokens": [], "src_lengths": [], }
def forward( self, src_tokens, src_lengths, return_all_hiddens: bool = False, ): x, input_lengths = self.subsample(src_tokens, src_lengths) x = self.embed_scale * x encoder_padding_mask = lengths_to_padding_mask(input_lengths) positions = self.embed_positions(encoder_padding_mask).transpose(0, 1) x += positions x = self.dropout_module(x) encoder_states = [] if return_all_hiddens else None x_ctc = None ctc_padding_mask = None for l_idx, layer in enumerate(self.transformer_layers): x = layer(x, encoder_padding_mask) if self.ctc_compress_out and self.ctc_layer == l_idx + 1: ctc_padding_mask = encoder_padding_mask x_ctc, x, src_lengths = self.average_same_ctc_features( x, src_lengths) encoder_padding_mask = self.create_mask(src_lengths) if not encoder_padding_mask.any(): encoder_padding_mask = None if self.layer_norm is not None: x = self.layer_norm(x) if self.ctc_compress_out: return { "encoder_out": [x], # T x B x C "encoder_padding_mask": [encoder_padding_mask], # B x T "encoder_embedding": None, "encoder_states": encoder_states, # List[T x B x C] "ctc_out": x_ctc, # T x B x D "ctc_padding_mask": ctc_padding_mask } else: return { "encoder_out": [x], "encoder_padding_mask": [encoder_padding_mask], "encoder_embedding": None, "encoder_states": None, "src_tokens": None, "src_lengths": None, }
def forward(self, x, padding_mask): # T x B x C -> B x C x T x = x.transpose(0, 1).transpose(1, 2) for i, layer in enumerate(self.layers): x = nn.functional.glu(layer(x), dim=1) if self.layernorms is not None: x = self.layernorms[i](x.transpose(1, 2)).transpose(1, 2) # B x C x T -> T x B x C x = x.transpose(1, 2).transpose(0, 1) if padding_mask is None: out_padding_mask = None else: out_lengths = self.get_out_seq_lens_tensor((~padding_mask).sum(1)) out_padding_mask = lengths_to_padding_mask(out_lengths) return x, out_padding_mask
def forward(self, src_tokens, src_lengths=None, **kwargs): padding_mask = lengths_to_padding_mask(src_lengths) out = self.w2v_encoder.forward(src_tokens, padding_mask, tbc=True) x = out["encoder_out"] enc_padding_mask = None if out["encoder_padding_mask"] is not None: enc_padding_mask = out["encoder_padding_mask"].transpose( 0, 1) # T X B --> B X T x, enc_padding_mask = self.adaptor(x, enc_padding_mask) return { "encoder_out": [x], # T x B x C "encoder_padding_mask": [enc_padding_mask] if enc_padding_mask.any() else [], # B x T "encoder_embedding": [], # B x T x C "encoder_states": [], # List[T x B x C] "src_tokens": [], "src_lengths": [], }
def extract_features(self, prev_outputs, encoder_out=None, incremental_state=None, target_lengths=None, speaker=None, **kwargs): alignment_layer = self.n_transformer_layers - 1 self_attn_padding_mask = lengths_to_padding_mask(target_lengths) positions = self.embed_positions(self_attn_padding_mask, incremental_state=incremental_state) if incremental_state is not None: prev_outputs = prev_outputs[:, -1:, :] self_attn_padding_mask = self_attn_padding_mask[:, -1:] if positions is not None: positions = positions[:, -1:] x = self.prenet(prev_outputs) x += self.pos_emb_alpha * positions x = self.dropout_module(x) # B x T x C -> T x B x C x = x.transpose(0, 1) if not self_attn_padding_mask.any(): self_attn_padding_mask = None attn: Optional[torch.Tensor] = None inner_states: List[Optional[torch.Tensor]] = [x] for idx, transformer_layer in enumerate(self.transformer_layers): if incremental_state is None: self_attn_mask = self.buffered_future_mask(x) else: self_attn_mask = None x, layer_attn, _ = transformer_layer( x, encoder_out["encoder_out"][0] if (encoder_out is not None and len(encoder_out["encoder_out"]) > 0) else None, encoder_out["encoder_padding_mask"][0] if (encoder_out is not None and len(encoder_out["encoder_padding_mask"]) > 0) else None, incremental_state, self_attn_mask=self_attn_mask, self_attn_padding_mask=self_attn_padding_mask, need_attn=bool((idx == alignment_layer)), need_head_weights=bool((idx == alignment_layer)), ) inner_states.append(x) if layer_attn is not None and idx == alignment_layer: attn = layer_attn.float().to(x) if attn is not None: # average probabilities over heads, transpose to # (B, src_len, tgt_len) attn = attn.mean(dim=0).transpose(2, 1) if self.layer_norm is not None: x = self.layer_norm(x) # T x B x C -> B x T x C x = x.transpose(0, 1) return x, {"attn": attn, "inner_states": inner_states}