예제 #1
0
    def forward(self, src_tokens, src_lengths):
        x, input_lengths = self.subsample(src_tokens, src_lengths)
        x = self.embed_scale * x

        encoder_padding_mask = lengths_to_padding_mask(input_lengths)
        positions = self.embed_positions(encoder_padding_mask).transpose(0, 1)
        x += positions
        x = self.dropout_module(x)

        for layer in self.transformer_layers:
            x = layer(x, encoder_padding_mask)

        if not encoder_padding_mask.any():
            encoder_padding_mask = None

        if self.layer_norm is not None:
            x = self.layer_norm(x)

        return EncoderOut(
            encoder_out=x,
            encoder_padding_mask=encoder_padding_mask,
            encoder_embedding=None,
            encoder_states=None,
            src_tokens=None,
            src_lengths=None,
        )
예제 #2
0
    def forward(self,
                src_tokens,
                src_lengths=None,
                speaker=None,
                durations=None,
                pitches=None,
                energies=None,
                **kwargs):
        x = self.embed_tokens(src_tokens)

        enc_padding_mask = src_tokens.eq(self.padding_idx)
        x += self.pos_emb_alpha * self.embed_positions(enc_padding_mask)
        x = self.dropout_module(x)

        for layer in self.encoder_fft_layers:
            x = layer(x, enc_padding_mask)

        if self.embed_speaker is not None:
            bsz, seq_len, _ = x.size()
            emb = self.embed_speaker(speaker).expand(bsz, seq_len, -1)
            x = self.spk_emb_proj(torch.cat([x, emb], dim=2))

        x, out_lens, log_dur_out, pitch_out, energy_out = self.var_adaptor(
            x, enc_padding_mask, durations, pitches, energies)

        dec_padding_mask = lengths_to_padding_mask(out_lens)
        x += self.dec_pos_emb_alpha * self.embed_positions(dec_padding_mask)
        for layer in self.decoder_fft_layers:
            x = layer(x, dec_padding_mask)

        x = self.out_proj(x)
        x_post = None
        if self.postnet is not None:
            x_post = x + self.postnet(x)
        return x, x_post, out_lens, log_dur_out, pitch_out, energy_out
예제 #3
0
    def _forward(self, src_tokens, src_lengths, return_all_hiddens=False):
        x, input_lengths = self.subsample(src_tokens, src_lengths)
        x = self.embed_scale * x

        encoder_padding_mask = lengths_to_padding_mask(input_lengths)
        positions = self.embed_positions(encoder_padding_mask).transpose(0, 1)
        x += positions
        x = self.dropout_module(x)

        encoder_states = []

        for layer in self.transformer_layers:
            x = layer(x, encoder_padding_mask)
            if return_all_hiddens:
                encoder_states.append(x)

        if self.layer_norm is not None:
            x = self.layer_norm(x)

        return {
            "encoder_out": [x],  # T x B x C
            "encoder_padding_mask": [encoder_padding_mask]
            if encoder_padding_mask.any() else [],  # B x T
            "encoder_embedding": [],  # B x T x C
            "encoder_states":
            encoder_states,  # List[T x B x C]
            "src_tokens": [],
            "src_lengths": [],
        }
예제 #4
0
    def forward(self, x, padding_mask: Optional[torch.Tensor]):
        if self.layernorm is not None:
            x = self.layernorm(x)

        if self.proj is not None:
            x = x + 0.5 * self.proj(x)
            x = self.proj_ln(x)

        # T x B x C -> B x C x T
        x = x.transpose(0, 1).transpose(1, 2)
        out_lens = None
        if padding_mask is not None:
            out_lens = (~padding_mask).sum(1).float()

        for layer in self.layers:
            layerdrop_prob = np.random.random()
            if not self.training or (layerdrop_prob > self.layerdrop):
                x = nn.functional.glu(layer(x), dim=1)
                if padding_mask is not None:
                    out_lens = ((out_lens - 1) / self.stride + 1).floor()
        # B x C x T -> T x B x C
        x = x.transpose(1, 2).transpose(0, 1)

        if self.post_proj is not None:
            x = x + 0.5 * self.post_proj(x)
            x = self.post_proj_ln(x)

        out_padding_mask = None
        if padding_mask is not None:
            out_padding_mask = lengths_to_padding_mask(out_lens.long())
        return x, out_padding_mask
예제 #5
0
    def forward(self, src_tokens, src_lengths=None, **kwargs):
        padding_mask = lengths_to_padding_mask(src_lengths)
        if not padding_mask.any():
            padding_mask = None

        out = self.w2v_encoder.forward(src_tokens, padding_mask, tbc=True)
        x = out["encoder_out"]
        enc_padding_mask = None
        if out["encoder_padding_mask"] is not None:
            enc_padding_mask = out["encoder_padding_mask"].transpose(
                0, 1
            )  # T X B --> B X T

        x, enc_padding_mask = self.adaptor(x, enc_padding_mask)
        for layer in self.shared_layers:
            x, _ = layer(x, enc_padding_mask)
        if self.final_layer_norm is not None:
            x = self.final_layer_norm(x)

        return {
            "encoder_out": [x],  # T x B x C
            "encoder_padding_mask": [enc_padding_mask]
            if enc_padding_mask is not None
            else [],  # B x T
            "encoder_embedding": [],  # B x T x C
            "encoder_states": [],  # List[T x B x C]
            "src_tokens": [],
            "src_lengths": [],
        }
예제 #6
0
    def forward(self, src_tokens, src_lengths=None, **kwargs):
        if (
            self.freezing_updates is not None
            and self.num_updates > self.freezing_updates
        ):
            for p in self.w2v_encoder.w2v_model.parameters():
                p.requires_grad = True

        padding_mask = lengths_to_padding_mask(src_lengths)
        out = self.w2v_encoder.forward(src_tokens, padding_mask, tbc=True)
        x, padding_mask = out["encoder_out"], out["padding_mask"]
        if self.w2v_proj_ln is not None:
            x = self.w2v_proj_ln(x)

        if self.adaptor is not None:
            x, padding_mask = self.adaptor(x, padding_mask)

        return {
            "encoder_out": [x],  # T x B x C
            "encoder_padding_mask": []
            if padding_mask is None
            else [padding_mask],  # B x T
            "encoder_embedding": [],  # B x T x C
            "encoder_states": [],  # List[T x B x C]
            "src_tokens": [],
            "src_lengths": [],
        }
예제 #7
0
    def forward(self, src_tokens, src_lengths=None, return_all_hiddens=False, **kwargs):
        padding_mask = lengths_to_padding_mask(src_lengths)
        if not padding_mask.any():
            padding_mask = None

        out = self.w2v_encoder.forward(src_tokens, padding_mask, tbc=True)
        x = out["encoder_out"]
        enc_padding_mask = None
        if out["padding_mask"] is not None:
            enc_padding_mask = out["padding_mask"]  # B X T

        x, enc_padding_mask = self.adaptor(x, enc_padding_mask)
        encoder_states = []
        for layer in self.mbart_encoder_layers:
            x = layer(x, enc_padding_mask)
            if return_all_hiddens:
                encoder_states.append(x)
        if self.final_layer_norm is not None:
            x = self.final_layer_norm(x)

        return {
            "encoder_out": [x],  # T x B x C
            "encoder_padding_mask": [enc_padding_mask]
            if enc_padding_mask is not None
            else [],  # B x T
            "encoder_embedding": [],  # B x T x C
            "encoder_states": encoder_states,  # List[T x B x C]
            "src_tokens": [],
            "src_lengths": [],
        }
예제 #8
0
    def forward(self, src_tokens, src_lengths=None, **kwargs):
        """
        Args
            src_tokens: padded tensor (B, T, C * feat)
            src_lengths: tensor of original lengths of input utterances (B,)
        """
        bsz, max_seq_len, _ = src_tokens.size()
        # (B, C, T, feat)
        x = (
            src_tokens.view(bsz, max_seq_len, self.in_channels, self.input_dim)
            .transpose(1, 2)
            .contiguous()
        )

        for input_layer in self.input_layers:
            x = input_layer(x)
            x = torch.tanh(x)

        for conv_layer in self.conv_layers:
            x = conv_layer(x)

        bsz, _, output_seq_len, _ = x.size()

        # (B, C, T, feat) -> (B, T, C, feat) -> (T, B, C, feat) ->
        # (T, B, C * feat)
        x = x.transpose(1, 2).transpose(0, 1).contiguous().view(output_seq_len, bsz, -1)

        input_lengths = src_lengths.clone()
        for k, s in self.conv_kernel_sizes_and_strides:
            p = k // 2
            input_lengths = (input_lengths.float() + 2 * p - k) / s + 1
            input_lengths = input_lengths.floor().long()

        packed_x = nn.utils.rnn.pack_padded_sequence(x, input_lengths)

        h0 = x.new(2 * self.num_blstm_layers, bsz, self.lstm_size).zero_()
        c0 = x.new(2 * self.num_blstm_layers, bsz, self.lstm_size).zero_()
        packed_outs, _ = self.lstm(packed_x, (h0, c0))

        # unpack outputs and apply dropout
        x, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_outs)
        if self.dropout is not None:
            x = self.dropout(x)

        encoder_padding_mask = (
            lengths_to_padding_mask(output_lengths).to(src_tokens.device).t()
        )

        return {
            "encoder_out": x,  # (T, B, C)
            "encoder_padding_mask": encoder_padding_mask,  # (T, B)
        }
예제 #9
0
    def forward(self, src_tokens, src_lengths, return_all_hiddens=False):
        """
        Args:
            src_tokens: Input source tokens Tensor of shape B X T X C
            src_lengths: Lengths Tensor corresponding to input source tokens
            return_all_hiddens: If true will append the self attention states to the encoder states
        Returns:
            encoder_out: Tensor of shape B X T X C
            encoder_padding_mask: Optional Tensor with mask
            encoder_embedding: Optional Tensor. Always empty here
            encoder_states: List of Optional Tensors wih self attention states
            src_tokens: Optional Tensor. Always empty here
            src_lengths: Optional Tensor. Always empty here
        """
        x, input_lengths = self.subsample(src_tokens,
                                          src_lengths)  # returns T X B X C
        encoder_padding_mask = lengths_to_padding_mask(input_lengths)
        x = self.embed_scale * x
        if self.pos_enc_type == "rel_pos":
            positions = self.embed_positions(x)

        elif self.pos_enc_type == "rope":
            positions = None

        else:
            positions = self.embed_positions(encoder_padding_mask).transpose(
                0, 1)
            x += positions
            positions = None

        x = self.linear(x)
        x = self.dropout(x)
        encoder_states = []

        # x is T X B X C
        for layer in self.conformer_layers:
            x, _ = layer(x, encoder_padding_mask, positions)
            if return_all_hiddens:
                encoder_states.append(x)

        return {
            "encoder_out": [x],  # T x B x C
            "encoder_padding_mask": [encoder_padding_mask]
            if encoder_padding_mask.any() else [],  # B x T
            "encoder_embedding": [],  # B x T x C
            "encoder_states":
            encoder_states,  # List[T x B x C]
            "src_tokens": [],
            "src_lengths": [],
        }
예제 #10
0
    def forward(self, src_tokens, src_lengths):
        """Encode input sequence.
        :param torch.Tensor xs: input tensor
        :param torch.Tensor masks: input mask
        :return: position embedded tensor and mask
        :rtype Tuple[torch.Tensor, torch.Tensor]:
        """
        bsz, max_seq_len, _ = src_tokens.size()
        x = (
            src_tokens.view(bsz, max_seq_len, self.in_channels, self.input_dim)
            .transpose(1, 2)
            .contiguous()
        )
        x = self.conv(x)
        bsz, _, output_seq_len, _ = x.size()
        x = x.transpose(1, 2).transpose(0, 1).contiguous().view(output_seq_len, bsz, -1)
        x = self.out(x)
        x = self.embed_scale * x

        subsampling_factor = int(max_seq_len * 1.0 / output_seq_len + 0.5)
        input_len_0 = (src_lengths.float() / subsampling_factor).ceil().long()
        input_len_1 = x.size(0) * torch.ones([src_lengths.size(0)]).long().to(
            input_len_0.device
        )
        input_lengths = torch.min(input_len_0, input_len_1)

        encoder_padding_mask = lengths_to_padding_mask(input_lengths)

        positions = self.embed_positions(encoder_padding_mask).transpose(0, 1)
        x += positions
        x = F.dropout(x, p=self.dropout, training=self.training)

        for layer in self.transformer_layers:
            x = layer(x, encoder_padding_mask)

        if not encoder_padding_mask.any():
            maybe_encoder_padding_mask = None
        else:
            maybe_encoder_padding_mask = encoder_padding_mask

        return {
            "encoder_out": [x],
            "encoder_padding_mask": [maybe_encoder_padding_mask]
            if maybe_encoder_padding_mask is not None
            else [],
            "encoder_embedding": [],
            "encoder_states": [],
            "src_tokens": [],
            "src_lengths": [],
        }
예제 #11
0
    def forward(
        self,
        src_tokens,
        src_lengths,
        return_all_hiddens: bool = False,
    ):
        x, input_lengths = self.subsample(src_tokens, src_lengths)
        x = self.embed_scale * x

        encoder_padding_mask = lengths_to_padding_mask(input_lengths)
        positions = self.embed_positions(encoder_padding_mask).transpose(0, 1)
        x += positions
        x = self.dropout_module(x)

        encoder_states = [] if return_all_hiddens else None

        x_ctc = None
        ctc_padding_mask = None
        for l_idx, layer in enumerate(self.transformer_layers):
            x = layer(x, encoder_padding_mask)
            if self.ctc_compress_out and self.ctc_layer == l_idx + 1:
                ctc_padding_mask = encoder_padding_mask
                x_ctc, x, src_lengths = self.average_same_ctc_features(
                    x, src_lengths)
                encoder_padding_mask = self.create_mask(src_lengths)

        if not encoder_padding_mask.any():
            encoder_padding_mask = None

        if self.layer_norm is not None:
            x = self.layer_norm(x)

        if self.ctc_compress_out:
            return {
                "encoder_out": [x],  # T x B x C
                "encoder_padding_mask": [encoder_padding_mask],  # B x T
                "encoder_embedding": None,
                "encoder_states": encoder_states,  # List[T x B x C]
                "ctc_out": x_ctc,  # T x B x D
                "ctc_padding_mask": ctc_padding_mask
            }
        else:
            return {
                "encoder_out": [x],
                "encoder_padding_mask": [encoder_padding_mask],
                "encoder_embedding": None,
                "encoder_states": None,
                "src_tokens": None,
                "src_lengths": None,
            }
예제 #12
0
    def forward(self, x, padding_mask):
        # T x B x C -> B x C x T
        x = x.transpose(0, 1).transpose(1, 2)
        for i, layer in enumerate(self.layers):
            x = nn.functional.glu(layer(x), dim=1)
            if self.layernorms is not None:
                x = self.layernorms[i](x.transpose(1, 2)).transpose(1, 2)
        # B x C x T -> T x B x C
        x = x.transpose(1, 2).transpose(0, 1)

        if padding_mask is None:
            out_padding_mask = None
        else:
            out_lengths = self.get_out_seq_lens_tensor((~padding_mask).sum(1))
            out_padding_mask = lengths_to_padding_mask(out_lengths)
        return x, out_padding_mask
예제 #13
0
    def forward(self, src_tokens, src_lengths=None, **kwargs):
        padding_mask = lengths_to_padding_mask(src_lengths)
        out = self.w2v_encoder.forward(src_tokens, padding_mask, tbc=True)
        x = out["encoder_out"]
        enc_padding_mask = None
        if out["encoder_padding_mask"] is not None:
            enc_padding_mask = out["encoder_padding_mask"].transpose(
                0, 1)  # T X B --> B X T

        x, enc_padding_mask = self.adaptor(x, enc_padding_mask)

        return {
            "encoder_out": [x],  # T x B x C
            "encoder_padding_mask":
            [enc_padding_mask] if enc_padding_mask.any() else [],  # B x T
            "encoder_embedding": [],  # B x T x C
            "encoder_states": [],  # List[T x B x C]
            "src_tokens": [],
            "src_lengths": [],
        }
예제 #14
0
    def extract_features(self,
                         prev_outputs,
                         encoder_out=None,
                         incremental_state=None,
                         target_lengths=None,
                         speaker=None,
                         **kwargs):
        alignment_layer = self.n_transformer_layers - 1
        self_attn_padding_mask = lengths_to_padding_mask(target_lengths)
        positions = self.embed_positions(self_attn_padding_mask,
                                         incremental_state=incremental_state)

        if incremental_state is not None:
            prev_outputs = prev_outputs[:, -1:, :]
            self_attn_padding_mask = self_attn_padding_mask[:, -1:]
            if positions is not None:
                positions = positions[:, -1:]

        x = self.prenet(prev_outputs)
        x += self.pos_emb_alpha * positions
        x = self.dropout_module(x)

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        if not self_attn_padding_mask.any():
            self_attn_padding_mask = None

        attn: Optional[torch.Tensor] = None
        inner_states: List[Optional[torch.Tensor]] = [x]
        for idx, transformer_layer in enumerate(self.transformer_layers):
            if incremental_state is None:
                self_attn_mask = self.buffered_future_mask(x)
            else:
                self_attn_mask = None

            x, layer_attn, _ = transformer_layer(
                x,
                encoder_out["encoder_out"][0] if
                (encoder_out is not None
                 and len(encoder_out["encoder_out"]) > 0) else None,
                encoder_out["encoder_padding_mask"][0] if
                (encoder_out is not None
                 and len(encoder_out["encoder_padding_mask"]) > 0) else None,
                incremental_state,
                self_attn_mask=self_attn_mask,
                self_attn_padding_mask=self_attn_padding_mask,
                need_attn=bool((idx == alignment_layer)),
                need_head_weights=bool((idx == alignment_layer)),
            )
            inner_states.append(x)
            if layer_attn is not None and idx == alignment_layer:
                attn = layer_attn.float().to(x)

        if attn is not None:
            # average probabilities over heads, transpose to
            # (B, src_len, tgt_len)
            attn = attn.mean(dim=0).transpose(2, 1)

        if self.layer_norm is not None:
            x = self.layer_norm(x)

        # T x B x C -> B x T x C
        x = x.transpose(0, 1)

        return x, {"attn": attn, "inner_states": inner_states}