class Tacotron3(nn.Module): def __init__(self, activate_encoder): super(Tacotron3, self).__init__() if activate_encoder: self.encoder = Encoder() else: self.encoder = SimpleEncoder() self.decoder = Decoder(activate_encoder) self.postnet = Postnet() def forward(self, inputs): mel_source, mel_lengths, embedding, mel_target = inputs mel_lengths = mel_lengths.data encoder_outputs = self.encoder(mel_source, embedding) mel_outputs, alignments = self.decoder(encoder_outputs, mel_target, mel_lengths) end_padding_ind = get_reverse_mask(mel_lengths) mel_outputs_postnet = self.postnet(mel_outputs) # FIX This sollution is ugleh - is there a better way? mel_outputs = mel_outputs.permute(0, 2, 1) mel_outputs[end_padding_ind, :] = 0 mel_outputs = mel_outputs.permute(0, 2, 1) mel_outputs_postnet = mel_outputs_postnet.permute(0, 2, 1) mel_outputs_postnet[end_padding_ind, :] = 0 mel_outputs_postnet = mel_outputs_postnet.permute(0, 2, 1) mel_outputs_postnet = mel_outputs + mel_outputs_postnet return mel_outputs, mel_outputs_postnet, alignments def inference(self, inputs): mel_source, mel_lengths, embedding, _ = inputs encoder_outputs = self.encoder(mel_source, embedding) mel_outputs, _ = self.decoder.inference(encoder_outputs) end_padding_ind = get_reverse_mask(mel_lengths) mel_outputs_postnet = self.postnet(mel_outputs) # FIX This sollution is ugleh - is there a better way? mel_outputs = mel_outputs.permute(0, 2, 1) mel_outputs[end_padding_ind, :] = 0 mel_outputs = mel_outputs.permute(0, 2, 1) mel_outputs_postnet = mel_outputs_postnet.permute(0, 2, 1) mel_outputs_postnet[end_padding_ind, :] = 0 mel_outputs_postnet = mel_outputs_postnet.permute(0, 2, 1) mel_outputs_postnet = mel_outputs + mel_outputs_postnet return mel_outputs_postnet
class Tacotron2(nn.Module): def __init__(self): super(Tacotron2, self).__init__() self.embedding = nn.Embedding(hps.n_symbols, hps.character_embedding_dim) std = sqrt(2.0 / (hps.n_symbols + hps.character_embedding_dim)) val = sqrt(3.0) * std self.embedding.weight.data.uniform_(-val, val) self.encoder = Encoder() self.decoder = Decoder() self.postnet = PostNet() def parse_outputs(self, mel_outputs, mel_outputs_postnet, gate_outputs, output_lengths): mask = ~get_mask_from_lengths(output_lengths, pad=True) mask = mask.expand(80, mask.size(0), mask.size(1)) mask = mask.permute(1, 0, 2) # mask : (B, 80, Frames) mel_outputs.data.masked_fill_(mask, 0.0) mel_outputs_postnet.data.masked_fill_(mask, 0.0) # gate outputs : (B, Frames // 3) slice_mask = torch.arange(0, mask.size(2), 1) gate_outputs.data.masked_fill_(mask[:, 0, slice_mask], 1e3) return mel_outputs, mel_outputs_postnet, gate_outputs def forward(self, inputs): text_inputs, input_lengths, mel_targets, output_lengths = inputs # print('input text size : ', text_inputs.size()) character_embedding = self.embedding(text_inputs) # (B, Seq_len, 512) # (B, 512, seq_len) # print('character embedding size : ', character_embedding.size()) character_embedding = character_embedding.transpose(1, 2) # print('character embedding size : ', character_embedding.size()) encoder_outputs = self.encoder(character_embedding, input_lengths) # print('encoder output size : ', encoder_outputs.size()) mel_outputs, alignments, gate_outputs = self.decoder( encoder_outputs, mel_targets, input_lengths) mel_outputs_postnet = self.postnet(mel_outputs) mel_outputs_postnet = mel_outputs + mel_outputs_postnet mel_outputs, mel_outputs_postnet, gate_outputs = self.parse_outputs( mel_outputs, mel_outputs_postnet, gate_outputs, output_lengths) return mel_outputs, mel_outputs_postnet, gate_outputs, alignments def inference(self, inputs): embedded_inputs = self.embedding(inputs).transpose(1, 2) encoder_outputs = self.encoder.inference(embedded_inputs) mel_outputs, alignments, gate_outputs = self.decoder.inference( encoder_outputs) mel_outputs_postnet = self.postnet(mel_outputs) mel_outputs_postnet = mel_outputs + mel_outputs_postnet return mel_outputs, mel_outputs_postnet, alignments