def test_in_out(self): # test input == target layer = L1LossMasked() dummy_input = T.ones(4, 8, 128).float() dummy_target = T.ones(4, 8, 128).float() dummy_length = (T.ones(4) * 8).long() output = layer(dummy_input, dummy_target, dummy_length) assert output.item() == 0.0 # test input != target dummy_input = T.ones(4, 8, 128).float() dummy_target = T.zeros(4, 8, 128).float() dummy_length = (T.ones(4) * 8).long() output = layer(dummy_input, dummy_target, dummy_length) assert output.item() == 1.0, "1.0 vs {}".format(output.data[0]) # test if padded values of input makes any difference dummy_input = T.ones(4, 8, 128).float() dummy_target = T.zeros(4, 8, 128).float() dummy_length = (T.arange(5, 9)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) assert output.item() == 1.0, "1.0 vs {}".format(output.data[0]) dummy_input = T.rand(4, 8, 128).float() dummy_target = dummy_input.detach() dummy_length = (T.arange(5, 9)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) assert output.item() == 0, "0 vs {}".format(output.data[0])
def compute_masks(self, text_lengths, mel_lengths): """Compute masks against sequence paddings.""" # B x T_in_max (boolean) device = text_lengths.device input_mask = sequence_mask(text_lengths).to(device) output_mask = None if mel_lengths is not None: max_len = mel_lengths.max() r = self.decoder.r max_len = max_len + (r - (max_len % r)) if max_len % r > 0 else max_len output_mask = sequence_mask(mel_lengths, max_len=max_len).to(device) return input_mask, output_mask
def forward(self, x, target, length): """ Args: x: A Variable containing a FloatTensor of size (batch, max_len, dim) which contains the unnormalized probability for each class. target: A Variable containing a LongTensor of size (batch, max_len, dim) which contains the index of the true class for each corresponding step. length: A Variable containing a LongTensor of size (batch,) which contains the length of each data in a batch. Returns: loss: An average loss value in range [0, 1] masked by the length. """ # mask: (batch, max_len, 1) target.requires_grad = False mask = sequence_mask( sequence_length=length, max_len=target.size(1)).unsqueeze(2).float() if self.seq_len_norm: norm_w = mask / mask.sum(dim=1, keepdim=True) out_weights = norm_w.div(target.shape[0] * target.shape[2]) mask = mask.expand_as(x) loss = functional.mse_loss( x * mask, target * mask, reduction='none') loss = loss.mul(out_weights.to(loss.device)).sum() else: mask = mask.expand_as(x) loss = functional.mse_loss( x * mask, target * mask, reduction='sum') loss = loss / mask.sum() return loss
def forward(self, x, target, length): """ Args: x: A Variable containing a FloatTensor of size (batch, max_len) which contains the unnormalized probability for each class. target: A Variable containing a LongTensor of size (batch, max_len) which contains the index of the true class for each corresponding step. length: A Variable containing a LongTensor of size (batch,) which contains the length of each data in a batch. Returns: loss: An average loss value in range [0, 1] masked by the length. """ # mask: (batch, max_len, 1) target.requires_grad = False mask = sequence_mask(sequence_length=length, max_len=target.size(1)).float() loss = functional.binary_cross_entropy_with_logits( x * mask, target * mask, pos_weight=self.pos_weight, reduction='sum') loss = loss / mask.sum() return loss
def forward(self, text, text_lengths, mel_specs=None, speaker_ids=None,ref_cond=True): self._init_states() # compute mask for padding mask = sequence_mask(text_lengths).to(text.device) embedded_inputs = self.embedding(text).transpose(1, 2) encoder_outputs = self.encoder(embedded_inputs, text_lengths) encoder_outputs = self._add_speaker_embedding(encoder_outputs, speaker_ids) if ref_cond: prosody_outputs, mu, logvar, z = self.vae_gst(mel_specs) prosody_outputs = prosody_outputs.unsqueeze(1).expand_as(encoder_outputs) encoder_outputs =encoder_outputs+prosody_outputs decoder_outputs, alignments, stop_tokens = self.decoder( encoder_outputs, mel_specs, mask) postnet_outputs = self.postnet(decoder_outputs) postnet_outputs = decoder_outputs + postnet_outputs decoder_outputs, postnet_outputs, alignments = self.shape_outputs( decoder_outputs, postnet_outputs, alignments) if self.bidirectional_decoder: decoder_outputs_backward, alignments_backward = self._backward_inference(mel_specs, encoder_outputs, mask) if ref_con: return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward, mu, logvar, z else: return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward else: if ref_cond: return decoder_outputs, postnet_outputs, alignments, stop_tokens, mu, logvar, z else: return decoder_outputs, postnet_outputs, alignments, stop_tokens
def forward(self, characters, text_lengths, mel_specs): B = characters.size(0) mask = sequence_mask(text_lengths).to(characters.device) inputs = self.embedding(characters) encoder_outputs = self.encoder(inputs) mel_outputs, alignments, stop_tokens = self.decoder( encoder_outputs, mel_specs, mask) mel_outputs = mel_outputs.view(B, -1, self.mel_dim) linear_outputs = self.postnet(mel_outputs) linear_outputs = self.last_linear(linear_outputs) return mel_outputs, linear_outputs, alignments, stop_tokens
def forward(self, text, text_lengths, mel_specs=None): # compute mask for padding mask = sequence_mask(text_lengths).to(text.device) embedded_inputs = self.embedding(text).transpose(1, 2) encoder_outputs = self.encoder(embedded_inputs, text_lengths) mel_outputs, stop_tokens, alignments = self.decoder( encoder_outputs, mel_specs, mask) mel_outputs_postnet = self.postnet(mel_outputs) mel_outputs_postnet = mel_outputs + mel_outputs_postnet mel_outputs, mel_outputs_postnet, alignments = self.shape_outputs( mel_outputs, mel_outputs_postnet, alignments) return mel_outputs, mel_outputs_postnet, alignments, stop_tokens
def forward(self, characters, text_lengths, mel_specs, speaker_ids=None): B = characters.size(0) mask = sequence_mask(text_lengths).to(characters.device) inputs = self.embedding(characters) encoder_outputs = self.encoder(inputs) encoder_outputs = self._add_speaker_embedding(encoder_outputs, speaker_ids) gst_outputs = self.gst(mel_specs) gst_outputs = gst_outputs.expand(-1, encoder_outputs.size(1), -1) encoder_outputs = encoder_outputs + gst_outputs mel_outputs, alignments, stop_tokens = self.decoder( encoder_outputs, mel_specs, mask) mel_outputs = mel_outputs.view(B, -1, self.mel_dim) linear_outputs = self.postnet(mel_outputs) linear_outputs = self.last_linear(linear_outputs) return mel_outputs, linear_outputs, alignments, stop_tokens
def forward(self, text, text_lengths, mel_specs=None, speaker_ids=None): self._init_states() # compute mask for padding mask = sequence_mask(text_lengths).to(text.device) embedded_inputs = self.embedding(text).transpose(1, 2) encoder_outputs = self.encoder(embedded_inputs, text_lengths) encoder_outputs = self._add_speaker_embedding(encoder_outputs, speaker_ids) decoder_outputs, alignments, stop_tokens = self.decoder( encoder_outputs, mel_specs, mask) postnet_outputs = self.postnet(decoder_outputs) postnet_outputs = decoder_outputs + postnet_outputs decoder_outputs, postnet_outputs, alignments = self.shape_outputs( decoder_outputs, postnet_outputs, alignments) if self.bidirectional_decoder: decoder_outputs_backward, alignments_backward = self._backward_inference(mel_specs, encoder_outputs, mask) return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward return decoder_outputs, postnet_outputs, alignments, stop_tokens
def forward(self, characters, text_lengths, mel_specs, speaker_ids=None): """ Shapes: - characters: B x T_in - text_lengths: B - mel_specs: B x T_out x D - speaker_ids: B x 1 """ self._init_states() mask = sequence_mask(text_lengths).to(characters.device) # B x T_in x embed_dim inputs = self.embedding(characters) # B x speaker_embed_dim self.compute_speaker_embedding(speaker_ids) if self.num_speakers > 1: # B x T_in x embed_dim + speaker_embed_dim inputs = self._concat_speaker_embedding(inputs, self.speaker_embeddings) # B x T_in x encoder_dim encoder_outputs = self.encoder(inputs) if self.gst: # B x gst_dim encoder_outputs = self.compute_gst(encoder_outputs, mel_specs) if self.num_speakers > 1: encoder_outputs = self._concat_speaker_embedding( encoder_outputs, self.speaker_embeddings) # decoder_outputs: B x decoder_dim x T_out # alignments: B x T_in x encoder_dim # stop_tokens: B x T_in decoder_outputs, alignments, stop_tokens = self.decoder( encoder_outputs, mel_specs, mask, self.speaker_embeddings_projected) # B x T_out x decoder_dim postnet_outputs = self.postnet(decoder_outputs) # B x T_out x posnet_dim postnet_outputs = self.last_linear(postnet_outputs) # B x T_out x decoder_dim decoder_outputs = decoder_outputs.transpose(1, 2).contiguous() if self.bidirectional_decoder: decoder_outputs_backward, alignments_backward = self._backward_inference( mel_specs, encoder_outputs, mask) return decoder_outputs, postnet_outputs, alignments, stop_tokens, decoder_outputs_backward, alignments_backward return decoder_outputs, postnet_outputs, alignments, stop_tokens
def test_in_out(self): layer = L1LossMasked() dummy_input = T.ones(4, 8, 128).float() dummy_target = T.ones(4, 8, 128).float() dummy_length = (T.ones(4) * 8).long() output = layer(dummy_input, dummy_target, dummy_length) assert output.item() == 0.0 dummy_input = T.ones(4, 8, 128).float() dummy_target = T.zeros(4, 8, 128).float() dummy_length = (T.ones(4) * 8).long() output = layer(dummy_input, dummy_target, dummy_length) assert output.item() == 1.0, "1.0 vs {}".format(output.data[0]) dummy_input = T.ones(4, 8, 128).float() dummy_target = T.zeros(4, 8, 128).float() dummy_length = (T.arange(5, 9)).long() mask = ((sequence_mask(dummy_length).float() - 1.0) * 100.0).unsqueeze(2) output = layer(dummy_input + mask, dummy_target, dummy_length) assert output.item() == 1.0, "1.0 vs {}".format(output.data[0])
def forward(self, characters, text_lengths, mel_specs, speaker_ids=None): B = characters.size(0) mask = sequence_mask(text_lengths).to(characters.device) inputs = self.embedding(characters) self._init_states() self.compute_speaker_embedding(speaker_ids) if self.num_speakers > 1: inputs = self._concat_speaker_embedding(inputs, self.speaker_embeddings) encoder_outputs = self.encoder(inputs) if self.gst: encoder_outputs = self.compute_gst(encoder_outputs, mel_specs) if self.num_speakers > 1: encoder_outputs = self._concat_speaker_embedding( encoder_outputs, self.speaker_embeddings) mel_outputs, alignments, stop_tokens = self.decoder( encoder_outputs, mel_specs, mask, self.speaker_embeddings_projected) mel_outputs = mel_outputs.view(B, -1, self.mel_dim) linear_outputs = self.postnet(mel_outputs) linear_outputs = self.last_linear(linear_outputs) return mel_outputs, linear_outputs, alignments, stop_tokens
def forward(self, x, target, length): """ Args: x: A Variable containing a FloatTensor of size (batch, max_len, dim) which contains the unnormalized probability for each class. target: A Variable containing a LongTensor of size (batch, max_len, dim) which contains the index of the true class for each corresponding step. length: A Variable containing a LongTensor of size (batch,) which contains the length of each data in a batch. Returns: loss: An average loss value in range [0, 1] masked by the length. """ # mask: (batch, max_len, 1) target.requires_grad = False mask = sequence_mask(sequence_length=length, max_len=target.size(1)).unsqueeze(2).float() mask = mask.expand_as(x) loss = functional.l1_loss(x * mask, target * mask, reduction="sum") loss = loss / mask.sum() return loss
def _make_masks(ilens, olens): in_masks = sequence_mask(ilens) out_masks = sequence_mask(olens) return out_masks.unsqueeze(-1) & in_masks.unsqueeze(-2)
stop_targets = (stop_targets.sum(2) > 0.0).unsqueeze(2).float().squeeze(2) # dispatch data to GPU if use_cuda: text_input = text_input.cuda() text_lengths = text_lengths.cuda() mel_input = mel_input.cuda() mel_lengths = mel_lengths.cuda() if linear_input is not None: linear_input = linear_input.cuda() stop_targets = stop_targets.cuda() if speaker_ids is not None: speaker_ids = speaker_ids.cuda() mask = sequence_mask(text_lengths) # print(text_input, text_lengths, mel_input, speaker_ids) mel_outputs, postnet_outputs, alignments, stop_tokens = model( text_input, text_lengths, mel_input, speaker_ids=speaker_ids) # print(mel_outputs, postnet_outputs, alignments, stop_tokens) # compute mel specs from linear spec if model is Tacotron mel_specs = [] if C.model == "Tacotron": postnet_outputs = postnet_outputs.data.cpu().numpy() for b in range(postnet_outputs.shape[0]): postnet_output = postnet_outputs[b] mel_specs.append( torch.FloatTensor(ap.out_linear_to_mel( postnet_output.T).T).cuda()) postnet_outputs = torch.stack(mel_specs)