示例#1
0
    def forward(self, batch_wave, lengths, phone, len_phone, label_smooth=0., threshold=0.95):
        device = batch_wave.device
        phone_paddings = phone.eq(0).float()

        encoder_outputs, encoder_output_lengths = self.splayer(batch_wave, lengths)
        encoder_outputs, encoder_output_lengths = self.encoder(encoder_outputs, encoder_output_lengths)
        ctc_logits = self.ctc_fc(encoder_outputs)

        len_logits_ctc = encoder_output_lengths
        alphas = self.assigner(encoder_outputs, encoder_output_lengths)

        # sum
        _num = alphas.sum(-1)
        # scaling
        num = len_phone.float()
        num_noise = num + 0.9 * torch.rand(alphas.size(0)).to(device) - 0.45
        alphas *= (num_noise / _num)[:, None].repeat(1, alphas.size(1))

        cif_outputs = self.cif(encoder_outputs, alphas, threshold=threshold)

        logits_IPA = self.phone_fc(cif_outputs)

        ctc_loss = cal_ctc_loss(ctc_logits, len_logits_ctc, phone, len_phone)
        qua_loss = cal_qua_loss(_num, num)
        ce_phone_loss = cal_ce_loss(logits_IPA, phone, phone_paddings, label_smooth)

        return ctc_loss, qua_loss, ce_phone_loss
示例#2
0
    def forward(self, batch_wave, lengths, target_ids, target_labels=None, target_paddings=None, label_smooth=0., threshold=0.95):
        device = batch_wave.device
        target_lengths = torch.sum(1-target_paddings, dim=-1).long()

        encoder_outputs, encoder_output_lengths = self.splayer(batch_wave, lengths)
        encoder_outputs, encoder_output_lengths = self.encoder(encoder_outputs, encoder_output_lengths)
        ctc_logits = self.ctc_fc(encoder_outputs)

        len_logits_ctc = encoder_output_lengths
        alphas = self.assigner(encoder_outputs, encoder_output_lengths)

        # sum
        _num = alphas.sum(-1)
        # scaling
        num = target_lengths.float()
        num_noise = num + 0.9 * torch.rand(alphas.size(0)).to(device) - 0.45
        alphas *= (num_noise / _num)[:, None].repeat(1, alphas.size(1))

        cif_outputs = self.cif(encoder_outputs, alphas, threshold=threshold)

        logits = self.decoder(cif_outputs, target_ids, target_lengths)

        ctc_loss = cal_ctc_loss(ctc_logits, len_logits_ctc, target_labels, target_lengths)
        qua_loss = cal_qua_loss(_num, num)
        ce_loss = cal_ce_loss(logits, target_labels, target_paddings, label_smooth)

        return ctc_loss, qua_loss, ce_loss