def validate(self):
        # Eval mode
        self.model.eval()
        if self.emb_decoder is not None:
            self.emb_decoder.eval()
        dev_wer = {'att': [], 'ctc': []}

        for i, data in enumerate(self.dv_set):
            self.progress('Valid step - {}/{}'.format(i + 1, len(self.dv_set)))
            # Fetch data
            feat, feat_len, txt, txt_len = self.fetch_data(data)

            # Forward model
            with torch.no_grad():
                ctc_output, encode_len, att_output, att_align, dec_state = \
                    self.model(feat, feat_len, int(max(txt_len)*self.DEV_STEP_RATIO),
                               emb_decoder=self.emb_decoder)

            dev_wer['att'].append(cal_er(self.tokenizer, att_output, txt))
            dev_wer['ctc'].append(
                cal_er(self.tokenizer, ctc_output, txt, ctc=True))

            # Show some example on tensorboard
            if i == len(self.dv_set) // 2:
                for i in range(min(len(txt), self.DEV_N_EXAMPLE)):
                    if self.step == 1:
                        self.write_log('true_text{}'.format(i),
                                       self.tokenizer.decode(txt[i].tolist()))
                    if att_output is not None:
                        self.write_log(
                            'att_align{}'.format(i),
                            feat_to_fig(att_align[i, 0, :, :].cpu().detach()))
                        self.write_log(
                            'att_text{}'.format(i),
                            self.tokenizer.decode(
                                att_output[i].argmax(dim=-1).tolist()))
                    if ctc_output is not None:
                        self.write_log(
                            'ctc_text{}'.format(i),
                            self.tokenizer.decode(
                                ctc_output[i].argmax(dim=-1).tolist(),
                                ignore_repeat=True))

        # Ckpt if performance improves
        for task in ['att', 'ctc']:
            dev_wer[task] = sum(dev_wer[task]) / len(dev_wer[task])
            if dev_wer[task] < self.best_wer[task]:
                self.best_wer[task] = dev_wer[task]
                self.save_checkpoint('best_{}.pth'.format(task), 'wer',
                                     dev_wer[task])
            self.write_log('wer', {'dv_' + task: dev_wer[task]})
        self.save_checkpoint('latest.pth',
                             'wer',
                             dev_wer['att'],
                             show_msg=False)

        # Resume training
        self.model.train()
        if self.emb_decoder is not None:
            self.emb_decoder.train()
示例#2
0
    def exec(self):
        self.verbose(
            ['Total training steps {}.'.format(human_format(self.max_step))])
        self.timer.set()
        unpair_speech_loss, unpair_text_loss, unsup_pred, unsup_trans, unsup_align = None, None, None, None, None
        ctc_nan_flag, ignore_speech_flag = 0, 0
        tok_usage, gt_usage = [], []
        cnter = {'ctc_nan': 0, 'unp_sph': 0, 'unp_txt': 0}

        while self.step < self.max_step:
            # --------------------- Load data ----------------------- #
            # Unpair setting
            unpair_mel, unpair_aug_mel, unpair_linear, unpair_text, unpair_sid = None, None, None, None, None
            post_pred, asr_post_loss = None, None  # For ASR postnet only
            use_unpair_text = self.unpair_text_weight > 0 and self.step > self.unpair_text_start_step
            use_unpair_speech = self.unpair_speech_weight > 0 and self.step > self.unpair_speech_start_step

            tf_rate = self.optimizer.pre_step(
                self.step)  # Catch the returned tf_rate if needed
            # ToDo : change # of sup. step = 2 x # of unsup. step ?
            mel, aug_mel, linear, text, sid = self.fetch_data(
                iter_name='pair_iter')

            # Load unpaired data only when use_unpair_xxx == True
            if self.step % 2 == 0:  #2
                # if True:
                # ASR first
                speech_first = True
                if use_unpair_speech:
                    unpair_mel, unpair_aug_mel, unpair_linear, unpair_text, unpair_sid = \
                                                    self.fetch_data(iter_name='unpair_iter')
            else:
                # TTS first
                speech_first = False
                if use_unpair_text:
                    cnter['unp_txt'] += 1
                    unpair_mel, unpair_aug_mel, unpair_linear, unpair_text, unpair_sid = \
                                                    self.fetch_data(iter_name='unpair_iter')

            total_loss = 0
            bs = len(mel)
            self.timer.cnt('rd')
            try:
                # ----------------------- Forward ------------------------ #
                if speech_first:
                    # Cycle : speech -> text -> speech
                    pair_prob, _, unpair_prob, unpair_latent, unpair_latent_len, pair_post_prob, _ = \
                                self.model.speech_to_text(paired_mel=aug_mel, unpaired_mel= unpair_aug_mel)

                    # Check to involve unsupervised Speech2Speech
                    if unpair_latent is not None:
                        # ASR output is the representataion for speech2speech
                        cnter['unp_sph'] += 1
                        ignore_speech_cycle = False
                        unpaired_teacher = unpair_mel
                    else:
                        # ASR output is all blank (cannot be passed to TTS) only paired text is used
                        ignore_speech_cycle = True
                        unpaired_teacher = None

                    # text -> speech
                    pair_mel_pred, pair_linear_pred, pair_align, _, \
                    unpair_mel_pred, unpair_linear_pred, unpair_align, _ =\
                                self.model.text_to_speech(paired_text = text,
                                                          paired_sid=sid,
                                                          unpaired_sid=unpair_sid,
                                                          unpaired_latent = unpair_latent,
                                                          unpaired_text= None,
                                                          unpaired_latent_len = unpair_latent_len,
                                                          paired_teacher = mel,
                                                          unpaired_teacher = unpaired_teacher,
                                                          tf_rate = tf_rate
                                                         )
                else:
                    # Cycle : text -> speech -> text
                    pair_mel_pred, pair_linear_pred, pair_align, _, \
                    unpair_mel_pred, unpair_linear_pred, unpair_align, _ =\
                                self.model.text_to_speech(paired_text=text,
                                                          paired_sid=sid,
                                                          unpaired_sid=unpair_sid,
                                                          unpaired_latent=None,
                                                          unpaired_text=unpair_text,
                                                          unpaired_latent_len=None,
                                                          paired_teacher=mel,
                                                          unpaired_teacher=None,
                                                          tf_rate=tf_rate
                                                         )
                    if use_unpair_text:
                        unpair_mel_pred = unpair_mel_pred.detach(
                        )  # Stop-grad for tts in text2text
                    pair_prob, _, unpair_prob, unpair_latent, unpair_latent_len, pair_post_prob, _ = \
                                self.model.speech_to_text(paired_mel=aug_mel,
                                                          unpaired_mel=unpair_mel_pred,  #None, #unpair_mel_pred, #None, #unpaired_mel= unpair_mel_pred,
                                                          using_fake_mel=use_unpair_text)

                # Paired ASR loss
                asr_loss = self.compute_ctcloss(aug_mel, pair_prob, text)
                if self.model.use_asr_postnet:
                    total_loss = total_loss + self.asr_weight * (
                        1 - self.model.asr_postnet_weight) * asr_loss
                    asr_post_loss = self.compute_ctcloss(aug_mel,
                                                         pair_post_prob,
                                                         text,
                                                         apply_log=False)
                    total_loss = total_loss + self.asr_weight * self.model.asr_postnet_weight * asr_post_loss
                else:
                    total_loss = total_loss + self.asr_weight * asr_loss
                if math.isnan(asr_loss) or math.isinf(asr_loss):
                    cnter['ctc_nan'] += 1
                    asr_loss = 0

                # Paired TTS loss
                mel_loss = self.freq_loss(pair_mel_pred, mel)
                linear_loss = self.freq_loss(pair_linear_pred, linear)
                tts_loss = mel_loss + linear_loss
                total_loss = total_loss + self.tts_weight * tts_loss

                # Unpaired loss
                if speech_first:
                    # Unpaired speech reconstruction loss
                    if not ignore_speech_cycle:
                        unpair_speech_loss = self.freq_loss(unpair_mel_pred, unpair_mel) +\
                                            self.freq_loss(unpair_linear_pred, unpair_linear)
                        #total_loss += self.unpair_speech_weight*unpair_speech_loss
                        if self.step > self.unpair_speech_start_step:
                            total_loss += self.unpair_speech_weight * unpair_speech_loss
                elif use_unpair_text:
                    # Unpaired text reconstruction loss
                    ctc_input = (unpair_prob + EPS).transpose(0, 1).log()
                    if self.paras.actual_len:
                        asr_input_len = (unpair_text != 0).sum(
                            dim=-1) * FRAME_PHN_RATIO
                        asr_input_len = asr_input_len + asr_input_len % self.model.n_frames_per_step
                        ctc_len = 1 + (asr_input_len //
                                       self.model.time_reduce_factor)
                    else:
                        ctc_len = torch.LongTensor(
                            [unpair_prob.shape[1]] *
                            unpair_prob.shape[0]).to(device=self.device)
                    unpair_text_loss = self.ctc_loss(
                        ctc_input,
                        unpair_text.to_sparse().values(), ctc_len,
                        torch.sum(unpair_text != 0, dim=-1))
                    if math.isnan(unpair_text_loss) or math.isinf(
                            unpair_text_loss):
                        cnter['ctc_nan'] += 1
                        unpair_text_loss = 0
                    total_loss += self.unpair_text_weight * unpair_text_loss

                # VQ-loss
                # if vq_loss>0:
                #     total_loss += self.model.vq_weight*vq_loss
                # if commit_loss>0:
                #     total_loss += self.model.commit_weight*commit_loss

                # Statics (over unsup. speech only)
                if speech_first and use_unpair_speech:
                    unsup_pred = unpair_prob.argmax(dim=-1).cpu()
                    unsup_trans = unpair_text.cpu()
                    tok_usage += unsup_pred.flatten().tolist()
                    gt_usage += unsup_trans.flatten().tolist()
                    if unpair_align is not None:
                        unsup_align = unpair_align.detach().cpu()
                    else:
                        unsup_align = [None] * bs

                self.timer.cnt('fw')

                # ----------------------- Backward ------------------------ #
                grad_norm = self.backward(total_loss)
                # For debugging
                # if math.isnan(grad_norm):
                # import IPython
                # IPython.embed()
                self.step += 1

                # Log
                if (self.step == 1) or (self.step % self._PROGRESS_STEP == 0):
                    self.progress('Tr stat | Loss - {:.2f} (CTC-nan/unp-sph/unp-txt={}/{}/{}) | Grad. Norm - {:.2f} | {} '\
                                  .format(total_loss.cpu().item(), cnter['ctc_nan'], cnter['unp_sph'], cnter['unp_txt'],
                                          grad_norm, self.timer.show()))
                    self.write_log(
                        'txt_loss', {
                            'pair':
                            asr_loss.item() if asr_loss is not None else None,
                            'unpair':
                            unpair_text_loss.item()
                            if unpair_text_loss is not None else None,
                            'post':
                            asr_post_loss.item()
                            if asr_post_loss is not None else None
                        })
                    self.write_log(
                        'speech_loss', {
                            'pair':
                            tts_loss.item() if tts_loss is not None else None,
                            'unpair':
                            unpair_speech_loss.item()
                            if unpair_speech_loss is not None else None
                        })
                    #self.write_log('stop_err',{'tr':stop_err})
                    # if commit_loss>0:
                    #     self.write_log('commit',{'tr':commit_loss})
                    # if vq_loss>0:
                    #     self.write_log('commit',{'vq':vq_loss})
                    # self.write_log('temperature',{'temp':self.model.codebook.temp.data})
                    # self.write_log('ppx',{'tr':cal_ppx(p_code)})
                    for k in cnter.keys():
                        cnter[k] = 0
                    if (self.step == 1) or (self.step % ATTENTION_PLOT_STEP
                                            == 0):
                        align = pair_align.cpu()  # align shape BxDsxEs
                        sup_pred = pair_prob.argmax(dim=-1).cpu()
                        sup_trans = text.cpu()
                        if self.model.use_asr_postnet:
                            post_pred = pair_post_prob.argmax(dim=-1).cpu()
                        self.write_log(
                            'per', {
                                'pair': cal_per(sup_pred, sup_trans),
                                'unpair': cal_per(unsup_pred, unsup_trans),
                                'post': cal_per(post_pred, sup_trans)
                            })
                        self.write_log(
                            'unpair_hist',
                            data_to_bar(tok_usage, gt_usage, self.vocab_size,
                                        self.tokenizer._vocab_list))
                        for i in range(LISTEN_N_EXAMPLES):
                            self.write_log(
                                'pair_align{}'.format(i),
                                feat_to_fig(align[i].cpu().detach()))
                            if unsup_align is not None and unsup_align[
                                    i] is not None:
                                self.write_log(
                                    'unpair_align{}'.format(i),
                                    feat_to_fig(unsup_align[i].cpu().detach()))
                        tok_usage, gt_usage = [], []

                # Validation
                if (self.step == 1) or (self.step % self.valid_step == 0):
                    self.validate()

                # End of step
                self.timer.set()
                if self.step > self.max_step: break

            except RuntimeError as e:
                if 'out of memory' in str(e):
                    self.verbose('WARNING: ran out of memory, retrying batch')
                    for p in self.model.parameters():
                        if p.grad is not None:
                            del p.grad  # free some memory
                    torch.cuda.empty_cache()
                else:
                    print(repr(e))
                    errorout()
示例#3
0
    def validate(self):
        # Eval mode
        self.model.eval()
        dev_tts_loss, dev_per, dev_post_per, dev_stop_err = [], [], [], []

        for i in range(len(self.dev_set)):
            self.progress('Valid step - {}/{}'.format(i + 1,
                                                      len(self.dev_set)))
            # Fetch data
            mel, aug_mel, linear, text, sid = self.fetch_data(
                iter_name='dev_iter')

            # Forward model
            with torch.no_grad():
                # test ASR
                pair_prob, _, _, _, _, pair_post_prob, _ = self.model.speech_to_text(
                    paired_mel=mel, unpaired_mel=None)
                dev_per.append(cal_per(pair_prob, text))
                if pair_post_prob is not None:
                    dev_post_per.append((cal_per(pair_post_prob, text)))

                # test TTS (Note: absolute dec step now)
                pair_mel_pred, pair_linear_pred, pair_align, _, _, _, _, _ = \
                        self.model.text_to_speech(paired_text = text,
                                                  paired_sid=sid,
                                                  unpaired_sid=None,
                                                  unpaired_latent=None,
                                                  unpaired_text=None,
                                                  unpaired_latent_len=None,
                                                  paired_teacher=mel.shape[1],
                                                  unpaired_teacher=None,
                                                  tf_rate=0.0)
                dev_tts_loss.append(
                    self.freq_loss(pair_mel_pred, mel) +
                    self.freq_loss(pair_linear_pred, linear))

            if i == len(self.dev_set) // 2:
                # pick n longest samples in the median batch
                sample_txt = text.cpu()[:LISTEN_N_EXAMPLES]
                hyp = pair_prob.argmax(dim=-1).cpu()[:LISTEN_N_EXAMPLES]
                mel_p = pair_mel_pred.cpu()[:LISTEN_N_EXAMPLES]
                linear_p = pair_linear_pred.cpu()[:LISTEN_N_EXAMPLES]
                #post_mel_p = tts_pred.cpu()[:LISTEN_N_EXAMPLES,1] # PostNet product
                align_p = pair_align.cpu()[:LISTEN_N_EXAMPLES]
                sample_mel = mel.cpu()[:LISTEN_N_EXAMPLES]
                sample_linear = linear.cpu()[:LISTEN_N_EXAMPLES]

        # Ckpt if performance improves
        dev_tts_loss = sum(dev_tts_loss) / len(dev_tts_loss)
        dev_per = sum(dev_per) / len(dev_per)
        dev_post_per = sum(dev_post_per) / len(dev_post_per) if len(
            dev_post_per) > 0 else None
        #dev_stop_err = sum(dev_stop_err)/len(dev_stop_err)

        if self.paras.store_best_per:
            if dev_per < self.best_per:
                self.best_per = dev_per
                self.save_checkpoint('best_per.pth', dev_per)
            if (dev_post_per is not None) and (dev_post_per < self.best_per):
                self.best_per = dev_post_per
                self.save_checkpoint('best_post_per.pth', dev_post_per)
        else:
            if dev_tts_loss < self.best_tts_loss:
                self.best_tts_loss = dev_tts_loss
                if self.step > 1:
                    self.save_checkpoint('tts_{}.pth'.format(self.step),
                                         dev_tts_loss)
            if dev_per < self.best_per:
                self.best_per = dev_per
                if self.step > 1:
                    self.save_checkpoint('asr_{}.pth'.format(self.step),
                                         dev_per)
            if (dev_post_per is not None) and (dev_post_per < self.best_per):
                self.best_per = dev_post_per
                self.save_checkpoint(
                    'best_post_per.pth', dev_post_per
                )  # Note: didnot recode best per from postnet or not

        if ((self.step > 1) and
            (self.step % CKPT_STEP == 0)) and not self.paras.store_best_per:
            # Regular ckpt
            self.save_checkpoint('step_{}.pth'.format(self.step), dev_tts_loss)

        # Logger
        # Write model output (no G-F-lim if picking per)
        for i, (m_p, l_p, a_p,
                h_p) in enumerate(zip(mel_p, linear_p, align_p, hyp)):
            self.write_log('hyp_text{}'.format(i),
                           self.tokenizer.decode(h_p.tolist()))
            self.write_log('mel_spec{}'.format(i), feat_to_fig(m_p))
            self.write_log('linear_spec{}'.format(i), feat_to_fig(l_p))
            self.write_log('dv_align{}'.format(i), feat_to_fig(a_p))
            if not self.paras.store_best_per:
                self.write_log('mel_wave{}'.format(i),
                               self.audio_converter.feat_to_wave(m_p))
                self.write_log('linear_wave{}'.format(i),
                               self.audio_converter.feat_to_wave(l_p))
        # Write ground truth
        if self.step == 1:
            for i, (mel, linear, gt_txt) in enumerate(
                    zip(sample_mel, sample_linear, sample_txt)):
                self.write_log('truth_text{}'.format(i),
                               self.tokenizer.decode(gt_txt.tolist()))
                self.write_log('mel_spec{}_gt'.format(i), feat_to_fig(mel))
                self.write_log('mel_wave{}_gt'.format(i),
                               self.audio_converter.feat_to_wave(mel))
                self.write_log('linear_spec{}_gt'.format(i),
                               feat_to_fig(linear))
                self.write_log('linear_wave{}_gt'.format(i),
                               self.audio_converter.feat_to_wave(linear))

        self.write_log('speech_loss', {'dev': dev_tts_loss})
        self.write_log('per', {'dev': dev_per, 'dev_post': dev_post_per})
        self.write_log('codebook', (self.model.codebook.embedding.weight.data,
                                    self.tokenizer._vocab_list))
        #self.write_log('stop_err',{'dev':dev_stop_err})
        # Resume training
        self.model.train()
    def validate(self, _dv_set, _name):
        # Eval mode
        self.model.eval()
        if self.emb_decoder is not None: self.emb_decoder.eval()
        dev_wer = {'att': [], 'ctc': []}
        dev_cer = {'att': [], 'ctc': []}
        dev_er = {'att': [], 'ctc': []}

        for i, data in enumerate(_dv_set):
            self.progress('Valid step - {}/{}'.format(i + 1, len(_dv_set)))
            # Fetch data
            feat, feat_len, txt, txt_len = self.fetch_data(data)

            # Forward model
            with torch.no_grad():
                ctc_output, encode_len, att_output, att_align, dec_state = \
                    self.model( feat, feat_len, int(max(txt_len)*self.DEV_STEP_RATIO),
                                    emb_decoder=self.emb_decoder)

            if att_output is not None:
                dev_wer['att'].append(
                    cal_er(self.tokenizer, att_output, txt, mode='wer'))
                dev_cer['att'].append(
                    cal_er(self.tokenizer, att_output, txt, mode='cer'))
                dev_er['att'].append(
                    cal_er(self.tokenizer, att_output, txt,
                           mode=self.val_mode))
            if ctc_output is not None:
                dev_wer['ctc'].append(
                    cal_er(self.tokenizer,
                           ctc_output,
                           txt,
                           mode='wer',
                           ctc=True))
                dev_cer['ctc'].append(
                    cal_er(self.tokenizer,
                           ctc_output,
                           txt,
                           mode='cer',
                           ctc=True))
                dev_er['ctc'].append(
                    cal_er(self.tokenizer,
                           ctc_output,
                           txt,
                           mode=self.val_mode,
                           ctc=True))

            # Show some example on tensorboard
            if i == len(_dv_set) // 2:
                for i in range(min(len(txt), self.DEV_N_EXAMPLE)):
                    if self.step == 1:
                        self.write_log('true_text_{}_{}'.format(_name, i),
                                       self.tokenizer.decode(txt[i].tolist()))
                    if att_output is not None:
                        self.write_log(
                            'att_align_{}_{}'.format(_name, i),
                            feat_to_fig(att_align[i, 0, :, :].cpu().detach()))
                        self.write_log(
                            'att_text_{}_{}'.format(_name, i),
                            self.tokenizer.decode(
                                att_output[i].argmax(dim=-1).tolist()))
                    if ctc_output is not None:
                        self.write_log(
                            'ctc_text_{}_{}'.format(_name, i),
                            self.tokenizer.decode(
                                ctc_output[i].argmax(dim=-1).tolist(),
                                ignore_repeat=True))

        # Ckpt if performance improves
        tasks = []
        if len(dev_er['att']) > 0:
            tasks.append('att')
        if len(dev_er['ctc']) > 0:
            tasks.append('ctc')

        for task in tasks:
            dev_er[task] = sum(dev_er[task]) / len(dev_er[task])
            dev_wer[task] = sum(dev_wer[task]) / len(dev_wer[task])
            dev_cer[task] = sum(dev_cer[task]) / len(dev_cer[task])
            if dev_er[task] < self.best_wer[task][_name]:
                self.best_wer[task][_name] = dev_er[task]
                self.save_checkpoint(
                    'best_{}_{}.pth'.format(
                        task, _name +
                        (self.save_name if self.transfer_learning else '')),
                    self.val_mode, dev_er[task], _name)
            if self.step >= self.max_step:
                self.save_checkpoint(
                    'last_{}_{}.pth'.format(
                        task, _name +
                        (self.save_name if self.transfer_learning else '')),
                    self.val_mode, dev_er[task], _name)
            self.write_log(self.WER,
                           {'dv_' + task + '_' + _name.lower(): dev_wer[task]})
            self.write_log('cer',
                           {'dv_' + task + '_' + _name.lower(): dev_cer[task]})
            # if self.transfer_learning:
            #     print('[{}] WER {:.4f} / CER {:.4f} on {}'.format(human_format(self.step), dev_wer[task], dev_cer[task], _name))

        # Resume training
        self.model.train()
        if self.transfer_learning:
            self.model.encoder.fix_layers(self.fix_enc)
            if self.fix_dec and self.model.enable_att:
                self.model.decoder.fix_layers()
            if self.fix_dec and self.model.enable_ctc:
                self.model.fix_ctc_layer()

        if self.emb_decoder is not None: self.emb_decoder.train()