예제 #1
0
def tts_train_loop(model, optimizer, train_set, lr, total_steps):

    for p in optimizer.param_groups:
        p['lr'] = lr

    total_iters = len(train_set)
    epochs = (total_steps - model.get_step()) // total_iters + 1

    for e in range(epochs):

        start = time.time()
        running_loss = 0

        for i, (x, m, _, _) in enumerate(train_set, 1):

            optimizer.zero_grad()

            x, m = x.cuda(), m.cuda()

            m1_hat, m2_hat, attention = model(x, m)

            m1_loss = F.l1_loss(m1_hat, m)
            m2_loss = F.l1_loss(m2_hat, m)

            loss = m1_loss + m2_loss

            running_loss += loss.item()

            loss.backward()

            if hp.tts_clip_grad_norm:
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               hp.tts_clip_grad_norm)

            optimizer.step()

            step = model.get_step()
            k = step // 1000

            speed = i / (time.time() - start)

            avg_loss = running_loss / i

            if step % hp.tts_checkpoint_every == 0:
                model.checkpoint(paths.tts_checkpoints)

            if step % hp.tts_plot_every == 0:
                save_attention(attention[0], f'{paths.tts_attention}{k}k')

            msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:#.4} | {speed:#.2} steps/s | Step: {k}k | '
            stream(msg)

        model.save(paths.tts_latest_weights)
        model.log(paths.tts_log, msg)
        print(' ')
예제 #2
0
def voc_train_loop(model, loss_func, optimiser, train_set, test_set, lr,
                   total_steps):

    for p in optimiser.param_groups:
        p['lr'] = lr

    total_iters = len(train_set)
    epochs = (total_steps - model.get_step()) // total_iters + 1

    for e in range(1, epochs + 1):

        start = time.time()
        running_loss = 0.

        for i, (x, y, m, s_e) in enumerate(train_set, 1):

            x, m, y, spk_embd = x.cuda(), m.cuda(), y.cuda(), s_e.cuda()

            y_hat = model(x, m, spk_embd)

            if model.mode == 'RAW':
                y_hat = y_hat.transpose(1, 2).unsqueeze(-1)

            elif model.mode == 'MOL':
                y = y.float()

            y = y.unsqueeze(-1)

            loss = loss_func(y_hat, y)

            optimiser.zero_grad()
            loss.backward()
            optimiser.step()
            running_loss += loss.item()

            speed = i / (time.time() - start)
            avg_loss = running_loss / i

            step = model.get_step()
            k = step // 1000

            if step % hp.voc_checkpoint_every == 0:
                gen_testset(model, test_set, hp.voc_gen_at_checkpoint,
                            hp.voc_gen_batched, hp.voc_target, hp.voc_overlap,
                            paths.voc_output)
                model.checkpoint(paths.voc_checkpoints)

            msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:.4f} | {speed:.1f} steps/s | Step: {k}k | '
            stream(msg)

        model.save(paths.voc_latest_weights)
        model.log(paths.voc_log, msg)
        print(' ')
예제 #3
0
def train_loop(model, optimiser, train_set, test_set, lr):

    for p in optimiser.param_groups:
        p['lr'] = lr

    total_iters = len(train_set)
    epochs = (hp.total_steps - model.get_step()) // total_iters + 1

    for e in range(1, epochs + 1):

        start = time.time()
        running_loss = 0.

        for i, (x, y, m) in enumerate(train_set, 1):
            x, m, y = x.cuda(), m.cuda(), y.cuda()

            y_hat = model(x, m)
            y_hat = y_hat.transpose(1, 2).unsqueeze(-1)
            y = y.unsqueeze(-1)
            loss = F.cross_entropy(y_hat, y)

            optimiser.zero_grad()
            loss.backward()
            optimiser.step()
            running_loss += loss.item()

            speed = i / (time.time() - start)
            avg_loss = running_loss / i

            step = model.get_step()
            k = step // 1000

            if step % hp.checkpoint_every == 0:
                gen_testset(model, test_set, hp.test_samples, hp.batched,
                            hp.target, hp.overlap, paths.output)
                model.checkpoint(paths.checkpoints)

            msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:#.4} | {speed:#.2} steps/s | Step: {k}k | '
            stream(msg)

        model.save(paths.latest_weights)
        model.log(paths.log, msg)
        print(' ')
예제 #4
0
    def evaluate(self, model, val_set, msg) -> float:
        model.tacotron.eval()
        val_loss = 0
        device = next(model.tacotron.parameters()).device
        for i, batch in enumerate(val_set, 1):
            stream(msg + f'| Evaluating {i}/{len(val_set)}')
            seqs, mels, stops, ids, lens = batch
            seqs, mels, stops, lens = \
                seqs.to(device), mels.to(device), stops.to(device), lens.to(device)
            with torch.no_grad():
                pred = model.tacotron(seqs, mels)
                lin_mels, post_mels, att = pred
                lin_loss = F.l1_loss(lin_mels, mels)
                post_loss = F.l1_loss(post_mels, mels)
                val_loss += lin_loss + post_loss
            if i == 1:
                self.generate_samples(model, batch, pred)

        val_loss /= len(val_set)
        return float(val_loss)
예제 #5
0
def create_gta_features(model, train_set, save_path):

    iters = len(train_set)

    for i, (x, mels, ids, mel_lens) in enumerate(train_set, 1):

        x, mels = x.cuda(), mels.cuda()

        with torch.no_grad():
            _, gta, _ = model(x, mels)

        gta = gta.cpu().numpy()

        for j in range(len(ids)):
            mel = gta[j][:, :mel_lens[j]]
            mel = (mel + 4) / 8
            id = ids[j]
            np.save(f'{save_path}{id}.npy', mel, allow_pickle=False)

        bar = progbar(i, iters)
        msg = f'{bar} {i}/{iters} Batches '
        stream(msg)
예제 #6
0
    def train_session(self, model: ModelPackage, session: Session):
        model.r = session.r
        cfg = self.cfg
        tacotron, gan = model.tacotron, model.gan
        taco_opti, gen_opti, disc_opti = \
            model.taco_opti, model.gen_opti, model.disc_opti
        device = next(tacotron.parameters()).device
        display_params([('Session', session.index), ('Reduction', session.r),
                        ('Max Step', session.max_step),
                        ('Learning Rate', session.lr),
                        ('Batch Size', session.bs),
                        ('Steps per Epoch', len(session.train_set))])

        for g in taco_opti.param_groups:
            g['lr'] = session.lr

        loss_avg = Averager()
        duration_avg = Averager()

        while tacotron.get_step() <= session.max_step:

            for i, (seqs, mels, stops, ids,
                    lens) in enumerate(session.train_set):
                seqs, mels, stops, lens = \
                    seqs.to(device), mels.to(device), stops.to(device), lens.to(device)
                t_start = time.time()
                block_step = tacotron.get_step() % cfg.steps_to_eval + 1

                tacotron.train()
                lin_mels, post_mels, att = tacotron(seqs, mels)

                lin_loss = self.criterion(lin_mels, mels, lens)
                post_loss = self.criterion(post_mels, mels, lens)

                loss = lin_loss + post_loss
                loss_avg.add(loss)

                taco_opti.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(tacotron.parameters(), 1.0)
                taco_opti.step()

                duration_avg.add(time.time() - t_start)
                steps_per_s = 1. / duration_avg.get()
                self.writer.add_scalar('Loss/train', loss, tacotron.get_step())
                self.writer.add_scalar('Params/reduction_factor', session.r,
                                       tacotron.get_step())
                self.writer.add_scalar('Params/batch_sze', session.bs,
                                       tacotron.get_step())
                self.writer.add_scalar('Params/learning_rate', session.lr,
                                       tacotron.get_step())

                msg = f'{block_step}/{cfg.steps_to_eval} | Step: {tacotron.get_step()} ' \
                      f'| {steps_per_s:#.2} steps/s | Avg. Loss: {loss_avg.get():#.4} '
                stream(msg)

                if tacotron.step % cfg.steps_to_checkpoint == 0:
                    self.save_model(model, step=tacotron.get_step())

                if tacotron.step % self.cfg.steps_to_eval == 0:
                    val_loss = self.evaluate(model, session.val_set, msg)
                    self.writer.add_scalar('Loss/val', val_loss, tacotron.step)
                    self.save_model(model)
                    stream(msg + f'| Val Loss: {float(val_loss):#0.4} \n')
                    loss_avg.reset()
                    duration_avg.reset()

            if tacotron.step > session.max_step:
                return
예제 #7
0
    def dual_transform(self, model_tts, model_asr, optimizer_tts,
                       optimizer_asr, asr_test_set, m_loss_avg, dur_loss_avg,
                       device, asr_current_step, e, epochs, duration_avg,
                       total_iters, tts_s_loss, asr_s_loss, tts_lr,
                       tts_dt_path):
        print('\n\nStarting DualTransformation loop...\n')
        # exit()
        tmp_dir = './checkpoints/sme_speech_tts.asr_forward/dual_transform_tmp'
        os.makedirs(tmp_dir, exist_ok=True)
        # generate tmp ASR training data
        asr_train_data = []
        input_set = get_unpaired_txt(35)
        # print(input_set)
        text = [clean_text(v) for v in input_set]
        inputs = [text_to_sequence(t) for t in text]

        # generate unpaired data for ASR from TTS
        for i, x in enumerate(inputs, 1):
            _, m, dur = model_tts.generate(x, alpha=1.)
            wav = reconstruct_waveform(m, n_iter=32)
            wav_path = os.path.join(tmp_dir, f'{i}.wav')
            save_wav(wav, wav_path)
            asr_train_data.append((wav_path, text[i - 1]))

        # print(asr_train_data)
        dt_asr_data = load_dt_data(asr_train_data)
        # reinit trainer with only tmp train data
        asr_trainer_dt = init_trainer(dt_asr_data, None)
        dt_train = asr_trainer_dt.get_train_dataloader()

        # unsuper train loop for ASR
        for step, inputs in enumerate(dt_train, 1):
            # model_asr.cpu()
            model_asr.train()
            model_asr.to(device)
            for k, v in inputs.items():
                if isinstance(v, torch.Tensor):
                    inputs[k] = v.to(device)
            # model_asr.train()
            outputs = model_asr(**inputs)
            asr_u_loss = outputs["loss"] if isinstance(outputs,
                                                       dict) else outputs[0]
            # asr_u_loss.detach()
            # asr_u_loss = asr_s_loss.mean()

            # model_name = step + asr_current_step
            msg_asr =   f'| ASR MODEL (unsupervised training) : '\
                    f'| Epoch: {e}/{epochs} ({step}/{len(dt_train)}) | Loss ASR: {asr_u_loss:#.4} '\
                    f' ||||||||||||||||||||||||||||||||||||||||||||||||'
            stream(msg_asr)

        # for f in os.listdir(tmp_dir):
        #     file_path = os.path.join(tmp_dir, f)
        #     if f.endswith('.wav'):
        #         os.unlink(file_path)

        # generate tmp TTS data from ASR
        # model_asr.to(device)
        asr_predict_for_dt(model_asr)

        subprocess.check_output(
            'python preprocess.py -p "./data/speech-sme-tts" -d=True',
            shell=True,
            stderr=subprocess.STDOUT)
        print('Finished preprocessing for tmp data!')

        tmp_tts_train = get_tts_datasets(tts_dt_path,
                                         batch_size=2,
                                         r=1,
                                         model_type='forward_dt')
        print("Loaded tmp dataset!")
        # unsuper TTS training

        for i, (x, m, ids, x_lens, mel_lens,
                dur) in enumerate(tmp_tts_train, 1):
            start = time.time()
            model_tts.to(device)
            model_tts.train()
            # optimizer_tts.zero_grad()
            x, m, dur, x_lens, mel_lens = x.to(device), m.to(device), dur.to(device),\
                                                 x_lens.to(device), mel_lens.to(device)

            m1_hat, m2_hat, dur_hat = model_tts(x, m, dur, mel_lens)

            m1_loss = self.l1_loss(m1_hat, m, mel_lens)
            m2_loss = self.l1_loss(m2_hat, m, mel_lens)

            dur_loss = self.l1_loss(dur_hat.unsqueeze(1), dur.unsqueeze(1),
                                    x_lens)

            tts_u_loss = m1_loss + m2_loss + 0.1 * dur_loss
            # optimizer_tts.zero_grad()
            # tts_u_loss.backward()
            torch.nn.utils.clip_grad_norm_(model_tts.parameters(),
                                           hp.tts_clip_grad_norm)
            # optimizer_tts.step()
            m_loss_avg.add(m1_loss.item() + m2_loss.item())
            dur_loss_avg.add(dur_loss.item())
            step = model_tts.get_step()
            k = step // 1000

            duration_avg.add(time.time() - start)
            # pitch_loss_avg.add(pitch_loss.item())

            speed = 1. / duration_avg.get()
            msg_tts = f'| TTS MODEL (unsupervised training ): '\
                  f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Mel Loss: {m_loss_avg.get():#.4} ' \
                  f'| Dur Loss: {dur_loss_avg.get():#.4} ' \
                  f'| {speed:#.2} steps/s | Step: {k}k | '

            stream(msg_tts)
        # m_val_loss, dur_val_loss = self.evaluate(model_tts, tts_session.val_set)
        #TODO: combine L and update
        # asr_s_loss = torch.tensor(asr_s_loss).to(device)
        combined_loss = 0.5 * (tts_s_loss + asr_s_loss) + (tts_u_loss +
                                                           asr_u_loss)
        # backwards
        combined_loss.to(device)
        # print(combined_loss)
        combined_loss.backward()
        optimizer_tts.step()

        for state in optimizer_asr.state.values():
            for k, v in state.items():
                if torch.is_tensor(v):
                    state[k] = v.to(device)

        optimizer_asr.step()

        m_loss_avg.reset()
        duration_avg.reset()
        # pitch_loss_avg.reset()
        dt_msg = f'\n\nFinished DT loop in epoch {e}!\n'
        stream(dt_msg)
        print(' ')
        return tts_u_loss, asr_u_loss
예제 #8
0
    def train_session(self, model_tts: ForwardTacotron,
                      model_asr: Wav2Vec2ForCTC, optimizer_tts: Optimizer,
                      tts_session: ForwardSession, asr_session: ASRSession,
                      asr_trainer, optimizer_asr) -> None:
        # print(tts_session.path)
        # exit()
        asr_trainer_state = {'logs': []}
        current_step = model_tts.get_step()
        tts_training_steps = tts_session.max_step - current_step
        try:
            _, asr_current_step = get_last_checkpoint(
                './checkpoints/sme_speech_tts.asr_forward/', 'model_at')
            asr_training_steps = tts_session.max_step - asr_current_step
        except:
            asr_current_step = 0
            asr_training_steps = tts_training_steps

        total_iters = len(tts_session.train_set)
        epochs = tts_training_steps // total_iters + 1
        simple_table([
            ('TTS Steps', str(tts_training_steps // 1000) + 'k Steps'),
            ('ASR Steps', str(asr_training_steps // 1000) + 'k Steps'),
            ('Batch Size TTS', tts_session.bs),
            ('Learning Rate', tts_session.lr)
        ])

        for g in optimizer_tts.param_groups:
            g['lr'] = tts_session.lr

        m_loss_avg = Averager()
        dur_loss_avg = Averager()
        duration_avg = Averager()

        device = next(model_tts.parameters()
                      ).device  # use same device as model parameters
        warnings.filterwarnings('ignore', category=UserWarning)
        for e in range(1, epochs + 1):

            #tts train loop for epoch
            for i, (x, m, ids, x_lens, mel_lens,
                    dur) in enumerate(tts_session.train_set, 1):
                start = time.time()
                model_tts.train()
                x, m, dur, x_lens, mel_lens = x.to(device), m.to(device), dur.to(device),\
                                                     x_lens.to(device), mel_lens.to(device)

                m1_hat, m2_hat, dur_hat = model_tts(x, m, dur, mel_lens)

                m1_loss = self.l1_loss(m1_hat, m, mel_lens)
                m2_loss = self.l1_loss(m2_hat, m, mel_lens)

                dur_loss = self.l1_loss(dur_hat.unsqueeze(1), dur.unsqueeze(1),
                                        x_lens)

                tts_s_loss = m1_loss + m2_loss + 0.1 * dur_loss
                optimizer_tts.zero_grad()
                # tts_s_loss.backward()
                torch.nn.utils.clip_grad_norm_(model_tts.parameters(),
                                               hp.tts_clip_grad_norm)
                # optimizer_tts.step()
                m_loss_avg.add(m1_loss.item() + m2_loss.item())
                dur_loss_avg.add(dur_loss.item())
                step = model_tts.get_step()
                k = step // 1000

                duration_avg.add(time.time() - start)
                # pitch_loss_avg.add(pitch_loss.item())

                speed = 1. / duration_avg.get()
                msg_tts = f'| TTS MODEL (supervised training ): '\
                      f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Mel Loss: {m_loss_avg.get():#.4} ' \
                      f'| Dur Loss: {dur_loss_avg.get():#.4} ' \
                      f'| {speed:#.2} steps/s | Step: {k}k | '

                if step % hp.forward_checkpoint_every == 0:
                    ckpt_name = f'forward_step{k}K'
                    save_checkpoint('forward',
                                    self.paths,
                                    model_tts,
                                    optimizer_tts,
                                    name=ckpt_name,
                                    is_silent=True)

                if step % hp.forward_plot_every == 0:

                    self.generate_plots(model_tts, tts_session)

                self.writer.add_scalar('Mel_Loss/train', m1_loss + m2_loss,
                                       model_tts.get_step())
                self.writer.add_scalar('Duration_Loss/train', dur_loss,
                                       model_tts.get_step())
                self.writer.add_scalar('Params/batch_size', tts_session.bs,
                                       model_tts.get_step())
                self.writer.add_scalar('Params/learning_rate', tts_session.lr,
                                       model_tts.get_step())

                stream(msg_tts)
                # print(msg_tts)
            # print(torch.cuda.memory_allocated(device=device))
            # model_tts = model_tts.to('cpu')

            for step, inputs in enumerate(asr_session.train_set):

                optimizer_asr.zero_grad()

                model_asr.to(device)
                for k, v in inputs.items():
                    if isinstance(v, torch.Tensor):
                        inputs[k] = v.to(device)
                model_asr.train()
                outputs = model_asr(**inputs)
                asr_s_loss = outputs["loss"] if isinstance(
                    outputs, dict) else outputs[0]
                # asr_s_loss = asr_s_loss.mean()

                msg_asr =  f'| ASR MODEL (supervised training) : '\
                            f'| Epoch: {e}/{epochs} ({step}/{len(asr_session.train_set)}) | Loss ASR: {asr_s_loss:#.4} '\
                            f' ||||||||||||||||||||||'

                stream(msg_asr)
            # # model_asr.to('cuda')

            m_val_loss, dur_val_loss = self.evaluate(model_tts,
                                                     tts_session.val_set)
            eval_tts_msg = f'| TTS MODEL (supervised eval ): '\
                        f'| Epoch: {e}/{epochs} | Val Loss: {m_val_loss:#.4} ' \
                        f'| Dur Val Loss: {dur_val_loss:#.4} ' \

            stream(eval_tts_msg)
            tts_eval_loss = m_val_loss + dur_val_loss
            #     print(eval_tts_msg)

            # ASR eval supervised
            print('\nEvaluating ASR model ...')
            # model_asr.to('cpu')
            asr_eval_loss = 0
            eval_wer = 0

            for step, inputs in enumerate(asr_session.test_set):
                asr_eval_loss_i, logits_a, labels_a = asr_trainer.prediction_step(
                    model_asr, inputs, False)
                asr_eval_loss += asr_eval_loss_i
                logits_a.to('cpu')
                eval_wer_i = asr_trainer.compute_metrics(
                    EvalPrediction(predictions=logits_a, label_ids=labels_a))
                eval_wer += eval_wer_i['wer']
                # print(eval_wer)
            eval_wer = eval_wer / step
            asr_eval_loss = asr_eval_loss / step

            msg_asr_eval = f'| ASR MODEL (supervised eval) : Epoch {e}/{epochs} | Loss ASR: {asr_eval_loss:#.4} | WER: {eval_wer} |||||||||||||||||||||||||||||||||||||||||||||||||||||'
            stream(msg_asr_eval)

            # dual transformation loop
            # tts_s_loss = 3
            # asr_s_loss = 1
            tts_u_loss, asr_u_loss = self.dual_transform(
                model_tts, model_asr, optimizer_tts, optimizer_asr,
                asr_session.test_set, m_loss_avg, dur_loss_avg, device,
                asr_current_step, e, epochs, duration_avg, total_iters,
                tts_s_loss, asr_s_loss, tts_session.lr, tts_session.path)
            step += 1
            asr_path = f'checkpoint-27364'
            modelasr_folder = './checkpoints/sme_speech_tts.asr_forward/'
            new_check = modelasr_folder + asr_path
            os.makedirs(new_check, exist_ok=True)

            # asr_path, asr_step = get_last_checkpoint(modelasr_folder, modelasr_name)

            save_checkpoint('forward',
                            self.paths,
                            model_tts,
                            optimizer_tts,
                            is_silent=True)

            # asr_u_loss = 2

            if "logs" not in asr_trainer_state:
                asr_trainer_state['logs'] = []
            asr_trainer_state['logs'].append({
                'step':
                step,
                'epoch':
                e,
                'asr_s_loss':
                int(asr_s_loss),
                'asr_u_loss':
                int(asr_u_loss),
                'tts_s_loss':
                int(tts_s_loss),
                'tts_u_loss':
                int(tts_u_loss),
                'tts_eval_loss':
                int(tts_eval_loss),
                'asr_eval_loss':
                int(asr_eval_loss),
                'eval_wer':
                eval_wer
            })

            with open(f'{modelasr_folder+ asr_path}/dt_trainer_state.json',
                      'w') as f:
                json.dump(asr_trainer_state, f)

            model_asr.save_pretrained(f'{new_check}')

            torch.save(optimizer_asr.state_dict(), f'{new_check}/optimizer.pt')

            print("Exiting due to cuda OOM!")
            exit(11)
예제 #9
0
    def train_session(self, model: Tacotron, optimizer: Optimizer,
                      session: TTSSession) -> None:
        current_step = model.get_step()
        training_steps = session.max_step - current_step
        total_iters = len(session.train_set)
        epochs = training_steps // total_iters + 1
        model.r = session.r
        simple_table([(f'Steps with r={session.r}',
                       str(training_steps // 1000) + 'k Steps'),
                      ('Batch Size', session.bs),
                      ('Learning Rate', session.lr),
                      ('Outputs/Step (r)', model.r)])
        for g in optimizer.param_groups:
            g['lr'] = session.lr

        loss_avg = Averager()
        duration_avg = Averager()
        device = next(
            model.parameters()).device  # use same device as model parameters
        for e in range(1, epochs + 1):
            for i, (x, m, ids, x_lens,
                    mel_lens) in enumerate(session.train_set, 1):
                start = time.time()
                model.train()
                x, m = x.to(device), m.to(device)

                m1_hat, m2_hat, attention = model(x, m)

                m1_loss = F.l1_loss(m1_hat, m)
                m2_loss = F.l1_loss(m2_hat, m)
                loss = m1_loss + m2_loss
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               hp.tts_clip_grad_norm)
                optimizer.step()
                loss_avg.add(loss.item())
                step = model.get_step()
                k = step // 1000

                duration_avg.add(time.time() - start)
                speed = 1. / duration_avg.get()
                msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {loss_avg.get():#.4} ' \
                      f'| {speed:#.2} steps/s | Step: {k}k | '

                if step % hp.tts_checkpoint_every == 0:
                    ckpt_name = f'taco_step{k}K'
                    save_checkpoint('tts',
                                    self.paths,
                                    model,
                                    optimizer,
                                    name=ckpt_name,
                                    is_silent=True)

                if step % hp.tts_plot_every == 0:
                    self.generate_plots(model, session)

                _, att_score = attention_score(attention, mel_lens)
                att_score = torch.mean(att_score)
                self.writer.add_scalar('Attention_Score/train', att_score,
                                       model.get_step())
                self.writer.add_scalar('Loss/train', loss, model.get_step())
                self.writer.add_scalar('Params/reduction_factor', session.r,
                                       model.get_step())
                self.writer.add_scalar('Params/batch_size', session.bs,
                                       model.get_step())
                self.writer.add_scalar('Params/learning_rate', session.lr,
                                       model.get_step())

                stream(msg)

            val_loss, val_att_score = self.evaluate(model, session.val_set)
            self.writer.add_scalar('Loss/val', val_loss, model.get_step())
            self.writer.add_scalar('Attention_Score/val', val_att_score,
                                   model.get_step())
            save_checkpoint('tts',
                            self.paths,
                            model,
                            optimizer,
                            is_silent=True)

            loss_avg.reset()
            duration_avg.reset()
            print(' ')
예제 #10
0
def voc_train_loop(model, loss_func, optimiser, train_set, eval_set, test_set,
                   lr, total_steps, device, hp):

    for p in optimiser.param_groups:
        p['lr'] = lr

    total_iters = len(train_set)
    epochs = (total_steps - model.get_step()) // total_iters + 1
    trg = None
    patience = hp.patience
    min_val_loss = np.inf

    for e in range(1, epochs + 1):

        start = time.time()
        running_loss = 0.
        running_pase_reg_loss = 0.
        running_nll_loss = 0.
        pase_reg_loss = None

        for i, (x, y, xm) in enumerate(train_set, 1):
            if len(xm) == 2:
                # expand short and long term m
                xm, xlm = xm
                xm, xlm = xm.to(device), xlm.to(device)
                xm = xm.unsqueeze(1)
                xlm = xlm.unsqueeze(1)
            else:
                xm = xm.to(device).unsqueeze(1)
                xlm = None
            x, y = x.to(device), y.to(device)

            if hp.pase_ft:
                m = hp.pase(xm, xlm)
            else:
                with torch.no_grad():
                    m = hp.pase(xm, xlm)
            if hp.pase_lambda > 0:
                raise NotImplementedError
                # use an MSE loss weighted with pase_lamda
                # that tights the distorted PASE output
                # to the clean PASE soft-labels (loaded in m)
                pase_reg_loss = hp.pase_lambda * F.mse_loss(m, m_clean)

            y_hat = model(x, m)

            if model.mode == 'RAW':
                y_hat = y_hat.transpose(1, 2).unsqueeze(-1)

            elif model.mode == 'MOL':
                y = y.float()

            y = y.unsqueeze(-1)

            loss = loss_func(y_hat, y)

            running_nll_loss += loss.item()

            optimiser.zero_grad()
            if pase_reg_loss is not None:
                total_loss = loss + pase_reg_loss
                running_pase_reg_loss += pase_reg_loss.item()
                pase_reg_avg_loss = running_pase_reg_loss / i
            else:
                total_loss = loss
            total_loss.backward()
            optimiser.step()
            running_loss += total_loss.item()

            speed = i / (time.time() - start)
            nll_avg_loss = running_nll_loss / i
            avg_loss = running_loss / i

            step = model.get_step()
            k = step // 1000

            if step % hp.voc_write_every == 0:
                hp.writer.add_scalar('train/nll', avg_loss, step)
                if pase_reg_loss is not None:
                    hp.writer.add_scalar('train/pase_reg_loss',
                                         pase_reg_avg_loss, step)

            if step % hp.voc_checkpoint_every == 0:
                if eval_set is not None:
                    print('Validating')
                    # validate the model
                    val_loss = voc_eval_loop(model, loss_func, eval_set,
                                             device)
                    if val_loss <= min_val_loss:
                        patience = hp.patience
                        print('Val loss improved: {:.4f} -> '
                              '{:.4f}'.format(min_val_loss, val_loss))
                        min_val_loss = val_loss
                    else:
                        patience -= 1
                        print('Val loss did not improve. Patience '
                              '{}/{}'.format(patience, hp.patience))
                        if patience == 0:
                            print('Out of patience. Breaking the loop')
                            break
                    # set to train mode again
                    model.train()
                # generate some test samples
                gen_genh_testset(model,
                                 test_set,
                                 hp.voc_gen_at_checkpoint,
                                 hp.voc_gen_batched,
                                 hp.voc_target,
                                 hp.voc_overlap,
                                 paths.voc_output,
                                 hp=hp,
                                 device=device)
                model.checkpoint(paths.voc_checkpoints)
                if hp.pase_ft:
                    hp.pase.train()
                    hp.pase.save(paths.voc_checkpoints, step)

            if pase_reg_loss is None:
                msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | NLLoss: {avg_loss:.4f} | {speed:.1f} steps/s | Step: {k}k | '
            else:
                msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Total Loss: {avg_loss:.4f} | NLLoss: {avg_nll_loss:.4f} | PASE reg loss: {pase_reg_avg_loss:.4f} | {speed:.1f} steps/s | Step: {k}k | '
            stream(msg)

        model.save(paths.voc_latest_weights)
        model.log(paths.voc_log, msg)
        print(' ')
예제 #11
0
def voc_eval_loop(model, loss_func, eval_set, device):

    total_iters = len(eval_set)
    trg = None
    model.eval()

    with torch.no_grad():
        start = time.time()
        running_loss = 0.
        running_pase_reg_loss = 0.
        running_nll_loss = 0.
        pase_reg_loss = None

        for i, (x, y, xm) in enumerate(eval_set, 1):
            if len(xm) == 2:
                # expand short and long term m
                xm, xlm = xm
                xm, xlm = xm.to(device), xlm.to(device)
                xm = xm.unsqueeze(1)
                xlm = xlm.unsqueeze(1)
            else:
                xm = xm.to(device).unsqueeze(1)
                xlm = None
            x, y = x.to(device), y.to(device)

            m = hp.pase(xm, xlm)
            if hp.pase_lambda > 0:
                raise NotImplementedError
                # use an MSE loss weighted with pase_lamda
                # that tights the distorted PASE output
                # to the clean PASE soft-labels (loaded in m)
                pase_reg_loss = hp.pase_lambda * F.mse_loss(m, m_clean)

            y_hat = model(x, m)

            if model.mode == 'RAW':
                y_hat = y_hat.transpose(1, 2).unsqueeze(-1)

            elif model.mode == 'MOL':
                y = y.float()

            y = y.unsqueeze(-1)

            loss = loss_func(y_hat, y)

            running_nll_loss += loss.item()

            if pase_reg_loss is not None:
                total_loss = loss + pase_reg_loss
                running_pase_reg_loss += pase_reg_loss.item()
                pase_reg_avg_loss = running_pase_reg_loss / i
            else:
                total_loss = loss

            running_loss += total_loss.item()

            speed = i / (time.time() - start)
            nll_avg_loss = running_nll_loss / i
            avg_loss = running_loss / i

            step = model.get_step()
            k = step // 1000

            if pase_reg_loss is None:
                msg = f'| EVAL {i}/{total_iters} | NLLoss: {avg_loss:.4f} | {speed:.1f} steps/s | Step: {k}k | '
            else:
                msg = f'| EVAL {i}/{total_iters} | Total Loss: {avg_loss:.4f} | NLLoss: {avg_nll_loss:.4f} | PASE reg loss: {pase_reg_avg_loss:.4f} | {speed:.1f} steps/s | Step: {k}k | '
            stream(msg)

        hp.writer.add_scalar('eval/nll', nll_avg_loss, step)
        if pase_reg_loss is not None:
            hp.writer.add_scalar('eval/pase_reg_loss', pase_reg_avg_loss, step)

        print(' ')
        return avg_loss
예제 #12
0
    def train_session(self, model: ForwardTacotron, optimizer: Optimizer,
                      session: TTSSession) -> None:
        current_step = model.get_step()
        training_steps = session.max_step - current_step
        total_iters = len(session.train_set)
        epochs = training_steps // total_iters + 1
        simple_table([(f'Steps', str(training_steps // 1000) + 'k Steps'),
                      ('Batch Size', session.bs),
                      ('Learning Rate', session.lr)])

        for g in optimizer.param_groups:
            g['lr'] = session.lr

        m_loss_avg = Averager()
        dur_loss_avg = Averager()
        duration_avg = Averager()
        pitch_loss_avg = Averager()
        device = next(
            model.parameters()).device  # use same device as model parameters
        for e in range(1, epochs + 1):
            for i, batch in enumerate(session.train_set, 1):
                batch = to_device(batch, device=device)
                start = time.time()
                model.train()

                pitch_zoneout_mask = torch.rand(
                    batch['x'].size()) > self.train_cfg['pitch_zoneout']
                energy_zoneout_mask = torch.rand(
                    batch['x'].size()) > self.train_cfg['energy_zoneout']

                pitch_target = batch['pitch'].detach().clone()
                energy_target = batch['energy'].detach().clone()
                batch['pitch'] = batch['pitch'] * pitch_zoneout_mask.to(
                    device).float()
                batch['energy'] = batch['energy'] * energy_zoneout_mask.to(
                    device).float()

                pred = model(batch)

                m1_loss = self.l1_loss(pred['mel'], batch['mel'],
                                       batch['mel_len'])
                m2_loss = self.l1_loss(pred['mel_post'], batch['mel'],
                                       batch['mel_len'])

                dur_loss = self.l1_loss(pred['dur'].unsqueeze(1),
                                        batch['dur'].unsqueeze(1),
                                        batch['x_len'])
                pitch_loss = self.l1_loss(pred['pitch'],
                                          pitch_target.unsqueeze(1),
                                          batch['x_len'])
                energy_loss = self.l1_loss(pred['energy'],
                                           energy_target.unsqueeze(1),
                                           batch['x_len'])

                loss = m1_loss + m2_loss \
                       + self.train_cfg['dur_loss_factor'] * dur_loss \
                       + self.train_cfg['pitch_loss_factor'] * pitch_loss \
                       + self.train_cfg['energy_loss_factor'] * energy_loss

                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(
                    model.parameters(), self.train_cfg['clip_grad_norm'])
                optimizer.step()

                m_loss_avg.add(m1_loss.item() + m2_loss.item())
                dur_loss_avg.add(dur_loss.item())
                step = model.get_step()
                k = step // 1000

                duration_avg.add(time.time() - start)
                pitch_loss_avg.add(pitch_loss.item())

                speed = 1. / duration_avg.get()
                msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Mel Loss: {m_loss_avg.get():#.4} ' \
                      f'| Dur Loss: {dur_loss_avg.get():#.4} | Pitch Loss: {pitch_loss_avg.get():#.4} ' \
                      f'| {speed:#.2} steps/s | Step: {k}k | '

                if step % self.train_cfg['checkpoint_every'] == 0:
                    save_checkpoint(model=model,
                                    optim=optimizer,
                                    config=self.config,
                                    path=self.paths.forward_checkpoints /
                                    f'forward_step{k}k.pt')

                if step % self.train_cfg['plot_every'] == 0:
                    self.generate_plots(model, session)

                self.writer.add_scalar('Mel_Loss/train', m1_loss + m2_loss,
                                       model.get_step())
                self.writer.add_scalar('Pitch_Loss/train', pitch_loss,
                                       model.get_step())
                self.writer.add_scalar('Energy_Loss/train', energy_loss,
                                       model.get_step())
                self.writer.add_scalar('Duration_Loss/train', dur_loss,
                                       model.get_step())
                self.writer.add_scalar('Params/batch_size', session.bs,
                                       model.get_step())
                self.writer.add_scalar('Params/learning_rate', session.lr,
                                       model.get_step())

                stream(msg)

            val_out = self.evaluate(model, session.val_set)
            self.writer.add_scalar('Mel_Loss/val', val_out['mel_loss'],
                                   model.get_step())
            self.writer.add_scalar('Duration_Loss/val', val_out['dur_loss'],
                                   model.get_step())
            self.writer.add_scalar('Pitch_Loss/val', val_out['pitch_loss'],
                                   model.get_step())
            self.writer.add_scalar('Energy_Loss/val', val_out['energy_loss'],
                                   model.get_step())
            save_checkpoint(model=model,
                            optim=optimizer,
                            config=self.config,
                            path=self.paths.forward_checkpoints /
                            'latest_model.pt')

            m_loss_avg.reset()
            duration_avg.reset()
            pitch_loss_avg.reset()
            print(' ')
예제 #13
0
def voc_train_loop(model, loss_func, optimizer, train_set, test_set, init_lr, final_lr, total_steps):

    total_iters = len(train_set)
    epochs = int((total_steps - model.get_step()) // total_iters + 1)

    if hp.amp:
        model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    torch.backends.cudnn.benchmark = True

    for e in range(1, epochs + 1):

        adjust_learning_rate(optimizer, e, epochs, init_lr, final_lr)

        start = time.time()
        running_loss = 0.

        for i, (x, y, m) in enumerate(train_set, 1):
            x, m, y = x.cuda(), m.cuda(), y.cuda()

            y_hat = model(x, m)

            if model.mode == 'RAW' :
                y_hat = y_hat.transpose(1, 2).unsqueeze(-1)

            elif model.mode == 'MOL' :
                y = y.float()

            y = y.unsqueeze(-1)

            loss = loss_func(y_hat, y)

            optimizer.zero_grad()

            if hp.amp:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                grad_norm = torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), 1)
            else:
                loss.backward()
                grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1)

            optimizer.step()
            running_loss += loss.item()

            speed = i / (time.time() - start)
            avg_loss = running_loss / i

            step = model.get_step()
            k = step // 1000

            if step % hp.voc_checkpoint_every == 0 :
                model.eval()
                gen_testset(model, test_set, hp.voc_gen_at_checkpoint, hp.voc_gen_batched,
                            hp.voc_target, hp.voc_overlap, paths.voc_output)
                model.checkpoint(paths.voc_checkpoints)
                model.train()

            msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:.4f} | {speed:.1f} steps/s | Step: {k}k | '
            stream(msg)

        model.save(paths.voc_latest_weights)
        model.log(paths.voc_log, msg)
예제 #14
0
def voc_train_loop(paths: Paths, model: WaveRNN, loss_func, optimizer,
                   train_set, test_set, init_lr, final_lr, total_steps):
    # Use same device as model parameters
    device = next(model.parameters()).device

    # for g in optimizer.param_groups: g['lr'] = lr

    total_iters = len(train_set)
    epochs = (total_steps - model.get_step()) // total_iters + 1

    for e in range(1, epochs + 1):

        adjust_learning_rate(optimizer, e, epochs, init_lr,
                             final_lr)  # 初始学习率与最终学习率-Begee
        start = time.time()
        running_loss = 0.

        for i, (x, y, m) in enumerate(train_set, 1):
            x, m, y = x.to(device), m.to(device), y.to(
                device)  # x/y: (Batch, sub_bands, T)

            #########################  MultiBand-WaveRNN   #########################
            if hp.voc_multiband:
                y0 = y[:, 0, :].squeeze(0).unsqueeze(
                    -1)  # y0/y1/y2/y3: (Batch, T, 1)
                y1 = y[:, 1, :].squeeze(0).unsqueeze(-1)
                y2 = y[:, 2, :].squeeze(0).unsqueeze(-1)
                y3 = y[:, 3, :].squeeze(0).unsqueeze(-1)

                y_hat = model(x, m)  # (Batch, T, num_classes, sub_bands)

                if model.mode == 'RAW':
                    y_hat0 = y_hat[:, :, :, 0].transpose(1, 2).unsqueeze(
                        -1)  # (Batch, num_classes, T, 1)
                    y_hat1 = y_hat[:, :, :, 1].transpose(1, 2).unsqueeze(-1)
                    y_hat2 = y_hat[:, :, :, 2].transpose(1, 2).unsqueeze(-1)
                    y_hat3 = y_hat[:, :, :, 3].transpose(1, 2).unsqueeze(-1)

                elif model.mode == 'MOL':
                    y0 = y0.float()
                    y1 = y1.float()
                    y2 = y2.float()
                    y3 = y3.float()

                loss = loss_func(y_hat0, y0) + loss_func(
                    y_hat1, y1) + loss_func(y_hat2, y2) + loss_func(
                        y_hat3, y3)

            #########################  MultiBand-WaveRNN   #########################

            optimizer.zero_grad()
            loss.backward()

            if hp.voc_clip_grad_norm is not None:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hp.voc_clip_grad_norm).cpu()
                if np.isnan(grad_norm):
                    print('grad_norm was NaN!')
            optimizer.step()

            running_loss += loss.item()
            avg_loss = running_loss / i

            speed = i / (time.time() - start)

            step = model.get_step()
            k = step // 1000

            if step % hp.voc_checkpoint_every == 0:
                gen_testset(model, test_set, hp.voc_gen_at_checkpoint,
                            hp.voc_gen_batched, hp.voc_target, hp.voc_overlap,
                            paths.voc_output)
                ckpt_name = f'wave_step{k}K'
                save_checkpoint('voc',
                                paths,
                                model,
                                optimizer,
                                name=ckpt_name,
                                is_silent=True)

            msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:.4f} | {speed:.1f} steps/s | Step: {k}k | '
            stream(msg)

        # Must save latest optimizer state to ensure that resuming training
        # doesn't produce artifacts
        save_checkpoint('voc', paths, model, optimizer, is_silent=True)
        model.log(paths.voc_log, msg)
        print(' ')
예제 #15
0
    def train_session(self, model: WaveRNN, optimizer: Optimizer,
                      session: VocSession, train_gta: bool) -> None:
        current_step = model.get_step()
        training_steps = session.max_step - current_step
        total_iters = len(session.train_set)
        epochs = training_steps // total_iters + 1
        simple_table([(f'Steps ', str(training_steps // 1000) + 'k'),
                      ('Batch Size', session.bs),
                      ('Learning Rate', session.lr),
                      ('Sequence Length', self.train_cfg['seq_len']),
                      ('GTA Training', train_gta)])
        for g in optimizer.param_groups:
            g['lr'] = session.lr

        loss_avg = Averager()
        duration_avg = Averager()
        device = next(
            model.parameters()).device  # use same device as model parameters

        for e in range(1, epochs + 1):
            for i, batch in enumerate(session.train_set, 1):
                start = time.time()
                model.train()
                batch = to_device(batch, device=device)
                x, y = batch['x'], batch['y']
                y_hat = model(x, batch['mel'])
                if model.mode == 'RAW':
                    y_hat = y_hat.transpose(1, 2).unsqueeze(-1)
                elif model.mode == 'MOL':
                    y = batch['y'].float()
                y = y.unsqueeze(-1)

                loss = self.loss_func(y_hat, y)
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(
                    model.parameters(), self.train_cfg['clip_grad_norm'])
                optimizer.step()
                loss_avg.add(loss.item())
                step = model.get_step()
                k = step // 1000

                duration_avg.add(time.time() - start)
                speed = 1. / duration_avg.get()
                msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {loss_avg.get():#.4} ' \
                      f'| {speed:#.2} steps/s | Step: {k}k | '

                if step % self.train_cfg['gen_samples_every'] == 0:
                    stream(msg + 'generating samples...')
                    gen_result = self.generate_samples(model, session)
                    if gen_result is not None:
                        mel_loss, gen_wav = gen_result
                        self.writer.add_scalar('Loss/generated_mel_l1',
                                               mel_loss, model.get_step())
                        self.track_top_models(mel_loss, gen_wav, model)

                if step % self.train_cfg['checkpoint_every'] == 0:
                    save_checkpoint(model=model,
                                    optim=optimizer,
                                    config=self.config,
                                    path=self.paths.voc_checkpoints /
                                    f'wavernn_step{k}k.pt')

                self.writer.add_scalar('Loss/train', loss, model.get_step())
                self.writer.add_scalar('Params/batch_size', session.bs,
                                       model.get_step())
                self.writer.add_scalar('Params/learning_rate', session.lr,
                                       model.get_step())

                stream(msg)

            val_loss = self.evaluate(model, session.val_set)
            self.writer.add_scalar('Loss/val', val_loss, model.get_step())
            save_checkpoint(model=model,
                            optim=optimizer,
                            config=self.config,
                            path=self.paths.voc_checkpoints /
                            'latest_model.pt')

            loss_avg.reset()
            duration_avg.reset()
            print(' ')
예제 #16
0
    def train_session(self, model: ForwardTacotron,
                      optimizer: Optimizer, session: TTSSession) -> None:
        current_step = model.get_step()
        training_steps = session.max_step - current_step
        total_iters = len(session.train_set)
        epochs = training_steps // total_iters + 1
        simple_table([(f'Steps', str(training_steps // 1000) + 'k Steps'),
                      ('Batch Size', session.bs),
                      ('Learning Rate', session.lr)])

        for g in optimizer.param_groups:
            g['lr'] = session.lr

        m_loss_avg = Averager()
        dur_loss_avg = Averager()
        duration_avg = Averager()
        pitch_loss_avg = Averager()
        device = next(model.parameters()).device  # use same device as model parameters
        for e in range(1, epochs + 1):
            for i, (x, m, ids, x_lens, mel_lens, dur, pitch, puncts) in enumerate(
                session.train_set, 1
            ):
                start = time.time()
                model.train()
                x, m, dur, x_lens, mel_lens, pitch, puncts = (
                    x.to(device),
                    m.to(device),
                    dur.to(device),
                    x_lens.to(device),
                    mel_lens.to(device),
                    pitch.to(device),
                    puncts.to(device),
                )
                # print("*" * 20)
                # print(x)
                # print("*" * 20)
                m1_hat, m2_hat, dur_hat, pitch_hat = model(
                    x, m, dur, mel_lens, pitch, puncts
                )
                m1_loss = self.l1_loss(m1_hat, m, mel_lens)
                m2_loss = self.l1_loss(m2_hat, m, mel_lens)
                dur_loss = self.l1_loss(dur_hat.unsqueeze(1), dur.unsqueeze(1), x_lens)
                pitch_loss = self.l1_loss(pitch_hat, pitch.unsqueeze(1), x_lens)
                loss = m1_loss + m2_loss + 0.3 * dur_loss + 0.1 * pitch_loss
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), hp.tts_clip_grad_norm)
                optimizer.step()
                m_loss_avg.add(m1_loss.item() + m2_loss.item())
                dur_loss_avg.add(dur_loss.item())
                step = model.get_step()
                k = step // 1000

                duration_avg.add(time.time() - start)
                pitch_loss_avg.add(pitch_loss.item())

                speed = 1. / duration_avg.get()
                msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Mel Loss: {m_loss_avg.get():#.4} ' \
                      f'| Dur Loss: {dur_loss_avg.get():#.4} | Pitch Loss: {pitch_loss_avg.get():#.4} ' \
                      f'| {speed:#.2} steps/s | Step: {k}k | '

                if step % hp.forward_checkpoint_every == 0:
                    ckpt_name = f'forward_step{k}K'
                    save_checkpoint('forward', self.paths, model, optimizer,
                                    name=ckpt_name, is_silent=True)

                if step % hp.forward_plot_every == 0:
                    self.generate_plots(model, session)

                self.writer.add_scalar('Mel_Loss/train', m1_loss + m2_loss, model.get_step())
                self.writer.add_scalar('Pitch_Loss/train', pitch_loss, model.get_step())
                self.writer.add_scalar('Duration_Loss/train', dur_loss, model.get_step())
                self.writer.add_scalar('Params/batch_size', session.bs, model.get_step())
                self.writer.add_scalar('Params/learning_rate', session.lr, model.get_step())

                stream(msg)

            m_val_loss, dur_val_loss, pitch_val_loss = self.evaluate(model, session.val_set)
            self.writer.add_scalar('Mel_Loss/val', m_val_loss, model.get_step())
            self.writer.add_scalar('Duration_Loss/val', dur_val_loss, model.get_step())
            self.writer.add_scalar('Pitch_Loss/val', pitch_val_loss, model.get_step())
            save_checkpoint('forward', self.paths, model, optimizer, is_silent=True)

            m_loss_avg.reset()
            duration_avg.reset()
            pitch_loss_avg.reset()
            print(' ')
예제 #17
0
def voc_train_loop(paths: Paths, model: WaveRNN, loss_func, optimizer,
                   train_set, test_set, lr, total_steps):
    # Use same device as model parameters
    device = next(model.parameters()).device

    # set learning rate
    for g in optimizer.param_groups:
        g['lr'] = lr

    total_iters = len(train_set)
    epochs = (total_steps - model.get_step()) // total_iters + 1

    total_number_of_batches = len(train_set)

    writer = SummaryWriter("runs/{0}-{1}".format(
        model_name_prefix,
        datetime.now().strftime("%Y%m%d-%H%M%S")))
    scheduler = StepLR(optimizer, step_size=1, gamma=0.983)

    for e in range(EPOCH, epochs + 1):

        start = time.time()
        running_loss = 0.
        avg_loss = 0

        for i, (x, y, m) in enumerate(train_set, 1):
            x, m, y = x.to(device), m.to(device), y.to(device)

            # Parallelize model onto GPUS using workaround due to python bug
            if device.type == 'cuda' and torch.cuda.device_count() > 1:
                y_hat = data_parallel_workaround(model, x, m)
            else:
                y_hat = model(x, m)

            if model.mode == 'RAW':
                y_hat = y_hat.transpose(1, 2).unsqueeze(-1)

            elif model.mode == 'MOL':
                y = y.float()

            y = y.unsqueeze(-1)

            loss = loss_func(y_hat, y)

            optimizer.zero_grad()
            loss.backward()
            if hp.voc_clip_grad_norm is not None:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hp.voc_clip_grad_norm)
            optimizer.step()

            running_loss += loss.item()
            avg_loss = running_loss / i

            speed = i / (time.time() - start)

            step = model.get_step()
            k = step // 1000

            # Write to tensorboard per batch
            writer.add_scalar('Epoch loss', loss.item(),
                              e * total_number_of_batches + i)

            msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:.4f} | {speed:.1f} steps/s | Step: {k}k | '
            stream(msg)
        """
        ####################### Testing ############################
        torch.cuda.empty_cache()
        loss_test = 0
        for _, (x_test, y_test, m_test) in enumerate(test_set, 1):
            x_test, m_test, y_test = x_test.to(device), m_test.to(device), y_test.to(device)
            if device.type == 'cuda' and torch.cuda.device_count() > 1:
                raise RuntimeError("Unsupported")
            else:
                y_test_hat = model(x_test, m_test)

            if model.mode == 'RAW':
                y_test_hat = y_test_hat.transpose(1, 2).unsqueeze(-1)
            elif model.mode == 'MOL':
                y_test = y_test.float()

            y_test = y_test.unsqueeze(-1)

            loss_test += loss_func(y_test_hat, y_test).item()
        avg_loss_test = loss_test / len(test_set)
        msg = f'| Epoch: {e}/{epochs} | Test-Loss: {loss_test:.4f} | Test-AvgLoss: {avg_loss_test:.4f} | '
        stream("\n")
        stream(msg)

        writer.add_scalar('Test loss', loss_test, e)
        writer.add_scalar('Average test loss', avg_loss_test, e)
        ############################################################
        """

        # Write to tensorboard per epoch
        writer.add_scalar('Running loss', running_loss, e)
        writer.add_scalar('Average loss', avg_loss, e)

        # Must save latest optimizer state to ensure that resuming training
        # doesn't produce artifacts
        save_checkpoint('voc',
                        paths,
                        model,
                        optimizer,
                        name="{0}-epoch-{1}-loss-{2}".format(
                            model_name_prefix, e, avg_loss),
                        is_silent=True)
        model.log(paths.voc_log, msg)
        print(' ')
        scheduler.step()
        print('Epoch:', e, 'LR:', scheduler.get_lr())
예제 #18
0
def voc_train_loop(paths: Paths, model: WaveRNN, loss_func, optimizer,
                   train_set, test_set, lr, total_steps):
    # Use same device as model parameters
    device = next(model.parameters()).device

    for g in optimizer.param_groups:
        g['lr'] = lr

    total_iters = len(train_set)
    epochs = (total_steps - model.get_step()) // total_iters + 1

    for e in range(1, epochs + 1):

        start = time.time()
        running_loss = 0.

        for i, (x, y, m) in enumerate(train_set, 1):
            x, m, y = x.to(device), m.to(device), y.to(device)

            # Parallelize model onto GPUS using workaround due to python bug
            if device.type == 'cuda' and torch.cuda.device_count() > 1:
                y_hat = data_parallel_workaround(model, x, m)
            else:
                y_hat = model(x, m)

            if model.mode == 'RAW':
                y_hat = y_hat.transpose(1, 2).unsqueeze(-1)

            elif model.mode == 'MOL':
                y = y.float()

            y = y.unsqueeze(-1)

            loss = loss_func(y_hat, y)

            optimizer.zero_grad()
            loss.backward()
            if hp.voc_clip_grad_norm is not None:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hp.voc_clip_grad_norm)
                if np.isnan(grad_norm):
                    print('grad_norm was NaN!')
            optimizer.step()

            running_loss += loss.item()
            avg_loss = running_loss / i

            speed = i / (time.time() - start)

            step = model.get_step()
            k = step // 1000

            if step % hp.voc_checkpoint_every == 0:
                gen_testset(model, test_set, hp.voc_gen_at_checkpoint,
                            hp.voc_gen_batched, hp.voc_target, hp.voc_overlap,
                            paths.voc_output)
                ckpt_name = f'wave_step{k}K'
                save_checkpoint('voc',
                                paths,
                                model,
                                optimizer,
                                name=ckpt_name,
                                is_silent=True)

            msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:.4f} | {speed:.1f} steps/s | Step: {k}k | '
            stream(msg)

        # Must save latest optimizer state to ensure that resuming training
        # doesn't produce artifacts
        save_checkpoint('voc', paths, model, optimizer, is_silent=True)
        model.log(paths.voc_log, msg)
        print(' ')
예제 #19
0
    return basename


if __name__ == '__main__':
    wav_paths = list(Path(hp.wav_dir).rglob('*.wav'))
    print(f'\n{len(wav_paths)} wav files found in "{hp.wav_dir}"\n')

    if len(wav_paths) == 0:
        print('Please point wav_dir in hparams.py to your dataset.')

    else:
        os.makedirs(hp.data_dir, exist_ok=True)
        os.makedirs(os.path.join(hp.data_dir, 'mel'), exist_ok=True)
        os.makedirs(os.path.join(hp.data_dir, 'quant'), exist_ok=True)

        pool = Pool(processes=cpu_count() - 1)
        basenames = []

        for i, basename in enumerate(
                pool.imap_unordered(process_wav, wav_paths), 1):
            basenames.append(basename)
            bar = progbar(i, len(wav_paths))
            message = f'{bar} {i}/{len(wav_paths)} '
            stream(message)

        with open(hp.data_dir + '/basenames.pkl', 'wb') as f:
            pickle.dump(basenames, f)

        print('\n\nCompleted.\n')