示例#1
0
def gen_testset(model: WaveRNN, test_set, samples, batched, target, overlap, save_path: Path):

    k = model.get_step() // 1000

    for i, (m, x) in enumerate(test_set, 1):

        if i > samples: break

        print('\n| Generating: %i/%i' % (i, samples))

        x = x[0].numpy()

        bits = 16 if hp.voc_mode == 'MOL' else hp.bits

        if hp.mu_law and hp.voc_mode != 'MOL':
            x = decode_mu_law(x, 2**bits, from_labels=True)
        else:
            x = label_2_float(x, bits)

        save_wav(x, save_path/f'{k}k_steps_{i}_target.wav')

        batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED'
        save_str = str(save_path/f'{k}k_steps_{i}_{batch_str}.wav')

        _ = model.generate(m, save_str, batched, target, overlap, hp.mu_law)
示例#2
0
 def train(self,
           model: WaveRNN,
           optimizer: Optimizer,
           train_gta=False) -> None:
     voc_schedule = self.train_cfg['schedule']
     voc_schedule = parse_schedule(voc_schedule)
     for i, session_params in enumerate(voc_schedule, 1):
         lr, max_step, bs = session_params
         if model.get_step() < max_step:
             train_set, val_set, val_set_samples = get_vocoder_datasets(
                 path=self.paths.data,
                 batch_size=bs,
                 train_gta=train_gta,
                 max_mel_len=self.train_cfg['max_mel_len'],
                 hop_length=self.dsp.hop_length,
                 voc_pad=model.pad,
                 voc_seq_len=self.train_cfg['seq_len'],
                 voc_mode=self.dsp.voc_mode,
                 bits=self.dsp.bits,
                 num_gen_samples=self.train_cfg['num_gen_samples'])
             session = VocSession(index=i,
                                  lr=lr,
                                  max_step=max_step,
                                  bs=bs,
                                  train_set=train_set,
                                  val_set=val_set,
                                  val_set_samples=val_set_samples)
             self.train_session(model, optimizer, session, train_gta)
示例#3
0
def gen_from_file(model: WaveRNN, load_path: Path, save_path: Path, batched, target, overlap):

    k = model.get_step() // 1000
    file_name = load_path.stem

    suffix = load_path.suffix
    if suffix == ".wav":
        wav = load_wav(load_path)
        save_wav(wav, save_path/f'__{file_name}__{k}k_steps_target.wav')
        mel = melspectrogram(wav)
    elif suffix == ".npy":
        mel = np.load(load_path)
        if mel.ndim != 2 or mel.shape[0] != hp.num_mels:
            raise ValueError(f'Expected a numpy array shaped (n_mels, n_hops), but got {wav.shape}!')
        _max = np.max(mel)
        _min = np.min(mel)
        if _max >= 1.01 or _min <= -0.01:
            raise ValueError(f'Expected spectrogram range in [0,1] but was instead [{_min}, {_max}]')
    else:
        raise ValueError(f"Expected an extension of .wav or .npy, but got {suffix}!")


    mel = torch.tensor(mel).unsqueeze(0)

    batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED'
    save_str = save_path/f'__{file_name}__{k}k_steps_{batch_str}.wav'

    _ = model.generate(mel, save_str, batched, target, overlap, hp.mu_law)
示例#4
0
def gen_testset(model: WaveRNN, test_set, samples, batched, target, overlap,
                save_path: Path):

    k = model.get_step() // 1000

    for i, (m, x) in enumerate(test_set, 1):

        if i > samples: break

        print('\n| Generating: %i/%i' % (i, samples))

        x = x[0].numpy()

        bits = 16 if hp.voc_mode == 'MOL' else hp.bits

        if hp.mu_law and hp.voc_mode != 'MOL':
            x = decode_mu_law(x, 2**bits, from_labels=True)
        else:
            x = label_2_float(x, bits)

        save_wav(x,
                 save_path / '%sk_steps_%s_target.wav' % (repr1(k), repr1(i)))

        batch_str = 'gen_batched_target%s_overlap%s' % (
            repr1(target), repr1(overlap)) if batched else 'gen_NOT_BATCHED'
        save_str = str(save_path / '%sk_steps_%s_%s.wav' %
                       (repr1(k), repr1(i), repr1(batch_str)))

        _ = model.generate(m, save_str, batched, target, overlap, hp.mu_law)
示例#5
0
 def train(self, model: WaveRNN, optimizer: Optimizer, train_gta=False) -> None:
     for i, session_params in enumerate(hp.voc_schedule, 1):
         lr, max_step, bs = session_params
         if model.get_step() < max_step:
             train_set, val_set, val_set_samples = get_vocoder_datasets(
                 path=self.paths.data, batch_size=bs, train_gta=train_gta)
             session = VocSession(
                 index=i, lr=lr, max_step=max_step,
                 bs=bs, train_set=train_set, val_set=val_set,
                 val_set_samples=val_set_samples)
             self.train_session(model, optimizer, session, train_gta)
示例#6
0
def gen_from_file(model: WaveRNN, load_path: Path, save_path: Path, batched, target, overlap):

    k = model.get_step() // 1000
    os.makedirs(save_path/'test', exist_ok=True)
    for file_name in tqdm(os.listdir(load_path)):
        if file_name.endswith('.npy'):
            mel = np.load(os.path.join(load_path, file_name))
            mel = torch.tensor(mel).unsqueeze(0)

            batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED'
            save_str = save_path/f'test/{file_name}__{k}k_steps_{batch_str}.wav'

            _ = model.generate(mel, save_str, batched, target, overlap, hp.mu_law)
示例#7
0
def gen_testset(model: WaveRNN, test_set, samples, batched, target, overlap, save_path: Path):

    k = model.get_step() // 1000

    mypqmf = PQMF()
    for i, (m, x) in enumerate(test_set, 1):

        if i > samples: break

        print('\n| Generating: %i/%i' % (i, samples))

        if hp.voc_multiband:

            x = x[0].numpy()

            bits = 16 if hp.voc_mode == 'MOL' else hp.bits

            if hp.mu_law and hp.voc_mode != 'MOL':
                x = decode_mu_law(x, 2 ** bits, from_labels=True)
            else:
                x = label_2_float(x, bits)

            source = mypqmf.synthesis(
                torch.tensor(x, dtype=torch.float).unsqueeze(
                    0)).numpy()  # (1, sub_band, T//sub_band) -> (1, 1, T)
            source = source.squeeze() # (T,)
            save_wav(source,save_path/f'{k}k_steps_{i}_target.wav')
            # np.save(save_path/f'{k}k_steps_{i}_target.npy', x, allow_pickle=False)

        else:
            x = x[0].numpy()

            bits = 16 if hp.voc_mode == 'MOL' else hp.bits

            if hp.mu_law and hp.voc_mode != 'MOL':
                x = decode_mu_law(x, 2**bits, from_labels=True)
            else:
                x = label_2_float(x, bits)

            save_wav(x, save_path/f'{k}k_steps_{i}_target.wav')

        batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED'
        save_str = str(save_path/f'{k}k_steps_{i}_{batch_str}.wav')   # 返回PQMF后
        _ = model.generate(m, save_str,batched, target, overlap, hp.mu_law)
示例#8
0
def gen_from_file(model: WaveRNN, load_path: Path, save_path: Path, batched,
                  target, overlap):

    k = model.get_step() // 1000
    file_name = load_path.stem

    suffix = load_path.suffix
    if suffix == ".wav":
        wav = load_wav(load_path)
        save_wav(
            wav, save_path / '__%s__%sk_steps_target.wav' %
            (repr1(file_name), repr1(k)))
        mel = melspectrogram(wav)
    elif suffix == ".npy":
        mel = np.load(load_path)
        if mel.ndim != 2 or mel.shape[0] != hp.num_mels:
            raise ValueError(
                'Expected a numpy array shaped (n_mels, n_hops), but got %s!' %
                (repr1(wav.shape)))
        _max = np.max(mel)
        _min = np.min(mel)
        if _max >= 1.01 or _min <= -0.01:
            raise ValueError(
                'Expected spectrogram range in [0,1] but was instead [%s, %s]'
                % (repr1(_min), repr1(_max)))
    else:
        raise ValueError('Expected an extension of .wav or .npy, but got %s!' %
                         (repr1(suffix)))

    mel = torch.tensor(mel).unsqueeze(0)

    batch_str = 'gen_batched_target%s_overlap%s' % (
        repr1(target), repr1(overlap)) if batched else 'gen_NOT_BATCHED'
    save_str = save_path / '__%s__%sk_steps_%s.wav' % (
        repr1(file_name), repr1(k), repr1(batch_str))

    _ = model.generate(mel, save_str, batched, target, overlap, hp.mu_law)
示例#9
0
def gen_from_file(model: WaveRNN, load_path: Path, save_path: Path, batched,
                  target, overlap):

    k = model.get_step() // 1000
    file_name = load_path.stem

    suffix = load_path.suffix
    if suffix == ".wav":
        wav = load_wav(load_path)
        save_wav(wav, save_path / f'{prefix}{file_name}.target.wav')
        mel = melspectrogram(wav)
    elif suffix == ".npy":
        mel = np.load(load_path)
        if mel.ndim != 2 or mel.shape[0] != hp.num_mels:
            raise ValueError(
                f'Expected a numpy array shaped (n_mels, n_hops), but got {mel.shape}!'
            )
        _max = np.max(mel)
        _min = np.min(mel)
        if _max >= 1.01 or _min <= -0.01:
            raise ValueError(
                f'Expected spectrogram range in [0,1] but was instead [{_min}, {_max}]'
            )
    else:
        raise ValueError(
            f"Expected an extension of .wav or .npy, but got {suffix}!")

    m = torch.tensor(mel).unsqueeze(0)

    save_str_wavernn = save_path / f'{prefix}{file_name}.wavernn.wav'
    save_str_griffinlim = save_path / f'{prefix}{file_name}.griffinlim.wav'

    wav = reconstruct_waveform(mel, n_iter=32)
    save_wav(wav, save_str_griffinlim)

    _ = model.generate(m, save_str_wavernn, batched, target, overlap,
                       hp.mu_law)
示例#10
0
def gen_testset(model: WaveRNN, test_set, samples, batched, target, overlap,
                save_path: Path):
    '''
    :param model:
    :param test_set: 测试集,包含了mel或sp+f0特征,以及原音频的载入文件
    :param samples: 要生成的样本量,也就是要生成的音频个数
    :param batched: 在这个脚本中batched为True
    :param target: 11000
    :param overlap: 550
    :param save_path: model_outputs_*
    :return: 生成的音频文件
    '''

    k = model.get_step() // 1000

    for i, (m, x) in enumerate(test_set, 1):

        if i > samples: break

        print('\n| Generating: %i/%i' % (i, samples))

        x = x[0].numpy()

        bits = 16 if hp.voc_mode == 'MOL' else hp.bits

        if hp.mu_law and hp.voc_mode != 'MOL':
            x = decode_mu_law(x, 2**bits, from_labels=True)
        else:
            x = label_2_float(x, bits)

        save_wav(x, save_path / f'{k}k_steps_{i}_target.wav')  # 保存原音频文件

        batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED'
        save_str = str(save_path / f'{k}k_steps_{i}_{batch_str}.wav')

        _ = model.generate(m, save_str, batched, target, overlap, hp.mu_law)
示例#11
0
def main():

    # Parse Arguments
    parser = argparse.ArgumentParser(description='Train WaveRNN Vocoder')
    parser.add_argument('--lr',
                        '-l',
                        type=float,
                        help='[float] override hparams.py learning rate')
    parser.add_argument('--batch_size',
                        '-b',
                        type=int,
                        help='[int] override hparams.py batch size')
    parser.add_argument('--force_train',
                        '-f',
                        action='store_true',
                        help='Forces the model to train past total steps')
    parser.add_argument('--gta',
                        '-g',
                        action='store_true',
                        help='train wavernn on GTA features')
    parser.add_argument(
        '--force_cpu',
        '-c',
        action='store_true',
        help='Forces CPU-only training, even when in CUDA capable environment')
    parser.add_argument('--hp_file',
                        metavar='FILE',
                        default='hparams.py',
                        help='The file to use for the hyperparameters')
    args = parser.parse_args()

    hp.configure(args.hp_file)  # load hparams from file
    if args.lr is None:
        args.lr = hp.voc_lr
    if args.batch_size is None:
        args.batch_size = hp.voc_batch_size

    paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)

    batch_size = args.batch_size
    force_train = args.force_train
    train_gta = args.gta
    lr = args.lr

    if not args.force_cpu and torch.cuda.is_available():
        device = torch.device('cuda')
        if batch_size % torch.cuda.device_count() != 0:
            raise ValueError(
                '`batch_size` must be evenly divisible by n_gpus!')
    else:
        device = torch.device('cpu')
    print('Using device:', device)

    print('\nInitialising Model...\n')

    # Instantiate WaveRNN Model
    voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims,
                        fc_dims=hp.voc_fc_dims,
                        bits=hp.bits,
                        pad=hp.voc_pad,
                        upsample_factors=hp.voc_upsample_factors,
                        feat_dims=hp.num_mels,
                        compute_dims=hp.voc_compute_dims,
                        res_out_dims=hp.voc_res_out_dims,
                        res_blocks=hp.voc_res_blocks,
                        hop_length=hp.hop_length,
                        sample_rate=hp.sample_rate,
                        mode=hp.voc_mode).to(device)

    # Check to make sure the hop length is correctly factorised
    assert np.cumprod(hp.voc_upsample_factors)[-1] == hp.hop_length

    optimizer = optim.Adam(voc_model.parameters())
    restore_checkpoint('voc',
                       paths,
                       voc_model,
                       optimizer,
                       create_if_missing=True)

    train_set, test_set = get_vocoder_datasets(paths.data, batch_size,
                                               train_gta)

    total_steps = 10_000_000 if force_train else hp.voc_total_steps

    simple_table([
        ('Remaining', str(
            (total_steps - voc_model.get_step()) // 1000) + 'k Steps'),
        ('Batch Size', batch_size), ('LR', lr),
        ('Sequence Len', hp.voc_seq_len), ('GTA Train', train_gta)
    ])

    loss_func = F.cross_entropy if voc_model.mode == 'RAW' else discretized_mix_logistic_loss

    voc_train_loop(paths, voc_model, loss_func, optimizer, train_set, test_set,
                   lr, total_steps)

    print('Training Complete.')
    print(
        'To continue training increase voc_total_steps in hparams.py or use --force_train'
    )
示例#12
0
def voc_train_loop(paths: Paths, model: WaveRNN, loss_func, optimizer,
                   train_set, test_set, lr, total_steps):
    # Use same device as model parameters
    device = next(model.parameters()).device

    for g in optimizer.param_groups:
        g['lr'] = lr

    total_iters = len(train_set)
    epochs = (total_steps - model.get_step()) // total_iters + 1

    for e in range(1, epochs + 1):

        start = time.time()
        running_loss = 0.

        for i, (x, y, m) in enumerate(train_set, 1):
            x, m, y = x.to(device), m.to(device), y.to(device)

            # Parallelize model onto GPUS using workaround due to python bug
            if device.type == 'cuda' and torch.cuda.device_count() > 1:
                y_hat = data_parallel_workaround(model, x, m)
            else:
                y_hat = model(x, m)

            if model.mode == 'RAW':
                y_hat = y_hat.transpose(1, 2).unsqueeze(-1)

            elif model.mode == 'MOL':
                y = y.float()

            y = y.unsqueeze(-1)

            loss = loss_func(y_hat, y)

            optimizer.zero_grad()
            loss.backward()
            if hp.voc_clip_grad_norm is not None:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hp.voc_clip_grad_norm)
                if np.isnan(grad_norm):
                    print('grad_norm was NaN!')
            optimizer.step()

            running_loss += loss.item()
            avg_loss = running_loss / i

            speed = i / (time.time() - start)

            step = model.get_step()
            k = step // 1000

            if step % hp.voc_checkpoint_every == 0:
                gen_testset(model, test_set, hp.voc_gen_at_checkpoint,
                            hp.voc_gen_batched, hp.voc_target, hp.voc_overlap,
                            paths.voc_output)
                ckpt_name = f'wave_step{k}K'
                save_checkpoint('voc',
                                paths,
                                model,
                                optimizer,
                                name=ckpt_name,
                                is_silent=True)

            msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:.4f} | {speed:.1f} steps/s | Step: {k}k | '
            stream(msg)

        # Must save latest optimizer state to ensure that resuming training
        # doesn't produce artifacts
        save_checkpoint('voc', paths, model, optimizer, is_silent=True)
        model.log(paths.voc_log, msg)
        print(' ')
示例#13
0
                         postnet_K=hp.tts_postnet_K,
                         num_highways=hp.tts_num_highways,
                         dropout=hp.tts_dropout,
                         stop_threshold=hp.tts_stop_threshold).to(device)

    tts_model.load('quick_start/tts_weights/latest_weights.pyt')

    if input_text:
        inputs = [text_to_sequence(input_text.strip(), hp.tts_cleaner_names)]
    else:
        with open('sentences.txt') as f:
            inputs = [
                text_to_sequence(l.strip(), hp.tts_cleaner_names) for l in f
            ]

    voc_k = voc_model.get_step() // 1000
    tts_k = tts_model.get_step() // 1000

    r = tts_model.r

    simple_table([('WaveRNN', str(voc_k) + 'k'),
                  (f'Tacotron(r={r})', str(tts_k) + 'k'),
                  ('Generation Mode', 'Batched' if batched else 'Unbatched'),
                  ('Target Samples', 11_000 if batched else 'N/A'),
                  ('Overlap Samples', 550 if batched else 'N/A')])

    for i, x in enumerate(inputs, 1):

        print(f'\n| Generating {i}/{len(inputs)}')
        _, m, attention = tts_model.generate(x)
示例#14
0
def voc_train_loop(paths: Paths, model: WaveRNN, loss_func, optimizer,
                   train_set, test_set, init_lr, final_lr, total_steps):
    # Use same device as model parameters
    device = next(model.parameters()).device

    # for g in optimizer.param_groups: g['lr'] = lr

    total_iters = len(train_set)
    epochs = (total_steps - model.get_step()) // total_iters + 1

    for e in range(1, epochs + 1):

        adjust_learning_rate(optimizer, e, epochs, init_lr,
                             final_lr)  # 初始学习率与最终学习率-Begee
        start = time.time()
        running_loss = 0.

        for i, (x, y, m) in enumerate(train_set, 1):
            x, m, y = x.to(device), m.to(device), y.to(
                device)  # x/y: (Batch, sub_bands, T)

            #########################  MultiBand-WaveRNN   #########################
            if hp.voc_multiband:
                y0 = y[:, 0, :].squeeze(0).unsqueeze(
                    -1)  # y0/y1/y2/y3: (Batch, T, 1)
                y1 = y[:, 1, :].squeeze(0).unsqueeze(-1)
                y2 = y[:, 2, :].squeeze(0).unsqueeze(-1)
                y3 = y[:, 3, :].squeeze(0).unsqueeze(-1)

                y_hat = model(x, m)  # (Batch, T, num_classes, sub_bands)

                if model.mode == 'RAW':
                    y_hat0 = y_hat[:, :, :, 0].transpose(1, 2).unsqueeze(
                        -1)  # (Batch, num_classes, T, 1)
                    y_hat1 = y_hat[:, :, :, 1].transpose(1, 2).unsqueeze(-1)
                    y_hat2 = y_hat[:, :, :, 2].transpose(1, 2).unsqueeze(-1)
                    y_hat3 = y_hat[:, :, :, 3].transpose(1, 2).unsqueeze(-1)

                elif model.mode == 'MOL':
                    y0 = y0.float()
                    y1 = y1.float()
                    y2 = y2.float()
                    y3 = y3.float()

                loss = loss_func(y_hat0, y0) + loss_func(
                    y_hat1, y1) + loss_func(y_hat2, y2) + loss_func(
                        y_hat3, y3)

            #########################  MultiBand-WaveRNN   #########################

            optimizer.zero_grad()
            loss.backward()

            if hp.voc_clip_grad_norm is not None:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hp.voc_clip_grad_norm).cpu()
                if np.isnan(grad_norm):
                    print('grad_norm was NaN!')
            optimizer.step()

            running_loss += loss.item()
            avg_loss = running_loss / i

            speed = i / (time.time() - start)

            step = model.get_step()
            k = step // 1000

            if step % hp.voc_checkpoint_every == 0:
                gen_testset(model, test_set, hp.voc_gen_at_checkpoint,
                            hp.voc_gen_batched, hp.voc_target, hp.voc_overlap,
                            paths.voc_output)
                ckpt_name = f'wave_step{k}K'
                save_checkpoint('voc',
                                paths,
                                model,
                                optimizer,
                                name=ckpt_name,
                                is_silent=True)

            msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:.4f} | {speed:.1f} steps/s | Step: {k}k | '
            stream(msg)

        # Must save latest optimizer state to ensure that resuming training
        # doesn't produce artifacts
        save_checkpoint('voc', paths, model, optimizer, is_silent=True)
        model.log(paths.voc_log, msg)
        print(' ')
示例#15
0
    def train_session(self, model: WaveRNN, optimizer: Optimizer,
                      session: VocSession, train_gta: bool) -> None:
        current_step = model.get_step()
        training_steps = session.max_step - current_step
        total_iters = len(session.train_set)
        epochs = training_steps // total_iters + 1
        simple_table([(f'Steps ', str(training_steps // 1000) + 'k'),
                      ('Batch Size', session.bs),
                      ('Learning Rate', session.lr),
                      ('Sequence Length', self.train_cfg['seq_len']),
                      ('GTA Training', train_gta)])
        for g in optimizer.param_groups:
            g['lr'] = session.lr

        loss_avg = Averager()
        duration_avg = Averager()
        device = next(
            model.parameters()).device  # use same device as model parameters

        for e in range(1, epochs + 1):
            for i, batch in enumerate(session.train_set, 1):
                start = time.time()
                model.train()
                batch = to_device(batch, device=device)
                x, y = batch['x'], batch['y']
                y_hat = model(x, batch['mel'])
                if model.mode == 'RAW':
                    y_hat = y_hat.transpose(1, 2).unsqueeze(-1)
                elif model.mode == 'MOL':
                    y = batch['y'].float()
                y = y.unsqueeze(-1)

                loss = self.loss_func(y_hat, y)
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(
                    model.parameters(), self.train_cfg['clip_grad_norm'])
                optimizer.step()
                loss_avg.add(loss.item())
                step = model.get_step()
                k = step // 1000

                duration_avg.add(time.time() - start)
                speed = 1. / duration_avg.get()
                msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {loss_avg.get():#.4} ' \
                      f'| {speed:#.2} steps/s | Step: {k}k | '

                if step % self.train_cfg['gen_samples_every'] == 0:
                    stream(msg + 'generating samples...')
                    gen_result = self.generate_samples(model, session)
                    if gen_result is not None:
                        mel_loss, gen_wav = gen_result
                        self.writer.add_scalar('Loss/generated_mel_l1',
                                               mel_loss, model.get_step())
                        self.track_top_models(mel_loss, gen_wav, model)

                if step % self.train_cfg['checkpoint_every'] == 0:
                    save_checkpoint(model=model,
                                    optim=optimizer,
                                    config=self.config,
                                    path=self.paths.voc_checkpoints /
                                    f'wavernn_step{k}k.pt')

                self.writer.add_scalar('Loss/train', loss, model.get_step())
                self.writer.add_scalar('Params/batch_size', session.bs,
                                       model.get_step())
                self.writer.add_scalar('Params/learning_rate', session.lr,
                                       model.get_step())

                stream(msg)

            val_loss = self.evaluate(model, session.val_set)
            self.writer.add_scalar('Loss/val', val_loss, model.get_step())
            save_checkpoint(model=model,
                            optim=optimizer,
                            config=self.config,
                            path=self.paths.voc_checkpoints /
                            'latest_model.pt')

            loss_avg.reset()
            duration_avg.reset()
            print(' ')
示例#16
0
                        sample_rate=hp.sample_rate,
                        pad_val=hp.voc_pad_val,
                        mode=hp.voc_mode).cuda()

    # Check to make sure the hop length is correctly factorised
    assert np.cumprod(hp.voc_upsample_factors)[-1] == hp.hop_length

    paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)

    voc_model.restore(paths.voc_latest_weights)

    optimizer = torch.optim.Adam(voc_model.parameters())

    train_set, test_set = get_vocoder_datasets(paths.data, batch_size, train_gta)

    total_steps = 10_000_000 if force_train else hp.voc_total_steps

    simple_table([('Remaining', str((total_steps - voc_model.get_step())//1000) + 'k Steps'),
                  ('Batch Size', batch_size),
                  ('Initial learning rate', init_lr),
                  ('Final learnging rate', final_lr),
                  ('Sequence Len', hp.voc_seq_len),
                  ('GTA Train', train_gta)])

    loss_func = torch.nn.functional.cross_entropy if voc_model.mode == 'RAW' else discretized_mix_logistic_loss

    voc_train_loop(voc_model, loss_func, optimizer, train_set, test_set, init_lr, final_lr, total_steps)

    print('Training Complete.')
    print('To continue training increase voc_total_steps in hparams.py or use --force_train')
示例#17
0
                    res_blocks=hp.voc_res_blocks,
                    hop_length=hp.hop_length,
                    sample_rate=hp.sample_rate,
                    pad_val=hp.voc_pad_val,
                    mode=hp.voc_mode).cuda()

    paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)

    restore_path = args.weights if args.weights else paths.voc_latest_weights

    model.restore(restore_path)
    model.eval()
    if hp.amp:
        model, _ = amp.initialize(model, [], opt_level='O3')

    simple_table([('Generation Mode', 'Batched' if batched else 'Unbatched'),
                  ('Target Samples', target if batched else 'N/A'),
                  ('Overlap Samples', overlap if batched else 'N/A')])

    k = model.get_step() // 1000

    for file_name in os.listdir(args.dir):
        if file_name.endswith('.npy'):
            mel = np.load(os.path.join(args.dir, file_name))
            mel = torch.tensor(mel).unsqueeze(0)

            batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED'
            save_str = f'{file_name}__{k}k_steps_{batch_str}.wav'

            model.generate(mel, save_str, batched, target, overlap, hp.mu_law)
示例#18
0
    # Check to make sure the hop length is correctly factorised
    assert np.cumprod(hp.voc_upsample_factors)[-1] == hp.hop_length

    paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)

    voc_model.restore(paths.voc_latest_weights)

    optimiser = optim.Adam(voc_model.parameters())

    train_set, test_set = get_vocoder_datasets(paths.data, batch_size,
                                               train_gta)

    total_steps = 10_000_000 if force_train else hp.voc_total_steps

    simple_table([
        ('Remaining', str(
            (total_steps - voc_model.get_step()) // 1000) + 'k Steps'),
        ('Batch Size', batch_size), ('LR', lr),
        ('Sequence Len', hp.voc_seq_len), ('GTA Train', train_gta)
    ])

    loss_func = F.cross_entropy if voc_model.mode == 'RAW' else discretized_mix_logistic_loss

    voc_train_loop(voc_model, loss_func, optimiser, train_set, test_set, lr,
                   total_steps)

    print('Training Complete.')
    print(
        'To continue training increase voc_total_steps in hparams.py or use --force_train'
    )
示例#19
0
def voc_train_loop(paths: Paths, model: WaveRNN, loss_func, optimizer,
                   train_set, test_set, lr, total_steps):
    # Use same device as model parameters
    device = next(model.parameters()).device

    # set learning rate
    for g in optimizer.param_groups:
        g['lr'] = lr

    total_iters = len(train_set)
    epochs = (total_steps - model.get_step()) // total_iters + 1

    total_number_of_batches = len(train_set)

    writer = SummaryWriter("runs/{0}-{1}".format(
        model_name_prefix,
        datetime.now().strftime("%Y%m%d-%H%M%S")))
    scheduler = StepLR(optimizer, step_size=1, gamma=0.983)

    for e in range(EPOCH, epochs + 1):

        start = time.time()
        running_loss = 0.
        avg_loss = 0

        for i, (x, y, m) in enumerate(train_set, 1):
            x, m, y = x.to(device), m.to(device), y.to(device)

            # Parallelize model onto GPUS using workaround due to python bug
            if device.type == 'cuda' and torch.cuda.device_count() > 1:
                y_hat = data_parallel_workaround(model, x, m)
            else:
                y_hat = model(x, m)

            if model.mode == 'RAW':
                y_hat = y_hat.transpose(1, 2).unsqueeze(-1)

            elif model.mode == 'MOL':
                y = y.float()

            y = y.unsqueeze(-1)

            loss = loss_func(y_hat, y)

            optimizer.zero_grad()
            loss.backward()
            if hp.voc_clip_grad_norm is not None:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hp.voc_clip_grad_norm)
            optimizer.step()

            running_loss += loss.item()
            avg_loss = running_loss / i

            speed = i / (time.time() - start)

            step = model.get_step()
            k = step // 1000

            # Write to tensorboard per batch
            writer.add_scalar('Epoch loss', loss.item(),
                              e * total_number_of_batches + i)

            msg = f'| Epoch: {e}/{epochs} ({i}/{total_iters}) | Loss: {avg_loss:.4f} | {speed:.1f} steps/s | Step: {k}k | '
            stream(msg)
        """
        ####################### Testing ############################
        torch.cuda.empty_cache()
        loss_test = 0
        for _, (x_test, y_test, m_test) in enumerate(test_set, 1):
            x_test, m_test, y_test = x_test.to(device), m_test.to(device), y_test.to(device)
            if device.type == 'cuda' and torch.cuda.device_count() > 1:
                raise RuntimeError("Unsupported")
            else:
                y_test_hat = model(x_test, m_test)

            if model.mode == 'RAW':
                y_test_hat = y_test_hat.transpose(1, 2).unsqueeze(-1)
            elif model.mode == 'MOL':
                y_test = y_test.float()

            y_test = y_test.unsqueeze(-1)

            loss_test += loss_func(y_test_hat, y_test).item()
        avg_loss_test = loss_test / len(test_set)
        msg = f'| Epoch: {e}/{epochs} | Test-Loss: {loss_test:.4f} | Test-AvgLoss: {avg_loss_test:.4f} | '
        stream("\n")
        stream(msg)

        writer.add_scalar('Test loss', loss_test, e)
        writer.add_scalar('Average test loss', avg_loss_test, e)
        ############################################################
        """

        # Write to tensorboard per epoch
        writer.add_scalar('Running loss', running_loss, e)
        writer.add_scalar('Average loss', avg_loss, e)

        # Must save latest optimizer state to ensure that resuming training
        # doesn't produce artifacts
        save_checkpoint('voc',
                        paths,
                        model,
                        optimizer,
                        name="{0}-epoch-{1}-loss-{2}".format(
                            model_name_prefix, e, avg_loss),
                        is_silent=True)
        model.log(paths.voc_log, msg)
        print(' ')
        scheduler.step()
        print('Epoch:', e, 'LR:', scheduler.get_lr())
示例#20
0
    def TTS_Wave(self):
        os.makedirs('quick_start/tts_weights/', exist_ok=True)
        os.makedirs('quick_start/voc_weights/', exist_ok=True)

        zip_ref = zipfile.ZipFile('pretrained/ljspeech.wavernn.mol.800k.zip', 'r')
        zip_ref.extractall('quick_start/voc_weights/')
        zip_ref.close()

        zip_ref = zipfile.ZipFile('pretrained/ljspeech.tacotron.r2.180k.zip', 'r')
        zip_ref.extractall('quick_start/tts_weights/')
        zip_ref.close()

        # Parse Arguments
        parser = argparse.ArgumentParser(description='TTS Generator')
        parser.add_argument('-name', metavar='name', type=str,help='name of pdf')
        parser.add_argument('--input_text', '-i', type=str, help='[string] Type in something here and TTS will generate it!')
        parser.add_argument('--batched', '-b', dest='batched', action='store_true', help='Fast Batched Generation (lower quality)')
        parser.add_argument('--unbatched', '-u', dest='batched', action='store_false', help='Slower Unbatched Generation (better quality)')
        parser.add_argument('--target', '-t', type=int, help='[int] number of samples in each batch index')
        parser.add_argument('--overlap', '-o', type=int, help='[int] number of crossover samples')
        parser.add_argument('--force_cpu', '-c', action='store_true', help='Forces CPU-only training, even when in CUDA capable environment')
        parser.set_defaults(batched=hp.voc_gen_batched)
        parser.set_defaults(target=hp.voc_target)
        parser.set_defaults(overlap=hp.voc_overlap)
        parser.set_defaults(input_text=None)
        parser.set_defaults(weights_path=None)
        args = parser.parse_args()

        batched = args.batched
        target = args.target
        overlap = args.overlap
        input_text = args.input_text
        weights_path = args.weights_path

        if not args.force_cpu and torch.cuda.is_available():
            device = torch.device('cuda')
            torch.cuda.set_device(0)
        else:
            device = torch.device('cpu')
        print('Using device:', device)

        print('\nInitialising WaveRNN Model...\n')

        # Instantiate WaveRNN Model
        voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims,
                            fc_dims=hp.voc_fc_dims,
                            bits=hp.bits,
                            pad=hp.voc_pad,
                            upsample_factors=hp.voc_upsample_factors,
                            feat_dims=hp.num_mels,
                            compute_dims=hp.voc_compute_dims,
                            res_out_dims=hp.voc_res_out_dims,
                            res_blocks=hp.voc_res_blocks,
                            hop_length=hp.hop_length,
                            sample_rate=hp.sample_rate,
                            mode='MOL').to(device)

        voc_model.restore('quick_start/voc_weights/latest_weights.pyt')

        print('\nInitialising Tacotron Model...\n')

        # Instantiate Tacotron Model
        tts_model = Tacotron(embed_dims=hp.tts_embed_dims,
                             num_chars=len(symbols),
                             encoder_dims=hp.tts_encoder_dims,
                             decoder_dims=hp.tts_decoder_dims,
                             n_mels=hp.num_mels,
                             fft_bins=hp.num_mels,
                             postnet_dims=hp.tts_postnet_dims,
                             encoder_K=hp.tts_encoder_K,
                             lstm_dims=hp.tts_lstm_dims,
                             postnet_K=hp.tts_postnet_K,
                             num_highways=hp.tts_num_highways,
                             dropout=hp.tts_dropout).to(device)


        tts_model.restore('quick_start/tts_weights/latest_weights.pyt')

        if input_text:
            inputs = [text_to_sequence(input_text.strip(), hp.tts_cleaner_names)]
        else:
            with open('final.txt') as f:
                inputs = [text_to_sequence(l.strip(), hp.tts_cleaner_names) for l in f]

        voc_k = voc_model.get_step() // 1000
        tts_k = tts_model.get_step() // 1000

        r = tts_model.get_r()

        simple_table([('WaveRNN', str(voc_k) + 'k'),
                      (f'Tacotron(r={r})', str(tts_k) + 'k'),
                      ('Generation Mode', 'Batched' if batched else 'Unbatched'),
                      ('Target Samples', target if batched else 'N/A'),
                      ('Overlap Samples', overlap if batched else 'N/A')])

        for i, x in enumerate(inputs, 1):

            print("f'\n| Generating {i}/{len(inputs)}'")
            _, m, attention = tts_model.generate(x)

            if input_text:
                save_path = './output_audio/'+str(i)+'.wav'
            else:
                save_path = './output_audio/'+str(i)+'.wav'

            # save_attention(attention, save_path)

            m = torch.tensor(m).unsqueeze(0)
            m = (m + 4) / 8

            voc_model.generate(m, save_path, batched, hp.voc_target, hp.voc_overlap, hp.mu_law)


            if i == 2:

                temp1 = AudioSegment.from_wav("./output_audio/"+str(i-1)+".wav")
                temp2 = AudioSegment.from_wav("./output_audio/"+str(i)+".wav")

                combined_sounds = temp1 + temp2

                os.remove("./output_audio/"+str(i-1)+".wav")
                os.remove("./output_audio/"+str(i)+".wav")

                combined_sounds.export("./output_audio/"+self.path[:-4]+".wav", format="wav")

            elif i > 2:

                preTemp = AudioSegment.from_wav("./output_audio/"+self.path[:-4]+".wav")

                newTemp = AudioSegment.from_wav("./output_audio/"+str(i)+".wav")

                combined_sounds = preTemp + newTemp

                os.remove("./output_audio/"+self.path[:-4]+".wav")
                os.remove("./output_audio/"+str(i)+".wav")

                combined_sounds.export("./output_audio/"+self.path[:-4]+".wav", format="wav")


        print("Done")