Exemplo n.º 1
0
def gen_from_file(model: WaveRNN, load_path: Path, save_path: Path, batched, target, overlap):

    k = model.get_step() // 1000
    file_name = load_path.stem

    suffix = load_path.suffix
    if suffix == ".wav":
        wav = load_wav(load_path)
        save_wav(wav, save_path/f'__{file_name}__{k}k_steps_target.wav')
        mel = melspectrogram(wav)
    elif suffix == ".npy":
        mel = np.load(load_path)
        if mel.ndim != 2 or mel.shape[0] != hp.num_mels:
            raise ValueError(f'Expected a numpy array shaped (n_mels, n_hops), but got {wav.shape}!')
        _max = np.max(mel)
        _min = np.min(mel)
        if _max >= 1.01 or _min <= -0.01:
            raise ValueError(f'Expected spectrogram range in [0,1] but was instead [{_min}, {_max}]')
    else:
        raise ValueError(f"Expected an extension of .wav or .npy, but got {suffix}!")


    mel = torch.tensor(mel).unsqueeze(0)

    batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED'
    save_str = save_path/f'__{file_name}__{k}k_steps_{batch_str}.wav'

    _ = model.generate(mel, save_str, batched, target, overlap, hp.mu_law)
Exemplo n.º 2
0
def gen_testset(model: WaveRNN, test_set, samples, batched, target, overlap,
                save_path: Path):

    k = model.get_step() // 1000

    for i, (m, x) in enumerate(test_set, 1):

        if i > samples: break

        print('\n| Generating: %i/%i' % (i, samples))

        x = x[0].numpy()

        bits = 16 if hp.voc_mode == 'MOL' else hp.bits

        if hp.mu_law and hp.voc_mode != 'MOL':
            x = decode_mu_law(x, 2**bits, from_labels=True)
        else:
            x = label_2_float(x, bits)

        save_wav(x,
                 save_path / '%sk_steps_%s_target.wav' % (repr1(k), repr1(i)))

        batch_str = 'gen_batched_target%s_overlap%s' % (
            repr1(target), repr1(overlap)) if batched else 'gen_NOT_BATCHED'
        save_str = str(save_path / '%sk_steps_%s_%s.wav' %
                       (repr1(k), repr1(i), repr1(batch_str)))

        _ = model.generate(m, save_str, batched, target, overlap, hp.mu_law)
Exemplo n.º 3
0
def gen_testset(model: WaveRNN, test_set, samples, batched, target, overlap, save_path: Path):

    k = model.get_step() // 1000

    for i, (m, x) in enumerate(test_set, 1):

        if i > samples: break

        print('\n| Generating: %i/%i' % (i, samples))

        x = x[0].numpy()

        bits = 16 if hp.voc_mode == 'MOL' else hp.bits

        if hp.mu_law and hp.voc_mode != 'MOL':
            x = decode_mu_law(x, 2**bits, from_labels=True)
        else:
            x = label_2_float(x, bits)

        save_wav(x, save_path/f'{k}k_steps_{i}_target.wav')

        batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED'
        save_str = str(save_path/f'{k}k_steps_{i}_{batch_str}.wav')

        _ = model.generate(m, save_str, batched, target, overlap, hp.mu_law)
Exemplo n.º 4
0
def gen_from_file(model: WaveRNN, load_path: Path, save_path: Path, batched, target, overlap):

    k = model.get_step() // 1000
    os.makedirs(save_path/'test', exist_ok=True)
    for file_name in tqdm(os.listdir(load_path)):
        if file_name.endswith('.npy'):
            mel = np.load(os.path.join(load_path, file_name))
            mel = torch.tensor(mel).unsqueeze(0)

            batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED'
            save_str = save_path/f'test/{file_name}__{k}k_steps_{batch_str}.wav'

            _ = model.generate(mel, save_str, batched, target, overlap, hp.mu_law)
Exemplo n.º 5
0
    def generate_samples(self, model: WaveRNN,
                         session: VocSession) -> Tuple[float, list]:
        """
        Generates audio samples to cherry-pick models. To evaluate audio quality
        we calculate the l1 distance between mels of predictions and targets.
        """
        model.eval()
        mel_losses = []
        gen_wavs = []
        device = next(model.parameters()).device
        for i, sample in enumerate(session.val_set_samples, 1):
            m, x = sample['mel'], sample['x']
            if i > self.train_cfg['num_gen_samples']:
                break
            x = x[0].numpy()
            bits = 16 if self.dsp.voc_mode == 'MOL' else self.dsp.bits
            if self.dsp.mu_law and self.dsp.voc_mode != 'MOL':
                x = DSP.decode_mu_law(x, 2**bits, from_labels=True)
            else:
                x = DSP.label_2_float(x, bits)
            gen_wav = model.generate(mels=m,
                                     batched=self.train_cfg['gen_batched'],
                                     target=self.train_cfg['target'],
                                     overlap=self.train_cfg['overlap'],
                                     mu_law=self.dsp.mu_law,
                                     silent=True)

            gen_wavs.append(gen_wav)
            y_mel = self.dsp.wav_to_mel(x.squeeze(), normalize=False)
            y_mel = torch.tensor(y_mel).to(device)
            y_hat_mel = self.dsp.wav_to_mel(gen_wav, normalize=False)
            y_hat_mel = torch.tensor(y_hat_mel).to(device)
            loss = F.l1_loss(y_hat_mel, y_mel)
            mel_losses.append(loss.item())

            self.writer.add_audio(tag=f'Validation_Samples/target_{i}',
                                  snd_tensor=x,
                                  global_step=model.step,
                                  sample_rate=self.dsp.sample_rate)
            self.writer.add_audio(tag=f'Validation_Samples/generated_{i}',
                                  snd_tensor=gen_wav,
                                  global_step=model.step,
                                  sample_rate=self.dsp.sample_rate)

        return sum(mel_losses) / len(mel_losses), gen_wavs[0]
Exemplo n.º 6
0
    def generate_samples(self, model: WaveRNN,
                         session: VocSession) -> Tuple[float, list]:
        """
        Generates audio samples to cherry-pick models. To evaluate audio quality
        we calculate the l1 distance between mels of predictions and targets.
        """
        model.eval()
        mel_losses = []
        gen_wavs = []
        device = next(model.parameters()).device
        for i, (m, x) in enumerate(session.val_set_samples, 1):
            if i > hp.voc_gen_num_samples:
                break
            x = x[0].numpy()
            bits = 16 if hp.voc_mode == 'MOL' else hp.bits
            if hp.mu_law and hp.voc_mode != 'MOL':
                x = decode_mu_law(x, 2**bits, from_labels=True)
            else:
                x = label_2_float(x, bits)
            gen_wav = model.generate(mels=m,
                                     save_path=None,
                                     batched=hp.voc_gen_batched,
                                     target=hp.voc_target,
                                     overlap=hp.voc_overlap,
                                     mu_law=hp.mu_law,
                                     silent=True)

            gen_wavs.append(gen_wav)
            y_mel = raw_melspec(x.squeeze())
            y_mel = torch.tensor(y_mel).to(device)
            y_hat_mel = raw_melspec(gen_wav)
            y_hat_mel = torch.tensor(y_hat_mel).to(device)
            loss = F.l1_loss(y_hat_mel, y_mel)
            mel_losses.append(loss.item())

            self.writer.add_audio(tag=f'Validation_Samples/target_{i}',
                                  snd_tensor=x,
                                  global_step=model.step,
                                  sample_rate=hp.sample_rate)
            self.writer.add_audio(tag=f'Validation_Samples/generated_{i}',
                                  snd_tensor=gen_wav,
                                  global_step=model.step,
                                  sample_rate=hp.sample_rate)

        return sum(mel_losses) / len(mel_losses), gen_wavs[0]
Exemplo n.º 7
0
def gen_testset(model: WaveRNN, test_set, samples, batched, target, overlap, save_path: Path):

    k = model.get_step() // 1000

    mypqmf = PQMF()
    for i, (m, x) in enumerate(test_set, 1):

        if i > samples: break

        print('\n| Generating: %i/%i' % (i, samples))

        if hp.voc_multiband:

            x = x[0].numpy()

            bits = 16 if hp.voc_mode == 'MOL' else hp.bits

            if hp.mu_law and hp.voc_mode != 'MOL':
                x = decode_mu_law(x, 2 ** bits, from_labels=True)
            else:
                x = label_2_float(x, bits)

            source = mypqmf.synthesis(
                torch.tensor(x, dtype=torch.float).unsqueeze(
                    0)).numpy()  # (1, sub_band, T//sub_band) -> (1, 1, T)
            source = source.squeeze() # (T,)
            save_wav(source,save_path/f'{k}k_steps_{i}_target.wav')
            # np.save(save_path/f'{k}k_steps_{i}_target.npy', x, allow_pickle=False)

        else:
            x = x[0].numpy()

            bits = 16 if hp.voc_mode == 'MOL' else hp.bits

            if hp.mu_law and hp.voc_mode != 'MOL':
                x = decode_mu_law(x, 2**bits, from_labels=True)
            else:
                x = label_2_float(x, bits)

            save_wav(x, save_path/f'{k}k_steps_{i}_target.wav')

        batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED'
        save_str = str(save_path/f'{k}k_steps_{i}_{batch_str}.wav')   # 返回PQMF后
        _ = model.generate(m, save_str,batched, target, overlap, hp.mu_law)
Exemplo n.º 8
0
def gen_testset(model: WaveRNN, test_set_wav, samples, batched, target,
                overlap, save_path: Path):
    '''
    :param model:
    :param test_set: 测试集,包含了mel或sp+f0特征,以及原音频的载入文件
    :param samples: 要生成的样本量,也就是要生成的音频个数
    :param batched: 在这个脚本中batched为True
    :param target: 11000
    :param overlap: 550
    :param save_path: model_outputs_*
    :return: 生成的音频文件
    '''

    c = 0
    for i in os.listdir(test_set_wav):
        m = np.expand_dims(np.load(join(test_set_wav, i)).T, 0)
        filenname = basename(i)[:-4]

        wave_path = "/emotion_wav/"
        save_str = wave_path + str(filenname) + ".wav"

        _ = model.generate(m, save_str, batched, target, overlap, hp.mu_law)
Exemplo n.º 9
0
def gen_from_file(model: WaveRNN, load_path: Path, save_path: Path, batched,
                  target, overlap):
    suffix = load_path.suffix
    if suffix == ".wav":
        wav = load_wav(load_path)
        save_wav(
            wav, os.path.join(save_path, "target",
                              os.path.basename(load_path)))
        print("Generating from {0}".format(load_path))
        mel = melspectrogram(wav)
        print("Melspectrograms generated!")
    elif suffix == ".npy":
        mel = np.load(load_path)
        if mel.ndim != 2 or mel.shape[0] != hp.num_mels:
            raise ValueError(
                f'Expected a numpy array shaped (n_mels, n_hops), but got {wav.shape}!'
            )
        _max = np.max(mel)
        _min = np.min(mel)
        if _max >= 1.01 or _min <= -0.01:
            raise ValueError(
                f'Expected spectrogram range in [0,1] but was instead [{_min}, {_max}]'
            )
    else:
        raise ValueError(
            f"Expected an extension of .wav or .npy, but got {suffix}!")

    mel = torch.tensor(mel).unsqueeze(0)

    batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED'
    save_str = os.path.join(save_path, os.path.basename(load_path))

    beg = time.time()
    print("Start generating... [{0}]".format(beg))
    output = model.generate(mel, save_str, batched, target, overlap, hp.mu_law)
    end = time.time()
    print("Done generating... [{0}] -> delta: [{1}]".format(end, end - beg))
    save_wav(output, save_str)
Exemplo n.º 10
0
def gen_from_file(model: WaveRNN, load_path: Path, save_path: Path, batched,
                  target, overlap):

    k = model.get_step() // 1000
    file_name = load_path.stem

    suffix = load_path.suffix
    if suffix == ".wav":
        wav = load_wav(load_path)
        save_wav(
            wav, save_path / '__%s__%sk_steps_target.wav' %
            (repr1(file_name), repr1(k)))
        mel = melspectrogram(wav)
    elif suffix == ".npy":
        mel = np.load(load_path)
        if mel.ndim != 2 or mel.shape[0] != hp.num_mels:
            raise ValueError(
                'Expected a numpy array shaped (n_mels, n_hops), but got %s!' %
                (repr1(wav.shape)))
        _max = np.max(mel)
        _min = np.min(mel)
        if _max >= 1.01 or _min <= -0.01:
            raise ValueError(
                'Expected spectrogram range in [0,1] but was instead [%s, %s]'
                % (repr1(_min), repr1(_max)))
    else:
        raise ValueError('Expected an extension of .wav or .npy, but got %s!' %
                         (repr1(suffix)))

    mel = torch.tensor(mel).unsqueeze(0)

    batch_str = 'gen_batched_target%s_overlap%s' % (
        repr1(target), repr1(overlap)) if batched else 'gen_NOT_BATCHED'
    save_str = save_path / '__%s__%sk_steps_%s.wav' % (
        repr1(file_name), repr1(k), repr1(batch_str))

    _ = model.generate(mel, save_str, batched, target, overlap, hp.mu_law)
Exemplo n.º 11
0
def gen_from_file(model: WaveRNN, load_path: Path, save_path: Path, batched,
                  target, overlap):

    k = model.get_step() // 1000
    file_name = load_path.stem

    suffix = load_path.suffix
    if suffix == ".wav":
        wav = load_wav(load_path)
        save_wav(wav, save_path / f'{prefix}{file_name}.target.wav')
        mel = melspectrogram(wav)
    elif suffix == ".npy":
        mel = np.load(load_path)
        if mel.ndim != 2 or mel.shape[0] != hp.num_mels:
            raise ValueError(
                f'Expected a numpy array shaped (n_mels, n_hops), but got {mel.shape}!'
            )
        _max = np.max(mel)
        _min = np.min(mel)
        if _max >= 1.01 or _min <= -0.01:
            raise ValueError(
                f'Expected spectrogram range in [0,1] but was instead [{_min}, {_max}]'
            )
    else:
        raise ValueError(
            f"Expected an extension of .wav or .npy, but got {suffix}!")

    m = torch.tensor(mel).unsqueeze(0)

    save_str_wavernn = save_path / f'{prefix}{file_name}.wavernn.wav'
    save_str_griffinlim = save_path / f'{prefix}{file_name}.griffinlim.wav'

    wav = reconstruct_waveform(mel, n_iter=32)
    save_wav(wav, save_str_griffinlim)

    _ = model.generate(m, save_str_wavernn, batched, target, overlap,
                       hp.mu_law)
Exemplo n.º 12
0
def gen_testset(model: WaveRNN, test_set, samples, batched, target, overlap,
                save_path: Path):
    '''
    :param model:
    :param test_set: 测试集,包含了mel或sp+f0特征,以及原音频的载入文件
    :param samples: 要生成的样本量,也就是要生成的音频个数
    :param batched: 在这个脚本中batched为True
    :param target: 11000
    :param overlap: 550
    :param save_path: model_outputs_*
    :return: 生成的音频文件
    '''

    k = model.get_step() // 1000

    for i, (m, x) in enumerate(test_set, 1):

        if i > samples: break

        print('\n| Generating: %i/%i' % (i, samples))

        x = x[0].numpy()

        bits = 16 if hp.voc_mode == 'MOL' else hp.bits

        if hp.mu_law and hp.voc_mode != 'MOL':
            x = decode_mu_law(x, 2**bits, from_labels=True)
        else:
            x = label_2_float(x, bits)

        save_wav(x, save_path / f'{k}k_steps_{i}_target.wav')  # 保存原音频文件

        batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED'
        save_str = str(save_path / f'{k}k_steps_{i}_{batch_str}.wav')

        _ = model.generate(m, save_str, batched, target, overlap, hp.mu_law)
Exemplo n.º 13
0
        _, m, dur, pitch = tts_model.generate(x,
                                              alpha=args.alpha,
                                              pitch_function=pitch_function)

        if args.vocoder == 'griffinlim':
            v_type = args.vocoder
        elif args.vocoder == 'wavernn' and args.batched:
            v_type = 'wavernn_batched'
        else:
            v_type = 'wavernn_unbatched'

        if input_text:
            save_path = paths.forward_output / f'{input_text[:10]}_{args.alpha}_{v_type}_{tts_k}k_amp{args.amp}.wav'
        else:
            save_path = paths.forward_output / f'{i}_{v_type}_{tts_k}k_alpha{args.alpha}_amp{args.amp}.wav'

        if args.vocoder == 'wavernn':
            m = torch.tensor(m).unsqueeze(0)
            voc_model.generate(m, save_path, batched, hp.voc_target,
                               hp.voc_overlap, hp.mu_law)
        if args.vocoder == 'melgan':
            m = torch.tensor(m).unsqueeze(0)
            torch.save(
                m, paths.forward_output /
                f'{i}_{tts_k}_alpha{args.alpha}_amp{args.amp}.mel')
        elif args.vocoder == 'griffinlim':
            wav = reconstruct_waveform(m, n_iter=args.iters)
            save_wav(wav, save_path)

    print('\n\nDone.\n')
Exemplo n.º 14
0
                    res_blocks=hp.voc_res_blocks,
                    hop_length=hp.hop_length,
                    sample_rate=hp.sample_rate,
                    pad_val=hp.voc_pad_val,
                    mode=hp.voc_mode).cuda()

    paths = Paths(hp.data_path, hp.voc_model_id, hp.tts_model_id)

    restore_path = args.weights if args.weights else paths.voc_latest_weights

    model.restore(restore_path)
    model.eval()
    if hp.amp:
        model, _ = amp.initialize(model, [], opt_level='O3')

    simple_table([('Generation Mode', 'Batched' if batched else 'Unbatched'),
                  ('Target Samples', target if batched else 'N/A'),
                  ('Overlap Samples', overlap if batched else 'N/A')])

    k = model.get_step() // 1000

    for file_name in os.listdir(args.dir):
        if file_name.endswith('.npy'):
            mel = np.load(os.path.join(args.dir, file_name))
            mel = torch.tensor(mel).unsqueeze(0)

            batch_str = f'gen_batched_target{target}_overlap{overlap}' if batched else 'gen_NOT_BATCHED'
            save_str = f'{file_name}__{k}k_steps_{batch_str}.wav'

            model.generate(mel, save_str, batched, target, overlap, hp.mu_law)
Exemplo n.º 15
0
    def TTS_Wave(self):
        os.makedirs('quick_start/tts_weights/', exist_ok=True)
        os.makedirs('quick_start/voc_weights/', exist_ok=True)

        zip_ref = zipfile.ZipFile('pretrained/ljspeech.wavernn.mol.800k.zip', 'r')
        zip_ref.extractall('quick_start/voc_weights/')
        zip_ref.close()

        zip_ref = zipfile.ZipFile('pretrained/ljspeech.tacotron.r2.180k.zip', 'r')
        zip_ref.extractall('quick_start/tts_weights/')
        zip_ref.close()

        # Parse Arguments
        parser = argparse.ArgumentParser(description='TTS Generator')
        parser.add_argument('-name', metavar='name', type=str,help='name of pdf')
        parser.add_argument('--input_text', '-i', type=str, help='[string] Type in something here and TTS will generate it!')
        parser.add_argument('--batched', '-b', dest='batched', action='store_true', help='Fast Batched Generation (lower quality)')
        parser.add_argument('--unbatched', '-u', dest='batched', action='store_false', help='Slower Unbatched Generation (better quality)')
        parser.add_argument('--target', '-t', type=int, help='[int] number of samples in each batch index')
        parser.add_argument('--overlap', '-o', type=int, help='[int] number of crossover samples')
        parser.add_argument('--force_cpu', '-c', action='store_true', help='Forces CPU-only training, even when in CUDA capable environment')
        parser.set_defaults(batched=hp.voc_gen_batched)
        parser.set_defaults(target=hp.voc_target)
        parser.set_defaults(overlap=hp.voc_overlap)
        parser.set_defaults(input_text=None)
        parser.set_defaults(weights_path=None)
        args = parser.parse_args()

        batched = args.batched
        target = args.target
        overlap = args.overlap
        input_text = args.input_text
        weights_path = args.weights_path

        if not args.force_cpu and torch.cuda.is_available():
            device = torch.device('cuda')
            torch.cuda.set_device(0)
        else:
            device = torch.device('cpu')
        print('Using device:', device)

        print('\nInitialising WaveRNN Model...\n')

        # Instantiate WaveRNN Model
        voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims,
                            fc_dims=hp.voc_fc_dims,
                            bits=hp.bits,
                            pad=hp.voc_pad,
                            upsample_factors=hp.voc_upsample_factors,
                            feat_dims=hp.num_mels,
                            compute_dims=hp.voc_compute_dims,
                            res_out_dims=hp.voc_res_out_dims,
                            res_blocks=hp.voc_res_blocks,
                            hop_length=hp.hop_length,
                            sample_rate=hp.sample_rate,
                            mode='MOL').to(device)

        voc_model.restore('quick_start/voc_weights/latest_weights.pyt')

        print('\nInitialising Tacotron Model...\n')

        # Instantiate Tacotron Model
        tts_model = Tacotron(embed_dims=hp.tts_embed_dims,
                             num_chars=len(symbols),
                             encoder_dims=hp.tts_encoder_dims,
                             decoder_dims=hp.tts_decoder_dims,
                             n_mels=hp.num_mels,
                             fft_bins=hp.num_mels,
                             postnet_dims=hp.tts_postnet_dims,
                             encoder_K=hp.tts_encoder_K,
                             lstm_dims=hp.tts_lstm_dims,
                             postnet_K=hp.tts_postnet_K,
                             num_highways=hp.tts_num_highways,
                             dropout=hp.tts_dropout).to(device)


        tts_model.restore('quick_start/tts_weights/latest_weights.pyt')

        if input_text:
            inputs = [text_to_sequence(input_text.strip(), hp.tts_cleaner_names)]
        else:
            with open('final.txt') as f:
                inputs = [text_to_sequence(l.strip(), hp.tts_cleaner_names) for l in f]

        voc_k = voc_model.get_step() // 1000
        tts_k = tts_model.get_step() // 1000

        r = tts_model.get_r()

        simple_table([('WaveRNN', str(voc_k) + 'k'),
                      (f'Tacotron(r={r})', str(tts_k) + 'k'),
                      ('Generation Mode', 'Batched' if batched else 'Unbatched'),
                      ('Target Samples', target if batched else 'N/A'),
                      ('Overlap Samples', overlap if batched else 'N/A')])

        for i, x in enumerate(inputs, 1):

            print("f'\n| Generating {i}/{len(inputs)}'")
            _, m, attention = tts_model.generate(x)

            if input_text:
                save_path = './output_audio/'+str(i)+'.wav'
            else:
                save_path = './output_audio/'+str(i)+'.wav'

            # save_attention(attention, save_path)

            m = torch.tensor(m).unsqueeze(0)
            m = (m + 4) / 8

            voc_model.generate(m, save_path, batched, hp.voc_target, hp.voc_overlap, hp.mu_law)


            if i == 2:

                temp1 = AudioSegment.from_wav("./output_audio/"+str(i-1)+".wav")
                temp2 = AudioSegment.from_wav("./output_audio/"+str(i)+".wav")

                combined_sounds = temp1 + temp2

                os.remove("./output_audio/"+str(i-1)+".wav")
                os.remove("./output_audio/"+str(i)+".wav")

                combined_sounds.export("./output_audio/"+self.path[:-4]+".wav", format="wav")

            elif i > 2:

                preTemp = AudioSegment.from_wav("./output_audio/"+self.path[:-4]+".wav")

                newTemp = AudioSegment.from_wav("./output_audio/"+str(i)+".wav")

                combined_sounds = preTemp + newTemp

                os.remove("./output_audio/"+self.path[:-4]+".wav")
                os.remove("./output_audio/"+str(i)+".wav")

                combined_sounds.export("./output_audio/"+self.path[:-4]+".wav", format="wav")


        print("Done")