示例#1
0
def test(hparams,
         mel,
         output_path="test.wav",
         ref_level_db=20,
         magnitude_power=1.5):
    taco_stft = TacotronSTFT(hparams)
    stime = time.time()
    mel_decompress = mel_denormalize(mel).unsqueeze(0)
    mel_decompress = taco_stft.spectral_de_normalize(mel_decompress +
                                                     ref_level_db)**(
                                                         1 / magnitude_power)
    mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
    spec_from_mel_scaling = 1000
    spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis)
    spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
    spec_from_mel = spec_from_mel * spec_from_mel_scaling
    waveform = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :]),
                           taco_stft.stft_fn, 60)
    waveform = waveform[0].data.cpu().numpy()
    waveform = waveform / abs(waveform).max() * 0.99 * 2**15
    waveform = waveform.astype(dtype=np.int16)
    dec_time = time.time() - stime
    len_audio = float(len(waveform)) / float(hparams.sampling_rate)
    str = "audio length: {:.2f} sec,  mel_to_wave time: {:.2f}".format(
        len_audio, dec_time)
    print(str)
    write(os.path.join(output_path), hparams.sampling_rate, waveform)
def mel_spectrogram_and_waveform_generation(checkpoint_path, text, hparams):

    # Griffin Lim iterations
    n_iter = 60
    # #### Load model from checkpoint
    model = load_model(hparams)
    model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
    _ = model.eval()

    # #### Prepare text input
    #text = "amor é fogo que arde sem se ver."
    sequence = np.array(text_to_sequence(text, ['basic_cleaners']))[None, :]
    sequence = torch.autograd.Variable(
        torch.from_numpy(sequence)).cuda().long()

    # #### Decode text input

    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)

    taco_stft = TacotronSTFT(hparams.filter_length,
                             hparams.hop_length,
                             hparams.win_length,
                             sampling_rate=hparams.sampling_rate)
    mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet)
    mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
    spec_from_mel_scaling = 1000
    spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis)
    spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
    spec_from_mel = spec_from_mel * spec_from_mel_scaling

    waveform = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]),
                           taco_stft.stft_fn, n_iter)
    return waveform
示例#3
0
def inference_texts(model,
                    hp,
                    target_texts,
                    step,
                    model_name,
                    vocoder,
                    waveglow,
                    f_type='mel',
                    _type='train',
                    postnet=True):
    model.eval()
    for param in model.parameters():
        param.requires_grad = False
    sample_rate = 22050
    original_audio, texts = target_texts
    save_target = 'generate/{}-step-{}'.format(model_name, step)
    stft = TacotronSTFT(hp.filter_length, hp.hop_length, hp.win_length,
                        hp.n_mel_channels, hp.sampling_rate, hp.mel_fmin,
                        hp.mel_fmax)

    os.makedirs(save_target, exist_ok=True)
    for i, text in enumerate(texts):
        print(text)
        if original_audio:
            target_name = '{}-target-{}.wav'.format(_type, i)
            path = os.path.join(save_target, target_name)
            shutil.copy2(
                original_audio[i],
                path,
            )
        inputs = prepare_inputs(hp, text)
        if torch.cuda.device_count() > 1:
            with torch.no_grad():
                predict = model.module.inference(inputs, postnet=postnet)
        else:
            with torch.no_grad():
                predict = model.inference(inputs, postnet=postnet)
        name = '{}-{}-{}-{}.wav'.format(_type, f_type, i, vocoder)

        path = os.path.join(save_target, name)
        if vocoder == 'griffin_lim':
            mel_decompress = stft.spectral_de_normalize(predict)
            mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
            spec_from_mel_scaling = 1000
            spec_from_mel = torch.mm(mel_decompress[0], stft.mel_basis)
            spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
            spec_from_mel = spec_from_mel * spec_from_mel_scaling
            print(spec_from_mel.size())
            waveform = griffin_lim(
                torch.autograd.Variable(spec_from_mel[:, :, :-1]),
                stft.stft_fn, 60)
            write(path, sample_rate, waveform[0].data.cpu().numpy())
        elif vocoder == 'waveglow' and waveglow:
            with torch.no_grad():
                audio = MAX_WAV_VALUE * waveglow.infer(predict, sigma=1.0)[0]
            audio = audio.cpu().numpy()
            audio = audio.astype('int16')
            write(path, sample_rate, audio)
示例#4
0
def inference(args):
    hparams = create_hparams()

    sentences = get_sentences(args)
    # sentences = [sentences[i: i+hparams.tacotron_synthesis_batch_size] for i in range(0, len(sentences), hparams.tacotron_synthesis_batch_size)]

    model = load_model(hparams)
    model.load_state_dict(torch.load(args.checkpoint_path)['state_dict'])
    model.cuda().eval()  #.half()

    test_set = TextMelLoaderEval(sentences, hparams)
    test_collate_fn = TextMelCollateEval(hparams)
    test_sampler = DistributedSampler(
        valset) if hparams.distributed_run else None
    test_loader = DataLoader(test_set,
                             num_workers=0,
                             shuffle=shuffle,
                             sampler=test_sampler,
                             batch_size=hparams.batch_size,
                             pin_memory=False,
                             drop_last=True,
                             collate_fn=test_collate_fn)

    taco_stft = TacotronSTFT(hparams.filter_length,
                             hparams.hop_length,
                             hparams.win_length,
                             sampling_rate=hparams.sampling_rate)

    with torch.no_grad():
        for i, batch in enumerate(test_loader):
            mel_outputs, mel_outputs_postnet, _, alignments = model.inference(
                batch)

            for j in range(mel_outputs.size(0)):

                mel_decompress = taco_stft.spectral_de_normalize(
                    mel_outputs_postnet[j])
                mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
                spec_from_mel_scaling = 1000
                spec_from_mel = torch.mm(mel_decompress[0],
                                         taco_stft.mel_basis)
                spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
                spec_from_mel = spec_from_mel * spec_from_mel_scaling

                audio = griffin_lim(
                    torch.autograd.Variable(spec_from_mel[:, :, :-1]),
                    taco_stft.stft_fn, args.griffin_iters)

                audio = audio.squeeze()
                audio = audio.cpu().numpy()
                #audio = audio.astype('int16')
                # audio_path = os.path.join('samples', "{}_synthesis.wav".format(args.out_filename))
                audio_path = os.path.join(args.out_filename,
                                          'batch_{}_sentence_{}'.format(i, j))
                write(audio_path, hparams.sampling_rate, audio)
                print(audio_path)
示例#5
0
def synthesis_griffin_lim(mel,hparams):
    taco_stft = TacotronSTFT(
    hparams.filter_length, hparams.hop_length, hparams.win_length, 
    sampling_rate=hparams.sampling_rate)
    mel_decompress = taco_stft.spectral_de_normalize(mel)
    mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
    spec_from_mel_scaling = 1000
    spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis)
    spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
    spec_from_mel = spec_from_mel * spec_from_mel_scaling
    waveform = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]), 
               taco_stft.stft_fn, 60) 
    return waveform
       
示例#6
0
def main(text, checkpoint_path, path, name):
    #### Setup hparams
    hparams = create_hparams("distributed_run=False,mask_padding=False")
    hparams.filter_length = 1024
    hparams.hop_length = 256
    hparams.win_length = 1024

    #### Load model from checkpoint
    model = get_model(hparams, checkpoint_path)

    #### Prepare text input
    sequence = get_input(get_pinyin(text))

    #### inference
    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(
        sequence, drop_prob=0.25)

    #### tacotron result
    taco_stft = TacotronSTFT(hparams.filter_length,
                             hparams.hop_length,
                             hparams.win_length,
                             sampling_rate=hparams.sampling_rate)
    mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet)
    mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
    spec_from_mel_scaling = 1000
    spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis)
    spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
    spec_from_mel = spec_from_mel * spec_from_mel_scaling
    waveform = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]),
                           taco_stft.stft_fn, 60)
    write(
        os.path.join(path, name) + '_tacotron.wav', 16000,
        waveform[0].data.cpu().numpy())

    #### transform tacotron mel to wavenet mel
    wavenet_mel = to_wavenet_mel(mel_outputs_postnet.data.cpu().numpy()[0].T)

    #### save
    np.save(
        os.path.join(path, name) + '_mel.npy',
        mel_outputs_postnet.data.cpu().numpy()[0])
    np.save(
        os.path.join(path, name) + '_alig.npy',
        alignments.data.cpu().numpy()[0])
    np.save(os.path.join(path, name) + '.npy', wavenet_mel)
示例#7
0
 def generate(self, text=None):
     text = ch2p(text)
     sequence = np.array(text_to_sequence(text,
                                          ['basic_cleaners']))[None, :]
     sequence = torch.autograd.Variable(
         torch.from_numpy(sequence)).cuda().long()
     mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference(
         sequence)
     taco_stft = TacotronSTFT(self.hparams.filter_length,
                              self.hparams.hop_length,
                              self.hparams.win_length,
                              sampling_rate=self.hparams.sampling_rate)
     mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet)
     mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
     spec_from_mel_scaling = 1000
     spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis)
     spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
     spec_from_mel = spec_from_mel * spec_from_mel_scaling
     waveform = griffin_lim(
         torch.autograd.Variable(spec_from_mel[:, :, :-1]),
         taco_stft.stft_fn, 60)
示例#8
0
def infer(checkpoint_path, griffin_iters, text, out_filename):
    hparams = create_hparams()
    hparams.sampling_rate = 22050

    model = load_model(hparams)
    model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
    _ = model.cuda().eval()  #.half()

    sequence = np.array(text_to_sequence(text, ['chinese_cleaners']))[None, :]
    sequence = torch.autograd.Variable(
        torch.from_numpy(sequence)).cuda().long()

    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)

    taco_stft = TacotronSTFT(hparams.filter_length,
                             hparams.hop_length,
                             hparams.win_length,
                             sampling_rate=hparams.sampling_rate)

    mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet)
    mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
    spec_from_mel_scaling = 1000
    spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis)
    spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
    spec_from_mel = spec_from_mel * spec_from_mel_scaling

    audio = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]),
                        taco_stft.stft_fn, griffin_iters)

    audio = audio.squeeze()
    audio = audio.cpu().numpy()
    #audio = audio.astype('int16')
    audio_path = os.path.join('samples',
                              "{}_synthesis.wav".format(out_filename))
    write(audio_path, hparams.sampling_rate, audio)
    print(audio_path)
    plot_alignment_to_numpy(
        alignments.squeeze().cpu().detach().numpy().T,
        os.path.join('samples', "{}_attention.png".format(out_filename)))
示例#9
0
def infer(checkpoint_path, griffin_iters, text, out_filename):
    hparams = create_hparams()

    model = load_model(hparams)
    model.load_state_dict(torch.load(checkpoint_path)['state_dict'])
    _ = model.cuda().eval()  #.half()

    with codecs.open(hparams.merge_cedict, 'r', 'utf-8') as usernames:
        mask_dict = json.load(usernames)

    sequence = np.array(poly_yinsu_to_mask_inference(text, mask_dict))[None, :]
    print('CHECK INPUT sequence:', sequence)
    # sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()

    # sequence, mask_sequence = poly_yinsu_to_mask_inference(text, mask_dict)
    # print('CHECK INPUT sequence:', sequence)

    #tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
    #text_seq = tokenizer.convert_tokens_to_ids(text)
    text = [list(text)]
    print('CHECK INPUT mask_sequence:', text)
    mask_padded = torch.FloatTensor(len(sequence), hparams.num_classes)
    mask_padded.fill_(-float('inf'))
    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(text)
    # sequence_id = np.array(sequence_id)[None, :]
    # mask_sequence = np.array(mask_sequence)[None, :]
    # sequence_id = torch.autograd.Variable(torch.from_numpy(sequence_id)).cuda().long()

    # mask_sequence = torch.autograd.Variable(torch.from_numpy(mask_sequence)).cuda().long()

    # mask_sequence = batch[ids_sorted_decreasing[i]][1]
    for j in range(len(mask_sequence)):
        mask_character = mask_sequence[j]
        for k in range(len(mask_character)):
            index = torch.LongTensor([[j, mask_character[k]]])
            value = torch.zeros(index.shape[0])
            mask_padded.index_put_(tuple(index.t()), value)

    mel_outputs, mel_outputs_postnet, _, alignments = model.inference(
        [sequence], mask_padded.cuda())
    # mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence, mask_sequence)

    taco_stft = TacotronSTFT(hparams.filter_length,
                             hparams.hop_length,
                             hparams.win_length,
                             sampling_rate=hparams.sampling_rate)

    mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet)
    mel_decompress = mel_decompress.transpose(1, 2).data.cpu()
    spec_from_mel_scaling = 1000
    spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis)
    spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0)
    spec_from_mel = spec_from_mel * spec_from_mel_scaling

    audio = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]),
                        taco_stft.stft_fn, griffin_iters)

    audio = audio.squeeze()
    audio = audio.cpu().numpy()
    #audio = audio.astype('int16')
    audio_path = os.path.join('samples',
                              "{}_synthesis.wav".format(out_filename))
    write(audio_path, hparams.sampling_rate, audio)
    print(audio_path)