def test(hparams, mel, output_path="test.wav", ref_level_db=20, magnitude_power=1.5): taco_stft = TacotronSTFT(hparams) stime = time.time() mel_decompress = mel_denormalize(mel).unsqueeze(0) mel_decompress = taco_stft.spectral_de_normalize(mel_decompress + ref_level_db)**( 1 / magnitude_power) mel_decompress = mel_decompress.transpose(1, 2).data.cpu() spec_from_mel_scaling = 1000 spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis) spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0) spec_from_mel = spec_from_mel * spec_from_mel_scaling waveform = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :]), taco_stft.stft_fn, 60) waveform = waveform[0].data.cpu().numpy() waveform = waveform / abs(waveform).max() * 0.99 * 2**15 waveform = waveform.astype(dtype=np.int16) dec_time = time.time() - stime len_audio = float(len(waveform)) / float(hparams.sampling_rate) str = "audio length: {:.2f} sec, mel_to_wave time: {:.2f}".format( len_audio, dec_time) print(str) write(os.path.join(output_path), hparams.sampling_rate, waveform)
def mel_spectrogram_and_waveform_generation(checkpoint_path, text, hparams): # Griffin Lim iterations n_iter = 60 # #### Load model from checkpoint model = load_model(hparams) model.load_state_dict(torch.load(checkpoint_path)['state_dict']) _ = model.eval() # #### Prepare text input #text = "amor é fogo que arde sem se ver." sequence = np.array(text_to_sequence(text, ['basic_cleaners']))[None, :] sequence = torch.autograd.Variable( torch.from_numpy(sequence)).cuda().long() # #### Decode text input mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence) taco_stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, sampling_rate=hparams.sampling_rate) mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet) mel_decompress = mel_decompress.transpose(1, 2).data.cpu() spec_from_mel_scaling = 1000 spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis) spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0) spec_from_mel = spec_from_mel * spec_from_mel_scaling waveform = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]), taco_stft.stft_fn, n_iter) return waveform
def inference_texts(model, hp, target_texts, step, model_name, vocoder, waveglow, f_type='mel', _type='train', postnet=True): model.eval() for param in model.parameters(): param.requires_grad = False sample_rate = 22050 original_audio, texts = target_texts save_target = 'generate/{}-step-{}'.format(model_name, step) stft = TacotronSTFT(hp.filter_length, hp.hop_length, hp.win_length, hp.n_mel_channels, hp.sampling_rate, hp.mel_fmin, hp.mel_fmax) os.makedirs(save_target, exist_ok=True) for i, text in enumerate(texts): print(text) if original_audio: target_name = '{}-target-{}.wav'.format(_type, i) path = os.path.join(save_target, target_name) shutil.copy2( original_audio[i], path, ) inputs = prepare_inputs(hp, text) if torch.cuda.device_count() > 1: with torch.no_grad(): predict = model.module.inference(inputs, postnet=postnet) else: with torch.no_grad(): predict = model.inference(inputs, postnet=postnet) name = '{}-{}-{}-{}.wav'.format(_type, f_type, i, vocoder) path = os.path.join(save_target, name) if vocoder == 'griffin_lim': mel_decompress = stft.spectral_de_normalize(predict) mel_decompress = mel_decompress.transpose(1, 2).data.cpu() spec_from_mel_scaling = 1000 spec_from_mel = torch.mm(mel_decompress[0], stft.mel_basis) spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0) spec_from_mel = spec_from_mel * spec_from_mel_scaling print(spec_from_mel.size()) waveform = griffin_lim( torch.autograd.Variable(spec_from_mel[:, :, :-1]), stft.stft_fn, 60) write(path, sample_rate, waveform[0].data.cpu().numpy()) elif vocoder == 'waveglow' and waveglow: with torch.no_grad(): audio = MAX_WAV_VALUE * waveglow.infer(predict, sigma=1.0)[0] audio = audio.cpu().numpy() audio = audio.astype('int16') write(path, sample_rate, audio)
def inference(args): hparams = create_hparams() sentences = get_sentences(args) # sentences = [sentences[i: i+hparams.tacotron_synthesis_batch_size] for i in range(0, len(sentences), hparams.tacotron_synthesis_batch_size)] model = load_model(hparams) model.load_state_dict(torch.load(args.checkpoint_path)['state_dict']) model.cuda().eval() #.half() test_set = TextMelLoaderEval(sentences, hparams) test_collate_fn = TextMelCollateEval(hparams) test_sampler = DistributedSampler( valset) if hparams.distributed_run else None test_loader = DataLoader(test_set, num_workers=0, shuffle=shuffle, sampler=test_sampler, batch_size=hparams.batch_size, pin_memory=False, drop_last=True, collate_fn=test_collate_fn) taco_stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, sampling_rate=hparams.sampling_rate) with torch.no_grad(): for i, batch in enumerate(test_loader): mel_outputs, mel_outputs_postnet, _, alignments = model.inference( batch) for j in range(mel_outputs.size(0)): mel_decompress = taco_stft.spectral_de_normalize( mel_outputs_postnet[j]) mel_decompress = mel_decompress.transpose(1, 2).data.cpu() spec_from_mel_scaling = 1000 spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis) spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0) spec_from_mel = spec_from_mel * spec_from_mel_scaling audio = griffin_lim( torch.autograd.Variable(spec_from_mel[:, :, :-1]), taco_stft.stft_fn, args.griffin_iters) audio = audio.squeeze() audio = audio.cpu().numpy() #audio = audio.astype('int16') # audio_path = os.path.join('samples', "{}_synthesis.wav".format(args.out_filename)) audio_path = os.path.join(args.out_filename, 'batch_{}_sentence_{}'.format(i, j)) write(audio_path, hparams.sampling_rate, audio) print(audio_path)
def synthesis_griffin_lim(mel,hparams): taco_stft = TacotronSTFT( hparams.filter_length, hparams.hop_length, hparams.win_length, sampling_rate=hparams.sampling_rate) mel_decompress = taco_stft.spectral_de_normalize(mel) mel_decompress = mel_decompress.transpose(1, 2).data.cpu() spec_from_mel_scaling = 1000 spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis) spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0) spec_from_mel = spec_from_mel * spec_from_mel_scaling waveform = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]), taco_stft.stft_fn, 60) return waveform
def main(text, checkpoint_path, path, name): #### Setup hparams hparams = create_hparams("distributed_run=False,mask_padding=False") hparams.filter_length = 1024 hparams.hop_length = 256 hparams.win_length = 1024 #### Load model from checkpoint model = get_model(hparams, checkpoint_path) #### Prepare text input sequence = get_input(get_pinyin(text)) #### inference mel_outputs, mel_outputs_postnet, _, alignments = model.inference( sequence, drop_prob=0.25) #### tacotron result taco_stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, sampling_rate=hparams.sampling_rate) mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet) mel_decompress = mel_decompress.transpose(1, 2).data.cpu() spec_from_mel_scaling = 1000 spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis) spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0) spec_from_mel = spec_from_mel * spec_from_mel_scaling waveform = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]), taco_stft.stft_fn, 60) write( os.path.join(path, name) + '_tacotron.wav', 16000, waveform[0].data.cpu().numpy()) #### transform tacotron mel to wavenet mel wavenet_mel = to_wavenet_mel(mel_outputs_postnet.data.cpu().numpy()[0].T) #### save np.save( os.path.join(path, name) + '_mel.npy', mel_outputs_postnet.data.cpu().numpy()[0]) np.save( os.path.join(path, name) + '_alig.npy', alignments.data.cpu().numpy()[0]) np.save(os.path.join(path, name) + '.npy', wavenet_mel)
def generate(self, text=None): text = ch2p(text) sequence = np.array(text_to_sequence(text, ['basic_cleaners']))[None, :] sequence = torch.autograd.Variable( torch.from_numpy(sequence)).cuda().long() mel_outputs, mel_outputs_postnet, _, alignments = self.model.inference( sequence) taco_stft = TacotronSTFT(self.hparams.filter_length, self.hparams.hop_length, self.hparams.win_length, sampling_rate=self.hparams.sampling_rate) mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet) mel_decompress = mel_decompress.transpose(1, 2).data.cpu() spec_from_mel_scaling = 1000 spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis) spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0) spec_from_mel = spec_from_mel * spec_from_mel_scaling waveform = griffin_lim( torch.autograd.Variable(spec_from_mel[:, :, :-1]), taco_stft.stft_fn, 60)
def infer(checkpoint_path, griffin_iters, text, out_filename): hparams = create_hparams() hparams.sampling_rate = 22050 model = load_model(hparams) model.load_state_dict(torch.load(checkpoint_path)['state_dict']) _ = model.cuda().eval() #.half() sequence = np.array(text_to_sequence(text, ['chinese_cleaners']))[None, :] sequence = torch.autograd.Variable( torch.from_numpy(sequence)).cuda().long() mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence) taco_stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, sampling_rate=hparams.sampling_rate) mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet) mel_decompress = mel_decompress.transpose(1, 2).data.cpu() spec_from_mel_scaling = 1000 spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis) spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0) spec_from_mel = spec_from_mel * spec_from_mel_scaling audio = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]), taco_stft.stft_fn, griffin_iters) audio = audio.squeeze() audio = audio.cpu().numpy() #audio = audio.astype('int16') audio_path = os.path.join('samples', "{}_synthesis.wav".format(out_filename)) write(audio_path, hparams.sampling_rate, audio) print(audio_path) plot_alignment_to_numpy( alignments.squeeze().cpu().detach().numpy().T, os.path.join('samples', "{}_attention.png".format(out_filename)))
def infer(checkpoint_path, griffin_iters, text, out_filename): hparams = create_hparams() model = load_model(hparams) model.load_state_dict(torch.load(checkpoint_path)['state_dict']) _ = model.cuda().eval() #.half() with codecs.open(hparams.merge_cedict, 'r', 'utf-8') as usernames: mask_dict = json.load(usernames) sequence = np.array(poly_yinsu_to_mask_inference(text, mask_dict))[None, :] print('CHECK INPUT sequence:', sequence) # sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long() # sequence, mask_sequence = poly_yinsu_to_mask_inference(text, mask_dict) # print('CHECK INPUT sequence:', sequence) #tokenizer = BertTokenizer.from_pretrained("bert-base-chinese") #text_seq = tokenizer.convert_tokens_to_ids(text) text = [list(text)] print('CHECK INPUT mask_sequence:', text) mask_padded = torch.FloatTensor(len(sequence), hparams.num_classes) mask_padded.fill_(-float('inf')) mel_outputs, mel_outputs_postnet, _, alignments = model.inference(text) # sequence_id = np.array(sequence_id)[None, :] # mask_sequence = np.array(mask_sequence)[None, :] # sequence_id = torch.autograd.Variable(torch.from_numpy(sequence_id)).cuda().long() # mask_sequence = torch.autograd.Variable(torch.from_numpy(mask_sequence)).cuda().long() # mask_sequence = batch[ids_sorted_decreasing[i]][1] for j in range(len(mask_sequence)): mask_character = mask_sequence[j] for k in range(len(mask_character)): index = torch.LongTensor([[j, mask_character[k]]]) value = torch.zeros(index.shape[0]) mask_padded.index_put_(tuple(index.t()), value) mel_outputs, mel_outputs_postnet, _, alignments = model.inference( [sequence], mask_padded.cuda()) # mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence, mask_sequence) taco_stft = TacotronSTFT(hparams.filter_length, hparams.hop_length, hparams.win_length, sampling_rate=hparams.sampling_rate) mel_decompress = taco_stft.spectral_de_normalize(mel_outputs_postnet) mel_decompress = mel_decompress.transpose(1, 2).data.cpu() spec_from_mel_scaling = 1000 spec_from_mel = torch.mm(mel_decompress[0], taco_stft.mel_basis) spec_from_mel = spec_from_mel.transpose(0, 1).unsqueeze(0) spec_from_mel = spec_from_mel * spec_from_mel_scaling audio = griffin_lim(torch.autograd.Variable(spec_from_mel[:, :, :-1]), taco_stft.stft_fn, griffin_iters) audio = audio.squeeze() audio = audio.cpu().numpy() #audio = audio.astype('int16') audio_path = os.path.join('samples', "{}_synthesis.wav".format(out_filename)) write(audio_path, hparams.sampling_rate, audio) print(audio_path)