def infer_wavenet(args): import sys sys.path.append('thirdparty/wavenet_vocoder') from train import build_model from synthesis import wavegen from tqdm import tqdm target_sample_rate = 22050 hparams, model = load_model(args.model_name) meller = MelSpectrogram() files = [ item for item in os.listdir(args.folder_in) if item.endswith('wav') ] for idx, audio in enumerate(files): wav_path = os.path.join(args.folder_in, audio) wav = load_wav(wav_path, target_sample_rate) c = meller(wav)[0] if c.shape[1] != hparams.num_mels: c = c.transpose(0, 1) # Range [0, 4] was used for training Tacotron2 but WaveNet vocoder assumes [0, 1] # c = np.interp(c, (0, 4), (0, 1)) # Generate waveform = wavegen(model, c=c, fast=True, tqdm=tqdm) path = os.path.join(args.folder_out, audio) folder = os.path.dirname(path) if not os.path.exists(folder): os.makedirs(folder) torchaudio.save(path, waveform, hparams.sample_rate)
def genspec(pkl_path, write_name, save_dir="./result_wav/"): spect_vc = pickle.load(open(pkl_path, "rb")) i = 0 for spect in spect_vc: c = spect[1] i = i + 1 waveform = wavegen(model, c=c) librosa.output.write_wav(save_dir + write_name + '_' + str(i) + '.wav', waveform, sr=16000)
def generateAudioGroup(original_audio, ref_audios, autovc_checkpoint = 'checkpoints_fully/autovc_700000.pt', vocoder_checkpoint = "../checkpoint_step001000000_ema.pth"): mel_org = makeSpect(original_audio, None) def pad_seq(x, base=32): len_out = int(base * ceil(float(x.shape[0])/base)) len_pad = len_out - x.shape[0] assert len_pad >= 0 return np.pad(x, ((0,len_pad),(0,0)), 'constant'), len_pad device = 'cuda:0' G = Generator(32,256,512,32).eval().to(device) g_checkpoint = torch.load(autovc_checkpoint, map_location=torch.device('cuda')) G = g_checkpoint.eval() x_org = mel_org x_org, len_pad = pad_seq(x_org) uttr_org = torch.FloatTensor(x_org[np.newaxis, :, :]).to(device) emb_org = get_verification_pytorch_1000(original_audio) emb_refs = [] i = 0 for file in os.listdir(ref_audios): i += 1 print("{}/{}".format(i, len(os.listdir(ref_audios)))) emb_ref = get_verification_pytorch_1000(ref_audios + file, 1) if emb_ref is not None: emb_refs.append(emb_ref) emb_refs = np.mean(emb_refs, axis=0) emb_org = torch.FloatTensor(emb_org).unsqueeze(0).cuda() emb_refs = torch.FloatTensor(emb_refs).unsqueeze(0).cuda() with torch.no_grad(): _, x_identic_psnt, _ = G(uttr_org, emb_org, emb_refs) if len_pad == 0: uttr_trg = x_identic_psnt[0, 0, :, :].cpu().numpy() else: uttr_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy() device = torch.device("cuda") model = build_model().to(device) checkpoint = torch.load(vocoder_checkpoint, map_location=torch.device('cuda')) model.load_state_dict(checkpoint["state_dict"]) waveform = wavegen(model, c=uttr_trg) return waveform
def generateAudio(original_audio, ref_audio, autovc_checkpoint, vocoder_checkpoint ,english=False): mel_org = makeSpect(original_audio, None) def pad_seq(x, base=32): len_out = int(base * ceil(float(x.shape[0])/base)) len_pad = len_out - x.shape[0] assert len_pad >= 0 return np.pad(x, ((0,len_pad),(0,0)), 'constant'), len_pad device = 'cuda:0' G = Generator(32,256,512,32).eval().to(device) g_checkpoint = torch.load(autovc_checkpoint, map_location=torch.device('cuda')) G = g_checkpoint.eval() x_org = mel_org x_org, len_pad = pad_seq(x_org) uttr_org = torch.FloatTensor(x_org[np.newaxis, :, :]).to(device) emb_org = get_verification_pytorch_1000(original_audio) if not english: emb_ref = get_verification_pytorch_1000(ref_audio) else: emb_ref = get_verification_eng(ref_audio) if emb_org is None or emb_ref is None: return None emb_org = torch.FloatTensor(emb_org).unsqueeze(0).cuda() if not english: emb_ref = torch.FloatTensor(emb_ref).unsqueeze(0).cuda() else: emb_ref = emb_ref.type(torch.cuda.FloatTensor) with torch.no_grad(): _, x_identic_psnt, _ = G(uttr_org, emb_org, emb_ref) if len_pad == 0: uttr_trg = x_identic_psnt[0, 0, :, :].cpu().numpy() else: uttr_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy() device = torch.device("cuda") model = build_model().to(device) checkpoint = torch.load(vocoder_checkpoint, map_location=torch.device('cuda')) model.load_state_dict(checkpoint["state_dict"]) waveform = wavegen(model, c=uttr_trg) return waveform
def __decode__(self): spect_vc = pickle.load(open('results.pkl', 'rb')) #device = torch.device("cuda") model = build_model()#.to(device) checkpoint = torch.load("checkpoint_step001000000_ema.pth", map_location=torch.device('cpu')) model.load_state_dict(checkpoint["state_dict"]) for spect in spect_vc: name = spect[0] c = spect[1] print(name) waveform = wavegen(model, c=c) save_path = os.path.join("audio/download/audio.wav") librosa.output.write_wav(save_path, waveform, sr=16000) return save_path
wav = load_wav(src_wav_path) emb = np.load(src_emb_path) emb_tgt = np.load(tgt_emb_path) mel = melspectrogram(wav) pad_len = math.ceil(mel.shape[1] / 32) * 32 - mel.shape[1] mel = np.pad(mel, ((0,0), (0, pad_len)), mode='constant') mel = torch.FloatTensor(mel) emb = torch.FloatTensor(emb) emb_tgt = torch.FloatTensor(emb_tgt) model = Generator(dim_neck, dim_emb, dim_pre, freq) checkpoint = torch.load(autovc_checkpoint_path, map_location=torch.device('cpu')) model.load_state_dict(checkpoint['model']) model.eval() x = mel.unsqueeze(0).transpose(2,1) e = emb.unsqueeze(0) et = emb_tgt.unsqueeze(0) mel_outputs, mel_outputs_postnet, codes = model(x, e, et) mel_rec = mel_outputs_postnet.transpose(2,1).cpu().detach().numpy()[0] mel_rec = mel_rec[:,:-pad_len] c = np.transpose(mel_rec, (1, 0)) waveform = wavegen(wavnet, device, c=c) librosa.output.write_wav(output_path, waveform, sr=16000)
if g is not None: print("Global conditioned by speaker id {}".format(g)) # Paths dst_wav_path = join( dst_dir, "{}_{}{}_predicted.wav".format(idx, checkpoint_name, file_name_suffix)) target_wav_path = join( dst_dir, "{}_{}{}_target.wav".format(idx, checkpoint_name, file_name_suffix)) # Generate waveform = wavegen(model, length, c=c, g=g, initial_value=initial_value, fast=True, tqdm=_tqdm) # save librosa.output.write_wav(dst_wav_path, waveform, sr=hparams.sample_rate) librosa.output.write_wav(target_wav_path, P.inv_mulaw_quantize(x), sr=hparams.sample_rate) # log if output_html: print("""
device = 'cuda:0' g_checkpoint = torch.load(autovc_checkpoint, map_location=torch.device('cuda')) G = g_checkpoint x_org = mel_org x_org, len_pad = pad_seq(x_org) uttr_org = torch.FloatTensor(x_org[np.newaxis, :, :]).to(device) with torch.no_grad(): _, x_identic_psnt, _ = G(uttr_org, emb_ref) if len_pad == 0: uttr_trg = x_identic_psnt[0, 0, :, :].cpu().numpy() else: uttr_trg = x_identic_psnt[0, 0, :-len_pad, :].cpu().numpy() device = torch.device("cuda") model = build_model().to(device) checkpoint = torch.load( "../drive/MyDrive/MultiSpeaker_Tacotron2/checkpoint_step001000000_ema.pth", map_location=torch.device('cuda')) model.load_state_dict(checkpoint["state_dict"]) waveform = wavegen(model, c=uttr_trg) sf.write('{}-{}.wav'.format(original_name, ref_name), waveform, 16000, subtype='PCM_24')
condition), uttr_trg)) # %% # spectrogram to waveform import torch import librosa import pickle import os from synthesis import build_model from synthesis import wavegen if not os.path.exists('results'): os.makedirs('results') model = build_model().to(device) checkpoint = torch.load( "/datapool/home/zxt20/JieWang2020ICASSP/speechflow_plus-grl11/pre-trained-model/wave_netcheckpoint_step001000000_ema.pth" ) # 预训练好的wavenet vocoder model.load_state_dict(checkpoint["state_dict"]) for spect in spect_vc: name = spect[0] c = spect[1] print(name) waveform = wavegen(model, c=c) librosa.output.write_wav('results_L1/' + name + '.wav', waveform, sr=16000) # %%
if in_path[-1] == str(os.sep): in_path = in_path[:-1] model = build_model().to(device) model.load_state_dict(checkpoint["state_dict"]) wav_paths = [in_path + os.sep + "{}".format(fi) for fi in os.listdir(in_path) if ".wav" in fi] out_dir = in_path + "_mel" if not os.path.exists(out_dir): os.mkdir(out_dir) for wp in wav_paths: print("Saving mels for {}".format(wp)) _process_utterance(wp, out_dir) mel_dir = out_dir wav_out_dir = mel_dir + "_wavenet_render" if not os.path.exists(wav_out_dir): os.mkdir(wav_out_dir) sample_rate = 22050 mel_paths = [mel_dir + os.sep + "{}".format(fi) for fi in os.listdir(mel_dir) if "mel" in fi] for mel_path in mel_paths: c = np.load(mel_path) if c.shape[1] != hparams.num_mels: np.swapaxes(c, 0, 1) waveform = wavegen(model, c=c, fast=True, tqdm=tqdm) fname = mel_path.split(os.sep)[-1].split(".")[0] fpath = wav_out_dir + str(os.sep) + '{}.wav'.format(fname) wavfile.write(fpath, sample_rate, waveform) print("Saved HD audio {}".format(fpath))
plt.subplot(1, len(all_spmels), j + 1) if j == 0: plt.title('original_' + name) elif j == 1: plt.title('resynthOrg_' + name) else: try: plt.title(name + '_to_' + str(style_names[j - num_unconv_styles])) except: pdb.set_trace() plt.imshow(np.rot90(all_spmels[j])) plt.savefig(subdir_for_wavs + '/example' + str(counter) + '_spmels') # synthesize nu shit for k, spmel in enumerate(all_spmels): # x_identic_psnt = tensor.squeeze(0).squeeze(0).detach().cpu().numpy() waveform = wavegen(model, config.which_cuda, c=spmel) # librosa.output.write_wav(name+'.wav', waveform, sr=16000) # if k == 0: # sf.write(subdir_for_wavs +f'/example{counter}_{name}_ORG.wav', waveform, samplerate=16000) if k == 0: sf.write(subdir_for_wavs + f'/example{counter}_{name}_synthed_from_org.wav', waveform, samplerate=16000) else: sf.write(subdir_for_wavs + f'/example{counter}_{name}_to_{style_names[k-1]}.wav', waveform, samplerate=16000) counter += 2
# if not os.path.exists('results_p'): # os.makedirs('results_p') if not os.path.exists('results_nop_1_rhym'): os.makedirs('results_nop_1_rhym') model = build_model().to(device) # checkpoint = torch.load("/home/jie-wang19/Speechsplitexp/speech_split_baseline/Base_origin/pre-trained-model/checkpoint_step001000000_ema.pth") checkpoint = torch.load( "/datapool/home/zxt20/JieWang2020ICASSP/SpeechFlow-master_ordin/pre-trained-model/wave_netcheckpoint_step001000000_ema.pth" ) # 预训练好的wavenet vocoder model.load_state_dict(checkpoint["state_dict"]) # i = 0 # for spect in spect_vc: # # i += 1 # name = spect[0] # c = spect[1] # #waveform = audio.inv_mel_spectrogram(c.T, hparams) # print(name) # waveform = wavegen(model, c=c) # librosa.output.write_wav('results_p/' + name + '.wav', waveform, sr=16000) for sp in spect_vc_NOP: nn = sp[0] c = sp[1] print(nn) waveform_NOP = wavegen(model, c=c) librosa.output.write_wav('results_nop_1_rhym/' + nn + '.wav', waveform_NOP, sr=16000) # %%
def vocode_spec(spec, model, out_name): c = spec waveform = wavegen(model, c=c) librosa.output.write_wav(out_name, waveform, sr=16000)
def step(self, spect): waveform = wavegen(self.model, c=spect) return waveform
f0_onehot = torch.from_numpy(f0_onehot).to(device) # concat pitch contour to freq axis (cols) S = S[np.newaxis, :192, :] S, _ = pad_seq_to_2(S, 192) uttr = torch.from_numpy(S.astype(np.float32)).to(device) #f0_onehot = tr.zeros_like(f0_onehot) uttr_f0 = torch.cat((uttr, f0_onehot), dim=-1) # Generate back from components emb = tr.zeros(1, 82).to(device) print(uttr_f0.shape, uttr.shape, emb.shape) # uttr_f0 = tr.zeros_like(uttr_f0) out = G(uttr_f0, uttr, emb) # Synthesize wav back model = build_model().to(device) checkpoint = torch.load("assets/checkpoint_step001000000_ema.pth", map_location=device) model.load_state_dict(checkpoint["state_dict"]) print(out.shape) waveform = wavegen(model, c=out.squeeze().cpu()) # librosa.output.write_wav('results/'+name+'.wav', waveform, sr=16000) sf.write('results/back_synthesized-zeros-pitch.wav', waveform, 16000, subtype='PCM_24')