def prepare_test(self, dim_spec, source_path=None, target_path=None): if source_path is None: source_path = "/mnt/lustre/dengkangle/cmu/datasets/audio/test/trump_02.wav" if target_path is None: target_path = "/mnt/lustre/dengkangle/cmu/datasets/audio/test/female.wav" # source_path = "/home/kangled/datasets/audio/Chaplin_01.wav" # target_path = "/home/kangled/datasets/audio/Obama_01.wav" mel_basis80 = librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=80) wav, sr = librosa.load(source_path, hparams.sample_rate) wav = preemphasis(wav, hparams.preemphasis, hparams.preemphasize) linear_spec = np.abs( librosa.stft(wav, n_fft=hparams.n_fft, hop_length=hparams.hop_size, win_length=hparams.win_size)) mel_spec = mel_basis80.dot(linear_spec) mel_db = 20 * np.log10(mel_spec) source_spec = np.clip((mel_db + 120) / 125, 0, 1) # source_spec = mel_spec self.source_embed = torch.from_numpy(np.array([0, 1 ])).float().unsqueeze(0) self.source_wav = wav wav, sr = librosa.load(target_path, hparams.sample_rate) wav = preemphasis(wav, hparams.preemphasis, hparams.preemphasize) linear_spec = np.abs( librosa.stft(wav, n_fft=hparams.n_fft, hop_length=hparams.hop_size, win_length=hparams.win_size)) mel_spec = mel_basis80.dot(linear_spec) mel_db = 20 * np.log10(mel_spec) target_spec = np.clip((mel_db + 120) / 125, 0, 1) # target_spec = mel_spec self.target_embed = torch.from_numpy(np.array([1, 0 ])).float().unsqueeze(0) self.target_wav = wav self.source_spec = torch.Tensor(pad_seq(source_spec.T, hparams.freq)).unsqueeze(0) self.target_spec = torch.Tensor(pad_seq(target_spec.T, hparams.freq)).unsqueeze(0)
def __init__(self, npy_path=None, wav_path=None, speaker_id=0, speaker_nums=2, sample_frames=128, length=-1): super(SampleDataset, self).__init__() if npy_path is not None: self.raw_data = np.load(npy_path) print('Loading ', npy_path, "\tshape:", self.raw_data.shape) elif wav_path is not None: print('Encoding ', wav_path) wav, sr = librosa.load(wav_path, hparams.sample_rate) wav = preemphasis(wav, hparams.preemphasis, hparams.preemphasize) wav = wav / (np.abs(wav).max() * 1.1) self.wav = audio.encode_mu_law(wav, mu=2 ** hparams.bits) mel_basis = librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=hparams.num_mels) linear_spec = np.abs( librosa.stft(wav, n_fft=hparams.n_fft, hop_length=hparams.hop_size, win_length=hparams.win_size)) mel_spec = mel_basis.dot(linear_spec) mel_db = 20 * np.log10(mel_spec) # print(in_fpath, mel_db.min(), mel_db.max()) self.raw_data = np.clip((mel_db + 120) / 125, 0, 1) print('Raw_Data Shape:', self.raw_data.shape) # (num_mels, num_frames) else: print("Error! No data input...") self.speaker = np.zeros(speaker_nums) self.speaker[speaker_id % speaker_nums] = 1 self.sample_frames = sample_frames if length > 0: self.length = length else: self.length = max(self.raw_data.shape[1] // sample_frames, 50 * 32)
def voice_conversion(G, input_wavfile, parallel=True): source_path = input_wavfile wav, sr = librosa.load(source_path, hparams.sample_rate) wav = preemphasis(wav, hparams.preemphasis, hparams.sample_rate) linear_spec = np.abs( librosa.stft(wav, n_fft=hparams.n_fft, hop_length=hparams.hop_size, win_length=hparams.win_size)) mel_spec = mel_basis.dot(linear_spec) mel_db = 20 * np.log10(mel_spec) source_spec = np.clip((mel_db + 120) / 125, 0, 1) source_embed = torch.from_numpy(np.array([0, 1])).float() source_spec, _ = pad_seq(source_spec.T, hparams.freq) with torch.no_grad(): s2t_spec = G.conversion(torch.Tensor(source_embed).unsqueeze(0), torch.Tensor(source_embed).unsqueeze(0), torch.Tensor(source_spec).unsqueeze(0), device).cpu() if parallel: s2t_wav = G.vocoder.generate(s2t_spec.transpose(1, 2), True, 8000, 800, mu_law=True) else: s2t_wav = G.vocoder.generate(s2t_spec.transpose(1, 2), False, None, None, mu_law=True) s2t_wav = inv_preemphasis(s2t_wav, hparams.preemphasis, hparams.preemphasize) librosa.output.write_wav(args.output_file, s2t_wav.astype(np.float32), hparams.sample_rate)
newkey = key if 'wavenet' in key: newdict[key.replace('wavenet', 'vocoder')] = newdict.pop(key) newkey = key.replace('wavenet', 'vocoder') if not args.multigpu and 'module' in key: newdict[newkey.replace('module.','',1)] = newdict.pop(newkey) newkey = newkey.replace('module.','', 1) if newkey not in G.state_dict(): newdict.pop(newkey) G.load_state_dict(newdict) print("AutoVC Model Loaded") wav, sr = librosa.load(args.wav_path, hparams.sample_rate) mel_basis = librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=hparams.num_mels) wav = preemphasis(wav, hparams.preemphasis, hparams.preemphasize) linear_spec = np.abs( librosa.stft(wav, n_fft=hparams.n_fft, hop_length=hparams.hop_size, win_length=hparams.win_size)) mel_spec = mel_basis.dot(linear_spec) mel_db = 20 * np.log10(mel_spec) # print(in_fpath, mel_db.min(), mel_db.max()) test_data = np.clip((mel_db + 120) / 125, 0, 1) test_data = torch.Tensor(pad_seq(test_data.T, hparams.freq)).unsqueeze(0).to(device) speaker = torch.from_numpy(np.array([0, 1])).float() with torch.no_grad(): mel_outputs_postnet, v_stage1, v_stage2 = G.generate(test_data, speaker, device) print(v_stage2.shape) gif_list = list(v_stage2.squeeze().cpu().numpy().transpose(0,2,3,1))
def test_audiovideo(self, device, writer, niter): source_path = self.test_path mel_basis80 = librosa.filters.mel(hparams.sample_rate, hparams.n_fft, n_mels=80) wav, sr = librosa.load(source_path, hparams.sample_rate) wav = preemphasis(wav, hparams.preemphasis, hparams.preemphasize) linear_spec = np.abs( librosa.stft(wav, n_fft=hparams.n_fft, hop_length=hparams.hop_size, win_length=hparams.win_size)) mel_spec = mel_basis80.dot(linear_spec) mel_db = 20 * np.log10(mel_spec) source_spec = np.clip((mel_db + 120) / 125, 0, 1) source_embed = torch.from_numpy(np.array([0, 1])).float().unsqueeze(0) source_wav = wav source_spec = torch.Tensor(pad_seq(source_spec.T, hparams.freq)).unsqueeze(0) # print(source_spec.shape) with torch.no_grad(): generated_spec, v_mid, v_hat = self.generate( source_spec, source_embed, device) generated_spec, v_mid, v_hat = generated_spec.cpu(), v_mid.cpu( ), v_hat.cpu() print("Generating Wavfile...") with torch.no_grad(): if not self.multigpu: generated_wav = inv_preemphasis( self.vocoder.generate(generated_spec.to(device).transpose( 2, 1), False, None, None, mu_law=True), hparams.preemphasis, hparams.preemphasize) else: generated_wav = inv_preemphasis( self.vocoder.module.generate( generated_spec.to(device).transpose(2, 1), False, None, None, mu_law=True), hparams.preemphasis, hparams.preemphasize) writer.add_video('generated', (v_hat.numpy() + 1) / 2, global_step=niter) writer.add_video('mid', (v_mid.numpy() + 1) / 2, global_step=niter) writer.add_audio('ground_truth', source_wav, niter, sample_rate=hparams.sample_rate) writer.add_audio('generated_wav', generated_wav, niter, sample_rate=hparams.sample_rate)