def speaker_f0(wav, sr=24000, lo=100, hi=600): f0_rapt = sptk.rapt(wav.astype(np.float32)*32768, sr, 256, min=lo, max=hi, otype=2) index_nonzero = (f0_rapt != -1e10) mean_f0, std_f0 = np.mean(f0_rapt[index_nonzero]), np.std(f0_rapt[index_nonzero]) f0_norm = speaker_normalization(f0_rapt, index_nonzero, mean_f0, std_f0) return f0_norm
def get_f0(self, audio, f0_mean=None, f0_var=None, sampling_rate=22050, frame_length=1024, hop_length=256, f0_min=80, f0_max=880, harm_thresh=0.25, mel_fmin=70.0): '''f0, harmonic_rates, argmins, times = compute_yin( audio, sampling_rate, frame_length, hop_length, f0_min, f0_max, harm_thresh, mel_fmin)''' f0 = sptk.rapt(audio * 32768, sampling_rate, hop_length, min=f0_min, max=f0_max, otype=2) f0 = np.clip(f0, 0, f0_max) index_nonzero = np.nonzero(f0) f0[index_nonzero] += 10.0 f0 -= 10.0 if f0_mean == None: f0_mean = np.mean(f0[index_nonzero]) if f0_var == None: f0_var = np.std(f0[index_nonzero]) f0[index_nonzero] = (f0[index_nonzero] - f0_mean) / f0_var return f0
def extract_f0_func_audiofile(audio_file, gender='M'): floor_sp, ceil_sp = -80, 30 mel_basis = mel(16000, 1024, fmin=90, fmax=7600, n_mels=80).T min_level = np.exp(-100 / 20 * np.log(10)) b, a = butter_highpass(30, 16000, order=5) if gender == 'M': lo, hi = 50, 250 elif gender == 'F': lo, hi = 100, 600 else: raise ValueError prng = RandomState(0) x, fs = sf.read(audio_file) if(len(x.shape) >= 2): x = x[:, 0] if x.shape[0] % 256 == 0: x = np.concatenate((x, np.array([1e-06])), axis=0) y = signal.filtfilt(b, a, x) wav = y * 0.95 + (prng.rand(y.shape[0]) - 0.5) * 1e-06 D = pySTFT(wav).T D_mel = np.dot(D, mel_basis) D_db = 20 * np.log10(np.maximum(min_level, D_mel)) - 16 S = (D_db + 100) / 100 f0_rapt = sptk.rapt(wav.astype(np.float32) * 32768, fs, 256, min=lo, max=hi, otype=2) index_nonzero = (f0_rapt != -1e10) tmp = f0_rapt[index_nonzero] mean_f0, std_f0 = np.mean(tmp), np.std(tmp) f0_norm = speaker_normalization(f0_rapt, index_nonzero, mean_f0, std_f0) return S, f0_norm
def get_f0(audio, sampling_rate, frame_length, hop_length, f0_min, f0_max, harm_thresh): f0 = sptk.rapt(audio * 32768, sampling_rate, hop_length, min=f0_min, max=f0_max, otype=2) f0 = np.clip(f0, 0, f0_max) return f0
def get_f0_noisy(wav, duration=None): f0 = sptk.rapt(wav.astype(np.float32) * hparams.max_wav_value, hparams.sampling_rate, hparams.encoder_hidden, min=hparams.f0_min, max=hparams.f0_max, otype=2) # log f0 if duration is not None: f0 = f0[:sum(duration)] f0 = np.exp(f0) return f0
def _processing_data(hparams, full_path, spk_label, spk_emb, gender, npz_name, pbar, i): if gender == 'M': lo, hi = 50, 250 elif gender == 'F': lo, hi = 100, 600 else: raise ValueError prng = RandomState(int(random.random())) x, fs = librosa.load(full_path, sr=hparams.sample_rate) assert fs == hparams.sample_rate if x.shape[0] % hparams.hop_size == 0: x = np.concatenate((x, np.array([1e-06])), axis=0) y = signal.filtfilt(b, a, x) wav = y * 0.96 + (prng.rand(y.shape[0]) - 0.5) * 1e-06 # compute spectrogram D = pySTFT(wav).T D_mel = np.dot(D, mel_basis) D_db = 20 * np.log10(np.maximum(min_level, D_mel)) - hparams.ref_level_db S = (D_db + 100) / 100 # extract f0 f0_rapt = sptk.rapt(wav.astype(np.float32) * 32768, fs, hparams.hop_size, min=lo, max=hi, otype=2) index_nonzero = (f0_rapt != -1e10) mean_f0, std_f0 = np.mean(f0_rapt[index_nonzero]), np.std( f0_rapt[index_nonzero]) f0_norm = speaker_normalization(f0_rapt, index_nonzero, mean_f0, std_f0) assert len(S) == len(f0_rapt) data = { 'mel': S.astype(np.float32), 'f0': f0_norm.astype(np.float32), 'spk_label': spk_label } if spk_emb is not None: data['spk_emb'] = spk_emb np.savez(npz_name, **data) pbar.update(i)
def Pattern_Generate(path, n_fft: int, num_mels: int, sample_rate: int, hop_size: int, win_size: int, fmin: int, fmax: int, center: bool = False, top_db=60): audio, _ = librosa.load(path, sr=sample_rate) audio = librosa.effects.trim(audio, top_db=top_db, frame_length=512, hop_length=256)[0] audio = librosa.util.normalize(audio) * 0.95 audio = audio[:audio.shape[0] - (audio.shape[0] % hop_size)] spect = spectrogram(y=torch.from_numpy(audio).float().unsqueeze(0), n_fft=n_fft, hop_size=hop_size, win_size=win_size, center=center).squeeze(0).T.numpy() mel = mel_spectrogram(y=torch.from_numpy(audio).float().unsqueeze(0), n_fft=n_fft, num_mels=num_mels, sampling_rate=sample_rate, hop_size=hop_size, win_size=win_size, fmin=fmin, fmax=fmax, center=center).squeeze(0).T.numpy() log_f0 = np.log(rapt( x=audio * 32768, fs=sample_rate, hopsize=hop_size, )) if log_f0.shape[0] != mel.shape[0]: print(path, audio.shape[0], log_f0.shape[0], mel.shape[0]) return audio, spect, mel, log_f0
def extract_f0(wav, fs): f0_rapt = sptk.rapt(wav.astype(np.float32) * 32768, fs, 256, min=lo, max=hi, otype=2) index_nonzero = (f0_rapt != -1e10) mean_f0, std_f0 = np.mean(f0_rapt[index_nonzero]), np.std( f0_rapt[index_nonzero]) f0_norm = speaker_normalization(f0_rapt, index_nonzero, mean_f0, std_f0) f0_quantized = quantize_f0_numpy(f0_norm)[0] f0_onehot = f0_quantized[np.newaxis, :, :] print(f0_onehot.shape) if f0_onehot.shape[1] <= 192: f0_onehot, _ = pad_seq_to_2(f0_onehot, 192) return torch.from_numpy(f0_onehot).to(device)
def extract_f0_func(gender): floor_sp, ceil_sp = -80, 30 mel_basis = mel(16000, 1024, fmin=90, fmax=7600, n_mels=80).T min_level = np.exp(-100 / 20 * np.log(10)) b, a = butter_highpass(30, 16000, order=5) # Set the directory you want to start from ROOT = r'E:\Dataset\VCTK\test_audio' rootDir = os.path.join(ROOT, 'audio') targetDir_f0 = os.path.join(ROOT, 'f0') targetDir = os.path.join(ROOT, 'mel-sp') pt = glob.glob1(rootDir, '*') cep_all = [] dirName, subdirList, _ = next(os.walk(rootDir)) print('Found directory: %s' % dirName) for subdir in sorted(pt): print(subdir) if not os.path.exists(os.path.join(targetDir, subdir)): os.makedirs(os.path.join(targetDir, subdir)) if not os.path.exists(os.path.join(targetDir_f0, subdir)): os.makedirs(os.path.join(targetDir_f0, subdir)) _, _, fileList = next(os.walk(os.path.join(dirName, subdir))) if gender == 'M': lo, hi = 50, 250 elif gender == 'F': lo, hi = 100, 600 else: raise ValueError prng = RandomState(0) for fileName in sorted(fileList): print(subdir, fileName) x, fs = sf.read(os.path.join(dirName, subdir, fileName)) if (len(x.shape) >= 2): x = x[:, 0] if x.shape[0] % 256 == 0: x = np.concatenate((x, np.array([1e-06])), axis=0) y = signal.filtfilt(b, a, x) wav = y * 0.95 + (prng.rand(y.shape[0]) - 0.5) * 1e-06 D = pySTFT(wav).T D_mel = np.dot(D, mel_basis) D_db = 20 * np.log10(np.maximum(min_level, D_mel)) - 16 S = (D_db + 100) / 100 f0_rapt = sptk.rapt(wav.astype(np.float32) * 32768, fs, 256, min=lo, max=hi, otype=2) index_nonzero = (f0_rapt != -1e10) tmp = f0_rapt[index_nonzero] mean_f0, std_f0 = np.mean(tmp), np.std(tmp) f0_norm = speaker_normalization(f0_rapt, index_nonzero, mean_f0, std_f0) if len(S) != len(f0_norm): pdb.set_trace() np.save(os.path.join(targetDir, subdir, fileName[:-4]), S.astype(np.float32), allow_pickle=False) np.save(os.path.join(targetDir_f0, subdir, fileName[:-4]), f0_norm.astype(np.float32), allow_pickle=False) print(S.shape) print(f0_norm.shape)
# assert fs == 16000 if x.shape[0] % 256 == 0: x = np.concatenate((x, np.array([1e-06])), axis=0) y = signal.filtfilt(b, a, x) wav = y * 0.96 + (prng.rand(y.shape[0]) - 0.5) * 1e-06 # compute spectrogram D = pySTFT(wav).T D_mel = np.dot(D, mel_basis) D_db = 20 * np.log10(np.maximum(min_level, D_mel)) - 16 S = (D_db + 100) / 100 # extract f0 f0_rapt = sptk.rapt(wav.astype(np.float32) * 32768, fs, 256, min=lo, max=hi, otype=2) index_nonzero = (f0_rapt != -1e10) mean_f0, std_f0 = np.mean(f0_rapt[index_nonzero]), np.std( f0_rapt[index_nonzero]) f0_norm = speaker_normalization(f0_rapt, index_nonzero, mean_f0, std_f0) assert len(S) == len(f0_rapt) np.save(os.path.join(targetDir, basename), S.astype(np.float32), allow_pickle=False) np.save(os.path.join(targetDir_f0, basename), f0_norm.astype(np.float32),
[fs, sig] = util.WavRead(args.i) sig = sig.astype('float32') if not args.isolate: x_axis = np.arange(0, len(sig) / fs, 1 / fs) plot.subplot(211) plot.plot(x_axis, sig) plot.grid() plot.title(args.i.split('/')[-1] + ' Signal') plot.xlabel('Time (s)') plot.ylabel('Amplitude') plot.subplot(212) fmin = float(args.fmin) fmax = float(args.fmax) pitch = sptk.rapt(sig, fs, 250, float(120), float(400)) x_axis = np.linspace(0, len(sig) / fs, len(pitch)) if args.smooth: pitch = smooth_pitch(pitch) plot.title(args.i.split('/')[-1] + ' Pitch Contour(Smoothed)') else: plot.title(args.i.split('/')[-1] + ' Pitch Contour') plot.plot(x_axis, pitch) plot.ylim([fmin, fmax]) plot.grid() if args.mean: length = 0 add = 0
def synthesize_with_reference(idx_info, name, noisy_input, audio_path, tg_path, speaker_id, inspection): global model, vocoder, step start_time = time.perf_counter() # Prepare Reference Data if speaker_id is not None: spker_embed_path = os.path.join( hp.preprocessed_path, "spker_embed", "{}-spker_embed-{}.npy".format(hp.dataset, speaker_id)) speaker_embed = torch.from_numpy(np.load(spker_embed_path)).to(device) else: try: # VCTK fileformat speaker_id = name.split("_")[0] spker_embed_path = os.path.join( hp.preprocessed_path, "spker_embed", "{}-spker_embed-{}.npy".format(hp.dataset, speaker_id)) speaker_embed = torch.from_numpy( np.load(spker_embed_path)).to(device) except: # General cases speaker_id = None speaker_embed = torch.from_numpy( embedding.predict_embedding(speaker_embedder, audio_path)) # Outdir outdir = os.path.join(hp.test_path(), "{}_by_{}_{}".format(name, speaker_id, step)) if not os.path.exists(outdir): os.makedirs(outdir) text = utils.get_transcript( os.path.join(audio_path.replace(".wav", ".txt"))) if not os.path.isfile(tg_path): tg_path = "NO TextGrid" _, wav = read(audio_path) if noisy_input: f0 = sptk.rapt(wav.astype(np.float32) * hp.max_wav_value, hp.sampling_rate, hp.encoder_hidden, min=hp.f0_min, max=hp.f0_max, otype=2) # log f0 f0 = np.exp(f0) else: f0, _ = pw.dio(wav.astype(np.float64), hp.sampling_rate, frame_period=hp.hop_length / hp.sampling_rate * 1000) mel, energy, _ = Audio.tools.get_mel_from_wav( torch.FloatTensor(np.array(wav))) mel = mel.T.numpy().astype(np.float32) energy = energy.numpy().astype(np.float32) utils.plot_data([(mel.T, f0, energy)], ['Reference Spectrogram'], filename=os.path.join( outdir, '{}_{}_{}.png'.format("Reference", name, text[:100]))) else: f0, energy, mel = get_processed_data_from_wav(audio_path, tg_path, noisy_input) utils.plot_data([(mel.T, f0, energy)], ['Reference Spectrogram'], filename=os.path.join( outdir, '{}_{}_{}.png'.format("Reference", name, text[:100]))) # Prepare Audio Inputs energy = (energy - hp.energy_min) / (hp.energy_max - hp.energy_min) f0_norm = utils.speaker_normalization(f0) mel, mel_len, energy, f0, f0_norm = preprocess_audio( mel, energy, f0, f0_norm) print("\n\n---------------- [{}/{}]: {} ----------------".format( idx_info[0] + 1, idx_info[1], audio_path.split('/')[-1])) print('Audio Path:', audio_path) print('TextGrid Path:', tg_path) print('Speaker ID:', speaker_id) # Synthesize success = 0 for sentence in sentences: text = preprocess_text(sentence) synthesize(outdir, model, vocoder, text, sentence, speaker_embed, speaker_id, inspection, mel, mel_len, f0, f0_norm, energy, args.duration_control, args.pitch_control, args.energy_control) success += 1 print("Synthesized {} out of {} in {:.3f}s".format( success, len(sentences), time.perf_counter() - start_time))