def get_feature(wav_path, preprocessing=False, getsize=False): fs, x = wavfile.read(wav_path) x = x.astype(np.float64) if audio_world_config.use_harvest: f0, timeaxis = pyworld.harvest( x, fs, frame_period=audio_world_config.frame_period, f0_floor=audio_world_config.f0_floor, f0_ceil=audio_world_config.f0_ceil) else: f0, timeaxis = pyworld.dio( x, fs, frame_period=audio_world_config.frame_period, f0_floor=audio_world_config.f0_floor, f0_ceil=audio_world_config.f0_ceil) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) alpha = pysptk.util.mcepalpha(fs) mgc = pysptk.sp2mc(spectrogram, order=audio_world_config.mgc_dim, alpha=alpha) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) if audio_world_config.use_harvest: # https://github.com/mmorise/World/issues/35#issuecomment-306521887 vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None] else: vuv = (lf0 != 0).astype(np.float32) lf0 = P.interp1d(lf0, kind=audio_world_config.f0_interpolation_kind) # Parameter trajectory smoothing if audio_world_config.mod_spec_smoothing: hop_length = int(fs * (audio_world_config.frame_period * 0.001)) modfs = fs / hop_length mgc = P.modspec_smoothing( mgc, modfs, cutoff=audio_world_config.mod_spec_smoothing_cutoff) mgc = P.delta_features(mgc, audio_world_config.windows) lf0 = P.delta_features(lf0, audio_world_config.windows) bap = P.delta_features(bap, audio_world_config.windows) features = np.hstack((mgc, lf0, vuv, bap)) if preprocessing: out_path = wav_path.replace(".wav", "").replace("wav", "world") np.save(out_path, features) elif getsize: feature, mgc.shape[0], lf0.shape[0], bap.shape[0] else: return features
def collect_features(self, wav_path, label_path): fs, x = wavfile.read(wav_path) x = x.astype(np.float64) if hp_acoustic.use_harvest: f0, timeaxis = pyworld.harvest( x, fs, frame_period=hp_acoustic.frame_period, f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil) else: f0, timeaxis = pyworld.dio( x, fs, frame_period=hp_acoustic.frame_period, f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) if self.alpha is None: self.alpha = pysptk.util.mcepalpha(fs) mgc = pysptk.sp2mc(spectrogram, order=hp_acoustic.order, alpha=self.alpha) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) if hp_acoustic.use_harvest: # https://github.com/mmorise/World/issues/35#issuecomment-306521887 vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None] else: vuv = (lf0 != 0).astype(np.float32) lf0 = P.interp1d(lf0, kind=hp_acoustic.f0_interpolation_kind) # Parameter trajectory smoothing if hp_acoustic.mod_spec_smoothing: hop_length = int(fs * (hp_acoustic.frame_period * 0.001)) modfs = fs / hop_length mgc = P.modspec_smoothing( mgc, modfs, cutoff=hp_acoustic.mod_spec_smoothing_cutoff) mgc = P.delta_features(mgc, hp_acoustic.windows) lf0 = P.delta_features(lf0, hp_acoustic.windows) bap = P.delta_features(bap, hp_acoustic.windows) features = np.hstack((mgc, lf0, vuv, bap)) # Cut silence frames by HTS alignment labels = hts.load(label_path) features = features[:labels.num_frames()] indices = labels.silence_frame_indices() features = np.delete(features, indices, axis=0) return features.astype(np.float32)
def collect_features(self, wav_path, label_path): fs, x = wavfile.read(wav_path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp_acoustic.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) if self.alpha is None: self.alpha = pysptk.util.mcepalpha(fs) mgc = pysptk.sp2mc(spectrogram, order=hp_acoustic.order, alpha=self.alpha) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) vuv = (lf0 != 0).astype(np.float32) lf0 = P.interp1d(lf0, kind=hp_acoustic.f0_interpolation_kind) # 50hz parameter trajectory smoothing hop_length = int(fs * (hp_acoustic.frame_period * 0.001)) modfs = fs / hop_length mgc = P.modspec_smoothing(mgc, modfs, cutoff=50) mgc = P.delta_features(mgc, hp_acoustic.windows) lf0 = P.delta_features(lf0, hp_acoustic.windows) bap = P.delta_features(bap, hp_acoustic.windows) features = np.hstack((mgc, lf0, vuv, bap)) # Cut silence frames by HTS alignment labels = hts.load(label_path) features = features[:labels.num_frames()] indices = labels.silence_frame_indices() features = np.delete(features, indices, axis=0) return features.astype(np.float32)
def proc_wav(wav_path, out_dir, index, spkid, sr=16000): fs, signal = wav.read(wav_path) frame_count = int(len(signal) / hop_size) pad_len = (frame_count - 1) * hop_size + frame_length padded = np.pad(signal, (0, pad_len - len(signal)), mode="constant", constant_values=0) mfcc = python_speech_features.mfcc(padded, fs, winlen=0.025, winstep=0.01, nfilt=40, numcep=13) # log-energy, mfcc[1:13] out = padded[:frame_count * hop_size] assert len(out) % mfcc.shape[0] == 0 padded = padded.astype(np.float64) f0, timeaxis = pw.harvest(padded, fs, frame_period=frame_period) f0 = f0[:frame_count] vuv = np.zeros(len(f0)) vuv[f0 > 0] = 1 logf0 = np.zeros(len(f0)) logf0[f0 > 0] = np.log(f0[f0 > 0]) continuous_lf0 = interp1d(logf0, kind="slinear") print(continuous_lf0.shape, vuv.shape, out.shape, mfcc.shape)
def _process_utterance(lf0_dir, mgc_dir, bap_dir, cmp_dir, linear_dir, basename, wav_path, text, hparams): """ Preprocesses a single utterance wav/text pair this writes the mel scale spectogram to disk and return a tuple to write to the train.txt file Args: - mel_dir: the directory to write the mel spectograms into - linear_dir: the directory to write the linear spectrograms into - wav_dir: the directory to write the preprocessed wav into - basename: - wav_path: path to the audio file containing the speech input - text: text spoken in the input audio file - hparams: hyper parameters Returns: - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text) """ if hparams.trim_silence: tar_wavfile = wav_path[:-4] + "_trim.wav" print("raw wav path:%s" % wav_path) wav_raw, fs = sf.read(wav_path) wav_trim = audio.trim_silence(wav_raw, hparams) sf.write(tar_wavfile, wav_trim, fs) wav_path = tar_wavfile nFFTHalf, alpha, bap_dim = audio.get_config(hparams.sample_rate) mcsize = hparams.num_mgc - 1 filename = basename #os.path.basename(wav_path).split(".")[0] print('extract feats for %s' % wav_path) # extract f0,sp,ap os.system("analysis %s %s/%s.f0 %s/%s.sp %s/%s.bapd" % (wav_path, lf0_dir, filename, mgc_dir, filename, bap_dir, filename)) # get float64??? # interpolate f0 f0 = np.fromfile("%s/%s.f0" % (lf0_dir, filename), dtype=np.float64) continuous_f0 = interp1d(f0, kind="slinear") continuous_f0.tofile("%s/%s.f0c" % (lf0_dir, filename)) # convert f0 to lf0 os.system("x2x +da %s/%s.f0c > %s/%s.f0a" % (lf0_dir, filename, lf0_dir, filename)) os.system( "x2x +af %s/%s.f0a | sopr -magic 0.0 -LN -MAGIC -1.0E+10 > %s/%s.lf0" % (lf0_dir, filename, lf0_dir, filename)) # convert sp to mgc os.system("x2x +df %s/%s.sp | sopr -R -m 32768.0 | " "mcep -a %f -m %d -l %d -e 1.0E-8 -j 0 -f 0.0 -q 3 " "> %s/%s.mgc" % (mgc_dir, filename, alpha, mcsize, nFFTHalf, mgc_dir, filename)) # convert ap to bap os.system("x2x +df %s/%s.bapd > %s/%s.bap" % (bap_dir, filename, bap_dir, filename)) # merge mgc,lf0 and bap to cmp os.system("merge +f -s 0 -l 1 -L %d %s/%s.mgc < %s/%s.lf0 > %s/%s.ml" % ( (mcsize + 1), mgc_dir, filename, lf0_dir, filename, cmp_dir, filename)) os.system("merge +f -s 0 -l %d -L %d %s/%s.ml < %s/%s.bap > %s/%s.cmp" % (bap_dim, (mcsize + 2), cmp_dir, filename, bap_dir, filename, cmp_dir, filename)) #if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length: # return None #Compute the linear scale spectrogram from the wav wav = audio.load_wav(wav_path, hparams.sample_rate) linear_spectrogram = audio.linearspectrogram(wav, hparams).astype(np.float32) linear_frames = linear_spectrogram.shape[1] #sanity check #assert linear_frames == mel_frames lf0 = np.fromfile("%s/%s.lf0" % (lf0_dir, filename), dtype=np.float32) mgc = np.fromfile("%s/%s.mgc" % (mgc_dir, filename), dtype=np.float32) bap = np.fromfile("%s/%s.bap" % (bap_dir, filename), dtype=np.float32) cmp = np.fromfile("%s/%s.cmp" % (cmp_dir, filename), dtype=np.float32) cmp_dim = mcsize + 1 + 1 + bap_dim cmp_frames = cmp.shape[0] / cmp_dim #print(f0[:100]) #print(continuous_f0[:100]) print(lf0.shape) print(continuous_f0.shape) print(mgc.shape) print(bap.shape) print(cmp_frames) print(continuous_f0.dtype) print(mgc.dtype) print(bap.dtype) assert (mgc.shape[0] / (mcsize + 1)) == (continuous_f0.shape[0] / 1) == (bap.shape[0] / bap_dim) == cmp_frames assert cmp_dim == hparams.num_mels #assert len(out) >= cmp_frames * audio.get_hop_size(hparams) #time resolution adjustement #ensure length of raw audio is multiple of hop size so that we can use #transposed convolution to upsample #out = out[:mel_frames * audio.get_hop_size(hparams)] #assert len(out) % audio.get_hop_size(hparams) == 0 #time_steps = len(out) # Write the spectrogram and audio to disk #audio_filename = 'audio-{}.npy'.format(index) cmp_mat = cmp.reshape(-1, cmp_dim) cmp_filename = 'cmp-{}.npy'.format(basename) linear_filename = 'linear-{}.npy'.format(basename) #np.save(os.path.join(wav_dir, audio_filename), out.astype(out_dtype), allow_pickle=False) np.save(os.path.join(cmp_dir, cmp_filename), cmp_mat, allow_pickle=False) np.save(os.path.join(linear_dir, linear_filename), linear_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example return (cmp_filename, linear_filename, cmp_frames, text)