def mcep_dir(srcroot, tgtroot, n_mcep=40, alpha=0.42): src = pathlib.Path(srcroot) tgt = pathlib.Path(tgtroot) if not pathlib.Path(src).exists(): raise ValueError('src not exists: {}'.format(src)) for p in sorted(src.glob('**/*.wav')): print(p) tgt_dir = tgt / p.parent.relative_to(src) tgt_stem = (tgt_dir / p.name).with_suffix('') tgt_dir.mkdir(parents=True, exist_ok=True) mcep_path = tgt_stem.with_suffix('.mcep.npy') c0_path = tgt_stem.with_suffix('.c0.npy') f0_path = tgt_stem.with_suffix('.f0.npy') ap_path = tgt_stem.with_suffix('.ap.npy') if mcep_path.exists() and c0_path.exists() and f0_path.exists( ) and ap_path.exists(): print('skip') continue sr, wav = wavfile.read(p) x = (wav / 32768.0).astype(np.float64) f0, sp, ap = pyworld.wav2world(x.astype(np.float64), sr) mfcc = pysptk.sp2mc(sp, n_mcep, alpha) f0, mfcc, ap = f0.astype(np.float32), mfcc.T.astype( np.float32), ap.T.astype(np.float32) c0 = mfcc[0, :] mfcc = np.ascontiguousarray(mfcc[1:, :]) ap = ap[192, :] np.save(mcep_path, mfcc) np.save(c0_path, c0) np.save(f0_path, f0) np.save(ap_path, ap) print(tgt_stem, flush=True)
def world_extract(x, fs, f0min, f0max): # scale from [-1, 1] to [-32768, 32767] x = x * np.iinfo(np.int16).max x = np.array(x, dtype=np.float64) x = low_cut_filter(x, fs) # extract features f0, time_axis = pw.harvest(x, fs, f0_floor=f0min, f0_ceil=f0max, frame_period=MCEP_SHIFT) sp = pw.cheaptrick(x, f0, time_axis, fs, fft_size=MCEP_FFTL) ap = pw.d4c(x, f0, time_axis, fs, fft_size=MCEP_FFTL) mcep = pysptk.sp2mc(sp, MCEP_DIM, MCEP_ALPHA) npow = spc2npow(sp) return { "sp": sp, "mcep": mcep, "ap": ap, "f0": f0, "npow": npow, }
def __test(order, alpha, fftlen): np.random.seed(98765) sp = np.random.rand(int(fftlen // 2 + 1)) mc = pysptk.sp2mc(sp, order, alpha) approx_sp = pysptk.mc2sp(mc, alpha, fftlen) # TODO: tolerance should be more carefully chosen assert np.allclose(sp, approx_sp, atol=0.9)
def extract(wave: Wave, frame_period, f0_floor, f0_ceil, fft_length, order, alpha, dtype): x = wave.wave.astype(numpy.float64) fs = wave.sampling_rate f0, t = pyworld.harvest( x, fs, frame_period=frame_period, f0_floor=f0_floor, f0_ceil=f0_ceil, ) f0 = pyworld.stonemask(x, f0, t, fs) sp = pyworld.cheaptrick(x, f0, t, fs, fft_size=fft_length) ap = pyworld.d4c(x, f0, t, fs, fft_size=fft_length) mc = pysptk.sp2mc(sp, order=order, alpha=alpha) coded_ap = pyworld.code_aperiodicity(ap, fs) voiced = ~(f0 == 0) # type: numpy.ndarray feature = AcousticFeature( f0=f0[:, None], sp=sp, ap=ap, coded_ap=coded_ap, mc=mc, voiced=voiced[:, None], ) feature = feature.astype_only_float(dtype) feature.validate() return feature
def collect_features(self, path): fs, x = wavfile.read(path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha) return mc
def generate_changed_voice(model, input_path): fs, x = wavfile.read(input_path) x = x.astype(np.float64) if len(x.shape) > 1: x = x.mean(axis=1) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] mc = P.modspec_smoothing(mc, FS / HOP_LENGHT, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) gen_data = model.predict(mc) gen_data = np.hstack([c0.reshape((-1, 1)), gen_data]) fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp( gen_data.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize( f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform
def collect_features(self, wav_path): # x: Raw audio, (Sample_length, ) x, fs = librosa.load(wav_path, sr=self.target_sr, mono=True, dtype=np.float64) # f0: F0, (Frame_length, ) # lf0: log(f0) --> interp1d (Frame_length, ) # vuv: voice/unvoiced (Frame_length, ) f0, timeaxis = pyworld.dio(x, self.target_sr, frame_period=self.hop_sz_in_ms) f0 = pyworld.stonemask(x, f0, timeaxis, fs) lf0 = f0.copy() lf0[np.nonzero(f0)] = np.log(f0[np.nonzero(f0)]) lf0 = interp1d(lf0, kind="slinear") vuv = (lf0 != 0).astype(np.float32) # spec: Spectrogram, (Frame_length x Dim), Dim = 513 # bap: coded aperiodicity, (Frame_length, ) # mgc: mel-cepstrum, (Frame_length x Dim), Dim = 60 spec = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) mgc = pysptk.sp2mc(spec, order=59, alpha=pysptk.util.mcepalpha(fs)) # Stacking Features: total dimesnion = 64 features = np.hstack((f0[:,None], lf0[:,None], vuv[:,None], bap, mgc, spec)) return features.astype(np.float32)
def extract(cls, wave: Wave, frame_period, f0_floor, f0_ceil, fft_length, order, alpha, dtype): x = wave.wave.astype(numpy.float64) fs = wave.sampling_rate f0, t = cls.extract_f0(x=x, fs=fs, frame_period=frame_period, f0_floor=f0_floor, f0_ceil=f0_ceil) sp = pyworld.cheaptrick(x, f0, t, fs, fft_size=fft_length) ap = pyworld.d4c(x, f0, t, fs, fft_size=fft_length) mc = pysptk.sp2mc(sp, order=order, alpha=alpha) coded_ap = pyworld.code_aperiodicity(ap, fs) voiced: numpy.ndarray = ~(f0 == 0) if len(x) % fft_length > 0: f0 = f0[:-1] t = t[:-1] sp = sp[:-1] ap = ap[:-1] mc = mc[:-1] coded_ap = coded_ap[:-1] voiced = voiced[:-1] feature = AcousticFeature( f0=f0[:, None], sp=sp, ap=ap, coded_ap=coded_ap, mc=mc, voiced=voiced[:, None], ) feature = feature.astype_only_float(dtype) return feature
def collect_features(self, wav_path, label_path): fs, x = wavfile.read(wav_path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) mgc = pysptk.sp2mc(spectrogram, order=order, alpha=pysptk.util.mcepalpha(fs)) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) vuv = (lf0 != 0).astype(np.float32) lf0 = interp1d(lf0, kind="slinear") mgc = apply_delta_windows(mgc, windows) lf0 = apply_delta_windows(lf0, windows) bap = apply_delta_windows(bap, windows) features = np.hstack((mgc, lf0, vuv, bap)) # Cut silence frames by HTS alignment labels = hts.load(label_path) features = features[:labels.num_frames()] indices = labels.silence_frame_indices() features = np.delete(features, indices, axis=0) return features.astype(np.float32)
def wav2world(wavfile, frame_period): wav, fs = librosa.load(wavfile, sr=hp.sample_rate, dtype=np.float64) if hp.use_harvest: f0, timeaxis = pyworld.harvest(wav, fs, frame_period=frame_period) else: f0, timeaxis = pyworld.dio(wav, fs, frame_period=frame_period) f0 = pyworld.stonemask(wav, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(wav, f0, timeaxis, fs) aperiodicity = pyworld.d4c(wav, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) hp.num_bap = bap.shape[1] alpha = pysptk.util.mcepalpha(fs) mgc = pysptk.sp2mc(spectrogram, order=hp.num_mgc - 1, alpha=alpha) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) if hp.use_harvest: # https://github.com/mmorise/World/issues/35#issuecomment-306521887 vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None] else: vuv = (lf0 != 0).astype(np.float32) #print(mgc.shape,lf0.shape,vuv.shape,bap.shape) features = np.hstack((mgc, lf0, vuv, bap)) return features.astype(np.float32)
def get_features(filename, *, winlen, winstep, n_mcep, mcep_alpha, minf0, maxf0, type): wav, sr = load(filename, sr=None) # get f0 x = wav.astype(float) _f0, t = world.harvest(x, sr, f0_floor=minf0, f0_ceil=maxf0, frame_period=winstep * 1000) f0 = world.stonemask(x, _f0, t, sr) window_size = int(sr * winlen) hop_size = int(sr * winstep) # get mel if type == 'mcc': spec = world.cheaptrick(x, f0, t, sr, f0_floor=minf0) h = sptk.sp2mc(spec, n_mcep - 1, mcep_alpha).T else: h = mfcc(x, sr, n_mfcc=n_mcep, n_fft=window_size, hop_length=hop_size) h = np.vstack((h, f0)) maxlen = len(x) // hop_size + 2 h = repeat_last_padding(h, maxlen) id = os.path.basename(filename).replace(".wav", "") return (id, x, h)
def collect_features(x, fs): x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha) return mc
def _process_feature(out_dir, index, wav_path, label_path): # get list of wav files wav_files = os.listdir(os.path.dirname(wav_path)) # check wav_file assert len( wav_files) != 0 and wav_files[0][-4:] == '.wav', "no wav files found!" fs, x = wavfile.read(wav_path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period) n_frames = len(f0) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) mgc = pysptk.sp2mc(spectrogram, order=order, alpha=pysptk.util.mcepalpha(fs)) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) vuv = (lf0 != 0).astype(np.float32) lf0 = interp1d(lf0, kind="slinear") mgc = apply_delta_windows(mgc, windows) lf0 = apply_delta_windows(lf0, windows) bap = apply_delta_windows(bap, windows) features = np.hstack((mgc, lf0, vuv, bap)) # get list of lab files lab_files = os.listdir(os.path.dirname(label_path)) # check wav_file assert len( lab_files) != 0 and lab_files[0][-4:] == '.lab', "no lab files found!" # Cut silence frames by HTS alignment labels = hts.load(label_path) features = features[:labels.num_frames()] indices = labels.silence_frame_indices() features = np.delete(features, indices, axis=0) voiced_frames = features.shape[0] # Write the acoustic to disk: acoustic_filename = 'arctic_%05d.npy' % index np.save(os.path.join(out_dir, acoustic_filename), features.astype(np.float32), allow_pickle=False) dataset_ids.append(acoustic_filename[:-4]) with open(os.path.join(os.path.dirname(out_dir), 'dataset_ids.pkl'), 'wb') as pklFile: pickle.dump(dataset_ids, pklFile) # Return a tuple describing this training example: return (acoustic_filename, n_frames, voiced_frames)
def get_features(wav_path): x, fs = librosa.load(wav_path, sr=config.fs) x = x.astype(np.float64) f0, time_axis = pyworld.dio(x, fs, frame_period=config.frame_period) f0 = pyworld.stonemask(x, f0, time_axis, fs) spectrogram = pyworld.cheaptrick(x, f0, time_axis, fs) aperiodicity = pyworld.d4c(x, f0, time_axis, fs) mc = pysptk.sp2mc(spectrogram, order=config.order, alpha=config.alpha) return mc, aperiodicity, f0
def collect_features(self, path): fs, x = wavfile.read(path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=5) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) spectrogram = trim_zeros_frames(spectrogram) mc = pysptk.sp2mc(spectrogram, order=24, alpha=self.alpha) return mc.astype(np.float32)
def collect_features(self, path): x, fs = librosa.load(path, sr=config.fs) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=config.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) spectrogram = trim_zeros_frames(spectrogram) mc = pysptk.sp2mc(spectrogram, order=config.order, alpha=config.alpha) return mc
def get_MCEPs(self, wav): wav = np.float64(wav) _f0_h, t_h = pw.harvest(wav, self.sr) f0_h = pw.stonemask(wav, _f0_h, t_h, self.sr) sp_h = pw.cheaptrick(wav, f0_h, t_h, self.sr) ap_h = pw.d4c(wav, f0_h, t_h, self.sr) mc = pysptk.sp2mc(sp_h, order=self.order, alpha=self.alpha) return mc, f0_h, ap_h
def spec_to_waveform(spectrogram, order, fs, frame_period): alpha = pysptk.util.mcepalpha(fs) hop_length = int(fs * (frame_period * 0.001)) mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha) engine = Synthesizer(MLSADF(order=order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) return waveform
def _get_mcep(x, fs, frame_period=5, order=24): alpha = pysptk.util.mcepalpha(fs) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) spectrogram = trim_zeros_frames(spectrogram) mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha) return mc
def get_feature(wav_path, preprocessing=False, getsize=False): fs, x = wavfile.read(wav_path) x = x.astype(np.float64) if audio_world_config.use_harvest: f0, timeaxis = pyworld.harvest( x, fs, frame_period=audio_world_config.frame_period, f0_floor=audio_world_config.f0_floor, f0_ceil=audio_world_config.f0_ceil) else: f0, timeaxis = pyworld.dio( x, fs, frame_period=audio_world_config.frame_period, f0_floor=audio_world_config.f0_floor, f0_ceil=audio_world_config.f0_ceil) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) alpha = pysptk.util.mcepalpha(fs) mgc = pysptk.sp2mc(spectrogram, order=audio_world_config.mgc_dim, alpha=alpha) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) if audio_world_config.use_harvest: # https://github.com/mmorise/World/issues/35#issuecomment-306521887 vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None] else: vuv = (lf0 != 0).astype(np.float32) lf0 = P.interp1d(lf0, kind=audio_world_config.f0_interpolation_kind) # Parameter trajectory smoothing if audio_world_config.mod_spec_smoothing: hop_length = int(fs * (audio_world_config.frame_period * 0.001)) modfs = fs / hop_length mgc = P.modspec_smoothing( mgc, modfs, cutoff=audio_world_config.mod_spec_smoothing_cutoff) mgc = P.delta_features(mgc, audio_world_config.windows) lf0 = P.delta_features(lf0, audio_world_config.windows) bap = P.delta_features(bap, audio_world_config.windows) features = np.hstack((mgc, lf0, vuv, bap)) if preprocessing: out_path = wav_path.replace(".wav", "").replace("wav", "world") np.save(out_path, features) elif getsize: feature, mgc.shape[0], lf0.shape[0], bap.shape[0] else: return features
def analyze(x, fs, f0_floor, f0_ceil, frame_period=20.0, pitchshift=None): if pitchshift is not None: f0, spc, ap = analyze_world(x, fs * pitchshift, f0_floor, f0_ceil, frame_period / pitchshift) else: f0, spc, ap = analyze_world(x, fs, f0_floor, f0_ceil, frame_period) mcep = pysptk.sp2mc(spc, 24, 0.410) codeap = pyworld.code_aperiodicity(ap, fs) #return x, fs, f0, time_axis, spc, ap, mcep, codeap return f0, mcep, codeap
def load_wav_extract_mcep(converted_dir, sent): if '.wav' in os.path.join(converted_dir, sent): wav, _ = librosa.load(os.path.join(converted_dir, sent), sr=22050, mono=True) _, _, sp, _ = world_decompose(wav=wav, fs=22050, frame_period=5.0) mcep = pysptk.sp2mc(sp, 36 - 1, 0.455) return mcep else: print('{}: not wav'.format(os.path.join(converted_dir, sent)))
def get_mc(wav): y, sr = sf.read(wav) y = y.astype(np.float64) f0, timeaxis = pyworld.dio(y, sr, frame_period=5) f0 = pyworld.stonemask(y, f0, timeaxis, sr) spectrogram = pyworld.cheaptrick(y, f0, timeaxis, sr) mc = pysptk.sp2mc(spectrogram, order=24, alpha=0.41) mc = mc.astype(np.float32) return mc
def gaussian_voice_conversion(model, audio_path, windows=default_windows, frame_period=default_frame_period, order=default_order, alpha=default_alpha, hop_length=default_hop_length): paramgen = utilities.math.MLPG(model, windows=windows, diff=True) sampling_rate, audio_data = scipy.io.wavfile.read(audio_path) audio_data = audio_data.astype(numpy.float64) # fundamental_frequency, time_axis = pyworld.dio(audio_data, sampling_rate, frame_period=frame_period) fundamental_frequency = pyworld.stonemask(audio_data, fundamental_frequency, time_axis, sampling_rate) spectrogram = pyworld.cheaptrick(audio_data, fundamental_frequency, time_axis, sampling_rate) aperiodicity = pyworld.d4c(audio_data, fundamental_frequency, time_axis, sampling_rate) # mel_coefficients = pysptk.sp2mc(spectrogram, order=order, alpha=alpha) c0, mel_coefficients = mel_coefficients[:, 0], mel_coefficients[:, 1:] mel_coefficients = utilities.math.apply_delta(mel_coefficients, windows) mel_coefficients = paramgen.transform(mel_coefficients) mel_coefficients = numpy.hstack((c0[:, None], mel_coefficients)) # mel_coefficients[:, 0] = 0 engine = pysptk.synthesis.Synthesizer(pysptk.synthesis.MLSADF(order=order, alpha=alpha), hopsize=hop_length) mlsa_coefficients = pysptk.mc2b(mel_coefficients.astype(numpy.float64), alpha=alpha) waveform = engine.synthesis(audio_data, mlsa_coefficients) # The numpy.int16 is really important, otherwise it would # produce non-sensical wavefiles when saved with scipy return numpy.asarray(waveform, dtype=numpy.int16)
def test_vc_from_path(model, path, data_mean, data_std, diffvc=True): model.eval() fs, x = wavfile.read(path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] static_dim = mc.shape[-1] mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) T = mc.shape[0] inputs = mc[:, :static_dim].copy() # Normalization mc_scaled = P.scale(mc, data_mean, data_std) # Apply model mc_scaled = Variable(torch.from_numpy(mc_scaled)) R = unit_variance_mlpg_matrix(hp.windows, T) R = torch.from_numpy(R) y_hat, y_hat_static = model(mc_scaled, R) mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim) # Denormalize mc_static_pred = P.inv_scale(mc_static_pred, data_mean[:static_dim], data_std[:static_dim]) outputs = mc_static_pred.copy() if diffvc: mc_static_pred = mc_static_pred - mc[:, :static_dim] mc = np.hstack((c0[:, None], mc_static_pred)) if diffvc: mc[:, 0] = 0 # remove power coefficients engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp(mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform, inputs, outputs
def get_features(x, fs): # f0 calculate _f0, t = pw.dio(x, fs) f0 = pw.stonemask(x, _f0, t, fs) # mcep calculate sp = trim_zeros_frames(pw.cheaptrick(x, f0, t, fs)) mcep = pysptk.sp2mc(sp, order=24, alpha=pysptk.util.mcepalpha(fs)) # bap calculate ap = pw.d4c(x, f0, t, fs) bap = pw.code_aperiodicity(ap, fs) return f0, mcep, bap
def wav2mcep(WAV_FILE,dim): fs, data = wavfile.read(WAV_FILE) # floatでないとworldは扱えない data = data.astype(np.float) _f0, _time = pw.dio(data, fs) # 基本周波数の抽出。pw.dioは0.005秒ごとの基本周波数を測定し、numpyとして返す。 f0 = pw.stonemask(data, _f0, _time, fs) # 基本周波数の修正 sp = pw.cheaptrick(data, f0, _time, fs) # スペクトル包絡の抽出 ap = pw.d4c(data, f0, _time, fs) # 非周期性指標の抽出 mcep=pysptk.sp2mc(sp,dim,0.42) return torch.Tensor(mcep)
def mgc_lf0_vuv(f0, sp, ap, fs=22050, order=13, alpha=None): if alpha is None: alpha=pysptk.util.mcepalpha(fs) # https://github.com/r9y9/gantts/blob/master/prepare_features_tts.py mgc = pysptk.sp2mc(sp, order=order, alpha=alpha) # f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) vuv = (ap[:, 0] < 0.5).astype(np.float32)[:, None] return mgc, lf0[:, None], vuv
def feature_extract(wav_list, arr): n_sample = 0 n_frame = 0 max_frame = 0 count = 1 coeff = np.array([-0.5, 0.5, 0.0]) for wav_name in wav_list: # load wavfile and apply low cut filter fs, x = read_wav(wav_name, cutoff=70) n_sample += x.shape[0] logging.info(wav_name + " " + str(x.shape[0]) + " " + str(n_sample) + " " + str(count)) # check sampling frequency if not fs == args.fs: logging.debug("ERROR: sampling frequency is not matched.") sys.exit(1) hdf5name = args.hdf5dir + "/" + os.path.basename(wav_name).replace( ".wav", ".h5") # extimate f0 and ap time_axis, f0, spc, ap = analyze_range(x, fs=args.fs, minf0=minf0, maxf0=maxf0, fperiod=args.shiftms, fftl=args.fftl) write_hdf5(hdf5name, '/ap', ap) write_hdf5(hdf5name, "/f0", f0) # convert to continuous f0 and low-pass filter uv, cont_f0 = convert_continuos_f0(np.array(f0)) cont_f0_lpf = low_pass_filter(cont_f0, int(1.0 / (args.shiftms * 0.001)), cutoff=20) cont_f0_lpf = np.expand_dims(cont_f0_lpf, axis=-1) uv = np.expand_dims(uv, axis=-1) write_hdf5(hdf5name, "/lcf0", np.log(cont_f0_lpf)) write_hdf5(hdf5name, "/uv", uv) # extimate codeap codeap = pw.code_aperiodicity(ap, args.fs) if codeap.ndim == 1: # when fs == 16000 codeap = np.expand_dims(codeap, axis=-1) write_hdf5(hdf5name, "/codeap", codeap) # mcep mcep = ps.sp2mc(spc, args.mcep_dim, mcep_alpha) write_hdf5(hdf5name, "/mcep", mcep)
def process(filename): ''' The function decomposes a wav file into F0, mel-cepstral coefficients, and aperiodicity :param filename: path to wav file :return: .lf0, .mgc and .bap files ''' # pdb.set_trace() file_id = os.path.basename(filename).split(".")[0] print('\n' + file_id) ### WORLD ANALYSIS -- extract vocoder parameters ### # x, fs = librosa.core.load(filename, sr=16000) fs, x = wavfile.read(filename) # warnning this parameter is important alpha = pysptk.util.mcepalpha(fs) hopesize = int(0.005 * fs) # pdb.set_trace() f0 = pysptk.rapt(x.astype(np.float32), fs=fs, hopsize=hopesize, min=60, max=600, voice_bias=0.0, otype=1) f0 = f0.astype(np.float64) x = x.astype(np.float64) / (2**15) _, timeaxis = pyworld.harvest(x, fs, frame_period=5, f0_floor=60.0, f0_ceil=600) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) f0 = f0[:, None] lf0 = f0.copy() lf0 = lf0.astype(np.float32) nonzero_indices = np.where(f0 != 0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) zero_indices = np.where(f0 == 0) lf0[zero_indices] = -1.0E+10 write_binfile(lf0, os.path.join(lf0_dir, file_id + '.lf0'), dtype=np.float32) mc = pysptk.sp2mc(spectrogram, mcsize, alpha=alpha) mc = mc.astype(np.float32) write_binfile(mc, os.path.join(mgc_dir, file_id + '.mgc'), dtype=np.float32) bap = pyworld.code_aperiodicity(aperiodicity, fs) bap = bap.astype(np.float32) write_binfile(bap, os.path.join(bap_dir, file_id + '.bap'), dtype=np.float32)
def collect_features(self, wav_path, label_path): fs, x = wavfile.read(wav_path) x = x.astype(np.float64) if hp_acoustic.use_harvest: f0, timeaxis = pyworld.harvest( x, fs, frame_period=hp_acoustic.frame_period, f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil) else: f0, timeaxis = pyworld.dio( x, fs, frame_period=hp_acoustic.frame_period, f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) if self.alpha is None: self.alpha = pysptk.util.mcepalpha(fs) mgc = pysptk.sp2mc(spectrogram, order=hp_acoustic.order, alpha=self.alpha) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) if hp_acoustic.use_harvest: # https://github.com/mmorise/World/issues/35#issuecomment-306521887 vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None] else: vuv = (lf0 != 0).astype(np.float32) lf0 = P.interp1d(lf0, kind=hp_acoustic.f0_interpolation_kind) # Parameter trajectory smoothing if hp_acoustic.mod_spec_smoothing: hop_length = int(fs * (hp_acoustic.frame_period * 0.001)) modfs = fs / hop_length mgc = P.modspec_smoothing( mgc, modfs, cutoff=hp_acoustic.mod_spec_smoothing_cutoff) mgc = P.delta_features(mgc, hp_acoustic.windows) lf0 = P.delta_features(lf0, hp_acoustic.windows) bap = P.delta_features(bap, hp_acoustic.windows) features = np.hstack((mgc, lf0, vuv, bap)) # Cut silence frames by HTS alignment labels = hts.load(label_path) features = features[:labels.num_frames()] indices = labels.silence_frame_indices() features = np.delete(features, indices, axis=0) return features.astype(np.float32)
def collect_features(self, wav_path): fs, x = wavfile.read(wav_path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) spectrogram = P.trim_zeros_frames(spectrogram) if self.alpha is None: self.alpha = pysptk.util.mcepalpha(fs) mgc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=self.alpha) # Drop 0-th coefficient mgc = mgc[:, 1:] # 50Hz cut-off MS smoothing hop_length = int(fs * (hp.frame_period * 0.001)) modfs = fs / hop_length mgc = P.modspec_smoothing(mgc, modfs, cutoff=50) # Add delta mgc = P.delta_features(mgc, hp.windows) return mgc.astype(np.float32)
def __call__(self, data: Wave, test=None): x = data.wave.astype(numpy.float64) fs = data.sampling_rate if self._f0_estimating_method == 'dio': _f0, t = pyworld.dio( x, fs, frame_period=self._frame_period, f0_floor=self._f0_floor, f0_ceil=self._f0_ceil, ) else: from world4py.np import apis _f0, t = apis.harvest( x, fs, frame_period=self._frame_period, f0_floor=self._f0_floor, f0_ceil=self._f0_ceil, ) f0 = pyworld.stonemask(x, _f0, t, fs) spectrogram = pyworld.cheaptrick(x, f0, t, fs) aperiodicity = pyworld.d4c(x, f0, t, fs) mfcc = pysptk.sp2mc(spectrogram, order=self._order, alpha=self._alpha) voiced = ~(f0 == 0) # type: numpy.ndarray feature = AcousticFeature( f0=f0[:, None].astype(self._dtype), spectrogram=spectrogram.astype(self._dtype), aperiodicity=aperiodicity.astype(self._dtype), mfcc=mfcc.astype(self._dtype), voiced=voiced[:, None], ) feature.validate() return feature
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True): model.eval() hop_length = int(fs * (hp.frame_period * 0.001)) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] static_dim = mc.shape[-1] mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) T = mc.shape[0] inputs = mc[:, :static_dim].copy() # Normalization mc_scaled = P.scale(mc, data_mean, data_std) mc_scaled = Variable(torch.from_numpy(mc_scaled)) lengths = [len(mc_scaled)] # Add batch axis mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1)) # For MLPG R = unit_variance_mlpg_matrix(hp.windows, T) R = torch.from_numpy(R) # Apply model if model.include_parameter_generation(): # Case: models include parameter generation in itself # Mulistream features cannot be used in this case y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths) else: # Case: generic models (can be sequence model) assert hp.has_dynamic_features is not None y_hat = model(mc_scaled, lengths=lengths) y_hat_static = multi_stream_mlpg( y_hat, R, hp.stream_sizes, hp.has_dynamic_features) mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim) # Denormalize mc_static_pred = P.inv_scale( mc_static_pred, data_mean[:static_dim], data_std[:static_dim]) outputs = mc_static_pred.copy() if diffvc: mc_static_pred = mc_static_pred - mc[:, :static_dim] mc = np.hstack((c0[:, None], mc_static_pred)) if diffvc: mc[:, 0] = 0 # remove power coefficients engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp( mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize( f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform, inputs, outputs