def test_modspec_smoothing(): static_dim = 2 T = 64 np.random.seed(1234) y = np.random.rand(T, static_dim) modfs = 200 for log_domain in [True, False]: for norm in [None, "ortho"]: for n in [1024, 2048]: # Nyquist freq y_hat = P.modspec_smoothing(y, modfs, n=n, norm=norm, cutoff=modfs // 2, log_domain=log_domain) assert np.allclose(y, y_hat) # Smooth P.modspec_smoothing(y, modfs, n=n, norm=norm, cutoff=modfs // 4, log_domain=log_domain) # Cutoff frequency larger than modfs//2 @raises(ValueError) def __test_invalid_param(y, modfs): P.modspec_smoothing(y, modfs, n=2048, cutoff=modfs // 2 + 1) # FFT size should larger than time length @raises(RuntimeError) def __test_invalid_time_length(y, modfs): P.modspec_smoothing(y, modfs, n=32, cutoff=modfs // 2) __test_invalid_time_length(y, modfs) __test_invalid_param(y, modfs)
def generate_changed_voice(model, input_path): fs, x = wavfile.read(input_path) x = x.astype(np.float64) if len(x.shape) > 1: x = x.mean(axis=1) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] mc = P.modspec_smoothing(mc, FS / HOP_LENGHT, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) gen_data = model.predict(mc) gen_data = np.hstack([c0.reshape((-1, 1)), gen_data]) fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp( gen_data.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize( f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform
def get_feature(wav_path, preprocessing=False, getsize=False): fs, x = wavfile.read(wav_path) x = x.astype(np.float64) if audio_world_config.use_harvest: f0, timeaxis = pyworld.harvest( x, fs, frame_period=audio_world_config.frame_period, f0_floor=audio_world_config.f0_floor, f0_ceil=audio_world_config.f0_ceil) else: f0, timeaxis = pyworld.dio( x, fs, frame_period=audio_world_config.frame_period, f0_floor=audio_world_config.f0_floor, f0_ceil=audio_world_config.f0_ceil) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) alpha = pysptk.util.mcepalpha(fs) mgc = pysptk.sp2mc(spectrogram, order=audio_world_config.mgc_dim, alpha=alpha) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) if audio_world_config.use_harvest: # https://github.com/mmorise/World/issues/35#issuecomment-306521887 vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None] else: vuv = (lf0 != 0).astype(np.float32) lf0 = P.interp1d(lf0, kind=audio_world_config.f0_interpolation_kind) # Parameter trajectory smoothing if audio_world_config.mod_spec_smoothing: hop_length = int(fs * (audio_world_config.frame_period * 0.001)) modfs = fs / hop_length mgc = P.modspec_smoothing( mgc, modfs, cutoff=audio_world_config.mod_spec_smoothing_cutoff) mgc = P.delta_features(mgc, audio_world_config.windows) lf0 = P.delta_features(lf0, audio_world_config.windows) bap = P.delta_features(bap, audio_world_config.windows) features = np.hstack((mgc, lf0, vuv, bap)) if preprocessing: out_path = wav_path.replace(".wav", "").replace("wav", "world") np.save(out_path, features) elif getsize: feature, mgc.shape[0], lf0.shape[0], bap.shape[0] else: return features
def test_vc_from_path(model, path, data_mean, data_std, diffvc=True): model.eval() fs, x = wavfile.read(path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] static_dim = mc.shape[-1] mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) T = mc.shape[0] inputs = mc[:, :static_dim].copy() # Normalization mc_scaled = P.scale(mc, data_mean, data_std) # Apply model mc_scaled = Variable(torch.from_numpy(mc_scaled)) R = unit_variance_mlpg_matrix(hp.windows, T) R = torch.from_numpy(R) y_hat, y_hat_static = model(mc_scaled, R) mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim) # Denormalize mc_static_pred = P.inv_scale(mc_static_pred, data_mean[:static_dim], data_std[:static_dim]) outputs = mc_static_pred.copy() if diffvc: mc_static_pred = mc_static_pred - mc[:, :static_dim] mc = np.hstack((c0[:, None], mc_static_pred)) if diffvc: mc[:, 0] = 0 # remove power coefficients engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp(mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform, inputs, outputs
def collect_features(self, wav_path, label_path): fs, x = wavfile.read(wav_path) x = x.astype(np.float64) if hp_acoustic.use_harvest: f0, timeaxis = pyworld.harvest( x, fs, frame_period=hp_acoustic.frame_period, f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil) else: f0, timeaxis = pyworld.dio( x, fs, frame_period=hp_acoustic.frame_period, f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) if self.alpha is None: self.alpha = pysptk.util.mcepalpha(fs) mgc = pysptk.sp2mc(spectrogram, order=hp_acoustic.order, alpha=self.alpha) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) if hp_acoustic.use_harvest: # https://github.com/mmorise/World/issues/35#issuecomment-306521887 vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None] else: vuv = (lf0 != 0).astype(np.float32) lf0 = P.interp1d(lf0, kind=hp_acoustic.f0_interpolation_kind) # Parameter trajectory smoothing if hp_acoustic.mod_spec_smoothing: hop_length = int(fs * (hp_acoustic.frame_period * 0.001)) modfs = fs / hop_length mgc = P.modspec_smoothing( mgc, modfs, cutoff=hp_acoustic.mod_spec_smoothing_cutoff) mgc = P.delta_features(mgc, hp_acoustic.windows) lf0 = P.delta_features(lf0, hp_acoustic.windows) bap = P.delta_features(bap, hp_acoustic.windows) features = np.hstack((mgc, lf0, vuv, bap)) # Cut silence frames by HTS alignment labels = hts.load(label_path) features = features[:labels.num_frames()] indices = labels.silence_frame_indices() features = np.delete(features, indices, axis=0) return features.astype(np.float32)
def collect_features(self, wav_path): fs, x = wavfile.read(wav_path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) spectrogram = P.trim_zeros_frames(spectrogram) if self.alpha is None: self.alpha = pysptk.util.mcepalpha(fs) mgc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=self.alpha) # Drop 0-th coefficient mgc = mgc[:, 1:] # 50Hz cut-off MS smoothing hop_length = int(fs * (hp.frame_period * 0.001)) modfs = fs / hop_length mgc = P.modspec_smoothing(mgc, modfs, cutoff=50) # Add delta mgc = P.delta_features(mgc, hp.windows) return mgc.astype(np.float32)
def collect_features(self, wav_path, label_path): fs, x = wavfile.read(wav_path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp_acoustic.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) if self.alpha is None: self.alpha = pysptk.util.mcepalpha(fs) mgc = pysptk.sp2mc(spectrogram, order=hp_acoustic.order, alpha=self.alpha) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) vuv = (lf0 != 0).astype(np.float32) lf0 = P.interp1d(lf0, kind=hp_acoustic.f0_interpolation_kind) # 50hz parameter trajectory smoothing hop_length = int(fs * (hp_acoustic.frame_period * 0.001)) modfs = fs / hop_length mgc = P.modspec_smoothing(mgc, modfs, cutoff=50) mgc = P.delta_features(mgc, hp_acoustic.windows) lf0 = P.delta_features(lf0, hp_acoustic.windows) bap = P.delta_features(bap, hp_acoustic.windows) features = np.hstack((mgc, lf0, vuv, bap)) # Cut silence frames by HTS alignment labels = hts.load(label_path) features = features[:labels.num_frames()] indices = labels.silence_frame_indices() features = np.delete(features, indices, axis=0) return features.astype(np.float32)
def __test_invalid_time_length(y, modfs): P.modspec_smoothing(y, modfs, n=32, cutoff=modfs // 2)
def __test_invalid_param(y, modfs): P.modspec_smoothing(y, modfs, n=2048, cutoff=modfs // 2 + 1)
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True): model.eval() hop_length = int(fs * (hp.frame_period * 0.001)) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] static_dim = mc.shape[-1] mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) T = mc.shape[0] inputs = mc[:, :static_dim].copy() # Normalization mc_scaled = P.scale(mc, data_mean, data_std) mc_scaled = Variable(torch.from_numpy(mc_scaled)) lengths = [len(mc_scaled)] # Add batch axis mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1)) # For MLPG R = unit_variance_mlpg_matrix(hp.windows, T) R = torch.from_numpy(R) # Apply model if model.include_parameter_generation(): # Case: models include parameter generation in itself # Mulistream features cannot be used in this case y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths) else: # Case: generic models (can be sequence model) assert hp.has_dynamic_features is not None y_hat = model(mc_scaled, lengths=lengths) y_hat_static = multi_stream_mlpg( y_hat, R, hp.stream_sizes, hp.has_dynamic_features) mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim) # Denormalize mc_static_pred = P.inv_scale( mc_static_pred, data_mean[:static_dim], data_std[:static_dim]) outputs = mc_static_pred.copy() if diffvc: mc_static_pred = mc_static_pred - mc[:, :static_dim] mc = np.hstack((c0[:, None], mc_static_pred)) if diffvc: mc[:, 0] = 0 # remove power coefficients engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp( mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize( f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform, inputs, outputs
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True): model.eval() hop_length = int(fs * (hp.frame_period * 0.001)) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] static_dim = mc.shape[-1] mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) T = mc.shape[0] inputs = mc[:, :static_dim].copy() # Normalization mc_scaled = P.scale(mc, data_mean, data_std) mc_scaled = Variable(torch.from_numpy(mc_scaled)) lengths = [len(mc_scaled)] # Add batch axis mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1)) # For MLPG R = unit_variance_mlpg_matrix(hp.windows, T) R = torch.from_numpy(R) # Apply model if model.include_parameter_generation(): # Case: models include parameter generation in itself # Mulistream features cannot be used in this case y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths) else: # Case: generic models (can be sequence model) assert hp.has_dynamic_features is not None y_hat = model(mc_scaled, lengths=lengths) y_hat_static = multi_stream_mlpg(y_hat, R, hp.stream_sizes, hp.has_dynamic_features) mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim) # Denormalize mc_static_pred = P.inv_scale(mc_static_pred, data_mean[:static_dim], data_std[:static_dim]) outputs = mc_static_pred.copy() if diffvc: mc_static_pred = mc_static_pred - mc[:, :static_dim] mc = np.hstack((c0[:, None], mc_static_pred)) if diffvc: mc[:, 0] = 0 # remove power coefficients engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp(mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform, inputs, outputs