def test_meanvar(): # Pick acoustic features for testing _, X = example_file_data_sources_for_acoustic_model() X = FileSourceDataset(X) lengths = [len(x) for x in X] D = X[0].shape[-1] X_mean, X_var = P.meanvar(X) X_std = np.sqrt(X_var) assert np.isfinite(X_mean).all() assert np.isfinite(X_var).all() assert X_mean.shape[-1] == D assert X_var.shape[-1] == D _, X_std_hat = P.meanstd(X) assert np.allclose(X_std, X_std_hat) x = X[0] x_scaled = P.scale(x, X_mean, X_std) assert np.isfinite(x_scaled).all() # For padded dataset _, X = example_file_data_sources_for_acoustic_model() X = PaddedFileSourceDataset(X, 1000) # Should get same results with padded features X_mean_hat, X_var_hat = P.meanvar(X, lengths) assert np.allclose(X_mean, X_mean_hat) assert np.allclose(X_var, X_var_hat) # Inverse transform x = X[0] x_hat = P.inv_scale(P.scale(x, X_mean, X_std), X_mean, X_std) assert np.allclose(x, x_hat, atol=1e-5)
def __getitem__(self, idx): x = P.minmax_scale(self.X[idx], min_=self.X_data_min, scale_=self.X_data_scale, feature_range=(0.01, 0.99)) y = P.scale(self.Y[idx], self.Y_data_mean, self.Y_data_std) return x, y
def test_vc_from_path(model, path, data_mean, data_std, diffvc=True): model.eval() fs, x = wavfile.read(path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] static_dim = mc.shape[-1] mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) T = mc.shape[0] inputs = mc[:, :static_dim].copy() # Normalization mc_scaled = P.scale(mc, data_mean, data_std) # Apply model mc_scaled = Variable(torch.from_numpy(mc_scaled)) R = unit_variance_mlpg_matrix(hp.windows, T) R = torch.from_numpy(R) y_hat, y_hat_static = model(mc_scaled, R) mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim) # Denormalize mc_static_pred = P.inv_scale(mc_static_pred, data_mean[:static_dim], data_std[:static_dim]) outputs = mc_static_pred.copy() if diffvc: mc_static_pred = mc_static_pred - mc[:, :static_dim] mc = np.hstack((c0[:, None], mc_static_pred)) if diffvc: mc[:, 0] = 0 # remove power coefficients engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp(mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform, inputs, outputs
def __getitem__(self, idx): x, t = self.xs[idx], self.ts[idx] x = minmax_scale(x, self.x_stat['min'], self.x_stat['max'], feature_range=(0.01, 0.99)) t = scale(t, self.t_stat['mean'], np.sqrt(self.t_stat['var'])) pad_x = self._padding(x) pad_t = self._padding(t) return pad_x, pad_t, len(self.xs[idx]), len(self.ts[idx])
def __getitem__(self, index): file = self.metadata[index] x = np.load(os.path.join(self.X_path, '{}.npy'.format(file))).reshape( -1, self.x_dim) y = np.load(os.path.join(self.Y_path, '{}.npy'.format(file))).reshape( -1, self.y_dim) norm_x = minmax_scale(x, self.X_min[self.train], self.X_max[self.train], feature_range=(0.01, 0.99)) norm_y = scale(y, self.Y_mean[self.train], self.Y_scale[self.train]) return norm_x, norm_y
def __getitem__(self, idx): x = P.minmax_scale(self.X[idx], min_=self.X_data_min, scale_=self.X_data_scale, feature_range=(0.01, 0.99)) y = P.scale(self.Y[idx], self.Y_data_mean, self.Y_data_std) # To handle inconsistent static-delta relationship after normalization # This is required to use MSE + MGE loss work if hp.recompute_delta_features: y = recompute_delta_features(y, self.Y_data_mean, self.Y_data_std, hp.windows, hp.stream_sizes, hp.has_dynamic_features) return x, y
def __getitem__(self, index): if self.train_flag is True: # x: Zero-padded raw_audio x = np.zeros(5000, dtype=np.float64) zpad_end = 5000 - (self.sample_end[index] - self.sample_start[index]) x[zpad_end:] = self.X_raw[self.sample_start[index]:self.sample_end[index]].astype(np.float64) # assert(len(x)==5000) else: x = self.X_raw[self.sample_start[index]:self.sample_end[index]].astype(np.float64) # x_mulaw: 8bit Mulaw encoded x_mulaw = mu_law_encode(x).astype(np.uint8) # cond: features to be used as a conditional input. mfcc or pyspec f0, timeaxis = pyworld.dio(x, 16000, frame_period=5) #(T,) f0 = pyworld.stonemask(x, f0, timeaxis, 16000) lf0 = f0.copy() lf0[np.nonzero(f0)] = np.log(f0[np.nonzero(f0)]) #(T,) #vuv = (lf0 != 0).astype(np.uint8) #(T,) if self.cond_sel is 'pyspec': cond = pyworld.cheaptrick(x, f0, timeaxis, 16000) # (T,d=513) cond = cond / self.pyspec_max elif self.cond_sel is 'mfcc': melspec = librosa.feature.melspectrogram(y=x, sr=16000, power=2.0, n_fft=400, hop_length=80, n_mels=128) cond = librosa.feature.mfcc(S=librosa.power_to_db(melspec), sr=16000, n_mfcc=25) #(d=25, T) cond = np.transpose(cond) # (T,d) cond = scale(cond, self.mfcc_mean, self.mfcc_std) # Stack cond cond = np.hstack((lf0[:,None], cond)) # cond: transpose to (d,T) and Resample cond = librosa.core.resample(np.transpose(cond), 200, 16000, res_type='kaiser_fast', fix=True, scale=False) # Resize and transpose cond = librosa.util.fix_length(cond, len(x_mulaw), mode='edge') # (d,T) return index, torch.LongTensor(x_mulaw), cond.astype(np.float32)
def __call__(self, path): feat = np.load(path).astype(np.float32) if self.domain == 'wld': #feat= P.scale(feat, self.mean, self.std) mean = feat.mean(axis=0) std = feat.std(axis=0) feat = P.scale(feat, mean, std) mean = np.vstack((mean, np.vstack((mean, mean)))) std = np.vstack((std, np.vstack((std, std)))) feat = np.vstack((np.vstack((feat, mean)), std)) elif self.domain == 'mel': feat = (feat - self.mean) / (self.std * self.std) feat = torch.from_numpy(feat) if feat.shape[0] % 3 != 0: #print(feat.shape,5-feat.shape[0]%5) feat = F.pad(feat, (0, 0, 0, 3 - feat.shape[0] % 3)) feat = feat.contiguous().view(-1, self.dim * 3) feat = F.pad(feat, (0, 0, 1, 0), value=1) #pad sos feat = F.pad(feat, (0, 0, 0, 1)) #pad eos return feat.type(torch.FloatTensor) else: return torch.from_numpy(feat)
def __getitem__(self, idx): x = P.scale(self.X[idx], self.data_mean, self.data_std) y = P.scale(self.Y[idx], self.data_mean, self.data_std) return x, y
def normalize(feature, mean, std): return P.scale(feature, mean, std)
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True): model.eval() hop_length = int(fs * (hp.frame_period * 0.001)) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] static_dim = mc.shape[-1] mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) T = mc.shape[0] inputs = mc[:, :static_dim].copy() # Normalization mc_scaled = P.scale(mc, data_mean, data_std) mc_scaled = Variable(torch.from_numpy(mc_scaled)) lengths = [len(mc_scaled)] # Add batch axis mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1)) # For MLPG R = unit_variance_mlpg_matrix(hp.windows, T) R = torch.from_numpy(R) # Apply model if model.include_parameter_generation(): # Case: models include parameter generation in itself # Mulistream features cannot be used in this case y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths) else: # Case: generic models (can be sequence model) assert hp.has_dynamic_features is not None y_hat = model(mc_scaled, lengths=lengths) y_hat_static = multi_stream_mlpg( y_hat, R, hp.stream_sizes, hp.has_dynamic_features) mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim) # Denormalize mc_static_pred = P.inv_scale( mc_static_pred, data_mean[:static_dim], data_std[:static_dim]) outputs = mc_static_pred.copy() if diffvc: mc_static_pred = mc_static_pred - mc[:, :static_dim] mc = np.hstack((c0[:, None], mc_static_pred)) if diffvc: mc[:, 0] = 0 # remove power coefficients engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp( mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize( f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform, inputs, outputs
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True): model.eval() hop_length = int(fs * (hp.frame_period * 0.001)) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] static_dim = mc.shape[-1] mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) T = mc.shape[0] inputs = mc[:, :static_dim].copy() # Normalization mc_scaled = P.scale(mc, data_mean, data_std) mc_scaled = Variable(torch.from_numpy(mc_scaled)) lengths = [len(mc_scaled)] # Add batch axis mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1)) # For MLPG R = unit_variance_mlpg_matrix(hp.windows, T) R = torch.from_numpy(R) # Apply model if model.include_parameter_generation(): # Case: models include parameter generation in itself # Mulistream features cannot be used in this case y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths) else: # Case: generic models (can be sequence model) assert hp.has_dynamic_features is not None y_hat = model(mc_scaled, lengths=lengths) y_hat_static = multi_stream_mlpg(y_hat, R, hp.stream_sizes, hp.has_dynamic_features) mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim) # Denormalize mc_static_pred = P.inv_scale(mc_static_pred, data_mean[:static_dim], data_std[:static_dim]) outputs = mc_static_pred.copy() if diffvc: mc_static_pred = mc_static_pred - mc[:, :static_dim] mc = np.hstack((c0[:, None], mc_static_pred)) if diffvc: mc[:, 0] = 0 # remove power coefficients engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp(mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform, inputs, outputs