def transform(self, XY): X, Y = XY assert X.ndim == 3 and Y.ndim == 3 longer_features = X if X.shape[1] > Y.shape[1] else Y X_aligned = np.zeros_like(longer_features) Y_aligned = np.zeros_like(longer_features) for idx, (x, y) in enumerate(zip(X, Y)): x, y = trim_zeros_frames(x), trim_zeros_frames(y) dist, path = fastdtw(x, y, radius=self.radius, dist=self.dist) dist /= (len(x) + len(y)) pathx = list(map(lambda l: l[0], path)) pathy = list(map(lambda l: l[1], path)) x, y = x[pathx], y[pathy] max_len = max(len(x), len(y)) if max_len > X_aligned.shape[1] or max_len > Y_aligned.shape[1]: pad_size = max(max_len - X_aligned.shape[1], max_len > Y_aligned.shape[1]) X_aligned = np.pad(X_aligned, [(0, 0), (0, pad_size), (0, 0)], mode="constant", constant_values=0) Y_aligned = np.pad(Y_aligned, [(0, 0), (0, pad_size), (0, 0)], mode="constant", constant_values=0) X_aligned[idx][:len(x)] = x Y_aligned[idx][:len(y)] = y if self.verbose > 0: print("{}, distance: {}".format(idx, dist)) return X_aligned, Y_aligned
def transform(self, XY): X, Y = XY assert X.ndim == 3 and Y.ndim == 3 longer_features = X if X.shape[1] > Y.shape[1] else Y Xc = X.copy() # this will be updated iteratively X_aligned = np.zeros_like(longer_features) Y_aligned = np.zeros_like(longer_features) refined_paths = np.empty(len(X), dtype=np.object) for idx in range(self.n_iter): for idx, (x, y) in enumerate(zip(Xc, Y)): x, y = trim_zeros_frames(x), trim_zeros_frames(y) dist, path = fastdtw(x, y, radius=self.radius, dist=self.dist) dist /= (len(x) + len(y)) pathx = list(map(lambda l: l[0], path)) pathy = list(map(lambda l: l[1], path)) refined_paths[idx] = pathx x, y = x[pathx], y[pathy] max_len = max(len(x), len(y)) if max_len > X_aligned.shape[1] or max_len > Y_aligned.shape[1]: pad_size = max(max_len - X_aligned.shape[1], max_len > Y_aligned.shape[1]) X_aligned = np.pad(X_aligned, [(0, 0), (0, pad_size), (0, 0)], mode="constant", constant_values=0) Y_aligned = np.pad(Y_aligned, [(0, 0), (0, pad_size), (0, 0)], mode="constant", constant_values=0) X_aligned[idx][:len(x)] = x Y_aligned[idx][:len(y)] = y if self.verbose > 0: print("{}, distance: {}".format(idx, dist)) # Fit gmm = GaussianMixture(n_components=self.n_components_gmm, covariance_type="full", max_iter=self.max_iter_gmm) XY = np.concatenate((X_aligned, Y_aligned), axis=-1).reshape(-1, X.shape[-1] * 2) gmm.fit(XY) windows = [(0, 0, np.array([1.0]))] # no delta paramgen = MLPG(gmm, windows=windows) for idx in range(len(Xc)): x = trim_zeros_frames(Xc[idx]) Xc[idx][:len(x)] = paramgen.transform(x) # Finally we can get aligned X for idx in range(len(X_aligned)): x = X[idx][refined_paths[idx]] X_aligned[idx][:len(x)] = x return X_aligned, Y_aligned
def plot_parallel(x, y): figure(figsize=(16, 7)) subplot(2, 1, 1) librosa.display.specshow(trim_zeros_frames(x).T, sr=fs, hop_length=hop_length, x_axis="time") colorbar() subplot(2, 1, 2) librosa.display.specshow(trim_zeros_frames(y).T, sr=fs, hop_length=hop_length, x_axis="time") colorbar()
def collect_features(self, path): x, fs = librosa.load(path, sr=config.fs) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=config.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) spectrogram = trim_zeros_frames(spectrogram) mc = pysptk.sp2mc(spectrogram, order=config.order, alpha=config.alpha) return mc
def _get_mcep(x, fs, frame_period=5, order=24): alpha = pysptk.util.mcepalpha(fs) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) spectrogram = trim_zeros_frames(spectrogram) mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha) return mc
def collect_features(self, path): fs, x = wavfile.read(path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) spectrogram = trim_zeros_frames(spectrogram) mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha) return mc
def get_features(x, fs): # f0 calculate _f0, t = pw.dio(x, fs) f0 = pw.stonemask(x, _f0, t, fs) # mcep calculate sp = trim_zeros_frames(pw.cheaptrick(x, f0, t, fs)) mcep = pysptk.sp2mc(sp, order=24, alpha=pysptk.util.mcepalpha(fs)) # bap calculate ap = pw.d4c(x, f0, t, fs) bap = pw.code_aperiodicity(ap, fs) return f0, mcep, bap
def test_trim_zeros_frames(): arr = np.array(((0, 0), (0, 0), (1, 1), (2, 2), (0, 0))) desired_default = np.array(((0, 0), (0, 0), (1, 1), (2, 2))) actual_default = trim_zeros_frames(arr) assert desired_default.shape[1] == actual_default.shape[1] np.testing.assert_array_equal(actual_default, desired_default) desired_b = np.array(((0, 0), (0, 0), (1, 1), (2, 2))) actual_b = trim_zeros_frames(arr, trim='b') assert desired_b.shape[1] == actual_b.shape[1] np.testing.assert_array_equal(actual_b, desired_b) desired_f = np.array(((1, 1), (2, 2), (0, 0))) actual_f = trim_zeros_frames(arr, trim='f') assert desired_f.shape[1] == actual_f.shape[1] np.testing.assert_array_equal(actual_f, desired_f) desired_fb = np.array(((1, 1), (2, 2))) actual_fb = trim_zeros_frames(arr, trim='fb') assert desired_fb.shape[1] == actual_fb.shape[1] np.testing.assert_array_equal(actual_fb, desired_fb) non_zeros = np.array(((1, 1), (2, 2), (3, 3), (4, 4), (5, 5))) desired_b_or_fb_non_zeros = np.array( ((1, 1), (2, 2), (3, 3), (4, 4), (5, 5))) actual_b = trim_zeros_frames(non_zeros, trim='b') np.testing.assert_array_equal(actual_b, desired_b_or_fb_non_zeros) actual_fb = trim_zeros_frames(non_zeros, trim='fb') np.testing.assert_array_equal(actual_fb, desired_b_or_fb_non_zeros)
def apply_each2d_trim(func2d, X, *args, **kwargs): """Apply function for each trimmed 2d slice. Args: func2d (Function): Function applied multiple times for each 2d slice. X (numpy.ndarray): Input 3d array of shape (``N x T x D``) Returns: numpy.ndarray: Output array (``N x T x D'``) """ assert X.ndim == 3 N, T, _ = X.shape x = trim_zeros_frames(X[0]) y = func2d(x, *args, **kwargs) assert y.ndim == 2 _, D = y.shape Y = np.zeros((N, T, D)) for idx in range(N): x = trim_zeros_frames(X[idx]) y = func2d(x, *args, **kwargs) Y[idx][:len(y)] = y return Y
def test_trim_remove_zeros_frames(): fs, x = wavfile.read(example_audio_file()) frame_period = 5 x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) for mat in [spectrogram, aperiodicity]: trimmed = trim_zeros_frames(mat) assert trimmed.shape[1] == mat.shape[1] for mat in [spectrogram, aperiodicity]: trimmed = remove_zeros_frames(mat) assert trimmed.shape[1] == mat.shape[1]
def collect_features(emotion): arr = [] for count in range(0, num_files): count_n = count + 1 path = '_' + str(emotion) + '/' + [str(count_n), ('0' + str(count_n))][count_n < 10] + '.wav' x, fs_ = sf.read(path) x = x.astype(np.float64) f0, time_axis = pyworld.dio(x, fs_, frame_period=frame_period) f0 = pyworld.stonemask(x, f0, time_axis, fs_) spectrogram = pyworld.cheaptrick(x, f0, time_axis, fs_) spectrogram = trim_zeros_frames(spectrogram) mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha) mc = mc.tolist() while len(mc) < 1000: mc.append(vuoto) arr.append(mc) return np.array(arr)
def gen_waveform(y_predicted, do_postfilter=False): y_predicted = trim_zeros_frames(y_predicted) # Generate parameters and split streams mgc, lf0, vuv, bap = gen_parameters(y_predicted) if do_postfilter: mgc = merlin_post_filter(mgc, alpha) spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha) #print(bap.shape) aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), fs, fftlen) f0 = lf0.copy() f0[vuv < 0.5] = 0 f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)]) generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64), spectrogram.astype(np.float64), aperiodicity.astype(np.float64), fs, frame_period) return generated_waveform
def _process_utterance(out_dir, index, speaker_id, wav_path, text): x, fs = librosa.load(wav_path, sr=config.fs) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=config.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) spectrogram = trim_zeros_frames(spectrogram) mc = pysptk.sp2mc(spectrogram, order=config.order, alpha=config.alpha) timesteps = mc.shape[0] wav_id = wav_path.split("/")[-1].split('.')[0] mc_name = '{}-mc.npy'.format(wav_id) np.save(os.path.join(out_dir, mc_name), mc, allow_pickle=False) # compute lf0 lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) # Return a tuple describing this training example: return mc_name, timesteps, text, speaker_id, lf0.tolist()
def collect_features(self, wav_path): fs, x = wavfile.read(wav_path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) spectrogram = P.trim_zeros_frames(spectrogram) if self.alpha is None: self.alpha = pysptk.util.mcepalpha(fs) mgc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=self.alpha) # Drop 0-th coefficient mgc = mgc[:, 1:] # 50Hz cut-off MS smoothing hop_length = int(fs * (hp.frame_period * 0.001)) modfs = fs / hop_length mgc = P.modspec_smoothing(mgc, modfs, cutoff=50) # Add delta mgc = P.delta_features(mgc, hp.windows) return mgc.astype(np.float32)
def collect_features(self, wav_path): fs, x = wavfile.read(wav_path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) spectrogram = P.trim_zeros_frames(spectrogram) if self.alpha is None: self.alpha = pysptk.util.mcepalpha(fs) mgc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=self.alpha) # Drop 0-th coefficient mgc = mgc[:, 1:] # 50Hz cut-off MS smoothing hop_length = int(fs * (hp.frame_period * 0.001)) modfs = fs / hop_length mgc = P.modspec_smoothing(mgc, modfs, cutoff=50) # Add delta mgc = P.delta_features(mgc, hp.windows) return mgc.astype(np.float32)
def _generate_parameters(self, path, var): seq = self.parameter_generator.generate(path) seq = trim_zeros_frames(seq) T = seq.shape[0] feat_index = self.feature_config.get_indices() mgc = seq[:, :feat_index['lf0']] lf0 = seq[:, feat_index['lf0']:feat_index['vuv']] vuv = seq[:, feat_index['vuv']] bap = seq[:, feat_index['bap']:] mgc_var = np.tile(var[:feat_index['lf0']], (T, 1)) lf0_var = np.tile(var[feat_index['lf0']:feat_index['vuv']], (T, 1)) bap_var = np.tile(var[feat_index['bap']:], (T, 1)) mgc = paramgen.mlpg(mgc, mgc_var, self.analysis_config.window) lf0 = paramgen.mlpg(lf0, lf0_var, self.analysis_config.window) bap = paramgen.mlpg(bap, bap_var, self.analysis_config.window) return mgc, lf0, vuv, bap
def collect_features(self, path): feature = kwiiyatta.analyze_wav(path) s = trim_zeros_frames(feature.spectrum_envelope) return feature.mel_cepstrum.data[:len(s)] # トリムするフレームが手前にずれてるのでは?
def remove_zero_frames_spectrogram(spectrogram): """ Removes frames containing only zeros from spectrogram """ return trim_zeros_frames(spectrogram)
print("Destination dir for {}: {}".format(speaker, d)) if not exists(d): os.makedirs(d) # Convert to arrays print("Convert datasets to arrays") X, Y = X_dataset.asarray(verbose=1), Y_dataset.asarray(verbose=1) # Alignment print("Perform alignment") X, Y = DTWAligner().transform((X, Y)) print("Save features to disk") for idx, (x, y) in tqdm(enumerate(zip(X, Y))): # paths src_name = splitext(basename(X_dataset.collected_files[idx][0]))[0] tgt_name = splitext(basename(Y_dataset.collected_files[idx][0]))[0] src_path = join(dst_dir, "X", src_name) tgt_path = join(dst_dir, "Y", tgt_name) # Trim and ajast frames x = P.trim_zeros_frames(x) y = P.trim_zeros_frames(y) x, y = P.adjust_frame_lengths(x, y, pad=True, divisible_by=2) # Save np.save(src_path, x) np.save(tgt_path, y) sys.exit(0)
print("Destination dir for {}: {}".format(speaker, d)) if not exists(d): os.makedirs(d) # Convert to arrays print("Convert datasets to arrays") X, Y = X_dataset.asarray(verbose=1), Y_dataset.asarray(verbose=1) # Alignment print("Perform alignment") X, Y = DTWAligner().transform((X, Y)) print("Save features to disk") for idx, (x, y) in tqdm(enumerate(zip(X, Y))): # paths src_name = splitext(basename(X_dataset.collected_files[idx][0]))[0] tgt_name = splitext(basename(Y_dataset.collected_files[idx][0]))[0] src_path = join(dst_dir, "X", src_name) tgt_path = join(dst_dir, "Y", tgt_name) # Trim and ajast frames x = P.trim_zeros_frames(x) y = P.trim_zeros_frames(y) x, y = P.adjust_frame_lengths(x, y, pad=True, divisible_by=2) # Save np.save(src_path, x) np.save(tgt_path, y) sys.exit(0)
def TrimmedDataset(feature): s = trim_zeros_frames(feature.spectrum_envelope) return feature[:len(s)] # トリムするフレームが手前にずれてるのでは?