def spec_to_waveform(spectrogram, order, fs, frame_period): alpha = pysptk.util.mcepalpha(fs) hop_length = int(fs * (frame_period * 0.001)) mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha) engine = Synthesizer(MLSADF(order=order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) return waveform
def test_vc_from_path(model, path, data_mean, data_std, diffvc=True): model.eval() fs, x = wavfile.read(path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] static_dim = mc.shape[-1] mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) T = mc.shape[0] inputs = mc[:, :static_dim].copy() # Normalization mc_scaled = P.scale(mc, data_mean, data_std) # Apply model mc_scaled = Variable(torch.from_numpy(mc_scaled)) R = unit_variance_mlpg_matrix(hp.windows, T) R = torch.from_numpy(R) y_hat, y_hat_static = model(mc_scaled, R) mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim) # Denormalize mc_static_pred = P.inv_scale(mc_static_pred, data_mean[:static_dim], data_std[:static_dim]) outputs = mc_static_pred.copy() if diffvc: mc_static_pred = mc_static_pred - mc[:, :static_dim] mc = np.hstack((c0[:, None], mc_static_pred)) if diffvc: mc[:, 0] = 0 # remove power coefficients engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp(mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform, inputs, outputs
def pysptk_imfcc(self): from pysptk.synthesis import MLSADF, Synthesizer # Convert mel-cesptrum to MLSADF coefficients b = pysptk.mc2b(self.mc, self.alpha) synthesizer = Synthesizer(MLSADF(order=self.order, alpha=self.alpha), self.hop_length) x_synthesized = synthesizer.synthesis(self.source_excitation, b) librosa.display.waveplot(x_synthesized, sr=self.sr) a = 0
def __test_synthesis(filt): # dummy source excitation source = __dummy_source() hopsize = 80 # dummy filter coef. windowed = __dummy_windowed_frames( source, frame_len=512, hopsize=hopsize) b = np.apply_along_axis( pysptk.mcep, 1, windowed, filt.order, 0.0) # synthesis synthesizer = Synthesizer(filt, hopsize) y = synthesizer.synthesis(source, b) assert np.all(np.isfinite(y))
def __test_synthesis(filt): # dummy source excitation source = __dummy_source() hopsize = 80 # dummy filter coef. windowed = __dummy_windowed_frames(source, frame_len=512, hopsize=hopsize) b = np.apply_along_axis(pysptk.mcep, 1, windowed, filt.order, 0.0) # synthesis synthesizer = Synthesizer(filt, hopsize) y = synthesizer.synthesis(source, b) assert np.all(np.isfinite(y))
def synthesize(self, pitch, mc, unnormalize=False): if unnormalize and self.mean != None: for tt in range(len(pitch)): for ii in range(self.num_params + 1): mc[tt][ii] = mc[tt][ii] * self.stdev[ii] + self.mean[ ii] #(mc[tt][ii]-self.mean[ii]) / self.stdev[ii] mc = np.asarray(mc, dtype=np.float64) pitch = np.asarray(pitch, dtype=np.float64) #print mc.shape #print pitch.shape b = sptk.mc2b(mc, self.alpha) synthesizer = Synthesizer( MLSADF(order=self.num_params, alpha=self.alpha), self.frame_len * self.sample_rate / 1000) source_excitation = sptk.excite( pitch, self.frame_len * self.sample_rate / 1000) x_synthesized = synthesizer.synthesis(source_excitation, b) return x_synthesized
def __test_synthesis(filt): # dummy source excitation source = __dummy_source() hopsize = 80 # dummy filter coef. windowed = __dummy_windowed_frames(source, frame_len=512, hopsize=hopsize) lpc = np.apply_along_axis(pysptk.lpc, 1, windowed, filt.order) # make sure lsp has loggain lsp = np.apply_along_axis(pysptk.lpc2lsp, 1, lpc) lsp[:, 0] = np.log(lsp[:, 0]) # synthesis synthesizer = Synthesizer(filt, hopsize) y = synthesizer.synthesis(source, lsp) assert np.all(np.isfinite(y))
def __test_synthesis(filt): # dummy source excitation source = __dummy_source() hopsize = 80 # dummy filter coef. windowed = __dummy_windowed_frames( source, frame_len=512, hopsize=hopsize) lpc = np.apply_along_axis( pysptk.lpc, 1, windowed, filt.order) # make sure lsp has loggain lsp = np.apply_along_axis(pysptk.lpc2lsp, 1, lpc) lsp[:, 0] = np.log(lsp[:, 0]) # synthesis synthesizer = Synthesizer(filt, hopsize) y = synthesizer.synthesis(source, lsp) assert np.all(np.isfinite(y))
def __test_synthesis(filt): # dummy source excitation source = __dummy_source() hopsize = 80 # dummy filter coef. windowed = __dummy_windowed_frames(source, frame_len=512, hopsize=hopsize) lpc = pysptk.lpc(windowed, filt.order) par = pysptk.lpc2par(lpc) # make sure par has loggain par[:, 0] = np.log(par[:, 0]) # synthesis synthesizer = Synthesizer(filt, hopsize) y = synthesizer.synthesis(source, par) assert np.all(np.isfinite(y))
def __test_synthesis_levdur(filt): # dummy source excitation source = __dummy_source() hopsize = 80 # dummy filter coef. windowed = __dummy_windowed_frames(source, frame_len=512, hopsize=hopsize) c = pysptk.mcep(windowed, filt.order) lpc = pysptk.levdur(pysptk.c2acr(c)) # make sure lpc has loggain lpc[:, 0] = np.log(lpc[:, 0]) # synthesis synthesizer = Synthesizer(filt, hopsize) y = synthesizer.synthesis(source, lpc) assert np.all(np.isfinite(y))
def test_one_utt(src_path, tgt_path, disable_mlpg=False, diffvc=True): # GMM-based parameter generation is provided by the library in `baseline` module if disable_mlpg: # Force disable MLPG paramgen = MLPG(gmm, windows=[(0, 0, np.array([1.0]))], diff=diffvc) else: paramgen = MLPG(gmm, windows=windows, diff=diffvc) fs, x = wavfile.read(src_path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) pdb.set_trace() mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] if use_delta: mc = delta_features(mc, windows) mc = paramgen.transform(mc) if disable_mlpg and mc.shape[-1] != static_dim: mc = mc[:, :static_dim] assert mc.shape[-1] == static_dim mc = np.hstack((c0[:, None], mc)) if diffvc: mc[:, 0] = 0 # remove power coefficients engine = Synthesizer(MLSADF(order=order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: spectrogram = pysptk.mc2sp(mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs, frame_period) return waveform
def apply_mlsa_filter(wav, mcep): if mcep.fs > wav.fs: mcep = kwiiyatta.resample(mcep, wav.fs) elif mcep.fs < wav.fs: spec = kwiiyatta.Synthesizer.resample_spectrum_envelope( mcep.extract_spectrum(), mcep.fs, wav.fs ) cutoff = mcep.fs*spec.shape[1]//wav.fs spec[:, cutoff:] = np.tile(np.atleast_2d(spec[:, cutoff-1]).T, spec.shape[-1]-cutoff) mcep = kwiiyatta.MelCepstrum(wav.fs, mcep.frame_period) mcep.extract(spec) # remove power coefficients mc = np.hstack((np.zeros((mcep.data.shape[0], 1)), mcep.data[:, 1:])) alpha = mcep.alpha() engine = Synthesizer(MLSADF(order=mcep.order, alpha=alpha), hopsize=int(mcep.fs * (mcep.frame_period * 0.001))) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(wav.data, b) return kwiiyatta.Wavdata(wav.fs, waveform)
def test_one_utt(path_src, path_tgt, disable_mlpg=False, diffvc=True): if disable_mlpg: paramgen = MLPG(gmm, windows=[(0, 0, np.array([1.0]))], diff=diffvc) else: paramgen = MLPG(gmm, windows=windows, diff=diffvc) x, fs_ = sf.read(path_src) x = x.astype(np.float64) f0, time_axis = pyworld.dio(x, fs_, frame_period=frame_period) f0 = pyworld.stonemask(x, f0, time_axis, fs_) spectrogram = pyworld.cheaptrick(x, f0, time_axis, fs_) aperiodicity = pyworld.d4c(x, f0, time_axis, fs_) mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] if use_delta: mc = delta_features(mc, windows) mc = paramgen.transform(mc) if disable_mlpg and mc.shape[-1] != static_dim: mc = mc[:, :static_dim] assert mc.shape[-1] == static_dim mc = np.hstack((c0[:, None], mc)) if diffvc: mc[:, 0] = 0 engine = Synthesizer(MLSADF(order=order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: spectrogram = pysptk.mc2sp(mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs_, frame_period) return waveform
class MCEP(object): def __init__(self, need_synth): self.frame_length = config.data.fftl self.hop_length = config.data.hop_length self.sr = config.data.sr self.order = config.data.mcep_order self.alpha = config.data.mcep_alpha if need_synth: self.build_synth() else: self.synthesizer = None def build_synth(self): self.synthesizer = Synthesizer( MLSADF(order=self.order, alpha=self.alpha), self.hop_length) def get_MCEP(self, utterance): utterance = librosa.util.normalize(utterance) utterance = utterance + np.random.normal( loc=0, scale=0.0000001, size=utterance.shape[0]) utterance = librosa.util.normalize(utterance) utterance = utterance.astype(np.float64) # necessary for synthesizer frames = librosa.util.frame(utterance, frame_length=self.frame_length, hop_length=self.hop_length).astype( np.float64).T # Windowing frames *= pysptk.blackman(self.frame_length) assert frames.shape[1] == self.frame_length # Pitch pitch = pysptk.swipe(utterance.astype(np.float64), fs=self.sr, hopsize=self.hop_length, min=60, max=240, otype="pitch") mcep = pysptk.mcep(frames, self.order, self.alpha) return mcep, pitch def synthesize_from_MCEP(self, mcep, pitch): mcep = mcep.copy(order='C') # fixes "ndarray not C-contiguous error b = pysptk.mc2b(mcep, self.alpha) excitation = pysptk.excite(pitch.astype(np.float64), self.hop_length) x = self.synthesizer.synthesis(excitation.astype(np.float64), b.astype(np.float64)) return x
def __test_synthesis(filt): # dummy source excitation source = __dummy_source() hopsize = 80 # dummy filter coef. windowed = __dummy_windowed_frames(source, frame_len=512, hopsize=hopsize) mc = pysptk.mcep(windowed, filt.order, filt.alpha) b = pysptk.mc2b(mc, filt.alpha) # synthesis synthesizer = Synthesizer(filt, hopsize) y = synthesizer.synthesis(source, b) assert np.all(np.isfinite(y)) # transpose synthesizer = Synthesizer(filt, hopsize, transpose=True) y = synthesizer.synthesis(source, b) assert np.all(np.isfinite(y))
def pitch_shift_on_lpc_residual( wav, sr, shift_in_cent, frame_length=4096, hop_length=240, mgc_order=59, ): assert wav.dtype == np.int16 frames = (librosa.util.frame(wav, frame_length=frame_length, hop_length=hop_length).astype(np.float64).T) frames *= pysptk.blackman(frame_length) alpha = pysptk.util.mcepalpha(sr) mgc = pysptk.mcep(frames, mgc_order, alpha, eps=1e-5, etype=1) c = pysptk.freqt(mgc, mgc_order, -alpha) lpc = pysptk.levdur(pysptk.c2acr(c, mgc_order, frame_length)) # remove gain lpc[:, 0] = 0 # Compute LPC residual synth = Synthesizer(AllZeroDF(mgc_order), hop_length) wav_lpc = synth.synthesis(wav.astype(np.float64), -lpc) residual = wav - wav_lpc # Pitch-shift on LPC residual residual_shifted = librosa.effects.pitch_shift(residual, sr=sr, n_steps=shift_in_cent, bins_per_octave=1200) # Filtering by LPC synth = Synthesizer(AllPoleDF(mgc_order), hop_length) wav_shifted = synth.synthesis(residual_shifted, lpc) return wav_shifted.astype(np.int16)
def __test_synthesis(filt): # dummy source excitation source = __dummy_source() hopsize = 80 # dummy filter coef. windowed = __dummy_windowed_frames(source, frame_len=512, hopsize=hopsize) lpc = pysptk.lpc(windowed, filt.order) lpc[:, 0] = 0 b = -lpc # synthesis synthesizer = Synthesizer(filt, hopsize) y = synthesizer.synthesis(source, b) assert np.all(np.isfinite(y)) # transpose synthesizer = Synthesizer(filt, hopsize, transpose=True) y = synthesizer.synthesis(source, b) assert np.all(np.isfinite(y))
datalist = [] with open("conf/eval.list", "r") as f: for line in f: line = line.rstrip() datalist.append(line) for i in range(0, len(datalist)): outfile = "result/wav/{}_diff.wav".format(datalist[i]) with open("data/SF-TF/mgc/{}.mgc".format(datalist[i]), "rb") as f: conv_mgc = np.fromfile(f, dtype="<f8", sep="") conv_mgc = conv_mgc.reshape(len(conv_mgc) // dim, dim) with open("data/SF/mgc/{}.mgc".format(datalist[i]), "rb") as f: src_mgc = np.fromfile(f, dtype="<f8", sep="") src_mgc = src_mgc.reshape(len(src_mgc) // dim, dim) fs, data = wavfile.read("data/SF/wav/{}.wav".format( datalist[i])) # 入力音声そのものをもってくる data = data.astype(np.float) diff_mgc = conv_mgc - src_mgc # 差分のフィルタを用意する diff_mgc = np.zeros(shape=conv_mgc.shape) # 差分のフィルタを入力音声波形に適用する b = np.apply_along_axis(sptk.mc2b, 1, diff_mgc, alpha) synthesizer = Synthesizer(MLSADF(order=dim - 1, alpha=alpha), 80) owav = synthesizer.synthesis(data, b) owav = np.clip(owav, -32768, 32767) wavfile.write(outfile, fs, owav.astype(np.int16))
pitch = pysptk.swipe(x.astype(np.float64), fs=sr, hopsize=hop_length, min=60, max=240, otype="pitch") source_excitation = pysptk.excite(pitch, hop_length) # Order of mel-cepstrum mc = pysptk.mcep(frames, order, alpha) logH = pysptk.mgc2sp(mc, alpha, 0.0, frame_length).real print(mc.shape) #plt.plot(mc) #plotname="x_syn_coefs_" + str(order) + ".png" #plt.savefig(plotname) # Convert mel-cesptrum to MLSADF coefficients b = pysptk.mc2b(mc, alpha) synthesizer = Synthesizer(MLSADF(order=order, alpha=alpha), hop_length) x_synthesized = synthesizer.synthesis(source_excitation, b) filenam = "synthesized_sounds/" + "x_syn" + str(order + 1) + ".wav" #wavfile.write("x.wav", sr, x) wavfile.write(filenam, sr, x_synthesized) time_total = time.time() - start writestring = str(order) + "," + str(time_total) + "\n" f.write(writestring)
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True): model.eval() hop_length = int(fs * (hp.frame_period * 0.001)) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] static_dim = mc.shape[-1] mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) T = mc.shape[0] inputs = mc[:, :static_dim].copy() # Normalization mc_scaled = P.scale(mc, data_mean, data_std) mc_scaled = Variable(torch.from_numpy(mc_scaled)) lengths = [len(mc_scaled)] # Add batch axis mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1)) # For MLPG R = unit_variance_mlpg_matrix(hp.windows, T) R = torch.from_numpy(R) # Apply model if model.include_parameter_generation(): # Case: models include parameter generation in itself # Mulistream features cannot be used in this case y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths) else: # Case: generic models (can be sequence model) assert hp.has_dynamic_features is not None y_hat = model(mc_scaled, lengths=lengths) y_hat_static = multi_stream_mlpg( y_hat, R, hp.stream_sizes, hp.has_dynamic_features) mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim) # Denormalize mc_static_pred = P.inv_scale( mc_static_pred, data_mean[:static_dim], data_std[:static_dim]) outputs = mc_static_pred.copy() if diffvc: mc_static_pred = mc_static_pred - mc[:, :static_dim] mc = np.hstack((c0[:, None], mc_static_pred)) if diffvc: mc[:, 0] = 0 # remove power coefficients engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp( mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize( f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform, inputs, outputs
hopsize=HOP_LENGTH, min=MIN_F0, max=MAX_F0, otype="pitch") # 励振源信号(声帯音源)の生成 source_excitation = pysptk.excite(pitch, HOP_LENGTH) # メルケプストラム分析(=スペクトル包絡の抽出) mc = pysptk.mcep(frames, ORDER, ALPHA) # メルケプストラム係数からMLSAディジタルフィルタ係数に変換 mlsa_coef = pysptk.mc2b(mc, ALPHA) # MLSAフィルタの作成 synthesizer = Synthesizer(MLSADF(order=ORDER, alpha=ALPHA), HOP_LENGTH) # #### 以降、合成フィルタのパラメタなどを変えて色々な音声を合成 # ### ピッチシフト (音を高くする) ### OUT_WAVE_FILE = "pitchshift_high.wav" PITCH_SHIFT = 0.5 # 音を高くする場合は 1より小さい倍率 excitation_pitchhigh = pysptk.excite(pitch * PITCH_SHIFT, HOP_LENGTH) y = synthesizer.synthesis(excitation_pitchhigh, mlsa_coef) # 音声合成 y = y.astype(np.int16) wavfile.write(OUT_WAVE_FILE, fs, y) # ### ピッチシフト (音を低くする) ### OUT_WAVE_FILE = "pitchshift_low.wav" PITCH_SHIFT = 1.5 # 音を低くする場合は 1より大きい倍率 excitation_pitchlow = pysptk.excite(pitch * PITCH_SHIFT, HOP_LENGTH)
hop_length=HOP_LENGTH).astype(np.float64).T frames *= pysptk.blackman(FRAME_LENGTH) # 窓掛け(ブラックマン窓) # ピッチ抽出 pitch = pysptk.swipe(x, fs=fs, hopsize=HOP_LENGTH, min=MIN_F0, max=MAX_F0, otype="pitch") # 励振源信号(声帯音源)の生成 source_excitation = pysptk.excite(pitch, HOP_LENGTH) # 線形予測分析による線形予測符号化(LPC)係数の抽出 lpc = pysptk.lpc(frames, ORDER) lpc[:, 0] = np.log(lpc[:, 0]) # LPC係数をPARCOR係数に変換 parcor = pysptk.lpc2par(lpc) # 全極フィルタの作成 synthesizer = Synthesizer(AllPoleLatticeDF(order=ORDER), HOP_LENGTH) # 励振源信号でフィルタを駆動して音声を合成 y = synthesizer.synthesis(source_excitation, parcor) # 音声の書き込み y = y.astype(np.int16) wavfile.write(OUT_WAVE_FILE, fs, y)
# Parameter generation paramgen = MLPG(gmm, windows=windows, diff=True) # Waveform generation for test set for idx, path in enumerate(source.test_paths): fs, x = wavfile.read(path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) # aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] mc = delta_features(mc, windows) since = time.time() mc = paramgen.transform(mc) print("{}, Elapsed time in conversion: {}s".format(idx, time.time() - since)) assert mc.shape[-1] == static_dim mc = np.hstack((c0[:, None], mc)) mc[:, 0] = 0 engine = Synthesizer(MLSADF(order=order, alpha=alpha), hopsize=80) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) if not exists('resultsVC'): os.makedirs('resultsVC') wavfile.write( "resultsVC/{}_{}.wav".format(splitext(basename(path))[0], 'mlpg'), fs, waveform.astype(np.int16))
hop_length=HOP_LENGTH).astype(np.float64).T frames *= pysptk.blackman(FRAME_LENGTH) # 窓掛け(ブラックマン窓) # ピッチ抽出 pitch = pysptk.swipe(x, fs=fs, hopsize=HOP_LENGTH, min=MIN_F0, max=MAX_F0, otype="pitch") # 励振源信号(声帯音源)の生成 source_excitation = pysptk.excite(pitch, HOP_LENGTH) # 線形予測分析による線形予測符号化(LPC)係数の抽出 lpc = pysptk.lpc(frames, ORDER) lpc[:, 0] = np.log(lpc[:, 0]) # LPC係数を線スペクトル対に変換 lsp = pysptk.lpc2lsp(lpc, otype=0, fs=fs) # 全極フィルタの作成 synthesizer = Synthesizer(LSPDF(order=ORDER), HOP_LENGTH) # 励振源信号でフィルタを駆動して音声を合成 y = synthesizer.synthesis(source_excitation, lsp) # 音声の書き込み y = y.astype(np.int16) wavfile.write(OUT_WAVE_FILE, fs, y)
def build_synth(self): self.synthesizer = Synthesizer( MLSADF(order=self.order, alpha=self.alpha), self.hop_length)
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True): model.eval() hop_length = int(fs * (hp.frame_period * 0.001)) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] static_dim = mc.shape[-1] mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) T = mc.shape[0] inputs = mc[:, :static_dim].copy() # Normalization mc_scaled = P.scale(mc, data_mean, data_std) mc_scaled = Variable(torch.from_numpy(mc_scaled)) lengths = [len(mc_scaled)] # Add batch axis mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1)) # For MLPG R = unit_variance_mlpg_matrix(hp.windows, T) R = torch.from_numpy(R) # Apply model if model.include_parameter_generation(): # Case: models include parameter generation in itself # Mulistream features cannot be used in this case y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths) else: # Case: generic models (can be sequence model) assert hp.has_dynamic_features is not None y_hat = model(mc_scaled, lengths=lengths) y_hat_static = multi_stream_mlpg(y_hat, R, hp.stream_sizes, hp.has_dynamic_features) mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim) # Denormalize mc_static_pred = P.inv_scale(mc_static_pred, data_mean[:static_dim], data_std[:static_dim]) outputs = mc_static_pred.copy() if diffvc: mc_static_pred = mc_static_pred - mc[:, :static_dim] mc = np.hstack((c0[:, None], mc_static_pred)) if diffvc: mc[:, 0] = 0 # remove power coefficients engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp(mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform, inputs, outputs
# 音声の切り出しと窓掛け frames = librosa.util.frame(x, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH).astype(np.float64).T frames *= pysptk.blackman(FRAME_LENGTH) # 窓掛け(ブラックマン窓) # ピッチ抽出 pitch = pysptk.swipe(x, fs=fs, hopsize=HOP_LENGTH, min=MIN_F0, max=MAX_F0, otype="pitch") # 励振源信号(声帯音源)の生成 source_excitation = pysptk.excite(pitch, HOP_LENGTH) # 線形予測分析による線形予測係数の抽出 lpc = pysptk.lpc(frames, ORDER) lpc[:, 0] = np.log(lpc[:, 0]) # 全極フィルタの作成 synthesizer = Synthesizer(AllPoleDF(order=ORDER), HOP_LENGTH) # 励振源信号でフィルタを駆動して音声を合成 y = synthesizer.synthesis(source_excitation, lpc) # 音声の書き込み y = y.astype(np.int16) wavfile.write(OUT_WAVE_FILE, fs, y)
# 音声の分析 (基本周波数、スペクトル包絡、非周期性指標) _, sp, _ = pyworld.wav2world(x, fs) # メルケプストラム係数の抽出 from WORLDのスペクトル包絡 mcep = pysptk.sp2mc(sp, order=ORDER, alpha=ALPHA) # ピッチ抽出 pitch = pysptk.swipe(x, fs=fs, hopsize=HOP_LENGTH, min=MIN_F0, max=MAX_F0, otype="pitch") # 励振源信号(声帯音源)の生成 source_excitation = pysptk.excite(pitch, HOP_LENGTH) # メルケプストラム係数からMLSAディジタルフィルタ係数に変換 mlsa_coef = pysptk.mc2b(mcep, ALPHA) # MLSAフィルタの作成 synthesizer = Synthesizer(MLSADF(order=ORDER, alpha=ALPHA), HOP_LENGTH) # 励振源信号でMLSAフィルタを駆動して音声を合成 y = synthesizer.synthesis(source_excitation, mlsa_coef) # 音声の書き込み y = y.astype(np.int16) wavfile.write(OUT_WAVE_FILE, fs, y)