def test_world_array_order(): wav = kwiiyatta.load_wav(dataset.CLB_WAV) f0, timeaxis = pyworld.dio(wav.data, wav.fs) f0 = pyworld.stonemask(wav.data, f0, timeaxis, wav.fs) spec = pyworld.cheaptrick(wav.data, f0, timeaxis, wav.fs) ape = pyworld.d4c(wav.data, f0, timeaxis, wav.fs) pyworld.synthesize(f0, spec, ape, wav.fs) data = wav.data[::2] expected_msg = 'ndarray is not C-contiguous' with pytest.raises(ValueError) as e: f0, timeaxis = pyworld.dio(data, wav.fs) assert expected_msg == str(e.value) with pytest.raises(ValueError) as e: f0 = pyworld.stonemask(data, f0, timeaxis, wav.fs) assert expected_msg == str(e.value) with pytest.raises(ValueError) as e: pyworld.cheaptrick(data, f0, timeaxis, wav.fs) assert expected_msg == str(e.value) with pytest.raises(ValueError) as e: pyworld.d4c(data, f0, timeaxis, wav.fs) assert expected_msg == str(e.value) with pytest.raises(ValueError) as e: pyworld.synthesize(f0[::2], spec[::2], ape[::2], wav.fs) assert expected_msg == str(e.value)
def estimate_word(word, name): if os.path.exists(f'rijeci_wav/{word}_{name}.wav'): f_bef, fs = sf.read(f'rijeci_wav/{word}_{name}.wav') f0, timeaxis = pw.harvest(f_bef, fs) f0_mask = pw.stonemask(f_bef, f0, timeaxis, fs) sp = pw.cheaptrick(f_bef, f0_mask, timeaxis, fs) ap = pw.d4c(f_bef, f0_mask, timeaxis, fs) y = pw.synthesize(f0_mask, sp, ap, fs, pw.default_frame_period) sf.write(f'rijeci_after_sint/{word}_after_sint_{name}-def.wav', y, fs) savefig(f'slike_rijeci_after_sint/{word}_after_sint_{name}-def.png', [f_bef, y], word) y = pw.synthesize(f0_mask, sp, ap, fs, 3.0) sf.write(f'rijeci_after_sint/{word}_after_sint_{name}.wav', y, fs) savefig(f'slike_rijeci_after_sint/{word}_after_sint_{name}.png', [f_bef, y], word) y = pw.synthesize(f0_mask, sp, ap, fs, 20.0) sf.write(f'rijeci_after_sint/{word}_after_sint_{name}-20.wav', y, fs) savefig(f'slike_rijeci_after_sint/{word}_after_sint_{name}-20.png', [f_bef, y], word)
def pw2wav(features, feat_dim=513, fs=16000): ''' NOTE: Use `order='C'` to ensure Cython compatibility ''' #print(type(features['sp'])) #print(type(features['en'])) en = np.reshape(features['en'], [-1, 1]) sp = np.power(10., features['sp']) sp = en * sp if isinstance(features, dict): return pw.synthesize( features['f0'].astype(np.float64).copy(order='C'), sp.astype(np.float64).copy(order='C'), features['ap'].astype(np.float64).copy(order='C'), fs, ) features = features.astype(np.float64) sp = features[:, :feat_dim] ap = features[:, feat_dim:feat_dim*2] f0 = features[:, feat_dim*2] en = features[:, feat_dim*2 + 1] en = np.reshape(en, [-1, 1]) sp = np.power(10., sp) sp = en * sp return pw.synthesize( f0.copy(order='C'), sp.copy(order='C'), ap.copy(order='C'), fs )
def pw2wav(features, feat_dim=513, fs=16000): ''' NOTE: Use `order='C'` to ensure Cython compatibility ''' en = np.reshape(features['en'], [-1, 1]) sp = np.power(10., features['sp']) sp = en * sp if isinstance(features, dict): return pw.synthesize( features['f0'].astype(np.float64).copy(order='C'), sp.astype(np.float64).copy(order='C'), features['ap'].astype(np.float64).copy(order='C'), fs, ) features = features.astype(np.float64) sp = features[:, :feat_dim] ap = features[:, feat_dim:feat_dim*2] f0 = features[:, feat_dim*2] en = features[:, feat_dim*2 + 1] en = np.reshape(en, [-1, 1]) sp = np.power(10., sp) sp = en * sp return pw.synthesize( f0.copy(order='C'), sp.copy(order='C'), ap.copy(order='C'), fs )
def word_synthesis(word, file_name): if os.path.exists(f'{PROCESSED_WORDS_DIRECTORY}/{file_name}_{word}.wav'): data, samplerate = sf.read( f'{PROCESSED_WORDS_DIRECTORY}/{file_name}_{word}.wav') f0, timeaxis = pw.harvest(data, samplerate) f0_mask = pw.stonemask(data, f0, timeaxis, samplerate) spectral_envelop = pw.cheaptrick(data, f0_mask, timeaxis, samplerate) aperiodicity = pw.d4c(data, f0_mask, timeaxis, samplerate) synthesized_word = pw.synthesize(f0_mask, spectral_envelop, aperiodicity, samplerate, pw.default_frame_period) sf.write(f'{SYNTHESIS_WORDS_DIRECTORY}/{file_name}_{word}_default.wav', synthesized_word, samplerate) savefig( f'{PLOTS_SYNTHESIS_WORDS_DIRECTORY}/{file_name}_{word}_default.png', [data, synthesized_word], word) synthesized_word = pw.synthesize(f0_mask, spectral_envelop, aperiodicity, samplerate, 3.0) sf.write(f'{SYNTHESIS_WORDS_DIRECTORY}/{file_name}_{word}_3.wav', synthesized_word, samplerate) savefig(f'{PLOTS_SYNTHESIS_WORDS_DIRECTORY}/{file_name}_{word}_3.png', [data, synthesized_word], word) synthesized_word = pw.synthesize(f0_mask, spectral_envelop, aperiodicity, samplerate, 20.0) sf.write(f'{SYNTHESIS_WORDS_DIRECTORY}/{file_name}_{word}_20.wav', synthesized_word, samplerate) savefig(f'{PLOTS_SYNTHESIS_WORDS_DIRECTORY}/{file_name}_{word}_20.png', [data, synthesized_word], word)
def synthesis(resyn=False): results_dir = config.test_results_dir predicted_mceps = load_pkl(results_dir + '/predicted_mceps.pkl') predicted_mceps = data_merge(predicted_mceps) uttids = load_pkl(results_dir + '/test_uttids.pkl') lengths = load_pkl(results_dir + '/test_lengths.pkl') scp_dict = scp2dict() data_size = 10 # len(lengths) # get f0 range if 'bdl' in uttids[0]: f0_floor = 30.0 f0_ceil = 300.0 elif 'rms' in uttids[0]: f0_floor = 30.0 f0_ceil = 300.0 elif 'slt' in uttids[0]: f0_floor = 70.0 f0_ceil = 500.0 elif 'clb' in uttids[0]: f0_floor = 70.0 f0_ceil = 500.0 else: print('Unknown speaker! Check if something Wrong!!!') f0_floor = 40.0 f0_ceil = 600.0 src_spk = uttids[0].split('_')[0] tgt_spk = config.tgt_data_dir.split('/')[-1] for i in range(data_size): uttid = uttids[i] utt_len = lengths[i] sp_predict = mceps2sp(predicted_mceps[i][:utt_len, :]) wav_arr, sr = librosa.load(scp_dict[uttid], sr=None, dtype=np.float64) _, t = pw.harvest(wav_arr, sr, f0_floor, f0_ceil) f0_raw = read_f0_via_id(uttid, utt_len) ap = pw.d4c(wav_arr, f0_raw, t, sr) if src_spk != tgt_spk: f0_t = f0_transform(f0_raw) else: f0_t = f0_raw y_predict = pw.synthesize(f0_t, sp_predict, ap[:utt_len, :], sr, pw.default_frame_period) y_predict = y_predict.astype(np.float32) librosa.output.write_wav(results_dir + '/' + uttid + '_predict.wav', y_predict, sr) if resyn: sp = pw.cheaptrick(wav_arr, f0_raw, t, sr) y_resyn = pw.synthesize(f0_raw, sp, ap, sr, pw.default_frame_period) y_resyn = y_resyn.astype(np.float32) librosa.output.write_wav(results_dir + '/' + uttid + '_resyn.wav', y_resyn, sr) print('Resynthesized %s groundtruth wav files!' % (i + 1)) print('Synthesized %s wav files!' % (i + 1))
def main(args): if os.path.isdir('test'): rmtree('test') os.mkdir('test') x, fs = sf.read("./{}.wav".format(edited_files["Thinking_Out_Loud"])) # x, fs = librosa.load('utterance/vaiueo2d.wav', dtype=np.float64) # 1. A convient way f0, sp, ap = pw.wav2world(x, fs) # use default options y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period) # 2. Step by step # 2-1 Without F0 refinement _f0, t = pw.dio(x, fs, f0_floor=50.0, f0_ceil=600.0, channels_in_octave=2, frame_period=args.frame_period, speed=args.speed) _sp = pw.cheaptrick(x, _f0, t, fs) _ap = pw.d4c(x, _f0, t, fs) _y = pw.synthesize(_f0, _sp, _ap, fs, args.frame_period) # librosa.output.write_wav('test/y_without_f0_refinement.wav', _y, fs) sf.write('test/y_without_f0_refinement.wav', _y, fs) # 2-2 DIO with F0 refinement (using Stonemask) f0 = pw.stonemask(x, _f0, t, fs) sp = pw.cheaptrick(x, f0, t, fs) ap = pw.d4c(x, f0, t, fs) y = pw.synthesize(f0, sp, ap, fs, args.frame_period) # librosa.output.write_wav('test/y_with_f0_refinement.wav', y, fs) sf.write('test/y_with_f0_refinement.wav', y, fs) # 2-3 Harvest with F0 refinement (using Stonemask) _f0_h, t_h = pw.harvest(x, fs) f0_h = pw.stonemask(x, _f0_h, t_h, fs) sp_h = pw.cheaptrick(x, f0_h, t_h, fs) ap_h = pw.d4c(x, f0_h, t_h, fs) y_h = pw.synthesize(f0_h, sp_h, ap_h, fs, pw.default_frame_period) # librosa.output.write_wav('test/y_harvest_with_f0_refinement.wav', y_h, fs) sf.write('test/y_harvest_with_f0_refinement.wav', y_h, fs) # Comparison savefig('test/wavform.png', [x, _y, y]) savefig('test/sp.png', [_sp, sp]) savefig('test/ap.png', [_ap, ap], log=False) savefig('test/f0.png', [_f0, f0]) print('Please check "test" directory for output files')
def main(args): if os.path.isdir('test'): rmtree('test') os.mkdir('test') x, fs = sf.read('utterance/vaiueo2d.wav') # x, fs = librosa.load('utterance/vaiueo2d.wav', dtype=np.float64) # 1. A convient way f0, sp, ap = pw.wav2world(x, fs) # use default options y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period) # 2. Step by step # 2-1 Without F0 refinement _f0, t = pw.dio(x, fs, f0_floor=50.0, f0_ceil=600.0, channels_in_octave=2, frame_period=args.frame_period, speed=args.speed) _sp = pw.cheaptrick(x, _f0, t, fs) _ap = pw.d4c(x, _f0, t, fs) _y = pw.synthesize(_f0, _sp, _ap, fs, args.frame_period) # librosa.output.write_wav('test/y_without_f0_refinement.wav', _y, fs) sf.write('test/y_without_f0_refinement.wav', _y, fs) # 2-2 DIO with F0 refinement (using Stonemask) f0 = pw.stonemask(x, _f0, t, fs) sp = pw.cheaptrick(x, f0, t, fs) ap = pw.d4c(x, f0, t, fs) y = pw.synthesize(f0, sp, ap, fs, args.frame_period) # librosa.output.write_wav('test/y_with_f0_refinement.wav', y, fs) sf.write('test/y_with_f0_refinement.wav', y, fs) # 2-3 Harvest with F0 refinement (using Stonemask) _f0_h, t_h = pw.harvest(x, fs) f0_h = pw.stonemask(x, _f0_h, t_h, fs) sp_h = pw.cheaptrick(x, f0_h, t_h, fs) ap_h = pw.d4c(x, f0_h, t_h, fs) y_h = pw.synthesize(f0_h, sp_h, ap_h, fs, pw.default_frame_period) # librosa.output.write_wav('test/y_harvest_with_f0_refinement.wav', y_h, fs) sf.write('test/y_harvest_with_f0_refinement.wav', y_h, fs) # Comparison savefig('test/wavform.png', [x, _y, y]) savefig('test/sp.png', [_sp, sp]) savefig('test/ap.png', [_ap, ap], log=False) savefig('test/f0.png', [_f0, f0]) print('Please check "test" directory for output files')
def anonymization(fs, waveNDArray, f0Value = 0, sp_strechRatio = np.random.uniform(0.6, 2, size=1), gaussian_s = 3): """ WAV音声データから話者情報を取り除いたWAV音声データを作成 label音声からinput音声作成用 :param path: :param f0Value: :param sp_strechRatio: :return: """ waveNDArray = waveNDArray.astype(np.float) _f0, t = pw.dio(waveNDArray, fs) # 基本周波数の抽出 f0 = pw.stonemask(waveNDArray, _f0, t, fs) # 基本周波数の修正 sp = pw.cheaptrick(waveNDArray, f0, t, fs) # スペクトル包絡の抽出 ap = pw.d4c(waveNDArray, f0, t, fs) # 非周期性指標の抽出 f0_fixed0 = np.ones(f0.shape) * f0Value f0_median = np.median(f0) sp_median = np.median(sp) ap_median = np.median(ap) # SPを高周波方向に伸縮 sp2 = np.ones_like(sp)*np.min(sp) for f in range(sp2.shape[1]): if(int(f / sp_strechRatio) >= sp.shape[1]): break sp2[:, f] = sp[:, int(f / sp_strechRatio)] # SP/APに正規分布ノイズ sp_noised = sp2 + np.random.normal(sp_median,sp_median/10,sp2.shape) ap_noised = ap + np.random.normal(ap_median,ap_median/10,ap.shape) #ガウシアンフィルタ sp_gaussian = scipy.ndimage.filters.gaussian_filter(sp_noised,gaussian_s) ap_gaussian = scipy.ndimage.filters.gaussian_filter(ap_noised,gaussian_s) # 音声復元 synthesized = pw.synthesize(f0_fixed0, sp, ap, fs) return synthesized
def worldSpeechSynthesis(f0: np.ndarray, decoded_sp: np.ndarray, ap: np.ndarray, fs: int = SAMPLE_RATE, frame_period: float = 5.) -> np.ndarray: ''' worldでシンセサイズする Parameters ---------- f0: np.ndarray フレームの基本周波数[hz] decoded_sp: np.ndarray スペクトル包絡 ap: np.ndarray 非周期性指標 fs: int, default SAMPLE_RATE サンプリング周波数 frame_period: float, default 5. フレームの時間的間隔 Returns ------- wave: np.ndarray 合成した波形データ ''' wave = pyworld.synthesize(f0, decoded_sp, ap, fs, frame_period) wave = wave.astype(np.float32) return wave
def synthesis_proc(self): global audio_file_name, x, fs, _f0, _sp, _ap, _y, audio_out print(self.fund.get()) print(self.form1.get()) print(self.form2.get()) print(self.form3.get()) perc_inc=self.fund.get() new_f0 = _f0 + ((perc_inc/100) * _f0) fm,ft = cam_formants(x,fs) nos_of_peaks = self.form4.get() shiftconst_test = [self.form1.get(), self.form2.get(), self.form3.get()] shifted_sp = shift_formants(_sp, ft, fm, fs, nos_of_peaks, shiftconst_test) new_y = pw.synthesize(new_f0[0:len(_f0)-1], shifted_sp, _ap[0:len(_f0)-1], fs) audio_out='testaudio/'+'audio-out' + '_' + str(self.fund.get()) + '_' + str(self.form1.get()) + '_' + str(self.form2.get()) + '_' + str(self.form3.get())+'.wav' wav.write(audio_out,fs, new_y) wav.write('testaudio/origfile.wav',fs, x) print('done') plt.figure() plt.subplot(4,1,1) plt.title('Waveform') plt.plot(x) plt.plot(new_y) plt.subplot(4,1,2) plt.plot(_f0) plt.plot(new_f0) plt.subplot(4,1,3) plt.imshow(shifted_sp.transpose(), origin='lower', interpolation='none', aspect='auto', extent=(0, _sp.shape[0], 0, _sp.shape[1])) plt.subplot(4,1,4) plt.imshow(_ap.transpose(), origin='lower', interpolation='none', aspect='auto', extent=(0, _ap.shape[0], 0, _ap.shape[1])) plt.savefig(audio_out+'.png') print('done')
def generate(self, parm_var, do_postfilter=True): config = self.analysis_config for path in self.paths: file_id = splitext(basename(path))[0] print('Synthesizing %s ... ' % (file_id), end='') mgc, lf0, vuv, bap = self._generate_parameters(path, parm_var) if do_postfilter: mgc = merlin_post_filter(mgc, config.alpha) sp = pysptk.mc2sp(mgc, fftlen=config.fft_length, alpha=config.alpha) ap = pyworld.decode_aperiodicity(bap.astype(np.float64), config.sampling_rate, config.fft_length) f0 = self._lf0_to_f0(lf0, vuv) generated = pyworld.synthesize(f0.flatten().astype(np.float64), sp.astype(np.float64), ap.astype(np.float64), config.sampling_rate, config.frame_period) with open(join(self.out_dir, file_id + '.wav'), 'wb') as f: f.write(Audio(generated, rate=config.sampling_rate).data) print('done!')
def analysis_resynthesis(signal): # 音響特徴量の抽出 f0, t = pw.dio(signal, sample_rate) # 基本周波数の抽出 f0 = pw.stonemask(signal, f0, t, sample_rate) # refinement sp = pw.cheaptrick(signal, f0, t, sample_rate) # スペクトル包絡の抽出 ap = pw.d4c(signal, f0, t, sample_rate) # 非周期性指標の抽出 # ピッチシフト modified_f0 = f0_rate * f0 # フォルマントシフト(周波数軸の一様な伸縮) modified_sp = np.zeros_like(sp) sp_range = int(modified_sp.shape[1] * sp_rate) for f in range(modified_sp.shape[1]): if (f < sp_range): if sp_rate >= 1.0: modified_sp[:, f] = sp[:, int(f / sp_rate)] else: modified_sp[:, f] = sp[:, int(sp_rate * f)] else: modified_sp[:, f] = sp[:, f] # 再合成 synth = pw.synthesize(modified_f0, modified_sp, ap, sample_rate) return synth
def synthesis(): # pdb.set_trace() lf0_file = "p225_001.lf0" bap_file_name="p225_001.bap" mgc_file_name="p225_001.mgc" fl=4096 sr=48000 # pdb.set_trace() lf0 = read_binfile(lf0_file, dim=1, dtype=np.float32) zeros_index = np.where(lf0 == -1E+10) nonzeros_index = np.where(lf0 != -1E+10) f0 = lf0.copy() f0[zeros_index] = 0 f0[nonzeros_index] = np.exp(lf0[nonzeros_index]) f0 = f0.astype(np.float64) bap_dim = 5 bap = read_binfile(bap_file_name, dim=bap_dim, dtype=np.float32) ap = pyworld.decode_aperiodicity(bap.astype(np.float64).reshape(-1, bap_dim), sr, fl) mc = read_binfile(mgc_file_name, dim=60, dtype=np.float32) alpha = pysptk.util.mcepalpha(sr) sp = pysptk.mc2sp(mc.astype(np.float64), fftlen=fl, alpha=alpha) wav = pyworld.synthesize(f0, sp, ap, sr, 5) x2 = wav * 32768 x2 = x2.astype(np.int16) scipy.io.wavfile.write("resynthesis.wav", sr, x2)
def analysis_for_valid_batch(self, features, output_features, names, out_dir, sample_rate=16000, **kwargs): super(F0Model, self).analysis_for_valid_batch(features, output_features, names, out_dir, **kwargs) # Synthesise outputs using WORLD. synth_dir = os.path.join(out_dir, 'synth') os.makedirs(synth_dir, exist_ok=True) lf0 = output_features['lf0'].cpu().detach().numpy() vuv = features['vuv'].cpu().detach().numpy() sp = features['sp'].cpu().detach().numpy() ap = features['ap'].cpu().detach().numpy() n_frames = features['n_frames'].cpu().detach().numpy() for i, (n_frame, name) in enumerate(zip(n_frames, names)): f0_i = np.exp(lf0[i, :n_frame, 0]) f0_i = savgol_filter(f0_i, 7, 1) f0_i = f0_i * vuv[i, :n_frame, 0] f0_i = f0_i.astype(np.float64) sp_i = sp[i, :n_frame].astype(np.float64) ap_i = ap[i, :n_frame].astype(np.float64) wav_path = os.path.join(synth_dir, '{}.wav'.format(name)) wav = pyworld.synthesize(f0_i, sp_i, ap_i, sample_rate) tdt.file_io.save_wav(wav_path, wav, sample_rate=sample_rate)
def Conversion(self): print(f">Conversion") print(f"f0_rate:{self.f0_rate}, sp_rate:{self.sp_rate}") self.statusBar().showMessage(f'Start conversion') wavdata = self.streamer.get_all() if (len(self.wav) <= 0): reply = QMessageBox.information( self, "声変換", "変換前の音声データがありません\nStartボタンを押して録音するか\nファイル>変換前の音声データの読み込み から音声データを読み込んでください" ) return self.saveconvAction.setEnabled(True) wavdata = np.frombuffer(wavdata, dtype='int16').astype(np.float64) f0, t = pw.harvest(wavdata, self.RATE) # 基本周波数の抽出 sp = pw.cheaptrick(wavdata, f0, t, self.RATE) # スペクトル包絡の抽出 ap = pw.d4c(wavdata, f0, t, self.RATE) # 非周期性指標の抽出 "ピッチシフト" modified_f0 = self.f0_rate * f0 "フォルマントシフト(周波数軸の一様な伸縮)" modified_sp = np.zeros_like(sp) sp_range = int(modified_sp.shape[1] * self.sp_rate) for f in range(modified_sp.shape[1]): if (f < sp_range): if self.sp_rate >= 1.0: modified_sp[:, f] = sp[:, int(f / self.sp_rate)] else: modified_sp[:, f] = sp[:, int(self.sp_rate * f)] else: modified_sp[:, f] = sp[:, f] self.synth = pw.synthesize(modified_f0, modified_sp, ap, self.RATE) self.curve2.setData(self.synth / 32767.0) print(len(self.synth)) self.statusBar().showMessage(f'Finish conversion')
def generate_test(filename): [sp_min, sp_max, ap_min, ap_max] = np.load('data/timbre_model/min_max_record.npy') condi = get_condition(filename) # cat_input = get_ap_cat() # fist_input = get_first_input() sp, raw_sp = generate_timbre(0, sp_max, sp_min, condi, None) plt.imshow(np.log(np.transpose(sp)), aspect='auto', origin='bottom', interpolation='none') plt.show() sp1 = load_timbre('data/timbre_model/test/sp/' + filename + '_sp.npy', 0, sp_max, sp_min) plt.imshow(np.log(np.transpose(sp1)), aspect='auto', origin='bottom', interpolation='none') plt.show() #################################################################################################### ap, raw_ap = generate_timbre(1, ap_max, ap_min, condi, raw_sp) plt.imshow(np.log(np.transpose(ap)), aspect='auto', origin='bottom', interpolation='none') plt.show() ap1 = load_timbre('data/timbre_model/test/ap/' + filename + '_ap.npy', 1, ap_max, ap_min) plt.imshow(np.log(np.transpose(ap1)), aspect='auto', origin='bottom', interpolation='none') plt.show() ######################################################################################################### # vuv_cat = get_vuv_cat() # gen_cat = torch.cat((raw_ap, raw_sp), 0) # vuv = generate_vuv(condi, vuv_cat) # plt.plot(vuv) # plt.show() # # vuv1 = np.load('data/timbre_model/test/vuv/nitech_jp_song070_f001_029_vuv.npy') # plt.plot(vuv1) # plt.show() path = 'data/raw/' + filename + '.raw' _f0, _sp, code_sp, _ap, code_ap = process_wav(path) # 合成原始语音 synthesized = pw.synthesize(_f0, sp, ap, 32000, pw.default_frame_period) # 1.输出原始语音 sf.write('./data/gen_wav/' + filename + '' '.wav', synthesized, 32000)
def save_world_wav(feats, model_name, filename): # feats = [f0, sp, ap, sp_coded, labels] if isinstance(feats[3], torch.Tensor): feats[3] = feats[3].cpu().numpy() if hp.normalise_mels: feats[3] = _unnormalise_coded_sp(feats[3]) path = os.path.join(hp.sample_set_dir, model_name) if not os.path.exists(path): os.makedirs(path) path = os.path.join(path, filename) # print("Made path.") feats[3] = np.ascontiguousarray(feats[3], dtype=np.float64) # print("Made contiguous.") # print(feats[3].shape) decoded_sp = decode_spectral_envelope(feats[3], hp.sr, fft_size=hp.n_fft) # print("Decoded.") # f0_converted = norm.pitch_conversion(f0, speaker, target) wav = synthesize(feats[0], decoded_sp, feats[1], hp.sr) # Audio(wav,rate=hp.sr) # librosa.display.waveplot(y=wav, sr=hp.sr) # print("Sythesized wav.") save_wav(wav, path)
def __getitem__(self, key): key, pitch_aug_factor, time_aug_factor = key wav = self.data[key] if self.normalize: # soundfile.read normalizes data to [-1,1] if dtype is not given array, rate = soundfile.read(wav, always_2d=self.always_2d) else: array, rate = soundfile.read(wav, dtype=self.dtype, always_2d=self.always_2d) if pitch_aug_factor != 0: # Pitch augmentation ratio = pow(2, 1 / 12) import pyworld as pw f0_pw, sp, ap = pw.wav2world(array, rate) # use default options array = pw.synthesize( f0_pw * (ratio**pitch_aug_factor), sp, ap, rate, pw.default_frame_period, ) if time_aug_factor != 1: # Time augmentation array = tsm.wsola(array, time_aug_factor) return rate, array
def convert(signal): f0_rate = 2.4 sp_rate = 0.78 sample_rate = 16000 f0, t = pyworld.dio(signal, sample_rate) f0 = pyworld.stonemask(signal, f0, t, sample_rate) sp = pyworld.cheaptrick(signal, f0, t, sample_rate) ap = pyworld.d4c(signal, f0, t, sample_rate) modified_f0 = f0_rate * f0 # フォルマントシフト(周波数軸の一様な伸縮) modified_sp = np.zeros_like(sp) sp_range = int(modified_sp.shape[1] * sp_rate) for f in range(modified_sp.shape[1]): if (f < sp_range): if sp_rate >= 1.0: modified_sp[:, f] = sp[:, int(f / sp_rate)] else: modified_sp[:, f] = sp[:, int(sp_rate * f)] else: modified_sp[:, f] = sp[:, f] y = pyworld.synthesize(modified_f0, modified_sp, ap, sample_rate) return y
def world2wav(feature, frame_period): hparams = hp mgc_idx = 0 lf0_idx = mgc_idx + hparams.num_mgc vuv_idx = lf0_idx + hparams.num_lf0 bap_idx = vuv_idx + hparams.num_vuv mgc = feature[:, mgc_idx:mgc_idx + hparams.num_mgc] lf0 = feature[:, lf0_idx:lf0_idx + hparams.num_lf0] vuv = feature[:, vuv_idx:vuv_idx + hparams.num_vuv] bap = feature[:, bap_idx:bap_idx + hparams.num_bap] fs = hparams.sample_rate alpha = pysptk.util.mcepalpha(fs) fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha) indexes = (vuv < 0.5).flatten() bap[indexes] = np.zeros(hparams.num_bap) aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), fs, fftlen) f0 = lf0.copy() f0[vuv < 0.5] = 0 f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)]) return pyworld.synthesize(f0.flatten().astype(np.float64), spectrogram.astype(np.float64), aperiodicity.astype(np.float64), fs, frame_period)
def world_speech_synthesis(f0, decoded_sp, ap, fs, frame_period): # decoded_sp = decoded_sp.astype(np.float64) wav = pyworld.synthesize(f0, decoded_sp, ap, fs, frame_period) # Librosa could not save wav if not doing so wav = wav.astype(np.float32) return wav
def gen_waveform(y_predicted, Y_mean, Y_std, post_filter=False, coef=1.4, fs=16000, mge_training=True): alpha = pysptk.util.mcepalpha(fs) fftlen = fftlen = pyworld.get_cheaptrick_fft_size(fs) frame_period = hp_acoustic.frame_period # Generate parameters and split streams mgc, lf0, vuv, bap = gen_parameters(y_predicted, Y_mean, Y_std, mge_training) if post_filter: mgc = merlin_post_filter(mgc, alpha, coef=coef) spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha) aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), fs, fftlen) f0 = lf0.copy() f0[vuv < 0.5] = 0 f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)]) generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64), spectrogram.astype(np.float64), aperiodicity.astype(np.float64), fs, frame_period) # Convert range to int16 generated_waveform = generated_waveform / \ np.max(np.abs(generated_waveform)) * 32767 # return features as well to compare natural/genearted later return generated_waveform, mgc, lf0, vuv, bap
def formant(self, val, f0_v): ''' Change formant. val : formant rate f0_v: f0 rate ''' f_rate = self.audio.frame_rate np_arr = np.array(self.audio.get_array_of_samples(), dtype=np.float64) # pydub --> np.array(float64) 変換 # print(np_arr, f_rate) _f0_val, _time = pyworld.dio(np_arr, f_rate) # 基本周波数 spct = pyworld.cheaptrick(np_arr, _f0_val, _time, f_rate) # スペクトル包絡 aper = pyworld.d4c(np_arr, _f0_val, _time, f_rate) # 非周期性指標 spct_b = np.zeros_like(spct) for i in range(spct_b.shape[1]): spct_b[:, i] = spct[:, int(i / val)] ef_audio = pyworld.synthesize(_f0_val * f0_v, spct_b, aper, f_rate) ef_audio = ef_audio.astype(np.int16).tobytes() # print(ef_audio) # print(type(ef_audio)) new_audio = AudioSegment( ef_audio, sample_width=self.audio.sample_width, frame_rate=f_rate, channels=self.audio.channels, ) self.audio = new_audio return self
def generate_changed_voice(model, input_path): fs, x = wavfile.read(input_path) x = x.astype(np.float64) if len(x.shape) > 1: x = x.mean(axis=1) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] mc = P.modspec_smoothing(mc, FS / HOP_LENGHT, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) gen_data = model.predict(mc) gen_data = np.hstack([c0.reshape((-1, 1)), gen_data]) fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp( gen_data.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize( f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform
def synthesis(ori_path, aim_sp, aim_spkid): print('synthesizing ...') wav, _ = librosa.load(ori_path, sr=hp.SR, mono=True, dtype=np.float64) f0, timeaxis = pw.harvest(wav, hp.SR) sp_per_timeaxis_before = pw.cheaptrick(wav, f0, timeaxis, hp.SR, fft_size=hp.N_FFT) # 1024 压缩到 513 维 # ori_decoded_sp = pw.decode_spectral_envelope(ori_sp, hp.SR, fft_size=hp.N_FFT) # print('f0.shape = ') # print(f0) ap = pw.d4c(wav, f0, timeaxis, hp.SR, fft_size=hp.N_FFT) aim_decoded_sp = pw.decode_spectral_envelope( aim_sp, hp.SR, fft_size=hp.N_FFT) # 转换/解码 后的sp: print('解码后的513维度的aim_decoded_sp = ') print(aim_decoded_sp.shape) print(aim_decoded_sp[399][:]) synwav = pw.synthesize(f0, aim_decoded_sp, ap, hp.SR) print(f'synthesize done. path : ./convert_to_{aim_spkid}_test1.wav') librosa.output.write_wav(f'./convert_to_{aim_spkid}_test1.wav', synwav, sr=hp.SR)
def feats_to_audio_test(in_feats,filename, fs=config.fs, mode=config.comp_mode): harm = in_feats[:,:60] ap = in_feats[:,60:-2] f0 = in_feats[:,-2:] f0[:,0] = f0[:,0]-69 f0[:,0] = f0[:,0]/12 f0[:,0] = 2**f0[:,0] f0[:,0] = f0[:,0]*440 f0 = f0[:,0]*(1-f0[:,1]) if mode == 'mfsc': harm = mfsc_to_mgc(harm) ap = mfsc_to_mgc(ap) harm = mgc_to_sp(harm, 1025, 0.45) ap = mgc_to_sp(ap, 1025, 0.45) harm = 10**(harm/10) ap = 10**(ap/20) y=pw.synthesize(f0.astype('double'),harm.astype('double'),ap.astype('double'),fs,config.hoptime) sf.write('./medley_resynth_test/'+filename+'.wav',y,fs)
def looptoweb(f1perc, f2perc): #table is the name of the output html table_file = open('Shiftmethod1.html', 'w') table_file.write('<!DOCTYPE html><html>' + styletext + '<body><h1>Audio Files ' + path + '</h1><table><tr><th>Form1/Form2</th>') for k in np.arange(0, len(f2perc)): table_file.write('<th>' + str(f2perc[k]) + '%' + '</th>') table_file.write('</tr>') for i in np.arange(0, len(f1perc)): table_file.write('<tr>') table_file.write('<td>' + str(f1perc[i]) + '%' + '</td>') for j in np.arange(0, len(f2perc)): audio_out = 'testaudio/' + 'testsmile' + '_' + str( f1perc[i]) + '_' + str(f2perc[j]) + '.wav' shifted_sp, maximas = shift_formants(sp, ft, fm, fs, 2, [f1perc[i], f2perc[j]]) #print('-------------------------'+[f1perc[i],f2perc[j]]+'----------------------------------------') new_y = pw.synthesize(f0[0:len(f0) - 1], shifted_sp, ap[0:len(f0) - 1], fs) wav.write(audio_out, fs, new_y) table_file.write('<td><audio controls>') #table_file.write(audio_out) table_file.write('<source src= ' + '"' + audio_out + '"' + ' type="audio/mpeg">') table_file.write('</audio></td>') table_file.write('</tr>') table_file.write('</table></body></html>') table_file.close()
def feats_to_audio(in_feats,filename, fs=config.fs, mode=config.comp_mode): harm = in_feats[:,:60] ap = in_feats[:,60:-2] f0 = in_feats[:,-2:] # f0[:,0] = f0[:,0]-69 # f0[:,0] = f0[:,0]/12 # f0[:,0] = 2**f0[:,0] # f0[:,0] = f0[:,0]*440 f0[:,0] = f0_to_hertz(f0[:,0]) f0 = f0[:,0]*(1-f0[:,1]) if mode == 'mfsc': harm = mfsc_to_mgc(harm) ap = mfsc_to_mgc(ap) harm = mgc_to_sp(harm, 1025, 0.45) ap = mgc_to_sp(ap, 1025, 0.45) harm = 10**(harm/10) ap = 10**(ap/20) y=pw.synthesize(f0.astype('double'),harm.astype('double'),ap.astype('double'),fs,config.hoptime*1000) sf.write(config.val_dir+filename+'.wav',y,int(fs))
def feats_to_audio(in_feats,fs=config.fs): harm = in_feats[:,:60] ap = in_feats[:,60:-2] f0 = in_feats[:,-2:] f0[:,0] = f0_to_hertz(f0[:,0]) f0 = f0[:,0]*(1-f0[:,1]) # # if mode == 'mfsc': # harm = mfsc_to_mgc(harm) wraped_freq = get_warped_freqs(60, config.fs, 0.45) # import pdb;pdb.set_trace() harm = mfsc_to_sp(harm, wraped_freq, 1025, config.fs) # ap = mfsc_to_mgc(ap) # harm = mgc_to_sp(harm, 1025, 0.4) ap = wbap_to_ap(ap, 1025, config.fs) harm = np.ascontiguousarray(10**((harm - config.world_offset)/10)) ap = np.ascontiguousarray(10**(ap/20)) y=pw.synthesize(f0.astype('double'),harm.astype('double'),ap.astype('double'),fs,config.hoptime*1000) return y
def synthesis(self, feat, se_kind='sp'): batch_size = feat['ap'].size(0) device = feat['ap'].device audio = [] for i in range(batch_size): ap = feat['ap'][i].detach().t().cpu().double().numpy() f0 = feat['f0'][i].detach().view(-1).cpu().double().numpy() if se_kind == 'mcc': mcc = feat['mcc'][i].detach().t().cpu().double().numpy() sp = pysptk.mc2sp(mcc.copy(order='C'), self.mcc_alpha, self.fft_size) else: sp = feat['sp'][i].detach().t().cpu().double().numpy() syn = pyworld.synthesize(f0.copy(order='C'), sp.copy(order='C'), ap.copy(order='C'), self.fs, frame_period=self.shiftms) audio.append(torch.from_numpy(syn).float().view(-1)) audio = torch.cat([syn.unsqueeze(0) for syn in audio], dim=0).to(device) return audio / MAX_WAV_VALUE
def gen_waveform(self, feature): mcep_dim = self.config['mcep_order'] + 1 mgc = feature[:, :mcep_dim] lf0 = feature[:, mcep_dim:mcep_dim + 1] vuv = feature[:, mcep_dim + 1: mcep_dim + 2] bap = feature[:, mcep_dim + 2:] spectrogram = pysptk.mc2sp( mgc, fftlen=self.config['fft_size'], alpha=pysptk.util.mcepalpha(self.config['sampling_rate']), ) aperiodicity = pyworld.decode_aperiodicity( bap.astype(np.float64), self.config['sampling_rate'], self.config['fft_size'], ) f0 = lf0.copy() f0[vuv < 0.5] = 0 f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)]) waveform = pyworld.synthesize( f0.flatten().astype(np.float64), spectrogram.astype(np.float64), aperiodicity.astype(np.float64), self.config['sampling_rate'], self.config['hop_size_in_ms'], ) return waveform
def world_speech_synthesis(f0, decoded_sp, ap, fs, frame_period): #decoded_sp = decoded_sp.astype(np.float64) wav = pyworld.synthesize(f0, decoded_sp, ap, fs, frame_period) # Librosa could not save wav if not doing so wav = wav.astype(np.float32) return wav
def decode( self, acoustic_feature: AcousticFeature, ): acoustic_feature = acoustic_feature.astype_only_float(numpy.float64) out = pyworld.synthesize( f0=acoustic_feature.f0.ravel(), spectrogram=acoustic_feature.spectrogram, aperiodicity=acoustic_feature.aperiodicity, fs=self.out_sampling_rate, frame_period=self.acoustic_feature_param.frame_period ) return Wave(out, sampling_rate=self.out_sampling_rate)
def convert_from_feature(self, input: AcousticFeature, out_sampling_rate: Optional[int] = None): if out_sampling_rate is None: out_sampling_rate = self.config.dataset.param.voice_param.sample_rate out = self.convert_to_feature(input=input, out_sampling_rate=out_sampling_rate) out = pyworld.synthesize( f0=out.f0.ravel(), spectrogram=out.spectrogram, aperiodicity=out.aperiodicity, fs=out_sampling_rate, frame_period=self._param.acoustic_feature_param.frame_period, ) return Wave(out, sampling_rate=out_sampling_rate)
def convert_to_audio( self, input: numpy.ndarray, acoustic_feature: AcousticFeature, sampling_rate: int, ): acoustic_feature = acoustic_feature.astype_only_float(numpy.float64) out = pyworld.synthesize( f0=acoustic_feature.f0.ravel(), spectrogram=input.astype(numpy.float64), aperiodicity=acoustic_feature.aperiodicity, fs=sampling_rate, frame_period=self._param.acoustic_feature_param.frame_period, ) return Wave(out, sampling_rate=sampling_rate)
def synthesis(self, f0, mcep, ap, rmcep=None, alpha=0.42): """synthesis generates waveform from F0, mcep, aperiodicity Parameters ---------- f0 : array, shape (`T`, `1`) array of F0 sequence mcep : array, shape (`T`, `dim`) array of mel-cepstrum sequence ap : array, shape (`T`, `fftlen / 2 + 1`) or (`T`, `dim_codeap`) array of aperiodicity or code aperiodicity rmcep : array, optional, shape (`T`, `dim`) array of reference mel-cepstrum sequence Default set to None alpha : int, optional Parameter of all-path transfer function Default set to 0.42 Returns ---------- wav: array, Synethesized waveform """ if rmcep is not None: # power modification mcep = mod_power(mcep, rmcep, alpha=alpha) if ap.shape[1] < self.fftl // 2 + 1: # decode codeap to ap ap = pyworld.decode_aperiodicity(ap, self.fs, self.fftl) # mcep into spc spc = pysptk.mc2sp(mcep, alpha, self.fftl) # generate waveform using world vocoder with f0, spc, ap wav = pyworld.synthesize(f0, spc, ap, self.fs, frame_period=self.shiftms) return wav
def synthesis_spc(self, f0, spc, ap): """synthesis generates waveform from F0, mcep, ap Parameters ---------- f0 : array, shape (`T`, `1`) array of F0 sequence spc : array, shape (`T`, `fftl // 2 + 1`) array of mel-cepstrum sequence ap : array, shape (`T`, `fftl // 2 + 1`) array of aperiodicity Return ------ wav: vector, shape (`samples`) Synethesized waveform """ # generate waveform using world vocoder with f0, spc, ap wav = pyworld.synthesize(f0, spc, ap, self.fs, frame_period=self.shiftms) return wav
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True): model.eval() hop_length = int(fs * (hp.frame_period * 0.001)) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] static_dim = mc.shape[-1] mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) T = mc.shape[0] inputs = mc[:, :static_dim].copy() # Normalization mc_scaled = P.scale(mc, data_mean, data_std) mc_scaled = Variable(torch.from_numpy(mc_scaled)) lengths = [len(mc_scaled)] # Add batch axis mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1)) # For MLPG R = unit_variance_mlpg_matrix(hp.windows, T) R = torch.from_numpy(R) # Apply model if model.include_parameter_generation(): # Case: models include parameter generation in itself # Mulistream features cannot be used in this case y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths) else: # Case: generic models (can be sequence model) assert hp.has_dynamic_features is not None y_hat = model(mc_scaled, lengths=lengths) y_hat_static = multi_stream_mlpg( y_hat, R, hp.stream_sizes, hp.has_dynamic_features) mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim) # Denormalize mc_static_pred = P.inv_scale( mc_static_pred, data_mean[:static_dim], data_std[:static_dim]) outputs = mc_static_pred.copy() if diffvc: mc_static_pred = mc_static_pred - mc[:, :static_dim] mc = np.hstack((c0[:, None], mc_static_pred)) if diffvc: mc[:, 0] = 0 # remove power coefficients engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp( mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize( f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform, inputs, outputs