def wav2pw(wavfile, sr=SR, fft_size=FFT_SIZE, frame_period=FRAME_PERIOD): x, _ = librosa.load(wavfile, sr=sr, mono=True, dtype=np.float64) _f0, t = pw.harvest(x, sr, frame_period=frame_period) f0 = pw.stonemask(x, _f0, t, sr) sp = pw.cheaptrick(x, f0, t, sr, fft_size=fft_size) ap = pw.d4c(x, f0, t, sr, fft_size=fft_size) return f0, sp, ap
def get_f0(audio, sample_rate, frame_period=5, method='dio'): if isinstance(audio, torch.Tensor): if audio.ndim > 1: audio = audio[0] audio = audio.numpy() hop_size = int(frame_period * sample_rate / 1000) if method == 'dio': f0, _ = pw.dio(audio.astype(np.double), sample_rate, frame_period=frame_period) elif method == 'harvest': f0, _ = pw.harvest(audio.astype(np.double), sample_rate, frame_period=frame_period) elif method == 'swipe': f0 = pysptk.sptk.swipe(audio.astype(np.double), sample_rate, hopsize=hop_size) elif method == 'rapt': f0 = pysptk.sptk.rapt(audio.astype(np.double), sample_rate, hopsize=hop_size) else: raise ValueError(f'No such f0 extract method, {method}.') f0 = torch.from_numpy(f0) vuv = 1 * (f0 != 0.0) return f0, vuv
def world_extract(x, fs, f0min, f0max): # scale from [-1, 1] to [-32768, 32767] x = x * np.iinfo(np.int16).max x = np.array(x, dtype=np.float64) x = low_cut_filter(x, fs) # extract features f0, time_axis = pw.harvest(x, fs, f0_floor=f0min, f0_ceil=f0max, frame_period=MCEP_SHIFT) sp = pw.cheaptrick(x, f0, time_axis, fs, fft_size=MCEP_FFTL) ap = pw.d4c(x, f0, time_axis, fs, fft_size=MCEP_FFTL) mcep = pysptk.sp2mc(sp, MCEP_DIM, MCEP_ALPHA) npow = spc2npow(sp) return { "sp": sp, "mcep": mcep, "ap": ap, "f0": f0, "npow": npow, }
def wav2world(wavfile, frame_period): wav, fs = librosa.load(wavfile, sr=hp.sample_rate, dtype=np.float64) if hp.use_harvest: f0, timeaxis = pyworld.harvest(wav, fs, frame_period=frame_period) else: f0, timeaxis = pyworld.dio(wav, fs, frame_period=frame_period) f0 = pyworld.stonemask(wav, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(wav, f0, timeaxis, fs) aperiodicity = pyworld.d4c(wav, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) hp.num_bap = bap.shape[1] alpha = pysptk.util.mcepalpha(fs) mgc = pysptk.sp2mc(spectrogram, order=hp.num_mgc - 1, alpha=alpha) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) if hp.use_harvest: # https://github.com/mmorise/World/issues/35#issuecomment-306521887 vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None] else: vuv = (lf0 != 0).astype(np.float32) #print(mgc.shape,lf0.shape,vuv.shape,bap.shape) features = np.hstack((mgc, lf0, vuv, bap)) return features.astype(np.float32)
def data_extraction(np_data, rate): np_data = np_data.astype(np.float) _f0, t = pw.harvest(np_data, rate) f0 = pw.stonemask(np_data, _f0, t, rate) sp = pw.cheaptrick(np_data, f0, t, rate) ap = pw.d4c(np_data, f0, t, rate) return f0, sp, ap
def Conversion(self): print(f">Conversion") print(f"f0_rate:{self.f0_rate}, sp_rate:{self.sp_rate}") self.statusBar().showMessage(f'Start conversion') wavdata = self.streamer.get_all() if (len(self.wav) <= 0): reply = QMessageBox.information( self, "声変換", "変換前の音声データがありません\nStartボタンを押して録音するか\nファイル>変換前の音声データの読み込み から音声データを読み込んでください" ) return self.saveconvAction.setEnabled(True) wavdata = np.frombuffer(wavdata, dtype='int16').astype(np.float64) f0, t = pw.harvest(wavdata, self.RATE) # 基本周波数の抽出 sp = pw.cheaptrick(wavdata, f0, t, self.RATE) # スペクトル包絡の抽出 ap = pw.d4c(wavdata, f0, t, self.RATE) # 非周期性指標の抽出 "ピッチシフト" modified_f0 = self.f0_rate * f0 "フォルマントシフト(周波数軸の一様な伸縮)" modified_sp = np.zeros_like(sp) sp_range = int(modified_sp.shape[1] * self.sp_rate) for f in range(modified_sp.shape[1]): if (f < sp_range): if self.sp_rate >= 1.0: modified_sp[:, f] = sp[:, int(f / self.sp_rate)] else: modified_sp[:, f] = sp[:, int(self.sp_rate * f)] else: modified_sp[:, f] = sp[:, f] self.synth = pw.synthesize(modified_f0, modified_sp, ap, self.RATE) self.curve2.setData(self.synth / 32767.0) print(len(self.synth)) self.statusBar().showMessage(f'Finish conversion')
def get_features(filename, *, winlen, winstep, n_mcep, mcep_alpha, minf0, maxf0, type): wav, sr = load(filename, sr=None) # get f0 x = wav.astype(float) _f0, t = world.harvest(x, sr, f0_floor=minf0, f0_ceil=maxf0, frame_period=winstep * 1000) f0 = world.stonemask(x, _f0, t, sr) window_size = int(sr * winlen) hop_size = int(sr * winstep) # get mel if type == 'mcc': spec = world.cheaptrick(x, f0, t, sr, f0_floor=minf0) h = sptk.sp2mc(spec, n_mcep - 1, mcep_alpha).T else: h = mfcc(x, sr, n_mfcc=n_mcep, n_fft=window_size, hop_length=hop_size) h = np.vstack((h, f0)) maxlen = len(x) // hop_size + 2 h = repeat_last_padding(h, maxlen) id = os.path.basename(filename).replace(".wav", "") return (id, x, h)
def word_synthesis(word, file_name): if os.path.exists(f'{PROCESSED_WORDS_DIRECTORY}/{file_name}_{word}.wav'): data, samplerate = sf.read( f'{PROCESSED_WORDS_DIRECTORY}/{file_name}_{word}.wav') f0, timeaxis = pw.harvest(data, samplerate) f0_mask = pw.stonemask(data, f0, timeaxis, samplerate) spectral_envelop = pw.cheaptrick(data, f0_mask, timeaxis, samplerate) aperiodicity = pw.d4c(data, f0_mask, timeaxis, samplerate) synthesized_word = pw.synthesize(f0_mask, spectral_envelop, aperiodicity, samplerate, pw.default_frame_period) sf.write(f'{SYNTHESIS_WORDS_DIRECTORY}/{file_name}_{word}_default.wav', synthesized_word, samplerate) savefig( f'{PLOTS_SYNTHESIS_WORDS_DIRECTORY}/{file_name}_{word}_default.png', [data, synthesized_word], word) synthesized_word = pw.synthesize(f0_mask, spectral_envelop, aperiodicity, samplerate, 3.0) sf.write(f'{SYNTHESIS_WORDS_DIRECTORY}/{file_name}_{word}_3.wav', synthesized_word, samplerate) savefig(f'{PLOTS_SYNTHESIS_WORDS_DIRECTORY}/{file_name}_{word}_3.png', [data, synthesized_word], word) synthesized_word = pw.synthesize(f0_mask, spectral_envelop, aperiodicity, samplerate, 20.0) sf.write(f'{SYNTHESIS_WORDS_DIRECTORY}/{file_name}_{word}_20.wav', synthesized_word, samplerate) savefig(f'{PLOTS_SYNTHESIS_WORDS_DIRECTORY}/{file_name}_{word}_20.png', [data, synthesized_word], word)
def process_acoustic_parameters(sound, sound_position, word, file_name): file = f'{PROCESSED_SOUNDS_DIRECTORY}/{sound_position}/{file_name}_{word}_{sound}.wav' if os.path.exists(file): data, samplerate = sf.read(file) frame_period = 5 # ms hop_length = int(0.001 * samplerate * frame_period) f0_dio, timeaxis_dio = pw.dio(data, samplerate) f0, timeaxis = pw.harvest( data, samplerate, frame_period) f0_mask = pw.stonemask(data, f0, timeaxis, samplerate) spectral_envelop = pw.cheaptrick(data, f0_mask, timeaxis, samplerate) aperiodicity = pw.d4c(data, f0_mask, timeaxis, samplerate) f0_rapt = pysptk.sptk.rapt( data.astype(np.float32), samplerate, hop_length) f0_swipe = pysptk.sptk.swipe( data, samplerate, hop_length) plot_f0(sound, sound_position, [timeaxis, f0], [timeaxis_dio, f0_dio], [arange(len(f0_rapt), 0.005), f0_rapt], [arange(len(f0_swipe), 0.005), f0_swipe], f0_mask, f'{PLOTS_SOUNDS_DIRECTORY}/{sound_position}/f0/{file_name}_{word}_{sound}.png') savefig( f'{PLOTS_SOUNDS_DIRECTORY}/{sound_position}/spectral_envelop/{file_name}_{word}_{sound}.png', [spectral_envelop], sound) savefig( f'{PLOTS_SOUNDS_DIRECTORY}/{sound_position}/aperiodicity/{file_name}_{word}_{sound}.png', [aperiodicity], sound, log=False)
def estimate_word(word, name): if os.path.exists(f'rijeci_wav/{word}_{name}.wav'): f_bef, fs = sf.read(f'rijeci_wav/{word}_{name}.wav') f0, timeaxis = pw.harvest(f_bef, fs) f0_mask = pw.stonemask(f_bef, f0, timeaxis, fs) sp = pw.cheaptrick(f_bef, f0_mask, timeaxis, fs) ap = pw.d4c(f_bef, f0_mask, timeaxis, fs) y = pw.synthesize(f0_mask, sp, ap, fs, pw.default_frame_period) sf.write(f'rijeci_after_sint/{word}_after_sint_{name}-def.wav', y, fs) savefig(f'slike_rijeci_after_sint/{word}_after_sint_{name}-def.png', [f_bef, y], word) y = pw.synthesize(f0_mask, sp, ap, fs, 3.0) sf.write(f'rijeci_after_sint/{word}_after_sint_{name}.wav', y, fs) savefig(f'slike_rijeci_after_sint/{word}_after_sint_{name}.png', [f_bef, y], word) y = pw.synthesize(f0_mask, sp, ap, fs, 20.0) sf.write(f'rijeci_after_sint/{word}_after_sint_{name}-20.wav', y, fs) savefig(f'slike_rijeci_after_sint/{word}_after_sint_{name}-20.png', [f_bef, y], word)
def world_decompose(wav, fs, frame_period=5.0): # Decompose speech signal into f0, spectral envelope and aperiodicity using WORLD wav = wav.astype(np.float64) f0, timeaxis = pyworld.harvest(wav, fs, frame_period=frame_period, f0_floor=71.0, f0_ceil=800.0) # Finding Spectogram sp = pyworld.cheaptrick(wav, f0, timeaxis, fs) # Finding aperiodicity ap = pyworld.d4c(wav, f0, timeaxis, fs) # Use this in Ipython to see plot # librosa.display.specshow(np.log(sp).T, # sr=fs, # hop_length=int(0.001 * fs * frame_period), # x_axis="time", # y_axis="linear", # cmap="magma") # colorbar() return f0, timeaxis, sp, ap
def worldDecompose( wave: np.ndarray, fs: int = SAMPLE_RATE, frame_period: float = 5.) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: ''' 音声をworldを用いてf0, spectral envelope, aperiodicityに分解 Parameters ---------- wave: np.ndarray 音声の波形データ fs: int, default SAMPLE_RATE サンプリング周波数 frame_period: float, default 5. フレームの時間的間隔 Returns ------- f0: np.ndarray フレームの基本周波数[hz] sp: np.ndarray スペクトル包絡 ap: np.ndarray 非周期性指標 ''' wave = wave.astype(np.float64) f0, timeaxis = pyworld.harvest(wave, fs, frame_period=frame_period, f0_floor=71., f0_ceil=800.) sp = pyworld.cheaptrick(wave, f0, timeaxis, fs) ap = pyworld.d4c(wave, f0, timeaxis, fs) return f0, sp, ap
def world_spectrogram(wav, sr=_sr, dim_num=32, **kwargs): """world声码器语音转为频谱。""" # 分布提取参数 frame_period = kwargs.get("frame_period", pw.default_frame_period) f0_floor = kwargs.get("f0_floor", pw.default_f0_floor) f0_ceil = kwargs.get("f0_ceil", pw.default_f0_ceil) fft_size = kwargs.get("fft_size", pw.get_cheaptrick_fft_size(sr, f0_floor)) ap_threshold = kwargs.get("ap_threshold", 0.85) f0_extractor = kwargs.get("f0_extractor", "dio") x = wav.astype(np.double) if f0_extractor == "dio": # 使用DIO算法计算音频的基频F0 f0, t = pw.dio(x, sr, f0_floor=f0_floor, f0_ceil=f0_ceil) elif f0_extractor == "harvest": f0, t = pw.harvest(x, sr, f0_floor=f0_floor, f0_ceil=f0_ceil, frame_period=frame_period) else: f0, t = f0_extractor(x, sr, f0_floor=f0_floor, f0_ceil=f0_ceil, frame_period=frame_period) # 使用CheapTrick算法计算音频的频谱包络 sp = pw.cheaptrick(x, f0, t, sr, f0_floor=f0_floor, fft_size=fft_size) # SP降维 sp_enc = pw.code_spectral_envelope(sp, sr, number_of_dimensions=dim_num) # 计算aperiodic参数 ap = pw.d4c(x, f0, t, sr, threshold=ap_threshold, fft_size=fft_size) # AP降维 ap_enc = pw.code_aperiodicity(ap, sr) return f0, sp_enc, ap_enc
def analyze(self, x): """Analyze acoustic features based on WORLD analyze F0, spectral envelope, aperiodicity Paramters --------- x : array, shape (`T`) monoral speech signal in time domain Returns --------- f0 : array, shape (`T`,) F0 sequence spc : array, shape (`T`, `fftl / 2 + 1`) Spectral envelope sequence ap: array, shape (`T`, `fftl / 2 + 1`) aperiodicity sequence """ f0, time_axis = pyworld.harvest(x, self.fs, f0_floor=self.minf0, f0_ceil=self.maxf0, frame_period=self.shiftms) spc = pyworld.cheaptrick(x, f0, time_axis, self.fs, fft_size=self.fftl) ap = pyworld.d4c(x, f0, time_axis, self.fs, fft_size=self.fftl) assert spc.shape == ap.shape return f0, spc, ap
def process_wav(wav_path): y, osr = sf.read(wav_path, subtype='PCM_16', channels=1, samplerate=48000, endian='LITTLE') #, start=56640, stop=262560) sr = 32000 if osr != sr: y = librosa.resample(y, osr, sr) #使用harvest算法计算音频的基频F0 _f0, t = pw.harvest(y, sr, f0_floor=f0_min, f0_ceil=f0_max, frame_period=pw.default_frame_period) _f0 = pw.stonemask(y, _f0, t, sr) print(_f0.shape) #使用CheapTrick算法计算音频的频谱包络 _sp = pw.cheaptrick(y, _f0, t, sr) code_sp = code_harmonic(_sp, 60) print(_sp.shape, code_sp.shape) #计算aperiodic参数 _ap = pw.d4c(y, _f0, t, sr) code_ap = pw.code_aperiodicity(_ap, sr) print(_ap.shape, code_ap.shape) return _f0, _sp, code_sp, _ap, code_ap
def harvest(cmd): docid = cmd["id"] meta = rec_set.get_meta(docid) x, fs = librosa.load(os.path.join(get_attachpath(), meta["path"]), sr=None) print("SYSTEM: harvesting...") hv_start = time.time() f0, timeaxis = pyworld.harvest(x.astype(np.float64), fs) print(f"SYSTEM: finished harvesting! (took {time.time() - hv_start:.2f}s)") with tempfile.NamedTemporaryFile(suffix=".txt", delete=False, mode="w") as harvest_fp: for i in range(len(timeaxis)): harvest_fp.write(f'{timeaxis[i]} {f0[i]}\n') if len(open(harvest_fp.name).read().strip()) == 0: return {"error": "Harvest computation failed"} # XXX: frozen attachdir harvesthash = guts.attach(harvest_fp.name, get_attachpath()) guts.bschange( rec_set.dbs[docid], { "type": "set", "id": "meta", "key": "harvest", "val": harvesthash }, ) return {"harvest": harvesthash}
def analyze_range(wav, fs=FS, minf0=MINF0, maxf0=MAXF0, fperiod=SHIFTMS, fftl=FFTL, f0=None, time_axis=None): if f0 is None or time_axis is None: #logging.info("%lf %lf %lf %lf" % (minf0, maxf0, fperiod, fftl)) #logging.info("1") _f0, time_axis = pw.harvest(wav, fs, f0_floor=minf0, f0_ceil=maxf0, frame_period=fperiod) #_f0, time_axis = pw.harvest(wav, fs, f0_floor=60, f0_ceil=maxf0, frame_period=fperiod) #_f0, time_axis = pw.harvest(wav, fs, f0_floor=60, frame_period=fperiod) #_f0, time_axis = pw.harvest(wav, fs, f0_floor=minf0, frame_period=fperiod) #_f0, time_axis = pw.harvest(wav, fs, f0_floor=minf0, frame_period=fperiod) #logging.info("2") f0 = pw.stonemask(wav, _f0, time_axis, fs) #logging.info("3") #f0, time_axis = pw.harvest(wav, fs, f0_floor=minf0, f0_ceil=maxf0, frame_period=fperiod) sp = pw.cheaptrick(wav, f0, time_axis, fs, fft_size=fftl) #logging.info("4") ap = pw.d4c(wav, f0, time_axis, fs, fft_size=fftl) #logging.info("5") return time_axis, f0, sp, ap
def analyze(wav, fs=FS, minf0=MINF0, maxf0=MAXF0, fperiod=SHIFTMS, fftl=FFTL, f0=None, time_axis=None): """ f0 estimation w/o f0_floor & f0_ceil Args: minf0: Never used maxf0: Never used Returns: (time_axis, fundamental frequency, spectral envelope, aperiodicity) """ if f0 is None or time_axis is None: _f0, time_axis = pw.harvest(wav, fs, f0_floor=60.0, frame_period=fperiod) f0 = pw.stonemask(wav, _f0, time_axis, fs) sp = pw.cheaptrick(wav, f0, time_axis, fs, fft_size=fftl) ap = pw.d4c(wav, f0, time_axis, fs, fft_size=fftl) return time_axis, f0, sp, ap
def world_decompose(wav, fs, frame_period = 5.0): # Decompose speech signal into f0, spectral envelope and aperiodicity using WORLD wav = wav.astype(np.float64) f0, timeaxis = pyworld.harvest(wav, fs, frame_period = frame_period, f0_floor = 71.0, f0_ceil = 800.0) sp = pyworld.cheaptrick(wav, f0, timeaxis, fs) ap = pyworld.d4c(wav, f0, timeaxis, fs) return f0, timeaxis, sp, ap
def pre_process(file_name, training_dir): audio_file_name = training_dir + file_name + '.wav' lyrics_file_name = training_dir + 'Transcripts/' + file_name + '.txt' audio_data, sample_rate = soundfile.read(audio_file_name) audio_data = librosa.resample(audio_data, sample_rate, params.sample_rate) sample_rate = params.sample_rate harvest_frequency, timing = pyworld.harvest( audio_data, sample_rate, f0_floor=params.min_freq, f0_ceil=params.max_freq, frame_period=params.frame_period) frequency = pyworld.stonemask(audio_data, harvest_frequency, timing, sample_rate) audio_length = len(frequency) phoneme_data = extract_phoneme_data( [audio_file_name, lyrics_file_name, audio_length]) frequency_data = process_frequency(frequency) label_data = pd.concat([phoneme_data, frequency_data], axis=1) spectral_data, aperiodic_data = extract_timbre_data( [audio_data, frequency, timing, sample_rate]) return [spectral_data, aperiodic_data, label_data, frequency]
def wav2mcep(filepath): ''' cal mcep given wav singnal return: f0: shape [ T, ] ap: shape [ T, sampling_rate/2 + 1 ] sp: shape [ T, sampling_rate/2 + 1 ] coded_sp: shape [n_mels, T] ''' y, sr = librosa.load(filepath, sr=sampling_rate) y, _ = librosa.effects.trim(y) y = np.asarray(y, dtype=np.double) f0, timeaxis = pyworld.harvest(y, sr) sp = pyworld.cheaptrick(y, f0, timeaxis, sampling_rate, fft_size=n_fft) ap = pyworld.d4c(y, f0, timeaxis, sampling_rate, fft_size=n_fft) mcep = pyworld.code_spectral_envelope(sp, sampling_rate, n_mels) mcep = mcep.T # dim x n f0 = f0.astype(np.float64) sp = sp.astype(np.float64) ap = ap.astype(np.float64) mcep = mcep.astype(np.float64) return f0, ap, sp, mcep
def world_features(wav, sr, fft_size, dim): f0, timeaxis = pyworld.harvest(wav, sr) sp = pyworld.cheaptrick(wav, f0, timeaxis, sr, fft_size=fft_size) ap = pyworld.d4c(wav, f0, timeaxis, sr, fft_size=fft_size) coded_sp = pyworld.code_spectral_envelope(sp, sr, dim) return f0, timeaxis, sp, ap, coded_sp
def convertWavIntoF0seqMCEPseq(wav, fs, frame_period=5.0, MCEPdim=24): """ Extract a F0 sequence and a MCEP sequence from a single waveform Args: wav (np.ndarray(1,T)): waveform fs : frame_period (float): [ms] MCEPdim (int): dimension of Mel CEPstral analysis Returns: tuple: f0seq (np.ndarray(1, T/frame_period)) & MCEPseq (np.ndarray(MCEPdim, T/frame_period)) """ wav = wav.astype(np.float64) # np.ndarray -> np.ndarray(number is float64) f0seq, timeaxis = pyworld.harvest(wav, fs, frame_period=frame_period, f0_floor=71.0, f0_ceil=800.0) spetrogram = pyworld.cheaptrick(wav, f0seq, timeaxis, fs) MCEPseq = pyworld.code_spectral_envelope(spetrogram, fs, MCEPdim) print( f"F0&MCEP-nized! {wav.shape[0] / fs} [sec] wav => {f0seq.shape}, {MCEPseq.shape}" ) return f0seq, MCEPseq.T.astype(np.float32)
def cal_mcep(wav_ori, fs=SAMPLE_RATE, ispad=False, frame_period=0.005, dim=FEATURE_DIM, fft_size=FFTSIZE): '''cal mcep given wav singnal the frame_period used only for pad_wav_to_get_fixed_frames ''' if ispad: wav, pad_length = pad_wav_to_get_fixed_frames(wav_ori, frames=FRAMES, frame_period=frame_period, sr=fs) else: wav = wav_ori #Harvest F0 extraction algorithm. f0, timeaxis = pyworld.harvest(wav, fs) #CheapTrick harmonic spectral envelope estimation algorithm. sp = pyworld.cheaptrick(wav, f0, timeaxis, fs, fft_size=fft_size) #D4C aperiodicity estimation algorithm. ap = pyworld.d4c(wav, f0, timeaxis, fs, fft_size=fft_size) #feature reduction nxdim coded_sp = pyworld.code_spectral_envelope(sp, fs, dim) #log coded_sp = coded_sp.T # dim x n res = { 'f0': f0, #n 'ap': ap, #n*fftsize//2+1 'sp': sp, #n*fftsize//2+1 'coded_sp': coded_sp, #dim * n } return res
def world_decompose(wav, fs): wav = wav.astype(np.float64) f0, timeaxis = pyworld.harvest(wav, fs, f0_floor=71.0, f0_ceil=800.0, frame_period=hp.duration) sp = pyworld.cheaptrick(wav, f0, timeaxis, fs) ap = pyworld.d4c(wav, f0, timeaxis, fs) return f0, timeaxis, sp, ap
def extract(wave: Wave, frame_period, f0_floor, f0_ceil, fft_length, order, alpha, dtype): x = wave.wave.astype(numpy.float64) fs = wave.sampling_rate f0, t = pyworld.harvest( x, fs, frame_period=frame_period, f0_floor=f0_floor, f0_ceil=f0_ceil, ) f0 = pyworld.stonemask(x, f0, t, fs) sp = pyworld.cheaptrick(x, f0, t, fs, fft_size=fft_length) ap = pyworld.d4c(x, f0, t, fs, fft_size=fft_length) mc = pysptk.sp2mc(sp, order=order, alpha=alpha) coded_ap = pyworld.code_aperiodicity(ap, fs) voiced = ~(f0 == 0) # type: numpy.ndarray feature = AcousticFeature( f0=f0[:, None], sp=sp, ap=ap, coded_ap=coded_ap, mc=mc, voiced=voiced[:, None], ) feature = feature.astype_only_float(dtype) feature.validate() return feature
def analyze_range(wav, fs=FS, minf0=MINF0, maxf0=MAXF0, fperiod=SHIFTMS, fftl=FFTL, f0=None, time_axis=None): """ f0 estimation w/ f0_floor & f0_ceil Args: f0: Given f0. If not provided, estimated by WORLD harvest/stonemask from waveform. Returns: (time_axis, fundamental frequency, spectral envelope, aperiodicity) """ if f0 is None or time_axis is None: # pyworld.harvest: Estimate fo. _f0, time_axis = pw.harvest(wav, fs, f0_floor=minf0, f0_ceil=maxf0, frame_period=fperiod) # pyworld.stonemask: Refine fo. f0 = pw.stonemask(wav, _f0, time_axis, fs) # pyworld.cheaptrick: Spectral envelope estimation. sp = pw.cheaptrick(wav, f0, time_axis, fs, fft_size=fftl) # pyworld.d4c: Aperiodicity estimation. ap = pw.d4c(wav, f0, time_axis, fs, fft_size=fftl) return time_axis, f0, sp, ap
def synthesis(ori_path, aim_sp, aim_spkid): print('synthesizing ...') wav, _ = librosa.load(ori_path, sr=hp.SR, mono=True, dtype=np.float64) f0, timeaxis = pw.harvest(wav, hp.SR) sp_per_timeaxis_before = pw.cheaptrick(wav, f0, timeaxis, hp.SR, fft_size=hp.N_FFT) # 1024 压缩到 513 维 # ori_decoded_sp = pw.decode_spectral_envelope(ori_sp, hp.SR, fft_size=hp.N_FFT) # print('f0.shape = ') # print(f0) ap = pw.d4c(wav, f0, timeaxis, hp.SR, fft_size=hp.N_FFT) aim_decoded_sp = pw.decode_spectral_envelope( aim_sp, hp.SR, fft_size=hp.N_FFT) # 转换/解码 后的sp: print('解码后的513维度的aim_decoded_sp = ') print(aim_decoded_sp.shape) print(aim_decoded_sp[399][:]) synwav = pw.synthesize(f0, aim_decoded_sp, ap, hp.SR) print(f'synthesize done. path : ./convert_to_{aim_spkid}_test1.wav') librosa.output.write_wav(f'./convert_to_{aim_spkid}_test1.wav', synwav, sr=hp.SR)
def world_decompose(wav, fs=16000, frame_period = 5.0): # Decompose speech signal into f0, spectral envelope and aperiodicity using WORLD wav = wav.astype(np.float64) f0, timeaxis = pyworld.harvest(wav, fs, frame_period = frame_period, f0_floor = 71.0, f0_ceil = 800.0) # f0_floor是基频的下限 f0_ceil是基频的上限 # frame_period是连续帧之间的间隔 sp = pyworld.cheaptrick(wav, f0, timeaxis, fs) ap = pyworld.d4c(wav, f0, timeaxis, fs) return f0 # 返回基频 时间 频谱包络 非周期性
def eval_rmse_f0(x_r, x_s, sr, frame_len='5', method='swipe', tone_shift=None): # TODO: 要可以改動 frame len (ms) 或者 hop_size if method == 'harvest': f0_r, t = pw.harvest(x_r.astype(np.double), sr, frame_period=50) f0_s, t = pw.harvest(x_s.astype(np.double), sr, frame_period=50) elif method == 'dio': f0_r, t = pw.dio(x_r.astype(np.double), sr, frame_period=50) f0_s, t = pw.dio(x_s.astype(np.double), sr, frame_period=50) elif method == 'swipe': f0_r = pysptk.sptk.swipe(x_r.astype(np.double), sr, hopsize=128) f0_s = pysptk.sptk.swipe(x_s.astype(np.double), sr, hopsize=128) elif method == 'rapt': f0_r = pysptk.sptk.rapt(x_r.astype(np.double), sr, hopsize=128) f0_s = pysptk.sptk.rapt(x_s.astype(np.double), sr, hopsize=128) else: raise ValueError('no such f0 exract method') # length align f0_s = pad_to(f0_s, len(f0_r)) # make unvoice / vooiced frame mask f0_r_uv = (f0_r == 0) * 1 f0_r_v = 1 - f0_r_uv f0_s_uv = (f0_s == 0) * 1 f0_s_v = 1 - f0_s_uv tp_mask = f0_r_v * f0_s_v tn_mask = f0_r_uv * f0_s_uv fp_mask = f0_r_uv * f0_s_v fn_mask = f0_r_v * f0_s_uv if tone_shift is not None: shift_scale = 2**(tone_shift / 12) f0_r = f0_r * shift_scale # only calculate f0 error for voiced frame y = 1200 * np.abs(np.log2(f0_r + f0_r_uv) - np.log2(f0_s + f0_s_uv)) y = y * tp_mask # print(y.sum(), tp_mask.sum()) f0_rmse_mean = y.sum() / tp_mask.sum() # only voiced/ unvoiced accuracy/precision vuv_precision = tp_mask.sum() / (tp_mask.sum() + fp_mask.sum()) vuv_accuracy = (tp_mask.sum() + tn_mask.sum()) / len(y) return f0_rmse_mean, vuv_accuracy, vuv_precision
def world_decompose(wav, fs, frame_period = 5.0): # Decompose speech signal into f0, spectral envelope and aperiodicity using WORLD wav = wav.astype(np.float64) f0, timeaxis = pyworld.harvest(wav, fs, frame_period = frame_period, f0_floor = 71.0, f0_ceil = 800.0) sp = pyworld.cheaptrick(wav, f0, timeaxis, fs) ap = pyworld.d4c(wav, f0, timeaxis, fs) return f0, timeaxis, sp, ap
def collect_features(self, wav_path, label_path): fs, x = wavfile.read(wav_path) x = x.astype(np.float64) if hp_acoustic.use_harvest: f0, timeaxis = pyworld.harvest( x, fs, frame_period=hp_acoustic.frame_period, f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil) else: f0, timeaxis = pyworld.dio( x, fs, frame_period=hp_acoustic.frame_period, f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) if self.alpha is None: self.alpha = pysptk.util.mcepalpha(fs) mgc = pysptk.sp2mc(spectrogram, order=hp_acoustic.order, alpha=self.alpha) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) if hp_acoustic.use_harvest: # https://github.com/mmorise/World/issues/35#issuecomment-306521887 vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None] else: vuv = (lf0 != 0).astype(np.float32) lf0 = P.interp1d(lf0, kind=hp_acoustic.f0_interpolation_kind) # Parameter trajectory smoothing if hp_acoustic.mod_spec_smoothing: hop_length = int(fs * (hp_acoustic.frame_period * 0.001)) modfs = fs / hop_length mgc = P.modspec_smoothing( mgc, modfs, cutoff=hp_acoustic.mod_spec_smoothing_cutoff) mgc = P.delta_features(mgc, hp_acoustic.windows) lf0 = P.delta_features(lf0, hp_acoustic.windows) bap = P.delta_features(bap, hp_acoustic.windows) features = np.hstack((mgc, lf0, vuv, bap)) # Cut silence frames by HTS alignment labels = hts.load(label_path) features = features[:labels.num_frames()] indices = labels.silence_frame_indices() features = np.delete(features, indices, axis=0) return features.astype(np.float32)
def main(args): if os.path.isdir('test'): rmtree('test') os.mkdir('test') x, fs = sf.read('utterance/vaiueo2d.wav') # x, fs = librosa.load('utterance/vaiueo2d.wav', dtype=np.float64) # 1. A convient way f0, sp, ap = pw.wav2world(x, fs) # use default options y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period) # 2. Step by step # 2-1 Without F0 refinement _f0, t = pw.dio(x, fs, f0_floor=50.0, f0_ceil=600.0, channels_in_octave=2, frame_period=args.frame_period, speed=args.speed) _sp = pw.cheaptrick(x, _f0, t, fs) _ap = pw.d4c(x, _f0, t, fs) _y = pw.synthesize(_f0, _sp, _ap, fs, args.frame_period) # librosa.output.write_wav('test/y_without_f0_refinement.wav', _y, fs) sf.write('test/y_without_f0_refinement.wav', _y, fs) # 2-2 DIO with F0 refinement (using Stonemask) f0 = pw.stonemask(x, _f0, t, fs) sp = pw.cheaptrick(x, f0, t, fs) ap = pw.d4c(x, f0, t, fs) y = pw.synthesize(f0, sp, ap, fs, args.frame_period) # librosa.output.write_wav('test/y_with_f0_refinement.wav', y, fs) sf.write('test/y_with_f0_refinement.wav', y, fs) # 2-3 Harvest with F0 refinement (using Stonemask) _f0_h, t_h = pw.harvest(x, fs) f0_h = pw.stonemask(x, _f0_h, t_h, fs) sp_h = pw.cheaptrick(x, f0_h, t_h, fs) ap_h = pw.d4c(x, f0_h, t_h, fs) y_h = pw.synthesize(f0_h, sp_h, ap_h, fs, pw.default_frame_period) # librosa.output.write_wav('test/y_harvest_with_f0_refinement.wav', y_h, fs) sf.write('test/y_harvest_with_f0_refinement.wav', y_h, fs) # Comparison savefig('test/wavform.png', [x, _y, y]) savefig('test/sp.png', [_sp, sp]) savefig('test/ap.png', [_ap, ap], log=False) savefig('test/f0.png', [_f0, f0]) print('Please check "test" directory for output files')