def x2mcep(self, x): x = x.astype(np.float64) # [TODO] to avoid ValueError: ndarray is not C-contiguous if not x.flags['C_CONTIGUOUS']: x = x.copy(order='C') # if self.itype == 3: # etype = 2 # eps = 1e-10 # else: # etype = 0 # eps = 0.0 etype = 0 eps = 0.0 if self.isMatrix: return np.asarray([ mcep(xi, order=self.order, itype=self.itype, etype=etype, eps=eps) for xi in x ]) else: return mcep(x, order=self.order, itype=self.itype, etype=etype, eps=eps)
def sptk_mcep(x, order, winsz, hopsz, fftsz, fs, window_norm=False, noise_floor=1e-8): alpha = hz2alpha(fs) windowed = sptk_window(x, winsz, hopsz, fftsz, windowing='blackman', normalize=window_norm) cep = pysptk.mcep(windowed, order=order, alpha=alpha, miniter=2, maxiter=30, threshold=0.001, etype=1, eps=noise_floor, min_det=1.0e-6, itype=0) return cep, alpha
def pysptk_mfcc(self): self.frame_length = 1024 self.hop_length = 80 self.pitch = pysptk.swipe(self.audio.astype(np.float64), fs=self.sr, hopsize=self.hop_length, min=60, max=240, otype="pitch") self.source_excitation = pysptk.excite(self.pitch, self.hop_length) # Note that almost all of pysptk functions assume input array is C-contiguous and np.float4 element type frames = librosa.util.frame(self.audio, frame_length=self.frame_length, hop_length=self.hop_length).astype( np.float64).T # Windowing frames *= pysptk.blackman(self.frame_length) assert frames.shape[1] == self.frame_length # Order of mel-cepstrum self.order = 25 self.alpha = 0.41 self.mc = pysptk.mcep(frames, self.order, self.alpha) logH = pysptk.mgc2sp(self.mc, self.alpha, 0.0, self.frame_length).real librosa.display.specshow(logH.T, sr=self.sr, hop_length=self.hop_length, x_axis="time", y_axis="linear")
def extract_mcep(amp_sp: np.array, num_coded_sps: int, mgc_alpha: float) \ -> np.array: """Extract MCep from the amplitude spectrum with SPTK.""" mcep = pysptk.mcep(amp_sp, order=num_coded_sps - 1, alpha=mgc_alpha, eps=1.0e-8, min_det=0.0, etype=1, itype=3) return mcep.astype(np.float32, copy=False)
def sptk_extract( x: np.ndarray, fs: int, n_fft: int = 512, n_shift: int = 256, mcep_dim: int = 25, mcep_alpha: float = 0.41, is_padding: bool = False, ) -> np.ndarray: """Extract SPTK-based mel-cepstrum. Args: x (ndarray): 1D waveform array. fs (int): Sampling rate n_fft (int): FFT length in point (default=512). n_shift (int): Shift length in point (default=256). mcep_dim (int): Dimension of mel-cepstrum (default=25). mcep_alpha (float): All pass filter coefficient (default=0.41). is_padding (bool): Whether to pad the end of signal (default=False). Returns: ndarray: Mel-cepstrum with the size (N, n_fft). """ # perform padding if is_padding: n_pad = n_fft - (len(x) - n_fft) % n_shift x = np.pad(x, (0, n_pad), "reflect") # get number of frames n_frame = (len(x) - n_fft) // n_shift + 1 # get window function win = pysptk.sptk.hamming(n_fft) # check mcep and alpha if mcep_dim is None or mcep_alpha is None: mcep_dim, mcep_alpha = _get_best_mcep_params(fs) # calculate spectrogram mcep = [ pysptk.mcep( x[n_shift * i:n_shift * i + n_fft] * win, mcep_dim, mcep_alpha, eps=1e-6, etype=1, ) for i in range(n_frame) ] return np.stack(mcep)
def __test_synthesis(filt): # dummy source excitation source = __dummy_source() hopsize = 80 # dummy filter coef. windowed = __dummy_windowed_frames(source, frame_len=512, hopsize=hopsize) b = pysptk.mcep(windowed, filt.order, 0.0) # synthesis synthesizer = Synthesizer(filt, hopsize) y = synthesizer.synthesis(source, b) assert np.all(np.isfinite(y))
def freq_to_mcep(mag_spec, sample_rate, dims=60): r"""Convert from magnitude frequency space to mel-cepstral space. We use mel-cepstrum (i.e. mel-generalised with :math:`\gamma = 0`) as we do not make assumptions about the SNR. """ mag_spec = mag_spec.astype(np.float64) # Convert float to signed-int16 domain. data_16bit = mag_spec * 2.**15 # maxiter=0, etype=1, eps=1e-8, min_det=0. mcep = pysptk.mcep(data_16bit, order=dims - 1, alpha=utils.ALPHA[sample_rate], itype=3) return mcep
def _process_wav(file_list, outfile, winlen, winstep, n_mcep, mcep_alpha, minf0, maxf0, q_channels, type): data_dict = {} enc = encoder(q_channels) for f in tqdm(file_list): wav, sr = load(f, sr=None) x = wav.astype(float) _f0, t = world.harvest(x, sr, f0_floor=minf0, f0_ceil=maxf0, frame_period=winstep * 1000) # can't adjust window size f0 = world.stonemask(x, _f0, t, sr) window_size = int(sr * winlen) hop_size = int(sr * winstep) # get mel if type == 'mcc': nfft = 2**(window_size - 1).bit_length() spec = np.abs( stft(x, n_fft=nfft, hop_length=hop_size, win_length=window_size, window='blackman'))**2 h = sptk.mcep(spec, n_mcep - 1, mcep_alpha, eps=-60, etype=2, itype=4).T else: h = mfcc(x, sr, n_mfcc=n_mcep, n_fft=int(sr * winlen), hop_length=int(sr * winstep)) h = np.vstack((h, f0)) # mulaw encode wav = enc(x).astype(np.uint8) id = os.path.basename(f).replace(".wav", "") data_dict[id] = wav data_dict[id + "_h"] = h np.savez(outfile, **data_dict)
def pysptk_features(x): import pysptk wav_max = 2**15-1 x = (x * wav_max).astype(np.float64) frame_length = 512 hop_length = 160 frames = librosa.util.frame(x, frame_length=frame_length, hop_length=hop_length).T frames *= pysptk.blackman(frame_length) order = 25 # seems to be pretty standard, results in 26 values alpha = 0.42 # this value is best for 16kHz sampling according to docs http://ftp.jaist.ac.jp/pub/pkgsrc/distfiles/SPTKref-3.9.pdf mcep = pysptk.mcep(frames, order, alpha) f0 = pysptk.swipe(x, fs=16000, hopsize=hop_length, min=60, max=240, otype="f0") f0 = f0[1:1+mcep.shape[0]] # cut off ends to match mcep lengths return np.concatenate([f0[:,np.newaxis], mcep], 1).astype(np.float32)
def stft_mcep(x, fftl=512, shiftl=256, dim=25, alpha=0.41, window="hamming", is_padding=False): """EXTRACT STFT-BASED MEL-CEPSTRUM. Args: x (ndarray): Numpy double array with the size (T,). fftl (int): FFT length in point (default=512). shiftl (int): Shift length in point (default=256). dim (int): Dimension of mel-cepstrum (default=25). alpha (float): All pass filter coefficient (default=0.41). window (str): Analysis window type (default="hamming"). is_padding (bool): Whether to pad the end of signal (default=False). Returns: ndarray: Mel-cepstrum with the size (N, n_fft). """ # perform padding if is_padding: n_pad = fftl - (len(x) - fftl) % shiftl x = np.pad(x, (0, n_pad), 'reflect') # get number of frames n_frame = (len(x) - fftl) // shiftl + 1 # get window function win = get_window(window, fftl) # calculate spectrogram mcep = [ pysptk.mcep(x[shiftl * i:shiftl * i + fftl] * win, dim, alpha, eps=EPS, etype=1) for i in range(n_frame) ] return np.stack(mcep)
def __test_synthesis_levdur(filt): # dummy source excitation source = __dummy_source() hopsize = 80 # dummy filter coef. windowed = __dummy_windowed_frames(source, frame_len=512, hopsize=hopsize) c = pysptk.mcep(windowed, filt.order) lpc = pysptk.levdur(pysptk.c2acr(c)) # make sure lpc has loggain lpc[:, 0] = np.log(lpc[:, 0]) # synthesis synthesizer = Synthesizer(filt, hopsize) y = synthesizer.synthesis(source, lpc) assert np.all(np.isfinite(y))
def stft_mcep(x, fftl=512, shiftl=256, dim=25, alpha=0.41, window="hamming", is_padding=False): """FUNCTION TO EXTRACT STFT-BASED MEL-CEPSTRUM Args: x (ndarray): numpy double array with the size [T] fftl (int): fft length in point (default=512) shiftl (int): shift length in point (default=256) dim (int): dimension of mel-cepstrum (default=25) alpha (float): all pass filter coefficient (default=0.41) window (str): analysis window type (default="hamming") is_padding (bool): whether to pad the end of signal (default=False) Return: (ndarray): mel-cepstrum with the size [N, n_fft] """ # perform padding if is_padding: n_pad = fftl - (len(x) - fftl) % shiftl x = np.pad(x, (0, n_pad), 'reflect') # get number of frames n_frame = (len(x) - fftl) // shiftl + 1 # get window function win = get_window(window, fftl) # calculate spectrogram mcep = [ pysptk.mcep(x[shiftl * i:shiftl * i + fftl] * win, dim, alpha) for i in range(n_frame) ] return np.stack(mcep)
def get_MCEP(self, utterance): utterance = librosa.util.normalize(utterance) utterance = utterance + np.random.normal( loc=0, scale=0.0000001, size=utterance.shape[0]) utterance = librosa.util.normalize(utterance) utterance = utterance.astype(np.float64) # necessary for synthesizer frames = librosa.util.frame(utterance, frame_length=self.frame_length, hop_length=self.hop_length).astype( np.float64).T # Windowing frames *= pysptk.blackman(self.frame_length) assert frames.shape[1] == self.frame_length # Pitch pitch = pysptk.swipe(utterance.astype(np.float64), fs=self.sr, hopsize=self.hop_length, min=60, max=240, otype="pitch") mcep = pysptk.mcep(frames, self.order, self.alpha) return mcep, pitch
def pitch_shift_on_lpc_residual( wav, sr, shift_in_cent, frame_length=4096, hop_length=240, mgc_order=59, ): assert wav.dtype == np.int16 frames = (librosa.util.frame(wav, frame_length=frame_length, hop_length=hop_length).astype(np.float64).T) frames *= pysptk.blackman(frame_length) alpha = pysptk.util.mcepalpha(sr) mgc = pysptk.mcep(frames, mgc_order, alpha, eps=1e-5, etype=1) c = pysptk.freqt(mgc, mgc_order, -alpha) lpc = pysptk.levdur(pysptk.c2acr(c, mgc_order, frame_length)) # remove gain lpc[:, 0] = 0 # Compute LPC residual synth = Synthesizer(AllZeroDF(mgc_order), hop_length) wav_lpc = synth.synthesis(wav.astype(np.float64), -lpc) residual = wav - wav_lpc # Pitch-shift on LPC residual residual_shifted = librosa.effects.pitch_shift(residual, sr=sr, n_steps=shift_in_cent, bins_per_octave=1200) # Filtering by LPC synth = Synthesizer(AllPoleDF(mgc_order), hop_length) wav_shifted = synth.synthesis(residual_shifted, lpc) return wav_shifted.astype(np.int16)
def __test_broadcast(dtype): frames = windowed_dummy_frames(100, 512, dtype=dtype) mc = pysptk.mcep(frames, order, alpha) assert np.all(np.isfinite(mc)) assert frames.shape[0] == mc.shape[0]
def __test(order, alpha): mc = pysptk.mcep(x, order, alpha) assert np.all(np.isfinite(mc))
def gen_data(self, dir_in, dir_out=None, file_id_list=None, id_list=None, add_deltas=False, return_dict=False): """ Prepare WORLD features from audio files. If add_delta is false labels have the dimension num_frames x (num_coded_sps + 3) [mgc(num_coded_sps), lf0, vuv, bap(1)], otherwise the deltas and double deltas are added between the features resulting in num_frames x (3*num_coded_sps + 7) [mgc(3*num_coded_sps), lf0(3*1), vuv, bap(3*1)]. :param dir_in: Directory where the .wav files are stored for each utterance to process. :param dir_out: Main directory where the labels and normalisation parameters are saved to subdirectories. If None, labels are not saved. :param file_id_list: Name of the file containing the ids. Normalisation parameters are saved using this name to differentiate parameters between subsets. :param id_list: The list of utterances to process. Should have the form uttId1 \\n uttId2 \\n ...\\n uttIdN. If None, all file in audio_dir are used. :param add_deltas: Add deltas and double deltas to all features except vuv. :param return_dict: If true, returns an OrderedDict of all samples as first output. :return: Returns two normalisation parameters as tuple. If return_dict is True it returns all processed labels in an OrderedDict followed by the two normalisation parameters. """ # Fill file_id_list by .wav files in dir_in if not given and set an appropriate file_id_list_name. if id_list is None: id_list = list() filenames = glob.glob(os.path.join(dir_in, "*.wav")) for filename in filenames: id_list.append(os.path.splitext(os.path.basename(filename))[0]) file_id_list_name = "all" else: file_id_list_name = os.path.splitext( os.path.basename(file_id_list))[0] # Create directories in dir_out if it is given. if dir_out is not None: if add_deltas: makedirs_safe(os.path.join(dir_out, self.dir_deltas)) else: makedirs_safe(os.path.join(dir_out, self.dir_lf0)) makedirs_safe(os.path.join(dir_out, self.dir_vuv)) makedirs_safe(os.path.join(dir_out, self.dir_coded_sps)) makedirs_safe(os.path.join(dir_out, self.dir_bap)) # Create the return dictionary if required. if return_dict: label_dict = OrderedDict() if add_deltas: # Create normalisation computation units. norm_params_ext_coded_sp = MeanCovarianceExtractor() norm_params_ext_lf0 = MeanCovarianceExtractor() norm_params_ext_bap = MeanCovarianceExtractor() else: # Create normalisation computation units. norm_params_ext_coded_sp = MeanStdDevExtractor() norm_params_ext_lf0 = MeanStdDevExtractor() # norm_params_ext_vuv = MeanStdDevExtractor() norm_params_ext_bap = MeanStdDevExtractor() logging.info("Extract WORLD{} features for".format( "" if not add_deltas else " deltas") + "[{0}]".format(", ".join(str(i) for i in id_list))) for file_name in id_list: # Load audio file and extract features. audio_name = os.path.join(dir_in, file_name + ".wav") raw, fs = soundfile.read(audio_name) logging.debug("Extract WORLD{} features from {} at {}Hz.".format( "" if not add_deltas else " deltas", file_name, fs)) f0, sp, ap = pyworld.wav2world(raw, fs) file_name = os.path.basename(file_name) # Remove speaker. # Compute lf0 and vuv information. lf0 = np.log(f0.clip(min=1E-10), dtype=np.float32) lf0[lf0 <= math.log(self.f0_silence_threshold)] = self.lf0_zero lf0, vuv = interpolate_lin(lf0) lf0 = lf0.astype(dtype=np.float32) vuv = vuv.astype(dtype=np.float32) # Throw a warning when less then 5% of all frames are unvoiced. if vuv.sum() / len(vuv) < 0.05: self.logger.warning( "Detected only {:.0f}% [{}/{}] unvoiced frames in {}.". format(vuv.sum() / len(vuv) * 100.0, int(vuv.sum()), len(vuv), file_name)) # Decode spectrum to a lower dimension and aperiodicity to one band aperiodicity. # coded_sp = pyworld.code_spectral_envelope(sp, fs, WorldFeatLabelGen.num_coded_sps) # Cepstral version. coded_sp = np.sqrt(sp) * 32768.0 coded_sp = np.array(pysptk.mcep(coded_sp, order=self.num_coded_sps - 1, alpha=self.mgc_alpha, eps=1.0e-8, min_det=0.0, etype=1, itype=3), dtype=np.float32) bap = np.array(pyworld.code_aperiodicity(ap, fs), dtype=np.float32) if add_deltas: # Compute the deltas and double deltas for all features. lf0_deltas, lf0_double_deltas = compute_deltas(lf0) coded_sp_deltas, coded_sp_double_deltas = compute_deltas( coded_sp) bap_deltas, bap_double_deltas = compute_deltas(bap) coded_sp = np.concatenate( (coded_sp, coded_sp_deltas, coded_sp_double_deltas), axis=1) lf0 = np.concatenate((lf0, lf0_deltas, lf0_double_deltas), axis=1) bap = np.concatenate((bap, bap_deltas, bap_double_deltas), axis=1) # Combine them to a single feature sample. labels = np.concatenate((coded_sp, lf0, vuv, bap), axis=1) # Save into return dictionary and/or file. if return_dict: label_dict[file_name] = labels if dir_out is not None: labels.tofile( os.path.join(dir_out, self.dir_deltas, file_name + self.ext_deltas)) else: # Save into return dictionary and/or file. if return_dict: label_dict[file_name] = np.concatenate( (coded_sp, lf0, vuv, bap), axis=1) if dir_out is not None: coded_sp.tofile( os.path.join(dir_out, self.dir_coded_sps, file_name + self.ext_coded_sp)) lf0.tofile( os.path.join(dir_out, self.dir_lf0, file_name + self.ext_lf0)) vuv.astype(np.float32).tofile( os.path.join(dir_out, self.dir_vuv, file_name + self.ext_vuv)) bap.tofile( os.path.join(dir_out, self.dir_bap, file_name + self.ext_bap)) # Add sample to normalisation computation unit. norm_params_ext_coded_sp.add_sample(coded_sp) norm_params_ext_lf0.add_sample(lf0) # norm_params_ext_vuv.add_sample(vuv) norm_params_ext_bap.add_sample(bap) # Save mean and std dev of all features. if not add_deltas: norm_params_ext_coded_sp.save( os.path.join(dir_out, self.dir_coded_sps, file_id_list_name)) norm_params_ext_lf0.save( os.path.join(dir_out, self.dir_lf0, file_id_list_name)) # norm_params_ext_vuv.save(os.path.join(dir_out, WorldFeatLabelGen.dir_vuv, file_id_list_name)) norm_params_ext_bap.save( os.path.join(dir_out, self.dir_bap, file_id_list_name)) else: self.logger.info("Write norm_prams to{}".format( os.path.join(dir_out, self.dir_deltas, "_".join( (file_id_list_name, self.dir_coded_sps))))) norm_params_ext_coded_sp.save( os.path.join(dir_out, self.dir_deltas, "_".join( (file_id_list_name, self.dir_coded_sps)))) norm_params_ext_lf0.save( os.path.join(dir_out, self.dir_deltas, "_".join( (file_id_list_name, self.dir_lf0)))) norm_params_ext_bap.save( os.path.join(dir_out, self.dir_deltas, "_".join( (file_id_list_name, self.dir_bap)))) # Get normalisation parameters. if not add_deltas: norm_coded_sp = norm_params_ext_coded_sp.get_params() norm_lf0 = norm_params_ext_lf0.get_params() # norm_vuv = norm_params_ext_vuv.get_params() norm_bap = norm_params_ext_bap.get_params() norm_first = np.concatenate( (norm_coded_sp[0], norm_lf0[0], (0.0, ), norm_bap[0]), axis=0) norm_second = np.concatenate( (norm_coded_sp[1], norm_lf0[1], (1.0, ), norm_bap[1]), axis=0) else: norm_coded_sp = norm_params_ext_coded_sp.get_params() norm_lf0 = norm_params_ext_lf0.get_params() # norm_vuv = norm_params_ext_vuv.get_params() norm_bap = norm_params_ext_bap.get_params() norm_first = (norm_coded_sp[0], norm_lf0[0], (0.0, ), norm_bap[0]) norm_second = (norm_coded_sp[1], norm_lf0[1], (1.0, ), norm_bap[1]) if return_dict: # Return dict of labels for all utterances. return label_dict, norm_first, norm_second else: return norm_first, norm_second
def __test_itype(itype=0): pysptk.mcep(x, itype=itype)
def get_random_peseudo_mcep(order=24, alpha=0.42): T, N = 100, 513 frames = np.random.rand(T, N) * pysptk.blackman(N) mc = pysptk.mcep(frames, order=order, alpha=alpha) return mc
def __test_min_det(min_det): pysptk.mcep(x, min_det=min_det)
def sp2mgc(sp, dim, sr): return pysptk.mcep(sp, order=dim-1, alpha=get_world_alpha(sr), \ maxiter=0, etype=1, eps=0.0, min_det=1e-06, itype=4)
def load_mfcc_mceps_VCTK(list_train_data, data_folder, speaker, config_mfcc_mceps): '''extract normalized mfcc and mceps from list of data path input: list_train_data: path to file name with audio tracks title for target data_folder: path to data folder speaker: code for target speaker mfcc_mceps setting dictionary return: dictionary: key: speaker code + _ + audio name value: tuple (mfcc normalized, mceps normalized) target scaler: contains mcep mean and variance of target speaker in order to scale back mcep results ''' root = data_folder _data_x = {} target_scaler = {} with open(list_train_data, 'r') as ft: count_errors = 0 lines = ft.readlines() total_mceps = np.empty( (0, config_mfcc_mceps['order_mcep'] + 1), float) #used to store mean and std for denormalize results for l in lines: l = l.strip() speaker_f, _ = l.split('_') if speaker_f != speaker: continue wav_path = root + speaker + '/' + l + '.wav' try: x, _ = librosa.load(wav_path, sr=config_mfcc_mceps["sampling_frequency"]) mfccs = librosa.feature.mfcc( y=x, sr=config_mfcc_mceps["sampling_frequency"], n_mfcc=config_mfcc_mceps["order_mfcc"], n_fft=config_mfcc_mceps["n_fft"], hop_length=config_mfcc_mceps["hop_length"]) mfccs = normalize_mfcc( mfccs.T ).T #transpose twice in order to normalize on right axis ## pad the extracted x in order to frame it to have same number of mceps and mfccs mfcc_l = math.ceil(x.shape[0] / config_mfcc_mceps["hop_length"] ) #number of 10ms frames expected mcep_l = math.ceil((x.shape[0] - config_mfcc_mceps["n_fft"]) / config_mfcc_mceps["hop_length"] ) #number of 10ms frames without 0 padding final_shape = x.shape[0] + config_mfcc_mceps["hop_length"] * ( mfcc_l - mcep_l ) #compute new shape in order to get same number of mceps and mfcc frames x.resize((final_shape, )) frames = librosa.util.frame( x, frame_length=config_mfcc_mceps["n_fft"], hop_length=config_mfcc_mceps["hop_length"]).astype( np.float64).T # mceps = pysptk.mcep(frames, config_mfcc_mceps['order_mcep'], etype=1, eps=1e-5) mceps = pysptk.mcep(frames, config_mfcc_mceps['order_mcep']) total_mceps = np.vstack((total_mceps, mceps)) id_ = "_" + l _data_x[id_] = ( mfccs.T, mceps ) #Don't forget mfcc.T -> now both have shape (#frames, #mfcc/mceps) except: #print(f"Error file: {wav_path}") count_errors += 1 #print(f"\nTotal errors: {count_errors}\n") # compute mean and std for all mceps target_scaler["mean"] = np.mean(total_mceps, 0) target_scaler["std"] = np.std(total_mceps, 0) #apply normalization for k, v in _data_x.items(): mcep = v[1] mcep = (mcep - target_scaler["mean"]) / target_scaler["std"] _data_x[k] = (v[0], mcep) #convert to list to save to file target_scaler["mean"] = list(target_scaler["mean"]) target_scaler["std"] = list(target_scaler["std"]) print(f"Total Seconds of audio: {total_mceps.shape[0]/100}") return _data_x, target_scaler
frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH).astype(np.float64).T frames *= pysptk.blackman(FRAME_LENGTH) # 窓掛け(ブラックマン窓) # ピッチ抽出 pitch = pysptk.swipe(x, fs=fs, hopsize=HOP_LENGTH, min=MIN_F0, max=MAX_F0, otype="pitch") # 励振源信号(声帯音源)の生成 source_excitation = pysptk.excite(pitch, HOP_LENGTH) # メルケプストラム分析(=スペクトル包絡の抽出) mc = pysptk.mcep(frames, ORDER, ALPHA) # メルケプストラム係数の抽出 # メルケプストラム係数からMLSAディジタルフィルタ係数に変換 mlsa_coef = pysptk.mc2b(mc, ALPHA) # MLSAフィルタの作成 synthesizer = Synthesizer(MLSADF(order=ORDER, alpha=ALPHA), HOP_LENGTH) # 励振源信号でフィルタを駆動して音声を合成 y = synthesizer.synthesis(source_excitation, mlsa_coef) # 音声の書き込み y = y.astype(np.int16) wavfile.write(OUT_WAVE_FILE, fs, y)
def test_mcep_failure(): pysptk.mcep(np.ones(256), 40, 0.41)
def __test_eps(etype=0, eps=0.0): pysptk.mcep(x, etype=etype, eps=eps)
def load_mfcc_mceps(path_to_data, config_mfcc_mceps): '''extract normalized mfcc and mceps from list of data path input: list of paths to data, mfcc_mceps setting dictionary return: dictionary: key: speaker code + _ + audio name value: tuple (mfcc normalized, mceps normalized) target scaler: contains mcep mean and variance of target speaker in order to scale back mcep results ''' _data_x = {} path_audios = os.listdir(path_to_data) total_mceps = np.empty( (0, config_mfcc_mceps['order_mcep'] + 1), float) #used to store mean and std for denormalize results target_scaler = {} for p in path_audios: if p.split(".")[-1] != "wav": continue x, _ = librosa.load(path_to_data + '/' + p, sr=config_mfcc_mceps["sampling_frequency"]) mfcc_l = math.ceil(x.shape[0] / config_mfcc_mceps["hop_length"]) mcep_l = math.ceil((x.shape[0] - config_mfcc_mceps["n_fft"]) / config_mfcc_mceps["hop_length"]) final_shape = x.shape[0] + config_mfcc_mceps["hop_length"] * (mfcc_l - mcep_l) x.resize((final_shape, )) frames = librosa.util.frame( x, frame_length=config_mfcc_mceps["n_fft"], hop_length=config_mfcc_mceps["hop_length"]).astype(np.float64).T # Windowing frames *= pysptk.blackman(config_mfcc_mceps["n_fft"], normalize=1) mceps = pysptk.mcep(frames, config_mfcc_mceps['order_mcep']) #,alpha) total_mceps = np.vstack((total_mceps, mceps)) mfccs = librosa.feature.mfcc( y=x, sr=config_mfcc_mceps["sampling_frequency"], n_mfcc=config_mfcc_mceps["order_mfcc"], n_fft=config_mfcc_mceps["n_fft"], hop_length=config_mfcc_mceps["hop_length"]) mfccs = normalize_mfcc( mfccs.T).T #transpose twice in order to normalize on right axis id_ = "_" + p _data_x[id_] = ( mfccs.T, mceps ) #Don't forget mfcc.T -> now both have shape (#frames, #mfcc/mceps) target_scaler["mean"] = list(np.mean(total_mceps, 0)) target_scaler["std"] = list(np.std(total_mceps, 0)) #apply normalization for k, v in _data_x.items(): mcep = v[1] mcep = (mcep - target_scaler["mean"]) / target_scaler["std"] _data_x[k] = (v[0], mcep) return _data_x, target_scaler
def test_mc2b(): x = windowed_dummy_data(1024) mc = pysptk.mcep(x) assert pysptk.mc2e(mc) > 0
# Windowing frames *= pysptk.blackman(frame_length) assert frames.shape[1] == frame_length pitch = pysptk.swipe(x.astype(np.float64), fs=sr, hopsize=hop_length, min=60, max=240, otype="pitch") source_excitation = pysptk.excite(pitch, hop_length) # Order of mel-cepstrum mc = pysptk.mcep(frames, order, alpha) logH = pysptk.mgc2sp(mc, alpha, 0.0, frame_length).real print(mc.shape) #plt.plot(mc) #plotname="x_syn_coefs_" + str(order) + ".png" #plt.savefig(plotname) # Convert mel-cesptrum to MLSADF coefficients b = pysptk.mc2b(mc, alpha) synthesizer = Synthesizer(MLSADF(order=order, alpha=alpha), hop_length) x_synthesized = synthesizer.synthesis(source_excitation, b) filenam = "synthesized_sounds/" + "x_syn" + str(order + 1) + ".wav" #wavfile.write("x.wav", sr, x)
hop_length=HOP_LENGTH).astype(np.float64).T frames *= pysptk.blackman(FRAME_LENGTH) # 窓掛け(ブラックマン窓) # ピッチ抽出 pitch = pysptk.swipe(x, fs=fs, hopsize=HOP_LENGTH, min=MIN_F0, max=MAX_F0, otype="pitch") # 励振源信号(声帯音源)の生成 source_excitation = pysptk.excite(pitch, HOP_LENGTH) # メルケプストラム分析(=スペクトル包絡の抽出) mc = pysptk.mcep(frames, ORDER, ALPHA) # メルケプストラム係数からMLSAディジタルフィルタ係数に変換 mlsa_coef = pysptk.mc2b(mc, ALPHA) # MLSAフィルタの作成 synthesizer = Synthesizer(MLSADF(order=ORDER, alpha=ALPHA), HOP_LENGTH) # #### 以降、合成フィルタのパラメタなどを変えて色々な音声を合成 # ### ピッチシフト (音を高くする) ### OUT_WAVE_FILE = "pitchshift_high.wav" PITCH_SHIFT = 0.5 # 音を高くする場合は 1より小さい倍率 excitation_pitchhigh = pysptk.excite(pitch * PITCH_SHIFT, HOP_LENGTH) y = synthesizer.synthesis(excitation_pitchhigh, mlsa_coef) # 音声合成 y = y.astype(np.int16)
def mel_cepstrum(frames): mc = ps.mcep(frames, ORDER, ALPHA, eps=0, etype=1) # logH = ps.mgc2sp(mc, ALPHA, 0.0, FRAME_LENGTH).real return mc