def test_definition_ortho(self): """Test orthornomal mode.""" for i in range(len(X)): x = np.array(X[i], dtype=self.rdt) y = dct(x, norm="ortho", type=2) xi = dct(y, norm="ortho", type=3) self.assertTrue(xi.dtype == self.rdt, "Output dtype is %s, expected %s" % (xi.dtype, self.rdt)) assert_array_almost_equal(xi, x, decimal=self.dec)
def dct_2d_ref(x, **kwargs): """ used as a reference in testing dct2. """ x = np.array(x, copy=True) for row in range(x.shape[0]): x[row, :] = dct(x[row, :], **kwargs) for col in range(x.shape[1]): x[:, col] = dct(x[:, col], **kwargs) return x
def test_definition_ortho(self): # Test orthornomal mode. for i in range(len(X)): x = np.array(X[i], dtype=self.rdt) y = dct(x, norm='ortho', type=2) xi = dct(y, norm="ortho", type=3) assert_equal(xi.dtype, self.rdt) assert_array_almost_equal(xi, x, decimal=self.dec)
def dct_2d_ref(x, **kwargs): """Calculate reference values for testing dct2.""" x = np.array(x, copy=True) for row in range(x.shape[0]): x[row, :] = dct(x[row, :], **kwargs) for col in range(x.shape[1]): x[:, col] = dct(x[:, col], **kwargs) return x
def test_definition_ortho(self): # Test orthornomal mode. for i in range(len(X)): x = np.array(X[i], dtype=self.rdt) dt = np.result_type(np.float32, self.rdt) y = dct(x, norm='ortho', type=2) xi = dct(y, norm="ortho", type=3) assert_equal(xi.dtype, dt) assert_array_almost_equal(xi, x, decimal=self.dec)
def test_definition_ortho(self): """Test orthornomal mode.""" for i in range(len(X)): x = np.array(X[i], dtype=self.rdt) y = dct(x, norm='ortho', type=2) xi = dct(y, norm="ortho", type=3) self.assertTrue( xi.dtype == self.rdt, "Output dtype is %s, expected %s" % (xi.dtype, self.rdt)) assert_array_almost_equal(xi, x, decimal=self.dec)
def test_axis(self): nt = 2 for i in [7, 8, 9, 16, 32, 64]: x = np.random.randn(nt, i) y = dct(x, type=self.type) for j in range(nt): assert_array_almost_equal(y[j], dct(x[j], type=self.type), decimal=self.dec) x = x.T y = dct(x, axis=0, type=self.type) for j in range(nt): assert_array_almost_equal(y[:, j], dct(x[:, j], type=self.type), decimal=self.dec)
def test_axis(self): nt = 2 for i in [7, 8, 9, 16, 32, 64]: x = np.random.randn(nt, i) y = dct(x, type=self.type) for j in range(nt): assert_array_almost_equal(y[j], dct(x[j], type=self.type), decimal=self.dec) x = x.T y = dct(x, axis=0, type=self.type) for j in range(nt): assert_array_almost_equal(y[:,j], dct(x[:,j], type=self.type), decimal=self.dec)
def gene_mfcc(s, fs, nperseg, filterbank): f, t, spec = signal.stft(s, fs=fs, nperseg=nperseg) mspec = np.dot(filterbank, np.abs(spec[:-1])) mspec_db = librosa.amplitude_to_db(mspec) ceps = dct(mspec_db, axis=0) mfcc = ceps[1:13] return spec, mspec_db, mfcc
def st_mfcc(cur_pos_signal, fbank, nceps): """ 短时mfcc """ mspec = numpy.log10(numpy.dot(cur_pos_signal, fbank.T) + EPS) ceps = dct(mspec, type=2, norm='ortho', axis=-1)[:nceps] return ceps
def create_mfcc(self): spec_mfb \ = np.dot(self.mel_filter_bank, self.spec) # スペクトル[dB]とメルフィルタバンクの内積 self.mfcc \ = dct(spec_mfb, type=2, norm="ortho", axis=0)[:MFCC_DIM] # 離散コサイン変換 self.d_mfcc = self.create_delta(self.mfcc, DELTA_LENGTH) # ΔMFCC self.dd_mfcc = self.create_delta(self.d_mfcc, DELTA_LENGTH) # ΔΔMFCC
def calc_mfcc(wav, hop, win_length, filterbank): """ Calculate Mel Frequency Cepstrum Coeffcient(MFCC). Parameters: wav : ndarray, real-valued Time series of measurement values. hop : float Hop (Overlap) size. win_length : int Window size. filter_bank : ndarray mel filter bank Returns: mel_spec : ndarray (n_channels, n_frames) Mel scale spectrogram. mfcc : ndarray (n_channels, n_frames) Mel Frequency Cepstrum Coeffcient(MFCC). """ pre_wav = utils.pre_emphasis(wav, p=0.97) spec = utils.stft(pre_wav, hop=hop, win_length=win_length) # hop_length = int(win_length * hop) # spec = spec[:, :hop_length] mel_spec = np.dot(filterbank, np.abs(spec[:-1])) mfcc = np.zeros_like(mel_spec) for i in range(mel_spec.shape[1]): mfcc[:, i] = dct(mel_spec[:, i], type=2, norm="ortho", axis=-1) return mel_spec, mfcc
def stMFCC(X, fbank, nceps): """ Computes the MFCCs of a frame, given the fft mag ARGUMENTS: X: fft magnitude abs(FFT) fbank: filter bank (see mfccInitFilterBanks) RETURN ceps: MFCCs (13 element vector) Note: MFCC calculation is, in general, taken from the scikits.talkbox library (MIT Licence), # with a small number of modifications to make it more compact and suitable for the pyAudioAnalysis Lib """ qtdDeleted = fbank.T.size - X.size for i in range(0, qtdDeleted): fbank = numpy.delete(fbank.T, [0.]) fbank = fbank.reshape((fbank.size / 2), 2) mspec = numpy.log10(numpy.dot(X, fbank.T) + eps) ceps = dct(mspec, type=2, norm='ortho', axis=-1)[:nceps] resp = [] for i in range(0, ceps[0].size): if ceps[0][i] != 0.0: resp.append(ceps[0][i]) return numpy.asarray(resp)
def __dct(self, mspec, nceps): ceps = realtransforms.dct(mspec, type=2, norm="ortho", axis=-1) #return lower features by n return ceps[:nceps] #end of class MFCC
def specPS(input_wav, pitch): N = len(input_wav) samps = N / pitch if samps == 0: samps = 1 frames = N / samps data = input_wav[0:frames] specs = periodogram(data, nfft=4096) for i in range(1, int(samps)): data = input_wav[frames * i:frames * (i + 1)] peri = periodogram(data, nfft=4096) for sp in range(len(peri[0])): specs[0][sp] += peri[0][sp] for s in range(len(specs[0])): specs[0][s] /= float(samps) peri = [] for k, l in zip(specs[0], specs[1]): if k == 0 and l == 0: peri.append(epsilon) else: peri.append(math.log(math.sqrt((k**2) + (l**2)))) # Fix values<=0 to prevent nan if sum(n < 0 for n in peri) > 0: eps = np.finfo(float).eps peri = [eps if p <= 0 else p for p in peri] # Filter the spectrum through the triangle filterbank mspec = np.log10(peri) # Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain) ceps = dct(mspec, type=2, norm='ortho', axis=-1) return ceps[:50]
def smoothData(self,x,y,weight,nMiss=0): ''' smooth data ''' import scipy.optimize.lbfgsb as lbfgsb from scipy.fftpack.realtransforms import dct,idct n0 = len(x) #x = np.array([x,x,x]).flatten() #y = np.array([y,y,y]).flatten() #weight = np.array([weight,weight,weight]).flatten() n = len(x) weight = 1./weight # scale 0 to 1 weight = weight/np.max(weight) i = np.arange(1,n+1) eigenvalues = -2. + 2.*np.cos((i-1)*np.pi/n) DCTy = dct(y,norm='ortho',type=2) dcty2 = DCTy**2 eigenvalues2 = eigenvalues**2 x0 = np.atleast_1d(1.) y_hat = np.zeros_like(y) xpost,f,d = lbfgsb.fmin_l_bfgs_b(gcv,x0,fprime=None,factr=10.,\ approx_grad=True,args=(y,weight,eigenvalues2,n,nMiss,y_hat)) solvedGamma = np.exp(xpost)[0] return y_hat,solvedGamma
def mfcc(input, nceps=13): """Compute Mel Frequency Cepstral Coefficients. Parameters ---------- input: ndarray input spectrogram from which the coefficients are computed Returns ------- ceps: ndarray Mel-cepstrum coefficients mspec: ndarray Log-spectrum in the mel-domain. Notes ----- MFCC are computed as follows: * Pre-processing in time-domain (pre-emphasizing) * Compute the spectrum amplitude by windowing with a Hamming window * Filter the signal in the spectral domain with a triangular filter-bank, whose filters are approximatively linearly spaced on the mel scale, and have equal bandwith in the mel scale * Compute the DCT of the log-spectrum This is based on the talkbox module: http://pydoc.net/Python/scikits.talkbox/0.2.4.dev/scikits.talkbox.features.mfcc/ References ---------- .. [1] S.B. Davis and P. Mermelstein, "Comparison of parametric representations for monosyllabic word recognition in continuously spoken sentences", IEEE Trans. Acoustics. Speech, Signal Proc. ASSP-28 (4): 357-366, August 1980.""" nfft = input.metadata.sampling_configuration.dft_length fs = input.metadata.sampling_configuration.fs over = input.metadata.sampling_configuration.window_length \ - input.metadata.sampling_configuration.window_step #lowfreq = 400 / 3. lowfreq = 133.33 #highfreq = 6855.4976 linsc = 200/3. logsc = 1.0711703 nlinfil = 13 nlogfil = 27 nlinfil + nlogfil fbank = trfbank(fs, nfft, lowfreq, linsc, logsc, nlinfil, nlogfil)[0] fbank = fbank.T[0:input.data.shape[0], :] mspec = np.log10(np.maximum(np.dot(fbank.T, input.data), 0.0000001)).T # Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain) ceps = dct(mspec, type=2, norm='ortho', axis=-1)[:, :nceps] return ceps
def mfcc(f, fs, frameLength, nceps=13): nfft = frameLength * 2 lowfreq = 133.33 #highfreq = 6855.4976 linsc = 200 / 3. logsc = 1.0711703 #三角滤波器组的几个参数 nlinfil = 13 nlogfil = 27 #滤波器的个数 fbank = trfbank(fs, nfft, lowfreq, linsc, logsc, nlinfil, nlogfil) data = np.array([frame.data for frame in f]) #所有帧的内容 # Compute the spectrum magnitude spec = np.abs(fft(data, nfft, axis=-1)) # Filter the spectrum through the triangle filterbank mspec = np.log10(np.dot(spec, fbank.T)) #由于通过短时能量筛选去除了静音帧,理论上此处不会出现系数为0的情况 #如果删除了排除静音帧的步骤,有可能会存在0系数导致无法计算,此时可用下方代码替代 #epsilon = 1e-6 #mspec = np.log10(np.dot(np.maximum(spec, epsilon), fbank.T)) # Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain) ceps = dct(mspec, type=2, norm='ortho', axis=-1)[:, 1:nceps] # 一般取DCT后的第2个到第13个系数作为MFCC系数 #计算一阶△MFCC,以反映音频的动态特征 deltamfcc = delta(ceps) ceps = np.concatenate((ceps, deltamfcc), axis=1) #将差分mfcc参数扩展到原ceps后 #继续计算二阶差分△△MFCC deltadeltamfcc = delta(deltamfcc) ceps = np.concatenate((ceps, deltadeltamfcc), axis=1) return ceps
def FFTcoefficient(sig, samplerate=16000, win_length=0.025, win_step=0.01, pre_emphasis_coeff=0.97, NFFT=512): ''' 计算初始IDCT系数 :param sig: :param samplerate: :param win_length: :param win_step: :param pre_emphasis_coeff: :return: ''' #预处理 signal = pre_emphasis(sig, pre_emphasis_coeff) #分帧 frames = audio2frame(signal, win_length * samplerate, win_step * samplerate) # 得到帧数组 #加窗 frames *= np.hamming(int(round(win_length * samplerate))) # 加窗 #FFT fftfeat = spectrum_power(frames, NFFT) # 进行快速傅里叶变换 得到幅值系数 feat = np.where(fftfeat == 0, np.finfo(float).eps, fftfeat) #TODO 滤波 feat = np.log(fftfeat) feat = dct(feat, type=2, axis=1, norm='ortho') return feat
def st_mfcc(cur_pos_signal, fbank, nceps): """ Mfcc à court terme """ mspec = numpy.log10(numpy.dot(cur_pos_signal, fbank.T) + EPS) ceps = dct(mspec, type=2, norm='ortho', axis=-1)[:nceps] return ceps
def get_mfcc(path): """Finds the MFCCs and FFTs of a WAVE file. Args: path: The path to a WAVE file. Returns: A tuple of two iterables, the FFTs and MFCCs of the frames of the WAVE file. """ global COMP_FRAME_SIZE # Read the file, and determine its length in frames (sample, data) = utils.read_wave_from_file(path) total_frames = (data.size / sample) / COMP_FRAME_SIZE step = COMP_FRAME_SIZE * sample window = hamming(step) # Allocate space for the FFT decompositions of each frame of sound data fft_out = [] mfcc_out = [] # Loop invariant: # 0 <= frame_index <= total_frames # results in an array (fft_out) of FFTs that correspond to the # frames of the WAVE file filterbank_cache = {} frame_index = 0 while frame_index + (1 - FRAME_OVERLAP_FACTOR) < total_frames: # Obtain the frame_indexth frame from the data frame = data[frame_index * step : (frame_index + 1) * step] # Generate the FFT of the frame windowed by the hamming window frame_fft = numpy.fft.rfft(frame * window, n=256) frame_fft[frame_fft == 0] = 0.000003 nfft = len(frame_fft) # Compute the mel triangular filterbank or get a cached version fb_key = (sample, nfft) if fb_key in filterbank_cache: filterbank = filterbank_cache[fb_key] else: filterbank = triangular_filters(sample, nfft).T filterbank[filterbank == 0] = 0.00003 filterbank_cache[fb_key] = filterbank # The power spectrum of the frame power_spectrum = numpy.abs(frame_fft) # Filtered by the mel filterbank mel_power_spectrum = numpy.log10(numpy.dot(power_spectrum, filterbank)) # With the discrete cosine transform to find the cepstrum cepstrum = dct(mel_power_spectrum, type=2, norm="ortho", axis=-1) fft_out.append(frame_fft) mfcc_out.append(cepstrum[: int(len(cepstrum) * SIGNIFICANT_MFCC)]) frame_index = frame_index + FRAME_OVERLAP_FACTOR return numpy.array(mfcc_out)
def mfcc(input, nceps=13): """Compute Mel Frequency Cepstral Coefficients. Parameters ---------- input: ndarray input spectrogram from which the coefficients are computed Returns ------- ceps: ndarray Mel-cepstrum coefficients mspec: ndarray Log-spectrum in the mel-domain. Notes ----- MFCC are computed as follows: * Pre-processing in time-domain (pre-emphasizing) * Compute the spectrum amplitude by windowing with a Hamming window * Filter the signal in the spectral domain with a triangular filter-bank, whose filters are approximatively linearly spaced on the mel scale, and have equal bandwith in the mel scale * Compute the DCT of the log-spectrum This is based on the talkbox module: http://pydoc.net/Python/scikits.talkbox/0.2.4.dev/scikits.talkbox.features.mfcc/ References ---------- .. [1] S.B. Davis and P. Mermelstein, "Comparison of parametric representations for monosyllabic word recognition in continuously spoken sentences", IEEE Trans. Acoustics. Speech, Signal Proc. ASSP-28 (4): 357-366, August 1980.""" nfft = input.metadata.sampling_configuration.dft_length fs = input.metadata.sampling_configuration.fs over = input.metadata.sampling_configuration.window_length \ - input.metadata.sampling_configuration.window_step #lowfreq = 400 / 3. lowfreq = 133.33 #highfreq = 6855.4976 linsc = 200 / 3. logsc = 1.0711703 nlinfil = 13 nlogfil = 27 nlinfil + nlogfil fbank = trfbank(fs, nfft, lowfreq, linsc, logsc, nlinfil, nlogfil)[0] fbank = fbank.T[0:input.data.shape[0], :] mspec = np.log10(np.maximum(np.dot(fbank.T, input.data), 0.0000001)).T # Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain) ceps = dct(mspec, type=2, norm='ortho', axis=-1)[:, :nceps] return ceps
def test_definition_matlab(self): # Test correspondance with matlab (orthornomal mode). for i in range(len(X)): x = np.array(X[i], dtype=self.rdt) yr = Y[i] y = dct(x, norm="ortho", type=2) assert_equal(y.dtype, self.rdt) assert_array_almost_equal(y, yr, decimal=self.dec)
def test_definition_matlab(self): # Test correspondence with MATLAB (orthornomal mode). dt = np.result_type(np.float32, self.rdt) for xr, yr in zip(X, Y): x = np.array(xr, dtype=dt) y = dct(x, norm="ortho", type=2) assert_equal(y.dtype, dt) assert_array_almost_equal(y, yr, decimal=self.dec)
def test_definition_matlab(self): """Test correspondance with matlab (orthornomal mode).""" for i in range(len(X)): x = np.array(X[i], dtype=self.rdt) yr = Y[i] y = dct(x, norm="ortho", type=2) self.assertTrue(y.dtype == self.rdt, "Output dtype is %s, expected %s" % (y.dtype, self.rdt)) assert_array_almost_equal(y, yr, decimal=self.dec)
def test_definition_ortho(self): # Test orthornomal mode. dt = np.result_type(np.float32, self.rdt) for xr in X: x = np.array(xr, dtype=self.rdt) y = dct(x, norm='ortho', type=1) y2 = naive_dct1(x, norm='ortho') assert_equal(y.dtype, dt) assert_array_almost_equal(y / np.max(y), y2 / np.max(y), decimal=self.dec)
def test_definition_ortho(self): # Test orthornomal mode. for i in range(len(X)): x = np.array(X[i], dtype=self.rdt) dt = np.result_type(np.float32, self.rdt) y = dct(x, norm='ortho', type=4) y2 = naive_dct4(x, norm='ortho') assert_equal(y.dtype, dt) assert_array_almost_equal(y / np.max(y), y2 / np.max(y), decimal=self.dec)
def test_definition(self): for i in FFTWDATA_SIZES: x, yr = fftw_ref(self.type, i, self.rdt) y = dct(x, type=self.type) self.assertTrue(y.dtype == self.rdt, "Output dtype is %s, expected %s" % (y.dtype, self.rdt)) # XXX: we divide by np.max(y) because the tests fail otherwise. We # should really use something like assert_array_approx_equal. The # difference is due to fftw using a better algorithm w.r.t error # propagation compared to the ones from fftpack. assert_array_almost_equal(y / np.max(y), yr / np.max(y), decimal=self.dec, err_msg="Size %d failed" % i)
def __compute_mfcc_for_window(self, s, window_index): self.__fft_mag[window_index, :] = np.abs(np.fft.fft(s, n = self.__fft_window_length)) for i in range(0, self.__nfilters): self.__filtered_spectra[window_index, i, :] = np.multiply(self.__filter_banks[i, :], self.__fft_mag[window_index, :]) self.__filtered_spectra_sums[window_index, i] = np.sum(self.__filtered_spectra[window_index, i, :]) self.__filtered_spectra_sums_log[window_index, :] = np.log10(self.__filtered_spectra_sums[window_index, :]) return dct(self.__filtered_spectra_sums_log[window_index, :], norm='ortho')
def test_definition_matlab(self): # Test correspondence with MATLAB (orthornomal mode). for i in range(len(X)): dt = np.result_type(np.float32, self.rdt) x = np.array(X[i], dtype=dt) yr = Y[i] y = dct(x, norm="ortho", type=2) assert_equal(y.dtype, dt) assert_array_almost_equal(y, yr, decimal=self.dec)
def mfcc_framed(framed, nfft=512, fs=16000, nceps=13): """Compute Mel Frequency Cepstral Coefficients. Parameters ---------- input: ndarray input from which the coefficients are computed Returns ------- ceps: ndarray Mel-cepstrum coefficients mspec: ndarray Log-spectrum in the mel-domain. Notes ----- MFCC are computed as follows: * Pre-processing in time-domain (pre-emphasizing) * Compute the spectrum amplitude by windowing with a Hamming window * Filter the signal in the spectral domain with a triangular filter-bank, whose filters are approximatively linearly spaced on the mel scale, and have equal bandwith in the mel scale * Compute the DCT of the log-spectrum References ---------- .. [1] S.B. Davis and P. Mermelstein, "Comparison of parametric representations for monosyllabic word recognition in continuously spoken sentences", IEEE Trans. Acoustics. Speech, Signal Proc. ASSP-28 (4): 357-366, August 1980.""" #lowfreq = 400 / 3. lowfreq = 133.33 #highfreq = 6855.4976 linsc = 200 / 3. logsc = 1.0711703 nlinfil = 13 nlogfil = 27 nfil = nlinfil + nlogfil fbank = trfbank(fs, nfft, lowfreq, linsc, logsc, nlinfil, nlogfil)[0] #------------------ # Compute the MFCC #------------------ # Compute the spectrum magnitude spec = np.abs(fft(framed, nfft, axis=-1)) # Filter the spectrum through the triangle filterbank mspec = np.log10(np.dot(spec, fbank.T)) # Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain) ceps = dct(mspec, type=2, norm='ortho', axis=-1)[:, :nceps] return ceps, mspec, spec
def test_definition(self): for i in FFTWDATA_SIZES: x, yr, dt = fftw_dct_ref(self.type, i, self.rdt) y = dct(x, type=self.type) assert_equal(y.dtype, dt) # XXX: we divide by np.max(y) because the tests fail otherwise. We # should really use something like assert_array_approx_equal. The # difference is due to fftw using a better algorithm w.r.t error # propagation compared to the ones from fftpack. assert_array_almost_equal(y / np.max(y), yr / np.max(y), decimal=self.dec, err_msg="Size %d failed" % i)
def cepstrum(input, nceps): """ Calulates Cepstral coefficients from mel spectrum applying Discrete Cosine Transform Args: input: array of log outputs of Mel scale filterbank [N x nmelfilters] where N is the number of frames and nmelfilters the length of the filterbank nceps: number of output cepstral coefficients Output: array of Cepstral coefficients [N x nceps] Note: you can use the function dct from scipy.fftpack.realtransforms """ return dct(input)[:, :nceps]
def __init_mfcc(self, num_mel_bands=DEFAULT_MFCC_BANDS, num_mfcc=DEFAULT_NUM_MFCC_COEFFICIENTS, delta_N=DEFAULT_MFCC_DELTA_N): mel_bin_matrix, freqs = self.get_mel_binning_matrix(num_mel_bands) Pxx2 = np.dot(self.specgram.T, mel_bin_matrix) # Unlike the mlab implementation, we threshold and log our FFT magnitudes # before returning Pxx2[Pxx2 < 1e-10] = 1e-10 Pxx2 = 10. * np.log10(Pxx2) Pxx2[Pxx2 <= 0.0] = 0.0 # http://pydoc.net/Python/scikits.talkbox/0.2.4.dev/scikits.talkbox.features.mfcc/ ceps = dct(Pxx2, type=2, norm='ortho', axis=-1)[:, :num_mfcc] ceps = np.flipud(ceps) deltas = np.zeros(ceps.shape) delta_deltas = np.zeros(ceps.shape) for cep_frame_i in xrange(len(ceps)): if cep_frame_i < delta_N: del_N = cep_frame_i elif cep_frame_i > len(ceps) - delta_N - 1: del_N = len(ceps) - cep_frame_i - 1 else: del_N = delta_N if del_N == 0: continue deltas[cep_frame_i] = sum([ n * (ceps[cep_frame_i + n] - ceps[cep_frame_i - n]) for n in xrange(1, del_N + 1) ]) / (2.0 * sum([n**2 for n in xrange(1, del_N + 1)])) for cep_frame_i in xrange(len(deltas)): if cep_frame_i < delta_N: del_N = cep_frame_i elif cep_frame_i > len(ceps) - delta_N - 1: del_N = len(ceps) - cep_frame_i - 1 else: del_N = delta_N if del_N == 0: continue delta_deltas[cep_frame_i] = sum([ n * (deltas[cep_frame_i + n] - deltas[cep_frame_i - n]) for n in xrange(1, del_N + 1) ]) / (2.0 * sum([n**2 for n in xrange(1, del_N + 1)])) ceps = np.fliplr(ceps.T[1:]) deltas = np.fliplr(deltas.T[1:]) delta_deltas = np.fliplr(delta_deltas.T[1:]) return ceps, deltas, delta_deltas
def stMFCC(X, fbank, nceps): """ Computes the MFCCs of a frame, given the fft mag ARGUMENTS: X: fft magnitude abs(FFT) fbank: filter bank (see mfccInitFilterBanks) RETURN ceps: MFCCs (13 element vector) Note: MFCC calculation is, in general, taken from the scikits.talkbox library (MIT Licence), # with a small number of modifications to make it more compact and suitable for the pyAudioAnalysis Lib """ mspec = numpy.log10(numpy.dot(X, fbank.T)+eps) ceps = dct(mspec, type=2, norm='ortho', axis=-1)[:nceps] return ceps
def short_term_MFCC(X, fbank, nceps): """ Calculating the MFCCs of a frame, given the fft mag ARGUMENTS: X: fft magnitude abs(FFT) fbank: filter bank (see mfcc_init_filter_banks) RETURN ceps: MFCCs (13 element vector) Note: MFCC calculation is, in general, taken from the scikits.talkbox library (MIT Licence), # with a small number of modifications to make it more compact and suitable for the pyAudioAnalysis Lib """ mspec = numpy.log10(numpy.dot(X, fbank.T) + eps) ceps = dct(mspec, type=2, norm='ortho', axis=-1)[:nceps] return ceps
def mfcc(fft_magnitude, fbank, num_mfcc_feats): """ Computes the MFCCs of a frame, given the fft mag ARGUMENTS: fft_magnitude: fft magnitude abs(FFT) fbank: filter bank (see mfccInitFilterBanks) RETURN ceps: MFCCs (13 element vector) Note: MFCC calculation is, in general, taken from the scikits.talkbox library (MIT Licence), # with a small number of modifications to make it more compact and suitable for the pyAudioAnalysis Lib """ mspec = np.log10(np.dot(fft_magnitude, fbank.T) + eps) ceps = dct(mspec, type=2, norm='ortho', axis=-1)[:num_mfcc_feats] return ceps
def mfcc(x, framesize=1024, hopsize=512, fs=44100, window="hamming", min_freq=0, max_freq=22050, n_mel_bands=40, n_ceps=13, preemp=False): """ Calculate MFCC @param x input signal @param framesize STFT frame size @param hopsize STFT hop size @param fs sampling rate @param window type of window function @param min_freq minimum frequency of mel filterbank @param max_freq maximum frequency of mel filterbank @param n_mel_bands number of channels of mel filterbank @param n_ceps number of coefficients @param preemp flag for using pre-emphasis @return (MFCC coefficients, center frequencies) """ # プリエンファシス if preemp: coef = 0.97 xemp = _pre_emphasis(x, coef) else: xemp = x # mel-scale spectrogram mel_spe, center_freqs = mel_spectrogram(xemp, framesize, hopsize, fs, window, min_freq, max_freq, n_mel_bands) mel_spe = sp.log10(mel_spe + 1e-10) # DCT (ケプストラムに変換=MFCC) ceps = dct(mel_spe, type=2, norm="ortho", axis=-1)[:, :n_ceps] # nan check & inf check ceps = feature.check_nan_2d(ceps) ceps = feature.check_inf_2d(ceps) return ceps, center_freqs
def gcv(gamma_,y,weight,eigenvalues2,n,nMiss,y_hat_final): # a GCV function for the smoother from scipy.fftpack.realtransforms import dct,idct gamma = np.exp((gamma_)) G = 1./(1+gamma*eigenvalues2) y0 = y.copy() e = 1e20 while (e > 1e-10): y_hat = idct(G*dct(weight*weight*(y-y0)+y0,norm='ortho',type=2),norm='ortho',type=2) dy = y_hat - y0 e = np.mean(dy*dy) y0 = y_hat y_hat_final[:] = y_hat d = weight*(y_hat-y) numerator = np.dot(d,d)/(n-nMiss) traceH = (1./(1 + gamma*eigenvalues2)).sum() denominator = (1 - traceH/n)**2 return numerator/denominator
def arspecs(input_wav, order, Atal=False): epsilon = 0.0000000001 data = input_wav if Atal: ar = atal(data, order, 30) return ar else: ar = [] ars = arspec(data, order, 4096) for k, l in zip(ars[0], ars[1]): ar.append(math.log(math.sqrt((k**2) + (l**2)))) for val in range(0, len(ar)): if ar[val] == 0.0: ar[val] = deepcopy(epsilon) mspec1 = np.log10(ar) # Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain) ar = dct(mspec1, type=2, norm='ortho', axis=-1) return ar[:30]
def mfcc(input, nwin=256, nfft=512, fs=16000, nceps=13): import numpy as numpy from scipy.io import loadmat from scipy.signal import lfilter, hamming from scipy.fftpack import fft from scipy.fftpack.realtransforms import dct over = nwin - 160 prefac = 0.97 #lowfreq = 400 / 3. lowfreq = 133.33 linsc = 200/3. logsc = 1.0711703 nlinfil = 13 nlogfil = 27 nfil = nlinfil + nlogfil w = hamming(nwin, sym=0) fbank = trfbank(fs, nfft, lowfreq, linsc, logsc, nlinfil, nlogfil)[0] if fbank<=0: fbank=0.0001 #------------------ # Compute the MFCC #------------------ extract = preemp(input, prefac) framed = segment_axis(extract, nwin, over) * w # Compute the spectrum magnitude spec = numpy.abs(fft(framed, nfft, axis=-1)) if spec<=0: spec=0.00001 # Filter the spectrum through the triangle filterbank arr= numpy.dot(spec,fbank.T) ##CHANGED CODE print "LOG ARRAy =",arr mspec=numpy.log10(arr) # Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain) ceps = dct(mspec, type=2, norm='ortho', axis=-1)[:, :nceps] return ceps
def __init_mfcc(self, num_mel_bands = DEFAULT_MFCC_BANDS, num_mfcc = DEFAULT_NUM_MFCC_COEFFICIENTS, delta_N = DEFAULT_MFCC_DELTA_N): mel_bin_matrix, freqs = self.get_mel_binning_matrix(num_mel_bands) Pxx2 = np.dot(self.specgram.T, mel_bin_matrix) # Unlike the mlab implementation, we threshold and log our FFT magnitudes # before returning Pxx2[Pxx2 < 1e-10] = 1e-10 Pxx2 = 10. * np.log10(Pxx2) Pxx2[Pxx2 <= 0.0] = 0.0 # http://pydoc.net/Python/scikits.talkbox/0.2.4.dev/scikits.talkbox.features.mfcc/ ceps = dct(Pxx2, type=2, norm='ortho', axis=-1)[:, :num_mfcc] ceps = np.flipud(ceps) deltas = np.zeros(ceps.shape) delta_deltas = np.zeros(ceps.shape) for cep_frame_i in xrange(len(ceps)): if cep_frame_i < delta_N: del_N = cep_frame_i elif cep_frame_i > len(ceps) - delta_N - 1: del_N = len(ceps) - cep_frame_i - 1 else: del_N = delta_N if del_N == 0: continue deltas[cep_frame_i] = sum([n*(ceps[cep_frame_i + n] - ceps[cep_frame_i - n]) for n in xrange(1,del_N+1)]) / (2.0*sum([n**2 for n in xrange(1, del_N + 1)])) for cep_frame_i in xrange(len(deltas)): if cep_frame_i < delta_N: del_N = cep_frame_i elif cep_frame_i > len(ceps) - delta_N - 1: del_N = len(ceps) - cep_frame_i - 1 else: del_N = delta_N if del_N == 0: continue delta_deltas[cep_frame_i] = sum([n*(deltas[cep_frame_i + n] - deltas[cep_frame_i - n]) for n in xrange(1,del_N+1)]) / (2.0*sum([n**2 for n in xrange(1, del_N + 1)])) ceps = np.fliplr(ceps.T[1:]) deltas = np.fliplr(deltas.T[1:]) delta_deltas = np.fliplr(delta_deltas.T[1:]) return ceps, deltas, delta_deltas
def test_definition(self): for i in FFTWDATA_SIZES: xr, yr = fftw_ref(self.type, i, self.rdt) y = dct(xr, type=self.type) x = idct(yr, type=self.type) if self.type == 1: x /= 2 * (i - 1) else: x /= 2 * i self.assertTrue( x.dtype == self.rdt, "Output dtype is %s, expected %s" % (x.dtype, self.rdt)) # XXX: we divide by np.max(y) because the tests fail otherwise. We # should really use something like assert_array_approx_equal. The # difference is due to fftw using a better algorithm w.r.t error # propagation compared to the ones from fftpack. assert_array_almost_equal(x / np.max(x), xr / np.max(x), decimal=self.dec, err_msg="Size %d failed" % i)
def mfcc(input, nwin=256, nfft=512, fs=16000, nceps=13): """Compute Mel Frequency Cepstral Coefficients. Parameters ---------- input: ndarray input from which the coefficients are computed Returns ------- ceps: ndarray Mel-cepstrum coefficients mspec: ndarray Log-spectrum in the mel-domain. Notes ----- MFCC are computed as follows: * Pre-processing in time-domain (pre-emphasizing) * Compute the spectrum amplitude by windowing with a Hamming window * Filter the signal in the spectral domain with a triangular filter-bank, whose filters are approximatively linearly spaced on the mel scale, and have equal bandwith in the mel scale * Compute the DCT of the log-spectrum References ---------- .. [1] S.B. Davis and P. Mermelstein, "Comparison of parametric representations for monosyllabic word recognition in continuously spoken sentences", IEEE Trans. Acoustics. Speech, Signal Proc. ASSP-28 (4): 357-366, August 1980.""" # MFCC parameters: taken from auditory toolbox over = nwin - 160 # Pre-emphasis factor (to take into account the -6dB/octave rolloff of the # radiation at the lips level) prefac = 0.97 #lowfreq = 400 / 3. lowfreq = 133.33 #highfreq = 6855.4976 linsc = 200/3. logsc = 1.0711703 nlinfil = 13 nlogfil = 27 nfil = nlinfil + nlogfil w = hamming(nwin, sym=0) fbank = trfbank(fs, nfft, lowfreq, linsc, logsc, nlinfil, nlogfil)[0] #------------------ # Compute the MFCC #------------------ extract = preemp(input, prefac) framed = segment_axis(extract, nwin, over) * w # Compute the spectrum magnitude spec = np.abs(fft(framed, nfft, axis=-1)) # Filter the spectrum through the triangle filterbank mspec = np.log10(np.dot(spec, fbank.T)) # Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain) ceps = dct(mspec, type=2, norm='ortho', axis=-1)[:, :nceps] return ceps, mspec, spec
def mfcc(input, nwin=256, nfft=512, fs=16000, nceps=13): """Compute Mel Frequency Cepstral Coefficients. Parameters ---------- input: ndarray input from which the coefficients are computed Returns ------- ceps: ndarray Mel-cepstrum coefficients mspec: ndarray Log-spectrum in the mel-domain. Notes ----- MFCC are computed as follows: * Pre-processing in time-domain (pre-emphasizing) * Compute the spectrum amplitude by windowing with a Hamming window * Filter the signal in the spectral domain with a triangular filter-bank, whose filters are approximatively linearly spaced on the mel scale, and have equal bandwith in the mel scale * Compute the DCT of the log-spectrum References ---------- .. [1] S.B. Davis and P. Mermelstein, "Comparison of parametric representations for monosyllabic word recognition in continuously spoken sentences", IEEE Trans. Acoustics. Speech, Signal Proc. ASSP-28 (4): 357-366, August 1980.""" # Number of overlapping samples in each frame t_overlap = 10*10**(-3) # Time in seconds of overlapping between frames over = int(t_overlap*fs) # over = nwin - 160 # Pre-emphasis factor (to take into account the -6dB/octave rolloff of the # radiation at the lips level) prefac = 0.97 #lowfreq = 400 / 3. lowfreq = 133.33 #highfreq = 6855.4976 linsc = 200/3. logsc = 1.0711703 nlinfil = 13 nlogfil = 27 nfil = nlinfil + nlogfil w = hamming(nwin, sym=0) [fbank, freqs] = trfbank(fs, nfft, lowfreq, linsc, logsc, nlinfil, nlogfil) # "fbank" is a nfil-by-nfft Numpy 2D array. ''' # Visualizando o banco de filtros: plt.figure() nfiltros,lenfiltros = fbank.shape for i in range(nfiltros): plt.plot(range(lenfiltros),fbank[i,:]) plt.axis([0, lenfiltros, 0, np.max(fbank)]) plt.show() ''' #------------------ # Compute the MFCC #------------------ extract = preemp(input, prefac) framed = segment_axis(extract, nwin, over) * w # Compute the spectrum magnitude spec = np.abs(fft(framed, nfft, axis=-1)) # Filter the spectrum through the triangle filterbank mspec = np.log10(np.dot(spec, fbank.T)) # Use the DCT to 'compress' the coefficients (spectrum -> cepstrum domain) ceps = dct(mspec, type=2, norm='ortho', axis=-1)[:, :nceps] nframes = ceps.shape[0] print 'nframes: ', nframes # ----------------------------------------- # Cepstrum mean subtraction # mean_along_frames = np.mean(mspec,axis=0) # Mean along the vertical dimension of the mel-spectrum # # mean_along_frames_stack = mean_along_frames # for i in range(nframes-1): # mean_along_frames_stack = np.vstack((mean_along_frames_stack, mean_along_frames)) # ceps = ceps - mean_along_frames_stack[:,0:nceps] return ceps, mspec, spec
def test_dct_complex64(self): y = dct(1j * np.arange(5, dtype=np.complex64)) x = 1j * dct(np.arange(5)) assert_array_almost_equal(x, y)
def MFCC(X, fbank, nceps): mspec = np.log10(np.dot(X, fbank.T)+eps) ceps = dct(mspec, type=2, norm='ortho', axis=-1)[:nceps] return ceps
def test_dct_complex(self): y = dct(np.arange(5) * 1j) x = 1j * dct(np.arange(5)) assert_array_almost_equal(x, y)