def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97): """Compute Spectral Subband Centroid features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between seccessive windows in seconds. Default is 0.01s (10 milliseconds) :param nfilt: the number of filters in the filterbank, default 20. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. """ highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate) pspec = sigproc.powspec(frames,nfft) fb = get_filterbanks(nfilt,nfft,samplerate) feat = numpy.dot(pspec,fb.T) # compute the filterbank energies R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1)) return numpy.dot(pspec*R,fb.T) / feat
def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97): """Compute Spectral Subband Centroid features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) :param nfilt: the number of filters in the filterbank, default 26. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. """ highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate) pspec = sigproc.powspec(frames,nfft) pspec = numpy.where(pspec == 0,numpy.finfo(float).eps,pspec) # if things are all zeros we get problems fb = get_filterbanks(nfilt,nfft,samplerate) feat = numpy.dot(pspec,fb.T) # compute the filterbank energies R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1)) return numpy.dot(pspec*R,fb.T) / feat
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97): """Compute Mel-filterbank energy features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between seccessive windows in seconds. Default is 0.01s (10 milliseconds) :param nfilt: the number of filters in the filterbank, default 20. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The second return value is the energy in each frame (total energy, unwindowed) """ highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate) pspec = sigproc.powspec(frames,nfft) energy = numpy.sum(pspec,1) # this stores the total energy in each frame fb = get_filterbanks(nfilt,nfft,samplerate) feat = numpy.dot(pspec,fb.T) # compute the filterbank energies return feat,energy
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97): """Compute Mel-filterbank energy features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) :param nfilt: the number of filters in the filterbank, default 26. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The second return value is the energy in each frame (total energy, unwindowed) """ highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate) pspec = sigproc.powspec(frames,nfft) energy = numpy.sum(pspec,1) # this stores the total energy in each frame energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log fb = get_filterbanks(nfilt,nfft,samplerate) feat = numpy.dot(pspec,fb.T) # compute the filterbank energies feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log return feat,energy
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97, winfunc=lambda x:numpy.ones((x,))): highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc) pspec = sigproc.powspec(frames,nfft) energy = numpy.sum(pspec,1) # this stores the total energy in each frame energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) feat = numpy.dot(pspec,fb.T) # compute the filterbank energies feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log return feat,energy
def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=True): """Compute MFCC features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) :param numcep: the number of cepstrum to return, default 13 :param nfilt: the number of filters in the filterbank, default 26. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22. :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy. :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector. """ # In fbank changed to do things on unique part of spectrum only i.e from frequency bins 1 to nfft/2+1 # change in sigproc to use hamming window by default #MAKE SURE THAT nfft is even or next power of two after window length...in particular use something as NFFT=2^(ceil(log(winpts)/log(2))); #feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph) #K = nfft/2 + 1 # unique part of spectrum 0 to nfft/2 -- Already taken care of by numpy.fft.rfft -- returns unique part only highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate,'hamm') pspec = sigproc.powspec(frames,nfft) # in this power spectrum computation normalization has been done..check 1/nfft factor..removed as of now mspec = sigproc.magspec(frames,nfft) energy = numpy.sum(pspec,1) # this stores the total energy in each frame energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) # filter bank returned here is nfilt by nfft/2 + 1 featx = numpy.dot(pspec,fb.T) # compute the filterbank energies featx = numpy.where(featx == 0,numpy.finfo(float).eps,featx) # if feat is zero, we get problems with log feat = numpy.log(featx) logmelspec = feat feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep] feat = lifter(feat,ceplifter) if appendEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy return feat,mspec,logmelspec
from features import mfcc from features import logfbank import scipy.io.wavfile as wav import matplotlib.pyplot as plt from features.sigproc import preemphasis, framesig, magspec, powspec (rate,sig) = wav.read('../Data/roycer/roycer.wav') mfcc_feat = mfcc(sig,rate) fbank_feat = logfbank(sig, rate) magspec_result = magspec(mfcc_feat,1) powspec_result = powspec(mfcc_feat, 1) sig2 = preemphasis(sig,0.95) print mfcc_feat plt.hist(mfcc_feat) #print(mfcc_feat[0:12,0:12]) #print(fbank_feat[0:12,0:12]) #print magspec_result print powspec_result enojado = 0 feliz = 0 tristes = 0 normal = 0
def logFilterbankFeatures(signal, samplerate=16000, winlen=0.0255, winstep=0.01, nfilt=40, nfft=512, lowfreq=133.3333, highfreq=6855.4976, preemph=0.97, winSzForDelta=2): ''' Computes log filterbank energies on a mel scale + total energy using with the code taken from features.fbank, which does not accept window function as a param. function from package 'python_speech_features', see http://python-speech-features.readthedocs.org/en/latest/ or https://github.com/jameslyons/python_speech_features Therefore it calculates the FFT of the signal and sums the the weighted bins, distributed on a mel scale. Weighting is done with tri-angular filters. For these filter energies + total energy, deltas are calculated. :parameters: - signal : np.ndarray, dtype=float input vector of the speech signal - samplerate : int - winlen: float length of analysis window in seconds - winstep: float step size between successive windows in seconds - nfilt: int number of filter energies to compute (total energy not included). e.g. 40 --> Output dim = (40+1)*3 - nfft: int FFT size - lowfreq: int lower end on mel frequency scale, on which filter banks are distributed - highfreq: int upper end on mel frequency scale, on which filter banks are distributed - preemph: float pre-emphasis coefficient - deltafeat: np.ndarray, dtype=float deltas of the input features - winSzForDelta: int window size for computing deltas. E.g. 2 --> t-2, t-1, t+1 and t+2 are for calculating the deltas :returns: - features: numpy.array: float feature-matrix. 1st dimension: time steps of 'winstep', 2nd dim: feature dimension: (nfilt + 1)*3, +1 for energy, *3 because of deltas ''' # Part of the following code is copied from function features.fbank # Unfortunately, one can't specify the window function in features.fbank # Hamming window is used here highfreq = highfreq or samplerate / 2 signal = sigproc.preemphasis(signal, preemph) frames = sigproc.framesig(signal, winlen * samplerate, winstep * samplerate, winfunc=hamming) pspec = sigproc.powspec(frames, nfft) energy = np.sum(pspec, 1) # this stores the total energy in each frame energy = np.where(energy == 0, np.finfo(float).eps, energy) # if energy is zero, we get problems with log fb = features.get_filterbanks(nfilt, nfft, samplerate, lowfreq, highfreq) feat = np.dot(pspec, fb.T) # compute the filterbank energies feat = np.where(feat == 0, np.finfo(float).eps, feat) # if feat is zero, we get problems with log # Use log feature bank and log energy feat = np.column_stack((np.log(energy), np.log(feat))) # calculate delta and acceleration deltaFeat = delta(feat, winSzForDelta) accFeat = delta(deltaFeat, winSzForDelta) # stack features + delta + acceleration return np.concatenate((feat, deltaFeat, accFeat), axis=1)
def logFilterbankFeatures(signal,samplerate=16000,winlen=0.0255,winstep=0.01, nfilt=40,nfft=512,lowfreq=133.3333,highfreq=6855.4976,preemph=0.97, winSzForDelta=2): ''' Computes log filterbank energies on a mel scale + total energy using with the code taken from features.fbank, which does not accept window function as a param. function from package 'python_speech_features', see http://python-speech-features.readthedocs.org/en/latest/ or https://github.com/jameslyons/python_speech_features Therefore it calculates the FFT of the signal and sums the the weighted bins, distributed on a mel scale. Weighting is done with tri-angular filters. For these filter energies + total energy, deltas are calculated. :parameters: - signal : np.ndarray, dtype=float input vector of the speech signal - samplerate : int - winlen: float length of analysis window in seconds - winstep: float step size between successive windows in seconds - nfilt: int number of filter energies to compute (total energy not included). e.g. 40 --> Output dim = (40+1)*3 - nfft: int FFT size - lowfreq: int lower end on mel frequency scale, on which filter banks are distributed - highfreq: int upper end on mel frequency scale, on which filter banks are distributed - preemph: float pre-emphasis coefficient - deltafeat: np.ndarray, dtype=float deltas of the input features - winSzForDelta: int window size for computing deltas. E.g. 2 --> t-2, t-1, t+1 and t+2 are for calculating the deltas :returns: - features: numpy.array: float feature-matrix. 1st dimension: time steps of 'winstep', 2nd dim: feature dimension: (nfilt + 1)*3, +1 for energy, *3 because of deltas ''' # Part of the following code is copied from function features.fbank # Unfortunately, one can't specify the window function in features.fbank # Hamming window is used here highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate,winfunc=hamming) pspec = sigproc.powspec(frames,nfft) energy = np.sum(pspec,1) # this stores the total energy in each frame energy = np.where(energy == 0,np.finfo(float).eps,energy) # if energy is zero, we get problems with log fb = features.get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) feat = np.dot(pspec,fb.T) # compute the filterbank energies feat = np.where(feat == 0,np.finfo(float).eps,feat) # if feat is zero, we get problems with log # Use log feature bank and log energy feat = np.column_stack((np.log(energy),np.log(feat))) # calculate delta and acceleration deltaFeat = delta(feat, winSzForDelta) accFeat = delta(deltaFeat, winSzForDelta) # stack features + delta + acceleration return np.concatenate((feat,deltaFeat,accFeat),axis=1)