def MFCC_SVM(audioFileName,model,DEBUG): ''' Calculate MFCC features from audio frames and use trained SVM for frame prediction: 1. divide audio file into overlapping frames 2. window the frame 3. compute MFCC features for each frame 4. predict using SVM trained on MFCC features Input: audioFileName : input test audio file model : SVM model DEBUG : debug flag Output: label : predicted label for all frames of the audio tag : probability of prediction ''' #,trainingDataFile,label fs,audio = read (audioFileName) audio = audio/float(4000) # frame duration in ms frame_length = 25 # overlap duration in ms frame_overlap = 10 N = len (audio) nsample = round(frame_length*fs/1000) noverlap = round(frame_overlap*fs/1000) # FFT length NFFT = 2*nsample # Hanning window window = hann(nsample) offset = nsample-noverlap max_m = round((N-NFFT)/offset) numFilter = 26 fl = 0*fs fh = 0.5*fs melFilBank = util.melFilterBank(numFilter, int(NFFT/2), fs, fl, fh) coeffs = 13 N1 = 5 flag = 0 count = 0 flag2 = 0 frames = int(max_m) mQ = [] dQ = [] tag = [] ZCR = [] F0 = [] label = [] alpha = 0 # pre-emphasis factor for m in range(0,frames): begin = m*offset iend = m*offset + nsample frame = audio[begin:iend] Frame = preEmphasis(frame,alpha) magy = powerSpectrum(Frame,window,NFFT) mfccfeature = util.mfccFeature(magy[0:int(NFFT/2)], melFilBank, coeffs) mfccfeature =mfccfeature/np.absolute(mfccfeature).max(0) mQ.append(mfccfeature) zcr_temp = zeroCrossingRate(Frame,window) f0 = fundamentalFreq(Frame,window,fs) ZCR.append(zcr_temp) F0.append(f0) if m%N1 == N1-1 or flag == 1: MD = util.deltaCoefficients(np.asarray(mQ)) MD = MD/np.absolute(MD).max(0) dQ.append(MD) mQ.pop(0) ZCR.pop(0) F0.pop(0) flag = 1 if count%N1 == N1-1 or flag2 == 1: MDD = util.deltaCoefficients(np.asarray(dQ)) MDD = MDD/np.absolute(MDD).max(0) dQ.pop(0) feature =[mQ[0],dQ[2],MDD] feature1 = np.reshape(np.vstack(feature),3*coeffs) feature2 = feature1.tolist() feature2.append(ZCR[0]) feature2.append(F0[0]) feature2.append(ZCR[0]-ZCR[1]) feature2.append(F0[0]-F0[1]) p_label,p_val = framePrediction(feature2,model) tag.append(p_val[0][0]) label.append(p_label[0]>0) flag2 = 1 count = count+1 if (DEBUG): T = np.arange(round(nsample/2),N-1-round(nsample/2),(nsample-noverlap))/fs; #L1 = T [0:int(max_m)]; t = np.linspace(0,N,N)/fs; L2 = T [8:len(tag)+8]; plt.subplot(3,1,1) plt.plot(t,audio) plt.xlabel("Time") plt.ylabel("Amplitude") plt.title(audioFileName) plt.xlim([0, T[-1]]) plt.subplot(3,1,2) plt.plot(L2,tag,color = 'r') plt.xlabel("Time") plt.ylabel("SVM-tag") plt.ylim([-1.2,1.2]) plt.xlim([0, T[-1]]) plt.subplot(3,1,3) plt.plot(L2,label,color = 'g') plt.xlabel("Time") plt.ylabel("SVM-tag") plt.ylim([-1.2,1.2]) plt.xlim([0, T[-1]]) plt.show() return label,tag
def MFCCfeatureExtraction(audioFileName,trainingDataFile,label,DEBUG): ''' MFCC feature extraction for training data generation. ''' #,trainingDataFile,label fs,audio = read (audioFileName) audio = audio/float(4000) # frame duration in ms frame_length = 25 # overlap duration in ms frame_overlap = 10 N = len (audio) nsample = round(frame_length*fs/1000) noverlap = round(frame_overlap*fs/1000) # FFT length NFFT = 2*nsample # Hanning window window = hann(nsample) offset = nsample-noverlap max_m = round((N-NFFT)/offset) numFilter = 26 fl = 0*fs fh = 0.5*fs melFilBank = util.melFilterBank(numFilter, int(NFFT/2), fs, fl, fh) coeffs = 13 N1 = 5 flag = 0 count = 0 flag2 = 0 frames = int(max_m) if (DEBUG): mfccTrack = np.zeros((coeffs,frames)) deltaTrack = np.zeros((coeffs,frames)) delta2Track = np.zeros((coeffs,frames)) mQ = [] dQ = [] ZCR = [] F0 = [] for m in range(0,frames): begin = m*offset iend = m*offset + nsample Frame = audio[begin:iend] magy = powerSpectrum(Frame,window,NFFT) mfccfeature = util.mfccFeature(magy[0:int(NFFT/2)], melFilBank, coeffs) mfccfeature = mfccfeature/np.absolute(mfccfeature).max(0) mQ.append(mfccfeature) zcr_temp = zeroCrossingRate(Frame,window) f0 = fundamentalFreq(Frame,window,fs) ZCR.append(zcr_temp) F0.append(f0) if (DEBUG) : mfccTrack[0:coeffs,m] = mfccfeature[0:coeffs] if m%N1 == N1-1 or flag == 1: MD = util.deltaCoefficients(np.asarray(mQ)) MD = MD/np.absolute(MD).max(0) dQ.append(MD) mQ.pop(0) ZCR.pop(0) F0.pop(0) flag = 1 if (DEBUG) : deltaTrack[0:coeffs,m] = MD[0:coeffs] if count%N1 == N1-1 or flag2 == 1: MDD = util.deltaCoefficients(np.asarray(dQ)) MDD = MDD/np.absolute(MDD).max(0) dQ.pop(0) feature =[mQ[0],dQ[2],MDD] feature1 = np.reshape(np.vstack(feature),3*coeffs) feature2 = feature1.tolist() feature2.append(ZCR[0]) feature2.append(F0[0]) feature2.append(ZCR[0]-ZCR[1]) feature2.append(F0[0]-F0[1]) if (trainingDataFile): writetoFile(trainingDataFile,label,feature2) flag2 = 1 if(DEBUG): delta2Track[0:coeffs,m] = MDD[0:coeffs] count = count+1 if (DEBUG): T = np.arange(round(nsample/2),N-1-round(nsample/2),(nsample-noverlap))/fs; L1 = T [0:int(max_m)]; t = np.linspace(0,N,N)/fs; plt.subplot(4,1,1) plt.plot(t,audio) plt.xlabel("Time") plt.ylabel("Amplitude") plt.title(audioFileName) plt.xlim([0, T[-1]]) plt.subplot(4,1,2) plt.plot(L1,mfccTrack[0,:],color='r') plt.plot(L1,mfccTrack[1,:],color='g') plt.plot(L1,mfccTrack[2,:],color='b') plt.xlim([0, T[-1]]) plt.subplot(4,1,3) plt.plot(L1,deltaTrack[0,:],color='r') plt.plot(L1,deltaTrack[1,:],color='g') plt.plot(L1,deltaTrack[2,:],color='b') plt.xlim([0, T[-1]]) plt.subplot(4,1,4) plt.plot(L1,delta2Track[0,:],color='r') plt.plot(L1,delta2Track[1,:],color='g') plt.plot(L1,delta2Track[2,:],color='b') plt.xlim([0, T[-1]]) plt.show()