def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97): """Compute Spectral Subband Centroid features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) :param nfilt: the number of filters in the filterbank, default 26. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. """ highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate) pspec = sigproc.powspec(frames,nfft) pspec = numpy.where(pspec == 0,numpy.finfo(float).eps,pspec) # if things are all zeros we get problems fb = get_filterbanks(nfilt,nfft,samplerate) feat = numpy.dot(pspec,fb.T) # compute the filterbank energies R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1)) return numpy.dot(pspec*R,fb.T) / feat
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97): """Compute Mel-filterbank energy features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between seccessive windows in seconds. Default is 0.01s (10 milliseconds) :param nfilt: the number of filters in the filterbank, default 20. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The second return value is the energy in each frame (total energy, unwindowed) """ highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate) pspec = sigproc.powspec(frames,nfft) energy = numpy.sum(pspec,1) # this stores the total energy in each frame fb = get_filterbanks(nfilt,nfft,samplerate) feat = numpy.dot(pspec,fb.T) # compute the filterbank energies return feat,energy
def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97): """Compute Spectral Subband Centroid features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between seccessive windows in seconds. Default is 0.01s (10 milliseconds) :param nfilt: the number of filters in the filterbank, default 20. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. """ highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate) pspec = sigproc.powspec(frames,nfft) fb = get_filterbanks(nfilt,nfft,samplerate) feat = numpy.dot(pspec,fb.T) # compute the filterbank energies R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1)) return numpy.dot(pspec*R,fb.T) / feat
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97): """Compute Mel-filterbank energy features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) :param nfilt: the number of filters in the filterbank, default 26. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The second return value is the energy in each frame (total energy, unwindowed) """ highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate) pspec = sigproc.powspec(frames,nfft) energy = numpy.sum(pspec,1) # this stores the total energy in each frame energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log fb = get_filterbanks(nfilt,nfft,samplerate) feat = numpy.dot(pspec,fb.T) # compute the filterbank energies feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log return feat,energy
def feature_extract(wav_name, winlen=0.025, winstep=0.01): """This function returns (mfcc) feature vectors extracted from wav_name""" rate, signal = wav.read(wav_name) signal = numpy.sum(signal, axis=1)/signal.shape[1] signal = sigproc.framesig(signal, rate*winlen, rate*winstep) signal = vad.vad_filter(signal) signal = sigproc.deframesig(signal, 0, rate*winlen, rate*winstep) mfcc_feat = mfcc(signal, rate) return mfcc_feat
def feature_extract(wav_name, winlen=0.025, winstep=0.01): """This function returns (mfcc) feature vectors extracted from wav_name""" rate, signal = wav.read(wav_name) signal = numpy.sum(signal, axis=1) / signal.shape[1] signal = sigproc.framesig(signal, rate * winlen, rate * winstep) signal = vad.vad_filter(signal) signal = sigproc.deframesig(signal, 0, rate * winlen, rate * winstep) mfcc_feat = mfcc(signal, rate) return mfcc_feat
def doDet(fileIn): (fsr,sig) = wav.read(fileIn) segChunks,segTimes = framesig(sig,seglen*fs,segstp*fs,'box',1,fs) if len(segChunks.shape) == 1: segChunks = np.reshape(segChunks,1,segChunks.reshape[0]) #allOut = np.zeros((segChunks.shape[0],1)) allFeats = np.zeros((segChunks.shape[0],nComp)) #print allOut.shape for t in range(segChunks.shape[0]): seg = segChunks[t,:] mfcc_feat,mspec,logmelspec = mfcc(seg,fs,winlen=wlen,winstep=wstep,numcep=ncep,nfilt=numfilt,nfft=fftsz,lowfreq=0,highfreq=fs/2,preemph=0.97,ceplifter=22,appendEnergy=True) #if (math.isnan(numpy.sum(numpy.sum(mfcc_feat)))) or (math.isinf(numpy.sum(numpy.sum(mfcc_feat)))): # print 'Escaping this Seg -- NaN or Inf occurres' #else: # numpy.savetxt(fltoread.replace('AllData',mfccset).rstrip('.wav')+'_POSITIVE_'+tlist[0]+'_'+tlist[1]+'_'+str(t)+'.mfcc',mfc\ # c_feat,delimiter=' ') cdist=gmmmixt.predict_proba(mfcc_feat) hist = np.sum(cdist,axis=0) histfeat = hist/float(hist.shape[0]) histfeat = histfeat.reshape(1,histfeat.shape[0]) allFeats[t,:] = histfeat histKer = computeKernel(allFeats,trdata,1.0) kerId = np.arange(histKer.shape[0])+1 kerId = np.reshape(kerId,(kerId.shape[0],1)) teKer = map(list,np.hstack((kerId,histKer))) telax = [0]*len(teKer) plb,acc,probab = svm_predict(telax,teKer,svmMod,'-b 1 -q') lbs = svmMod.get_labels() #print str(probab) + str(lbs) probab = np.array(probab) if lbs[0] == 1: prob_f=probab[:,0] elif lbs[1] == 1: prob_f=probab[:,1] else: print 'Not possible' sys.exit() prob_f.reshape(prob_f.shape[0],1) if pOrc == 1: allOut = prob_f elif pOrc == 0: allOut = np.array(map(int,(prob_f > opPoint))).reshape(prob_f.shape[0],1) #print np.hstack((allOut,segTimes)) np.savetxt(fileIn.rstrip('.wav')+'_res'+'.txt',allOut) return np.hstack((allOut,segTimes))
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97, winfunc=lambda x:numpy.ones((x,))): highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc) pspec = sigproc.powspec(frames,nfft) energy = numpy.sum(pspec,1) # this stores the total energy in each frame energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) feat = numpy.dot(pspec,fb.T) # compute the filterbank energies feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log return feat,energy
def get_lpcc(filename): """ gets lpccs for each frame in a wav file filename: name of wav file with .wav returns the lpcc features in each frame as a list of lists """ print "Getting LPCC" (rate, sig) = wav.read(filename) frames = sigproc.framesig(sig, 0.025 * rate, 0.01 * rate) lpccs = [[]] * len(frames) for x in xrange(0, len(frames)): lpcc_feat = lpc(frames[x], 12) for feature in lpcc_feat[0]: feature = float(feature) lpccs[x] = lpcc_feat[0] return numpy.asarray(lpccs)
def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=True): """Compute MFCC features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) :param numcep: the number of cepstrum to return, default 13 :param nfilt: the number of filters in the filterbank, default 26. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22. :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy. :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector. """ # In fbank changed to do things on unique part of spectrum only i.e from frequency bins 1 to nfft/2+1 # change in sigproc to use hamming window by default #MAKE SURE THAT nfft is even or next power of two after window length...in particular use something as NFFT=2^(ceil(log(winpts)/log(2))); #feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph) #K = nfft/2 + 1 # unique part of spectrum 0 to nfft/2 -- Already taken care of by numpy.fft.rfft -- returns unique part only highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate,'hamm') pspec = sigproc.powspec(frames,nfft) # in this power spectrum computation normalization has been done..check 1/nfft factor..removed as of now mspec = sigproc.magspec(frames,nfft) energy = numpy.sum(pspec,1) # this stores the total energy in each frame energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) # filter bank returned here is nfilt by nfft/2 + 1 featx = numpy.dot(pspec,fb.T) # compute the filterbank energies featx = numpy.where(featx == 0,numpy.finfo(float).eps,featx) # if feat is zero, we get problems with log feat = numpy.log(featx) logmelspec = feat feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep] feat = lifter(feat,ceplifter) if appendEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy return feat,mspec,logmelspec
def get_words(audio): winlen = 0.01 winstep = 0.01 sample_rate = 44100 # in terms of number of 10 ms frames start_speech = 10 end_silence = 5 speech_leader = 5 speech_trailer = 5 (rate, sig) = wav.read(audio) frames = framesig(sig, winlen * sample_rate, winstep * sample_rate, lambda x: numpy.ones((1, x))) word_list = [] #calculate energy per frame and zcr per frame frame_energy = [] zcr = [] for i in range(0, len(frames)): energy = sum(1.0 * x * x for x in frames[i]) zc = 0 for j in range(1, len(frames[i])): if (frames[i][j] < 0 and frames[i][j - 1] > 0) or (frames[i][j] > 0 and frames[i][j - 1] < 0): zc = zc + 1 frame_energy.append(energy) zcr.append(zc) #calculate final noise value avg_energy = sum(1.0 * x for x in frame_energy[0:9]) avg_energy = avg_energy / 10 avg_zcr = sum(1.0 * x for x in frame_energy[0:9]) avg_zcr = avg_zcr / 10 #calculate threshold upper_energy_threshold = 2 * avg_energy upper_zcr_threshold = 2 * avg_zcr lower_energy_threshold = 0.75 * avg_energy lower_zcr_threshold = 0.75 * avg_zcr ''' print upper_energy_threshold print upper_zcr_threshold print lower_energy_threshold print lower_zcr_threshold ''' started = False start_index = 0 start_cnt = 0 stop_index = 0 stop_cnt = 0 words = 0 #print ">> START_FRAME END_FRAME LENGTH_IN_SECONDS" for i in range(0, len(frames[10:])): if not started: if frame_energy[i] > upper_energy_threshold or zcr[ i] > upper_zcr_threshold: start_cnt += 1 else: start_cnt = 0 if start_cnt == start_speech: started = True start_index = i - start_speech + 1 - speech_leader start_index = max(0, start_index) else: if frame_energy[i] > upper_energy_threshold or zcr[ i] > upper_zcr_threshold: stop_cnt = 0 else: stop_cnt += 1 if stop_cnt == end_silence: stop_index = i - end_silence + 1 + speech_trailer stop_index = min(len(frames) - 1, stop_index) #print ">>", start_index, stop_index, (stop_index - start_index + 1 ) * 10 wav.write("word" + str(words) + ".wav", rate, sig[441 * start_index:441 * (stop_index + 1)]) words += 1 started = False start_index = start_cnt = 0 stop_index = stop_cnt = 0 if started: stop_index = len(frames) - 1 #print ">>", start_index, stop_index, (stop_index - start_index + 1 ) * 10 wav.write("word" + str(words) + ".wav", rate, sig[441 * start_index:441 * (stop_index + 1)]) words += 1 return words
def chop(output_folder='chopped-words', audio_file='recording.wav'): global voiced, sig, lrms, min_signal (rate, sig) = wav.read(audio_file) frames = framesig(sig, winlen * sample_rate, winstep * sample_rate, lambda x: np.ones((1, x))) lrms = [log_root_mean_square(x) for x in frames] min_signal = np.mean(lrms) / 2 # min_signal = 0 # print min_signal # dtype=int16 must for writing to wav file # result = np.array([], dtype=np.int16) for i in range(0, len(frames)): if classify(i): #result = np.append(result, sig[441*i : 441*(i+1)]) voiced.append(True) else: voiced.append(False) started = False start_index = 0 start_cnt = 0 stop_index = 0 stop_cnt = 0 words = 0 print ">> START_FRAME END_FRAME LENGTH_IN_SECONDS" for i in range(0, len(frames)): if not started: if voiced[i]: start_cnt += 1 else: start_cnt = 0 if start_cnt == start_speech: started = True start_index = i - start_speech + 1 - speech_leader start_index = max(0, start_index) else: if voiced[i]: stop_cnt = 0 else: stop_cnt += 1 if stop_cnt == end_silence: stop_index = i - end_silence + 1 + speech_trailer stop_index = min(len(frames) - 1, stop_index) print ">>", start_index, stop_index, (stop_index - start_index + 1) * 10 wav.write(output_folder + "/word" + str(words) + ".wav", rate, sig[441 * start_index:441 * (stop_index + 1)]) #wav.write("word" + str(words) + ".wav",rate, get_signal(start_index , stop_index)) words += 1 started = False start_index = start_cnt = 0 stop_index = stop_cnt = 0 if started: stop_index = len(frames) - 1 print ">>", start_index, stop_index, (stop_index - start_index + 1) * 10 wav.write(output_folder + "/word" + str(words) + ".wav", rate, sig[441 * start_index:441 * (stop_index + 1)]) #wav.write("word" + str(words) + ".wav",rate, get_signal(start_index , stop_index)) words += 1 return words
def logFilterbankFeatures(signal, samplerate=16000, winlen=0.0255, winstep=0.01, nfilt=40, nfft=512, lowfreq=133.3333, highfreq=6855.4976, preemph=0.97, winSzForDelta=2): ''' Computes log filterbank energies on a mel scale + total energy using with the code taken from features.fbank, which does not accept window function as a param. function from package 'python_speech_features', see http://python-speech-features.readthedocs.org/en/latest/ or https://github.com/jameslyons/python_speech_features Therefore it calculates the FFT of the signal and sums the the weighted bins, distributed on a mel scale. Weighting is done with tri-angular filters. For these filter energies + total energy, deltas are calculated. :parameters: - signal : np.ndarray, dtype=float input vector of the speech signal - samplerate : int - winlen: float length of analysis window in seconds - winstep: float step size between successive windows in seconds - nfilt: int number of filter energies to compute (total energy not included). e.g. 40 --> Output dim = (40+1)*3 - nfft: int FFT size - lowfreq: int lower end on mel frequency scale, on which filter banks are distributed - highfreq: int upper end on mel frequency scale, on which filter banks are distributed - preemph: float pre-emphasis coefficient - deltafeat: np.ndarray, dtype=float deltas of the input features - winSzForDelta: int window size for computing deltas. E.g. 2 --> t-2, t-1, t+1 and t+2 are for calculating the deltas :returns: - features: numpy.array: float feature-matrix. 1st dimension: time steps of 'winstep', 2nd dim: feature dimension: (nfilt + 1)*3, +1 for energy, *3 because of deltas ''' # Part of the following code is copied from function features.fbank # Unfortunately, one can't specify the window function in features.fbank # Hamming window is used here highfreq = highfreq or samplerate / 2 signal = sigproc.preemphasis(signal, preemph) frames = sigproc.framesig(signal, winlen * samplerate, winstep * samplerate, winfunc=hamming) pspec = sigproc.powspec(frames, nfft) energy = np.sum(pspec, 1) # this stores the total energy in each frame energy = np.where(energy == 0, np.finfo(float).eps, energy) # if energy is zero, we get problems with log fb = features.get_filterbanks(nfilt, nfft, samplerate, lowfreq, highfreq) feat = np.dot(pspec, fb.T) # compute the filterbank energies feat = np.where(feat == 0, np.finfo(float).eps, feat) # if feat is zero, we get problems with log # Use log feature bank and log energy feat = np.column_stack((np.log(energy), np.log(feat))) # calculate delta and acceleration deltaFeat = delta(feat, winSzForDelta) accFeat = delta(deltaFeat, winSzForDelta) # stack features + delta + acceleration return np.concatenate((feat, deltaFeat, accFeat), axis=1)
def get_words(audio): (rate,sig) = wav.read(audio) frames = framesig(sig, winlen*sample_rate, winstep*sample_rate,lambda x:numpy.ones((1,x))) word_list = [] #calculate energy per frame and zcr per frame frame_energy = [] zcr = [] for i in range(0,len(frames)): energy = sum(1.0*x*x for x in frames[i]) zc = 0 for j in range(1,len(frames[i])): if (frames[i][j]<0 and frames[i][j-1]>0) or (frames[i][j]>0 and frames[i][j-1]<0): zc = zc + 1 frame_energy.append(energy) zcr.append(zc) #calculate final noise value avg_energy = sum(1.0*x for x in frame_energy[0:9]) avg_energy = avg_energy/10 avg_zcr = sum(1.0*x for x in frame_energy[0:9]) avg_zcr = avg_zcr/10 #calculate threshold upper_energy_threshold = 2*avg_energy upper_zcr_threshold = 2*avg_zcr lower_energy_threshold = 0.75*avg_energy lower_zcr_threshold = 0.75*avg_zcr print upper_energy_threshold print upper_zcr_threshold print lower_energy_threshold print lower_zcr_threshold started = False start_index = 0 start_cnt = 0 stop_index = 0 stop_cnt = 0 words = 0 print ">> START_FRAME END_FRAME LENGTH_IN_SECONDS" for i in range(0,len(frames[10:])): if not started: if frame_energy[i]>upper_energy_threshold or zcr[i]>upper_zcr_threshold: start_cnt += 1 else: start_cnt = 0 if start_cnt == start_speech: started = True start_index = i - start_speech + 1 - speech_leader start_index = max(0, start_index) else: if frame_energy[i]>upper_energy_threshold or zcr[i]>upper_zcr_threshold: stop_cnt = 0 else: stop_cnt += 1 if stop_cnt == end_silence: stop_index = i - end_silence + 1 + speech_trailer stop_index = min(len(frames)-1, stop_index) print ">>", start_index, stop_index, (stop_index - start_index + 1 ) * 10 wav.write("word" + str(words) + ".wav",rate,sig[441*start_index:441*(stop_index+1)]) words += 1 started = False start_index = start_cnt = 0 stop_index = stop_cnt = 0 if started: stop_index = len(frames)-1 print ">>", start_index, stop_index, (stop_index - start_index + 1 ) * 10 wav.write("word" + str(words) + ".wav",rate,sig[441*start_index:441*(stop_index+1)]) words+=1 return words
def chop(output_folder = 'chopped-words', audio_file = 'recording.wav'): global voiced, sig, lrms, min_signal (rate,sig) = wav.read(audio_file) frames = framesig(sig, winlen*sample_rate, winstep*sample_rate,lambda x:np.ones((1,x))) lrms = [log_root_mean_square(x) for x in frames] min_signal = np.mean(lrms) / 2 # min_signal = 0 # print min_signal # dtype=int16 must for writing to wav file # result = np.array([], dtype=np.int16) for i in range(0,len(frames)): if classify(i): #result = np.append(result, sig[441*i : 441*(i+1)]) voiced.append(True) else: voiced.append(False) started = False start_index = 0 start_cnt = 0 stop_index = 0 stop_cnt = 0 words = 0 print ">> START_FRAME END_FRAME LENGTH_IN_SECONDS" for i in range(0,len(frames)): if not started: if voiced[i]: start_cnt += 1 else: start_cnt = 0 if start_cnt == start_speech: started = True start_index = i - start_speech + 1 - speech_leader start_index = max(0, start_index) else: if voiced[i]: stop_cnt = 0 else: stop_cnt += 1 if stop_cnt == end_silence: stop_index = i - end_silence + 1 + speech_trailer stop_index = min(len(frames)-1, stop_index) print ">>", start_index, stop_index, (stop_index - start_index + 1 ) * 10 wav.write(output_folder + "/word" + str(words) + ".wav",rate, sig[441*start_index : 441*(stop_index+1)]) #wav.write("word" + str(words) + ".wav",rate, get_signal(start_index , stop_index)) words += 1 started = False start_index = start_cnt = 0 stop_index = stop_cnt = 0 if started: stop_index = len(frames)-1 print ">>", start_index, stop_index, (stop_index - start_index + 1 ) * 10 wav.write(output_folder + "/word" + str(words) + ".wav",rate, sig[441*start_index : 441*(stop_index+1)]) #wav.write("word" + str(words) + ".wav",rate, get_signal(start_index , stop_index)) words += 1 return words
# Inicia extracción de caracteristicas a nivel de segmento start = time.time() window = 2 allUt = [] allLabels = [] utteranceByFeat = [] labelsByFeat = [] portionSelection = 1.0 for utterance in sessionTrain: features = [] labels = [] if np.random.uniform(1, 0) < portionSelection: label = utterance[1] channel1, channel2 = zip(*utterance[0]) samplesSize = 1600 samplesChn1 = sigproc.framesig(channel1, samplesSize, np.ceil(samplesSize * 0.1)) samplesChn2 = sigproc.framesig(channel2, samplesSize, np.ceil(samplesSize * 0.1)) allFeaturesVector = [] for i in range(0, len(samplesChn1)): sampleLeft = samplesChn1[i] sampleRight = samplesChn2[i] currentFeaturesLeft = calcFeaturesVector(sampleLeft, 16000) currentFeaturesRight = calcFeaturesVector(sampleRight, 16000) allFeaturesVector.append(currentFeaturesLeft + currentFeaturesRight) bound = 2 * window + 1 if i >= (bound): features.append(np.concatenate(allFeaturesVector[(i - bound) : i], axis=0)) allUt.append(np.concatenate(allFeaturesVector[(i - bound) : i], axis=0)) labels.append(encodeLabels(label)) allLabels.append(encodeLabels(label))
def logFilterbankFeatures(signal,samplerate=16000,winlen=0.0255,winstep=0.01, nfilt=40,nfft=512,lowfreq=133.3333,highfreq=6855.4976,preemph=0.97, winSzForDelta=2): ''' Computes log filterbank energies on a mel scale + total energy using with the code taken from features.fbank, which does not accept window function as a param. function from package 'python_speech_features', see http://python-speech-features.readthedocs.org/en/latest/ or https://github.com/jameslyons/python_speech_features Therefore it calculates the FFT of the signal and sums the the weighted bins, distributed on a mel scale. Weighting is done with tri-angular filters. For these filter energies + total energy, deltas are calculated. :parameters: - signal : np.ndarray, dtype=float input vector of the speech signal - samplerate : int - winlen: float length of analysis window in seconds - winstep: float step size between successive windows in seconds - nfilt: int number of filter energies to compute (total energy not included). e.g. 40 --> Output dim = (40+1)*3 - nfft: int FFT size - lowfreq: int lower end on mel frequency scale, on which filter banks are distributed - highfreq: int upper end on mel frequency scale, on which filter banks are distributed - preemph: float pre-emphasis coefficient - deltafeat: np.ndarray, dtype=float deltas of the input features - winSzForDelta: int window size for computing deltas. E.g. 2 --> t-2, t-1, t+1 and t+2 are for calculating the deltas :returns: - features: numpy.array: float feature-matrix. 1st dimension: time steps of 'winstep', 2nd dim: feature dimension: (nfilt + 1)*3, +1 for energy, *3 because of deltas ''' # Part of the following code is copied from function features.fbank # Unfortunately, one can't specify the window function in features.fbank # Hamming window is used here highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate,winfunc=hamming) pspec = sigproc.powspec(frames,nfft) energy = np.sum(pspec,1) # this stores the total energy in each frame energy = np.where(energy == 0,np.finfo(float).eps,energy) # if energy is zero, we get problems with log fb = features.get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) feat = np.dot(pspec,fb.T) # compute the filterbank energies feat = np.where(feat == 0,np.finfo(float).eps,feat) # if feat is zero, we get problems with log # Use log feature bank and log energy feat = np.column_stack((np.log(energy),np.log(feat))) # calculate delta and acceleration deltaFeat = delta(feat, winSzForDelta) accFeat = delta(deltaFeat, winSzForDelta) # stack features + delta + acceleration return np.concatenate((feat,deltaFeat,accFeat),axis=1)