Пример #1
0
def MFSC(signal,
         samplerate=16000,
         winlen=0.025,
         winstep=0.01,
         numcep=13,
         nfilt=26,
         nfft=512,
         lowfreq=0,
         highfreq=None,
         preemph=0.97,
         ceplifter=22,
         appendEnergy=True):
    """Compute mel-frequency filterbank  features from an audio signal (MFCC without DCT)

    :param signal: the audio signal from which to compute features. Should be an N*1 array
    :param samplerate: the samplerate of the signal we are working with.
    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
    :param numcep: the number of cepstrum to return, default 13
    :param nfilt: the number of filters in the filterbank, default 26.
    :param nfft: the FFT size. Default is 512.
    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
    :param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22.
    :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy.
    :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.
    """
    feat, energy = fbank(signal, samplerate, winlen, winstep, nfilt, nfft,
                         lowfreq, highfreq, preemph)
    feat = np.log(feat)
    return feat, energy
Пример #2
0
def filter(samplerate, signal, winlen=0.02, winstep=0.01, nfilt=40, nfft=512, lowfreq=100, highfreq=5000, preemph=0.97):
    """extracts mel filterbank energies from a given signal

  Args:
    samplerate (int): samples taken per second
    signal(1d numpy array): sample values
    winlen(float): sliding window size in seconds
    winstep(float): overlap of sliding windows in seconds
    nfilt(int): number of mel filters to apply
    nfft(int): size of the discrete fourier transform to use
    lowfreq(int): lowest frequency to collect
    highfreq(int): highest frequency to collect
    preemph(float): preemphesis factor

  Returns:
    feat(2d numpy array): filterbank energies

  """
    feat, energy = speechfeatures.fbank(
        np.array(signal),
        samplerate,
        winlen=winlen,
        winstep=winstep,
        nfilt=nfilt,
        nfft=nfft,
        lowfreq=lowfreq,
        highfreq=highfreq,
        preemph=preemph,
    )

    return np.swapaxes(feat, 0, 1)
Пример #3
0
 def computeLogMelFilterBank(self, file_name):
     '''
     Compute the log-mel frequency filterbank feature vector with deltas and
     double deltas
     '''
     (rate, sig) = wav.read(file_name)
     fbank_feat, energy = fbank(sig,rate, winlen=0.025,winstep=0.01, nfilt=40)
     fbank_feat = np.log(fbank_feat)
     fbank_feat = np.vstack((fbank_feat.transpose(), energy.transpose())).transpose()
     deltas = self.computeDeltas(fbank_feat)
     assert deltas.shape == fbank_feat.shape, "Shapes not equal {0} and \
     {1}".format(deltas.shape, fbank_feat.shape)
     feat_vec = np.vstack((fbank_feat.transpose(), deltas.transpose()))
     double_deltas = self.computeDeltas(deltas)
     feat_vec = np.vstack((feat_vec, double_deltas.transpose())).transpose()
     assert len(feat_vec[0]) == 123, "Something wrong with feature vector dimensions..."
     return feat_vec
    def get_features(sample):
        rate, sig = sample
        mfcc_feats = mfcc(sig, rate)

        def diff(feats):
            feats_diff = numpy.zeros(feats.shape)
            for i in range(2, feats.shape[0]-2):
                feats_diff[i,:] = 2*feats[i-2,:] - feats[i-2,:] + feats[i+1,:] + 2*feats[i+2,:]
            return feats_diff

        mfcc_diff_feats = diff(mfcc_feats)
        mfcc_diff2_feats = diff(mfcc_diff_feats)

        _, energy_feat = fbank(sig, rate)
        log_energy_feat = numpy.log(energy_feat).reshape(energy_feat.shape[0],1)

        return numpy.concatenate((mfcc_feats, mfcc_diff_feats, mfcc_diff2_feats, log_energy_feat), axis=1)[2:-2]
Пример #5
0
def wav2fbank(wavFile, fs=16000, maxLen_s=None):

    if isinstance(wavFile, str):
        (fs, wav) = wavfile.read(wavFile)
        assert fs == 16000  # requirement for now
    elif isinstance(wavFile, np.ndarray):
        wav = wavFile

    winlen = 0.025
    winstep = 0.015
    nfft = np.int(np.power(2, np.ceil(np.log2(winlen * fs))))
    winfunc = lambda x: np.hanning(x)
    nfilt = 40
    preemph = 0.97

    if np.ndim(wav) == 2:  # Multiple channels; just take left one
        wav = wav[:, 0]
    if maxLen_s is not None:
        maxSamp = maxLen_s * fs
        wav = wav[:maxSamp]

    if True:
        M, E = fbank(wav,
                     fs,
                     winlen=winlen,
                     winstep=winstep,
                     nfilt=nfilt,
                     nfft=nfft,
                     winfunc=winfunc,
                     preemph=preemph)

        logM = np.log(M)
    else:
        logM = mfcc(wav,
                    fs,
                    numcep=16,
                    winlen=winlen,
                    winstep=winstep,
                    nfilt=nfilt,
                    nfft=nfft,
                    winfunc=winfunc,
                    preemph=preemph)
    logM = np.swapaxes(logM, 0, 1)

    return logM
Пример #6
0
def MFSC(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,
      nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=True):
    """Compute mel-frequency filterbank  features from an audio signal (MFCC without DCT)

    :param signal: the audio signal from which to compute features. Should be an N*1 array
    :param samplerate: the samplerate of the signal we are working with.
    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
    :param numcep: the number of cepstrum to return, default 13
    :param nfilt: the number of filters in the filterbank, default 26.
    :param nfft: the FFT size. Default is 512.
    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
    :param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22.
    :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy.
    :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.
    """
    feat, energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph)
    feat = np.log(feat)
    return feat, energy
    def get_features(sample):
        rate, sig = sample
        mfcc_feats = mfcc(sig, rate)

        def diff(feats):
            feats_diff = numpy.zeros(feats.shape)
            for i in range(2, feats.shape[0] - 2):
                feats_diff[i, :] = 2 * feats[i - 2, :] - feats[
                    i - 2, :] + feats[i + 1, :] + 2 * feats[i + 2, :]
            return feats_diff

        mfcc_diff_feats = diff(mfcc_feats)
        mfcc_diff2_feats = diff(mfcc_diff_feats)

        _, energy_feat = fbank(sig, rate)
        log_energy_feat = numpy.log(energy_feat).reshape(
            energy_feat.shape[0], 1)

        return numpy.concatenate(
            (mfcc_feats, mfcc_diff_feats, mfcc_diff2_feats, log_energy_feat),
            axis=1)[2:-2]
Пример #8
0
def computeFBANKDeltaDelta(sswi,NFilt=40,NDelta=2):
    nframes = speakersent.countFrames(sswi)
    NFilt=NFilt+1 # Energy values count as one extra
    features = np.zeros(shape=(nframes,NFilt*3))
    util.startprogress("FBANK Features")
    frameCnt=0
    for k,v in sswi.iteritems(): # k: speakerId, v: dict with sentenceId
        speaker_=k
        for k2,v2 in v.iteritems(): # k2: sentenceId, v: dict with frameId -> entry
            sent_=k2
            r,sig=getWavFile(speaker_,sent_)
            fbank_frames,energy=fbank(sig,r,winlen=winlen,winstep=winstep,nfilt=NFilt-1)
            fbank_frames=np.log(np.append(np.reshape(energy,(energy.shape[0],1)),fbank_frames,axis=1))
            delta_1 = computeDelta(fbank_frames,N=NDelta)
            delta_2 = computeDelta(delta_1,N=NDelta)
            util.progress(float(frameCnt)/nframes*100.0)
            for i in range(len(v2)):
                frameCnt+=1
                features[v2[i+1],:NFilt]=fbank_frames[i,:]                
                features[v2[i+1],NFilt:2*NFilt]=delta_1[i,:]                
                features[v2[i+1],2*NFilt:]=delta_2[i,:]
    util.endprogress()
    return features
def energy_feature(sig,rate):
	'''
	this function is used to get the statistic energy value

	output features including:
	1. log of average energy of all frames
	2. log of maximum ...
	3. log of minimum ...
	4. log of varience ...
	
	INPUT: fbank_feat2 (FRAMENUM, )
	OUTPUT: ave_energy(1, )
			max_energy(1, )
			min_energy(1, )
			var_energy(1, )
	'''
	[fbank_feat1, fbank_feat2] = fbank(sig,rate)
	ave_energy = np.log(np.mean(fbank_feat2))
	max_energy = np.log(np.max(fbank_feat2))
	min_energy = np.log(np.min(fbank_feat2))
	var_energy = np.log(np.var(fbank_feat2))

	return [ave_energy, max_energy, min_energy, var_energy]
Пример #10
0
def filter(samplerate,
           signal,
           winlen=0.02,
           winstep=0.01,
           nfilt=40,
           nfft=512,
           lowfreq=100,
           highfreq=5000,
           preemph=0.97):
    """extracts mel filterbank energies from a given signal

  Args:
    samplerate (int): samples taken per second
    signal(1d numpy array): sample values
    winlen(float): sliding window size in seconds
    winstep(float): overlap of sliding windows in seconds
    nfilt(int): number of mel filters to apply
    nfft(int): size of the discrete fourier transform to use
    lowfreq(int): lowest frequency to collect
    highfreq(int): highest frequency to collect
    preemph(float): preemphesis factor

  Returns:
    feat(2d numpy array): filterbank energies

  """
    feat, energy = speechfeatures.fbank(np.array(signal),
                                        samplerate,
                                        winlen=winlen,
                                        winstep=winstep,
                                        nfilt=nfilt,
                                        nfft=nfft,
                                        lowfreq=lowfreq,
                                        highfreq=highfreq,
                                        preemph=preemph)

    return np.swapaxes(feat, 0, 1)
Пример #11
0
 def computeLogMelFilterBank(self, file_name):
     '''
     Compute the log-mel frequency filterbank feature vector with deltas and
     double deltas
     '''
     (rate, sig) = wav.read(file_name)
     fbank_feat, energy = fbank(sig,
                                rate,
                                winlen=0.025,
                                winstep=0.01,
                                nfilt=40)
     fbank_feat = np.log(fbank_feat)
     fbank_feat = np.vstack(
         (fbank_feat.transpose(), energy.transpose())).transpose()
     deltas = self.computeDeltas(fbank_feat)
     assert deltas.shape == fbank_feat.shape, "Shapes not equal {0} and \
     {1}".format(deltas.shape, fbank_feat.shape)
     feat_vec = np.vstack((fbank_feat.transpose(), deltas.transpose()))
     double_deltas = self.computeDeltas(deltas)
     feat_vec = np.vstack((feat_vec, double_deltas.transpose())).transpose()
     assert len(
         feat_vec[0]
     ) == 123, "Something wrong with feature vector dimensions..."
     return feat_vec
Пример #12
0
 stop = int(cur_line[1])
 #                audio = audlab.wavread(wave_name,(freq*stop/10**7-freq*start/10**7))
 #                data = audio[0]
 label = cur_line[2]
 audio = f.read_frames(freq * stop / 10**7 -
                       freq * start / 10**7)
 if label in label_dic:
     mono_signal = audio  #audio[:,0]
     energy = np.sum(mono_signal**2, 0) / len(mono_signal)
     signal = mono_signal  #mono_signal/math.sqrt(energy)
     samplerate = f.samplerate
     #                    mfcc = get_mfcc(signal,samplerate,winstep=window_step,nfft=2048,highfreq=8000,lowfreq=100)
     feat, energy = fbank(signal,
                          samplerate,
                          winstep=window_step,
                          nfft=2048,
                          lowfreq=100,
                          highfreq=22050,
                          nfilt=40)
     feat = np.log(feat)
     feat = np.concatenate((feat, energy[:, np.newaxis]), 1)
     #                    d1_mfcc = mfcc_der_1()
     L = (stop - start) / 10.0**7
     N_iter = len(feat) / N  #math.floor(L/window_step/N)
     # apply context window
     if (L / window_step) > N:
         mfcc_matrix = np.zeros((1, 41 * N))
         for k in range(int(N_iter)):
             mfcc_vec = []
             for kk in range(N):
                 mfcc_vec = np.concatenate(
Пример #13
0
        for j in xrange(len(lines)):
            try:
                cur_line = lines[j].split()
                start = int(cur_line[0])
                stop = int(cur_line[1])
#                audio = audlab.wavread(wave_name,(freq*stop/10**7-freq*start/10**7))
#                data = audio[0]
                label = cur_line[2]
                audio = f.read_frames(freq*stop/10**7-freq*start/10**7)
                if label in label_dic:
                    mono_signal = audio#audio[:,0]
                    energy = np.sum(mono_signal**2,0)/len(mono_signal)
                    signal = mono_signal#mono_signal/math.sqrt(energy)
                    samplerate = f.samplerate
#                    mfcc = get_mfcc(signal,samplerate,winstep=window_step,nfft=2048,highfreq=8000,lowfreq=100)
                    feat,energy = fbank(signal,samplerate,winstep=window_step,nfft=2048,lowfreq=100,highfreq=22050,nfilt=40)
                    feat = np.log(feat)
                    feat = np.concatenate((feat,energy[:,np.newaxis]),1)
#                    d1_mfcc = mfcc_der_1()
                    L = (stop-start)/10.0**7
                    N_iter = len(feat)/N#math.floor(L/window_step/N)
                    # apply context window
                    if (L/window_step)>N:
                        mfcc_matrix = np.zeros((1,41*N))
                        for k in range(int(N_iter)):
                            mfcc_vec = []
                            for kk in range(N):
                                mfcc_vec = np.concatenate((mfcc_vec,feat[k*N+kk,:]))
                            mfcc_matrix = np.concatenate((mfcc_matrix,mfcc_vec[np.newaxis,:]))
                    else:
                        print('Input data sequence does not match minimal length requirement: ignoring')
Пример #14
0
def log_energy_lags(num, window=0.1):
    rate, audio = load_audio(num)
    _, energy = fbank(audio, rate, winlen=window, winstep=window)
    energy = np.log(energy)
    return make_lags(energy, num, framesTR=int(2.0/window))
Пример #15
0
                audio = f.read_frames(freq*stop/10**7-freq*start/10**7)
                if label in label_dic:
                    mono_signal = audio#audio[:,0]
                    energy = np.sum(mono_signal**2,0)/len(mono_signal)
                    signal = mono_signal#mono_signal/math.sqrt(energy)
                    samplerate = f.samplerate
                    L = (stop-start)/10.0**7
#                    N_iter = int(np.floor(L/(window_step*N)))
                    # apply context window
                    if L>(window_step*N):
#                        mfcc_matrix = np.zeros((1,numcep*N))
                        data_conc = np.zeros((1,nfilt))
                        for k in range(int(np.floor((L-N*window_step)*samplerate/(window_step*samplerate)))):
                            feat = np.zeros((N,26))
                            audio_data = audio[k*window_step*samplerate:k*window_step*samplerate+(N+1)*window_step*samplerate]
                            features, energy = fbank(audio_data,samplerate=samplerate,winstep=window_step,nfilt=nfilt,lowfreq=100,highfreq=8000)                       
                            interm = np.sum(features,1)
                            index = np.argsort(interm)
                            data = np.log(features[index[-1],:])
                            data = dct(data)[0:nfilt]                           
                            data[0] = np.log(energy[index[-1]]) # replace first cepstral coefficient with log of frame energy   
                            data_conc = np.concatenate((data_conc,data[np.newaxis,:]),0)
                    else:
                        print('Input data sequence does not match minimal length requirement: ignoring')
                    # get the numeric label corresponding to the literal label
                    num_label = label_dic[label]*np.ones(len(data_conc)-1)
                    label_vector = np.append(label_vector,num_label.astype(np.float32,copy=False))
                    data_vector = np.concatenate(((data_vector,data_conc[1:,:].astype(np.float32,copy=False))),0)
                    for k in range(len(label_dic)):
                        if label_dic[label]==k:
                            time_per_occurrence_class[k].append((stop-start)/(10.0**7))
Пример #16
0
	def dicGen(self, audioPath): # the sub routine in featureDicGen
		

		featureDic = {}
		# count = 0 # for test
		for item in os.listdir(audioPath):
			# count += 1

			# if count > 4:
			# 	break
			# debug(item,"item")
			if item[-4:] != ".wav":
				continue


			tempVec = [] # the feature vector of current clip being processed

			wavepath = os.path.join(audioPath, item)
			# print "processing " + wavepath
			rate, sig = wav.read(wavepath)
			
			mfcc_feat = mfcc(sig,rate)
			mfcc_feat = mfcc_feat[:, 1:13]
			# print mfcc_feat.shape
			# debug(mfcc_feat, "mfcc_feat")
			# debug(mfcc_feat[0], "mfcc_feat[0]")
			# break


			delta_mfcc = readAudio.deltacal(self, mfcc_feat)
			# print delta_mfcc.shape
			# debug(delta_mfcc, "delta_mfcc")
			# debug(delta_mfcc[0], "delta_mfcc[0]")
			# print type(delta_mfcc[0,0])
			# break


			deltadelta_mfcc = readAudio.deltadelta(self, delta_mfcc)
			# debug(deltadelta_mfcc, "deltadelta_mfcc")
			# print deltadelta_mfcc.shape
			# break

			mfcc_feat = np.mean(mfcc_feat, axis = 0)
			# print mfcc_feat.shape

			# break


			delta_mfcc = np.mean(delta_mfcc, axis = 0)
			deltadelta_mfcc = np.mean(deltadelta_mfcc, axis = 0)

			# take means of all windows in each dim for mfcc, delta_mfcc and deltadelta_mfcc


			fbank_feat, energy = fbank(sig,rate)
			# debug(energy, "energy")

			fbank_feat = np.mean(fbank_feat[:, 1:13], axis = 0)

			# debug(energy, "energy")
			
			energy_vec = []
			energyarray = np.asarray(energy)
			energy_vec.append(np.mean(energyarray, axis = 0))
			energy_vec.append(np.median(energyarray, axis = 0))
			energy_vec.append(np.std(energyarray, axis = 0))
			energy_vec.append(np.amax(energyarray, axis = 0))
			energy_vec.append(np.amin(energyarray, axis = 0))
			energy_vec = np.asarray(energy_vec)

			# debug(energy_vec, "test")

			# take the mean, dedian, standard diviation, max and min of the energy, 5 dim in all, into the feature vector

			# print len(delta_mfcc)
			tempVec.extend(mfcc_feat) # 12
			tempVec.extend(delta_mfcc)#12
			tempVec.extend(deltadelta_mfcc) #12
			tempVec.extend(fbank_feat) # 12
			tempVec.extend(energy_vec) # 5

			clip_name = item.split('.')[0] # name of the clip, as the key in the dictionary

			featureDic[clip_name] = tempVec
			# print len(tempVec)

			
			# break

		return featureDic