def load_data_shared(ind): #Training and testing data timit_data_train = genfromtxt('timit_data_1280_train.csv', delimiter=',') timit_vwlname_train = genfromtxt('timit_vwlname_1280_train.csv', delimiter=',') timit_vwlname_train[:] = [x - 1 for x in timit_vwlname_train] timit_data_test = genfromtxt('timit_data_1280_test.csv', delimiter=',') timit_vwlname_test = genfromtxt('timit_vwlname_1280_test.csv', delimiter=',') timit_vwlname_test[:] = [x - 1 for x in timit_vwlname_test] fs = 16000 datalen = 1280 narr = np.array([13, 26, 39]); #Number of features in each frame i=0; j=0; trainfeature=np.zeros((len(timit_data_train), (datalen*100/fs - 1)*narr[ind])) for x in timit_data_train: fbank_flat = logfbank(x,fs).flatten() mfcc_flat = mfcc(x,fs).flatten() if ind == 0: trainfeature[i,:] = mfcc_flat elif ind == 1: trainfeature[i,:] = fbank_flat else: trainfeature[i,:] = np.concatenate((mfcc_flat, fbank_flat)) i = i+1 testfeature=np.zeros((len(timit_data_test), (datalen*100/fs - 1)*narr[ind])) for x in timit_data_test: fbank_flat = logfbank(x,fs).flatten() mfcc_flat = mfcc(x,fs).flatten() if ind == 0: testfeature[j,:] = mfcc_flat elif ind == 1: testfeature[j,:] = fbank_flat else: testfeature[j,:] = np.concatenate((mfcc_flat, fbank_flat)) j = j+1 training_data = (trainfeature, timit_vwlname_train) test_data = (testfeature, timit_vwlname_test) # For now, I am using test data as validating data. Should change later. validation_data = test_data def shared(data): """Place the data into shared variables. This allows Theano to copy the data to the GPU, if one is available. """ shared_x = theano.shared( np.asarray(data[0], dtype=theano.config.floatX), borrow=True) shared_y = theano.shared( np.asarray(data[1], dtype=theano.config.floatX), borrow=True) return shared_x, T.cast(shared_y, "int32") return [shared(training_data), shared(validation_data), shared(test_data)]
def svm_baseline(): #### Change here ind = 0; # 0 for mfcc, 1 for filterbank, 2 for both narr = np.array([13, 26, 39]); # corresponding length of feature in a frame #Training and testing data timit_data_train = genfromtxt('timit_data_1280_train.csv', delimiter=',') timit_vwlname_train = genfromtxt('timit_vwlname_1280_train.csv', delimiter=',') timit_vwlname_train[:] = [x - 1 for x in timit_vwlname_train] timit_data_test = genfromtxt('timit_data_1280_test.csv', delimiter=',') timit_vwlname_test = genfromtxt('timit_vwlname_1280_test.csv', delimiter=',') timit_vwlname_test[:] = [x - 1 for x in timit_vwlname_test] fs = 16000 datalen = 1280 i=0; j=0; trainfeature=np.zeros((len(timit_data_train), (datalen*100/fs - 1)*narr[ind])) for x in timit_data_train: fbank_flat = logfbank(x,fs).flatten() mfcc_flat = mfcc(x,fs).flatten() if ind == 0: trainfeature[i,:] = mfcc_flat elif ind == 1: trainfeature[i,:] = fbank_flat else: trainfeature[i,:] = np.concatenate((mfcc_flat, fbank_flat)) i = i+1 testfeature=np.zeros((len(timit_data_test), (datalen*100/fs - 1)*narr[ind])) for x in timit_data_test: fbank_flat = logfbank(x,fs).flatten() mfcc_flat = mfcc(x,fs).flatten() if ind == 0: testfeature[j,:] = mfcc_flat elif ind == 1: testfeature[j,:] = fbank_flat else: testfeature[j,:] = np.concatenate((mfcc_flat, fbank_flat)) j = j+1 training_data = (list(trainfeature), timit_vwlname_train) test_data = (list(testfeature), timit_vwlname_test) # train clf = svm.SVC() clf.fit(training_data[0], training_data[1]) # test predictions = [int(a) for a in clf.predict(test_data[0])] num_correct = sum(int(a == y) for a, y in zip(predictions, test_data[1])) print "Using svm_baseline classifier:" print "%s of %s values correct. %s percent " % (num_correct, len(test_data[1]), (num_correct*100)/len(test_data[1]))
def getFeatures(signal, rate): """ Extracts Important Vocal Features author: chris """ if signal.shape[0] > mem_cut_off: mfcc,fbank = getFeatures(signal[mem_cut_off:], rate) return np.concatenate((fs.mfcc(signal[:mem_cut_off],rate),mfcc)), np.concatenate((fs.logfbank(signal,rate),fbank)) else: return fs.mfcc(signal,rate), fs.logfbank(signal,rate)
def getFeatures(signal, rate): """ Extracts Important Vocal Features author: chris """ if signal.shape[0] > mem_cut_off: mfcc, fbank = getFeatures(signal[mem_cut_off:], rate) return np.concatenate((fs.mfcc(signal[:mem_cut_off], rate), mfcc)), np.concatenate( (fs.logfbank(signal, rate), fbank)) else: return fs.mfcc(signal, rate), fs.logfbank(signal, rate)
def compute_mfb(filename): '''Compute Mel filterbank features on a song and store them in a binary file with the Numpy format. Argument : filename: filename of the wav file located in settings.DIR_SONGS (without path, without wav extension) Returns: 0 if success The output file is located in settings.DIR_MEL_FEATURES. ''' tmax = settings.TMAX (rate, sig) = wav.read(settings.DIR_SONGS + filename + '.wav') if rate != 44100: print 'Warning : the rate is not 44100.' nSamples, nChannels = sig.shape if nChannels != 2: print 'Warning : the number of channels is not 2.' if nSamples > rate * tmax: sig = sig[:rate * tmax, :] # take the 2 first minutes (for memory) sig = sig.mean(1) #mfcc_feat = mfcc(sig,rate) fbank_feat = logfbank(sig, rate) numpy.save(settings.DIR_MEL_FEATURES + filename + '.npy', fbank_feat) numpy.save(settings.DIR_SAMPLE_RATE + filename + '.npy', rate) return 0
def compute_dynamic_selected_features(filename): melFeatures = numpy.load(settings.DIR_MEL_FEATURES + filename + '.npy') tmax = settings.TMAX nPoints, nChannels = melFeatures.shape if nChannels != 26: print "Warning : 26 channels expected" nChannelsPerChannel = 13 #timeSize = stft.stft_time_size(melFeatures[:,0], settings.FFT_SIZE, settings.OVERLAP) #dynamic_selected_features = numpy.zeros((timeSize, nChannels / 2 * nChannelsPerChannel)) dynamic_selected_features = [] for i in range(nChannels / 2): #dynamic_selected_features[:, i*nChannelsPerChannel:(i+1)*nChannelsPerChannel] = logfbank(melFeatures[:,i],100,settings.FFT_SIZE, settings.FFT_SIZE / settings.OVERLAP)[:,:13] A = logfbank(melFeatures[:,i],nPoints/tmax,settings.FFT_SIZE, float(settings.FFT_SIZE) / settings.OVERLAP)[:,:13] if i == 0: dynamic_selected_features = A else: dynamic_selected_features = numpy.append(dynamic_selected_features,A,axis=1) dynamic_selected_features = numpy.transpose(dynamic_selected_features) nFeatures, timeSize = dynamic_selected_features.shape featureVar = numpy.sqrt(abs(dynamic_selected_features*dynamic_selected_features).mean(1)) dynamic_selected_features = dynamic_selected_features/numpy.tile(featureVar.reshape((nFeatures,1)), (1,timeSize)) numpy.save(settings.DIR_SELECTED_FEATURES + filename + '.npy', dynamic_selected_features) return dynamic_selected_features
def compute_mfb(filename): '''Compute Mel filterbank features on a song and store them in a binary file with the Numpy format. Argument : filename: filename of the wav file located in settings.DIR_SONGS (without path, without wav extension) Returns: 0 if success The output file is located in settings.DIR_MEL_FEATURES. ''' tmax = settings.TMAX (rate,sig) = wav.read(settings.DIR_SONGS + filename + '.wav') if rate != 44100: print 'Warning : the rate is not 44100.' nSamples, nChannels = sig.shape if nChannels != 2: print 'Warning : the number of channels is not 2.' if nSamples > rate*tmax: sig = sig[:rate*tmax,:] # take the 2 first minutes (for memory) sig = sig.mean(1) #mfcc_feat = mfcc(sig,rate) fbank_feat = logfbank(sig,rate) numpy.save(settings.DIR_MEL_FEATURES + filename + '.npy', fbank_feat) numpy.save(settings.DIR_SAMPLE_RATE + filename + '.npy', rate) return 0
def training(): ''' Takes input signal and searches current dataset for hit. If hit, then add to correct dataset. If miss, asks user for currect input and adds to dataset. ''' print("please speak a word into the microphone") record_to_file('training.wav') print("done - result written to training.wav") (rate, sig) = wav.read("training.wav") mfcc_feat = mfcc(sig, rate) fbank_feat = logfbank(sig, rate) recording = fbank_feat[1:3, :] testing = check_for_match(recording) verify = raw_input("did you say " + testing + " ") if verify is 'y': parse_array(recording, testing) if verify is 'n': correct_word = input("what word did you mean? ") print correct_word parse_array(recording, correct_word)
def LogFBank(data, samp): mfcc_feat = logfbank(data,samp) mMin = mfcc_feat.min() mMax = mfcc_feat.max() mfcc_feat -= mMin mfcc_feat *= 255/mfcc_feat.max() outImg = np.array(mfcc_feat, np.uint8) return outImg
def generate(self,testsample): (rate,audio) = wav.read(testsample.path) # grab first channel one_channel = _extract_single_channel(audio) N = len(audio) fbank_feat = logfbank(one_channel,samplerate=rate) #winlen=1.0 cols=fbank_feat.shape[0]*fbank_feat.shape[1] return fbank_feat.reshape((1,cols))[0]
def LogFBank(data, samp): mfcc_feat = logfbank(data, samp) mMin = mfcc_feat.min() mMax = mfcc_feat.max() mfcc_feat -= mMin mfcc_feat *= 255 / mfcc_feat.max() outImg = np.array(mfcc_feat, np.uint8) return outImg
def fbank_feature_extractor(wav_file_path): ''' Extracts mfcc features for the wav file ''' # Extracts mfcc features every 1/200th of a second. (rate, sig) = wav.read(wav_file_path) fbank_feat = logfbank(sig, rate) return fbank_feat
def analyzeLogBinergy(grain): windowSize = int(float(grain["frameCount"])) (rate, sig) = wav.read(grain["file"]) windowedSignal = numpy.multiply(signal.hamming(windowSize), sig) energies = logfbank(signal=sig, samplerate=rate, winlen=.020, winstep=.020, nfilt=13, nfft=windowSize) return energies.tolist()[0]
def sndFeature(snd, graph = False): #normalize rms here snd /= float(np.linalg.norm(snd)) ft_mfcc = mfcc(snd, samplerate=sampFreq, nfilt=26, numcep=13)[0] ft_logf = logfbank(snd, sampFreq)[0] #print '{}\n*******\n{}'.format(ft_mfcc, ft_logf) #raw_input() ft = np.hstack((ft_mfcc, ft_logf)) #print ft #raw_input() return ft
def sndFeature(snd, graph=False): #normalize rms here snd /= float(np.linalg.norm(snd)) ft_mfcc = mfcc(snd, samplerate=sampFreq, nfilt=26, numcep=13)[0] ft_logf = logfbank(snd, sampFreq)[0] #print '{}\n*******\n{}'.format(ft_mfcc, ft_logf) #raw_input() ft = np.hstack((ft_mfcc, ft_logf)) #print ft #raw_input() return ft
def get_track_features(track_name): (rate,sig) = wav.read(track_name) mfcc_feat = mfcc(sig,rate) fbank_feat = logfbank(sig,rate) num_segments = len(mfcc_feat) num_features = len(mfcc_feat[0]) features_mean = _get_features_mean(mfcc_feat) cov_mat = _get_covariance_matrix(mfcc_feat) return (features_mean, cov_mat)
def get_track_features(track_name): (rate, sig) = wav.read(track_name) mfcc_feat = mfcc(sig, rate) fbank_feat = logfbank(sig, rate) num_segments = len(mfcc_feat) num_features = len(mfcc_feat[0]) features_mean = _get_features_mean(mfcc_feat) cov_mat = _get_covariance_matrix(mfcc_feat) return (features_mean, cov_mat)
def frequency_banks(self, blockSize=600): if self.signal is None: self.read_recording() fbanks = numpy.zeros((0, 1, 26)) start = 0 while start < len(self.signal): end = start + blockSize * self.samplerate end = end if end < len(self.signal) else len(self.signal) block = self.signal[start:end] fbank = logfbank(block, self.samplerate, winlen=0.05, winstep=0.025) fbanks = numpy.concatenate((fbanks, numpy.reshape(fbank, (len(fbank), 1, 26)))) start = end return fbanks
def make_dataset(): files = sorted(os.listdir(file_path)) data_mfcc, data_lmfb, target = [], [], [] for file_name in files: target += [int(x) for x in file_name[:-4].split('_')] segments = segmentation(file_name, play=False, display=False) for segment in segments: data_mfcc.append(mfcc(segment, samplerate=Fs)) data_lmfb.append(logfbank(segment, samplerate=Fs)) f = open('dataset.pkl', 'wr+') pickle.dump({'data_mfcc': np.array(data_mfcc), 'data_lmfb': np.array(data_lmfb), 'target': np.array(target)}, f) f.close() return {'data_mfcc': np.array(data_mfcc), 'data_lmfb': np.array(data_lmfb), 'target': np.array(target)}
def compute_fbank(sig, rate, winlen=0.025,winstep=0.01, nfilt=39, nfft=512, lowfreq=0, highfreq=None, preemph=0.97, include_energy=True, snip_edges = True): if snip_edges: #snip the edges sig = snip(sig, rate, winlen, winstep) #compute fbank features and energy (feat,energy) = logfbank(sig, rate, winlen, winstep, nfilt, nfft, lowfreq, highfreq, preemph) if include_energy: #append the energy fbank_feat = np.ndarray(shape=(feat.shape[0], feat.shape[1] + 1)) fbank_feat[:,0:feat.shape[1]] = feat fbank_feat[:,feat.shape[1]] = energy else: fbank_feat = feat return fbank_feat
def build_codebook( trgfile, codesize=32, fname=None ): # given a training file constructs the codebook using kmeans (rate, sig) = wav.read(trgfile) print rate, sig.shape #get the spectral vectors print("MFCC generation begins") mfcc_feat = mfcc(sig, rate) print("MFCC generation ends") print mfcc_feat.shape print("Fbank creation begins") fbank_feat = logfbank(sig, rate) #this has the spectral vectors now print("Fbank creation ends") print fbank_feat.shape print "codesize = ", codesize km = KMeans(n_clusters=codesize) km.fit(fbank_feat) if fname != None: pickle.dump(km, open(fname, 'wb')) return km
def short_to_mfcc(signal): global sampling mfcc_features = mfcc(signal, samplerate=sampling, winlen=0.025, winstep=0.01, numcep=13, nfilt=26, nfft=512, lowfreq=0, highfreq=1000, preemph=0.97, ceplifter=22, appendEnergy=True) fbank_features = logfbank(signal, sampling) #print(fbank_features[1:3,:]) #return fbank_features[1:3,:] return fbank_features[1:2, :]
def compute_dynamic_selected_features(filename): melFeatures = numpy.load(settings.DIR_MEL_FEATURES + filename + '.npy') tmax = settings.TMAX nPoints, nChannels = melFeatures.shape if nChannels != 26: print "Warning : 26 channels expected" nChannelsPerChannel = 13 #timeSize = stft.stft_time_size(melFeatures[:,0], settings.FFT_SIZE, settings.OVERLAP) #dynamic_selected_features = numpy.zeros((timeSize, nChannels / 2 * nChannelsPerChannel)) dynamic_selected_features = [] for i in range(nChannels / 2): #dynamic_selected_features[:, i*nChannelsPerChannel:(i+1)*nChannelsPerChannel] = logfbank(melFeatures[:,i],100,settings.FFT_SIZE, settings.FFT_SIZE / settings.OVERLAP)[:,:13] A = logfbank(melFeatures[:, i], nPoints / tmax, settings.FFT_SIZE, float(settings.FFT_SIZE) / settings.OVERLAP)[:, :13] if i == 0: dynamic_selected_features = A else: dynamic_selected_features = numpy.append(dynamic_selected_features, A, axis=1) dynamic_selected_features = numpy.transpose(dynamic_selected_features) nFeatures, timeSize = dynamic_selected_features.shape featureVar = numpy.sqrt( abs(dynamic_selected_features * dynamic_selected_features).mean(1)) dynamic_selected_features = dynamic_selected_features / numpy.tile( featureVar.reshape((nFeatures, 1)), (1, timeSize)) numpy.save(settings.DIR_SELECTED_FEATURES + filename + '.npy', dynamic_selected_features) return dynamic_selected_features
def vector_quantize( myfiles, outdir, model ): #given a list of files transform them to spectral vectors and compute the KMeans VQ for f in myfiles: print "Quantizing: ", f (rate, sig) = wav.read(f) print rate, sig.shape #get the spectral vectors mfcc_feat = mfcc(sig, rate) print mfcc_feat.shape fbank_feat = logfbank(sig, rate) #this has the spectral vectors now print fbank_feat.shape val = model.predict(fbank_feat) fcomps = os.path.split(f) #file components path, filename fn = fcomps[-1].split('.')[0] + '_vq.txt' #outpath = os.path.join(fcomps[0], 'outputs') fn = os.path.join(outdir, fn) f = open(fn, 'wb') for v in val: f.write(str(v) + '\n') f.close() print 'output vector quantized file: ', f, ' written' return
def logfbank_feature(sig,rate): ''' this function is used to change the logfbank_feature of every frame into statistic value output features including: 1. average of 26 features 2. maximum ... 3. minimum ... 4. varience ... INPUT: logfbank_feat (FRAMENUM, 26) OUTPUT: ave_logfbank (26, ) max_logfbank (26, ) min_logfbank (26, ) var_logfbank (26, ) ''' logfbank_feat = logfbank(sig,rate) ave_logfbank = np.mean(logfbank_feat, axis = 0) max_logfbank = np.max(logfbank_feat, axis = 0) min_logfbank = np.min(logfbank_feat, axis = 0) var_logfbank = np.var(logfbank_feat, axis = 0) return [ave_logfbank, max_logfbank, min_logfbank, var_logfbank]
def _speechFeatures(): filename = sorted(glob.glob(outputDir + '/*.' + audioTargetFormat))[2] (rate, sig) = wav.read(filename) sig = sig[0:(rate * 10)] mfcc_feat = mfcc(sig, rate) fbank_feat = logfbank(sig, rate) print(fbank_feat[1:3, :]) print(fbank_feat.shape) print(mfcc_feat.shape) fileoutName = filename.replace('.' + audioTargetFormat, '.png') fileoutName = 'test.png' print(fileoutName) fig = plt.figure(figsize=(12, 4)) ax = fig.add_subplot(211) ax.contourf(np.transpose(mfcc_feat)) plt.tight_layout() ax = fig.add_subplot(212) mfcc_sum = np.sum(np.transpose(np.sqrt(mfcc_feat * mfcc_feat)), 0) n = 6 mfcc_sum_ref = mfcc_sum[:] for i in range(len(mfcc_sum_ref)): minidx = max(0, i - int(n / 2)) maxidx = min(len(mfcc_sum_ref), i + (n - int(n / 2))) mfcc_sum[i] = np.sum(mfcc_sum_ref[minidx:maxidx]) / (maxidx - minidx) ax.plot(mfcc_sum) #ax.set_yscale('log') plt.tight_layout() plt.savefig(fileoutName, format='png', dpi=300)
def _speechFeatures(): filename=sorted(glob.glob(outputDir+'/*.'+audioTargetFormat))[2] (rate,sig) = wav.read(filename) sig = sig[0:(rate*10)] mfcc_feat = mfcc(sig,rate) fbank_feat = logfbank(sig,rate) print(fbank_feat[1:3,:]) print(fbank_feat.shape) print(mfcc_feat.shape) fileoutName=filename.replace('.'+audioTargetFormat,'.png') fileoutName='test.png' print(fileoutName) fig = plt.figure(figsize=(12,4)) ax = fig.add_subplot(211) ax.contourf(np.transpose(mfcc_feat)) plt.tight_layout() ax = fig.add_subplot(212) mfcc_sum = np.sum(np.transpose(np.sqrt(mfcc_feat*mfcc_feat)),0) n=6 mfcc_sum_ref = mfcc_sum[:] for i in range(len(mfcc_sum_ref)): minidx=max(0,i-int(n/2)) maxidx=min(len(mfcc_sum_ref),i+(n-int(n/2))) mfcc_sum[i]=np.sum(mfcc_sum_ref[minidx:maxidx])/(maxidx-minidx) ax.plot(mfcc_sum) #ax.set_yscale('log') plt.tight_layout() plt.savefig(fileoutName,format='png',dpi=300)
def analyzeLogBinergy(grain): windowSize = int(float(grain["frameCount"])) (rate,sig) = wav.read(grain["file"]) windowedSignal = numpy.multiply(signal.hamming(windowSize), sig) energies = logfbank(signal=sig, samplerate=rate, winlen=.020, winstep=.020, nfilt=13, nfft=windowSize) return energies.tolist()[0]
from features import mfcc from features import logfbank import scipy.io.wavfile as wav import pickle from sklearn.cross_validation import train_test_split from sklearn.svm import SVC import numpy as np from sklearn import metrics from sklearn.cross_validation import StratifiedKFold from sklearn.grid_search import GridSearchCV X = [] y = [] # Feature extraction using mfcc features a = "C:\\Project\\speech\\corpus\\" for num in range(1, 6): for ite in range(1, 11): (rate, sig) = wav.read(a + str(num) + "\\" + str(ite) + ".wav") mfcc_feat = mfcc(sig, rate) print(len(logfbank(sig, rate).flatten())) X.append(logfbank(sig, rate)[:10000, :].flatten()) print(num, ite) #y.append(num) pickle.dump(X, open("XX.pkl", "wb")) #pickle.dump( y, open( "y.pkl", "wb" )) # print(fbank_feat[:500,:].flatten())
def get_data(path_to_audio, path_to_labels, delimiter_char, nb_features=13): (rate, sig) = wav.read(path_to_audio) target = np.genfromtxt(path_to_labels, dtype=long, delimiter=delimiter_char) labels = target[:, 1] if 1: # Change Window Size window_size = 0.1 # Default window size is milliseconds if 1: mfcc_feat = mfcc(sig, rate, window_size, 0.1, nb_features) if 0: mfcc_feat = mfcc(sig, rate, window_size, 0.1, nb_features/2) fbank_feat = logfbank(sig, rate, window_size, 0.1, nb_features/2) #ssc_feat = ssc(sig, rate, window_size, 0.1, nb_features/2) temp = np.empty([mfcc_feat.shape[0],nb_features]); for i in range(len(mfcc_feat)): temp1 = np.append(mfcc_feat[i], fbank_feat[i]) np.append(temp,temp1) mfcc_feat = temp if 0: # Change Window Size and step size when aggregation is done on labels window_size = 0.1 window_step = 1 mfcc_feat = mfcc(sig, rate, window_size, window_step, nb_features) if 1: # Normalize features print "Normalizing Features" for col in range(nb_features): min_col = np.amin(mfcc_feat[:, col]) max_col = np.amax(mfcc_feat[:, col]) range_col = max_col - min_col mfcc_feat[:, col] = (mfcc_feat[:, col] - min_col) / range_col if 1: # Low pass features print "Low Pass Filtering features" convolute_size = 4 count = mfcc_feat.shape[0] new_feat = np.empty([count, nb_features]) for i in range(count): if (i < convolute_size) or (i > count - 1 - convolute_size): new_feat[i, :] = mfcc_feat[i, :] else: row_ = mfcc_feat[i, :] for row_dex in range(1, 1 + convolute_size): row_ = row_ + mfcc_feat[i + row_dex, :] row_ = row_ + mfcc_feat[i - row_dex, :] new_feat[i, :] = row_ / (convolute_size * 2 + 1) mfcc_feat = new_feat if 0: # Aggregating labels by block print "Aggregation of labels on", window_step,"sec" count = labels.shape[0] aggregate_size = int(window_step*10) size_labels = ceil(float(count)/aggregate_size) modified_size = min(mfcc_feat.shape[0], size_labels) mfcc_feat = mfcc_feat[0:modified_size,:] new_labels = np.empty([modified_size]) new_index = 0 for i in range(0,int((modified_size-1)*aggregate_size+1),aggregate_size): (new_labels[new_index], count_) = stats.mode(labels[i:i+aggregate_size]) new_index += 1 labels = new_labels if 0: # Low pass labels print "Low Pass Filtering labels" convolute_size = 4 count = labels.shape[0] new_labels = np.empty([count]) for i in range(count): if (i < convolute_size) or (i > count - 1 - convolute_size): new_labels[i] = labels[i] else: row_ = labels[i] for row_dex in range(1, 1 + convolute_size): row_ = row_ + labels[i + row_dex] row_ = row_ + labels[i - row_dex] new_labels[i] = row_ / (convolute_size * 2 + 1) labels = new_labels if 1: # get rid of background points = class 5 print "Removing speaking parts" all_sound_count = 0 non_verbal_count = 0 for row in labels: if row != 5: non_verbal_count += 1 all_sound_count += 1 new_feat = np.empty([non_verbal_count, nb_features]) new_target = np.empty([non_verbal_count, 2]) new_labels = np.empty([non_verbal_count]) count = 0 dex = 0 for row in labels: if row != 5: new_target[count, :] = target[dex, :] new_feat[count, :] = mfcc_feat[dex, :] new_labels[count] = labels[dex] count += 1 dex += 1 mfcc_feat = new_feat labels = new_labels labels = labels.astype('int') n_classes = np.unique(labels) return [mfcc_feat, labels, n_classes]
def main(setname): print " Extract SIG features for ", setname, " ..." if setname == "train" : waveChannels = config['train_waveChannels'].strip().split() transcChannels = config['train_transcChannels_ctm'].strip().split() elif setname == "test" : waveChannels = config['test_waveChannels'].strip().split() transcChannels = config['test_transcChannels_ctm'].strip().split() else : print "Error!!! Define the set name train or test\n" return CHANNELS = int(config['CHANNELS']) for ch in range(CHANNELS): print " Extract SIG features for CHANNEL ", str(ch+1) # load the wave files for each recording channel wavfiles = "data/lists/"+setname+"_CH_"+str(ch+1)+"_wav.list" if not waveChannels[ch] or not transcChannels[ch]: print "ERROR!!! SIG features need .CTM format of transcriptions" return wavfiles = waveChannels[ch] ctmfile = transcChannels[ch] # save the SIG features into this file sig_feat_file = config['BASEDIR']+"/data/features/"+setname+"_CH_"+str(ch+1)+"_SIG.feat" ctm_array = load_ctm (ctmfile) ind = -1 feat_matrix = np.array([]) wav_doc = open(wavfiles, 'r') # for each wave file, compute the frame mfcc/energy. then assign the frames to the recognized words for wavfile in wav_doc: ind += 1 (rate,sig) = wav.read(wavfile.strip()) mfcc_feat = mfcc(sig,rate,winlen=0.02,winstep=0.01,numcep=12) fbank_feat =logfbank(sig,rate,winlen=0.02,winstep=0.01, nfilt=1) w2fr = load_ctm_info (ctm_array[ind], fbank_feat) sil_no = 0; sil_e = 0; min_sil_e = 1000; max_sil_e = -1000; sil_dur = 0; std_sil_dur = 0 wrd_no = 0; wrd_e= 0; min_wrd_e = 1000; max_wrd_e = -1000; wrd_dur = 0; std_wrd_dur = 0 for elem in w2fr: w = elem[0] t1= elem[1] t2= elem[2] e = elem[3] if w == "@bg": # if it's noise sil_no += 1 sil_e += e sil_dur += t2-t1+1 if e < min_sil_e: min_sil_e = e elif e > max_sil_e: max_sil_e = e else : # if it's word wrd_no += 1 wrd_e += e wrd_dur += t2-t1+1 if e < min_wrd_e: min_wrd_e = e elif e > max_wrd_e: max_wrd_e = e # compute the following Features feat_vector = np.array([]) feat_vector = np.append(feat_vector, mfcc_feat.shape[0]/100 ); # total seg duration feat_vector = np.append(feat_vector, mfcc_feat.mean (axis = 0) ) # mean of mfcc feat_vector = np.append(feat_vector, fbank_feat.mean (axis = 0) ) # mean of energy feat_vector = np.append(feat_vector, fbank_feat.min (axis = 0) ) # min of energy feat_vector = np.append(feat_vector, fbank_feat.max (axis = 0) ) # max of energy feat_vector = np.append(feat_vector, (sil_e/sil_no) ) # mean noise energy feat_vector = np.append(feat_vector, min_sil_e ) # min of noise energy feat_vector = np.append(feat_vector, max_sil_e ) # max of noise energy feat_vector = np.append(feat_vector, (wrd_e/wrd_no) ) # mean of word energies feat_vector = np.append(feat_vector, min_wrd_e ) # min of word energies feat_vector = np.append(feat_vector, max_wrd_e ) # max of noise energies feat_vector = np.append(feat_vector, (wrd_e/wrd_no) / (sil_e/sil_no) ) # Signal to Noise ratio feat_vector = np.append(feat_vector, max_wrd_e - min_sil_e ) # max word energy - min noise energy feat_vector = np.append(feat_vector, sil_no ) # number of silences feat_vector = np.append(feat_vector, sil_no / wrd_no ) # silence to noise ratio feat_vector = np.append(feat_vector, wrd_no / wrd_dur ) # number of words per second (frame) if sil_dur == 0: feat_vector = np.append(feat_vector, 0 ) else: feat_vector = np.append(feat_vector, sil_no / sil_dur ) # number of silences per second (frame) feat_vector = np.append(feat_vector, wrd_dur ) # total words duration feat_vector = np.append(feat_vector, sil_dur ) # total silence duration feat_vector = np.append(feat_vector, wrd_dur / wrd_no ) # mean of words duration feat_vector = np.append(feat_vector, sil_dur / sil_no ) # mean of silence duration feat_vector = np.append(feat_vector, sil_dur / wrd_dur ) # silence to word duration ratio feat_vector = np.append(feat_vector, wrd_dur - sil_dur ) # word duration - silence duration for elem in w2fr: w = elem[0] t1= elem[1] t2= elem[2] if w == "@bg": # if it's noise std_sil_dur += math.pow((t2-t1+1) - (sil_dur/sil_no) , 2) else: # if it's word std_wrd_dur += math.pow((t2-t1+1) - (wrd_dur/wrd_no), 2 ) feat_vector = np.append(feat_vector, math.sqrt ( std_wrd_dur ) / wrd_no ) # std of the words duration feat_vector = np.append(feat_vector, math.sqrt ( std_sil_dur ) / wrd_no ) # std of the silence duration if len(feat_matrix) < 1: feat_matrix = feat_vector else: feat_matrix = np.vstack([ feat_matrix, feat_vector] ) np.savetxt( sig_feat_file, feat_matrix , fmt='%.4f')
def extractLogFBank(path): os.system(sph2pipe + " -f wav " + path + " tmp.wav") (rate, sig) = wav.read("tmp.wav") feats = logfbank(sig, rate, window, step, nfilt, fftsize, 0, None, 0) os.remove("tmp.wav") return feats
def getFBanks(self,waves): fbanks = [] for wave in waves: fbanks.append(logfbank(wave[1],wave[0])) return fbanks
sampling_freq, signal = wavfile.read('datas/sounds/random_sound.wav') # Take the first 10,000 samples for analysis signal = signal[:10000] # Extract the MFCC features features_mfcc = mfcc(signal, sampling_freq) # Print the parameters for MFCC print('\nMFCC:\nNumber of windows =', features_mfcc.shape[0]) print('Length of each feature =', features_mfcc.shape[1]) # Plot the features features_mfcc = features_mfcc.T plt.matshow(features_mfcc) plt.title('MFCC') # Extract the Filter Bank features features_fb = logfbank(signal, sampling_freq) # Print the parameters for Filter Bank print('\nFilter bank:\nNumber of windows =', features_fb.shape[0]) print('Length of each feature =', features_fb.shape[1]) # Plot the features features_fb = features_fb.T plt.matshow(features_fb) plt.title('Filter bank') plt.show()
import math from scipy.signal import lfilter from scikits.talkbox import lpc path = "/home/ponco/devel/mel_cepstral_coeff_neural/vowels/" # First set up the figure, the axis, and the plot element we want to animate fig = plt.figure() ax = plt.axes(xlim=(0, 25), ylim=(-84, 80)) #ax = plt.axes(xlim=(0, 25), ylim=(0, 20)) line, = ax.plot([], [], lw=2) #MEL (rate,sig) = wav.read("Ah.wav") mfcc_feat = mfcc(sig,rate,numcep=30,appendEnergy=False) fbank_feat = logfbank(sig,rate,nfilt=40) # initialization function: plot the background of each frame def init(): line.set_data([], []) return line, # animation function. This is called sequentially def animate(i): #x = np.linspace(0, 12,13) x = np.linspace(0, 25,26) y = mfcc_feat[i,:] #y = fbank_feat[i,:] #print("x:" , x.shape) #print("y:" , y.shape)
def extractLogFBank(rate, sig): feats = logfbank(sig, rate, window, step, nfilt, fftsize, 0, None, 0) return feats
from features import mfcc from features import logfbank import scipy.io.wavfile as wav (rate, sig) = wav.read("audio/s1_an_1.wav") mfcc_feat = mfcc(sig, rate) fbank_feat = logfbank(sig, rate) print fbank_feat[1:3, :]
def main(setname): print " Extract SIG features for ", setname, " ..." if setname == "train": waveChannels = config['train_waveChannels'].strip().split() transcChannels = config['train_transcChannels_ctm'].strip().split() elif setname == "test": waveChannels = config['test_waveChannels'].strip().split() transcChannels = config['test_transcChannels_ctm'].strip().split() else: print "Error!!! Define the set name train or test\n" return CHANNELS = int(config['CHANNELS']) for ch in range(CHANNELS): print " Extract SIG features for CHANNEL ", str(ch + 1) # load the wave files for each recording channel wavfiles = "data/lists/" + setname + "_CH_" + str(ch + 1) + "_wav.list" if not waveChannels[ch] or not transcChannels[ch]: print "ERROR!!! SIG features need .CTM format of transcriptions" return wavfiles = waveChannels[ch] ctmfile = transcChannels[ch] # save the SIG features into this file sig_feat_file = config[ 'BASEDIR'] + "/data/features/" + setname + "_CH_" + str( ch + 1) + "_SIG.feat" ctm_array = load_ctm(ctmfile) ind = -1 feat_matrix = np.array([]) wav_doc = open(wavfiles, 'r') # for each wave file, compute the frame mfcc/energy. then assign the frames to the recognized words for wavfile in wav_doc: ind += 1 (rate, sig) = wav.read(wavfile.strip()) mfcc_feat = mfcc(sig, rate, winlen=0.02, winstep=0.01, numcep=12) fbank_feat = logfbank(sig, rate, winlen=0.02, winstep=0.01, nfilt=1) w2fr = load_ctm_info(ctm_array[ind], fbank_feat) sil_no = 0 sil_e = 0 min_sil_e = 1000 max_sil_e = -1000 sil_dur = 0 std_sil_dur = 0 wrd_no = 0 wrd_e = 0 min_wrd_e = 1000 max_wrd_e = -1000 wrd_dur = 0 std_wrd_dur = 0 for elem in w2fr: w = elem[0] t1 = elem[1] t2 = elem[2] e = elem[3] if w == "@bg": # if it's noise sil_no += 1 sil_e += e sil_dur += t2 - t1 + 1 if e < min_sil_e: min_sil_e = e elif e > max_sil_e: max_sil_e = e else: # if it's word wrd_no += 1 wrd_e += e wrd_dur += t2 - t1 + 1 if e < min_wrd_e: min_wrd_e = e elif e > max_wrd_e: max_wrd_e = e # compute the following Features feat_vector = np.array([]) feat_vector = np.append(feat_vector, mfcc_feat.shape[0] / 100) # total seg duration feat_vector = np.append(feat_vector, mfcc_feat.mean(axis=0)) # mean of mfcc feat_vector = np.append(feat_vector, fbank_feat.mean(axis=0)) # mean of energy feat_vector = np.append(feat_vector, fbank_feat.min(axis=0)) # min of energy feat_vector = np.append(feat_vector, fbank_feat.max(axis=0)) # max of energy feat_vector = np.append(feat_vector, (sil_e / sil_no)) # mean noise energy feat_vector = np.append(feat_vector, min_sil_e) # min of noise energy feat_vector = np.append(feat_vector, max_sil_e) # max of noise energy feat_vector = np.append(feat_vector, (wrd_e / wrd_no)) # mean of word energies feat_vector = np.append(feat_vector, min_wrd_e) # min of word energies feat_vector = np.append(feat_vector, max_wrd_e) # max of noise energies feat_vector = np.append(feat_vector, (wrd_e / wrd_no) / (sil_e / sil_no)) # Signal to Noise ratio feat_vector = np.append( feat_vector, max_wrd_e - min_sil_e) # max word energy - min noise energy feat_vector = np.append(feat_vector, sil_no) # number of silences feat_vector = np.append(feat_vector, sil_no / wrd_no) # silence to noise ratio feat_vector = np.append( feat_vector, wrd_no / wrd_dur) # number of words per second (frame) if sil_dur == 0: feat_vector = np.append(feat_vector, 0) else: feat_vector = np.append( feat_vector, sil_no / sil_dur) # number of silences per second (frame) feat_vector = np.append(feat_vector, wrd_dur) # total words duration feat_vector = np.append(feat_vector, sil_dur) # total silence duration feat_vector = np.append(feat_vector, wrd_dur / wrd_no) # mean of words duration feat_vector = np.append(feat_vector, sil_dur / sil_no) # mean of silence duration feat_vector = np.append(feat_vector, sil_dur / wrd_dur) # silence to word duration ratio feat_vector = np.append( feat_vector, wrd_dur - sil_dur) # word duration - silence duration for elem in w2fr: w = elem[0] t1 = elem[1] t2 = elem[2] if w == "@bg": # if it's noise std_sil_dur += math.pow((t2 - t1 + 1) - (sil_dur / sil_no), 2) else: # if it's word std_wrd_dur += math.pow((t2 - t1 + 1) - (wrd_dur / wrd_no), 2) feat_vector = np.append(feat_vector, math.sqrt(std_wrd_dur) / wrd_no) # std of the words duration feat_vector = np.append(feat_vector, math.sqrt(std_sil_dur) / wrd_no) # std of the silence duration if len(feat_matrix) < 1: feat_matrix = feat_vector else: feat_matrix = np.vstack([feat_matrix, feat_vector]) np.savetxt(sig_feat_file, feat_matrix, fmt='%.4f')
samples = trim_or_pad(samples, max_len_seconds * fs) if len(np.shape(samples)) == 2: samples = samples[:, 0] norm = sqrt(np.dot(samples, samples)) print 'appending', f sounds.append((fs, np.array(samples) / norm)) except (ValueError, TypeError): "Couldn't read wav file" features = [] for fs, s in sounds: mfcc_feat = mfcc(s, fs) mfcc_feat = np.reshape(mfcc_feat, (1, np.shape(mfcc_feat)[0] * np.shape(mfcc_feat)[1])) ssc_feat = ssc(s, fs) ssc_feat = np.reshape(ssc_feat, (1, np.shape(ssc_feat)[0] * np.shape(ssc_feat)[1])) lfbank_feat = logfbank(s, fs) lfbank_feat = np.reshape(lfbank_feat, (1, np.shape(lfbank_feat)[0] * np.shape(lfbank_feat)[1])) #import pdb; pdb.set_trace() #ceps, mspec, spec = mfcc(s, fs = fs) #ceps = np.reshape(ceps, (1, np.shape(ceps)[0] * np.shape(ceps)[1])) features.append(np.hstack([mfcc_feat, ssc_feat, lfbank_feat])) #features.append(np.hstack([ssc_feat])) M = np.vstack(features) print np.shape(M) pca = PCA(n_components=500) V = pca.fit_transform(M)
#!/usr/bin/env python from features import mfcc from features import logfbank import scipy.io.wavfile as wav from sklearn.metrics import mean_squared_error (rate,sig) = wav.read("./testfiles/energy.wav") mfcc_feat = mfcc(sig,rate) fbank_feat = logfbank(sig,rate) (rate2,sig2) = wav.read("./testfiles/64energyfiltered.wav") mfcc_feat2 = mfcc(sig2,rate2) fbank_feat2 = logfbank(sig2,rate2) print mean_squared_error(mfcc_feat, mfcc_feat2[:46])
from features import mfcc from features import logfbank import scipy.io.wavfile as wav (rate,data) = wav.read("demo.wav") mfcc_feat = mfcc(data,rate) fbank_feat = logfbank(data,rate) print fbank_feat[1:3,:]
from features import mfcc from features import logfbank import scipy.io.wavfile as wav import os keyword = ["1", "2"] print keyword.index("1") print keyword.index("2") print keyword.index("3") sph2pipe = "/Users/evgeny/kaldi3/tools/sph2pipe_v2.5/sph2pipe" path = "/Users/evgeny/timit/TIMIT/TRAIN/DR1/FCJF0/SA1.WAV" os.system(sph2pipe + " -f wav " + path + " tmp.wav") window = 0.025 step = 0.01 nfilt = 40 fftsize = 512 (rate,sig) = wav.read("tmp.wav") os.remove("tmp.wav") mfcc_feat = mfcc(sig,rate) fbank_feat = logfbank(sig, rate, window, step, nfilt, fftsize, 0, None, 0) print sig, rate print fbank_feat[1:3,:]
import numpy as np import matplotlib.pyplot as plt from scipy.io import wavfile from features import mfcc, logfbank # Read input sound file sampling_freq, audio = wavfile.read("input_freq.wav") # Extract MFCC and Filter bank features mfcc_features = mfcc(audio, sampling_freq) filterbank_features = logfbank(audio, sampling_freq) # Print parameters print('\nMFCC:\nNumber of windows =', mfcc_features.shape[0]) print('Length of each feature =', mfcc_features.shape[1]) print('\nFilter bank:\nNumber of windows =', filterbank_features.shape[0]) print('Length of each feature =', filterbank_features.shape[1]) # Plot the features mfcc_features = mfcc_features.T plt.matshow(mfcc_features) plt.title('MFCC') filterbank_features = filterbank_features.T plt.matshow(filterbank_features) plt.title('Filter bank') plt.show()
#get wav (rate,sig) = wav.read("BF4.wav") #MFCC mfcc_feat_not_norm = mfcc(sig,rate) max_mfcc = np.amax(mfcc_feat_not_norm) mfcc_feat = (1/max_mfcc) * mfcc_feat_not_norm mfcc_size = len(mfcc_feat[:,1]) # x dimensions MFCC #Log Spec fbank_feat_not_norm = logfbank(sig,rate) max_log = np.amax(fbank_feat_not_norm) fbank_feat = (1/max_log) * fbank_feat_not_norm logSizeX = len(fbank_feat[1,:])# y dimensions log spec logSizeY =len(fbank_feat[:,1])# x dimensions log spec ''' #plotting Log Spec fig = plt.figure(1) ax = fig.add_subplot(2, 1, 1, projection='3d') X = np.arange(0, logSizeX, 1) Y = np.arange(0, logSizeY, 1) X, Y = np.meshgrid(X, Y) R = np.sqrt(X**2 + Y**2) Z = fbank_feat
import numpy as np import matplotlib.pyplot as plt from scipy.io import wavfile from features import mfcc, logfbank # Read input sound file sampling_freq, audio = wavfile.read("input_freq.wav") # Extract MFCC and Filter bank features mfcc_features = mfcc(audio, sampling_freq) filterbank_features = logfbank(audio, sampling_freq) # Print parameters print '\nMFCC:\nNumber of windows =', mfcc_features.shape[0] print 'Length of each feature =', mfcc_features.shape[1] print '\nFilter bank:\nNumber of windows =', filterbank_features.shape[0] print 'Length of each feature =', filterbank_features.shape[1] # Plot the features mfcc_features = mfcc_features.T plt.matshow(mfcc_features) plt.title('MFCC') filterbank_features = filterbank_features.T plt.matshow(filterbank_features) plt.title('Filter bank') plt.show()
print Sxx.shape ''' with open('data/spectrogram_gabriel.pickle', 'rb') as f: (X_gab, y_gab) = pickle.load(f) wav_file = 'data/SA1_RIFF.WAV' spect_new = spectrogram_converter.spectrogram(wav_file) (rate, sig) = wav.read(wav_file) fbe = logfbank(sig, samplerate=rate, winlen=0.025, winstep=0.01, nfilt=26, nfft=512, lowfreq=0, highfreq=None, preemph=0.97) fbe = np.fliplr(zip(*fbe[::-1])) print 'Duration: %f' % duration(wav_file) print 'Array Size Spect: %d' % spect_new.shape[2] print 'Array Size FBE new: %d' % fbe.shape[1] print 'Array Size FBE old: %d' % X_gab.shape[3] f, (plt1, plt2, plt3) = plt.subplots(3, 1, sharey=False) plt1.imshow(spect_new) plt1.set_title('new spectrogram')
__author__ = 'jasonboyer' ''' From https://github.com/jameslyons/python_speech_features example.py ''' import sys from features import mfcc from features import logfbank import scipy.io.wavfile as wav if len(sys.argv) < 2: print("Plays a wave file.\n\nUsage: %s filename.wav" % sys.argv[0]) sys.exit(-1) (rate,sig) = wav.read(sys.argv[1]) mfcc_feat = mfcc(sig,rate) fbank_feat = logfbank(sig,rate) print(fbank_feat[1:3,:])
from features import mfcc from features import logfbank import scipy.io.wavfile as wav (rate, data) = wav.read("demo.wav") mfcc_feat = mfcc(data, rate) fbank_feat = logfbank(data, rate) print fbank_feat[1:3, :]
"""features.mfcc() - Mel Frequency Cepstral Coefficients features.fbank() - Filterbank Energies features.logfbank() - Log Filterbank Energies features.ssc() - Spectral Subband Centroids """ from features import mfcc from features import logfbank import scipy.io.wavfile as wav coun = 0 kkk = 7 while (kkk == 7): (rate, sig) = wav.read("blues.0000" + str(kkk) + ".wav") mfcc_feat = mfcc(sig, rate) fbank_feat = logfbank(sig, rate, winlen=0.03, winstep=0.03) #print fbank_feat[0] normalised = [] for i in fbank_feat: sublist = [] for j in i: sublist.append(int(round(j / 22 * 7))) normalised.append(sublist) with open("blue.txt", "a") as myfile: for i in normalised: print i for j in i: myfile.write(str(j)) coun += 1 kkk = kkk + 1 ''' kkk=10
from features import mfcc from features import logfbank import scipy.io.wavfile as wav import os keyword = ["1", "2"] print keyword.index("1") print keyword.index("2") print keyword.index("3") sph2pipe = "/Users/evgeny/kaldi3/tools/sph2pipe_v2.5/sph2pipe" path = "/Users/evgeny/timit/TIMIT/TRAIN/DR1/FCJF0/SA1.WAV" os.system(sph2pipe + " -f wav " + path + " tmp.wav") window = 0.025 step = 0.01 nfilt = 40 fftsize = 512 (rate, sig) = wav.read("tmp.wav") os.remove("tmp.wav") mfcc_feat = mfcc(sig, rate) fbank_feat = logfbank(sig, rate, window, step, nfilt, fftsize, 0, None, 0) print sig, rate print fbank_feat[1:3, :]
def testmfcc(wavfile="../thecatsatonthemat.wav"): (rate,sig) = wav.read(wavfile) print "rate %s, len(sig) %s"%(rate, len(sig)) mfcc_feat = mfcc(sig,rate) fbank_feat = logfbank(sig,rate) return fbank_feat
#y = np.array(pickle.load( open( "yy.pkl", "rb" ) )) for i in range(1, 17): for j in range(1, 10): if i != predicted: y.append(0) else: y.append(1) y = np.array(y) X1_test = [] (rate, sig) = wav.read("s.wav") mfcc_feat = mfcc(sig, rate) #print(len(X1)) #print(len(logfbank(sig,rate)[:10000].flatten())) X1_test = (logfbank(sig, rate).flatten()[:10000]) X1_test = np.array(X1_test) model = SVC(kernel="linear") model.fit(X1, y) ans = model.predict(X1_test) print("the prediction from speech :") if (ans == 1): print(predicted) else: print("not") if (ans == 1): print("Validated") else: print("not validated")