def mel_dist(x1, x2, fs, num, wlen, inc): """ 计算两信号x1,x2的MFCC参数和距离 :param x1: signal 1 :param x2: signal 2 :param fs: sample frequency :param num: the number we select in MFCC :param wlen: frame length :param inc: frame shift :return Dcep: distance :return Ccep1, Ccep2: num MFCC """ M = MFCC() ccc1 = M.mfcc(x1, Fs, num, wlen, inc) # MFCC ccc2 = M.mfcc(x2, Fs, num, wlen, inc) fn1 = np.shape(ccc1)[0] # frame number Ccep1 = ccc1[:, 0 : num] Ccep2 = ccc2[:, 0 : num] Dcep = np.zeros(fn1) # distance for i in range(fn1): Cn1 = Ccep1[i, :] Cn2 = Ccep2[i, :] Dstu = 0 for k in range(num): Dstu = Dstu + (Cn1[k] - Cn2[k]) ** 2 Dcep[i] = np.sqrt(Dstu) return Dcep, Ccep1, Ccep2
def DPGMM_test(cov_type, alpha_val): #speakers_MFCC_dict = {} #speaker_GMM_dict = {} files = glob.glob(os.getcwd()+'\\speakers\\*.wav') gauss_num = 32 iterator = 1 test_files = [] good = 0 bad = 0 total = 0 for file in files: if file[-6:-4] == '09': test_files.append(file) for file in files: #print(file) if file[-6:-4] == '00': #file[len(file)-12:len(file)-9] current_speaker = file[len(file)-10:len(file)-6] #print("############# Calculate MFCC and DPGMM for ", current_speaker, " , speaker no ", str(iterator)) #if iterator == 572: # print("Tu bedzie error") merged_files = np.array([]) for i in range(0, 9): current_file = wav.read(file[:-5]+str(i)+file[-4:]) merged_files = np.append(merged_files, current_file[1]) #print(type(merged_files)) speaker_MFCC = MFCC.extract(merged_files) speaker_MFCC = speaker_MFCC[:, 1:] #speakers_MFCC_dict[current_speaker] = speaker_MFCC g = mixture.DPGMM(n_components=gauss_num, n_iter=100, covariance_type=cov_type, alpha=alpha_val) g.fit(speaker_MFCC) #speaker_model = np.array([g.means_, g.precs_, np.repeat(g.weights_[:, np.newaxis], 12, 1)]) #speaker_GMM_dict[current_speaker] = speaker_model log_prob = -10000 winner = 'nobody' for test_file in test_files: current_test_speaker = test_file[len(test_file)-10:len(test_file)-6] current_test_file = wav.read(test_file) test_speaker_MFCC = MFCC.extract(current_test_file[1]) test_speaker_MFCC = test_speaker_MFCC[:, 1:] temp_prob = np.mean(g.score(test_speaker_MFCC)) if temp_prob > log_prob: log_prob = temp_prob winner = current_test_speaker if winner == current_speaker: good += 1 else: bad += 1 total +=1 #print(current_speaker, " speaker no ", str(iterator), " is similar to ", winner, " - log prob = ", str(log_prob)) #print("good = ", str(good), ", bad = ", str(bad), ", total = ", str(total)) iterator += 1 print("DPGMM (covariance_type - ", cov_type, ", alpha - ", str(alpha_val), "), Efficiency = ", str(good/total))
def calculate_cluster_distance(sample, database): distances = list() sample_feature_vector = MFCC.voice_feature_extraction(sample) for database_sample in os.listdir(database): database_feature_vector = MFCC.voice_feature_extraction( os.path.join(database, database_sample)) distances.append( DTW.dynamic_time_warping(sample_feature_vector, database_feature_vector)) return np.mean(distances)
def super_vector(test_file_name, ubm_file): wav = mywave() waveData = wav.WaveRead(test_file_name) waveVadIdx = vad(waveData ** 2) waveData = waveData[waveVadIdx] MFCC_obj = MFCC(40,12,300,3400,0.97,16000,50,0.0256,256) MFCC_coef = MFCC_obj.sig2s2mfc(waveData) ubm = GMM(n_mix = 128, n_dim = 12) ubm.read(ubm_file) ubm.adapt(MFCC_coef) return ubm.means
def super_vector(test_file_name, ubm_file): wav = mywave() waveData = wav.WaveRead(test_file_name) waveVadIdx = vad(waveData**2) waveData = waveData[waveVadIdx] MFCC_obj = MFCC(40, 12, 300, 3400, 0.97, 16000, 50, 0.0256, 256) MFCC_coef = MFCC_obj.sig2s2mfc(waveData) ubm = GMM(n_mix=128, n_dim=12) ubm.read(ubm_file) ubm.adapt(MFCC_coef) return ubm.means
def mix_feature(tup): mfcc = MFCC.extract(tup) lpc = LPC.extract(tup) if len(mfcc) == 0: print >> sys.stderr, "ERROR.. failed to extract mfcc feature:", len( tup[1]) return np.concatenate((mfcc, lpc), axis=1)
def main(): theta1 = loadmat('ml.mat')['theta1'] theta2 = loadmat('ml.mat')['theta2'] """Xtest = []; ytest = []; nspeakers = theta2.shape[0]; folders = os.listdir("wav") for i in range(5): folder = folders[i]; print(folder) files = [f for f in glob.glob("wav/"+folder + "/" + "**/*.wav", recursive=True)] sztraining = int(len(files)*0.6); for fid in range(sztraining, len(files)): sample_rate, signal = wav.read(files[fid]) signal = signal[0:int(2 * sample_rate)] mfcc = MFCC.main(signal, sample_rate) Xtest.append(mfcc) ytest.append(i) ytest = np.array(ytest) Xtest = np.array(Xtest) pred = []; for i in range(len(Xtest)): pred.append(ml.predictWAV(theta1, theta2, Xtest[i])[0]) print(np.mean(pred == ytest.flatten()) * 100)""" signal = [] sample_rate = 16000 #th = threading.Thread(target=audio.plot_audio, args=(1,)); #th.start() while True: cmd = input("Digite um comando") print("CMDZAO = " + str(cmd)) if cmd == "record": seconds = 7 print("recording...") signal = sd.rec(int(seconds * sample_rate), samplerate=sample_rate, channels=1) sd.wait() elif cmd == "who": if not len(signal): print("no signal") continue sd.play(signal, sample_rate) signal = signal[0:int(2 * sample_rate)] mfcc = MFCC.main(signal, sample_rate) mlres = ml.predictWAV(theta1, theta2, mfcc) print("user id: {}".format(mlres[0])) elif cmd == "exit": break else: print("not found.") return 0
def Classify (self, sample, verbose = True): length = len (sample) features = MFCC.extract (numpy.frombuffer (sample, numpy.int16)) gestures = {} for gesture in self.params: d = [] for tsample in self.params[gesture]: total_distance = 0 smpl_length = len(tsample) if(numpy.abs(length - smpl_length) <= 0): continue for i in range (min (len (features), len (tsample))): total_distance += dist.cityblock(features[i], tsample[i]) d.append (total_distance/float (i)) score = numpy.min(d) gestures[gesture] = score if(verbose): print "Gesture %s: %f" % (gesture, score) try: if (score < minimum): minimum = score lowest = gesture except: minimum = score lowest = gesture if verbose: print lowest, minimum if(minimum < THRESHOLD): return lowest else: return None
def test(filename, verbose = False): rawdata = loadWAVfile(filename) mfcc = MFCC.extract(rawdata, show=False) #Test the hmm HMM_Model.test(mfcc, verbose) return
def get_label_data(directory): files = os.listdir(directory) # Randomized Test and Train set test_files = rand.sample(files, TEST_SIZE) train_mfcc = [] test_mfcc = [] for f in test_files: if f.endswith("wav"): test_mfcc.append(MFCC.get_mfcc(os.path.join(directory, f))) files.pop(files.index(f)) train_mfcc = [MFCC.get_mfcc(os.path.join(directory, f)) for f in files if f.endswith("wav")] return train_mfcc, test_mfcc
def select_events(nevents,nfeatures): global groups fftbins = 8192 featurewidth = 16 print "Selecting %d random spectral features.." % nfeatures feature_bins = np.random.randint(featurewidth/2,(fftbins/8),nfeatures) print "Selecting %d random audio events.." % nevents events = np.random.randint(0,len(faudio)-grain_mid,nevents) # Initialise features array with the first variable as index features = np.zeros((14,nevents)) features[0] = np.arange(0,nevents) print "Computing audio event spectrograms.." # For each event.. for i in range(0,nevents): # Calculate spectrogram for the event _fftevent = faudio[events[i]:min(events[i]+1000,len(faudio))]*sig.hann(1000) mfcc = MFCC.extract(_fftevent) features[:,i] = np.append(i,mfcc) #powerspec = abs(fft(_fftevent,fftbins)) ** 2 #melspec = np.dot(powerspec,melFilterBank(len(_fftevent))) #logspec = np.log(melspec) #mfcc = dct(logspec,type=2) #print mfcc # Calculate each feature for this event #for j in range(0,nfeatures): # features[j+1][i] = abs(np.mean(abs(mags[(feature_bins[j]-featurewidth/2):(feature_bins[j]+featurewidth/2)]))) print "Clustering events with K-Means algorithm.." groups = kmeans(np.transpose(features),tracks,minit='points',iter=30)[1] return [events,groups]
def get_label_data(label): files = os.listdir(label) test_mfcc = [ MFCC.get_mfcc(os.path.join(label, f)) for f in files if f.endswith("wav") ] return test_mfcc
def GenerateParams (gestures, verbose = True): params = {} for gesture in gestures: if(verbose): print "Processing " + gesture l = [] for sample in gestures[gesture]: l.append (MFCC.extract (numpy.frombuffer (sample, numpy.int16))) params[gesture] = l return params
def train(filename, id): rawdata = loadWAVfile(filename) mfcc = MFCC.extract(rawdata, show=False) model = VQ.Model(id) #Train the VQ model.train(mfcc) #Train the HMM create_file(mfcc, id) return
def load(): names = [ "Mathematics", "Biology", "PoliticalScience", "Statistics", "Psychology" ] sampledict = {} for name in names: sampledict[name] = [] for fname in glob.glob("Samples/" + name + " *"): w = wread(fname) sampledict[name].append(MFCC.extract(w[1])[:30]) return names, sampledict
def getPerson(self): if self.file[0] == '': print('empty file!') return sample_rate, signal = read(self.file[0]) theta1 = loadmat('ml.mat')['theta1'] theta2 = loadmat('ml.mat')['theta2'] mfcc = MFCC.main(signal, sample_rate) mlres = ml.predictWAV(theta1, theta2, mfcc) self.whoAmIWindow = WhoAmIWindowClass(mlres) self.whoAmIWindow.setWindowTitle('Result') self.whoAmIWindow.show()
def add_to_database(parameters, noteNumber, data, orchestra): instrument = parameters[2] tech = parameters[3] dyn = parameters[4] #note = parameters[5] dyn_db = assignDynamics.assign_dynamics( dyn, instrument, dynamics_list ) # function parameters: dynamics, instrument name, dynamics list data = cutSample(data) data = normalize_sound_file.normalize_audio_file( data, dyn_db) #Set sound file level according to the loaded text file #print("data normalized") #mfcc_data=librosa.feature.mfcc(y=data,sr=fs,n_mfcc=12,win_length=fs) M = len(data) #Length of data (should be 44100) spectrum = np.fft.fft(data, axis=0)[:M // 2 + 1:-1] #Calculate the fft #print("spectrum calculated") S = np.abs(spectrum) #Get rid of complex numbers S = 20 * np.log10(S) #dB values of data try: masking_freq, masking_threshold = maskingCurve.maskingCurve( S, noteNumber) #Calculate the masking curve except: print("Masking calculation fail, using flat masking curve") masking_freq = constants.threshold[:, 0] masking_threshold = np.ones(106) #print("masking calculated") mfcc_data, centroid = MFCC.custom_mfcc( data) #Calculate mfcc and spectral centroid #print("mfcc calculated") LpcLocs, LpcFreqs = lpc_coeffs.lpc_coeffs( data) #calculate LPC-frequency response #print("lpc calculated") #Add everything to database (except fft spectrum): nested_update( orchestra, { instrument: { tech: { dyn: { noteNumber: { "data": data, "masking_curve": masking_threshold, "masking_locs": masking_freq, "lpc_curve": LpcFreqs, "lpc_locs": LpcLocs, "mfcc": mfcc_data, "centroid": centroid } } } } }) return orchestra
def produce_mfcc(self, filename): wav = wave.open(filename, "r") x = np.fromstring(wav.readframes(self.sz), dtype=np.int16) #(nchannels, sampwidth, framerate, nframes, # comptype, compname) = wav.getparams() mfcc = MFCC.extract(x) match = self.lab_extractor.match(filename) try: label = match.group(1) except: label = "unknown" print >> sys.stderr, "unknown labels encountered" return (mfcc, label)
def write_to_csv(location): print "omer" os.chdir(location) for file in glob.glob("*.wav"): print "omer" mfcc = MFCC.extract_mfcc(file) mfcc = np.hstack([np.ones((mfcc.shape[0], 1)), mfcc]) print mfcc.shape with open("/home/omer/Desktop/Echo/Machine_Learning/Data_Gunshot.csv", 'a') as f_handle: np.savetxt(f_handle, mfcc, delimiter=",")
def calculate_within_cluster_distance(filepath): feature_vectors = list() samples = os.listdir(filepath) for sample in samples: feature_vectors.append( MFCC.voice_feature_extraction(os.path.join(filepath, sample))) distances = list() for i in range(0, len(feature_vectors)): for j in range(i + 1, len(feature_vectors)): distances.append( DTW.dynamic_time_warping(feature_vectors[i], feature_vectors[j])) return np.max(distances)
def test(signal, fs, feat_list): mean = np.average(signal) energy = np.sum(np.abs(signal - mean)) signal = signal / energy * 100 mfcc = MFCC.MFCC(signal, fs, Frame_Len, Hop_Len) digit = 0 c = DTW.DTW(mfcc, feat_list[0]) dis = c for i in range(0, len(feat_list)): c = DTW.DTW(mfcc, feat_list[i]) if c < dis: digit = i dis = c return digit, dis
def write_to_csv(fold0, fold_n, location): count = 0 for i in range(fold0, fold_n): path = location + st( i) #"/home/Desktop/UrbanSound8K/UrbanSound8K/audio/fold" +str(i) os.chdir(path) print path for file in glob.glob("*.wav"): if (file.split('-')[1] == '6'): mfcc = MFCC.extract_mfcc(file) mfcc = np.hstack([np.ones((mfcc.shape[0], 1)), mfcc]) with open( "/home/omer/Desktop/UrbanSound8K/UrbanSound8K/audio/Data_Gunskhdot.csv", 'a') as f_handle: np.savetxt(f_handle, mfcc, delimiter=",")
def test_mfcc(self): List=[[ -1.58999199e+02 , 8.34436590e+00, -4.44382643e+01, -1.05713490e+01, -4.14216808e+00 , 5.43735320e+00 ,-6.23641973e+00, 1.13643816e+01, 1.11168843e+01 ,2.09593413e+01 , 2.08886976e+01 , 1.78893376e+01, -1.85126261e+00 ,1.98630431e+00 ,-3.58780406e+00 , 1.07466142e+01, 4.06712767e+00 , -3.77452706e+00 ,-9.57172794e+00 ,2.71010408e+00, 2.28370949e-01 ,-1.67914367e+00 ,-2.70335598e+00 ,9.36659239e+00, -1.06643306e+00 , -4.19447993e+00 ,-1.55310523e+00 ,9.63509903e+00, -2.36770851e+00 , 1.16768921e+00 ,1.74342284e+00 , -6.92783306e-01, -2.74215299e+00 , 7.46808225e+00 ,-3.92998483e+00 ,-1.10826282e+00, 2.49712828e+00, -1.59097153e+00 , -5.17096235e+00 , 3.18161592e+00, -4.68084505e+00 , 4.28643721e+00, -3.98783991e-01 ,-4.31620744e+00, 1.85530792e+00 , 1.94520311e+00 ,-3.32610635e+00 , 5.60897361e+00, -1.59248264e+00 ,3.31523211e+00 , 3.20098072e-01 , 3.58511203e-01, 3.37264297e+00, -1.70320401e+00 , -1.18435935e-01 , 1.40946029e+00, -4.82136239e+00 , 3.66574126e+00, -1.98897953e+00 ,1.42700455e+00]] self.assertListEqual(List,list( MFCC.extract_mfcc("/home/omer/Desktop/UrbanSound8K/UrbanSound8K/audio/fold1/7061-6-0-0.wav")))
def runHMM(file_path): models = {} for label in CLASS_LABELS: with open(os.path.join("Models", label + ".pkl"), "rb") as file: models[label] = pk.load(file) with open("Models/kmeans.pkl", "rb") as file: kmeans = pk.load(file) sound_mfcc = MFCC.get_mfcc(file_path) sound_mfcc = kmeans.predict(sound_mfcc).reshape(-1, 1) evals = { cname: model.score(sound_mfcc, [len(sound_mfcc)]) for cname, model in models.items() } conclusion = max(evals.keys(), key=(lambda k: evals[k])) return evals, conclusion
def get_mfcc_from_melspec(self, melspec, deltamfcc=True, avelocalframes=True, stdlocalframes=True): '''Extract MFCC stats from mel-spectrogram. ''' mf = mfc.MFCCs() mfcc = mf.get_mfccs_from_melspec(melspec=melspec, melsr=self.framessr) if deltamfcc: ff = mfcc ffdiff = np.diff(ff, axis=1) ffdelta = np.concatenate((ffdiff, ffdiff[:, -1, None]), axis=1) frames = np.concatenate([ff, ffdelta], axis=0) mfcc = frames if avelocalframes: mfcc = self.average_local_frames(mfcc, getstd=stdlocalframes) mfcc = pd.DataFrame(mfcc.T) return mfcc
def feature_get(input_files_list,feature_save_list): #feature_extractors = {mfcc._extractor,pitch_based._extractor} f = open(input_files_list,'r') input_audio_files = f.readlines() f.close() f = open(feature_save_list,'r') save_files = f.readlines() f.close() i = 0 for audio_file,save_file in zip(input_audio_files,save_files): audio_file = audio_file.strip() save_file = save_file.strip() marks = get_segment_energy_marks(audio_file) feature1 = mfcc._extractor(audio_file,n_mfcc=13,n_fft=200,hop_length=80) feature2 = pitch_based._extractor(audio_file,window_length = 200,hop_length = 80) features = combine_feature([feature1,feature2]) features = get_segment_feature(features,marks) save_features(features,save_file) print(i) i = i+1
def add_to_database(url_, person_name_): gmm_models = {} if os.path.isfile('mfcc.mat'): gmm_models = sio.loadmat('mfcc.mat') print "Recording and processing...\n\n" full_sound_model = read_radio_stream(url_) wav.write('People\\'+person_name_+'.wav', 11025, full_sound_model/32767.0) print "Calculating MFCC and saving the model..." mfcc_features = MFCC.extract(full_sound_model) mfcc_features = mfcc_features[:, 1:] g = mixture.GMM(n_components=128) g.fit(mfcc_features) model = np.array([g.means_, g.covars_, np.repeat(g.weights_[:, np.newaxis], 12, 1)]) # weights have to be repeated to properly save the np array print len(g.means_) gmm_models[person_name_] = model sio.savemat('mfcc_32.mat', gmm_models, oned_as='row')
def main_rnn(config): x = tensor.tensor3('features') y = tensor.matrix('targets') # if 'LSTM' in config['model'] : # from models import getLSTMstack # y_hat = getLSTMstack(input_dim=13, input_var=x, depth=int(config['model'][-1])) # else : # raise Exception("These are not the LSTM we are looking for") # y_hat = model.apply(x) emitter = TestEmitter() # emitter = TrivialEmitter(readout_dim=config['lstm_hidden_size']) # cost_func = SquaredError() # @application # def qwe(self, readouts, outputs=None): # print(type(self), type(readouts)) # x = cost_func.apply(readouts,outputs) # return x print(type(emitter.cost)) # emitter.cost = qwe # print(type(qwe)) steps = 2 n_samples= config['target_size'] transition = [LSTM(config['lstm_hidden_size']) for _ in range(4)] transition = RecurrentStack(transition, name="transition", skip_connections=False) source_names = [name for name in transition.apply.states if 'states' in name] readout = Readout(emitter, readout_dim=config['lstm_hidden_size'], source_names=source_names,feedback_brick=None, merge=None, merge_prototype=None, post_merge=None, merged_dim=None) seqgen = SequenceGenerator(readout, transition, attention=None, add_contexts=False) seqgen.weights_init = IsotropicGaussian(0.01) seqgen.biases_init = Constant(0.) seqgen.push_initialization_config() seqgen.transition.biases_init = IsotropicGaussian(0.01,1) seqgen.transition.push_initialization_config() seqgen.initialize() states = seqgen.transition.apply.outputs print('states',states) states = {name: shared_floatx_zeros((n_samples, config['lstm_hidden_size'])) for name in states} cost_matrix = seqgen.cost_matrix(x, **states) cost = cost_matrix.mean() cost.name = "nll" cg = ComputationGraph(cost) model = Model(cost) #Cost # cost = SquaredError().apply(y_hat ,y) #cost = CategoricalCrossEntropy().apply(T.flatten(),Y) # #for sampling #cg = ComputationGraph(seqgen.generate(n_steps=steps,batch_size=n_samples, iterate=True)) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=config['learning_rate'])) #Getting the stream train_stream = MFCC.get_stream(config['batch_size'],config['source_size'],config['target_size'],config['num_examples']) #Monitoring stuff extensions = [Timing(), FinishAfter(after_n_batches=config['num_batches']), #DataStreamMonitoring([cost, error_rate],test_stream,prefix="test"), TrainingDataMonitoring([cost], prefix="train", every_n_batches=1), #Checkpoint(save_to), ProgressBar(), Printing(every_n_batches=1)] main_loop = MainLoop( algorithm, train_stream, # model=model, extensions=extensions) main_loop.run()
def main(): folders = os.listdir("wav") X = [] y = [] Xtest = [] ytest = [] nspeakers = 5 #feature extraction for i in range(nspeakers): folder = folders[i] files = [ f for f in glob.glob("wav/" + folder + "/" + "**/*.wav", recursive=True) ] sztraining = int(len(files) * 0.6) for fid in range(sztraining): sample_rate, signal = wav.read(files[fid]) mfcc = MFCC.main(signal, sample_rate) for j in range(len(mfcc)): X.append([]) for k in range(len(mfcc[j])): X[-1].append(mfcc[j][k]) y.append(i) for fid in range(sztraining, len(files)): sample_rate, signal = wav.read(files[fid]) mfcc = MFCC.main(signal, sample_rate) for j in range(len(mfcc)): Xtest.append([]) for k in range(len(mfcc[j])): Xtest[-1].append(mfcc[j][k]) ytest.append(i) y = np.array(y) X = np.array(X) ytest = np.array(ytest) Xtest = np.array(Xtest) input_layer_size = 390 hidden_layer_size = 200 num_labels = nspeakers lmbda = 1 initial_theta1 = ml.randInitializeWeights(input_layer_size, hidden_layer_size) initial_theta2 = ml.randInitializeWeights(hidden_layer_size, num_labels) nn_initial_params = np.hstack( (initial_theta1.ravel(order='F'), initial_theta2.ravel(order='F'))) print( ml.nnCostFunc(nn_initial_params, input_layer_size, hidden_layer_size, num_labels, X, y, lmbda)) theta_opt = opt.fmin_cg(maxiter=50, f=ml.nnCostFunc, x0=nn_initial_params, fprime=ml.nnGrad, args=(input_layer_size, hidden_layer_size, num_labels, X, y.flatten(), lmbda)) theta1_opt = np.reshape( theta_opt[:hidden_layer_size * (input_layer_size + 1)], (hidden_layer_size, input_layer_size + 1), 'F') theta2_opt = np.reshape( theta_opt[hidden_layer_size * (input_layer_size + 1):], (num_labels, hidden_layer_size + 1), 'F') pred = ml.predict(theta1_opt, theta2_opt, Xtest, ytest) print(np.mean(pred == ytest.flatten()) * 100) savemat('ml.mat', { 'theta1': theta1_opt, 'theta2': theta2_opt })
# python -i <name of this .py file> import numpy as np from scikits.audiolab import Sndfile SOUND_DIRECTORY = 'small_data_sample/right_whale' test_file = '%s/train12.aiff' % SOUND_DIRECTORY f = Sndfile(test_file, 'r') # Sndfile instances can be queried for the audio file meta-data fs = f.samplerate nc = f.channels enc = f.encoding # Reading is straightfoward data = f.read_frames(1000) # This reads the next 1000 frames, e.g. from 1000 to 2000, but as single precision data_float = f.read_frames(1000, dtype=np.float32) print data_float.shape import MFCC # data_float is a wave signal saved in a 1-D numpy array # mfcc is a 2-D numpy array, where each row is the # MFCC of a frame in data_float mfcc = MFCC.extract(data_float, show = True) # This will also plot the MFCC and the spectrogram # reconstructed from MFCC by inverse DCT
def collect(n=20): obs = [] for i in xrange(n): os.system("arecord -f S16_LE --rate=44100 -D hw:1,0 -d 3 test.wav") obs.append(MFCC.extract(wavfile.read("test.wav")[1])) return obs
def mfcc(self, m, NumFilters=48): """ Compute the Mth Mel-Frequency Cepstral Coefficient """ return MFCC.mfcc(self, m, NumFilters)
def GMM_test(ii): speakers_MFCC_dict = {} speaker_GMM_dict = {} files = glob.glob(os.getcwd()+'\\speakers\\*.wav') gauss_num = 32 iterator = 1 num_iter = ii if os.path.isfile('mfcc_'+str(gauss_num)+'.mat'): speaker_GMM_dict = sio.loadmat('mfcc_'+str(gauss_num)+'.mat') speaker_GMM_dict.pop('__header__') speaker_GMM_dict.pop('__version__') speaker_GMM_dict.pop('__globals__') else: for file in files: #print(file) if file[-6:-4] == '00': #file[len(file)-12:len(file)-9] current_speaker = file[len(file)-10:len(file)-6] print("############# Calculate MFCC and GMM for ", current_speaker, " , speaker no ", str(iterator)) #if iterator == 572: # print("Tu bedzie error") iterator += 1 merged_files = np.array([]) for i in range(0, 9): current_file = wav.read(file[:-5]+str(i)+file[-4:]) merged_files = np.append(merged_files, current_file[1]) #print(type(merged_files)) speaker_MFCC = MFCC.extract(merged_files) speaker_MFCC = speaker_MFCC[:, 1:] speakers_MFCC_dict[current_speaker] = speaker_MFCC g = mixture.GMM(n_components=gauss_num, n_iter=num_iter) g.fit(speaker_MFCC) speaker_model = np.array([g.means_, g.covars_, np.repeat(g.weights_[:, np.newaxis], 12, 1)]) speaker_GMM_dict[current_speaker] = speaker_model sio.savemat('mfcc_'+str(gauss_num)+'.mat', speaker_GMM_dict, oned_as='row') iterator = 1 good = 0 bad = 0 total = 0 for file in files: if file[-6:-4] == '09': g = mixture.GMM(n_components=gauss_num, n_iter=num_iter) current_file = wav.read(file) current_speaker = file[len(file)-10:len(file)-6] #print(current_speaker, ) speaker_MFCC = MFCC.extract(current_file[1]) speaker_MFCC = speaker_MFCC[:, 1:] log_prob = -10000 winner = 'nobody' for key, values in speaker_GMM_dict.items(): try: g.means_ = values[0, :, :] g.covars_ = values[1, :, :] g.weights_ = values[2, :, 1] temp_prob = np.mean(g.score(speaker_MFCC)) if temp_prob > log_prob: log_prob = temp_prob winner = key except TypeError: print('error for ', key) if current_speaker == winner: good += 1 else: bad += 1 total +=1 print(current_speaker, " speaker no ", str(iterator), " is similar to ", winner, " - log prob = ", str(log_prob)) print("good = ", str(good), ", bad = ", str(bad), ", total = ", str(total)) iterator += 1 print("GMM, n_iter = ", num_iter, ", Efficiency = ", str(good/total))
def get_mfcc_worker(fpath): print('mfcc: ' + fpath) fs, signal = wavfile.read(fpath) mfcc = MFCC.extract(fs, signal) return mfcc[:1500]
def template(signal, fs): mean = np.average(signal) energy = np.sum(np.abs(signal - mean)) signal = signal / energy * 100 mfcc = MFCC.MFCC(signal, fs, Frame_Len, Hop_Len) return mfcc
import MFCC import os name_list = ["Uesaka_Sumire_Anime","Komatsu_Mikako_Anime","Okubo_Rumi_Anime","Takamori_Natsumi_Anime","Mikami_Shiori_Anime"] for g in name_list: files = os.listdir("./" + g + "/") for f in files: MFCC.create_ceps("./" + g + "/" + f)
def mfcc2(self, numFilters = 32): """ Vectorized MFCC implementation """ return MFCC.mfcc2(self, numFilters)
add_to_database(fileParts, noteN, data) #print(data.shape) #print(data) #sd.play(data, fs) inst = 'alto_flute' orchestra = {inst: {'data': data}} M = len(data) spectrum = np.fft.fft(orchestra[inst]['data'], axis=0)[:M // 2 + 1:-1] spectrum = np.abs(spectrum) S = 20 * np.log10(spectrum) frq = 30 #mfcc_data=librosa.feature.mfcc(y=data,sr=rate,n_mfcc=12,n_fft=int(M),hop_length=int(M+2))[:,0] mfcc_data, centroid = MFCC.custom_mfcc(data) LpcLocs, LpcFreqs = lpc_coeffs.lpc_coeffs(data) # LPC=librosa.lpc(data, lpc_coeffs) # f,h=freqz(1,LPC, worN=lpc_coeffs, fs=fs) # h=20 * np.log10(np.abs(h)) A = np.linspace(0, len(spectrum), 101) #print("mfccs:") #print(mfcc_data.shape) #print(mfcc_data) #print(centroid) #peaks, _ = findPeaks(S, distance=frq, prominence=20, height=-10) idx, peaks = findPeaks.peaks(S, noteN) frq, thr = maskingCurve.maskingCurve(S, noteN) #peaks = find_peaks_cwt(S,np.arange(1,fs/2+1))
speech = Speech() xx, fs = speech.audioread(filename, 8000) xx = xx - np.mean(xx) # DC x = xx / np.max(xx) # normalized N = len(x) time = np.arange(N) / fs noisy = Noisy() signal, _ = noisy.Gnoisegen(x, SNR) # add noise wnd = np.hamming(wlen) # window function overlap = wlen - inc NIS = int((IS * fs - wlen) / inc + 1) # unvoice segment frame number y = speech.enframe(signal, list(wnd), inc).T fn = y.shape[1] # frame number frameTime = speech.FrameTime(fn, wlen, inc, fs) # frame to time Mfcc = MFCC() ccc = Mfcc.mfcc(signal, fs, 16, wlen, inc) # MFCC fn1 = ccc.shape[0] # frame number frameTime1 = frameTime[2:fn - 2] Ccep = ccc[:, 0:16] # MFCC coefficient C0 = np.mean( Ccep[0:5, :], axis=0) # calculate approximate average noise MFCC coefficient Dcep = np.zeros(fn) for i in range(5, fn1): Cn = Ccep[i, :] # one frame MFCC cepstrum coefficient Dstu = 0 for k in range(16): # calculate the MFCC cepstrum distance Dstu += (Cn[k] - C0[k])**2 # between each frame and noise Dcep[i] = np.sqrt(Dstu) Dcep[0:5] = Dcep[5]
def CalculateMFCCs(self): # This function calculates and returns the MFCC from the given wavfile mfccs = MFCC.extract(self.wav_data) return mfccs
ubm_dir = 'train_data_for_UBM' ubm_data_dirs = os.listdir(ubm_dir) dim = 12 sig = np.array([]) features_M = np.ndarray(shape = (0,dim), dtype = 'float64') features_F = np.ndarray(shape = (0,dim), dtype = 'float64') features = np.ndarray(shape = (0,dim), dtype = 'float64') wav = mywave() print 'hello' for ubm_data_dir in ubm_data_dirs: print 'hello' print ubm_data_dir if ubm_data_dir == '.DS_Store': continue sig = wav.WaveRead(ubm_dir+r'/'+ubm_data_dir) MFCC_obj = MFCC(40,12,300,3400,0.97,16000,50,0.0256,256) MFCC_coef = MFCC_obj.sig2s2mfc(sig) #energy = np.ndarray(shape = (MFCC_coef.shape[0],1),dtype = 'float64') #energy[:,0] = 10*numpy.log10((MFCC_coef**2).sum(axis=1)) #MFCC_coef = np.hstack((MFCC_coef,energy)) """ dtm1 = np.ndarray(shape = MFCC_coef.shape,dtype = 'float64' ) #初始化dtm1 dtm1[0:2,:] = 0 dtm1[MFCC_coef.shape[0]-2:MFCC_coef.shape[0],:] = 0; #计算dtm1 for loop2 in range(2,MFCC_coef.shape[0]-2): dtm1[loop2,:] = -2*MFCC_coef[loop2-2,:]-MFCC_coef[loop2-1,:]+MFCC_coef[loop2+1,:]+2*MFCC_coef[loop2+2,:] dtm1 = dtm1/3; dtm2 = np.ndarray(shape = MFCC_coef.shape,dtype = 'float64' ) #初始化dtm2
ubm_dir = 'train_data_for_UBM' ubm_data_dirs = os.listdir(ubm_dir) dim = 12 sig = np.array([]) features_M = np.ndarray(shape=(0, dim), dtype='float64') features_F = np.ndarray(shape=(0, dim), dtype='float64') features = np.ndarray(shape=(0, dim), dtype='float64') wav = mywave() print 'hello' for ubm_data_dir in ubm_data_dirs: print 'hello' print ubm_data_dir if ubm_data_dir == '.DS_Store': continue sig = wav.WaveRead(ubm_dir + r'/' + ubm_data_dir) MFCC_obj = MFCC(40, 12, 300, 3400, 0.97, 16000, 50, 0.0256, 256) MFCC_coef = MFCC_obj.sig2s2mfc(sig) #energy = np.ndarray(shape = (MFCC_coef.shape[0],1),dtype = 'float64') #energy[:,0] = 10*numpy.log10((MFCC_coef**2).sum(axis=1)) #MFCC_coef = np.hstack((MFCC_coef,energy)) """ dtm1 = np.ndarray(shape = MFCC_coef.shape,dtype = 'float64' ) #初始化dtm1 dtm1[0:2,:] = 0 dtm1[MFCC_coef.shape[0]-2:MFCC_coef.shape[0],:] = 0; #计算dtm1 for loop2 in range(2,MFCC_coef.shape[0]-2): dtm1[loop2,:] = -2*MFCC_coef[loop2-2,:]-MFCC_coef[loop2-1,:]+MFCC_coef[loop2+1,:]+2*MFCC_coef[loop2+2,:] dtm1 = dtm1/3; dtm2 = np.ndarray(shape = MFCC_coef.shape,dtype = 'float64' ) #初始化dtm2
def mfcc2(self, numFilters=32): """ Vectorized MFCC implementation """ return MFCC.mfcc2(self, numFilters)
def read_radio_stream(url_): database = sio.loadmat('mfcc_16_fft256_GMM.mat') database.pop('__header__') database.pop('__version__') database.pop('__globals__') r2 = urllib.urlopen(url_) pygame.mixer.init(44100, -16, 2, 2048) print pygame.mixer.get_init() chan1 = pygame.mixer.find_channel() format = sound.AFMT_S16_LE print sound.getODevices() #snd_out = sound.Output(44100, 2, format) dm = muxer.Demuxer('mp3') dec = None snd = None print(r2.info()) print('###################\n') #f = open('radio.mp3', 'wb') #g = open('radio.wav', 'wb') i = 0 while True: #i < 3: samples = r2.read(15000) frames = dm.parse(samples) if dec is None: # Open decoder dec = acodec.Decoder(dm.streams[0]) #start = time.time() sound_np_array = ansic_to_numpy(frames, dec) #print (sound_np_array.shape[0])/44100.0 #elapsed = (time.time() - start) #print 'decode and ndaray - %2.8f' %elapsed #start = time.time() to_play = np.array(np.repeat(sound_np_array[:, np.newaxis], 2, 1), dtype = 'int16') sounds = pygame.sndarray.make_sound(to_play) chan1.queue(sounds) #elapsed = (time.time() - start) #print 'to play - %2.8f' %elapsed #start = time.time() sound_np_array = decimate(sound_np_array, 4) #elapsed = (time.time() - start) #print 'downsample - %2.8f' %elapsed #start = time.time() mfcc_features = MFCC.extract(sound_np_array) #1.5s mfcc_features = mfcc_features[:, 1:] #elapsed = (time.time() - start) #print 'mfcc - %2.8f' %elapsed g = mixture.GMM(n_components=16) log_prob = -10000 winner = 'nobody' for key, values in database.iteritems(): try: g.means_ = values[0, :, :] g.covars_ = values[1, :, :] g.weights_ = values[2, :, 1] #start = time.time() temp_prob = np.mean(g.score(mfcc_features)) #elapsed = (time.time() - start) #print 'log-likelihood - %2.8f' %elapsed if temp_prob > log_prob: log_prob = temp_prob winner = key except TypeError: print 'error dla ', key print winner, log_prob print('\n###################')
if __name__ == '__main__': print __doc__ ubms_dir = 'ubms' speaker_model_dir = 'adaption' if not os.path.exists(speaker_model_dir): os.mkdir(speaker_model_dir) train_data_dir = 'train_data' train_data = os.listdir(train_data_dir) wav = mywave.mywave() for train_wav in train_data: print train_wav wave_data = wav.WaveRead(train_data_dir+r'/'+train_wav) MFCC_obj = MFCC(40,12,300,3400,0.97,16000,50,0.0256,256) MFCC_coef = MFCC_obj.sig2s2mfc(wave_data) adapted_gmm = GMM() if train_wav[-5] == 'M': adapted_gmm.read(ubms_dir+r'/ubm_M') elif train_wav[-5] == 'F': adapted_gmm.read(ubms_dir+r'/ubm_F') else: print 'train_wav name unexpected' adapted_gmm.adapt(MFCC_coef) adapted_gmm.write(speaker_model_dir+r'/'+train_wav)
def mfcc(self, m, NumFilters = 48): """ Compute the Mth Mel-Frequency Cepstral Coefficient """ return MFCC.mfcc(self, m, NumFilters)