def train_classifier(dataset,n_classes,Fs=16000): datalist = os.listdir(dataset) n_data_set = len(datalist) for i in range(n_data_set) : filepath = dataset + '/' + datalist[i] if datalist[i].find('.wav')==1 : try: [x, Fs, n_channels, n_samples] = read_wave(filepath) except: print(e.msg) elif datalist[i].find('.raw') : try: x = read_raw(filepath) if len(x.shape)>1: n_channels = x.shape[1] n_samples = x.shape[0] else: n_channels = 1 n_samples = len(x) except: print(e.msg) features = get_mfcc(x,Fs) dimension = features.shape[1] size = features.shape[0] emHMM_algorithm(features,dimension,2,size)
def train_classifier(dataset, n_classes, Fs=16000): datalist = os.listdir(dataset) n_data_set = len(datalist) for i in range(n_data_set): filepath = dataset + '/' + datalist[i] if datalist[i].find('.wav') == 1: try: [x, Fs, n_channels, n_samples] = read_wave(filepath) except: print(e.msg) elif datalist[i].find('.raw'): try: x = read_raw(filepath) if len(x.shape) > 1: n_channels = x.shape[1] n_samples = x.shape[0] else: n_channels = 1 n_samples = len(x) except: print(e.msg) features = get_mfcc(x, Fs) dimension = features.shape[1] size = features.shape[0] emHMM_algorithm(features, dimension, 2, size)
def predict_from_file(self, file_name, norm=False): ''' Take a fileName, try to open it and returns the prediction of the wrapped model :param file_name: Name of the file to recognize :return: a json-formatted string of the prediction ''' pred_data = (mf.get_mfcc(self.dirpath + file_name, normalizemean=norm)[:self.trunk, 1:]) prediction = self.model.predict(pred_data) prediction = self.users_map[int(prediction)] answ = {"user_prediction": prediction} return json.dumps(answ)
def predict_from_audio(self, audio, recognizer): audio, google_pred = sprec.recognize(audio, recognizer, return_audio=True) sprec.save_audio(audio, self.dirpath + "sample_to_recognize.wav") pred_data = mf.get_mfcc(self.dirpath + "sample_to_recognize.wav")[:self.trunk, 1:] prediction = self.model.predict(pred_data) if isinstance(self.model, Cl.NNClassifier): prediction = self.users_map[prediction] else: prediction = self.users_map[prediction[0]] answ = { "user_prediction": prediction, "google_sentence_prediction": google_pred } return prediction, google_pred
def extract_features(path): df = pd.DataFrame() print('Extracting features') freq_col = ['pitch'] mfcc_col = ['mfcc' + str(i + 1) for i in list(range(110))] col = freq_col + mfcc_col directory = os.listdir(path + "recorded_audio\\") print(directory) for wav_file in directory: write_features = [] y, sr = librosa.load(path + "recorded_audio\\" + wav_file) fs, x = wav.read(path + "recorded_audio\\" + wav_file) pitch = get_pitch(fs, x) mfcc_features = get_mfcc(y, sr) write_features = [pitch] + mfcc_features.tolist()[0] df = df.append([write_features]) df.columns = col df.to_csv('recorded_audio_features.csv')
def main(path, gender): df = pd.DataFrame() print('Extracting features for ' + gender) directory = os.listdir(path) for wav_file in directory: write_features = [] y, sr = librosa.load(path + wav_file) fs, x = wav.read(path + wav_file) print(wav_file) pitch = get_pitch(fs, x) #frequencies=get_frequencies(y,sr) #freq_features=get_features(frequencies) mfcc_features = get_mfcc(y, sr) #write_features=[pitch]+freq_features+mfcc_features.tolist()[0]+[gender] write_features = [pitch] + mfcc_features.tolist()[0] + [gender] df = df.append([write_features]) #if wav_file=='00001.wav': #break #remove break to execute for all files df.columns = col df.to_csv(gender + '_features.csv')
try: cur_line = lines[j].split() start = int(cur_line[0]) stop = int(cur_line[1]) label = cur_line[2] length = stop / 10.0**7 - start / 10.0**7 audio = f.read_frames(freq * (length)) total_length += length if label in label_dic: mono_signal = audio # audio[:,0] energy = np.sum(mono_signal**2, 0) / len(mono_signal) signal = mono_signal # mono_signal/math.sqrt(energy) samplerate = f.samplerate mfcc = get_mfcc(signal, samplerate, winstep=window_step, nfft=2048, highfreq=8000, lowfreq=10) N_iter = np.floor(len(mfcc) / N) # apply context window if (length / window_step) > N: mfcc_matrix = np.zeros((1, 13 * N)) for k in range(int(N_iter)): mfcc_vec = [] for kk in range(N): mfcc_vec = np.concatenate( (mfcc_vec, mfcc[k * N + kk, :])) mfcc_matrix = np.concatenate( (mfcc_matrix, mfcc_vec[np.newaxis, :])) # get the numeric label corresponding to the literal label num_label = label_dic[label] * np.ones(
def calibrate(self, user_name, nb_calibrations="5", existing_samples=False, nb_tests="0", noise_red=False, norm=False, downsample=0): ''' Calibrate for a new user of the speaker recognition system :param user_name: The name of the user :param nb_calibrations: Number of calibration samples :param dirpath: path where to store the user data :param trunk: where to cut in the mel freq time domain :return: None ''' # update the number of users we have self.nb_users += 1 self.users_map.update({self.nb_users - 1: user_name}) nb_calibrations = int(nb_calibrations) nb_tests = int(nb_tests) trunk = self.trunk if existing_samples == "0": print("Recording the calibration samples") for i in range(nb_calibrations): sprec.get_one_sample(self.dirpath + user_name.lower() + "{:0>4}.wav".format(i + 1)) if not self.bootstrap: # Drop after trunk in the time domain as well as the first mel coef user_calibration_data = [(mf.get_mfcc( self.dirpath + user_name.lower() + "{:0>4}.wav".format(i + 1), delta=self.delta, noisereduction=noise_red, normalizemean=norm, downsample=downsample)[:trunk, 1:]) for i in range(nb_calibrations)] self.train_data = np.concatenate( (self.train_data, np.asarray(user_calibration_data)), axis=0) self.train_labels = np.concatenate((self.train_labels, np.tile([self.nb_users - 1], (nb_calibrations, 1)))) else: print("Bootstrapping") '''for i in range(nb_calibrations): mfcc = (mf.get_mfcc(self.dirpath + user_name.lower() + "{:0>4}.wav".format(i + 1)) [:, 1:]) n_timedomain = mfcc.shape[0] nb_bootstrap = n_timedomain - self.trunk assert nb_bootstrap > 0 #TODO: TRY A NEW WAY OF BOOTSTRAPPING cal_data = [mfcc[j: j+self.trunk] for j in range(nb_bootstrap)] self.train_data = np.concatenate((self.train_data, np.asarray(cal_data)), axis=0) self.train_labels = np.concatenate( (self.train_labels, np.tile([self.nb_users - 1], (nb_bootstrap, 1))))''' for i in range(nb_calibrations): mfcc = (mf.get_mfcc(self.dirpath + user_name.lower() + "{:0>4}.wav".format(i + 1), delta=self.delta, normalizemean=norm, downsample=downsample)[:, 1:]) n_timedomain = mfcc.shape[0] print(mfcc.shape) nb_bootstrap = n_timedomain - self.trunk step = 25 assert nb_bootstrap > 0 cal_data = [ mfcc[j:j + self.trunk] for j in np.arange(0, nb_bootstrap, step) ] self.train_data = np.concatenate( (self.train_data, np.asarray(cal_data)), axis=0) self.train_labels = np.concatenate( (self.train_labels, np.tile([self.nb_users - 1], (np.arange(0, nb_bootstrap, step).shape[0], 1)))) print(self.train_labels) if (nb_tests > 0): # Drop after trunk in the time domain as well as the first mel coef user_testing_data = [ (mf.get_mfcc(self.dirpath + user_name.lower() + "{:0>4}.wav".format(i + 1), delta=self.delta, noisereduction=noise_red, normalizemean=norm)[:trunk, 1:]) for i in range(nb_calibrations, nb_tests + nb_calibrations) ] self.test_data = np.concatenate( (self.test_data, np.asarray(user_testing_data)), axis=0) self.test_labels = np.concatenate( (self.test_labels, np.tile([self.nb_users - 1], (nb_tests, 1))))
def process_file(args): fname = args[0] frame_size = args[1] frame_step = args[2] fft_n = args[3] mel_filterbank = args[4] mfcc_num = args[5] counter_queue = args[6] transcription_path = args[7] # Handle to types of files: wave and sph # f_raw is numpy array if fname.endswith(".wav"): sample_rate, f_raw = wavfile.read(fname) elif fname.endswith(".sph"): sph_obj = sph.read(fname) sample_rate = sph_obj.framerate f_raw = sph_obj.data else: raise ValueError("Wrong file format: " + str(fname)) # split into overlapping frames frames = split_into_frames(f_raw, frame_size, frame_step, transcription_path, sample_rate) frames_buffer_size = 5 frames_buffer = [] # middle of the buffer processing_frame_index = 2 features = [] for frame in frames: if len(frames_buffer) < frames_buffer_size: frames_buffer.append(get_mfcc(frame, fft_n, mel_filterbank, mfcc_num)) else: processing_frame_mfcc = frames_buffer[processing_frame_index] prev_frame_mfcc = frames_buffer[processing_frame_index-1] prev_frame_first_deltas = get_deltas(processing_frame_mfcc, frames_buffer[processing_frame_index-2]) next_frame_mfcc = frames_buffer[processing_frame_index+1] next_frame_first_deltas = get_deltas(frames_buffer[processing_frame_index+2], processing_frame_mfcc) processing_frame_first_deltas = get_deltas(next_frame_mfcc, prev_frame_mfcc) processing_frame_second_deltas = get_deltas(next_frame_first_deltas, prev_frame_first_deltas) features.append(( processing_frame_mfcc, processing_frame_first_deltas, processing_frame_second_deltas )) # Update circular buffer frames_buffer.pop(0) frames_buffer.append(get_mfcc(frame, fft_n, mel_filterbank, mfcc_num)) processed_files = counter_queue.get() + 1 if processed_files % 5 == 0: print("Processed " + str(processed_files) + ' files') counter_queue.put(processed_files) return features
import sys import mfcc import glob import os audio_dir = sys.argv[1] feature_dir = sys.argv[2] audio_files = glob.glob('{}/*.mp3'.format(audio_dir)) print('found {} samples in {}'.format(len(audio_files), audio_dir)) for file in audio_files: sample_name = file.split('\\')[-1].split('.')[0] if os.path.isfile('{}/{}.csv'.format(feature_dir, sample_name)): print('feature exist for {}'.format(sample_name)) continue mfcc_feature = mfcc.get_mfcc(file) fea_file_name = '{}/{}.csv'.format(feature_dir, sample_name) print('writing feature to [ {} ]'.format(fea_file_name)) with open(fea_file_name, 'w') as f: for row in mfcc_feature: for col in row: f.write('{},'.format(col)) f.write('\n')
def main(args): audio_path = Path(args.audio_path) wave_data, sr = librosa.load(audio_path) df1 = pd.read_csv('../data1.csv') df2 = pd.read_csv('../data2.csv') df3 = pd.read_csv('../data3.csv') result_path = Path(args.save_path) timestamp = datetime.now().strftime(TIME_TEMPLATE) result_path = result_path / timestamp if not result_path.exists(): try: result_path.mkdir(parents=True) except Exception as err: print(err) '''clustering data1''' clf_1 = kmeans.KMeans(n_clusters=4, init='random', max_ite=300, random_state=44) pred_1 = clf_1.fit_predict(df1.values) df1['pred'] = pred_1 # df1.plot(kind="scatter", x='x', y='y', c="pred", cmap='rainbow') plt.scatter(df1['x'], df1['y'], c=df1['pred']) plt.savefig(result_path / "data1.png") plt.clf() plt.close() '''clustering data2''' clf_2 = kmeans.KMeans(n_clusters=2, init='random', max_ite=300, random_state=44) pred_2 = clf_2.fit_predict(df2.values) df2['pred'] = pred_2 # df2.plot(kind="scatter", x='x', y='y', c="pred", cmap='rainbow') plt.scatter(df2['x'], df2['y'], c=df2['pred']) plt.savefig(result_path / "data2.png") plt.clf() plt.close() '''clustering data3''' clf_3 = kmeans.KMeans(n_clusters=4, init='random', max_ite=300, random_state=44) pred_3 = clf_3.fit_predict(df3.values) df3['pred'] = pred_3 fig = plt.figure(figsize=(10, 10)) ax = fig.add_subplot(111, projection='3d') ax.scatter(df3['x'].values, df3['y'].values, df3['z'].values, c=df3['pred']) plt.legend() plt.savefig(result_path / 'data3.png') plt.clf() plt.close() '''mfcc''' win_size = 2048 overlap = 0.5 bank_size = 20 _mfcc = mfcc.get_mfcc(wave_data, sr, win_size, overlap, bank_size) plt.imshow(_mfcc, cmap='rainbow', aspect='auto', origin='lower') plt.colorbar() plt.savefig(result_path / 'mfcc.png') plt.clf() plt.close() dmfcc = mfcc.calc_delta(_mfcc) plt.imshow(dmfcc, cmap='rainbow', aspect='auto', origin='lower') plt.colorbar() plt.savefig(result_path / 'dmfcc.png') plt.clf() plt.close() ddmfcc = mfcc.calc_delta(dmfcc) plt.imshow(ddmfcc, cmap='rainbow', aspect='auto', origin='lower') plt.colorbar() plt.savefig(result_path / 'ddmfcc.png') plt.clf() plt.close()
save_data += L*10 - data_spacing if save_data > 1: zero_padding = np.floor(save_data) save_data = save_data % 1 shouting_data = np.concatenate((shouting_data, np.zeros(zero_padding, dtype=int))) noise_data = np.concatenate((noise_data, np.zeros(zero_padding, dtype=int))) label_data = np.concatenate((label_data, np.zeros(zero_padding, dtype=int))) error_vector = np.concatenate((error_vector, np.zeros(zero_padding, dtype=int))) total_data += L*10 test_length = len(label_data) if label in label_dic: mono_signal = audio # audio[:,0] energy = np.sum(mono_signal**2, 0) / len(mono_signal) signal = mono_signal # mono_signal/math.sqrt(energy) samplerate = f1.samplerate mfcc = get_mfcc(signal, samplerate, winstep=window_step, nfft=2048, highfreq=8000, lowfreq=10) if (L / window_step) < N: shouting_data = np.concatenate((shouting_data, np.zeros(data_spacing, dtype=int))) noise_data = np.concatenate((noise_data, np.zeros(data_spacing, dtype=int))) label_data = np.concatenate((label_data, np.zeros(data_spacing, dtype=int))) error_vector = np.concatenate((error_vector, np.zeros(data_spacing, dtype=int))) else: zeros_to_add = 0 if label == "Noise": error_vector = np.concatenate((error_vector, (test_labels[index:index + data_spacing] - predictionfinal[index:index + data_spacing]))) shouting_data = np.concatenate((shouting_data, np.zeros(data_spacing, dtype=int))) noise_data = np.concatenate((noise_data, np.ones(data_spacing, dtype=int))) label_data = np.concatenate((label_data, np.ones(data_spacing, dtype=int))) index += data_spacing - 1 elif label == "Shouting":