def classify(): # load model try: model = keras.models.load_model('cnn_speech_trained_model.h5') except: print('Error: No model file found, please train the classifier first.') sys.exit() # load the evaluation data eval_data = list(wav16khz2mfcc(EVAL_DATA).items()) # ...aaand classify the given data count = 0 results = [] for filename, data in eval_data: # compute the score of the picture cls, score = classify_record(model, data) results.append(filename.split('/')[-1].split('.')[0] + ' ' + str(score) + ' ' + str(cls)) if cls: count += 1 # just remember we got a target... results.sort() output = open('audio_convolutionalNN.txt', 'w') for result in results: print(result, file=output) if PRINT_STATS: print(f'Targets found: {count}') output.close()
def loadEvalData(): print("STEP4: Loading evaluation data.") evalData = wav16khz2mfcc(EVAL_DIR) print( "STEP4 Done: All evaluation data loaded and prepared for classification.\n" ) return evalData
def loadTrainData(): filenames = os.listdir(TRAIN_DIR) # Recordings loading print("STEP1: Loading training data into internal structure") for filename in filenames: classRecords = wav16khz2mfcc(TRAIN_DIR + "/" + filename).values() trainClasses[filename] = classRecords trainClasses[filename] = np.vstack(classRecords) print("STEP1 Done: Training data loaded. Number of trainig classes: ", len(trainClasses), "\n") return trainClasses
def main(): check_dir('eval') test = wav16khz2mfcc('eval') P_t = 0.5 P_nt = 1 - P_t fname = 'GMM_model.pkl' if len(sys.argv) > 1: fname = sys.argv[1] #choose one model with open(fname, 'rb') as f: Ws_t, MUs_t, COVs_t, Ws_nt, MUs_nt, COVs_nt = pickle.load(f) for tst in sorted(test.keys()): ll_t = logpdf_gmm(test[tst], Ws_t, MUs_t, COVs_t) ll_nt = logpdf_gmm(test[tst], Ws_nt, MUs_nt, COVs_nt) scr = (sum(ll_t) + np.log(P_t)) - (sum(ll_nt) + np.log(P_nt)) tst = tst.split("/")[-1].split(".")[0] if scr >= 0: print(tst, scr, 1) else: print(tst, scr, 0)
predictions = img_model.predict_proba(i_test_x) p = np.argmax(predictions) name = re.search('.*/(\w*).png', k).group(1) #get result and write to output file result = predictions[0][1] * 10 result = float("{:.2f}".format(result)) if result >= 1.0: output_file.write(k + " " + " " + str(result) + " " + "1\n") else: output_file.write(k + " " + " " + str(result) + " " + "0\n") #add prediction result to data dict for detecting person data[name] = [predictions[0][1]] output_file.close() v_data = wav16khz2mfcc(data_folder_path) voice_model.summary() output_file = open('./predict_voice_output.txt', 'w') print("[INFO] evaluating voice classifier...") a = [] count = 0 #average value of results mean = 1020 for k, v in v_data.items(): #prepare data for model v_test_x = np.vstack(tuple(v)) v_test_x = np.r_[v_test_x] v_test_x = v_test_x.astype('float32') v_test_x /= 255
def main(): check_dir(os.path.dirname(negative_test_path)) check_dir(os.path.dirname(negative_train_path)) check_dir(os.path.dirname(positive_test_path)) check_dir(os.path.dirname(positive_train_path)) train_m = list(wav16khz2mfcc(positive_train_path).values()) train_f = list(wav16khz2mfcc(negative_train_path).values()) test_m = list(wav16khz2mfcc(positive_test_path).values()) test_f = list(wav16khz2mfcc(negative_test_path).values()) train_m = np.vstack(train_m) train_f = np.vstack(train_f) dim = train_m.shape[1] cov_tot = np.cov(np.vstack([train_m, train_f]).T, bias=True) d, e = scipy.linalg.eigh(cov_tot, eigvals=(dim - 2, dim - 1)) train_m_pca = train_m.dot(e) train_f_pca = train_f.dot(e) # Classes are not well separated in 2D PCA subspace n_m = len(train_m) n_f = len(train_f) cov_wc = (n_m * np.cov(train_m.T, bias=True) + n_f * np.cov(train_f.T, bias=True)) / (n_m + n_f) cov_ac = cov_tot - cov_wc d, e = scipy.linalg.eigh(cov_ac, cov_wc, eigvals=(dim - 1, dim - 1)) # Lets define uniform a-priori probabilities of classes: P_m = 0.5 P_f = 1 - P_m ll_m = logpdf_gauss(test_m[0], np.mean(train_m, axis=0), np.var(train_m, axis=0)) ll_f = logpdf_gauss(test_m[0], np.mean(train_f, axis=0), np.var(train_f, axis=0)) posterior_m = np.exp(ll_m) * P_m / (np.exp(ll_m) * P_m + np.exp(ll_f) * P_f) ll_m = logpdf_gauss(test_m[0], *train_gauss(train_m)) ll_f = logpdf_gauss(test_m[0], *train_gauss(train_f)) # '*' before 'train_gauss' pases both return values (mean and cov) as parameters of 'logpdf_gauss' posterior_m = np.exp(ll_m) * P_m / (np.exp(ll_m) * P_m + np.exp(ll_f) * P_f) plt.figure() plt.plot(posterior_m, 'b') plt.plot(1 - posterior_m, 'r') plt.figure() plt.plot(ll_m, 'b') plt.plot(ll_f, 'r') # Again gaussian models with full covariance matrices. Now testing a female utterance ll_m = logpdf_gauss(test_f[1], *train_gauss(train_m)) ll_f = logpdf_gauss(test_f[1], *train_gauss(train_f)) # '*' before 'train_gauss' pases both return values (mean and cov) as parameters of 'logpdf_gauss' posterior_m = np.exp(ll_m) * P_m / (np.exp(ll_m) * P_m + np.exp(ll_f) * P_f) plt.figure() plt.plot(posterior_m, 'b') plt.plot(1 - posterior_m, 'r') plt.figure() plt.plot(ll_m, 'b') plt.plot(ll_f, 'r') score = [] mean_m, cov_m = train_gauss(train_m) mean_f, cov_f = train_gauss(train_f) for tst in test_m: ll_m = logpdf_gauss(tst, mean_m, cov_m) ll_f = logpdf_gauss(tst, mean_f, cov_f) score.append((sum(ll_m) + np.log(P_m)) - (sum(ll_f) + np.log(P_f))) # Run recognition with 1-dimensional LDA projected data score = [] mean_m, cov_m = train_gauss(train_m.dot(e)) mean_f, cov_f = train_gauss(train_f.dot(e)) for tst in test_m: ll_m = logpdf_gauss(tst.dot(e), mean_m, np.atleast_2d(cov_m)) ll_f = logpdf_gauss(tst.dot(e), mean_f, np.atleast_2d(cov_f)) score.append((sum(ll_m) + np.log(P_m)) - (sum(ll_f) + np.log(P_f))) M_m = 12 MUs_m = train_m[randint(1, len(train_m), M_m)] COVs_m = [np.var(train_m, axis=0)] * M_m Ws_m = np.ones(M_m) / M_m M_f = 7 MUs_f = train_f[randint(1, len(train_f), M_f)] COVs_f = [np.var(train_f, axis=0)] * M_f Ws_f = np.ones(M_f) / M_f for jj in range(100): [Ws_m, MUs_m, COVs_m, TTL_m] = train_gmm(train_m, Ws_m, MUs_m, COVs_m) [Ws_f, MUs_f, COVs_f, TTL_f] = train_gmm(train_f, Ws_f, MUs_f, COVs_f) score = [] testok = 0 testnok = 0 for tst in test_m: ll_m = logpdf_gmm(tst, Ws_m, MUs_m, COVs_m) ll_f = logpdf_gmm(tst, Ws_f, MUs_f, COVs_f) scr = (sum(ll_m) + np.log(P_m)) - (sum(ll_f) + np.log(P_f)) score.append(scr) if scr >= 0: testok += 1 else: testnok += 1 print("target is " + str(testok / (testok + testnok))) score = [] testok = 0 testnok = 0 for tst in test_f: ll_m = logpdf_gmm(tst, Ws_m, MUs_m, COVs_m) ll_f = logpdf_gmm(tst, Ws_f, MUs_f, COVs_f) scr = (sum(ll_m) + np.log(P_m)) - (sum(ll_f) + np.log(P_f)) score.append(scr) if scr < 0: testok += 1 else: testnok += 1 print("non target is " + str(testok / (testok + testnok))) print('Saved as "GMM_model.pkl"') with open('GMM_model.pkl', 'wb') as f: pickle.dump([Ws_m, MUs_m, COVs_m, Ws_f, MUs_f, COVs_f], f)
import numpy as np from ikrlib import wav16khz2mfcc, logistic_sigmoid, train_gauss, train_gmm, logpdf_gmm, gellipse import matplotlib.pyplot as plt import scipy.linalg from numpy.random import randint class_index = 0 train_sample = [None] * 2 test_sample = [None] * 2 real_test_sample = [None] * 2 for speech_sample in train_sample: train_sample[class_index] = wav16khz2mfcc('data/train/' + str(class_index + 1)).values() for rec in train_sample[class_index]: rec = rec[100:len(rec) - 300, :] class_index += 1 P_c1 = len(train_sample[0]) * 1.0 / len(train_sample[1]) P_c = [P_c1, 1 - P_c1] real_test_sample_dict = wav16khz2mfcc('data/eval/') real_test_sample = real_test_sample_dict.values() real_test_name = real_test_sample_dict.keys() # Cutting silence of a record - to get better results for rec in real_test_sample: rec = rec[100:len(rec) - 300, :] for class_index in xrange(len(train_sample)):
from __future__ import division import numpy as np from ikrlib import wav16khz2mfcc, train_gmm, logpdf_gmm NUM_CLASSES = 31 if __name__ == "__main__": M = [] Ws = [] MUs = [] COVs = [] for i in range(NUM_CLASSES): id = i + 1 print("Loading data for class {}".format(id)) train = np.vstack(wav16khz2mfcc("train/{}".format(id)).values()) print("Training model for class {}".format(id)) M.append(32) Ws.append(np.ones(M[i]) / M[i]) MUs.append(train[np.random.randint(1, len(train), M[i])]) COVs.append([np.var(train, axis=0)] * M[i]) n = 15 for iteration in range(n): [Ws[i], MUs[i], COVs[i], TTL] = train_gmm(train, Ws[i], MUs[i], COVs[i]) print("Training iteration: {}/{}, total log-likelihood: {}".format(iteration + 1, n, TTL)) errors = 0 trials = 0 for i in range(NUM_CLASSES):
import matplotlib.pyplot as plt from ikrlib import wav16khz2mfcc import numpy as np from keras import layers from keras import models from keras import optimizers from keras import utils from keras import metrics import datetime import time # Load data train_t = np.vstack(tuple(wav16khz2mfcc('../data/target_train').values())) train_n = np.vstack(tuple(wav16khz2mfcc('../data/non_target_train').values())) val_t = np.vstack(tuple(wav16khz2mfcc('../data/target_dev').values())) val_n = np.vstack(tuple(wav16khz2mfcc('../data/non_target_dev').values())) print('train_t: ', train_t.shape) print('train_n: ', train_n.shape) print('val_t: ', val_t.shape) print('val_n: ', val_n.shape) #preparing data train_x = np.r_[train_t, train_n] train_y = np.r_[np.ones(len(train_t)), np.zeros(len(train_n))]
import matplotlib.pyplot as plt from ikrlib import wav16khz2mfcc, logpdf_gauss, train_gauss, train_gmm, logpdf_gmm from glob import glob import scipy.linalg import numpy as np from numpy.random import randint train = {} MUs = {} COVs = {} gauss_cnt = 10 Ws = {} for i in range(1, 32): train[i] = wav16khz2mfcc('train/' + str(i)).values() #plt.plot(train[i][0][:,0]) #plt.show() # DELETE FIRST AND LAST 200 FRAMES for j in range(0, len(train[i])): for k in range(0, 200): train[i][j] = np.delete(train[i][j], (0), axis=0) for k in range(0, 200): train[i][j] = np.delete(train[i][j], (len(train[i][j]) - 1), axis=0) # DELETE SILENCE frames for j in range(0, len(train[i])): summ = 0 min_eng = train[i][j][0][0]
def train_classifier(validation=False): # First, load the data train_t = list(wav16khz2mfcc(TRAIN_TARGET).values()) # target train data train_n = list(wav16khz2mfcc(TRAIN_NTARGET).values()) # non-target train data if validation: test_t = wav16khz2mfcc(TEST_TARGET) # target test data test_n = wav16khz2mfcc(TEST_NTARGET) # non-target test data target = [] for rec in train_t: target.append(remove_silence(rec)) ntarget = [] for rec in train_n: ntarget.append(remove_silence(rec)) X_train_t = np.vstack(target) X_train_n = np.vstack(ntarget) if validation: test_target = [] for rec in list(test_t.values()): test_target.append(remove_silence(rec)) test_ntarget = [] for rec in list(test_n.values()): test_ntarget.append(remove_silence(rec)) X_test_t = np.vstack(test_target) X_test_n = np.vstack(test_ntarget) # Create 13x13 batches from the data X_train_t = create_frame_batches(X_train_t) X_train_n = create_frame_batches(X_train_n) if validation: X_test_t = create_frame_batches(X_test_t) X_test_n = create_frame_batches(X_test_n) # Get all the data to one place X_train = np.vstack((X_train_t, X_train_n)) y_train = np.hstack((np.zeros(X_train_t.shape[0]), np.ones(X_train_n.shape[0]))) y_train_hot = to_categorical(y_train) if validation: X_test = np.vstack((X_test_t, X_test_n)) y_test = np.hstack((np.zeros(X_test_t.shape[0]), np.ones(X_test_n.shape[0]))) y_test_hot = to_categorical(y_test) num_classes = 2 # Now build the model model = Sequential() model.add(Conv2D(16, (3, 3), input_shape=(SEGMENT_LEN, 13, 1), activation='relu')) model.add(MaxPooling2D(pool_size=(2,2))) model.add(Conv2D(32, (3, 3), activation='relu')) model.add(MaxPooling2D(pool_size=(2,2))) model.add(Flatten()) model.add(Dropout(0.25)) model.add(Dense(64, activation='relu')) model.add(Dropout(0.25)) model.add(Dense(num_classes, activation='softmax')) model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=['accuracy']) if validation: model.fit(X_train, y_train_hot, batch_size=5, epochs=5, validation_data=(X_test, y_test_hot)) else: model.fit(X_train, y_train_hot, batch_size=5, epochs=5) model.summary() model.save('cnn_speech_trained_model.h5')
#!/usr/bin/python3.8 import numpy import io # slightly modified version of ikrlib from ikrlib import wav16khz2mfcc, train_gmm, logpdf_gmm t_train_sound__dict = wav16khz2mfcc("target_train") t_train_sound = numpy.vstack(list(t_train_sound__dict.values())) nt_train_sound__dict = wav16khz2mfcc("non_target_train") nt_train_sound = numpy.vstack(list(wav16khz2mfcc("non_target_train").values())) eval_sound__dict = wav16khz2mfcc("eval") eval_sound = list(eval_sound__dict.values()) eval_sound_names = [x.split("\\")[1].split(".")[0] for x in list(eval_sound__dict.keys())] P_t = 0.5 M_t = 64 MUs_t = t_train_sound[numpy.random.randint(1, len(t_train_sound), M_t)] COVs_t = [numpy.var(t_train_sound, axis=0)] * M_t Ws_t = numpy.ones(M_t) / M_t P_nt = 1 - P_t M_nt = M_t MUs_nt = nt_train_sound[numpy.random.randint(1, len(nt_train_sound), M_nt)] COVs_nt = [numpy.var(nt_train_sound, axis=0)] * M_nt Ws_nt = numpy.ones(M_nt) / M_nt n = 32 for i in range(n): [Ws_t, MUs_t, COVs_t, TTL_t] = train_gmm( t_train_sound, Ws_t, MUs_t, COVs_t)