import speech_data as data # training and testing data sets train_data = sys.argv[1] test_data = sys.argv[2] # grab the speakers from the training directory speakers = data.get_speakers(train_data) number_classes = len(speakers) # create the MFCC arrays from the data for training audio_files = os.listdir(train_data) X = [] Y = [] for f in audio_files: Y.append(data.one_hot_from_item(data.speaker(f), speakers)) y, sr = librosa.load(train_data + f) X.append(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)) # define the network and the model tflearn.init_graph(num_cores=8, gpu_memory_fraction=0.5) net = tflearn.input_data(shape=[None, 13, 44]) net = tflearn.fully_connected(net, 64) net = tflearn.dropout(net, 0.5) net = tflearn.fully_connected(net, number_classes, activation='softmax') net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy') model = tflearn.DNN(net)
import pickle import tensorflow as tf import librosa.display import IPython.display import numpy as np import speech_data from pydub import AudioSegment as audio # now put all of the mfccs into an array data = '/home/cc/working/data/devclean_2_seg/' speakers = speech_data.get_speakers(data) audio_files = os.listdir(data) mfccs = [] Y = [] for f in audio_files: Y.append(speech_data.one_hot_from_item(speech_data.speaker(f), speakers)) y, sr = librosa.load(data + f) mfccs.append(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)) net = tflearn.input_data(shape=[None, 13, 44]) net = tflearn.fully_connected(net, 64) net = tflearn.dropout(net, 0.5) net = tflearn.fully_connected(net, 32) net = tflearn.fully_connected(net, len(speakers), activation='softmax') net = tflearn.regression(net, optimizer='adam', loss='categorical_crossentropy') model = tflearn.DNN(net,tensorboard_dir='/home/cc/working/tboard/', tensorboard_verbose=3) model.fit(mfccs, Y, n_epoch=2000, show_metric=True, snapshot_step=100) os.chdir('/home/cc/working/data/devclean_test/')
learning_rate=0.001) # Training model = tflearn.DNN(network, checkpoint_path='model_alexnet', max_checkpoints=1, tensorboard_verbose=2) statistic_array = np.zeros((1, number_classes)) try: model.load('./saved_model/augment_model.tflearn') finally: ts_path = "./new_data_set/simple_test_set/npys/" v_counter = 0 samples = fetch.random_sample(ts_path, 1) for sample in samples: load_spectrum = np.load(ts_path + sample) #demo=np.reshape(load_spectrum,(227,227,1)) demo = np.array(load_spectrum, dtype=np.float32) result1 = model.predict([demo]) result = data.one_hot_to_item(result1, speakers) validity = fetch.check_speaker(result, sample, -3) #-2or-3 print("predicted speaker for %s : result = %s validity = %d" % (sample, result, validity)) # ~ 97% correct if validity: v_counter += 1 else: statistic_array = statistic_array + data.one_hot_from_item( fetch.extract(sample, -3), speakers) print(v_counter / len(samples))