def main(script_purpose, split=False): current_filename = os.path.basename(__file__) session_name = get_date( ) #make sure this session has a unique identifier - link to model name and logging information path_to_data = "./data_3word" start = time.time() try: start_logging(script_purpose) logging.info("Running script: {}".format(current_filename)) logging.info("Session: {}".format(session_name)) tablename, feature_type, num_features, num_feature_columns, noise = user_input.set_variables( ) paths, labels = featfun.collect_audio_and_labels(path_to_data) noise_path = "./_background_noise_/doing_the_dishes.wav" label_list = [i[1] for i in labels] class_labels = list(set(label_list)) print("The labels found: {}".format(class_labels)) #find out number of recordings from each class #ask user if they want to balance out the data? dict_class_distribution = featfun.get_class_distribution( class_labels, label_list) min_val = (1000000, None) for key, value in dict_class_distribution.items(): if value < min_val[0]: min_val = (value, key) print("Number of wave files for each class:\n\n{}\n\n".format( dict_class_distribution)) print("Chosen max number of files from each class = {}".format( min_val[0])) print("\n\nDo you approve this? (Y/N)") approve = input() if 'exit' in approve.lower(): raise ExitApp() elif 'y' in approve.lower(): pass elif 'n' in approve.lower(): print("Okay... woring on that functionality") raise ExitApp() else: raise ExitApp() max_num_per_class = min_val[0] #create dictionary w indices to labels and paths for each class dict_class_index_list = {} for label in class_labels: dict_class_index_list[label] = [] for i, label_item in enumerate(label_list): if label == label_item: dict_class_index_list[label].append(i) #get num per training/validation/test datasets max_nums_train_val_test = get_max_nums_train_val_test( max_num_per_class) #randomly assign indices to train, val, test datasets: dict_class_dataset_index_list = {} for label in class_labels: tot_indices = dict_class_index_list[label] tot_indices_copy = tot_indices.copy() random.shuffle(tot_indices_copy) train_indices = tot_indices_copy[:max_nums_train_val_test[0]] val_indices = tot_indices_copy[ max_nums_train_val_test[0]:max_nums_train_val_test[0] + max_nums_train_val_test[1]] test_indices = tot_indices_copy[ max_nums_train_val_test[0] + max_nums_train_val_test[1]:max_nums_train_val_test[0] + max_nums_train_val_test[1] + max_nums_train_val_test[2]] dict_class_dataset_index_list[label] = [ train_indices, val_indices, test_indices ] print() print("Name for directory to save feature images:") new_directory = input() if 'exit' in new_directory.lower(): raise ExitApp() train_val_test_dirs = [] for i in ["train", "val", "test"]: train_val_test_dirs.append(new_directory + "_{}".format(i)) start_feature_extraction = time.time() for label in class_labels: for i, directory in enumerate(train_val_test_dirs): if i == 0: train = True else: train = False try: os.makedirs(directory) except OSError as e: if e.errno != errno.EEXIST: raise dict_new_paths = {} new_path = './{}/{}/'.format(directory, label) try: os.makedirs(new_path) dict_new_paths[label] = new_path except OSError as e: if e.errno != errno.EEXIST: raise #limit = int(max_nums_train_val_test[i]*.3) limit = None num_pics = max_nums_train_val_test[i] msg = "\nExtracting features from {} samples. \nImages will be saved in the directory {}".format( num_pics, new_path) print(msg) logging.info(msg) frame_width = 11 time_step = 6 logging.info( "extracting features from wavefiles. Limit = {}".format( limit)) paths_list_dataset = [] labels_list_dataset = [] train_val_test_index_list = dict_class_dataset_index_list[ label] for k in train_val_test_index_list[i]: paths_list_dataset.append(paths[k]) labels_list_dataset.append(label_list[k]) print("Extracting features from class: {}".format(label)) for j, wav in enumerate(paths_list_dataset): if limit: if j <= limit: featfun.save_chroma( wav, split, frame_width, time_step, feature_type, num_features, num_feature_columns, noise, dict_new_paths[labels_list_dataset[j]], train, noise_path, vad=True, add_noise=True) else: featfun.save_chroma( wav, split, frame_width, time_step, feature_type, num_features, num_feature_columns, noise, dict_new_paths[labels_list_dataset[j]], train, noise_path, vad=True, add_noise=True) end_feature_extraction = time.time() logging.info("Duration setup: {} minutes".format( round((start_feature_extraction - start) / 60, 2))) print("Duration of feature extraction: {} minutes".format( round((end_feature_extraction - start_feature_extraction) / 60, 2))) except ExitApp: print("Have a good day!") logging.info("User exited app.") except FeatureExtractionError as e: logging.exception("Error occurred in feature extraction: {}".format(e)) except Exception as e: logging.exception("Error occurred: {}".format(e)) finally: end = time.time() duration = round((end - start) / 60, 2) logging.info("Duration: {} minutes".format(duration))
else: col_var += ' ?' c.executemany(' INSERT INTO mfcc_40 VALUES (%s) ' % col_var, x) conn.commit() else: logging.exception( "Failed MFCC extraction: {} in the directory: {}".format( filename, label)) return None if __name__ == '__main__': try: tr_tot = tracker.SummaryTracker() start_logging(script_purpose) logging.info("Running script: {}".format(current_filename)) logging.info("Session: {}".format(session_name)) #initialize database conn = sqlite3.connect(database) c = conn.cursor() print("Database will be saved as: {}".format(database)) print("Noisegroup of collected MFCCs: {}".format(noisegroup)) print("Speech data will be {}".format(dataset_name)) print("Noise wavefile: {}".format(environment_noise)) print("Number of MFCCs to be extracted: {}".format(num_mfcc)) check_variables = input( "\nIMPORTANT!!!!\nAre the items listed above correct? (Y or N): ")
def main(script_purpose, database=None, tablename=None): current_filename = os.path.basename(__file__) session_name = get_date( ) #make sure this session has a unique identifier - link to model name and logging information #set default values if database is None: database = "speech_features.db" start = time.time() try: start_logging(script_purpose) separator = "*" * 80 logging.info(separator) logging.info("RUNNING SCRIPT: \n\n{}".format(current_filename)) logging.info("SESSION: \n\n{}".format(session_name)) ###################################################################### #load data logging.info("Loading data from \nDatabase: {}\nTable: {}".format( database, tablename)) data = user_input.load_data(database, tablename) logging.info("Data successfully loaded") end_loaded_data = time.time() #!!!!necessary variables for user to set!!!!! #~these set most of the subsequent variables id_col_index = 2 #index 0 --> sample ID, index 1 --> speaker ID context_window_size = 9 frame_width = context_window_size * 2 + 1 #if the data contains column w frequency info, assume it is the second to last column #also assumes features start after the relevant id column if 'pitch' in tablename: features_start_stop_index = [id_col_index + 1, -2] else: features_start_stop_index = [id_col_index + 1, -1] #assumes last column is the label column label_col_index = [-1] #add feature columns based on which features are to be expected num_features, num_feature_columns = feature_column_prep(tablename) print("The original number of features: {}".format(num_features)) print("Total feature columns: {}".format(num_feature_columns)) logging.info( "Column index for each recording/speaker ID set at: {}".format( id_col_index)) logging.info( "Number of original features (e.g. MFCCs or FBANK energy features): {}" .format(num_features)) logging.info( "Number of total features (e.g. derivatives, pitch): {}".format( num_feature_columns)) logging.info("Set context window size: {}".format(context_window_size)) logging.info("Frame width: {}".format(frame_width)) ###################################################################### start_data_prep = time.time() logging.info("Now prepping data for model training") #prep data #1) make sure each utterance has same number of samples; #if not, zeropad them so each has same number of samples data_zeropadded, samples_per_utterance, num_utterances, labels_present = featfun.prep_data( data, id_col_index, features_start_stop_index, label_col_index, num_feature_columns, frame_width, session_name) logging.info("Data has been zero-padded") logging.info("Shape of zero-padded data: {}".format( data_zeropadded.shape)) logging.info("Fixed number of samples per utterance: {}".format( samples_per_utterance)) logging.info("Number of utterances in data: {}".format(num_utterances)) logging.info("Reshaping data to fit ConvNet + LSTM models") X, y = featfun.shape_data_dimensions_CNN_LSTM(data_zeropadded, samples_per_utterance, frame_width) logging.info("Done reshaping") logging.info("Shape of feature data (i.e. 'X'): {}".format(X.shape)) logging.info("Shape of label data (i.e. 'y'): {}".format(y.shape)) #separate X and y --> training and test datasets logging.info("Separating data into train and test datasets") test_size = 0.1 X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size) logging.info("Separated data. Test size = {}".format(test_size)) logging.info("Shape of train data: \nX = {}\ny = {}".format( X_train.shape, y_train.shape)) logging.info("Shape of test data: \nX = {}\ny = {}".format( X_test.shape, y_test.shape)) ###################################################################### #train the models! logging.info("Now initializing the model and beginning training") start_train = time.time() #TIME-FREQUENCY CONVNET tfcnn = Sequential() # feature maps = 40 # 8x4 time-frequency filter (goes along both time and frequency axes) color_scale = 1 input_size = (frame_width, num_features, color_scale) tfcnn.add( Conv2D(num_feature_columns, kernel_size=(8, 4), activation='relu')) #non-overlapping pool_size 3x3 tfcnn.add(MaxPooling2D(pool_size=(3, 3))) tfcnn.add(Dropout(0.25)) tfcnn.add(Flatten()) #prepare LSTM tfcnn_lstm = Sequential() timestep = samples_per_utterance // frame_width tfcnn_lstm.add( TimeDistributed(tfcnn, input_shape=(timestep, frame_width, num_feature_columns, color_scale))) tfcnn_lstm.add(LSTM(timestep)) #num timesteps tfcnn_lstm.add( Dense(len(labels_present), activation="softmax" )) # binary = "sigmoid"; multiple classification = "softmax" print(tfcnn_lstm.summary()) #set loss: #binary = "binary_crossentropy", multiple (one-hot-encoded) = "categorical_crossentropy"; multiple (integer encoded) = "sparse_categorical_crossentropy" loss = "sparse_categorical_crossentropy" logging.info("Loss set at: '{}'".format(loss)) #compile model tfcnn_lstm.compile(optimizer='adam', loss=loss, metrics=['accuracy']) #train model epochs = 300 logging.info("Number of epochs set at: {}".format(epochs)) model_train_name = "CNN_LSTM_training_{}".format(session_name) callback = [ EarlyStopping(monitor='val_loss', patience=15, verbose=1), ReduceLROnPlateau(patience=5, verbose=1), CSVLogger( filename='model_log/{}_log.csv'.format(model_train_name)), ModelCheckpoint( filepath='bestmodel/bestmodel_{}.h5'.format(model_train_name), verbose=1, save_best_only=True) ] history = tfcnn_lstm.fit(X_train, y_train, epochs=epochs, validation_split=0.15, callbacks=callback) score = tfcnn_lstm.evaluate(X_test, y_test, verbose=1) acc = round(score[1] * 100, 2) print("Model Accuracy on test data:") print(acc) logging.info("Model Accuracy on TEST data: {}".format(acc)) modelname = "CNN_LSTM_{}_{}_{}_{}recordings_{}epochs_{}acc".format( session_name, database, tablename, num_utterances, epochs, acc) print('Saving Model') tfcnn_lstm.save(modelname + '.h5') print('Done!') print("\n\nModel saved as:\n{}".format(modelname)) print("Now saving history and plots") plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.title("train vs validation loss") plt.ylabel("loss") plt.xlabel("epoch") plt.legend(["train", "validation"], loc="upper right") plt.savefig("{}_LOSS.png".format(modelname)) plt.clf() plt.plot(history.history['acc']) plt.plot(history.history['val_acc']) plt.title("train vs validation accuracy") plt.legend(["train", "validation"], loc="upper right") plt.savefig("{}_ACCURACY.png".format(modelname)) except ExitApp: print("Have a good day!") logging.info("User exited app.") except Error as e: logging.exception("Database error: {}".format(e)) except Exception as e: logging.exception("Error occurred: {}".format(e)) finally: end = time.time() duration = round((end - start) / 3600, 2) msg1 = "Total Duration: {} hours".format(duration) logging.info(msg1) duration_load_data = round((end_loaded_data - start) / 60, 2) msg2 = "Duration to load data: {} min".format(duration_load_data) logging.info(msg2) duration_prep = round((start_train - start_data_prep) / 60, 2) msg3 = "Duration to prep data: {} min".format(duration_prep) logging.info(msg3) duration_train = round((end - start_train) / 60, 2) msg4 = "Duration to train models: {} min".format(duration_train) logging.info(msg4)