def run(): """ Runs the classification program using input values from the command line. """ # Clear previous streams Main.clear_streams() # Persist classifier if it does not exist if len(glob.glob("saved_classifiers/clf_mfcc.h5")) < 1: print("Saving model...") clf_mfcc, scaler_mfcc = MFCC.train_mfcc_nn(util.data_path + "speech", util.data_path + "music", 20000) clf_mfcc.save('saved_classifiers/clf_mfcc.h5') # Persist scaler joblib.dump(scaler_mfcc, "saved_classifiers/scaler_mfcc.joblib") else: print("Restoring models...") clf_mfcc = tf.keras.models.load_model('saved_classifiers/clf_mfcc.h5') scaler_mfcc = joblib.load("saved_classifiers/scaler_mfcc.joblib") if live_stream: Main.calc_from_stream(station, clf_mfcc, scaler_mfcc, is_mfcc, is_cfa, listening_preference, replacement_path) else: filename = "stream_long" radiorec.record(station, 10, filename) file = "data/test/" + filename + ".mp3" Main.calc_from_file(file, filename, clf_mfcc, scaler_mfcc, is_mfcc, is_cfa)
def evaluate_mfcc(x_tst, y_true, clf, scaler, iteration): """ Evaluates the trained classifier with the given data. Prints and stores confusion matrices. :param x_tst: The test data :param y_true: The true test labels :param clf: The classifier :param scaler: The scaler :param iteration: The current iteration """ print("Evaluating MFCC feature...") y_mfccs = [] for file in tqdm(x_tst): # MFCC classification mfcc = MFCC.read_mfcc_from_file(file) result = MFCC.predict_nn(clf, scaler, mfcc) ones = np.count_nonzero(result) total = ones / len(result) if total < 0.5: y_mfccs.append(0) else: y_mfccs.append(1) print("Evaluation for MFCC feature:") y_pred = y_mfccs report = sklearn.metrics.classification_report(y_true, y_pred, labels, target_names) confusion_matrix = sklearn.metrics.confusion_matrix(y_true, y_pred, labels) print("Report") print(report) print("Confusion Matrix") print(confusion_matrix) pretty_print_cm(confusion_matrix, "MFCC", str(iteration)) print( "----------------------------------------------------------------- \n\n" )
def print_mfcc(mfcc, clf, scaler_mfcc, result_mfcc, splits=9): """ Split the incoming MFCC data into 9 blocks, make a prediction and print the result :param mfcc: The incoming MFCC feature vectors :param clf: The trained MFCC classifier :param scaler_mfcc: The scaler instance (from training) :param result_mfcc: List to which the result is written :param splits: How many splits should be performed on the feature vectors """ split = np.array_split(mfcc, splits) total = 0 for i in range(splits): result = MFCC.predict_nn(clf, scaler_mfcc, split[i]) ones = np.count_nonzero(result) total += ones / len(result) result = round(total / splits, 4) result_mfcc[0] = result print("MFCC Music: ", str(result))
from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import SGDClassifier from sklearn.linear_model import Perceptron from sklearn.neural_network import MLPClassifier import util # Load dataset from features import MFCC if len(glob.glob(util.data_path + "mfcc_trn_gtzan.joblib")) < 1: X, Y = MFCC.calculate_mfccs(util.data_path + "speech/gtzan", util.data_path + "music/gtzan", max_duration=1800) joblib.dump(X, util.data_path + "mfcc_trn_gtzan.joblib") joblib.dump(X, util.data_path + "mfcc_lbls_gtzan.joblib") else: X = joblib.load(util.data_path + "mfcc_trn_gtzan.joblib") Y = joblib.load(util.data_path + "mfcc_lbls_gtzan.joblib") # Prepare configuration for cross validation seed = 7 # Prepare models models = [('LR', LogisticRegression()), ('SGD', SGDClassifier()), ('LDA', LinearDiscriminantAnalysis()), ('CART', DecisionTreeClassifier()), ('NB', GaussianNB()), ('PC', Perceptron()),
def calc_from_stream(station, clf_mfcc, scaler_mfcc, is_mfcc, is_cfa, listening_preference, replacement_path): """ Streams 0.5 seconds of audio and classifies the data. :param station: The radio station from which the data should be streamed. See "radiorec_settings.ini" for available stations. :param clf_mfcc: The trained neural network classifier for MFCC classification :param scaler_mfcc: The scaler instance used to scaled the original MFCC training data :param is_mfcc: If the MFCC value should be calculated and taken into consideration for the final result :param is_cfa: If the CFA value should be calculated and taken into consideration for the final result :param listening_preference: The listening preference specifies whether spoken segments or music segments should be kept :param replacement_path: The path to the audio file that is played when the unwanted class is detected :return: void """ mixer.init(frequency=16000, channels=1) succ_speech = 0 succ_music = 0 # Flag for checking if currently playing replacement is_replacement = False i = 0 while True: results = {} current_file = "stream_" + str(i) radiorec.record(station, 0.5, current_file) path = "data/test/" + current_file + ".mp3" wav_path = ac.mp3_to_16_khz_wav(path) print("Current: " + current_file) # Preprocess audio sig, rate, frequencies, times, spectrogram = Processing.preprocessing( wav_path) # Take time start = time.time() # Use features specified in command line arguments if is_mfcc: # MFCC classification current_mfcc = MFCC.read_mfcc(sig, rate) result_mfcc = [-1] thread_mfcc = threading.Thread(target=Output.print_mfcc( current_mfcc, clf_mfcc, scaler_mfcc, result_mfcc, 9), args=(10, )) if is_cfa: # CFA classification cfa, peakis = CFA.calculate_cfa(spec=spectrogram, threshold=cfa_threshold) result = round(cfa, 4) results["cfa"] = result print("CFA Music: " + str(result)) if is_mfcc: thread_mfcc.start() if is_mfcc: thread_mfcc.join() results["mfcc"] = result_mfcc # Make a decision and add to blocks final_result = decide(results) # Add to successive blocks if final_result > 0.5: succ_music += 1 succ_speech = 0 else: succ_speech += 1 succ_music = 0 result_str = "SPEECH" if final_result <= 0.5 else "MUSIC" print("FINAL RESULT: ", final_result, " => " + result_str) print("Successive music blocks: ", succ_music) print("Successive speech blocks: ", succ_speech) # Fadeout the track if the currently played type does not correspond to what was specified via the command line # 4 blocks provide a good user experience because sometimes single or double blocks are classified wrong if listening_preference == "music": if succ_speech > 4 and not is_replacement: mixer.music.load(replacement_path) mixer.music.fadeout(300) mixer.music.play() is_replacement = True if succ_music > 4 and is_replacement: is_replacement = False mixer.music.fadeout(300) if listening_preference == "speech": if succ_music > 4 and not is_replacement: mixer.music.load(replacement_path) mixer.music.fadeout(300) mixer.music.play() is_replacement = True if succ_speech > 4 and is_replacement: is_replacement = False mixer.music.fadeout(300) if not is_replacement: # Play audio stream mixer.music.load(wav_path) mixer.music.play() i += 1 # Measure execution time end = time.time() print("Elapsed Time: ", str(end - start)) print()
def calc_from_file(file, filename, clf_mfcc, scaler_mfcc, is_mfcc, is_cfa): """ Classifies an mp3 audio file and saves the results in a CSV and PNG file. See "plots" folder. :param file: The file to classify. :param filename: The filename. Used to save the CSV and PNG files. :param clf_mfcc: The trained neural network classifier for MFCC classification :param scaler_mfcc: The scaler instance used to scaled the original MFCC training data :param is_mfcc: If the MFCC value should be calculated and taken into consideration for the final result :param is_cfa: If the CFA value should be calculated and taken into consideration for the final result :return: An array containing the range of seconds of the duration of the file in steps of 0.5s. The generated speech_music_map """ speech_music_map = [] succ_speech = 0 succ_music = 0 path = file wav_path = ac.mp3_to_16_khz_wav(path) # Preprocess audio sig, rate, frequencies, times, spectrogram = Processing.preprocessing( wav_path) half_seconds = math.ceil(util.get_wav_duration(wav_path) * 2) mixer.init(frequency=16000, channels=1) mixer.music.load(wav_path) mixer.music.play() time_per_iteration = 0 i = 0 for i in range(half_seconds): # Take time start = time.time() results = {} print("Current: ") # Use features specified in command line arguments if is_mfcc: # MFCC classification startidx = math.floor(len(sig) * i / half_seconds) endidx = math.ceil(len(sig) * (i + 1) / half_seconds) current_mfcc = MFCC.read_mfcc(sig[startidx:endidx], rate) result_mfcc = [-1] thread_mfcc = threading.Thread(target=Output.print_mfcc( current_mfcc, clf_mfcc, scaler_mfcc, result_mfcc, 9), args=(10, )) if is_cfa: startidx = math.floor(spectrogram.shape[1] * i / half_seconds) endidx = math.ceil(spectrogram.shape[1] * (i + 1) / half_seconds) # CFA classification cfa, peakis = CFA.calculate_cfa(spec=spectrogram[:, startidx:endidx], threshold=cfa_threshold) result = round(cfa, 4) results["cfa"] = result print("CFA Music: " + str(result)) if is_mfcc: thread_mfcc.start() thread_mfcc.join() results["mfcc"] = result_mfcc # Make a decision and add to blocks final_result = decide(results) # Add to successive blocks if final_result > 0.5: succ_music += 1 succ_speech = 0 speech_music_map.append(1) else: succ_speech += 1 succ_music = 0 speech_music_map.append(0) result_str = "SPEECH" if final_result <= 0.5 else "MUSIC" print("FINAL RESULT: ", final_result, " => " + result_str) print("Successive music blocks: ", succ_music) print("Successive speech blocks: ", succ_speech) i += 1 # Measure execution time end = time.time() elapsed = end - start time_per_iteration += elapsed if i > 1 else 0 # First iteration takes longer as numba caches all the functions print("Elapsed Time: ", str(elapsed)) print() # Save CSV and PNG of sequence of classified data (speech_music_map) x = np.arange(len( speech_music_map)) / 2 # Convert from samples (every 0.5s to seconds) util.plot_speech_music_map(filename, x, speech_music_map, save_csv=True) print("Average time per iteration: ", str(time_per_iteration / i)) return x, speech_music_map
def evaluate_mfcc_kfold(path_speech, path_music, max_duration, k): """ Trains and evaluates a Keras neural network :param path_speech: The path to the speech data :param path_music: The path to the music data :param max_duration: The total duration of files that should be selected of each class. For example, 5000 would :param k: The number of splits on the training data train the network with 5000 minutes of speech files and 5000 minutes of music files :param test: If the data should be split into a training and a test set :return: The trained classifier and the scaler used to scale the training data """ # Use existing training data (= extracted MFCCs). This is to skip the process of recalculating the MFCC each time. if len(glob.glob(util.data_path + "mfcc_trn_kfold.joblib")) < 1: trn, lbls = MFCC.calculate_mfccs(path_speech, path_music, max_duration) joblib.dump(trn, util.data_path + "mfcc_trn_kfold.joblib") joblib.dump(lbls, util.data_path + "mfcc_lbls_kfold.joblib") else: trn = joblib.load(util.data_path + "mfcc_trn_kfold.joblib") lbls = joblib.load(util.data_path + "mfcc_lbls_kfold.joblib") # Initialize kfold = sklearn.model_selection.KFold(n_splits=k, shuffle=True) # The accuracy of each iteration is stored in this list accuracies = [] for train_index, test_index in kfold.split(trn): print("TRAIN:", train_index, "TEST:", test_index) # Take split from the total training data x_train, x_test = trn[train_index], trn[test_index] y_train, y_test = lbls[train_index], lbls[test_index] # Scale data scaler = sklearn.preprocessing.StandardScaler() x_train = scaler.fit_transform(x_train) # Classifier fitting # Keras nn clf = tf.keras.models.Sequential([ tf.keras.layers.Flatten(input_shape=(1, 26)), tf.keras.layers.Dense(16, activation=tf.nn.relu), tf.keras.layers.Dense(8, activation=tf.nn.relu), tf.keras.layers.Dense(2, activation=tf.nn.softmax) ]) clf.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) # Data reshaping required for the network x_train = x_train.reshape((x_train.shape[0], 1, x_train.shape[1])) # Fit data clf.fit(x_train, y_train, epochs=5) # Run on test set correct = 0 incorrect = 0 y_pred = [] for i in tqdm(range(len(x_test))): result = MFCC.predict_nn(clf, scaler, x_test[i].reshape(1, -1)) ones = np.count_nonzero(result) result = ones / len(result) if result >= 0.5: y_pred.append(1) if y_test[i] == 1: correct += 1 else: incorrect += 1 else: y_pred.append(0) if y_test[i] == 0: correct += 1 else: incorrect += 1 accuracy = correct / (correct + incorrect) accuracies.append(accuracy) print("Results for test set: ", str(round(accuracy, 2)) + "%") y_pred = np.array(y_pred) report = sklearn.metrics.classification_report(y_test, y_pred, labels, target_names) confusion_matrix = sklearn.metrics.confusion_matrix( y_test, y_pred, labels) print("Report") print(report) print("Confusion Matrix") print(confusion_matrix) pretty_print_cm(confusion_matrix, "MFCC", str(train_index)) print( "----------------------------------------------------------------- \n\n" ) print("Mean accuracy:", np.mean(np.array(accuracies)))