def inference_from_microphone(): # Setup model model = setup_classifier(load_weights_from=PATH_TO_WEIGHTS) setup_classes_labels(load_classes_from=PATH_TO_CLASSES, model=model) # Start keyboard listener recording_state = Value('i', 0) board = KeyboardMonitor(recording_state, PRINT=False) board.start_listen(run_in_new_thread=True) # Set up audio recorder recorder = AudioRecorder() # Others tprinter = TimerPrinter() # for print # Start loop cnt_voice = 0 while True: tprinter.print("Usage: keep pressing down 'R' to record audio", T_gap=20) board.update_key_state() if board.has_just_pressed(): cnt_voice += 1 print("Record {}th voice".format(cnt_voice)) # start recording recorder.start_record(folder=SAVE_AUDIO_TO) # wait until key release while not board.has_just_released(): board.update_key_state() time.sleep(0.001) # stop recording recorder.stop_record() # Do inference audio = lib_datasets.AudioClass(filename=recorder.filename) predicted_label = model.predict_audio_label(audio) print("\nAll word labels: {}".format(model.classes)) print("\nPredicted label: {}\n".format(predicted_label)) # Shout out the results. e.g.: one is two lib_datasets.shout_out_result(recorder.filename, predicted_label, preposition_word="is", cache_folder="data/examples/") # reset for better printing print("\n") tprinter.reset() time.sleep(0.1)
def record_audio_and_classifiy(self, is_shout_out_result=False): model, classes = self._model, self._CLASSES gui, recorder = self._gui, self._recorder # -- Record audio gui.enable_img1_self_updating() recorder.start_record(folder=self._DST_AUDIO_FOLDER) # Start record while not gui.is_key_released(): # Wait for key released time.sleep(0.001) recorder.stop_record() # Stop record # -- Do inference audio = lib_datasets.AudioClass(filename=recorder.filename) probs = model.predict_audio_label_probabilities(audio) predicted_idx = np.argmax(probs) predicted_label = classes[predicted_idx] max_prob = probs[predicted_idx] print("\nAll word labels: {}".format(classes)) print("\nPredicted label: {}, probability: {}\n".format( predicted_label, max_prob)) PROB_THRESHOLD = 0.8 final_label = predicted_label if max_prob > PROB_THRESHOLD else "none" # -- Update the image # Update image1: first stop self updating, # then set recording_length and voice_intensity to zero gui.reset_img1() # Update image 2: the prediction results gui.set_img2( final_label=final_label, predicted_label=predicted_label, probability=max_prob, length=audio.get_len_s(), valid_length=audio.get_len_s(), # TODO: remove the silent voice, ) # Update image 3: the probability of each class gui.set_img3(probabilities=probs) # -- Shout out the results. e.g.: two is one if is_shout_out_result: lib_datasets.shout_out_result( recorder.filename, # Raw audio to shout out final_label, middle_word="is", cache_folder="data/examples/") return predicted_label, max_prob
def main(args): model = setup_classifier(load_weights_from=args.path_to_weights) setup_classes_labels(load_classes_from=args.path_to_classes, model=model) filenames = get_wav_filenames(path_to_data=args.path_to_data) print("\nStart predicting audio label:\n") for i, name in enumerate(filenames): audio = lib_datasets.AudioClass(filename=name) label = model.predict_audio_label(audio) print("{:03d}th file: Label = {:<10}, Filename = {}".format( i, label, name))
def inference_from_microphone(): # Setup model model, classes = lib_rnn.setup_default_RNN_model( weight_filepath=SRC_WEIGHT_PATH, classes_txt=SRC_CLASSES_PATH) print(f"{len(classes)} classes: {classes}") # Start keyboard listener keyboard = KeyboardInputFromTerminal( hotkey="R", is_print=False, run_in_new_thread=True) # Set up audio recorder recorder = AudioRecorder() # Others timer_printer = TimerPrinter(print_period=2.0) # for print # Start loop cnt_voice = 0 while True: timer_printer.print("Usage: keep pressing down 'R' to record audio") if keyboard.is_key_pressed(): cnt_voice += 1 print("\nRecord {}th voice".format(cnt_voice)) # Record audio recorder.start_record(folder=DST_AUDIO_FOLDER) # Start record while not keyboard.is_key_released(): # Wait for key released time.sleep(0.001) recorder.stop_record() # Stop record # Do inference audio = lib_datasets.AudioClass(filename=recorder.filename) predicted_label = model.predict_audio_label(audio) print("\nAll word labels: {}".format(model.classes)) print("\nPredicted label: {}\n".format(predicted_label)) # Shout out the results. e.g.: one is two lib_datasets.shout_out_result(recorder.filename, predicted_label, middle_word="is", cache_folder="data/examples/") time.sleep(0.1)
def main(args): # -- Init model model, classes = lib_rnn.setup_default_RNN_model( args.weight_path, args.classes_path) # -- If `data_folder` is a filename, return [data_folder]. # If `data_folder` is a folder, return all .wav filenames in this folder. filenames = lib_datasets.get_wav_filenames( data_folder=args.data_folder, suffix=".wav") # -- Classification print("\nStart predicting audio label:\n") for i, name in enumerate(filenames): audio = lib_datasets.AudioClass(filename=name) label = model.predict_audio_label(audio) print("{:03d}th file: Label = {:<10}, Filename = {}".format( i, label, name))
def reset_audio_sample_rate(filename, dst_sample_rate=16000): data, sample_rate = sf.read(filename) if (dst_sample_rate is not None) and (dst_sample_rate != sample_rate): data = librosa.core.resample(data, sample_rate, dst_sample_rate) sample_rate = dst_sample_rate sf.write(filename, data, sample_rate) print(f"Reset sample rate to {dst_sample_rate} for the file: {filename}") folder = "./data/noises/" fnames = get_filenames(folder, file_type="*.wav") Aug = lib_augment.Augmenter aug = Aug([ Aug.Crop(time=(0.6, 2.0)), Aug.PadZeros(time=(0, 0.3)), Aug.PlaySpeed(rate=(0.7, 1.5), keep_size=False), Aug.Amplify(rate=(0.6, 1.5)), ]) for name in fnames: audio0 = lib_datasets.AudioClass(filename=name) for i in range(5): print(i) audio = copy.deepcopy(audio0) aug(audio) name_new = lib_commons.add_idx_suffix(name, i).split('/')[-1] audio.write_to_file("data/data_tmp/" + name_new) audio.play_audio()
''' Test the function `lib_proc_audio.compute_mfcc`, which is called by `audio.compute_mfcc()`. ''' import matplotlib.pyplot as plt if True: # Add ROOT and import my libraries. import sys import os ROOT = os.path.dirname(os.path.abspath(__file__)) + \ "/../" # Root of the project. sys.path.append(ROOT) import utils.lib_datasets as lib_datasets audio = lib_datasets.AudioClass(filename="test_data/audio_front.wav") audio.compute_mfcc() audio.plot_audio_and_mfcc() plt.show()