def classify_audio(audio_device_index, interpreter, labels_file, commands_file=None, result_callback=None, dectection_callback=None, sample_rate_hz=16000, negative_threshold=0.6, num_frames_hop=33): """Acquire audio, preprocess, and classify.""" # Initialize recorder. AUDIO_SAMPLE_RATE_HZ = sample_rate_hz downsample_factor = 1 if AUDIO_SAMPLE_RATE_HZ == 48000: downsample_factor = 3 # Most microphones support this # Because the model expects 16KHz audio, we downsample 3 fold recorder = audio_recorder.AudioRecorder( AUDIO_SAMPLE_RATE_HZ, downsample_factor=downsample_factor, device_index=audio_device_index) feature_extractor = Uint8LogMelFeatureExtractor(num_frames_hop=num_frames_hop) labels = read_labels(labels_file) if commands_file: commands = read_commands(commands_file) else: commands = {} logger.info("Loaded commands: %s", str(commands)) logger.info("Recording") timed_out = False with recorder: last_detection = -1 while not timed_out: try: spectrogram = feature_extractor.get_next_spectrogram(recorder) set_input(interpreter, spectrogram.flatten()) interpreter.invoke() result = get_output(interpreter) if result_callback: result_callback(result, commands, labels) if dectection_callback: detection = -1 if result[0] < negative_threshold: top3 = np.argsort(-result)[:3] for p in range(3): label = labels[top3[p]] if label not in commands.keys(): continue if top3[p] and result[top3[p]] > commands[label]['conf']: if detection < 0: detection = top3[p] if detection < 0 and last_detection > 0: print("---------------") last_detection = 0 if labels[detection] in commands.keys() and detection != last_detection: print(labels[detection], commands[labels[detection]]) dectection_callback(commands[labels[detection]]['key']) last_detection = detection if spectrogram.mean() < 0.001: print("Warning: Input audio signal is nearly 0. Mic may be off ?") except: print("crashing out not sure why") timed_out = True raise
def classify_audio(model_file, labels_file, callback, audio_device_index=0, sample_rate_hz=16000, negative_threshold=0.6, num_frames_hop=33): """Acquire audio, preprocess, and classify.""" downsample_factor = 1 if sample_rate_hz == 48000: downsample_factor = 3 # Most microphones support this # Because the model expects 16KHz audio, we downsample 3 fold recorder = audio_recorder.AudioRecorder( sample_rate_hz, downsample_factor=downsample_factor, device_index=audio_device_index) feature_extractor = Uint8LogMelFeatureExtractor( num_frames_hop=num_frames_hop) labels = read_labels(labels_file) interpreter = make_interpreter(model_file) interpreter.allocate_tensors() keep_listening = True prev_detection = -1 with recorder: print("Ready for voice commands...") while keep_listening: spectrogram = feature_extractor.get_next_spectrogram(recorder) if spectrogram.mean() < 0.001: print( "Warning: Input audio signal is nearly 0. Mic may be off ?" ) set_input(interpreter, spectrogram.flatten()) interpreter.invoke() result = get_output(interpreter) if result[0] >= negative_threshold: prev_detection = -1 continue detection = np.argmax(result) if detection == 0: prev_detection = -1 continue if detection != prev_detection: keep_listening = callback(labels[detection], result[detection]) prev_detection = detection
def main(): microphone = audio_recorder.AudioRecorder(INPUT_DEVICE) process_audio_forever(microphone)
def classify_audio(audio_device_index, interpreter, labels_file, commands_file=None, result_callback=None, dectection_callback=None, sample_rate_hz=16000, negative_threshold=0.6, num_frames_hop=33): """Acquire audio, preprocess, and classify.""" # Initialize recorder. AUDIO_SAMPLE_RATE_HZ = sample_rate_hz downsample_factor = 1 if AUDIO_SAMPLE_RATE_HZ == 48000: downsample_factor = 3 # Most microphones support this # Because the model expects 16KHz audio, we downsample 3 fold recorder = audio_recorder.AudioRecorder( AUDIO_SAMPLE_RATE_HZ, downsample_factor=downsample_factor, device_index=audio_device_index) feature_extractor = Uint8LogMelFeatureExtractor(num_frames_hop=num_frames_hop) labels = read_labels(labels_file) if commands_file: commands = read_commands(commands_file) else: commands = {} logger.info("Loaded commands: %s", str(commands)) logger.info("Recording") timed_out = False # Testing if False: sample_data = 'data/mini_speech_commands/down/e71b4ce6_nohash_1.wav' import tensorflow as tf import os def decode_audio(audio_binary): audio, _ = tf.audio.decode_wav(audio_binary) return tf.squeeze(audio, axis=-1) def get_label(file_path): parts = tf.strings.split(file_path, os.path.sep) # Note: You'll use indexing here instead of tuple unpacking to enable this # to work in a TensorFlow graph. return parts[-2] def get_waveform_and_label(file_path): label = get_label(file_path) audio_binary = tf.io.read_file(file_path) waveform = decode_audio(audio_binary) return waveform, label waveform, label = get_waveform_and_label(sample_data) print(waveform.shape) # End Testing with recorder: last_detection = -1 while not timed_out: spectrogram = feature_extractor.get_next_spectrogram(recorder).astype('float32') #spectrogram = feature_extractor.compute_spectrogram_and_normalize(waveform.numpy()[:15680], 16000) # plot_spectrogram(spectrogram) spectrogram = np.expand_dims(spectrogram, axis=-1) spectrogram = np.expand_dims(spectrogram, axis=0) input_details = interpreter.get_input_details() interpreter.set_tensor(input_details[0]['index'], spectrogram) # set_input(interpreter, spectrogram.flatten()) interpreter.invoke() result = get_output(interpreter) # NOTE: Add softmax # NOTE: Remove negative label result = softmax(result) #print(result) if result_callback: result_callback(result, commands, labels) if dectection_callback: detection = -1 if result[0] < negative_threshold: top3 = np.argsort(-result)[:3] for p in range(3): label = labels[top3[p]] if label not in commands.keys(): continue if top3[p] and result[top3[p]] > commands[label]['conf']: if detection < 0: detection = top3[p] if detection < 0 and last_detection > 0: print("---------------") last_detection = 0 if labels[detection] in commands.keys() and detection != last_detection: print(labels[detection], commands[labels[detection]]) dectection_callback(commands[labels[detection]]['key']) last_detection = detection if spectrogram.mean() < 0.001: print("Warning: Input audio signal is nearly 0. Mic may be off ?")
def classify_audio(audio_device_index, interpreter, labels_file, commands_file=None, result_callback=None, dectection_callback=None, sample_rate_hz=16000, negative_threshold=0.6, num_frames_hop=33): """Acquire audio, preprocess, and classify.""" # Initialize recorder. AUDIO_SAMPLE_RATE_HZ = sample_rate_hz downsample_factor = 1 if AUDIO_SAMPLE_RATE_HZ == 48000: downsample_factor = 3 # Most microphones support this # Because the model expects 16KHz audio, we downsample 3 fold recorder = audio_recorder.AudioRecorder( AUDIO_SAMPLE_RATE_HZ, downsample_factor=downsample_factor, device_index=audio_device_index) feature_extractor = Uint8LogMelFeatureExtractor( num_frames_hop=num_frames_hop) labels = read_labels(labels_file) if commands_file: commands = read_commands(commands_file) else: commands = {} logger.info("Loaded commands: %s", str(commands)) logger.info("Recording") timed_out = False # Testing if False: sample_data = 'data/mini_speech_commands/down/e71b4ce6_nohash_1.wav' import tensorflow as tf import os def decode_audio(audio_binary): audio, _ = tf.audio.decode_wav(audio_binary) return tf.squeeze(audio, axis=-1) def get_label(file_path): parts = tf.strings.split(file_path, os.path.sep) # Note: You'll use indexing here instead of tuple unpacking to enable this # to work in a TensorFlow graph. return parts[-2] def get_waveform_and_label(file_path): label = get_label(file_path) audio_binary = tf.io.read_file(file_path) waveform = decode_audio(audio_binary) return waveform, label waveform, label = get_waveform_and_label(sample_data) print(waveform.shape) # End Testing # yamnet start testing import os import soundfile as sf import params as yamnet_params import yamnet as yamnet_model from scipy.io import wavfile params = yamnet_params.Params() yamnet = yamnet_model.yamnet_frames_model(params) if not os.path.exists('yamnet.h5'): print( 'Error: curl -O https://storage.googleapis.com/audioset/yamnet.h5') exit() yamnet.load_weights('yamnet.h5') yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv') import pygame pygame.init() screen = pygame.display.set_mode((640, 480)) font_header = pygame.font.Font(pygame.font.get_default_font(), 36) font = pygame.font.Font(pygame.font.get_default_font(), 36 * 2) text_surface = font.render('Hello world', True, (0, 0, 0)) GRAY = (200, 200, 200) # yamnet end testing with recorder: last_detection = -1 while not timed_out: audio_sample = recorder.get_audio(7921)[0] if False: wavfile.write('test.wav', 16000, audio_sample) wav_data, sr = sf.read('test.wav', dtype=np.int16) else: wav_data = np.array(audio_sample, dtype=np.int16) sr = AUDIO_SAMPLE_RATE_HZ assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype waveform = wav_data / 32768.0 # Convert to [-1.0, +1.0] waveform = waveform.astype('float32') # Convert to mono and the sample rate expected by YAMNet. if len(waveform.shape) > 1: waveform = np.mean(waveform, axis=1) if sr != params.sample_rate: waveform = resampy.resample(waveform, sr, params.sample_rate) print('-------') # Predict YAMNet classes. scores, embeddings, spectrogram = yamnet(waveform) # Scores is a matrix of (time_frames, num_classes) classifier scores. # Average them along time to get an overall classifier output for the clip. prediction = np.mean(scores, axis=0) # Report the highest-scoring classes and their scores. top5_i = np.argsort(prediction)[::-1][:5] print(':\n' + '\n'.join( ' {:12s}: {:.3f}'.format(yamnet_classes[i], prediction[i]) for i in top5_i)) print('{}:{:.3f}'.format(yamnet_classes[42], prediction[42])) print('{}:{:.3f}'.format(yamnet_classes[0], prediction[0])) print('{}:{:.3f}'.format(yamnet_classes[494], prediction[494])) target_predictions = prediction[42], prediction[0], prediction[494] target_classes = yamnet_classes[42], yamnet_classes[ 0], yamnet_classes[494] index = np.argsort(target_predictions)[::-1][0] black = (0, 0, 0) green = (0, 255, 0) red = (255, 0, 0) if index == 0: color = red elif index == 1: color = green else: color = black text1 = font.render(target_classes[index], True, color) header1 = font_header.render('R-zero Device Listening for Audio', True, (0, 0, 0)) screen.fill(GRAY) screen.blit(header1, dest=(20, 100)) screen.blit(text1, dest=(200, 200)) pygame.display.update() ''' line = '{}:{:.3f}'.format(yamnet_classes[42], prediction[42]) label = Tk.Label(None, text=line, font=('Times', '18'), fg='blue') label.pack() label.mainloop() ''' # End """