def yamnet_grad_test(): waveform = np.reshape( np.sin(2 * np.pi * 440 * np.linspace(0, 3, num=int(3 * 16000))), [1, -1]) print(waveform[0]) wavfile.write('sine.wav', 16000, waveform[0]) model = yamnet_frames_model(params) model.load_weights('yamnet.h5') classes = class_names('yamnet_class_map.csv') with tf.GradientTape() as grad_tape: audio_tensor = tf.convert_to_tensor(np.reshape(waveform, [1, -1])) print(f'Audio Tensor is: {type(audio_tensor)}') grad_tape.watch(audio_tensor) # scores, spectrograms = model.predict(audio_tensor, steps=1) scores, spectrograms = model(audio_tensor) print(f'Scores is: {type(scores)}') target_scores = scores.numpy() assert target_scores.shape == scores.shape target_scores[:, 0] = 1 target_scores = tf.convert_to_tensor(target_scores) loss = tf.keras.losses.MSE(target_scores, scores) gradient_tensor = grad_tape.gradient(loss, audio_tensor) print(scores[0]) print(classes[np.argsort(scores[0])[-3:]]) print(gradient_tensor.shape) print(audio_tensor.shape) output_tensor = audio_tensor + 1000 * gradient_tensor wavfile.write('speechy.wav', 16000, output_tensor[0].numpy()) wavfile.write('grad.wav', 16000, 1000 * gradient_tensor[0].numpy())
def main(argv): assert argv, 'Usage: inference.py <wav file> <wav file> ...' params = yamnet_params.Params() yamnet = yamnet_model.yamnet_frames_model(params) yamnet.load_weights('yamnet.h5') yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv') for file_name in argv: # Decode the WAV file. wav_data, sr = sf.read(file_name, dtype=np.int16) assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype waveform = wav_data / 32768.0 # Convert to [-1.0, +1.0] waveform = waveform.astype('float32') # Convert to mono and the sample rate expected by YAMNet. if len(waveform.shape) > 1: waveform = np.mean(waveform, axis=1) if sr != params.sample_rate: waveform = resampy.resample(waveform, sr, params.sample_rate) # Predict YAMNet classes. scores, embeddings, spectrogram = yamnet(waveform) # Scores is a matrix of (time_frames, num_classes) classifier scores. # Average them along time to get an overall classifier output for the clip. prediction = np.mean(scores, axis=0) # Report the highest-scoring classes and their scores. top5_i = np.argsort(prediction)[::-1][:5] print( file_name, ':\n' + '\n'.join( ' {:12s}: {:.3f}'.format(yamnet_classes[i], prediction[i]) for i in top5_i))
def check_model(model_fn, class_map_path, params): yamnet_classes = yamnet.class_names(class_map_path) """Applies yamnet_test's sanity checks to an instance of YAMNet.""" def clip_test(waveform, expected_class_name, top_n=10): predictions, embeddings, log_mel_spectrogram = model_fn(waveform) clip_predictions = np.mean(predictions, axis=0) top_n_indices = np.argsort(clip_predictions)[-top_n:] top_n_scores = clip_predictions[top_n_indices] top_n_class_names = yamnet_classes[top_n_indices] top_n_predictions = list(zip(top_n_class_names, top_n_scores)) assert expected_class_name in top_n_class_names, ( 'Did not find expected class {} in top {} predictions: {}'.format( expected_class_name, top_n, top_n_predictions)) clip_test( waveform=np.zeros((int(3 * params.sample_rate),), dtype=np.float32), expected_class_name='Silence') np.random.seed(51773) # Ensure repeatability. clip_test( waveform=np.random.uniform(-1.0, +1.0, (int(3 * params.sample_rate),)).astype(np.float32), expected_class_name='White noise') clip_test( waveform=np.sin(2 * np.pi * 440 * np.arange(0, 3, 1 / params.sample_rate), dtype=np.float32), expected_class_name='Sine wave')
def main(argv): assert argv graph = tf.Graph() with graph.as_default(): yamnet = yamnet_model.yamnet_frames_model(params) yamnet.load_weights('yamnet.h5') yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv') for file_name in argv: # Decode the WAV file. wav_data, sr = sf.read(file_name, dtype=np.int16) assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype waveform = wav_data / 32768.0 # Convert to [-1.0, +1.0] # Convert to mono and the sample rate expected by YAMNet. if len(waveform.shape) > 1: waveform = np.mean(waveform, axis=1) if sr != params.SAMPLE_RATE: waveform = resampy.resample(waveform, sr, params.SAMPLE_RATE) # Predict YAMNet classes. # Second output is log-mel-spectrogram array (used for visualizations). # (steps=1 is a work around for Keras batching limitations.) with graph.as_default(): scores, _ = yamnet.predict(np.reshape(waveform, [1, -1]), steps=1) # Scores is a matrix of (time_frames, num_classes) classifier scores. # Average them along time to get an overall classifier output for the clip. prediction = np.mean(scores, axis=0) # Report the highest-scoring classes and their scores. top5_i = np.argsort(prediction)[::-1][:5] print(file_name, ':\n' + '\n'.join(' {:12s}: {:.3f}'.format(yamnet_classes[i], prediction[i]) for i in top5_i))
def embedding(self, input_paths, output_paths, embed_paths=""): """Extract YAMnet features with opensmile using a single process.""" if embed_paths == "": embed_paths = [""] * len(input_paths) save_embedding = False else: save_embedding = True paths = list(zip(input_paths, embed_paths, output_paths)) params = yamnet_params.Params(sample_rate=self.sample_rate, patch_hop_seconds=0.48) class_names = yamnet_model.class_names(self.class_names) yamnet = yamnet_model.yamnet_frames_model(params) yamnet.load_weights(self.model_checkpoint) func = partial( self._embed, yamnet=yamnet, params=params, class_names=class_names, save_embedding=save_embedding, ) self.single_process(func, paths)
def setUpClass(cls): super(YAMNetTest, cls).setUpClass() cls._yamnet_graph = tf.Graph() with cls._yamnet_graph.as_default(): cls._yamnet = yamnet.yamnet_frames_model(params) cls._yamnet.load_weights('yamnet.h5') cls._yamnet_classes = yamnet.class_names('yamnet_class_map.csv')
def __init__(self): physical_devices = tf.config.experimental.list_physical_devices('GPU') tf.config.experimental.set_virtual_device_configuration( physical_devices[0], [ tf.config.experimental.VirtualDeviceConfiguration( memory_limit=4096) ]) self.graph = tf.Graph() with self.graph.as_default(): self.yamnet = yamnet_model.yamnet_frames_model(params) self.yamnet.load_weights('yamnet/yamnet.h5') self.yamnet_classes = yamnet_model.class_names( 'yamnet/yamnet_class_map.csv')
def main(argv): assert argv model = tf.saved_model.load('model') metadata_fn = model.signatures["metadata"] metadata = metadata_fn() print('metadata', metadata) score_fn = model.signatures["score"] print(score_fn) yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv') print(yamnet_classes) for file_name in argv: # Decode the WAV file. wav_data, sr = sf.read(file_name, dtype=np.int16) assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype waveform = wav_data / 32768.0 # Convert to [-1.0, +1.0] # Convert to mono and the sample rate expected by YAMNet. if len(waveform.shape) > 1: waveform = np.mean(waveform, axis=1) if sr != params.SAMPLE_RATE: waveform = resampy.resample(waveform, sr, params.SAMPLE_RATE) waveform = tf.expand_dims( tf.expand_dims(tf.constant(waveform, dtype=tf.float32), 0), 2) scores = next( iter( score_fn( waveform=waveform, context_step_samples=tf.constant(int( params.PATCH_HOP_SECONDS * params.SAMPLE_RATE), dtype=tf.int64), ).values())).numpy() print(scores) # Scores is a matrix of (time_frames, num_classes) classifier scores. # Average them along time to get an overall classifier output for the clip. prediction = np.mean(scores[0], axis=0) # Report the highest-scoring classes and their scores. top5_i = np.argsort(prediction)[::-1][:5] print( file_name, ':\n' + '\n'.join( ' {:12s}: {:.5f}'.format(yamnet_classes[i], prediction[i]) for i in top5_i))
def __init__(self, config_path="./config.yaml"): """Init method for the Searcher.""" super().__init__() # Load the configuration conf = OmegaConf.load(config_path) self.dataset_path = conf.dataset_path self.audio_path = os.path.join(conf.dataset_path, "podcasts-audio") self.es_url = conf.search_es_url # URL of Elasticsearch to query self.es_num = (conf.search_es_num ) # Number of segments to request from Elasticsearch self.sample_rate = 44100 # Hardcoded sample rate of all podcast audio # Load the podcast metadata self.metadata = load_metadata(self.dataset_path) # Set up the reranking model self.rerank_tokenizer = AutoTokenizer.from_pretrained( conf.search_rerank_model, use_fast=True, cache_dir=conf.search_cache_dir) self.rerank_model = AutoModelForSequenceClassification.from_pretrained( conf.search_rerank_model, cache_dir=conf.search_cache_dir) self.rerank_model.to("cpu", non_blocking=True) self.rerank_max_seq_len = 512 # Set up the openSMILE extractor self.smile = opensmile.Smile( feature_set=opensmile.FeatureSet.eGeMAPSv02, feature_level=opensmile.FeatureLevel.Functionals, options={ "frameModeFunctionalsConf": os.path.join( os.getenv("PODCAST_PATH"), "data/custom_FrameModeFunctionals.conf.inc", ) }, ) # Set up the YAMNet model params = yamnet_params.Params(sample_rate=self.sample_rate, patch_hop_seconds=0.48) self.yamnet_classes = yamnet_model.class_names( os.path.join(os.getenv("YAMNET_PATH"), "yamnet_class_map.csv")) self.yamnet_model = yamnet_model.yamnet_frames_model(params) self.yamnet_model.load_weights( os.path.join(os.getenv("PODCAST_PATH"), "data/yamnet.h5"))
def main(argv): assert argv, 'Usage: inference.py <wav file> <wav file> ...' model_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'yamnet.h5') classes_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'yamnet_class_map.csv') event_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'event.json') params = yamnet_params.Params() yamnet = yamnet_model.yamnet_frames_model(params) yamnet.load_weights(model_path) yamnet_classes = yamnet_model.class_names(classes_path) for file_name in argv: # Decode the WAV file. wav_data, sr = sf.read(file_name, dtype=np.int16) assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype waveform = wav_data / 32768.0 # Convert to [-1.0, +1.0] waveform = waveform.astype('float32') # Convert to mono and the sample rate expected by YAMNet. if len(waveform.shape) > 1: waveform = np.mean(waveform, axis=1) if sr != params.sample_rate: waveform = resampy.resample(waveform, sr, params.sample_rate) # Predict YAMNet classes. scores, embeddings, spectrogram = yamnet(waveform) # Scores is a matrix of (time_frames, num_classes) classifier scores. # Average them along time to get an overall classifier output for the clip. prediction = np.mean(scores, axis=0) # Report the highest-scoring classes and their scores. top5_i = np.argsort(prediction)[::-1][:5] print(file_name, ':\n' + '\n'.join(' {:12s}: {:.3f}'.format(yamnet_classes[i], prediction[i]) for i in top5_i)) # print all classes b = prediction.tolist() # nested lists with same data, indices pred = [] for (i,cls) in enumerate(yamnet_classes): item={} item['label']=cls item['value']=round(b[i], 6) pred.append(item) pred = sorted(pred, key=lambda x: x['value'], reverse=True) json.dump(pred, codecs.open(event_path, 'w', encoding='utf-8'), separators=(',', ':'), sort_keys=True, indent=4) ### this saves the array in .json format
def main(argv): global analysisdata, frame_counter log = open('/tmp/sound.log', 'w') # Set up yamnet params = yamnet_params.Params(sample_rate=ANALYSIS_SAMPLE_RATE, patch_hop_seconds=0.1) yamnet = yamnet_model.yamnet_frames_model(params) yamnet.load_weights('/home/pi/models/research/audioset/yamnet/yamnet.h5') yamnet_classes = yamnet_model.class_names( '/home/pi/models/research/audioset/yamnet/yamnet_class_map.csv') # Set up a live callback stream from the microphone stream = sd.InputStream(device=1, channels=1, samplerate=RECORD_SAMPLE_RATE, callback=audio_callback, blocksize=BUFFER_SIZE_F) with stream: while True: update_analysis_window() if (frame_counter >= int( ANALYSIS_LENGTH_S * ANALYSIS_SAMPLE_RATE)): frame_counter = 0 scores = yamnet.predict(analysisdata, steps=1)[0] if (len(scores)): prediction = np.mean(scores, axis=0) top5_i = np.argsort(prediction)[::-1][:1] for x in top5_i: if (prediction[x] > THRESHOLD): top_class_str = yamnet_classes[x] # Write any detected class (outside these noisy ones) to the log if (not top_class_str in [ "Fireworks", "Silence", "Inside, small room" ]): log.write("[%s] %s %0.4f\n" % (datetime.now().strftime( "%m/%d/%Y %H:%M:%S"), top_class_str, prediction[x])) log.flush() # And if it's one of the doorbell ones, ping the homebridge server if (top_class_str in [ "Beep, bleep", "Doorbell", "Glass", "Ding" ]): trigger_homekit_motion()
def main(): # Load yamnet yamnet = yamnet_model.yamnet_frames_model(params) yamnet.load_weights('yamnet.h5') # Convert the model class_names = [ re.sub(r'\ |\(|\)|,|-|\'', '', x.lower()) for x in yamnet_model.class_names('yamnet_class_map.csv') ] frame = RfcxFrame(yamnet, params.SAMPLE_RATE, params.PATCH_WINDOW_SECONDS, class_names, 'pcm_s16le') tf.saved_model.save(frame, 'model', signatures={ "score": frame.score, "metadata": frame.metadata })
def load_model(self, layer=None): """ This function loads the yamnet model with a specified layer and returns a 'dreamer' model that returns the activations of such layer Parameters ----------- layer (string) : a specified layer If `layer` is not specified, the last layer is used instead. Returns ---------- (tf.keras.Model) : the dreamer model """ # load its class names self.class_names = yamnet.class_names(self.class_file) self.class_names_tensor = tf.constant(self.class_names) # load model parameters and get model self.params = params.Params(sample_rate=self.sr, patch_hop_seconds=self.patch_hop) self.model = yamnet.yamnet_frames_model(self.params) # load model weigths self.model.load_weights(self.weights_file) if layer is not None: self.layername = layer else: self.__print__("Using last layer.") self.layername = self.model.layers[-1].name self.__print__(f"Yamnet loaded, using layer:{self.layername}") # Get the specified layer self.layers = self.model.get_layer(self.layername).output # Finally, create the dreamer model self.dreamer = tf.keras.Model(inputs=self.model.input, outputs=self.layers) self.__print__("Dreamer started.") return self.dreamer
def main(argv): assert argv # Load the TFLite model and allocate tensors. interpreter = tf.lite.Interpreter(model_path="yamnet.tflite") interpreter.allocate_tensors() inputs = interpreter.get_input_details() outputs = interpreter.get_output_details() yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv') for file_name in argv: # Decode the WAV file. wav_data, sr = sf.read(file_name, dtype=np.int16) assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype waveform = wav_data / 32768.0 # Convert to [-1.0, +1.0] # Convert to mono and the sample rate expected by YAMNet. if len(waveform.shape) > 1: waveform = np.mean(waveform, axis=1) if sr != params.SAMPLE_RATE: waveform = resampy.resample(waveform, sr, params.SAMPLE_RATE) # Predict YAMNet classes. interpreter.set_tensor( inputs[0]['index'], np.expand_dims(np.array(waveform, dtype=np.float32), axis=0)) interpreter.invoke() scores = interpreter.get_tensor(outputs[0]['index']) # Scores is a matrix of (time_frames, num_classes) classifier scores. # Average them along time to get an overall classifier output for the clip. prediction = np.mean(scores, axis=0) # Report the highest-scoring classes and their scores. top5_i = np.argsort(prediction)[::-1][:5] print( file_name, ':\n' + '\n'.join( ' {:12s}: {:.3f}'.format(yamnet_classes[i], prediction[i]) for i in top5_i))
'Liquid', 'Water', ] # 'Water', 'Pour', 'Drip' waterKeys = ['Water tap, faucet', 'Sink (filling or washing)'] signals = dict.fromkeys(keys, 0.0) picked = dict.fromkeys(keys, 0.0) detected = dict.fromkeys(keys, False) detectThreshold = 0.65 checkThreshold = 0.25 resetThreshold = 0.05 # Set up the YAMNet model. params.PATCH_HOP_SECONDS = 0.48 # 10 Hz scores frame rate. //0.1 yamnet = yamnet_model.yamnet_frames_model(params) yamnet.load_weights('yamnet.h5') class_names = yamnet_model.class_names('yamnet_class_map.csv') CHUNKSIZE = 16000 # fixed chunk size sr = 16000 seconds = 1 predictionPeriod = 2.0 predictionRate = 2.0 predChunkSize = int(sr * predictionPeriod) readChunkSize = int(sr * predictionRate) duration = 50 frames = [] last5secFrames = [] old5secFrames = []
def main(argv): params = yamnet_params.Params() yamnet = yamnet_model.yamnet_frames_model(params) yamnet.load_weights('yamnet.h5') yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv') for file_name in argv: # Decode the WAV file. wav_data, sr = sf.read(file_name, always_2d=False, dtype=np.int16) assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype print('waveform original dtaa', wav_data.shape) waveform = wav_data / 32768.0 # Convert to [-1.0, +1.0] waveform = waveform.astype('float32') print('waveform normal dtaa', waveform.shape) print('sampling rate', sr) print('sampling rate model params', params.sample_rate) # Convert to mono and the sample rate expected by YAMNet. if len(waveform.shape) > 1: print('entered') waveform = np.mean(waveform, axis=1) if sr != params.sample_rate: waveform = resampy.resample(waveform, sr, params.sample_rate) print(waveform.shape, min(waveform)) # plt.figure(figsize=(20, 8)) # plt.plot(waveform) # plt.xlabel('Samples') # plt.ylabel('Amplitude') # # plt.savefig('waveform.png') # plt.show() # plt.close() # fig, ax = plt.subplots(figsize=(20, 8)) fig = plt.figure() ax = plt.axes(xlim=(0, len(waveform)), ylim=(-0.16, 0.17)) line, = ax.plot([], [], lw=1) def init(): line.set_data([], []) return line, def animate(i): x = np.linspace(0, len(waveform), len(waveform)) y = waveform[i] line.set_data(x, y) return line, anim = FuncAnimation(fig, animate, init_func=init, frames=200, interval=20, blit=True) plt.draw() plt.show()
with open('credentials.txt', 'r') as file: credentials = json.loads(file.read()) Baby_Crying = False ##Account Variables account_sid = credentials['account_sid'] auth_token = credentials['auth_token'] messaging_service_sid = credentials['messaging_service_sid'] ##Get Tensorflow Model graph = tf.Graph() with graph.as_default(): yamnet = yamnet_model.yamnet_frames_model(params) yamnet.load_weights('models/research/audioset/yamnet/yamnet.h5') yamnet_classes = yamnet_model.class_names('models/research/audioset/yamnet/yamnet_class_map.csv') ##Set Paremeters for PyAudio RATE=44100 RECORD_SECONDS = 5 CHUNKSIZE = 4096 p = pyaudio.PyAudio() stream = p.open(format=pyaudio.paInt16, channels=1, rate=RATE, input=True, frames_per_buffer=CHUNKSIZE) def get_audio_stream(): stream.start_stream() frames = [] # A python-list of chunks(numpy.ndarray) print('\n***************\n***************\n***recording***\n***************\n***************\n') for _ in range(0, int(RATE / CHUNKSIZE * RECORD_SECONDS)): data = stream.read(CHUNKSIZE) frames.append(numpy.fromstring(data, dtype=numpy.int16))
ip = input_pipeline.InputPipeline(batch_size=32, buffer_size=100) ip.setup_paths(paths) ip.setup_labels_cough(labels_cough) ip.make_datasets() import params import yamnet as yamnet_model import importlib importlib.reload(yamnet_model) import tflite_compat importlib.reload(tflite_compat) params.BATCH_SIZE = ip.batch_size yamnet = yamnet_model.yamnet_frames_model(params) yamnet.load_weights(paths['yamnet_weights']) yamnet_classes = yamnet_model.class_names(paths['yamnet_classes']) embeddings = {} # This loops in a slighly stupid way, because the yamnet # requires fixed length and fixed batch size - for tflite # compatibility. So, it repeats until all ids have randomly # not been dropped in making fixed batch_sizes. Also, since # the length has to match the 10s of audioset, the FSD data # is either croppoed to 10s or padded with zeros. for ds in ['fsd']: embeddings[ds] = {} print(f'{ds}') for split in ['train', 'test', 'val']: print(f'\t{split}') embeddings[ds][split] = {} n_processed = 0
def classify_audio(audio_device_index, interpreter, labels_file, commands_file=None, result_callback=None, dectection_callback=None, sample_rate_hz=16000, negative_threshold=0.6, num_frames_hop=33): """Acquire audio, preprocess, and classify.""" # Initialize recorder. AUDIO_SAMPLE_RATE_HZ = sample_rate_hz downsample_factor = 1 if AUDIO_SAMPLE_RATE_HZ == 48000: downsample_factor = 3 # Most microphones support this # Because the model expects 16KHz audio, we downsample 3 fold recorder = audio_recorder.AudioRecorder( AUDIO_SAMPLE_RATE_HZ, downsample_factor=downsample_factor, device_index=audio_device_index) feature_extractor = Uint8LogMelFeatureExtractor( num_frames_hop=num_frames_hop) labels = read_labels(labels_file) if commands_file: commands = read_commands(commands_file) else: commands = {} logger.info("Loaded commands: %s", str(commands)) logger.info("Recording") timed_out = False # Testing if False: sample_data = 'data/mini_speech_commands/down/e71b4ce6_nohash_1.wav' import tensorflow as tf import os def decode_audio(audio_binary): audio, _ = tf.audio.decode_wav(audio_binary) return tf.squeeze(audio, axis=-1) def get_label(file_path): parts = tf.strings.split(file_path, os.path.sep) # Note: You'll use indexing here instead of tuple unpacking to enable this # to work in a TensorFlow graph. return parts[-2] def get_waveform_and_label(file_path): label = get_label(file_path) audio_binary = tf.io.read_file(file_path) waveform = decode_audio(audio_binary) return waveform, label waveform, label = get_waveform_and_label(sample_data) print(waveform.shape) # End Testing # yamnet start testing import os import soundfile as sf import params as yamnet_params import yamnet as yamnet_model from scipy.io import wavfile params = yamnet_params.Params() yamnet = yamnet_model.yamnet_frames_model(params) if not os.path.exists('yamnet.h5'): print( 'Error: curl -O https://storage.googleapis.com/audioset/yamnet.h5') exit() yamnet.load_weights('yamnet.h5') yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv') import pygame pygame.init() screen = pygame.display.set_mode((640, 480)) font_header = pygame.font.Font(pygame.font.get_default_font(), 36) font = pygame.font.Font(pygame.font.get_default_font(), 36 * 2) text_surface = font.render('Hello world', True, (0, 0, 0)) GRAY = (200, 200, 200) # yamnet end testing with recorder: last_detection = -1 while not timed_out: audio_sample = recorder.get_audio(7921)[0] if False: wavfile.write('test.wav', 16000, audio_sample) wav_data, sr = sf.read('test.wav', dtype=np.int16) else: wav_data = np.array(audio_sample, dtype=np.int16) sr = AUDIO_SAMPLE_RATE_HZ assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype waveform = wav_data / 32768.0 # Convert to [-1.0, +1.0] waveform = waveform.astype('float32') # Convert to mono and the sample rate expected by YAMNet. if len(waveform.shape) > 1: waveform = np.mean(waveform, axis=1) if sr != params.sample_rate: waveform = resampy.resample(waveform, sr, params.sample_rate) print('-------') # Predict YAMNet classes. scores, embeddings, spectrogram = yamnet(waveform) # Scores is a matrix of (time_frames, num_classes) classifier scores. # Average them along time to get an overall classifier output for the clip. prediction = np.mean(scores, axis=0) # Report the highest-scoring classes and their scores. top5_i = np.argsort(prediction)[::-1][:5] print(':\n' + '\n'.join( ' {:12s}: {:.3f}'.format(yamnet_classes[i], prediction[i]) for i in top5_i)) print('{}:{:.3f}'.format(yamnet_classes[42], prediction[42])) print('{}:{:.3f}'.format(yamnet_classes[0], prediction[0])) print('{}:{:.3f}'.format(yamnet_classes[494], prediction[494])) target_predictions = prediction[42], prediction[0], prediction[494] target_classes = yamnet_classes[42], yamnet_classes[ 0], yamnet_classes[494] index = np.argsort(target_predictions)[::-1][0] black = (0, 0, 0) green = (0, 255, 0) red = (255, 0, 0) if index == 0: color = red elif index == 1: color = green else: color = black text1 = font.render(target_classes[index], True, color) header1 = font_header.render('R-zero Device Listening for Audio', True, (0, 0, 0)) screen.fill(GRAY) screen.blit(header1, dest=(20, 100)) screen.blit(text1, dest=(200, 200)) pygame.display.update() ''' line = '{}:{:.3f}'.format(yamnet_classes[42], prediction[42]) label = Tk.Label(None, text=line, font=('Times', '18'), fg='blue') label.pack() label.mainloop() ''' # End """
def main(argv): assert argv, 'Usage: inference.py <wav file> <wav file> ...' params = yamnet_params.Params() yamnet = yamnet_model.yamnet_frames_model(params) yamnet.load_weights('yamnet.h5') yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv') for file_name in argv: # Decode the WAV file. wav_data, sr = sf.read(file_name, dtype=np.int16) assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype print('waveform original dtaa', wav_data.shape) waveform = wav_data / 32768.0 # Convert to [-1.0, +1.0] waveform = waveform.astype('float32') print('waveform normal dtaa', waveform.shape) print('sampling rate', sr) print('sampling rate model params', params.sample_rate) # Convert to mono and the sample rate expected by YAMNet. if len(waveform.shape) > 1: print('entered') waveform = np.mean(waveform, axis=1) if sr != params.sample_rate: waveform = resampy.resample(waveform, sr, params.sample_rate) # plt.figure(figsize=(20, 8)) # plt.plot(waveform) # plt.xlabel('Samples') # plt.ylabel('Amplitude') # # plt.savefig('waveform.png') # plt.show() # plt.close() print('waveform sample dtaa', waveform.shape) # Predict YAMNet classes. scores, embeddings, spectrogram = yamnet(waveform) print('scores', scores) # Scores is a matrix of (time_frames, num_classes) classifier scores. # Average them along time to get an overall classifier output for the clip. prediction = np.mean(scores, axis=0) # Report the highest-scoring classes and their scores. top5_i = np.argsort(prediction)[::-1][:5] print( file_name, ':\n' + '\n'.join( ' {:12s}: {:.3f}'.format(yamnet_classes[i], prediction[i]) for i in top5_i)) truth_labels = [yamnet_classes[i] for i in top5_i] print('ground labels', truth_labels) total_time = 0 # plt.figure(figsize=(20, 8)) # plt.plot(scores[:,282].numpy(),label='water') # plt.plot(scores[:,364].numpy(),label='faucet') # plt.plot(scores[:,365].numpy(),label='sink') # plt.legend() # plt.show() # plt.close() for i in range(len(scores)): pred = scores[i] water_prob = pred[282].numpy() print('water_prob', water_prob) top5_i = np.argsort(pred)[::-1][:5] print( file_name, ':\n' + '\n'.join(' {:12s}: {:.3f}'.format(yamnet_classes[i], pred[i]) for i in top5_i)) pred_class = yamnet_classes[top5_i[0]] print(pred_class) if pred_class in truth_labels: total_time += 0.96 print('total time', total_time / 2)
) import numpy as np import resampy import soundfile as sf import tensorflow as tf import params import yamnet as yamnet_model graph = tf.Graph() with graph.as_default(): yamnet = yamnet_model.yamnet_frames_model(params) yamnet.load_weights("./checkpoint") yamnet_classes = yamnet_model.class_names("yamnet_class_map.csv") def read_wav(w, max_audio_time=30): wav_data, sr = sf.read(w, dtype=np.int16) waveform = wav_data / 32768.0 if len(waveform.shape) > 1: waveform = np.mean(waveform, axis=1) waveform = waveform[:max_audio_time * params.SAMPLE_RATE * 1000] if sr != params.SAMPLE_RATE: waveform = resampy.resample(waveform, sr, params.SAMPLE_RATE) return waveform
def load_model(): params = yamnet_params.Params() yamnet = yamnet_model.yamnet_frames_model(params) yamnet.load_weights('yamnet.h5') yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv') return yamnet, yamnet_classes
def setUpClass(cls): super().setUpClass() cls._params = params.Params() cls._yamnet = yamnet.yamnet_frames_model(cls._params) cls._yamnet.load_weights('yamnet.h5') cls._yamnet_classes = yamnet.class_names('yamnet_class_map.csv')
def main(argv): params = yamnet_params.Params() yamnet = yamnet_model.yamnet_frames_model(params) yamnet.load_weights('yamnet.h5') yamnet_classes = yamnet_model.class_names('yamnet_class_map.csv') for file_name in argv: # Decode the WAV file. wav_data, sr = sf.read(file_name, always_2d=False, dtype=np.int16) assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype print('waveform original dtaa', wav_data.shape) waveform = wav_data / 32768.0 # Convert to [-1.0, +1.0] waveform = waveform.astype('float32') print('waveform normal dtaa', waveform.shape) print('sampling rate', sr) print('sampling rate model params', params.sample_rate) # Convert to mono and the sample rate expected by YAMNet. if len(waveform.shape) > 1: print('entered') waveform = np.mean(waveform, axis=1) if sr != params.sample_rate: waveform = resampy.resample(waveform, sr, params.sample_rate) print('waveform normal dtaa', waveform.shape) scale = 2.5 # fig = plt.figure(figsize=(int(scale*4), int(scale*3))) # camera = Camera(fig) # for i in range(0,len(waveform),int(0.96*params.sample_rate/int(8))): # plt.plot(waveform[:i],color='b') # plt.xlabel('Samples') # plt.ylabel('Amplitude') # camera.snap() # animation = camera.animate() # animation.save(file_name+'_filename_'+str(scale)+'.mp4') # plt.close() # Predict YAMNet classes. scores, embeddings, spectrogram = yamnet(waveform) print('scores', scores) # Scores is a matrix of (time_frames, num_classes) classifier scores. # Average them along time to get an overall classifier output for the clip. prediction = np.mean(scores, axis=0) # Report the highest-scoring classes and their scores. top5_i = np.argsort(prediction)[::-1][:5] print( file_name, ':\n' + '\n'.join( ' {:12s}: {:.3f}'.format(yamnet_classes[i], prediction[i]) for i in top5_i)) # colors=['b','g','r'] # fig=plt.figure() # camera = Camera(fig) # plt.xlabel('Time(0.5s)') # plt.ylabel('Probability') # for j in range(1,len(scores)): # k=0 # for i in top5_i[1:-1]: # x=np.convolve(scores[:j,i].numpy(), np.ones((4,))/4, mode='valid') # # x=scores[:j,i].numpy() # plt.plot(x,color=colors[k]) # k+=1 # for i in range(1): # camera.snap() # plt.legend([yamnet_classes[i] for i in top5_i[1:-1]],loc='upper right') # animation = camera.animate(interval=int(1000)) # # plt.show() # # plt.close() # animation.save(file_name+'_class_'+str(scale)+'.mp4') colors = ['b', 'g', 'r'] fig = plt.figure() camera = Camera(fig) plt.xlabel('Time(0.5s)') plt.ylabel('volume') vol_store = [] total_vol = 0 for j in range(len(scores)): vol = [] for i in top5_i[1:-1]: # x=np.convolve(scores[j,i].numpy(), np.ones((4,))/4, mode='valid') x = scores[j, i].numpy() if x > 0.1: vol.append(float(1 / 24)) # print(vol) if vol: total_vol += np.mean(vol) print(total_vol) vol_store.append(total_vol) # print(vol_store) plt.plot(vol_store, color='b') camera.snap() # plt.legend(,loc='upper right') animation = camera.animate(interval=int(1000)) # plt.show() # plt.close() animation.save(file_name + '_volume_' + str(scale) + '.mp4')