def _get_examples_batch(): """Returns a shuffled batch of examples of all audio classes. Note that this is just a toy function because this is a simple demo intended to illustrate how the training code might work. Returns: a tuple (features, labels) where features is a NumPy array of shape [batch_size, num_frames, num_bands] where the batch_size is variable and each row is a log mel spectrogram patch of shape [num_frames, num_bands] suitable for feeding VGGish, while labels is a NumPy array of shape [batch_size, num_classes] where each row is a multi-hot label vector that provides the labels for corresponding rows in features. """ # Make a waveform for each class. num_seconds = 5 sr = 44100 # Sampling rate. t = np.linspace(0, num_seconds, int(num_seconds * sr)) # Time axis. # Random sine wave. freq = np.random.uniform(100, 1000) sine = np.sin(2 * np.pi * freq * t) # Random constant signal. magnitude = np.random.uniform(-1, 1) const = magnitude * t # White noise. noise = np.random.normal(-1, 1, size=t.shape) # Make examples of each signal and corresponding labels. # Sine is class index 0, Const class index 1, Noise class index 2. sine_examples = vggish_input.waveform_to_examples(sine, sr) sine_labels = np.array([[1, 0, 0]] * sine_examples.shape[0]) const_examples = vggish_input.waveform_to_examples(const, sr) const_labels = np.array([[0, 1, 0]] * const_examples.shape[0]) noise_examples = vggish_input.waveform_to_examples(noise, sr) noise_labels = np.array([[0, 0, 1]] * noise_examples.shape[0]) # Shuffle (example, label) pairs across all classes. all_examples = np.concatenate((sine_examples, const_examples, noise_examples)) all_labels = np.concatenate((sine_labels, const_labels, noise_labels)) labeled_examples = list(zip(all_examples, all_labels)) shuffle(labeled_examples) # Separate and return the features and labels. features = [example for (example, _) in labeled_examples] labels = [label for (_, label) in labeled_examples] return (features, labels)
checkpoint_path = 'vggish_model.ckpt' pca_params_path = 'vggish_pca_params.npz' # Relative tolerance of errors in mean and standard deviation of embeddings. rel_error = 0.1 # Up to 10% # Generate a 1 kHz sine wave at 44.1 kHz (we use a high sampling rate # to test resampling to 16 kHz during feature extraction). num_secs = 3 freq = 1000 sr = 44100 t = np.linspace(0, num_secs, int(num_secs * sr)) x = np.sin(2 * np.pi * freq * t) # Produce a batch of log mel spectrogram examples. input_batch = vggish_input.waveform_to_examples(x, sr) print('Log Mel Spectrogram example: ', input_batch[0]) np.testing.assert_equal( input_batch.shape, [num_secs, vggish_params.NUM_FRAMES, vggish_params.NUM_BANDS]) # Define VGGish, load the checkpoint, and run the batch through the model to # produce embeddings. with tf.Graph().as_default(), tf.Session() as sess: vggish_slim.define_vggish_slim() vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME)
parser.add_argument('--output_path', default='/home/ICT2000/jondras/dvra_datasets/mimicry/audio_features') args = parser.parse_args() start_time = time.time() cnt = 0 for audio_filepath in sorted(glob.glob(f'{args.input_path}/*.wav')): audio_basename = audio_filepath.split('/')[-1].split('.')[0] print(f'Extracting features for {audio_basename}') wav_rate, wav_samples = wavfile.read(audio_filepath) print(f'\t{wav_rate} {len(wav_samples)}') if len(wav_samples) < wav_rate: wav_samples = np.pad(wav_samples, (0, wav_rate - len(wav_samples)), 'constant') samples = vggish_input.waveform_to_examples(wav_samples, wav_rate) with tf.Graph().as_default(), tf.Session() as sess: vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, args.model_file) samples_tensor = sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME) features_tensor = sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME) print(time.time() - start_time) [features] = sess.run([features_tensor], feed_dict={samples_tensor: samples}) print(time.time() - start_time) print(f'\t{features.shape}') # print(f'\t{features}') np.save(f'{args.output_path}/{audio_basename}.npy', features)
def prepareAudio(WAV_FILE): fs, data = wavfile.read(WAV_FILE) #data = data[0] data = truncateAudioFs(data, fs) data = data / 32768.0 return vggish_input.waveform_to_examples(data, fs)
def _get_examples_batch(): """Returns a shuffled batch of examples of all audio classes. Note that this is just a toy function because this is a simple demo intended to illustrate how the training code might work. Returns: a tuple (features, labels) where features is a NumPy array of shape [batch_size, num_frames, num_bands] where the batch_size is variable and each row is a log mel spectrogram patch of shape [num_frames, num_bands] suitable for feeding VGGish, while labels is a NumPy array of shape [batch_size, num_classes] where each row is a multi-hot label vector that provides the labels for corresponding rows in features. """ # Make a waveform for each class. num_seconds = 5 sr = 44100 # Sampling rate. t = np.linspace(0, num_seconds, int(num_seconds * sr)) # Time axis. # Random sine wave. freq = np.random.uniform(100, 1000) sine = np.sin(2 * np.pi * freq * t) # Random constant signal. magnitude = np.random.uniform(-1, 1) const = magnitude * t # White noise. noise = np.random.normal(-1, 1, size=t.shape) # Make examples of each signal and corresponding labels. # Sine is class index 0, Const class index 1, Noise class index 2. sine_examples = vggish_input.waveform_to_examples(sine, sr) sine_labels = np.array([[1, 0, 0]] * sine_examples.shape[0]) const_examples = vggish_input.waveform_to_examples(const, sr) const_labels = np.array([[0, 1, 0]] * const_examples.shape[0]) noise_examples = vggish_input.waveform_to_examples(noise, sr) noise_labels = np.array([[0, 0, 1]] * noise_examples.shape[0]) # Shuffle (example, label) pairs across all classes. all_examples = np.concatenate( (sine_examples, const_examples, noise_examples)) all_labels = np.concatenate((sine_labels, const_labels, noise_labels)) labeled_examples = list(zip(all_examples, all_labels)) shuffle(labeled_examples) # Separate and return the features and labels. features = [example for (example, _) in labeled_examples] labels = [label for (_, label) in labeled_examples] return (features, labels)
# Slice audio according to segments slicer = essentia.standard.Slicer(startTimes=[s[0] for s in segments], endTimes=[s[1] for s in segments], timeUnits='samples', sampleRate=22050) sliced_audio = np.hstack(slicer(audio)) sliced_duration = essentia.standard.Duration( sampleRate=22050)(sliced_audio) # Collate the active segments active_y = np.hstack( [audio[segment[0]:segment[1]] for segment in segments]) input_batch = vggish_input.waveform_to_examples(active_y, sampleRate) predictions = np.argmax(model.predict(input_batch[:, :, :, None]), axis=1) p_music = 0 p_speech = 0 p_sfx = 0 sum_ = len(predictions) for n in range(len(predictions)): if predictions[n] == 0: p_music += 1 / sum_ elif predictions[n] == 1: p_speech += 1 / sum_ elif predictions[n] == 2: p_sfx += 1 / sum_
# t = np.arange(0, num_secs, 1 / sr) # x = np.sin(2 * np.pi * freq * t) # # Convert to signed 16-bit samples. # samples = np.clip(x * 32768, -32768, 32767).astype(np.int16) # wav_file = six.BytesIO() # soundfile.write(wav_file, samples, sr, format='WAV', subtype='PCM_16') # wav_file.seek(0) # examples_frames = vggish_input.wavfile_to_frames(wav_file) #reset the seek on the in memory wav object # wav_file.seek(0) # # # examples_batch = vggish_input.wavfile_to_examples(wav_file) # wav_file.seek(0) example_waveform = generate_audio.gen_audio(3, 16000) examples_batch = vggish_input.waveform_to_examples(example_waveform, 16000) examples_frames = vggish_input.wavform_to_frames(example_waveform, 16000) #since there is no zero padding, the last frames from 'wavefile to frames' are not aligned with any of the log-mel features. So, they need to be discarded. num_frames_to_keep = examples_batch.shape[0] * examples_batch.shape[1] examples_frames = examples_frames[0:num_frames_to_keep, :] print(examples_batch) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) # If needed, prepare a record writer to store the postprocessed embeddings. writer = tf.python_io.TFRecordWriter( FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None
def main(_): # In this simple example, we run the examples from a single audio file through # the model. If none is provided, we generate a synthetic input. # print("begin vggish_inference_demo file") if FLAGS.wav_file: wav_file = FLAGS.wav_file else: # Write a WAV of a sine wav into an in-memory file object. num_secs = 5 freq = 1000 sr = 44100 t = np.linspace(0, num_secs, int(num_secs * sr)) x = np.sin(2 * np.pi * freq * t) # Convert to signed 16-bit samples. samples = np.clip(x * 32768, -32768, 32767).astype(np.int16) wav_file = six.BytesIO() wavfile.write(wav_file, sr, samples) wav_file.seek(0) output_csv_dict, examples_batch = vggish_input.wavfile_to_examples( wav_file) print("after initializing wav_file") num_secs = 3 freq = 1000 sr = 44100 t = np.linspace(0, num_secs, int(num_secs * sr)) x = np.sin(2 * np.pi * freq * t) # Produce a batch of log mel spectrogram examples. _, input_smoke_test_batch = vggish_input.waveform_to_examples( x, sr, wav_file) print(examples_batch) print("Shape of examples batch:", examples_batch.shape, "Shape of smoke test batch: ", input_smoke_test_batch.shape) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) # If needed, prepare a record writer to store the postprocessed embeddings. writer = tf.python_io.TFRecordWriter( FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None # Things to try # input noise tensor # example audio with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) print("embedding_batch") print(embedding_batch) postprocessed_batch = pproc.postprocess(embedding_batch) print("postprocessed_batch") print(postprocessed_batch) dict_to_csv(output_csv_dict, postprocessed_batch) # Write the postprocessed embeddings as a SequenceExample, in a similar # format as the features released in AudioSet. Each row of the batch of # embeddings corresponds to roughly a second of audio (96 10ms frames), and # the rows are written as a sequence of bytes-valued features, where each # feature value contains the 128 bytes of the whitened quantized embedding. seq_example = tf.train.SequenceExample( feature_lists=tf.train.FeatureLists( feature_list={ vggish_params.AUDIO_EMBEDDING_FEATURE_NAME: tf.train.FeatureList(feature=[ tf.train.Feature(bytes_list=tf.train.BytesList( value=[embedding.tobytes()])) for embedding in postprocessed_batch ]) })) print(seq_example) if writer: writer.write(seq_example.SerializeToString()) if writer: writer.close()
def handle_source(json_data): data = str(json_data['data']) data = data[1:-1] global graph np_wav = np.fromstring(data, dtype=np.int16, sep=',') / \ 32768.0 # Convert to [-1.0, +1.0] # Compute RMS and convert to dB print('Successfully convert to NP rep', np_wav) rms = np.sqrt(np.mean(np_wav**2)) db = dbFS(rms) print('Db...', db) # Make predictions print('Making prediction...') x = waveform_to_examples(np_wav, RATE) predictions = [] if x.shape[0] != 0: x = x.reshape(len(x), 96, 64, 1) print('Successfully reshape x', x.shape) pred = model.predict(x) predictions.append(pred) with graph.as_default(): if x.shape[0] != 0: x = x.reshape(len(x), 96, 64, 1) print('Successfully reshape x', x) # pred = model.predict(x) # predictions.append(pred) for prediction in predictions: context_prediction = np.take( prediction[0], [homesounds.labels[x] for x in active_context]) m = np.argmax(context_prediction) print('Max prediction', str( homesounds.to_human_labels[active_context[m]]), str(context_prediction[m])) if (context_prediction[m] > PREDICTION_THRES and db > DBLEVEL_THRES): socketio.emit('audio_label', {'label': str(homesounds.to_human_labels[active_context[m]]), 'accuracy': str(context_prediction[m])}) print("Prediction: %s (%0.2f)" % ( homesounds.to_human_labels[active_context[m]], context_prediction[m])) socket.emit('audio_label', { 'label': 'Unrecognized Sound', 'accuracy': '1.0' }) def background_thread(): """Example of how to send server generated events to clients.""" count = 0 while True: socketio.sleep(10) count += 1 socketio.emit('my_response', {'data': 'Server generated event', 'count': count}, namespace='/test') @app.route('/') def index(): return render_template('index.html',) @socketio.on('send_message') def handle_source(json_data): print('Receive message...' + str(json_data['message'])) text = json_data['message'].encode('ascii', 'ignore') socketio.emit('echo', {'echo': 'Server Says: ' + str(text)}) print('Sending message back..') @socketio.on('disconnect_request', namespace='/test') def disconnect_request(): @copy_current_request_context def can_disconnect(): disconnect() session['receive_count'] = session.get('receive_count', 0) + 1 # for this emit we use a callback function # when the callback function is invoked we know that the message has been # received and it is safe to disconnect emit('my_response', {'data': 'Disconnected!', 'count': session['receive_count']}, callback=can_disconnect) @socketio.on('connect', namespace='/test') def test_connect(): global thread with thread_lock: if thread is None: thread = socketio.start_background_task(background_thread) emit('my_response', {'data': 'Connected', 'count': 0}) @socketio.on('disconnect', namespace='/test') def test_disconnect(): print('Client disconnected', request.sid) if __name__ == '__main__': socketio.run(app, debug=True)
def wav2mel(self, blob, sample_rate): self.logger.debug(f'blob: {blob.shape}, sample_rate: {sample_rate}') mel_spec = vggish_input.waveform_to_examples(blob, sample_rate).squeeze() self.logger.debug(f'mel_spec: {mel_spec.shape}') return mel_spec
df = pd.read_csv('data.csv') df['File'] = df['Category'] + '/' + df['File'] idx, labels, events, files = df.index.values, df.Category.values, df.Event.values, df.File.values df_eval = pd.read_csv('Logsheet_Evaluation.csv') files_eval = df_eval.File.values audio_path = '/home/tianxiangchen1/cssvp/Evaluation/16k/' embedding_path = '/home/tianxiangchen1/cssvp/embeddings/from_file/Evaluation/' X_eval = [] X_eval_2 = [] for f in files_eval: wav = read_audio(audio_path + f) Sxx = vggish_input.waveform_to_examples(wav, sample_rate=16000) Sxx = np.vstack(Sxx) X_eval.append(Sxx.reshape(1, Sxx.shape[0], Sxx.shape[1], 1)) feat = np.load(embedding_path + f + '.npy') X_eval_2.append(feat.reshape(1, feat.shape[0], feat.shape[1])) X_eval = np.vstack(X_eval) X_eval_2 = np.vstack(X_eval_2) from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import LabelEncoder # Encoding main task label_enc = LabelEncoder() enc = OneHotEncoder(sparse=False) y_int = label_enc.fit_transform(labels) y_int = y_int.reshape(len(y_int), 1) y_one_hot = enc.fit_transform(y_int)