def extract_audioset_features(ids, id2audio_path, id2label): first_audio = True for i in ids: if first_audio: input_data = vggish_input.wavfile_to_examples(id2audio_path[i]) ground_truth = np.repeat(id2label[i], input_data.shape[0], axis=0) identifiers = np.repeat(i, input_data.shape[0], axis=0) first_audio = False else: tmp_in = vggish_input.wavfile_to_examples(id2audio_path[i]) input_data = np.concatenate((input_data, tmp_in), axis=0) tmp_gt = np.repeat(id2label[i], tmp_in.shape[0], axis=0) ground_truth = np.concatenate((ground_truth, tmp_gt), axis=0) tmp_id = np.repeat(i, tmp_in.shape[0], axis=0) identifiers = np.concatenate((identifiers, tmp_id), axis=0) with tf.Graph().as_default(), tf.Session() as sess: vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, 'vggish_model.ckpt') features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) extracted_feat = sess.run([embedding_tensor], feed_dict={features_tensor: input_data}) feature = np.squeeze(np.asarray(extracted_feat)) return [feature, ground_truth, identifiers]
def _folder_to_mel(path): os.chdir(path) files = os.listdir(path) sound_examples = vggish_input.wavfile_to_examples(files[0]) for i in range(1, len(files)): sound_examples = np.concatenate( (sound_examples, vggish_input.wavfile_to_examples(files[i]))) return sound_examples
def extract_n_predict(input_wav_file, pca_params, checkpoint, checkpoint_file, train_dir, output_file): print("Input file: " +input_wav_file) if (os.path.isfile(input_wav_file)): examples_batch = vggish_input.wavfile_to_examples(input_wav_file) #print(examples_batch) pproc = vggish_postprocess.Postprocessor(pca_params) with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) #print(embedding_batch) postprocessed_batch = pproc.postprocess(embedding_batch) #print(postprocessed_batch) num_frames_batch_val = np.array([postprocessed_batch.shape[0]],dtype=np.int32) video_batch_val = np.zeros((1, 300, 128), dtype=np.float32) video_batch_val[0,0:postprocessed_batch.shape[0],:] = utils.Dequantize(postprocessed_batch.astype(float),2,-2) # extract_n_predict() predicted_class = inference(video_batch_val ,num_frames_batch_val, checkpoint_file, train_dir, output_file) return(predicted_class) tf.reset_default_graph()
def extraccion_embeddings_audio(self, rutaAudio, embeddings): '''Extrae de un único audio, la media y la desviación de los 128 embeddings, para devolver un vector de 256 ccas por audio o los 128 espectros en caso de que embeddings sea falso. Parametros ---------- rutaAudio:str ruta del audio en concreto embeddings:boolean Si es true sacamos los embeddings, si false sacamos los espectros Return ------ final_ccas: numpy.array array de características del audio. Embeddings o espectros. ''' #1. Sacamos las ccas MFCC y espectrales input_batch = vggish_input.wavfile_to_examples(rutaAudio) #2. producimos los embbeding cn el modelo o los espectros segun se indique if embeddings: ccas = self.model.predict(input_batch[:, :, :, None]) else: # "Rompemos" los grupos de 0.96s ccas = input_batch.reshape(-1, input_batch.shape[-1]) #3. Hacemos media y deviación de los embbedings media = np.mean(ccas, axis=0) desvs = np.std(ccas, axis=0) final_ccas = np.append(media, desvs) return final_ccas
def main(_): with open(FLAGS.wav_files) as f: files_list = [line.replace('\n', '') for line in f] n_files = len(files_list) output_emedding = np.zeros((n_files, 128)) pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) processed_fnames = [] with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) for n_file, wav_file in enumerate(files_list): examples_batch = vggish_input.wavfile_to_examples(wav_file) print(n_file, '/', n_files) if examples_batch.shape[0] == 0: with open('bad_files.log', 'a') as logf: logf.write(wav_file + '\n') else: processed_fnames.append(wav_file) [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) postprocessed_batch = pproc.postprocess(embedding_batch) postprocessed_batch_mean = np.mean(postprocessed_batch, axis=0) output_emedding[n_file, :] = postprocessed_batch_mean np.save(FLAGS.npy_file, output_emedding)
def extract_and_predict(wav): print("Boom from PYTHON!!!") # tf.enable_v2_behavior() # loaded_model = tf2.saved_model.load(saved_model_path) # print("I can load model now!!!") wav_file = wav examples_batch = vggish_input.wavfile_to_examples(wav_file) print("Jerry audio_to_prediction.py: after wavfile_to_examples") # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor() print("Jerry audio_to_prediction.py: after pproc") with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) print("Jerry audio_to_prediction.py: after load vggish_slim") features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) postprocessed_batch = pproc.postprocess(embedding_batch) postprocessed_batch = [postprocessed_batch[i] for i in range(len(postprocessed_batch))] pred_each_n_seconds = predict_with_saved_model(postprocessed_batch) return str(pred_each_n_seconds)
def extract_and_predict(wav): wav_file = wav examples_batch = vggish_input.wavfile_to_examples(wav_file) # Prepare a postprocessor to munge the model embeddings. # pproc = vggish_postprocess.Postprocessor() with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) postprocessed_batch = vggish_postprocess.postprocess(embedding_batch) postprocessed_batch = [ postprocessed_batch[i] for i in range(len(postprocessed_batch)) ] pred_each_n_seconds = predict_with_saved_model(postprocessed_batch) print(str(pred_each_n_seconds))
def extract(wav_file): examples_batch = vggish_input.wavfile_to_examples(wav_file) pproc = vggish_postprocess.Postprocessor( '/storage/haibn/yt8m/code/video_classification/feature_extractor/vggish/vggish_pca_params.npz' ) with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint( sess, '/storage/haibn/yt8m/code/video_classification/feature_extractor/vggish/vggish_model.ckpt' ) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) print(embedding_batch) postprocessed_batch = pproc.postprocess(embedding_batch) print(postprocessed_batch) return postprocessed_batch
def main(_): if FLAGS.wav_file: wav_file = FLAGS.wav_file else: num_secs = 5 freq = 1000 sr = 44100 t = np.linspace(0, num_secs, int(num_secs * sr)) x = np.sin(2 * np.pi * freq * t) samples = np.clip(x * 32768, -32768, 32767).astype(np.int16) wav_file = six.BytesIO() wavfile.write(wav_file, sr, samples) wav_file.seek(0) examples_batch = vggish_input.wavfile_to_examples(wav_file) # pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) # print(FLAGS.checkpoint) # print(os.getcwd()) # print(path.exists(FLAGS.checkpoint)) with tf.Graph().as_default(), tf.Session() as sess: vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) # embedding_batch为提取结果 with open(path.splitext(wav_file)[0] + '.pk', "wb") as f: pickle.dump(embedding_batch, f)
def main(_): # In this simple example, we run the examples from a single audio file through # the model. If none is provided, we generate a synthetic input. wav_file = FLAGS.wav_file examples_batch = vggish_input.wavfile_to_examples(wav_file) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) postprocessed_batch = pproc.postprocess(embedding_batch) print(postprocessed_batch) np.save("/postprocessed_batch.npy", postprocessed_batch)
def extract_vggish_features(wav_path): # Produce a batch of log mel spectrogram examples. input_batch = vggish_input.wavfile_to_examples(wav_path) if input_batch.shape[0] < 1: print('{}: Audio sample shorter than 1 second. Ignoring ...', os.path.basename(wav_path)) return None # print('Log Mel Spectrogram example: ', input_batch[0]) # Define VGGish, load the checkpoint, and run the batch through the model to # produce embeddings. with tf.Graph().as_default(), tf.Session() as sess: vggish_slim.define_vggish_slim() vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: input_batch}) # Postprocess the results to produce whitened quantized embeddings. pproc = vggish_postprocess.Postprocessor(pca_params_path) postprocessed_batch = pproc.postprocess(embedding_batch) return postprocessed_batch
def main(wav_file): """ #Specify the path for the downloaded or recorded audio files and #also path for writing the embeddings or pickle files """ if wav_file: pkl = wav_file[:-4] + '.pkl' print(pkl) examples_batch = vggish_input.wavfile_to_examples(wav_file) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) postprocessed_batch = pproc.postprocess(embedding_batch) print(postprocessed_batch) predict_prob, predictions = model_function.predictions_wavfile( postprocessed_batch) K.clear_session() return predict_prob, predictions
def main(_): audio_files = os.listdir(audio_path) # maxi = 0 for each_file in tqdm.tqdm(audio_files): file_nm = dest_path + each_file.split('.')[0] + '.npy' if not (path.exists(file_nm)): try: wav_file = audio_path + each_file examples_batch = vggish_input.wavfile_to_examples(wav_file) with tf.Graph().as_default(), tf.Session() as sess: vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint( sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) [embedding_batch] = sess.run( [embedding_tensor], feed_dict={features_tensor: examples_batch}) postprocessed_batch = embedding_batch #indices = np.linspace(0, len(postprocessed_batch), max_frames, endpoint=False, dtype=int) #postprocessed_batch = postprocessed_batch[indices] np.save(dest_path + each_file.split('.')[0] + '.npy', postprocessed_batch) except: print("here") continue
def main(_): opt = parse_opt() with tf.Graph().as_default(), tf.Session() as sess: vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME) keys, values = ['train', 'val', 'test'], [opt.train_range, opt.val_range, opt.test_range] for i in range(3): h5_path = opt.feat_h5 + '2016' + '_' + keys[i] + '_' + opt.type + '.h5' if os.path.exists(h5_path): os.remove(h5_path) h5 = h5py.File(h5_path, 'w') dataset_feats = h5.create_dataset('feats', ((values[i][1] - values[i][0] + 1), opt.feat_size), dtype='float32') # print(values[i]) for audio_id in range(values[i][0], values[i][1] + 1): wav_file = opt.video_root + 'video' + str(audio_id) + '.mp4.wav' #print(wav_file) # id = int(audio_id[5:-9]) #print(audio_id) if os.path.isfile(wav_file): examples_batch = vggish_input.wavfile_to_examples(wav_file) pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) writer = tf.python_io.TFRecordWriter(FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) #print(len(embedding_batch), len(embedding_batch[0])) embedding_batch = embedding_batch.mean(0) dataset_feats[audio_id - values[i][0]] = embedding_batch #print(embedding_batch) if writer: writer.close()
def extract_vggish_features(paths, path2gt, model): """Extracts VGGish features and their corresponding ground_truth and identifiers (the path). VGGish features are extracted from non-overlapping audio patches of 0.96 seconds, where each audio patch covers 64 mel bands and 96 frames of 10 ms each. We repeat ground_truth and identifiers to fit the number of extracted VGGish features. """ # 1) Extract log-mel spectrograms first_audio = True for p in paths: if first_audio: input_data = vggish_input.wavfile_to_examples( config['audio_folder'] + p) ground_truth = np.repeat(path2gt[p], input_data.shape[0], axis=0) identifiers = np.repeat(p, input_data.shape[0], axis=0) first_audio = False else: tmp_in = vggish_input.wavfile_to_examples(config['audio_folder'] + p) input_data = np.concatenate((input_data, tmp_in), axis=0) tmp_gt = np.repeat(path2gt[p], tmp_in.shape[0], axis=0) ground_truth = np.concatenate((ground_truth, tmp_gt), axis=0) tmp_id = np.repeat(p, tmp_in.shape[0], axis=0) identifiers = np.concatenate((identifiers, tmp_id), axis=0) # 2) Load Tensorflow model to extract VGGish features tfconfig = tf.ConfigProto( allow_soft_placement=True, log_device_placement=True, device_count={'CPU': 4}, intra_op_parallelism_threads=4, inter_op_parallelism_threads=2, ) with tf.Graph().as_default(), tf.Session(config=tfconfig) as sess: vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint( sess, '/kaggle/input/vggishmodel/vggish_model.ckpt') features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) extracted_feat = sess.run([embedding_tensor], feed_dict={features_tensor: input_data}) feature = np.squeeze(np.asarray(extracted_feat)) return [feature, ground_truth, identifiers]
def main(_): # In this simple example, we run the examples from a single audio file through # the model. If none is provided, we generate a synthetic input. # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(pca_params) vggish_params.EXAMPLE_HOP_SECONDS = ( 1 - args.overlap) * vggish_params.EXAMPLE_WINDOW_SECONDS # If needed, prepare a record writer_dict to store the postprocessed embeddings. with tf.Graph().as_default(), tf.Session(config=config) as sess: vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) movie_id = args.wav_file[args.wav_file.rfind('/') + 1:args.wav_file.rfind('.')] examples_batch = vggish_input.wavfile_to_examples(args.wav_file) num_splits = min(int(examples_batch.shape[0] / 10), 100) num_splits = max(1, num_splits) examples_batch = np.array_split(examples_batch, num_splits) embedding_batch = [] for i in range(num_splits): [batch] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch[i]}) embedding_batch.extend(batch) postprocessed_batch = pproc.postprocess(np.array(embedding_batch)) # Write the postprocessed embeddings as a SequenceExample, in a similar # format as the features released in AudioSet. Each row of the batch of # embeddings corresponds to roughly a second of audio (96 10ms frames), and # the rows are written as a sequence of bytes-valued features, where each # feature value contains the 128 bytes of the whitened quantized embedding. seq_example = tf.train.SequenceExample( context=tf.train.Features( feature={ 'movie_id': tf.train.Feature(bytes_list=tf.train.BytesList( value=[movie_id])) }), feature_lists=tf.train.FeatureLists( feature_list={ vggish_params.AUDIO_EMBEDDING_FEATURE_NAME: tf.train.FeatureList(feature=[ tf.train.Feature(bytes_list=tf.train.BytesList( value=[embedding.tobytes()])) for embedding in postprocessed_batch ]) })) writer = tf.python_io.TFRecordWriter( os.path.join(args.write_dir, movie_id + '.tfrecord')) writer.write(seq_example.SerializeToString()) writer.close()
def __getitem__(self, idx): filename = self.X[idx] wav = wavfile_to_examples(filename) wav = wav.cuda() target = self.y[idx] return wav, target
def _preprocess(self, x, fs): if isinstance(x, np.ndarray): x = vggish_input.waveform_to_examples(x, fs) elif isinstance(x, str): x = vggish_input.wavfile_to_examples(x) else: raise AttributeError return x
def main(_): if FLAGS.wav_file: wav_file = FLAGS.wav_file else: return "No wav file" examples_batch = vggish_input.wavfile_to_examples(wav_file) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) # If needed, prepare a record writer to store the postprocessed embeddings. writer = tf.python_io.TFRecordWriter( FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) print(embedding_batch) postprocessed_batch = pproc.postprocess(embedding_batch) print(postprocessed_batch) # Write the postprocessed embeddings as a SequenceExample, in a similar # format as the features released in AudioSet. Each row of the batch of # embeddings corresponds to roughly a second of audio (96 10ms frames), and # the rows are written as a sequence of bytes-valued features, where each # feature value contains the 128 bytes of the whitened quantized embedding. seq_example = tf.train.SequenceExample( feature_lists=tf.train.FeatureLists( feature_list={ vggish_params.AUDIO_EMBEDDING_FEATURE_NAME: tf.train.FeatureList( feature=[ tf.train.Feature( bytes_list=tf.train.BytesList( value=[embedding.tobytes()])) for embedding in postprocessed_batch ] ) } ) ) print(seq_example) if writer: writer.write(seq_example.SerializeToString()) if writer: writer.close()
def readDirectory(dirname, label): pproc = vggish_postprocess.Postprocessor("vggish_pca_params.npz") for wav_file in glob.glob(dirname + "*.wav"): print(wav_file) try: examples_batch = vggish_input.wavfile_to_examples(wav_file) except: continue writer = tf.python_io.TFRecordWriter(wav_file[:-3] + "tfrecord") with tf.Graph().as_default(), tf.Session() as sess: vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, "vggish_model.ckpt") features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) try: [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) except: continue postprocessed_batch = pproc.postprocess(embedding_batch) nBatches = len(postprocessed_batch) if nBatches < 10: nBatches = 1 else: nBatches = nBatches / 10 for i in range(nBatches): seq_example = tf.train.SequenceExample( context=tf.train.Features( feature={ "labels": tf.train.Feature(int64_list=tf.train.Int64List( value=[label])) }), feature_lists=tf.train.FeatureLists( feature_list={ vggish_params.AUDIO_EMBEDDING_FEATURE_NAME: tf.train.FeatureList(feature=[ tf.train.Feature(bytes_list=tf.train.BytesList( value=[embedding.tobytes()])) for embedding in postprocessed_batch[i * 10:i * 10 + 10] ]) })) if writer: writer.write(seq_example.SerializeToString()) if writer: writer.close()
def _folder_to_mel(path): scaler = StandardScaler() os.chdir(path) files = os.listdir(".") sound_examples = vggish_input.wavfile_to_examples(files[0]) for i in range(0, sound_examples.shape[0]): sound_examples[i, :, :] = scaler.fit_transform(sound_examples[i, :, :]) sound_examples = sound_examples.reshape(sound_examples.shape[0], 96, 64, 1) sound_examples = np.repeat(sound_examples, 3, axis=3) for i in range(1, len(files)): if (sf.SoundFile(files[i]).subtype) == "PCM_16": temp_example = vggish_input.wavfile_to_examples(files[i]) for j in range(0, temp_example.shape[0]): temp_example[j, :, :] = scaler.fit_transform( temp_example[j, :, :]) temp_example = temp_example.reshape(temp_example.shape[0], 96, 64, 1) temp_example = np.repeat(temp_example, 3, axis=3) sound_examples = np.concatenate((sound_examples, temp_example)) return sound_examples
def main(wav_file, npz_path): # In this simple example, we run the examples from a single audio file through # the model. If none is provided, we generate a synthetic input. #if FLAGS.wav_file: # wav_file = str(FLAGS.wav_file) # print (FLAGS.wav_file) if 1: wav_file = wav_file else: # Write a WAV of a sine wav into an in-memory file object. num_secs = 5 freq = 1000 sr = 44100 t = np.linspace(0, num_secs, int(num_secs * sr)) x = np.sin(2 * np.pi * freq * t) # Convert to signed 16-bit samples. samples = np.clip(x * 32768, -32768, 32767).astype(np.int16) wav_file = six.BytesIO() wavfile.write(wav_file, sr, samples) wav_file.seek(0) examples_batch = vggish_input.wavfile_to_examples(wav_file) #print(examples_batch) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) # If needed, prepare a record writer to store the postprocessed embeddings. writer = tf.python_io.TFRecordWriter(tfrecord_file) with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) #print(embedding_batch) postprocessed_batch = pproc.postprocess(embedding_batch) #print(postprocessed_batch) if 0 in embedding_batch.shape: print('NO') return 0 np.savez_compressed(npz_path, postprocessed_batch) return 1
def main(_): ontology_lookup = {} with open(ONTROLOGY, 'r') as f: label_json = json.load(f) for entry in label_json: label_id = entry['id'].replace('/', '_') assert label_id not in ontology_lookup.keys() ontology_lookup[label_id] = entry wav_paths = glob.glob(os.path.join(AUDIO_CHUNKS, '*', '*.wav')) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) audio_tsv = [] label_tsv = [] emb_tsv = [] for wavfile in tqdm(wav_paths): label = Path(Path(wavfile).parent).stem filename = Path(wavfile).name examples_batch = vggish_input.wavfile_to_examples(wavfile) with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) # emb = [] # for embedding in embedding_batch: # emb.append(embedding.tolist()) emb = np.mean(embedding_batch, axis=0).tolist() label_tsv.append([ontology_lookup[label]['name']]) audio_tsv.append([f'{label}/{filename}']) emb_tsv.append(emb) assert len(emb_tsv[0]) == len(emb) with open(f'{OUTPUTDIR}/emb.tsv', 'w') as f: for emb in emb_tsv: csv.writer(f, delimiter='\t').writerow(emb) with open(f'{OUTPUTDIR}/label.tsv', 'w') as f: for label in label_tsv: csv.writer(f, delimiter='\t').writerow(label) with open(f'{OUTPUTDIR}/audio.tsv', 'w') as f: for audio_path in audio_tsv: csv.writer(f, delimiter='\t').writerow(audio_path)
def _pre_process(paths): """Individual VGGish preprocessing process.""" input_path, output_path = paths input_path_exists, output_path_exists = FeatureExtractor.feature_path_checker( input_path, output_path) if input_path_exists and not output_path_exists: features = vggish_input.wavfile_to_examples( input_path) # can also do .ogg files pickle.dump(features, open(output_path, "wb")) del features
def ProcessWithVGGish(sess, vgg, file_name, start=0, stop=None): '''Run the VGGish model, starting with a sound (x) at sample rate (sr). Return a whitened version of the embeddings. Sound must be scaled to be floats between -1 and +1.''' # Produce a batch of log mel spectrogram examples. (MFCC) input_batch = vggish_input.wavfile_to_examples(file_name, start, stop) # print('Log Mel Spectrogram example: ', input_batch[0]) [embedding_batch] = sess.run([vgg['embedding']], feed_dict={vgg['features']: input_batch}) return embedding_batch, input_batch
def audio_inference(wav_file): examples_batch = vggish_input.wavfile_to_examples(wav_file) with tf.Graph().as_default() as g: # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) postprocessed_batch = pproc.postprocess(embedding_batch) return postprocessed_batch # postprocessed_batch = audio_inference('../data/audio/00059.wav')
def get_audio_input(wave_file_address, sess, features_tensor, embedding_tensor, pproc): wave_file = wavfile_to_examples(wave_file_address) # Run inference and postprocessing. [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: wave_file}) sample4 = pproc.postprocess(embedding_batch) #print(np.shape(sample4)) sample5 = align(sample4) sample5 = np.reshape(sample5, (1, 5, 128)) return sample5
def get_examples_(self, i, audio_name): # audio_name = self.labeled_data.iloc[i,0] context_num = self.labeled_data.iloc[i, 1] all_examples = vggish_input.wavfile_to_examples(audio_name) all_labels = np.array([self.onehot_label[context_num - 1]] * all_examples.shape[0]) labeled_examples = list(zip(all_examples, all_labels)) # Separate and return the features and labels. features = [example for (example, _) in labeled_examples] labels = [label for (_, label) in labeled_examples] if features == None: print("ERROR: None features") exit() return (features, labels, context_num)
def add_augmented_files(X_files, y_categories): path_to_dataset = os.path.join(DATA_DIR, AUGMENTATION_DIR) augmented_features = [] augmented_categories = [] for sound_file in X_files: if sound_file[0] != '.': sound_category = sound_file.split('_')[0] path_to_directory = os.path.join(path_to_dataset, sound_category) path_to_file = os.path.join(path_to_directory, sound_file) # get all features of the sound files features = wavfile_to_examples(path_to_file) features = np.array(features) augmented_features.append(features) augmented_categories.append()
def OutputAudioEmbeddings(pathIn, row): video_id = row['video_id'] video_path = row['video_path'] split = row['split'] full_path = os.path.join(pathIn, video_path) full_path = full_path.replace("%(ext)s", "wav") # output file of the downloader path if split == 'train': full_path_cut = full_path.replace("train", "train/cut") elif split == 'test': full_path_cut = full_path.replace("test", "test/cut") # In this simple example, we run the examples from a single audio file through # the model. If none is provided, we generate a synthetic input. if os.path.isfile(full_path_cut): wav_file = full_path_cut examples_batch = vggish_input.wavfile_to_examples(wav_file) #print(examples_batch) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) # If needed, prepare a record writer to store the postprocessed embeddings. writer = tf.python_io.TFRecordWriter( FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) #print(embedding_batch) postprocessed_batch = pproc.postprocess(embedding_batch) print(postprocessed_batch) #print(postprocessed_batch.shape) np.save( '/lfs01/workdirs/shams010/shams010u1/code/audio_features/' + split + '/' + video_id, postprocessed_batch)
def main(_): # In this simple example, we run the examples from a single audio file through # the model. If none is provided, we generate a synthetic input. if FLAGS.wav_file: wav_file = FLAGS.wav_file else: # Write a WAV of a sine wav into an in-memory file object. num_secs = 5 freq = 1000 sr = 44100 t = np.linspace(0, num_secs, int(num_secs * sr)) x = np.sin(2 * np.pi * freq * t) # Convert to signed 16-bit samples. samples = np.clip(x * 32768, -32768, 32767).astype(np.int16) wav_file = six.BytesIO() wavfile.write(wav_file, sr, samples) wav_file.seek(0) examples_batch = vggish_input.wavfile_to_examples(wav_file) print(examples_batch) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) # If needed, prepare a record writer to store the postprocessed embeddings. writer = tf.python_io.TFRecordWriter( FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) print(embedding_batch) postprocessed_batch = pproc.postprocess(embedding_batch) print(postprocessed_batch) # Write the postprocessed embeddings as a SequenceExample, in a similar # format as the features released in AudioSet. Each row of the batch of # embeddings corresponds to roughly a second of audio (96 10ms frames), and # the rows are written as a sequence of bytes-valued features, where each # feature value contains the 128 bytes of the whitened quantized embedding. seq_example = tf.train.SequenceExample( feature_lists=tf.train.FeatureLists( feature_list={ vggish_params.AUDIO_EMBEDDING_FEATURE_NAME: tf.train.FeatureList( feature=[ tf.train.Feature( bytes_list=tf.train.BytesList( value=[embedding.tobytes()])) for embedding in postprocessed_batch ] ) } ) ) print(seq_example) if writer: writer.write(seq_example.SerializeToString()) if writer: writer.close()