def main(_): with open(FLAGS.wav_files) as f: files_list = [line.replace('\n', '') for line in f] n_files = len(files_list) output_emedding = np.zeros((n_files, 128)) pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) processed_fnames = [] with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) for n_file, wav_file in enumerate(files_list): examples_batch = vggish_input.wavfile_to_examples(wav_file) print(n_file, '/', n_files) if examples_batch.shape[0] == 0: with open('bad_files.log', 'a') as logf: logf.write(wav_file + '\n') else: processed_fnames.append(wav_file) [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) postprocessed_batch = pproc.postprocess(embedding_batch) postprocessed_batch_mean = np.mean(postprocessed_batch, axis=0) output_emedding[n_file, :] = postprocessed_batch_mean np.save(FLAGS.npy_file, output_emedding)
def extract_vggish_features(wav_path): # Produce a batch of log mel spectrogram examples. input_batch = vggish_input.wavfile_to_examples(wav_path) if input_batch.shape[0] < 1: print('{}: Audio sample shorter than 1 second. Ignoring ...', os.path.basename(wav_path)) return None # print('Log Mel Spectrogram example: ', input_batch[0]) # Define VGGish, load the checkpoint, and run the batch through the model to # produce embeddings. with tf.Graph().as_default(), tf.Session() as sess: vggish_slim.define_vggish_slim() vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: input_batch}) # Postprocess the results to produce whitened quantized embeddings. pproc = vggish_postprocess.Postprocessor(pca_params_path) postprocessed_batch = pproc.postprocess(embedding_batch) return postprocessed_batch
def main(_): # In this simple example, we run the examples from a single audio file through # the model. If none is provided, we generate a synthetic input. wav_file = FLAGS.wav_file examples_batch = vggish_input.wavfile_to_examples(wav_file) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) postprocessed_batch = pproc.postprocess(embedding_batch) print(postprocessed_batch) np.save("/postprocessed_batch.npy", postprocessed_batch)
def extract_n_predict(input_wav_file, pca_params, checkpoint, checkpoint_file, train_dir, output_file): print("Input file: " +input_wav_file) if (os.path.isfile(input_wav_file)): examples_batch = vggish_input.wavfile_to_examples(input_wav_file) #print(examples_batch) pproc = vggish_postprocess.Postprocessor(pca_params) with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) #print(embedding_batch) postprocessed_batch = pproc.postprocess(embedding_batch) #print(postprocessed_batch) num_frames_batch_val = np.array([postprocessed_batch.shape[0]],dtype=np.int32) video_batch_val = np.zeros((1, 300, 128), dtype=np.float32) video_batch_val[0,0:postprocessed_batch.shape[0],:] = utils.Dequantize(postprocessed_batch.astype(float),2,-2) # extract_n_predict() predicted_class = inference(video_batch_val ,num_frames_batch_val, checkpoint_file, train_dir, output_file) return(predicted_class) tf.reset_default_graph()
def setup(self): # Paths to downloaded VGGish files. self.checkpoint_path = 'vggish_model.ckpt' self.pca_params_path = 'vggish_pca_params.npz' self.batch_size = 60 # If we can't find the trained model files, download them if not os.path.exists(self.checkpoint_path): print( 'AudiosetAnalysis: Downloading model file {} (please wait - this may take a while)' .format(self.checkpoint_path)) urllib.urlretrieve( 'https://storage.googleapis.com/audioset/vggish_model.ckpt', self.checkpoint_path) if not os.path.exists(self.pca_params_path): print( 'AudiosetAnalysis: Downloading params file {} (please wait - this may take a while)' .format(self.pca_params_path)) urllib.urlretrieve( 'https://storage.googleapis.com/audioset/vggish_pca_params.npz', self.pca_params_path) # Define VGGish self.sess = tf.Graph().as_default() config = tf.ConfigProto(device_count={'CPU': 4}) self.sess = tf.Session(config=config) # Load the checkpoint vggish_slim.define_vggish_slim() vggish_slim.load_vggish_slim_checkpoint(self.sess, self.checkpoint_path) self.features_tensor = self.sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) self.embedding_tensor = self.sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME)
def input_sound(): # function to calculate the FS and time series input files = get_file_paths(DIRNAME) # function called to get the file paths File_names = [] full_feature_vector = np.empty([0, 128]) for file in sorted(files): # loop to access each file # print (file) (filepath, ext) = os.path.splitext(file) # get extension of the file file_name = os.path.basename(file) # get the file name if ext == '.wav': File_names.append(file_name) y, sr = librosa.load(file, sr=None) print(sr) examples_batch = waveform_to_examples(y, sr) pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) postprocessed_batch = pproc.postprocess(embedding_batch) print(np.shape(postprocessed_batch)) full_feature_vector = np.concatenate( (full_feature_vector, postprocessed_batch), axis=0) print(np.shape(full_feature_vector)) return full_feature_vector
def main(wav_file): """ #Specify the path for the downloaded or recorded audio files and #also path for writing the embeddings or pickle files """ if wav_file: pkl = wav_file[:-4] + '.pkl' print(pkl) examples_batch = vggish_input.wavfile_to_examples(wav_file) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) postprocessed_batch = pproc.postprocess(embedding_batch) print(postprocessed_batch) predict_prob, predictions = model_function.predictions_wavfile( postprocessed_batch) K.clear_session() return predict_prob, predictions
def main(_): audio_files = os.listdir(audio_path) # maxi = 0 for each_file in tqdm.tqdm(audio_files): file_nm = dest_path + each_file.split('.')[0] + '.npy' if not (path.exists(file_nm)): try: wav_file = audio_path + each_file examples_batch = vggish_input.wavfile_to_examples(wav_file) with tf.Graph().as_default(), tf.Session() as sess: vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint( sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) [embedding_batch] = sess.run( [embedding_tensor], feed_dict={features_tensor: examples_batch}) postprocessed_batch = embedding_batch #indices = np.linspace(0, len(postprocessed_batch), max_frames, endpoint=False, dtype=int) #postprocessed_batch = postprocessed_batch[indices] np.save(dest_path + each_file.split('.')[0] + '.npy', postprocessed_batch) except: print("here") continue
def extract(wav_file): examples_batch = vggish_input.wavfile_to_examples(wav_file) pproc = vggish_postprocess.Postprocessor( '/storage/haibn/yt8m/code/video_classification/feature_extractor/vggish/vggish_pca_params.npz' ) with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint( sess, '/storage/haibn/yt8m/code/video_classification/feature_extractor/vggish/vggish_model.ckpt' ) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) print(embedding_batch) postprocessed_batch = pproc.postprocess(embedding_batch) print(postprocessed_batch) return postprocessed_batch
def extract_audioset_features(ids, id2audio_path, id2label): first_audio = True for i in ids: if first_audio: input_data = vggish_input.wavfile_to_examples(id2audio_path[i]) ground_truth = np.repeat(id2label[i], input_data.shape[0], axis=0) identifiers = np.repeat(i, input_data.shape[0], axis=0) first_audio = False else: tmp_in = vggish_input.wavfile_to_examples(id2audio_path[i]) input_data = np.concatenate((input_data, tmp_in), axis=0) tmp_gt = np.repeat(id2label[i], tmp_in.shape[0], axis=0) ground_truth = np.concatenate((ground_truth, tmp_gt), axis=0) tmp_id = np.repeat(i, tmp_in.shape[0], axis=0) identifiers = np.concatenate((identifiers, tmp_id), axis=0) with tf.Graph().as_default(), tf.Session() as sess: vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, 'vggish_model.ckpt') features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) extracted_feat = sess.run([embedding_tensor], feed_dict={features_tensor: input_data}) feature = np.squeeze(np.asarray(extracted_feat)) return [feature, ground_truth, identifiers]
def inference(file_path,checkpoint_dir,checkpoint_path): ''' Inference loop for prediction the audio file. ''' with tf.Graph().as_default(), tf.compat.v1.Session() as sess: logits_inf = vggish_slim.define_audio_slim(training=False,is_reuse=None) with tf.compat.v1.variable_scope('mymodel'): predict_inf = tf.sigmoid(logits_inf, name='prediction_inf') # Add inference ops. with tf.variable_scope('train'): global_step = tf.Variable( 0, name='global_step', trainable=False, collections=[tf.compat.v1.GraphKeys.GLOBAL_VARIABLES, tf.compat.v1.GraphKeys.GLOBAL_STEP]) # Initialize all variables in the model sess.run(tf.compat.v1.global_variables_initializer()) # Restore the model saver = tf.compat.v1.train.Saver() saver.restore(sess, checkpoint_path) # Locate all the tensors and ops we need for the inference loop. features_tensor_inf = sess.graph.get_tensor_by_name( 'audio/audio_input_features:0') prediction_tensor_inf = sess.graph.get_tensor_by_name('mymodel/prediction_inf:0') graph = tf.Graph() with graph.as_default(): vggish_slim.define_vggish_slim(training=False) sess_ext = tf.compat.v1.Session(graph=graph) vggish_slim.load_vggish_slim_checkpoint(sess_ext, checkpoint_dir + "vggish_model.ckpt") input_tensor = graph.get_tensor_by_name('vggish/input_features:0') output_tensor = graph.get_tensor_by_name('vggish/embedding:0') pproc = Postprocessor.Postprocessor(checkpoint_dir + "vggish_pca_params.npz") print('\n###################') print('# Inference loop #') print('###################') try: data, sampleratde = sf.read(Path(file_path)) wave_array_example_pre = data_transformation.waveform_to_examples(data,sampleratde,display=0) [embedding_batch] = sess_ext.run([output_tensor], feed_dict={input_tensor: wave_array_example_pre}) wave_arrays = pproc.postprocess(embedding_batch) pred_inf_restore = sess.run(prediction_tensor_inf, feed_dict={features_tensor_inf: wave_arrays}) return pred_inf_restore except: print('This program does not support the input file format or file does not found ')
def extract_and_predict(wav): wav_file = wav examples_batch = vggish_input.wavfile_to_examples(wav_file) # Prepare a postprocessor to munge the model embeddings. # pproc = vggish_postprocess.Postprocessor() with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) postprocessed_batch = vggish_postprocess.postprocess(embedding_batch) postprocessed_batch = [ postprocessed_batch[i] for i in range(len(postprocessed_batch)) ] pred_each_n_seconds = predict_with_saved_model(postprocessed_batch) print(str(pred_each_n_seconds))
def main(_): if FLAGS.wav_file: wav_file = FLAGS.wav_file else: num_secs = 5 freq = 1000 sr = 44100 t = np.linspace(0, num_secs, int(num_secs * sr)) x = np.sin(2 * np.pi * freq * t) samples = np.clip(x * 32768, -32768, 32767).astype(np.int16) wav_file = six.BytesIO() wavfile.write(wav_file, sr, samples) wav_file.seek(0) examples_batch = vggish_input.wavfile_to_examples(wav_file) # pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) # print(FLAGS.checkpoint) # print(os.getcwd()) # print(path.exists(FLAGS.checkpoint)) with tf.Graph().as_default(), tf.Session() as sess: vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) # embedding_batch为提取结果 with open(path.splitext(wav_file)[0] + '.pk', "wb") as f: pickle.dump(embedding_batch, f)
def create_vggish_frozen_graph(): """Create the VGGish frozen graph.""" os.system('git clone https://github.com/tensorflow/models.git') sys.path.append('models/research/audioset/vggish/') import vggish_slim os.system( 'curl -O https://storage.googleapis.com/audioset/vggish_model.ckpt') ckpt_path = 'vggish_model.ckpt' with tf.Graph().as_default(), tf.Session() as sess: vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, ckpt_path) saver = tf.train.Saver(tf.all_variables()) freeze_graph.freeze_graph_with_def_protos( sess.graph_def, saver.as_saver_def(), ckpt_path, 'vggish/fc2/BiasAdd', restore_op_name=None, filename_tensor_name=None, output_graph='/tmp/mediapipe/vggish_new.pb', clear_devices=True, initializer_nodes=None) os.system('rm -rf models/') os.system('rm %s' % ckpt_path)
def extract_and_predict(wav): print("Boom from PYTHON!!!") # tf.enable_v2_behavior() # loaded_model = tf2.saved_model.load(saved_model_path) # print("I can load model now!!!") wav_file = wav examples_batch = vggish_input.wavfile_to_examples(wav_file) print("Jerry audio_to_prediction.py: after wavfile_to_examples") # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor() print("Jerry audio_to_prediction.py: after pproc") with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) print("Jerry audio_to_prediction.py: after load vggish_slim") features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) postprocessed_batch = pproc.postprocess(embedding_batch) postprocessed_batch = [postprocessed_batch[i] for i in range(len(postprocessed_batch))] pred_each_n_seconds = predict_with_saved_model(postprocessed_batch) return str(pred_each_n_seconds)
def main(_): with tf.Graph().as_default(), tf.Session() as sess: embeddings = vggish_slim.define_vggish_slim(FLAGS.train_vggish) with tf.variable_scope('mymodel'): num_units = 100 fc = slim.fully_connected(embeddings, num_units) logits = slim.fully_connected(fc, _NUM_CLASSES, activation_fn=None, scope='logits') tf.sigmoid(logits, name='prediction') with tf.variable_scope('train'): global_step = tf.Variable(0, name='global_step', trainable=False, collections=[ tf.GraphKeys.GLOBAL_VARIABLES, tf.GraphKeys.GLOBAL_STEP ]) labels = tf.placeholder(tf.float32, shape=(None, _NUM_CLASSES), name='labels') xent = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels, name='xent') loss = tf.reduce_mean(xent, name='loss_op') tf.summary.scalar('loss', loss) optimizer = tf.train.AdamOptimizer( learning_rate=vggish_params.LEARNING_RATE, epsilon=vggish_params.ADAM_EPSILON) optimizer.minimize(loss, global_step=global_step, name='train_op') sess.run(tf.global_variables_initializer()) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) labels_tensor = sess.graph.get_tensor_by_name('mymodel/train/labels:0') global_step_tensor = sess.graph.get_tensor_by_name( 'mymodel/train/global_step:0') loss_tensor = sess.graph.get_tensor_by_name('mymodel/train/loss_op:0') train_op = sess.graph.get_operation_by_name('mymodel/train/train_op') for _ in range(FLAGS.num_batches): (features, labels) = _get_examples_batch() [num_steps, loss, _] = sess.run([global_step_tensor, loss_tensor, train_op], feed_dict={ features_tensor: features, labels_tensor: labels }) print('Step %d: loss %g' % (num_steps, loss))
def main(_): opt = parse_opt() with tf.Graph().as_default(), tf.Session() as sess: vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME) keys, values = ['train', 'val', 'test'], [opt.train_range, opt.val_range, opt.test_range] for i in range(3): h5_path = opt.feat_h5 + '2016' + '_' + keys[i] + '_' + opt.type + '.h5' if os.path.exists(h5_path): os.remove(h5_path) h5 = h5py.File(h5_path, 'w') dataset_feats = h5.create_dataset('feats', ((values[i][1] - values[i][0] + 1), opt.feat_size), dtype='float32') # print(values[i]) for audio_id in range(values[i][0], values[i][1] + 1): wav_file = opt.video_root + 'video' + str(audio_id) + '.mp4.wav' #print(wav_file) # id = int(audio_id[5:-9]) #print(audio_id) if os.path.isfile(wav_file): examples_batch = vggish_input.wavfile_to_examples(wav_file) pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) writer = tf.python_io.TFRecordWriter(FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) #print(len(embedding_batch), len(embedding_batch[0])) embedding_batch = embedding_batch.mean(0) dataset_feats[audio_id - values[i][0]] = embedding_batch #print(embedding_batch) if writer: writer.close()
def embed(wavform_slice, rate): norm_wavform_slice = preprocessing.normalize(wavform_slice) examples_batch = vggish_input.waveform_to_examples(norm_wavform_slice,rate) #print('examples_batch:') #print(examples_batch) print('examples_batch len: ' + str(len(examples_batch))) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) #print('embedding_batch: ') #print(embedding_batch) #print(embedding_batch.shape) postprocessed_batch = pproc.postprocess(embedding_batch) print('postprocessed_batch: ') print(postprocessed_batch) print(postprocessed_batch.shape) return postprocessed_batch
def main(_): # In this simple example, we run the examples from a single audio file through # the model. If none is provided, we generate a synthetic input. # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(pca_params) vggish_params.EXAMPLE_HOP_SECONDS = ( 1 - args.overlap) * vggish_params.EXAMPLE_WINDOW_SECONDS # If needed, prepare a record writer_dict to store the postprocessed embeddings. with tf.Graph().as_default(), tf.Session(config=config) as sess: vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) movie_id = args.wav_file[args.wav_file.rfind('/') + 1:args.wav_file.rfind('.')] examples_batch = vggish_input.wavfile_to_examples(args.wav_file) num_splits = min(int(examples_batch.shape[0] / 10), 100) num_splits = max(1, num_splits) examples_batch = np.array_split(examples_batch, num_splits) embedding_batch = [] for i in range(num_splits): [batch] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch[i]}) embedding_batch.extend(batch) postprocessed_batch = pproc.postprocess(np.array(embedding_batch)) # Write the postprocessed embeddings as a SequenceExample, in a similar # format as the features released in AudioSet. Each row of the batch of # embeddings corresponds to roughly a second of audio (96 10ms frames), and # the rows are written as a sequence of bytes-valued features, where each # feature value contains the 128 bytes of the whitened quantized embedding. seq_example = tf.train.SequenceExample( context=tf.train.Features( feature={ 'movie_id': tf.train.Feature(bytes_list=tf.train.BytesList( value=[movie_id])) }), feature_lists=tf.train.FeatureLists( feature_list={ vggish_params.AUDIO_EMBEDDING_FEATURE_NAME: tf.train.FeatureList(feature=[ tf.train.Feature(bytes_list=tf.train.BytesList( value=[embedding.tobytes()])) for embedding in postprocessed_batch ]) })) writer = tf.python_io.TFRecordWriter( os.path.join(args.write_dir, movie_id + '.tfrecord')) writer.write(seq_example.SerializeToString()) writer.close()
def initialize_classifier(self): if not os.path.exists(os.path.join(os.getcwd(), self.model_dir)): os.mkdir(os.path.join(os.getcwd(), self.model_dir)) # Load pre-trained VGGish vggish_slim.load_vggish_slim_checkpoint(self.sess, self.vggish_checkpoint) # Save model checkpoint self.save_variables()
def readDirectory(dirname, label): pproc = vggish_postprocess.Postprocessor("vggish_pca_params.npz") for wav_file in glob.glob(dirname + "*.wav"): print(wav_file) try: examples_batch = vggish_input.wavfile_to_examples(wav_file) except: continue writer = tf.python_io.TFRecordWriter(wav_file[:-3] + "tfrecord") with tf.Graph().as_default(), tf.Session() as sess: vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, "vggish_model.ckpt") features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) try: [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) except: continue postprocessed_batch = pproc.postprocess(embedding_batch) nBatches = len(postprocessed_batch) if nBatches < 10: nBatches = 1 else: nBatches = nBatches / 10 for i in range(nBatches): seq_example = tf.train.SequenceExample( context=tf.train.Features( feature={ "labels": tf.train.Feature(int64_list=tf.train.Int64List( value=[label])) }), feature_lists=tf.train.FeatureLists( feature_list={ vggish_params.AUDIO_EMBEDDING_FEATURE_NAME: tf.train.FeatureList(feature=[ tf.train.Feature(bytes_list=tf.train.BytesList( value=[embedding.tobytes()])) for embedding in postprocessed_batch[i * 10:i * 10 + 10] ]) })) if writer: writer.write(seq_example.SerializeToString()) if writer: writer.close()
def extract_audioset_embedding(): """Extract log mel spectrogram features. """ # Arguments & parameters mel_bins = vggish_params.NUM_BANDS sample_rate = vggish_params.SAMPLE_RATE input_len = vggish_params.NUM_FRAMES embedding_size = vggish_params.EMBEDDING_SIZE '''You may modify the EXAMPLE_HOP_SECONDS in vggish_params.py to change the hop size. ''' # Paths audio_path = 'appendixes/01.wav' checkpoint_path = os.path.join('vggish_model.ckpt') pcm_params_path = os.path.join('vggish_pca_params.npz') if not os.path.isfile(checkpoint_path): raise Exception( 'Please download vggish_model.ckpt from ' 'https://storage.googleapis.com/audioset/vggish_model.ckpt ' 'and put it in the root of this codebase. ') if not os.path.isfile(pcm_params_path): raise Exception( 'Please download pcm_params_path from ' 'https://storage.googleapis.com/audioset/vggish_pca_params.npz ' 'and put it in the root of this codebase. ') # Load model sess = tf.Session() vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) pproc = vggish_postprocess.Postprocessor(pcm_params_path) # Read audio (audio, _) = read_audio(audio_path, target_fs=sample_rate) # Extract log mel feature logmel = vggish_input.waveform_to_examples(audio, sample_rate) # Extract embedding feature [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: logmel}) # PCA postprocessed_batch = pproc.postprocess(embedding_batch) print('Audio length: {}'.format(len(audio))) print('Log mel shape: {}'.format(logmel.shape)) print('Embedding feature shape: {}'.format(postprocessed_batch.shape))
def main(_): if FLAGS.wav_file: wav_file = FLAGS.wav_file else: return "No wav file" examples_batch = vggish_input.wavfile_to_examples(wav_file) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) # If needed, prepare a record writer to store the postprocessed embeddings. writer = tf.python_io.TFRecordWriter( FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) print(embedding_batch) postprocessed_batch = pproc.postprocess(embedding_batch) print(postprocessed_batch) # Write the postprocessed embeddings as a SequenceExample, in a similar # format as the features released in AudioSet. Each row of the batch of # embeddings corresponds to roughly a second of audio (96 10ms frames), and # the rows are written as a sequence of bytes-valued features, where each # feature value contains the 128 bytes of the whitened quantized embedding. seq_example = tf.train.SequenceExample( feature_lists=tf.train.FeatureLists( feature_list={ vggish_params.AUDIO_EMBEDDING_FEATURE_NAME: tf.train.FeatureList( feature=[ tf.train.Feature( bytes_list=tf.train.BytesList( value=[embedding.tobytes()])) for embedding in postprocessed_batch ] ) } ) ) print(seq_example) if writer: writer.write(seq_example.SerializeToString()) if writer: writer.close()
def main(_): ontology_lookup = {} with open(ONTROLOGY, 'r') as f: label_json = json.load(f) for entry in label_json: label_id = entry['id'].replace('/', '_') assert label_id not in ontology_lookup.keys() ontology_lookup[label_id] = entry wav_paths = glob.glob(os.path.join(AUDIO_CHUNKS, '*', '*.wav')) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) audio_tsv = [] label_tsv = [] emb_tsv = [] for wavfile in tqdm(wav_paths): label = Path(Path(wavfile).parent).stem filename = Path(wavfile).name examples_batch = vggish_input.wavfile_to_examples(wavfile) with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) # emb = [] # for embedding in embedding_batch: # emb.append(embedding.tolist()) emb = np.mean(embedding_batch, axis=0).tolist() label_tsv.append([ontology_lookup[label]['name']]) audio_tsv.append([f'{label}/{filename}']) emb_tsv.append(emb) assert len(emb_tsv[0]) == len(emb) with open(f'{OUTPUTDIR}/emb.tsv', 'w') as f: for emb in emb_tsv: csv.writer(f, delimiter='\t').writerow(emb) with open(f'{OUTPUTDIR}/label.tsv', 'w') as f: for label in label_tsv: csv.writer(f, delimiter='\t').writerow(label) with open(f'{OUTPUTDIR}/audio.tsv', 'w') as f: for audio_path in audio_tsv: csv.writer(f, delimiter='\t').writerow(audio_path)
def main(wav_file, npz_path): # In this simple example, we run the examples from a single audio file through # the model. If none is provided, we generate a synthetic input. #if FLAGS.wav_file: # wav_file = str(FLAGS.wav_file) # print (FLAGS.wav_file) if 1: wav_file = wav_file else: # Write a WAV of a sine wav into an in-memory file object. num_secs = 5 freq = 1000 sr = 44100 t = np.linspace(0, num_secs, int(num_secs * sr)) x = np.sin(2 * np.pi * freq * t) # Convert to signed 16-bit samples. samples = np.clip(x * 32768, -32768, 32767).astype(np.int16) wav_file = six.BytesIO() wavfile.write(wav_file, sr, samples) wav_file.seek(0) examples_batch = vggish_input.wavfile_to_examples(wav_file) #print(examples_batch) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) # If needed, prepare a record writer to store the postprocessed embeddings. writer = tf.python_io.TFRecordWriter(tfrecord_file) with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) #print(embedding_batch) postprocessed_batch = pproc.postprocess(embedding_batch) #print(postprocessed_batch) if 0 in embedding_batch.shape: print('NO') return 0 np.savez_compressed(npz_path, postprocessed_batch) return 1
def post_init(self): self.to_device() import tensorflow as tf tf.compat.v1.disable_eager_execution() self.sess = tf.compat.v1.Session() vggish_slim.define_vggish_slim() vggish_slim.load_vggish_slim_checkpoint(self.sess, self.model_path) self.feature_tensor = self.sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) self.embedding_tensor = self.sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) self.post_processor = vggish_postprocess.Postprocessor(self.pca_path)
def __init__(self): self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf.Session(graph=self.graph) # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(self.sess, FLAGS.checkpoint) self.features_tensor = self.sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) self.embedding_tensor = self.sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME)
def _build_model(self): # Restore VGGish model trained on YouTube8M dataset # Retrieve PCA-embeddings of bottleneck features # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(self.sess, model_checkpoint) self.features_tensor = self.sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) self.embedding_tensor = self.sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Prepare a postprocessor to munge the model embeddings. self.pproc = vggish_postprocess.Postprocessor(pca_params)
def processList(filelist): # Generate a 1 kHz sine wave at 44.1 kHz (we use a high sampling rate # to test resampling to 16 kHz during feature extraction). out_file_list = [] input_list = open(filelist) input_lists = input_list.readlines() for i, infl in tqdm(enumerate(input_lists)): infl_new = audio_root + '/' + infl.replace("\ ", " ") y, sr = librosa.load(infl_new.strip(), sr=None) if len(y) < sr: y1 = np.pad(y, (0, sr - len(y)), 'wrap') y = y1 # Produce a batch of log mel spectrogram examples. input_batch = vggish_input.waveform_to_examples(y, sr) print('Log Mel Spectrogram example: ', input_batch[0]) # Define VGGish, load the checkpoint, and run the batch through the model to # produce embeddings. gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5) with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto( gpu_options=gpu_options)) as sess: vggish_slim.define_vggish_slim() vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: input_batch}) print('VGGish embedding: done ', i) # Postprocess the results to produce whitened quantized embeddings. pproc = vggish_postprocess.Postprocessor(pca_params_path) postprocessed_batch = pproc.postprocess(embedding_batch) infl_list = infl_new.strip().split("/") file_name = infl_list[-1].strip() out_dir = output_root + "/" + infl_list[-2] if not os.path.exists(out_dir): os.makedirs(out_dir) featfile = str(out_dir) + '/' + str(file_name) + '.txt' out_file_list.append(featfile.strip()) np.savetxt(featfile, postprocessed_batch.astype(int), fmt='%i', delimiter=",") create_file(output_file, out_file_list)
def OutputAudioEmbeddings(pathIn, row): video_id = row['video_id'] video_path = row['video_path'] split = row['split'] full_path = os.path.join(pathIn, video_path) full_path = full_path.replace("%(ext)s", "wav") # output file of the downloader path if split == 'train': full_path_cut = full_path.replace("train", "train/cut") elif split == 'test': full_path_cut = full_path.replace("test", "test/cut") # In this simple example, we run the examples from a single audio file through # the model. If none is provided, we generate a synthetic input. if os.path.isfile(full_path_cut): wav_file = full_path_cut examples_batch = vggish_input.wavfile_to_examples(wav_file) #print(examples_batch) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) # If needed, prepare a record writer to store the postprocessed embeddings. writer = tf.python_io.TFRecordWriter( FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) #print(embedding_batch) postprocessed_batch = pproc.postprocess(embedding_batch) print(postprocessed_batch) #print(postprocessed_batch.shape) np.save( '/lfs01/workdirs/shams010/shams010u1/code/audio_features/' + split + '/' + video_id, postprocessed_batch)
def main(_): with tf.Graph().as_default(), tf.Session() as sess: # Define VGGish. embeddings = vggish_slim.define_vggish_slim(FLAGS.train_vggish) # Define a shallow classification model and associated training ops on top # of VGGish. with tf.variable_scope('mymodel'): # Add a fully connected layer with 100 units. num_units = 100 fc = slim.fully_connected(embeddings, num_units) # Add a classifier layer at the end, consisting of parallel logistic # classifiers, one per class. This allows for multi-class tasks. logits = slim.fully_connected( fc, _NUM_CLASSES, activation_fn=None, scope='logits') tf.sigmoid(logits, name='prediction') # Add training ops. with tf.variable_scope('train'): global_step = tf.Variable( 0, name='global_step', trainable=False, collections=[tf.GraphKeys.GLOBAL_VARIABLES, tf.GraphKeys.GLOBAL_STEP]) # Labels are assumed to be fed as a batch multi-hot vectors, with # a 1 in the position of each positive class label, and 0 elsewhere. labels = tf.placeholder( tf.float32, shape=(None, _NUM_CLASSES), name='labels') # Cross-entropy label loss. xent = tf.nn.sigmoid_cross_entropy_with_logits( logits=logits, labels=labels, name='xent') loss = tf.reduce_mean(xent, name='loss_op') tf.summary.scalar('loss', loss) # We use the same optimizer and hyperparameters as used to train VGGish. optimizer = tf.train.AdamOptimizer( learning_rate=vggish_params.LEARNING_RATE, epsilon=vggish_params.ADAM_EPSILON) optimizer.minimize(loss, global_step=global_step, name='train_op') # Initialize all variables in the model, and then load the pre-trained # VGGish checkpoint. sess.run(tf.global_variables_initializer()) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) # Locate all the tensors and ops we need for the training loop. features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) labels_tensor = sess.graph.get_tensor_by_name('mymodel/train/labels:0') global_step_tensor = sess.graph.get_tensor_by_name( 'mymodel/train/global_step:0') loss_tensor = sess.graph.get_tensor_by_name('mymodel/train/loss_op:0') train_op = sess.graph.get_operation_by_name('mymodel/train/train_op') # The training loop. for _ in range(FLAGS.num_batches): (features, labels) = _get_examples_batch() [num_steps, loss, _] = sess.run( [global_step_tensor, loss_tensor, train_op], feed_dict={features_tensor: features, labels_tensor: labels}) print('Step %d: loss %g' % (num_steps, loss))
def main(_): # In this simple example, we run the examples from a single audio file through # the model. If none is provided, we generate a synthetic input. if FLAGS.wav_file: wav_file = FLAGS.wav_file else: # Write a WAV of a sine wav into an in-memory file object. num_secs = 5 freq = 1000 sr = 44100 t = np.linspace(0, num_secs, int(num_secs * sr)) x = np.sin(2 * np.pi * freq * t) # Convert to signed 16-bit samples. samples = np.clip(x * 32768, -32768, 32767).astype(np.int16) wav_file = six.BytesIO() wavfile.write(wav_file, sr, samples) wav_file.seek(0) examples_batch = vggish_input.wavfile_to_examples(wav_file) print(examples_batch) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) # If needed, prepare a record writer to store the postprocessed embeddings. writer = tf.python_io.TFRecordWriter( FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) print(embedding_batch) postprocessed_batch = pproc.postprocess(embedding_batch) print(postprocessed_batch) # Write the postprocessed embeddings as a SequenceExample, in a similar # format as the features released in AudioSet. Each row of the batch of # embeddings corresponds to roughly a second of audio (96 10ms frames), and # the rows are written as a sequence of bytes-valued features, where each # feature value contains the 128 bytes of the whitened quantized embedding. seq_example = tf.train.SequenceExample( feature_lists=tf.train.FeatureLists( feature_list={ vggish_params.AUDIO_EMBEDDING_FEATURE_NAME: tf.train.FeatureList( feature=[ tf.train.Feature( bytes_list=tf.train.BytesList( value=[embedding.tobytes()])) for embedding in postprocessed_batch ] ) } ) ) print(seq_example) if writer: writer.write(seq_example.SerializeToString()) if writer: writer.close()
sr = 44100 t = np.linspace(0, num_secs, int(num_secs * sr)) x = np.sin(2 * np.pi * freq * t) # Produce a batch of log mel spectrogram examples. input_batch = vggish_input.waveform_to_examples(x, sr) print('Log Mel Spectrogram example: ', input_batch[0]) np.testing.assert_equal( input_batch.shape, [num_secs, vggish_params.NUM_FRAMES, vggish_params.NUM_BANDS]) # Define VGGish, load the checkpoint, and run the batch through the model to # produce embeddings. with tf.Graph().as_default(), tf.Session() as sess: vggish_slim.define_vggish_slim() vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint_path) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: input_batch}) print('VGGish embedding: ', embedding_batch[0]) expected_embedding_mean = 0.131 expected_embedding_std = 0.238 np.testing.assert_allclose( [np.mean(embedding_batch), np.std(embedding_batch)], [expected_embedding_mean, expected_embedding_std], rtol=rel_error)