def vggish(model_path): with tf.Graph().as_default() as default_grapth: sess = tf.Session() vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, model_path) features_tensor = sess.graph.get_tensor_by_name(vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name(vggish_params.OUTPUT_TENSOR_NAME) return sess, embedding_tensor, embedding_tensor, features_tensor
def model(learning_rate=vggish_params.LEARNING_RATE): graph = tf.Graph() with graph.as_default(): # Define VGGish. embeddings = vggish_slim.define_vggish_slim(FLAGS.train_vggish) with tf.variable_scope("mymodel"): # Add a fully connected layer with 100 units. num_units = 100 fc = slim.fully_connected(embeddings, num_units) # Add a classifier layer at the end, consisting of parallel logistic # classifiers, one per class. This allows for multi-class tasks. logits = slim.fully_connected(fc, params.NUM_CLASSES, activation_fn=None, scope='logits') prediction = tf.argmax(logits, name='prediction') # Add training ops. with tf.variable_scope('train'): global_step = tf.Variable(0, name='global_step', trainable=False, collections=[ tf.GraphKeys.GLOBAL_VARIABLES, tf.GraphKeys.GLOBAL_STEP ]) # Labels are assumed to be fed as a batch multi-hot vectors, with # a 1 in the position of each positive class label, and 0 elsewhere. labels = tf.placeholder(tf.float32, shape=(None, params.NUM_CLASSES), name='labels') # Cross-entropy label loss. xent = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits, labels=labels, name='xent') loss = tf.reduce_mean(xent, name='loss_op') tf.summary.scalar('loss', loss) variable_summaries(loss) # Calculate accuracy #accuracy = tf.metrics.accuracy(labels=labels, predictions=logits, name="acc") #tf.summary.scalar('accuracy', accuracy) #variable_summaries(accuracy) # We use the same optimizer and hyperparameters as used to train VGGish. optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate, epsilon=vggish_params.ADAM_EPSILON) optimizer.minimize(loss, global_step=global_step, name='train_op') return graph, prediction
def create_vggish_network(sess, config): """ Define VGGish model, load the checkpoint, and return a dictionary that points to the different tensors defined by the model. """ vggish_slim.define_vggish_slim(training=False) vggish_params.EXAMPLE_HOP_SECONDS = config.vggish_hop_size vggish_slim.load_vggish_slim_checkpoint( sess, config.vggish_model_checkpoint_path) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) return {'features': features_tensor, 'embedding': embedding_tensor}
def __init__(self, ckpt_fname, add_classifier=False): self.add_classifier = add_classifier num_time_samples = 3 spec_ph = tf.placeholder(tf.float32, (num_time_samples, 96, 64)) embeddings = vggish_slim.define_vggish_slim(spec_ph, training=True) self.spec_ph, self.embeddings = spec_ph, embeddings if add_classifier: with tf.variable_scope('mymodel'): num_units = 100 num_classes = 527 fc = slim.fully_connected(embeddings, num_units) self.logits = slim.fully_connected( fc, num_classes, activation_fn=None, scope='logits' ) self.sess = tf.Session() tf.train.Saver().restore(self.sess, ckpt_fname)
count += 1 if count % 100 == 0: print("At File ", count, "/", N) print("Done!") print("Computing Tensorflow Embeddings...") # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(pca_params) output_sequences = [] with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) count = 0 for batch in batches: # Run inference and postprocessing. [embedding_batch] = sess.run([embedding_tensor], feed_dict={features_tensor: batch}) postprocessed_batch = pproc.postprocess(embedding_batch) output_sequences.append(postprocessed_batch) count += 1 if count % 100 == 0:
def model(learning_rate=vggish_params.LEARNING_RATE, training=FLAGS.train_vggish): graph = tf.Graph() with graph.as_default(): # Define VGGish. embeddings = vggish_slim.define_vggish_slim(training) with tf.variable_scope("mymodel"): # Add a fully connected layer with 100 units. num_units = 100 conv1 = slim.conv2d(embeddings, 1024, scope="conv1", kernel_size=[3, 3], stride=1, padding='SAME') pool1 = slim.avg_pool2d(conv1, scope='pool1', kernel_size=[2, 2], stride=2, padding='SAME') pool1 = slim.flatten(pool1) fc1 = tf.contrib.layers.fully_connected(inputs=pool1, num_outputs=512, activation_fn=None, scope="fc1") bn1 = tf.layers.batch_normalization(fc1, 1, name="batch_norm_1") fc2 = tf.contrib.layers.fully_connected( inputs=bn1, num_outputs=vggish_params.EMBEDDING_SIZE, activation_fn=tf.nn.relu, scope="fc2") bn2 = tf.layers.batch_normalization(fc2, 1, name="batch_norm_2") # Add a classifier layer at the end, consisting of parallel logistic # classifiers, one per class. This allows for multi-class tasks. logits = tf.contrib.layers.fully_connected(bn2, params.NUM_CLASSES, activation_fn=None, scope='logits') prediction = tf.argmax(logits, axis=1, name='prediction') softmax_prediction = tf.nn.softmax(logits, axis=1, name="softmax_prediction") softmax_prediction = tf.nn.top_k(softmax_prediction, k=5) # Add training ops. with tf.variable_scope('train'): global_step = tf.Variable(0, name='global_step', trainable=False, collections=[ tf.GraphKeys.GLOBAL_VARIABLES, tf.GraphKeys.GLOBAL_STEP ]) # Labels are assumed to be fed as a batch multi-hot vectors, with # a 1 in the position of each positive class label, and 0 elsewhere. labels = tf.placeholder(tf.float32, shape=(None, params.NUM_CLASSES), name='labels') # Cross-entropy label loss. xent = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=labels, name='xent') loss = tf.reduce_mean(xent, name='loss_op') tf.summary.scalar('loss', loss) variable_summaries(loss) # Calculate accuracy # accuracy = tf.metrics.accuracy(labels=labels, predictions=logits, name="acc") # tf.summary.scalar('accuracy', accuracy) # variable_summaries(accuracy) # We use the same optimizer and hyperparameters as used to train VGGish. optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate, epsilon=vggish_params.ADAM_EPSILON) optimizer.minimize(loss, global_step=global_step, name='train_op') with tf.variable_scope("accuracy"): with tf.variable_scope("correct_prediction"): correct_prediction = tf.equal(prediction, tf.argmax(labels, 1)) with tf.variable_scope("accuracy"): accuracy = tf.reduce_mean( tf.cast(correct_prediction, "float")) tf.summary.scalar("accuracy", accuracy) return graph, accuracy, softmax_prediction
def make_extract_vggish_embedding(frame_duration, hop_duration, input_op_name='vggish/input_features', output_op_name='vggish/embedding', embedding_size=128, resources_dir=None): """ Creates a coroutine generator for extracting and saving VGGish embeddings Parameters ---------- frame_duration hop_duration input_op_name output_op_name embedding_size resources_dir Returns ------- coroutine """ params = { 'frame_win_sec': frame_duration, 'frame_hop_sec': hop_duration, 'embedding_size': embedding_size } if not resources_dir: resources_dir = os.path.join(os.path.dirname(__file__), 'vggish/resources') pca_params_path = os.path.join(resources_dir, 'vggish_pca_params.npz') model_path = os.path.join(resources_dir, 'vggish_model.ckpt') try: with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False, **params) vggish_slim.load_vggish_slim_checkpoint(sess, model_path, **params) while True: # We use a coroutine to more easily keep open the Tensorflow contexts # without having to constantly reload the model audio_path, output_path = (yield) if os.path.exists(output_path): continue try: examples_batch = vggish_input.wavfile_to_examples( audio_path, **params) except ValueError: print("Error opening {}. Skipping...".format(audio_path)) continue # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor( pca_params_path, **params) input_tensor_name = input_op_name + ':0' output_tensor_name = output_op_name + ':0' features_tensor = sess.graph.get_tensor_by_name( input_tensor_name) embedding_tensor = sess.graph.get_tensor_by_name( output_tensor_name) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) emb = pproc.postprocess(embedding_batch, **params).astype(np.float32) with gzip.open(output_path, 'wb') as f: emb.dump(f) except GeneratorExit: pass
def main(_): with tf.Graph().as_default(), tf.Session() as sess: # Define VGGish. embeddings = vggish_slim.define_vggish_slim(FLAGS.train_vggish) # Define a shallow classification model and associated training ops on top # of VGGish. with tf.variable_scope('mymodel'): # Add a fully connected layer with 100 units. num_units = 100 fc = slim.fully_connected(embeddings, num_units) # Add a classifier layer at the end, consisting of parallel logistic # classifiers, one per class. This allows for multi-class tasks. logits = slim.fully_connected(fc, _NUM_CLASSES, activation_fn=None, scope='logits') tf.sigmoid(logits, name='prediction') # Add training ops. with tf.variable_scope('train'): global_step = tf.Variable(0, name='global_step', trainable=False, collections=[ tf.GraphKeys.GLOBAL_VARIABLES, tf.GraphKeys.GLOBAL_STEP ]) # Labels are assumed to be fed as a batch multi-hot vectors, with # a 1 in the position of each positive class label, and 0 elsewhere. labels = tf.placeholder(tf.float32, shape=(None, _NUM_CLASSES), name='labels') # Cross-entropy label loss. xent = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits, labels=labels, name='xent') loss = tf.reduce_mean(xent, name='loss_op') tf.summary.scalar('loss', loss) # We use the same optimizer and hyperparameters as used to train VGGish. optimizer = tf.train.AdamOptimizer( learning_rate=vggish_params.LEARNING_RATE, epsilon=vggish_params.ADAM_EPSILON) optimizer.minimize(loss, global_step=global_step, name='train_op') # Initialize all variables in the model, and then load the pre-trained # VGGish checkpoint. sess.run(tf.global_variables_initializer()) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) # Locate all the tensors and ops we need for the training loop. features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) labels_tensor = sess.graph.get_tensor_by_name('mymodel/train/labels:0') global_step_tensor = sess.graph.get_tensor_by_name( 'mymodel/train/global_step:0') loss_tensor = sess.graph.get_tensor_by_name('mymodel/train/loss_op:0') train_op = sess.graph.get_operation_by_name('mymodel/train/train_op') # The training loop. for _ in range(FLAGS.num_batches): (features, labels) = _get_examples_batch() [num_steps, loss, _] = sess.run([global_step_tensor, loss_tensor, train_op], feed_dict={ features_tensor: features, labels_tensor: labels }) print('Step %d: loss %g' % (num_steps, loss))
def main(_): # In this simple example, we run the examples from a single audio file through # the model. If none is provided, we generate a synthetic input. if FLAGS.wav_file: wav_file = FLAGS.wav_file else: # Write a WAV of a sine wav into an in-memory file object. num_secs = 5 freq = 1000 sr = 44100 t = np.linspace(0, num_secs, int(num_secs * sr)) x = np.sin(2 * np.pi * freq * t) # Convert to signed 16-bit samples. samples = np.clip(x * 32768, -32768, 32767).astype(np.int16) wav_file = six.BytesIO() wavfile.write(wav_file, sr, samples) wav_file.seek(0) examples_batch = vggish_input.wavfile_to_examples(wav_file) print(examples_batch) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(FLAGS.pca_params) # If needed, prepare a record writer to store the postprocessed embeddings. writer = tf.python_io.TFRecordWriter( FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, FLAGS.checkpoint) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) print(embedding_batch) postprocessed_batch = pproc.postprocess(embedding_batch) print(postprocessed_batch) # Write the postprocessed embeddings as a SequenceExample, in a similar # format as the features released in AudioSet. Each row of the batch of # embeddings corresponds to roughly a second of audio (96 10ms frames), and # the rows are written as a sequence of bytes-valued features, where each # feature value contains the 128 bytes of the whitened quantized embedding. seq_example = tf.train.SequenceExample( feature_lists=tf.train.FeatureLists( feature_list={ vggish_params.AUDIO_EMBEDDING_FEATURE_NAME: tf.train.FeatureList(feature=[ tf.train.Feature(bytes_list=tf.train.BytesList( value=[embedding.tobytes()])) for embedding in postprocessed_batch ]) })) print(seq_example) if writer: writer.write(seq_example.SerializeToString()) if writer: writer.close()