def extract_vggish_embedding(audio_data, fs): examples_batch = vggish_input.waveform_to_examples(audio_data, fs) # Prepare a postprocessor to munge the model embeddings. pproc = vggish_postprocess.Postprocessor(PCA_PARAMS_PATH) # If needed, prepare a record writer to store the postprocessed embeddings. #writer = tf.python_io.TFRecordWriter( # FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None with tf.Graph().as_default(), tf.Session() as sess: # Define the model in inference mode, load the checkpoint, and # locate input and output tensors. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, MODEL_PATH) features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) # Run inference and postprocessing. [embedding_batch ] = sess.run([embedding_tensor], feed_dict={features_tensor: examples_batch}) postprocessed_batch = pproc.postprocess(embedding_batch) # Write the postprocessed embeddings as a SequenceExample, in a similar # format as the features released in AudioSet. Each row of the batch of # embeddings corresponds to roughly a second of audio (96 10ms frames), and # the rows are written as a sequence of bytes-valued features, where each # feature value contains the 128 bytes of the whitened quantized embedding. return postprocessed_batch
def extract(self, audio_path, sc_start, sc_end): wav_data, sr = sf.read(audio_path, dtype='int16') assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype samples = wav_data / 32768.0 # Convert to [-1.0, +1.0] sc_center = self.time_to_sample((sc_start + sc_end) / 2, sr, 1000.0) # print('Center is {} when sample_rate is {}'.format(sc_center, sr)) data_length = len(samples) data_width = self.time_to_sample(self.num_secs, sr, 1.0) half_input_width = int(data_width / 2) if sc_center < half_input_width: pad_width = half_input_width - sc_center samples = np.pad(samples, [(pad_width, 0), (0, 0)], mode='constant', constant_values=0) sc_center += pad_width elif sc_center + half_input_width > data_length: pad_width = sc_center + half_input_width - data_length samples = np.pad(samples, [(0, pad_width), (0, 0)], mode='constant', constant_values=0) samples = samples[sc_center - half_input_width: sc_center + half_input_width] input_batch = vggish_input.waveform_to_examples(samples, sr) [embedding_batch] = self.sess.run([self.embedding_tensor], feed_dict={self.features_tensor: input_batch}) pproc = vggish_postprocess.Postprocessor(self.pca_path) postprocessed_batch = pproc.postprocess(embedding_batch) return postprocessed_batch
def _get_examples_batch(): """Returns a shuffled batch of examples of all audio classes. Note that this is just a toy function because this is a simple demo intended to illustrate how the training code might work. Returns: a tuple (features, labels) where features is a NumPy array of shape [batch_size, num_frames, num_bands] where the batch_size is variable and each row is a log mel spectrogram patch of shape [num_frames, num_bands] suitable for feeding VGGish, while labels is a NumPy array of shape [batch_size, num_classes] where each row is a multi-hot label vector that provides the labels for corresponding rows in features. """ # Make a waveform for each class. num_seconds = 5 sr = 44100 # Sampling rate. t = np.linspace(0, num_seconds, int(num_seconds * sr)) # Time axis. # Random sine wave. freq = np.random.uniform(100, 1000) sine = np.sin(2 * np.pi * freq * t) # Random constant signal. magnitude = np.random.uniform(-1, 1) const = magnitude * t # White noise. noise = np.random.normal(-1, 1, size=t.shape) # Make examples of each signal and corresponding labels. # Sine is class index 0, Const class index 1, Noise class index 2. sine_examples = vggish_input.waveform_to_examples(sine, sr) sine_labels = np.array([[1, 0, 0]] * sine_examples.shape[0]) const_examples = vggish_input.waveform_to_examples(const, sr) const_labels = np.array([[0, 1, 0]] * const_examples.shape[0]) noise_examples = vggish_input.waveform_to_examples(noise, sr) noise_labels = np.array([[0, 0, 1]] * noise_examples.shape[0]) # Shuffle (example, label) pairs across all classes. all_examples = np.concatenate((sine_examples, const_examples, noise_examples)) all_labels = np.concatenate((sine_labels, const_labels, noise_labels)) labeled_examples = list(zip(all_examples, all_labels)) shuffle(labeled_examples) # Separate and return the features and labels. features = [example for (example, _) in labeled_examples] labels = [label for (_, label) in labeled_examples] return (features, labels)
def inference(hostport, work_dir, concurrency, num_tests): audio_path = 'test_DB/test_airport.wav' num_secs = 1 sc_start = 0 sc_end = 2000 wav_data, sr = sf.read(audio_path, dtype='int16') assert wav_data.dtype == numpy.int16, 'Bad sample type: %r' % wav_data.dtype samples = wav_data / 32768.0 # Convert to [-1.0, +1.0] sc_center = time_to_sample((sc_start + sc_end) / 2, sr, 1000.0) # print('Center is {} when sample_rate is {}'.format(sc_center, sr)) data_length = len(samples) data_width = time_to_sample(num_secs, sr, 1.0) half_input_width = int(data_width / 2) if sc_center < half_input_width: pad_width = half_input_width - sc_center samples = numpy.pad(samples, [(pad_width, 0), (0, 0)], mode='constant', constant_values=0) sc_center += pad_width elif sc_center + half_input_width > data_length: pad_width = sc_center + half_input_width - data_length samples = numpy.pad(samples, [(0, pad_width), (0, 0)], mode='constant', constant_values=0) samples = samples[sc_center - half_input_width:sc_center + half_input_width] audio_input = vggish_input.waveform_to_examples(samples, sr) print(audio_input.dtype) audio_input = audio_input.astype(numpy.float32) channel = grpc.insecure_channel(hostport) stub = prediction_service_pb2_grpc.PredictionServiceStub(channel) result_counter = _ResultCounter(num_tests, concurrency) for _ in range(num_tests): request = predict_pb2.PredictRequest() request.model_spec.name = 'vgg' request.model_spec.signature_name = 'prediction' print(audio_input.shape) request.inputs['input'].CopyFrom( tf.contrib.util.make_tensor_proto(audio_input, shape=audio_input.shape)) result_counter.throttle() result_future = stub.Predict.future(request, 5.0) result_future.add_done_callback( _create_rpc_callback(None, result_counter)) return result_counter.get_throughput()
def main(): # Initialize the PyTorch model. device = 'cuda:0' pytorch_model = VGGish() pytorch_model.load_state_dict(torch.load('pytorch_vggish.pth')) pytorch_model = pytorch_model.to(device) # Generate a sample input (as in the AudioSet repo smoke test). num_secs = 3 freq = 1000 sr = 44100 t = np.linspace(0, num_secs, int(num_secs * sr)) x = np.sin(2 * np.pi * freq * t) # Produce a batch of log mel spectrogram examples. input_batch = vggish_input.waveform_to_examples(x, sr) input_batch = torch.from_numpy(input_batch).unsqueeze(dim=1) input_batch = input_batch.float().to(device) # Run the PyTorch model. pytorch_output = pytorch_model(input_batch) pytorch_output = pytorch_output.detach().cpu().numpy() print('Input Shape:', tuple(input_batch.shape)) print('Output Shape:', tuple(pytorch_output.shape)) expected_embedding_mean = 0.131 expected_embedding_std = 0.238 print('Computed Embedding Mean and Standard Deviation:', np.mean(pytorch_output), np.std(pytorch_output)) print('Expected Embedding Mean and Standard Deviation:', expected_embedding_mean, expected_embedding_std) # Post-processing. post_processor = vggish_postprocess.Postprocessor('vggish_pca_params.npz') postprocessed_output = post_processor.postprocess(pytorch_output) expected_postprocessed_mean = 123.0 expected_postprocessed_std = 75.0 print('Computed Post-processed Embedding Mean and Standard Deviation:', np.mean(postprocessed_output), np.std(postprocessed_output)) print('Expected Post-processed Embedding Mean and Standard Deviation:', expected_postprocessed_mean, expected_postprocessed_std)
def load_input(filename, mono=True): """ Extract input features Parameters ---------- filename : str Yields ------- dict of str: np.array """ y, sr = psf.read(filename) if mono: y = librosa.to_mono(y.T) y = resampy.resample(y, sr, vggish_params.SAMPLE_RATE, filter='kaiser_fast') y /= np.max(np.abs(y)) print('{} features {}'.format(filename, y.shape[0] / vggish_params.SAMPLE_RATE)) return vggish_input.waveform_to_examples( y, vggish_params.SAMPLE_RATE).astype(np.float32)
def convert_waveform_to_embedding(self, waveform, sample_rate): samples = waveform / 32768.0 # Convert to [-1.0, +1.0] examples_batch = vggish_input.waveform_to_examples( samples, sample_rate) return self.convert_examples_to_embedding(examples_batch)
def main(): with tf.Graph().as_default(), tf.Session() as sess: # ------------------- # Step 1 # ------------------- # Load the model. vggish_slim.define_vggish_slim(training=False) vggish_slim.load_vggish_slim_checkpoint(sess, 'vggish_model.ckpt') # Get all of the variables, and use this to construct a dictionary which maps # the name of the variables to their values. variables = tf.all_variables() variables = [x.name for x in variables] variable_values = sess.run(variables) variable_dict = dict(zip(variables, variable_values)) # Create a new state dictionary which maps the TensorFlow version of the weights # to those in in the new PyTorch model. pytorch_model = VGGish() pytorch_feature_dict = pytorch_model.features.state_dict() pytorch_fc_dict = pytorch_model.fc.state_dict() # ------------------- # Step 2 # ------------------- # There is a bias and weight vector for each convolution layer. The weights are not necessarily stored # in the same format and order between the two frameworks; for the TensorFlow model, the 12 vectors for the # convolution layers are first, followed by the 6 FC layers. tf_feature_names = list(variable_dict.keys())[:-6] tf_fc_names = list(variable_dict.keys())[-6:] def to_pytorch_tensor(weights): if len(weights.shape) == 4: tensor = torch.from_numpy(weights.transpose(3, 2, 0, 1)).float() else: tensor = torch.from_numpy(weights.T).float() return tensor # Convert the weights for the convolution layers. for tf_name, pytorch_name in zip(tf_feature_names, pytorch_feature_dict.keys()): print( f'Converting [{tf_name}] ----------> [feature.{pytorch_name}]' ) pytorch_feature_dict[pytorch_name] = to_pytorch_tensor( variable_dict[tf_name]) # Convert the weights for the FC layers. for tf_name, pytorch_name in zip(tf_fc_names, pytorch_fc_dict.keys()): print(f'Converting [{tf_name}] ----------> [fc.{pytorch_name}]') pytorch_fc_dict[pytorch_name] = to_pytorch_tensor( variable_dict[tf_name]) # ------------------- # Step 3 # ------------------- # Load the new state dictionaries into the PyTorch model. pytorch_model.features.load_state_dict(pytorch_feature_dict) pytorch_model.fc.load_state_dict(pytorch_fc_dict) # ------------------- # Step 4 # ------------------- # Generate a sample input (as in the AudioSet repo smoke test). num_secs = 3 freq = 1000 sr = 44100 t = np.linspace(0, num_secs, int(num_secs * sr)) x = np.sin(2 * np.pi * freq * t) # Produce a batch of log mel spectrogram examples. input_batch = vggish_input.waveform_to_examples(x, sr) # Run inference on the TensorFlow model. features_tensor = sess.graph.get_tensor_by_name( vggish_params.INPUT_TENSOR_NAME) embedding_tensor = sess.graph.get_tensor_by_name( vggish_params.OUTPUT_TENSOR_NAME) [tf_output] = sess.run([embedding_tensor], feed_dict={features_tensor: input_batch}) # Run on the PyTorch model. pytorch_model = pytorch_model.to('cpu') pytorch_output = pytorch_model( torch.from_numpy(input_batch).unsqueeze(dim=1).float()) pytorch_output = pytorch_output.detach().numpy() # ------------------- # Step 5 # ------------------- # Compare the difference between the outputs. diff = np.linalg.norm(pytorch_output - tf_output)**2 print(f'Distance between TensorFlow and PyTorch outputs: [{diff}]') assert diff < 1e-6 # Run a smoke test. expected_embedding_mean = 0.131 expected_embedding_std = 0.238 # Verify the TF output. np.testing.assert_allclose( [np.mean(tf_output), np.std(tf_output)], [expected_embedding_mean, expected_embedding_std], rtol=0.001) # Verify the PyTorch output. np.testing.assert_allclose( [np.mean(pytorch_output), np.std(pytorch_output)], [expected_embedding_mean, expected_embedding_std], rtol=0.001) # ------------------- # Step 6 # ------------------- print( 'Smoke test passed! Saving PyTorch weights to "pytorch_vggish.pth".' ) torch.save(pytorch_model.state_dict(), 'pytorch_vggish.pth')