示例#1
0
def extract_vggish_embedding(audio_data, fs):
    examples_batch = vggish_input.waveform_to_examples(audio_data, fs)

    # Prepare a postprocessor to munge the model embeddings.
    pproc = vggish_postprocess.Postprocessor(PCA_PARAMS_PATH)

    # If needed, prepare a record writer to store the postprocessed embeddings.
    #writer = tf.python_io.TFRecordWriter(
    #    FLAGS.tfrecord_file) if FLAGS.tfrecord_file else None

    with tf.Graph().as_default(), tf.Session() as sess:
        # Define the model in inference mode, load the checkpoint, and
        # locate input and output tensors.
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, MODEL_PATH)
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)

        # Run inference and postprocessing.
        [embedding_batch
         ] = sess.run([embedding_tensor],
                      feed_dict={features_tensor: examples_batch})
        postprocessed_batch = pproc.postprocess(embedding_batch)

        # Write the postprocessed embeddings as a SequenceExample, in a similar
        # format as the features released in AudioSet. Each row of the batch of
        # embeddings corresponds to roughly a second of audio (96 10ms frames), and
        # the rows are written as a sequence of bytes-valued features, where each
        # feature value contains the 128 bytes of the whitened quantized embedding.

    return postprocessed_batch
    def extract(self, audio_path, sc_start, sc_end):

        wav_data, sr = sf.read(audio_path, dtype='int16')
        assert wav_data.dtype == np.int16, 'Bad sample type: %r' % wav_data.dtype
        samples = wav_data / 32768.0  # Convert to [-1.0, +1.0]

        sc_center = self.time_to_sample((sc_start + sc_end) / 2, sr, 1000.0)
        # print('Center is {} when sample_rate is {}'.format(sc_center, sr))
        data_length = len(samples)
        data_width = self.time_to_sample(self.num_secs, sr, 1.0)
        half_input_width = int(data_width / 2)
        if sc_center < half_input_width:
            pad_width = half_input_width - sc_center
            samples = np.pad(samples, [(pad_width, 0), (0, 0)], mode='constant', constant_values=0)
            sc_center += pad_width
        elif sc_center + half_input_width > data_length:
            pad_width = sc_center + half_input_width - data_length
            samples = np.pad(samples, [(0, pad_width), (0, 0)], mode='constant', constant_values=0)
        samples = samples[sc_center - half_input_width: sc_center + half_input_width]
        input_batch = vggish_input.waveform_to_examples(samples, sr)
        [embedding_batch] = self.sess.run([self.embedding_tensor], feed_dict={self.features_tensor: input_batch})

        pproc = vggish_postprocess.Postprocessor(self.pca_path)
        postprocessed_batch = pproc.postprocess(embedding_batch)
        return postprocessed_batch
def _get_examples_batch():
  """Returns a shuffled batch of examples of all audio classes.

  Note that this is just a toy function because this is a simple demo intended
  to illustrate how the training code might work.

  Returns:
    a tuple (features, labels) where features is a NumPy array of shape
    [batch_size, num_frames, num_bands] where the batch_size is variable and
    each row is a log mel spectrogram patch of shape [num_frames, num_bands]
    suitable for feeding VGGish, while labels is a NumPy array of shape
    [batch_size, num_classes] where each row is a multi-hot label vector that
    provides the labels for corresponding rows in features.
  """
  # Make a waveform for each class.
  num_seconds = 5
  sr = 44100  # Sampling rate.
  t = np.linspace(0, num_seconds, int(num_seconds * sr))  # Time axis.
  # Random sine wave.
  freq = np.random.uniform(100, 1000)
  sine = np.sin(2 * np.pi * freq * t)
  # Random constant signal.
  magnitude = np.random.uniform(-1, 1)
  const = magnitude * t
  # White noise.
  noise = np.random.normal(-1, 1, size=t.shape)

  # Make examples of each signal and corresponding labels.
  # Sine is class index 0, Const class index 1, Noise class index 2.
  sine_examples = vggish_input.waveform_to_examples(sine, sr)
  sine_labels = np.array([[1, 0, 0]] * sine_examples.shape[0])
  const_examples = vggish_input.waveform_to_examples(const, sr)
  const_labels = np.array([[0, 1, 0]] * const_examples.shape[0])
  noise_examples = vggish_input.waveform_to_examples(noise, sr)
  noise_labels = np.array([[0, 0, 1]] * noise_examples.shape[0])

  # Shuffle (example, label) pairs across all classes.
  all_examples = np.concatenate((sine_examples, const_examples, noise_examples))
  all_labels = np.concatenate((sine_labels, const_labels, noise_labels))
  labeled_examples = list(zip(all_examples, all_labels))
  shuffle(labeled_examples)

  # Separate and return the features and labels.
  features = [example for (example, _) in labeled_examples]
  labels = [label for (_, label) in labeled_examples]
  return (features, labels)
def inference(hostport, work_dir, concurrency, num_tests):

    audio_path = 'test_DB/test_airport.wav'
    num_secs = 1
    sc_start = 0
    sc_end = 2000

    wav_data, sr = sf.read(audio_path, dtype='int16')
    assert wav_data.dtype == numpy.int16, 'Bad sample type: %r' % wav_data.dtype
    samples = wav_data / 32768.0  # Convert to [-1.0, +1.0]

    sc_center = time_to_sample((sc_start + sc_end) / 2, sr, 1000.0)
    # print('Center is {} when sample_rate is {}'.format(sc_center, sr))
    data_length = len(samples)
    data_width = time_to_sample(num_secs, sr, 1.0)
    half_input_width = int(data_width / 2)
    if sc_center < half_input_width:
        pad_width = half_input_width - sc_center
        samples = numpy.pad(samples, [(pad_width, 0), (0, 0)],
                            mode='constant',
                            constant_values=0)
        sc_center += pad_width
    elif sc_center + half_input_width > data_length:
        pad_width = sc_center + half_input_width - data_length
        samples = numpy.pad(samples, [(0, pad_width), (0, 0)],
                            mode='constant',
                            constant_values=0)
    samples = samples[sc_center - half_input_width:sc_center +
                      half_input_width]
    audio_input = vggish_input.waveform_to_examples(samples, sr)
    print(audio_input.dtype)
    audio_input = audio_input.astype(numpy.float32)
    channel = grpc.insecure_channel(hostport)
    stub = prediction_service_pb2_grpc.PredictionServiceStub(channel)
    result_counter = _ResultCounter(num_tests, concurrency)
    for _ in range(num_tests):
        request = predict_pb2.PredictRequest()
        request.model_spec.name = 'vgg'
        request.model_spec.signature_name = 'prediction'
        print(audio_input.shape)
        request.inputs['input'].CopyFrom(
            tf.contrib.util.make_tensor_proto(audio_input,
                                              shape=audio_input.shape))
        result_counter.throttle()
        result_future = stub.Predict.future(request, 5.0)
        result_future.add_done_callback(
            _create_rpc_callback(None, result_counter))
    return result_counter.get_throughput()
def main():
    # Initialize the PyTorch model.
    device = 'cuda:0'
    pytorch_model = VGGish()
    pytorch_model.load_state_dict(torch.load('pytorch_vggish.pth'))
    pytorch_model = pytorch_model.to(device)

    # Generate a sample input (as in the AudioSet repo smoke test).
    num_secs = 3
    freq = 1000
    sr = 44100
    t = np.linspace(0, num_secs, int(num_secs * sr))
    x = np.sin(2 * np.pi * freq * t)

    # Produce a batch of log mel spectrogram examples.
    input_batch = vggish_input.waveform_to_examples(x, sr)
    input_batch = torch.from_numpy(input_batch).unsqueeze(dim=1)
    input_batch = input_batch.float().to(device)

    # Run the PyTorch model.
    pytorch_output = pytorch_model(input_batch)
    pytorch_output = pytorch_output.detach().cpu().numpy()
    print('Input Shape:', tuple(input_batch.shape))
    print('Output Shape:', tuple(pytorch_output.shape))

    expected_embedding_mean = 0.131
    expected_embedding_std = 0.238
    print('Computed Embedding Mean and Standard Deviation:',
          np.mean(pytorch_output), np.std(pytorch_output))
    print('Expected Embedding Mean and Standard Deviation:',
          expected_embedding_mean, expected_embedding_std)

    # Post-processing.
    post_processor = vggish_postprocess.Postprocessor('vggish_pca_params.npz')
    postprocessed_output = post_processor.postprocess(pytorch_output)
    expected_postprocessed_mean = 123.0
    expected_postprocessed_std = 75.0
    print('Computed Post-processed Embedding Mean and Standard Deviation:',
          np.mean(postprocessed_output), np.std(postprocessed_output))
    print('Expected Post-processed Embedding Mean and Standard Deviation:',
          expected_postprocessed_mean, expected_postprocessed_std)
def load_input(filename, mono=True):
    """
    Extract input features
    Parameters
    ----------
    filename : str
    Yields
    -------
    dict of str: np.array
    """
    y, sr = psf.read(filename)
    if mono:
        y = librosa.to_mono(y.T)
    y = resampy.resample(y,
                         sr,
                         vggish_params.SAMPLE_RATE,
                         filter='kaiser_fast')
    y /= np.max(np.abs(y))

    print('{} features {}'.format(filename,
                                  y.shape[0] / vggish_params.SAMPLE_RATE))
    return vggish_input.waveform_to_examples(
        y, vggish_params.SAMPLE_RATE).astype(np.float32)
 def convert_waveform_to_embedding(self, waveform, sample_rate):
     samples = waveform / 32768.0  # Convert to [-1.0, +1.0]
     examples_batch = vggish_input.waveform_to_examples(
         samples, sample_rate)
     return self.convert_examples_to_embedding(examples_batch)
def main():
    with tf.Graph().as_default(), tf.Session() as sess:
        # -------------------
        # Step 1
        # -------------------
        # Load the model.
        vggish_slim.define_vggish_slim(training=False)
        vggish_slim.load_vggish_slim_checkpoint(sess, 'vggish_model.ckpt')

        # Get all of the variables, and use this to construct a dictionary which maps
        # the name of the variables to their values.
        variables = tf.all_variables()
        variables = [x.name for x in variables]
        variable_values = sess.run(variables)
        variable_dict = dict(zip(variables, variable_values))

        # Create a new state dictionary which maps the TensorFlow version of the weights
        # to those in in the new PyTorch model.
        pytorch_model = VGGish()
        pytorch_feature_dict = pytorch_model.features.state_dict()
        pytorch_fc_dict = pytorch_model.fc.state_dict()

        # -------------------
        # Step 2
        # -------------------
        # There is a bias and weight vector for each convolution layer. The weights are not necessarily stored
        # in the same format and order between the two frameworks; for the TensorFlow model, the 12 vectors for the
        # convolution layers are first, followed by the 6 FC layers.
        tf_feature_names = list(variable_dict.keys())[:-6]
        tf_fc_names = list(variable_dict.keys())[-6:]

        def to_pytorch_tensor(weights):
            if len(weights.shape) == 4:
                tensor = torch.from_numpy(weights.transpose(3, 2, 0,
                                                            1)).float()
            else:
                tensor = torch.from_numpy(weights.T).float()
            return tensor

        # Convert the weights for the convolution layers.
        for tf_name, pytorch_name in zip(tf_feature_names,
                                         pytorch_feature_dict.keys()):
            print(
                f'Converting [{tf_name}] ---------->  [feature.{pytorch_name}]'
            )
            pytorch_feature_dict[pytorch_name] = to_pytorch_tensor(
                variable_dict[tf_name])

        # Convert the weights for the FC layers.
        for tf_name, pytorch_name in zip(tf_fc_names, pytorch_fc_dict.keys()):
            print(f'Converting [{tf_name}] ---------->  [fc.{pytorch_name}]')
            pytorch_fc_dict[pytorch_name] = to_pytorch_tensor(
                variable_dict[tf_name])

        # -------------------
        # Step 3
        # -------------------
        # Load the new state dictionaries into the PyTorch model.
        pytorch_model.features.load_state_dict(pytorch_feature_dict)
        pytorch_model.fc.load_state_dict(pytorch_fc_dict)

        # -------------------
        # Step 4
        # -------------------
        # Generate a sample input (as in the AudioSet repo smoke test).
        num_secs = 3
        freq = 1000
        sr = 44100
        t = np.linspace(0, num_secs, int(num_secs * sr))
        x = np.sin(2 * np.pi * freq * t)

        # Produce a batch of log mel spectrogram examples.
        input_batch = vggish_input.waveform_to_examples(x, sr)

        # Run inference on the TensorFlow model.
        features_tensor = sess.graph.get_tensor_by_name(
            vggish_params.INPUT_TENSOR_NAME)
        embedding_tensor = sess.graph.get_tensor_by_name(
            vggish_params.OUTPUT_TENSOR_NAME)
        [tf_output] = sess.run([embedding_tensor],
                               feed_dict={features_tensor: input_batch})

        # Run on the PyTorch model.
        pytorch_model = pytorch_model.to('cpu')
        pytorch_output = pytorch_model(
            torch.from_numpy(input_batch).unsqueeze(dim=1).float())
        pytorch_output = pytorch_output.detach().numpy()

        # -------------------
        # Step 5
        # -------------------
        # Compare the difference between the outputs.
        diff = np.linalg.norm(pytorch_output - tf_output)**2
        print(f'Distance between TensorFlow and PyTorch outputs: [{diff}]')
        assert diff < 1e-6

        # Run a smoke test.
        expected_embedding_mean = 0.131
        expected_embedding_std = 0.238

        # Verify the TF output.
        np.testing.assert_allclose(
            [np.mean(tf_output), np.std(tf_output)],
            [expected_embedding_mean, expected_embedding_std],
            rtol=0.001)

        # Verify the PyTorch output.
        np.testing.assert_allclose(
            [np.mean(pytorch_output),
             np.std(pytorch_output)],
            [expected_embedding_mean, expected_embedding_std],
            rtol=0.001)

        # -------------------
        # Step 6
        # -------------------
        print(
            'Smoke test passed! Saving PyTorch weights to "pytorch_vggish.pth".'
        )
        torch.save(pytorch_model.state_dict(), 'pytorch_vggish.pth')