示例#1
0
def prepare_input(filename):
    from tensorflow.contrib.framework.python.ops import audio_ops
    from tensorflow.python.ops import io_ops

    with tf.Session(graph=tf.Graph()) as sess:
        wav_filename_placeholder = tf.placeholder(tf.string, [])
        wav_loader = io_ops.read_file(wav_filename_placeholder)
        wav_decoder = tf.audio.decode_wav(wav_loader,
                            desired_channels=1,
                            desired_samples=16000,
                            name='decoded_sample_data')

        spectrum = audio_ops.audio_spectrogram(input=wav_decoder[0],
                                            window_size=640,
                                            stride=320,
                                            magnitude_squared=True,
                                            name='AudioSpectrogram')
        final = audio_ops.mfcc(spectrogram=spectrum, 
                               sample_rate=wav_decoder[1], 
                               upper_frequency_limit=4000.0, 
                               lower_frequency_limit=20.0, 
                               filterbank_channel_count=40, 
                               dct_coefficient_count=10, 
                               name='Mfcc')

        data = sess.run(final,
        feed_dict={wav_filename_placeholder: filename})
        print(f'Data shape: {data.shape}')

    return data
示例#2
0
def _make_spect(file_name):
    audio_binary = tf.read_file(file_name)
    waveform = audio_ops.decode_wav(audio_binary, desired_channels=1)

    spectrogram = audio_ops.audio_spectrogram(waveform.audio,
                                              window_size=1024,
                                              stride=64)

    # Tensorflow spectrogram has time along y axis and frequencies along x axis
    # flip them
    spectrogram = tf.image.flip_left_right(spectrogram)
    spectrogram = tf.transpose(spectrogram, [0, 2, 1])
    spectrogram = tf.expand_dims(spectrogram, -1)  #add color channel
    spectrogram = tf.image.resize_bilinear(
        spectrogram, (spectrogram.shape[1], spectrogram.shape[1]))
    spectrogram = tf.squeeze(spectrogram, 0)
    spectrogram = spectrogram - mean_value

    one_hot = []

    for c in _classes:
        match = tf.strings.regex_full_match(file_name,
                                            ".*" + c + ".*",
                                            name="find_" + c)
        one_hot.append(match)

    one_hot = tf.cast(tf.stack(one_hot), tf.int32)

    return {"spectrogram": spectrogram, "label": one_hot}
示例#3
0
    def SpecWithARGS(self, inp, out):
        wav_file = tf.placeholder(tf.string)
        audio_binary = tf.read_file(wav_file)
        waveform = audio_ops.decode_wav(audio_binary, desired_channels=1)
        spectrogram = audio_ops.audio_spectrogram(waveform.audio,
                                                  window_size=1024,
                                                  stride=64)
        brightness = tf.placeholder(tf.float32, shape=[])
        mul = tf.multiply(spectrogram, brightness)
        min_const = tf.constant(255.)
        minimum = tf.minimum(mul, min_const)
        expand_dims = tf.expand_dims(minimum, -1)
        resize = tf.image.resize_bilinear(expand_dims, [512, 512])
        squeeze = tf.squeeze(resize, 0)
        flip = tf.image.flip_left_right(squeeze)
        transpose = tf.image.transpose_image(flip)
        grayscale = tf.image.grayscale_to_rgb(transpose)
        cast = tf.cast(grayscale, tf.uint8)
        png = tf.image.encode_png(cast)
        with tf.Session() as sess:
            # Run the computation graph and save the png encoded image to a file
            image = sess.run(png,
                             feed_dict={
                                 wav_file:
                                 os.path.join(self.curworkdir, str(inp)),
                                 brightness: 100
                             })

            with open(os.path.join(self.curworkdir, str(out)), 'wb') as f:
                f.write(image)
示例#4
0
def wav_to_features(filenames_dataset, hparams, feature_count):
    dataset = filenames_dataset.map(lambda filename: io_ops.read_file(filename))
    dataset = dataset.map(lambda wav_loader: contrib_audio.decode_wav(wav_loader, desired_channels=1))
    dataset = dataset.map(lambda wav_decoder:
                                  (contrib_audio.audio_spectrogram(
                                      wav_decoder.audio,
                                      window_size=int(hparams.sample_rate * hparams.window_size_ms / 1000),
                                      stride=int(hparams.sample_rate * hparams.window_stride_ms / 1000),
                                      magnitude_squared=True), wav_decoder.sample_rate))
    dataset = dataset.map(lambda spectrogram, sample_rate: contrib_audio.mfcc(
        spectrogram, sample_rate,
        dct_coefficient_count=feature_count))
    dataset = dataset.map(lambda inputs: (
        inputs,
        tf.nn.moments(inputs, axes=[1])
    ))
    dataset = dataset.map(lambda inputs, moments: (
        tf.divide(tf.subtract(inputs, moments[0]), moments[1]),
        tf.shape(inputs)[1]
    ))
    dataset = dataset.map(lambda inputs, seq_len: (
        inputs[0],
        seq_len
    ))
    return dataset
    def _build_processing_graph(self):
        """Builds a TensorFlow graph to apply the input distortions.

            Creates a graph that loads a WAVE file, decodes it, scales the volume,
            shifts it in time, adds in background noise, calculates a spectrogram, and
            then builds an MFCC fingerprint from that.

            This must be called with an active TensorFlow session running, and it
            creates multiple placeholder inputs, and one output:

              - wav_filename_placeholder_: Filename of the WAV to load.
              - mfcc_: Output 2D fingerprint of processed audio.

            """
        with tf.name_scope('audio_processing'):
            desired_samples = self._model_settings['desired_samples']
            self.wav_filename_placeholder_ = tf.placeholder(tf.string, [])
            wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
            wav_decoder = contrib_audio.decode_wav(
                wav_loader,
                desired_channels=1,
                desired_samples=desired_samples)
            background_clamp = tf.clip_by_value(wav_decoder.audio, -1.0, 1.0)
            # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
            spectrogram = contrib_audio.audio_spectrogram(
                background_clamp,
                window_size=self._model_settings['window_size_samples'],
                stride=self._model_settings['window_stride_samples'],
                magnitude_squared=True)
            self.mfcc_ = contrib_audio.mfcc(
                spectrogram,
                wav_decoder.sample_rate,
                dct_coefficient_count=self.
                _model_settings['dct_coefficient_count'])
示例#6
0
  def get_test_data(self, how_many, offset, model_settings, sess, features='mfcc'):

    candidates = self.data_index
    if how_many == -1:
      sample_count = len(candidates)
    else:
        sample_count = max(0, min(how_many, len(candidates) - offset))
    desired_samples = model_settings['desired_samples']
    data = np.zeros((sample_count, model_settings['fingerprint_size']))

    wav_filename_placeholder = tf.placeholder(tf.string, [], name='wav_file_names')
    wav_loader = io_ops.read_file(wav_filename_placeholder)
    wav_decoder = contrib_audio.decode_wav(
      wav_loader, desired_channels=1, desired_samples=desired_samples)
    spectrogram = contrib_audio.audio_spectrogram(
      wav_decoder.audio,
      window_size=model_settings['window_size_samples'],
      stride=model_settings['window_stride_samples'],
      magnitude_squared=True)
    mfcc = contrib_audio.mfcc(
      spectrogram,
      wav_decoder.sample_rate,
      dct_coefficient_count=model_settings['dct_coefficient_count'])


    for i in range(offset, offset + sample_count):
        input_dict = {wav_filename_placeholder : candidates[i]}
        if features == "spectrogram":
          data[i - offset, :] = sess.run(spectrogram, feed_dict=input_dict).flatten()
        elif features == "raw":
          data[i - offset, :] = sess.run(wav_decoder.audio, feed_dict=input_dict).flatten()
        else:
          data[i - offset, :] = sess.run(mfcc, feed_dict=input_dict).flatten()

    return data
示例#7
0
def mfcc_tensorflow(wavfile, _sr, frame_size, frame_shift, order=13):
    sess = tf.InteractiveSession()
    wav_filename_placeholder = tf.placeholder(tf.string, [])

    wav_loader = io_ops.read_file(wav_filename_placeholder)
    wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1)
    wav_data = wav_decoder.audio

    wav_sample_rate = sess.run(wav_decoder,
                               feed_dict={
                                   wav_filename_placeholder: wavfile
                               }).sample_rate
    check_sample_rate(wavfile, _sr, wav_sample_rate)

    spectrogram = contrib_audio.audio_spectrogram(wav_data,
                                                  window_size=frame_size,
                                                  stride=frame_shift,
                                                  magnitude_squared=True)

    mfcc_ = contrib_audio.mfcc(spectrogram,
                               wav_decoder.sample_rate,
                               dct_coefficient_count=order)

    mfcc_data = sess.run(mfcc_, feed_dict={wav_filename_placeholder: wavfile})

    return mfcc_data
示例#8
0
文件: id.py 项目: cooledge/nn
def load_mfcc_file(sess, filename):
  filename_ph = tf.placeholder(tf.string)
  loader = io_ops.read_file(filename_ph)
  decoder = contrib_audio.decode_wav(loader, desired_channels=1, desired_samples=16000)
  spectrogram = contrib_audio.audio_spectrogram( decoder.audio, window_size=480, stride=160, magnitude_squared=True)
  mfcc = contrib_audio.mfcc( spectrogram, decoder.sample_rate, dct_coefficient_count=40)
  return sess.run(mfcc, feed_dict={filename_ph: filename})
示例#9
0
def log_spec_tensorflow(wavfile, _sr, frame_size, frame_shift):
    sess = tf.InteractiveSession()
    wav_filename_placeholder = tf.placeholder(tf.string, [])

    wav_loader = io_ops.read_file(wav_filename_placeholder)
    wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1)
    wav_data = wav_decoder.audio

    wav_sample_rate = sess.run(wav_decoder,
                               feed_dict={
                                   wav_filename_placeholder: wavfile
                               }).sample_rate
    check_sample_rate(wavfile, _sr, wav_sample_rate)

    spectrogram = contrib_audio.audio_spectrogram(wav_data,
                                                  window_size=frame_size,
                                                  stride=frame_shift,
                                                  magnitude_squared=True)

    log_spectrogram = tf.log(spectrogram[0] + log_offset)

    log_spec_data = sess.run(log_spectrogram,
                             feed_dict={wav_filename_placeholder: wavfile})

    return np.transpose(log_spec_data)
def audio_to_spectrogram(audio_contents,
                         width,
                         height,
                         channels=1,
                         window_size=1024,
                         stride=64,
                         brightness=100.):
    """Decode and build a spectrogram using a wav string tensor.

    Args:
      audio_contents: String tensor of the wav audio contents.
      width: Spectrogram width.
      height: Spectrogram height.
      channels: Audio channel count.
      window_size: Size of the spectrogram window.
      stride: Size of the spectrogram stride.
      brightness: Brightness of the spectrogram.

    Returns:
      0-D string Tensor with the image contents.
    """
    # Decode the wav mono into a 2D tensor with time in dimension 0
    # and channel along dimension 1
    waveform = audio_ops.decode_wav(audio_contents, desired_channels=channels)

    # Compute the spectrogram
    # FIXME: Seems like this is deprecated in tensorflow 2.0 and
    # the operation only works on CPU. Change this to tf.signal.stft
    # and  friends to take advantage of GPU kernels.
    spectrogram = audio_ops.audio_spectrogram(waveform.audio,
                                              window_size=window_size,
                                              stride=stride)

    # Adjust brightness
    brightness = tf.constant(brightness)

    # Normalize pixels
    mul = tf.multiply(spectrogram, brightness)
    min_const = tf.constant(255.)
    minimum = tf.minimum(mul, min_const)

    # Expand dims so we get the proper shape
    expand_dims = tf.expand_dims(minimum, -1)

    # Resize the spectrogram to input size of the model
    resize = tf.image.resize(expand_dims, [width, height])

    # Remove the trailing dimension
    squeeze = tf.squeeze(resize, 0)

    # Tensorflow spectrogram has time along y axis and frequencies along x axis
    # so we fix that
    flip_left_right = tf.image.flip_left_right(squeeze)
    transposed = tf.image.transpose(flip_left_right)

    # Cast to uint8 and encode as png
    cast = tf.cast(transposed, tf.uint8)

    # Encode tensor as a png image
    return tf.image.encode_png(cast)
  def prepare_processing_graph(self, model_settings):
    """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - mfcc_: Output 2D fingerprint of processed audio.

    Args:
      model_settings: Information about the current model being trained.
    """
    desired_samples = model_settings['desired_samples']
    self.wav_filename_placeholder_ = tf.placeholder(tf.string, [])
    wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
    wav_decoder = contrib_audio.decode_wav(
        # wav_loader, desired_channels=1, desired_samples=desired_samples)
      wav_loader, desired_channels=1, desired_samples=16000)
    # Allow the audio sample's volume to be adjusted.
    self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, [])
    scaled_foreground = tf.multiply(wav_decoder.audio,
                                    self.foreground_volume_placeholder_)
    # Shift the sample's start position, and pad any gaps with zeros.
    self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2])
    self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2])
    padded_foreground = tf.pad(
        scaled_foreground,
        self.time_shift_padding_placeholder_,
        mode='CONSTANT')
    sliced_foreground = tf.slice(padded_foreground,
                                 self.time_shift_offset_placeholder_,
                                 [desired_samples, -1])
    # Mix in background noise.
    self.background_data_placeholder_ = tf.placeholder(tf.float32,
                                                       [desired_samples, 1])
    self.background_volume_placeholder_ = tf.placeholder(tf.float32, [])
    background_mul = tf.multiply(self.background_data_placeholder_,
                                 self.background_volume_placeholder_)
    background_add = tf.add(background_mul, sliced_foreground)
    background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
    ######################  M F C C #################################
    # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
    spectrogram = contrib_audio.audio_spectrogram(
        background_clamp,
        window_size=model_settings['window_size_samples'],
        stride=model_settings['window_stride_samples'],
        magnitude_squared=True)
    self.mfcc_ = contrib_audio.mfcc(
        spectrogram,
        wav_decoder.sample_rate,
        dct_coefficient_count=model_settings['dct_coefficient_count'])
示例#12
0
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
                           clip_stride_ms, window_size_ms, window_stride_ms,
                           dct_coefficient_count, model_architecture):
    """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    clip_stride_ms: How often to run recognition. Useful for models with cache.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    dct_coefficient_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
  """

    words_list = input_data.prepare_words_list(wanted_words.split(','))
    model_settings = models.prepare_model_settings(
        len(words_list),
        sample_rate,
        clip_duration_ms,
        window_size_ms,
        window_stride_ms,
        dct_coefficient_count,
    )
    runtime_settings = {'clip_stride_ms': clip_stride_ms}

    wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
    decoded_sample_data = contrib_audio.decode_wav(
        wav_data_placeholder,
        desired_channels=1,
        desired_samples=model_settings['desired_samples'],
        name='decoded_sample_data')
    spectrogram = contrib_audio.audio_spectrogram(
        decoded_sample_data.audio,
        window_size=model_settings['window_size_samples'],
        stride=model_settings['window_stride_samples'],
        magnitude_squared=True)
    fingerprint_input = contrib_audio.mfcc(
        spectrogram,
        decoded_sample_data.sample_rate,
        dct_coefficient_count=dct_coefficient_count)
    fingerprint_frequency_size = model_settings['dct_coefficient_count']
    fingerprint_time_size = model_settings['spectrogram_length']
    reshaped_input = tf.reshape(
        fingerprint_input,
        [-1, fingerprint_time_size * fingerprint_frequency_size])

    logits = models.create_model(reshaped_input,
                                 model_settings,
                                 model_architecture,
                                 is_training=False,
                                 runtime_settings=runtime_settings)

    # Create an output to use for inference.
    tf.nn.softmax(logits, name='labels_softmax')
示例#13
0
  def prepare_processing_graph(self, model_settings):
    """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - mfcc_: Output 2D fingerprint of processed audio.

    Args:
      model_settings: Information about the current model being trained.
    """
    desired_samples = model_settings['desired_samples']
    self.wav_filename_placeholder_ = tf.placeholder(tf.string, [])
    wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
    wav_decoder = contrib_audio.decode_wav(
        wav_loader, desired_channels=1, desired_samples=desired_samples)
    # Allow the audio sample's volume to be adjusted.
    self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, [])
    scaled_foreground = tf.multiply(wav_decoder.audio,
                                    self.foreground_volume_placeholder_)
    # Shift the sample's start position, and pad any gaps with zeros.
    self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2])
    self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2])
    padded_foreground = tf.pad(
        scaled_foreground,
        self.time_shift_padding_placeholder_,
        mode='CONSTANT')
    sliced_foreground = tf.slice(padded_foreground,
                                 self.time_shift_offset_placeholder_,
                                 [desired_samples, -1])
    # Mix in background noise.
    self.background_data_placeholder_ = tf.placeholder(tf.float32,
                                                       [desired_samples, 1])
    self.background_volume_placeholder_ = tf.placeholder(tf.float32, [])
    background_mul = tf.multiply(self.background_data_placeholder_,
                                 self.background_volume_placeholder_)
    background_add = tf.add(background_mul, sliced_foreground)
    background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
    # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
    spectrogram = contrib_audio.audio_spectrogram(
        background_clamp,
        window_size=model_settings['window_size_samples'],
        stride=model_settings['window_stride_samples'],
        magnitude_squared=True)
    self.mfcc_ = contrib_audio.mfcc(
        spectrogram,
        wav_decoder.sample_rate,
        dct_coefficient_count=model_settings['dct_coefficient_count'])
def samples_to_mfccs(samples, sample_rate):
    spectrogram = contrib_audio.audio_spectrogram(samples,
                                                  window_size=Config.audio_window_samples,
                                                  stride=Config.audio_step_samples,
                                                  magnitude_squared=True)
    mfccs = contrib_audio.mfcc(spectrogram, sample_rate, dct_coefficient_count=Config.n_input)
    mfccs = tf.reshape(mfccs, [-1, Config.n_input])

    return mfccs, tf.shape(mfccs)[0]
示例#15
0
def samples_to_mfccs(samples, sample_rate):
    spectrogram = contrib_audio.audio_spectrogram(samples,
                                                  window_size=Config.audio_window_samples,
                                                  stride=Config.audio_step_samples,
                                                  magnitude_squared=True)
    mfccs = contrib_audio.mfcc(spectrogram, sample_rate, dct_coefficient_count=Config.n_input)
    mfccs = tf.reshape(mfccs, [-1, Config.n_input])

    return mfccs, tf.shape(mfccs)[0]
示例#16
0
def _load_sample(
        wav_filename,
        model_settings):
    """Creates an audio model with the nodes needed for inference.

    Uses the supplied arguments to create a model, and inserts the input and
    output nodes that are needed to use the graph for inference.

    Args:
      wanted_words: Comma-separated list of the words we're trying to recognize.
      sample_rate: How many samples per second are in the input audio files.
      clip_duration_ms: How many samples to analyze for the audio pattern.
      clip_stride_ms: How often to run recognition. Useful for models with cache.
      window_size_ms: Time slice duration to estimate frequencies from.
      window_stride_ms: How far apart time slices should be.
      dct_coefficient_count: Number of frequency bands to analyze.
      model: Name of the kind of model to generate.
    """
    wav_loader = io_ops.read_file(wav_filename)

    decoded_sample_data = contrib_audio.decode_wav(
        wav_loader,
        desired_channels=1,
        desired_samples=model_settings['desired_samples'],
        name='decoded_sample_data')

    if model_settings['input_format'] == 'raw':
        print(decoded_sample_data.audio.shape)
        reshaped_input = tf.reshape(decoded_sample_data.audio, [
            -1, model_settings['desired_samples']
        ])
        print(reshaped_input.shape)
    else:
        spectrogram = contrib_audio.audio_spectrogram(
            decoded_sample_data.audio,
            window_size=model_settings['window_size_samples'],
            stride=model_settings['window_stride_samples'],
            magnitude_squared=True)

        fingerprint_input = contrib_audio.mfcc(
            spectrogram,
            decoded_sample_data.sample_rate,
            lower_frequency_limit=model_settings['lower_frequency_limit'],
            upper_frequency_limit=model_settings['upper_frequency_limit'],
            filterbank_channel_count=model_settings['filterbank_channel_count'],
            dct_coefficient_count=model_settings['dct_coefficient_count'])
        fingerprint_frequency_size = model_settings['dct_coefficient_count']
        fingerprint_time_size = model_settings['spectrogram_length']

        reshaped_input = tf.reshape(fingerprint_input, [
            -1, fingerprint_time_size * fingerprint_frequency_size
        ])

    return reshaped_input
示例#17
0
    def _single_spectrogram(self, audio, window_size_samples,
                            window_stride_samples, magnitude_squared):
        # only accept single batch
        audio = tf.squeeze(audio, 0)

        spectrogram = contrib_audio.audio_spectrogram(
            audio,
            window_size=window_size_samples,
            stride=window_stride_samples,
            magnitude_squared=magnitude_squared)

        return spectrogram
示例#18
0
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
                           clip_stride_ms, window_size_ms, window_stride_ms,
                           dct_coefficient_count, model_architecture):
  """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    clip_stride_ms: How often to run recognition. Useful for models with cache.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    dct_coefficient_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
  """

  words_list = input_data.prepare_words_list(wanted_words.split(','))
  model_settings = models.prepare_model_settings(
      len(words_list), sample_rate, clip_duration_ms, window_size_ms,
      window_stride_ms, dct_coefficient_count)
  runtime_settings = {'clip_stride_ms': clip_stride_ms}

  wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
  decoded_sample_data = contrib_audio.decode_wav(
      wav_data_placeholder,
      desired_channels=1,
      desired_samples=model_settings['desired_samples'],
      name='decoded_sample_data')
  spectrogram = contrib_audio.audio_spectrogram(
      decoded_sample_data.audio,
      window_size=model_settings['window_size_samples'],
      stride=model_settings['window_stride_samples'],
      magnitude_squared=True)
  fingerprint_input = contrib_audio.mfcc(
      spectrogram,
      decoded_sample_data.sample_rate,
      dct_coefficient_count=dct_coefficient_count)
  fingerprint_frequency_size = model_settings['dct_coefficient_count']
  fingerprint_time_size = model_settings['spectrogram_length']
  reshaped_input = tf.reshape(fingerprint_input, [
      -1, fingerprint_time_size * fingerprint_frequency_size
  ])

  logits = models.create_model(
      reshaped_input, model_settings, model_architecture, is_training=False,
      runtime_settings=runtime_settings)

  # Create an output to use for inference.
  tf.nn.softmax(logits, name='labels_softmax')
示例#19
0
def AudioToMfcc(sample_rate, audio, window_size_ms, window_stride_ms,
                num_coefficients):
    window_size_samples = sample_rate * window_size_ms // 1000
    window_stride_samples = sample_rate * window_stride_ms // 1000
    spectrogram = contrib_audio.audio_spectrogram(
        audio,
        window_size=window_size_samples,
        stride=window_stride_samples,
        magnitude_squared=True)
    mfcc = contrib_audio.mfcc(spectrogram,
                              sample_rate,
                              dct_coefficient_count=num_coefficients)
    return mfcc
  def __init__(self, sample_rate: int, dct_coef_count: int=-1):
    '''
    suppose the channel number is 1.
    '''
    assert sample_rate == 16_000
    if dct_coef_count == -1:
      dct_coef_count = DataGraphMFCC.max_mfcc_num
    else:
      assert dct_coef_count <= DataGraphMFCC.max_mfcc_num

    self._sample_rate = sample_rate
    samples_per_second = sample_rate / 1000
    window = int(DataGraphMFCC.window_duration * samples_per_second)
    stride = int(DataGraphMFCC.stride_duration * samples_per_second)

    self._graph = tf.Graph()
    with self._graph.as_default():
      self._in_wav_file = tf.placeholder(tf.string, [], name='wav_filename')
      self._in_frame_num = tf.placeholder(tf.int32, [])
      wav_loader = io_ops.read_file(self._in_wav_file)
      wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1)
      self._out_audio = tf.squeeze(wav_decoder.audio)
      self._out_sample_rate = wav_decoder.sample_rate

      self._in_audio = tf.placeholder(tf.float32, [None])
      in_audio = tf.expand_dims(self._in_audio, -1)

      audio_clamp = tf.clip_by_value(in_audio, -1.0, 1.0)
      spectrogram = contrib_audio.audio_spectrogram(
        audio_clamp,
        window_size=window,
        stride=stride,
        magnitude_squared=True)
      self._out_spectrogram = spectrogram

      feat_ts = contrib_audio.mfcc(
        spectrogram=spectrogram,
        sample_rate=sample_rate,
        dct_coefficient_count=dct_coef_count,
      )
      self._out_mfcc = feat_ts[0]
      self._out_real_mfcc_len = tf.shape(self._out_mfcc)[0]

      diff = tf.maximum(0, self._in_frame_num - self._out_real_mfcc_len)
      self._out_expanded_mfcc = tf.pad(
        self._out_mfcc,
        [[0, diff], [0, 0]],
      )[: self._in_frame_num]

    self._sess = tf.Session(graph=self._graph)
    print(f"DataGgraphMFCC graph is created!")
    def build_graph(self):
        """ Graph to extract mfcc fingerprint given wav file

		  Here we add the necessary input & output tensors, to decode wav,
		  serialize mfcc fingerprint, restore from checkpoint etc.

		Returns:
		  input_wav_filename: A tensor containing wav filename as the input layer.
		  mfcc_fingerprint: The MFCC fingerprint tensor, that will be materialized later.
		"""

        self.wav_filename_placeholder_ = tf.placeholder(tf.string, [])
        wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
        wav_decoder = contrib_audio.decode_wav(
            wav_loader,
            desired_channels=1,
            desired_samples=self.desired_samples)

        # Allow the audio sample's volume to be adjusted.
        self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, [])
        scaled_foreground = tf.multiply(wav_decoder.audio,
                                        self.foreground_volume_placeholder_)

        # Shift the sample's start position, and pad any gaps with zeros.
        self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2])
        self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2])
        padded_foreground = tf.pad(scaled_foreground,
                                   self.time_shift_padding_placeholder_,
                                   mode='CONSTANT')
        sliced_foreground = tf.slice(padded_foreground,
                                     self.time_shift_offset_placeholder_,
                                     [self.desired_samples, -1])
        # Mix in background noise.
        self.background_data_placeholder_ = tf.placeholder(
            tf.float32, [self.desired_samples, 1])
        self.background_volume_placeholder_ = tf.placeholder(tf.float32, [])
        background_mul = tf.multiply(self.background_data_placeholder_,
                                     self.background_volume_placeholder_)
        background_add = tf.add(background_mul, sliced_foreground)
        background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)

        spectrogram = contrib_audio.audio_spectrogram(
            background_clamp,
            window_size=self.window_size_samples,
            stride=self.window_stride_samples,
            magnitude_squared=True)

        self.mfcc_fingerprint_ = contrib_audio.mfcc(
            spectrogram,
            wav_decoder.sample_rate,
            dct_coefficient_count=self.opt.dct_coefficient_count)
def wav_to_spectrogram(path):
    wav_file = tf.placeholder(tf.string)
    audio_binary = tf.read_file(wav_file)

    # Decode the wav mono into a 2D tensor with time in dimension 0
    # and channel along dimension 1
    waveform = audio_ops.decode_wav(audio_binary,
                                    file_format='wav',
                                    desired_channels=1)

    # Compute the spectrogram
    spectrogram = audio_ops.audio_spectrogram(waveform.audio,
                                              window_size=1024,
                                              stride=64)

    # Custom brightness
    brightness = tf.placeholder(tf.float32, shape=[])
    mul = tf.multiply(spectrogram, brightness)

    # Normalize pixels
    min_const = tf.constant(255.)
    minimum = tf.minimum(mul, min_const)

    # Expand dims so we get the proper shape
    expand_dims = tf.expand_dims(minimum, -1)

    # Remove the trailing dimension
    squeeze = tf.squeeze(expand_dims, 0)

    # Tensorflow spectrogram has time along y axis and frequencies along x axis
    flip = tf.image.flip_left_right(squeeze)
    transpose = tf.image.transpose_image(flip)

    # Convert image to 3 channels
    grayscale = tf.image.grayscale_to_rgb(transpose)

    # Cast to uint8 and encode as png
    cast = tf.cast(grayscale, tf.uint8)
    png = tf.image.encode_png(cast)

    with tf.Session() as sess:
        # Run the computation graph and save the png encoded image to a file
        image = sess.run(png, feed_dict={wav_file: path, brightness: 100})

        new_path = path.rstrip(".wav")
        with open(new_path + '.png', 'wb') as f:
            f.write(image)

    return
示例#23
0
 def wav_to_mfcc(self, raw_data):
     spectrogram = audio_ops.audio_spectrogram(
         raw_data,
         window_size=self.parameters['spectogram_window_size'],
         stride=self.parameters['spectogram_stride'],
         magnitude_squared=True)
     mfcc = audio_ops.mfcc(
         spectrogram,
         self.parameters['audio_sample_rate'],
         dct_coefficient_count=self.parameters['dtc_coefficient_count'])
     mfcc = tf.expand_dims(mfcc, -1)
     self.input_dimensions = (
         self.input_dimensions[0] / self.parameters['spectogram_stride'] -
         2, self.parameters['dtc_coefficient_count'], 1)
     mfcc = tf.squeeze(mfcc, 0)
     return mfcc
示例#24
0
def encode_data(audio_data, sample_rate):
    spectrogram = contrib_audio.audio_spectrogram(
        audio_data,
        window_size_samples,
        window_stride_samples
    )

    print(spectrogram.shape)

    mfcc = contrib_audio.mfcc(
        spectrogram,
        sample_rate=sample_rate,
        dct_coefficient_count=dct_coefficient_count
    )

    return mfcc
示例#25
0
def create_inference_graph_and_load_variables(sess, FLAGS):
    """Creates an audio model with the nodes needed for inference.

    Uses the supplied arguments to create a model, and inserts the input and
    output the trained model graph.
    """
    model_settings = data_utils.prepare_settings(FLAGS.num_classes,
                                                 FLAGS.sample_rate,
                                                 FLAGS.clip_duration_ms,
                                                 FLAGS.window_size_ms,
                                                 FLAGS.window_stride_ms,
                                                 FLAGS.dct_coefficient_count)
    runtime_settings = {'clip_stride_ms': FLAGS.clip_stride_ms}

    wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
    decoded_sample_data = contrib_audio.decode_wav(
        wav_data_placeholder,
        desired_channels=1,
        desired_samples=model_settings['desired_samples'],
        name='decoded_sample_data')
    spectrogram = contrib_audio.audio_spectrogram(
        decoded_sample_data.audio,
        window_size=model_settings['window_size_samples'],
        stride=model_settings['window_stride_samples'],
        magnitude_squared=True)
    fingerprint_input = contrib_audio.mfcc(
        spectrogram,
        decoded_sample_data.sample_rate,
        dct_coefficient_count=FLAGS.dct_coefficient_count)
    fingerprint_frequency_size = model_settings['dct_coefficient_count']
    fingerprint_time_size = model_settings['spectrogram_length']
    reshaped_input = tf.reshape(
        fingerprint_input,
        [-1, fingerprint_time_size, fingerprint_frequency_size, 1],
        name="model_input")

    # Init model and load variables
    model = models.create_model(FLAGS)
    fw = framework.Framework(sess,
                             model,
                             None,
                             FLAGS,
                             input_tensor=reshaped_input)

    # Create an output to use for inference
    logits = tf.nn.softmax(model.get_raw_scores(), name='labels_softmax')
示例#26
0
    def prepare_processing_graph(self, model_settings):
        """
        Build a tensorflow graph
        
        creates a graph that loads a wave file, decodes it, scales the volume, shifts it in time
        calculates a spectrogram and builds MFCC fingerprint from that

        input:
        model_settings: info about model being trained
        """
        desired_samples = model_settings['desired_samples']
        self.wav_filename_placeholder_ = tf.placeholder(tf.string, [])
        wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
        wav_decoder = contrib_audio.decode_wav(wav_loader,
                                               desired_channels=1,
                                               desired_samples=desired_samples)
        # Allow the audio sample's volume to be adjusted.
        self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, [])
        scaled_foreground = tf.multiply(wav_decoder.audio,
                                        self.foreground_volume_placeholder_)
        # Shift the sample's start position, and pad any gaps with zeros.
        self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2])
        self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2])
        padded_foreground = tf.pad(scaled_foreground,
                                   self.time_shift_padding_placeholder_,
                                   mode='CONSTANT')
        sliced_foreground = tf.slice(padded_foreground,
                                     self.time_shift_offset_placeholder_,
                                     [desired_samples, -1])

        self.background_data_placeholder_ = tf.placeholder(
            np.float32, [desired_samples, 1])
        self.background_volume_placeholder_ = tf.placeholder(np.float32, [])
        background_mul = tf.multiply(self.background_data_placeholder_,
                                     self.background_volume_placeholder_)
        background_add = tf.add(background_mul, sliced_foreground)
        background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
        spectrogram = contrib_audio.audio_spectrogram(
            background_clamp,
            window_size=model_settings['window_size_samples'],
            stride=model_settings['window_stride_samples'],
            magnitude_squared=True)
        self.mfcc_ = contrib_audio.mfcc(
            spectrogram,
            wav_decoder.sample_rate,
            dct_coefficient_count=model_settings['dct_coefficient_count'])
示例#27
0
 def prepare_processing_graph(self, model_settings):
     desired_samples = model_settings['desired_samples']
     self.wav_filename_placeholder_ = tf.placeholder(tf.string, [])
     wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
     wav_decoder = contrib_audio.decode_wav(wav_loader,
                                            desired_channels=1,
                                            desired_samples=desired_samples)
     # Allow the audio sample's volume to be adjusted.
     self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, [])
     scaled_foreground = tf.multiply(wav_decoder.audio,
                                     self.foreground_volume_placeholder_)
     # Shift the sample's start position, and pad any gaps with zeros.
     self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2])
     self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2])
     padded_foreground = tf.pad(scaled_foreground,
                                self.time_shift_padding_placeholder_,
                                mode='CONSTANT')
     sliced_foreground = tf.slice(padded_foreground,
                                  self.time_shift_offset_placeholder_,
                                  [desired_samples, -1])
     # Mix in background noise.
     self.background_data_placeholder_ = tf.placeholder(
         tf.float32, [desired_samples, 1])
     self.background_volume_placeholder_ = tf.placeholder(tf.float32, [])
     background_mul = tf.multiply(self.background_data_placeholder_,
                                  self.background_volume_placeholder_)
     background_add = tf.add(background_mul, sliced_foreground)
     background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
     # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
     print 'window_size_samples', model_settings['window_size_samples']
     print 'window_stride_samples', model_settings['window_stride_samples']
     print 'background_clamp', background_clamp
     spectrogram = contrib_audio.audio_spectrogram(
         background_clamp,
         window_size=model_settings['window_size_samples'],
         stride=model_settings['window_stride_samples'],
         magnitude_squared=True)
     print 'spectrogram', spectrogram
     print 'dct_coefficient_count', model_settings['dct_coefficient_count']
     print 'wav_decoder.sample_rate', wav_decoder.sample_rate
     self.mfcc_ = contrib_audio.mfcc(
         spectrogram,
         wav_decoder.sample_rate,
         dct_coefficient_count=model_settings['dct_coefficient_count'])
     print 'self.mfcc_', self.mfcc_
示例#28
0
def get_mfcc_graph(model_settings):
    g = tf.Graph()
    with g.as_default():
        input_file_placeholder = tf.compat.v1.placeholder(tf.string, [],
                                                          name='wav_filename')
        wav_loader = io_ops.read_file(input_file_placeholder)
        wav_decoder = audio_ops.decode_wav(
            wav_loader,
            desired_channels=1,
            desired_samples=model_settings['desired_samples'])
        # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
        spectrograms_power = audio_ops.audio_spectrogram(
            wav_decoder.audio,
            window_size=model_settings['window_size_samples'],
            stride=model_settings['window_stride_samples'],
            magnitude_squared=True)
        USE_POWER = True
        if USE_POWER:
            # Warp the linear scale spectrograms into the mel-scale.
            num_spectrogram_bins = spectrograms_power.shape[-1].value
            lower_edge_hertz, upper_edge_hertz, num_mel_bins = 20.0, 4000.0, 40
            linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
                num_mel_bins, num_spectrogram_bins, 16000.0, lower_edge_hertz,
                upper_edge_hertz)
            mel_spectrograms = tf.tensordot(spectrograms_power,
                                            linear_to_mel_weight_matrix, 1)
            mel_spectrograms.set_shape(
                spectrograms_power.shape[:-1].concatenate(
                    linear_to_mel_weight_matrix.shape[-1:]))

            # Compute a stabilized log to get log-magnitude mel-scale spectrograms.
            log_mel_spectrograms = tf.math.log(mel_spectrograms + 1e-6)

            # Compute MFCCs from log_mel_spectrograms and take the first NDCT.
            mfccs = tf.signal.mfccs_from_log_mel_spectrograms(
                log_mel_spectrograms)[
                    ..., :model_settings['dct_coefficient_count']]
            #output = tf.expand_dims(mfccs, axis=0)
            output = mfccs
        else:
            output = audio_ops.mfcc(
                spectrograms_power,
                wav_decoder.sample_rate,
                dct_coefficient_count=model_settings['dct_coefficient_count'])
    return g, input_file_placeholder, output, wav_decoder.audio
 def save_my_test_file(self, model_settings):
     desired_samples = model_settings['desired_samples']
     self.wav_filename_placeholder_ = tf.placeholder(tf.string, [])
     wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
     wav_decoder = contrib_audio.decode_wav(
       # wav_loader, desired_channels=1, desired_samples=desired_samples)
       wav_loader, desired_channels=1, desired_samples=16000)
     # Allow the audio sample's volume to be adjusted.
     self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, [])
     scaled_foreground = tf.multiply(wav_decoder.audio,
                                     self.foreground_volume_placeholder_)
     # Shift the sample's start position, and pad any gaps with zeros.
     self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2])
     self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2])
     padded_foreground = tf.pad(
       scaled_foreground,
       self.time_shift_padding_placeholder_,
       mode='CONSTANT')
     sliced_foreground = tf.slice(padded_foreground,
                                  self.time_shift_offset_placeholder_,
                                  [desired_samples, -1])
     # Mix in background noise.
     self.background_data_placeholder_ = tf.placeholder(tf.float32,
                                                        [desired_samples, 1])
     self.background_volume_placeholder_ = tf.placeholder(tf.float32, [])
     background_mul = tf.multiply(self.background_data_placeholder_,
                                  self.background_volume_placeholder_)
     background_add = tf.add(background_mul, sliced_foreground)
     background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
     self.filename_ = tf.placeholder(tf.string)
     # save_wav_file(self.filename_, np.array(background_clamp), 16000)
     wav_encoder = contrib_audio.encode_wav(background_clamp,
                                            16000)
     self.wav_saver = io_ops.write_file(self.filename_, wav_encoder)
     # with tf.Session(graph=tf.Graph()) as sess:
     #   sess.run(wav_saver)
     spectrogram = contrib_audio.audio_spectrogram(
       background_clamp,
       window_size=model_settings['window_size_samples'],
       stride=model_settings['window_stride_samples'],
       magnitude_squared=True)
     self.test_mfcc_ = contrib_audio.mfcc(
       spectrogram,
       wav_decoder.sample_rate,
       dct_coefficient_count=model_settings['dct_coefficient_count'])
示例#30
0
 def __init__(self,
              desired_samples=16000,
              window_size_samples=480,
              window_stride_samples=160):
     self.wav_filename_placeholder = tf.placeholder(tf.string, [])
     wav_loader = io_ops.read_file(self.wav_filename_placeholder)
     # already pads/crops
     wav_decoder = contrib_audio.decode_wav(wav_loader,
                                            desired_channels=1,
                                            desired_samples=desired_samples)
     spectrogram = contrib_audio.audio_spectrogram(
         wav_decoder.audio,
         window_size=window_size_samples,
         stride=window_stride_samples,
         magnitude_squared=True)
     self.mfcc = contrib_audio.mfcc(spectrogram,
                                    wav_decoder.sample_rate,
                                    dct_coefficient_count=40)
def build_data_generator():

    # Build data generator pipeline
    desired_samples = model_settings['desired_samples']
    wav_filename_placeholder_ = tf.placeholder(
        tf.string, [], name="wav_filename_placeholder_")
    wav_loader = io_ops.read_file(wav_filename_placeholder_)
    wav_decoder = contrib_audio.decode_wav(wav_loader,
                                           desired_channels=1,
                                           desired_samples=desired_samples)
    # Allow the audio sample's volume to be adjusted.
    foreground_volume_placeholder_ = tf.placeholder(
        tf.float32, [], name="foreground_volume_placeholder_")
    scaled_foreground = tf.multiply(wav_decoder.audio,
                                    foreground_volume_placeholder_)
    # Shift the sample's start position, and pad any gaps with zeros.
    time_shift_padding_placeholder_ = tf.placeholder(
        tf.int32, [2, 2], name="time_shift_padding_placeholder_")
    time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2])
    padded_foreground = tf.pad(scaled_foreground,
                               time_shift_padding_placeholder_,
                               mode='CONSTANT')
    sliced_foreground = tf.slice(padded_foreground,
                                 time_shift_offset_placeholder_,
                                 [desired_samples, -1])
    # Mix in background noise.
    background_data_placeholder_ = tf.placeholder(tf.float32,
                                                  [desired_samples, 1])
    background_volume_placeholder_ = tf.placeholder(tf.float32, [])
    background_mul = tf.multiply(background_data_placeholder_,
                                 background_volume_placeholder_)
    background_add = tf.add(background_mul, sliced_foreground)
    background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
    # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
    spectrogram = contrib_audio.audio_spectrogram(
        background_clamp,
        window_size=model_settings['window_size_samples'],
        stride=model_settings['window_stride_samples'],
        magnitude_squared=True)
    mfcc_ = contrib_audio.mfcc(
        spectrogram,
        wav_decoder.sample_rate,
        dct_coefficient_count=model_settings['dct_coefficient_count'])
    return wav_filename_placeholder_, foreground_volume_placeholder_, time_shift_padding_placeholder_, time_shift_offset_placeholder_, background_data_placeholder_, background_volume_placeholder_, mfcc_
示例#32
0
 def prepare_processing_graph(self):
     self.wav_filename_placeholder_ = tf.placeholder(tf.string, [])
     wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
     wav_decoder = contrib_audio.decode_wav(wav_loader,
                                            desired_channels=1,
                                            desired_samples=desired_samples)
     spectrogram = contrib_audio.audio_spectrogram(
         wav_decoder.audio,
         window_size=window_size_samples,
         stride=window_stride_samples,
         magnitude_squared=True)
     print 'spectrogram', spectrogram
     print 'dct_coefficient_count', dct_coefficient_count
     print 'wav_decoder.sample_rate', wav_decoder.sample_rate
     self.mfcc_ = contrib_audio.mfcc(
         spectrogram,
         wav_decoder.sample_rate,
         dct_coefficient_count=dct_coefficient_count)
     print 'self.mfcc_', self.mfcc_
示例#33
0
def _spectrogram_function(features, labels):
    # decoding wav files
    audio_binary = tf.read_file(features)
    wav = audio_ops.decode_wav(audio_binary, desired_channels=1)

    # create the spectrogram
    spectrogram = audio_ops.audio_spectrogram(
        wav.audio,
        window_size=window_size,
        stride=stride,
        magnitude_squared=True
    )
    spectrogram = tf.log(tf.abs(spectrogram) + 0.01)
    spectrogram = tf.transpose(spectrogram, perm=[1, 2, 0])

    # transform the class_id into a one-hot encoded vector
    response = tf.one_hot(labels, 30)

    return [spectrogram, response]
示例#34
0
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
                           clip_stride_ms, window_size_ms, window_stride_ms,
                           feature_bin_count, model_architecture, preprocess):
  """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    clip_stride_ms: How often to run recognition. Useful for models with cache.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    feature_bin_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
    preprocess: How the spectrogram is processed to produce features, for
      example 'mfcc', 'average', or 'micro'.

  Raises:
    Exception: If the preprocessing mode isn't recognized.
  """

  words_list = input_data.prepare_words_list(wanted_words.split(','))
  model_settings = models.prepare_model_settings(
      len(words_list), sample_rate, clip_duration_ms, window_size_ms,
      window_stride_ms, feature_bin_count, preprocess)
  runtime_settings = {'clip_stride_ms': clip_stride_ms}

  wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
  decoded_sample_data = contrib_audio.decode_wav(
      wav_data_placeholder,
      desired_channels=1,
      desired_samples=model_settings['desired_samples'],
      name='decoded_sample_data')
  spectrogram = contrib_audio.audio_spectrogram(
      decoded_sample_data.audio,
      window_size=model_settings['window_size_samples'],
      stride=model_settings['window_stride_samples'],
      magnitude_squared=True)

  if preprocess == 'average':
    fingerprint_input = tf.nn.pool(
        tf.expand_dims(spectrogram, -1),
        window_shape=[1, model_settings['average_window_width']],
        strides=[1, model_settings['average_window_width']],
        pooling_type='AVG',
        padding='SAME')
  elif preprocess == 'mfcc':
    fingerprint_input = contrib_audio.mfcc(
        spectrogram,
        sample_rate,
        dct_coefficient_count=model_settings['fingerprint_width'])
  elif preprocess == 'micro':
    if not frontend_op:
      raise Exception(
          'Micro frontend op is currently not available when running TensorFlow'
          ' directly from Python, you need to build and run through Bazel, for'
          ' example'
          ' `bazel run tensorflow/examples/speech_commands:freeze_graph`'
      )
    sample_rate = model_settings['sample_rate']
    window_size_ms = (model_settings['window_size_samples'] *
                      1000) / sample_rate
    window_step_ms = (model_settings['window_stride_samples'] *
                      1000) / sample_rate
    int16_input = tf.cast(
        tf.multiply(decoded_sample_data.audio, 32767), tf.int16)
    micro_frontend = frontend_op.audio_microfrontend(
        int16_input,
        sample_rate=sample_rate,
        window_size=window_size_ms,
        window_step=window_step_ms,
        num_channels=model_settings['fingerprint_width'],
        out_scale=1,
        out_type=tf.float32)
    fingerprint_input = tf.multiply(micro_frontend, (10.0 / 256.0))
  else:
    raise Exception('Unknown preprocess mode "%s" (should be "mfcc",'
                    ' "average", or "micro")' % (preprocess))

  fingerprint_size = model_settings['fingerprint_size']
  reshaped_input = tf.reshape(fingerprint_input, [-1, fingerprint_size])

  logits = models.create_model(
      reshaped_input, model_settings, model_architecture, is_training=False,
      runtime_settings=runtime_settings)

  # Create an output to use for inference.
  tf.nn.softmax(logits, name='labels_softmax')
示例#35
0
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
                           clip_stride_ms, window_size_ms, window_stride_ms,
                           feature_bin_count, model_architecture, preprocess):
  """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    clip_stride_ms: How often to run recognition. Useful for models with cache.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    feature_bin_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
    preprocess: How the spectrogram is processed to produce features, for
      example 'mfcc' or 'average'.

  Raises:
    Exception: If the preprocessing mode isn't recognized.
  """

  words_list = input_data.prepare_words_list(wanted_words.split(','))
  model_settings = models.prepare_model_settings(
      len(words_list), sample_rate, clip_duration_ms, window_size_ms,
      window_stride_ms, feature_bin_count, preprocess)
  runtime_settings = {'clip_stride_ms': clip_stride_ms}

  wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
  decoded_sample_data = contrib_audio.decode_wav(
      wav_data_placeholder,
      desired_channels=1,
      desired_samples=model_settings['desired_samples'],
      name='decoded_sample_data')
  spectrogram = contrib_audio.audio_spectrogram(
      decoded_sample_data.audio,
      window_size=model_settings['window_size_samples'],
      stride=model_settings['window_stride_samples'],
      magnitude_squared=True)

  if preprocess == 'average':
    fingerprint_input = tf.nn.pool(
        tf.expand_dims(spectrogram, -1),
        window_shape=[1, model_settings['average_window_width']],
        strides=[1, model_settings['average_window_width']],
        pooling_type='AVG',
        padding='SAME')
  elif preprocess == 'mfcc':
    fingerprint_input = contrib_audio.mfcc(
        spectrogram,
        sample_rate,
        dct_coefficient_count=model_settings['fingerprint_width'])
  else:
    raise Exception('Unknown preprocess mode "%s" (should be "mfcc" or'
                    ' "average")' % (preprocess))

  fingerprint_size = model_settings['fingerprint_size']
  reshaped_input = tf.reshape(fingerprint_input, [-1, fingerprint_size])

  logits = models.create_model(
      reshaped_input, model_settings, model_architecture, is_training=False,
      runtime_settings=runtime_settings)

  # Create an output to use for inference.
  tf.nn.softmax(logits, name='labels_softmax')
示例#36
0
  def prepare_processing_graph(self, model_settings, summaries_dir):
    """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - output_: Output 2D fingerprint of processed audio.

    Args:
      model_settings: Information about the current model being trained.
      summaries_dir: Path to save training summary information to.

    Raises:
      ValueError: If the preprocessing mode isn't recognized.
    """
    with tf.get_default_graph().name_scope('data'):
      desired_samples = model_settings['desired_samples']
      self.wav_filename_placeholder_ = tf.placeholder(
          tf.string, [], name='wav_filename')
      wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
      wav_decoder = contrib_audio.decode_wav(
          wav_loader, desired_channels=1, desired_samples=desired_samples)
      # Allow the audio sample's volume to be adjusted.
      self.foreground_volume_placeholder_ = tf.placeholder(
          tf.float32, [], name='foreground_volume')
      scaled_foreground = tf.multiply(wav_decoder.audio,
                                      self.foreground_volume_placeholder_)
      # Shift the sample's start position, and pad any gaps with zeros.
      self.time_shift_padding_placeholder_ = tf.placeholder(
          tf.int32, [2, 2], name='time_shift_padding')
      self.time_shift_offset_placeholder_ = tf.placeholder(
          tf.int32, [2], name='time_shift_offset')
      padded_foreground = tf.pad(
          scaled_foreground,
          self.time_shift_padding_placeholder_,
          mode='CONSTANT')
      sliced_foreground = tf.slice(padded_foreground,
                                   self.time_shift_offset_placeholder_,
                                   [desired_samples, -1])
      # Mix in background noise.
      self.background_data_placeholder_ = tf.placeholder(
          tf.float32, [desired_samples, 1], name='background_data')
      self.background_volume_placeholder_ = tf.placeholder(
          tf.float32, [], name='background_volume')
      background_mul = tf.multiply(self.background_data_placeholder_,
                                   self.background_volume_placeholder_)
      background_add = tf.add(background_mul, sliced_foreground)
      background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
      # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
      spectrogram = contrib_audio.audio_spectrogram(
          background_clamp,
          window_size=model_settings['window_size_samples'],
          stride=model_settings['window_stride_samples'],
          magnitude_squared=True)
      tf.summary.image(
          'spectrogram', tf.expand_dims(spectrogram, -1), max_outputs=1)
      # The number of buckets in each FFT row in the spectrogram will depend on
      # how many input samples there are in each window. This can be quite
      # large, with a 160 sample window producing 127 buckets for example. We
      # don't need this level of detail for classification, so we often want to
      # shrink them down to produce a smaller result. That's what this section
      # implements. One method is to use average pooling to merge adjacent
      # buckets, but a more sophisticated approach is to apply the MFCC
      # algorithm to shrink the representation.
      if model_settings['preprocess'] == 'average':
        self.output_ = tf.nn.pool(
            tf.expand_dims(spectrogram, -1),
            window_shape=[1, model_settings['average_window_width']],
            strides=[1, model_settings['average_window_width']],
            pooling_type='AVG',
            padding='SAME')
        tf.summary.image('shrunk_spectrogram', self.output_, max_outputs=1)
      elif model_settings['preprocess'] == 'mfcc':
        self.output_ = contrib_audio.mfcc(
            spectrogram,
            wav_decoder.sample_rate,
            dct_coefficient_count=model_settings['fingerprint_width'])
        tf.summary.image(
            'mfcc', tf.expand_dims(self.output_, -1), max_outputs=1)
      else:
        raise ValueError('Unknown preprocess mode "%s" (should be "mfcc" or'
                         ' "average")' % (model_settings['preprocess']))

      # Merge all the summaries and write them out to /tmp/retrain_logs (by
      # default)
      self.merged_summaries_ = tf.summary.merge_all(scope='data')
      self.summary_writer_ = tf.summary.FileWriter(summaries_dir + '/data',
                                                   tf.get_default_graph())