예제 #1
0
def prepare_input(filename):
    from tensorflow.contrib.framework.python.ops import audio_ops
    from tensorflow.python.ops import io_ops

    with tf.Session(graph=tf.Graph()) as sess:
        wav_filename_placeholder = tf.placeholder(tf.string, [])
        wav_loader = io_ops.read_file(wav_filename_placeholder)
        wav_decoder = tf.audio.decode_wav(wav_loader,
                            desired_channels=1,
                            desired_samples=16000,
                            name='decoded_sample_data')

        spectrum = audio_ops.audio_spectrogram(input=wav_decoder[0],
                                            window_size=640,
                                            stride=320,
                                            magnitude_squared=True,
                                            name='AudioSpectrogram')
        final = audio_ops.mfcc(spectrogram=spectrum, 
                               sample_rate=wav_decoder[1], 
                               upper_frequency_limit=4000.0, 
                               lower_frequency_limit=20.0, 
                               filterbank_channel_count=40, 
                               dct_coefficient_count=10, 
                               name='Mfcc')

        data = sess.run(final,
        feed_dict={wav_filename_placeholder: filename})
        print(f'Data shape: {data.shape}')

    return data
예제 #2
0
  def get_test_data(self, how_many, offset, model_settings, sess, features='mfcc'):

    candidates = self.data_index
    if how_many == -1:
      sample_count = len(candidates)
    else:
        sample_count = max(0, min(how_many, len(candidates) - offset))
    desired_samples = model_settings['desired_samples']
    data = np.zeros((sample_count, model_settings['fingerprint_size']))

    wav_filename_placeholder = tf.placeholder(tf.string, [], name='wav_file_names')
    wav_loader = io_ops.read_file(wav_filename_placeholder)
    wav_decoder = contrib_audio.decode_wav(
      wav_loader, desired_channels=1, desired_samples=desired_samples)
    spectrogram = contrib_audio.audio_spectrogram(
      wav_decoder.audio,
      window_size=model_settings['window_size_samples'],
      stride=model_settings['window_stride_samples'],
      magnitude_squared=True)
    mfcc = contrib_audio.mfcc(
      spectrogram,
      wav_decoder.sample_rate,
      dct_coefficient_count=model_settings['dct_coefficient_count'])


    for i in range(offset, offset + sample_count):
        input_dict = {wav_filename_placeholder : candidates[i]}
        if features == "spectrogram":
          data[i - offset, :] = sess.run(spectrogram, feed_dict=input_dict).flatten()
        elif features == "raw":
          data[i - offset, :] = sess.run(wav_decoder.audio, feed_dict=input_dict).flatten()
        else:
          data[i - offset, :] = sess.run(mfcc, feed_dict=input_dict).flatten()

    return data
    def _build_processing_graph(self):
        """Builds a TensorFlow graph to apply the input distortions.

            Creates a graph that loads a WAVE file, decodes it, scales the volume,
            shifts it in time, adds in background noise, calculates a spectrogram, and
            then builds an MFCC fingerprint from that.

            This must be called with an active TensorFlow session running, and it
            creates multiple placeholder inputs, and one output:

              - wav_filename_placeholder_: Filename of the WAV to load.
              - mfcc_: Output 2D fingerprint of processed audio.

            """
        with tf.name_scope('audio_processing'):
            desired_samples = self._model_settings['desired_samples']
            self.wav_filename_placeholder_ = tf.placeholder(tf.string, [])
            wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
            wav_decoder = contrib_audio.decode_wav(
                wav_loader,
                desired_channels=1,
                desired_samples=desired_samples)
            background_clamp = tf.clip_by_value(wav_decoder.audio, -1.0, 1.0)
            # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
            spectrogram = contrib_audio.audio_spectrogram(
                background_clamp,
                window_size=self._model_settings['window_size_samples'],
                stride=self._model_settings['window_stride_samples'],
                magnitude_squared=True)
            self.mfcc_ = contrib_audio.mfcc(
                spectrogram,
                wav_decoder.sample_rate,
                dct_coefficient_count=self.
                _model_settings['dct_coefficient_count'])
예제 #4
0
def mfcc_tensorflow(wavfile, _sr, frame_size, frame_shift, order=13):
    sess = tf.InteractiveSession()
    wav_filename_placeholder = tf.placeholder(tf.string, [])

    wav_loader = io_ops.read_file(wav_filename_placeholder)
    wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1)
    wav_data = wav_decoder.audio

    wav_sample_rate = sess.run(wav_decoder,
                               feed_dict={
                                   wav_filename_placeholder: wavfile
                               }).sample_rate
    check_sample_rate(wavfile, _sr, wav_sample_rate)

    spectrogram = contrib_audio.audio_spectrogram(wav_data,
                                                  window_size=frame_size,
                                                  stride=frame_shift,
                                                  magnitude_squared=True)

    mfcc_ = contrib_audio.mfcc(spectrogram,
                               wav_decoder.sample_rate,
                               dct_coefficient_count=order)

    mfcc_data = sess.run(mfcc_, feed_dict={wav_filename_placeholder: wavfile})

    return mfcc_data
예제 #5
0
파일: id.py 프로젝트: cooledge/nn
def load_mfcc_file(sess, filename):
  filename_ph = tf.placeholder(tf.string)
  loader = io_ops.read_file(filename_ph)
  decoder = contrib_audio.decode_wav(loader, desired_channels=1, desired_samples=16000)
  spectrogram = contrib_audio.audio_spectrogram( decoder.audio, window_size=480, stride=160, magnitude_squared=True)
  mfcc = contrib_audio.mfcc( spectrogram, decoder.sample_rate, dct_coefficient_count=40)
  return sess.run(mfcc, feed_dict={filename_ph: filename})
  def prepare_processing_graph(self, model_settings):
    """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - mfcc_: Output 2D fingerprint of processed audio.

    Args:
      model_settings: Information about the current model being trained.
    """
    desired_samples = model_settings['desired_samples']
    self.wav_filename_placeholder_ = tf.placeholder(tf.string, [])
    wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
    wav_decoder = contrib_audio.decode_wav(
        # wav_loader, desired_channels=1, desired_samples=desired_samples)
      wav_loader, desired_channels=1, desired_samples=16000)
    # Allow the audio sample's volume to be adjusted.
    self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, [])
    scaled_foreground = tf.multiply(wav_decoder.audio,
                                    self.foreground_volume_placeholder_)
    # Shift the sample's start position, and pad any gaps with zeros.
    self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2])
    self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2])
    padded_foreground = tf.pad(
        scaled_foreground,
        self.time_shift_padding_placeholder_,
        mode='CONSTANT')
    sliced_foreground = tf.slice(padded_foreground,
                                 self.time_shift_offset_placeholder_,
                                 [desired_samples, -1])
    # Mix in background noise.
    self.background_data_placeholder_ = tf.placeholder(tf.float32,
                                                       [desired_samples, 1])
    self.background_volume_placeholder_ = tf.placeholder(tf.float32, [])
    background_mul = tf.multiply(self.background_data_placeholder_,
                                 self.background_volume_placeholder_)
    background_add = tf.add(background_mul, sliced_foreground)
    background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
    ######################  M F C C #################################
    # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
    spectrogram = contrib_audio.audio_spectrogram(
        background_clamp,
        window_size=model_settings['window_size_samples'],
        stride=model_settings['window_stride_samples'],
        magnitude_squared=True)
    self.mfcc_ = contrib_audio.mfcc(
        spectrogram,
        wav_decoder.sample_rate,
        dct_coefficient_count=model_settings['dct_coefficient_count'])
예제 #7
0
def wav_to_features(filenames_dataset, hparams, feature_count):
    dataset = filenames_dataset.map(lambda filename: io_ops.read_file(filename))
    dataset = dataset.map(lambda wav_loader: contrib_audio.decode_wav(wav_loader, desired_channels=1))
    dataset = dataset.map(lambda wav_decoder:
                                  (contrib_audio.audio_spectrogram(
                                      wav_decoder.audio,
                                      window_size=int(hparams.sample_rate * hparams.window_size_ms / 1000),
                                      stride=int(hparams.sample_rate * hparams.window_stride_ms / 1000),
                                      magnitude_squared=True), wav_decoder.sample_rate))
    dataset = dataset.map(lambda spectrogram, sample_rate: contrib_audio.mfcc(
        spectrogram, sample_rate,
        dct_coefficient_count=feature_count))
    dataset = dataset.map(lambda inputs: (
        inputs,
        tf.nn.moments(inputs, axes=[1])
    ))
    dataset = dataset.map(lambda inputs, moments: (
        tf.divide(tf.subtract(inputs, moments[0]), moments[1]),
        tf.shape(inputs)[1]
    ))
    dataset = dataset.map(lambda inputs, seq_len: (
        inputs[0],
        seq_len
    ))
    return dataset
예제 #8
0
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
                           clip_stride_ms, window_size_ms, window_stride_ms,
                           dct_coefficient_count, model_architecture):
    """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    clip_stride_ms: How often to run recognition. Useful for models with cache.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    dct_coefficient_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
  """

    words_list = input_data.prepare_words_list(wanted_words.split(','))
    model_settings = models.prepare_model_settings(
        len(words_list),
        sample_rate,
        clip_duration_ms,
        window_size_ms,
        window_stride_ms,
        dct_coefficient_count,
    )
    runtime_settings = {'clip_stride_ms': clip_stride_ms}

    wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
    decoded_sample_data = contrib_audio.decode_wav(
        wav_data_placeholder,
        desired_channels=1,
        desired_samples=model_settings['desired_samples'],
        name='decoded_sample_data')
    spectrogram = contrib_audio.audio_spectrogram(
        decoded_sample_data.audio,
        window_size=model_settings['window_size_samples'],
        stride=model_settings['window_stride_samples'],
        magnitude_squared=True)
    fingerprint_input = contrib_audio.mfcc(
        spectrogram,
        decoded_sample_data.sample_rate,
        dct_coefficient_count=dct_coefficient_count)
    fingerprint_frequency_size = model_settings['dct_coefficient_count']
    fingerprint_time_size = model_settings['spectrogram_length']
    reshaped_input = tf.reshape(
        fingerprint_input,
        [-1, fingerprint_time_size * fingerprint_frequency_size])

    logits = models.create_model(reshaped_input,
                                 model_settings,
                                 model_architecture,
                                 is_training=False,
                                 runtime_settings=runtime_settings)

    # Create an output to use for inference.
    tf.nn.softmax(logits, name='labels_softmax')
예제 #9
0
  def prepare_processing_graph(self, model_settings):
    """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - mfcc_: Output 2D fingerprint of processed audio.

    Args:
      model_settings: Information about the current model being trained.
    """
    desired_samples = model_settings['desired_samples']
    self.wav_filename_placeholder_ = tf.placeholder(tf.string, [])
    wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
    wav_decoder = contrib_audio.decode_wav(
        wav_loader, desired_channels=1, desired_samples=desired_samples)
    # Allow the audio sample's volume to be adjusted.
    self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, [])
    scaled_foreground = tf.multiply(wav_decoder.audio,
                                    self.foreground_volume_placeholder_)
    # Shift the sample's start position, and pad any gaps with zeros.
    self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2])
    self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2])
    padded_foreground = tf.pad(
        scaled_foreground,
        self.time_shift_padding_placeholder_,
        mode='CONSTANT')
    sliced_foreground = tf.slice(padded_foreground,
                                 self.time_shift_offset_placeholder_,
                                 [desired_samples, -1])
    # Mix in background noise.
    self.background_data_placeholder_ = tf.placeholder(tf.float32,
                                                       [desired_samples, 1])
    self.background_volume_placeholder_ = tf.placeholder(tf.float32, [])
    background_mul = tf.multiply(self.background_data_placeholder_,
                                 self.background_volume_placeholder_)
    background_add = tf.add(background_mul, sliced_foreground)
    background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
    # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
    spectrogram = contrib_audio.audio_spectrogram(
        background_clamp,
        window_size=model_settings['window_size_samples'],
        stride=model_settings['window_stride_samples'],
        magnitude_squared=True)
    self.mfcc_ = contrib_audio.mfcc(
        spectrogram,
        wav_decoder.sample_rate,
        dct_coefficient_count=model_settings['dct_coefficient_count'])
예제 #10
0
def samples_to_mfccs(samples, sample_rate):
    spectrogram = contrib_audio.audio_spectrogram(samples,
                                                  window_size=Config.audio_window_samples,
                                                  stride=Config.audio_step_samples,
                                                  magnitude_squared=True)
    mfccs = contrib_audio.mfcc(spectrogram, sample_rate, dct_coefficient_count=Config.n_input)
    mfccs = tf.reshape(mfccs, [-1, Config.n_input])

    return mfccs, tf.shape(mfccs)[0]
예제 #11
0
def samples_to_mfccs(samples, sample_rate):
    spectrogram = contrib_audio.audio_spectrogram(samples,
                                                  window_size=Config.audio_window_samples,
                                                  stride=Config.audio_step_samples,
                                                  magnitude_squared=True)
    mfccs = contrib_audio.mfcc(spectrogram, sample_rate, dct_coefficient_count=Config.n_input)
    mfccs = tf.reshape(mfccs, [-1, Config.n_input])

    return mfccs, tf.shape(mfccs)[0]
예제 #12
0
def _load_sample(
        wav_filename,
        model_settings):
    """Creates an audio model with the nodes needed for inference.

    Uses the supplied arguments to create a model, and inserts the input and
    output nodes that are needed to use the graph for inference.

    Args:
      wanted_words: Comma-separated list of the words we're trying to recognize.
      sample_rate: How many samples per second are in the input audio files.
      clip_duration_ms: How many samples to analyze for the audio pattern.
      clip_stride_ms: How often to run recognition. Useful for models with cache.
      window_size_ms: Time slice duration to estimate frequencies from.
      window_stride_ms: How far apart time slices should be.
      dct_coefficient_count: Number of frequency bands to analyze.
      model: Name of the kind of model to generate.
    """
    wav_loader = io_ops.read_file(wav_filename)

    decoded_sample_data = contrib_audio.decode_wav(
        wav_loader,
        desired_channels=1,
        desired_samples=model_settings['desired_samples'],
        name='decoded_sample_data')

    if model_settings['input_format'] == 'raw':
        print(decoded_sample_data.audio.shape)
        reshaped_input = tf.reshape(decoded_sample_data.audio, [
            -1, model_settings['desired_samples']
        ])
        print(reshaped_input.shape)
    else:
        spectrogram = contrib_audio.audio_spectrogram(
            decoded_sample_data.audio,
            window_size=model_settings['window_size_samples'],
            stride=model_settings['window_stride_samples'],
            magnitude_squared=True)

        fingerprint_input = contrib_audio.mfcc(
            spectrogram,
            decoded_sample_data.sample_rate,
            lower_frequency_limit=model_settings['lower_frequency_limit'],
            upper_frequency_limit=model_settings['upper_frequency_limit'],
            filterbank_channel_count=model_settings['filterbank_channel_count'],
            dct_coefficient_count=model_settings['dct_coefficient_count'])
        fingerprint_frequency_size = model_settings['dct_coefficient_count']
        fingerprint_time_size = model_settings['spectrogram_length']

        reshaped_input = tf.reshape(fingerprint_input, [
            -1, fingerprint_time_size * fingerprint_frequency_size
        ])

    return reshaped_input
예제 #13
0
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
                           clip_stride_ms, window_size_ms, window_stride_ms,
                           dct_coefficient_count, model_architecture):
  """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    clip_stride_ms: How often to run recognition. Useful for models with cache.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    dct_coefficient_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
  """

  words_list = input_data.prepare_words_list(wanted_words.split(','))
  model_settings = models.prepare_model_settings(
      len(words_list), sample_rate, clip_duration_ms, window_size_ms,
      window_stride_ms, dct_coefficient_count)
  runtime_settings = {'clip_stride_ms': clip_stride_ms}

  wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
  decoded_sample_data = contrib_audio.decode_wav(
      wav_data_placeholder,
      desired_channels=1,
      desired_samples=model_settings['desired_samples'],
      name='decoded_sample_data')
  spectrogram = contrib_audio.audio_spectrogram(
      decoded_sample_data.audio,
      window_size=model_settings['window_size_samples'],
      stride=model_settings['window_stride_samples'],
      magnitude_squared=True)
  fingerprint_input = contrib_audio.mfcc(
      spectrogram,
      decoded_sample_data.sample_rate,
      dct_coefficient_count=dct_coefficient_count)
  fingerprint_frequency_size = model_settings['dct_coefficient_count']
  fingerprint_time_size = model_settings['spectrogram_length']
  reshaped_input = tf.reshape(fingerprint_input, [
      -1, fingerprint_time_size * fingerprint_frequency_size
  ])

  logits = models.create_model(
      reshaped_input, model_settings, model_architecture, is_training=False,
      runtime_settings=runtime_settings)

  # Create an output to use for inference.
  tf.nn.softmax(logits, name='labels_softmax')
예제 #14
0
def AudioToMfcc(sample_rate, audio, window_size_ms, window_stride_ms,
                num_coefficients):
    window_size_samples = sample_rate * window_size_ms // 1000
    window_stride_samples = sample_rate * window_stride_ms // 1000
    spectrogram = contrib_audio.audio_spectrogram(
        audio,
        window_size=window_size_samples,
        stride=window_stride_samples,
        magnitude_squared=True)
    mfcc = contrib_audio.mfcc(spectrogram,
                              sample_rate,
                              dct_coefficient_count=num_coefficients)
    return mfcc
예제 #15
0
  def __init__(self, sample_rate: int, dct_coef_count: int=-1):
    '''
    suppose the channel number is 1.
    '''
    assert sample_rate == 16_000
    if dct_coef_count == -1:
      dct_coef_count = DataGraphMFCC.max_mfcc_num
    else:
      assert dct_coef_count <= DataGraphMFCC.max_mfcc_num

    self._sample_rate = sample_rate
    samples_per_second = sample_rate / 1000
    window = int(DataGraphMFCC.window_duration * samples_per_second)
    stride = int(DataGraphMFCC.stride_duration * samples_per_second)

    self._graph = tf.Graph()
    with self._graph.as_default():
      self._in_wav_file = tf.placeholder(tf.string, [], name='wav_filename')
      self._in_frame_num = tf.placeholder(tf.int32, [])
      wav_loader = io_ops.read_file(self._in_wav_file)
      wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1)
      self._out_audio = tf.squeeze(wav_decoder.audio)
      self._out_sample_rate = wav_decoder.sample_rate

      self._in_audio = tf.placeholder(tf.float32, [None])
      in_audio = tf.expand_dims(self._in_audio, -1)

      audio_clamp = tf.clip_by_value(in_audio, -1.0, 1.0)
      spectrogram = contrib_audio.audio_spectrogram(
        audio_clamp,
        window_size=window,
        stride=stride,
        magnitude_squared=True)
      self._out_spectrogram = spectrogram

      feat_ts = contrib_audio.mfcc(
        spectrogram=spectrogram,
        sample_rate=sample_rate,
        dct_coefficient_count=dct_coef_count,
      )
      self._out_mfcc = feat_ts[0]
      self._out_real_mfcc_len = tf.shape(self._out_mfcc)[0]

      diff = tf.maximum(0, self._in_frame_num - self._out_real_mfcc_len)
      self._out_expanded_mfcc = tf.pad(
        self._out_mfcc,
        [[0, diff], [0, 0]],
      )[: self._in_frame_num]

    self._sess = tf.Session(graph=self._graph)
    print(f"DataGgraphMFCC graph is created!")
    def build_graph(self):
        """ Graph to extract mfcc fingerprint given wav file

		  Here we add the necessary input & output tensors, to decode wav,
		  serialize mfcc fingerprint, restore from checkpoint etc.

		Returns:
		  input_wav_filename: A tensor containing wav filename as the input layer.
		  mfcc_fingerprint: The MFCC fingerprint tensor, that will be materialized later.
		"""

        self.wav_filename_placeholder_ = tf.placeholder(tf.string, [])
        wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
        wav_decoder = contrib_audio.decode_wav(
            wav_loader,
            desired_channels=1,
            desired_samples=self.desired_samples)

        # Allow the audio sample's volume to be adjusted.
        self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, [])
        scaled_foreground = tf.multiply(wav_decoder.audio,
                                        self.foreground_volume_placeholder_)

        # Shift the sample's start position, and pad any gaps with zeros.
        self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2])
        self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2])
        padded_foreground = tf.pad(scaled_foreground,
                                   self.time_shift_padding_placeholder_,
                                   mode='CONSTANT')
        sliced_foreground = tf.slice(padded_foreground,
                                     self.time_shift_offset_placeholder_,
                                     [self.desired_samples, -1])
        # Mix in background noise.
        self.background_data_placeholder_ = tf.placeholder(
            tf.float32, [self.desired_samples, 1])
        self.background_volume_placeholder_ = tf.placeholder(tf.float32, [])
        background_mul = tf.multiply(self.background_data_placeholder_,
                                     self.background_volume_placeholder_)
        background_add = tf.add(background_mul, sliced_foreground)
        background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)

        spectrogram = contrib_audio.audio_spectrogram(
            background_clamp,
            window_size=self.window_size_samples,
            stride=self.window_stride_samples,
            magnitude_squared=True)

        self.mfcc_fingerprint_ = contrib_audio.mfcc(
            spectrogram,
            wav_decoder.sample_rate,
            dct_coefficient_count=self.opt.dct_coefficient_count)
예제 #17
0
    def _single_mfcc(self, audio, window_size_samples, window_stride_samples,
                     magnitude_squared, **kwargs):
        spectrogram = self._single_spectrogram(audio, window_size_samples,
                                               window_stride_samples,
                                               magnitude_squared)

        mfcc = contrib_audio.mfcc(
            spectrogram,
            kwargs["sample_rate_const"],
            upper_frequency_limit=kwargs["upper_edge_hertz"],
            lower_frequency_limit=kwargs["lower_edge_hertz"],
            filterbank_channel_count=kwargs["num_mel_bins"],
            dct_coefficient_count=kwargs["num_mfccs"],
        )

        return mfcc
예제 #18
0
def encode_data(audio_data, sample_rate):
    spectrogram = contrib_audio.audio_spectrogram(
        audio_data,
        window_size_samples,
        window_stride_samples
    )

    print(spectrogram.shape)

    mfcc = contrib_audio.mfcc(
        spectrogram,
        sample_rate=sample_rate,
        dct_coefficient_count=dct_coefficient_count
    )

    return mfcc
예제 #19
0
 def wav_to_mfcc(self, raw_data):
     spectrogram = audio_ops.audio_spectrogram(
         raw_data,
         window_size=self.parameters['spectogram_window_size'],
         stride=self.parameters['spectogram_stride'],
         magnitude_squared=True)
     mfcc = audio_ops.mfcc(
         spectrogram,
         self.parameters['audio_sample_rate'],
         dct_coefficient_count=self.parameters['dtc_coefficient_count'])
     mfcc = tf.expand_dims(mfcc, -1)
     self.input_dimensions = (
         self.input_dimensions[0] / self.parameters['spectogram_stride'] -
         2, self.parameters['dtc_coefficient_count'], 1)
     mfcc = tf.squeeze(mfcc, 0)
     return mfcc
예제 #20
0
    def prepare_processing_graph(self, model_settings):
        """
        Build a tensorflow graph
        
        creates a graph that loads a wave file, decodes it, scales the volume, shifts it in time
        calculates a spectrogram and builds MFCC fingerprint from that

        input:
        model_settings: info about model being trained
        """
        desired_samples = model_settings['desired_samples']
        self.wav_filename_placeholder_ = tf.placeholder(tf.string, [])
        wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
        wav_decoder = contrib_audio.decode_wav(wav_loader,
                                               desired_channels=1,
                                               desired_samples=desired_samples)
        # Allow the audio sample's volume to be adjusted.
        self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, [])
        scaled_foreground = tf.multiply(wav_decoder.audio,
                                        self.foreground_volume_placeholder_)
        # Shift the sample's start position, and pad any gaps with zeros.
        self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2])
        self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2])
        padded_foreground = tf.pad(scaled_foreground,
                                   self.time_shift_padding_placeholder_,
                                   mode='CONSTANT')
        sliced_foreground = tf.slice(padded_foreground,
                                     self.time_shift_offset_placeholder_,
                                     [desired_samples, -1])

        self.background_data_placeholder_ = tf.placeholder(
            np.float32, [desired_samples, 1])
        self.background_volume_placeholder_ = tf.placeholder(np.float32, [])
        background_mul = tf.multiply(self.background_data_placeholder_,
                                     self.background_volume_placeholder_)
        background_add = tf.add(background_mul, sliced_foreground)
        background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
        spectrogram = contrib_audio.audio_spectrogram(
            background_clamp,
            window_size=model_settings['window_size_samples'],
            stride=model_settings['window_stride_samples'],
            magnitude_squared=True)
        self.mfcc_ = contrib_audio.mfcc(
            spectrogram,
            wav_decoder.sample_rate,
            dct_coefficient_count=model_settings['dct_coefficient_count'])
예제 #21
0
def create_inference_graph_and_load_variables(sess, FLAGS):
    """Creates an audio model with the nodes needed for inference.

    Uses the supplied arguments to create a model, and inserts the input and
    output the trained model graph.
    """
    model_settings = data_utils.prepare_settings(FLAGS.num_classes,
                                                 FLAGS.sample_rate,
                                                 FLAGS.clip_duration_ms,
                                                 FLAGS.window_size_ms,
                                                 FLAGS.window_stride_ms,
                                                 FLAGS.dct_coefficient_count)
    runtime_settings = {'clip_stride_ms': FLAGS.clip_stride_ms}

    wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
    decoded_sample_data = contrib_audio.decode_wav(
        wav_data_placeholder,
        desired_channels=1,
        desired_samples=model_settings['desired_samples'],
        name='decoded_sample_data')
    spectrogram = contrib_audio.audio_spectrogram(
        decoded_sample_data.audio,
        window_size=model_settings['window_size_samples'],
        stride=model_settings['window_stride_samples'],
        magnitude_squared=True)
    fingerprint_input = contrib_audio.mfcc(
        spectrogram,
        decoded_sample_data.sample_rate,
        dct_coefficient_count=FLAGS.dct_coefficient_count)
    fingerprint_frequency_size = model_settings['dct_coefficient_count']
    fingerprint_time_size = model_settings['spectrogram_length']
    reshaped_input = tf.reshape(
        fingerprint_input,
        [-1, fingerprint_time_size, fingerprint_frequency_size, 1],
        name="model_input")

    # Init model and load variables
    model = models.create_model(FLAGS)
    fw = framework.Framework(sess,
                             model,
                             None,
                             FLAGS,
                             input_tensor=reshaped_input)

    # Create an output to use for inference
    logits = tf.nn.softmax(model.get_raw_scores(), name='labels_softmax')
 def save_my_test_file(self, model_settings):
     desired_samples = model_settings['desired_samples']
     self.wav_filename_placeholder_ = tf.placeholder(tf.string, [])
     wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
     wav_decoder = contrib_audio.decode_wav(
       # wav_loader, desired_channels=1, desired_samples=desired_samples)
       wav_loader, desired_channels=1, desired_samples=16000)
     # Allow the audio sample's volume to be adjusted.
     self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, [])
     scaled_foreground = tf.multiply(wav_decoder.audio,
                                     self.foreground_volume_placeholder_)
     # Shift the sample's start position, and pad any gaps with zeros.
     self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2])
     self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2])
     padded_foreground = tf.pad(
       scaled_foreground,
       self.time_shift_padding_placeholder_,
       mode='CONSTANT')
     sliced_foreground = tf.slice(padded_foreground,
                                  self.time_shift_offset_placeholder_,
                                  [desired_samples, -1])
     # Mix in background noise.
     self.background_data_placeholder_ = tf.placeholder(tf.float32,
                                                        [desired_samples, 1])
     self.background_volume_placeholder_ = tf.placeholder(tf.float32, [])
     background_mul = tf.multiply(self.background_data_placeholder_,
                                  self.background_volume_placeholder_)
     background_add = tf.add(background_mul, sliced_foreground)
     background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
     self.filename_ = tf.placeholder(tf.string)
     # save_wav_file(self.filename_, np.array(background_clamp), 16000)
     wav_encoder = contrib_audio.encode_wav(background_clamp,
                                            16000)
     self.wav_saver = io_ops.write_file(self.filename_, wav_encoder)
     # with tf.Session(graph=tf.Graph()) as sess:
     #   sess.run(wav_saver)
     spectrogram = contrib_audio.audio_spectrogram(
       background_clamp,
       window_size=model_settings['window_size_samples'],
       stride=model_settings['window_stride_samples'],
       magnitude_squared=True)
     self.test_mfcc_ = contrib_audio.mfcc(
       spectrogram,
       wav_decoder.sample_rate,
       dct_coefficient_count=model_settings['dct_coefficient_count'])
예제 #23
0
def get_mfcc_graph(model_settings):
    g = tf.Graph()
    with g.as_default():
        input_file_placeholder = tf.compat.v1.placeholder(tf.string, [],
                                                          name='wav_filename')
        wav_loader = io_ops.read_file(input_file_placeholder)
        wav_decoder = audio_ops.decode_wav(
            wav_loader,
            desired_channels=1,
            desired_samples=model_settings['desired_samples'])
        # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
        spectrograms_power = audio_ops.audio_spectrogram(
            wav_decoder.audio,
            window_size=model_settings['window_size_samples'],
            stride=model_settings['window_stride_samples'],
            magnitude_squared=True)
        USE_POWER = True
        if USE_POWER:
            # Warp the linear scale spectrograms into the mel-scale.
            num_spectrogram_bins = spectrograms_power.shape[-1].value
            lower_edge_hertz, upper_edge_hertz, num_mel_bins = 20.0, 4000.0, 40
            linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
                num_mel_bins, num_spectrogram_bins, 16000.0, lower_edge_hertz,
                upper_edge_hertz)
            mel_spectrograms = tf.tensordot(spectrograms_power,
                                            linear_to_mel_weight_matrix, 1)
            mel_spectrograms.set_shape(
                spectrograms_power.shape[:-1].concatenate(
                    linear_to_mel_weight_matrix.shape[-1:]))

            # Compute a stabilized log to get log-magnitude mel-scale spectrograms.
            log_mel_spectrograms = tf.math.log(mel_spectrograms + 1e-6)

            # Compute MFCCs from log_mel_spectrograms and take the first NDCT.
            mfccs = tf.signal.mfccs_from_log_mel_spectrograms(
                log_mel_spectrograms)[
                    ..., :model_settings['dct_coefficient_count']]
            #output = tf.expand_dims(mfccs, axis=0)
            output = mfccs
        else:
            output = audio_ops.mfcc(
                spectrograms_power,
                wav_decoder.sample_rate,
                dct_coefficient_count=model_settings['dct_coefficient_count'])
    return g, input_file_placeholder, output, wav_decoder.audio
예제 #24
0
파일: train1.py 프로젝트: EJHortala/books-2
 def prepare_processing_graph(self, model_settings):
     desired_samples = model_settings['desired_samples']
     self.wav_filename_placeholder_ = tf.placeholder(tf.string, [])
     wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
     wav_decoder = contrib_audio.decode_wav(wav_loader,
                                            desired_channels=1,
                                            desired_samples=desired_samples)
     # Allow the audio sample's volume to be adjusted.
     self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, [])
     scaled_foreground = tf.multiply(wav_decoder.audio,
                                     self.foreground_volume_placeholder_)
     # Shift the sample's start position, and pad any gaps with zeros.
     self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2])
     self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2])
     padded_foreground = tf.pad(scaled_foreground,
                                self.time_shift_padding_placeholder_,
                                mode='CONSTANT')
     sliced_foreground = tf.slice(padded_foreground,
                                  self.time_shift_offset_placeholder_,
                                  [desired_samples, -1])
     # Mix in background noise.
     self.background_data_placeholder_ = tf.placeholder(
         tf.float32, [desired_samples, 1])
     self.background_volume_placeholder_ = tf.placeholder(tf.float32, [])
     background_mul = tf.multiply(self.background_data_placeholder_,
                                  self.background_volume_placeholder_)
     background_add = tf.add(background_mul, sliced_foreground)
     background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
     # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
     print 'window_size_samples', model_settings['window_size_samples']
     print 'window_stride_samples', model_settings['window_stride_samples']
     print 'background_clamp', background_clamp
     spectrogram = contrib_audio.audio_spectrogram(
         background_clamp,
         window_size=model_settings['window_size_samples'],
         stride=model_settings['window_stride_samples'],
         magnitude_squared=True)
     print 'spectrogram', spectrogram
     print 'dct_coefficient_count', model_settings['dct_coefficient_count']
     print 'wav_decoder.sample_rate', wav_decoder.sample_rate
     self.mfcc_ = contrib_audio.mfcc(
         spectrogram,
         wav_decoder.sample_rate,
         dct_coefficient_count=model_settings['dct_coefficient_count'])
     print 'self.mfcc_', self.mfcc_
예제 #25
0
 def __init__(self,
              desired_samples=16000,
              window_size_samples=480,
              window_stride_samples=160):
     self.wav_filename_placeholder = tf.placeholder(tf.string, [])
     wav_loader = io_ops.read_file(self.wav_filename_placeholder)
     # already pads/crops
     wav_decoder = contrib_audio.decode_wav(wav_loader,
                                            desired_channels=1,
                                            desired_samples=desired_samples)
     spectrogram = contrib_audio.audio_spectrogram(
         wav_decoder.audio,
         window_size=window_size_samples,
         stride=window_stride_samples,
         magnitude_squared=True)
     self.mfcc = contrib_audio.mfcc(spectrogram,
                                    wav_decoder.sample_rate,
                                    dct_coefficient_count=40)
def build_data_generator():

    # Build data generator pipeline
    desired_samples = model_settings['desired_samples']
    wav_filename_placeholder_ = tf.placeholder(
        tf.string, [], name="wav_filename_placeholder_")
    wav_loader = io_ops.read_file(wav_filename_placeholder_)
    wav_decoder = contrib_audio.decode_wav(wav_loader,
                                           desired_channels=1,
                                           desired_samples=desired_samples)
    # Allow the audio sample's volume to be adjusted.
    foreground_volume_placeholder_ = tf.placeholder(
        tf.float32, [], name="foreground_volume_placeholder_")
    scaled_foreground = tf.multiply(wav_decoder.audio,
                                    foreground_volume_placeholder_)
    # Shift the sample's start position, and pad any gaps with zeros.
    time_shift_padding_placeholder_ = tf.placeholder(
        tf.int32, [2, 2], name="time_shift_padding_placeholder_")
    time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2])
    padded_foreground = tf.pad(scaled_foreground,
                               time_shift_padding_placeholder_,
                               mode='CONSTANT')
    sliced_foreground = tf.slice(padded_foreground,
                                 time_shift_offset_placeholder_,
                                 [desired_samples, -1])
    # Mix in background noise.
    background_data_placeholder_ = tf.placeholder(tf.float32,
                                                  [desired_samples, 1])
    background_volume_placeholder_ = tf.placeholder(tf.float32, [])
    background_mul = tf.multiply(background_data_placeholder_,
                                 background_volume_placeholder_)
    background_add = tf.add(background_mul, sliced_foreground)
    background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
    # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
    spectrogram = contrib_audio.audio_spectrogram(
        background_clamp,
        window_size=model_settings['window_size_samples'],
        stride=model_settings['window_stride_samples'],
        magnitude_squared=True)
    mfcc_ = contrib_audio.mfcc(
        spectrogram,
        wav_decoder.sample_rate,
        dct_coefficient_count=model_settings['dct_coefficient_count'])
    return wav_filename_placeholder_, foreground_volume_placeholder_, time_shift_padding_placeholder_, time_shift_offset_placeholder_, background_data_placeholder_, background_volume_placeholder_, mfcc_
        def decode_audio(audio_str):

            wav_decoder = contrib_audio.decode_wav(
                audio_str,
                desired_channels=1,
                desired_samples=self.desired_samples)

            spectrogram = contrib_audio.audio_spectrogram(
                wav_decoder.audio,
                window_size=self.window_size_samples,
                stride=self.window_stride_samples,
                magnitude_squared=True)

            mfcc_fingerprint = contrib_audio.mfcc(
                spectrogram,
                wav_decoder.sample_rate,
                dct_coefficient_count=self.dct_coefficient_count)

            return mfcc_fingerprint
예제 #28
0
 def prepare_processing_graph(self):
     self.wav_filename_placeholder_ = tf.placeholder(tf.string, [])
     wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
     wav_decoder = contrib_audio.decode_wav(wav_loader,
                                            desired_channels=1,
                                            desired_samples=desired_samples)
     spectrogram = contrib_audio.audio_spectrogram(
         wav_decoder.audio,
         window_size=window_size_samples,
         stride=window_stride_samples,
         magnitude_squared=True)
     print 'spectrogram', spectrogram
     print 'dct_coefficient_count', dct_coefficient_count
     print 'wav_decoder.sample_rate', wav_decoder.sample_rate
     self.mfcc_ = contrib_audio.mfcc(
         spectrogram,
         wav_decoder.sample_rate,
         dct_coefficient_count=dct_coefficient_count)
     print 'self.mfcc_', self.mfcc_
예제 #29
0
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
                           clip_stride_ms, window_size_ms, window_stride_ms,
                           dct_coefficient_count, model_architecture):

    graph = tf.Graph()
    with graph.as_default():
        words_list = input_data.prepare_words_list(wanted_words.split(','))
        model_settings = models.prepare_model_settings(
            len(words_list), sample_rate, clip_duration_ms, window_size_ms,
            window_stride_ms, dct_coefficient_count)
        runtime_settings = {'clip_stride_ms': clip_stride_ms}

        wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
        decoded_sample_data = contrib_audio.decode_wav(
            wav_data_placeholder,
            desired_channels=1,
            desired_samples=model_settings['desired_samples'],
            name='decoded_sample_data')
        spectrogram = contrib_audio.audio_spectrogram(
            decoded_sample_data.audio,
            window_size=model_settings['window_size_samples'],
            stride=model_settings['window_stride_samples'],
            magnitude_squared=True)
        fingerprint_input = contrib_audio.mfcc(
            spectrogram,
            decoded_sample_data.sample_rate,
            dct_coefficient_count=dct_coefficient_count)
        fingerprint_frequency_size = model_settings['dct_coefficient_count']
        fingerprint_time_size = model_settings['spectrogram_length']
        reshaped_input = tf.reshape(
            fingerprint_input,
            [-1, fingerprint_time_size * fingerprint_frequency_size])

        logits = models.create_model(reshaped_input,
                                     model_settings,
                                     model_architecture,
                                     is_training=False,
                                     runtime_settings=runtime_settings)

        # Create an output to use for inference.
        tf.nn.softmax(logits, name='labels_softmax')
    return graph
def create_decoder_graph():  #may need to pass in session
    """Creates the input of the CNN model based off of this paper https://www.isca-speech.org/archive/interspeech_2015/papers/i15_1478.pdf
	
	Returns:
	  input node and output node
	"""

    words_list = prepare_words_list(FLAGS.wanted_words.split(','))
    model_settings = prepare_model_settings(len(words_list), FLAGS.sample_rate,
                                            FLAGS.clip_duration_ms,
                                            FLAGS.window_size_ms,
                                            FLAGS.window_stride_ms,
                                            FLAGS.dct_coefficient_count)
    runtime_settings = {'clip_stride_ms': FLAGS.clip_stride_ms}

    wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
    decoded_sample_data = contrib_audio.decode_wav(
        wav_data_placeholder,
        desired_channels=1,
        desired_samples=model_settings['desired_samples'],
        name='decoded_sample_data')
    spectrogram = contrib_audio.audio_spectrogram(
        decoded_sample_data.audio,
        window_size=model_settings['window_size_samples'],
        stride=model_settings['window_stride_samples'],
        magnitude_squared=True)
    fingerprint_input = contrib_audio.mfcc(
        spectrogram,
        decoded_sample_data.sample_rate,
        dct_coefficient_count=FLAGS.dct_coefficient_count)
    fingerprint_frequency_size = model_settings['dct_coefficient_count']
    fingerprint_time_size = model_settings['spectrogram_length']
    reshaped_input = tf.reshape(
        fingerprint_input,
        [-1, fingerprint_time_size * fingerprint_frequency_size])

    input_frequency_size = model_settings['dct_coefficient_count']
    input_time_size = model_settings['spectrogram_length']
    fingerprint_4d = tf.reshape(reshaped_input,
                                [-1, input_time_size, input_frequency_size, 1])
    return wav_data_placeholder, fingerprint_4d
예제 #31
0
def process_audio(audio_data):
    with tf.Session() as sess:
        data_input = tf.compat.v1.placeholder(dtype=tf.float32,
                                              shape=(16000, 1),
                                              name=None)
        spectrum = audio_ops.audio_spectrogram(input=data_input,
                                               window_size=640,
                                               stride=320,
                                               magnitude_squared=True,
                                               name='AudioSpectrogram')
        final = audio_ops.mfcc(spectrogram=spectrum,
                               sample_rate=RATE,
                               upper_frequency_limit=4000.0,
                               lower_frequency_limit=20.0,
                               filterbank_channel_count=40,
                               dct_coefficient_count=10,
                               name='Mfcc')

        data_out = sess.run(final, feed_dict={data_input: audio_data})

    return data_out
예제 #32
0
def Features(conf):
    sound = tf.placeholder(tf.float32, [None, None], name='wav')

    # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
    spectrogram = contrib_audio.audio_spectrogram(
        sound,
        window_size=conf['window_size_samples'],
        stride=conf['window_stride_samples'],
        magnitude_squared=True,
        name='spectogram')

    mfcc = contrib_audio.mfcc(
        spectrogram,
        conf['sample_rate'],
        dct_coefficient_count=conf['dct_coefficient_count'],
        name='mfcc')

    spect_norm = spectrogram / tf.reduce_sum(spectrogram, [1, 2])
    mfcc_norm = mfcc / tf.reduce_sum(mfcc, [1, 2])

    return sound, spectrogram, mfcc, spect_norm, mfcc_norm
예제 #33
0
def wav2Input(filename, model_settings):

    #print('wav2Input ', filename)
    desired_samples = model_settings['desired_samples']

    foreground_volume = 1
    time_shift_padding = [[0, 0], [0, 0]]
    time_shift_offset = [0, 0]
    #background_data=0
    #background_volume=0

    wav_loader = io_ops.read_file(filename)
    wav_decoder = contrib_audio.decode_wav(wav_loader,
                                           desired_channels=1,
                                           desired_samples=desired_samples)

    #scaled_foreground = tf.multiply(wav_decoder.audio, foreground_volume)
    #padded_foreground = tf.pad(scaled_foreground, time_shift_padding, mode='CONSTANT')
    #sliced_foreground = tf.slice(padded_foreground, time_shift_offset, [desired_samples, -1])

    #background_mul = tf.multiply(background_data, background_volume)
    #background_add = tf.add(background_mul, sliced_foreground)
    background_clamp = tf.clip_by_value(wav_decoder.audio, -1.0, 1.0)

    spectrogram = contrib_audio.audio_spectrogram(
        background_clamp,
        window_size=model_settings['window_size_samples'],
        stride=model_settings['window_stride_samples'],
        magnitude_squared=True)

    mfcc = contrib_audio.mfcc(
        spectrogram,
        wav_decoder.sample_rate,
        dct_coefficient_count=model_settings['dct_coefficient_count'])

    mfcc = tf.clip_by_value(mfcc, -10.0, 127.0)

    with tf.Session() as sess:
        mfcc = sess.run(mfcc)
    return mfcc
예제 #34
0
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
                           clip_stride_ms, window_size_ms, window_stride_ms,
                           feature_bin_count, model_architecture, preprocess):
  """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    clip_stride_ms: How often to run recognition. Useful for models with cache.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    feature_bin_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
    preprocess: How the spectrogram is processed to produce features, for
      example 'mfcc' or 'average'.

  Raises:
    Exception: If the preprocessing mode isn't recognized.
  """

  words_list = input_data.prepare_words_list(wanted_words.split(','))
  model_settings = models.prepare_model_settings(
      len(words_list), sample_rate, clip_duration_ms, window_size_ms,
      window_stride_ms, feature_bin_count, preprocess)
  runtime_settings = {'clip_stride_ms': clip_stride_ms}

  wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
  decoded_sample_data = contrib_audio.decode_wav(
      wav_data_placeholder,
      desired_channels=1,
      desired_samples=model_settings['desired_samples'],
      name='decoded_sample_data')
  spectrogram = contrib_audio.audio_spectrogram(
      decoded_sample_data.audio,
      window_size=model_settings['window_size_samples'],
      stride=model_settings['window_stride_samples'],
      magnitude_squared=True)

  if preprocess == 'average':
    fingerprint_input = tf.nn.pool(
        tf.expand_dims(spectrogram, -1),
        window_shape=[1, model_settings['average_window_width']],
        strides=[1, model_settings['average_window_width']],
        pooling_type='AVG',
        padding='SAME')
  elif preprocess == 'mfcc':
    fingerprint_input = contrib_audio.mfcc(
        spectrogram,
        sample_rate,
        dct_coefficient_count=model_settings['fingerprint_width'])
  else:
    raise Exception('Unknown preprocess mode "%s" (should be "mfcc" or'
                    ' "average")' % (preprocess))

  fingerprint_size = model_settings['fingerprint_size']
  reshaped_input = tf.reshape(fingerprint_input, [-1, fingerprint_size])

  logits = models.create_model(
      reshaped_input, model_settings, model_architecture, is_training=False,
      runtime_settings=runtime_settings)

  # Create an output to use for inference.
  tf.nn.softmax(logits, name='labels_softmax')
예제 #35
0
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
                           clip_stride_ms, window_size_ms, window_stride_ms,
                           feature_bin_count, model_architecture, preprocess):
  """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    clip_stride_ms: How often to run recognition. Useful for models with cache.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    feature_bin_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
    preprocess: How the spectrogram is processed to produce features, for
      example 'mfcc', 'average', or 'micro'.

  Raises:
    Exception: If the preprocessing mode isn't recognized.
  """

  words_list = input_data.prepare_words_list(wanted_words.split(','))
  model_settings = models.prepare_model_settings(
      len(words_list), sample_rate, clip_duration_ms, window_size_ms,
      window_stride_ms, feature_bin_count, preprocess)
  runtime_settings = {'clip_stride_ms': clip_stride_ms}

  wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
  decoded_sample_data = contrib_audio.decode_wav(
      wav_data_placeholder,
      desired_channels=1,
      desired_samples=model_settings['desired_samples'],
      name='decoded_sample_data')
  spectrogram = contrib_audio.audio_spectrogram(
      decoded_sample_data.audio,
      window_size=model_settings['window_size_samples'],
      stride=model_settings['window_stride_samples'],
      magnitude_squared=True)

  if preprocess == 'average':
    fingerprint_input = tf.nn.pool(
        tf.expand_dims(spectrogram, -1),
        window_shape=[1, model_settings['average_window_width']],
        strides=[1, model_settings['average_window_width']],
        pooling_type='AVG',
        padding='SAME')
  elif preprocess == 'mfcc':
    fingerprint_input = contrib_audio.mfcc(
        spectrogram,
        sample_rate,
        dct_coefficient_count=model_settings['fingerprint_width'])
  elif preprocess == 'micro':
    if not frontend_op:
      raise Exception(
          'Micro frontend op is currently not available when running TensorFlow'
          ' directly from Python, you need to build and run through Bazel, for'
          ' example'
          ' `bazel run tensorflow/examples/speech_commands:freeze_graph`'
      )
    sample_rate = model_settings['sample_rate']
    window_size_ms = (model_settings['window_size_samples'] *
                      1000) / sample_rate
    window_step_ms = (model_settings['window_stride_samples'] *
                      1000) / sample_rate
    int16_input = tf.cast(
        tf.multiply(decoded_sample_data.audio, 32767), tf.int16)
    micro_frontend = frontend_op.audio_microfrontend(
        int16_input,
        sample_rate=sample_rate,
        window_size=window_size_ms,
        window_step=window_step_ms,
        num_channels=model_settings['fingerprint_width'],
        out_scale=1,
        out_type=tf.float32)
    fingerprint_input = tf.multiply(micro_frontend, (10.0 / 256.0))
  else:
    raise Exception('Unknown preprocess mode "%s" (should be "mfcc",'
                    ' "average", or "micro")' % (preprocess))

  fingerprint_size = model_settings['fingerprint_size']
  reshaped_input = tf.reshape(fingerprint_input, [-1, fingerprint_size])

  logits = models.create_model(
      reshaped_input, model_settings, model_architecture, is_training=False,
      runtime_settings=runtime_settings)

  # Create an output to use for inference.
  tf.nn.softmax(logits, name='labels_softmax')
예제 #36
0
  def prepare_processing_graph(self, model_settings, summaries_dir):
    """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - output_: Output 2D fingerprint of processed audio.

    Args:
      model_settings: Information about the current model being trained.
      summaries_dir: Path to save training summary information to.

    Raises:
      ValueError: If the preprocessing mode isn't recognized.
    """
    with tf.get_default_graph().name_scope('data'):
      desired_samples = model_settings['desired_samples']
      self.wav_filename_placeholder_ = tf.placeholder(
          tf.string, [], name='wav_filename')
      wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
      wav_decoder = contrib_audio.decode_wav(
          wav_loader, desired_channels=1, desired_samples=desired_samples)
      # Allow the audio sample's volume to be adjusted.
      self.foreground_volume_placeholder_ = tf.placeholder(
          tf.float32, [], name='foreground_volume')
      scaled_foreground = tf.multiply(wav_decoder.audio,
                                      self.foreground_volume_placeholder_)
      # Shift the sample's start position, and pad any gaps with zeros.
      self.time_shift_padding_placeholder_ = tf.placeholder(
          tf.int32, [2, 2], name='time_shift_padding')
      self.time_shift_offset_placeholder_ = tf.placeholder(
          tf.int32, [2], name='time_shift_offset')
      padded_foreground = tf.pad(
          scaled_foreground,
          self.time_shift_padding_placeholder_,
          mode='CONSTANT')
      sliced_foreground = tf.slice(padded_foreground,
                                   self.time_shift_offset_placeholder_,
                                   [desired_samples, -1])
      # Mix in background noise.
      self.background_data_placeholder_ = tf.placeholder(
          tf.float32, [desired_samples, 1], name='background_data')
      self.background_volume_placeholder_ = tf.placeholder(
          tf.float32, [], name='background_volume')
      background_mul = tf.multiply(self.background_data_placeholder_,
                                   self.background_volume_placeholder_)
      background_add = tf.add(background_mul, sliced_foreground)
      background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
      # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
      spectrogram = contrib_audio.audio_spectrogram(
          background_clamp,
          window_size=model_settings['window_size_samples'],
          stride=model_settings['window_stride_samples'],
          magnitude_squared=True)
      tf.summary.image(
          'spectrogram', tf.expand_dims(spectrogram, -1), max_outputs=1)
      # The number of buckets in each FFT row in the spectrogram will depend on
      # how many input samples there are in each window. This can be quite
      # large, with a 160 sample window producing 127 buckets for example. We
      # don't need this level of detail for classification, so we often want to
      # shrink them down to produce a smaller result. That's what this section
      # implements. One method is to use average pooling to merge adjacent
      # buckets, but a more sophisticated approach is to apply the MFCC
      # algorithm to shrink the representation.
      if model_settings['preprocess'] == 'average':
        self.output_ = tf.nn.pool(
            tf.expand_dims(spectrogram, -1),
            window_shape=[1, model_settings['average_window_width']],
            strides=[1, model_settings['average_window_width']],
            pooling_type='AVG',
            padding='SAME')
        tf.summary.image('shrunk_spectrogram', self.output_, max_outputs=1)
      elif model_settings['preprocess'] == 'mfcc':
        self.output_ = contrib_audio.mfcc(
            spectrogram,
            wav_decoder.sample_rate,
            dct_coefficient_count=model_settings['fingerprint_width'])
        tf.summary.image(
            'mfcc', tf.expand_dims(self.output_, -1), max_outputs=1)
      else:
        raise ValueError('Unknown preprocess mode "%s" (should be "mfcc" or'
                         ' "average")' % (model_settings['preprocess']))

      # Merge all the summaries and write them out to /tmp/retrain_logs (by
      # default)
      self.merged_summaries_ = tf.summary.merge_all(scope='data')
      self.summary_writer_ = tf.summary.FileWriter(summaries_dir + '/data',
                                                   tf.get_default_graph())