예제 #1
0
def wavds2specds(ds_wav, Flags):
    """ Convert a dataset of waveforms into a dataset of spectrograms
  """
    specgrams = []
    labels = []

    # cnt=0
    for cnt, (wav, label) in enumerate(ds_wav):
        if wav.shape != (16000, ) or label.shape != ():
            print(
                f"In Loop Shape is wrong at {cnt}: {wav.shape}, {label.shape}")
        cnt += 1
        spec = frontend_op.audio_microfrontend(
            wav,
            sample_rate=Flags.sample_rate,
            window_size=Flags.window_size_ms,
            window_step=Flags.window_stride_ms,
            num_channels=Flags.dct_coefficient_count)
        spec = tf.cast(spec, 'float32') / 1000.0
        specgrams.append(spec)
        # label = keras.utils.to_categorical(label, num_classes)
        labels.append(label)
        if (cnt % 250) == 0:
            print(f"Converted {cnt} samples to spectrogram")
    print(f"Finished converting {cnt} samples to spectrogram.")
    ds_specs = tf.data.Dataset.from_tensor_slices((specgrams, labels))
    return ds_specs
예제 #2
0
def Micro_process(sample_rate=16000,
                  window_size=480,
                  window_stride=320,
                  input_width=40):
    wav_filename_placeholder = tf.compat.v1.placeholder(tf.string, [],
                                                        name="wav_name")
    wav_loader = io_ops.read_file(wav_filename_placeholder,
                                  name="reader_reader")
    wav_decoder = tf.audio.decode_wav(wav_loader,
                                      desired_channels=1,
                                      desired_samples=sample_rate,
                                      name="wav_decoder")
    #background_clamp = tf.clip_by_value(wav_decoder.audio, -1.0, 1.0)

    window_size = (window_size * 1000) / sample_rate
    window_step = (window_stride * 1000) / sample_rate

    int16_input = tf.cast(tf.multiply(wav_decoder.audio, 32768), tf.int16)
    micro_frontend = frontend_op.audio_microfrontend(int16_input,
                                                     sample_rate=sample_rate,
                                                     window_size=window_size,
                                                     window_step=window_step,
                                                     num_channels=input_width,
                                                     out_scale=1,
                                                     out_type=tf.float32)
    mfcc = tf.multiply(micro_frontend, (10.0 / 256.0))

    return mfcc, wav_filename_placeholder
예제 #3
0
def run_Micro_process(filename,
                      sess,
                      input_width=40,
                      window_size_samples=480,
                      window_stride_samples=320,
                      sample_rate=16000):
    wav_filename_placeholder = tf.compat.v1.placeholder(tf.string, [],
                                                        name="wav_name")
    wav_loader = io_ops.read_file(wav_filename_placeholder,
                                  name="reader_reader")
    wav_decoder = tf.audio.decode_wav(wav_loader,
                                      desired_channels=1,
                                      desired_samples=sample_rate,
                                      name="wav_decoder")

    window_size = (window_size_samples * 1000) / sample_rate
    window_step = (window_stride_samples * 1000) / sample_rate

    int16_input = tf.cast(tf.multiply(wav_decoder.audio, 32768), tf.int16)
    micro_frontend = frontend_op.audio_microfrontend(int16_input,
                                                     sample_rate=sample_rate,
                                                     window_size=window_size,
                                                     window_step=window_step,
                                                     num_channels=input_width,
                                                     out_scale=1,
                                                     out_type=tf.float32)
    mfcc = tf.multiply(micro_frontend, (10.0 / 256.0))
    tf.compat.v1.summary.image('micro',
                               tf.expand_dims(tf.expand_dims(mfcc, -1), 0),
                               max_outputs=1)

    return sess.run(mfcc, feed_dict={
        wav_filename_placeholder: filename
    }).flatten()
예제 #4
0
def prepare_tf_micro_spectrogram_computer():

    sample_rate = 16000
    # Step 1: Windowing
    window_size_samples = 480
    window_stride_samples = 320

    # Step 3: Mel-spec
    num_channels = 40
    lower_band_limit = 0.0
    upper_band_limit = 7999.0

    # Step 4: Smoothing
    min_signal_remaining = 0.05
    smoothing_bits = 10
    even_smoothing = 0.025
    odd_smoothing = 0.06

    # Step 5: PCEN
    enable_pcan = True
    pcan_strength = 0.95
    pcan_offset = 80.0
    gain_bits = 21

    # Step 6: log-scaling
    scale_shift = 6
    enable_log = True

    tf.compat.v1.reset_default_graph()
    tf.compat.v1.disable_eager_execution()

    window_size_ms = window_size_samples * 1000 / sample_rate
    window_step_ms = window_stride_samples * 1000 / sample_rate

    with tf.compat.v1.get_default_graph().name_scope('data'):
        wav_signal_placeholder = tf.compat.v1.placeholder(
            tf.int16, shape=16000, name='wav_signal_placeholder')
        micro_frontend = frontend_op.audio_microfrontend(
            wav_signal_placeholder,
            sample_rate=sample_rate,
            window_size=window_size_ms,
            window_step=window_step_ms,
            num_channels=num_channels,
            upper_band_limit=upper_band_limit,
            lower_band_limit=lower_band_limit,
            min_signal_remaining=min_signal_remaining,
            smoothing_bits=smoothing_bits,
            even_smoothing=even_smoothing,
            odd_smoothing=odd_smoothing,
            enable_pcan=enable_pcan,
            pcan_strength=pcan_strength,
            pcan_offset=pcan_offset,
            gain_bits=gain_bits,
            enable_log=enable_log,
            scale_shift=scale_shift,
            out_scale=1,
            out_type=tf.float32)
        output_ = tf.multiply(micro_frontend, (10.0 / 256.0))
    return wav_signal_placeholder, output_
예제 #5
0
파일: aww_data.py 프로젝트: toanhvu/tiny
def spec_feats(sample_dict): 
  """Runs TFL microfrontend and returns spectrogram"""
  audio = sample_dict['audio']
  label = sample_dict['label']
  paddings = [[0, 16000-tf.shape(audio)[0]]]
  audio = tf.pad(audio, paddings)
  audio16 = tf.cast(audio, 'int16')    
  spec = frontend_op.audio_microfrontend(audio16, sample_rate=16000, window_size=40, 
                                           window_step=20, num_channels=40)
  return spec, label
예제 #6
0
def get_features(model_settings):
    if model_settings['preprocess'] == 'micro':
        window_size_ms = (model_settings['window_size_samples'] *
                          1000) / model_settings['sample_rate']
        window_step_ms = (model_settings['window_stride_samples'] *
                          1000) / model_settings['sample_rate']
        int16_input = tf.cast(tf.multiply(input_data, 32768), tf.int16)
        # print(int16_input.shape)

        # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/microfrontend/python/ops/audio_microfrontend_op.py
        micro_frontend = frontend_op.audio_microfrontend(
            int16_input,
            sample_rate=model_settings['sample_rate'],
            window_size=window_size_ms,
            window_step=window_step_ms,
            num_channels=model_settings['fingerprint_width'],
            out_scale=1,
            out_type=tf.float32)
        output = tf.multiply(micro_frontend, (10.0 / 256.0))
        return output

    elif model_settings['preprocess'] == 'mfcc':
        # https://www.tensorflow.org/api_docs/python/tf/raw_ops/AudioSpectrogram
        spectrogram = audio_ops.audio_spectrogram(
                  input_data,
                  window_size=model_settings['window_size_samples'],
                  stride=model_settings['window_stride_samples'],
                  magnitude_squared=True)
        output = audio_ops.mfcc(
                spectrogram,
                model_settings['sample_rate'],
                dct_coefficient_count=model_settings['fingerprint_width'])
        return output[0,:,:] #just return channel 0 as 2D tensor

    elif model_settings['preprocess'] == 'average':
        spectrogram = audio_ops.audio_spectrogram(
                  input_data,
                  window_size=model_settings['window_size_samples'],
                  stride=model_settings['window_stride_samples'],
                  magnitude_squared=True)
        output = tf.nn.pool(
                  input=tf.expand_dims(spectrogram, -1),
                  window_shape=[1, model_settings['average_window_width']],
                  strides=[1, model_settings['average_window_width']],
                  pooling_type='AVG',
                  padding='SAME')
        return output[0,:,:,0] #just return channel 0 as 2D tensor

    else:
        raise ValueError(f'Unknown model setting: {model_settings["preprocess"]}')
예제 #7
0
def get_spectrogram(waveform):
    # Concatenate audio with padding so that all audio clips will be of the
    # same length (16000 samples)
    zero_padding = tf.zeros([wave_length_samps] - tf.shape(waveform),
                            dtype=tf.int16)
    waveform = tf.cast(0.5 * waveform * (i16max - i16min),
                       tf.int16)  # scale float [-1,+1]=>INT16
    equal_length = tf.concat([waveform, zero_padding], 0)
    ## Make sure these labels correspond to those used in micro_features_micro_features_generator.cpp
    spectrogram = frontend_op.audio_microfrontend(equal_length,
                                                  sample_rate=fsamp,
                                                  num_channels=num_channels,
                                                  window_size=window_size_ms,
                                                  window_step=window_step_ms)
    return spectrogram
 def testSimple(self):
   with self.test_session():
     audio = tf.constant(
         [0, 32767, 0, -32768] * ((WINDOW_SIZE + 4 * WINDOW_STEP) // 4),
         tf.int16)
     filterbanks = frontend_op.audio_microfrontend(
         audio,
         sample_rate=SAMPLE_RATE,
         window_size=WINDOW_SIZE,
         window_step=WINDOW_STEP,
         num_channels=NUM_CHANNELS,
         upper_band_limit=UPPER_BAND_LIMIT,
         lower_band_limit=LOWER_BAND_LIMIT,
         smoothing_bits=SMOOTHING_BITS,
         enable_pcan=True)
     self.assertAllEqual(filterbanks.eval(),
                         [[479, 425], [436, 378], [410, 350], [391, 325]])
예제 #9
0
def to_micro_spectrogram(model_settings, audio):
    sample_rate = model_settings["sample_rate"]
    window_size_ms = (model_settings["window_size_samples"] *
                      1000) / sample_rate
    window_step_ms = (model_settings["window_stride_samples"] *
                      1000) / sample_rate
    int16_input = tf.cast(tf.multiply(audio, 32768), tf.int16)
    # https://git.io/Jkuux
    micro_frontend = frontend_op.audio_microfrontend(
        int16_input,
        sample_rate=sample_rate,
        window_size=window_size_ms,
        window_step=window_step_ms,
        num_channels=model_settings["fingerprint_width"],
        out_scale=1,
        out_type=tf.float32,
    )
    output = tf.multiply(micro_frontend, (10.0 / 256.0))
    return output
 def testSimpleFloatScaled(self):
     with self.test_session():
         audio = tf.constant([0, 32767, 0, -32768] *
                             ((WINDOW_SIZE + 4 * WINDOW_STEP) // 4),
                             tf.int16)
         filterbanks = frontend_op.audio_microfrontend(
             audio,
             sample_rate=SAMPLE_RATE,
             window_size=WINDOW_SIZE,
             window_step=WINDOW_STEP,
             num_channels=NUM_CHANNELS,
             upper_band_limit=UPPER_BAND_LIMIT,
             lower_band_limit=LOWER_BAND_LIMIT,
             smoothing_bits=SMOOTHING_BITS,
             enable_pcan=True,
             out_scale=64,
             out_type=tf.float32)
         self.assertAllEqual(filterbanks.eval(),
                             [[7.484375, 6.640625], [6.8125, 5.90625],
                              [6.40625, 5.46875], [6.109375, 5.078125]])
 def testSimpleFloatScaled(self):
   with self.test_session():
     audio = tf.constant(
         [0, 32767, 0, -32768] * ((WINDOW_SIZE + 4 * WINDOW_STEP) // 4),
         tf.int16)
     filterbanks = frontend_op.audio_microfrontend(
         audio,
         sample_rate=SAMPLE_RATE,
         window_size=WINDOW_SIZE,
         window_step=WINDOW_STEP,
         num_channels=NUM_CHANNELS,
         upper_band_limit=UPPER_BAND_LIMIT,
         lower_band_limit=LOWER_BAND_LIMIT,
         smoothing_bits=SMOOTHING_BITS,
         enable_pcan=True,
         out_scale=64,
         out_type=tf.float32)
     self.assertAllEqual(filterbanks.eval(),
                         [[7.484375, 6.640625], [6.8125, 5.90625],
                          [6.40625, 5.46875], [6.109375, 5.078125]])
예제 #12
0
def to_micro_spectrogram(
    audio,
    sample_rate: int = 16000,
    window_size_ms: int = 30,
    window_step_ms: int = 20,
    feature_bin_count: int = 40,
):
    int16_input = tf.cast(tf.multiply(audio, 32768), tf.int16)
    # https://git.io/Jkuux
    micro_frontend = frontend_op.audio_microfrontend(
        int16_input,
        sample_rate=sample_rate,
        window_size=window_size_ms,
        window_step=window_step_ms,
        num_channels=feature_bin_count,
        out_scale=1,
        out_type=tf.float32,
    )
    output = tf.multiply(micro_frontend, (10.0 / 256.0))
    return output
 def testZeroPadding(self):
   with self.test_session():
     audio = tf.constant(
         [0, 32767, 0, -32768] * ((WINDOW_SIZE + 7 * WINDOW_STEP) // 4),
         tf.int16)
     filterbanks = frontend_op.audio_microfrontend(
         audio,
         sample_rate=SAMPLE_RATE,
         window_size=WINDOW_SIZE,
         window_step=WINDOW_STEP,
         num_channels=NUM_CHANNELS,
         upper_band_limit=UPPER_BAND_LIMIT,
         lower_band_limit=LOWER_BAND_LIMIT,
         smoothing_bits=SMOOTHING_BITS,
         enable_pcan=True,
         left_context=2,
         frame_stride=3,
         zero_padding=True)
     self.assertAllEqual(
         self.evaluate(filterbanks),
         [[0, 0, 0, 0, 479, 425], [436, 378, 410, 350, 391, 325],
          [374, 308, 362, 292, 352, 275]])
 def testZeroPadding(self):
     with self.test_session():
         audio = tf.constant([0, 32767, 0, -32768] *
                             ((WINDOW_SIZE + 7 * WINDOW_STEP) // 4),
                             tf.int16)
         filterbanks = frontend_op.audio_microfrontend(
             audio,
             sample_rate=SAMPLE_RATE,
             window_size=WINDOW_SIZE,
             window_step=WINDOW_STEP,
             num_channels=NUM_CHANNELS,
             upper_band_limit=UPPER_BAND_LIMIT,
             lower_band_limit=LOWER_BAND_LIMIT,
             smoothing_bits=SMOOTHING_BITS,
             enable_pcan=True,
             left_context=2,
             frame_stride=3,
             zero_padding=True)
         self.assertAllEqual(
             filterbanks.eval(),
             [[0, 0, 0, 0, 479, 425], [436, 378, 410, 350, 391, 325],
              [374, 308, 362, 292, 352, 275]])
예제 #15
0
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
                           clip_stride_ms, window_size_ms, window_stride_ms,
                           feature_bin_count, model_architecture, preprocess):
    """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    clip_stride_ms: How often to run recognition. Useful for models with cache.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    feature_bin_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
    preprocess: How the spectrogram is processed to produce features, for
      example 'mfcc', 'average', or 'micro'.

  Returns:
    Input and output tensor objects.

  Raises:
    Exception: If the preprocessing mode isn't recognized.
  """

    words_list = input_data.prepare_words_list(wanted_words.split(','))
    model_settings = models.prepare_model_settings(
        len(words_list), sample_rate, clip_duration_ms, window_size_ms,
        window_stride_ms, feature_bin_count, preprocess)
    runtime_settings = {'clip_stride_ms': clip_stride_ms}

    wav_data_placeholder = tf.compat.v1.placeholder(tf.string, [],
                                                    name='wav_data')
    decoded_sample_data = tf.audio.decode_wav(
        wav_data_placeholder,
        desired_channels=1,
        desired_samples=model_settings['desired_samples'],
        name='decoded_sample_data')
    spectrogram = audio_ops.audio_spectrogram(
        decoded_sample_data.audio,
        window_size=model_settings['window_size_samples'],
        stride=model_settings['window_stride_samples'],
        magnitude_squared=True)

    if preprocess == 'average':
        fingerprint_input = tf.nn.pool(
            input=tf.expand_dims(spectrogram, -1),
            window_shape=[1, model_settings['average_window_width']],
            strides=[1, model_settings['average_window_width']],
            pooling_type='AVG',
            padding='SAME')
    elif preprocess == 'mfcc':
        fingerprint_input = audio_ops.mfcc(
            spectrogram,
            sample_rate,
            dct_coefficient_count=model_settings['fingerprint_width'])
    elif preprocess == 'micro':
        if not frontend_op:
            raise Exception(
                'Micro frontend op is currently not available when running TensorFlow'
                ' directly from Python, you need to build and run through Bazel, for'
                ' example'
                ' `bazel run tensorflow/examples/speech_commands:freeze_graph`'
            )
        sample_rate = model_settings['sample_rate']
        window_size_ms = (model_settings['window_size_samples'] *
                          1000) / sample_rate
        window_step_ms = (model_settings['window_stride_samples'] *
                          1000) / sample_rate
        int16_input = tf.cast(tf.multiply(decoded_sample_data.audio, 32767),
                              tf.int16)
        micro_frontend = frontend_op.audio_microfrontend(
            int16_input,
            sample_rate=sample_rate,
            window_size=window_size_ms,
            window_step=window_step_ms,
            num_channels=model_settings['fingerprint_width'],
            out_scale=1,
            out_type=tf.float32)
        fingerprint_input = tf.multiply(micro_frontend, (10.0 / 256.0))

    elif preprocess == "rune":
        fingerprint_input = np.random.uniform(0, 26, 1960).astype(np.float32)

    else:
        raise Exception('Unknown preprocess mode "%s" (should be "mfcc",'
                        ' "average", or "micro")' % (preprocess))

    fingerprint_size = model_settings['fingerprint_size']
    reshaped_input = tf.reshape(fingerprint_input, [-1, fingerprint_size])

    logits = models.create_model(reshaped_input,
                                 model_settings,
                                 model_architecture,
                                 is_training=False,
                                 runtime_settings=runtime_settings)

    # Create an output to use for inference.
    softmax = tf.nn.softmax(logits, name='labels_softmax')

    return reshaped_input, softmax
예제 #16
0
    def prepare_processing_graph(self, model_settings, summaries_dir):
        """Builds a TensorFlow graph to apply the input distortions.

        Creates a graph that loads a WAVE file, decodes it, scales the volume,
        shifts it in time, adds in background noise, calculates a spectrogram, and
        then builds an MFCC fingerprint from that.

        This must be called with an active TensorFlow session running, and it
        creates multiple placeholder inputs, and one output:

          - wav_filename_placeholder_: Filename of the WAV to load.
          - foreground_volume_placeholder_: How loud the main clip should be.
          - time_shift_padding_placeholder_: Where to pad the clip.
          - time_shift_offset_placeholder_: How much to move the clip in time.
          - background_data_placeholder_: PCM sample data for background noise.
          - background_volume_placeholder_: Loudness of mixed-in background.
          - output_: Output 2D fingerprint of processed audio.

        Args:
          model_settings: Information about the current model being trained.
          summaries_dir: Path to save training summary information to.

        Raises:
          ValueError: If the preprocessing mode isn't recognized.
          Exception: If the preprocessor wasn't compiled in.
        """
        with tf.compat.v1.get_default_graph().name_scope('data'):
            desired_samples = model_settings['desired_samples']
            self.wav_filename_placeholder_ = tf.compat.v1.placeholder(
                tf.string, [], name='wav_filename')
            wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
            wav_decoder = tf.audio.decode_wav(wav_loader,
                                              desired_channels=1,
                                              desired_samples=desired_samples)
            # Allow the audio sample's volume to be adjusted.
            self.foreground_volume_placeholder_ = tf.compat.v1.placeholder(
                tf.float32, [], name='foreground_volume')
            scaled_foreground = tf.multiply(
                wav_decoder.audio, self.foreground_volume_placeholder_)
            # Shift the sample's start position, and pad any gaps with zeros.
            self.time_shift_offset_placeholder_ = tf.compat.v1.placeholder(
                tf.int32, [2], name='time_shift_offset')
            padded_foreground = tf.pad(
                tensor=scaled_foreground,
                paddings=self.time_shift_padding_placeholder_,
                mode='CONSTANT')
            sliced_foreground = tf.slice(padded_foreground,
                                         self.time_shift_offset_placeholder_,
                                         [desired_samples, -1])
            # Mix in background noise.
            self.background_data_placeholder_ = tf.compat.v1.placeholder(
                tf.float32, [desired_samples, 1], name='background_data')
            self.background_volume_placeholder_ = tf.compat.v1.placeholder(
                tf.float32, [], name='background_volume')
            background_mul = tf.multiply(self.background_data_placeholder_,
                                         self.background_volume_placeholder_)
            background_add = tf.add(background_mul, sliced_foreground)
            background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
            # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
            spectrogram = audio_ops.audio_spectrogram(
                background_clamp,
                window_size=model_settings['window_size_samples'],
                stride=model_settings['window_stride_samples'],
                magnitude_squared=True)
            # remove summary
            # tf.compat.v1.summary.image(
            #     'spectrogram', tf.expand_dims(spectrogram, -1), max_outputs=1)
            # The number of buckets in each FFT row in the spectrogram will depend on
            # how many input samples there are in each window. This can be quite
            # large, with a 160 sample window producing 127 buckets for example. We
            # don't need this level of detail for classification, so we often want to
            # shrink them down to produce a smaller result. That's what this section
            # implements. One method is to use average pooling to merge adjacent
            # buckets, but a more sophisticated approach is to apply the MFCC
            # algorithm to shrink the representation.
            if model_settings['preprocess'] == 'average':
                self.output_ = tf.nn.pool(
                    input=tf.expand_dims(spectrogram, -1),
                    window_shape=[1, model_settings['average_window_width']],
                    strides=[1, model_settings['average_window_width']],
                    pooling_type='AVG',
                    padding='SAME')
                # tf.compat.v1.summary.image('shrunk_spectrogram',
                #                            self.output_,
                #                            max_outputs=1)
            elif model_settings['preprocess'] == 'mfcc':
                self.output_ = audio_ops.mfcc(
                    spectrogram,
                    wav_decoder.sample_rate,
                    dct_coefficient_count=model_settings['fingerprint_width'])
                # tf.compat.v1.summary.image(
                #     'mfcc', tf.expand_dims(self.output_, -1), max_outputs=1)
            elif model_settings['preprocess'] == 'micro':
                if not frontend_op:
                    raise Exception(
                        'Micro frontend op is currently not available when running'
                        ' TensorFlow directly from Python, you need to build and run'
                        ' through Bazel')
                sample_rate = model_settings['sample_rate']
                window_size_ms = (model_settings['window_size_samples'] *
                                  1000) / sample_rate
                window_step_ms = (model_settings['window_stride_samples'] *
                                  1000) / sample_rate
                int16_input = tf.cast(tf.multiply(background_clamp, 32768),
                                      tf.int16)
                micro_frontend = frontend_op.audio_microfrontend(
                    int16_input,
                    sample_rate=sample_rate,
                    window_size=window_size_ms,
                    window_step=window_step_ms,
                    num_channels=model_settings['fingerprint_width'],
                    out_scale=1,
                    out_type=tf.float32)
                self.output_ = tf.multiply(micro_frontend, (10.0 / 256.0))
                # tf.compat.v1.summary.image(
                #     'micro',
                #     tf.expand_dims(tf.expand_dims(self.output_, -1), 0),
                #     max_outputs=1)
            else:
                raise ValueError(
                    'Unknown preprocess mode "%s" (should be "mfcc", '
                    ' "average", or "micro")' % (model_settings['preprocess']))
예제 #17
0
    def prepare_processing_graph(self, flags):
        """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - foreground_resampling_placeholder_: Controls signal stretching/squeezing
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - output_: Output 2D fingerprint of processed audio or raw audio.

    Args:
      flags: data and model parameters, described at model_train.py

    Raises:
      ValueError: If the preprocessing mode isn't recognized.
      Exception: If the preprocessor wasn't compiled in.
    """
        with tf.get_default_graph().name_scope('data'):
            desired_samples = flags.desired_samples
            self.wav_filename_placeholder_ = tf.placeholder(
                tf.string, [], name='wav_filename')
            wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
            wav_decoder = tf.audio.decode_wav(wav_loader,
                                              desired_channels=1,
                                              desired_samples=desired_samples)

            # Allow the audio sample's volume to be adjusted.
            self.foreground_volume_placeholder_ = tf.placeholder(
                tf.float32, [], name='foreground_volume')
            # signal resampling to generate more training data
            # it will stretch or squeeze input signal proportinally to:
            self.foreground_resampling_placeholder_ = tf.placeholder(
                tf.float32, [])

            if self.foreground_resampling_placeholder_ != 1.0:
                image = tf.expand_dims(wav_decoder.audio, 0)
                image = tf.expand_dims(image, 2)
                shape = tf.shape(wav_decoder.audio)
                image_resized = tf.image.resize(
                    images=image,
                    size=(tf.cast((tf.cast(shape[0], tf.float32) *
                                   self.foreground_resampling_placeholder_),
                                  tf.int32), 1),
                    preserve_aspect_ratio=False)
                image_resized_cropped = tf.image.resize_with_crop_or_pad(
                    image_resized,
                    target_height=desired_samples,
                    target_width=1,
                )
                image_resized_cropped = tf.squeeze(image_resized_cropped,
                                                   axis=[0, 3])
                scaled_foreground = tf.multiply(
                    image_resized_cropped, self.foreground_volume_placeholder_)
            else:
                scaled_foreground = tf.multiply(
                    wav_decoder.audio, self.foreground_volume_placeholder_)
            # Shift the sample's start position, and pad any gaps with zeros.
            self.time_shift_padding_placeholder_ = tf.placeholder(
                tf.int32, [2, 2], name='time_shift_padding')
            self.time_shift_offset_placeholder_ = tf.placeholder(
                tf.int32, [2], name='time_shift_offset')
            padded_foreground = tf.pad(
                tensor=scaled_foreground,
                paddings=self.time_shift_padding_placeholder_,
                mode='CONSTANT')
            sliced_foreground = tf.slice(padded_foreground,
                                         self.time_shift_offset_placeholder_,
                                         [desired_samples, -1])
            # Mix in background noise.
            self.background_data_placeholder_ = tf.placeholder(
                tf.float32, [desired_samples, 1], name='background_data')
            self.background_volume_placeholder_ = tf.placeholder(
                tf.float32, [], name='background_volume')
            background_mul = tf.multiply(self.background_data_placeholder_,
                                         self.background_volume_placeholder_)
            background_add = tf.add(background_mul, sliced_foreground)
            background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)

            if flags.preprocess == 'raw':
                # background_clamp dims: [time, channels]
                # remove channel dim
                self.output_ = tf.squeeze(background_clamp, axis=1)
            # below options are for backward compatibility with previous
            # version of hotword detection on microcontrollers
            # in this case audio feature extraction is done separately from
            # neural net and user will have to manage it.
            elif flags.preprocess == 'mfcc':
                # Run the spectrogram and MFCC ops to get a 2D audio: Short-time FFTs
                # background_clamp dims: [time, channels]
                spectrogram = audio_ops.audio_spectrogram(
                    background_clamp,
                    window_size=flags.window_size_samples,
                    stride=flags.window_stride_samples,
                    magnitude_squared=flags.fft_magnitude_squared)
                # spectrogram: [channels/batch, frames, fft_feature]

                # extract mfcc features from spectrogram by audio_ops.mfcc:
                # 1 Input is spectrogram frames.
                # 2 Weighted spectrogram into bands using a triangular mel filterbank
                # 3 Logarithmic scaling
                # 4 Discrete cosine transform (DCT), return lowest dct_coefficient_count
                mfcc = audio_ops.mfcc(
                    spectrogram=spectrogram,
                    sample_rate=flags.sample_rate,
                    upper_frequency_limit=flags.mel_upper_edge_hertz,
                    lower_frequency_limit=flags.mel_lower_edge_hertz,
                    filterbank_channel_count=flags.mel_num_bins,
                    dct_coefficient_count=flags.dct_num_features)
                # mfcc: [channels/batch, frames, dct_coefficient_count]
                # remove channel dim
                self.output_ = tf.squeeze(mfcc, axis=0)
            elif flags.preprocess == 'micro':
                if not frontend_op:
                    raise Exception(
                        'Micro frontend op is currently not available when running'
                        ' TensorFlow directly from Python, you need to build and run'
                        ' through Bazel')
                int16_input = tf.cast(
                    tf.multiply(background_clamp, MAX_ABS_INT16), tf.int16)
                # audio_microfrontend does:
                # 1. A slicing window function of raw audio
                # 2. Short-time FFTs
                # 3. Filterbank calculations
                # 4. Noise reduction
                # 5. PCAN Auto Gain Control
                # 6. Logarithmic scaling

                # int16_input dims: [time, channels]
                micro_frontend = frontend_op.audio_microfrontend(
                    int16_input,
                    sample_rate=flags.sample_rate,
                    window_size=flags.window_size_ms,
                    window_step=flags.window_stride_ms,
                    num_channels=flags.mel_num_bins,
                    upper_band_limit=flags.mel_upper_edge_hertz,
                    lower_band_limit=flags.mel_lower_edge_hertz,
                    out_scale=1,
                    out_type=tf.float32)
                # int16_input dims: [frames, num_channels]
                self.output_ = tf.multiply(micro_frontend, (10.0 / 256.0))
            else:
                raise ValueError(
                    'Unknown preprocess mode "%s" (should be "raw", '
                    ' "mfcc", or "micro")' % (flags.preprocess))
예제 #18
0
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms,
                           clip_stride_ms, window_size_ms, window_stride_ms,
                           feature_bin_count, model_architecture, preprocess):
  """Creates an audio model with the nodes needed for inference.

  Uses the supplied arguments to create a model, and inserts the input and
  output nodes that are needed to use the graph for inference.

  Args:
    wanted_words: Comma-separated list of the words we're trying to recognize.
    sample_rate: How many samples per second are in the input audio files.
    clip_duration_ms: How many samples to analyze for the audio pattern.
    clip_stride_ms: How often to run recognition. Useful for models with cache.
    window_size_ms: Time slice duration to estimate frequencies from.
    window_stride_ms: How far apart time slices should be.
    feature_bin_count: Number of frequency bands to analyze.
    model_architecture: Name of the kind of model to generate.
    preprocess: How the spectrogram is processed to produce features, for
      example 'mfcc', 'average', or 'micro'.

  Raises:
    Exception: If the preprocessing mode isn't recognized.
  """

  words_list = input_data.prepare_words_list(wanted_words.split(','))
  model_settings = models.prepare_model_settings(
      len(words_list), sample_rate, clip_duration_ms, window_size_ms,
      window_stride_ms, feature_bin_count, preprocess)
  runtime_settings = {'clip_stride_ms': clip_stride_ms}

  wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data')
  decoded_sample_data = contrib_audio.decode_wav(
      wav_data_placeholder,
      desired_channels=1,
      desired_samples=model_settings['desired_samples'],
      name='decoded_sample_data')
  spectrogram = contrib_audio.audio_spectrogram(
      decoded_sample_data.audio,
      window_size=model_settings['window_size_samples'],
      stride=model_settings['window_stride_samples'],
      magnitude_squared=True)

  if preprocess == 'average':
    fingerprint_input = tf.nn.pool(
        tf.expand_dims(spectrogram, -1),
        window_shape=[1, model_settings['average_window_width']],
        strides=[1, model_settings['average_window_width']],
        pooling_type='AVG',
        padding='SAME')
  elif preprocess == 'mfcc':
    fingerprint_input = contrib_audio.mfcc(
        spectrogram,
        sample_rate,
        dct_coefficient_count=model_settings['fingerprint_width'])
  elif preprocess == 'micro':
    if not frontend_op:
      raise Exception(
          'Micro frontend op is currently not available when running TensorFlow'
          ' directly from Python, you need to build and run through Bazel, for'
          ' example'
          ' `bazel run tensorflow/examples/speech_commands:freeze_graph`'
      )
    sample_rate = model_settings['sample_rate']
    window_size_ms = (model_settings['window_size_samples'] *
                      1000) / sample_rate
    window_step_ms = (model_settings['window_stride_samples'] *
                      1000) / sample_rate
    int16_input = tf.cast(
        tf.multiply(decoded_sample_data.audio, 32767), tf.int16)
    micro_frontend = frontend_op.audio_microfrontend(
        int16_input,
        sample_rate=sample_rate,
        window_size=window_size_ms,
        window_step=window_step_ms,
        num_channels=model_settings['fingerprint_width'],
        out_scale=1,
        out_type=tf.float32)
    fingerprint_input = tf.multiply(micro_frontend, (10.0 / 256.0))
  else:
    raise Exception('Unknown preprocess mode "%s" (should be "mfcc",'
                    ' "average", or "micro")' % (preprocess))

  fingerprint_size = model_settings['fingerprint_size']
  reshaped_input = tf.reshape(fingerprint_input, [-1, fingerprint_size])

  logits = models.create_model(
      reshaped_input, model_settings, model_architecture, is_training=False,
      runtime_settings=runtime_settings)

  # Create an output to use for inference.
  tf.nn.softmax(logits, name='labels_softmax')
예제 #19
0
    def prepare_processing_graph(self, data_settings):
        """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - foreground_resampling_placeholder_: Controls signal stretching/squeezing
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - output_: Output 2D fingerprint of processed audio or raw audio.

    Args:
      data_settings: data and model parameters, described at model_train.py

    Raises:
      ValueError: If the preprocessing mode isn't recognized.
      Exception: If the preprocessor wasn't compiled in.
    """
        with tf.get_default_graph().name_scope('data'):
            desired_samples = data_settings.desired_samples
            self.wav_filename_placeholder_ = tf.placeholder(
                tf.string, [], name='wav_filename')
            wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
            wav_decoder = tf.audio.decode_wav(wav_loader,
                                              desired_channels=1,
                                              desired_samples=desired_samples)
            # Allow the audio sample's volume to be adjusted.
            self.foreground_volume_placeholder_ = tf.placeholder(
                tf.float32, [], name='foreground_volume')
            # signal resampling to generate more training data
            # it will stretch or squeeze input signal proportinally to:
            self.foreground_resampling_placeholder_ = tf.placeholder(
                tf.float32, [])

            if self.foreground_resampling_placeholder_ != 1.0:
                image = tf.expand_dims(wav_decoder.audio, 0)
                image = tf.expand_dims(image, 2)
                shape = tf.shape(wav_decoder.audio)
                image_resized = tf.image.resize(
                    images=image,
                    size=(tf.cast((tf.cast(shape[0], tf.float32) *
                                   self.foreground_resampling_placeholder_),
                                  tf.int32), 1),
                    preserve_aspect_ratio=False)
                image_resized_cropped = tf.image.resize_with_crop_or_pad(
                    image_resized,
                    target_height=desired_samples,
                    target_width=1,
                )
                image_resized_cropped = tf.squeeze(image_resized_cropped,
                                                   axis=[0, 3])
                scaled_foreground = tf.multiply(
                    image_resized_cropped, self.foreground_volume_placeholder_)
            else:
                scaled_foreground = tf.multiply(
                    wav_decoder.audio, self.foreground_volume_placeholder_)
            # Shift the sample's start position, and pad any gaps with zeros.
            self.time_shift_padding_placeholder_ = tf.placeholder(
                tf.int32, [2, 2], name='time_shift_padding')
            self.time_shift_offset_placeholder_ = tf.placeholder(
                tf.int32, [2], name='time_shift_offset')
            padded_foreground = tf.pad(
                tensor=scaled_foreground,
                paddings=self.time_shift_padding_placeholder_,
                mode='CONSTANT')
            sliced_foreground = tf.slice(padded_foreground,
                                         self.time_shift_offset_placeholder_,
                                         [desired_samples, -1])
            # Mix in background noise.
            self.background_data_placeholder_ = tf.placeholder(
                tf.float32, [desired_samples, 1], name='background_data')
            self.background_volume_placeholder_ = tf.placeholder(
                tf.float32, [], name='background_volume')
            background_mul = tf.multiply(self.background_data_placeholder_,
                                         self.background_volume_placeholder_)
            background_add = tf.add(background_mul, sliced_foreground)
            background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)

            if data_settings.preprocess == 'raw':
                # return raw audio
                self.output_ = background_clamp
                tf.summary.image('input_audio',
                                 tf.expand_dims(
                                     tf.expand_dims(background_clamp, -1), -1),
                                 max_outputs=1)
            else:
                # Run the spectrogram and MFCC ops to get a 2D audio 'fingerprint'
                spectrogram = audio_ops.audio_spectrogram(
                    background_clamp,
                    window_size=data_settings.window_size_samples,
                    stride=data_settings.window_stride_samples,
                    magnitude_squared=True)
                tf.summary.image('spectrogram',
                                 tf.expand_dims(spectrogram, -1),
                                 max_outputs=1)
                # The number of buckets in each FFT row in the spectrogram will depend
                # on how many input samples there are in each window. This can be quite
                # large, with a 160 sample window producing 127 buckets for example. We
                # don't need this level of detail for classification, so we often want
                # to shrink them down to produce a smaller result. That's what this
                # section implements. One method is to use average pooling to merge
                # adjacent buckets, but a more sophisticated approach is to apply the
                # MFCC algorithm to shrink the representation.
                if data_settings.preprocess == 'average':
                    self.output_ = tf.nn.pool(
                        input=tf.expand_dims(spectrogram, -1),
                        window_shape=[1, data_settings.average_window_width],
                        strides=[1, data_settings.average_window_width],
                        pooling_type='AVG',
                        padding='SAME')
                    tf.summary.image('shrunk_spectrogram',
                                     self.output_,
                                     max_outputs=1)
                elif data_settings.preprocess == 'mfcc':
                    self.output_ = audio_ops.mfcc(
                        spectrogram,
                        wav_decoder.sample_rate,
                        dct_coefficient_count=data_settings.fingerprint_width)
                    tf.summary.image('mfcc',
                                     tf.expand_dims(self.output_, -1),
                                     max_outputs=1)
                elif data_settings.preprocess == 'micro':
                    if not frontend_op:
                        raise Exception(
                            'Micro frontend op is currently not available when running'
                            ' TensorFlow directly from Python, you need to build and run'
                            ' through Bazel')
                    sample_rate = data_settings.sample_rate
                    window_size_ms = (data_settings.window_size_samples *
                                      1000) / sample_rate
                    window_step_ms = (data_settings.window_stride_samples *
                                      1000) / sample_rate
                    int16_input = tf.cast(tf.multiply(background_clamp, 32768),
                                          tf.int16)
                    micro_frontend = frontend_op.audio_microfrontend(
                        int16_input,
                        sample_rate=sample_rate,
                        window_size=window_size_ms,
                        window_step=window_step_ms,
                        num_channels=data_settings.fingerprint_width,
                        out_scale=1,
                        out_type=tf.float32)
                    self.output_ = tf.multiply(micro_frontend, (10.0 / 256.0))
                    tf.summary.image('micro',
                                     tf.expand_dims(
                                         tf.expand_dims(self.output_, -1), 0),
                                     max_outputs=1)
                else:
                    raise ValueError(
                        'Unknown preprocess mode "%s" (should be "mfcc", '
                        ' "average", or "micro")' % (data_settings.preprocess))

            # Merge all the summaries and write them out to /tmp/retrain_logs (by
            # default)
            self.merged_summaries_ = tf.summary.merge_all(scope='data')
            if data_settings.summaries_dir:
                self.summary_writer_ = tf.summary.FileWriter(
                    data_settings.summaries_dir + '/data',
                    tf.get_default_graph())
예제 #20
0
    def prepare_processing_graph(self, model_settings, summaries_dir):
        """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - output_: Output 2D fingerprint of processed audio.

    Args:
      model_settings: Information about the current model being trained.
      summaries_dir: Path to save training summary information to.

    Raises:
      ValueError: If the preprocessing mode isn't recognized.
      Exception: If the preprocessor wasn't compiled in.
    """
        with tf.compat.v1.get_default_graph().name_scope('data'):
            desired_samples = model_settings['desired_samples']
            self.wav_filename_placeholder_ = tf.compat.v1.placeholder(
                tf.string, [], name='wav_filename')
            wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
            wav_decoder = tf.audio.decode_wav(wav_loader,
                                              desired_channels=1,
                                              desired_samples=desired_samples)
            # Allow the audio sample's volume to be adjusted.
            self.foreground_volume_placeholder_ = tf.compat.v1.placeholder(
                tf.float32, [], name='foreground_volume')
            scaled_foreground = tf.multiply(
                wav_decoder.audio, self.foreground_volume_placeholder_)
            # Shift the sample's start position, and pad any gaps with zeros.
            self.time_shift_padding_placeholder_ = tf.compat.v1.placeholder(
                tf.int32, [2, 2], name='time_shift_padding')
            self.time_shift_offset_placeholder_ = tf.compat.v1.placeholder(
                tf.int32, [2], name='time_shift_offset')
            padded_foreground = tf.pad(
                tensor=scaled_foreground,
                paddings=self.time_shift_padding_placeholder_,
                mode='CONSTANT')
            sliced_foreground = tf.slice(padded_foreground,
                                         self.time_shift_offset_placeholder_,
                                         [desired_samples, -1])
            # Mix in background noise.
            self.background_data_placeholder_ = tf.compat.v1.placeholder(
                tf.float32, [desired_samples, 1], name='background_data')
            self.background_volume_placeholder_ = tf.compat.v1.placeholder(
                tf.float32, [], name='background_volume')
            background_mul = tf.multiply(self.background_data_placeholder_,
                                         self.background_volume_placeholder_)
            background_add = tf.add(background_mul, sliced_foreground)
            background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)

            # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.

            # spectrogram = audio_ops.audio_spectrogram(
            #     background_clamp,
            #     window_size=model_settings['window_size_samples'],
            #     stride=model_settings['window_stride_samples'],
            #     magnitude_squared=True)

            def periodic_hann_window(window_length, dtype):
                return 0.5 - 0.5 * tf.math.cos(2.0 * np.pi * tf.range(
                    tf.cast(window_length, dtype=dtype),
                    dtype=dtype) / tf.cast(window_length, dtype=dtype))

            signal_stft = tf.signal.stft(
                tf.transpose(background_clamp, [1, 0]),
                frame_length=model_settings['window_size_samples'],
                frame_step=model_settings['window_stride_samples'],
                window_fn=periodic_hann_window)
            signal_spectrograms = tf.abs(signal_stft)
            spectrogram = signal_spectrograms

            tf.compat.v1.summary.image('spectrogram',
                                       tf.expand_dims(spectrogram, -1),
                                       max_outputs=1)
            # The number of buckets in each FFT row in the spectrogram will depend on
            # how many input samples there are in each window. This can be quite
            # large, with a 160 sample window producing 127 buckets for example. We
            # don't need this level of detail for classification, so we often want to
            # shrink them down to produce a smaller result. That's what this section
            # implements. One method is to use average pooling to merge adjacent
            # buckets, but a more sophisticated approach is to apply the MFCC
            # algorithm to shrink the representation.
            if model_settings['preprocess'] == 'average':
                self.output_ = tf.nn.pool(
                    input=tf.expand_dims(spectrogram, -1),
                    window_shape=[1, model_settings['average_window_width']],
                    strides=[1, model_settings['average_window_width']],
                    pooling_type='AVG',
                    padding='SAME')
                tf.compat.v1.summary.image('shrunk_spectrogram',
                                           self.output_,
                                           max_outputs=1)
            elif model_settings['preprocess'] == 'fbank':
                # We just convert the data back to int16 wav format
                # and the actual filterbank processing is performed outside of tensorflow graph
                # in the get_data function
                int16_input = tf.cast(tf.multiply(background_clamp, 32768),
                                      tf.int16)
                # def compute_fbs(int16_wav_input):
                #     fbs, energy = fbank(int16_wav_input, model_settings['sample_rate'],
                #                         nfilt=model_settings['fingerprint_width'],
                #                         winstep=model_settings['window_stride_samples'] / model_settings['sample_rate'],
                #                         winlen=model_settings['window_size_samples'] / model_settings['sample_rate'],
                #                         nfft=1024,
                #                         lowfreq=64)
                #     fbs = np.log(fbs)
                #     energy = np.log(energy)
                #     return np.concatenate([fbs, energy[:, None]], axis=1)
                #
                # log_fbs_with_energy = compute_fbs(int16_input)
                self.output_ = int16_input
                # tf.compat.v1.summary.image(
                #     'fbank', tf.expand_dims(self.output_, -1), max_outputs=1)
            elif model_settings['preprocess'] == 'mfcc':

                # signal_mfccs = audio_ops.mfcc(
                #     spectrogram,
                #     # tf.expand_dims(signal_spectrograms, 0),
                #     wav_decoder.sample_rate,
                #     dct_coefficient_count=model_settings['fingerprint_width'])
                #
                # self.output_ = signal_mfccs
                # print("OLD", signal_mfccs.shape)

                num_spectrogram_bins = signal_stft.shape[-1]

                num_mel_bins = num_mfccs = model_settings['fingerprint_width']
                lower_edge_hertz = 20.0
                upper_edge_hertz = 4000.0
                log_noise_floor = 1e-12
                linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix(
                    num_mel_bins,
                    num_spectrogram_bins,
                    model_settings['sample_rate'],
                    # lower_edge_hertz, upper_edge_hertz
                )
                mel_spectrograms = tf.tensordot(spectrogram,
                                                linear_to_mel_weight_matrix, 1)
                mel_spectrograms.set_shape(
                    mel_spectrograms.shape[:-1].concatenate(
                        linear_to_mel_weight_matrix.shape[-1:]))

                log_mel_spectrograms = tf.math.log(mel_spectrograms +
                                                   log_noise_floor)
                signal_mfccs = tf.signal.mfccs_from_log_mel_spectrograms(
                    log_mel_spectrograms)[..., :num_mfccs]
                # print("NEW", signal_mfccs.shape)

                self.output_ = signal_mfccs

                tf.compat.v1.summary.image('mfcc',
                                           tf.expand_dims(self.output_, -1),
                                           max_outputs=1)
            elif model_settings['preprocess'] == 'micro':
                if not frontend_op:
                    raise Exception(
                        'Micro frontend op is currently not available when running'
                        ' TensorFlow directly from Python, you need to build and run'
                        ' through Bazel')
                sample_rate = model_settings['sample_rate']
                window_size_ms = (model_settings['window_size_samples'] *
                                  1000) / sample_rate
                window_step_ms = (model_settings['window_stride_samples'] *
                                  1000) / sample_rate
                int16_input = tf.cast(tf.multiply(background_clamp, 32768),
                                      tf.int16)
                micro_frontend = frontend_op.audio_microfrontend(
                    int16_input,
                    sample_rate=sample_rate,
                    window_size=window_size_ms,
                    window_step=window_step_ms,
                    num_channels=model_settings['fingerprint_width'],
                    out_scale=1,
                    out_type=tf.float32)
                self.output_ = tf.multiply(micro_frontend, (10.0 / 256.0))
                tf.compat.v1.summary.image(
                    'micro',
                    tf.expand_dims(tf.expand_dims(self.output_, -1), 0),
                    max_outputs=1)
            else:
                raise ValueError(
                    'Unknown preprocess mode "%s" (should be "mfcc", '
                    ' "average", or "micro")' % (model_settings['preprocess']))

            # Merge all the summaries and write them out to /tmp/retrain_logs (by
            # default)
            self.merged_summaries_ = tf.compat.v1.summary.merge_all(
                scope='data')
            if summaries_dir:
                self.summary_writer_ = tf.compat.v1.summary.FileWriter(
                    summaries_dir + '/data', tf.compat.v1.get_default_graph())
예제 #21
0
  def prepare_processing_graph(self, model_settings, summaries_dir):
    """Builds a TensorFlow graph to apply the input distortions.

    Creates a graph that loads a WAVE file, decodes it, scales the volume,
    shifts it in time, adds in background noise, calculates a spectrogram, and
    then builds an MFCC fingerprint from that.

    This must be called with an active TensorFlow session running, and it
    creates multiple placeholder inputs, and one output:

      - wav_filename_placeholder_: Filename of the WAV to load.
      - foreground_volume_placeholder_: How loud the main clip should be.
      - time_shift_padding_placeholder_: Where to pad the clip.
      - time_shift_offset_placeholder_: How much to move the clip in time.
      - background_data_placeholder_: PCM sample data for background noise.
      - background_volume_placeholder_: Loudness of mixed-in background.
      - output_: Output 2D fingerprint of processed audio.

    Args:
      model_settings: Information about the current model being trained.
      summaries_dir: Path to save training summary information to.

    Raises:
      ValueError: If the preprocessing mode isn't recognized.
      Exception: If the preprocessor wasn't compiled in.
    """
    with tf.get_default_graph().name_scope('data'):
      desired_samples = model_settings['desired_samples']
      self.wav_filename_placeholder_ = tf.placeholder(
          tf.string, [], name='wav_filename')
      wav_loader = io_ops.read_file(self.wav_filename_placeholder_)
      wav_decoder = contrib_audio.decode_wav(
          wav_loader, desired_channels=1, desired_samples=desired_samples)
      # Allow the audio sample's volume to be adjusted.
      self.foreground_volume_placeholder_ = tf.placeholder(
          tf.float32, [], name='foreground_volume')
      scaled_foreground = tf.multiply(wav_decoder.audio,
                                      self.foreground_volume_placeholder_)
      # Shift the sample's start position, and pad any gaps with zeros.
      self.time_shift_padding_placeholder_ = tf.placeholder(
          tf.int32, [2, 2], name='time_shift_padding')
      self.time_shift_offset_placeholder_ = tf.placeholder(
          tf.int32, [2], name='time_shift_offset')
      padded_foreground = tf.pad(
          scaled_foreground,
          self.time_shift_padding_placeholder_,
          mode='CONSTANT')
      sliced_foreground = tf.slice(padded_foreground,
                                   self.time_shift_offset_placeholder_,
                                   [desired_samples, -1])
      # Mix in background noise.
      self.background_data_placeholder_ = tf.placeholder(
          tf.float32, [desired_samples, 1], name='background_data')
      self.background_volume_placeholder_ = tf.placeholder(
          tf.float32, [], name='background_volume')
      background_mul = tf.multiply(self.background_data_placeholder_,
                                   self.background_volume_placeholder_)
      background_add = tf.add(background_mul, sliced_foreground)
      background_clamp = tf.clip_by_value(background_add, -1.0, 1.0)
      # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio.
      spectrogram = contrib_audio.audio_spectrogram(
          background_clamp,
          window_size=model_settings['window_size_samples'],
          stride=model_settings['window_stride_samples'],
          magnitude_squared=True)
      tf.summary.image(
          'spectrogram', tf.expand_dims(spectrogram, -1), max_outputs=1)
      # The number of buckets in each FFT row in the spectrogram will depend on
      # how many input samples there are in each window. This can be quite
      # large, with a 160 sample window producing 127 buckets for example. We
      # don't need this level of detail for classification, so we often want to
      # shrink them down to produce a smaller result. That's what this section
      # implements. One method is to use average pooling to merge adjacent
      # buckets, but a more sophisticated approach is to apply the MFCC
      # algorithm to shrink the representation.
      if model_settings['preprocess'] == 'average':
        self.output_ = tf.nn.pool(
            tf.expand_dims(spectrogram, -1),
            window_shape=[1, model_settings['average_window_width']],
            strides=[1, model_settings['average_window_width']],
            pooling_type='AVG',
            padding='SAME')
        tf.summary.image('shrunk_spectrogram', self.output_, max_outputs=1)
      elif model_settings['preprocess'] == 'mfcc':
        self.output_ = contrib_audio.mfcc(
            spectrogram,
            wav_decoder.sample_rate,
            dct_coefficient_count=model_settings['fingerprint_width'])
        tf.summary.image(
            'mfcc', tf.expand_dims(self.output_, -1), max_outputs=1)
      elif model_settings['preprocess'] == 'micro':
        if not frontend_op:
          raise Exception(
              'Micro frontend op is currently not available when running'
              ' TensorFlow directly from Python, you need to build and run'
              ' through Bazel'
          )
        sample_rate = model_settings['sample_rate']
        window_size_ms = (model_settings['window_size_samples'] *
                          1000) / sample_rate
        window_step_ms = (model_settings['window_stride_samples'] *
                          1000) / sample_rate
        int16_input = tf.cast(tf.multiply(background_clamp, 32768), tf.int16)
        micro_frontend = frontend_op.audio_microfrontend(
            int16_input,
            sample_rate=sample_rate,
            window_size=window_size_ms,
            window_step=window_step_ms,
            num_channels=model_settings['fingerprint_width'],
            out_scale=1,
            out_type=tf.float32)
        self.output_ = tf.multiply(micro_frontend, (10.0 / 256.0))
        tf.summary.image(
            'micro',
            tf.expand_dims(tf.expand_dims(self.output_, -1), 0),
            max_outputs=1)
      else:
        raise ValueError(
            'Unknown preprocess mode "%s" (should be "mfcc", '
            ' "average", or "micro")' % (model_settings['preprocess']))

      # Merge all the summaries and write them out to /tmp/retrain_logs (by
      # default)
      self.merged_summaries_ = tf.summary.merge_all(scope='data')
      if summaries_dir:
        self.summary_writer_ = tf.summary.FileWriter(summaries_dir + '/data',
                                                     tf.get_default_graph())