def wavds2specds(ds_wav, Flags): """ Convert a dataset of waveforms into a dataset of spectrograms """ specgrams = [] labels = [] # cnt=0 for cnt, (wav, label) in enumerate(ds_wav): if wav.shape != (16000, ) or label.shape != (): print( f"In Loop Shape is wrong at {cnt}: {wav.shape}, {label.shape}") cnt += 1 spec = frontend_op.audio_microfrontend( wav, sample_rate=Flags.sample_rate, window_size=Flags.window_size_ms, window_step=Flags.window_stride_ms, num_channels=Flags.dct_coefficient_count) spec = tf.cast(spec, 'float32') / 1000.0 specgrams.append(spec) # label = keras.utils.to_categorical(label, num_classes) labels.append(label) if (cnt % 250) == 0: print(f"Converted {cnt} samples to spectrogram") print(f"Finished converting {cnt} samples to spectrogram.") ds_specs = tf.data.Dataset.from_tensor_slices((specgrams, labels)) return ds_specs
def Micro_process(sample_rate=16000, window_size=480, window_stride=320, input_width=40): wav_filename_placeholder = tf.compat.v1.placeholder(tf.string, [], name="wav_name") wav_loader = io_ops.read_file(wav_filename_placeholder, name="reader_reader") wav_decoder = tf.audio.decode_wav(wav_loader, desired_channels=1, desired_samples=sample_rate, name="wav_decoder") #background_clamp = tf.clip_by_value(wav_decoder.audio, -1.0, 1.0) window_size = (window_size * 1000) / sample_rate window_step = (window_stride * 1000) / sample_rate int16_input = tf.cast(tf.multiply(wav_decoder.audio, 32768), tf.int16) micro_frontend = frontend_op.audio_microfrontend(int16_input, sample_rate=sample_rate, window_size=window_size, window_step=window_step, num_channels=input_width, out_scale=1, out_type=tf.float32) mfcc = tf.multiply(micro_frontend, (10.0 / 256.0)) return mfcc, wav_filename_placeholder
def run_Micro_process(filename, sess, input_width=40, window_size_samples=480, window_stride_samples=320, sample_rate=16000): wav_filename_placeholder = tf.compat.v1.placeholder(tf.string, [], name="wav_name") wav_loader = io_ops.read_file(wav_filename_placeholder, name="reader_reader") wav_decoder = tf.audio.decode_wav(wav_loader, desired_channels=1, desired_samples=sample_rate, name="wav_decoder") window_size = (window_size_samples * 1000) / sample_rate window_step = (window_stride_samples * 1000) / sample_rate int16_input = tf.cast(tf.multiply(wav_decoder.audio, 32768), tf.int16) micro_frontend = frontend_op.audio_microfrontend(int16_input, sample_rate=sample_rate, window_size=window_size, window_step=window_step, num_channels=input_width, out_scale=1, out_type=tf.float32) mfcc = tf.multiply(micro_frontend, (10.0 / 256.0)) tf.compat.v1.summary.image('micro', tf.expand_dims(tf.expand_dims(mfcc, -1), 0), max_outputs=1) return sess.run(mfcc, feed_dict={ wav_filename_placeholder: filename }).flatten()
def prepare_tf_micro_spectrogram_computer(): sample_rate = 16000 # Step 1: Windowing window_size_samples = 480 window_stride_samples = 320 # Step 3: Mel-spec num_channels = 40 lower_band_limit = 0.0 upper_band_limit = 7999.0 # Step 4: Smoothing min_signal_remaining = 0.05 smoothing_bits = 10 even_smoothing = 0.025 odd_smoothing = 0.06 # Step 5: PCEN enable_pcan = True pcan_strength = 0.95 pcan_offset = 80.0 gain_bits = 21 # Step 6: log-scaling scale_shift = 6 enable_log = True tf.compat.v1.reset_default_graph() tf.compat.v1.disable_eager_execution() window_size_ms = window_size_samples * 1000 / sample_rate window_step_ms = window_stride_samples * 1000 / sample_rate with tf.compat.v1.get_default_graph().name_scope('data'): wav_signal_placeholder = tf.compat.v1.placeholder( tf.int16, shape=16000, name='wav_signal_placeholder') micro_frontend = frontend_op.audio_microfrontend( wav_signal_placeholder, sample_rate=sample_rate, window_size=window_size_ms, window_step=window_step_ms, num_channels=num_channels, upper_band_limit=upper_band_limit, lower_band_limit=lower_band_limit, min_signal_remaining=min_signal_remaining, smoothing_bits=smoothing_bits, even_smoothing=even_smoothing, odd_smoothing=odd_smoothing, enable_pcan=enable_pcan, pcan_strength=pcan_strength, pcan_offset=pcan_offset, gain_bits=gain_bits, enable_log=enable_log, scale_shift=scale_shift, out_scale=1, out_type=tf.float32) output_ = tf.multiply(micro_frontend, (10.0 / 256.0)) return wav_signal_placeholder, output_
def spec_feats(sample_dict): """Runs TFL microfrontend and returns spectrogram""" audio = sample_dict['audio'] label = sample_dict['label'] paddings = [[0, 16000-tf.shape(audio)[0]]] audio = tf.pad(audio, paddings) audio16 = tf.cast(audio, 'int16') spec = frontend_op.audio_microfrontend(audio16, sample_rate=16000, window_size=40, window_step=20, num_channels=40) return spec, label
def get_features(model_settings): if model_settings['preprocess'] == 'micro': window_size_ms = (model_settings['window_size_samples'] * 1000) / model_settings['sample_rate'] window_step_ms = (model_settings['window_stride_samples'] * 1000) / model_settings['sample_rate'] int16_input = tf.cast(tf.multiply(input_data, 32768), tf.int16) # print(int16_input.shape) # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/microfrontend/python/ops/audio_microfrontend_op.py micro_frontend = frontend_op.audio_microfrontend( int16_input, sample_rate=model_settings['sample_rate'], window_size=window_size_ms, window_step=window_step_ms, num_channels=model_settings['fingerprint_width'], out_scale=1, out_type=tf.float32) output = tf.multiply(micro_frontend, (10.0 / 256.0)) return output elif model_settings['preprocess'] == 'mfcc': # https://www.tensorflow.org/api_docs/python/tf/raw_ops/AudioSpectrogram spectrogram = audio_ops.audio_spectrogram( input_data, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) output = audio_ops.mfcc( spectrogram, model_settings['sample_rate'], dct_coefficient_count=model_settings['fingerprint_width']) return output[0,:,:] #just return channel 0 as 2D tensor elif model_settings['preprocess'] == 'average': spectrogram = audio_ops.audio_spectrogram( input_data, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) output = tf.nn.pool( input=tf.expand_dims(spectrogram, -1), window_shape=[1, model_settings['average_window_width']], strides=[1, model_settings['average_window_width']], pooling_type='AVG', padding='SAME') return output[0,:,:,0] #just return channel 0 as 2D tensor else: raise ValueError(f'Unknown model setting: {model_settings["preprocess"]}')
def get_spectrogram(waveform): # Concatenate audio with padding so that all audio clips will be of the # same length (16000 samples) zero_padding = tf.zeros([wave_length_samps] - tf.shape(waveform), dtype=tf.int16) waveform = tf.cast(0.5 * waveform * (i16max - i16min), tf.int16) # scale float [-1,+1]=>INT16 equal_length = tf.concat([waveform, zero_padding], 0) ## Make sure these labels correspond to those used in micro_features_micro_features_generator.cpp spectrogram = frontend_op.audio_microfrontend(equal_length, sample_rate=fsamp, num_channels=num_channels, window_size=window_size_ms, window_step=window_step_ms) return spectrogram
def testSimple(self): with self.test_session(): audio = tf.constant( [0, 32767, 0, -32768] * ((WINDOW_SIZE + 4 * WINDOW_STEP) // 4), tf.int16) filterbanks = frontend_op.audio_microfrontend( audio, sample_rate=SAMPLE_RATE, window_size=WINDOW_SIZE, window_step=WINDOW_STEP, num_channels=NUM_CHANNELS, upper_band_limit=UPPER_BAND_LIMIT, lower_band_limit=LOWER_BAND_LIMIT, smoothing_bits=SMOOTHING_BITS, enable_pcan=True) self.assertAllEqual(filterbanks.eval(), [[479, 425], [436, 378], [410, 350], [391, 325]])
def to_micro_spectrogram(model_settings, audio): sample_rate = model_settings["sample_rate"] window_size_ms = (model_settings["window_size_samples"] * 1000) / sample_rate window_step_ms = (model_settings["window_stride_samples"] * 1000) / sample_rate int16_input = tf.cast(tf.multiply(audio, 32768), tf.int16) # https://git.io/Jkuux micro_frontend = frontend_op.audio_microfrontend( int16_input, sample_rate=sample_rate, window_size=window_size_ms, window_step=window_step_ms, num_channels=model_settings["fingerprint_width"], out_scale=1, out_type=tf.float32, ) output = tf.multiply(micro_frontend, (10.0 / 256.0)) return output
def testSimpleFloatScaled(self): with self.test_session(): audio = tf.constant([0, 32767, 0, -32768] * ((WINDOW_SIZE + 4 * WINDOW_STEP) // 4), tf.int16) filterbanks = frontend_op.audio_microfrontend( audio, sample_rate=SAMPLE_RATE, window_size=WINDOW_SIZE, window_step=WINDOW_STEP, num_channels=NUM_CHANNELS, upper_band_limit=UPPER_BAND_LIMIT, lower_band_limit=LOWER_BAND_LIMIT, smoothing_bits=SMOOTHING_BITS, enable_pcan=True, out_scale=64, out_type=tf.float32) self.assertAllEqual(filterbanks.eval(), [[7.484375, 6.640625], [6.8125, 5.90625], [6.40625, 5.46875], [6.109375, 5.078125]])
def testSimpleFloatScaled(self): with self.test_session(): audio = tf.constant( [0, 32767, 0, -32768] * ((WINDOW_SIZE + 4 * WINDOW_STEP) // 4), tf.int16) filterbanks = frontend_op.audio_microfrontend( audio, sample_rate=SAMPLE_RATE, window_size=WINDOW_SIZE, window_step=WINDOW_STEP, num_channels=NUM_CHANNELS, upper_band_limit=UPPER_BAND_LIMIT, lower_band_limit=LOWER_BAND_LIMIT, smoothing_bits=SMOOTHING_BITS, enable_pcan=True, out_scale=64, out_type=tf.float32) self.assertAllEqual(filterbanks.eval(), [[7.484375, 6.640625], [6.8125, 5.90625], [6.40625, 5.46875], [6.109375, 5.078125]])
def to_micro_spectrogram( audio, sample_rate: int = 16000, window_size_ms: int = 30, window_step_ms: int = 20, feature_bin_count: int = 40, ): int16_input = tf.cast(tf.multiply(audio, 32768), tf.int16) # https://git.io/Jkuux micro_frontend = frontend_op.audio_microfrontend( int16_input, sample_rate=sample_rate, window_size=window_size_ms, window_step=window_step_ms, num_channels=feature_bin_count, out_scale=1, out_type=tf.float32, ) output = tf.multiply(micro_frontend, (10.0 / 256.0)) return output
def testZeroPadding(self): with self.test_session(): audio = tf.constant( [0, 32767, 0, -32768] * ((WINDOW_SIZE + 7 * WINDOW_STEP) // 4), tf.int16) filterbanks = frontend_op.audio_microfrontend( audio, sample_rate=SAMPLE_RATE, window_size=WINDOW_SIZE, window_step=WINDOW_STEP, num_channels=NUM_CHANNELS, upper_band_limit=UPPER_BAND_LIMIT, lower_band_limit=LOWER_BAND_LIMIT, smoothing_bits=SMOOTHING_BITS, enable_pcan=True, left_context=2, frame_stride=3, zero_padding=True) self.assertAllEqual( self.evaluate(filterbanks), [[0, 0, 0, 0, 479, 425], [436, 378, 410, 350, 391, 325], [374, 308, 362, 292, 352, 275]])
def testZeroPadding(self): with self.test_session(): audio = tf.constant([0, 32767, 0, -32768] * ((WINDOW_SIZE + 7 * WINDOW_STEP) // 4), tf.int16) filterbanks = frontend_op.audio_microfrontend( audio, sample_rate=SAMPLE_RATE, window_size=WINDOW_SIZE, window_step=WINDOW_STEP, num_channels=NUM_CHANNELS, upper_band_limit=UPPER_BAND_LIMIT, lower_band_limit=LOWER_BAND_LIMIT, smoothing_bits=SMOOTHING_BITS, enable_pcan=True, left_context=2, frame_stride=3, zero_padding=True) self.assertAllEqual( filterbanks.eval(), [[0, 0, 0, 0, 479, 425], [436, 378, 410, 350, 391, 325], [374, 308, 362, 292, 352, 275]])
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms, clip_stride_ms, window_size_ms, window_stride_ms, feature_bin_count, model_architecture, preprocess): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. clip_stride_ms: How often to run recognition. Useful for models with cache. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. feature_bin_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. preprocess: How the spectrogram is processed to produce features, for example 'mfcc', 'average', or 'micro'. Returns: Input and output tensor objects. Raises: Exception: If the preprocessing mode isn't recognized. """ words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, feature_bin_count, preprocess) runtime_settings = {'clip_stride_ms': clip_stride_ms} wav_data_placeholder = tf.compat.v1.placeholder(tf.string, [], name='wav_data') decoded_sample_data = tf.audio.decode_wav( wav_data_placeholder, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') spectrogram = audio_ops.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) if preprocess == 'average': fingerprint_input = tf.nn.pool( input=tf.expand_dims(spectrogram, -1), window_shape=[1, model_settings['average_window_width']], strides=[1, model_settings['average_window_width']], pooling_type='AVG', padding='SAME') elif preprocess == 'mfcc': fingerprint_input = audio_ops.mfcc( spectrogram, sample_rate, dct_coefficient_count=model_settings['fingerprint_width']) elif preprocess == 'micro': if not frontend_op: raise Exception( 'Micro frontend op is currently not available when running TensorFlow' ' directly from Python, you need to build and run through Bazel, for' ' example' ' `bazel run tensorflow/examples/speech_commands:freeze_graph`' ) sample_rate = model_settings['sample_rate'] window_size_ms = (model_settings['window_size_samples'] * 1000) / sample_rate window_step_ms = (model_settings['window_stride_samples'] * 1000) / sample_rate int16_input = tf.cast(tf.multiply(decoded_sample_data.audio, 32767), tf.int16) micro_frontend = frontend_op.audio_microfrontend( int16_input, sample_rate=sample_rate, window_size=window_size_ms, window_step=window_step_ms, num_channels=model_settings['fingerprint_width'], out_scale=1, out_type=tf.float32) fingerprint_input = tf.multiply(micro_frontend, (10.0 / 256.0)) elif preprocess == "rune": fingerprint_input = np.random.uniform(0, 26, 1960).astype(np.float32) else: raise Exception('Unknown preprocess mode "%s" (should be "mfcc",' ' "average", or "micro")' % (preprocess)) fingerprint_size = model_settings['fingerprint_size'] reshaped_input = tf.reshape(fingerprint_input, [-1, fingerprint_size]) logits = models.create_model(reshaped_input, model_settings, model_architecture, is_training=False, runtime_settings=runtime_settings) # Create an output to use for inference. softmax = tf.nn.softmax(logits, name='labels_softmax') return reshaped_input, softmax
def prepare_processing_graph(self, model_settings, summaries_dir): """Builds a TensorFlow graph to apply the input distortions. Creates a graph that loads a WAVE file, decodes it, scales the volume, shifts it in time, adds in background noise, calculates a spectrogram, and then builds an MFCC fingerprint from that. This must be called with an active TensorFlow session running, and it creates multiple placeholder inputs, and one output: - wav_filename_placeholder_: Filename of the WAV to load. - foreground_volume_placeholder_: How loud the main clip should be. - time_shift_padding_placeholder_: Where to pad the clip. - time_shift_offset_placeholder_: How much to move the clip in time. - background_data_placeholder_: PCM sample data for background noise. - background_volume_placeholder_: Loudness of mixed-in background. - output_: Output 2D fingerprint of processed audio. Args: model_settings: Information about the current model being trained. summaries_dir: Path to save training summary information to. Raises: ValueError: If the preprocessing mode isn't recognized. Exception: If the preprocessor wasn't compiled in. """ with tf.compat.v1.get_default_graph().name_scope('data'): desired_samples = model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.compat.v1.placeholder( tf.string, [], name='wav_filename') wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = tf.audio.decode_wav(wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.compat.v1.placeholder( tf.float32, [], name='foreground_volume') scaled_foreground = tf.multiply( wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_offset_placeholder_ = tf.compat.v1.placeholder( tf.int32, [2], name='time_shift_offset') padded_foreground = tf.pad( tensor=scaled_foreground, paddings=self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) # Mix in background noise. self.background_data_placeholder_ = tf.compat.v1.placeholder( tf.float32, [desired_samples, 1], name='background_data') self.background_volume_placeholder_ = tf.compat.v1.placeholder( tf.float32, [], name='background_volume') background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. spectrogram = audio_ops.audio_spectrogram( background_clamp, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) # remove summary # tf.compat.v1.summary.image( # 'spectrogram', tf.expand_dims(spectrogram, -1), max_outputs=1) # The number of buckets in each FFT row in the spectrogram will depend on # how many input samples there are in each window. This can be quite # large, with a 160 sample window producing 127 buckets for example. We # don't need this level of detail for classification, so we often want to # shrink them down to produce a smaller result. That's what this section # implements. One method is to use average pooling to merge adjacent # buckets, but a more sophisticated approach is to apply the MFCC # algorithm to shrink the representation. if model_settings['preprocess'] == 'average': self.output_ = tf.nn.pool( input=tf.expand_dims(spectrogram, -1), window_shape=[1, model_settings['average_window_width']], strides=[1, model_settings['average_window_width']], pooling_type='AVG', padding='SAME') # tf.compat.v1.summary.image('shrunk_spectrogram', # self.output_, # max_outputs=1) elif model_settings['preprocess'] == 'mfcc': self.output_ = audio_ops.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=model_settings['fingerprint_width']) # tf.compat.v1.summary.image( # 'mfcc', tf.expand_dims(self.output_, -1), max_outputs=1) elif model_settings['preprocess'] == 'micro': if not frontend_op: raise Exception( 'Micro frontend op is currently not available when running' ' TensorFlow directly from Python, you need to build and run' ' through Bazel') sample_rate = model_settings['sample_rate'] window_size_ms = (model_settings['window_size_samples'] * 1000) / sample_rate window_step_ms = (model_settings['window_stride_samples'] * 1000) / sample_rate int16_input = tf.cast(tf.multiply(background_clamp, 32768), tf.int16) micro_frontend = frontend_op.audio_microfrontend( int16_input, sample_rate=sample_rate, window_size=window_size_ms, window_step=window_step_ms, num_channels=model_settings['fingerprint_width'], out_scale=1, out_type=tf.float32) self.output_ = tf.multiply(micro_frontend, (10.0 / 256.0)) # tf.compat.v1.summary.image( # 'micro', # tf.expand_dims(tf.expand_dims(self.output_, -1), 0), # max_outputs=1) else: raise ValueError( 'Unknown preprocess mode "%s" (should be "mfcc", ' ' "average", or "micro")' % (model_settings['preprocess']))
def prepare_processing_graph(self, flags): """Builds a TensorFlow graph to apply the input distortions. Creates a graph that loads a WAVE file, decodes it, scales the volume, shifts it in time, adds in background noise, calculates a spectrogram, and then builds an MFCC fingerprint from that. This must be called with an active TensorFlow session running, and it creates multiple placeholder inputs, and one output: - wav_filename_placeholder_: Filename of the WAV to load. - foreground_volume_placeholder_: How loud the main clip should be. - foreground_resampling_placeholder_: Controls signal stretching/squeezing - time_shift_padding_placeholder_: Where to pad the clip. - time_shift_offset_placeholder_: How much to move the clip in time. - background_data_placeholder_: PCM sample data for background noise. - background_volume_placeholder_: Loudness of mixed-in background. - output_: Output 2D fingerprint of processed audio or raw audio. Args: flags: data and model parameters, described at model_train.py Raises: ValueError: If the preprocessing mode isn't recognized. Exception: If the preprocessor wasn't compiled in. """ with tf.get_default_graph().name_scope('data'): desired_samples = flags.desired_samples self.wav_filename_placeholder_ = tf.placeholder( tf.string, [], name='wav_filename') wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = tf.audio.decode_wav(wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder( tf.float32, [], name='foreground_volume') # signal resampling to generate more training data # it will stretch or squeeze input signal proportinally to: self.foreground_resampling_placeholder_ = tf.placeholder( tf.float32, []) if self.foreground_resampling_placeholder_ != 1.0: image = tf.expand_dims(wav_decoder.audio, 0) image = tf.expand_dims(image, 2) shape = tf.shape(wav_decoder.audio) image_resized = tf.image.resize( images=image, size=(tf.cast((tf.cast(shape[0], tf.float32) * self.foreground_resampling_placeholder_), tf.int32), 1), preserve_aspect_ratio=False) image_resized_cropped = tf.image.resize_with_crop_or_pad( image_resized, target_height=desired_samples, target_width=1, ) image_resized_cropped = tf.squeeze(image_resized_cropped, axis=[0, 3]) scaled_foreground = tf.multiply( image_resized_cropped, self.foreground_volume_placeholder_) else: scaled_foreground = tf.multiply( wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_padding_placeholder_ = tf.placeholder( tf.int32, [2, 2], name='time_shift_padding') self.time_shift_offset_placeholder_ = tf.placeholder( tf.int32, [2], name='time_shift_offset') padded_foreground = tf.pad( tensor=scaled_foreground, paddings=self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) # Mix in background noise. self.background_data_placeholder_ = tf.placeholder( tf.float32, [desired_samples, 1], name='background_data') self.background_volume_placeholder_ = tf.placeholder( tf.float32, [], name='background_volume') background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) if flags.preprocess == 'raw': # background_clamp dims: [time, channels] # remove channel dim self.output_ = tf.squeeze(background_clamp, axis=1) # below options are for backward compatibility with previous # version of hotword detection on microcontrollers # in this case audio feature extraction is done separately from # neural net and user will have to manage it. elif flags.preprocess == 'mfcc': # Run the spectrogram and MFCC ops to get a 2D audio: Short-time FFTs # background_clamp dims: [time, channels] spectrogram = audio_ops.audio_spectrogram( background_clamp, window_size=flags.window_size_samples, stride=flags.window_stride_samples, magnitude_squared=flags.fft_magnitude_squared) # spectrogram: [channels/batch, frames, fft_feature] # extract mfcc features from spectrogram by audio_ops.mfcc: # 1 Input is spectrogram frames. # 2 Weighted spectrogram into bands using a triangular mel filterbank # 3 Logarithmic scaling # 4 Discrete cosine transform (DCT), return lowest dct_coefficient_count mfcc = audio_ops.mfcc( spectrogram=spectrogram, sample_rate=flags.sample_rate, upper_frequency_limit=flags.mel_upper_edge_hertz, lower_frequency_limit=flags.mel_lower_edge_hertz, filterbank_channel_count=flags.mel_num_bins, dct_coefficient_count=flags.dct_num_features) # mfcc: [channels/batch, frames, dct_coefficient_count] # remove channel dim self.output_ = tf.squeeze(mfcc, axis=0) elif flags.preprocess == 'micro': if not frontend_op: raise Exception( 'Micro frontend op is currently not available when running' ' TensorFlow directly from Python, you need to build and run' ' through Bazel') int16_input = tf.cast( tf.multiply(background_clamp, MAX_ABS_INT16), tf.int16) # audio_microfrontend does: # 1. A slicing window function of raw audio # 2. Short-time FFTs # 3. Filterbank calculations # 4. Noise reduction # 5. PCAN Auto Gain Control # 6. Logarithmic scaling # int16_input dims: [time, channels] micro_frontend = frontend_op.audio_microfrontend( int16_input, sample_rate=flags.sample_rate, window_size=flags.window_size_ms, window_step=flags.window_stride_ms, num_channels=flags.mel_num_bins, upper_band_limit=flags.mel_upper_edge_hertz, lower_band_limit=flags.mel_lower_edge_hertz, out_scale=1, out_type=tf.float32) # int16_input dims: [frames, num_channels] self.output_ = tf.multiply(micro_frontend, (10.0 / 256.0)) else: raise ValueError( 'Unknown preprocess mode "%s" (should be "raw", ' ' "mfcc", or "micro")' % (flags.preprocess))
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms, clip_stride_ms, window_size_ms, window_stride_ms, feature_bin_count, model_architecture, preprocess): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. clip_stride_ms: How often to run recognition. Useful for models with cache. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. feature_bin_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. preprocess: How the spectrogram is processed to produce features, for example 'mfcc', 'average', or 'micro'. Raises: Exception: If the preprocessing mode isn't recognized. """ words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, feature_bin_count, preprocess) runtime_settings = {'clip_stride_ms': clip_stride_ms} wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data') decoded_sample_data = contrib_audio.decode_wav( wav_data_placeholder, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') spectrogram = contrib_audio.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) if preprocess == 'average': fingerprint_input = tf.nn.pool( tf.expand_dims(spectrogram, -1), window_shape=[1, model_settings['average_window_width']], strides=[1, model_settings['average_window_width']], pooling_type='AVG', padding='SAME') elif preprocess == 'mfcc': fingerprint_input = contrib_audio.mfcc( spectrogram, sample_rate, dct_coefficient_count=model_settings['fingerprint_width']) elif preprocess == 'micro': if not frontend_op: raise Exception( 'Micro frontend op is currently not available when running TensorFlow' ' directly from Python, you need to build and run through Bazel, for' ' example' ' `bazel run tensorflow/examples/speech_commands:freeze_graph`' ) sample_rate = model_settings['sample_rate'] window_size_ms = (model_settings['window_size_samples'] * 1000) / sample_rate window_step_ms = (model_settings['window_stride_samples'] * 1000) / sample_rate int16_input = tf.cast( tf.multiply(decoded_sample_data.audio, 32767), tf.int16) micro_frontend = frontend_op.audio_microfrontend( int16_input, sample_rate=sample_rate, window_size=window_size_ms, window_step=window_step_ms, num_channels=model_settings['fingerprint_width'], out_scale=1, out_type=tf.float32) fingerprint_input = tf.multiply(micro_frontend, (10.0 / 256.0)) else: raise Exception('Unknown preprocess mode "%s" (should be "mfcc",' ' "average", or "micro")' % (preprocess)) fingerprint_size = model_settings['fingerprint_size'] reshaped_input = tf.reshape(fingerprint_input, [-1, fingerprint_size]) logits = models.create_model( reshaped_input, model_settings, model_architecture, is_training=False, runtime_settings=runtime_settings) # Create an output to use for inference. tf.nn.softmax(logits, name='labels_softmax')
def prepare_processing_graph(self, data_settings): """Builds a TensorFlow graph to apply the input distortions. Creates a graph that loads a WAVE file, decodes it, scales the volume, shifts it in time, adds in background noise, calculates a spectrogram, and then builds an MFCC fingerprint from that. This must be called with an active TensorFlow session running, and it creates multiple placeholder inputs, and one output: - wav_filename_placeholder_: Filename of the WAV to load. - foreground_volume_placeholder_: How loud the main clip should be. - foreground_resampling_placeholder_: Controls signal stretching/squeezing - time_shift_padding_placeholder_: Where to pad the clip. - time_shift_offset_placeholder_: How much to move the clip in time. - background_data_placeholder_: PCM sample data for background noise. - background_volume_placeholder_: Loudness of mixed-in background. - output_: Output 2D fingerprint of processed audio or raw audio. Args: data_settings: data and model parameters, described at model_train.py Raises: ValueError: If the preprocessing mode isn't recognized. Exception: If the preprocessor wasn't compiled in. """ with tf.get_default_graph().name_scope('data'): desired_samples = data_settings.desired_samples self.wav_filename_placeholder_ = tf.placeholder( tf.string, [], name='wav_filename') wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = tf.audio.decode_wav(wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder( tf.float32, [], name='foreground_volume') # signal resampling to generate more training data # it will stretch or squeeze input signal proportinally to: self.foreground_resampling_placeholder_ = tf.placeholder( tf.float32, []) if self.foreground_resampling_placeholder_ != 1.0: image = tf.expand_dims(wav_decoder.audio, 0) image = tf.expand_dims(image, 2) shape = tf.shape(wav_decoder.audio) image_resized = tf.image.resize( images=image, size=(tf.cast((tf.cast(shape[0], tf.float32) * self.foreground_resampling_placeholder_), tf.int32), 1), preserve_aspect_ratio=False) image_resized_cropped = tf.image.resize_with_crop_or_pad( image_resized, target_height=desired_samples, target_width=1, ) image_resized_cropped = tf.squeeze(image_resized_cropped, axis=[0, 3]) scaled_foreground = tf.multiply( image_resized_cropped, self.foreground_volume_placeholder_) else: scaled_foreground = tf.multiply( wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_padding_placeholder_ = tf.placeholder( tf.int32, [2, 2], name='time_shift_padding') self.time_shift_offset_placeholder_ = tf.placeholder( tf.int32, [2], name='time_shift_offset') padded_foreground = tf.pad( tensor=scaled_foreground, paddings=self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) # Mix in background noise. self.background_data_placeholder_ = tf.placeholder( tf.float32, [desired_samples, 1], name='background_data') self.background_volume_placeholder_ = tf.placeholder( tf.float32, [], name='background_volume') background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) if data_settings.preprocess == 'raw': # return raw audio self.output_ = background_clamp tf.summary.image('input_audio', tf.expand_dims( tf.expand_dims(background_clamp, -1), -1), max_outputs=1) else: # Run the spectrogram and MFCC ops to get a 2D audio 'fingerprint' spectrogram = audio_ops.audio_spectrogram( background_clamp, window_size=data_settings.window_size_samples, stride=data_settings.window_stride_samples, magnitude_squared=True) tf.summary.image('spectrogram', tf.expand_dims(spectrogram, -1), max_outputs=1) # The number of buckets in each FFT row in the spectrogram will depend # on how many input samples there are in each window. This can be quite # large, with a 160 sample window producing 127 buckets for example. We # don't need this level of detail for classification, so we often want # to shrink them down to produce a smaller result. That's what this # section implements. One method is to use average pooling to merge # adjacent buckets, but a more sophisticated approach is to apply the # MFCC algorithm to shrink the representation. if data_settings.preprocess == 'average': self.output_ = tf.nn.pool( input=tf.expand_dims(spectrogram, -1), window_shape=[1, data_settings.average_window_width], strides=[1, data_settings.average_window_width], pooling_type='AVG', padding='SAME') tf.summary.image('shrunk_spectrogram', self.output_, max_outputs=1) elif data_settings.preprocess == 'mfcc': self.output_ = audio_ops.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=data_settings.fingerprint_width) tf.summary.image('mfcc', tf.expand_dims(self.output_, -1), max_outputs=1) elif data_settings.preprocess == 'micro': if not frontend_op: raise Exception( 'Micro frontend op is currently not available when running' ' TensorFlow directly from Python, you need to build and run' ' through Bazel') sample_rate = data_settings.sample_rate window_size_ms = (data_settings.window_size_samples * 1000) / sample_rate window_step_ms = (data_settings.window_stride_samples * 1000) / sample_rate int16_input = tf.cast(tf.multiply(background_clamp, 32768), tf.int16) micro_frontend = frontend_op.audio_microfrontend( int16_input, sample_rate=sample_rate, window_size=window_size_ms, window_step=window_step_ms, num_channels=data_settings.fingerprint_width, out_scale=1, out_type=tf.float32) self.output_ = tf.multiply(micro_frontend, (10.0 / 256.0)) tf.summary.image('micro', tf.expand_dims( tf.expand_dims(self.output_, -1), 0), max_outputs=1) else: raise ValueError( 'Unknown preprocess mode "%s" (should be "mfcc", ' ' "average", or "micro")' % (data_settings.preprocess)) # Merge all the summaries and write them out to /tmp/retrain_logs (by # default) self.merged_summaries_ = tf.summary.merge_all(scope='data') if data_settings.summaries_dir: self.summary_writer_ = tf.summary.FileWriter( data_settings.summaries_dir + '/data', tf.get_default_graph())
def prepare_processing_graph(self, model_settings, summaries_dir): """Builds a TensorFlow graph to apply the input distortions. Creates a graph that loads a WAVE file, decodes it, scales the volume, shifts it in time, adds in background noise, calculates a spectrogram, and then builds an MFCC fingerprint from that. This must be called with an active TensorFlow session running, and it creates multiple placeholder inputs, and one output: - wav_filename_placeholder_: Filename of the WAV to load. - foreground_volume_placeholder_: How loud the main clip should be. - time_shift_padding_placeholder_: Where to pad the clip. - time_shift_offset_placeholder_: How much to move the clip in time. - background_data_placeholder_: PCM sample data for background noise. - background_volume_placeholder_: Loudness of mixed-in background. - output_: Output 2D fingerprint of processed audio. Args: model_settings: Information about the current model being trained. summaries_dir: Path to save training summary information to. Raises: ValueError: If the preprocessing mode isn't recognized. Exception: If the preprocessor wasn't compiled in. """ with tf.compat.v1.get_default_graph().name_scope('data'): desired_samples = model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.compat.v1.placeholder( tf.string, [], name='wav_filename') wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = tf.audio.decode_wav(wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.compat.v1.placeholder( tf.float32, [], name='foreground_volume') scaled_foreground = tf.multiply( wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_padding_placeholder_ = tf.compat.v1.placeholder( tf.int32, [2, 2], name='time_shift_padding') self.time_shift_offset_placeholder_ = tf.compat.v1.placeholder( tf.int32, [2], name='time_shift_offset') padded_foreground = tf.pad( tensor=scaled_foreground, paddings=self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) # Mix in background noise. self.background_data_placeholder_ = tf.compat.v1.placeholder( tf.float32, [desired_samples, 1], name='background_data') self.background_volume_placeholder_ = tf.compat.v1.placeholder( tf.float32, [], name='background_volume') background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. # spectrogram = audio_ops.audio_spectrogram( # background_clamp, # window_size=model_settings['window_size_samples'], # stride=model_settings['window_stride_samples'], # magnitude_squared=True) def periodic_hann_window(window_length, dtype): return 0.5 - 0.5 * tf.math.cos(2.0 * np.pi * tf.range( tf.cast(window_length, dtype=dtype), dtype=dtype) / tf.cast(window_length, dtype=dtype)) signal_stft = tf.signal.stft( tf.transpose(background_clamp, [1, 0]), frame_length=model_settings['window_size_samples'], frame_step=model_settings['window_stride_samples'], window_fn=periodic_hann_window) signal_spectrograms = tf.abs(signal_stft) spectrogram = signal_spectrograms tf.compat.v1.summary.image('spectrogram', tf.expand_dims(spectrogram, -1), max_outputs=1) # The number of buckets in each FFT row in the spectrogram will depend on # how many input samples there are in each window. This can be quite # large, with a 160 sample window producing 127 buckets for example. We # don't need this level of detail for classification, so we often want to # shrink them down to produce a smaller result. That's what this section # implements. One method is to use average pooling to merge adjacent # buckets, but a more sophisticated approach is to apply the MFCC # algorithm to shrink the representation. if model_settings['preprocess'] == 'average': self.output_ = tf.nn.pool( input=tf.expand_dims(spectrogram, -1), window_shape=[1, model_settings['average_window_width']], strides=[1, model_settings['average_window_width']], pooling_type='AVG', padding='SAME') tf.compat.v1.summary.image('shrunk_spectrogram', self.output_, max_outputs=1) elif model_settings['preprocess'] == 'fbank': # We just convert the data back to int16 wav format # and the actual filterbank processing is performed outside of tensorflow graph # in the get_data function int16_input = tf.cast(tf.multiply(background_clamp, 32768), tf.int16) # def compute_fbs(int16_wav_input): # fbs, energy = fbank(int16_wav_input, model_settings['sample_rate'], # nfilt=model_settings['fingerprint_width'], # winstep=model_settings['window_stride_samples'] / model_settings['sample_rate'], # winlen=model_settings['window_size_samples'] / model_settings['sample_rate'], # nfft=1024, # lowfreq=64) # fbs = np.log(fbs) # energy = np.log(energy) # return np.concatenate([fbs, energy[:, None]], axis=1) # # log_fbs_with_energy = compute_fbs(int16_input) self.output_ = int16_input # tf.compat.v1.summary.image( # 'fbank', tf.expand_dims(self.output_, -1), max_outputs=1) elif model_settings['preprocess'] == 'mfcc': # signal_mfccs = audio_ops.mfcc( # spectrogram, # # tf.expand_dims(signal_spectrograms, 0), # wav_decoder.sample_rate, # dct_coefficient_count=model_settings['fingerprint_width']) # # self.output_ = signal_mfccs # print("OLD", signal_mfccs.shape) num_spectrogram_bins = signal_stft.shape[-1] num_mel_bins = num_mfccs = model_settings['fingerprint_width'] lower_edge_hertz = 20.0 upper_edge_hertz = 4000.0 log_noise_floor = 1e-12 linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix( num_mel_bins, num_spectrogram_bins, model_settings['sample_rate'], # lower_edge_hertz, upper_edge_hertz ) mel_spectrograms = tf.tensordot(spectrogram, linear_to_mel_weight_matrix, 1) mel_spectrograms.set_shape( mel_spectrograms.shape[:-1].concatenate( linear_to_mel_weight_matrix.shape[-1:])) log_mel_spectrograms = tf.math.log(mel_spectrograms + log_noise_floor) signal_mfccs = tf.signal.mfccs_from_log_mel_spectrograms( log_mel_spectrograms)[..., :num_mfccs] # print("NEW", signal_mfccs.shape) self.output_ = signal_mfccs tf.compat.v1.summary.image('mfcc', tf.expand_dims(self.output_, -1), max_outputs=1) elif model_settings['preprocess'] == 'micro': if not frontend_op: raise Exception( 'Micro frontend op is currently not available when running' ' TensorFlow directly from Python, you need to build and run' ' through Bazel') sample_rate = model_settings['sample_rate'] window_size_ms = (model_settings['window_size_samples'] * 1000) / sample_rate window_step_ms = (model_settings['window_stride_samples'] * 1000) / sample_rate int16_input = tf.cast(tf.multiply(background_clamp, 32768), tf.int16) micro_frontend = frontend_op.audio_microfrontend( int16_input, sample_rate=sample_rate, window_size=window_size_ms, window_step=window_step_ms, num_channels=model_settings['fingerprint_width'], out_scale=1, out_type=tf.float32) self.output_ = tf.multiply(micro_frontend, (10.0 / 256.0)) tf.compat.v1.summary.image( 'micro', tf.expand_dims(tf.expand_dims(self.output_, -1), 0), max_outputs=1) else: raise ValueError( 'Unknown preprocess mode "%s" (should be "mfcc", ' ' "average", or "micro")' % (model_settings['preprocess'])) # Merge all the summaries and write them out to /tmp/retrain_logs (by # default) self.merged_summaries_ = tf.compat.v1.summary.merge_all( scope='data') if summaries_dir: self.summary_writer_ = tf.compat.v1.summary.FileWriter( summaries_dir + '/data', tf.compat.v1.get_default_graph())
def prepare_processing_graph(self, model_settings, summaries_dir): """Builds a TensorFlow graph to apply the input distortions. Creates a graph that loads a WAVE file, decodes it, scales the volume, shifts it in time, adds in background noise, calculates a spectrogram, and then builds an MFCC fingerprint from that. This must be called with an active TensorFlow session running, and it creates multiple placeholder inputs, and one output: - wav_filename_placeholder_: Filename of the WAV to load. - foreground_volume_placeholder_: How loud the main clip should be. - time_shift_padding_placeholder_: Where to pad the clip. - time_shift_offset_placeholder_: How much to move the clip in time. - background_data_placeholder_: PCM sample data for background noise. - background_volume_placeholder_: Loudness of mixed-in background. - output_: Output 2D fingerprint of processed audio. Args: model_settings: Information about the current model being trained. summaries_dir: Path to save training summary information to. Raises: ValueError: If the preprocessing mode isn't recognized. Exception: If the preprocessor wasn't compiled in. """ with tf.get_default_graph().name_scope('data'): desired_samples = model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.placeholder( tf.string, [], name='wav_filename') wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav( wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder( tf.float32, [], name='foreground_volume') scaled_foreground = tf.multiply(wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_padding_placeholder_ = tf.placeholder( tf.int32, [2, 2], name='time_shift_padding') self.time_shift_offset_placeholder_ = tf.placeholder( tf.int32, [2], name='time_shift_offset') padded_foreground = tf.pad( scaled_foreground, self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) # Mix in background noise. self.background_data_placeholder_ = tf.placeholder( tf.float32, [desired_samples, 1], name='background_data') self.background_volume_placeholder_ = tf.placeholder( tf.float32, [], name='background_volume') background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. spectrogram = contrib_audio.audio_spectrogram( background_clamp, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) tf.summary.image( 'spectrogram', tf.expand_dims(spectrogram, -1), max_outputs=1) # The number of buckets in each FFT row in the spectrogram will depend on # how many input samples there are in each window. This can be quite # large, with a 160 sample window producing 127 buckets for example. We # don't need this level of detail for classification, so we often want to # shrink them down to produce a smaller result. That's what this section # implements. One method is to use average pooling to merge adjacent # buckets, but a more sophisticated approach is to apply the MFCC # algorithm to shrink the representation. if model_settings['preprocess'] == 'average': self.output_ = tf.nn.pool( tf.expand_dims(spectrogram, -1), window_shape=[1, model_settings['average_window_width']], strides=[1, model_settings['average_window_width']], pooling_type='AVG', padding='SAME') tf.summary.image('shrunk_spectrogram', self.output_, max_outputs=1) elif model_settings['preprocess'] == 'mfcc': self.output_ = contrib_audio.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=model_settings['fingerprint_width']) tf.summary.image( 'mfcc', tf.expand_dims(self.output_, -1), max_outputs=1) elif model_settings['preprocess'] == 'micro': if not frontend_op: raise Exception( 'Micro frontend op is currently not available when running' ' TensorFlow directly from Python, you need to build and run' ' through Bazel' ) sample_rate = model_settings['sample_rate'] window_size_ms = (model_settings['window_size_samples'] * 1000) / sample_rate window_step_ms = (model_settings['window_stride_samples'] * 1000) / sample_rate int16_input = tf.cast(tf.multiply(background_clamp, 32768), tf.int16) micro_frontend = frontend_op.audio_microfrontend( int16_input, sample_rate=sample_rate, window_size=window_size_ms, window_step=window_step_ms, num_channels=model_settings['fingerprint_width'], out_scale=1, out_type=tf.float32) self.output_ = tf.multiply(micro_frontend, (10.0 / 256.0)) tf.summary.image( 'micro', tf.expand_dims(tf.expand_dims(self.output_, -1), 0), max_outputs=1) else: raise ValueError( 'Unknown preprocess mode "%s" (should be "mfcc", ' ' "average", or "micro")' % (model_settings['preprocess'])) # Merge all the summaries and write them out to /tmp/retrain_logs (by # default) self.merged_summaries_ = tf.summary.merge_all(scope='data') if summaries_dir: self.summary_writer_ = tf.summary.FileWriter(summaries_dir + '/data', tf.get_default_graph())