def prepare_input(filename): from tensorflow.contrib.framework.python.ops import audio_ops from tensorflow.python.ops import io_ops with tf.Session(graph=tf.Graph()) as sess: wav_filename_placeholder = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(wav_filename_placeholder) wav_decoder = tf.audio.decode_wav(wav_loader, desired_channels=1, desired_samples=16000, name='decoded_sample_data') spectrum = audio_ops.audio_spectrogram(input=wav_decoder[0], window_size=640, stride=320, magnitude_squared=True, name='AudioSpectrogram') final = audio_ops.mfcc(spectrogram=spectrum, sample_rate=wav_decoder[1], upper_frequency_limit=4000.0, lower_frequency_limit=20.0, filterbank_channel_count=40, dct_coefficient_count=10, name='Mfcc') data = sess.run(final, feed_dict={wav_filename_placeholder: filename}) print(f'Data shape: {data.shape}') return data
def _make_spect(file_name): audio_binary = tf.read_file(file_name) waveform = audio_ops.decode_wav(audio_binary, desired_channels=1) spectrogram = audio_ops.audio_spectrogram(waveform.audio, window_size=1024, stride=64) # Tensorflow spectrogram has time along y axis and frequencies along x axis # flip them spectrogram = tf.image.flip_left_right(spectrogram) spectrogram = tf.transpose(spectrogram, [0, 2, 1]) spectrogram = tf.expand_dims(spectrogram, -1) #add color channel spectrogram = tf.image.resize_bilinear( spectrogram, (spectrogram.shape[1], spectrogram.shape[1])) spectrogram = tf.squeeze(spectrogram, 0) spectrogram = spectrogram - mean_value one_hot = [] for c in _classes: match = tf.strings.regex_full_match(file_name, ".*" + c + ".*", name="find_" + c) one_hot.append(match) one_hot = tf.cast(tf.stack(one_hot), tf.int32) return {"spectrogram": spectrogram, "label": one_hot}
def SpecWithARGS(self, inp, out): wav_file = tf.placeholder(tf.string) audio_binary = tf.read_file(wav_file) waveform = audio_ops.decode_wav(audio_binary, desired_channels=1) spectrogram = audio_ops.audio_spectrogram(waveform.audio, window_size=1024, stride=64) brightness = tf.placeholder(tf.float32, shape=[]) mul = tf.multiply(spectrogram, brightness) min_const = tf.constant(255.) minimum = tf.minimum(mul, min_const) expand_dims = tf.expand_dims(minimum, -1) resize = tf.image.resize_bilinear(expand_dims, [512, 512]) squeeze = tf.squeeze(resize, 0) flip = tf.image.flip_left_right(squeeze) transpose = tf.image.transpose_image(flip) grayscale = tf.image.grayscale_to_rgb(transpose) cast = tf.cast(grayscale, tf.uint8) png = tf.image.encode_png(cast) with tf.Session() as sess: # Run the computation graph and save the png encoded image to a file image = sess.run(png, feed_dict={ wav_file: os.path.join(self.curworkdir, str(inp)), brightness: 100 }) with open(os.path.join(self.curworkdir, str(out)), 'wb') as f: f.write(image)
def wav_to_features(filenames_dataset, hparams, feature_count): dataset = filenames_dataset.map(lambda filename: io_ops.read_file(filename)) dataset = dataset.map(lambda wav_loader: contrib_audio.decode_wav(wav_loader, desired_channels=1)) dataset = dataset.map(lambda wav_decoder: (contrib_audio.audio_spectrogram( wav_decoder.audio, window_size=int(hparams.sample_rate * hparams.window_size_ms / 1000), stride=int(hparams.sample_rate * hparams.window_stride_ms / 1000), magnitude_squared=True), wav_decoder.sample_rate)) dataset = dataset.map(lambda spectrogram, sample_rate: contrib_audio.mfcc( spectrogram, sample_rate, dct_coefficient_count=feature_count)) dataset = dataset.map(lambda inputs: ( inputs, tf.nn.moments(inputs, axes=[1]) )) dataset = dataset.map(lambda inputs, moments: ( tf.divide(tf.subtract(inputs, moments[0]), moments[1]), tf.shape(inputs)[1] )) dataset = dataset.map(lambda inputs, seq_len: ( inputs[0], seq_len )) return dataset
def _build_processing_graph(self): """Builds a TensorFlow graph to apply the input distortions. Creates a graph that loads a WAVE file, decodes it, scales the volume, shifts it in time, adds in background noise, calculates a spectrogram, and then builds an MFCC fingerprint from that. This must be called with an active TensorFlow session running, and it creates multiple placeholder inputs, and one output: - wav_filename_placeholder_: Filename of the WAV to load. - mfcc_: Output 2D fingerprint of processed audio. """ with tf.name_scope('audio_processing'): desired_samples = self._model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav( wav_loader, desired_channels=1, desired_samples=desired_samples) background_clamp = tf.clip_by_value(wav_decoder.audio, -1.0, 1.0) # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. spectrogram = contrib_audio.audio_spectrogram( background_clamp, window_size=self._model_settings['window_size_samples'], stride=self._model_settings['window_stride_samples'], magnitude_squared=True) self.mfcc_ = contrib_audio.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=self. _model_settings['dct_coefficient_count'])
def get_test_data(self, how_many, offset, model_settings, sess, features='mfcc'): candidates = self.data_index if how_many == -1: sample_count = len(candidates) else: sample_count = max(0, min(how_many, len(candidates) - offset)) desired_samples = model_settings['desired_samples'] data = np.zeros((sample_count, model_settings['fingerprint_size'])) wav_filename_placeholder = tf.placeholder(tf.string, [], name='wav_file_names') wav_loader = io_ops.read_file(wav_filename_placeholder) wav_decoder = contrib_audio.decode_wav( wav_loader, desired_channels=1, desired_samples=desired_samples) spectrogram = contrib_audio.audio_spectrogram( wav_decoder.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) mfcc = contrib_audio.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=model_settings['dct_coefficient_count']) for i in range(offset, offset + sample_count): input_dict = {wav_filename_placeholder : candidates[i]} if features == "spectrogram": data[i - offset, :] = sess.run(spectrogram, feed_dict=input_dict).flatten() elif features == "raw": data[i - offset, :] = sess.run(wav_decoder.audio, feed_dict=input_dict).flatten() else: data[i - offset, :] = sess.run(mfcc, feed_dict=input_dict).flatten() return data
def mfcc_tensorflow(wavfile, _sr, frame_size, frame_shift, order=13): sess = tf.InteractiveSession() wav_filename_placeholder = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(wav_filename_placeholder) wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1) wav_data = wav_decoder.audio wav_sample_rate = sess.run(wav_decoder, feed_dict={ wav_filename_placeholder: wavfile }).sample_rate check_sample_rate(wavfile, _sr, wav_sample_rate) spectrogram = contrib_audio.audio_spectrogram(wav_data, window_size=frame_size, stride=frame_shift, magnitude_squared=True) mfcc_ = contrib_audio.mfcc(spectrogram, wav_decoder.sample_rate, dct_coefficient_count=order) mfcc_data = sess.run(mfcc_, feed_dict={wav_filename_placeholder: wavfile}) return mfcc_data
def load_mfcc_file(sess, filename): filename_ph = tf.placeholder(tf.string) loader = io_ops.read_file(filename_ph) decoder = contrib_audio.decode_wav(loader, desired_channels=1, desired_samples=16000) spectrogram = contrib_audio.audio_spectrogram( decoder.audio, window_size=480, stride=160, magnitude_squared=True) mfcc = contrib_audio.mfcc( spectrogram, decoder.sample_rate, dct_coefficient_count=40) return sess.run(mfcc, feed_dict={filename_ph: filename})
def log_spec_tensorflow(wavfile, _sr, frame_size, frame_shift): sess = tf.InteractiveSession() wav_filename_placeholder = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(wav_filename_placeholder) wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1) wav_data = wav_decoder.audio wav_sample_rate = sess.run(wav_decoder, feed_dict={ wav_filename_placeholder: wavfile }).sample_rate check_sample_rate(wavfile, _sr, wav_sample_rate) spectrogram = contrib_audio.audio_spectrogram(wav_data, window_size=frame_size, stride=frame_shift, magnitude_squared=True) log_spectrogram = tf.log(spectrogram[0] + log_offset) log_spec_data = sess.run(log_spectrogram, feed_dict={wav_filename_placeholder: wavfile}) return np.transpose(log_spec_data)
def audio_to_spectrogram(audio_contents, width, height, channels=1, window_size=1024, stride=64, brightness=100.): """Decode and build a spectrogram using a wav string tensor. Args: audio_contents: String tensor of the wav audio contents. width: Spectrogram width. height: Spectrogram height. channels: Audio channel count. window_size: Size of the spectrogram window. stride: Size of the spectrogram stride. brightness: Brightness of the spectrogram. Returns: 0-D string Tensor with the image contents. """ # Decode the wav mono into a 2D tensor with time in dimension 0 # and channel along dimension 1 waveform = audio_ops.decode_wav(audio_contents, desired_channels=channels) # Compute the spectrogram # FIXME: Seems like this is deprecated in tensorflow 2.0 and # the operation only works on CPU. Change this to tf.signal.stft # and friends to take advantage of GPU kernels. spectrogram = audio_ops.audio_spectrogram(waveform.audio, window_size=window_size, stride=stride) # Adjust brightness brightness = tf.constant(brightness) # Normalize pixels mul = tf.multiply(spectrogram, brightness) min_const = tf.constant(255.) minimum = tf.minimum(mul, min_const) # Expand dims so we get the proper shape expand_dims = tf.expand_dims(minimum, -1) # Resize the spectrogram to input size of the model resize = tf.image.resize(expand_dims, [width, height]) # Remove the trailing dimension squeeze = tf.squeeze(resize, 0) # Tensorflow spectrogram has time along y axis and frequencies along x axis # so we fix that flip_left_right = tf.image.flip_left_right(squeeze) transposed = tf.image.transpose(flip_left_right) # Cast to uint8 and encode as png cast = tf.cast(transposed, tf.uint8) # Encode tensor as a png image return tf.image.encode_png(cast)
def prepare_processing_graph(self, model_settings): """Builds a TensorFlow graph to apply the input distortions. Creates a graph that loads a WAVE file, decodes it, scales the volume, shifts it in time, adds in background noise, calculates a spectrogram, and then builds an MFCC fingerprint from that. This must be called with an active TensorFlow session running, and it creates multiple placeholder inputs, and one output: - wav_filename_placeholder_: Filename of the WAV to load. - foreground_volume_placeholder_: How loud the main clip should be. - time_shift_padding_placeholder_: Where to pad the clip. - time_shift_offset_placeholder_: How much to move the clip in time. - background_data_placeholder_: PCM sample data for background noise. - background_volume_placeholder_: Loudness of mixed-in background. - mfcc_: Output 2D fingerprint of processed audio. Args: model_settings: Information about the current model being trained. """ desired_samples = model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav( # wav_loader, desired_channels=1, desired_samples=desired_samples) wav_loader, desired_channels=1, desired_samples=16000) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, []) scaled_foreground = tf.multiply(wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2]) self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2]) padded_foreground = tf.pad( scaled_foreground, self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) # Mix in background noise. self.background_data_placeholder_ = tf.placeholder(tf.float32, [desired_samples, 1]) self.background_volume_placeholder_ = tf.placeholder(tf.float32, []) background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) ###################### M F C C ################################# # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. spectrogram = contrib_audio.audio_spectrogram( background_clamp, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) self.mfcc_ = contrib_audio.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=model_settings['dct_coefficient_count'])
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms, clip_stride_ms, window_size_ms, window_stride_ms, dct_coefficient_count, model_architecture): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. clip_stride_ms: How often to run recognition. Useful for models with cache. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. dct_coefficient_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. """ words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count, ) runtime_settings = {'clip_stride_ms': clip_stride_ms} wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data') decoded_sample_data = contrib_audio.decode_wav( wav_data_placeholder, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') spectrogram = contrib_audio.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) fingerprint_input = contrib_audio.mfcc( spectrogram, decoded_sample_data.sample_rate, dct_coefficient_count=dct_coefficient_count) fingerprint_frequency_size = model_settings['dct_coefficient_count'] fingerprint_time_size = model_settings['spectrogram_length'] reshaped_input = tf.reshape( fingerprint_input, [-1, fingerprint_time_size * fingerprint_frequency_size]) logits = models.create_model(reshaped_input, model_settings, model_architecture, is_training=False, runtime_settings=runtime_settings) # Create an output to use for inference. tf.nn.softmax(logits, name='labels_softmax')
def prepare_processing_graph(self, model_settings): """Builds a TensorFlow graph to apply the input distortions. Creates a graph that loads a WAVE file, decodes it, scales the volume, shifts it in time, adds in background noise, calculates a spectrogram, and then builds an MFCC fingerprint from that. This must be called with an active TensorFlow session running, and it creates multiple placeholder inputs, and one output: - wav_filename_placeholder_: Filename of the WAV to load. - foreground_volume_placeholder_: How loud the main clip should be. - time_shift_padding_placeholder_: Where to pad the clip. - time_shift_offset_placeholder_: How much to move the clip in time. - background_data_placeholder_: PCM sample data for background noise. - background_volume_placeholder_: Loudness of mixed-in background. - mfcc_: Output 2D fingerprint of processed audio. Args: model_settings: Information about the current model being trained. """ desired_samples = model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav( wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, []) scaled_foreground = tf.multiply(wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2]) self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2]) padded_foreground = tf.pad( scaled_foreground, self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) # Mix in background noise. self.background_data_placeholder_ = tf.placeholder(tf.float32, [desired_samples, 1]) self.background_volume_placeholder_ = tf.placeholder(tf.float32, []) background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. spectrogram = contrib_audio.audio_spectrogram( background_clamp, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) self.mfcc_ = contrib_audio.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=model_settings['dct_coefficient_count'])
def samples_to_mfccs(samples, sample_rate): spectrogram = contrib_audio.audio_spectrogram(samples, window_size=Config.audio_window_samples, stride=Config.audio_step_samples, magnitude_squared=True) mfccs = contrib_audio.mfcc(spectrogram, sample_rate, dct_coefficient_count=Config.n_input) mfccs = tf.reshape(mfccs, [-1, Config.n_input]) return mfccs, tf.shape(mfccs)[0]
def samples_to_mfccs(samples, sample_rate): spectrogram = contrib_audio.audio_spectrogram(samples, window_size=Config.audio_window_samples, stride=Config.audio_step_samples, magnitude_squared=True) mfccs = contrib_audio.mfcc(spectrogram, sample_rate, dct_coefficient_count=Config.n_input) mfccs = tf.reshape(mfccs, [-1, Config.n_input]) return mfccs, tf.shape(mfccs)[0]
def _load_sample( wav_filename, model_settings): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. clip_stride_ms: How often to run recognition. Useful for models with cache. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. dct_coefficient_count: Number of frequency bands to analyze. model: Name of the kind of model to generate. """ wav_loader = io_ops.read_file(wav_filename) decoded_sample_data = contrib_audio.decode_wav( wav_loader, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') if model_settings['input_format'] == 'raw': print(decoded_sample_data.audio.shape) reshaped_input = tf.reshape(decoded_sample_data.audio, [ -1, model_settings['desired_samples'] ]) print(reshaped_input.shape) else: spectrogram = contrib_audio.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) fingerprint_input = contrib_audio.mfcc( spectrogram, decoded_sample_data.sample_rate, lower_frequency_limit=model_settings['lower_frequency_limit'], upper_frequency_limit=model_settings['upper_frequency_limit'], filterbank_channel_count=model_settings['filterbank_channel_count'], dct_coefficient_count=model_settings['dct_coefficient_count']) fingerprint_frequency_size = model_settings['dct_coefficient_count'] fingerprint_time_size = model_settings['spectrogram_length'] reshaped_input = tf.reshape(fingerprint_input, [ -1, fingerprint_time_size * fingerprint_frequency_size ]) return reshaped_input
def _single_spectrogram(self, audio, window_size_samples, window_stride_samples, magnitude_squared): # only accept single batch audio = tf.squeeze(audio, 0) spectrogram = contrib_audio.audio_spectrogram( audio, window_size=window_size_samples, stride=window_stride_samples, magnitude_squared=magnitude_squared) return spectrogram
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms, clip_stride_ms, window_size_ms, window_stride_ms, dct_coefficient_count, model_architecture): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. clip_stride_ms: How often to run recognition. Useful for models with cache. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. dct_coefficient_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. """ words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, dct_coefficient_count) runtime_settings = {'clip_stride_ms': clip_stride_ms} wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data') decoded_sample_data = contrib_audio.decode_wav( wav_data_placeholder, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') spectrogram = contrib_audio.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) fingerprint_input = contrib_audio.mfcc( spectrogram, decoded_sample_data.sample_rate, dct_coefficient_count=dct_coefficient_count) fingerprint_frequency_size = model_settings['dct_coefficient_count'] fingerprint_time_size = model_settings['spectrogram_length'] reshaped_input = tf.reshape(fingerprint_input, [ -1, fingerprint_time_size * fingerprint_frequency_size ]) logits = models.create_model( reshaped_input, model_settings, model_architecture, is_training=False, runtime_settings=runtime_settings) # Create an output to use for inference. tf.nn.softmax(logits, name='labels_softmax')
def AudioToMfcc(sample_rate, audio, window_size_ms, window_stride_ms, num_coefficients): window_size_samples = sample_rate * window_size_ms // 1000 window_stride_samples = sample_rate * window_stride_ms // 1000 spectrogram = contrib_audio.audio_spectrogram( audio, window_size=window_size_samples, stride=window_stride_samples, magnitude_squared=True) mfcc = contrib_audio.mfcc(spectrogram, sample_rate, dct_coefficient_count=num_coefficients) return mfcc
def __init__(self, sample_rate: int, dct_coef_count: int=-1): ''' suppose the channel number is 1. ''' assert sample_rate == 16_000 if dct_coef_count == -1: dct_coef_count = DataGraphMFCC.max_mfcc_num else: assert dct_coef_count <= DataGraphMFCC.max_mfcc_num self._sample_rate = sample_rate samples_per_second = sample_rate / 1000 window = int(DataGraphMFCC.window_duration * samples_per_second) stride = int(DataGraphMFCC.stride_duration * samples_per_second) self._graph = tf.Graph() with self._graph.as_default(): self._in_wav_file = tf.placeholder(tf.string, [], name='wav_filename') self._in_frame_num = tf.placeholder(tf.int32, []) wav_loader = io_ops.read_file(self._in_wav_file) wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1) self._out_audio = tf.squeeze(wav_decoder.audio) self._out_sample_rate = wav_decoder.sample_rate self._in_audio = tf.placeholder(tf.float32, [None]) in_audio = tf.expand_dims(self._in_audio, -1) audio_clamp = tf.clip_by_value(in_audio, -1.0, 1.0) spectrogram = contrib_audio.audio_spectrogram( audio_clamp, window_size=window, stride=stride, magnitude_squared=True) self._out_spectrogram = spectrogram feat_ts = contrib_audio.mfcc( spectrogram=spectrogram, sample_rate=sample_rate, dct_coefficient_count=dct_coef_count, ) self._out_mfcc = feat_ts[0] self._out_real_mfcc_len = tf.shape(self._out_mfcc)[0] diff = tf.maximum(0, self._in_frame_num - self._out_real_mfcc_len) self._out_expanded_mfcc = tf.pad( self._out_mfcc, [[0, diff], [0, 0]], )[: self._in_frame_num] self._sess = tf.Session(graph=self._graph) print(f"DataGgraphMFCC graph is created!")
def build_graph(self): """ Graph to extract mfcc fingerprint given wav file Here we add the necessary input & output tensors, to decode wav, serialize mfcc fingerprint, restore from checkpoint etc. Returns: input_wav_filename: A tensor containing wav filename as the input layer. mfcc_fingerprint: The MFCC fingerprint tensor, that will be materialized later. """ self.wav_filename_placeholder_ = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav( wav_loader, desired_channels=1, desired_samples=self.desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, []) scaled_foreground = tf.multiply(wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2]) self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2]) padded_foreground = tf.pad(scaled_foreground, self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [self.desired_samples, -1]) # Mix in background noise. self.background_data_placeholder_ = tf.placeholder( tf.float32, [self.desired_samples, 1]) self.background_volume_placeholder_ = tf.placeholder(tf.float32, []) background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) spectrogram = contrib_audio.audio_spectrogram( background_clamp, window_size=self.window_size_samples, stride=self.window_stride_samples, magnitude_squared=True) self.mfcc_fingerprint_ = contrib_audio.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=self.opt.dct_coefficient_count)
def wav_to_spectrogram(path): wav_file = tf.placeholder(tf.string) audio_binary = tf.read_file(wav_file) # Decode the wav mono into a 2D tensor with time in dimension 0 # and channel along dimension 1 waveform = audio_ops.decode_wav(audio_binary, file_format='wav', desired_channels=1) # Compute the spectrogram spectrogram = audio_ops.audio_spectrogram(waveform.audio, window_size=1024, stride=64) # Custom brightness brightness = tf.placeholder(tf.float32, shape=[]) mul = tf.multiply(spectrogram, brightness) # Normalize pixels min_const = tf.constant(255.) minimum = tf.minimum(mul, min_const) # Expand dims so we get the proper shape expand_dims = tf.expand_dims(minimum, -1) # Remove the trailing dimension squeeze = tf.squeeze(expand_dims, 0) # Tensorflow spectrogram has time along y axis and frequencies along x axis flip = tf.image.flip_left_right(squeeze) transpose = tf.image.transpose_image(flip) # Convert image to 3 channels grayscale = tf.image.grayscale_to_rgb(transpose) # Cast to uint8 and encode as png cast = tf.cast(grayscale, tf.uint8) png = tf.image.encode_png(cast) with tf.Session() as sess: # Run the computation graph and save the png encoded image to a file image = sess.run(png, feed_dict={wav_file: path, brightness: 100}) new_path = path.rstrip(".wav") with open(new_path + '.png', 'wb') as f: f.write(image) return
def wav_to_mfcc(self, raw_data): spectrogram = audio_ops.audio_spectrogram( raw_data, window_size=self.parameters['spectogram_window_size'], stride=self.parameters['spectogram_stride'], magnitude_squared=True) mfcc = audio_ops.mfcc( spectrogram, self.parameters['audio_sample_rate'], dct_coefficient_count=self.parameters['dtc_coefficient_count']) mfcc = tf.expand_dims(mfcc, -1) self.input_dimensions = ( self.input_dimensions[0] / self.parameters['spectogram_stride'] - 2, self.parameters['dtc_coefficient_count'], 1) mfcc = tf.squeeze(mfcc, 0) return mfcc
def encode_data(audio_data, sample_rate): spectrogram = contrib_audio.audio_spectrogram( audio_data, window_size_samples, window_stride_samples ) print(spectrogram.shape) mfcc = contrib_audio.mfcc( spectrogram, sample_rate=sample_rate, dct_coefficient_count=dct_coefficient_count ) return mfcc
def create_inference_graph_and_load_variables(sess, FLAGS): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output the trained model graph. """ model_settings = data_utils.prepare_settings(FLAGS.num_classes, FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms, FLAGS.window_stride_ms, FLAGS.dct_coefficient_count) runtime_settings = {'clip_stride_ms': FLAGS.clip_stride_ms} wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data') decoded_sample_data = contrib_audio.decode_wav( wav_data_placeholder, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') spectrogram = contrib_audio.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) fingerprint_input = contrib_audio.mfcc( spectrogram, decoded_sample_data.sample_rate, dct_coefficient_count=FLAGS.dct_coefficient_count) fingerprint_frequency_size = model_settings['dct_coefficient_count'] fingerprint_time_size = model_settings['spectrogram_length'] reshaped_input = tf.reshape( fingerprint_input, [-1, fingerprint_time_size, fingerprint_frequency_size, 1], name="model_input") # Init model and load variables model = models.create_model(FLAGS) fw = framework.Framework(sess, model, None, FLAGS, input_tensor=reshaped_input) # Create an output to use for inference logits = tf.nn.softmax(model.get_raw_scores(), name='labels_softmax')
def prepare_processing_graph(self, model_settings): """ Build a tensorflow graph creates a graph that loads a wave file, decodes it, scales the volume, shifts it in time calculates a spectrogram and builds MFCC fingerprint from that input: model_settings: info about model being trained """ desired_samples = model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, []) scaled_foreground = tf.multiply(wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2]) self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2]) padded_foreground = tf.pad(scaled_foreground, self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) self.background_data_placeholder_ = tf.placeholder( np.float32, [desired_samples, 1]) self.background_volume_placeholder_ = tf.placeholder(np.float32, []) background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) spectrogram = contrib_audio.audio_spectrogram( background_clamp, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) self.mfcc_ = contrib_audio.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=model_settings['dct_coefficient_count'])
def prepare_processing_graph(self, model_settings): desired_samples = model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, []) scaled_foreground = tf.multiply(wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2]) self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2]) padded_foreground = tf.pad(scaled_foreground, self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) # Mix in background noise. self.background_data_placeholder_ = tf.placeholder( tf.float32, [desired_samples, 1]) self.background_volume_placeholder_ = tf.placeholder(tf.float32, []) background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. print 'window_size_samples', model_settings['window_size_samples'] print 'window_stride_samples', model_settings['window_stride_samples'] print 'background_clamp', background_clamp spectrogram = contrib_audio.audio_spectrogram( background_clamp, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) print 'spectrogram', spectrogram print 'dct_coefficient_count', model_settings['dct_coefficient_count'] print 'wav_decoder.sample_rate', wav_decoder.sample_rate self.mfcc_ = contrib_audio.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=model_settings['dct_coefficient_count']) print 'self.mfcc_', self.mfcc_
def get_mfcc_graph(model_settings): g = tf.Graph() with g.as_default(): input_file_placeholder = tf.compat.v1.placeholder(tf.string, [], name='wav_filename') wav_loader = io_ops.read_file(input_file_placeholder) wav_decoder = audio_ops.decode_wav( wav_loader, desired_channels=1, desired_samples=model_settings['desired_samples']) # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. spectrograms_power = audio_ops.audio_spectrogram( wav_decoder.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) USE_POWER = True if USE_POWER: # Warp the linear scale spectrograms into the mel-scale. num_spectrogram_bins = spectrograms_power.shape[-1].value lower_edge_hertz, upper_edge_hertz, num_mel_bins = 20.0, 4000.0, 40 linear_to_mel_weight_matrix = tf.signal.linear_to_mel_weight_matrix( num_mel_bins, num_spectrogram_bins, 16000.0, lower_edge_hertz, upper_edge_hertz) mel_spectrograms = tf.tensordot(spectrograms_power, linear_to_mel_weight_matrix, 1) mel_spectrograms.set_shape( spectrograms_power.shape[:-1].concatenate( linear_to_mel_weight_matrix.shape[-1:])) # Compute a stabilized log to get log-magnitude mel-scale spectrograms. log_mel_spectrograms = tf.math.log(mel_spectrograms + 1e-6) # Compute MFCCs from log_mel_spectrograms and take the first NDCT. mfccs = tf.signal.mfccs_from_log_mel_spectrograms( log_mel_spectrograms)[ ..., :model_settings['dct_coefficient_count']] #output = tf.expand_dims(mfccs, axis=0) output = mfccs else: output = audio_ops.mfcc( spectrograms_power, wav_decoder.sample_rate, dct_coefficient_count=model_settings['dct_coefficient_count']) return g, input_file_placeholder, output, wav_decoder.audio
def save_my_test_file(self, model_settings): desired_samples = model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav( # wav_loader, desired_channels=1, desired_samples=desired_samples) wav_loader, desired_channels=1, desired_samples=16000) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, []) scaled_foreground = tf.multiply(wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_padding_placeholder_ = tf.placeholder(tf.int32, [2, 2]) self.time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2]) padded_foreground = tf.pad( scaled_foreground, self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) # Mix in background noise. self.background_data_placeholder_ = tf.placeholder(tf.float32, [desired_samples, 1]) self.background_volume_placeholder_ = tf.placeholder(tf.float32, []) background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) self.filename_ = tf.placeholder(tf.string) # save_wav_file(self.filename_, np.array(background_clamp), 16000) wav_encoder = contrib_audio.encode_wav(background_clamp, 16000) self.wav_saver = io_ops.write_file(self.filename_, wav_encoder) # with tf.Session(graph=tf.Graph()) as sess: # sess.run(wav_saver) spectrogram = contrib_audio.audio_spectrogram( background_clamp, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) self.test_mfcc_ = contrib_audio.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=model_settings['dct_coefficient_count'])
def __init__(self, desired_samples=16000, window_size_samples=480, window_stride_samples=160): self.wav_filename_placeholder = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(self.wav_filename_placeholder) # already pads/crops wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1, desired_samples=desired_samples) spectrogram = contrib_audio.audio_spectrogram( wav_decoder.audio, window_size=window_size_samples, stride=window_stride_samples, magnitude_squared=True) self.mfcc = contrib_audio.mfcc(spectrogram, wav_decoder.sample_rate, dct_coefficient_count=40)
def build_data_generator(): # Build data generator pipeline desired_samples = model_settings['desired_samples'] wav_filename_placeholder_ = tf.placeholder( tf.string, [], name="wav_filename_placeholder_") wav_loader = io_ops.read_file(wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. foreground_volume_placeholder_ = tf.placeholder( tf.float32, [], name="foreground_volume_placeholder_") scaled_foreground = tf.multiply(wav_decoder.audio, foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. time_shift_padding_placeholder_ = tf.placeholder( tf.int32, [2, 2], name="time_shift_padding_placeholder_") time_shift_offset_placeholder_ = tf.placeholder(tf.int32, [2]) padded_foreground = tf.pad(scaled_foreground, time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, time_shift_offset_placeholder_, [desired_samples, -1]) # Mix in background noise. background_data_placeholder_ = tf.placeholder(tf.float32, [desired_samples, 1]) background_volume_placeholder_ = tf.placeholder(tf.float32, []) background_mul = tf.multiply(background_data_placeholder_, background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. spectrogram = contrib_audio.audio_spectrogram( background_clamp, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) mfcc_ = contrib_audio.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=model_settings['dct_coefficient_count']) return wav_filename_placeholder_, foreground_volume_placeholder_, time_shift_padding_placeholder_, time_shift_offset_placeholder_, background_data_placeholder_, background_volume_placeholder_, mfcc_
def prepare_processing_graph(self): self.wav_filename_placeholder_ = tf.placeholder(tf.string, []) wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav(wav_loader, desired_channels=1, desired_samples=desired_samples) spectrogram = contrib_audio.audio_spectrogram( wav_decoder.audio, window_size=window_size_samples, stride=window_stride_samples, magnitude_squared=True) print 'spectrogram', spectrogram print 'dct_coefficient_count', dct_coefficient_count print 'wav_decoder.sample_rate', wav_decoder.sample_rate self.mfcc_ = contrib_audio.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=dct_coefficient_count) print 'self.mfcc_', self.mfcc_
def _spectrogram_function(features, labels): # decoding wav files audio_binary = tf.read_file(features) wav = audio_ops.decode_wav(audio_binary, desired_channels=1) # create the spectrogram spectrogram = audio_ops.audio_spectrogram( wav.audio, window_size=window_size, stride=stride, magnitude_squared=True ) spectrogram = tf.log(tf.abs(spectrogram) + 0.01) spectrogram = tf.transpose(spectrogram, perm=[1, 2, 0]) # transform the class_id into a one-hot encoded vector response = tf.one_hot(labels, 30) return [spectrogram, response]
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms, clip_stride_ms, window_size_ms, window_stride_ms, feature_bin_count, model_architecture, preprocess): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. clip_stride_ms: How often to run recognition. Useful for models with cache. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. feature_bin_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. preprocess: How the spectrogram is processed to produce features, for example 'mfcc', 'average', or 'micro'. Raises: Exception: If the preprocessing mode isn't recognized. """ words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, feature_bin_count, preprocess) runtime_settings = {'clip_stride_ms': clip_stride_ms} wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data') decoded_sample_data = contrib_audio.decode_wav( wav_data_placeholder, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') spectrogram = contrib_audio.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) if preprocess == 'average': fingerprint_input = tf.nn.pool( tf.expand_dims(spectrogram, -1), window_shape=[1, model_settings['average_window_width']], strides=[1, model_settings['average_window_width']], pooling_type='AVG', padding='SAME') elif preprocess == 'mfcc': fingerprint_input = contrib_audio.mfcc( spectrogram, sample_rate, dct_coefficient_count=model_settings['fingerprint_width']) elif preprocess == 'micro': if not frontend_op: raise Exception( 'Micro frontend op is currently not available when running TensorFlow' ' directly from Python, you need to build and run through Bazel, for' ' example' ' `bazel run tensorflow/examples/speech_commands:freeze_graph`' ) sample_rate = model_settings['sample_rate'] window_size_ms = (model_settings['window_size_samples'] * 1000) / sample_rate window_step_ms = (model_settings['window_stride_samples'] * 1000) / sample_rate int16_input = tf.cast( tf.multiply(decoded_sample_data.audio, 32767), tf.int16) micro_frontend = frontend_op.audio_microfrontend( int16_input, sample_rate=sample_rate, window_size=window_size_ms, window_step=window_step_ms, num_channels=model_settings['fingerprint_width'], out_scale=1, out_type=tf.float32) fingerprint_input = tf.multiply(micro_frontend, (10.0 / 256.0)) else: raise Exception('Unknown preprocess mode "%s" (should be "mfcc",' ' "average", or "micro")' % (preprocess)) fingerprint_size = model_settings['fingerprint_size'] reshaped_input = tf.reshape(fingerprint_input, [-1, fingerprint_size]) logits = models.create_model( reshaped_input, model_settings, model_architecture, is_training=False, runtime_settings=runtime_settings) # Create an output to use for inference. tf.nn.softmax(logits, name='labels_softmax')
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms, clip_stride_ms, window_size_ms, window_stride_ms, feature_bin_count, model_architecture, preprocess): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. clip_stride_ms: How often to run recognition. Useful for models with cache. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. feature_bin_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. preprocess: How the spectrogram is processed to produce features, for example 'mfcc' or 'average'. Raises: Exception: If the preprocessing mode isn't recognized. """ words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, feature_bin_count, preprocess) runtime_settings = {'clip_stride_ms': clip_stride_ms} wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data') decoded_sample_data = contrib_audio.decode_wav( wav_data_placeholder, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') spectrogram = contrib_audio.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) if preprocess == 'average': fingerprint_input = tf.nn.pool( tf.expand_dims(spectrogram, -1), window_shape=[1, model_settings['average_window_width']], strides=[1, model_settings['average_window_width']], pooling_type='AVG', padding='SAME') elif preprocess == 'mfcc': fingerprint_input = contrib_audio.mfcc( spectrogram, sample_rate, dct_coefficient_count=model_settings['fingerprint_width']) else: raise Exception('Unknown preprocess mode "%s" (should be "mfcc" or' ' "average")' % (preprocess)) fingerprint_size = model_settings['fingerprint_size'] reshaped_input = tf.reshape(fingerprint_input, [-1, fingerprint_size]) logits = models.create_model( reshaped_input, model_settings, model_architecture, is_training=False, runtime_settings=runtime_settings) # Create an output to use for inference. tf.nn.softmax(logits, name='labels_softmax')
def prepare_processing_graph(self, model_settings, summaries_dir): """Builds a TensorFlow graph to apply the input distortions. Creates a graph that loads a WAVE file, decodes it, scales the volume, shifts it in time, adds in background noise, calculates a spectrogram, and then builds an MFCC fingerprint from that. This must be called with an active TensorFlow session running, and it creates multiple placeholder inputs, and one output: - wav_filename_placeholder_: Filename of the WAV to load. - foreground_volume_placeholder_: How loud the main clip should be. - time_shift_padding_placeholder_: Where to pad the clip. - time_shift_offset_placeholder_: How much to move the clip in time. - background_data_placeholder_: PCM sample data for background noise. - background_volume_placeholder_: Loudness of mixed-in background. - output_: Output 2D fingerprint of processed audio. Args: model_settings: Information about the current model being trained. summaries_dir: Path to save training summary information to. Raises: ValueError: If the preprocessing mode isn't recognized. """ with tf.get_default_graph().name_scope('data'): desired_samples = model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.placeholder( tf.string, [], name='wav_filename') wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = contrib_audio.decode_wav( wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder( tf.float32, [], name='foreground_volume') scaled_foreground = tf.multiply(wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_padding_placeholder_ = tf.placeholder( tf.int32, [2, 2], name='time_shift_padding') self.time_shift_offset_placeholder_ = tf.placeholder( tf.int32, [2], name='time_shift_offset') padded_foreground = tf.pad( scaled_foreground, self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) # Mix in background noise. self.background_data_placeholder_ = tf.placeholder( tf.float32, [desired_samples, 1], name='background_data') self.background_volume_placeholder_ = tf.placeholder( tf.float32, [], name='background_volume') background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. spectrogram = contrib_audio.audio_spectrogram( background_clamp, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) tf.summary.image( 'spectrogram', tf.expand_dims(spectrogram, -1), max_outputs=1) # The number of buckets in each FFT row in the spectrogram will depend on # how many input samples there are in each window. This can be quite # large, with a 160 sample window producing 127 buckets for example. We # don't need this level of detail for classification, so we often want to # shrink them down to produce a smaller result. That's what this section # implements. One method is to use average pooling to merge adjacent # buckets, but a more sophisticated approach is to apply the MFCC # algorithm to shrink the representation. if model_settings['preprocess'] == 'average': self.output_ = tf.nn.pool( tf.expand_dims(spectrogram, -1), window_shape=[1, model_settings['average_window_width']], strides=[1, model_settings['average_window_width']], pooling_type='AVG', padding='SAME') tf.summary.image('shrunk_spectrogram', self.output_, max_outputs=1) elif model_settings['preprocess'] == 'mfcc': self.output_ = contrib_audio.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=model_settings['fingerprint_width']) tf.summary.image( 'mfcc', tf.expand_dims(self.output_, -1), max_outputs=1) else: raise ValueError('Unknown preprocess mode "%s" (should be "mfcc" or' ' "average")' % (model_settings['preprocess'])) # Merge all the summaries and write them out to /tmp/retrain_logs (by # default) self.merged_summaries_ = tf.summary.merge_all(scope='data') self.summary_writer_ = tf.summary.FileWriter(summaries_dir + '/data', tf.get_default_graph())