def get_features(model_settings): if model_settings['preprocess'] == 'micro': window_size_ms = (model_settings['window_size_samples'] * 1000) / model_settings['sample_rate'] window_step_ms = (model_settings['window_stride_samples'] * 1000) / model_settings['sample_rate'] int16_input = tf.cast(tf.multiply(input_data, 32768), tf.int16) # print(int16_input.shape) # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/microfrontend/python/ops/audio_microfrontend_op.py micro_frontend = frontend_op.audio_microfrontend( int16_input, sample_rate=model_settings['sample_rate'], window_size=window_size_ms, window_step=window_step_ms, num_channels=model_settings['fingerprint_width'], out_scale=1, out_type=tf.float32) output = tf.multiply(micro_frontend, (10.0 / 256.0)) return output elif model_settings['preprocess'] == 'mfcc': # https://www.tensorflow.org/api_docs/python/tf/raw_ops/AudioSpectrogram spectrogram = audio_ops.audio_spectrogram( input_data, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) output = audio_ops.mfcc( spectrogram, model_settings['sample_rate'], dct_coefficient_count=model_settings['fingerprint_width']) return output[0,:,:] #just return channel 0 as 2D tensor elif model_settings['preprocess'] == 'average': spectrogram = audio_ops.audio_spectrogram( input_data, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) output = tf.nn.pool( input=tf.expand_dims(spectrogram, -1), window_shape=[1, model_settings['average_window_width']], strides=[1, model_settings['average_window_width']], pooling_type='AVG', padding='SAME') return output[0,:,:,0] #just return channel 0 as 2D tensor else: raise ValueError(f'Unknown model setting: {model_settings["preprocess"]}')
def calculate_mfcc(audio_signal, audio_sample_rate, window_size, window_stride, num_mfcc): """ Calculate MFCC(Mel Frequency Cepstral Coefficient) for a given audio signal Args: audio_signal: Raw audio signal in range [-1, 1] audio_sample_rate: sample rate for signal window_size: window size in samples for calculating spectrogram window_stride: window stride num_mfcc: number of mfcc features Returns: calculated mfcc feature """ spectrogram = audio_ops.audio_spectrogram(input=audio_signal, window_size=window_size, stride=window_stride, magnitude_squared=True) mfcc_features = audio_ops.mfcc(spectrogram, audio_sample_rate, dct_coefficient_count=num_mfcc) # *Note*, api has been changed in tf2.x > tf2.3 # TODO add another implementation return mfcc_features
def run_mfcc(input_width=40, window_size_samples=480, window_stride_samples=320.0, sample_rate=16000): """ Run MFCC on a .wav file Args: filename: the path to wav_file sess: Current Session being run Returns: Return 1-D of mfcc with 1960 data points """ wav_filename_placeholder = tf.compat.v1.placeholder(tf.string, []) wav_loader = io_ops.read_file(wav_filename_placeholder) wav_decoder = tf.audio.decode_wav(wav_loader, desired_channels=1, desired_samples=sample_rate) #background_clamp = tf.clip_by_value(wav_decoder.audio, -1.0, 1.0) spectrogram = audio_ops.audio_spectrogram(wav_decoder.audio, window_size=window_size_samples, stride=window_stride_samples, magnitude_squared=True) mfcc = audio_ops.mfcc(spectrogram, wav_decoder.sample_rate, dct_coefficient_count=input_width) return mfcc, wav_filename_placeholder
def _mfcc_op(self, inputs): # MFCC implementation based on TF custom op (supported by TFLite) # It reduces model size in comparison to _mfcc_tf if (self.mode == modes.Modes.STREAM_EXTERNAL_STATE_INFERENCE or self.mode == modes.Modes.STREAM_INTERNAL_STATE_INFERENCE): outputs = self.data_frame(inputs) # in streaming mode there is only one frame for FFT calculation # dims will be [batch=1, time=1, frame], # but audio_spectrogram requre 2D input data, so we remove time dim outputs = tf.squeeze(outputs, axis=1) else: outputs = inputs # outputs has dims [batch, time] # but audio_spectrogram expects [time, channels/batch] so transpose it outputs = tf.transpose(outputs, [1, 0]) # outputs: [time, channels/batch] outputs = audio_ops.audio_spectrogram( outputs, window_size=self.frame_size, stride=self.frame_step, magnitude_squared=self.params['fft_magnitude_squared']) # outputs: [channels/batch, frames, fft_feature] outputs = audio_ops.mfcc( outputs, self.params['sample_rate'], upper_frequency_limit=self.params['mel_upper_edge_hertz'], lower_frequency_limit=self.params['mel_lower_edge_hertz'], filterbank_channel_count=self.params['mel_num_bins'], dct_coefficient_count=self.params['dct_num_features']) # outputs: [channels/batch, frames, dct_coefficient_count] outputs = self.spec_augment(outputs) return outputs
def get_mfcc(waveform): # Run the spectrogram and MFCC ops to get a 2D audio: Short-time FFTs # background_clamp dims: [time, channels] sample_rate = 16000 spectrogram = audio_ops.audio_spectrogram(waveform, window_size=320, stride=160) # spectrogram: [channels/batch, frames, fft_feature] # extract mfcc features from spectrogram by audio_ops.mfcc: # 1 Input is spectrogram frames. # 2 Weighted spectrogram into bands using a triangular mel filterbank # 3 Logarithmic scaling # 4 Discrete cosine transform (DCT), return lowest dct_coefficient_count mfccs = audio_ops.mfcc(spectrogram=spectrogram, sample_rate=sample_rate, upper_frequency_limit=7600, lower_frequency_limit=60, filterbank_channel_count=40, dct_coefficient_count=20) # mfcc: [channels/batch, frames, dct_coefficient_count] # remove channel dim return mfccs
def samples_to_mfccs_orig(samples, sample_rate, train_phase=False): #tf.print('window_size: ', Config.audio_window_samples, ' stride: ', Config.audio_step_samples) spectrogram = contrib_audio.audio_spectrogram(samples, window_size=Config.audio_window_samples, stride=Config.audio_step_samples, magnitude_squared=True) # Data Augmentations if train_phase: if FLAGS.augmentation_spec_dropout_keeprate < 1: spectrogram = augment_dropout(spectrogram, keep_prob=FLAGS.augmentation_spec_dropout_keeprate) if FLAGS.augmentation_freq_and_time_masking: spectrogram = augment_freq_time_mask(spectrogram, frequency_masking_para=FLAGS.augmentation_freq_and_time_masking_freq_mask_range, time_masking_para=FLAGS.augmentation_freq_and_time_masking_time_mask_range, frequency_mask_num=FLAGS.augmentation_freq_and_time_masking_number_freq_masks, time_mask_num=FLAGS.augmentation_freq_and_time_masking_number_time_masks) if FLAGS.augmentation_pitch_and_tempo_scaling: spectrogram = augment_pitch_and_tempo(spectrogram, max_tempo=FLAGS.augmentation_pitch_and_tempo_scaling_max_tempo, max_pitch=FLAGS.augmentation_pitch_and_tempo_scaling_max_pitch, min_pitch=FLAGS.augmentation_pitch_and_tempo_scaling_min_pitch) if FLAGS.augmentation_speed_up_std > 0: spectrogram = augment_speed_up(spectrogram, speed_std=FLAGS.augmentation_speed_up_std) mfccs = contrib_audio.mfcc(spectrogram, sample_rate, dct_coefficient_count=Config.n_input) mfccs = tf.reshape(mfccs, [-1, Config.n_input]) #tf.print('dct_count: ', Config.n_input) return mfccs, tf.shape(input=mfccs)[0]
def audio_to_features(audio, sample_rate, transcript=None, clock=0.0, train_phase=False, augmentations=None, sample_id=None): if train_phase: # We need the lambdas to make TensorFlow happy. # pylint: disable=unnecessary-lambda tf.cond(tf.math.not_equal(sample_rate, FLAGS.audio_sample_rate), lambda: tf.print('WARNING: sample rate of sample', sample_id, '(', sample_rate, ') ' 'does not match FLAGS.audio_sample_rate. This can lead to incorrect results.'), lambda: tf.no_op(), name='matching_sample_rate') if train_phase and augmentations: audio = apply_graph_augmentations('signal', audio, augmentations, transcript=transcript, clock=clock) spectrogram = contrib_audio.audio_spectrogram(audio, window_size=Config.audio_window_samples, stride=Config.audio_step_samples, magnitude_squared=True) if train_phase and augmentations: spectrogram = apply_graph_augmentations('spectrogram', spectrogram, augmentations, transcript=transcript, clock=clock) features = contrib_audio.mfcc(spectrogram=spectrogram, sample_rate=sample_rate, dct_coefficient_count=Config.n_input, upper_frequency_limit=FLAGS.audio_sample_rate / 2) features = tf.reshape(features, [-1, Config.n_input]) if train_phase and augmentations: features = apply_graph_augmentations('features', features, augmentations, transcript=transcript, clock=clock) return features, tf.shape(input=features)[0]
def get_deepspeech_mfccs(samples, sample_rate=16000): decoded = contrib_audio.decode_wav(samples, desired_channels=1) spectrogram = contrib_audio.audio_spectrogram(decoded.audio, window_size=512, stride=320, magnitude_squared=True) return contrib_audio.mfcc(spectrogram=spectrogram, sample_rate=decoded.sample_rate, dct_coefficient_count=26, upper_frequency_limit=sample_rate / 2)
def AudioToMfcc(sample_rate, audio, window_size_ms, window_stride_ms, num_coefficients): window_size_samples = sample_rate * window_size_ms // 1000 window_stride_samples = sample_rate * window_stride_ms // 1000 spectrogram = audio_ops.audio_spectrogram(audio, window_size=window_size_samples, stride=window_stride_samples, magnitude_squared=True) mfcc = audio_ops.mfcc(spectrogram, sample_rate, dct_coefficient_count=num_coefficients) return mfcc
def samples_to_mfccs(samples, sample_rate): spectrogram = contrib_audio.audio_spectrogram( samples, window_size=Config.audio_window_samples, stride=Config.audio_step_samples, magnitude_squared=True) mfccs = contrib_audio.mfcc(spectrogram, sample_rate, dct_coefficient_count=Config.n_input) mfccs = tf.reshape(mfccs, [-1, Config.n_input]) return mfccs, tf.shape(input=mfccs)[0]
def samples_to_mfccs(samples, sample_rate, train_phase=False, sample_id=None): if train_phase: # We need the lambdas to make TensorFlow happy. # pylint: disable=unnecessary-lambda tf.cond(tf.math.not_equal(sample_rate, FLAGS.audio_sample_rate), lambda: tf.print('WARNING: sample rate of sample', sample_id, '(', sample_rate, ') ' 'does not match FLAGS.audio_sample_rate. This can lead to incorrect results.'), lambda: tf.no_op(), name='matching_sample_rate') spectrogram = contrib_audio.audio_spectrogram(samples, window_size=Config.audio_window_samples, stride=Config.audio_step_samples, magnitude_squared=True) # Data Augmentations if train_phase: if FLAGS.augmentation_spec_dropout_keeprate < 1: spectrogram = augment_dropout(spectrogram, keep_prob=FLAGS.augmentation_spec_dropout_keeprate) # sparse warp must before freq/time masking if FLAGS.augmentation_sparse_warp: spectrogram = augment_sparse_warp(spectrogram, time_warping_para=FLAGS.augmentation_sparse_warp_time_warping_para, interpolation_order=FLAGS.augmentation_sparse_warp_interpolation_order, regularization_weight=FLAGS.augmentation_sparse_warp_regularization_weight, num_boundary_points=FLAGS.augmentation_sparse_warp_num_boundary_points, num_control_points=FLAGS.augmentation_sparse_warp_num_control_points) if FLAGS.augmentation_freq_and_time_masking: spectrogram = augment_freq_time_mask(spectrogram, frequency_masking_para=FLAGS.augmentation_freq_and_time_masking_freq_mask_range, time_masking_para=FLAGS.augmentation_freq_and_time_masking_time_mask_range, frequency_mask_num=FLAGS.augmentation_freq_and_time_masking_number_freq_masks, time_mask_num=FLAGS.augmentation_freq_and_time_masking_number_time_masks) if FLAGS.augmentation_pitch_and_tempo_scaling: spectrogram = augment_pitch_and_tempo(spectrogram, max_tempo=FLAGS.augmentation_pitch_and_tempo_scaling_max_tempo, max_pitch=FLAGS.augmentation_pitch_and_tempo_scaling_max_pitch, min_pitch=FLAGS.augmentation_pitch_and_tempo_scaling_min_pitch) if FLAGS.augmentation_speed_up_std > 0: spectrogram = augment_speed_up(spectrogram, speed_std=FLAGS.augmentation_speed_up_std) mfccs = contrib_audio.mfcc(spectrogram=spectrogram, sample_rate=sample_rate, dct_coefficient_count=Config.n_input, upper_frequency_limit=FLAGS.audio_sample_rate/2) mfccs = tf.reshape(mfccs, [-1, Config.n_input]) return mfccs, tf.shape(input=mfccs)[0]
def make_features(self, audio: np.ndarray) -> np.ndarray: """ Use `python_speech_features` lib to extract log filter banks from the features file. """ spectrogram = contrib_audio.audio_spectrogram( audio.audio, window_size=self.window_size, stride=self.window_step, magnitude_squared=True) mfccs = contrib_audio.mfcc(spectrogram=spectrogram, sample_rate=self.sample_rate, dct_coefficient_count=self.features_num, upper_frequency_limit=self.sample_rate // 2) return self.standardize( mfccs[0]) if self.is_standardization else mfccs[0]
def samples_to_mfccs(samples, sample_rate): spectrogram = contrib_audio.audio_spectrogram(samples, window_size=512, stride=320, magnitude_squared=True) mfccs = contrib_audio.mfcc(spectrogram=spectrogram, sample_rate=sample_rate, dct_coefficient_count=26, upper_frequency_limit=4000) mfccs = tf.reshape(mfccs, [-1, 26]) return mfccs, tf.shape(input=mfccs)[0]
def prepare_processing_graph(self, model_settings): """Builds a TensorFlow graph to apply the input distortions. Creates a graph that loads a WAVE file, decodes it, scales the volume, shifts it in time, calculates a spectrogram, and then builds an MFCC fingerprint from that. This must be called with an active TensorFlow session running, and it creates multiple placeholder inputs, and one output: - wav_filename_placeholder_: Filename of the WAV to load. - foreground_volume_placeholder_: How loud the main clip should be. - time_shift_offset_placeholder_: How much to move the clip in time. - mfcc_: Output 2D fingerprint of processed audio. Args: model_settings: Information about the current model being trained. """ desired_samples = model_settings['desired_samples'] channel_count = model_settings['channel_count'] sample_rate = model_settings['sample_rate'] self.foreground_data_placeholder_ = tf.placeholder( tf.float32, [desired_samples, channel_count]) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder(tf.float32, []) scaled_foreground = tf.multiply(self.foreground_data_placeholder_, self.foreground_volume_placeholder_) # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. self.waveform_ = scaled_foreground spectrograms = [] for ichannel in range(channel_count): spectrograms.append( audio_ops.audio_spectrogram( tf.slice(scaled_foreground, [0, ichannel], [-1, 1]), window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True)) self.spectrogram_ = tf.stack(spectrograms, -1) mfccs = [] for ichannel in range(channel_count): mfccs.append( audio_ops.mfcc( spectrograms[ichannel], sample_rate, upper_frequency_limit=model_settings['sample_rate'] // 2, filterbank_channel_count=model_settings[ 'filterbank_channel_count'], dct_coefficient_count=model_settings[ 'dct_coefficient_count'])) self.mfcc_ = tf.stack(mfccs, -1)
def make_features(self, audio: np.ndarray) -> np.ndarray: """Use Tensorflow lib to extract log filter banks from the features file. """ audio = audio[:, np.newaxis] spectrogram = contrib_audio.audio_spectrogram( audio, window_size=self.window_size, stride=self.window_step, magnitude_squared=True) mfccs = contrib_audio.mfcc(spectrogram=spectrogram, sample_rate=self.sample_rate, dct_coefficient_count=self.features_num, upper_frequency_limit=8000) # take the first channel only return mfccs[0]
def samples_to_mfccs(samples, sample_rate): # 16000 = default sample rate # 32 = default feature extraction audio window length in milliseconds audio_window_samples = 16000 * (32 / 1000) # 20 = default feature extraction window step length in milliseconds audio_step_samples = 16000 * (20 / 1000) spectrogram = contrib_audio.audio_spectrogram( samples, window_size=audio_window_samples, stride=audio_step_samples, magnitude_squared=True) mfccs = contrib_audio.mfcc(spectrogram, sample_rate, dct_coefficient_count=n_input) mfccs = tf.reshape(mfccs, [-1, n_input]) return mfccs, tf.shape(input=mfccs)[0]
def samples_to_mfccs(samples, sample_rate, train_phase=False): spectrogram = contrib_audio.audio_spectrogram(samples, window_size=Config.audio_window_samples, stride=Config.audio_step_samples, magnitude_squared=True) # Data Augmentations if train_phase: if FLAGS.augmentation_spec_dropout_keeprate < 1: spectrogram = augment_dropout(spectrogram, keep_prob=FLAGS.augmentation_spec_dropout_keeprate) # sparse warp must before freq/time masking if FLAGS.augmentation_sparse_warp: spectrogram = augment_sparse_warp(spectrogram, time_warping_para=FLAGS.augmentation_sparse_warp_time_warping_para, interpolation_order=FLAGS.augmentation_sparse_warp_interpolation_order, regularization_weight=FLAGS.augmentation_sparse_warp_regularization_weight, num_boundary_points=FLAGS.augmentation_sparse_warp_num_boundary_points, num_control_points=FLAGS.augmentation_sparse_warp_num_control_points) if FLAGS.augmentation_freq_and_time_masking: spectrogram = augment_freq_time_mask(spectrogram, frequency_masking_para=FLAGS.augmentation_freq_and_time_masking_freq_mask_range, time_masking_para=FLAGS.augmentation_freq_and_time_masking_time_mask_range, frequency_mask_num=FLAGS.augmentation_freq_and_time_masking_number_freq_masks, time_mask_num=FLAGS.augmentation_freq_and_time_masking_number_time_masks) if FLAGS.augmentation_pitch_and_tempo_scaling: spectrogram = augment_pitch_and_tempo(spectrogram, max_tempo=FLAGS.augmentation_pitch_and_tempo_scaling_max_tempo, max_pitch=FLAGS.augmentation_pitch_and_tempo_scaling_max_pitch, min_pitch=FLAGS.augmentation_pitch_and_tempo_scaling_min_pitch) if FLAGS.augmentation_speed_up_std > 0: spectrogram = augment_speed_up(spectrogram, speed_std=FLAGS.augmentation_speed_up_std) mfccs = contrib_audio.mfcc(spectrogram=spectrogram, sample_rate=sample_rate, dct_coefficient_count=Config.n_input, upper_frequency_limit=FLAGS.audio_sample_rate/2) mfccs = tf.reshape(mfccs, [-1, Config.n_input]) return mfccs, tf.shape(input=mfccs)[0]
def callback(input_data, frame_count, time_info, flags): global samples # print("Got audio " + str(frame_count)) new_samples = np.frombuffer(input_data, np.float32) samples = np.concatenate((samples, new_samples)) samples = samples[-16000:] if len(samples) == 16000: start = time.perf_counter() # normalise the samples normalised = samples - np.mean(samples) max = np.max(normalised) if max > 0: normalised = normalised / max # create the spectrogram spectrogram = audio_ops.audio_spectrogram( np.reshape(normalised, (16000, 1)), window_size=320, stride=160, magnitude_squared=True) # reduce the number of frequency bins in our spectrogram to a more sensible level spectrogram = tf.nn.pool( input=tf.expand_dims(spectrogram, -1), window_shape=[1, 6], strides=[1, 6], pooling_type='AVG', padding='SAME') # remove the first 1 index spectrogram = tf.squeeze(spectrogram, axis=0) spectrogram = np.log10(spectrogram + 1e-6) prediction = model.predict(np.reshape(spectrogram, (1, 99, 43, 1))) if prediction[0][0] > 0.9: print(f"{datetime.now().time()} - Here I am, brain the size of a planet.... {prediction[0][0]}") end = time.perf_counter() # print((end-start)*1000) return input_data, pyaudio.paContinue
def get_spectrogram(filename, window_size_samples, window_stride_samples, sess): """Create Spectrogram from the PCM-encoded audio data Args: wav_data: 2D array of float PCM-encoded audio data. sess: current session being run Returns: 2-D spectrogram of audio """ wav_data = load_wav_file(filename, sess) #print(wav_data.shape) wav_data_placeholder = tf.compat.v1.placeholder(tf.float32, [None, 1]) spectrogram = audio_ops.audio_spectrogram(wav_data_placeholder, window_size=window_size_samples, stride=window_stride_samples, magnitude_squared=True) spectrogram = sess.run( spectrogram, feed_dict={wav_data_placeholder: np.reshape(wav_data, (-1, 1))}) return spectrogram
def powspec_feat(samples, sr=8000, nfft=512, winlen=0.025, winstep=0.010, lowfreq=0, highfreq=None, preemph=0.97): ''' params: samples: [nsample, channels] returns: powspec: power spectrogram, shape [channels, nframe, nfft / 2 + 1] ''' del nfft del lowfreq del highfreq del preemph #pylint: disable=no-member feat = audio_ops.audio_spectrogram(samples, window_size=winlen * sr, stride=winstep * sr, magnitude_squared=True) return feat
def calculate_mfcc(audio_signal, audio_sample_rate, window_size, window_stride, num_mfcc): """Returns Mel Frequency Cepstral Coefficients (MFCC) for a given audio signal. Args: audio_signal: Raw audio signal in range [-1, 1] audio_sample_rate: Audio signal sample rate window_size: Window size in samples for calculating spectrogram window_stride: Window stride in samples for calculating spectrogram num_mfcc: The number of MFCC features wanted. Returns: Calculated mffc features. """ spectrogram = audio_ops.audio_spectrogram(input=audio_signal, window_size=window_size, stride=window_stride, magnitude_squared=True) mfcc_features = audio_ops.mfcc(spectrogram, audio_sample_rate, dct_coefficient_count=num_mfcc) return mfcc_features
def create_inference_graph(wanted_words, sample_rate, clip_duration_ms, clip_stride_ms, window_size_ms, window_stride_ms, feature_bin_count, model_architecture, preprocess): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. clip_stride_ms: How often to run recognition. Useful for models with cache. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. feature_bin_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. preprocess: How the spectrogram is processed to produce features, for example 'mfcc', 'average', or 'micro'. Returns: Input and output tensor objects. Raises: Exception: If the preprocessing mode isn't recognized. """ words_list = input_data.prepare_words_list(wanted_words.split(',')) model_settings = models.prepare_model_settings( len(words_list), sample_rate, clip_duration_ms, window_size_ms, window_stride_ms, feature_bin_count, preprocess) runtime_settings = {'clip_stride_ms': clip_stride_ms} wav_data_placeholder = tf.compat.v1.placeholder(tf.string, [], name='wav_data') decoded_sample_data = tf.audio.decode_wav( wav_data_placeholder, desired_channels=1, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') spectrogram = audio_ops.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) if preprocess == 'average': fingerprint_input = tf.nn.pool( input=tf.expand_dims(spectrogram, -1), window_shape=[1, model_settings['average_window_width']], strides=[1, model_settings['average_window_width']], pooling_type='AVG', padding='SAME') elif preprocess == 'mfcc': fingerprint_input = audio_ops.mfcc( spectrogram, sample_rate, dct_coefficient_count=model_settings['fingerprint_width']) elif preprocess == 'micro': if not frontend_op: raise Exception( 'Micro frontend op is currently not available when running TensorFlow' ' directly from Python, you need to build and run through Bazel, for' ' example' ' `bazel run tensorflow/examples/speech_commands:freeze_graph`' ) sample_rate = model_settings['sample_rate'] window_size_ms = (model_settings['window_size_samples'] * 1000) / sample_rate window_step_ms = (model_settings['window_stride_samples'] * 1000) / sample_rate int16_input = tf.cast(tf.multiply(decoded_sample_data.audio, 32767), tf.int16) micro_frontend = frontend_op.audio_microfrontend( int16_input, sample_rate=sample_rate, window_size=window_size_ms, window_step=window_step_ms, num_channels=model_settings['fingerprint_width'], out_scale=1, out_type=tf.float32) fingerprint_input = tf.multiply(micro_frontend, (10.0 / 256.0)) elif preprocess == "rune": fingerprint_input = np.random.uniform(0, 26, 1960).astype(np.float32) else: raise Exception('Unknown preprocess mode "%s" (should be "mfcc",' ' "average", or "micro")' % (preprocess)) fingerprint_size = model_settings['fingerprint_size'] reshaped_input = tf.reshape(fingerprint_input, [-1, fingerprint_size]) logits = models.create_model(reshaped_input, model_settings, model_architecture, is_training=False, runtime_settings=runtime_settings) # Create an output to use for inference. softmax = tf.nn.softmax(logits, name='labels_softmax') return reshaped_input, softmax
def prepare_processing_graph(self, model_settings, summaries_dir): """Builds a TensorFlow graph to apply the input distortions. Creates a graph that loads a WAVE file, decodes it, scales the volume, shifts it in time, adds in background noise, calculates a spectrogram, and then builds an MFCC fingerprint from that. This must be called with an active TensorFlow session running, and it creates multiple placeholder inputs, and one output: - wav_filename_placeholder_: Filename of the WAV to load. - foreground_volume_placeholder_: How loud the main clip should be. - time_shift_padding_placeholder_: Where to pad the clip. - time_shift_offset_placeholder_: How much to move the clip in time. - background_data_placeholder_: PCM sample data for background noise. - background_volume_placeholder_: Loudness of mixed-in background. - output_: Output 2D fingerprint of processed audio. Args: model_settings: Information about the current model being trained. summaries_dir: Path to save training summary information to. Raises: ValueError: If the preprocessing mode isn't recognized. Exception: If the preprocessor wasn't compiled in. """ with tf.compat.v1.get_default_graph().name_scope('data'): desired_samples = model_settings['desired_samples'] self.wav_filename_placeholder_ = tf.compat.v1.placeholder( tf.string, [], name='wav_filename') wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = tf.audio.decode_wav(wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.compat.v1.placeholder( tf.float32, [], name='foreground_volume') scaled_foreground = tf.multiply( wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_offset_placeholder_ = tf.compat.v1.placeholder( tf.int32, [2], name='time_shift_offset') padded_foreground = tf.pad( tensor=scaled_foreground, paddings=self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) # Mix in background noise. self.background_data_placeholder_ = tf.compat.v1.placeholder( tf.float32, [desired_samples, 1], name='background_data') self.background_volume_placeholder_ = tf.compat.v1.placeholder( tf.float32, [], name='background_volume') background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) # Run the spectrogram and MFCC ops to get a 2D 'fingerprint' of the audio. spectrogram = audio_ops.audio_spectrogram( background_clamp, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True) # remove summary # tf.compat.v1.summary.image( # 'spectrogram', tf.expand_dims(spectrogram, -1), max_outputs=1) # The number of buckets in each FFT row in the spectrogram will depend on # how many input samples there are in each window. This can be quite # large, with a 160 sample window producing 127 buckets for example. We # don't need this level of detail for classification, so we often want to # shrink them down to produce a smaller result. That's what this section # implements. One method is to use average pooling to merge adjacent # buckets, but a more sophisticated approach is to apply the MFCC # algorithm to shrink the representation. if model_settings['preprocess'] == 'average': self.output_ = tf.nn.pool( input=tf.expand_dims(spectrogram, -1), window_shape=[1, model_settings['average_window_width']], strides=[1, model_settings['average_window_width']], pooling_type='AVG', padding='SAME') # tf.compat.v1.summary.image('shrunk_spectrogram', # self.output_, # max_outputs=1) elif model_settings['preprocess'] == 'mfcc': self.output_ = audio_ops.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=model_settings['fingerprint_width']) # tf.compat.v1.summary.image( # 'mfcc', tf.expand_dims(self.output_, -1), max_outputs=1) elif model_settings['preprocess'] == 'micro': if not frontend_op: raise Exception( 'Micro frontend op is currently not available when running' ' TensorFlow directly from Python, you need to build and run' ' through Bazel') sample_rate = model_settings['sample_rate'] window_size_ms = (model_settings['window_size_samples'] * 1000) / sample_rate window_step_ms = (model_settings['window_stride_samples'] * 1000) / sample_rate int16_input = tf.cast(tf.multiply(background_clamp, 32768), tf.int16) micro_frontend = frontend_op.audio_microfrontend( int16_input, sample_rate=sample_rate, window_size=window_size_ms, window_step=window_step_ms, num_channels=model_settings['fingerprint_width'], out_scale=1, out_type=tf.float32) self.output_ = tf.multiply(micro_frontend, (10.0 / 256.0)) # tf.compat.v1.summary.image( # 'micro', # tf.expand_dims(tf.expand_dims(self.output_, -1), 0), # max_outputs=1) else: raise ValueError( 'Unknown preprocess mode "%s" (should be "mfcc", ' ' "average", or "micro")' % (model_settings['preprocess']))
def preprocess(audio, label): # If we're time shifting, set up the offset for this sample. if time_shift > 0: time_shift_amount = tf.random.uniform([], -time_shift, time_shift, dtype=tf.int32) else: time_shift_amount = 0 if time_shift_amount > 0: time_shift_padding = [[time_shift_amount, 0], [0, 0]] time_shift_offset = [0, 0] else: time_shift_padding = [[0, -time_shift_amount], [0, 0]] time_shift_offset = [-time_shift_amount, 0] # Choose a section of background noise to mix in. if use_background or label == SILENCE_INDEX: background_index = tf.random.uniform( [], 0, self.background_data.shape[0], dtype=tf.int32) background_samples = self.background_data[background_index] background_offset = tf.random.uniform( [], 0, tf.shape(background_samples)[0] - desired_samples, dtype=tf.int32) background_clipped = background_samples[background_offset:( background_offset + desired_samples)] background_data = tf.reshape(background_clipped, [desired_samples, 1]) if label == SILENCE_INDEX: background_volume = tf.random.uniform([], 0, 1) elif tf.random.uniform([], 0, 1) < background_frequency: background_volume = tf.random.uniform( [], 0, background_volume_range) else: background_volume = 0.0 else: background_data = tf.zeros([desired_samples, 1]) background_volume = 0.0 # If we want silence, mute out the main sample but leave the background. foreground_volume = 0.0 if label == SILENCE_INDEX else 1.0 # Allow the audio sample's volume to be adjusted. scaled_foreground = tf.multiply(audio, foreground_volume) # Shift the sample's start position, and pad any gaps with zeros. padded_foreground = tf.pad(tensor=scaled_foreground, paddings=time_shift_padding, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, time_shift_offset, [desired_samples, -1]) sliced_foreground.set_shape((sliced_foreground.shape[0], 1)) # Mix in background noise. background_volume = tf.cast(background_volume, tf.float32) background_mul = tf.multiply(background_data, background_volume) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) spectrogram = audio_ops.audio_spectrogram(background_clamp, window_size=frame_length, stride=frame_step, magnitude_squared=True) x = audio_ops.mfcc(spectrogram, sample_rate, dct_coefficient_count=num_channels, upper_frequency_limit=7500, lower_frequency_limit=20) x = tf.reshape(x, (spectrogram_length, num_channels, 1)) return x, label
def prepare_processing_graph(self, flags): """Builds a TensorFlow graph to apply the input distortions. Creates a graph that loads a WAVE file, decodes it, scales the volume, shifts it in time, adds in background noise, calculates a spectrogram, and then builds an MFCC fingerprint from that. This must be called with an active TensorFlow session running, and it creates multiple placeholder inputs, and one output: - wav_filename_placeholder_: Filename of the WAV to load. - foreground_volume_placeholder_: How loud the main clip should be. - foreground_resampling_placeholder_: Controls signal stretching/squeezing - time_shift_padding_placeholder_: Where to pad the clip. - time_shift_offset_placeholder_: How much to move the clip in time. - background_data_placeholder_: PCM sample data for background noise. - background_volume_placeholder_: Loudness of mixed-in background. - output_: Output 2D fingerprint of processed audio or raw audio. Args: flags: data and model parameters, described at model_train.py Raises: ValueError: If the preprocessing mode isn't recognized. Exception: If the preprocessor wasn't compiled in. """ with tf.get_default_graph().name_scope('data'): desired_samples = flags.desired_samples self.wav_filename_placeholder_ = tf.placeholder( tf.string, [], name='wav_filename') wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = tf.audio.decode_wav(wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder( tf.float32, [], name='foreground_volume') # signal resampling to generate more training data # it will stretch or squeeze input signal proportinally to: self.foreground_resampling_placeholder_ = tf.placeholder( tf.float32, []) if self.foreground_resampling_placeholder_ != 1.0: image = tf.expand_dims(wav_decoder.audio, 0) image = tf.expand_dims(image, 2) shape = tf.shape(wav_decoder.audio) image_resized = tf.image.resize( images=image, size=(tf.cast((tf.cast(shape[0], tf.float32) * self.foreground_resampling_placeholder_), tf.int32), 1), preserve_aspect_ratio=False) image_resized_cropped = tf.image.resize_with_crop_or_pad( image_resized, target_height=desired_samples, target_width=1, ) image_resized_cropped = tf.squeeze(image_resized_cropped, axis=[0, 3]) scaled_foreground = tf.multiply( image_resized_cropped, self.foreground_volume_placeholder_) else: scaled_foreground = tf.multiply( wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_padding_placeholder_ = tf.placeholder( tf.int32, [2, 2], name='time_shift_padding') self.time_shift_offset_placeholder_ = tf.placeholder( tf.int32, [2], name='time_shift_offset') padded_foreground = tf.pad( tensor=scaled_foreground, paddings=self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) # Mix in background noise. self.background_data_placeholder_ = tf.placeholder( tf.float32, [desired_samples, 1], name='background_data') self.background_volume_placeholder_ = tf.placeholder( tf.float32, [], name='background_volume') background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) if flags.preprocess == 'raw': # background_clamp dims: [time, channels] # remove channel dim self.output_ = tf.squeeze(background_clamp, axis=1) # below options are for backward compatibility with previous # version of hotword detection on microcontrollers # in this case audio feature extraction is done separately from # neural net and user will have to manage it. elif flags.preprocess == 'mfcc': # Run the spectrogram and MFCC ops to get a 2D audio: Short-time FFTs # background_clamp dims: [time, channels] spectrogram = audio_ops.audio_spectrogram( background_clamp, window_size=flags.window_size_samples, stride=flags.window_stride_samples, magnitude_squared=flags.fft_magnitude_squared) # spectrogram: [channels/batch, frames, fft_feature] # extract mfcc features from spectrogram by audio_ops.mfcc: # 1 Input is spectrogram frames. # 2 Weighted spectrogram into bands using a triangular mel filterbank # 3 Logarithmic scaling # 4 Discrete cosine transform (DCT), return lowest dct_coefficient_count mfcc = audio_ops.mfcc( spectrogram=spectrogram, sample_rate=flags.sample_rate, upper_frequency_limit=flags.mel_upper_edge_hertz, lower_frequency_limit=flags.mel_lower_edge_hertz, filterbank_channel_count=flags.mel_num_bins, dct_coefficient_count=flags.dct_num_features) # mfcc: [channels/batch, frames, dct_coefficient_count] # remove channel dim self.output_ = tf.squeeze(mfcc, axis=0) elif flags.preprocess == 'micro': if not frontend_op: raise Exception( 'Micro frontend op is currently not available when running' ' TensorFlow directly from Python, you need to build and run' ' through Bazel') int16_input = tf.cast( tf.multiply(background_clamp, MAX_ABS_INT16), tf.int16) # audio_microfrontend does: # 1. A slicing window function of raw audio # 2. Short-time FFTs # 3. Filterbank calculations # 4. Noise reduction # 5. PCAN Auto Gain Control # 6. Logarithmic scaling # int16_input dims: [time, channels] micro_frontend = frontend_op.audio_microfrontend( int16_input, sample_rate=flags.sample_rate, window_size=flags.window_size_ms, window_step=flags.window_stride_ms, num_channels=flags.mel_num_bins, upper_band_limit=flags.mel_upper_edge_hertz, lower_band_limit=flags.mel_lower_edge_hertz, out_scale=1, out_type=tf.float32) # int16_input dims: [frames, num_channels] self.output_ = tf.multiply(micro_frontend, (10.0 / 256.0)) else: raise ValueError( 'Unknown preprocess mode "%s" (should be "raw", ' ' "mfcc", or "micro")' % (flags.preprocess))
def gen_spectrogram_onnx_test_model(model_path, window_count, window_size, stride, magnitude_squared=True): # Tensor sizes. input_length = window_size + (window_count - 1) * stride fft_length = int(2**np.ceil(np.log2(window_size))) input_shape = [1, input_length] spectrogram_length = int(fft_length / 2 + 1) spectrogram_shape = [window_count, spectrogram_length] # Generate random input data. np.random.seed(1) input_data = np.random.randn(*input_shape) # ----------------------------------------- COMPUTE TensorFlow REFERENCE ------------------------------------------- # Define TensorFlow model. tf_input = tf.constant(input_data.reshape([input_length, 1]), name='input', dtype=tf.float32) tf_spectrogram = audio_ops.audio_spectrogram( tf_input, window_size=window_size, stride=stride, magnitude_squared=magnitude_squared) # Run TensorFlow model and get reference output. with tf.Session() as sess: spectrogram_ref = sess.run(tf_spectrogram) spectrogram_ref = np.reshape(spectrogram_ref, spectrogram_shape) # ---------------------------------------------- NODE DEFINITION -------------------------------------------------- # AudioSpectrogram node definition. spectrogram_node_def = onnx.helper.make_node( 'AudioSpectrogram', name='audio_spectrogram', inputs=['input'], outputs=['spectrogram'], window_size=int(window_size), stride=int(stride), magnitude_squared=int(magnitude_squared)) # Error node definition. err_node_def = onnx.helper.make_node( 'Sub', name='error', inputs=['spectrogram', 'spectrogram_ref'], outputs=['spectrogram_err']) # --------------------------------------------- GRAPH DEFINITION -------------------------------------------------- graph_input = list() graph_init = list() graph_output = list() # Graph inputs. graph_input.append( helper.make_tensor_value_info('input', TensorProto.FLOAT, input_shape)) graph_input.append( helper.make_tensor_value_info('spectrogram_ref', TensorProto.FLOAT, spectrogram_shape)) # Graph initializers. graph_init.append(make_init('input', TensorProto.FLOAT, input_data)) graph_init.append( make_init('spectrogram_ref', TensorProto.FLOAT, spectrogram_ref)) # Graph outputs. graph_output.append( helper.make_tensor_value_info('spectrogram_err', TensorProto.FLOAT, spectrogram_shape)) # Graph name. graph_name = 'audio_spectrogram_test' # Define graph (GraphProto). graph_def = helper.make_graph([spectrogram_node_def, err_node_def], graph_name, inputs=graph_input, outputs=graph_output) # Set initializers. graph_def.initializer.extend(graph_init) # --------------------------------------------- MODEL DEFINITION -------------------------------------------------- # Define model (ModelProto). model_def = helper.make_model(graph_def, producer_name='onnx-audio-spectrogram') # Print model. with open(model_path, 'w') as f: f.write(str(model_def))
def create_inference_graph( wanted_words, sample_rate, nchannels, clip_duration_ms, clip_stride_ms, representation, window_size_ms, window_stride_ms, nwindows, dct_coefficient_count, filterbank_channel_count, model_architecture, filter_counts, filter_sizes, final_filter_len, dropout_prob, batch_size, dilate_after_layer, stride_after_layer, connection_type, silence_percentage, unknown_percentage): """Creates an audio model with the nodes needed for inference. Uses the supplied arguments to create a model, and inserts the input and output nodes that are needed to use the graph for inference. Args: wanted_words: Comma-separated list of the words we're trying to recognize. sample_rate: How many samples per second are in the input audio files. clip_duration_ms: How many samples to analyze for the audio pattern. clip_stride_ms: How often to run recognition. Useful for models with cache. window_size_ms: Time slice duration to estimate frequencies from. window_stride_ms: How far apart time slices should be. dct_coefficient_count: Number of frequency bands to analyze. model_architecture: Name of the kind of model to generate. """ words_list = input_data.prepare_words_list(wanted_words.split(','), silence_percentage, unknown_percentage) model_settings = models.prepare_model_settings( len(words_list), sample_rate, nchannels, clip_duration_ms, representation, window_size_ms, window_stride_ms, nwindows, dct_coefficient_count, filterbank_channel_count, filter_counts, filter_sizes, final_filter_len, dropout_prob, batch_size, dilate_after_layer, stride_after_layer, connection_type) runtime_settings = {'clip_stride_ms': clip_stride_ms} wav_data_placeholder = tf.placeholder(tf.string, [], name='wav_data') decoded_sample_data = audio_ops.decode_wav( wav_data_placeholder, desired_channels=nchannels, desired_samples=model_settings['desired_samples'], name='decoded_sample_data') spectrograms = [] for ichannel in range(nchannels): spectrograms.append( audio_ops.audio_spectrogram( decoded_sample_data.audio, window_size=model_settings['window_size_samples'], stride=model_settings['window_stride_samples'], magnitude_squared=True)) spectrogram = tf.stack(spectrograms, -1) mfccs = [] for ichannel in range(nchannels): mfccs.append( audio_ops.mfcc(spectrograms[ichannel], decoded_sample_data.sample_rate, upper_frequency_limit=sample_rate // 2, filterbank_channel_count=filterbank_channel_count, dct_coefficient_count=dct_coefficient_count)) mfcc = tf.stack(mfccs, -1) if representation == 'waveform': fingerprint_input = decoded_sample_data.audio elif representation == 'spectrogram': fingerprint_input = spectrogram elif representation == 'mel-cepstrum': fingerprint_input = mfcc reshaped_input = tf.reshape(fingerprint_input, [-1, model_settings['fingerprint_size']]) hidden_layers, final = models.create_model( reshaped_input, model_settings, model_architecture, is_training=False, runtime_settings=runtime_settings) # Create an output to use for inference. for i in range(len(hidden_layers)): tf.identity(hidden_layers[i], name='hidden_layer' + str(i)) tf.nn.softmax(final, name='output_layer')
def gen_mfcc_onnx_test_model(model_path, window_count, window_size, stride, sample_rate, lower_frequency_limit, upper_frequency_limit, filterbank_channel_count, dct_coefficient_count): # Tensor sizes. input_length = window_size + (window_count - 1) * stride fft_length = int(2 ** np.ceil(np.log2(window_size))) input_shape = [1, input_length] spectrogram_length = int(fft_length / 2 + 1) spectrogram_shape = [window_count, spectrogram_length] coefficients_shape = [window_count, dct_coefficient_count] # Generate random input data. np.random.seed(1) input_data = np.random.randn(*input_shape) # ----------------------------------------- COMPUTE TensorFlow REFERENCE ------------------------------------------- # Define TensorFlow model. tf_input = tf.constant(input_data.reshape( [input_length, 1]), name='input', dtype=tf.float32) tf_spectrogram = audio_ops.audio_spectrogram(tf_input, window_size=window_size, stride=stride, magnitude_squared=True) tf_mfcc = audio_ops.mfcc(spectrogram=tf_spectrogram, sample_rate=sample_rate, upper_frequency_limit=upper_frequency_limit, lower_frequency_limit=lower_frequency_limit, filterbank_channel_count=filterbank_channel_count, dct_coefficient_count=dct_coefficient_count) # Run TensorFlow model and get spectrogram input. with tf.Session() as sess: spectrogram = sess.run(tf_spectrogram) spectrogram = np.reshape(spectrogram, spectrogram_shape) # Run TensorFlow model and get reference output coefficients. with tf.Session() as sess: coefficients_ref = sess.run(tf_mfcc) coefficients_ref = np.reshape(coefficients_ref, coefficients_shape) # ---------------------------------------------- NODE DEFINITION -------------------------------------------------- # MFCC node definition. mfcc_node_def = onnx.helper.make_node( 'MFCC', name='mfcc', inputs=['spectrogram'], outputs=['coefficients'], sample_rate=float(sample_rate), lower_frequency_limit=float(lower_frequency_limit), upper_frequency_limit=float(upper_frequency_limit), filterbank_channel_count=int(filterbank_channel_count), dct_coefficient_count=int(dct_coefficient_count) ) # Error node definition. err_node_def = onnx.helper.make_node( 'Sub', name='error', inputs=['coefficients', 'coefficients_ref'], outputs=['coefficients_err'] ) # --------------------------------------------- GRAPH DEFINITION -------------------------------------------------- graph_input = list() graph_init = list() graph_output = list() # Graph inputs. graph_input.append(helper.make_tensor_value_info( 'spectrogram', TensorProto.FLOAT, spectrogram_shape)) graph_input.append(helper.make_tensor_value_info( 'coefficients_ref', TensorProto.FLOAT, coefficients_shape)) # Graph initializers. graph_init.append(make_init('spectrogram', TensorProto.FLOAT, spectrogram)) graph_init.append(make_init('coefficients_ref', TensorProto.FLOAT, coefficients_ref)) # Graph outputs. graph_output.append(helper.make_tensor_value_info( 'coefficients_err', TensorProto.FLOAT, coefficients_shape)) # Graph name. graph_name = 'mfcc_test' # Define graph (GraphProto). graph_def = helper.make_graph( [mfcc_node_def, err_node_def], graph_name, inputs=graph_input, outputs=graph_output) # Set initializers. graph_def.initializer.extend(graph_init) # --------------------------------------------- MODEL DEFINITION -------------------------------------------------- # Define model (ModelProto). model_def = helper.make_model(graph_def, producer_name='onnx-mfcc') # Print model. with open(model_path, 'w') as f: f.write(str(model_def))
def prepare_processing_graph(self, data_settings): """Builds a TensorFlow graph to apply the input distortions. Creates a graph that loads a WAVE file, decodes it, scales the volume, shifts it in time, adds in background noise, calculates a spectrogram, and then builds an MFCC fingerprint from that. This must be called with an active TensorFlow session running, and it creates multiple placeholder inputs, and one output: - wav_filename_placeholder_: Filename of the WAV to load. - foreground_volume_placeholder_: How loud the main clip should be. - foreground_resampling_placeholder_: Controls signal stretching/squeezing - time_shift_padding_placeholder_: Where to pad the clip. - time_shift_offset_placeholder_: How much to move the clip in time. - background_data_placeholder_: PCM sample data for background noise. - background_volume_placeholder_: Loudness of mixed-in background. - output_: Output 2D fingerprint of processed audio or raw audio. Args: data_settings: data and model parameters, described at model_train.py Raises: ValueError: If the preprocessing mode isn't recognized. Exception: If the preprocessor wasn't compiled in. """ with tf.get_default_graph().name_scope('data'): desired_samples = data_settings.desired_samples self.wav_filename_placeholder_ = tf.placeholder( tf.string, [], name='wav_filename') wav_loader = io_ops.read_file(self.wav_filename_placeholder_) wav_decoder = tf.audio.decode_wav(wav_loader, desired_channels=1, desired_samples=desired_samples) # Allow the audio sample's volume to be adjusted. self.foreground_volume_placeholder_ = tf.placeholder( tf.float32, [], name='foreground_volume') # signal resampling to generate more training data # it will stretch or squeeze input signal proportinally to: self.foreground_resampling_placeholder_ = tf.placeholder( tf.float32, []) if self.foreground_resampling_placeholder_ != 1.0: image = tf.expand_dims(wav_decoder.audio, 0) image = tf.expand_dims(image, 2) shape = tf.shape(wav_decoder.audio) image_resized = tf.image.resize( images=image, size=(tf.cast((tf.cast(shape[0], tf.float32) * self.foreground_resampling_placeholder_), tf.int32), 1), preserve_aspect_ratio=False) image_resized_cropped = tf.image.resize_with_crop_or_pad( image_resized, target_height=desired_samples, target_width=1, ) image_resized_cropped = tf.squeeze(image_resized_cropped, axis=[0, 3]) scaled_foreground = tf.multiply( image_resized_cropped, self.foreground_volume_placeholder_) else: scaled_foreground = tf.multiply( wav_decoder.audio, self.foreground_volume_placeholder_) # Shift the sample's start position, and pad any gaps with zeros. self.time_shift_padding_placeholder_ = tf.placeholder( tf.int32, [2, 2], name='time_shift_padding') self.time_shift_offset_placeholder_ = tf.placeholder( tf.int32, [2], name='time_shift_offset') padded_foreground = tf.pad( tensor=scaled_foreground, paddings=self.time_shift_padding_placeholder_, mode='CONSTANT') sliced_foreground = tf.slice(padded_foreground, self.time_shift_offset_placeholder_, [desired_samples, -1]) # Mix in background noise. self.background_data_placeholder_ = tf.placeholder( tf.float32, [desired_samples, 1], name='background_data') self.background_volume_placeholder_ = tf.placeholder( tf.float32, [], name='background_volume') background_mul = tf.multiply(self.background_data_placeholder_, self.background_volume_placeholder_) background_add = tf.add(background_mul, sliced_foreground) background_clamp = tf.clip_by_value(background_add, -1.0, 1.0) if data_settings.preprocess == 'raw': # return raw audio self.output_ = background_clamp tf.summary.image('input_audio', tf.expand_dims( tf.expand_dims(background_clamp, -1), -1), max_outputs=1) else: # Run the spectrogram and MFCC ops to get a 2D audio 'fingerprint' spectrogram = audio_ops.audio_spectrogram( background_clamp, window_size=data_settings.window_size_samples, stride=data_settings.window_stride_samples, magnitude_squared=True) tf.summary.image('spectrogram', tf.expand_dims(spectrogram, -1), max_outputs=1) # The number of buckets in each FFT row in the spectrogram will depend # on how many input samples there are in each window. This can be quite # large, with a 160 sample window producing 127 buckets for example. We # don't need this level of detail for classification, so we often want # to shrink them down to produce a smaller result. That's what this # section implements. One method is to use average pooling to merge # adjacent buckets, but a more sophisticated approach is to apply the # MFCC algorithm to shrink the representation. if data_settings.preprocess == 'average': self.output_ = tf.nn.pool( input=tf.expand_dims(spectrogram, -1), window_shape=[1, data_settings.average_window_width], strides=[1, data_settings.average_window_width], pooling_type='AVG', padding='SAME') tf.summary.image('shrunk_spectrogram', self.output_, max_outputs=1) elif data_settings.preprocess == 'mfcc': self.output_ = audio_ops.mfcc( spectrogram, wav_decoder.sample_rate, dct_coefficient_count=data_settings.fingerprint_width) tf.summary.image('mfcc', tf.expand_dims(self.output_, -1), max_outputs=1) elif data_settings.preprocess == 'micro': if not frontend_op: raise Exception( 'Micro frontend op is currently not available when running' ' TensorFlow directly from Python, you need to build and run' ' through Bazel') sample_rate = data_settings.sample_rate window_size_ms = (data_settings.window_size_samples * 1000) / sample_rate window_step_ms = (data_settings.window_stride_samples * 1000) / sample_rate int16_input = tf.cast(tf.multiply(background_clamp, 32768), tf.int16) micro_frontend = frontend_op.audio_microfrontend( int16_input, sample_rate=sample_rate, window_size=window_size_ms, window_step=window_step_ms, num_channels=data_settings.fingerprint_width, out_scale=1, out_type=tf.float32) self.output_ = tf.multiply(micro_frontend, (10.0 / 256.0)) tf.summary.image('micro', tf.expand_dims( tf.expand_dims(self.output_, -1), 0), max_outputs=1) else: raise ValueError( 'Unknown preprocess mode "%s" (should be "mfcc", ' ' "average", or "micro")' % (data_settings.preprocess)) # Merge all the summaries and write them out to /tmp/retrain_logs (by # default) self.merged_summaries_ = tf.summary.merge_all(scope='data') if data_settings.summaries_dir: self.summary_writer_ = tf.summary.FileWriter( data_settings.summaries_dir + '/data', tf.get_default_graph())
import numpy as np import tensorflow.compat.v1 as tf from tensorflow.python.ops import gen_audio_ops as contrib_audio # tf.disable_eager_execution() signal = tf.placeholder(tf.float32, [None], name='signal') spectrogram = contrib_audio.audio_spectrogram(tf.expand_dims(signal, 1), window_size=512, stride=320, magnitude_squared=True) mfccs = contrib_audio.mfcc(spectrogram=spectrogram, sample_rate=16000, dct_coefficient_count=26, upper_frequency_limit=16000 / 2) mfccs = tf.reshape(mfccs, [-1, 26]) sess = tf.Session() def audio2mfcc(samples): ret = sess.run(mfccs, feed_dict={signal: samples}) return ret if __name__ == '__main__': audio = Audio.read('test.wav', 16000) energy = np.abs(audio) silence_threshold = np.percentile(energy, 95) offsets = np.where(energy > silence_threshold)[0] # left_blank_duration_ms = (1000.0 * offsets[0]) // self.sample_rate # frame_id to duration (ms)