def setUp(self): super(FrameTestBase, self).setUp() self.frame_size = 7 self.frame_step = 5 self.inference_batch_size = 1 # generate input signal np.random.seed(1) self.data_size = 33 self.signal = np.random.rand(self.inference_batch_size, self.data_size) # non streaming frame extraction based on tf.signal.frame data_frame_tf = dataframe.DataFrame( mode=Modes.TRAINING, inference_batch_size=self.inference_batch_size, frame_size=self.frame_size, frame_step=self.frame_step) # it receives all data with size: data_size input1 = tf.keras.layers.Input(shape=(self.data_size, ), batch_size=self.inference_batch_size, dtype=tf.float32) output1 = data_frame_tf(inputs=input1) self.model_tf = tf.keras.models.Model(input1, output1) # generate frames for the whole signal (no streaming here) self.output_frames_tf = self.model_tf.predict(self.signal)
def model(flags): """Fully connected layer based model on raw wav data. It is based on paper (with added pooling and raw audio data): SMALL-FOOTPRINT KEYWORD SPOTTING USING DEEP NEURAL NETWORKS https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/42537.pdf Args: flags: data/model parameters Returns: Keras model for training """ if flags.preprocess != 'raw': ValueError('input audio has to be raw, but get ', flags.preprocess) input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ), batch_size=flags.batch_size) net = dataframe.DataFrame( frame_size=flags.window_size_samples, frame_step=flags.window_stride_samples)(input_audio) for units, activation in zip(parse(flags.units1), parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = Stream(cell=tf.keras.layers.Flatten())(net) # after flattening data in time, we can apply any layer: pooling, bi-lstm etc if flags.pool_size > 1: # add fake dim for compatibility with pooling net = tf.keras.backend.expand_dims(net, axis=-1) net = tf.keras.layers.MaxPool1D(pool_size=flags.pool_size, strides=flags.strides, data_format='channels_last')(net) # remove fake dim net = tf.keras.backend.squeeze(net, axis=-1) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units2), parse(flags.act2)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def test_tf_non_streaming_vs_streaming_internal_state(self): # prepare streaming frame extraction model with internal state data_frame_stream = dataframe.DataFrame( mode=Modes.STREAM_INTERNAL_STATE_INFERENCE, inference_batch_size=self.inference_batch_size, frame_size=self.frame_size, frame_step=self.frame_step) # it received input data incrementally with step: frame_step input2 = tf.keras.layers.Input(shape=(self.frame_step, ), batch_size=self.inference_batch_size, dtype=tf.float32) output2 = data_frame_stream(input2) model_stream = tf.keras.models.Model(input2, output2) # initialize internal state of data framer pre_state = self.signal[:, 0:data_frame_stream.frame_size - data_frame_stream.frame_step] state_init = np.concatenate( (np.zeros(shape=(1, data_frame_stream.frame_step), dtype=np.float32), pre_state), axis=1) data_frame_stream.set_weights([state_init]) start = self.frame_size - self.frame_step end = self.frame_size streamed_frames = [] # run streaming frames extraction while end <= self.data_size: # next data update stream_update = self.signal[:, start:end] # get new frame from stream of data output_frame = model_stream.predict(stream_update) streamed_frames.append(output_frame) start = end end = start + self.frame_step # compare streaming vs non streaming frames extraction for i in range(0, len(self.output_frames_tf[0])): self.assertAllEqual(streamed_frames[i][0][0], self.output_frames_tf[0][i])
def build(self, input_shape): super(SpeechFeatures, self).build(input_shape) self.data_frame = dataframe.DataFrame( mode=self.mode, inference_batch_size=self.inference_batch_size, frame_size=self.frame_size, frame_step=self.frame_step) if self.noise_scale != 0.0 and self.mode == modes.Modes.TRAINING: self.add_noise = tf.keras.layers.GaussianNoise(stddev=self.noise_scale) else: self.add_noise = tf.keras.layers.Lambda(lambda x: x) if self.params['preemph'] != 0.0: self.preemphasis = preemphasis.Preemphasis( preemph=self.params['preemph']) else: self.preemphasis = tf.keras.layers.Lambda(lambda x: x) if self.params['window_type'] is not None: self.windowing = windowing.Windowing( window_size=self.frame_size, window_type=self.params['window_type']) else: self.windowing = tf.keras.layers.Lambda(lambda x: x) # If use_tf_fft is False, we will use # Real Discrete Fourier Transformation(RDFT), which is slower than RFFT # To increase RDFT efficiency we use properties of mel spectrum. # We find a range of non zero values in mel spectrum # and use it to compute RDFT: it will speed up computations. # If use_tf_fft is True, then we use TF RFFT which require # signal length alignment, so we disable mel_non_zero_only. self.mag_rdft_mel = magnitude_rdft_mel.MagnitudeRDFTmel( use_tf_fft=self.params['use_tf_fft'], magnitude_squared=self.params['fft_magnitude_squared'], num_mel_bins=self.params['mel_num_bins'], lower_edge_hertz=self.params['mel_lower_edge_hertz'], upper_edge_hertz=self.params['mel_upper_edge_hertz'], sample_rate=self.params['sample_rate'], mel_non_zero_only=self.params['mel_non_zero_only']) self.log_max = tf.keras.layers.Lambda( lambda x: tf.math.log(tf.math.maximum(x, self.params['log_epsilon']))) if self.params['dct_num_features'] != 0: self.dct = dct.DCT(num_features=self.params['dct_num_features']) else: self.dct = tf.keras.layers.Lambda(lambda x: x) self.normalizer = normalizer.Normalizer( mean=self.mean, stddev=self.stddev) # in any inference mode there is no need to add dynamic logic in tf graph if self.params['use_spec_augment'] and self.mode == modes.Modes.TRAINING: self.spec_augment = spectrogram_augment.SpecAugment( time_masks_number=self.params['time_masks_number'], time_mask_max_size=self.params['time_mask_max_size'], frequency_masks_number=self.params['frequency_masks_number'], frequency_mask_max_size=self.params['frequency_mask_max_size']) else: self.spec_augment = tf.keras.layers.Lambda(lambda x: x)