def test__streaming_inference_external_state(self): # create streaming inference model with external state mode = Modes.STREAM_EXTERNAL_STATE_INFERENCE inputs = tf.keras.layers.Input(shape=(1, self.feature_size), batch_size=self.inference_batch_size, dtype=tf.float32) gru_layer = gru.GRU(units=self.units, mode=mode) outputs = gru_layer(inputs) model_stream = tf.keras.Model([inputs] + gru_layer.get_input_state(), [outputs] + gru_layer.get_output_state()) # set weights only model_stream.set_weights(self.model_non_streamable.get_weights()) # input states input_state1 = np.zeros((self.inference_batch_size, self.units)) # compare stateless streamable vs non streamable models for i in range(self.data_size): # loop over time samples input_stream = self.signal[:, i, :] input_stream = np.expand_dims(input_stream, 1) output_streams = model_stream.predict([input_stream, input_state1]) # update input states input_state1 = output_streams[1] # compare streaming and non streaming outputs self.assertAllClose(output_streams[0][0][0], self.output_gru[0][i])
def model(flags): """Convolutional recurrent neural network (CRNN) model. It is based on paper Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting https://arxiv.org/pdf/1703.05390.pdf Represented as sequence of Conv, RNN/GRU, FC layers. Model topology is similar with "Hello Edge: Keyword Spotting on Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) # expand dims for the next layer 2d conv net = tf.keras.backend.expand_dims(net) for filters, kernel_size, activation, dilation_rate, strides in zip( parse(flags.cnn_filters), parse(flags.cnn_kernel_size), parse(flags.cnn_act), parse(flags.cnn_dilation_rate), parse(flags.cnn_strides)): net = stream.Stream( cell=tf.keras.layers.Conv2D(filters=filters, kernel_size=kernel_size, activation=activation, dilation_rate=dilation_rate, strides=strides))(net) shape = net.shape # input net dimension: [batch, time, feature, channels] # reshape dimension: [batch, time, feature * channels] # so that GRU/RNN can process it net = tf.keras.layers.Reshape((-1, shape[2] * shape[3]))(net) for units, return_sequences in zip(parse(flags.gru_units), parse(flags.return_sequences)): net = gru.GRU(units=units, return_sequences=return_sequences, stateful=flags.stateful)(net) net = stream.Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units1), parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def setUp(self): super(GRUTest, self).setUp() test_utils.set_seed(123) # generate input signal self.inference_batch_size = 1 self.data_size = 32 self.feature_size = 4 self.signal = np.random.rand(self.inference_batch_size, self.data_size, self.feature_size) # create non streamable model inputs = tf.keras.layers.Input(shape=(self.data_size, self.feature_size), batch_size=self.inference_batch_size, dtype=tf.float32) self.units = 3 outputs = gru.GRU(units=self.units, return_sequences=True)(inputs) self.model_non_streamable = tf.keras.Model(inputs, outputs) self.output_gru = self.model_non_streamable.predict(self.signal)
def model(flags): """Gated Recurrent Unit(GRU) model. It is based on paper Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting https://arxiv.org/pdf/1703.05390.pdf (with no conv layer) Model topology is similar with "Hello Edge: Keyword Spotting on Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))( net) for units, return_sequences in zip( utils.parse(flags.gru_units), utils.parse(flags.return_sequences)): net = gru.GRU( units=units, return_sequences=return_sequences, stateful=flags.stateful)( net) net = stream.Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip( utils.parse(flags.units1), utils.parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def test_streaming_inference_internal_state(self): # create streaming inference model with internal state mode = Modes.STREAM_INTERNAL_STATE_INFERENCE inputs = tf.keras.layers.Input(shape=(1, self.feature_size), batch_size=self.inference_batch_size, dtype=tf.float32) outputs = gru.GRU(units=self.units, mode=mode)(inputs) model_stream = tf.keras.Model(inputs, outputs) # set weights + states weights_states = self.model_non_streamable.get_weights() + [ np.zeros((self.inference_batch_size, self.units)) ] model_stream.set_weights(weights_states) # compare streamable vs non streamable models for i in range(self.data_size): # loop over time samples input_stream = self.signal[:, i, :] input_stream = np.expand_dims(input_stream, 1) output_stream = model_stream.predict(input_stream) self.assertAllClose(output_stream[0][0], self.output_gru[0][i])