def __init__(self, label_count, apply_quantization, **kwargs): super(ConvModel, self).__init__(**kwargs) # create layers self.input_quant = quantize_layer.QuantizeLayer( AllValuesQuantizer(num_bits=8, per_axis=False, symmetric=False, narrow_range=False)) self.conv1 = quantize.quantize_layer( tf.keras.layers.Conv2D(filters=2, kernel_size=[1, 3], padding='SAME')) self.bn1 = quantize.quantize_layer( tf.keras.layers.BatchNormalization()) self.relu1 = quantize.quantize_layer(tf.keras.layers.ReLU()) self.conv2 = ring_buffer.RingBuffer( quantize.quantize_layer( tf.keras.layers.Conv2D(filters=2, kernel_size=(3, 1), dilation_rate=1, strides=2, use_bias=False), apply_quantization, quantize.NoOpActivationConfig(['kernel'], ['activation'], False)), use_one_step=False, inference_batch_size=self.inference_batch_size, pad_time_dim='causal') self.bn2 = quantize.quantize_layer( tf.keras.layers.BatchNormalization(), default_8bit_quantize_configs.NoOpQuantizeConfig()) self.relu2 = quantize.quantize_layer(tf.keras.layers.ReLU()) self.flatten = ring_buffer.RingBuffer( quantize.quantize_layer(tf.keras.layers.Flatten(), apply_quantization), use_one_step=True, inference_batch_size=self.inference_batch_size) self.dense = quantize.quantize_layer( tf.keras.layers.Dense(label_count, activation='softmax', use_bias=False), apply_quantization)
def model(flags): """CNN model. It is based on paper: Convolutional Neural Networks for Small-footprint Keyword Spotting http://www.isca-speech.org/archive/interspeech_2015/papers/i15_1478.pdf Model topology is similar with "Hello Edge: Keyword Spotting on Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) if flags.quantize: net = quantize_layer.QuantizeLayer( AllValuesQuantizer(num_bits=8, per_axis=False, symmetric=False, narrow_range=False))(net) net = tf.keras.backend.expand_dims(net) for filters, kernel_size, activation, dilation_rate, strides in zip( utils.parse(flags.cnn_filters), utils.parse(flags.cnn_kernel_size), utils.parse(flags.cnn_act), utils.parse(flags.cnn_dilation_rate), utils.parse(flags.cnn_strides)): net = stream.Stream(cell=quantize.quantize_layer( tf.keras.layers.Conv2D(filters=filters, kernel_size=kernel_size, dilation_rate=dilation_rate, activation='linear', strides=strides), flags.quantize, quantize.NoOpActivationConfig(['kernel'], ['activation'], False)), pad_time_dim='causal', use_one_step=False)(net) net = quantize.quantize_layer( tf.keras.layers.BatchNormalization(), default_8bit_quantize_configs.NoOpQuantizeConfig())(net) net = quantize.quantize_layer( tf.keras.layers.Activation(activation))(net) net = stream.Stream(cell=quantize.quantize_layer( tf.keras.layers.Flatten(), apply_quantization=flags.quantize))(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(utils.parse(flags.units2), utils.parse(flags.act2)): net = quantize.quantize_layer(tf.keras.layers.Dense( units=units, activation=activation), apply_quantization=flags.quantize)(net) net = quantize.quantize_layer( tf.keras.layers.Dense(units=flags.label_count), apply_quantization=flags.quantize)(net) if flags.return_softmax: net = quantize.quantize_layer(tf.keras.layers.Activation('softmax'), apply_quantization=flags.quantize)(net) return tf.keras.Model(input_audio, net)