def to_streaming_inference(model_non_stream, flags, mode): """Convert non streaming trained model to inference modes. Args: model_non_stream: trained Keras model non streamable flags: settings with global data and model properties mode: it supports Non streaming inference, Streaming inference with internal states, Streaming inference with external states Returns: Keras inference model of inference_type """ tf.keras.backend.set_learning_phase(0) input_data_shape = modes.get_input_data_shape(flags, mode) # get input data type and use it for input streaming type dtype = (model_non_stream.input[0].dtype if isinstance( model_non_stream.input, tuple) else model_non_stream.input.dtype) input_tensors = [ tf.keras.layers.Input(shape=input_data_shape, batch_size=1, dtype=dtype, name='input_audio') ] quantize_stream_scope = quantize.quantize_scope() with quantize_stream_scope: model_inference = convert_to_inference_model(model_non_stream, input_tensors, mode) return model_inference
def model(flags): """Convolutional recurrent neural network (CRNN) model. It is based on paper Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting https://arxiv.org/pdf/1703.05390.pdf Represented as sequence of Conv, RNN/GRU, FC layers. Model topology is similar with "Hello Edge: Keyword Spotting on Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) # expand dims for the next layer 2d conv net = tf.keras.backend.expand_dims(net) for filters, kernel_size, activation, dilation_rate, strides in zip( parse(flags.cnn_filters), parse(flags.cnn_kernel_size), parse(flags.cnn_act), parse(flags.cnn_dilation_rate), parse(flags.cnn_strides)): net = stream.Stream( cell=tf.keras.layers.Conv2D(filters=filters, kernel_size=kernel_size, activation=activation, dilation_rate=dilation_rate, strides=strides))(net) shape = net.shape # input net dimension: [batch, time, feature, channels] # reshape dimension: [batch, time, feature * channels] # so that GRU/RNN can process it net = tf.keras.layers.Reshape((-1, shape[2] * shape[3]))(net) for units, return_sequences in zip(parse(flags.gru_units), parse(flags.return_sequences)): net = gru.GRU(units=units, return_sequences=return_sequences, stateful=flags.stateful)(net) net = stream.Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units1), parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """Temporal Convolution ResNet model. It can be configured to reproduce model config as described in the paper below Temporal Convolution for Real-time Keyword Spotting on Mobile Devices https://arxiv.org/pdf/1904.03814.pdf Args: flags: data/model parameters Returns: Keras model for training """ tc_filters = parse(flags.tc_filters) repeat_tc_convs = parse(flags.repeat_tc_convs) kernel_sizes = parse(flags.kernel_sizes) pool_sizes = parse(flags.pool_sizes) dilations = parse(flags.dilations) residuals = parse(flags.residuals) if len( set((len(repeat_tc_convs), len(kernel_sizes), len(pool_sizes), len(dilations), len(residuals), len(tc_filters)))) != 1: raise ValueError('all input lists have to be the same length') input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) # make it [batch, time, 1, feature] net = tf.keras.backend.expand_dims(net, axis=2) for filters, repeat, kernel_size, pool_size, dilation, residual in zip( tc_filters, repeat_tc_convs, kernel_sizes, pool_sizes, dilations, residuals): net = resnet_block(net, repeat, kernel_size, filters, dilation, residual, flags.padding_in_time, flags.dropout, flags.activation) if pool_size > 1: net = tf.keras.layers.MaxPooling2D((pool_size, 1))(net) net = stream.Stream(cell=tf.keras.layers.GlobalAveragePooling2D())(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """SVDF model. This model is based on decomposition of a densely connected ops into low rank filters. It is based on paper END-TO-END STREAMING KEYWORD SPOTTING https://arxiv.org/pdf/1812.02802.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))( net) # for streaming mode it is better to use causal padding padding = 'causal' if flags.svdf_pad else 'valid' for i, (units1, memory_size, units2, dropout, activation) in enumerate( zip( utils.parse(flags.svdf_units1), utils.parse(flags.svdf_memory_size), utils.parse(flags.svdf_units2), utils.parse(flags.svdf_dropout), utils.parse(flags.svdf_act))): net = svdf.Svdf( units1=units1, memory_size=memory_size, units2=units2, dropout=dropout, activation=activation, pad=padding, name='svdf_%d' % i)( net) net = stream.Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip( utils.parse(flags.units2), utils.parse(flags.act2)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """Inception resnet model. It is based on paper: Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning https://arxiv.org/abs/1602.07261 Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) # [batch, time, feature] net = tf.keras.backend.expand_dims(net, axis=-1) # [batch, time, feature, 1] for filters in utils.parse(flags.cnn_filters0): net = tf.keras.layers.SeparableConv2D(filters, (3, 3), padding='valid', use_bias=False)(net) net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net) net = tf.keras.layers.Activation('relu')(net) net = tf.keras.layers.MaxPooling2D((3, 3), strides=(2, 2))(net) # [batch, time, feature, filters] for stride, scale, filters_branch0, filters_branch1 in zip( utils.parse(flags.strides), utils.parse(flags.scales), utils.parse(flags.filters_branch0), utils.parse(flags.filters_branch1)): net = inception_resnet_block(net, scale, filters_branch0, filters_branch1, bn_scale=flags.bn_scale) net = tf.keras.layers.MaxPooling2D(3, strides=stride, padding='valid')(net) # [batch, time, feature, filters] net = tf.keras.layers.GlobalAveragePooling2D()(net) # [batch, filters] net = tf.keras.layers.Dropout(flags.dropout)(net) net = tf.keras.layers.Dense(flags.label_count)(net) return tf.keras.Model(input_audio, net)
def model(flags): """Fully connected layer based model. It is based on paper (with added pooling): SMALL-FOOTPRINT KEYWORD SPOTTING USING DEEP NEURAL NETWORKS https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/42537.pdf Model topology is similar with "Hello Edge: Keyword Spotting on Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))( net) for units, activation in zip( utils.parse(flags.units1), utils.parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = stream.Stream(cell=tf.keras.layers.Flatten())(net) # after flattening data in time, we can apply any layer: pooling, bi-lstm etc if flags.pool_size > 1: # add fake dim for compatibility with pooling net = tf.keras.backend.expand_dims(net, axis=-1) net = tf.keras.layers.MaxPool1D( pool_size=flags.pool_size, strides=flags.strides, data_format='channels_last')(net) # remove fake dim net = tf.keras.backend.squeeze(net, axis=-1) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip( utils.parse(flags.units2), utils.parse(flags.act2)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """CNN model. It is based on paper: Convolutional Neural Networks for Small-footprint Keyword Spotting http://www.isca-speech.org/archive/interspeech_2015/papers/i15_1478.pdf Model topology is similar with "Hello Edge: Keyword Spotting on Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) net = tf.keras.backend.expand_dims(net) for filters, kernel_size, activation, dilation_rate, strides in zip( utils.parse(flags.cnn_filters), utils.parse(flags.cnn_kernel_size), utils.parse(flags.cnn_act), utils.parse(flags.cnn_dilation_rate), utils.parse(flags.cnn_strides)): net = stream.Stream( cell=tf.keras.layers.Conv2D(filters=filters, kernel_size=kernel_size, activation=activation, dilation_rate=dilation_rate, strides=strides))(net) net = stream.Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(utils.parse(flags.units2), utils.parse(flags.act2)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """LSTM model. Similar model in papers: Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting https://arxiv.org/pdf/1703.05390.pdf (with no conv layer) Model topology is similar with "Hello Edge: Keyword Spotting on Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))( net) for units, return_sequences, num_proj in zip( utils.parse(flags.lstm_units), utils.parse(flags.return_sequences), utils.parse(flags.num_proj)): net = lstm.LSTM( units=units, return_sequences=return_sequences, stateful=flags.stateful, use_peepholes=flags.use_peepholes, num_proj=num_proj)( net) net = stream.Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip( utils.parse(flags.units1), utils.parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def to_streaming_inference(model_non_stream, flags, mode): """Convert non streaming trained model to inference modes. Args: model_non_stream: trained Keras model non streamable flags: settings with global data and model properties mode: it supports Non streaming inference, Streaming inference with internal states, Streaming inference with external states Returns: Keras inference model of inference_type """ tf.keras.backend.set_learning_phase(0) input_data_shape = modes.get_input_data_shape(flags, mode) input_tensors = [ tf.keras.layers.Input( shape=input_data_shape, batch_size=1, name='input_audio') ] model_inference = convert_to_inference_model(model_non_stream, input_tensors, mode) return model_inference
def model(flags): """Gated Recurrent Unit(GRU) model. It is based on paper Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting https://arxiv.org/pdf/1703.05390.pdf (with no conv layer) Hello Edge: Keyword Spotting on Microcontrollers https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) for units, return_sequences in zip(parse(flags.gru_units), parse(flags.return_sequences)): net = GRU(units=units, return_sequences=return_sequences, stateful=flags.stateful)(net) net = Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units1), parse(flags.act1)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) return tf.keras.Model(input_audio, net)
def model(flags): """Mobilenet model. It is based on paper: MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications https://arxiv.org/abs/1704.04861 It is applied on sequence in time, so only 1D filters applied Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))( net) # [batch, time, feature] net = tf.keras.backend.expand_dims(net, axis=2) # [batch, time, feature, 1] # it is convolutional block net = tf.keras.layers.Conv2D( filters=flags.cnn1_filters, kernel_size=utils.parse(flags.cnn1_kernel_size), padding='valid', use_bias=False, strides=utils.parse(flags.cnn1_strides))( net) net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net) net = tf.keras.layers.ReLU(6.)(net) # [batch, time, feature, filters] for kernel_size, strides, filters in zip( utils.parse(flags.ds_kernel_size), utils.parse(flags.ds_strides), utils.parse(flags.cnn_filters)): # it is depthwise convolutional block net = tf.keras.layers.DepthwiseConv2D( kernel_size, padding='same' if strides == (1, 1) else 'valid', depth_multiplier=1, strides=strides, use_bias=False)( net) net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net) net = tf.keras.layers.ReLU(6.,)(net) net = tf.keras.layers.Conv2D( filters=filters, kernel_size=(1, 1), padding='same', use_bias=False, strides=(1, 1))(net) net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net) net = tf.keras.layers.ReLU(6.)(net) # [batch, time, feature, filters] net = tf.keras.layers.GlobalAveragePooling2D()(net) # [batch, filters] net = tf.keras.layers.Dropout(flags.dropout)(net) net = tf.keras.layers.Dense(flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) # [batch, label_count] return tf.keras.Model(input_audio, net)
def model(flags): """Inception model. It is based on paper: Rethinking the Inception Architecture for Computer Vision http://arxiv.org/abs/1512.00567 Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) # [batch, time, feature] net = tf.keras.backend.expand_dims(net, axis=-1) # [batch, time, feature, 1] for filters in utils.parse(flags.cnn_filters0): net = tf.keras.layers.SeparableConv2D(filters, (3, 3), padding='valid', use_bias=False)(net) net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net) net = tf.keras.layers.Activation('relu')(net) net = tf.keras.layers.MaxPooling2D((3, 3), strides=(2, 2))(net) # [batch, time, feature, filters] filters = utils.parse(flags.cnn_filters0)[-1] net = utils.conv2d_bn(net, filters, (3, 1), padding='valid', scale=flags.bn_scale) net = utils.conv2d_bn(net, filters, (1, 3), padding='valid', scale=flags.bn_scale) for stride, filters1, filters2 in zip(utils.parse(flags.cnn_strides), utils.parse(flags.cnn_filters1), utils.parse(flags.cnn_filters2)): if stride > 1: net = tf.keras.layers.MaxPooling2D((3, 3), strides=stride)(net) branch1 = utils.conv2d_bn(net, filters2, (1, 1), scale=flags.bn_scale) branch2 = utils.conv2d_bn(net, filters1, (1, 1), scale=flags.bn_scale) branch2 = utils.conv2d_bn(branch2, filters1, (3, 1), scale=flags.bn_scale) branch2 = utils.conv2d_bn(branch2, filters2, (1, 3), scale=flags.bn_scale) branch3 = utils.conv2d_bn(net, filters1, (1, 1), scale=flags.bn_scale) branch3 = utils.conv2d_bn(branch3, filters1, (3, 1), scale=flags.bn_scale) branch3 = utils.conv2d_bn(branch3, filters1, (1, 3), scale=flags.bn_scale) branch3 = utils.conv2d_bn(branch3, filters1, (3, 1), scale=flags.bn_scale) branch3 = utils.conv2d_bn(branch3, filters2, (1, 3), scale=flags.bn_scale) branch4 = tf.keras.layers.AveragePooling2D((3, 3), strides=(1, 1), padding='same')(net) branch4 = utils.conv2d_bn(branch4, filters2, (1, 1), scale=flags.bn_scale) net = tf.keras.layers.concatenate([branch1, branch2, branch3, branch4]) # [batch, time, feature, filters*4] net = tf.keras.layers.GlobalAveragePooling2D()(net) # [batch, filters*4] net = tf.keras.layers.Dropout(flags.dropout)(net) net = tf.keras.layers.Dense(flags.label_count)(net) return tf.keras.Model(input_audio, net)
def model(flags): """Temporal Convolution ResNet model. It is based on paper: Temporal Convolution for Real-time Keyword Spotting on Mobile Devices https://arxiv.org/pdf/1904.03814.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) time_size, feature_size = net.shape[1:3] channels = utils.parse(flags.channels) net = tf.keras.backend.expand_dims(net) if flags.debug_2d: conv_kernel = first_conv_kernel = (3, 3) else: net = tf.reshape( net, [-1, time_size, 1, feature_size]) # [batch, time, 1, feature] first_conv_kernel = (3, 1) conv_kernel = utils.parse(flags.kernel_size) net = tf.keras.layers.Conv2D(filters=channels[0], kernel_size=first_conv_kernel, strides=1, padding='same', activation='linear')(net) net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum, center=flags.bn_center, scale=flags.bn_scale, renorm=flags.bn_renorm)(net) net = tf.keras.layers.Activation('relu')(net) if utils.parse(flags.pool_size): net = tf.keras.layers.AveragePooling2D(pool_size=utils.parse( flags.pool_size), strides=flags.pool_stride)(net) channels = channels[1:] # residual blocks for n in channels: if n != net.shape[-1]: stride = 2 layer_in = tf.keras.layers.Conv2D(filters=n, kernel_size=1, strides=stride, padding='same', activation='linear')(net) layer_in = tf.keras.layers.BatchNormalization( momentum=flags.bn_momentum, center=flags.bn_center, scale=flags.bn_scale, renorm=flags.bn_renorm)(layer_in) layer_in = tf.keras.layers.Activation('relu')(layer_in) else: layer_in = net stride = 1 net = tf.keras.layers.Conv2D(filters=n, kernel_size=conv_kernel, strides=stride, padding='same', activation='linear')(net) net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum, center=flags.bn_center, scale=flags.bn_scale, renorm=flags.bn_renorm)(net) net = tf.keras.layers.Activation('relu')(net) net = tf.keras.layers.Conv2D(filters=n, kernel_size=conv_kernel, strides=1, padding='same', activation='linear')(net) net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum, center=flags.bn_center, scale=flags.bn_scale, renorm=flags.bn_renorm)(net) # residual connection net = tf.keras.layers.Add()([net, layer_in]) net = tf.keras.layers.Activation('relu')(net) net = tf.keras.layers.AveragePooling2D(pool_size=net.shape[1:3], strides=1)(net) net = tf.keras.layers.Dropout(rate=flags.dropout)(net) # fully connected layer net = tf.keras.layers.Conv2D(filters=flags.label_count, kernel_size=1, strides=1, padding='same', activation='linear')(net) net = tf.reshape(net, shape=(-1, net.shape[3])) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """Xception model. It is based on paper: Xception: Deep Learning with Depthwise Separable Convolutions https://arxiv.org/abs/1610.02357 Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) # [batch, time, feature] net = tf.keras.backend.expand_dims(net, axis=-1) # [batch, time, feature, 1] # conv block for kernel_size, stride, filters in zip(parse(flags.cnn1_kernel_size), parse(flags.cnn1_strides), parse(flags.cnn1_filters)): net = tf.keras.layers.Conv2D(filters, kernel_size, strides=stride, use_bias=False)(net) net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net) net = tf.keras.layers.Activation('relu')(net) # [batch, time, feature, filters] # first residual block for filters in parse(flags.cnn2_filters): residual = tf.keras.layers.Conv2D(filters, (1, 1), strides=(2, 2), padding='same', use_bias=False)(net) residual = tf.keras.layers.BatchNormalization( scale=flags.bn_scale)(residual) net = tf.keras.layers.SeparableConv2D(filters, (3, 3), padding='same', use_bias=False)(net) net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net) net = tf.keras.layers.MaxPooling2D((3, 3), strides=(2, 2), padding='same')(net) net = tf.keras.layers.add([net, residual]) # [batch, time, feature, filters] # second residual block filters = parse(flags.cnn2_filters)[-1] for _ in range(flags.cnn3_blocks): residual = net net = tf.keras.layers.Activation('relu')(net) net = tf.keras.layers.SeparableConv2D(filters, (3, 3), padding='same', use_bias=False)(net) net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net) net = tf.keras.layers.Activation('relu')(net) net = tf.keras.layers.SeparableConv2D( filters, (3, 3), padding='same', use_bias=False, )(net) net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net) net = tf.keras.layers.Activation('relu')(net) net = tf.keras.layers.SeparableConv2D(filters, (3, 3), padding='same', use_bias=False)(net) net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net) net = tf.keras.layers.add([net, residual]) # [batch, time, feature, filters] net = tf.keras.layers.GlobalAveragePooling2D()(net) # [batch, filters] net = tf.keras.layers.Dropout(flags.dropout)(net) net = tf.keras.layers.Dense(flags.label_count)(net) # [batch, label_count] return tf.keras.Model(input_audio, net)
def model(flags): """Inception model. It is based on paper: Rethinking the Inception Architecture for Computer Vision http://arxiv.org/abs/1512.00567 Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) # [batch, time, feature] net = tf.keras.backend.expand_dims(net, axis=2) # [batch, time, 1, feature] for stride, filters, kernel_size in zip( utils.parse(flags.cnn1_strides), utils.parse(flags.cnn1_filters), utils.parse(flags.cnn1_kernel_sizes)): net = utils.conv2d_bn(net, filters, (kernel_size, 1), padding='valid', scale=flags.bn_scale) if stride > 1: net = tf.keras.layers.MaxPooling2D((3, 1), strides=(stride, 1))(net) for stride, filters1, filters2, kernel_size in zip( utils.parse(flags.cnn2_strides), utils.parse(flags.cnn2_filters1), utils.parse(flags.cnn2_filters2), utils.parse(flags.cnn2_kernel_sizes)): branch1 = utils.conv2d_bn(net, filters1, (1, 1), scale=flags.bn_scale) branch2 = utils.conv2d_bn(net, filters1, (1, 1), scale=flags.bn_scale) branch2 = utils.conv2d_bn(branch2, filters1, (kernel_size, 1), scale=flags.bn_scale) branch3 = utils.conv2d_bn(net, filters1, (1, 1), scale=flags.bn_scale) branch3 = utils.conv2d_bn(branch3, filters1, (kernel_size, 1), scale=flags.bn_scale) branch3 = utils.conv2d_bn(branch3, filters1, (kernel_size, 1), scale=flags.bn_scale) net = tf.keras.layers.concatenate([branch1, branch2, branch3]) # [batch, time, 1, filters*4] net = utils.conv2d_bn(net, filters2, (1, 1), scale=flags.bn_scale) # [batch, time, 1, filters2] if stride > 1: net = tf.keras.layers.MaxPooling2D((3, 1), strides=(stride, 1))(net) net = tf.keras.layers.GlobalAveragePooling2D()(net) # [batch, filters*4] net = tf.keras.layers.Dropout(flags.dropout)(net) net = tf.keras.layers.Dense(flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """CNN model. It is based on paper: Convolutional Neural Networks for Small-footprint Keyword Spotting http://www.isca-speech.org/archive/interspeech_2015/papers/i15_1478.pdf Model topology is similar with "Hello Edge: Keyword Spotting on Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) if flags.quantize: net = quantize_layer.QuantizeLayer( AllValuesQuantizer(num_bits=8, per_axis=False, symmetric=False, narrow_range=False))(net) net = tf.keras.backend.expand_dims(net) for filters, kernel_size, activation, dilation_rate, strides in zip( utils.parse(flags.cnn_filters), utils.parse(flags.cnn_kernel_size), utils.parse(flags.cnn_act), utils.parse(flags.cnn_dilation_rate), utils.parse(flags.cnn_strides)): net = stream.Stream(cell=quantize.quantize_layer( tf.keras.layers.Conv2D(filters=filters, kernel_size=kernel_size, dilation_rate=dilation_rate, activation='linear', strides=strides), flags.quantize, quantize.NoOpActivationConfig(['kernel'], ['activation'], False)), pad_time_dim='causal', use_one_step=False)(net) net = quantize.quantize_layer( tf.keras.layers.BatchNormalization(), default_8bit_quantize_configs.NoOpQuantizeConfig())(net) net = quantize.quantize_layer( tf.keras.layers.Activation(activation))(net) net = stream.Stream(cell=quantize.quantize_layer( tf.keras.layers.Flatten(), apply_quantization=flags.quantize))(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(utils.parse(flags.units2), utils.parse(flags.act2)): net = quantize.quantize_layer(tf.keras.layers.Dense( units=units, activation=activation), apply_quantization=flags.quantize)(net) net = quantize.quantize_layer( tf.keras.layers.Dense(units=flags.label_count), apply_quantization=flags.quantize)(net) if flags.return_softmax: net = quantize.quantize_layer(tf.keras.layers.Activation('softmax'), apply_quantization=flags.quantize)(net) return tf.keras.Model(input_audio, net)
def model(flags): """Inception resnet model. It is based on paper: Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning https://arxiv.org/abs/1602.07261 Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))( net) # [batch, time, feature] net = tf.keras.backend.expand_dims(net, axis=2) # [batch, time, 1, feature] for filters, kernel_size, stride in zip( utils.parse(flags.cnn1_filters), utils.parse(flags.cnn1_kernel_sizes), utils.parse(flags.cnn1_strides)): net = utils.conv2d_bn( net, filters, (kernel_size, 1), scale=flags.bn_scale, padding='valid') if stride > 1: net = tf.keras.layers.MaxPooling2D((3, 1), strides=(stride, 1))(net) # [batch, time, 1, filters] for stride, scale, filters_branch0, filters_branch1, filters_branch2, kernel_size in zip( utils.parse(flags.cnn2_strides), utils.parse(flags.cnn2_scales), utils.parse(flags.cnn2_filters_branch0), utils.parse(flags.cnn2_filters_branch1), utils.parse(flags.cnn2_filters_branch2), utils.parse(flags.cnn2_kernel_sizes)): net = inception_resnet_block( net, scale, filters_branch0, filters_branch1, kernel_size, bn_scale=flags.bn_scale) net = utils.conv2d_bn( net, filters_branch2, (1, 1), scale=flags.bn_scale, padding='valid') if stride > 1: net = tf.keras.layers.MaxPooling2D((3, 1), strides=(stride, 1), padding='valid')( net) # [batch, time, 1, filters] net = tf.keras.layers.GlobalAveragePooling2D()(net) # [batch, filters] net = tf.keras.layers.Dropout(flags.dropout)(net) net = tf.keras.layers.Dense(flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def get_data(self, how_many, offset, flags, background_frequency, background_volume_range, time_shift, mode, resample_offset, volume_augmentation_offset, sess): """Gather samples from the data set, applying transformations as needed. When the mode is 'training', a random selection of samples will be returned, otherwise the first N clips in the partition will be used. This ensures that validation always uses the same samples, reducing noise in the metrics. Args: how_many: Desired number of samples to return. -1 means the entire contents of this partition. offset: Where to start when fetching deterministically. flags: data and model parameters, described at model_train.py background_frequency: How many clips will have background noise, 0.0 to 1.0. background_volume_range: How loud the background noise will be. time_shift: How much to randomly shift the clips by in time. It shifts audio data in range from -time_shift to time_shift. mode: Which partition to use, must be 'training', 'validation', or 'testing'. resample_offset: resample input signal - stretch it or squeeze by 0..0.15 If 0 - then not resampling. volume_augmentation_offset: it is used for raw audio volume control. During training volume multiplier will be sampled from 1.0 - volume_augmentation_offset ... 1.0 + volume_augmentation_offset sess: TensorFlow session that was active when processor was created. Returns: List of sample data for the transformed samples, and list of label indexes Raises: ValueError: If background samples are too short. """ # Pick one of the partitions to choose samples from. candidates = self.data_index[mode] if how_many == -1: sample_count = len(candidates) else: if flags.pick_deterministically and mode == 'training': # it is a special case: sample_count = how_many else: sample_count = max(0, min(how_many, len(candidates) - offset)) # Data and labels will be populated and returned. input_data_shape = modes.get_input_data_shape(flags, modes.Modes.TRAINING) data = np.zeros((sample_count, ) + input_data_shape) labels = np.zeros(sample_count) desired_samples = flags.desired_samples use_background = self.background_data and (mode == 'training') pick_deterministically = (mode != 'training') or flags.pick_deterministically # Use the processing graph we created earlier to repeatedly to generate the # final output sample data we'll use in training. for i in xrange(offset, offset + sample_count): # Pick which audio sample to use. if how_many == -1 or pick_deterministically: # during inference offset is 0, # but during training offset can be 0 or # training_step * batch_size, so 'i' can go beyond array size sample_index = i % len(candidates) else: sample_index = np.random.randint(len(candidates)) sample = candidates[sample_index] # If we're time shifting, set up the offset for this sample. if time_shift > 0: time_shift_amount = np.random.randint(-time_shift, time_shift) else: time_shift_amount = 0 if time_shift_amount > 0: time_shift_padding = [[time_shift_amount, 0], [0, 0]] time_shift_offset = [0, 0] else: time_shift_padding = [[0, -time_shift_amount], [0, 0]] time_shift_offset = [-time_shift_amount, 0] resample = 1.0 if mode == 'training' and resample_offset != 0.0: resample = np.random.uniform(low=resample - resample_offset, high=resample + resample_offset) input_dict = { self.wav_filename_placeholder_: sample['file'], self.time_shift_padding_placeholder_: time_shift_padding, self.time_shift_offset_placeholder_: time_shift_offset, self.foreground_resampling_placeholder_: resample, } # Choose a section of background noise to mix in. if use_background: background_index = np.random.randint(len(self.background_data)) background_samples = self.background_data[background_index] if len(background_samples) <= flags.desired_samples: raise ValueError( 'Background sample is too short! Need more than %d' ' samples but only %d were found' % (flags.desired_samples, len(background_samples))) background_offset = np.random.randint( 0, len(background_samples) - flags.desired_samples) background_clipped = background_samples[background_offset:( background_offset + desired_samples)] background_reshaped = background_clipped.reshape( [desired_samples, 1]) if np.random.uniform(0, 1) < background_frequency: background_volume = np.random.uniform( 0, background_volume_range) else: background_volume = 0 else: background_reshaped = np.zeros([desired_samples, 1]) background_volume = 0 input_dict[self.background_data_placeholder_] = background_reshaped input_dict[self.background_volume_placeholder_] = background_volume # If we want silence, mute out the main sample but leave the background. if sample['label'] == SILENCE_LABEL: input_dict[self.foreground_volume_placeholder_] = 0 else: foreground_volume = 1.0 # multiplier of audio signal # in training mode produce audio data with different volume if mode == 'training' and volume_augmentation_offset != 0.0: foreground_volume = np.random.uniform( low=foreground_volume - volume_augmentation_offset, high=foreground_volume + volume_augmentation_offset) input_dict[ self.foreground_volume_placeholder_] = foreground_volume # Run the graph to produce the output audio. data_tensor = sess.run(self.output_, feed_dict=input_dict) data[i - offset, :] = data_tensor label_index = self.word_to_index[sample['label']] labels[i - offset] = label_index return data, labels
def model(flags): """Xception model. It is based on papers: Xception: Deep Learning with Depthwise Separable Convolutions https://arxiv.org/abs/1610.02357 MatchboxNet: 1D Time-Channel Separable Convolutional Neural Network Architecture for Speech Commands Recognition https://arxiv.org/pdf/2004.08531 Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) # [batch, time, feature] net = tf.keras.backend.expand_dims(net, axis=2) # [batch, time, 1, feature] # conv block for kernel_size, filters in zip(utils.parse(flags.cnn1_kernel_sizes), utils.parse(flags.cnn1_filters)): net = tf.keras.layers.Conv2D(filters, (kernel_size, 1), use_bias=False)(net) net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net) net = tf.keras.layers.Activation('relu')(net) # [batch, time, 1, feature] if flags.stride1 > 1: net = tf.keras.layers.MaxPooling2D((3, 1), strides=(flags.stride1, 1), padding='valid')(net) net = block(net, utils.parse(flags.cnn2_kernel_sizes), utils.parse(flags.cnn2_filters), flags.dropout, flags.bn_scale) if flags.stride2 > 1: net = tf.keras.layers.MaxPooling2D((3, 1), strides=(flags.stride2, 1), padding='valid')(net) net = block(net, utils.parse(flags.cnn3_kernel_sizes), utils.parse(flags.cnn3_filters), flags.dropout, flags.bn_scale) if flags.stride3 > 1: net = tf.keras.layers.MaxPooling2D((3, 1), strides=(flags.stride3, 1), padding='valid')(net) net = block(net, utils.parse(flags.cnn4_kernel_sizes), utils.parse(flags.cnn4_filters), flags.dropout, flags.bn_scale) if flags.stride4 > 1: net = tf.keras.layers.MaxPooling2D((3, 1), strides=(flags.stride4, 1), padding='valid')(net) net = tf.keras.layers.GlobalAveragePooling2D()(net) # [batch, filters] net = tf.keras.layers.Dropout(flags.dropout)(net) for units in utils.parse(flags.units2): net = tf.keras.layers.Dense(units=units, activation=None, use_bias=False)(net) net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net) net = tf.keras.layers.Activation('relu')(net) net = tf.keras.layers.Dense(flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) # [batch, label_count] return tf.keras.Model(input_audio, net)
def main(_): # Update flags flags = model_flags.update_flags(FLAGS) if flags.train: # Create model folders where logs and model will be stored os.makedirs(flags.train_dir) os.mkdir(flags.summaries_dir) # Model training train.train(flags) else: if not os.path.isdir(flags.train_dir): raise ValueError( 'model is not trained set "--train 1" and retrain it') # write all flags settings into json with open(os.path.join(flags.train_dir, 'flags.json'), 'wt') as f: json.dump(flags.__dict__, f) # convert to SavedModel test.convert_model_saved(flags, 'non_stream', modes.Modes.NON_STREAM_INFERENCE) try: test.convert_model_saved(flags, 'stream_state_internal', modes.Modes.STREAM_INTERNAL_STATE_INFERENCE) except (ValueError, IndexError) as e: logging.info('FAILED to run TF streaming: %s', e) logging.info('run TF non streaming model accuracy evaluation') # with TF folder_name = 'tf' test.tf_non_stream_model_accuracy(flags, folder_name) # with TF. # We can apply non stream model on stream data, by running inference # every 200ms (for example), so that total latency will be similar with # streaming model which is executed every 20ms. # To measure the impact of sampling on model accuracy, # we introduce time_shift_ms during accuracy evaluation. # Convert milliseconds to samples: time_shift_samples = int( (flags.time_shift_ms * flags.sample_rate) / model_flags.MS_PER_SECOND) test.tf_non_stream_model_accuracy( flags, folder_name, time_shift_samples, accuracy_name='tf_non_stream_model_sampling_stream_accuracy.txt') name2opt = { '': None, 'quantize_opt_for_size_': [tf.lite.Optimize.DEFAULT], } for opt_name, optimizations in name2opt.items(): if (opt_name and flags.feature_type == 'mfcc_tf' and flags.preprocess == 'raw'): logging.info( 'feature type mfcc_tf needs quantization aware training ' 'for quantization - it is not implemented') continue folder_name = opt_name + 'tflite_non_stream' file_name = 'non_stream.tflite' mode = modes.Modes.NON_STREAM_INFERENCE test.convert_model_tflite(flags, folder_name, mode, file_name, optimizations=optimizations) test.tflite_non_stream_model_accuracy(flags, folder_name, file_name) # these models are using bi-rnn, so they are non streamable by default # also models using striding or pooling are not supported for streaming now non_streamable_models = {'att_mh_rnn', 'att_rnn', 'tc_resnet'} model_is_streamable = True if flags.model_name in non_streamable_models: model_is_streamable = False # below models can use striding in time dimension, # but this is currently unsupported elif flags.model_name == 'cnn': for strides in model_utils.parse(flags.cnn_strides): if strides[0] > 1: model_is_streamable = False break elif flags.model_name == 'ds_cnn': if model_utils.parse(flags.cnn1_strides)[0] > 1: model_is_streamable = False for strides in model_utils.parse(flags.dw2_strides): if strides[0] > 1: model_is_streamable = False break # set input data shape for testing inference in streaming mode flags.data_shape = modes.get_input_data_shape( flags, modes.Modes.STREAM_EXTERNAL_STATE_INFERENCE) # if model can be streamed, then run conversion/evaluation in streaming mode if model_is_streamable: # ---------------- TF streaming model accuracy evaluation ---------------- # Streaming model with external state evaluation using TF with state reset if not opt_name: logging.info( 'run TF evalution only without optimization/quantization') try: folder_name = 'tf' test.tf_stream_state_external_model_accuracy( flags, folder_name, accuracy_name= 'stream_state_external_model_accuracy_sub_set_reset1.txt', reset_state=True ) # with state reset between test sequences # Streaming (with external state) evaluation using TF no state reset test.tf_stream_state_external_model_accuracy( flags, folder_name, accuracy_name= 'stream_state_external_model_accuracy_sub_set_reset0.txt', reset_state=False) # without state reset # Streaming (with internal state) evaluation using TF no state reset test.tf_stream_state_internal_model_accuracy( flags, folder_name) except (ValueError, IndexError) as e: logging.info('FAILED to run TF streaming: %s', e) logging.info('run TFlite streaming model accuracy evaluation') try: # convert model to TFlite folder_name = opt_name + 'tflite_stream_state_external' file_name = 'stream_state_external.tflite' mode = modes.Modes.STREAM_EXTERNAL_STATE_INFERENCE test.convert_model_tflite(flags, folder_name, mode, file_name, optimizations=optimizations) # Streaming model accuracy evaluation with TFLite with state reset test.tflite_stream_state_external_model_accuracy( flags, folder_name, file_name, accuracy_name= 'tflite_stream_state_external_model_accuracy_reset1.txt', reset_state=True) # Streaming model accuracy evaluation with TFLite without state reset test.tflite_stream_state_external_model_accuracy( flags, folder_name, file_name, accuracy_name= 'tflite_stream_state_external_model_accuracy_reset0.txt', reset_state=False) except (ValueError, IndexError) as e: logging.info('FAILED to run TFLite streaming: %s', e)
def model(flags): """SVDF model with residual connections. This model is based on decomposition of a densely connected ops into low rank filters. It is based on paper END-TO-END STREAMING KEYWORD SPOTTING https://arxiv.org/pdf/1812.02802.pdf In addition we added residual connection Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) blocks_pool = parse(flags.blocks_pool) if len(blocks_pool) != 3: raise ValueError('number of pooling blocks has to be 3, but get: ', len(blocks_pool)) # for streaming mode it is better to use causal padding padding = 'causal' if flags.svdf_pad else 'valid' # first residual block number_of_blocks = len(parse(flags.block1_units1)) activations = [flags.activation] * number_of_blocks activations[-1] = 'linear' # last layer is linear residual = net for i, (units1, memory_size, activation) in enumerate( zip(parse(flags.block1_units1), parse(flags.block1_memory_size), activations)): # [batch, time, feature] net = svdf.Svdf(units1=units1, memory_size=memory_size, units2=-1, dropout=flags.svdf_dropout, activation=activation, pad=padding, use_bias=flags.svdf_use_bias, use_batch_norm=flags.use_batch_norm, bn_scale=flags.bn_scale, name='svdf_1_%d' % i)(net) # number of channels in the last layer units1_last = parse(flags.block1_units1)[-1] # equivalent to 1x1 convolution residual = tf.keras.layers.Dense(units1_last, use_bias=False)(residual) residual = tf.keras.layers.BatchNormalization( scale=flags.bn_scale)(residual) # residual connection net = tf.keras.layers.Add()([net, residual]) # [batch, time, feature] net = tf.keras.layers.Activation(flags.activation)(net) net = tf.keras.layers.MaxPool1D(3, strides=blocks_pool[0], padding='valid')(net) # second residual block number_of_blocks = len(parse(flags.block2_units1)) activations = [flags.activation] * number_of_blocks activations[-1] = 'linear' # last layer is linear residual = net for i, (units1, memory_size, activation) in enumerate( zip(parse(flags.block2_units1), parse(flags.block2_memory_size), activations)): # [batch, time, feature] net = svdf.Svdf(units1=units1, memory_size=memory_size, units2=-1, dropout=flags.svdf_dropout, activation=activation, pad=padding, use_bias=flags.svdf_use_bias, use_batch_norm=flags.use_batch_norm, bn_scale=flags.bn_scale, name='svdf_2_%d' % i)(net) # number of channels in the last layer units1_last = parse(flags.block2_units1)[-1] # equivalent to 1x1 convolution residual = tf.keras.layers.Dense(units1_last, use_bias=False)(residual) residual = tf.keras.layers.BatchNormalization( scale=flags.bn_scale)(residual) # residual connection net = tf.keras.layers.Add()([net, residual]) net = tf.keras.layers.Activation(flags.activation)(net) # [batch, time, feature] net = tf.keras.layers.MaxPool1D(3, strides=blocks_pool[1], padding='valid')(net) # third residual block number_of_blocks = len(parse(flags.block3_units1)) activations = [flags.activation] * number_of_blocks activations[-1] = 'linear' # last layer is linear residual = net for i, (units1, memory_size, activation) in enumerate( zip(parse(flags.block3_units1), parse(flags.block3_memory_size), activations)): net = svdf.Svdf(units1=units1, memory_size=memory_size, units2=-1, dropout=flags.svdf_dropout, activation=activation, pad=padding, use_bias=flags.svdf_use_bias, use_batch_norm=flags.use_batch_norm, bn_scale=flags.bn_scale, name='svdf_3_%d' % i)(net) # number of channels in the last layer units1_last = parse(flags.block3_units1)[-1] # equivalent to 1x1 convolution residual = tf.keras.layers.Dense(units1_last, use_bias=False)(residual) residual = tf.keras.layers.BatchNormalization( scale=flags.bn_scale)(residual) # residual connection net = tf.keras.layers.Add()([net, residual]) net = tf.keras.layers.Activation(flags.activation)(net) net = tf.keras.layers.MaxPool1D(3, strides=blocks_pool[2], padding='valid')(net) # [batch, time, feature] # convert all feature to one vector if flags.flatten: net = tf.keras.layers.Flatten()(net) else: net = tf.keras.layers.GlobalAveragePooling1D()(net) # [batch, feature] net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units in parse(flags.units2): net = tf.keras.layers.Dense(units=units, activation=flags.activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """Depthwise convolutional model. It is based on paper: MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications https://arxiv.org/abs/1704.04861 Model topology is similar with "Hello Edge: Keyword Spotting on Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) net = tf.keras.backend.expand_dims(net) net = stream.Stream(cell=tf.keras.layers.Conv2D( kernel_size=utils.parse(flags.cnn1_kernel_size), dilation_rate=utils.parse(flags.cnn1_dilation_rate), filters=flags.cnn1_filters, padding=flags.cnn1_padding, strides=utils.parse(flags.cnn1_strides)))(net) net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum, center=flags.bn_center, scale=flags.bn_scale, renorm=flags.bn_renorm)(net) net = tf.keras.layers.Activation('relu')(net) for kernel_size, dw2_act, dilation_rate, strides, filters, cnn2_act in zip( utils.parse(flags.dw2_kernel_size), utils.parse(flags.dw2_act), utils.parse(flags.dw2_dilation_rate), utils.parse(flags.dw2_strides), utils.parse(flags.cnn2_filters), utils.parse(flags.cnn2_act)): net = stream.Stream( cell=tf.keras.layers.DepthwiseConv2D(kernel_size=kernel_size, dilation_rate=dilation_rate, padding=flags.dw2_padding, strides=strides))(net) net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum, center=flags.bn_center, scale=flags.bn_scale, renorm=flags.bn_renorm)(net) net = tf.keras.layers.Activation(dw2_act)(net) net = tf.keras.layers.Conv2D(kernel_size=(1, 1), filters=filters)(net) net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum, center=flags.bn_center, scale=flags.bn_scale, renorm=flags.bn_renorm)(net) net = tf.keras.layers.Activation(cnn2_act)(net) net = stream.Stream(cell=tf.keras.layers.AveragePooling2D( pool_size=(int(net.shape[1]), int(net.shape[2]))))(net) net = stream.Stream(cell=tf.keras.layers.Flatten())(net) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """BC-ResNet model. It is based on paper Broadcasted Residual Learning for Efficient Keyword Spotting https://arxiv.org/pdf/2106.04140.pdf Args: flags: data/model parameters Returns: Keras model for training Raises: ValueError: if any of input list has different length from any other; or if padding is not supported """ dropouts = utils.parse(flags.dropouts) filters = utils.parse(flags.filters) blocks_n = utils.parse(flags.blocks_n) strides = utils.parse(flags.strides) dilations = utils.parse(flags.dilations) for l in (dropouts, filters, strides, dilations): if len(blocks_n) != len(l): raise ValueError('all input lists have to be the same length ' 'but get %s and %s ' % (blocks_n, l)) input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) # make it [batch, time, feature, 1] net = tf.keras.backend.expand_dims(net, axis=3) if flags.paddings == 'same': net = tf.keras.layers.Conv2D(filters=flags.first_filters, kernel_size=5, strides=(1, 2), padding='same')(net) else: net = stream.Stream(cell=tf.keras.layers.Conv2D( filters=flags.first_filters, kernel_size=5, strides=(1, 2), padding='valid'), use_one_step=True, pad_time_dim=flags.paddings, pad_freq_dim='same')(net) for n, n_filters, dilation, stride, dropout in zip(blocks_n, filters, dilations, strides, dropouts): net = TransitionBlock(n_filters, dilation, stride, flags.paddings, dropout, sub_groups=flags.sub_groups)(net) for _ in range(n): net = NormalBlock(n_filters, dilation, 1, flags.paddings, dropout, sub_groups=flags.sub_groups)(net) if flags.paddings == 'same': net = tf.keras.layers.DepthwiseConv2D(kernel_size=5, padding='same')(net) else: net = stream.Stream(cell=tf.keras.layers.DepthwiseConv2D( kernel_size=5, padding='valid'), use_one_step=True, pad_time_dim=flags.paddings, pad_freq_dim='same')(net) # average out frequency dim net = tf.keras.backend.mean(net, axis=2, keepdims=True) net = tf.keras.layers.Conv2D(filters=flags.last_filters, kernel_size=1, use_bias=False)(net) # average out time dim if flags.paddings == 'same': net = tf.keras.layers.GlobalAveragePooling2D(keepdims=True)(net) else: net = stream.Stream(cell=tf.keras.layers.GlobalAveragePooling2D( keepdims=True))(net) net = tf.keras.layers.Conv2D(filters=flags.label_count, kernel_size=1, use_bias=False)(net) # 1 and 2 dims are equal to 1 net = tf.squeeze(net, [1, 2]) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """Mobilenet V2 model. It is based on paper: MobileNetV2: Inverted Residuals and Linear Bottlenecks https://arxiv.org/abs/1801.04381 It is applied on sequence in time, so only 1D filters applied Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) # [batch, time, feature] net = tf.keras.backend.expand_dims(net, axis=2) # [batch, time, feature, 1] # it is conv_block net = tf.keras.layers.Conv2D(filters=flags.cnn1_filters, kernel_size=utils.parse( flags.cnn1_kernel_size), padding='valid', use_bias=False, strides=utils.parse(flags.cnn1_strides))(net) net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net) net = tf.keras.layers.ReLU(6.)(net) # [batch, time, feature, filters] for kernel_size, stride, filters, expansion in zip( utils.parse(flags.ds_kernel_size), utils.parse(flags.cnn_strides), utils.parse(flags.cnn_filters), utils.parse(flags.cnn_expansions)): # it is Inverted ResNet block net_input = net in_channels = tf.keras.backend.int_shape(net_input)[-1] net = tf.keras.layers.Conv2D(expansion * in_channels, kernel_size=1, padding='same', use_bias=False, activation=None)(net) net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net) net = tf.keras.layers.ReLU(6.)(net) # [batch, time, feature, filters] # depthwise net = tf.keras.layers.DepthwiseConv2D(kernel_size=kernel_size, strides=stride, activation=None, use_bias=False, padding='same')(net) net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net) net = tf.keras.layers.ReLU(6.)(net) # project net = tf.keras.layers.Conv2D(filters, kernel_size=1, padding='same', use_bias=False, activation=None)(net) net = tf.keras.layers.BatchNormalization()(net) if in_channels == filters and stride == (1, 1): net = tf.keras.layers.Add()([net_input, net]) # [batch, time, feature, filters] net = tf.keras.layers.GlobalAveragePooling2D()(net) # [batch, filters] net = tf.keras.layers.Dropout(flags.dropout)(net) net = tf.keras.layers.Dense(flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """MatchboxNet model. It is based on paper MatchboxNet: 1D Time-Channel Separable Convolutional Neural Network Architecture for Speech Commands Recognition https://arxiv.org/pdf/2004.08531.pdf Args: flags: data/model parameters Returns: Keras model for training Raises: ValueError: if any of input list has different length from any other; or if padding is not supported """ ds_filters = parse(flags.ds_filters) ds_repeat = parse(flags.ds_repeat) ds_kernel_size = parse(flags.ds_kernel_size) ds_stride = parse(flags.ds_stride) ds_dilation = parse(flags.ds_dilation) ds_residual = parse(flags.ds_residual) ds_pool = parse(flags.ds_pool) ds_padding = parse(flags.ds_padding) ds_filter_separable = parse(flags.ds_filter_separable) for l in (ds_repeat, ds_kernel_size, ds_stride, ds_dilation, ds_residual, ds_pool, ds_padding, ds_filter_separable): if len(ds_filters) != len(l): raise ValueError('all input lists have to be the same length') input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) # make it [batch, time, 1, feature] net = tf.keras.backend.expand_dims(net, axis=2) # encoder for filters, repeat, ksize, stride, sep, dilation, res, pool, pad in zip( ds_filters, ds_repeat, ds_kernel_size, ds_stride, ds_filter_separable, ds_dilation, ds_residual, ds_pool, ds_padding): net = resnet_block(net, repeat, ksize, filters, dilation, stride, sep, res, pad, flags.dropout, flags.activation, flags.ds_scale) if pool > 1: if flags.ds_max_pool: net = tf.keras.layers.MaxPooling2D(pool_size=(pool, 1), strides=(pool, 1))(net) else: net = tf.keras.layers.AveragePooling2D(pool_size=(pool, 1), strides=(pool, 1))(net) # decoder net = stream.Stream(cell=tf.keras.layers.GlobalAveragePooling2D())(net) net = tf.keras.layers.Flatten()(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """BiRNN attention model. It is based on paper: A neural attention model for speech command recognition https://arxiv.org/pdf/1808.08929.pdf Depending on parameter rnn_type, model can be biLSTM or biGRU Args: flags: data/model parameters Returns: Keras model for training """ rnn_types = {'lstm': tf.keras.layers.LSTM, 'gru': tf.keras.layers.GRU} if flags.rnn_type not in rnn_types: ValueError('not supported RNN type ', flags.rnn_type) rnn = rnn_types[flags.rnn_type] input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) net = tf.keras.backend.expand_dims(net) for filters, kernel_size, activation, dilation_rate, strides in zip( parse(flags.cnn_filters), parse(flags.cnn_kernel_size), parse(flags.cnn_act), parse(flags.cnn_dilation_rate), parse(flags.cnn_strides)): net = tf.keras.layers.Conv2D(filters=filters, kernel_size=kernel_size, activation=activation, dilation_rate=dilation_rate, strides=strides, padding='same')(net) net = tf.keras.layers.BatchNormalization()(net) shape = net.shape # input net dimension: [batch, time, feature, channels] # reshape dimension: [batch, time, feature * channels] # so that GRU/RNN can process it net = tf.keras.layers.Reshape((-1, shape[2] * shape[3]))(net) # dims: [batch, time, feature] for _ in range(flags.rnn_layers): net = tf.keras.layers.Bidirectional( rnn(flags.rnn_units, return_sequences=True, unroll=True))(net) feature_dim = net.shape[-1] middle = net.shape[1] // 2 # index of middle point of sequence # feature vector at middle point [batch, feature] mid_feature = net[:, middle, :] # apply one projection layer with the same dim as input feature query = tf.keras.layers.Dense(feature_dim)(mid_feature) # attention weights [batch, time] att_weights = tf.keras.layers.Dot(axes=[1, 2])([query, net]) att_weights = tf.keras.layers.Softmax(name='attSoftmax')(att_weights) # apply attention weights [batch, feature] net = tf.keras.layers.Dot(axes=[1, 1])([att_weights, net]) net = tf.keras.layers.Dropout(rate=flags.dropout1)(net) for units, activation in zip(parse(flags.units2), parse(flags.act2)): net = tf.keras.layers.Dense(units=units, activation=activation)(net) net = tf.keras.layers.Dense(units=flags.label_count)(net) return tf.keras.Model(input_audio, net)