def init(self, shape=(8, 2), flat_dim="time"): self.batch_size = 1 # input data placeholder input_tf = tf.keras.layers.Input(shape=shape, batch_size=self.batch_size, name="inp1") # input test data self.inputs = np.random.uniform(size=(self.batch_size, ) + shape) # create non streamable trainable model mode = Modes.TRAINING if flat_dim == "time": flat_tf = Stream(cell=tf.keras.layers.Flatten(), mode=mode)(input_tf) else: flat_tf = tf.reshape( input_tf, (-1, input_tf.shape[1], input_tf.shape[2] * input_tf.shape[3])) # flat_tf = flatten.Flatten(mode=mode, flat_dim=flat_dim)(input_tf) self.model_train = tf.keras.Model(input_tf, flat_tf) self.model_train.summary() # output data, generated by non streaming model self.outputs = self.model_train.predict(self.inputs) return self.outputs
def model(flags): """Temporal Convolution ResNet model. It is based on paper: Temporal Convolution for Real-time Keyword Spotting on Mobile Devices https://arxiv.org/pdf/1904.03814.pdf Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) time_size, feature_size = net.shape[1:3] channels = utils.parse(flags.channels) net = tf.keras.backend.expand_dims(net) if flags.debug_2d: conv_kernel = first_conv_kernel = (3, 3) else: net = tf.reshape( net, [-1, time_size, 1, feature_size]) # [batch, time, 1, feature] first_conv_kernel = (3, 1) conv_kernel = utils.parse(flags.kernel_size) net = tf.keras.layers.Conv2D(filters=channels[0], kernel_size=first_conv_kernel, strides=1, padding='same', activation='linear')(net) net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum, center=flags.bn_center, scale=flags.bn_scale, renorm=flags.bn_renorm)(net) net = tf.keras.layers.Activation('relu')(net) if utils.parse(flags.pool_size): net = tf.keras.layers.AveragePooling2D(pool_size=utils.parse( flags.pool_size), strides=flags.pool_stride)(net) channels = channels[1:] # residual blocks for n in channels: if n != net.shape[-1]: stride = 2 layer_in = tf.keras.layers.Conv2D(filters=n, kernel_size=1, strides=stride, padding='same', activation='linear')(net) layer_in = tf.keras.layers.BatchNormalization( momentum=flags.bn_momentum, center=flags.bn_center, scale=flags.bn_scale, renorm=flags.bn_renorm)(layer_in) layer_in = tf.keras.layers.Activation('relu')(layer_in) else: layer_in = net stride = 1 net = tf.keras.layers.Conv2D(filters=n, kernel_size=conv_kernel, strides=stride, padding='same', activation='linear')(net) net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum, center=flags.bn_center, scale=flags.bn_scale, renorm=flags.bn_renorm)(net) net = tf.keras.layers.Activation('relu')(net) net = tf.keras.layers.Conv2D(filters=n, kernel_size=conv_kernel, strides=1, padding='same', activation='linear')(net) net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum, center=flags.bn_center, scale=flags.bn_scale, renorm=flags.bn_renorm)(net) # residual connection net = tf.keras.layers.Add()([net, layer_in]) net = tf.keras.layers.Activation('relu')(net) net = tf.keras.layers.AveragePooling2D(pool_size=net.shape[1:3], strides=1)(net) net = tf.keras.layers.Dropout(rate=flags.dropout)(net) # fully connected layer net = tf.keras.layers.Conv2D(filters=flags.label_count, kernel_size=1, strides=1, padding='same', activation='linear')(net) net = tf.reshape(net, shape=(-1, net.shape[3])) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """MatchboxNet model. It is based on paper MatchboxNet: 1D Time-Channel Separable Convolutional Neural Network Architecture for Speech Commands Recognition https://arxiv.org/pdf/2004.08531.pdf Args: flags: data/model parameters Returns: Keras model for training Raises: ValueError: if any of input list has different length from any other; or if padding is not supported """ ds_filters = parse(flags.ds_filters) ds_repeat = parse(flags.ds_repeat) ds_kernel_size = parse(flags.ds_kernel_size) ds_stride = parse(flags.ds_stride) ds_dilation = parse(flags.ds_dilation) ds_residual = parse(flags.ds_residual) for l in (ds_repeat, ds_kernel_size, ds_stride, ds_dilation, ds_residual): if len(ds_filters) != len(l): raise ValueError('all input lists have to be the same length') input_audio = tf.keras.layers.Input( shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))( net) time_size, feature_size = net.shape[1:3] net = tf.keras.backend.expand_dims(net) net = tf.reshape( net, [-1, time_size, 1, feature_size]) # [batch, time, 1, feature] # encoder for filters, repeat, kernel_size, stride, dilation, residual in zip( ds_filters, ds_repeat, ds_kernel_size, ds_stride, ds_dilation, ds_residual): net = resnet_block(net, repeat, kernel_size, filters, dilation, stride, residual, flags.padding, flags.dropout, flags.activation) # decoder net = stream.Stream( cell=tf.keras.layers.AveragePooling2D( pool_size=net.shape[1:3], strides=1))( net) net = tf.reshape(net, shape=(-1, net.shape[3])) net = tf.keras.layers.Dense(units=flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)