def inception_resnet_block(x, scale, filters_branch0, filters_branch1, kernel_size, activation='relu', bn_scale=False): """Adds a Inception-ResNet block. This function builds only one types of Inception-ResNet block reduced version of keras/applications/inception_resnet_v2.py Arguments: x: input tensor. scale: scaling factor to scale the residuals (i.e., the output of passing `x` through an inception module) before adding them to the shortcut branch. Let `r` be the output from the residual branch, the output of this block will be `x + scale * r`. filters_branch0: number of filters in branch0 filters_branch1: number of filters in branch1 kernel_size: kernel size of conv in branch1 activation: activation function to use at the end of the block bn_scale: use scale in batch normalization layer Returns: Output tensor for the block. """ # only one type of branching is supported branch_0 = utils.conv2d_bn(x, filters_branch0, 1, scale=bn_scale) branch_1 = utils.conv2d_bn(x, filters_branch0, 1, scale=bn_scale) branch_1 = utils.conv2d_bn(branch_1, filters_branch1, [kernel_size, 1], scale=bn_scale) branch_1 = utils.conv2d_bn(branch_1, filters_branch1, [kernel_size, 1], scale=bn_scale) branches = [branch_0, branch_1] mixed = tf.keras.layers.Concatenate()(branches) up = utils.conv2d_bn(mixed, tf.keras.backend.int_shape(x)[-1], 1, activation=None, scale=bn_scale, use_bias=True) x = tf.keras.layers.Lambda( lambda inputs, scale: inputs[0] + inputs[1] * scale, output_shape=tf.keras.backend.int_shape(x)[1:], arguments={'scale': scale})([x, up]) if activation is not None: x = tf.keras.layers.Activation(activation)(x) return x
def model(flags): """Inception resnet model. It is based on paper: Inception-v4, Inception-ResNet and the Impact of Residual Connections on Learning https://arxiv.org/abs/1602.07261 Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape( flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))(net) # [batch, time, feature] net = tf.keras.backend.expand_dims(net, axis=2) # [batch, time, 1, feature] for filters, kernel_size, stride in zip( utils.parse(flags.cnn1_filters), utils.parse(flags.cnn1_kernel_sizes), utils.parse(flags.cnn1_strides)): net = utils.conv2d_bn(net, filters, (kernel_size, 1), scale=flags.bn_scale, padding='valid') if stride > 1: net = tf.keras.layers.MaxPooling2D((3, 1), strides=(stride, 1))(net) # [batch, time, 1, filters] for stride, scale, filters_branch0, filters_branch1, filters_branch2, kernel_size in zip( utils.parse(flags.cnn2_strides), utils.parse(flags.cnn2_scales), utils.parse(flags.cnn2_filters_branch0), utils.parse(flags.cnn2_filters_branch1), utils.parse(flags.cnn2_filters_branch2), utils.parse(flags.cnn2_kernel_sizes)): net = inception_resnet_block(net, scale, filters_branch0, filters_branch1, kernel_size, bn_scale=flags.bn_scale) net = utils.conv2d_bn(net, filters_branch2, (1, 1), scale=flags.bn_scale, padding='valid') if stride > 1: net = tf.keras.layers.MaxPooling2D((3, 1), strides=(stride, 1), padding='valid')(net) # [batch, time, 1, filters] net = tf.keras.layers.GlobalAveragePooling2D()(net) # [batch, filters] net = tf.keras.layers.Dropout(flags.dropout)(net) net = tf.keras.layers.Dense(flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)
def model(flags): """Inception model. It is based on paper: Rethinking the Inception Architecture for Computer Vision http://arxiv.org/abs/1512.00567 Args: flags: data/model parameters Returns: Keras model for training """ input_audio = tf.keras.layers.Input( shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING), batch_size=flags.batch_size) net = input_audio if flags.preprocess == 'raw': # it is a self contained model, user need to feed raw audio only net = speech_features.SpeechFeatures( speech_features.SpeechFeatures.get_params(flags))( net) # [batch, time, feature] net = tf.keras.backend.expand_dims(net, axis=2) # [batch, time, 1, feature] for stride, filters, kernel_size in zip( utils.parse(flags.cnn1_strides), utils.parse(flags.cnn1_filters), utils.parse(flags.cnn1_kernel_sizes)): net = utils.conv2d_bn( net, filters, (kernel_size, 1), padding='valid', scale=flags.bn_scale) if stride > 1: net = tf.keras.layers.MaxPooling2D((3, 1), strides=(stride, 1))(net) for stride, filters1, filters2, kernel_size in zip( utils.parse(flags.cnn2_strides), utils.parse(flags.cnn2_filters1), utils.parse(flags.cnn2_filters2), utils.parse(flags.cnn2_kernel_sizes)): branch1 = utils.conv2d_bn(net, filters1, (1, 1), scale=flags.bn_scale) branch2 = utils.conv2d_bn(net, filters1, (1, 1), scale=flags.bn_scale) branch2 = utils.conv2d_bn( branch2, filters1, (kernel_size, 1), scale=flags.bn_scale) branch3 = utils.conv2d_bn(net, filters1, (1, 1), scale=flags.bn_scale) branch3 = utils.conv2d_bn( branch3, filters1, (kernel_size, 1), scale=flags.bn_scale) branch3 = utils.conv2d_bn( branch3, filters1, (kernel_size, 1), scale=flags.bn_scale) net = tf.keras.layers.concatenate([branch1, branch2, branch3]) # [batch, time, 1, filters*4] net = utils.conv2d_bn(net, filters2, (1, 1), scale=flags.bn_scale) # [batch, time, 1, filters2] if stride > 1: net = tf.keras.layers.MaxPooling2D((3, 1), strides=(stride, 1))(net) net = tf.keras.layers.GlobalAveragePooling2D()(net) # [batch, filters*4] net = tf.keras.layers.Dropout(flags.dropout)(net) net = tf.keras.layers.Dense(flags.label_count)(net) if flags.return_softmax: net = tf.keras.layers.Activation('softmax')(net) return tf.keras.Model(input_audio, net)