def inception_resnet_block(x,
                           scale,
                           filters_branch0,
                           filters_branch1,
                           kernel_size,
                           activation='relu',
                           bn_scale=False):
    """Adds a Inception-ResNet block.

  This function builds only one types of Inception-ResNet block
  reduced version of keras/applications/inception_resnet_v2.py
  Arguments:
    x: input tensor.
    scale: scaling factor to scale the residuals (i.e., the output of
      passing `x` through an inception module) before adding them
      to the shortcut branch.
      Let `r` be the output from the residual branch,
      the output of this block will be `x + scale * r`.
    filters_branch0: number of filters in branch0
    filters_branch1: number of filters in branch1
    kernel_size: kernel size of conv in branch1
    activation: activation function to use at the end of the block
    bn_scale: use scale in batch normalization layer

  Returns:
      Output tensor for the block.
  """

    # only one type of branching is supported
    branch_0 = utils.conv2d_bn(x, filters_branch0, 1, scale=bn_scale)
    branch_1 = utils.conv2d_bn(x, filters_branch0, 1, scale=bn_scale)
    branch_1 = utils.conv2d_bn(branch_1,
                               filters_branch1, [kernel_size, 1],
                               scale=bn_scale)
    branch_1 = utils.conv2d_bn(branch_1,
                               filters_branch1, [kernel_size, 1],
                               scale=bn_scale)
    branches = [branch_0, branch_1]

    mixed = tf.keras.layers.Concatenate()(branches)
    up = utils.conv2d_bn(mixed,
                         tf.keras.backend.int_shape(x)[-1],
                         1,
                         activation=None,
                         scale=bn_scale,
                         use_bias=True)

    x = tf.keras.layers.Lambda(
        lambda inputs, scale: inputs[0] + inputs[1] * scale,
        output_shape=tf.keras.backend.int_shape(x)[1:],
        arguments={'scale': scale})([x, up])
    if activation is not None:
        x = tf.keras.layers.Activation(activation)(x)
    return x
Exemplo n.º 2
0
def model(flags):
    """Inception model.

  It is based on paper:
  Rethinking the Inception Architecture for Computer Vision
      http://arxiv.org/abs/1512.00567
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """
    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    # [batch, time, feature]
    net = tf.keras.backend.expand_dims(net, axis=-1)
    # [batch, time, feature, 1]

    for filters in utils.parse(flags.cnn_filters0):
        net = tf.keras.layers.SeparableConv2D(filters, (3, 3),
                                              padding='valid',
                                              use_bias=False)(net)
        net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net)
        net = tf.keras.layers.Activation('relu')(net)
        net = tf.keras.layers.MaxPooling2D((3, 3), strides=(2, 2))(net)
        # [batch, time, feature, filters]

    filters = utils.parse(flags.cnn_filters0)[-1]
    net = utils.conv2d_bn(net,
                          filters, (3, 1),
                          padding='valid',
                          scale=flags.bn_scale)
    net = utils.conv2d_bn(net,
                          filters, (1, 3),
                          padding='valid',
                          scale=flags.bn_scale)

    for stride, filters1, filters2 in zip(utils.parse(flags.cnn_strides),
                                          utils.parse(flags.cnn_filters1),
                                          utils.parse(flags.cnn_filters2)):

        if stride > 1:
            net = tf.keras.layers.MaxPooling2D((3, 3), strides=stride)(net)

        branch1 = utils.conv2d_bn(net, filters2, (1, 1), scale=flags.bn_scale)

        branch2 = utils.conv2d_bn(net, filters1, (1, 1), scale=flags.bn_scale)
        branch2 = utils.conv2d_bn(branch2,
                                  filters1, (3, 1),
                                  scale=flags.bn_scale)
        branch2 = utils.conv2d_bn(branch2,
                                  filters2, (1, 3),
                                  scale=flags.bn_scale)

        branch3 = utils.conv2d_bn(net, filters1, (1, 1), scale=flags.bn_scale)
        branch3 = utils.conv2d_bn(branch3,
                                  filters1, (3, 1),
                                  scale=flags.bn_scale)
        branch3 = utils.conv2d_bn(branch3,
                                  filters1, (1, 3),
                                  scale=flags.bn_scale)
        branch3 = utils.conv2d_bn(branch3,
                                  filters1, (3, 1),
                                  scale=flags.bn_scale)
        branch3 = utils.conv2d_bn(branch3,
                                  filters2, (1, 3),
                                  scale=flags.bn_scale)

        branch4 = tf.keras.layers.AveragePooling2D((3, 3),
                                                   strides=(1, 1),
                                                   padding='same')(net)
        branch4 = utils.conv2d_bn(branch4,
                                  filters2, (1, 1),
                                  scale=flags.bn_scale)
        net = tf.keras.layers.concatenate([branch1, branch2, branch3, branch4])
        # [batch, time, feature, filters*4]

    net = tf.keras.layers.GlobalAveragePooling2D()(net)
    # [batch, filters*4]
    net = tf.keras.layers.Dropout(flags.dropout)(net)
    net = tf.keras.layers.Dense(flags.label_count)(net)
    return tf.keras.Model(input_audio, net)
Exemplo n.º 3
0
def model(flags):
    """Inception model.

  It is based on paper:
  Rethinking the Inception Architecture for Computer Vision
      http://arxiv.org/abs/1512.00567
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """
    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    # [batch, time, feature]
    net = tf.keras.backend.expand_dims(net, axis=2)
    # [batch, time, 1, feature]

    for stride, filters, kernel_size in zip(
            utils.parse(flags.cnn1_strides), utils.parse(flags.cnn1_filters),
            utils.parse(flags.cnn1_kernel_sizes)):
        net = utils.conv2d_bn(net,
                              filters, (kernel_size, 1),
                              padding='valid',
                              scale=flags.bn_scale)
        if stride > 1:
            net = tf.keras.layers.MaxPooling2D((3, 1),
                                               strides=(stride, 1))(net)

    for stride, filters1, filters2, kernel_size in zip(
            utils.parse(flags.cnn2_strides), utils.parse(flags.cnn2_filters1),
            utils.parse(flags.cnn2_filters2),
            utils.parse(flags.cnn2_kernel_sizes)):

        branch1 = utils.conv2d_bn(net, filters1, (1, 1), scale=flags.bn_scale)

        branch2 = utils.conv2d_bn(net, filters1, (1, 1), scale=flags.bn_scale)
        branch2 = utils.conv2d_bn(branch2,
                                  filters1, (kernel_size, 1),
                                  scale=flags.bn_scale)

        branch3 = utils.conv2d_bn(net, filters1, (1, 1), scale=flags.bn_scale)
        branch3 = utils.conv2d_bn(branch3,
                                  filters1, (kernel_size, 1),
                                  scale=flags.bn_scale)
        branch3 = utils.conv2d_bn(branch3,
                                  filters1, (kernel_size, 1),
                                  scale=flags.bn_scale)

        net = tf.keras.layers.concatenate([branch1, branch2, branch3])
        # [batch, time, 1, filters*4]
        net = utils.conv2d_bn(net, filters2, (1, 1), scale=flags.bn_scale)
        # [batch, time, 1, filters2]

        if stride > 1:
            net = tf.keras.layers.MaxPooling2D((3, 1),
                                               strides=(stride, 1))(net)

    net = tf.keras.layers.GlobalAveragePooling2D()(net)
    # [batch, filters*4]
    net = tf.keras.layers.Dropout(flags.dropout)(net)
    net = tf.keras.layers.Dense(flags.label_count)(net)
    if flags.return_softmax:
        net = tf.keras.layers.Activation('softmax')(net)
    return tf.keras.Model(input_audio, net)
Exemplo n.º 4
0
def model(flags):
  """Inception resnet model.

  It is based on paper:
  Inception-v4, Inception-ResNet and the Impact of
     Residual Connections on Learning https://arxiv.org/abs/1602.07261
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """
  input_audio = tf.keras.layers.Input(
      shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING),
      batch_size=flags.batch_size)
  net = input_audio

  if flags.preprocess == 'raw':
    # it is a self contained model, user need to feed raw audio only
    net = speech_features.SpeechFeatures(
        speech_features.SpeechFeatures.get_params(flags))(
            net)

  # [batch, time, feature]
  net = tf.keras.backend.expand_dims(net, axis=2)
  # [batch, time, 1, feature]

  for filters, kernel_size, stride in zip(
      utils.parse(flags.cnn1_filters), utils.parse(flags.cnn1_kernel_sizes),
      utils.parse(flags.cnn1_strides)):
    net = utils.conv2d_bn(
        net, filters, (kernel_size, 1), scale=flags.bn_scale, padding='valid')
    if stride > 1:
      net = tf.keras.layers.MaxPooling2D((3, 1), strides=(stride, 1))(net)
    # [batch, time, 1, filters]

  for stride, scale, filters_branch0, filters_branch1, filters_branch2, kernel_size in zip(
      utils.parse(flags.cnn2_strides), utils.parse(flags.cnn2_scales),
      utils.parse(flags.cnn2_filters_branch0),
      utils.parse(flags.cnn2_filters_branch1),
      utils.parse(flags.cnn2_filters_branch2),
      utils.parse(flags.cnn2_kernel_sizes)):
    net = inception_resnet_block(
        net,
        scale,
        filters_branch0,
        filters_branch1,
        kernel_size,
        bn_scale=flags.bn_scale)
    net = utils.conv2d_bn(
        net, filters_branch2, (1, 1), scale=flags.bn_scale, padding='valid')
    if stride > 1:
      net = tf.keras.layers.MaxPooling2D((3, 1),
                                         strides=(stride, 1),
                                         padding='valid')(
                                             net)
    # [batch, time, 1, filters]

  net = tf.keras.layers.GlobalAveragePooling2D()(net)
  # [batch, filters]
  net = tf.keras.layers.Dropout(flags.dropout)(net)
  net = tf.keras.layers.Dense(flags.label_count)(net)
  if flags.return_softmax:
    net = tf.keras.layers.Activation('softmax')(net)
  return tf.keras.Model(input_audio, net)