示例#1
0
def ds_tc_resnet_model_params(use_tf_fft=False):
  """Generate parameters for ds_tc_resnet model."""

  # model parameters
  model_name = 'ds_tc_resnet'
  params = model_params.HOTWORD_MODEL_PARAMS[model_name]
  params.causal_data_frame_padding = 1  # causal padding on DataFrame
  params.clip_duration_ms = 160
  params.use_tf_fft = use_tf_fft
  params.mel_non_zero_only = not use_tf_fft
  params.feature_type = 'mfcc_tf'
  params.window_size_ms = 5.0
  params.window_stride_ms = 2.0
  params.wanted_words = 'a,b,c'
  params.ds_padding = "'causal','causal','causal','causal'"
  params.ds_filters = '4,4,4,2'
  params.ds_repeat = '1,1,1,1'
  params.ds_residual = '0,1,1,1'  # no residuals on strided layers
  params.ds_kernel_size = '3,3,3,1'
  params.ds_dilation = '1,1,1,1'
  params.ds_stride = '2,1,1,1'  # streaming conv with stride
  params.ds_pool = '1,2,1,1'  # streaming conv with pool
  params.ds_filter_separable = '1,1,1,1'

  # convert ms to samples and compute labels count
  params = model_flags.update_flags(params)

  # compute total stride
  pools = model_utils.parse(params.ds_pool)
  strides = model_utils.parse(params.ds_stride)
  time_stride = [1]
  for pool in pools:
    if pool > 1:
      time_stride.append(pool)
  for stride in strides:
    if stride > 1:
      time_stride.append(stride)
  total_stride = np.prod(time_stride)

  # override input data shape for streaming model with stride/pool
  params.data_stride = total_stride
  params.data_shape = (total_stride * params.window_stride_samples,)

  # set desired number of frames in model
  frames_number = 16
  frames_per_call = total_stride
  frames_number = (frames_number // frames_per_call) * frames_per_call
  # number of input audio samples required to produce one output frame
  framing_stride = max(
      params.window_stride_samples,
      max(0, params.window_size_samples -
          params.window_stride_samples))
  signal_size = framing_stride * frames_number

  # desired number of samples in the input data to train non streaming model
  params.desired_samples = signal_size
  params.batch_size = 1
  return params
示例#2
0
def model(flags):
    """Fully connected layer based model.

  It is based on paper (with added pooling):
  SMALL-FOOTPRINT KEYWORD SPOTTING USING DEEP NEURAL NETWORKS
  https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/42537.pdf
  Model topology is similar with "Hello Edge: Keyword Spotting on
  Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    for units, activation in zip(utils.parse(flags.units1),
                                 utils.parse(flags.act1)):
        net = tf.keras.layers.Dense(units=units, activation=activation)(net)

    net = stream.Stream(cell=tf.keras.layers.Flatten())(net)

    # after flattening data in time, we can apply any layer: pooling, bi-lstm etc
    if flags.pool_size > 1:
        # add fake dim for compatibility with pooling
        net = tf.keras.backend.expand_dims(net, axis=-1)
        net = tf.keras.layers.MaxPool1D(pool_size=flags.pool_size,
                                        strides=flags.strides,
                                        data_format='channels_last')(net)
        # remove fake dim
        net = tf.keras.backend.squeeze(net, axis=-1)

    net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

    for units, activation in zip(utils.parse(flags.units2),
                                 utils.parse(flags.act2)):
        net = tf.keras.layers.Dense(units=units, activation=activation)(net)

    net = tf.keras.layers.Dense(units=flags.label_count)(net)
    if flags.return_softmax:
        net = tf.keras.layers.Activation('softmax')(net)
    return tf.keras.Model(input_audio, net)
示例#3
0
def model(flags):
    """Fully connected layer based model on raw wav data.

  It is based on paper (with added pooling and raw audio data):
  SMALL-FOOTPRINT KEYWORD SPOTTING USING DEEP NEURAL NETWORKS
  https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/42537.pdf
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

    if flags.preprocess != 'raw':
        ValueError('input audio has to be raw, but get ', flags.preprocess)

    input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ),
                                        batch_size=flags.batch_size)

    net = data_frame.DataFrame(
        frame_size=flags.window_size_samples,
        frame_step=flags.window_stride_samples)(input_audio)

    for units, activation in zip(utils.parse(flags.units1),
                                 utils.parse(flags.act1)):
        net = tf.keras.layers.Dense(units=units, activation=activation)(net)

    net = stream.Stream(cell=tf.keras.layers.Flatten())(net)

    # after flattening data in time, we can apply any layer: pooling, bi-lstm etc
    if flags.pool_size > 1:
        # add fake dim for compatibility with pooling
        net = tf.keras.backend.expand_dims(net, axis=-1)
        net = tf.keras.layers.MaxPool1D(pool_size=flags.pool_size,
                                        strides=flags.strides,
                                        data_format='channels_last')(net)
        # remove fake dim
        net = tf.keras.backend.squeeze(net, axis=-1)

    net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

    for units, activation in zip(utils.parse(flags.units2),
                                 utils.parse(flags.act2)):
        net = tf.keras.layers.Dense(units=units, activation=activation)(net)

    net = tf.keras.layers.Dense(units=flags.label_count)(net)
    if flags.return_softmax:
        net = tf.keras.layers.Activation('softmax')(net)
    return tf.keras.Model(input_audio, net)
示例#4
0
def model(flags):
  """LSTM model.

  Similar model in papers:
  Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting
  https://arxiv.org/pdf/1703.05390.pdf (with no conv layer)
  Model topology is similar with "Hello Edge: Keyword Spotting on
  Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf

  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """
  input_audio = tf.keras.layers.Input(
      shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING),
      batch_size=flags.batch_size)
  net = input_audio

  if flags.preprocess == 'raw':
    # it is a self contained model, user need to feed raw audio only
    net = speech_features.SpeechFeatures(
        speech_features.SpeechFeatures.get_params(flags))(
            net)

  for units, return_sequences, num_proj in zip(
      utils.parse(flags.lstm_units), utils.parse(flags.return_sequences),
      utils.parse(flags.num_proj)):
    net = lstm.LSTM(
        units=units,
        return_sequences=return_sequences,
        stateful=flags.stateful,
        use_peepholes=flags.use_peepholes,
        num_proj=num_proj)(
            net)

  net = stream.Stream(cell=tf.keras.layers.Flatten())(net)
  net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

  for units, activation in zip(
      utils.parse(flags.units1), utils.parse(flags.act1)):
    net = tf.keras.layers.Dense(units=units, activation=activation)(net)

  net = tf.keras.layers.Dense(units=flags.label_count)(net)
  if flags.return_softmax:
    net = tf.keras.layers.Activation('softmax')(net)
  return tf.keras.Model(input_audio, net)
示例#5
0
def model(flags):
  """SVDF model.

  This model is based on decomposition of a densely connected ops
  into low rank filters.
  It is based on paper
  END-TO-END STREAMING KEYWORD SPOTTING https://arxiv.org/pdf/1812.02802.pdf
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

  input_audio = tf.keras.layers.Input(
      shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING),
      batch_size=flags.batch_size)
  net = input_audio

  if flags.preprocess == 'raw':
    # it is a self contained model, user need to feed raw audio only
    net = speech_features.SpeechFeatures(
        speech_features.SpeechFeatures.get_params(flags))(
            net)

  # for streaming mode it is better to use causal padding
  padding = 'causal' if flags.svdf_pad else 'valid'

  for i, (units1, memory_size, units2, dropout, activation) in enumerate(
      zip(
          utils.parse(flags.svdf_units1), utils.parse(flags.svdf_memory_size),
          utils.parse(flags.svdf_units2), utils.parse(flags.svdf_dropout),
          utils.parse(flags.svdf_act))):
    net = svdf.Svdf(
        units1=units1,
        memory_size=memory_size,
        units2=units2,
        dropout=dropout,
        activation=activation,
        pad=padding,
        name='svdf_%d' % i)(
            net)

  net = stream.Stream(cell=tf.keras.layers.Flatten())(net)
  net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

  for units, activation in zip(
      utils.parse(flags.units2), utils.parse(flags.act2)):
    net = tf.keras.layers.Dense(units=units, activation=activation)(net)

  net = tf.keras.layers.Dense(units=flags.label_count)(net)
  if flags.return_softmax:
    net = tf.keras.layers.Activation('softmax')(net)
  return tf.keras.Model(input_audio, net)
示例#6
0
def model(flags):
    """Inception resnet model.

  It is based on paper:
  Inception-v4, Inception-ResNet and the Impact of
     Residual Connections on Learning https://arxiv.org/abs/1602.07261
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """
    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    # [batch, time, feature]
    net = tf.keras.backend.expand_dims(net, axis=2)
    # [batch, time, 1, feature]

    for filters, kernel_size, stride in zip(
            utils.parse(flags.cnn1_filters),
            utils.parse(flags.cnn1_kernel_sizes),
            utils.parse(flags.cnn1_strides)):
        net = utils.conv2d_bn(net,
                              filters, (kernel_size, 1),
                              scale=flags.bn_scale,
                              padding='valid')
        if stride > 1:
            net = tf.keras.layers.MaxPooling2D((3, 1),
                                               strides=(stride, 1))(net)
        # [batch, time, 1, filters]

    for stride, scale, filters_branch0, filters_branch1, filters_branch2, kernel_size in zip(
            utils.parse(flags.cnn2_strides), utils.parse(flags.cnn2_scales),
            utils.parse(flags.cnn2_filters_branch0),
            utils.parse(flags.cnn2_filters_branch1),
            utils.parse(flags.cnn2_filters_branch2),
            utils.parse(flags.cnn2_kernel_sizes)):
        net = inception_resnet_block(net,
                                     scale,
                                     filters_branch0,
                                     filters_branch1,
                                     kernel_size,
                                     bn_scale=flags.bn_scale)
        net = utils.conv2d_bn(net,
                              filters_branch2, (1, 1),
                              scale=flags.bn_scale,
                              padding='valid')
        if stride > 1:
            net = tf.keras.layers.MaxPooling2D((3, 1),
                                               strides=(stride, 1),
                                               padding='valid')(net)
        # [batch, time, 1, filters]

    net = tf.keras.layers.GlobalAveragePooling2D()(net)
    # [batch, filters]
    net = tf.keras.layers.Dropout(flags.dropout)(net)
    net = tf.keras.layers.Dense(flags.label_count)(net)
    if flags.return_softmax:
        net = tf.keras.layers.Activation('softmax')(net)
    return tf.keras.Model(input_audio, net)
def model(flags):
    """Mobilenet V2 model.

  It is based on paper:
  MobileNetV2: Inverted Residuals and Linear Bottlenecks
      https://arxiv.org/abs/1801.04381
  It is applied on sequence in time, so only 1D filters applied
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """
    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    # [batch, time, feature]
    net = tf.keras.backend.expand_dims(net, axis=2)
    # [batch, time, feature, 1]

    # it is conv_block
    net = tf.keras.layers.Conv2D(filters=flags.cnn1_filters,
                                 kernel_size=utils.parse(
                                     flags.cnn1_kernel_size),
                                 padding='valid',
                                 use_bias=False,
                                 strides=utils.parse(flags.cnn1_strides))(net)
    net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net)
    net = tf.keras.layers.ReLU(6.)(net)
    # [batch, time, feature, filters]

    for kernel_size, stride, filters, expansion in zip(
            utils.parse(flags.ds_kernel_size), utils.parse(flags.cnn_strides),
            utils.parse(flags.cnn_filters), utils.parse(flags.cnn_expansions)):
        # it is Inverted ResNet block
        net_input = net
        in_channels = tf.keras.backend.int_shape(net_input)[-1]

        net = tf.keras.layers.Conv2D(expansion * in_channels,
                                     kernel_size=1,
                                     padding='same',
                                     use_bias=False,
                                     activation=None)(net)
        net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net)
        net = tf.keras.layers.ReLU(6.)(net)
        # [batch, time, feature, filters]

        # depthwise
        net = tf.keras.layers.DepthwiseConv2D(kernel_size=kernel_size,
                                              strides=stride,
                                              activation=None,
                                              use_bias=False,
                                              padding='same')(net)
        net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net)
        net = tf.keras.layers.ReLU(6.)(net)

        # project
        net = tf.keras.layers.Conv2D(filters,
                                     kernel_size=1,
                                     padding='same',
                                     use_bias=False,
                                     activation=None)(net)
        net = tf.keras.layers.BatchNormalization()(net)

        if in_channels == filters and stride == (1, 1):
            net = tf.keras.layers.Add()([net_input, net])

    # [batch, time, feature, filters]
    net = tf.keras.layers.GlobalAveragePooling2D()(net)
    # [batch, filters]
    net = tf.keras.layers.Dropout(flags.dropout)(net)
    net = tf.keras.layers.Dense(flags.label_count)(net)
    if flags.return_softmax:
        net = tf.keras.layers.Activation('softmax')(net)
    return tf.keras.Model(input_audio, net)
示例#8
0
def model(flags):
  """SVDF model with residual connections.

  This model is based on decomposition of a densely connected ops
  into low rank filters.
  It is based on paper
  END-TO-END STREAMING KEYWORD SPOTTING https://arxiv.org/pdf/1812.02802.pdf
  In addition we added residual connection
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

  input_audio = tf.keras.layers.Input(
      shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING),
      batch_size=flags.batch_size)
  net = input_audio

  if flags.preprocess == 'raw':
    # it is a self contained model, user need to feed raw audio only
    net = speech_features.SpeechFeatures(
        speech_features.SpeechFeatures.get_params(flags))(
            net)

  blocks_pool = utils.parse(flags.blocks_pool)
  if len(blocks_pool) != 3:
    raise ValueError('number of pooling blocks has to be 3, but get: ',
                     len(blocks_pool))

  # for streaming mode it is better to use causal padding
  padding = 'causal' if flags.svdf_pad else 'valid'

  # first residual block
  number_of_blocks = len(utils.parse(flags.block1_units1))
  activations = [flags.activation] * number_of_blocks
  activations[-1] = 'linear'  # last layer is linear
  residual = net
  for i, (units1, memory_size, activation) in enumerate(
      zip(
          utils.parse(flags.block1_units1),
          utils.parse(flags.block1_memory_size), activations)):
    # [batch, time, feature]
    net = svdf.Svdf(
        units1=units1,
        memory_size=memory_size,
        units2=-1,
        dropout=flags.svdf_dropout,
        activation=activation,
        pad=padding,
        use_bias=flags.svdf_use_bias,
        use_batch_norm=flags.use_batch_norm,
        bn_scale=flags.bn_scale,
        name='svdf_1_%d' % i)(
            net)

  # number of channels in the last layer
  units1_last = utils.parse(flags.block1_units1)[-1]

  # equivalent to 1x1 convolution
  residual = tf.keras.layers.Dense(units1_last, use_bias=False)(residual)
  residual = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(residual)

  # residual connection
  net = tf.keras.layers.Add()([net, residual])
  # [batch, time, feature]
  net = tf.keras.layers.Activation(flags.activation)(net)
  net = tf.keras.layers.MaxPool1D(
      blocks_pool[0], strides=blocks_pool[0], padding='valid')(
          net)

  # second residual block
  number_of_blocks = len(utils.parse(flags.block2_units1))
  activations = [flags.activation] * number_of_blocks
  activations[-1] = 'linear'  # last layer is linear
  residual = net
  for i, (units1, memory_size, activation) in enumerate(
      zip(
          utils.parse(flags.block2_units1),
          utils.parse(flags.block2_memory_size), activations)):
    # [batch, time, feature]
    net = svdf.Svdf(
        units1=units1,
        memory_size=memory_size,
        units2=-1,
        dropout=flags.svdf_dropout,
        activation=activation,
        pad=padding,
        use_bias=flags.svdf_use_bias,
        use_batch_norm=flags.use_batch_norm,
        bn_scale=flags.bn_scale,
        name='svdf_2_%d' % i)(
            net)

  # number of channels in the last layer
  units1_last = utils.parse(flags.block2_units1)[-1]

  # equivalent to 1x1 convolution
  residual = tf.keras.layers.Dense(units1_last, use_bias=False)(residual)
  residual = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(residual)

  # residual connection
  net = tf.keras.layers.Add()([net, residual])
  net = tf.keras.layers.Activation(flags.activation)(net)
  # [batch, time, feature]
  net = tf.keras.layers.MaxPool1D(
      blocks_pool[1], strides=blocks_pool[1], padding='valid')(
          net)

  # third residual block
  number_of_blocks = len(utils.parse(flags.block3_units1))
  activations = [flags.activation] * number_of_blocks
  activations[-1] = 'linear'  # last layer is linear
  residual = net
  for i, (units1, memory_size, activation) in enumerate(
      zip(
          utils.parse(flags.block3_units1),
          utils.parse(flags.block3_memory_size), activations)):
    net = svdf.Svdf(
        units1=units1,
        memory_size=memory_size,
        units2=-1,
        dropout=flags.svdf_dropout,
        activation=activation,
        pad=padding,
        use_bias=flags.svdf_use_bias,
        use_batch_norm=flags.use_batch_norm,
        bn_scale=flags.bn_scale,
        name='svdf_3_%d' % i)(
            net)

  # number of channels in the last layer
  units1_last = utils.parse(flags.block3_units1)[-1]

  # equivalent to 1x1 convolution
  residual = tf.keras.layers.Dense(units1_last, use_bias=False)(residual)
  residual = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(residual)

  # residual connection
  net = tf.keras.layers.Add()([net, residual])
  net = tf.keras.layers.Activation(flags.activation)(net)
  net = tf.keras.layers.MaxPool1D(
      blocks_pool[2], strides=blocks_pool[2], padding='valid')(
          net)
  # [batch, time, feature]

  # convert all feature to one vector
  if flags.flatten:
    net = stream.Stream(use_one_step=False, cell=tf.keras.layers.Flatten())(net)
  else:
    net = tf.keras.backend.expand_dims(net, axis=2)
    net = stream.Stream(
        use_one_step=False,
        cell=tf.keras.layers.AveragePooling2D(
            pool_size=(int(net.shape[1]), int(net.shape[2]))))(
                net)

  net = tf.keras.layers.Flatten()(net)

  # [batch, feature]
  net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

  for units in utils.parse(flags.units2):
    net = tf.keras.layers.Dense(units=units, activation=flags.activation)(net)

  net = tf.keras.layers.Dense(units=flags.label_count)(net)
  if flags.return_softmax:
    net = tf.keras.layers.Activation('softmax')(net)
  return tf.keras.Model(input_audio, net)
示例#9
0
def main(_):
    # Update flags
    flags = model_flags.update_flags(FLAGS)

    if flags.train:
        # Create model folders where logs and model will be stored
        os.makedirs(flags.train_dir)
        os.mkdir(flags.summaries_dir)

        # Model training
        train.train(flags)
    else:
        if not os.path.isdir(flags.train_dir):
            raise ValueError(
                'model is not trained set "--train 1" and retrain it')

    # write all flags settings into json
    with open(os.path.join(flags.train_dir, 'flags.json'), 'wt') as f:
        json.dump(flags.__dict__, f)

    # convert to SavedModel
    test.convert_model_saved(flags, 'non_stream',
                             modes.Modes.NON_STREAM_INFERENCE)
    try:
        test.convert_model_saved(flags, 'stream_state_internal',
                                 modes.Modes.STREAM_INTERNAL_STATE_INFERENCE)
    except (ValueError, IndexError) as e:
        logging.info('FAILED to run TF streaming: %s', e)

    logging.info('run TF non streaming model accuracy evaluation')
    # with TF
    folder_name = 'tf'
    test.tf_non_stream_model_accuracy(flags, folder_name)

    # with TF.
    # We can apply non stream model on stream data, by running inference
    # every 200ms (for example), so that total latency will be similar with
    # streaming model which is executed every 20ms.
    # To measure the impact of sampling on model accuracy,
    # we introduce time_shift_ms during accuracy evaluation.
    # Convert milliseconds to samples:
    time_shift_samples = int(
        (flags.time_shift_ms * flags.sample_rate) / model_flags.MS_PER_SECOND)
    test.tf_non_stream_model_accuracy(
        flags,
        folder_name,
        time_shift_samples,
        accuracy_name='tf_non_stream_model_sampling_stream_accuracy.txt')

    name2opt = {
        '': None,
        'quantize_opt_for_size_': [tf.lite.Optimize.DEFAULT],
    }

    for opt_name, optimizations in name2opt.items():

        if (opt_name and flags.feature_type == 'mfcc_tf'
                and flags.preprocess == 'raw'):
            logging.info(
                'feature type mfcc_tf needs quantization aware training '
                'for quantization - it is not implemented')
            continue

        folder_name = opt_name + 'tflite_non_stream'
        file_name = 'non_stream.tflite'
        mode = modes.Modes.NON_STREAM_INFERENCE
        test.convert_model_tflite(flags,
                                  folder_name,
                                  mode,
                                  file_name,
                                  optimizations=optimizations)
        test.tflite_non_stream_model_accuracy(flags, folder_name, file_name)

        # these models are using bi-rnn, so they are non streamable by default
        # also models using striding or pooling are not supported for streaming now
        non_streamable_models = {'att_mh_rnn', 'att_rnn', 'tc_resnet'}

        model_is_streamable = True
        if flags.model_name in non_streamable_models:
            model_is_streamable = False
        # below models can use striding in time dimension,
        # but this is currently unsupported
        elif flags.model_name == 'cnn':
            for strides in model_utils.parse(flags.cnn_strides):
                if strides[0] > 1:
                    model_is_streamable = False
                    break
        elif flags.model_name == 'ds_cnn':
            if model_utils.parse(flags.cnn1_strides)[0] > 1:
                model_is_streamable = False
            for strides in model_utils.parse(flags.dw2_strides):
                if strides[0] > 1:
                    model_is_streamable = False
                    break

        # set input data shape for testing inference in streaming mode
        flags.data_shape = modes.get_input_data_shape(
            flags, modes.Modes.STREAM_EXTERNAL_STATE_INFERENCE)

        # if model can be streamed, then run conversion/evaluation in streaming mode
        if model_is_streamable:
            # ---------------- TF streaming model accuracy evaluation ----------------
            # Streaming model with external state evaluation using TF with state reset
            if not opt_name:
                logging.info(
                    'run TF evalution only without optimization/quantization')
                try:
                    folder_name = 'tf'
                    test.tf_stream_state_external_model_accuracy(
                        flags,
                        folder_name,
                        accuracy_name=
                        'stream_state_external_model_accuracy_sub_set_reset1.txt',
                        reset_state=True
                    )  # with state reset between test sequences

                    # Streaming (with external state) evaluation using TF no state reset
                    test.tf_stream_state_external_model_accuracy(
                        flags,
                        folder_name,
                        accuracy_name=
                        'stream_state_external_model_accuracy_sub_set_reset0.txt',
                        reset_state=False)  # without state reset

                    # Streaming (with internal state) evaluation using TF no state reset
                    test.tf_stream_state_internal_model_accuracy(
                        flags, folder_name)
                except (ValueError, IndexError) as e:
                    logging.info('FAILED to run TF streaming: %s', e)

            logging.info('run TFlite streaming model accuracy evaluation')
            try:
                # convert model to TFlite
                folder_name = opt_name + 'tflite_stream_state_external'
                file_name = 'stream_state_external.tflite'
                mode = modes.Modes.STREAM_EXTERNAL_STATE_INFERENCE
                test.convert_model_tflite(flags,
                                          folder_name,
                                          mode,
                                          file_name,
                                          optimizations=optimizations)

                # Streaming model accuracy evaluation with TFLite with state reset
                test.tflite_stream_state_external_model_accuracy(
                    flags,
                    folder_name,
                    file_name,
                    accuracy_name=
                    'tflite_stream_state_external_model_accuracy_reset1.txt',
                    reset_state=True)

                # Streaming model accuracy evaluation with TFLite without state reset
                test.tflite_stream_state_external_model_accuracy(
                    flags,
                    folder_name,
                    file_name,
                    accuracy_name=
                    'tflite_stream_state_external_model_accuracy_reset0.txt',
                    reset_state=False)
            except (ValueError, IndexError) as e:
                logging.info('FAILED to run TFLite streaming: %s', e)
示例#10
0
def model(flags):
    """BiRNN multihead attention model.

  It is based on paper:
  Attention Is All You Need
  https://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf
  A neural attention model for speech command recognition
  https://arxiv.org/pdf/1808.08929.pdf

  Depending on parameter rnn_type, model can be biLSTM or biGRU

  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

    rnn_types = {'lstm': tf.keras.layers.LSTM, 'gru': tf.keras.layers.GRU}

    if flags.rnn_type not in rnn_types:
        ValueError('not supported RNN type ', flags.rnn_type)
    rnn = rnn_types[flags.rnn_type]

    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    net = tf.keras.backend.expand_dims(net)

    for filters, kernel_size, activation, dilation_rate, strides in zip(
            utils.parse(flags.cnn_filters), utils.parse(flags.cnn_kernel_size),
            utils.parse(flags.cnn_act), utils.parse(flags.cnn_dilation_rate),
            utils.parse(flags.cnn_strides)):
        net = tf.keras.layers.Conv2D(
            filters=filters,
            kernel_size=kernel_size,
            activation=activation,
            dilation_rate=dilation_rate,
            strides=strides,
            padding='same',
            kernel_regularizer=tf.keras.regularizers.l2(flags.l2_weight_decay),
            bias_regularizer=tf.keras.regularizers.l2(
                flags.l2_weight_decay))(net)
        net = tf.keras.layers.BatchNormalization()(net)

    shape = net.shape
    # input net dimension: [batch, time, feature, channels]
    # reshape dimension: [batch, time, feature * channels]
    # so that GRU/RNN can process it
    net = tf.keras.layers.Reshape((-1, shape[2] * shape[3]))(net)

    # dims: [batch, time, feature]
    for _ in range(flags.rnn_layers):
        net = tf.keras.layers.Bidirectional(
            rnn(flags.rnn_units,
                return_sequences=True,
                unroll=True,
                kernel_regularizer=tf.keras.regularizers.l2(
                    flags.l2_weight_decay),
                bias_regularizer=tf.keras.regularizers.l2(
                    flags.l2_weight_decay)))(net)
    feature_dim = net.shape[-1]
    middle = net.shape[1] // 2  # index of middle point of sequence

    # feature vector at middle point [batch, feature]
    mid_feature = net[:, middle, :]

    # prepare multihead attention
    multiheads = []
    for _ in range(flags.heads):
        # apply one projection layer with the same dim as input feature
        query = tf.keras.layers.Dense(
            feature_dim,
            kernel_regularizer=tf.keras.regularizers.l2(flags.l2_weight_decay),
            bias_regularizer=tf.keras.regularizers.l2(
                flags.l2_weight_decay))(mid_feature)

        # attention weights [batch, time]
        att_weights = tf.keras.layers.Dot(axes=[1, 2])([query, net])
        att_weights = tf.keras.layers.Softmax()(att_weights)

        # apply attention weights [batch, feature]
        multiheads.append(tf.keras.layers.Dot(axes=[1, 1])([att_weights, net]))

    net = tf.keras.layers.concatenate(multiheads)

    net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

    for units, activation in zip(utils.parse(flags.units2),
                                 utils.parse(flags.act2)):
        net = tf.keras.layers.Dense(
            units=units,
            activation=activation,
            kernel_regularizer=tf.keras.regularizers.l2(flags.l2_weight_decay),
            bias_regularizer=tf.keras.regularizers.l2(
                flags.l2_weight_decay))(net)
    net = tf.keras.layers.Dense(units=flags.label_count)(net)
    if flags.return_softmax:
        net = tf.keras.layers.Activation('softmax')(net)
    return tf.keras.Model(input_audio, net)
示例#11
0
    def init_model(self, use_tf_fft=False):

        config = tf1.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf1.Session(config=config)
        tf1.keras.backend.set_session(self.sess)
        test_utils.set_seed(123)
        tf.keras.backend.set_learning_phase(0)

        # model parameters
        model_name = 'ds_tc_resnet'
        self.params = model_params.HOTWORD_MODEL_PARAMS[model_name]
        self.params.causal_data_frame_padding = 1  # causal padding on DataFrame
        self.params.clip_duration_ms = 160
        self.params.use_tf_fft = use_tf_fft
        self.params.mel_non_zero_only = not use_tf_fft
        self.params.feature_type = 'mfcc_tf'
        self.params.window_size_ms = 5.0
        self.params.window_stride_ms = 2.0
        self.params.wanted_words = 'a,b,c'
        self.params.ds_padding = "'causal','causal','causal','causal'"
        self.params.ds_filters = '4,4,4,2'
        self.params.ds_repeat = '1,1,1,1'
        self.params.ds_residual = '0,1,1,1'  # no residuals on strided layers
        self.params.ds_kernel_size = '3,3,3,1'
        self.params.ds_dilation = '1,1,1,1'
        self.params.ds_stride = '2,1,1,1'  # streaming conv with stride
        self.params.ds_pool = '1,2,1,1'  # streaming conv with pool
        self.params.ds_filter_separable = '1,1,1,1'

        # convert ms to samples and compute labels count
        self.params = model_flags.update_flags(self.params)

        # compute total stride
        pools = model_utils.parse(self.params.ds_pool)
        strides = model_utils.parse(self.params.ds_stride)
        time_stride = [1]
        for pool in pools:
            if pool > 1:
                time_stride.append(pool)
        for stride in strides:
            if stride > 1:
                time_stride.append(stride)
        total_stride = np.prod(time_stride)

        # overide input data shape for streaming model with stride/pool
        self.params.data_stride = total_stride
        self.params.data_shape = (total_stride *
                                  self.params.window_stride_samples, )

        # set desired number of frames in model
        frames_number = 16
        frames_per_call = total_stride
        frames_number = (frames_number // frames_per_call) * frames_per_call
        # number of input audio samples required to produce one output frame
        framing_stride = max(
            self.params.window_stride_samples,
            max(
                0, self.params.window_size_samples -
                self.params.window_stride_samples))
        signal_size = framing_stride * frames_number

        # desired number of samples in the input data to train non streaming model
        self.params.desired_samples = signal_size

        self.params.batch_size = 1
        self.model = ds_tc_resnet.model(self.params)
        self.model.summary()

        self.input_data = np.random.rand(self.params.batch_size,
                                         self.params.desired_samples)

        # run non streaming inference
        self.non_stream_out = self.model.predict(self.input_data)
示例#12
0
def model(flags):
    """Xception model.

  It is based on papers:
  Xception: Deep Learning with Depthwise Separable Convolutions
      https://arxiv.org/abs/1610.02357
  MatchboxNet: 1D Time-Channel Separable Convolutional
  Neural Network Architecture for Speech Commands Recognition
  https://arxiv.org/pdf/2004.08531
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """
    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    # [batch, time, feature]
    net = tf.keras.backend.expand_dims(net, axis=2)
    # [batch, time, 1, feature]

    # conv block
    for kernel_size, filters in zip(utils.parse(flags.cnn1_kernel_sizes),
                                    utils.parse(flags.cnn1_filters)):
        net = tf.keras.layers.Conv2D(filters, (kernel_size, 1),
                                     use_bias=False)(net)
        net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net)
        net = tf.keras.layers.Activation('relu')(net)
        # [batch, time, 1, feature]

    if flags.stride1 > 1:
        net = tf.keras.layers.MaxPooling2D((3, 1),
                                           strides=(flags.stride1, 1),
                                           padding='valid')(net)

    net = block(net, utils.parse(flags.cnn2_kernel_sizes),
                utils.parse(flags.cnn2_filters), flags.dropout, flags.bn_scale)
    if flags.stride2 > 1:
        net = tf.keras.layers.MaxPooling2D((3, 1),
                                           strides=(flags.stride2, 1),
                                           padding='valid')(net)

    net = block(net, utils.parse(flags.cnn3_kernel_sizes),
                utils.parse(flags.cnn3_filters), flags.dropout, flags.bn_scale)
    if flags.stride3 > 1:
        net = tf.keras.layers.MaxPooling2D((3, 1),
                                           strides=(flags.stride3, 1),
                                           padding='valid')(net)

    net = block(net, utils.parse(flags.cnn4_kernel_sizes),
                utils.parse(flags.cnn4_filters), flags.dropout, flags.bn_scale)
    if flags.stride4 > 1:
        net = tf.keras.layers.MaxPooling2D((3, 1),
                                           strides=(flags.stride4, 1),
                                           padding='valid')(net)

    net = tf.keras.layers.GlobalAveragePooling2D()(net)
    # [batch, filters]
    net = tf.keras.layers.Dropout(flags.dropout)(net)
    for units in utils.parse(flags.units2):
        net = tf.keras.layers.Dense(units=units,
                                    activation=None,
                                    use_bias=False)(net)
        net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net)
        net = tf.keras.layers.Activation('relu')(net)

    net = tf.keras.layers.Dense(flags.label_count)(net)
    if flags.return_softmax:
        net = tf.keras.layers.Activation('softmax')(net)
    # [batch, label_count]
    return tf.keras.Model(input_audio, net)
示例#13
0
def model(flags):
    """CNN model.

  It is based on paper:
  Convolutional Neural Networks for Small-footprint Keyword Spotting
  http://www.isca-speech.org/archive/interspeech_2015/papers/i15_1478.pdf
  Model topology is similar with "Hello Edge: Keyword Spotting on
  Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf

  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    if flags.quantize:
        net = quantize_layer.QuantizeLayer(
            AllValuesQuantizer(num_bits=8,
                               per_axis=False,
                               symmetric=False,
                               narrow_range=False))(net)

    net = tf.keras.backend.expand_dims(net)
    for filters, kernel_size, activation, dilation_rate, strides in zip(
            utils.parse(flags.cnn_filters), utils.parse(flags.cnn_kernel_size),
            utils.parse(flags.cnn_act), utils.parse(flags.cnn_dilation_rate),
            utils.parse(flags.cnn_strides)):
        net = stream.Stream(cell=quantize.quantize_layer(
            tf.keras.layers.Conv2D(filters=filters,
                                   kernel_size=kernel_size,
                                   dilation_rate=dilation_rate,
                                   activation='linear',
                                   strides=strides), flags.quantize,
            quantize.NoOpActivationConfig(['kernel'], ['activation'], False)),
                            pad_time_dim='causal',
                            use_one_step=False)(net)
        net = quantize.quantize_layer(
            tf.keras.layers.BatchNormalization(),
            default_8bit_quantize_configs.NoOpQuantizeConfig())(net)
        net = quantize.quantize_layer(
            tf.keras.layers.Activation(activation))(net)

    net = stream.Stream(cell=quantize.quantize_layer(
        tf.keras.layers.Flatten(), apply_quantization=flags.quantize))(net)

    net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

    for units, activation in zip(utils.parse(flags.units2),
                                 utils.parse(flags.act2)):
        net = quantize.quantize_layer(tf.keras.layers.Dense(
            units=units, activation=activation),
                                      apply_quantization=flags.quantize)(net)

    net = quantize.quantize_layer(
        tf.keras.layers.Dense(units=flags.label_count),
        apply_quantization=flags.quantize)(net)
    if flags.return_softmax:
        net = quantize.quantize_layer(tf.keras.layers.Activation('softmax'),
                                      apply_quantization=flags.quantize)(net)
    return tf.keras.Model(input_audio, net)
示例#14
0
def model(flags):
    """MatchboxNet model.

  It is based on paper
  MatchboxNet: 1D Time-Channel Separable Convolutional Neural Network
  Architecture for Speech Commands Recognition
  https://arxiv.org/pdf/2004.08531.pdf

  Args:
    flags: data/model parameters

  Returns:
    Keras model for training

  Raises:
    ValueError: if any of input list has different length from any other;
    or if padding is not supported
  """

    ds_filters = utils.parse(flags.ds_filters)
    ds_repeat = utils.parse(flags.ds_repeat)
    ds_kernel_size = utils.parse(flags.ds_kernel_size)
    ds_stride = utils.parse(flags.ds_stride)
    ds_dilation = utils.parse(flags.ds_dilation)
    ds_residual = utils.parse(flags.ds_residual)
    ds_pool = utils.parse(flags.ds_pool)
    ds_padding = utils.parse(flags.ds_padding)
    ds_filter_separable = utils.parse(flags.ds_filter_separable)

    for l in (ds_repeat, ds_kernel_size, ds_stride, ds_dilation, ds_residual,
              ds_pool, ds_padding, ds_filter_separable):
        if len(ds_filters) != len(l):
            raise ValueError('all input lists have to be the same length')

    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    # make it [batch, time, 1, feature]
    net = tf.keras.backend.expand_dims(net, axis=2)

    # encoder
    for filters, repeat, ksize, stride, sep, dilation, res, pool, pad in zip(
            ds_filters, ds_repeat, ds_kernel_size, ds_stride,
            ds_filter_separable, ds_dilation, ds_residual, ds_pool,
            ds_padding):
        net = resnet_block(net, repeat, ksize, filters, dilation, stride, sep,
                           res, pad, flags.dropout, flags.activation,
                           flags.ds_scale, flags.data_stride <= 1)
        if pool > 1:
            if flags.ds_max_pool:
                net = tf.keras.layers.MaxPooling2D(pool_size=(pool, 1),
                                                   strides=(pool, 1))(net)
            else:
                net = tf.keras.layers.AveragePooling2D(pool_size=(pool, 1),
                                                       strides=(pool, 1))(net)

    # decoder
    net = stream.Stream(cell=tf.keras.layers.GlobalAveragePooling2D())(net)

    net = tf.keras.layers.Flatten()(net)

    net = tf.keras.layers.Dense(units=flags.label_count)(net)

    if flags.return_softmax:
        net = tf.keras.layers.Activation('softmax')(net)
    return tf.keras.Model(input_audio, net)
示例#15
0
def model(flags):
    """Depthwise convolutional model.

  It is based on paper:
  MobileNets: Efficient Convolutional Neural Networks for
  Mobile Vision Applications https://arxiv.org/abs/1704.04861
  Model topology is similar with "Hello Edge: Keyword Spotting on
  Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    net = tf.keras.backend.expand_dims(net)

    net = stream.Stream(cell=tf.keras.layers.Conv2D(
        kernel_size=utils.parse(flags.cnn1_kernel_size),
        dilation_rate=utils.parse(flags.cnn1_dilation_rate),
        filters=flags.cnn1_filters,
        padding=flags.cnn1_padding,
        strides=utils.parse(flags.cnn1_strides)))(net)
    net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum,
                                             center=flags.bn_center,
                                             scale=flags.bn_scale,
                                             renorm=flags.bn_renorm)(net)
    net = tf.keras.layers.Activation('relu')(net)

    for kernel_size, dw2_act, dilation_rate, strides, filters, cnn2_act in zip(
            utils.parse(flags.dw2_kernel_size), utils.parse(flags.dw2_act),
            utils.parse(flags.dw2_dilation_rate),
            utils.parse(flags.dw2_strides), utils.parse(flags.cnn2_filters),
            utils.parse(flags.cnn2_act)):
        net = stream.Stream(
            cell=tf.keras.layers.DepthwiseConv2D(kernel_size=kernel_size,
                                                 dilation_rate=dilation_rate,
                                                 padding=flags.dw2_padding,
                                                 strides=strides))(net)
        net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum,
                                                 center=flags.bn_center,
                                                 scale=flags.bn_scale,
                                                 renorm=flags.bn_renorm)(net)
        net = tf.keras.layers.Activation(dw2_act)(net)
        net = tf.keras.layers.Conv2D(kernel_size=(1, 1), filters=filters)(net)
        net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum,
                                                 center=flags.bn_center,
                                                 scale=flags.bn_scale,
                                                 renorm=flags.bn_renorm)(net)
        net = tf.keras.layers.Activation(cnn2_act)(net)

    net = stream.Stream(cell=tf.keras.layers.AveragePooling2D(
        pool_size=(int(net.shape[1]), int(net.shape[2]))))(net)

    net = stream.Stream(cell=tf.keras.layers.Flatten())(net)
    net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

    net = tf.keras.layers.Dense(units=flags.label_count)(net)
    if flags.return_softmax:
        net = tf.keras.layers.Activation('softmax')(net)
    return tf.keras.Model(input_audio, net)
示例#16
0
def model(flags):
  """Inception model.

  It is based on paper:
  Rethinking the Inception Architecture for Computer Vision
      http://arxiv.org/abs/1512.00567
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """
  input_audio = tf.keras.layers.Input(
      shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING),
      batch_size=flags.batch_size)
  net = input_audio

  if flags.preprocess == 'raw':
    # it is a self contained model, user need to feed raw audio only
    net = speech_features.SpeechFeatures(
        speech_features.SpeechFeatures.get_params(flags))(
            net)

  # [batch, time, feature]
  net = tf.keras.backend.expand_dims(net, axis=2)
  # [batch, time, 1, feature]

  for stride, filters, kernel_size in zip(
      utils.parse(flags.cnn1_strides),
      utils.parse(flags.cnn1_filters),
      utils.parse(flags.cnn1_kernel_sizes)):
    net = utils.conv2d_bn(
        net, filters, (kernel_size, 1), padding='valid', scale=flags.bn_scale)
    if stride > 1:
      net = tf.keras.layers.MaxPooling2D((3, 1), strides=(stride, 1))(net)

  for stride, filters1, filters2, kernel_size in zip(
      utils.parse(flags.cnn2_strides), utils.parse(flags.cnn2_filters1),
      utils.parse(flags.cnn2_filters2), utils.parse(flags.cnn2_kernel_sizes)):

    branch1 = utils.conv2d_bn(net, filters1, (1, 1), scale=flags.bn_scale)

    branch2 = utils.conv2d_bn(net, filters1, (1, 1), scale=flags.bn_scale)
    branch2 = utils.conv2d_bn(
        branch2, filters1, (kernel_size, 1), scale=flags.bn_scale)

    branch3 = utils.conv2d_bn(net, filters1, (1, 1), scale=flags.bn_scale)
    branch3 = utils.conv2d_bn(
        branch3, filters1, (kernel_size, 1), scale=flags.bn_scale)
    branch3 = utils.conv2d_bn(
        branch3, filters1, (kernel_size, 1), scale=flags.bn_scale)

    net = tf.keras.layers.concatenate([branch1, branch2, branch3])
    # [batch, time, 1, filters*4]
    net = utils.conv2d_bn(net, filters2, (1, 1), scale=flags.bn_scale)
    # [batch, time, 1, filters2]

    if stride > 1:
      net = tf.keras.layers.MaxPooling2D((3, 1), strides=(stride, 1))(net)

  net = tf.keras.layers.GlobalAveragePooling2D()(net)
  # [batch, filters*4]
  net = tf.keras.layers.Dropout(flags.dropout)(net)
  net = tf.keras.layers.Dense(flags.label_count)(net)
  if flags.return_softmax:
    net = tf.keras.layers.Activation('softmax')(net)
  return tf.keras.Model(input_audio, net)
示例#17
0
def model(flags):
    """Mobilenet model.

  It is based on paper:
  MobileNets: Efficient Convolutional Neural Networks for
     Mobile Vision Applications https://arxiv.org/abs/1704.04861
  It is applied on sequence in time, so only 1D filters applied
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """
    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    # [batch, time, feature]
    net = tf.keras.backend.expand_dims(net, axis=2)
    # [batch, time, feature, 1]

    # it is convolutional block
    net = tf.keras.layers.Conv2D(filters=flags.cnn1_filters,
                                 kernel_size=utils.parse(
                                     flags.cnn1_kernel_size),
                                 padding='valid',
                                 use_bias=False,
                                 strides=utils.parse(flags.cnn1_strides))(net)
    net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net)
    net = tf.keras.layers.ReLU(6.)(net)
    # [batch, time, feature, filters]

    for kernel_size, strides, filters in zip(utils.parse(flags.ds_kernel_size),
                                             utils.parse(flags.ds_strides),
                                             utils.parse(flags.cnn_filters)):
        # it is depthwise convolutional block
        net = tf.keras.layers.DepthwiseConv2D(
            kernel_size,
            padding='same' if strides == (1, 1) else 'valid',
            depth_multiplier=1,
            strides=strides,
            use_bias=False)(net)
        net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net)
        net = tf.keras.layers.ReLU(6., )(net)

        net = tf.keras.layers.Conv2D(filters=filters,
                                     kernel_size=(1, 1),
                                     padding='same',
                                     use_bias=False,
                                     strides=(1, 1))(net)
        net = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(net)
        net = tf.keras.layers.ReLU(6.)(net)
        # [batch, time, feature, filters]

    net = tf.keras.layers.GlobalAveragePooling2D()(net)
    # [batch, filters]
    net = tf.keras.layers.Dropout(flags.dropout)(net)
    net = tf.keras.layers.Dense(flags.label_count)(net)
    if flags.return_softmax:
        net = tf.keras.layers.Activation('softmax')(net)
    # [batch, label_count]
    return tf.keras.Model(input_audio, net)
示例#18
0
def model(flags):
    """BC-ResNet model.

  It is based on paper
  Broadcasted Residual Learning for Efficient Keyword Spotting
  https://arxiv.org/pdf/2106.04140.pdf

  Args:
    flags: data/model parameters

  Returns:
    Keras model for training

  Raises:
    ValueError: if any of input list has different length from any other;
    or if padding is not supported
  """

    dropouts = utils.parse(flags.dropouts)
    filters = utils.parse(flags.filters)
    blocks_n = utils.parse(flags.blocks_n)
    strides = utils.parse(flags.strides)
    dilations = utils.parse(flags.dilations)

    for l in (dropouts, filters, strides, dilations):
        if len(blocks_n) != len(l):
            raise ValueError('all input lists have to be the same length '
                             'but get %s and %s ' % (blocks_n, l))

    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    # make it [batch, time, feature, 1]
    net = tf.keras.backend.expand_dims(net, axis=3)

    if flags.paddings == 'same':
        net = tf.keras.layers.Conv2D(filters=flags.first_filters,
                                     kernel_size=5,
                                     strides=(1, 2),
                                     padding='same')(net)
    else:
        net = stream.Stream(cell=tf.keras.layers.Conv2D(
            filters=flags.first_filters,
            kernel_size=5,
            strides=(1, 2),
            padding='valid'),
                            use_one_step=True,
                            pad_time_dim=flags.paddings,
                            pad_freq_dim='same')(net)

    for n, n_filters, dilation, stride, dropout in zip(blocks_n, filters,
                                                       dilations, strides,
                                                       dropouts):
        net = TransitionBlock(n_filters,
                              dilation,
                              stride,
                              flags.paddings,
                              dropout,
                              sub_groups=flags.sub_groups)(net)
        for _ in range(n):
            net = NormalBlock(n_filters,
                              dilation,
                              1,
                              flags.paddings,
                              dropout,
                              sub_groups=flags.sub_groups)(net)

    if flags.paddings == 'same':
        net = tf.keras.layers.DepthwiseConv2D(kernel_size=5,
                                              padding='same')(net)
    else:
        net = stream.Stream(cell=tf.keras.layers.DepthwiseConv2D(
            kernel_size=5, padding='valid'),
                            use_one_step=True,
                            pad_time_dim=flags.paddings,
                            pad_freq_dim='same')(net)

    # average out frequency dim
    net = tf.keras.backend.mean(net, axis=2, keepdims=True)

    net = tf.keras.layers.Conv2D(filters=flags.last_filters,
                                 kernel_size=1,
                                 use_bias=False)(net)

    # average out time dim
    if flags.paddings == 'same':
        net = tf.keras.layers.GlobalAveragePooling2D(keepdims=True)(net)
    else:
        net = stream.Stream(cell=tf.keras.layers.GlobalAveragePooling2D(
            keepdims=True))(net)

    net = tf.keras.layers.Conv2D(filters=flags.label_count,
                                 kernel_size=1,
                                 use_bias=False)(net)
    # 1 and 2 dims are equal to 1
    net = tf.squeeze(net, [1, 2])

    if flags.return_softmax:
        net = tf.keras.layers.Activation('softmax')(net)
    return tf.keras.Model(input_audio, net)
示例#19
0
def model(flags):
    """Temporal Convolution ResNet model.

  It is based on paper:
  Temporal Convolution for Real-time Keyword Spotting on Mobile Devices
  https://arxiv.org/pdf/1904.03814.pdf
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """
    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    time_size, feature_size = net.shape[1:3]

    channels = utils.parse(flags.channels)

    net = tf.keras.backend.expand_dims(net)

    if flags.debug_2d:
        conv_kernel = first_conv_kernel = (3, 3)
    else:
        net = tf.reshape(
            net, [-1, time_size, 1, feature_size])  # [batch, time, 1, feature]
        first_conv_kernel = (3, 1)
        conv_kernel = utils.parse(flags.kernel_size)

    net = tf.keras.layers.Conv2D(filters=channels[0],
                                 kernel_size=first_conv_kernel,
                                 strides=1,
                                 padding='same',
                                 activation='linear')(net)
    net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum,
                                             center=flags.bn_center,
                                             scale=flags.bn_scale,
                                             renorm=flags.bn_renorm)(net)
    net = tf.keras.layers.Activation('relu')(net)

    if utils.parse(flags.pool_size):
        net = tf.keras.layers.AveragePooling2D(pool_size=utils.parse(
            flags.pool_size),
                                               strides=flags.pool_stride)(net)

    channels = channels[1:]

    # residual blocks
    for n in channels:
        if n != net.shape[-1]:
            stride = 2
            layer_in = tf.keras.layers.Conv2D(filters=n,
                                              kernel_size=1,
                                              strides=stride,
                                              padding='same',
                                              activation='linear')(net)
            layer_in = tf.keras.layers.BatchNormalization(
                momentum=flags.bn_momentum,
                center=flags.bn_center,
                scale=flags.bn_scale,
                renorm=flags.bn_renorm)(layer_in)
            layer_in = tf.keras.layers.Activation('relu')(layer_in)
        else:
            layer_in = net
            stride = 1

        net = tf.keras.layers.Conv2D(filters=n,
                                     kernel_size=conv_kernel,
                                     strides=stride,
                                     padding='same',
                                     activation='linear')(net)
        net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum,
                                                 center=flags.bn_center,
                                                 scale=flags.bn_scale,
                                                 renorm=flags.bn_renorm)(net)
        net = tf.keras.layers.Activation('relu')(net)

        net = tf.keras.layers.Conv2D(filters=n,
                                     kernel_size=conv_kernel,
                                     strides=1,
                                     padding='same',
                                     activation='linear')(net)
        net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum,
                                                 center=flags.bn_center,
                                                 scale=flags.bn_scale,
                                                 renorm=flags.bn_renorm)(net)

        # residual connection
        net = tf.keras.layers.Add()([net, layer_in])
        net = tf.keras.layers.Activation('relu')(net)

    net = tf.keras.layers.AveragePooling2D(pool_size=net.shape[1:3],
                                           strides=1)(net)

    net = tf.keras.layers.Dropout(rate=flags.dropout)(net)

    # fully connected layer
    net = tf.keras.layers.Conv2D(filters=flags.label_count,
                                 kernel_size=1,
                                 strides=1,
                                 padding='same',
                                 activation='linear')(net)

    net = tf.reshape(net, shape=(-1, net.shape[3]))
    if flags.return_softmax:
        net = tf.keras.layers.Activation('softmax')(net)
    return tf.keras.Model(input_audio, net)
示例#20
0
def model(flags):
    """Convolutional recurrent neural network (CRNN) model.

  It is based on paper
  Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting
  https://arxiv.org/pdf/1703.05390.pdf
  Represented as sequence of Conv, RNN/GRU, FC layers.
  Model topology is similar with "Hello Edge: Keyword Spotting on
  Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """
    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    # expand dims for the next layer 2d conv
    net = tf.keras.backend.expand_dims(net)
    for filters, kernel_size, activation, dilation_rate, strides in zip(
            utils.parse(flags.cnn_filters), utils.parse(flags.cnn_kernel_size),
            utils.parse(flags.cnn_act), utils.parse(flags.cnn_dilation_rate),
            utils.parse(flags.cnn_strides)):
        net = stream.Stream(
            cell=tf.keras.layers.Conv2D(filters=filters,
                                        kernel_size=kernel_size,
                                        activation=activation,
                                        dilation_rate=dilation_rate,
                                        strides=strides))(net)

    shape = net.shape
    # input net dimension: [batch, time, feature, channels]
    # reshape dimension: [batch, time, feature * channels]
    # so that GRU/RNN can process it
    net = tf.keras.layers.Reshape((-1, shape[2] * shape[3]))(net)

    for units, return_sequences in zip(utils.parse(flags.gru_units),
                                       utils.parse(flags.return_sequences)):
        net = gru.GRU(units=units,
                      return_sequences=return_sequences,
                      stateful=flags.stateful)(net)

    net = stream.Stream(cell=tf.keras.layers.Flatten())(net)
    net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

    for units, activation in zip(utils.parse(flags.units1),
                                 utils.parse(flags.act1)):
        net = tf.keras.layers.Dense(units=units, activation=activation)(net)

    net = tf.keras.layers.Dense(units=flags.label_count)(net)
    if flags.return_softmax:
        net = tf.keras.layers.Activation('softmax')(net)
    return tf.keras.Model(input_audio, net)