Пример #1
0
def model(flags):
    """Convolutional recurrent neural network (CRNN) model.

  It is based on paper
  Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting
  https://arxiv.org/pdf/1703.05390.pdf
  Represented as sequence of Conv, RNN/GRU, FC layers.
  Model topology is similar with "Hello Edge: Keyword Spotting on
  Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """
    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    # expand dims for the next layer 2d conv
    net = tf.keras.backend.expand_dims(net)
    for filters, kernel_size, activation, dilation_rate, strides in zip(
            parse(flags.cnn_filters), parse(flags.cnn_kernel_size),
            parse(flags.cnn_act), parse(flags.cnn_dilation_rate),
            parse(flags.cnn_strides)):
        net = stream.Stream(
            cell=tf.keras.layers.Conv2D(filters=filters,
                                        kernel_size=kernel_size,
                                        activation=activation,
                                        dilation_rate=dilation_rate,
                                        strides=strides))(net)

    shape = net.shape
    # input net dimension: [batch, time, feature, channels]
    # reshape dimension: [batch, time, feature * channels]
    # so that GRU/RNN can process it
    net = tf.keras.layers.Reshape((-1, shape[2] * shape[3]))(net)

    for units, return_sequences in zip(parse(flags.gru_units),
                                       parse(flags.return_sequences)):
        net = gru.GRU(units=units,
                      return_sequences=return_sequences,
                      stateful=flags.stateful)(net)

    net = stream.Stream(cell=tf.keras.layers.Flatten())(net)
    net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

    for units, activation in zip(parse(flags.units1), parse(flags.act1)):
        net = tf.keras.layers.Dense(units=units, activation=activation)(net)

    net = tf.keras.layers.Dense(units=flags.label_count)(net)
    if flags.return_softmax:
        net = tf.keras.layers.Activation('softmax')(net)
    return tf.keras.Model(input_audio, net)
Пример #2
0
def model(flags):
    """CNN model.

  It is based on paper:
  Convolutional Neural Networks for Small-footprint Keyword Spotting
  http://www.isca-speech.org/archive/interspeech_2015/papers/i15_1478.pdf
  Model topology is similar with "Hello Edge: Keyword Spotting on
  Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf

  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    net = tf.keras.backend.expand_dims(net)
    for filters, kernel_size, activation, dilation_rate, strides in zip(
            utils.parse(flags.cnn_filters), utils.parse(flags.cnn_kernel_size),
            utils.parse(flags.cnn_act), utils.parse(flags.cnn_dilation_rate),
            utils.parse(flags.cnn_strides)):
        net = stream.Stream(
            cell=tf.keras.layers.Conv2D(filters=filters,
                                        kernel_size=kernel_size,
                                        activation=activation,
                                        dilation_rate=dilation_rate,
                                        strides=strides))(net)

    net = stream.Stream(cell=tf.keras.layers.Flatten())(net)
    net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

    for units, activation in zip(utils.parse(flags.units2),
                                 utils.parse(flags.act2)):
        net = tf.keras.layers.Dense(units=units, activation=activation)(net)

    net = tf.keras.layers.Dense(units=flags.label_count)(net)
    if flags.return_softmax:
        net = tf.keras.layers.Activation('softmax')(net)
    return tf.keras.Model(input_audio, net)
Пример #3
0
    def build(self, input_shape):
        super(Svdf, self).build(input_shape)

        if self.mode == modes.Modes.TRAINING:
            self.dropout1 = non_scaling_dropout.NonScalingDropout(self.dropout)
        else:
            self.dropout1 = tf.keras.layers.Lambda(lambda x, training: x)
        self.dense1 = tf.keras.layers.Dense(units=self.units1,
                                            use_bias=self.use_bias1)
        self.depth_cnn1 = stream.Stream(
            cell=tf.keras.layers.DepthwiseConv2D(kernel_size=(self.memory_size,
                                                              1),
                                                 strides=(1, 1),
                                                 padding='valid',
                                                 dilation_rate=(1, 1),
                                                 use_bias=self.use_bias),
            inference_batch_size=self.inference_batch_size,
            mode=self.mode,
            use_one_step=False,
            pad_time_dim=self.pad)
        if self.units2 > 0:
            self.dense2 = tf.keras.layers.Dense(units=self.units2,
                                                use_bias=True)
        else:
            self.dense2 = tf.keras.layers.Lambda(lambda x, training: x)

        if self.use_batch_norm:
            self.batch_norm = tf.keras.layers.BatchNormalization(
                scale=self.bn_scale)
        else:
            self.batch_norm = tf.keras.layers.Lambda(lambda x, training: x)
Пример #4
0
    def test_strided_conv_alignment(self):
        kernel_size = 4
        strides = 2
        inputs = tf.keras.layers.Input(shape=(None, 1))
        net = inputs
        net = stream.Stream(cell=tf.keras.layers.Conv1D(
            filters=1,
            kernel_size=kernel_size,
            strides=strides,
            padding='valid',
            kernel_initializer='ones'),
                            use_one_step=False,
                            pad_time_dim='causal')(net)
        model = tf.keras.Model(inputs=inputs, outputs=net)

        input_signal = np.arange(1, 5)  # [1, 2, 3, 4]
        # Sanity check for the test itself: We only care about the case when input
        # length is a multiple of strides. If not, streaming is not meaningful.
        assert len(input_signal) % strides == 0
        input_signal = input_signal[None, :, None]
        output_signal = model.predict(input_signal)
        outputs = output_signal[0, :, 0]

        # Make sure causal conv is right-aligned, so that the most recent samples
        # are never ignored. Thus we want:
        #           1  2  3  4
        # -> [0  0] 1  2  3  4  (padding)
        # ->           3    10  (conv with kernel of ones: 3=0+0+1+2, 10=1+2+3+4)
        # Note that this is different from tf.keras.layersConv1D(..., 'causal'),
        # which will pad 3 zeroes on the left and produce [1(=0+0+0+1), 6(=0+1+2+3)]
        # instead. The latter is less ideal, since it pads an extra zero and ignores
        # the last (and hence most recent) valid sample "4".
        self.assertAllEqual(outputs, [3, 10])
Пример #5
0
def conv_model_no_stream_wrapper(flags, conv_cell, cnn_filters, cnn_kernel_size,
                                 cnn_act, cnn_dilation_rate, cnn_strides,
                                 cnn_use_bias):
  """Toy example of convolutional model.

  It has the same model topology as in conv_model() above, but without
  wrapping conv cell by Stream layer, so that all parameters set manually.
  Args:
      flags: model and data settings
      conv_cell: cell for streaming, for example: tf.keras.layers.Conv1D
      cnn_filters: list of filters in conv layer
      cnn_kernel_size: list of kernel_size in conv layer
      cnn_act: list of activation functions in conv layer
      cnn_dilation_rate: list of dilation_rate in conv layer
      cnn_strides: list of strides in conv layer
      cnn_use_bias: list of use_bias in conv layer
  Returns:
    Keras model
  """

  if not all(
      len(cnn_filters) == len(l) for l in [
          cnn_filters, cnn_kernel_size, cnn_act, cnn_dilation_rate, cnn_strides,
          cnn_use_bias
      ]):
    raise ValueError('all input lists have to be the same length')

  input_audio = tf.keras.layers.Input(
      shape=(flags.desired_samples,), batch_size=flags.batch_size)
  net = input_audio

  net = tf.keras.backend.expand_dims(net)

  for filters, kernel_size, activation, dilation_rate, strides, use_bias in zip(
      cnn_filters, cnn_kernel_size,
      cnn_act, cnn_dilation_rate,
      cnn_strides, cnn_use_bias):

    ring_buffer_size_in_time_dim = dilation_rate * (kernel_size - 1)
    net = stream.Stream(
        cell=tf.identity,
        ring_buffer_size_in_time_dim=ring_buffer_size_in_time_dim,
        use_one_step=False,
        pad_time_dim=None)(net)

    padding_size = ring_buffer_size_in_time_dim
    net = temporal_padding.TemporalPadding(
        padding='causal', padding_size=padding_size)(
            net)

    net = conv_cell(
        filters=filters,
        kernel_size=kernel_size,
        activation=activation,
        dilation_rate=dilation_rate,
        strides=strides,
        use_bias=use_bias,
        padding='valid')(net)  # padding has to be valid!

  return tf.keras.Model(input_audio, net)
Пример #6
0
  def __init__(self,
               filters=8,
               dilation=1,
               stride=1,
               padding='same',
               dropout=0.5,
               use_one_step=True,
               sub_groups=5,
               **kwargs):
    super(TransitionBlock, self).__init__(**kwargs)
    self.filters = filters
    self.dilation = dilation
    self.stride = stride
    self.padding = padding
    self.dropout = dropout
    self.use_one_step = use_one_step
    self.sub_groups = sub_groups

    self.frequency_dw_conv = tf.keras.layers.DepthwiseConv2D(
        kernel_size=(1, 3),
        strides=self.stride,
        dilation_rate=self.dilation,
        padding='same',
        use_bias=False)
    if self.padding == 'same':
      self.temporal_dw_conv = tf.keras.layers.DepthwiseConv2D(
          kernel_size=(3, 1),
          strides=self.stride,
          dilation_rate=self.dilation,
          padding='same',
          use_bias=False)
    else:
      self.temporal_dw_conv = stream.Stream(
          cell=tf.keras.layers.DepthwiseConv2D(
              kernel_size=(3, 1),
              strides=self.stride,
              dilation_rate=self.dilation,
              padding='valid',
              use_bias=False),
          use_one_step=use_one_step,
          pad_time_dim=self.padding,
          pad_freq_dim='same')
    self.batch_norm1 = tf.keras.layers.BatchNormalization()
    self.batch_norm2 = tf.keras.layers.BatchNormalization()
    self.conv1x1_1 = tf.keras.layers.Conv2D(
        filters=self.filters,
        kernel_size=1,
        strides=1,
        padding='valid',
        use_bias=False)
    self.conv1x1_2 = tf.keras.layers.Conv2D(
        filters=self.filters,
        kernel_size=1,
        strides=1,
        padding='valid',
        use_bias=False)
    self.spatial_drop = tf.keras.layers.SpatialDropout2D(rate=self.dropout)
    self.spectral_norm = sub_spectral_normalization.SubSpectralNormalization(
        self.sub_groups)
Пример #7
0
def model(flags):
    """Temporal Convolution ResNet model.

  It can be configured to reproduce model config as described in the paper below
  Temporal Convolution for Real-time Keyword Spotting on Mobile Devices
  https://arxiv.org/pdf/1904.03814.pdf
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

    tc_filters = parse(flags.tc_filters)
    repeat_tc_convs = parse(flags.repeat_tc_convs)
    kernel_sizes = parse(flags.kernel_sizes)
    pool_sizes = parse(flags.pool_sizes)
    dilations = parse(flags.dilations)
    residuals = parse(flags.residuals)

    if len(
            set((len(repeat_tc_convs), len(kernel_sizes), len(pool_sizes),
                 len(dilations), len(residuals), len(tc_filters)))) != 1:
        raise ValueError('all input lists have to be the same length')

    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    # make it [batch, time, 1, feature]
    net = tf.keras.backend.expand_dims(net, axis=2)

    for filters, repeat, kernel_size, pool_size, dilation, residual in zip(
            tc_filters, repeat_tc_convs, kernel_sizes, pool_sizes, dilations,
            residuals):
        net = resnet_block(net, repeat, kernel_size, filters, dilation,
                           residual, flags.padding_in_time, flags.dropout,
                           flags.activation)

        if pool_size > 1:
            net = tf.keras.layers.MaxPooling2D((pool_size, 1))(net)

    net = stream.Stream(cell=tf.keras.layers.GlobalAveragePooling2D())(net)

    net = tf.keras.layers.Dense(units=flags.label_count)(net)
    if flags.return_softmax:
        net = tf.keras.layers.Activation('softmax')(net)
    return tf.keras.Model(input_audio, net)
Пример #8
0
def model(flags):
  """SVDF model.

  This model is based on decomposition of a densely connected ops
  into low rank filters.
  It is based on paper
  END-TO-END STREAMING KEYWORD SPOTTING https://arxiv.org/pdf/1812.02802.pdf
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

  input_audio = tf.keras.layers.Input(
      shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING),
      batch_size=flags.batch_size)
  net = input_audio

  if flags.preprocess == 'raw':
    # it is a self contained model, user need to feed raw audio only
    net = speech_features.SpeechFeatures(
        speech_features.SpeechFeatures.get_params(flags))(
            net)

  # for streaming mode it is better to use causal padding
  padding = 'causal' if flags.svdf_pad else 'valid'

  for i, (units1, memory_size, units2, dropout, activation) in enumerate(
      zip(
          utils.parse(flags.svdf_units1), utils.parse(flags.svdf_memory_size),
          utils.parse(flags.svdf_units2), utils.parse(flags.svdf_dropout),
          utils.parse(flags.svdf_act))):
    net = svdf.Svdf(
        units1=units1,
        memory_size=memory_size,
        units2=units2,
        dropout=dropout,
        activation=activation,
        pad=padding,
        name='svdf_%d' % i)(
            net)

  net = stream.Stream(cell=tf.keras.layers.Flatten())(net)
  net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

  for units, activation in zip(
      utils.parse(flags.units2), utils.parse(flags.act2)):
    net = tf.keras.layers.Dense(units=units, activation=activation)(net)

  net = tf.keras.layers.Dense(units=flags.label_count)(net)
  if flags.return_softmax:
    net = tf.keras.layers.Activation('softmax')(net)
  return tf.keras.Model(input_audio, net)
Пример #9
0
def model(flags):
  """Fully connected layer based model.

  It is based on paper (with added pooling):
  SMALL-FOOTPRINT KEYWORD SPOTTING USING DEEP NEURAL NETWORKS
  https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/42537.pdf
  Model topology is similar with "Hello Edge: Keyword Spotting on
  Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

  input_audio = tf.keras.layers.Input(
      shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING),
      batch_size=flags.batch_size)
  net = input_audio

  if flags.preprocess == 'raw':
    # it is a self contained model, user need to feed raw audio only
    net = speech_features.SpeechFeatures(
        speech_features.SpeechFeatures.get_params(flags))(
            net)

  for units, activation in zip(
      utils.parse(flags.units1), utils.parse(flags.act1)):
    net = tf.keras.layers.Dense(units=units, activation=activation)(net)

  net = stream.Stream(cell=tf.keras.layers.Flatten())(net)

  # after flattening data in time, we can apply any layer: pooling, bi-lstm etc
  if flags.pool_size > 1:
    # add fake dim for compatibility with pooling
    net = tf.keras.backend.expand_dims(net, axis=-1)
    net = tf.keras.layers.MaxPool1D(
        pool_size=flags.pool_size,
        strides=flags.strides,
        data_format='channels_last')(net)
    # remove fake dim
    net = tf.keras.backend.squeeze(net, axis=-1)

  net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

  for units, activation in zip(
      utils.parse(flags.units2), utils.parse(flags.act2)):
    net = tf.keras.layers.Dense(units=units, activation=activation)(net)

  net = tf.keras.layers.Dense(units=flags.label_count)(net)
  if flags.return_softmax:
    net = tf.keras.layers.Activation('softmax')(net)
  return tf.keras.Model(input_audio, net)
Пример #10
0
def conv_model(flags, conv_cell, cnn_filters, cnn_kernel_size, cnn_act,
               cnn_dilation_rate, cnn_strides, cnn_use_bias, **kwargs):
  """Toy example of convolutional model with Stream wrapper.

  It can be used for speech enhancement.
  Args:
      flags: model and data settings
      conv_cell: cell for streaming, for example: tf.keras.layers.Conv1D
      cnn_filters: list of filters in conv layer
      cnn_kernel_size: list of kernel_size in conv layer
      cnn_act: list of activation functions in conv layer
      cnn_dilation_rate: list of dilation_rate in conv layer
      cnn_strides: list of strides in conv layer
      cnn_use_bias: list of use_bias in conv layer
      **kwargs: Additional kwargs passed on to conv_cell.
  Returns:
    Keras model

  Raises:
    ValueError: if any of input list has different length from any other
  """

  if not all(
      len(cnn_filters) == len(l) for l in [
          cnn_filters, cnn_kernel_size, cnn_act, cnn_dilation_rate, cnn_strides,
          cnn_use_bias
      ]):
    raise ValueError('all input lists have to be the same length')

  input_audio = tf.keras.layers.Input(
      shape=(flags.desired_samples,), batch_size=flags.batch_size)
  net = input_audio

  net = tf.keras.backend.expand_dims(net)

  for (filters, kernel_size, activation, dilation_rate, strides,
       use_bias) in zip(cnn_filters, cnn_kernel_size, cnn_act,
                        cnn_dilation_rate, cnn_strides, cnn_use_bias):

    net = stream.Stream(
        cell=conv_cell(
            filters=filters,
            kernel_size=kernel_size,
            activation=activation,
            dilation_rate=dilation_rate,
            strides=strides,
            use_bias=use_bias,
            padding='valid',
            **kwargs),
        use_one_step=False,
        pad_time_dim='causal')(net)

  return tf.keras.Model(input_audio, net)
Пример #11
0
def model(flags):
  """Fully connected layer based model on raw wav data.

  It is based on paper (with added pooling and raw audio data):
  SMALL-FOOTPRINT KEYWORD SPOTTING USING DEEP NEURAL NETWORKS
  https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/42537.pdf
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

  if flags.preprocess != 'raw':
    ValueError('input audio has to be raw, but get ', flags.preprocess)

  input_audio = tf.keras.layers.Input(
      shape=(flags.desired_samples,), batch_size=flags.batch_size)

  net = data_frame.DataFrame(
      frame_size=flags.window_size_samples,
      frame_step=flags.window_stride_samples)(
          input_audio)

  for units, activation in zip(
      utils.parse(flags.units1), utils.parse(flags.act1)):
    net = tf.keras.layers.Dense(units=units, activation=activation)(net)

  net = stream.Stream(cell=tf.keras.layers.Flatten())(net)

  # after flattening data in time, we can apply any layer: pooling, bi-lstm etc
  if flags.pool_size > 1:
    # add fake dim for compatibility with pooling
    net = tf.keras.backend.expand_dims(net, axis=-1)
    net = tf.keras.layers.MaxPool1D(
        pool_size=flags.pool_size,
        strides=flags.strides,
        data_format='channels_last')(
            net)
    # remove fake dim
    net = tf.keras.backend.squeeze(net, axis=-1)

  net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

  for units, activation in zip(
      utils.parse(flags.units2), utils.parse(flags.act2)):
    net = tf.keras.layers.Dense(units=units, activation=activation)(net)

  net = tf.keras.layers.Dense(units=flags.label_count)(net)
  if flags.return_softmax:
    net = tf.keras.layers.Activation('softmax')(net)
  return tf.keras.Model(input_audio, net)
Пример #12
0
def model(flags):
  """LSTM model.

  Similar model in papers:
  Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting
  https://arxiv.org/pdf/1703.05390.pdf (with no conv layer)
  Model topology is similar with "Hello Edge: Keyword Spotting on
  Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf

  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """
  input_audio = tf.keras.layers.Input(
      shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING),
      batch_size=flags.batch_size)
  net = input_audio

  if flags.preprocess == 'raw':
    # it is a self contained model, user need to feed raw audio only
    net = speech_features.SpeechFeatures(
        speech_features.SpeechFeatures.get_params(flags))(
            net)

  for units, return_sequences, num_proj in zip(
      utils.parse(flags.lstm_units), utils.parse(flags.return_sequences),
      utils.parse(flags.num_proj)):
    net = lstm.LSTM(
        units=units,
        return_sequences=return_sequences,
        stateful=flags.stateful,
        use_peepholes=flags.use_peepholes,
        num_proj=num_proj)(
            net)

  net = stream.Stream(cell=tf.keras.layers.Flatten())(net)
  net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

  for units, activation in zip(
      utils.parse(flags.units1), utils.parse(flags.act1)):
    net = tf.keras.layers.Dense(units=units, activation=activation)(net)

  net = tf.keras.layers.Dense(units=flags.label_count)(net)
  if flags.return_softmax:
    net = tf.keras.layers.Activation('softmax')(net)
  return tf.keras.Model(input_audio, net)
Пример #13
0
  def test_average_pooling_stream(self):

    # prepare input data
    params = test_utils.Params([1])
    params.desired_samples = 5

    batch_size = 1
    time1 = params.desired_samples  # it is time dim (will not be averaged out)
    time2 = 3  # this dim will be averaged out and become 1
    feature = 16  # it is a feature dim

    # override data shape for streaming mode testing
    params.preprocess = 'custom'
    params.data_shape = (1, time2, feature)

    inp_audio = np.random.rand(batch_size, time1, time2, feature)
    inputs = tf.keras.layers.Input(
        shape=(time1, time2, feature), batch_size=batch_size)

    net = stream.Stream(
        cell=average_pooling2d.AveragePooling2D(
            kernel_size=(time1, time2),
            padding='valid'),
        use_one_step=False,
        pad_time_dim='causal')(inputs)

    model = tf.keras.Model(inputs, net)
    model.summary()

    # prepare streaming model
    model_stream = utils.to_streaming_inference(
        model, params, modes.Modes.STREAM_INTERNAL_STATE_INFERENCE)
    model_stream.summary()

    # run inference and compare streaming vs non streaming
    non_stream_out = model.predict(inp_audio)
    stream_out = test.run_stream_inference(params, model_stream, inp_audio)
    self.assertAllClose(stream_out, non_stream_out)

    net = tf.keras.layers.GlobalAveragePooling2D()(inputs)
    model_global = tf.keras.Model(inputs, net)
    model_global.summary()

    global_out = model_global.predict(inp_audio)
    # last result in streaming output has to be the same with global average
    self.assertAllClose(stream_out[0, -1, 0, :], global_out[0, :])
Пример #14
0
    def test_padding(self, padding):
        batch_size = 1
        time_dim = 3
        feature_dim = 3
        kernel_size = 3
        inputs = tf.keras.layers.Input(shape=(time_dim, feature_dim),
                                       batch_size=batch_size)

        # set it in train mode (in stream mode padding is not applied)
        net = stream.Stream(mode=modes.Modes.TRAINING,
                            cell=tf.keras.layers.Lambda(lambda x: x),
                            ring_buffer_size_in_time_dim=kernel_size,
                            pad_time_dim=padding)(inputs)
        model = tf.keras.Model(inputs, net)

        test_utils.set_seed(1)
        input_signal = np.random.rand(batch_size, time_dim, feature_dim)
        outputs = model.predict(input_signal)
        self.assertAllEqual(
            outputs.shape,
            [batch_size, time_dim + kernel_size - 1, feature_dim])
Пример #15
0
def model(flags):
  """SVDF model with residual connections.

  This model is based on decomposition of a densely connected ops
  into low rank filters.
  It is based on paper
  END-TO-END STREAMING KEYWORD SPOTTING https://arxiv.org/pdf/1812.02802.pdf
  In addition we added residual connection
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

  input_audio = tf.keras.layers.Input(
      shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING),
      batch_size=flags.batch_size)
  net = input_audio

  if flags.preprocess == 'raw':
    # it is a self contained model, user need to feed raw audio only
    net = speech_features.SpeechFeatures(
        speech_features.SpeechFeatures.get_params(flags))(
            net)

  blocks_pool = utils.parse(flags.blocks_pool)
  if len(blocks_pool) != 3:
    raise ValueError('number of pooling blocks has to be 3, but get: ',
                     len(blocks_pool))

  # for streaming mode it is better to use causal padding
  padding = 'causal' if flags.svdf_pad else 'valid'

  # first residual block
  number_of_blocks = len(utils.parse(flags.block1_units1))
  activations = [flags.activation] * number_of_blocks
  activations[-1] = 'linear'  # last layer is linear
  residual = net
  for i, (units1, memory_size, activation) in enumerate(
      zip(
          utils.parse(flags.block1_units1),
          utils.parse(flags.block1_memory_size), activations)):
    # [batch, time, feature]
    net = svdf.Svdf(
        units1=units1,
        memory_size=memory_size,
        units2=-1,
        dropout=flags.svdf_dropout,
        activation=activation,
        pad=padding,
        use_bias=flags.svdf_use_bias,
        use_batch_norm=flags.use_batch_norm,
        bn_scale=flags.bn_scale,
        name='svdf_1_%d' % i)(
            net)

  # number of channels in the last layer
  units1_last = utils.parse(flags.block1_units1)[-1]

  # equivalent to 1x1 convolution
  residual = tf.keras.layers.Dense(units1_last, use_bias=False)(residual)
  residual = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(residual)

  # residual connection
  net = tf.keras.layers.Add()([net, residual])
  # [batch, time, feature]
  net = tf.keras.layers.Activation(flags.activation)(net)
  net = tf.keras.layers.MaxPool1D(
      blocks_pool[0], strides=blocks_pool[0], padding='valid')(
          net)

  # second residual block
  number_of_blocks = len(utils.parse(flags.block2_units1))
  activations = [flags.activation] * number_of_blocks
  activations[-1] = 'linear'  # last layer is linear
  residual = net
  for i, (units1, memory_size, activation) in enumerate(
      zip(
          utils.parse(flags.block2_units1),
          utils.parse(flags.block2_memory_size), activations)):
    # [batch, time, feature]
    net = svdf.Svdf(
        units1=units1,
        memory_size=memory_size,
        units2=-1,
        dropout=flags.svdf_dropout,
        activation=activation,
        pad=padding,
        use_bias=flags.svdf_use_bias,
        use_batch_norm=flags.use_batch_norm,
        bn_scale=flags.bn_scale,
        name='svdf_2_%d' % i)(
            net)

  # number of channels in the last layer
  units1_last = utils.parse(flags.block2_units1)[-1]

  # equivalent to 1x1 convolution
  residual = tf.keras.layers.Dense(units1_last, use_bias=False)(residual)
  residual = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(residual)

  # residual connection
  net = tf.keras.layers.Add()([net, residual])
  net = tf.keras.layers.Activation(flags.activation)(net)
  # [batch, time, feature]
  net = tf.keras.layers.MaxPool1D(
      blocks_pool[1], strides=blocks_pool[1], padding='valid')(
          net)

  # third residual block
  number_of_blocks = len(utils.parse(flags.block3_units1))
  activations = [flags.activation] * number_of_blocks
  activations[-1] = 'linear'  # last layer is linear
  residual = net
  for i, (units1, memory_size, activation) in enumerate(
      zip(
          utils.parse(flags.block3_units1),
          utils.parse(flags.block3_memory_size), activations)):
    net = svdf.Svdf(
        units1=units1,
        memory_size=memory_size,
        units2=-1,
        dropout=flags.svdf_dropout,
        activation=activation,
        pad=padding,
        use_bias=flags.svdf_use_bias,
        use_batch_norm=flags.use_batch_norm,
        bn_scale=flags.bn_scale,
        name='svdf_3_%d' % i)(
            net)

  # number of channels in the last layer
  units1_last = utils.parse(flags.block3_units1)[-1]

  # equivalent to 1x1 convolution
  residual = tf.keras.layers.Dense(units1_last, use_bias=False)(residual)
  residual = tf.keras.layers.BatchNormalization(scale=flags.bn_scale)(residual)

  # residual connection
  net = tf.keras.layers.Add()([net, residual])
  net = tf.keras.layers.Activation(flags.activation)(net)
  net = tf.keras.layers.MaxPool1D(
      blocks_pool[2], strides=blocks_pool[2], padding='valid')(
          net)
  # [batch, time, feature]

  # convert all feature to one vector
  if flags.flatten:
    net = stream.Stream(use_one_step=False, cell=tf.keras.layers.Flatten())(net)
  else:
    net = tf.keras.backend.expand_dims(net, axis=2)
    net = stream.Stream(
        use_one_step=False,
        cell=tf.keras.layers.AveragePooling2D(
            pool_size=(int(net.shape[1]), int(net.shape[2]))))(
                net)

  net = tf.keras.layers.Flatten()(net)

  # [batch, feature]
  net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

  for units in utils.parse(flags.units2):
    net = tf.keras.layers.Dense(units=units, activation=flags.activation)(net)

  net = tf.keras.layers.Dense(units=flags.label_count)(net)
  if flags.return_softmax:
    net = tf.keras.layers.Activation('softmax')(net)
  return tf.keras.Model(input_audio, net)
Пример #16
0
def model(flags):
    """BC-ResNet model.

  It is based on paper
  Broadcasted Residual Learning for Efficient Keyword Spotting
  https://arxiv.org/pdf/2106.04140.pdf

  Args:
    flags: data/model parameters

  Returns:
    Keras model for training

  Raises:
    ValueError: if any of input list has different length from any other;
    or if padding is not supported
  """

    dropouts = utils.parse(flags.dropouts)
    filters = utils.parse(flags.filters)
    blocks_n = utils.parse(flags.blocks_n)
    strides = utils.parse(flags.strides)
    dilations = utils.parse(flags.dilations)

    for l in (dropouts, filters, strides, dilations):
        if len(blocks_n) != len(l):
            raise ValueError('all input lists have to be the same length '
                             'but get %s and %s ' % (blocks_n, l))

    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    # make it [batch, time, feature, 1]
    net = tf.keras.backend.expand_dims(net, axis=3)

    if flags.paddings == 'same':
        net = tf.keras.layers.Conv2D(filters=flags.first_filters,
                                     kernel_size=5,
                                     strides=(1, 2),
                                     padding='same')(net)
    else:
        net = stream.Stream(cell=tf.keras.layers.Conv2D(
            filters=flags.first_filters,
            kernel_size=5,
            strides=(1, 2),
            padding='valid'),
                            use_one_step=True,
                            pad_time_dim=flags.paddings,
                            pad_freq_dim='same')(net)

    for n, n_filters, dilation, stride, dropout in zip(blocks_n, filters,
                                                       dilations, strides,
                                                       dropouts):
        net = TransitionBlock(n_filters,
                              dilation,
                              stride,
                              flags.paddings,
                              dropout,
                              sub_groups=flags.sub_groups)(net)
        for _ in range(n):
            net = NormalBlock(n_filters,
                              dilation,
                              1,
                              flags.paddings,
                              dropout,
                              sub_groups=flags.sub_groups)(net)

    if flags.paddings == 'same':
        net = tf.keras.layers.DepthwiseConv2D(kernel_size=5,
                                              padding='same')(net)
    else:
        net = stream.Stream(cell=tf.keras.layers.DepthwiseConv2D(
            kernel_size=5, padding='valid'),
                            use_one_step=True,
                            pad_time_dim=flags.paddings,
                            pad_freq_dim='same')(net)

    # average out frequency dim
    net = tf.keras.backend.mean(net, axis=2, keepdims=True)

    net = tf.keras.layers.Conv2D(filters=flags.last_filters,
                                 kernel_size=1,
                                 use_bias=False)(net)

    # average out time dim
    if flags.paddings == 'same':
        net = tf.keras.layers.GlobalAveragePooling2D(keepdims=True)(net)
    else:
        net = stream.Stream(cell=tf.keras.layers.GlobalAveragePooling2D(
            keepdims=True))(net)

    net = tf.keras.layers.Conv2D(filters=flags.label_count,
                                 kernel_size=1,
                                 use_bias=False)(net)
    # 1 and 2 dims are equal to 1
    net = tf.squeeze(net, [1, 2])

    if flags.return_softmax:
        net = tf.keras.layers.Activation('softmax')(net)
    return tf.keras.Model(input_audio, net)
Пример #17
0
    def test_streaming_with_effective_tdim(self):
        time_size = 10
        feature_size = 3
        batch_size = 1

        time_dim = 1  # index of time dimensions
        ring_buffer_size_in_time_dim = 3  # effective size of aperture in time dim

        inputs = tf.keras.layers.Input(shape=(time_size, feature_size),
                                       batch_size=batch_size,
                                       name='inp_sequence')

        mode = modes.Modes.TRAINING

        # in streaming mode it will create a
        # ring buffer with time dim size ring_buffer_size_in_time_dim
        outputs = stream.Stream(
            cell=Sum(time_dim=time_dim),
            mode=mode,
            ring_buffer_size_in_time_dim=ring_buffer_size_in_time_dim)(inputs)
        model_train = tf.keras.Model(inputs, outputs)
        model_train.summary()

        mode = modes.Modes.STREAM_EXTERNAL_STATE_INFERENCE
        input_tensors = [
            tf.keras.layers.Input(
                shape=(
                    1,  # time dim is size 1 in streaming mode
                    feature_size,
                ),
                batch_size=batch_size,
                name='inp_stream')
        ]
        # convert non streaming model to streaming one
        model_stream = utils.convert_to_inference_model(
            model_train, input_tensors, mode)
        model_stream.summary()

        # second input tostream model is a state, so we can use its shape
        input_state_np = np.zeros(model_stream.inputs[1].shape,
                                  dtype=np.float32)

        # input test data
        non_stream_input = np.random.randint(1,
                                             10,
                                             size=(batch_size, time_size,
                                                   feature_size))

        # run streaming inference
        # iterate over time dim sample by sample
        for i in range(input_state_np.shape[1]):
            input_stream_np = np.expand_dims(non_stream_input[0][i], 0)
            input_stream_np = np.expand_dims(input_stream_np, 1)
            input_stream_np = input_stream_np.astype(np.float32)
            output_stream_np, output_state_np = model_stream.predict(
                [input_stream_np, input_state_np])
            input_state_np = output_state_np  # update input state

            # emulate sliding window summation
            target = np.sum(
                non_stream_input[:,
                                 max(0, i - ring_buffer_size_in_time_dim):i +
                                 1],
                axis=time_dim)
            self.assertAllEqual(target, output_stream_np)

        # validate name tag of model's state
        expected_str = 'ExternalState'
        self.assertAllEqual(
            expected_str,
            model_stream.inputs[1].name.split('/')[-1][:len(expected_str)])
Пример #18
0
def resnet_block(inputs, repeat_tc_conv, kernel_size, filters, dilation,
                 residual, padding_in_time, dropout, activation):
    """TC(time conv) Residual block.

  Args:
    inputs: input tensor
    repeat_tc_conv: number of repeating Conv1D in time
    kernel_size: kernel size of Conv1D in time dim
    filters: number of filters in Conv1D in time and 1x1 conv
    dilation: dilation in time dim for Conv1D
    residual: if True residual connection is added
    padding_in_time: can be 'same' or 'causal'
    dropout: dropout value
    activation: type of activation function (string)

  Returns:
    output tensor

  Raises:
    ValueError: if padding has invalid value
  """
    if residual and (padding_in_time not in ('same', 'causal')):
        raise ValueError('padding should be same or causal')

    net = inputs
    if residual:
        # 1x1 conv
        layer_res = tf.keras.layers.Conv2D(filters=filters,
                                           kernel_size=1,
                                           activation='linear')(net)
        layer_res = tf.keras.layers.BatchNormalization()(layer_res)

    for _ in range(repeat_tc_conv - 1):
        # 1D conv in time
        net = stream.Stream(cell=tf.keras.layers.Conv2D(
            filters=filters,
            kernel_size=(kernel_size, 1),
            dilation_rate=(dilation, 1),
            padding='valid',
            activation='linear'),
                            pad_time_dim=padding_in_time)(net)
        net = tf.keras.layers.BatchNormalization()(net)
        net = tf.keras.layers.Activation(activation)(net)

    # 1D conv in time
    net = stream.Stream(cell=tf.keras.layers.Conv2D(filters=filters,
                                                    kernel_size=(kernel_size,
                                                                 1),
                                                    dilation_rate=(dilation,
                                                                   1),
                                                    padding='valid',
                                                    activation='linear'),
                        pad_time_dim=padding_in_time)(net)
    net = tf.keras.layers.BatchNormalization()(net)

    # residual connection
    if residual:
        net = tf.keras.layers.Add()([net, layer_res])

    net = tf.keras.layers.Activation(activation)(net)
    net = tf.keras.layers.Dropout(rate=dropout)(net)
    return net
Пример #19
0
def resnet_block(inputs,
                 repeat,
                 kernel_size,
                 filters,
                 dilation,
                 stride,
                 residual=False,
                 padding='same',
                 dropout=0.0,
                 activation='relu'):
  """Residual block.

  It is based on paper
  Jasper: An End-to-End Convolutional Neural Acoustic Model
  https://arxiv.org/pdf/1904.03288.pdf

  Args:
    inputs: input tensor
    repeat: number of repeating DepthwiseConv1D and Conv1D block
    kernel_size: kernel size of DepthwiseConv1D in time dim
    filters: number of filters in DepthwiseConv1D and Conv1D
    dilation: dilation in time dim for DepthwiseConv1D
    stride: stride in time dim for DepthwiseConv1D
    residual: if True residual connection is added
    padding: can be 'same' or 'causal'
    dropout: dropout value
    activation: type of activation function (string)

  Returns:
    output tensor

  Raises:
    ValueError: if any of input list has different length from any other;
    or if padding has invalid value
  """
  if padding not in ('same', 'causal'):
    raise ValueError('padding should be same or causal')

  net = inputs
  for _ in range(repeat-1):
    # DepthwiseConv1D
    net = stream.Stream(
        cell=tf.keras.layers.DepthwiseConv2D(
            kernel_size=(kernel_size, 1),
            strides=(stride, 1),
            padding='valid',
            dilation_rate=(dilation, 1),
            use_bias=False),
        pad_time_dim=padding)(
            net)

    # Conv1D 1x1
    net = stream.Stream(
        cell=tf.keras.layers.Conv2D(
            filters=filters, kernel_size=1, use_bias=False, padding='valid'),
        pad_time_dim=padding)(
            net)

    net = tf.keras.layers.BatchNormalization()(net)
    net = tf.keras.layers.Activation(activation)(net)
    net = tf.keras.layers.Dropout(rate=dropout)(net)

  # DepthwiseConv1D
  net = stream.Stream(
      cell=tf.keras.layers.DepthwiseConv2D(
          kernel_size=(kernel_size, 1),
          strides=(stride, 1),
          padding='valid',
          dilation_rate=(dilation, 1),
          use_bias=False),
      pad_time_dim=padding)(
          net)

  # Conv1D 1x1
  net = stream.Stream(
      cell=tf.keras.layers.Conv2D(
          filters=filters, kernel_size=1, use_bias=False, padding='valid'),
      pad_time_dim=padding)(
          net)
  net = tf.keras.layers.BatchNormalization()(net)

  if residual:
    # Conv1D 1x1
    net_res = stream.Stream(
        cell=tf.keras.layers.Conv2D(
            filters=filters, kernel_size=1, use_bias=False, padding='valid'),
        pad_time_dim=padding)(
            inputs)
    net_res = tf.keras.layers.BatchNormalization()(net_res)

    net = tf.keras.layers.Add()([net, net_res])

  net = tf.keras.layers.Activation(activation)(net)
  net = tf.keras.layers.Dropout(rate=dropout)(net)
  return net
Пример #20
0
def model(flags):
    """Depthwise convolutional model.

  It is based on paper:
  MobileNets: Efficient Convolutional Neural Networks for
  Mobile Vision Applications https://arxiv.org/abs/1704.04861
  Model topology is similar with "Hello Edge: Keyword Spotting on
  Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    net = tf.keras.backend.expand_dims(net)

    net = stream.Stream(cell=tf.keras.layers.Conv2D(
        kernel_size=utils.parse(flags.cnn1_kernel_size),
        dilation_rate=utils.parse(flags.cnn1_dilation_rate),
        filters=flags.cnn1_filters,
        padding=flags.cnn1_padding,
        strides=utils.parse(flags.cnn1_strides)))(net)
    net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum,
                                             center=flags.bn_center,
                                             scale=flags.bn_scale,
                                             renorm=flags.bn_renorm)(net)
    net = tf.keras.layers.Activation('relu')(net)

    for kernel_size, dw2_act, dilation_rate, strides, filters, cnn2_act in zip(
            utils.parse(flags.dw2_kernel_size), utils.parse(flags.dw2_act),
            utils.parse(flags.dw2_dilation_rate),
            utils.parse(flags.dw2_strides), utils.parse(flags.cnn2_filters),
            utils.parse(flags.cnn2_act)):
        net = stream.Stream(
            cell=tf.keras.layers.DepthwiseConv2D(kernel_size=kernel_size,
                                                 dilation_rate=dilation_rate,
                                                 padding=flags.dw2_padding,
                                                 strides=strides))(net)
        net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum,
                                                 center=flags.bn_center,
                                                 scale=flags.bn_scale,
                                                 renorm=flags.bn_renorm)(net)
        net = tf.keras.layers.Activation(dw2_act)(net)
        net = tf.keras.layers.Conv2D(kernel_size=(1, 1), filters=filters)(net)
        net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum,
                                                 center=flags.bn_center,
                                                 scale=flags.bn_scale,
                                                 renorm=flags.bn_renorm)(net)
        net = tf.keras.layers.Activation(cnn2_act)(net)

    net = stream.Stream(cell=tf.keras.layers.AveragePooling2D(
        pool_size=(int(net.shape[1]), int(net.shape[2]))))(net)

    net = stream.Stream(cell=tf.keras.layers.Flatten())(net)
    net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

    net = tf.keras.layers.Dense(units=flags.label_count)(net)
    if flags.return_softmax:
        net = tf.keras.layers.Activation('softmax')(net)
    return tf.keras.Model(input_audio, net)
Пример #21
0
def model(flags):
    """CNN model.

  It is based on paper:
  Convolutional Neural Networks for Small-footprint Keyword Spotting
  http://www.isca-speech.org/archive/interspeech_2015/papers/i15_1478.pdf
  Model topology is similar with "Hello Edge: Keyword Spotting on
  Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf

  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    if flags.quantize:
        net = quantize_layer.QuantizeLayer(
            AllValuesQuantizer(num_bits=8,
                               per_axis=False,
                               symmetric=False,
                               narrow_range=False))(net)

    net = tf.keras.backend.expand_dims(net)
    for filters, kernel_size, activation, dilation_rate, strides in zip(
            utils.parse(flags.cnn_filters), utils.parse(flags.cnn_kernel_size),
            utils.parse(flags.cnn_act), utils.parse(flags.cnn_dilation_rate),
            utils.parse(flags.cnn_strides)):
        net = stream.Stream(cell=quantize.quantize_layer(
            tf.keras.layers.Conv2D(filters=filters,
                                   kernel_size=kernel_size,
                                   dilation_rate=dilation_rate,
                                   activation='linear',
                                   strides=strides), flags.quantize,
            quantize.NoOpActivationConfig(['kernel'], ['activation'], False)),
                            pad_time_dim='causal',
                            use_one_step=False)(net)
        net = quantize.quantize_layer(
            tf.keras.layers.BatchNormalization(),
            default_8bit_quantize_configs.NoOpQuantizeConfig())(net)
        net = quantize.quantize_layer(
            tf.keras.layers.Activation(activation))(net)

    net = stream.Stream(cell=quantize.quantize_layer(
        tf.keras.layers.Flatten(), apply_quantization=flags.quantize))(net)

    net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

    for units, activation in zip(utils.parse(flags.units2),
                                 utils.parse(flags.act2)):
        net = quantize.quantize_layer(tf.keras.layers.Dense(
            units=units, activation=activation),
                                      apply_quantization=flags.quantize)(net)

    net = quantize.quantize_layer(
        tf.keras.layers.Dense(units=flags.label_count),
        apply_quantization=flags.quantize)(net)
    if flags.return_softmax:
        net = quantize.quantize_layer(tf.keras.layers.Activation('softmax'),
                                      apply_quantization=flags.quantize)(net)
    return tf.keras.Model(input_audio, net)
Пример #22
0
def resnet_block(inputs,
                 repeat,
                 kernel_size,
                 filters,
                 dilation,
                 stride,
                 filter_separable,
                 residual=False,
                 padding='same',
                 dropout=0.0,
                 activation='relu',
                 scale=True):
    """Residual block.

  It is based on paper
  Jasper: An End-to-End Convolutional Neural Acoustic Model
  https://arxiv.org/pdf/1904.03288.pdf

  Args:
    inputs: input tensor
    repeat: number of repeating DepthwiseConv1D and Conv1D block
    kernel_size: kernel size of DepthwiseConv1D in time dim
    filters: number of filters in DepthwiseConv1D and Conv1D
    dilation: dilation in time dim for DepthwiseConv1D
    stride: stride in time dim for DepthwiseConv1D
    filter_separable: use separable conv or standard conv
    residual: if True residual connection is added
    padding: can be 'same' or 'causal'
    dropout: dropout value
    activation: type of activation function (string)
    scale: apply scaling in batchnormalization layer

  Returns:
    output tensor

  Raises:
    ValueError: if padding has invalid value
  """
    if residual and (padding not in ('same', 'causal')):
        raise ValueError('padding should be same or causal')

    net = inputs
    for _ in range(repeat - 1):
        if filter_separable:  # apply separable conv
            if kernel_size > 0:
                # DepthwiseConv1D
                net = stream.Stream(cell=tf.keras.layers.DepthwiseConv2D(
                    kernel_size=(kernel_size, 1),
                    strides=(stride, 1),
                    padding='valid',
                    dilation_rate=(dilation, 1),
                    use_bias=False),
                                    pad_time_dim=padding)(net)

            # Conv1D 1x1 - streamable by default
            net = tf.keras.layers.Conv2D(filters=filters,
                                         kernel_size=1,
                                         use_bias=False,
                                         padding='valid')(net)
        else:  # apply 1D conv in time
            net = stream.Stream(cell=tf.keras.layers.Conv2D(
                filters=filters,
                kernel_size=(kernel_size, 1),
                dilation_rate=(dilation, 1),
                padding='valid',
                activation='linear',
                use_bias=False),
                                pad_time_dim=padding)(net)

        net = tf.keras.layers.BatchNormalization(scale=scale)(net)
        net = tf.keras.layers.Activation(activation)(net)
        net = tf.keras.layers.Dropout(rate=dropout)(net)

    if filter_separable:  # apply separable conv
        if kernel_size > 0:
            # DepthwiseConv1D
            net = stream.Stream(cell=tf.keras.layers.DepthwiseConv2D(
                kernel_size=(kernel_size, 1),
                strides=(stride, 1),
                padding='valid',
                dilation_rate=(dilation, 1),
                use_bias=False),
                                pad_time_dim=padding)(net)

        # Conv1D 1x1 - streamable by default
        net = tf.keras.layers.Conv2D(filters=filters,
                                     kernel_size=1,
                                     use_bias=False,
                                     padding='valid')(net)
    else:  # apply 1D conv in time
        net = stream.Stream(cell=tf.keras.layers.Conv2D(
            filters=filters,
            kernel_size=(kernel_size, 1),
            dilation_rate=(dilation, 1),
            padding='valid',
            activation='linear',
            use_bias=False),
                            pad_time_dim=padding)(net)

    net = tf.keras.layers.BatchNormalization(scale=scale)(net)

    if residual:
        # Conv1D 1x1 - streamable by default
        net_res = tf.keras.layers.Conv2D(filters=filters,
                                         kernel_size=1,
                                         use_bias=False,
                                         padding='valid')(inputs)
        net_res = tf.keras.layers.BatchNormalization(scale=scale)(net_res)

        net = tf.keras.layers.Add()([net, net_res])

    net = tf.keras.layers.Activation(activation)(net)
    net = tf.keras.layers.Dropout(rate=dropout)(net)
    return net
Пример #23
0
def model(flags):
    """MatchboxNet model.

  It is based on paper
  MatchboxNet: 1D Time-Channel Separable Convolutional Neural Network
  Architecture for Speech Commands Recognition
  https://arxiv.org/pdf/2004.08531.pdf

  Args:
    flags: data/model parameters

  Returns:
    Keras model for training

  Raises:
    ValueError: if any of input list has different length from any other;
    or if padding is not supported
  """

    ds_filters = parse(flags.ds_filters)
    ds_repeat = parse(flags.ds_repeat)
    ds_kernel_size = parse(flags.ds_kernel_size)
    ds_stride = parse(flags.ds_stride)
    ds_dilation = parse(flags.ds_dilation)
    ds_residual = parse(flags.ds_residual)
    ds_pool = parse(flags.ds_pool)
    ds_padding = parse(flags.ds_padding)
    ds_filter_separable = parse(flags.ds_filter_separable)

    for l in (ds_repeat, ds_kernel_size, ds_stride, ds_dilation, ds_residual,
              ds_pool, ds_padding, ds_filter_separable):
        if len(ds_filters) != len(l):
            raise ValueError('all input lists have to be the same length')

    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    # make it [batch, time, 1, feature]
    net = tf.keras.backend.expand_dims(net, axis=2)

    # encoder
    for filters, repeat, ksize, stride, sep, dilation, res, pool, pad in zip(
            ds_filters, ds_repeat, ds_kernel_size, ds_stride,
            ds_filter_separable, ds_dilation, ds_residual, ds_pool,
            ds_padding):
        net = resnet_block(net, repeat, ksize, filters, dilation, stride, sep,
                           res, pad, flags.dropout, flags.activation,
                           flags.ds_scale)
        if pool > 1:
            if flags.ds_max_pool:
                net = tf.keras.layers.MaxPooling2D(pool_size=(pool, 1),
                                                   strides=(pool, 1))(net)
            else:
                net = tf.keras.layers.AveragePooling2D(pool_size=(pool, 1),
                                                       strides=(pool, 1))(net)

    # decoder
    net = stream.Stream(cell=tf.keras.layers.GlobalAveragePooling2D())(net)

    net = tf.keras.layers.Flatten()(net)

    net = tf.keras.layers.Dense(units=flags.label_count)(net)

    if flags.return_softmax:
        net = tf.keras.layers.Activation('softmax')(net)
    return tf.keras.Model(input_audio, net)
Пример #24
0
def conv_model(flags,
               cnn_filters,
               cnn_kernel_size,
               cnn_act,
               cnn_use_bias,
               cnn_padding,
               dilation=1):
    """Toy convolutional model with sequence of convs with different paddings.

  It can be used for speech enhancement.

  Args:
      flags: model and data settings
      cnn_filters: list of filters in conv layer
      cnn_kernel_size: list of kernel_size in conv layer
      cnn_act: list of activation functions in conv layer
      cnn_use_bias: list of use_bias in conv layer
      cnn_padding: list of padding in conv layer
      dilation: dilation applied on all conv layers

  Returns:
    Keras model and sum delay

  Raises:
    ValueError: if any of input list has different length from any other
                or padding in not [same, causal]
  """

    if not all(
            len(cnn_filters) == len(l) for l in
        [cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding]):
        raise ValueError('all input lists have to be the same length')

    # it is an example of deep conv model for speech enhancement
    # which can be trained in non streaming mode and converted to streaming mode
    input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ),
                                        batch_size=flags.batch_size)
    net = input_audio

    sum_delay = 0
    sum_shift = 0
    net = tf.keras.backend.expand_dims(net)
    for filters, kernel_size, activation, use_bias, padding in zip(
            cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding):
        time_buffer_size = dilation * (kernel_size - 1)

        if padding == 'same':
            # need a delay with 'same' padding in streaming mode
            delay_val = time_buffer_size // 2
            net = delay.Delay(delay=delay_val)(net)
            sum_delay += delay_val * 2
        elif padding == 'causal':
            sum_shift += kernel_size
        else:
            raise ValueError('wrong padding mode ', padding)

        # it is a ring buffer in streaming mode and lambda x during training
        net = stream.Stream(cell=tf.keras.layers.Conv1D(
            filters=filters,
            kernel_size=kernel_size,
            activation=activation,
            use_bias=use_bias,
            padding='valid'),
                            use_one_step=False,
                            pad_time_dim=padding)(net)

    return tf.keras.Model(input_audio, net), sum_delay, sum_shift
Пример #25
0
def residual_model(flags, cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias,
                   cnn_padding):
    """Toy deep convolutional model with residual connections.

  It can be used for speech enhancement.

  Args:
      flags: model and data settings
      cnn_filters: list of filters in conv layer
      cnn_kernel_size: list of kernel_size in conv layer
      cnn_act: list of activation functions in conv layer
      cnn_use_bias: list of use_bias in conv layer
      cnn_padding: list of padding in conv layer

  Returns:
    Keras model

  Raises:
    ValueError: if any of input list has different length from any other
  """

    if not all(
            len(cnn_filters) == len(l) for l in
        [cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding]):
        raise ValueError('all input lists have to be the same length')

    # it is an example of deep conv model for speech enhancement
    # which can be trained in non streaming mode and converted to streaming mode
    input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ),
                                        batch_size=flags.batch_size)
    net = input_audio

    net = tf.keras.backend.expand_dims(net)
    for filters, kernel_size, activation, use_bias, padding in zip(
            cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding):

        ring_buffer_size_in_time_dim = (kernel_size - 1)

        # it is a ring buffer in streaming mode and lambda x during training
        net = stream.Stream(
            cell=tf.identity,
            ring_buffer_size_in_time_dim=ring_buffer_size_in_time_dim,
            use_one_step=False,
            pad_time_dim=None)(net)

        # residual connection in streaming mode needs:
        # * kernel size in time dim of conv layer
        # * padding mode which was used to padd data in time dim
        net_residual = residual.Residual(
            padding=padding,
            kernel_size_time=ring_buffer_size_in_time_dim + 1)(net)

        # it is easier to convert model to streaming mode when padding function
        # is decoupled from conv layer
        net = temporal_padding.TemporalPadding(
            padding=padding, padding_size=ring_buffer_size_in_time_dim)(net)

        net = tf.keras.layers.Conv1D(filters=filters,
                                     kernel_size=kernel_size,
                                     activation=activation,
                                     use_bias=use_bias,
                                     padding='valid')(
                                         net)  # padding has to be valid!

        net = tf.keras.layers.Add()([net, net_residual])

    return tf.keras.Model(input_audio, net)
Пример #26
0
def residual_model(flags,
                   cnn_filters,
                   cnn_kernel_size,
                   cnn_act,
                   cnn_use_bias,
                   cnn_padding,
                   delay_also_in_non_streaming,
                   dilation=1):
    """Toy deep convolutional model with residual connections.

  It can be used for speech enhancement.

  Args:
      flags: model and data settings
      cnn_filters: list of filters in conv layer
      cnn_kernel_size: list of kernel_size in conv layer
      cnn_act: list of activation functions in conv layer
      cnn_use_bias: list of use_bias in conv layer
      cnn_padding: list of padding in conv layer
      delay_also_in_non_streaming: Whether to apply delay also in non-streaming.
      dilation: dilation applied on all conv layers

  Returns:
    Keras model and sum delay

  Raises:
    ValueError: if any of input list has different length from any other
                or padding in not [same, causal]
  """

    if not all(
            len(cnn_filters) == len(l) for l in
        [cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding]):
        raise ValueError('all input lists have to be the same length')

    # it is an example of deep conv model for speech enhancement
    # which can be trained in non streaming mode and converted to streaming mode
    input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ),
                                        batch_size=flags.batch_size)
    net = input_audio

    sum_delay = 0
    net = tf.keras.backend.expand_dims(net)
    for filters, kernel_size, activation, use_bias, padding in zip(
            cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_padding):
        time_buffer_size = dilation * (kernel_size - 1)

        if padding == 'causal':
            # residual connection is simple with 'causal'  padding
            net_residual = net

        elif padding == 'same':
            # residual connection in streaming mode needs delay with 'same' padding
            delay_val = time_buffer_size // 2
            net_residual = delay.Delay(
                delay=delay_val,
                also_in_non_streaming=delay_also_in_non_streaming)(net)
            sum_delay += delay_val

        else:
            raise ValueError('wrong padding mode ', padding)

        # it is easier to convert model to streaming mode when padding function
        # is decoupled from conv layer
        net = temporal_padding.TemporalPadding(
            padding='causal' if delay_also_in_non_streaming else padding,
            padding_size=time_buffer_size)(net)

        # it is a ring buffer in streaming mode and lambda x during training
        net = stream.Stream(cell=tf.identity,
                            ring_buffer_size_in_time_dim=time_buffer_size,
                            use_one_step=False,
                            pad_time_dim=None)(net)

        net = tf.keras.layers.Conv1D(filters=filters,
                                     kernel_size=kernel_size,
                                     activation=activation,
                                     use_bias=use_bias,
                                     padding='valid')(
                                         net)  # padding has to be valid!

        net = tf.keras.layers.Add()([net, net_residual])

    return tf.keras.Model(input_audio, net), sum_delay
Пример #27
0
def transposed_conv_model(flags,
                          cnn_filters,
                          cnn_kernel_size,
                          cnn_act,
                          cnn_use_bias,
                          cnn_paddings,
                          trans_paddings):
  """Toy deep convolutional model with transposed convolutions.

  It can be used for speech enhancement.

  Args:
    flags: model and data settings
    cnn_filters: list of filters for conv layer
    cnn_kernel_size: list of kernel_size for conv layer
    cnn_act: list of activation functions for conv layer
    cnn_use_bias: list of use_bias for conv layer
    cnn_paddings: list of padding for conv layer
    trans_paddings: list of padding for transposed conv layer

  Returns:
    Keras model and sum delay

  Raises:
    ValueError: if any of input list has different length from any other
                or padding in not [same, causal]
  """

  if not all(
      len(cnn_filters) == len(l) for l in [
          cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_paddings,
          trans_paddings
      ]):
    raise ValueError('all input lists have to be the same length')

  # it is an example of deep conv model for speech enhancement
  # which can be trained in non streaming mode and converted to streaming mode
  input_audio = tf.keras.layers.Input(
      shape=(flags.desired_samples,), batch_size=flags.batch_size)
  net = input_audio

  net = tf.keras.backend.expand_dims(net)
  for filters, kernel_size, activation, use_bias, padding, trans_padding in zip(
      cnn_filters, cnn_kernel_size,
      cnn_act, cnn_use_bias, cnn_paddings, trans_paddings):
    time_buffer_size = kernel_size - 1

    net = tf.keras.backend.expand_dims(net, axis=-2)
    net = stream.Stream(
        cell=tf.keras.layers.Conv2DTranspose(
            filters=filters, kernel_size=(3, 1),
            strides=(2, 1), padding='valid'),
        pad_time_dim=trans_padding)(net)
    net = tf.keras.backend.squeeze(net, axis=-2)

    if padding == 'same':
      # model looking into future, so introducing delay for streaming mode
      net = delay.Delay(delay=time_buffer_size // 2)(net)
    elif padding != 'causal':
      raise ValueError('wrong padding mode ', padding)

    # it is a ring buffer in streaming mode and lambda x during training
    net = stream.Stream(
        cell=tf.keras.layers.Conv1D(
            filters=filters,
            kernel_size=kernel_size,
            activation=activation,
            use_bias=use_bias,
            padding='valid'),
        use_one_step=False,
        pad_time_dim=padding)(net)

  return tf.keras.Model(input_audio, net)