Пример #1
0
def model(flags):
    """Convolutional recurrent neural network (CRNN) model.

  It is based on paper
  Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting
  https://arxiv.org/pdf/1703.05390.pdf
  Represented as sequence of Conv, RNN/GRU, FC layers.
  Model topology is similar with "Hello Edge: Keyword Spotting on
  Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """
    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    # expand dims for the next layer 2d conv
    net = tf.keras.backend.expand_dims(net)
    for filters, kernel_size, activation, dilation_rate, strides in zip(
            parse(flags.cnn_filters), parse(flags.cnn_kernel_size),
            parse(flags.cnn_act), parse(flags.cnn_dilation_rate),
            parse(flags.cnn_strides)):
        net = Stream(cell=tf.keras.layers.Conv2D(filters=filters,
                                                 kernel_size=kernel_size,
                                                 activation=activation,
                                                 dilation_rate=dilation_rate,
                                                 strides=strides))(net)

    shape = net.shape
    # input net dimension: [batch, time, feature, channels]
    # reshape dimension: [batch, time, feature * channels]
    # so that GRU/RNN can process it
    net = tf.keras.layers.Reshape((-1, shape[2] * shape[3]))(net)

    for units, return_sequences in zip(parse(flags.gru_units),
                                       parse(flags.return_sequences)):
        net = GRU(units=units,
                  return_sequences=return_sequences,
                  stateful=flags.stateful)(net)

    net = Stream(cell=tf.keras.layers.Flatten())(net)
    net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

    for units, activation in zip(parse(flags.units1), parse(flags.act1)):
        net = tf.keras.layers.Dense(units=units, activation=activation)(net)

    net = tf.keras.layers.Dense(units=flags.label_count)(net)
    if flags.return_softmax:
        net = tf.keras.layers.Activation('softmax')(net)
    return tf.keras.Model(input_audio, net)
Пример #2
0
def model(flags):
  """CNN model.

  It is based on paper:
  Convolutional Neural Networks for Small-footprint Keyword Spotting
  http://www.isca-speech.org/archive/interspeech_2015/papers/i15_1478.pdf

  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

  input_audio = tf.keras.layers.Input(
      shape=(flags.desired_samples,), batch_size=flags.batch_size)

  net = speech_features.SpeechFeatures(
      frame_size_ms=flags.window_size_ms,
      frame_step_ms=flags.window_stride_ms,
      sample_rate=flags.sample_rate,
      use_tf_fft=flags.use_tf_fft,
      preemph=flags.preemph,
      window_type=flags.window_type,
      feature_type=flags.feature_type,
      mel_num_bins=flags.mel_num_bins,
      mel_lower_edge_hertz=flags.mel_lower_edge_hertz,
      mel_upper_edge_hertz=flags.mel_upper_edge_hertz,
      mel_non_zero_only=flags.mel_non_zero_only,
      fft_magnitude_squared=flags.fft_magnitude_squared,
      dct_num_features=flags.dct_num_features)(
          input_audio)

  net = tf.keras.backend.expand_dims(net)
  for filters, kernel_size, activation, dilation_rate, strides in zip(
      parse(flags.cnn_filters), parse(flags.cnn_kernel_size),
      parse(flags.cnn_act), parse(flags.cnn_dilation_rate),
      parse(flags.cnn_strides)):
    net = Stream(
        cell=tf.keras.layers.Conv2D(
            filters=filters,
            kernel_size=kernel_size,
            activation=activation,
            dilation_rate=dilation_rate,
            strides=strides))(
                net)

  net = Stream(cell=tf.keras.layers.Flatten())(net)
  net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

  for units, activation in zip(parse(flags.units2), parse(flags.act2)):
    net = tf.keras.layers.Dense(units=units, activation=activation)(net)

  net = tf.keras.layers.Dense(units=flags.label_count)(net)
  return tf.keras.Model(input_audio, net)
Пример #3
0
def E2E_1stage_v2(input_shape=(16000, ), data_settings=None, dropout=0.2):
    X_input = tf.keras.Input(input_shape)
    X = speech_features.SpeechFeatures(
        frame_size_ms=data_settings.window_size_ms,
        frame_step_ms=data_settings.window_stride_ms)(X_input)

    X = svdf.Svdf(units1=256,
                  memory_size=8,
                  units2=64,
                  dropout=dropout,
                  activation='relu',
                  pad=0,
                  name='svdf_1')(X)

    X = svdf.Svdf(units1=256,
                  memory_size=10,
                  units2=64,
                  dropout=dropout,
                  activation='relu',
                  pad=0,
                  name='svdf_2')(X)
    X = svdf.Svdf(units1=256,
                  memory_size=10,
                  units2=128,
                  dropout=dropout,
                  activation='relu',
                  pad=0,
                  name='svdf_3')(X)
    X = svdf.Svdf(units1=256,
                  memory_size=10,
                  units2=128,
                  dropout=dropout,
                  activation='relu',
                  pad=0,
                  name='svdf_4')(X)
    X = svdf.Svdf(units1=256,
                  memory_size=10,
                  units2=128,
                  dropout=dropout,
                  activation='relu',
                  pad=0,
                  name='svdf_5')(X)
    X = svdf.Svdf(units1=256,
                  memory_size=10,
                  units2=-1,
                  dropout=dropout,
                  activation='relu',
                  pad=0,
                  name='svdf_6')(X)

    X = Stream(cell=tf.keras.layers.Flatten())(X)
    X = tf.keras.layers.Dropout(dropout)(X)
    X = tf.keras.layers.Dense(units=data_settings.label_count)(X)

    # Create model
    model = tf.keras.models.Model(inputs=X_input,
                                  outputs=X,
                                  name='E2E_1stage_v2')

    return model
Пример #4
0
    def init(self, shape=(8, 2), flat_dim="time"):
        self.batch_size = 1
        # input data placeholder
        input_tf = tf.keras.layers.Input(shape=shape,
                                         batch_size=self.batch_size,
                                         name="inp1")

        # input test data
        self.inputs = np.random.uniform(size=(self.batch_size, ) + shape)

        # create non streamable trainable model
        mode = Modes.TRAINING
        if flat_dim == "time":
            flat_tf = Stream(cell=tf.keras.layers.Flatten(),
                             mode=mode)(input_tf)
        else:
            flat_tf = tf.reshape(
                input_tf,
                (-1, input_tf.shape[1], input_tf.shape[2] * input_tf.shape[3]))
        # flat_tf = flatten.Flatten(mode=mode, flat_dim=flat_dim)(input_tf)
        self.model_train = tf.keras.Model(input_tf, flat_tf)
        self.model_train.summary()

        # output data, generated by non streaming model
        self.outputs = self.model_train.predict(self.inputs)
        return self.outputs
Пример #5
0
def model(flags):
    """CNN model.

  It is based on paper:
  Convolutional Neural Networks for Small-footprint Keyword Spotting
  http://www.isca-speech.org/archive/interspeech_2015/papers/i15_1478.pdf
  Model topology is similar with "Hello Edge: Keyword Spotting on
  Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf

  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    net = tf.keras.backend.expand_dims(net)
    for filters, kernel_size, activation, dilation_rate, strides in zip(
            parse(flags.cnn_filters), parse(flags.cnn_kernel_size),
            parse(flags.cnn_act), parse(flags.cnn_dilation_rate),
            parse(flags.cnn_strides)):
        net = Stream(cell=tf.keras.layers.Conv2D(filters=filters,
                                                 kernel_size=kernel_size,
                                                 activation=activation,
                                                 dilation_rate=dilation_rate,
                                                 strides=strides))(net)

    net = Stream(cell=tf.keras.layers.Flatten())(net)
    net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

    for units, activation in zip(parse(flags.units2), parse(flags.act2)):
        net = tf.keras.layers.Dense(units=units, activation=activation)(net)

    net = tf.keras.layers.Dense(units=flags.label_count)(net)
    if flags.return_softmax:
        net = tf.keras.layers.Activation('softmax')(net)
    return tf.keras.Model(input_audio, net)
Пример #6
0
def E2E_1stage_v9(input_shape=(16000,), data_settings = None, dropout = 0.5):
    assert data_settings.wanted_words == 'on,off,up,down,zero,one,two,three,four,five,six,seven,eight,nine'
    assert data_settings.window_size_ms == 40.0
    assert data_settings.window_stride_ms == 20.0
    assert data_settings.dct_num_features == 40
    assert data_settings.mel_num_bins == 80
    assert data_settings.mel_upper_edge_hertz == 7000
    
    X_input = tf.keras.Input(input_shape)
    X =  speech_features.SpeechFeatures(
        frame_size_ms = data_settings.window_size_ms,
        frame_step_ms = data_settings.window_stride_ms,
        mel_num_bins = data_settings.mel_num_bins,
        dct_num_features = data_settings.dct_num_features,
        mel_upper_edge_hertz = data_settings.mel_upper_edge_hertz)(X_input)
    
    X = svdf.Svdf(
        units1=192, memory_size = 4, units2=96, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_1')(X)

    X = svdf.Svdf(
        units1=192, memory_size = 10, units2=96, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_2')(X)
    X = svdf.Svdf(
        units1=192, memory_size = 10, units2=96, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_3')(X)
    X = svdf.Svdf(
        units1=192, memory_size = 10, units2=96, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_4')(X)
    X = svdf.Svdf(
        units1=192, memory_size = 10, units2=96, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_5')(X)

    X = svdf.Svdf(
        units1=192, memory_size = 10, units2=-1, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_6')(X)


    X = Stream(cell=tf.keras.layers.Flatten())(X)
    X = tf.keras.layers.Dropout(dropout)(X)
    X = tf.keras.layers.Dense(units=data_settings.label_count)(X)
    

    # Create model
    model = tf.keras.models.Model(inputs=X_input, outputs=X, name='E2E_1stage_v9')

    return model
Пример #7
0
def model(flags):
  """Fully connected layer based model.

  It is based on paper (with added pooling):
  SMALL-FOOTPRINT KEYWORD SPOTTING USING DEEP NEURAL NETWORKS
  https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/42537.pdf
  Hello Edge: Keyword Spotting on Microcontrollers
  https://arxiv.org/pdf/1711.07128.pdf
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

  input_audio = tf.keras.layers.Input(
      shape=(flags.desired_samples,), batch_size=flags.batch_size)

  net = speech_features.SpeechFeatures(
      frame_size_ms=flags.window_size_ms,
      frame_step_ms=flags.window_stride_ms,
      sample_rate=flags.sample_rate,
      use_tf_fft=flags.use_tf_fft,
      preemph=flags.preemph,
      window_type=flags.window_type,
      mel_num_bins=flags.mel_num_bins,
      mel_lower_edge_hertz=flags.mel_lower_edge_hertz,
      mel_upper_edge_hertz=flags.mel_upper_edge_hertz,
      mel_non_zero_only=flags.mel_non_zero_only,
      fft_magnitude_squared=flags.fft_magnitude_squared,
      dct_num_features=flags.dct_num_features)(
          input_audio)

  for units, activation in zip(parse(flags.units1), parse(flags.act1)):
    net = tf.keras.layers.Dense(units=units, activation=activation)(net)

  net = Stream(cell=tf.keras.layers.Flatten())(net)

  # after flattening data in time, we can apply any layer: pooling, bi-lstm etc
  if flags.pool_size > 1:
    # add fake dim for compatibility with pooling
    net = tf.keras.backend.expand_dims(net, axis=-1)
    net = tf.keras.layers.MaxPool1D(
        pool_size=flags.pool_size,
        strides=flags.strides,
        data_format='channels_last')(net)
    # remove fake dim
    net = tf.keras.backend.squeeze(net, axis=-1)

  net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

  for units, activation in zip(parse(flags.units2), parse(flags.act2)):
    net = tf.keras.layers.Dense(units=units, activation=activation)(net)

  net = tf.keras.layers.Dense(units=flags.label_count)(net)

  return tf.keras.Model(input_audio, net)
Пример #8
0
def model(flags):
  """SVDF model.

  This model is based on decomposition of a densely connected ops
  into low rank filters.
  It is based on paper
  END-TO-END STREAMING KEYWORD SPOTTING https://arxiv.org/pdf/1812.02802.pdf
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

  input_audio = tf.keras.layers.Input(
      shape=(flags.desired_samples,), batch_size=flags.batch_size)

  net = speech_features.SpeechFeatures(
      frame_size_ms=flags.window_size_ms,
      frame_step_ms=flags.window_stride_ms,
      sample_rate=flags.sample_rate,
      use_tf_fft=flags.use_tf_fft,
      preemph=flags.preemph,
      window_type=flags.window_type,
      feature_type=flags.feature_type,
      mel_num_bins=flags.mel_num_bins,
      mel_lower_edge_hertz=flags.mel_lower_edge_hertz,
      mel_upper_edge_hertz=flags.mel_upper_edge_hertz,
      mel_non_zero_only=flags.mel_non_zero_only,
      fft_magnitude_squared=flags.fft_magnitude_squared,
      dct_num_features=flags.dct_num_features)(
          input_audio)

  for i, (units1, memory_size, units2, dropout, activation) in enumerate(
      zip(
          parse(flags.svdf_units1), parse(flags.svdf_memory_size),
          parse(flags.svdf_units2), parse(flags.svdf_dropout),
          parse(flags.svdf_act))):
    net = svdf.Svdf(
        units1=units1,
        memory_size=memory_size,
        units2=units2,
        dropout=dropout,
        activation=activation,
        pad=flags.svdf_pad,
        name='svdf_%d' % i)(
            net)

  net = Stream(cell=tf.keras.layers.Flatten())(net)
  net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

  for units, activation in zip(parse(flags.units2), parse(flags.act2)):
    net = tf.keras.layers.Dense(units=units, activation=activation)(net)

  net = tf.keras.layers.Dense(units=flags.label_count)(net)
  return tf.keras.Model(input_audio, net)
Пример #9
0
 def _get_conv2d_layer(self, mode, dilation_rate=(1, 1)):
     cell = tf.keras.layers.Conv2D(filters=self.filters,
                                   kernel_size=self.kernel_size,
                                   dilation_rate=dilation_rate,
                                   kernel_initializer='ones')
     return Stream(
         cell,
         mode=mode,
         inference_batch_size=self.batch_size,
         pad_time_dim='causal',
     )
Пример #10
0
def keyword_marvin_v3_vl_0_4(input_shape=(16000,), data_settings = None, dropout = 0.2):
    
    assert data_settings.window_size_ms == 30.0
    assert data_settings.window_stride_ms == 10.0
    assert data_settings.dct_num_features == 40
    assert data_settings.mel_num_bins == 80
    assert data_settings.background_volume == 0.4
    assert data_settings.mel_upper_edge_hertz == 7000
    assert data_settings.wanted_words == 'marvin'
    
    X_input = tf.keras.Input(input_shape)
    X =  speech_features.SpeechFeatures(
        frame_size_ms = data_settings.window_size_ms,
        frame_step_ms = data_settings.window_stride_ms,
        mel_num_bins = data_settings.mel_num_bins,
        dct_num_features = data_settings.dct_num_features,
        mel_upper_edge_hertz = data_settings.mel_upper_edge_hertz)(X_input)
    
    X = svdf.Svdf(
        units1=84, memory_size = 12, units2=32, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_1')(X)

    X = svdf.Svdf(
        units1=84, memory_size = 12, units2=32, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_2')(X)
    X = svdf.Svdf(
        units1=84, memory_size = 12, units2=32, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_3')(X)
    X = svdf.Svdf(
        units1=32, memory_size = 32, units2=-1, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_4')(X)
    X = svdf.Svdf(
        units1=32, memory_size = 32, units2=-1, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_5')(X)

    X = Stream(cell=tf.keras.layers.Flatten())(X)
    X = tf.keras.layers.Dropout(dropout)(X)
    X = tf.keras.layers.Dense(units=data_settings.label_count)(X)
    

    # Create model
    model = tf.keras.models.Model(inputs=X_input, outputs=X, name='keyword_marvin_v3_vl_0_4')

    return model
Пример #11
0
def model(flags):
  """LSTM model.

  Similar model in papers:
  Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting
  https://arxiv.org/pdf/1703.05390.pdf (with no conv layer)
  Hello Edge: Keyword Spotting on Microcontrollers
  https://arxiv.org/pdf/1711.07128.pdf

  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """
  input_audio = tf.keras.layers.Input(
      shape=(flags.desired_samples,), batch_size=flags.batch_size)

  net = speech_features.SpeechFeatures(
      frame_size_ms=flags.window_size_ms,
      frame_step_ms=flags.window_stride_ms,
      sample_rate=flags.sample_rate,
      use_tf_fft=flags.use_tf_fft,
      preemph=flags.preemph,
      window_type=flags.window_type,
      mel_num_bins=flags.mel_num_bins,
      mel_lower_edge_hertz=flags.mel_lower_edge_hertz,
      mel_upper_edge_hertz=flags.mel_upper_edge_hertz,
      mel_non_zero_only=flags.mel_non_zero_only,
      fft_magnitude_squared=flags.fft_magnitude_squared,
      dct_num_features=flags.dct_num_features)(
          input_audio)

  for units, return_sequences, num_proj in zip(
      parse(flags.lstm_units), parse(flags.return_sequences),
      parse(flags.num_proj)):
    net = LSTM(
        units=units,
        return_sequences=return_sequences,
        stateful=flags.stateful,
        use_peepholes=flags.use_peepholes,
        num_proj=num_proj)(
            net)

  net = Stream(cell=tf.keras.layers.Flatten())(net)
  net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

  for units, activation in zip(parse(flags.units1), parse(flags.act1)):
    net = tf.keras.layers.Dense(units=units, activation=activation)(net)

  net = tf.keras.layers.Dense(units=flags.label_count)(net)
  return tf.keras.Model(input_audio, net)
Пример #12
0
def model(flags):
    """CNN model.

  It is based on paper:
  Convolutional Neural Networks for Small-footprint Keyword Spotting
  http://www.isca-speech.org/archive/interspeech_2015/papers/i15_1478.pdf

  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

    input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ),
                                        batch_size=flags.batch_size)

    net = speech_features.SpeechFeatures(
        speech_features.SpeechFeatures.get_params(flags))(input_audio)

    net = tf.keras.backend.expand_dims(net)
    for filters, kernel_size, activation, dilation_rate, strides in zip(
            parse(flags.cnn_filters), parse(flags.cnn_kernel_size),
            parse(flags.cnn_act), parse(flags.cnn_dilation_rate),
            parse(flags.cnn_strides)):
        net = Stream(cell=tf.keras.layers.Conv2D(filters=filters,
                                                 kernel_size=kernel_size,
                                                 activation=activation,
                                                 dilation_rate=dilation_rate,
                                                 strides=strides))(net)

    net = Stream(cell=tf.keras.layers.Flatten())(net)
    net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

    for units, activation in zip(parse(flags.units2), parse(flags.act2)):
        net = tf.keras.layers.Dense(units=units, activation=activation)(net)

    net = tf.keras.layers.Dense(units=flags.label_count)(net)
    return tf.keras.Model(input_audio, net)
Пример #13
0
def E2E_1stage_v7(input_shape=(16000,), data_settings = None, dropout = 0.5):
    data_settings.window_size_ms = 40.0
    data_settings.window_stride_ms = 20.0
    data_settings.dct_num_features = 40
    data_settings.mel_num_bins = 80
    data_settings.mel_upper_edge_hertz = 7000
    
    X_input = tf.keras.Input(input_shape)
    X =  speech_features.SpeechFeatures(
        frame_size_ms = data_settings.window_size_ms,
        frame_step_ms = data_settings.window_stride_ms,
        mel_num_bins = data_settings.mel_num_bins,
        dct_num_features = data_settings.dct_num_features,
        mel_upper_edge_hertz = data_settings.mel_upper_edge_hertz)(X_input)
    
    X = svdf.Svdf(
        units1=224, memory_size = 12, units2=56, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_1')(X)

    X = svdf.Svdf(
        units1=224, memory_size = 12, units2=56, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_2')(X)
    X = svdf.Svdf(
        units1=224, memory_size = 12, units2=56, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_3')(X)
    X = svdf.Svdf(
        units1=32, memory_size = 32, units2=-1, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_4')(X)
    X = svdf.Svdf(
        units1=32, memory_size = 32, units2=-1, dropout=dropout,
        activation='relu',
        pad=0,
        name='svdf_5')(X)

    X = Stream(cell=tf.keras.layers.Flatten())(X)
    X = tf.keras.layers.Dropout(dropout)(X)
    X = tf.keras.layers.Dense(units=data_settings.label_count)(X)
    

    # Create model
    model = tf.keras.models.Model(inputs=X_input, outputs=X, name='E2E_1stage_v7')

    return model
Пример #14
0
def model(flags):
    """Fully connected layer based model.

  It is based on paper (with added pooling):
  SMALL-FOOTPRINT KEYWORD SPOTTING USING DEEP NEURAL NETWORKS
  https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/42537.pdf
  Model topology is similar with "Hello Edge: Keyword Spotting on
  Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    for units, activation in zip(parse(flags.units1), parse(flags.act1)):
        net = tf.keras.layers.Dense(units=units, activation=activation)(net)

    net = Stream(cell=tf.keras.layers.Flatten())(net)

    # after flattening data in time, we can apply any layer: pooling, bi-lstm etc
    if flags.pool_size > 1:
        # add fake dim for compatibility with pooling
        net = tf.keras.backend.expand_dims(net, axis=-1)
        net = tf.keras.layers.MaxPool1D(pool_size=flags.pool_size,
                                        strides=flags.strides,
                                        data_format='channels_last')(net)
        # remove fake dim
        net = tf.keras.backend.squeeze(net, axis=-1)

    net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

    for units, activation in zip(parse(flags.units2), parse(flags.act2)):
        net = tf.keras.layers.Dense(units=units, activation=activation)(net)

    net = tf.keras.layers.Dense(units=flags.label_count)(net)
    if flags.return_softmax:
        net = tf.keras.layers.Activation('softmax')(net)
    return tf.keras.Model(input_audio, net)
Пример #15
0
def model(flags):
    """SVDF model.

  This model is based on decomposition of a densely connected ops
  into low rank filters.
  It is based on paper
  END-TO-END STREAMING KEYWORD SPOTTING https://arxiv.org/pdf/1812.02802.pdf
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    # for streaming mode it is better to use causal padding
    padding = 'causal' if flags.svdf_pad else 'valid'

    for i, (units1, memory_size, units2, dropout, activation) in enumerate(
            zip(parse(flags.svdf_units1), parse(flags.svdf_memory_size),
                parse(flags.svdf_units2), parse(flags.svdf_dropout),
                parse(flags.svdf_act))):
        net = svdf.Svdf(units1=units1,
                        memory_size=memory_size,
                        units2=units2,
                        dropout=dropout,
                        activation=activation,
                        pad=padding,
                        name='svdf_%d' % i)(net)

    net = Stream(cell=tf.keras.layers.Flatten())(net)
    net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

    for units, activation in zip(parse(flags.units2), parse(flags.act2)):
        net = tf.keras.layers.Dense(units=units, activation=activation)(net)

    net = tf.keras.layers.Dense(units=flags.label_count)(net)
    if flags.return_softmax:
        net = tf.keras.layers.Activation('softmax')(net)
    return tf.keras.Model(input_audio, net)
Пример #16
0
def model(flags):
  """LSTM model.

  Similar model in papers:
  Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting
  https://arxiv.org/pdf/1703.05390.pdf (with no conv layer)
  Model topology is similar with "Hello Edge: Keyword Spotting on
  Microcontrollers" https://arxiv.org/pdf/1711.07128.pdf

  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """
  input_audio = tf.keras.layers.Input(
      shape=modes.get_input_data_shape(flags, modes.Modes.TRAINING),
      batch_size=flags.batch_size)
  net = input_audio

  if flags.preprocess == 'raw':
    # it is a self contained model, user need to feed raw audio only
    net = speech_features.SpeechFeatures(
        speech_features.SpeechFeatures.get_params(flags))(
            net)

  for units, return_sequences, num_proj in zip(
      parse(flags.lstm_units), parse(flags.return_sequences),
      parse(flags.num_proj)):
    net = LSTM(
        units=units,
        return_sequences=return_sequences,
        stateful=flags.stateful,
        use_peepholes=flags.use_peepholes,
        num_proj=num_proj)(
            net)

  net = Stream(cell=tf.keras.layers.Flatten())(net)
  net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

  for units, activation in zip(parse(flags.units1), parse(flags.act1)):
    net = tf.keras.layers.Dense(units=units, activation=activation)(net)

  net = tf.keras.layers.Dense(units=flags.label_count)(net)
  if flags.return_softmax:
    net = tf.keras.layers.Activation('softmax')(net)
  return tf.keras.Model(input_audio, net)
Пример #17
0
def model(flags):
    """Fully connected layer based model on raw wav data.

  It is based on paper (with added pooling and raw audio data):
  SMALL-FOOTPRINT KEYWORD SPOTTING USING DEEP NEURAL NETWORKS
  https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/42537.pdf
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

    if flags.preprocess != 'raw':
        ValueError('input audio has to be raw, but get ', flags.preprocess)

    input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ),
                                        batch_size=flags.batch_size)

    net = dataframe.DataFrame(
        frame_size=flags.window_size_samples,
        frame_step=flags.window_stride_samples)(input_audio)

    for units, activation in zip(parse(flags.units1), parse(flags.act1)):
        net = tf.keras.layers.Dense(units=units, activation=activation)(net)

    net = Stream(cell=tf.keras.layers.Flatten())(net)

    # after flattening data in time, we can apply any layer: pooling, bi-lstm etc
    if flags.pool_size > 1:
        # add fake dim for compatibility with pooling
        net = tf.keras.backend.expand_dims(net, axis=-1)
        net = tf.keras.layers.MaxPool1D(pool_size=flags.pool_size,
                                        strides=flags.strides,
                                        data_format='channels_last')(net)
        # remove fake dim
        net = tf.keras.backend.squeeze(net, axis=-1)

    net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

    for units, activation in zip(parse(flags.units2), parse(flags.act2)):
        net = tf.keras.layers.Dense(units=units, activation=activation)(net)

    net = tf.keras.layers.Dense(units=flags.label_count)(net)
    if flags.return_softmax:
        net = tf.keras.layers.Activation('softmax')(net)
    return tf.keras.Model(input_audio, net)
Пример #18
0
def model(flags):
    """LSTM model.

  It is based on paper https://arxiv.org/pdf/1705.02411.pdf

  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """
    input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ),
                                        batch_size=flags.batch_size)

    net = speech_features.SpeechFeatures(
        frame_size_ms=flags.window_size_ms,
        frame_step_ms=flags.window_stride_ms,
        sample_rate=flags.sample_rate,
        use_tf_fft=flags.use_tf_fft,
        preemph=flags.preemph,
        window_type=flags.window_type,
        mel_num_bins=flags.mel_num_bins,
        mel_lower_edge_hertz=flags.mel_lower_edge_hertz,
        mel_upper_edge_hertz=flags.mel_upper_edge_hertz,
        mel_non_zero_only=flags.mel_non_zero_only,
        fft_magnitude_squared=flags.fft_magnitude_squared,
        dct_num_features=flags.dct_num_features)(input_audio)

    for units, return_sequences, num_proj in zip(parse(flags.lstm_units),
                                                 parse(flags.return_sequences),
                                                 parse(flags.num_proj)):
        net = LSTM(units=units,
                   return_sequences=return_sequences,
                   stateful=flags.stateful,
                   use_peepholes=flags.use_peepholes,
                   num_proj=num_proj)(net)

    net = Stream(cell=tf.keras.layers.Flatten())(net)
    net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

    for units, activation in zip(parse(flags.units1), parse(flags.act1)):
        net = tf.keras.layers.Dense(units=units, activation=activation)(net)

    net = tf.keras.layers.Dense(units=flags.label_count)(net)
    return tf.keras.Model(input_audio, net)
Пример #19
0
def model(flags):
  """SVDF model.

  This model is based on decomposition of a densely connected ops
  into low rank filters.
  It is based on paper
  END-TO-END STREAMING KEYWORD SPOTTING https://arxiv.org/pdf/1812.02802.pdf
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

  input_audio = tf.keras.layers.Input(
      shape=(flags.desired_samples,), batch_size=flags.batch_size)

  net = speech_features.SpeechFeatures(
      speech_features.SpeechFeatures.get_params(flags))(
          input_audio)

  for i, (units1, memory_size, units2, dropout, activation) in enumerate(
      zip(
          parse(flags.svdf_units1), parse(flags.svdf_memory_size),
          parse(flags.svdf_units2), parse(flags.svdf_dropout),
          parse(flags.svdf_act))):
    net = svdf.Svdf(
        units1=units1,
        memory_size=memory_size,
        units2=units2,
        dropout=dropout,
        activation=activation,
        pad=flags.svdf_pad,
        name='svdf_%d' % i)(
            net)

  net = Stream(cell=tf.keras.layers.Flatten())(net)
  net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

  for units, activation in zip(parse(flags.units2), parse(flags.act2)):
    net = tf.keras.layers.Dense(units=units, activation=activation)(net)

  net = tf.keras.layers.Dense(units=flags.label_count)(net)
  return tf.keras.Model(input_audio, net)
Пример #20
0
def model(flags):
  """LSTM model.

  Similar model in papers:
  Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting
  https://arxiv.org/pdf/1703.05390.pdf (with no conv layer)
  Hello Edge: Keyword Spotting on Microcontrollers
  https://arxiv.org/pdf/1711.07128.pdf

  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """
  input_audio = tf.keras.layers.Input(
      shape=(flags.desired_samples,), batch_size=flags.batch_size)

  net = speech_features.SpeechFeatures(
      speech_features.SpeechFeatures.get_params(flags))(
          input_audio)

  for units, return_sequences, num_proj in zip(
      parse(flags.lstm_units), parse(flags.return_sequences),
      parse(flags.num_proj)):
    net = LSTM(
        units=units,
        return_sequences=return_sequences,
        stateful=flags.stateful,
        use_peepholes=flags.use_peepholes,
        num_proj=num_proj)(
            net)

  net = Stream(cell=tf.keras.layers.Flatten())(net)
  net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

  for units, activation in zip(parse(flags.units1), parse(flags.act1)):
    net = tf.keras.layers.Dense(units=units, activation=activation)(net)

  net = tf.keras.layers.Dense(units=flags.label_count)(net)
  return tf.keras.Model(input_audio, net)
Пример #21
0
def model(flags):
    """Gated Recurrent Unit(GRU) model.

  It is based on paper
  Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting
  https://arxiv.org/pdf/1703.05390.pdf (with no conv layer)
  Hello Edge: Keyword Spotting on Microcontrollers
  https://arxiv.org/pdf/1711.07128.pdf
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """
    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    for units, return_sequences in zip(parse(flags.gru_units),
                                       parse(flags.return_sequences)):
        net = GRU(units=units,
                  return_sequences=return_sequences,
                  stateful=flags.stateful)(net)

    net = Stream(cell=tf.keras.layers.Flatten())(net)
    net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

    for units, activation in zip(parse(flags.units1), parse(flags.act1)):
        net = tf.keras.layers.Dense(units=units, activation=activation)(net)

    net = tf.keras.layers.Dense(units=flags.label_count)(net)
    return tf.keras.Model(input_audio, net)
Пример #22
0
def model(flags):
    """Depthwise convolutional model.

  It is based on paper https://arxiv.org/abs/1704.04861

  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

    input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ),
                                        batch_size=flags.batch_size)

    net = speech_features.SpeechFeatures(
        frame_size_ms=flags.window_size_ms,
        frame_step_ms=flags.window_stride_ms,
        sample_rate=flags.sample_rate,
        use_tf_fft=flags.use_tf_fft,
        preemph=flags.preemph,
        window_type=flags.window_type,
        mel_num_bins=flags.mel_num_bins,
        mel_lower_edge_hertz=flags.mel_lower_edge_hertz,
        mel_upper_edge_hertz=flags.mel_upper_edge_hertz,
        mel_non_zero_only=flags.mel_non_zero_only,
        fft_magnitude_squared=flags.fft_magnitude_squared,
        dct_num_features=flags.dct_num_features)(input_audio)

    net = tf.keras.backend.expand_dims(net)

    net = Stream(cell=tf.keras.layers.Conv2D(
        kernel_size=parse(flags.cnn1_kernel_size),
        dilation_rate=parse(flags.cnn1_dilation_rate),
        filters=flags.cnn1_filters,
        padding=flags.cnn1_padding,
        strides=parse(flags.cnn1_strides)))(net)
    net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum,
                                             center=flags.bn_center,
                                             scale=flags.bn_scale,
                                             renorm=flags.bn_renorm)(net)
    net = tf.keras.layers.Activation('relu')(net)

    for kernel_size, dw2_act, dilation_rate, strides, filters, cnn2_act in zip(
            parse(flags.dw2_kernel_size), parse(flags.dw2_act),
            parse(flags.dw2_dilation_rate), parse(flags.dw2_strides),
            parse(flags.cnn2_filters), parse(flags.cnn2_act)):
        net = Stream(
            cell=tf.keras.layers.DepthwiseConv2D(kernel_size=kernel_size,
                                                 dilation_rate=dilation_rate,
                                                 padding=flags.dw2_padding,
                                                 strides=strides))(net)
        net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum,
                                                 center=flags.bn_center,
                                                 scale=flags.bn_scale,
                                                 renorm=flags.bn_renorm)(net)
        net = tf.keras.layers.Activation(dw2_act)(net)
        net = tf.keras.layers.Conv2D(kernel_size=(1, 1), filters=filters)(net)
        net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum,
                                                 center=flags.bn_center,
                                                 scale=flags.bn_scale,
                                                 renorm=flags.bn_renorm)(net)
        net = tf.keras.layers.Activation(cnn2_act)(net)

    net = Stream(cell=tf.keras.layers.AveragePooling2D(
        pool_size=(int(net.shape[1]), int(net.shape[2]))))(net)

    net = Stream(cell=tf.keras.layers.Flatten())(net)
    net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

    net = tf.keras.layers.Dense(units=flags.label_count)(net)
    return tf.keras.Model(input_audio, net)
Пример #23
0
def model(flags):
    """Depthwise convolutional model.

  It is based on paper:
  MobileNets: Efficient Convolutional Neural Networks for
  Mobile Vision Applications https://arxiv.org/abs/1704.04861
  Hello Edge: Keyword Spotting on Microcontrollers
  https://arxiv.org/pdf/1711.07128.pdf
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

    input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ),
                                        batch_size=flags.batch_size)

    net = speech_features.SpeechFeatures(
        speech_features.SpeechFeatures.get_params(flags))(input_audio)

    net = tf.keras.backend.expand_dims(net)

    net = Stream(cell=tf.keras.layers.Conv2D(
        kernel_size=parse(flags.cnn1_kernel_size),
        dilation_rate=parse(flags.cnn1_dilation_rate),
        filters=flags.cnn1_filters,
        padding=flags.cnn1_padding,
        strides=parse(flags.cnn1_strides)))(net)
    net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum,
                                             center=flags.bn_center,
                                             scale=flags.bn_scale,
                                             renorm=flags.bn_renorm)(net)
    net = tf.keras.layers.Activation('relu')(net)

    for kernel_size, dw2_act, dilation_rate, strides, filters, cnn2_act in zip(
            parse(flags.dw2_kernel_size), parse(flags.dw2_act),
            parse(flags.dw2_dilation_rate), parse(flags.dw2_strides),
            parse(flags.cnn2_filters), parse(flags.cnn2_act)):
        net = Stream(
            cell=tf.keras.layers.DepthwiseConv2D(kernel_size=kernel_size,
                                                 dilation_rate=dilation_rate,
                                                 padding=flags.dw2_padding,
                                                 strides=strides))(net)
        net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum,
                                                 center=flags.bn_center,
                                                 scale=flags.bn_scale,
                                                 renorm=flags.bn_renorm)(net)
        net = tf.keras.layers.Activation(dw2_act)(net)
        net = tf.keras.layers.Conv2D(kernel_size=(1, 1), filters=filters)(net)
        net = tf.keras.layers.BatchNormalization(momentum=flags.bn_momentum,
                                                 center=flags.bn_center,
                                                 scale=flags.bn_scale,
                                                 renorm=flags.bn_renorm)(net)
        net = tf.keras.layers.Activation(cnn2_act)(net)

    net = Stream(cell=tf.keras.layers.AveragePooling2D(
        pool_size=(int(net.shape[1]), int(net.shape[2]))))(net)

    net = Stream(cell=tf.keras.layers.Flatten())(net)
    net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

    net = tf.keras.layers.Dense(units=flags.label_count)(net)
    return tf.keras.Model(input_audio, net)
Пример #24
0
def model(flags):
    """Convolutional recurrent neural network (CRNN) model.

  It is based on paper
  Convolutional Recurrent Neural Networks for Small-Footprint Keyword Spotting
  https://arxiv.org/pdf/1703.05390.pdf
  Represented as sequence of Conv, RNN/GRU, FC layers.
  Hello Edge: Keyword Spotting on Microcontrollers
  https://arxiv.org/pdf/1711.07128.pdf
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """
    input_audio = tf.keras.layers.Input(shape=(flags.desired_samples, ),
                                        batch_size=flags.batch_size)

    net = speech_features.SpeechFeatures(
        frame_size_ms=flags.window_size_ms,
        frame_step_ms=flags.window_stride_ms,
        sample_rate=flags.sample_rate,
        use_tf_fft=flags.use_tf_fft,
        preemph=flags.preemph,
        window_type=flags.window_type,
        feature_type=flags.feature_type,
        mel_num_bins=flags.mel_num_bins,
        mel_lower_edge_hertz=flags.mel_lower_edge_hertz,
        mel_upper_edge_hertz=flags.mel_upper_edge_hertz,
        mel_non_zero_only=flags.mel_non_zero_only,
        fft_magnitude_squared=flags.fft_magnitude_squared,
        dct_num_features=flags.dct_num_features)(input_audio)

    # expand dims for the next layer 2d conv
    net = tf.keras.backend.expand_dims(net)
    for filters, kernel_size, activation, dilation_rate, strides in zip(
            parse(flags.cnn_filters), parse(flags.cnn_kernel_size),
            parse(flags.cnn_act), parse(flags.cnn_dilation_rate),
            parse(flags.cnn_strides)):
        net = Stream(cell=tf.keras.layers.Conv2D(filters=filters,
                                                 kernel_size=kernel_size,
                                                 activation=activation,
                                                 dilation_rate=dilation_rate,
                                                 strides=strides))(net)

    shape = net.shape
    # input net dimension: [batch, time, feature, channels]
    # reshape dimension: [batch, time, feature * channels]
    # so that GRU/RNN can process it
    net = tf.keras.layers.Reshape((-1, shape[2] * shape[3]))(net)

    for units, return_sequences in zip(parse(flags.gru_units),
                                       parse(flags.return_sequences)):
        net = GRU(units=units,
                  return_sequences=return_sequences,
                  stateful=flags.stateful)(net)

    net = Stream(cell=tf.keras.layers.Flatten())(net)
    net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

    for units, activation in zip(parse(flags.units1), parse(flags.act1)):
        net = tf.keras.layers.Dense(units=units, activation=activation)(net)

    net = tf.keras.layers.Dense(units=flags.label_count)(net)
    return tf.keras.Model(input_audio, net)