def _non_streaming(self, inputs):
        # depthwise 1D convolution in non streaming mode
        # it is used for training or non streaming inference.

        # pad input data
        inputs_pad = temporal_padding.TemporalPadding(
            padding=self.pad, padding_size=self.memory_size - 1)(inputs)

        # expand dimensionality for depthwise_conv2d
        # to [memory_size, 1, feature_dim, 1]
        time_kernel_exp = tf.expand_dims(tf.expand_dims(self.time_kernel, 1),
                                         -1)

        # run convolution
        depthwise_conv1d = tf.nn.depthwise_conv2d(
            tf.expand_dims(inputs_pad, -2),
            time_kernel_exp,
            strides=[1, 1, 1, 1],
            padding='VALID')  # [batch_size, time_steps, 1, feature_dim]

        # [batch_size, time_steps, feature_dim]
        depthwise_conv1d = tf.squeeze(depthwise_conv1d, [2])

        # [batch_size, time_steps, feature_dim]
        if self.use_bias:
            depthwise_conv1d = depthwise_conv1d + self.bias

        return depthwise_conv1d
    def _non_streaming(self, inputs):
        # depthwise 1D convolution in non streaming mode
        # it is used for training or non streaming inference.
        # Zero pad inputs from the left to make conv1d causal.
        # [batch_size, time_steps, feature_dim]
        if self.pad:
            inputs_pad = tf.keras.backend.temporal_padding(
                inputs, padding=(self.memory_size - 1, 0))
        else:
            inputs_pad = inputs

        # expand dimensionality for depthwise_conv2d
        # to [memory_size, 1, feature_dim, 1]
        time_kernel_exp = tf.expand_dims(tf.expand_dims(self.time_kernel, 1),
                                         -1)

        # run convolution
        depthwise_conv1d = tf.nn.depthwise_conv2d(
            tf.expand_dims(inputs_pad, -2),
            time_kernel_exp,
            strides=[1, 1, 1, 1],
            padding='VALID')  # [batch_size, time_steps, 1, feature_dim]

        # [batch_size, time_steps, feature_dim]
        depthwise_conv1d = tf.squeeze(depthwise_conv1d, [2])

        # [batch_size, time_steps, feature_dim]
        if self.use_bias:
            depthwise_conv1d = depthwise_conv1d + self.bias

        return depthwise_conv1d
示例#3
0
    def _mfcc_op(self, inputs):
        # MFCC implementation based on TF custom op (supported by TFLite)
        # It reduces model size in comparison to _mfcc_tf
        if (self.mode == modes.Modes.STREAM_EXTERNAL_STATE_INFERENCE
                or self.mode == modes.Modes.STREAM_INTERNAL_STATE_INFERENCE):
            outputs = self.data_frame(inputs)
            # in streaming mode there is only one frame for FFT calculation
            # dims will be [batch=1, time=1, frame],
            # but audio_spectrogram requre 2D input data, so we remove time dim
            outputs = tf.squeeze(outputs, axis=1)
        else:
            outputs = inputs

        # outputs has dims [batch, time]
        # but audio_spectrogram expects [time, channels/batch] so transpose it
        outputs = tf.transpose(outputs, [1, 0])

        # outputs: [time, channels/batch]
        outputs = audio_ops.audio_spectrogram(
            outputs,
            window_size=self.frame_size,
            stride=self.frame_step,
            magnitude_squared=self.params['fft_magnitude_squared'])
        # outputs: [channels/batch, frames, fft_feature]

        outputs = audio_ops.mfcc(
            outputs,
            self.params['sample_rate'],
            upper_frequency_limit=self.params['mel_upper_edge_hertz'],
            lower_frequency_limit=self.params['mel_lower_edge_hertz'],
            filterbank_channel_count=self.params['mel_num_bins'],
            dct_coefficient_count=self.params['dct_num_features'])
        # outputs: [channels/batch, frames, dct_coefficient_count]
        outputs = self.spec_augment(outputs)
        return outputs
示例#4
0
def random_stretch_squeeze(inputs,
                           resample_offset,
                           seed=None):
  """Stretches and squeezes audio data in time dim.

  It can be useful for augmenting training data
  with random stretchs squeezes in time dim
  for making model more robust to input audio sampling frequency
  and human speech frequency.

  Args:
    inputs: input tensor [batch_size, time]
    resample_offset: defines stretch squeeze range:
      1-resample_offset...1+resample_offset
    seed: random seed
  Returns:
    masked image
  Raises:
    ValueError: if inputs.shape.rank != 2
  """
  if inputs.shape.rank != 2:
    raise ValueError('inputs.shape.rank:%d must be 2' % inputs.shape.rank)

  inputs_shape = inputs.shape.as_list()
  batch_size = inputs_shape[0]
  sequence_length = inputs_shape[1]

  image = tf.expand_dims(inputs, 2)  # feature
  image = tf.expand_dims(image, 3)  # channels

  resample = 1.0  # when it is equal to 1 - no stretching or squeezing
  time_stretch_squeeze = tf.random.uniform(
      shape=[batch_size],
      minval=resample - resample_offset,
      maxval=resample + resample_offset,
      dtype=tf.float32,
      seed=seed)
  tf.print(time_stretch_squeeze)
  print(time_stretch_squeeze)
  shape = tf.shape(inputs)
  outputs = tf.TensorArray(inputs.dtype, 0, dynamic_size=True)
  for i in tf.range(batch_size):
    image_resized = tf.image.resize(
        images=image[i],
        size=(tf.cast((tf.cast(shape[1], tf.float32) * time_stretch_squeeze[i]),
                      tf.int32), 1),
        preserve_aspect_ratio=False)
    image_resized_cropped = tf.image.resize_with_crop_or_pad(
        image_resized,
        target_height=sequence_length,
        target_width=1,
    )

    outputs = outputs.write(i, image_resized_cropped)

  outputs = tf.squeeze(outputs.stack(), axis=[2, 3])
  outputs.set_shape(inputs_shape)
  return outputs
示例#5
0
def random_shift(inputs, time_shift, seed=None):
    """Shifts input data randomly in time dim.

  It can be useful for augmenting training data with random shifts in time dim
  for making model more robust to input audio shifts

  Args:
    inputs: input tensor [batch_size, time]
    time_shift: defines time shift range: -time_shift...time_shift
      it is defiend in samples
    seed: random seed
  Returns:
    masked image
  Raises:
    ValueError: if inputs.shape.rank != 2
  """
    if inputs.shape.rank != 2:
        raise ValueError('inputs.shape.rank:%d must be 2' % inputs.shape.rank)

    inputs_shape = inputs.shape.as_list()
    batch_size = inputs_shape[0]
    sequence_length = inputs_shape[1]

    # below function will process 2D arrays, convert it to [batch, time, dummy]
    inputs = tf.expand_dims(inputs, 2)

    time_shift_amounts = tf.random.uniform(shape=[batch_size],
                                           minval=-time_shift,
                                           maxval=time_shift,
                                           dtype=tf.int32,
                                           seed=seed)

    outputs = tf.TensorArray(inputs.dtype, 0, dynamic_size=True)
    for i in tf.range(batch_size):
        time_shift_amount = time_shift_amounts[i]

        # pylint: disable=cell-var-from-loop
        time_shift_padding = tf.cond(time_shift_amount > 0,
                                     lambda: [[time_shift_amount, 0], [0, 0]],
                                     lambda: [[0, -time_shift_amount], [0, 0]])
        time_shift_offset = tf.cond(time_shift_amount > 0, lambda: [0, 0],
                                    lambda: [-time_shift_amount, 0])
        # pylint: enable=cell-var-from-loop

        padded = tf.pad(tensor=inputs[i],
                        paddings=time_shift_padding,
                        mode='CONSTANT')
        padded_sliced = tf.slice(padded, time_shift_offset,
                                 [sequence_length, -1])

        outputs = outputs.write(i, padded_sliced)

    # convert it back to [batch, time]
    outputs = tf.squeeze(outputs.stack(), axis=[2])
    outputs.set_shape(inputs_shape)
    return outputs
示例#6
0
    def call(self, inputs, training=None):
        net = inputs

        # add fake dim [batch, time, 1, feature]
        net = tf.keras.backend.expand_dims(net, axis=2)

        net = self.dropout1(net, training=training)
        net = self.dense1(net)
        net = self.depth_cnn1(net)
        net = self.batch_norm(net, training=training)
        net = self.activation(net)
        net = self.dense2(net)

        # [batch, time, feature]
        net = tf.squeeze(net, [2])

        return net
示例#7
0
def model(flags):
    """BC-ResNet model.

  It is based on paper
  Broadcasted Residual Learning for Efficient Keyword Spotting
  https://arxiv.org/pdf/2106.04140.pdf

  Args:
    flags: data/model parameters

  Returns:
    Keras model for training

  Raises:
    ValueError: if any of input list has different length from any other;
    or if padding is not supported
  """

    dropouts = utils.parse(flags.dropouts)
    filters = utils.parse(flags.filters)
    blocks_n = utils.parse(flags.blocks_n)
    strides = utils.parse(flags.strides)
    dilations = utils.parse(flags.dilations)

    for l in (dropouts, filters, strides, dilations):
        if len(blocks_n) != len(l):
            raise ValueError('all input lists have to be the same length '
                             'but get %s and %s ' % (blocks_n, l))

    input_audio = tf.keras.layers.Input(shape=modes.get_input_data_shape(
        flags, modes.Modes.TRAINING),
                                        batch_size=flags.batch_size)
    net = input_audio

    if flags.preprocess == 'raw':
        # it is a self contained model, user need to feed raw audio only
        net = speech_features.SpeechFeatures(
            speech_features.SpeechFeatures.get_params(flags))(net)

    # make it [batch, time, feature, 1]
    net = tf.keras.backend.expand_dims(net, axis=3)

    if flags.paddings == 'same':
        net = tf.keras.layers.Conv2D(filters=flags.first_filters,
                                     kernel_size=5,
                                     strides=(1, 2),
                                     padding='same')(net)
    else:
        net = stream.Stream(cell=tf.keras.layers.Conv2D(
            filters=flags.first_filters,
            kernel_size=5,
            strides=(1, 2),
            padding='valid'),
                            use_one_step=True,
                            pad_time_dim=flags.paddings,
                            pad_freq_dim='same')(net)

    for n, n_filters, dilation, stride, dropout in zip(blocks_n, filters,
                                                       dilations, strides,
                                                       dropouts):
        net = TransitionBlock(n_filters,
                              dilation,
                              stride,
                              flags.paddings,
                              dropout,
                              sub_groups=flags.sub_groups)(net)
        for _ in range(n):
            net = NormalBlock(n_filters,
                              dilation,
                              1,
                              flags.paddings,
                              dropout,
                              sub_groups=flags.sub_groups)(net)

    if flags.paddings == 'same':
        net = tf.keras.layers.DepthwiseConv2D(kernel_size=5,
                                              padding='same')(net)
    else:
        net = stream.Stream(cell=tf.keras.layers.DepthwiseConv2D(
            kernel_size=5, padding='valid'),
                            use_one_step=True,
                            pad_time_dim=flags.paddings,
                            pad_freq_dim='same')(net)

    # average out frequency dim
    net = tf.keras.backend.mean(net, axis=2, keepdims=True)

    net = tf.keras.layers.Conv2D(filters=flags.last_filters,
                                 kernel_size=1,
                                 use_bias=False)(net)

    # average out time dim
    if flags.paddings == 'same':
        net = tf.keras.layers.GlobalAveragePooling2D(keepdims=True)(net)
    else:
        net = stream.Stream(cell=tf.keras.layers.GlobalAveragePooling2D(
            keepdims=True))(net)

    net = tf.keras.layers.Conv2D(filters=flags.label_count,
                                 kernel_size=1,
                                 use_bias=False)(net)
    # 1 and 2 dims are equal to 1
    net = tf.squeeze(net, [1, 2])

    if flags.return_softmax:
        net = tf.keras.layers.Activation('softmax')(net)
    return tf.keras.Model(input_audio, net)