Пример #1
0
    def setUp(self):
        super(FrameTestBase, self).setUp()

        self.frame_size = 7
        self.frame_step = 5
        self.inference_batch_size = 1

        # generate input signal
        set_seed(1)
        self.data_size = 33
        self.signal = np.random.rand(self.inference_batch_size, self.data_size)

        # non streaming frame extraction based on tf.signal.frame
        data_frame_tf = data_frame.DataFrame(
            mode=Modes.TRAINING,
            inference_batch_size=self.inference_batch_size,
            frame_size=self.frame_size,
            frame_step=self.frame_step)
        # it receives all data with size: data_size
        input1 = tf.keras.layers.Input(shape=(self.data_size, ),
                                       batch_size=self.inference_batch_size,
                                       dtype=tf.float32)
        output1 = data_frame_tf(inputs=input1)
        self.model_tf = tf.keras.models.Model(input1, output1)

        # generate frames for the whole signal (no streaming here)
        self.output_frames_tf = self.model_tf.predict(self.signal)
Пример #2
0
def model(flags):
  """Fully connected layer based model on raw wav data.

  It is based on paper (with added pooling and raw audio data):
  SMALL-FOOTPRINT KEYWORD SPOTTING USING DEEP NEURAL NETWORKS
  https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/42537.pdf
  Args:
    flags: data/model parameters

  Returns:
    Keras model for training
  """

  if flags.preprocess != 'raw':
    ValueError('input audio has to be raw, but get ', flags.preprocess)

  input_audio = tf.keras.layers.Input(
      shape=(flags.desired_samples,), batch_size=flags.batch_size)

  net = data_frame.DataFrame(
      frame_size=flags.window_size_samples,
      frame_step=flags.window_stride_samples)(
          input_audio)

  for units, activation in zip(
      utils.parse(flags.units1), utils.parse(flags.act1)):
    net = tf.keras.layers.Dense(units=units, activation=activation)(net)

  net = stream.Stream(cell=tf.keras.layers.Flatten())(net)

  # after flattening data in time, we can apply any layer: pooling, bi-lstm etc
  if flags.pool_size > 1:
    # add fake dim for compatibility with pooling
    net = tf.keras.backend.expand_dims(net, axis=-1)
    net = tf.keras.layers.MaxPool1D(
        pool_size=flags.pool_size,
        strides=flags.strides,
        data_format='channels_last')(
            net)
    # remove fake dim
    net = tf.keras.backend.squeeze(net, axis=-1)

  net = tf.keras.layers.Dropout(rate=flags.dropout1)(net)

  for units, activation in zip(
      utils.parse(flags.units2), utils.parse(flags.act2)):
    net = tf.keras.layers.Dense(units=units, activation=activation)(net)

  net = tf.keras.layers.Dense(units=flags.label_count)(net)
  if flags.return_softmax:
    net = tf.keras.layers.Activation('softmax')(net)
  return tf.keras.Model(input_audio, net)
Пример #3
0
    def test_tf_non_streaming_vs_streaming_internal_state(self):
        # prepare streaming frame extraction model with internal state
        data_frame_stream = data_frame.DataFrame(
            mode=modes.Modes.STREAM_INTERNAL_STATE_INFERENCE,
            inference_batch_size=self.inference_batch_size,
            frame_size=self.frame_size,
            frame_step=self.frame_step)
        # it received input data incrementally with step: frame_step
        input2 = tf.keras.layers.Input(shape=(self.frame_step, ),
                                       batch_size=self.inference_batch_size,
                                       dtype=tf.float32)
        output2 = data_frame_stream(input2)
        model_stream = tf.keras.models.Model(input2, output2)

        # initialize internal state of data framer
        pre_state = self.signal[:, 0:data_frame_stream.frame_size -
                                data_frame_stream.frame_step]
        state_init = np.concatenate(
            (np.zeros(shape=(1, data_frame_stream.frame_step),
                      dtype=np.float32), pre_state),
            axis=1)
        data_frame_stream.set_weights([state_init])

        start = self.frame_size - self.frame_step
        end = self.frame_size
        streamed_frames = []

        # run streaming frames extraction
        while end <= self.data_size:

            # next data update
            stream_update = self.signal[:, start:end]

            # get new frame from stream of data
            output_frame = model_stream.predict(stream_update)
            streamed_frames.append(output_frame)

            start = end
            end = start + self.frame_step

        # compare streaming vs non streaming frames extraction
        for i in range(0, len(self.output_frames_tf[0])):
            self.assertAllEqual(streamed_frames[i][0][0],
                                self.output_frames_tf[0][i])
Пример #4
0
  def build(self, input_shape):
    super(STFT, self).build(input_shape)

    self.data_frame = data_frame.DataFrame(
        mode=self.mode,
        inference_batch_size=self.inference_batch_size,
        frame_size=self.frame_size,
        frame_step=self.frame_step,
        use_one_step=False,
        padding=self.padding)

    if self.window_type:
      self.windowing = windowing.Windowing(
          window_size=self.frame_size, window_type=self.window_type)
    else:
      self.windowing = tf.keras.layers.Lambda(lambda x: x)

    self.rfft = tf.keras.layers.Lambda(
        lambda x: tf.signal.rfft(x, fft_length=[self.fft_size]))
  def test_stream_framing(self, batch_frames, window_stride_samples):
    """Test DataFrame in streaming mode with different batch_frames and stride.

    Args:
        batch_frames: number of frames produced by one call in streaming mode
        window_stride_samples: stride of sliding window
    """

    # data parameters
    params = Params(
        batch_frames=batch_frames, window_stride_samples=window_stride_samples)

    # prepare input data
    input_audio = np.arange(params.desired_samples)
    input_audio = np.expand_dims(input_audio, 0)  # add batch dim

    # prepare non stream model
    padding = 'causal'
    inputs = tf.keras.Input(
        shape=(params.desired_samples,), batch_size=1, dtype=tf.float32)
    net = inputs
    net = data_frame.DataFrame(
        frame_size=params.window_size_samples,
        frame_step=params.window_stride_samples,
        use_one_step=False,
        padding=padding)(
            net)
    model = tf.keras.Model(inputs, net)
    model.summary()

    # prepare streaming model
    model_stream = utils.to_streaming_inference(
        model, params, modes.Modes.STREAM_INTERNAL_STATE_INFERENCE)
    model_stream.summary()

    # run inference
    non_stream_out = model.predict(input_audio)
    stream_out = test.run_stream_inference(params, model_stream, input_audio)
    self.assertAllClose(stream_out, non_stream_out)
Пример #6
0
    def build(self, input_shape):
        super(SpeechFeatures, self).build(input_shape)

        self.data_frame = data_frame.DataFrame(
            mode=self.mode,
            inference_batch_size=self.inference_batch_size,
            frame_size=self.frame_size,
            frame_step=self.frame_step)

        if self.noise_scale != 0.0 and self.mode == modes.Modes.TRAINING:
            self.add_noise = tf.keras.layers.GaussianNoise(
                stddev=self.noise_scale)
        else:
            self.add_noise = tf.keras.layers.Lambda(lambda x: x)

        if self.params['preemph'] != 0.0:
            self.preemphasis = preemphasis.Preemphasis(
                preemph=self.params['preemph'])
        else:
            self.preemphasis = tf.keras.layers.Lambda(lambda x: x)

        if self.params['window_type'] is not None:
            self.windowing = windowing.Windowing(
                window_size=self.frame_size,
                window_type=self.params['window_type'])
        else:
            self.windowing = tf.keras.layers.Lambda(lambda x: x)

        # If use_tf_fft is False, we will use
        # Real Discrete Fourier Transformation(RDFT), which is slower than RFFT
        # To increase RDFT efficiency we use properties of mel spectrum.
        # We find a range of non zero values in mel spectrum
        # and use it to compute RDFT: it will speed up computations.
        # If use_tf_fft is True, then we use TF RFFT which require
        # signal length alignment, so we disable mel_non_zero_only.
        self.mag_rdft_mel = magnitude_rdft_mel.MagnitudeRDFTmel(
            use_tf_fft=self.params['use_tf_fft'],
            magnitude_squared=self.params['fft_magnitude_squared'],
            num_mel_bins=self.params['mel_num_bins'],
            lower_edge_hertz=self.params['mel_lower_edge_hertz'],
            upper_edge_hertz=self.params['mel_upper_edge_hertz'],
            sample_rate=self.params['sample_rate'],
            mel_non_zero_only=self.params['mel_non_zero_only'])

        self.log_max = tf.keras.layers.Lambda(lambda x: tf.math.log(
            tf.math.maximum(x, self.params['log_epsilon'])))

        if self.params['dct_num_features'] != 0:
            self.dct = dct.DCT(num_features=self.params['dct_num_features'])
        else:
            self.dct = tf.keras.layers.Lambda(lambda x: x)

        self.normalizer = normalizer.Normalizer(mean=self.mean,
                                                stddev=self.stddev)

        # in any inference mode there is no need to add dynamic logic in tf graph
        if self.params[
                'use_spec_augment'] and self.mode == modes.Modes.TRAINING:
            self.spec_augment = spectrogram_augment.SpecAugment(
                time_masks_number=self.params['time_masks_number'],
                time_mask_max_size=self.params['time_mask_max_size'],
                frequency_masks_number=self.params['frequency_masks_number'],
                frequency_mask_max_size=self.params['frequency_mask_max_size'])
        else:
            self.spec_augment = tf.keras.layers.Lambda(lambda x: x)
Пример #7
0
    def build(self, input_shape):
        super(SpeechFeatures, self).build(input_shape)

        if self.params[
                'sp_time_shift_samples'] != 0.0 and self.mode == modes.Modes.TRAINING:
            self.rand_shift = random_shift.RandomShift(
                self.params['sp_time_shift_samples'])
        else:
            self.rand_shift = tf.keras.layers.Lambda(lambda x: x)

        if self.params[
                'sp_resample'] != 0.0 and self.mode == modes.Modes.TRAINING:
            self.rand_stretch_squeeze = random_stretch_squeeze.RandomStretchSqueeze(
                self.params['sp_resample'])
        else:
            self.rand_stretch_squeeze = tf.keras.layers.Lambda(lambda x: x)

        self.data_frame = data_frame.DataFrame(
            mode=self.mode,
            inference_batch_size=self.inference_batch_size,
            frame_size=self.frame_size,
            frame_step=self.frame_step,
            use_one_step=self.params['use_one_step'],
            padding=self.params['data_frame_padding'])

        if self.noise_scale != 0.0 and self.mode == modes.Modes.TRAINING:
            self.add_noise = tf.keras.layers.GaussianNoise(
                stddev=self.noise_scale)
        else:
            self.add_noise = tf.keras.layers.Lambda(lambda x: x)

        if self.params['preemph'] != 0.0:
            self.preemphasis = preemphasis.Preemphasis(
                preemph=self.params['preemph'])
        else:
            self.preemphasis = tf.keras.layers.Lambda(lambda x: x)

        # if True it will replace direct DFT, DCT and hann window by tf functions
        # it is useful for model quantization,
        # because these functions will not be quantized
        use_tf_function = self.params['use_tf_fft']
        mel_non_zero_only = self.params['mel_non_zero_only']
        window_type = self.params['window_type']

        # set mel and window type for tf function compatibility
        if use_tf_function:
            mel_non_zero_only = False
            window_type = 'hann_tf'

        if window_type is not None:
            self.windowing = windowing.Windowing(window_size=self.frame_size,
                                                 window_type=window_type)
        else:
            self.windowing = tf.keras.layers.Lambda(lambda x: x)

        # If use_tf_fft is False, we will use
        # Real Discrete Fourier Transformation(RDFT), which is slower than RFFT
        # To increase RDFT efficiency we use properties of mel spectrum.
        # We find a range of non zero values in mel spectrum
        # and use it to compute RDFT: it will speed up computations.
        # If use_tf_fft is True, then we use TF RFFT which require
        # signal length alignment, so we disable mel_non_zero_only.
        self.mag_rdft_mel = magnitude_rdft_mel.MagnitudeRDFTmel(
            use_tf_fft=use_tf_function,
            magnitude_squared=self.params['fft_magnitude_squared'],
            num_mel_bins=self.params['mel_num_bins'],
            lower_edge_hertz=self.params['mel_lower_edge_hertz'],
            upper_edge_hertz=self.params['mel_upper_edge_hertz'],
            sample_rate=self.params['sample_rate'],
            mel_non_zero_only=mel_non_zero_only)

        self.log_max = tf.keras.layers.Lambda(lambda x: tf.math.log(
            tf.math.maximum(x, self.params['log_epsilon'])))

        if self.params['dct_num_features'] != 0:
            self.dct = dct.DCT(num_features=self.params['dct_num_features'])
        else:
            self.dct = tf.keras.layers.Lambda(lambda x: x)

        self.normalizer = normalizer.Normalizer(mean=self.mean,
                                                stddev=self.stddev)

        # in any inference mode there is no need to add dynamic logic in tf graph
        if self.params[
                'use_spec_augment'] and self.mode == modes.Modes.TRAINING:
            self.spec_augment = spectrogram_augment.SpecAugment(
                time_masks_number=self.params['time_masks_number'],
                time_mask_max_size=self.params['time_mask_max_size'],
                frequency_masks_number=self.params['frequency_masks_number'],
                frequency_mask_max_size=self.params['frequency_mask_max_size'])
        else:
            self.spec_augment = tf.keras.layers.Lambda(lambda x: x)

        if self.params['use_spec_cutout'] and self.mode == modes.Modes.TRAINING:
            self.spec_cutout = spectrogram_cutout.SpecCutout(
                masks_number=self.params['spec_cutout_masks_number'],
                time_mask_size=self.params['spec_cutout_time_mask_size'],
                frequency_mask_size=self.
                params['spec_cutout_frequency_mask_size'])
        else:
            self.spec_cutout = tf.keras.layers.Lambda(lambda x: x)