def setUp(self):
        super(InverseSTFTTest, self).setUp()
        test_utils.set_seed(123)

        self.frame_size = 32
        self.frame_step = 8
        # layer definition
        inverse_stft_layer = inverse_stft.InverseSTFT(self.frame_size,
                                                      self.frame_step)

        # prepare input stft data
        input_audio = tf.random.uniform((1, 256), maxval=1.0)
        signal_stft_tf = tf.signal.stft(
            input_audio,
            inverse_stft_layer.frame_size,
            inverse_stft_layer.frame_step,
            inverse_stft_layer.fft_size,
            window_fn=inverse_stft_layer.synthesis_window_fn,
            pad_end=False)
        with tf1.Session() as sess:
            self.signal_stft = sess.run(signal_stft_tf)

        self.feature_size = self.signal_stft.shape[-1]

        # create istft model and run non stream inference
        input_tf = tf.keras.layers.Input(shape=self.signal_stft.shape[1:3],
                                         batch_size=1,
                                         dtype=tf.complex64)
        net = inverse_stft_layer(input_tf)
        model_non_stream = tf.keras.models.Model(input_tf, net)
        self.non_stream_out = model_non_stream.predict(self.signal_stft)
示例#2
0
    def test_masking(self):
        test_utils.set_seed(self.seed)
        spectrogram = np.ones(self.input_shape)
        inputs = tf.keras.layers.Input(shape=self.input_shape[1:],
                                       batch_size=self.input_shape[0],
                                       dtype=tf.float32)
        outputs = spectrogram_cutout.SpecCutout(masks_number=2,
                                                time_mask_size=4,
                                                frequency_mask_size=2,
                                                seed=self.seed)(inputs,
                                                                training=True)
        model = tf.keras.models.Model(inputs, outputs)
        prediction = model.predict(spectrogram)
        # confirm that every mask has different rects in different batch indexes
        target0 = np.array([[1., 1., 1., 1., 1.], [1., 1., 1., 1., 1.],
                            [0., 1., 1., 1., 1.], [0., 1., 1., 0., 0.],
                            [0., 1., 1., 0., 0.], [0., 1., 1., 0., 0.],
                            [1., 1., 1., 0., 0.]])
        self.assertAllEqual(prediction[0, :, :], target0)

        target1 = np.array([[1., 1., 1., 1., 1.], [0., 1., 1., 1., 1.],
                            [0., 1., 1., 1., 1.], [0., 1., 0., 0., 1.],
                            [0., 1., 0., 0., 1.], [1., 1., 0., 0., 1.],
                            [1., 1., 0., 0., 1.]])
        self.assertAllEqual(prediction[1, :, :], target1)
    def setUp(self):
        super(MagnitudeRDFTmelTest, self).setUp()
        test_utils.set_seed(123)

        self.signal_size = 100
        # input signal
        self.signal = np.random.rand(1, self.signal_size)

        # model parameters
        self.use_tf_fft = False
        self.magnitude_squared = False
        self.num_mel_bins = 40
        self.lower_edge_hertz = 20.0
        self.upper_edge_hertz = 4000.0
        self.sample_rate = 16000.0

        # build rdft mel model and run it
        input_signal = tf.keras.Input(shape=(self.signal_size, ), batch_size=1)
        mag_rdft = magnitude_rdft.MagnitudeRDFT(
            use_tf_fft=self.use_tf_fft,
            magnitude_squared=self.magnitude_squared)(input_signal)
        mel_spectr = mel_spectrogram.MelSpectrogram(
            use_tf=False,
            num_mel_bins=self.num_mel_bins,
            lower_edge_hertz=self.lower_edge_hertz,
            upper_edge_hertz=self.upper_edge_hertz,
            sample_rate=self.sample_rate)(mag_rdft)
        model_rdft_mel = tf.keras.Model(input_signal, mel_spectr)
        model_rdft_mel.summary()
        self.shape_rdft_mel = model_rdft_mel.layers[2].mel_weight_matrix.shape
        self.rdft_mel_output = model_rdft_mel.predict(self.signal)
示例#4
0
    def test_transposed_conv(self):
        """Test transposed and standard conv model with 'same' padding."""
        test_utils.set_seed(123)

        # model and data parameters
        cnn_filters = [1, 1]
        cnn_kernel_size = [5, 3]
        cnn_act = ['linear', 'linear']
        cnn_use_bias = [False, False]
        cnn_paddings = ['same', 'same']
        trans_paddings = ['same', 'causal']
        params = test_utils.Params([1], clip_duration_ms=2)

        # prepare input data
        x = np.arange(params.desired_samples)
        inp_audio = x
        inp_audio = np.expand_dims(inp_audio, 0)

        # prepare non stream model
        model = transposed_conv_model(params, cnn_filters, cnn_kernel_size,
                                      cnn_act, cnn_use_bias, cnn_paddings,
                                      trans_paddings)
        # set random weights
        all_weights = []
        for w in model.get_weights():
            if isinstance(w, np.ndarray):
                shape = w.shape
                new_w = np.random.rand(*shape)
                all_weights.append(new_w)
            else:
                all_weights.append(True)
        model.set_weights(all_weights)
        model.summary()
        non_stream_out = model.predict(inp_audio)

        # prepare streaming model
        model_stream = utils.to_streaming_inference(
            model, params, modes.Modes.STREAM_INTERNAL_STATE_INFERENCE)
        model_stream.summary()
        stream_out = inference.run_stream_inference(params, model_stream,
                                                    inp_audio)

        # shift defines the index after which data in streaming mode become valid:
        # in streaming mode we use ring buffers initialized with zeros and it needs
        # several cycles until they are filled with real data.
        shift = 2
        # the total conv delay is (5//2) * 2 + 3//2 = 5
        # (there is no delay from the k=3 s=2 transposed convs, 'same' or 'causal'),
        # and the explicit Delay layers add an additional same amount.
        total_delay = 10
        # normalize output data and compare them
        non_stream_out = non_stream_out[0, shift:-(total_delay), ]
        stream_out = stream_out[0, total_delay + shift:, ]

        self.assertAllClose(stream_out, non_stream_out)
    def test_tf_vs_tf_direct(self):
        # Compare TF implementation of Mel (based on FFT)
        # vs TF direct implementation (based on FT)
        feature_size = 257
        num_mel_bins = 80
        lower_edge_hertz = 125.0
        upper_edge_hertz = 7600.0
        sample_rate = 16000.0
        batch_size = 1

        test_utils.set_seed(1)

        # generate input data
        frame = np.random.rand(batch_size, feature_size)

        # prepare model with TF implementation of Mel based on FFT
        input1 = tf.keras.layers.Input(shape=(feature_size, ),
                                       batch_size=batch_size,
                                       dtype=tf.float32)
        mel_spectrum = mel_spectrogram.MelSpectrogram(
            mode=modes.Modes.NON_STREAM_INFERENCE,
            use_tf=True,
            num_mel_bins=num_mel_bins,
            lower_edge_hertz=lower_edge_hertz,
            upper_edge_hertz=upper_edge_hertz,
            sample_rate=sample_rate)
        output1 = mel_spectrum(input1)
        model_tf = tf.keras.models.Model(input1, output1)
        # generate mel
        output_tf = model_tf.predict(frame)

        # prepare model with TF implementation of Mel based on direct FT
        input2 = tf.keras.layers.Input(shape=(feature_size, ),
                                       batch_size=batch_size,
                                       dtype=tf.float32)
        mel_spectrum_direct = mel_spectrogram.MelSpectrogram(
            mode=modes.Modes.STREAM_EXTERNAL_STATE_INFERENCE,
            use_tf=False,
            num_mel_bins=num_mel_bins,
            lower_edge_hertz=lower_edge_hertz,
            upper_edge_hertz=upper_edge_hertz,
            sample_rate=sample_rate)
        output2 = mel_spectrum_direct(input2)
        model_tf_direct = tf.keras.models.Model(input2, output2)
        # generate mel
        output_tf_direct = model_tf_direct.predict(frame)

        self.assertAllClose(output_tf, output_tf_direct, rtol=1e-5, atol=1e-4)
示例#6
0
    def setUp(self):
        super().setUp()

        self.inference_batch_size = 1
        self.params = Params()
        self.frame_size = int(
            round(self.params.sample_rate * self.params.window_size_ms /
                  1000.0))
        self.frame_step = int(
            round(self.params.sample_rate * self.params.window_stride_ms /
                  1000.0))

        # generate input signal
        test_utils.set_seed(1)
        self.data_size = 1024
        self.signal = np.random.rand(self.inference_batch_size, self.data_size)
  def setUp(self):
    super(CounterTest, self).setUp()
    test_utils.set_seed(123)
    self.time_size = 30
    self.feature_size = 3
    self.max_counter = 11
    self.input_non_stream_np = np.random.randn(1, self.time_size,
                                               self.feature_size)

    inputs = tf.keras.layers.Input(
        shape=(
            self.time_size,
            self.feature_size,
        ), batch_size=1)
    net = counter.Counter(max_counter=self.max_counter)(inputs)
    self.model = tf.keras.Model(inputs, net)
示例#8
0
 def test_frequency_masking(self):
     test_utils.set_seed(self.seed)
     spectrogram = np.ones(self.input_shape)
     inputs = tf.keras.layers.Input(shape=self.input_shape[1:],
                                    batch_size=self.input_shape[0],
                                    dtype=tf.float32)
     outputs = spectrogram_augment.SpecAugment(time_masks_number=0,
                                               time_mask_max_size=3,
                                               frequency_masks_number=2,
                                               frequency_mask_max_size=3)(
                                                   inputs, training=True)
     model = tf.keras.models.Model(inputs, outputs)
     prediction = model.predict(spectrogram)
     target = np.array([[[1., 0., 0., 1., 0.], [1., 0., 0., 1., 0.],
                         [1., 0., 0., 1., 0.], [1., 0., 0., 1., 0.],
                         [1., 0., 0., 1., 0.]]])
     self.assertAllEqual(prediction, target)
示例#9
0
    def test_sequential_to_functional(self):
        # prepare input data
        test_utils.set_seed(1)
        batch_input_shape = (1, 4, 2, 2)
        input_data = np.random.rand(np.prod(batch_input_shape))
        input_data = np.reshape(input_data, batch_input_shape)

        # create sequential model
        inputs = tf.keras.Input(batch_input_shape=batch_input_shape)
        net = SequentialModel(2)(inputs)
        model = tf.keras.Model(inputs=inputs, outputs=net)
        model.summary()

        # convert keras sequential model to functional and compare them
        func_model = utils.sequential_to_functional(model)
        func_model.summary()
        self.assertAllClose(model.predict(input_data),
                            func_model.predict(input_data))
示例#10
0
    def setUp(self):
        super(GRUTest, self).setUp()
        test_utils.set_seed(123)

        # generate input signal
        self.inference_batch_size = 1
        self.data_size = 32
        self.feature_size = 4
        self.signal = np.random.rand(self.inference_batch_size, self.data_size,
                                     self.feature_size)
        # create non streamable model
        inputs = tf.keras.layers.Input(shape=(self.data_size,
                                              self.feature_size),
                                       batch_size=self.inference_batch_size,
                                       dtype=tf.float32)
        self.units = 3
        outputs = gru.GRU(units=self.units, return_sequences=True)(inputs)
        self.model_non_streamable = tf.keras.Model(inputs, outputs)
        self.output_gru = self.model_non_streamable.predict(self.signal)
示例#11
0
    def test_random_shift(self):
        test_utils.set_seed(self.seed)
        audio = np.ones(self.input_shape)
        inputs = tf.keras.layers.Input(shape=self.input_shape[1:],
                                       batch_size=self.input_shape[0],
                                       dtype=tf.float32)
        outputs = random_shift.RandomShift(time_shift=3,
                                           seed=self.seed)(inputs,
                                                           training=True)
        model = tf.keras.models.Model(inputs, outputs)
        prediction = model.predict(audio)

        # confirm that audio sequence is shifted left
        target0 = np.array([1., 1., 1., 1., 0., 0., 0.])
        self.assertAllEqual(prediction[0, :], target0)

        # confirm that audio sequence is shifted right
        target1 = np.array([0., 1., 1., 1., 1., 1., 1.])
        self.assertAllEqual(prediction[1, :], target1)
  def setUp(self):
    super(DsTcResnetTest, self).setUp()

    config = tf1.ConfigProto()
    config.gpu_options.allow_growth = True
    self.sess = tf1.Session(config=config)
    tf1.keras.backend.set_session(self.sess)
    tf.keras.backend.set_learning_phase(0)

    test_utils.set_seed(123)
    self.params = utils.ds_tc_resnet_model_params(True)

    self.model = ds_tc_resnet.model(self.params)
    self.model.summary()

    self.input_data = np.random.rand(self.params.batch_size,
                                     self.params.desired_samples)

    # run non streaming inference
    self.non_stream_out = self.model.predict(self.input_data)
示例#13
0
    def setUp(self):
        super(STFTTest, self).setUp()
        test_utils.set_seed(123)

        self.frame_size = 40
        self.frame_step = 10
        # layer definition
        stft_layer = stft.STFT(self.frame_size,
                               self.frame_step,
                               mode=modes.Modes.TRAINING,
                               inference_batch_size=1,
                               padding='causal')

        if stft_layer.window_type == 'hann_tf':
            synthesis_window_fn = tf.signal.hann_window
        else:
            synthesis_window_fn = None

        # prepare input data
        self.input_signal = np.random.rand(1, 120)

        # prepare default tf stft
        padding_layer = temporal_padding.TemporalPadding(
            padding_size=stft_layer.frame_size - 1, padding=stft_layer.padding)
        # pylint: disable=g-long-lambda
        stft_default_layer = tf.keras.layers.Lambda(
            lambda x: tf.signal.stft(x,
                                     stft_layer.frame_size,
                                     stft_layer.frame_step,
                                     fft_length=stft_layer.fft_size,
                                     window_fn=synthesis_window_fn,
                                     pad_end=False))
        # pylint: enable=g-long-lambda
        input_tf = tf.keras.layers.Input(shape=(self.input_signal.shape[1], ),
                                         batch_size=1)
        net = padding_layer(input_tf)
        net = stft_default_layer(net)

        model_stft = tf.keras.models.Model(input_tf, net)

        self.stft_out = model_stft.predict(self.input_signal)
示例#14
0
    def test_padding(self, padding):
        batch_size = 1
        time_dim = 3
        feature_dim = 3
        kernel_size = 3
        inputs = tf.keras.layers.Input(shape=(time_dim, feature_dim),
                                       batch_size=batch_size)

        # set it in train mode (in stream mode padding is not applied)
        net = stream.Stream(mode=modes.Modes.TRAINING,
                            cell=tf.keras.layers.Lambda(lambda x: x),
                            ring_buffer_size_in_time_dim=kernel_size,
                            pad_time_dim=padding)(inputs)
        model = tf.keras.Model(inputs, net)

        test_utils.set_seed(1)
        input_signal = np.random.rand(batch_size, time_dim, feature_dim)
        outputs = model.predict(input_signal)
        self.assertAllEqual(
            outputs.shape,
            [batch_size, time_dim + kernel_size - 1, feature_dim])
示例#15
0
    def _set_params(self, use_peepholes):
        test_utils.set_seed(123)

        # generate input signal
        self.inference_batch_size = 1
        self.data_size = 32
        self.feature_size = 4
        self.signal = np.random.rand(self.inference_batch_size, self.data_size,
                                     self.feature_size)
        # create non streamable model
        inputs = tf.keras.layers.Input(shape=(self.data_size,
                                              self.feature_size),
                                       batch_size=self.inference_batch_size,
                                       dtype=tf.float32)
        self.units = 3
        self.num_proj = 4
        outputs = lstm.LSTM(units=self.units,
                            return_sequences=True,
                            use_peepholes=use_peepholes,
                            num_proj=self.num_proj)(inputs)
        self.model_non_streamable = tf.keras.Model(inputs, outputs)
        self.output_lstm = self.model_non_streamable.predict(self.signal)
  def test_random_stretch_squeeze(self):
    test_utils.set_seed(self.seed)
    audio = np.zeros(self.input_shape)
    audio[:, 2:5,] = 1
    inputs = tf.keras.layers.Input(
        shape=self.input_shape[1:],
        batch_size=self.input_shape[0],
        dtype=tf.float32)
    outputs = random_stretch_squeeze.RandomStretchSqueeze(
        resample_offset=0.5,
        seed=self.seed)(
            inputs, training=True)
    model = tf.keras.models.Model(inputs, outputs)
    prediction = model.predict(audio)

    # confirm that data are squeezed
    target0 = np.array([0., 0., 1., 1., 0., 0., 0.])
    self.assertAllClose(prediction[0, :], target0)

    # confirm that data are stretched
    target1 = np.array([0., 0.44444, 1., 1., 1., 0.44444, 0.])
    self.assertAllClose(prediction[1, :], target1, atol=1e-4)
    def test_ds_tc_resnet_stream_internal_tflite(self):
        """Test tflite streaming with internal state."""
        test_utils.set_seed(123)
        tf.keras.backend.set_learning_phase(0)
        params = utils.ds_tc_resnet_model_params(True)

        model = ds_tc_resnet.model(params)
        model.summary()

        input_data = np.random.rand(params.batch_size, params.desired_samples)

        # run non streaming inference
        non_stream_out = model.predict(input_data)

        tflite_streaming_model = utils.model_to_tflite(
            None, model, params, Modes.STREAM_INTERNAL_STATE_INFERENCE)

        interpreter = tf.lite.Interpreter(model_content=tflite_streaming_model)
        interpreter.allocate_tensors()

        stream_out = inference.run_stream_inference_classification_tflite(
            params, interpreter, input_data, input_states=None)

        self.assertAllClose(stream_out, non_stream_out, atol=1e-5)
示例#18
0
 def setUp(self):
     super(DelayStreamTest, self).setUp()
     test_utils.set_seed(123)
示例#19
0
  def test_cnn_model_end_to_end(self):

    config = tf1.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf1.Session(config=config)
    tf1.keras.backend.set_session(sess)
    test_utils.set_seed(123)

    # data parameters
    num_time_bins = 12
    feature_size = 12

    # model params.
    total_stride = 2
    params = test_utils.Params([total_stride], 0)
    params.model_name = 'cnn'
    params.cnn_filters = '2'
    params.cnn_kernel_size = '(3,3)'
    params.cnn_act = "'relu'"
    params.cnn_dilation_rate = '(1,1)'
    params.cnn_strides = '(2,2)'
    params.dropout1 = 0.5
    params.units2 = ''
    params.act2 = ''

    params.label_count = 2
    params.return_softmax = True
    params.quantize = 1  # apply quantization aware training

    params.data_shape = (num_time_bins, feature_size)
    params.preprocess = 'custom'

    model = cnn.model(params)
    model.summary()

    # prepare training and testing data
    train_images, train_labels = test_utils.generate_data(
        img_size_y=num_time_bins, img_size_x=feature_size, n_samples=32)
    test_images = train_images
    test_labels = train_labels

    # create and train quantization aware model in non streaming mode
    model.compile(
        optimizer='adam',
        loss=tf.keras.losses.SparseCategoricalCrossentropy(),
        metrics=['accuracy'])
    model.fit(
        train_images,
        train_labels,
        epochs=1,
        validation_data=(test_images, test_labels))
    model.summary()

    # one test image
    train_image = train_images[:1,]

    # run tf non streaming inference
    non_stream_output_tf = model.predict(train_image)

    # specify input data shape for streaming mode
    params.data_shape = (total_stride, feature_size)
    # TODO(rybakov) add params structure for model with no feature extractor

    # prepare tf streaming model and use it to generate representative_dataset
    with quantize.quantize_scope():
      stream_quantized_model = utils.to_streaming_inference(
          model, params, Modes.STREAM_EXTERNAL_STATE_INFERENCE)

    calibration_data = prepare_calibration_data(stream_quantized_model,
                                                total_stride, train_image)

    def representative_dataset(dtype):
      def _representative_dataset_gen():
        for i in range(len(calibration_data)):
          yield [
              calibration_data[i][0].astype(dtype),  # input audio packet
              calibration_data[i][1].astype(dtype),  # conv state
              calibration_data[i][2].astype(dtype)  # flatten state
          ]

      return _representative_dataset_gen

    # convert streaming quantization aware model to tflite
    # and apply post training quantization
    with quantize.quantize_scope():
      tflite_streaming_model = utils.model_to_tflite(
          sess, model, params,
          Modes.STREAM_EXTERNAL_STATE_INFERENCE,
          optimizations=[tf.lite.Optimize.DEFAULT],
          inference_type=tf.int8,
          experimental_new_quantizer=True,
          representative_dataset=representative_dataset(np.float32))

    # run tflite in streaming mode and compare output logits with tf
    interpreter = tf.lite.Interpreter(model_content=tflite_streaming_model)
    interpreter.allocate_tensors()
    input_states = []
    for detail in interpreter.get_input_details():
      input_states.append(np.zeros(detail['shape'], dtype=np.float32))
    stream_out_tflite = inference.run_stream_inference_classification_tflite(
        params, interpreter, train_image, input_states)
    self.assertAllClose(stream_out_tflite, non_stream_output_tf, atol=0.001)
示例#20
0
 def setUp(self):
     super(AveragePooling2DTest, self).setUp()
     test_utils.set_seed(123)
 def setUp(self):
     super(Conv1DTransposeTest, self).setUp()
     test_utils.set_seed(123)
示例#22
0
  def setUp(self):
    super(DsTcResnetTest, self).setUp()

    config = tf1.ConfigProto()
    config.gpu_options.allow_growth = True
    self.sess = tf1.Session(config=config)
    tf1.keras.backend.set_session(self.sess)
    test_utils.set_seed(123)
    tf.keras.backend.set_learning_phase(0)

    # model parameters
    model_name = 'ds_tc_resnet'
    self.params = model_params.HOTWORD_MODEL_PARAMS[model_name]
    self.params.clip_duration_ms = 160
    self.params.window_size_ms = 4.0
    self.params.window_stride_ms = 2.0
    self.params.wanted_words = 'a,b,c'
    self.params.ds_padding = "'causal','causal','causal'"
    self.params.ds_filters = '8,8,4'
    self.params.ds_repeat = '1,1,1'
    self.params.ds_residual = '0,1,1'  # residual can not be applied with stride
    self.params.ds_kernel_size = '3,3,3'
    self.params.ds_stride = '2,1,1'  # streaming conv with stride
    self.params.ds_dilation = '1,1,1'
    self.params.ds_pool = '1,2,1'  # streaming conv with pool
    self.params.ds_filter_separable = '1,1,1'

    # convert ms to samples and compute labels count
    self.params = model_flags.update_flags(self.params)

    # compute total stride
    pools = utils.parse(self.params.ds_pool)
    strides = utils.parse(self.params.ds_stride)
    time_stride = [1]
    for pool in pools:
      if pool > 1:
        time_stride.append(pool)
    for stride in strides:
      if stride > 1:
        time_stride.append(stride)
    total_stride = np.prod(time_stride)

    # overide input data shape for streaming model with stride/pool
    self.params.data_stride = total_stride
    self.params.data_frame_padding = 'causal'

    # set desired number of frames in model
    frames_number = 16
    frames_per_call = total_stride
    frames_number = (frames_number // frames_per_call) * frames_per_call
    # number of input audio samples required to produce one output frame
    framing_stride = max(
        self.params.window_stride_samples,
        max(0, self.params.window_size_samples -
            self.params.window_stride_samples))
    signal_size = framing_stride * frames_number

    # desired number of samples in the input data to train non streaming model
    self.params.desired_samples = signal_size

    self.params.batch_size = 1
    self.model = ds_tc_resnet.model(self.params)
    self.model.summary()

    self.input_data = np.random.rand(self.params.batch_size,
                                     self.params.desired_samples)

    # run non streaming inference
    self.non_stream_out = self.model.predict(self.input_data)
示例#23
0
 def setUp(self):
   super(Conv2DTransposeTest, self).setUp()
   test_utils.set_seed(123)
   self.input_channels = 2