def setUp(self): super(InverseSTFTTest, self).setUp() test_utils.set_seed(123) self.frame_size = 32 self.frame_step = 8 # layer definition inverse_stft_layer = inverse_stft.InverseSTFT(self.frame_size, self.frame_step) # prepare input stft data input_audio = tf.random.uniform((1, 256), maxval=1.0) signal_stft_tf = tf.signal.stft( input_audio, inverse_stft_layer.frame_size, inverse_stft_layer.frame_step, inverse_stft_layer.fft_size, window_fn=inverse_stft_layer.synthesis_window_fn, pad_end=False) with tf1.Session() as sess: self.signal_stft = sess.run(signal_stft_tf) self.feature_size = self.signal_stft.shape[-1] # create istft model and run non stream inference input_tf = tf.keras.layers.Input(shape=self.signal_stft.shape[1:3], batch_size=1, dtype=tf.complex64) net = inverse_stft_layer(input_tf) model_non_stream = tf.keras.models.Model(input_tf, net) self.non_stream_out = model_non_stream.predict(self.signal_stft)
def test_masking(self): test_utils.set_seed(self.seed) spectrogram = np.ones(self.input_shape) inputs = tf.keras.layers.Input(shape=self.input_shape[1:], batch_size=self.input_shape[0], dtype=tf.float32) outputs = spectrogram_cutout.SpecCutout(masks_number=2, time_mask_size=4, frequency_mask_size=2, seed=self.seed)(inputs, training=True) model = tf.keras.models.Model(inputs, outputs) prediction = model.predict(spectrogram) # confirm that every mask has different rects in different batch indexes target0 = np.array([[1., 1., 1., 1., 1.], [1., 1., 1., 1., 1.], [0., 1., 1., 1., 1.], [0., 1., 1., 0., 0.], [0., 1., 1., 0., 0.], [0., 1., 1., 0., 0.], [1., 1., 1., 0., 0.]]) self.assertAllEqual(prediction[0, :, :], target0) target1 = np.array([[1., 1., 1., 1., 1.], [0., 1., 1., 1., 1.], [0., 1., 1., 1., 1.], [0., 1., 0., 0., 1.], [0., 1., 0., 0., 1.], [1., 1., 0., 0., 1.], [1., 1., 0., 0., 1.]]) self.assertAllEqual(prediction[1, :, :], target1)
def setUp(self): super(MagnitudeRDFTmelTest, self).setUp() test_utils.set_seed(123) self.signal_size = 100 # input signal self.signal = np.random.rand(1, self.signal_size) # model parameters self.use_tf_fft = False self.magnitude_squared = False self.num_mel_bins = 40 self.lower_edge_hertz = 20.0 self.upper_edge_hertz = 4000.0 self.sample_rate = 16000.0 # build rdft mel model and run it input_signal = tf.keras.Input(shape=(self.signal_size, ), batch_size=1) mag_rdft = magnitude_rdft.MagnitudeRDFT( use_tf_fft=self.use_tf_fft, magnitude_squared=self.magnitude_squared)(input_signal) mel_spectr = mel_spectrogram.MelSpectrogram( use_tf=False, num_mel_bins=self.num_mel_bins, lower_edge_hertz=self.lower_edge_hertz, upper_edge_hertz=self.upper_edge_hertz, sample_rate=self.sample_rate)(mag_rdft) model_rdft_mel = tf.keras.Model(input_signal, mel_spectr) model_rdft_mel.summary() self.shape_rdft_mel = model_rdft_mel.layers[2].mel_weight_matrix.shape self.rdft_mel_output = model_rdft_mel.predict(self.signal)
def test_transposed_conv(self): """Test transposed and standard conv model with 'same' padding.""" test_utils.set_seed(123) # model and data parameters cnn_filters = [1, 1] cnn_kernel_size = [5, 3] cnn_act = ['linear', 'linear'] cnn_use_bias = [False, False] cnn_paddings = ['same', 'same'] trans_paddings = ['same', 'causal'] params = test_utils.Params([1], clip_duration_ms=2) # prepare input data x = np.arange(params.desired_samples) inp_audio = x inp_audio = np.expand_dims(inp_audio, 0) # prepare non stream model model = transposed_conv_model(params, cnn_filters, cnn_kernel_size, cnn_act, cnn_use_bias, cnn_paddings, trans_paddings) # set random weights all_weights = [] for w in model.get_weights(): if isinstance(w, np.ndarray): shape = w.shape new_w = np.random.rand(*shape) all_weights.append(new_w) else: all_weights.append(True) model.set_weights(all_weights) model.summary() non_stream_out = model.predict(inp_audio) # prepare streaming model model_stream = utils.to_streaming_inference( model, params, modes.Modes.STREAM_INTERNAL_STATE_INFERENCE) model_stream.summary() stream_out = inference.run_stream_inference(params, model_stream, inp_audio) # shift defines the index after which data in streaming mode become valid: # in streaming mode we use ring buffers initialized with zeros and it needs # several cycles until they are filled with real data. shift = 2 # the total conv delay is (5//2) * 2 + 3//2 = 5 # (there is no delay from the k=3 s=2 transposed convs, 'same' or 'causal'), # and the explicit Delay layers add an additional same amount. total_delay = 10 # normalize output data and compare them non_stream_out = non_stream_out[0, shift:-(total_delay), ] stream_out = stream_out[0, total_delay + shift:, ] self.assertAllClose(stream_out, non_stream_out)
def test_tf_vs_tf_direct(self): # Compare TF implementation of Mel (based on FFT) # vs TF direct implementation (based on FT) feature_size = 257 num_mel_bins = 80 lower_edge_hertz = 125.0 upper_edge_hertz = 7600.0 sample_rate = 16000.0 batch_size = 1 test_utils.set_seed(1) # generate input data frame = np.random.rand(batch_size, feature_size) # prepare model with TF implementation of Mel based on FFT input1 = tf.keras.layers.Input(shape=(feature_size, ), batch_size=batch_size, dtype=tf.float32) mel_spectrum = mel_spectrogram.MelSpectrogram( mode=modes.Modes.NON_STREAM_INFERENCE, use_tf=True, num_mel_bins=num_mel_bins, lower_edge_hertz=lower_edge_hertz, upper_edge_hertz=upper_edge_hertz, sample_rate=sample_rate) output1 = mel_spectrum(input1) model_tf = tf.keras.models.Model(input1, output1) # generate mel output_tf = model_tf.predict(frame) # prepare model with TF implementation of Mel based on direct FT input2 = tf.keras.layers.Input(shape=(feature_size, ), batch_size=batch_size, dtype=tf.float32) mel_spectrum_direct = mel_spectrogram.MelSpectrogram( mode=modes.Modes.STREAM_EXTERNAL_STATE_INFERENCE, use_tf=False, num_mel_bins=num_mel_bins, lower_edge_hertz=lower_edge_hertz, upper_edge_hertz=upper_edge_hertz, sample_rate=sample_rate) output2 = mel_spectrum_direct(input2) model_tf_direct = tf.keras.models.Model(input2, output2) # generate mel output_tf_direct = model_tf_direct.predict(frame) self.assertAllClose(output_tf, output_tf_direct, rtol=1e-5, atol=1e-4)
def setUp(self): super().setUp() self.inference_batch_size = 1 self.params = Params() self.frame_size = int( round(self.params.sample_rate * self.params.window_size_ms / 1000.0)) self.frame_step = int( round(self.params.sample_rate * self.params.window_stride_ms / 1000.0)) # generate input signal test_utils.set_seed(1) self.data_size = 1024 self.signal = np.random.rand(self.inference_batch_size, self.data_size)
def setUp(self): super(CounterTest, self).setUp() test_utils.set_seed(123) self.time_size = 30 self.feature_size = 3 self.max_counter = 11 self.input_non_stream_np = np.random.randn(1, self.time_size, self.feature_size) inputs = tf.keras.layers.Input( shape=( self.time_size, self.feature_size, ), batch_size=1) net = counter.Counter(max_counter=self.max_counter)(inputs) self.model = tf.keras.Model(inputs, net)
def test_frequency_masking(self): test_utils.set_seed(self.seed) spectrogram = np.ones(self.input_shape) inputs = tf.keras.layers.Input(shape=self.input_shape[1:], batch_size=self.input_shape[0], dtype=tf.float32) outputs = spectrogram_augment.SpecAugment(time_masks_number=0, time_mask_max_size=3, frequency_masks_number=2, frequency_mask_max_size=3)( inputs, training=True) model = tf.keras.models.Model(inputs, outputs) prediction = model.predict(spectrogram) target = np.array([[[1., 0., 0., 1., 0.], [1., 0., 0., 1., 0.], [1., 0., 0., 1., 0.], [1., 0., 0., 1., 0.], [1., 0., 0., 1., 0.]]]) self.assertAllEqual(prediction, target)
def test_sequential_to_functional(self): # prepare input data test_utils.set_seed(1) batch_input_shape = (1, 4, 2, 2) input_data = np.random.rand(np.prod(batch_input_shape)) input_data = np.reshape(input_data, batch_input_shape) # create sequential model inputs = tf.keras.Input(batch_input_shape=batch_input_shape) net = SequentialModel(2)(inputs) model = tf.keras.Model(inputs=inputs, outputs=net) model.summary() # convert keras sequential model to functional and compare them func_model = utils.sequential_to_functional(model) func_model.summary() self.assertAllClose(model.predict(input_data), func_model.predict(input_data))
def setUp(self): super(GRUTest, self).setUp() test_utils.set_seed(123) # generate input signal self.inference_batch_size = 1 self.data_size = 32 self.feature_size = 4 self.signal = np.random.rand(self.inference_batch_size, self.data_size, self.feature_size) # create non streamable model inputs = tf.keras.layers.Input(shape=(self.data_size, self.feature_size), batch_size=self.inference_batch_size, dtype=tf.float32) self.units = 3 outputs = gru.GRU(units=self.units, return_sequences=True)(inputs) self.model_non_streamable = tf.keras.Model(inputs, outputs) self.output_gru = self.model_non_streamable.predict(self.signal)
def test_random_shift(self): test_utils.set_seed(self.seed) audio = np.ones(self.input_shape) inputs = tf.keras.layers.Input(shape=self.input_shape[1:], batch_size=self.input_shape[0], dtype=tf.float32) outputs = random_shift.RandomShift(time_shift=3, seed=self.seed)(inputs, training=True) model = tf.keras.models.Model(inputs, outputs) prediction = model.predict(audio) # confirm that audio sequence is shifted left target0 = np.array([1., 1., 1., 1., 0., 0., 0.]) self.assertAllEqual(prediction[0, :], target0) # confirm that audio sequence is shifted right target1 = np.array([0., 1., 1., 1., 1., 1., 1.]) self.assertAllEqual(prediction[1, :], target1)
def setUp(self): super(DsTcResnetTest, self).setUp() config = tf1.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf1.Session(config=config) tf1.keras.backend.set_session(self.sess) tf.keras.backend.set_learning_phase(0) test_utils.set_seed(123) self.params = utils.ds_tc_resnet_model_params(True) self.model = ds_tc_resnet.model(self.params) self.model.summary() self.input_data = np.random.rand(self.params.batch_size, self.params.desired_samples) # run non streaming inference self.non_stream_out = self.model.predict(self.input_data)
def setUp(self): super(STFTTest, self).setUp() test_utils.set_seed(123) self.frame_size = 40 self.frame_step = 10 # layer definition stft_layer = stft.STFT(self.frame_size, self.frame_step, mode=modes.Modes.TRAINING, inference_batch_size=1, padding='causal') if stft_layer.window_type == 'hann_tf': synthesis_window_fn = tf.signal.hann_window else: synthesis_window_fn = None # prepare input data self.input_signal = np.random.rand(1, 120) # prepare default tf stft padding_layer = temporal_padding.TemporalPadding( padding_size=stft_layer.frame_size - 1, padding=stft_layer.padding) # pylint: disable=g-long-lambda stft_default_layer = tf.keras.layers.Lambda( lambda x: tf.signal.stft(x, stft_layer.frame_size, stft_layer.frame_step, fft_length=stft_layer.fft_size, window_fn=synthesis_window_fn, pad_end=False)) # pylint: enable=g-long-lambda input_tf = tf.keras.layers.Input(shape=(self.input_signal.shape[1], ), batch_size=1) net = padding_layer(input_tf) net = stft_default_layer(net) model_stft = tf.keras.models.Model(input_tf, net) self.stft_out = model_stft.predict(self.input_signal)
def test_padding(self, padding): batch_size = 1 time_dim = 3 feature_dim = 3 kernel_size = 3 inputs = tf.keras.layers.Input(shape=(time_dim, feature_dim), batch_size=batch_size) # set it in train mode (in stream mode padding is not applied) net = stream.Stream(mode=modes.Modes.TRAINING, cell=tf.keras.layers.Lambda(lambda x: x), ring_buffer_size_in_time_dim=kernel_size, pad_time_dim=padding)(inputs) model = tf.keras.Model(inputs, net) test_utils.set_seed(1) input_signal = np.random.rand(batch_size, time_dim, feature_dim) outputs = model.predict(input_signal) self.assertAllEqual( outputs.shape, [batch_size, time_dim + kernel_size - 1, feature_dim])
def _set_params(self, use_peepholes): test_utils.set_seed(123) # generate input signal self.inference_batch_size = 1 self.data_size = 32 self.feature_size = 4 self.signal = np.random.rand(self.inference_batch_size, self.data_size, self.feature_size) # create non streamable model inputs = tf.keras.layers.Input(shape=(self.data_size, self.feature_size), batch_size=self.inference_batch_size, dtype=tf.float32) self.units = 3 self.num_proj = 4 outputs = lstm.LSTM(units=self.units, return_sequences=True, use_peepholes=use_peepholes, num_proj=self.num_proj)(inputs) self.model_non_streamable = tf.keras.Model(inputs, outputs) self.output_lstm = self.model_non_streamable.predict(self.signal)
def test_random_stretch_squeeze(self): test_utils.set_seed(self.seed) audio = np.zeros(self.input_shape) audio[:, 2:5,] = 1 inputs = tf.keras.layers.Input( shape=self.input_shape[1:], batch_size=self.input_shape[0], dtype=tf.float32) outputs = random_stretch_squeeze.RandomStretchSqueeze( resample_offset=0.5, seed=self.seed)( inputs, training=True) model = tf.keras.models.Model(inputs, outputs) prediction = model.predict(audio) # confirm that data are squeezed target0 = np.array([0., 0., 1., 1., 0., 0., 0.]) self.assertAllClose(prediction[0, :], target0) # confirm that data are stretched target1 = np.array([0., 0.44444, 1., 1., 1., 0.44444, 0.]) self.assertAllClose(prediction[1, :], target1, atol=1e-4)
def test_ds_tc_resnet_stream_internal_tflite(self): """Test tflite streaming with internal state.""" test_utils.set_seed(123) tf.keras.backend.set_learning_phase(0) params = utils.ds_tc_resnet_model_params(True) model = ds_tc_resnet.model(params) model.summary() input_data = np.random.rand(params.batch_size, params.desired_samples) # run non streaming inference non_stream_out = model.predict(input_data) tflite_streaming_model = utils.model_to_tflite( None, model, params, Modes.STREAM_INTERNAL_STATE_INFERENCE) interpreter = tf.lite.Interpreter(model_content=tflite_streaming_model) interpreter.allocate_tensors() stream_out = inference.run_stream_inference_classification_tflite( params, interpreter, input_data, input_states=None) self.assertAllClose(stream_out, non_stream_out, atol=1e-5)
def setUp(self): super(DelayStreamTest, self).setUp() test_utils.set_seed(123)
def test_cnn_model_end_to_end(self): config = tf1.ConfigProto() config.gpu_options.allow_growth = True sess = tf1.Session(config=config) tf1.keras.backend.set_session(sess) test_utils.set_seed(123) # data parameters num_time_bins = 12 feature_size = 12 # model params. total_stride = 2 params = test_utils.Params([total_stride], 0) params.model_name = 'cnn' params.cnn_filters = '2' params.cnn_kernel_size = '(3,3)' params.cnn_act = "'relu'" params.cnn_dilation_rate = '(1,1)' params.cnn_strides = '(2,2)' params.dropout1 = 0.5 params.units2 = '' params.act2 = '' params.label_count = 2 params.return_softmax = True params.quantize = 1 # apply quantization aware training params.data_shape = (num_time_bins, feature_size) params.preprocess = 'custom' model = cnn.model(params) model.summary() # prepare training and testing data train_images, train_labels = test_utils.generate_data( img_size_y=num_time_bins, img_size_x=feature_size, n_samples=32) test_images = train_images test_labels = train_labels # create and train quantization aware model in non streaming mode model.compile( optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics=['accuracy']) model.fit( train_images, train_labels, epochs=1, validation_data=(test_images, test_labels)) model.summary() # one test image train_image = train_images[:1,] # run tf non streaming inference non_stream_output_tf = model.predict(train_image) # specify input data shape for streaming mode params.data_shape = (total_stride, feature_size) # TODO(rybakov) add params structure for model with no feature extractor # prepare tf streaming model and use it to generate representative_dataset with quantize.quantize_scope(): stream_quantized_model = utils.to_streaming_inference( model, params, Modes.STREAM_EXTERNAL_STATE_INFERENCE) calibration_data = prepare_calibration_data(stream_quantized_model, total_stride, train_image) def representative_dataset(dtype): def _representative_dataset_gen(): for i in range(len(calibration_data)): yield [ calibration_data[i][0].astype(dtype), # input audio packet calibration_data[i][1].astype(dtype), # conv state calibration_data[i][2].astype(dtype) # flatten state ] return _representative_dataset_gen # convert streaming quantization aware model to tflite # and apply post training quantization with quantize.quantize_scope(): tflite_streaming_model = utils.model_to_tflite( sess, model, params, Modes.STREAM_EXTERNAL_STATE_INFERENCE, optimizations=[tf.lite.Optimize.DEFAULT], inference_type=tf.int8, experimental_new_quantizer=True, representative_dataset=representative_dataset(np.float32)) # run tflite in streaming mode and compare output logits with tf interpreter = tf.lite.Interpreter(model_content=tflite_streaming_model) interpreter.allocate_tensors() input_states = [] for detail in interpreter.get_input_details(): input_states.append(np.zeros(detail['shape'], dtype=np.float32)) stream_out_tflite = inference.run_stream_inference_classification_tflite( params, interpreter, train_image, input_states) self.assertAllClose(stream_out_tflite, non_stream_output_tf, atol=0.001)
def setUp(self): super(AveragePooling2DTest, self).setUp() test_utils.set_seed(123)
def setUp(self): super(Conv1DTransposeTest, self).setUp() test_utils.set_seed(123)
def setUp(self): super(DsTcResnetTest, self).setUp() config = tf1.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf1.Session(config=config) tf1.keras.backend.set_session(self.sess) test_utils.set_seed(123) tf.keras.backend.set_learning_phase(0) # model parameters model_name = 'ds_tc_resnet' self.params = model_params.HOTWORD_MODEL_PARAMS[model_name] self.params.clip_duration_ms = 160 self.params.window_size_ms = 4.0 self.params.window_stride_ms = 2.0 self.params.wanted_words = 'a,b,c' self.params.ds_padding = "'causal','causal','causal'" self.params.ds_filters = '8,8,4' self.params.ds_repeat = '1,1,1' self.params.ds_residual = '0,1,1' # residual can not be applied with stride self.params.ds_kernel_size = '3,3,3' self.params.ds_stride = '2,1,1' # streaming conv with stride self.params.ds_dilation = '1,1,1' self.params.ds_pool = '1,2,1' # streaming conv with pool self.params.ds_filter_separable = '1,1,1' # convert ms to samples and compute labels count self.params = model_flags.update_flags(self.params) # compute total stride pools = utils.parse(self.params.ds_pool) strides = utils.parse(self.params.ds_stride) time_stride = [1] for pool in pools: if pool > 1: time_stride.append(pool) for stride in strides: if stride > 1: time_stride.append(stride) total_stride = np.prod(time_stride) # overide input data shape for streaming model with stride/pool self.params.data_stride = total_stride self.params.data_frame_padding = 'causal' # set desired number of frames in model frames_number = 16 frames_per_call = total_stride frames_number = (frames_number // frames_per_call) * frames_per_call # number of input audio samples required to produce one output frame framing_stride = max( self.params.window_stride_samples, max(0, self.params.window_size_samples - self.params.window_stride_samples)) signal_size = framing_stride * frames_number # desired number of samples in the input data to train non streaming model self.params.desired_samples = signal_size self.params.batch_size = 1 self.model = ds_tc_resnet.model(self.params) self.model.summary() self.input_data = np.random.rand(self.params.batch_size, self.params.desired_samples) # run non streaming inference self.non_stream_out = self.model.predict(self.input_data)
def setUp(self): super(Conv2DTransposeTest, self).setUp() test_utils.set_seed(123) self.input_channels = 2