def testModelEndToEnd(self, model_fn): # 1. Check whether quantized model graph can be constructed. model = model_fn(self) model = quantize.quantize_model(model) # 2. Sanity check to ensure basic training on random data works. x_train, y_train = self._create_test_data(model) model.compile(loss='mse', optimizer='sgd', metrics=['accuracy']) model.fit(x_train, y_train, epochs=100) x_test, y_test = self._create_test_data(model) y_tf = model.predict(x_test) # 3. Ensure conversion to TFLite works. _, tflite_file = tempfile.mkstemp('.tflite') print('TFLite File: ', tflite_file) with quantize.quantize_scope(): utils.convert_keras_to_tflite(model, tflite_file) # 4. Verify input runs on converted model. y_tfl = self._execute_tflite(tflite_file, x_test, y_test) # 5. Verify results are the same in TF and TFL. # TODO(pulkitb): Temporarily raise tolerances since some rounding # changes in x86 kernels are causing values to differ by 'scale'. self.assertAllClose(y_tf, y_tfl, atol=1e-1, rtol=1e-1)
def measure_sparsity(model): assert quantize_base.SET_CUSTOM_TNH_FLAG, log.info( "TFMOD needs to be modified with quantizer disabled for proper " "running") # Helper function uses `quantize_annotate_layer` to annotate that only the # Dense layers should be quantized. def add_sparsity_annotation(layer): quantize_config = SparsityMeter() log.info( "**Sparsity Measure annotation added to layer {} with {}".format( layer.name, quantize_config)) quantized_layer = quantize_annotate_layer( to_annotate=layer, quantize_config=quantize_config) return quantized_layer log.info("Annotating model {}".format(model.name)) tf.keras.backend.clear_session() annotated_model = tf.keras.models.clone_model( model, clone_function=add_sparsity_annotation) with quantize_scope({ 'SparsityMeter': SparsityMeter, "ActivSparsityMeasure ": ActivSparsityMeasure, "WeightsSparsityMeasure ": WeightsSparsityMeasure }): # Use `quantize_apply` to actually make the model Sparsity Measure aware. quant_aware_model = quantize_apply(annotated_model) return quant_aware_model
def testQuantizeApply_KeepTrainableWeightOrder(self): layer = self.CustomConvLayer(input_shape=(28, 28, 3)) model = keras.Sequential([layer]) def apply_quantization_to_dense(layer): if isinstance(layer, self.CustomConvLayer): return quantize_annotate_layer( layer, quantize_config=self.CustomConvQuantizeConfig()) return layer annotated_model = tf.keras.models.clone_model( model, clone_function=apply_quantization_to_dense, ) with quantize.quantize_scope({ 'CustomConvQuantizeConfig': self.CustomConvQuantizeConfig, 'CustomConvLayer': self.CustomConvLayer }): quant_aware_model = quantize_apply(annotated_model) self._assert_weights_different_objects( model.trainable_weights, quant_aware_model.trainable_weights) self._assert_weights_equal_value(model.trainable_weights, quant_aware_model.trainable_weights)
def apply_quantization(model): # Helper function uses `quantize_annotate_layer` to annotate that only the # Dense layers should be quantized. def add_quantize_annotation(layer): kernelization_map = [ # tf.keras.layers.Dense, tf.keras.layers.Conv2D ] for layer_type in kernelization_map: if isinstance(layer, layer_type): quantize_config = SLCQuantizeConfig() log.info( "**Kernelization annotation added to layer {} of type {} with {}".format(layer.name, layer_type, quantize_config)) quantized_layer = quantize_annotate_layer(to_annotate=layer, quantize_config=quantize_config) return quantized_layer log.info("**Kernelization annotation not added to layer {} of type {}".format(layer.name, type(layer))) return layer # Use `tf.keras.models.clone_model` to apply `add_quantize_annotation` # to the layers of the model. log.info("Annotating model {}".format(model.name)) annotated_model = tf.keras.models.clone_model( model, clone_function=add_quantize_annotation, ) with quantize_scope({ 'SLCQuantizeConfig': SLCQuantizeConfig, "SLCWeightGenerator": SLCWeightGenerator, "SLCRegularizer": SLCRegularizer }): # Use `quantize_apply` to actually make the model kernelization aware. quant_aware_model = quantize_apply(annotated_model) original_size = 0 compressed_size = 0 for layer in quant_aware_model.layers: try: original_size = original_size + layer.original_size if layer.compressed is True: compressed_size = compressed_size + layer.compressed_size else: compressed_size = compressed_size + layer.original_size except AttributeError: pass try: ratio = compressed_size * 100.0 / original_size log.info( "Model original size: {}, compressed size: {}, ratio: {:.2f}%".format(original_size, compressed_size, ratio)) except ZeroDivisionError: log.info( "Zero division error? Model original size: {}, compressed size: {}".format(original_size, compressed_size)) return quant_aware_model
def to_streaming_inference(model_non_stream, flags, mode): """Convert non streaming trained model to inference modes. Args: model_non_stream: trained Keras model non streamable flags: settings with global data and model properties mode: it supports Non streaming inference, Streaming inference with internal states, Streaming inference with external states Returns: Keras inference model of inference_type """ tf.keras.backend.set_learning_phase(0) input_data_shape = modes.get_input_data_shape(flags, mode) # get input data type and use it for input streaming type dtype = (model_non_stream.input[0].dtype if isinstance( model_non_stream.input, tuple) else model_non_stream.input.dtype) input_tensors = [ tf.keras.layers.Input(shape=input_data_shape, batch_size=1, dtype=dtype, name='input_audio') ] quantize_stream_scope = quantize.quantize_scope() with quantize_stream_scope: model_inference = convert_to_inference_model(model_non_stream, input_tensors, mode) return model_inference
def testSerialization_TF1SavedModel(self): if not compat.is_v1_apis(): return model = test_utils.build_simple_dense_model() quantized_model = quantize.quantize_model(model) self._train_model(quantized_model) saved_model_dir = tempfile.mkdtemp() with quantize.quantize_scope(): tf.keras.experimental.export_saved_model(quantized_model, saved_model_dir) with quantize.quantize_scope(): loaded_model = tf.keras.experimental.load_from_saved_model( saved_model_dir) self._assert_outputs_equal(quantized_model, loaded_model)
def _test_equivalent_to_tflite(self, model, is_tflite_quantized=False): _, keras_file = tempfile.mkstemp('.h5') _, tflite_file = tempfile.mkstemp('.tflite') model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy']) model.fit(np.random.uniform(0, 1, size=[self.batch_size, 10, 10, 3]), np.random.uniform(0, 10, size=[self.batch_size, 8, 8, 2]), epochs=1, callbacks=[]) # Prepare for inference. inp = np.random.uniform(0, 1, size=[self.batch_size, 10, 10, 3]) inp = inp.astype(np.float32) # TensorFlow inference. tf_out = model.predict(inp) if is_tflite_quantized: scale, zero_point = self._compute_quantization_params(model) # TFLite input needs to be quantized. inp = inp * 255 inp = inp.astype(np.uint8) # TensorFlow Lite inference. tf.keras.models.save_model(model, keras_file) with quantize.quantize_scope(): utils.convert_keras_to_tflite( keras_file, tflite_file, custom_objects={'_ConvBatchNorm2D': _ConvBatchNorm2D}, is_quantized=is_tflite_quantized) interpreter = tf.lite.Interpreter(model_path=tflite_file) interpreter.allocate_tensors() input_index = interpreter.get_input_details()[0]['index'] output_index = interpreter.get_output_details()[0]['index'] interpreter.set_tensor(input_index, inp) interpreter.invoke() tflite_out = interpreter.get_tensor(output_index) if is_tflite_quantized: # dequantize outputs tflite_out = [scale * (x - zero_point) for x in tflite_out] # Off by 1 in quantized output. Notably we cannot reduce this. There is # an existing mismatch between TensorFlow and TFLite (from # contrib.quantize days). self.assertAllClose(tf_out, tflite_out, atol=scale) else: # Taken from testFoldFusedBatchNorms from # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/optimize_for_inference_test.py#L230 self.assertAllClose(tf_out, tflite_out, rtol=1e-04, atol=1e-06)
def testSerialization_KerasModel(self): model = test_utils.build_simple_dense_model() quantized_model = quantize.quantize_model(model) self._train_model(quantized_model) _, model_file = tempfile.mkstemp('.h5') tf.keras.models.save_model(quantized_model, model_file) with quantize.quantize_scope(): loaded_model = tf.keras.models.load_model(model_file) self._assert_models_equal(quantized_model, loaded_model)
def testProductionModelConversionToTFLite(self): # small input shape to keep test running quickly. model = tf.keras.applications.mobilenet.MobileNet(weights=None, input_shape=(32, 32, 3)) annotated = quantize_annotate(model) quantized_model = quantize_apply(annotated) _, tflite_file = tempfile.mkstemp('.h5') with quantize.quantize_scope(): utils.convert_keras_to_tflite(quantized_model, tflite_file)
def testTransformsConvBNPattern(self): model = Conv2DModel.get_nonfolded_batchnorm_model( model_type='functional') folded_model = Conv2DModel.get_folded_batchnorm_model( is_quantized=True) with quantize.quantize_scope(): transformed_model, _ = ModelTransformer( model, [default_8bit_transforms.Conv2DBatchNormFold()]).transform() inputs = np.random.standard_normal(Conv2DModel.get_batched_input_shape()) self.assertAllClose( transformed_model.predict(inputs), folded_model.predict(inputs))
def apply_quantization(model): assert quantize_base.SET_CUSTOM_TNH_FLAG, log.info( "TFMOD needs to be modified with quantizer disabled for proper " "running") # Helper function uses `quantize_annotate_layer` to annotate that only the # Dense layers should be quantized. def add_quantize_annotation(layer): # create new layer to break link with old model layer = layer.__class__.from_config(layer.get_config()) quantization_map = [ # tf.keras.layers.Dense, tf.keras.layers.Conv2D # tf.keras.layers.Input: BFPInputQuantizerConfig() ] for layer_type in quantization_map: if isinstance(layer, layer_type): quantize_config = SLGQuantizeConfig() log.info( "**SLG annotation added to layer {} of type {} with {}". format(layer.name, layer_type, quantize_config)) quantized_layer = quantize_annotate_layer( to_annotate=layer, quantize_config=quantize_config) return quantized_layer log.info("**SLG annotation not added to layer {} of type {}".format( layer.name, type(layer))) return layer # Use `tf.keras.models.clone_model` to apply `add_quantize_annotation` # to the layers of the model. log.info("Annotating model {}".format(model.name)) tf.keras.backend.clear_session() annotated_model = tf.keras.models.clone_model( model, clone_function=add_quantize_annotation, ) with quantize_scope({ "SLGWeightGenerator": SLGWeightGenerator, "SLGQuantizeConfig": SLGQuantizeConfig, }): # Use `quantize_apply` to actually make the model quantization aware. quant_aware_model = quantize_apply(annotated_model) return quant_aware_model
def testTransformsDepthwiseConvBNReLUPattern(self): model = DepthwiseConv2DModel.get_nonfolded_batchnorm_model( post_bn_activation=keras.layers.ReLU(6.0), model_type='functional') folded_model = DepthwiseConv2DModel.get_folded_batchnorm_model( post_bn_activation=keras.layers.ReLU(6.0), is_quantized=True) with quantize.quantize_scope(): transformed_model = ModelTransformer( model, [tflite_transforms.DepthwiseConv2DBatchNormReLU6Fold() ]).transform() inputs = np.random.standard_normal( DepthwiseConv2DModel.get_batched_input_shape()) self.assertAllClose(transformed_model.predict(inputs), folded_model.predict(inputs))
def testCustomWeightQuantizers_Run(self, quantizer_type): init_params = self._get_quant_params(quantizer_type) # Additional test that same quantizer object can be shared # between Configs, though we don't expicitly promote this # anywhere in the documentation. quantizer = quantizer_type(**init_params) class DenseQuantizeConfig(QuantizeConfig): """Custom QuantizeConfig for Dense layer.""" def get_weights_and_quantizers(self, layer): return [(layer.kernel, quantizer)] def get_activations_and_quantizers(self, layer): # Defaults. return [(layer.activation, MovingAverageQuantizer( num_bits=8, per_axis=False, symmetric=False, narrow_range=False))] def set_quantize_weights(self, layer, quantize_weights): layer.kernel = quantize_weights[0] def set_quantize_activations(self, layer, quantize_activations): return def get_output_quantizers(self, layer): return [] def get_config(self): return {} annotated_model = tf.keras.Sequential([ quantize.quantize_annotate_layer( l.Dense(8, input_shape=(10,)), DenseQuantizeConfig()), quantize.quantize_annotate_layer( l.Dense(5), DenseQuantizeConfig()) ]) with quantize.quantize_scope( {'DenseQuantizeConfig': DenseQuantizeConfig}): quant_model = quantize.quantize_apply(annotated_model) # Check no error happens. self._train_model(quant_model)
def testTransformsConvBNPatternPreservesWeights(self): # random_init to prevent non-random initialization in resulting # in same weights between transformed and non-transformed models. model = Conv2DModel.get_nonfolded_batchnorm_model( model_type='functional', random_init=True) with quantize.quantize_scope(): transformed_model = ModelTransformer( model, [tflite_transforms.Conv2DBatchNormFold()]).transform() transformed_weights = transformed_model.get_weights() # Remove quantization related weights. del transformed_weights[3:8] self.assertEqual(len(transformed_weights), len(model.get_weights())) for i in range(len(transformed_weights)): self.assertAllEqual(transformed_weights[i], model.get_weights()[i])
def testQuantizesMnist(self): if not compat.is_v1_apis(): return model = test_utils_mnist.sequential_model() x_train, y_train, x_test, y_test = test_utils_mnist.preprocessed_data() model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy']) model.fit(x_train, y_train, batch_size=500) _, model_accuracy = model.evaluate(x_test, y_test, verbose=0) quantized_model = quantize.quantize_model(model) quantized_model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy']) quantized_model.fit(x_train, y_train, batch_size=500) _, quantized_model_accuracy = quantized_model.evaluate(x_test, y_test, verbose=0) self.assertGreater(quantized_model_accuracy, 0.6) _, quantized_tflite_file = tempfile.mkstemp('.tflite') with quantize.quantize_scope(): test_utils.convert_keras_to_tflite( model=quantized_model, output_path=quantized_tflite_file, is_quantized=True) quantized_model_tflite_accuracy = test_utils_mnist.eval_tflite( quantized_tflite_file) # Ensure accuracy for quantized TF and TFLite models are similar to original # model. There is no clear way to measure quantization, but for MNIST # results which differ a lot likely suggest an error in quantization. self.assertAllClose(model_accuracy, quantized_model_accuracy, rtol=0.2, atol=0.2) self.assertAllClose(quantized_model_accuracy, quantized_model_tflite_accuracy, rtol=0.2, atol=0.2)
def apply_quantization(model): # Helper function uses `quantize_annotate_layer` to annotate that only the # Dense layers should be quantized. def add_quantize_annotation(layer): # create new layer to break link with old model layer = layer.__class__.from_config(layer.get_config()) quantization_map = { tf.keras.layers.Dense: BFPQuantizeConfig(), tf.keras.layers.Conv2D: BFPQuantizeConfig() } for layer_type, quantize_config in quantization_map.items(): if isinstance(layer, layer_type): print( "**Quantization annotation added to layer {} of type {} with {}" .format(layer.name, layer_type, quantize_config)) quantized_layer = quantize_annotate_layer( to_annotate=layer, quantize_config=quantize_config) return quantized_layer print("**Quantization annotation not added to layer {} of type {}". format(layer.name, type(layer))) return layer # Use `tf.keras.models.clone_model` to apply `add_quantize_annotation` # to the layers of the model. print("Annotating model {}".format(model.name)) annotated_model = tf.keras.models.clone_model( model, clone_function=add_quantize_annotation, ) with quantize_scope({ 'BFPQuantizeConfig': BFPQuantizeConfig, "BFPActivQuantizer": BFPActivQuantizer, "BFPWeightQuantizer": BFPWeightQuantizer, "BFPBiasQuantizer": BFPBiasQuantizer, "PolynomialDecay": PolynomialDecay }): # Use `quantize_apply` to actually make the model quantization aware. quant_aware_model = quantize_apply(annotated_model) return quant_aware_model
def testSerialization(self): model = test_utils.build_simple_dense_model() quantized_model = quantize_apply(quantize_annotate(model)) quantized_model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy']) quantized_model.fit(np.random.rand(20, 10), tf.keras.utils.to_categorical( np.random.randint(5, size=(20, 1)), 5), batch_size=20) _, model_file = tempfile.mkstemp('.h5') keras.models.save_model(quantized_model, model_file) with quantize.quantize_scope(): loaded_model = keras.models.load_model(model_file) self._assert_models_equal(quantized_model, loaded_model)
def testQuantizeSingleLayer_ProducesFullIntegerModel_TF1( self, layer_type, kwargs): if not compat.is_v1_apis(): return if 'input_shape' not in kwargs: kwargs['input_shape'] = (5, ) layer = layer_type(**kwargs) model = tf.keras.Sequential([layer]) quantized_model = quantize.quantize_model(model) with quantize.quantize_scope(): test_utils.convert_keras_to_tflite(model=quantized_model, output_path=None, is_quantized=True, inference_type=tf.uint8, inference_input_type=tf.uint8, input_quant_params=(0., 1.))
def testModelEndToEnd(self, model_type): # 1. Check whether quantized model graph can be constructed. model = self._get_model(model_type) model = quantize.quantize_model(model) # 2. Sanity check to ensure basic training on random data works. x_train, y_train = self._create_test_data(model) model.compile( loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy']) model.fit(x_train, y_train) # 3. Ensure conversion to TFLite works. _, tflite_file = tempfile.mkstemp('.tflite') print('TFLite File: ', tflite_file) with quantize.quantize_scope(): utils.convert_keras_to_tflite(model, tflite_file) # 4. Verify input runs on converted model. self._verify_tflite(tflite_file, x_train, y_train)
def testQuantizeSingleLayer_ProducesFullIntegerModel_TF2( self, layer_type, kwargs): # "FullInteger" in the sense that ignores inputs and outputs. if compat.is_v1_apis(): return if 'input_shape' not in kwargs: kwargs['input_shape'] = (5, ) layer = layer_type(**kwargs) model = tf.keras.Sequential([layer]) quantized_model = quantize.quantize_model(model) _, quantized_tflite_file = tempfile.mkstemp('.tflite') with quantize.quantize_scope(): test_utils.convert_keras_to_tflite( model=quantized_model, output_path=quantized_tflite_file, is_quantized=True, input_quant_params=(0., 1.), experimental_new_converter=True) interpreter = tf.lite.Interpreter(model_path=quantized_tflite_file) interpreter.allocate_tensors() input_tensor_details = interpreter.get_input_details() self.assertEqual(input_tensor_details[0]['dtype'], np.float32) output_tensor_details = interpreter.get_output_details() self.assertEqual(output_tensor_details[0]['dtype'], np.float32) tensor_details = interpreter.get_tensor_details() float_tensor_details = [ t for t in tensor_details if t['dtype'] == np.float32 ] # Only the input and outputs are float. The rest are integer. # # TODO(tfmot): update this test to use the full-integer path when available, # so that float_tensor_details should be length 0. self.assertLen(float_tensor_details, 2)
def testConv2DBatchNormReLUQuantize(self, layer_type): model = self._get_model(layer_type, True) input_shape = self._get_input_shape(layer_type) with quantize.quantize_scope(): transformed_model, updated_metadata = ModelTransformer( model, [tflite_transforms.Conv2DBatchNormReLUQuantize()], ).transform() conv_layer = transformed_model.layers[1] bn_layer = transformed_model.layers[2] self.assertIsInstance(conv_layer.activation, quantize_aware_activation.NoOpActivation) self.assertIsInstance( updated_metadata.get(bn_layer.name).get('quantize_provider'), tflite_quantize_providers.NoOpQuantizeProvider) inputs = np.random.standard_normal(input_shape) self.assertAllClose(transformed_model.predict(inputs), model.predict(inputs))
def testQuantizeSingleLayer_ProducesFullIntegerModel_TF1( self, layer_type, kwargs): if not compat.is_v1_apis(): return if 'input_shape' not in kwargs: kwargs['input_shape'] = (5, ) layer = layer_type(**kwargs) model = tf.keras.Sequential([layer]) quantized_model = quantize.quantize_model(model) with quantize.quantize_scope(): test_utils.convert_keras_to_tflite( model=quantized_model, output_path=None, is_quantized=True, inference_type=tf.uint8, inference_input_type=tf.uint8, input_quant_params=(0., 1.), # Set to False to throw errors when FakeQuants are # not placed everywhere to create full-integer model. Errors # are not thrown when set to True. experimental_new_converter=False)
def test_cnn_model_end_to_end(self): config = tf1.ConfigProto() config.gpu_options.allow_growth = True sess = tf1.Session(config=config) tf1.keras.backend.set_session(sess) test_utils.set_seed(123) # data parameters num_time_bins = 12 feature_size = 12 # model params. total_stride = 2 params = test_utils.Params([total_stride], 0) params.model_name = 'cnn' params.cnn_filters = '2' params.cnn_kernel_size = '(3,3)' params.cnn_act = "'relu'" params.cnn_dilation_rate = '(1,1)' params.cnn_strides = '(2,2)' params.dropout1 = 0.5 params.units2 = '' params.act2 = '' params.label_count = 2 params.return_softmax = True params.quantize = 1 # apply quantization aware training params.data_shape = (num_time_bins, feature_size) params.preprocess = 'custom' model = cnn.model(params) model.summary() # prepare training and testing data train_images, train_labels = test_utils.generate_data( img_size_y=num_time_bins, img_size_x=feature_size, n_samples=32) test_images = train_images test_labels = train_labels # create and train quantization aware model in non streaming mode model.compile( optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(), metrics=['accuracy']) model.fit( train_images, train_labels, epochs=1, validation_data=(test_images, test_labels)) model.summary() # one test image train_image = train_images[:1,] # run tf non streaming inference non_stream_output_tf = model.predict(train_image) # specify input data shape for streaming mode params.data_shape = (total_stride, feature_size) # TODO(rybakov) add params structure for model with no feature extractor # prepare tf streaming model and use it to generate representative_dataset with quantize.quantize_scope(): stream_quantized_model = utils.to_streaming_inference( model, params, Modes.STREAM_EXTERNAL_STATE_INFERENCE) calibration_data = prepare_calibration_data(stream_quantized_model, total_stride, train_image) def representative_dataset(dtype): def _representative_dataset_gen(): for i in range(len(calibration_data)): yield [ calibration_data[i][0].astype(dtype), # input audio packet calibration_data[i][1].astype(dtype), # conv state calibration_data[i][2].astype(dtype) # flatten state ] return _representative_dataset_gen # convert streaming quantization aware model to tflite # and apply post training quantization with quantize.quantize_scope(): tflite_streaming_model = utils.model_to_tflite( sess, model, params, Modes.STREAM_EXTERNAL_STATE_INFERENCE, optimizations=[tf.lite.Optimize.DEFAULT], inference_type=tf.int8, experimental_new_quantizer=True, representative_dataset=representative_dataset(np.float32)) # run tflite in streaming mode and compare output logits with tf interpreter = tf.lite.Interpreter(model_content=tflite_streaming_model) interpreter.allocate_tensors() input_states = [] for detail in interpreter.get_input_details(): input_states.append(np.zeros(detail['shape'], dtype=np.float32)) stream_out_tflite = inference.run_stream_inference_classification_tflite( params, interpreter, train_image, input_states) self.assertAllClose(stream_out_tflite, non_stream_output_tf, atol=0.001)
def _test_equal_tf_and_tflite_outputs(self, tf_model, is_tflite_quantized=False): _, tflite_file = tempfile.mkstemp('.tflite') batched_input_shape = self._get_batched_input_shape() output_shape = self._get_output_shape() tf_model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy']) tf_model.fit(np.random.uniform(0, 1, size=batched_input_shape), np.random.uniform(0, 10, size=output_shape), epochs=1, callbacks=[]) # Prepare for inference. inp = np.random.uniform(0, 1, size=batched_input_shape) inp = inp.astype(np.float32) if is_tflite_quantized: real_min = keras.backend.eval( tf_model.layers[-1]._activation_min_var) real_max = keras.backend.eval( tf_model.layers[-1]._activation_max_var) scale, zero_point = self._get_asymmetric_quant_params( real_min, real_max, -128.0, 127.0) # TFLite input needs to be quantized. real_input_min = 0.0 real_input_max = 1.0 inp_scale, inp_zp = self._get_asymmetric_quant_params( real_input_min, real_input_max, -128.0, 127.0) inp8 = np.round(inp / inp_scale + inp_zp) inp8 = inp8.astype(np.int8) # Dequant inp = (inp8.astype(np.float32) - inp_zp) * inp_scale # TensorFlow inference. tf_out = tf_model.predict(inp) # TensorFlow Lite inference. with quantize.quantize_scope(): utils.convert_keras_to_tflite( tf_model, tflite_file, custom_objects={ '_ConvBatchNorm2D': _ConvBatchNorm2D, '_DepthwiseConvBatchNorm2D': _DepthwiseConvBatchNorm2D, }, is_quantized=is_tflite_quantized, inference_input_type=tf.lite.constants.INT8) interpreter = tf.lite.Interpreter(model_path=tflite_file) interpreter.allocate_tensors() input_index = interpreter.get_input_details()[0]['index'] output_index = interpreter.get_output_details()[0]['index'] if is_tflite_quantized: interpreter.set_tensor(input_index, inp8) else: interpreter.set_tensor(input_index, inp) interpreter.invoke() tflite_out = interpreter.get_tensor(output_index) if is_tflite_quantized: # dequantize outputs tflite_out = [scale * (x - zero_point) for x in tflite_out] # TODO(pulkitb): DConv quantized test somehow has a single value (0.065%) # of total values, which falls off by 1 scale. Investigate further and # introduce stricter testing by removing atol=scale. self.assertAllClose(tf_out, tflite_out, atol=scale) else: # Taken from testFoldFusedBatchNorms from # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/optimize_for_inference_test.py#L230 self.assertAllClose(tf_out, tflite_out, rtol=1e-04, atol=1e-06)
def apply_quantization(model, pruning_policy=None, weight_precision=None, activation_precision=None, activation_margin=None): # assert quantize_base.SET_CUSTOM_TNH_FLAG, log.info("TFMOD needs to be modified with quantizer disabled for proper " # "running") if weight_precision is not None: global _WEIGHTS_NUM_BITS # need to declare when you want to change the value _WEIGHTS_NUM_BITS = weight_precision if activation_precision is not None: global _ACTIV_NUM_BITS _ACTIV_NUM_BITS = activation_precision if activation_margin is not None: global _ACTIV_MARGIN _ACTIV_MARGIN = activation_margin log.info( "Weights num bits: {} - Activ num bits: {} - Activ margin: {}".format( _WEIGHTS_NUM_BITS, _ACTIV_NUM_BITS, _ACTIV_MARGIN)) # Helper function uses `quantize_annotate_layer` to annotate that only the # Dense layers should be quantized. def add_quantize_annotation(layer): # create new layer to break link with old model try: layer = layer.__class__.from_config(layer.get_config()) except: pass for layer_type in quantization_map: if isinstance(layer, layer_type): if isinstance(pruning_policy, float) or pruning_policy is None: layer_pruning = pruning_policy elif isinstance(pruning_policy, dict): layer_pruning = pruning_policy[layer.name] else: raise ValueError("Illegal layer pruning policy {}".format( pruning_policy)) quantize_config = BFPQuantizeConfig( pruning_policy=layer_pruning) log.info( "**Quantization annotation added to layer {} of type {} with {}" .format(layer.name, layer_type, quantize_config)) quantized_layer = quantize_annotate_layer( to_annotate=layer, quantize_config=quantize_config) return quantized_layer log.info("**Quantization annotation not added to layer {} of type {}". format(layer.name, type(layer))) return layer # Use `tf.keras.models.clone_model` to apply `add_quantize_annotation` # to the layers of the model. log.info("Annotating model {}".format(model.name)) tf.keras.backend.clear_session() annotated_model = tf.keras.models.clone_model( model, clone_function=add_quantize_annotation, ) with quantize_scope({ 'BFPQuantizeConfig': BFPQuantizeConfig, "BFPActivQuantizer": BFPActivQuantizer, "BFPWeightQuantizer": BFPWeightQuantizer, "BFPBiasQuantizer": BFPBiasQuantizer, "PolynomialDecay": PolynomialDecay }): # Use `quantize_apply` to actually make the model quantization aware. quant_aware_model = quantize_apply(annotated_model) for q_layer in quant_aware_model.layers: if isinstance(q_layer, QuantizeWrapper): for quant_type in quantization_map: if isinstance(q_layer.layer, quant_type): original_name = q_layer.name.replace("quant_", "") old_layer = model.get_layer(original_name) q_weights = q_layer.get_weights() orig_weights = old_layer.get_weights() q_weights[0] = orig_weights[0] try: q_weights[1] = orig_weights[1] except IndexError: pass q_layer.set_weights(q_weights) return quant_aware_model
f.write(str(graph_def)) model.compile(loss=tf.keras.losses.categorical_crossentropy, optimizer=tf.keras.optimizers.Adadelta(), metrics=['accuracy']) model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(x_test, y_test)) score = model.evaluate(x_test, y_test, verbose=0) print('Test loss:', score[0]) print('Test accuracy:', score[1]) # Export to Keras. keras_file = '/tmp/quantized_mnist.h5' tf.keras.models.save_model(model, keras_file) # Convert to TFLite model. with quantize.quantize_scope(): converter = tf.lite.TFLiteConverter.from_keras_model_file(keras_file) converter.inference_type = tf.lite.constants.QUANTIZED_UINT8 input_arrays = converter.get_input_arrays() converter.quantized_input_stats = { input_arrays[0]: (0., 255.) } # mean, std_dev tflite_model = converter.convert() open('/tmp/quantized_mnist.tflite', 'wb').write(tflite_model)