def quantize_to_int(self, model_name, model_data_url, model_data_md5, weight_bits, quantizable_op_type, weight_quantize_type, generate_test_model, threshold_rate): model_dir = self.download_model(model_name, model_data_url, model_data_md5) load_model_dir = os.path.join(model_dir, model_name) timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime()) save_model_dir = os.path.join( os.getcwd(), model_name + "_wq_" + str(weight_bits) + "_" + timestamp) weight_quant = WeightQuantization(model_dir=load_model_dir) weight_quant.quantize_weight_to_int( save_model_dir=save_model_dir, weight_bits=weight_bits, quantizable_op_type=quantizable_op_type, weight_quantize_type=weight_quantize_type, generate_test_model=generate_test_model, threshold_rate=threshold_rate) print("finish weight quantization for " + model_name + "\n") try: os.system("rm -rf {}".format(save_model_dir)) except Exception as e: print("Failed to delete {} due to {}".format( save_model_dir, str(e)))
def quant_post_only_weight(model_dir, save_model_dir, model_filename=None, params_filename=None, save_model_filename=None, save_params_filename=None, quantizable_op_type=["conv2d", "mul"], weight_bits=8, generate_test_model=False): ''' In order to reduce the size of model, this api quantizes the weight of some ops from float32 to int8/16. In the inference stage, the quantized weight will be dequantized to float32 again. Args: model_dir(str): The path of the fp32 model that will be quantized, and the model and params files are under the path. save_model_dir(str): The path to save the quantized model. model_filename(str, optional): The name of file used to load the inference program. If it is None, the default filename '__model__' will be used. Default is 'None'. params_filename(str, optional): The name of file used to load all parameters. When all parameters were saved in a single binary file, set it as the real filename. If parameters were saved in separate files, set it as 'None'. Default is 'None'. save_model_dir(str): The path used to save the quantized model. save_model_filename(str, optional): The name of file to save the inference program. If it is None, the default filename '__model__' will be used. Default is 'None'. save_params_filename(str, optional): The name of file to save all parameters. If it is None, parameters were saved in separate files. If it is not None, all parameters were saved in a single binary file. quantizable_op_type(list[str], optional): The list of ops that will be quantized, and the quantized ops should be contained in ["conv2d", "depthwise_conv2d", "mul"]. Default is ["conv2d", "depthwise_conv2d", "mul"]. weight_bits(int, optional): The bits for the quantized weight, and it should be 8 or 16. Default is 8. generate_test_model(bool, optional): If set generate_test_model as True, it saves a fake quantized model, in which the weights are quantized and dequantized. We can use PaddlePaddle to load the fake quantized model and test the accuracy on GPU or CPU. ''' weight_quant = WeightQuantization(model_dir=model_dir, model_filename=model_filename, params_filename=params_filename) weight_quant.quantize_weight_to_int( save_model_dir=save_model_dir, save_model_filename=save_model_filename, save_params_filename=save_params_filename, quantizable_op_type=quantizable_op_type, weight_bits=weight_bits, generate_test_model=generate_test_model)
def convert_to_fp16(self, model_name, model_data_url, model_data_md5, model_filename, params_filename): model_dir = self.download_model(model_name, model_data_url, model_data_md5) load_model_dir = os.path.join(model_dir, model_name) timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime()) save_model_dir = os.path.join(os.getcwd(), model_name + "_wq_fp16_" + timestamp) weight_quant = WeightQuantization(load_model_dir, model_filename, params_filename) weight_quant.convert_weight_to_fp16(save_model_dir) print("finish converting the data type of weights to fp16 for " + model_name) print("fp16 model saved in " + save_model_dir + "\n") input_data = np.ones([1, 3, 224, 224], dtype=np.float32) res_fp32 = self.run_models(load_model_dir, model_filename, params_filename, input_data, False) res_fp16 = self.run_models(save_model_dir, model_filename, params_filename, input_data, True) self.assertTrue( np.allclose(res_fp32, res_fp16, rtol=1e-5, atol=1e-08, equal_nan=True), msg='Failed to test the accuracy of the fp32 and fp16 model.') try: os.system("rm -rf {}".format(save_model_dir)) except Exception as e: print("Failed to delete {} due to {}".format( save_model_dir, str(e)))
def quant_post_dynamic(model_dir, save_model_dir, model_filename=None, params_filename=None, save_model_filename=None, save_params_filename=None, quantizable_op_type=["conv2d", "mul"], weight_bits=8, generate_test_model=False): ''' The function utilizes static post training quantization method to quantize the fp32 model. In details, it quantizes the weight of some ops from float32 to int8/16. For the quantized model, there are two kinds of calculation method in the reference stage. Firstly, the quantized weight will be dequantized to float32, and then apply the float32 calculation. Secondly, collect the quantized scales of the inputs, and then apply the int8 calculation. Args: model_dir(str): The path of the fp32 model that will be quantized, and the model and params files are under the path. save_model_dir(str): The path to save the quantized model. model_filename(str, optional): The name of file used to load the inference program. If it is None, the default filename '__model__' will be used. Default is 'None'. params_filename(str, optional): The name of file used to load all parameters. When all parameters were saved in a single binary file, set it as the real filename. If parameters were saved in separate files, set it as 'None'. Default is 'None'. save_model_dir(str): The path used to save the quantized model. save_model_filename(str, optional): The name of file to save the inference program. If it is None, the default filename '__model__' will be used. Default is 'None'. save_params_filename(str, optional): The name of file to save all parameters. If it is None, parameters were saved in separate files. If it is not None, all parameters were saved in a single binary file. quantizable_op_type(list[str], optional): The list of ops that will be quantized, and the quantized ops should be contained in ["conv2d", "depthwise_conv2d", "mul"]. Default is ["conv2d", "depthwise_conv2d", "mul"]. weight_bits(int, optional): The bits for the quantized weight, and it should be 8 or 16. Default is 8. generate_test_model(bool, optional): If set generate_test_model as True, it saves a fake quantized model, in which the weights are quantized and dequantized. We can use PaddlePaddle to load the fake quantized model and test the accuracy on GPU or CPU. ''' weight_quant = WeightQuantization(model_dir=model_dir, model_filename=model_filename, params_filename=params_filename) weight_quant.quantize_weight_to_int( save_model_dir=save_model_dir, save_model_filename=save_model_filename, save_params_filename=save_params_filename, quantizable_op_type=quantizable_op_type, weight_bits=weight_bits, generate_test_model=generate_test_model)