def _remove_duplicate_quantize__ops(model: ModelProto): quantize_ops_by_input = defaultdict(list) for node in model.graph.node: if node.op_type == "QuantizeLinear": quantize_ops_by_input[node.input[0]].append(node) for quantize_op_group in quantize_ops_by_input.values(): if len(quantize_op_group) == 1: continue keep_node = quantize_op_group[0] remove_nodes = quantize_op_group[1:] for remove_node in remove_nodes: _replace_input_id_model(model, remove_node.output[0], keep_node.output[0]) remove_node_and_params_from_graph(model, remove_node)
def _fold_conv_bn_bias(model: ModelProto, conv_node: NodeProto, bn_node: NodeProto): # fold bias into conv from bn then delete bn node bn_params = get_batch_norm_params(model, bn_node) variance_term = 1 / numpy.sqrt(bn_params.var + bn_params.epsilon) folded_bias = ( -1.0 * bn_params.mean * variance_term * bn_params.scale + bn_params.bias ) folded_bias = folded_bias.astype(numpy.float32) bias_name = conv_node.name + ".bias" conv_node.input.append(bias_name) update_model_param(model, bias_name, folded_bias) # forward conv output to bn children swap_node_output(conv_node, bn_node.output[0]) # remove bn from graph remove_node_and_params_from_graph(model, bn_node)
def delete_quant_node(model: ModelProto, node: NodeProto, keep_params: bool = False): """ Deletes a QuantizeLinear or DequantizeLinear and its parameters from the model :param model: ONNX model to modify :param node: the QuantizeLinear or DequantizeLinear node to delete :param keep_params: set true to not delete scale and zero point parameters stored in the graph """ assert ( node.op_type in _QUANTIZE_OP_NAMES ), "Op Type must be either QuantizeLinear or DequantizeLinear, found {} ".format( node.op_type ) if keep_params: del node.input[2] # delete reference to zero point del node.input[1] # delete reference to scale remove_node_and_params_from_graph(model, node)
def _fold_relu_quants(model: ModelProto): # delete relu nodes that feed directly into quantize nodes with a zero point of 0 for relu_node in model.graph.node: if relu_node.op_type != "Relu": continue relu_children = get_node_output_nodes(model, relu_node) if not relu_children or any( node.op_type != "QuantizeLinear" for node in relu_children ): # skip if any child is not a quantize node continue quantize_params = [ get_quantization_params(model, quant_node) for quant_node in relu_children ] if any(params.zero_point != 0 for params in quantize_params): # skip if activation zero point does not match relu threshold of 0 continue # set all child input nodes to the relu node input for quant_node in relu_children: quant_node.input[0] = relu_node.input[0] # delete relu node remove_node_and_params_from_graph(model, relu_node)
def _fold_qat_conv_bns(model: ModelProto): # conv weight should already be folded in quantize linear # remove the that div undos the weight folding # fold bn into conv bias and remove bn node # (Conv -> Div -> BN) -> Conv for conv_node in model.graph.node: if conv_node.op_type != "Conv" or len(conv_node.input) > 2: # not conv node or conv node already has bias continue div_node = _get_single_node_child(model, conv_node) if not div_node or div_node.op_type != "Div": continue bn_node = _get_single_node_child(model, div_node) if not bn_node or bn_node.op_type != "BatchNormalization": continue # forward conv output to div children swap_node_output(conv_node, div_node.output[0]) # remove div from graph remove_node_and_params_from_graph(model, div_node) # fold bn into conv bias and remove bn _fold_conv_bn_bias(model, conv_node, bn_node)
def _convert_quantizable_gemm( model: ModelProto, gemm_node: NodeProto, input_quantize_node: NodeProto, weight_dequantize_node: NodeProto, weight_quantize_node: NodeProto, output_quantize_node: NodeProto, ): # Gemm -> (QLinearMatMul -> Add(bias)) weight_quantize_params = get_quantization_params( model, weight_quantize_node, include_target=True ) if weight_quantize_params.target is None: # weight initializer not included return gemm_attributes = get_node_attributes(gemm_node) if any(float(attribute) != 1.0 for attribute in gemm_attributes.values()): # can only handle Gemm operations without alpha/beta/transB set return # can fold the input/output quant ops if they are trivial fold_input_quant = input_quantize_node.op_type == "DequantizeLinear" fold_output_quant = output_quantize_node.op_type == "QuantizeLinear" # quantize weight quantized_weight = _quantize_array( weight_quantize_params.target, weight_quantize_params.scale, weight_quantize_params.zero_point, ) quantized_weight = quantized_weight.transpose() # Gemm has implicit transpose quantized_weight_name = "{}.weight_quantized".format(gemm_node.name) quantized_weight_initializer = numpy_helper.from_array( quantized_weight, name=quantized_weight_name ) model.graph.initializer.append(quantized_weight_initializer) # get qmatmul inputs and outputs qmatmul_input = ( input_quantize_node.input[0] if fold_input_quant else gemm_node.input[0] ) qmatmul_inputs = [ qmatmul_input, # x input_quantize_node.input[1], # x_scale input_quantize_node.input[2], # x_zero_point quantized_weight_name, # w weight_quantize_node.input[1], # w_scale weight_quantize_node.input[2], # w_zero_point output_quantize_node.input[1], # y_scale output_quantize_node.input[2], # y_zero_point ] qmatmul_output = ( output_quantize_node.output[0] if fold_output_quant else gemm_node.output[0] ) qmatmul_name = "{}_quant".format(gemm_node.name) # create qmatmul node and add it to graph qmatmul_node = onnx.helper.make_node( "QLinearMatMul", qmatmul_inputs, [qmatmul_output], qmatmul_name, ) model.graph.node.append(qmatmul_node) # delete folded quantization ops delete_quant_node(model, weight_dequantize_node, keep_params=False) delete_quant_node(model, weight_quantize_node, keep_params=True) if fold_input_quant and len(get_node_output_nodes(model, input_quantize_node)) <= 1: # fold if this gemm is the only node that reads from this quant op delete_quant_node(model, input_quantize_node, keep_params=True) if fold_output_quant: delete_quant_node(model, output_quantize_node, keep_params=True) if len(gemm_node.input) > 2: # add bias term following FC in the graph qmatmul_child_node = get_node_output_nodes(model, qmatmul_node) assert qmatmul_child_node, "QLinearMatMul node must have an output in the graph" dequant_output_name = "{}_dequantized".format(qmatmul_name) if qmatmul_child_node[0].op_type == "DequantizeLinear": qmatmul_dequantize_node = qmatmul_child_node[0] # create hidden output layer for bias add add_output_name = qmatmul_dequantize_node.output[0] swap_node_output(qmatmul_dequantize_node, dequant_output_name) else: # inject dequantize op for matmul qmatmul_output_name = "{}_output".format(qmatmul_name) swap_node_output(qmatmul_node, qmatmul_output_name) qmatmul_dequantize_node = onnx.helper.make_node( "DequantizeLinear", [ qmatmul_output_name, # input output_quantize_node.input[1], # scale output_quantize_node.input[2], # zero point ], [dequant_output_name], "{}_dequantize".format(qmatmul_name), ) model.graph.node.append(qmatmul_dequantize_node) add_output_name = qmatmul_output # original qmatmul output name # inject bias op for dequantized matmul output qmatmul_bias_add_node = onnx.helper.make_node( "Add", [ qmatmul_dequantize_node.output[0], # add input gemm_node.input[2], # Gemm bias ], [add_output_name], "{}_bias_add".format(gemm_node.name), ) model.graph.node.append(qmatmul_bias_add_node) # delete original Gemm node params_to_keep = [gemm_node.input[2]] if len(gemm_node.input) > 1 else [] remove_node_and_params_from_graph(model, gemm_node, keep_params=params_to_keep)
def _convert_quantizable_conv( model: ModelProto, conv_node: NodeProto, input_quantize_node: NodeProto, weight_dequantize_node: NodeProto, weight_quantize_node: NodeProto, output_quantize_node: NodeProto, ): weight_quantize_params = get_quantization_params( model, weight_quantize_node, include_target=True ) if weight_quantize_params.target is None: # weight initializer not included return # can fold the input/output quant ops if they are trivial fold_input_quant = input_quantize_node.op_type == "DequantizeLinear" fold_output_quant = output_quantize_node.op_type == "QuantizeLinear" # quantize weight quantized_weight = _quantize_array( weight_quantize_params.target, weight_quantize_params.scale, weight_quantize_params.zero_point, weight_quantize_params.zero_point.dtype, ) quantized_weight_name = "{}.weight_quantized".format(conv_node.name) quantized_weight_initializer = numpy_helper.from_array( quantized_weight, name=quantized_weight_name ) model.graph.initializer.append(quantized_weight_initializer) # get qconv inputs and outputs qconv_input = ( input_quantize_node.input[0] if fold_input_quant else conv_node.input[0] ) qconv_inputs = [ qconv_input, # x input_quantize_node.input[1], # x_scale input_quantize_node.input[2], # x_zero_point quantized_weight_name, # w weight_quantize_node.input[1], # w_scale weight_quantize_node.input[2], # w_zero_point output_quantize_node.input[1], # y_scale output_quantize_node.input[2], # y_zero_point ] if len(conv_node.input) > 2: bias = get_init_by_name(model, conv_node.input[2]) if bias is not None: # quantize bias and add it to the qconv inputs bias = numpy_helper.to_array(bias) input_quantize_params = get_quantization_params( model, input_quantize_node, include_target=False ) bias_scale = input_quantize_params.scale * weight_quantize_params.scale quantized_bias = _quantize_array(bias, bias_scale, 0, numpy.int32) quantized_bias_name = "{}.bias_quantized".format(conv_node.name) quantized_bias_initializer = numpy_helper.from_array( quantized_bias, name=quantized_bias_name ) model.graph.initializer.append(quantized_bias_initializer) qconv_inputs.append(quantized_bias_name) qconv_output = ( output_quantize_node.output[0] if fold_output_quant else conv_node.output[0] ) qconv_name = "{}_quant".format(conv_node.name) qconv_kwargs = {} for attribute in conv_node.attribute: qconv_kwargs.update(_attribute_to_kwarg(attribute)) # create qconv node and add it to graph qconv_node = onnx.helper.make_node( "QLinearConv", qconv_inputs, [qconv_output], qconv_name, **qconv_kwargs ) model.graph.node.append(qconv_node) # delete original conv and folded quantization ops remove_node_and_params_from_graph(model, conv_node) delete_quant_node(model, weight_dequantize_node, keep_params=False) delete_quant_node(model, weight_quantize_node, keep_params=True) if fold_input_quant and len(get_node_output_nodes(model, input_quantize_node)) <= 1: # fold if this conv is the only node that reads from this quant op delete_quant_node(model, input_quantize_node, keep_params=True) if fold_output_quant: delete_quant_node(model, output_quantize_node, keep_params=True)
def _convert_quantizable_matmul_and_add(model: ModelProto): """ A pass for converting a MatMul with kernel and bias into a quantized representation | Starting with: | INPUT QuantizeLinear (with constant kernel) | | | | QuantizeLinear DequantizeLinear | | | | DequantizeLinear Transpose | | | | MatMul | | | Add (with constant bias) | | | QuantizeLinear | | | DequantizeLinear | | | OUTPUT | We end up converting to: | INPUT | | | QuantizeLinear | | | QLinearMatMul (with constant kernel) | | | QLinearAdd (with constant bias) | | | DequantizeLinear | | | OUTPUT """ conversion_count = 0 matmul_nodes = [n for n in model.graph.node if n.op_type in ["MatMul"]] for matmul_node in matmul_nodes: graph = ONNXGraph(model) ############# # Matching ############# weight_transpose_node = graph.get_node_single_parent(matmul_node, 1) if not weight_transpose_node or weight_transpose_node.op_type != "Transpose": continue weight_dequantize_node = graph.get_node_single_parent( weight_transpose_node, 0) if (not weight_dequantize_node or weight_dequantize_node.op_type != "DequantizeLinear"): continue weight_quantize_node = graph.get_node_single_parent( weight_dequantize_node, 0) if not weight_quantize_node or weight_quantize_node.op_type != "QuantizeLinear": continue input_quantize_node = graph.get_node_single_parent(matmul_node, 0) if (not input_quantize_node or input_quantize_node.op_type not in _QUANTIZE_OP_NAMES): continue bias_add_node = graph.get_node_single_child(matmul_node) if not bias_add_node or bias_add_node.op_type != "Add": continue output_quantize_node = graph.get_node_single_child(bias_add_node) if (not output_quantize_node or output_quantize_node.op_type not in _QUANTIZE_OP_NAMES): continue input_quantize_params = get_quantization_params(model, input_quantize_node, include_target=False) weight_quantize_params = get_quantization_params(model, weight_quantize_node, include_target=True) if weight_quantize_params.target is None: # weight initializer not included continue if input_quantize_node.op_type != "DequantizeLinear": continue if output_quantize_node.op_type != "QuantizeLinear": continue bias_initializer = get_init_by_name(model, bias_add_node.input[1]) if bias_initializer is None: continue _LOGGER.debug( f"Matched quantizable MatMul weight and bias: {matmul_node.name}") ############# # Conversion ############# # quantize weight quantized_weight = _quantize_array( weight_quantize_params.target, weight_quantize_params.scale, weight_quantize_params.zero_point, ) quantized_weight = quantized_weight.transpose( ) # Gemm has implicit transpose quantized_weight_name = "{}.weight_quantized".format(matmul_node.name) quantized_weight_initializer = numpy_helper.from_array( quantized_weight, name=quantized_weight_name) model.graph.initializer.append(quantized_weight_initializer) # QLinearMatMul # get qmatmul inputs and outputs qmatmul_input = input_quantize_node.input[0] qmatmul_inputs = [ qmatmul_input, # x input_quantize_node.input[1], # x_scale input_quantize_node.input[2], # x_zero_point quantized_weight_name, # w weight_quantize_node.input[1], # w_scale weight_quantize_node.input[2], # w_zero_point output_quantize_node.input[1], # y_scale output_quantize_node.input[2], # y_zero_point ] qmatmul_output = matmul_node.output[0] qmatmul_name = "{}_quant".format(matmul_node.name) # create qmatmul node and add it to graph qmatmul_node = onnx.helper.make_node( "QLinearMatMul", qmatmul_inputs, [qmatmul_output], qmatmul_name, ) model.graph.node.append(qmatmul_node) # QLinearAdd # quantize bias bias_initializer = numpy_helper.to_array(bias_initializer) bias_scale = input_quantize_params.scale * weight_quantize_params.scale bias_zero_point = 0 quantized_bias = _quantize_array(bias_initializer, bias_scale, bias_zero_point) quantized_bias_name = "{}.bias_quantized".format(bias_add_node.name) quantized_bias_initializer = numpy_helper.from_array( quantized_bias, name=quantized_bias_name) model.graph.initializer.append(quantized_bias_initializer) quantized_bias_scale_name = "{}.scale".format(quantized_bias_name) model.graph.initializer.append( numpy_helper.from_array(numpy.asarray(bias_scale), name=quantized_bias_scale_name)) quantized_bias_zero_point_name = "{}.zero_point".format( quantized_bias_name) model.graph.initializer.append( numpy_helper.from_array( numpy.asarray(bias_zero_point, dtype=numpy.uint8), name=quantized_bias_zero_point_name, )) # get qadd inputs and outputs qadd_input = qmatmul_output qadd_inputs = [ qadd_input, # x output_quantize_node.input[1], # x_scale output_quantize_node.input[2], # x_zero_point quantized_bias_name, # b quantized_bias_scale_name, # b_scale quantized_bias_zero_point_name, # b_zero_point output_quantize_node.input[1], # y_scale output_quantize_node.input[2], # y_zero_point ] qadd_output = output_quantize_node.output[0] qadd_name = "{}_quant".format(bias_add_node.name) kwargs = {"domain": "com.microsoft"} # create qlinearadd node and add it to graph qadd_node = onnx.helper.make_node( "QLinearAdd", qadd_inputs, [qadd_output], qadd_name, **kwargs, ) model.graph.node.append(qadd_node) # Cleanup # delete folded quantization ops delete_quant_node(model, weight_dequantize_node, keep_params=False) delete_quant_node(model, weight_quantize_node, keep_params=True) remove_node_and_params_from_graph(model, weight_transpose_node) delete_quant_node(model, input_quantize_node, keep_params=True) delete_quant_node(model, output_quantize_node, keep_params=True) # delete original Gemm node remove_node_and_params_from_graph(model, matmul_node, keep_params=None) # delete original Add node remove_node_and_params_from_graph(model, bias_add_node, keep_params=None) conversion_count += 1 if matmul_nodes: _LOGGER.info( f"Converted {conversion_count} quantizable MatMul ops with weight and bias " "to QLinearMatMul and QLinearAdd")