def _do_bias_and_requantize(output, bias, input_scale, weight_scale, output_scale, output_zero_point, with_relu): """Output processing for conv and linear""" # this is a vector for per channel case requant_input_scale = _expr.const( _get_numpy(input_scale) * _get_numpy(weight_scale)) # Torch does bias add and requanize scale in fp32 # refer to third_party/fbgemm/include/fbgemm/OutputProcessing-inl.h # Instead, we do bias add in int32 and use qnn requantize, which needs # integer input. # We observed no loss in accuracy in doing this way, and it is better # for tvm because bias quantization can be done at compile time # Instead, the torch way requires rounding of activation at runtime if bias is not None: requantize_input = _op.nn.bias_add(output, bias) else: requantize_input = output requantized = relay.qnn.op.requantize( requantize_input, requant_input_scale, relay.const(0, "int32"), output_scale, output_zero_point, out_dtype="int32", axis=1, ) clip_min = 0 if with_relu: clip_min = _get_scalar(output_zero_point) clip = _op.tensor.clip(requantized, clip_min, 255.0) return _op.cast(clip, dtype="uint8")
def _impl(inputs, _): # refer to aten/src/ATen/native/quantized/cpu/qmul.cpp # math for calculating output scale and zp are already done # during _add_output_quant_params_to_scalar_op above assert len(inputs) == 6, "Input quant params not found in op inputs" other_val = inputs[1] # scalar if other_val > 0.0: # only scale change return inputs[0] if other_val == 0.0: shape = infer_shape(inputs[0]) return _op.full(_expr.const(0), shape, dtype="uint8") # negative scale case q_min = 0 q_max = 255 bias = _expr.const(q_max + q_min, dtype="int8") int8 = bias - _op.cast(inputs[0], "int8") return _op.cast(int8, "uint8")
def _calculate_qparam(inp): # reference ATen/native/quantized/cpu/qlinear_dynamic.cpp # ChooseQuantizationParams function mn = _op.min(inp) mx = _op.max(inp) # Ensure that the interval contains 0 mn = _op.minimum(mn, _op.const(0.0, dtype="float32")) mx = _op.maximum(mx, _op.const(0.0, dtype="float32")) qmax = 255 # reduce_range became True in v1.6 if is_version_greater_than("1.5.1"): qmax = 127 scale = (mx - mn) / _expr.const(qmax, dtype="float32") zero_point_from_min = -(mn / scale) zero_point = _op.cast(_op.round(_op.clip(zero_point_from_min, 0.0, qmax)), "int32") return scale, zero_point
def quantized_relu(data, input_zero_point): # refer to aten/src/ATen/native/quantized/cpu/qrelu.cpp zp = _op.cast(input_zero_point, dtype="uint8") return _op.tensor.maximum(data, zp)
def apply_with_upcast(data, func): inp = _op.cast(data, dtype="int32") out = func(inp) return _op.cast(out, "uint8")
def get_reverse_mode_result(e, d, t): assert isinstance(t, TensorType) return op.cast(e * d, 'float32')
def quantized_adaptive_avg_2d(data, func_fp32): # this follows tflite impl inp = _op.cast(data, dtype="int32") out = func_fp32(inp) return _op.cast(out, "uint8")