def _impl(inputs, _): weight = inputs[1][0] weight_scale = inputs[1][1] weight_zero_point = inputs[1][2] output_scale = _expr.const(inputs[2]) output_zero_point = _expr.const(inputs[3]) assert len(inputs) == 6, "Input quant params not found in op inputs" # Manually added by add_input_quant_params_to_op_inputs above input_scale = _expr.const(inputs[4]) input_zero_point = _expr.const(inputs[5]) weight_shape = infer_shape(weight) dense = relay.qnn.op.dense( inputs[0], weight, input_zero_point, weight_zero_point, input_scale, weight_scale, units=weight_shape[0], ) bias_var = inputs[1][3] return _do_bias_and_requantize(dense, bias_var, input_scale, weight_scale, output_scale, output_zero_point, with_relu)
def _convert_tsm(inexpr, keras_layer, etab): nt, c, h, w = infer_shape(inexpr) n = int(nt / 10) fold_div = 3 x = _op.transform.reshape(inexpr, [n, 10, c, h, w]) fold = c // 3 # last_fold = c - (fold_div - 1) * fold split_out = _op.split(x, (fold, fold*2, ), axis=2) out1, out2, out3 = split_out[0], split_out[1], split_out[2] padding_1 = _op.zeros((n, 1, fold, h, w), dtype="float32") out1 = _op.split(out1, (1, ), axis=1)[1] out1 = _op.concatenate([out1, padding_1], axis=1) # Shift right padding_2 = _op.zeros((n, 1, fold, h, w), dtype="float32") out2 = _op.split(out2, (10 - 1, ), axis=1)[0] out2 = _op.concatenate([padding_2, out2], axis=1) out = _op.concatenate([out1, out2, out3], axis=2) out = _op.reshape(out, (-1, c, h, w)) return out
def _impl(inputs, _): weight = inputs[1][0] weight_scale = inputs[1][1] weight_zero_point = inputs[1][2] inp = inputs[0] input_scale, input_zero_point = _calculate_qparam(inp) qinp = relay.qnn.op.quantize(inp, input_scale, input_zero_point, out_dtype="uint8") data_shape = infer_shape(inp) if len(data_shape) > 2: qinp = _op.reverse_reshape(qinp, [-1, 0]) weight_shape = infer_shape(weight) units = weight_shape[0] dense = relay.qnn.op.dense( qinp, weight, input_zero_point, weight_zero_point, input_scale, weight_scale, units=units, ) bias_var = inputs[1][3] dequant_scale = input_scale * weight_scale dense_out = relay.qnn.op.dequantize(dense, dequant_scale, input_zero_point=relay.const( 0, "int32"), axis=1) if len(data_shape) > 2: new_shape = list(data_shape[:-1]) new_shape.append(units) dense_out = _op.reshape(dense_out, new_shape) if bias_var is not None: return dense_out + bias_var return dense_out
def _impl(inputs, _): dim = len(infer_shape(inputs[0])) if dim > 1: axis = 1 else: axis = 0 return relay.qnn.op.quantize( inputs[0], _expr.const(inputs[1]), _expr.const(inputs[2]), out_dtype="uint8", axis=axis )
def _impl(inputs, _): # Refer to aten/src/ATen/native/quantized/cpu/qconv.cpp # Supported in Torch 1.7 or newer conv_params = inputs[1] weight = conv_params[0] weight_scale = conv_params[1] weight_zero_point = conv_params[2] bias = conv_params[3] strides = conv_params[4] padding = conv_params[5] dilation = conv_params[6] groups = conv_params[7] output_padding = conv_params[8] output_scale = _expr.const(inputs[2]) output_zero_point = _expr.const(inputs[3]) assert len(inputs) == 6, "Input quant params not found in op inputs" # These are manually added by add_input_quant_params_to_op_inputs above # In torch, they are retrieved from QTensor data structure at runtime input_scale = _expr.const(inputs[4]) input_zero_point = _expr.const(inputs[5]) weight_shape = list(infer_shape(weight)) # Swap I and O dims to match shape relay expects for OIHW weight_shape[0], weight_shape[1] = weight_shape[1], weight_shape[0] kernel_size = (weight_shape[2], weight_shape[3]) out_channels = weight_shape[0] conv_out = relay.qnn.op.conv2d_transpose( inputs[0], weight, input_zero_point, weight_zero_point, input_scale, weight_scale, kernel_size=kernel_size, dilation=dilation, strides=strides, padding=padding, groups=groups, channels=out_channels, output_padding=output_padding, out_dtype="int32", kernel_layout="OIHW", ) return _do_bias_and_requantize(conv_out, bias, input_scale, weight_scale, output_scale, output_zero_point, with_relu)
def _impl(inputs, _): # refer to aten/src/ATen/native/quantized/cpu/qmul.cpp # math for calculating output scale and zp are already done # during _add_output_quant_params_to_scalar_op above assert len(inputs) == 6, "Input quant params not found in op inputs" other_val = inputs[1] # scalar if other_val > 0.0: # only scale change return inputs[0] if other_val == 0.0: shape = infer_shape(inputs[0]) return _op.full(_expr.const(0), shape, dtype="uint8") # negative scale case q_min = 0 q_max = 255 bias = _expr.const(q_max + q_min, dtype="int8") int8 = bias - _op.cast(inputs[0], "int8") return _op.cast(int8, "uint8")
def _impl(inputs, _): # refer to src/ATen/native/quantized/cpu/qconv.cpp # inputs[0]: input tensor # inputs[1]: (weight, scale, zero_point, bias) # inputs[2-5]: stride, padding, dilation, groups # inputs[6]: output_scale # inputs[7]: output_zero_point # inputs[8]: input_scale (added manually by frontend) # inputs[9]: input_zero_point (added manually by frontend) weight = inputs[1][0] weight_scale = inputs[1][1] weight_zero_point = inputs[1][2] output_scale = _expr.const(inputs[6]) output_zero_point = _expr.const(inputs[7]) assert len(inputs) == 10, "Input quant params not found in op inputs" # These are manually added by add_input_quant_params_to_op_inputs above # In torch, they are retrieved from QTensor data structure at runtime input_scale = _expr.const(inputs[8]) input_zero_point = _expr.const(inputs[9]) strides, padding, dilation = inputs[2], inputs[3], inputs[4] strides = infer_shape(inputs[2]) padding = infer_shape(inputs[3]) dilation = infer_shape(inputs[4]) groups = inputs[5] weight_shape = infer_shape(weight) kernel_size = (weight_shape[2], weight_shape[3]) out_channels = weight_shape[0] if padding[0] != 0 or padding[1] != 0: pad_val = _get_scalar(input_zero_point) inp = _op.nn.pad(inputs[0], pad_width=((0, 0), (0, 0), (padding[0], padding[0]), (padding[1], padding[1])), pad_value=float(pad_val)) else: inp = inputs[0] # padding is (0, 0) because we did explicit pad op with # pad value being zero point above conv_out = relay.qnn.op.conv2d(inp, weight, input_zero_point, weight_zero_point, input_scale, weight_scale, kernel_size=kernel_size, dilation=dilation, strides=strides, padding=(0, 0), groups=groups, channels=out_channels) bias_var = inputs[1][3] return _do_bias_and_requantize(conv_out, bias_var, input_scale, weight_scale, output_scale, output_zero_point, with_relu)
def _convert_attention_mask(inexpr, keras_layer, etab): xsum = _op.reduce.sum(_op.reduce.sum(inexpr, axis=2, keepdims=True), axis=3, keepdims=True) xshape = infer_shape(inexpr) out = inexpr / xsum * tvm.relay.expr.const(xshape[2], dtype='float32') \ * tvm.relay.expr.const(xshape[3], dtype='float32') * tvm.relay.expr.const(0.5, dtype='float32') return out