def selu_compute(input_data): """selu compute implemention""" # if input_dtype is float16,convert it to float32 dtype = input_data.dtype if dtype == "float16" or dtype == "float32": input_data = topi.cast(input_data, "float32") type_tmp = "float32" else: input_data = topi.cast(input_data, "float16") type_tmp = "float16" # generate tensor_zero to be compared tensor_zero = topi.multiply(input_data, tvm.const(0, dtype=type_tmp)) # generate negative_res and positive_res to compute # When the element value is greater than 0 and less than 0 negative_res = topi.minimum(input_data, tensor_zero) positive_res = topi.maximum(input_data, tensor_zero) exp_res = exp(negative_res) sub_res = topi.add(exp_res, tvm.const(SCALAR_NEGATIVE_ONE, dtype=type_tmp)) negative_muls_res = topi.multiply(sub_res, tvm.const(SCALE_ALPHA_PRODUCT, dtype=type_tmp)) if dtype == "int8": negative_muls_res = akg.lang.cce.ceil(negative_muls_res) positive_muls_res = topi.multiply(positive_res, tvm.const(SCALE, dtype=type_tmp)) res = topi.add(negative_muls_res, positive_muls_res) # cast to ori_dtype if dtype == "float16" or dtype == "int8" or dtype == "int32": res = topi.cast(res, dtype) return res
def fused_bn_follow_relu_avgpool(data0, data1, data2, data3, data4, data5, layout='NHWC', out_dtype='float16', target=utils.CUDA): """ input: data: length is 6 data0: tensor1 after bn_double_relu data1-6: bn parameters for conv2d tensor2 layout: only (N, H, W, C), (N, C, H, W) supported out_dtype: float16 output: avg-pooling( max(batch-normalized tensor1 + batch-normalized tensor2, 0) ) """ if layout == 'NCHW': data0 = topi.transpose(data0, (0, 2, 3, 1)) data5 = topi.transpose(data5, (0, 2, 3, 1)) elif layout != 'NHWC': raise NotImplementedError( 'Layout not supported {} '.format(layout)) n, h, w, c = data0.shape inter_dtype = 'float32' add0 = fused_bn_follow(data1, data2, data3, data4, data5) add0 = topi.cast(add0, data0.dtype) add1 = topi.add(data0, add0) output = topi.maximum(add1, 0) output = topi.cast(output, inter_dtype) output = topi.sum(output, axis=(1, 2)) output = topi.divide(output, h * w) output = topi.cast(output, out_dtype) return output
def fused_bn_follow_relu(data0, data1, data2, data3, data4, layout='NHWC', out_dtype='float16', target=utils.CUDA): """ input: data0-4: bn parameters for conv2d tensor, length is 5 data0: param0 beta data1: param1 gamma data2: param2 BNupdate: xi_variance data3: param6 BNreduce: xi_mean data4: param7 xi_conv2d, float16 layout: only (N, H, W, C), (N, C, H, W) supported out_dtype: float16 output: ReLU: max(batch-normalized tensor, 0) """ if layout == 'NCHW': data4 = topi.transpose(data4, (0, 2, 3, 1)) elif layout != 'NHWC': raise NotImplementedError( 'Layout not supported {} '.format(layout)) add0 = fused_bn_follow(data0, data1, data2, data3, data4) add0 = topi.cast(add0, out_dtype) output = topi.maximum(add0, 0) if layout == "NCHW": output = topi.transpose(output, (0, 3, 1, 2)) return output
def maximum(data1, data2, target=utils.CCE): """ Take element-wise maximum of two tensors with auto-broadcasting. Args: data1: tvm.tensor.Tensor data2: tvm.tensor.Tensor Returns: tvm.tensor.Tensor of maximum of two tensors. Supported Platforms: 'Ascend', 'GPU', 'CPU' """ utils.check_supported_target(target) shape1 = [x.value for x in data1.shape] shape2 = [x.value for x in data2.shape] utils.check_shape(shape1) utils.check_shape(shape2) utils.auto_broadcast_check(shape1, shape2) utils.elemwise_dtype_check(data1.dtype, data2.dtype) dtype = data1.dtype need_cast = True if target == utils.CCE and dtype in ["int8", "uint8" ] else False if need_cast: data1 = Cast(data1, "float16") data2 = Cast(data2, "float16") res = topi.maximum(data1, data2) if need_cast: res = Cast(res, dtype) return res
def fake_quant_with_min_max_args(input_data, min_=-6, max_=6, num_bits=8, narrow_range=False): """ Computes Fake-quantize the 'input_data' tensor, type float32 to 'output_data' tensor of same type output_data = (floor(clamped_shifted * inv_nudged_scale + 0.5f))) * scale + nudged_min scale = (max-min) / (quant_max-quant_min) Args: data_x1 (tvm.tensor.Tensor): Tensor of dtype "float32" min ([float, int]): scalar, defaults to -6 max ([float, int]): scalar, defaults to 6. [min; max] define the clamping range for the input_data data num_bits ([float, int]): Defaults to 8. num_bits is the bitwidth of the quantization,between 2 and 16 narrow_range ([bool]): True, quantized into the quantization range [1; 2^num_bits - 1] False,quantized into the quantization range [0; 2^num_bits - 1] Returns: tvm.tensor.Tensor """ shape = get_shape(input_data) utils.check_shape(shape) dtype = input_data.dtype utils.ops_dtype_check(dtype, utils.DtypeForDavinci.FLOAT32) nudged_min, nudged_max, scale = nudge_min_max(min_, max_, num_bits, narrow_range) zero_tensor = tvm.compute(input_data.shape, lambda *i: tvm.const(0, dtype="float32"), name="zero_tensor") nudged_max_tensor = topi.add(zero_tensor, nudged_max) nudged_min_tensor = topi.add(zero_tensor, nudged_min) inv_nudged_scale = 1.00 / scale # Transform the input between nudged_max and nudged_min clamped_vmin = topi.minimum(input_data, nudged_max_tensor) clamped = topi.maximum(clamped_vmin, nudged_min_tensor) # Calculate the quantized and dequantized results clamped_shifted = topi.subtract(clamped, nudged_min_tensor) vmul_shifted = topi.multiply(clamped_shifted, inv_nudged_scale) vadds_shifted = topi.add(vmul_shifted, 0.5) floor_vadds_shifted = floor(vadds_shifted) floor_cast = akg.lang.ascend.cast_to(floor_vadds_shifted, dtype) res_scale = topi.multiply(floor_cast, scale) res = topi.add(res_scale, nudged_min_tensor) return res
def my_dsl(dtype, kernel_name, attrs): m = tvm.var("M") n = tvm.var("N") A = tvm.placeholder((m, ), name="A", dtype=dtype) B = tvm.placeholder((m, ), name="B", dtype=dtype) if insn == "add": C = topi.add(A, B) elif insn == "sub": C = topi.subtract(A, B) if insn == "mul": C = topi.multiply(A, B) elif insn == "div": C = topi.divide(A, B) elif insn == "max": C = topi.maximum(A, B) elif insn == "min": C = topi.minimum(A, B) elif insn == "abs": C = tvm.compute(A.shape, lambda *index: tvm.abs(A(*index)), name='C') elif insn == "exp": C = topi.exp(A) elif insn == "log": C = topi.log(A) elif insn == "sqrt": C = topi.sqrt(A) C = topi.log(A) elif insn == "sqrt": C = topi.sqrt(A) elif insn == "adds": C = A + tvm.const(2, dtype) elif insn == "muls": C = A * tvm.const(2, dtype) # C = tvm.compute((m, ), lambda i: A[i] + B[i], name="C") s = tvm.create_schedule([C.op]) with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): if insnType == "binary": mod = akg.build(s, [A, B, C], "cce", name=kernel_name, attrs=attrs, polyhedral=True) else: mod = akg.build(s, [A, C], "cce", name=kernel_name, attrs=attrs, polyhedral=True) return mod
def _apply_ada_max_compute(var, m, v, grad, lr, beta1, beta1_power, beta2, epsilon): """Compute ada_max.""" # cast to float32 for improved accuracy inp_dtype = var.dtype if inp_dtype == 'float16': var = topi.cast(var, 'float32') m = topi.cast(m, 'float32') v = topi.cast(v, 'float32') lr = topi.cast(lr, 'float32') beta1_power = topi.cast(beta1_power, 'float32') beta1 = topi.cast(beta1, 'float32') beta2 = topi.cast(beta2, 'float32') grad = topi.cast(grad, 'float32') epsilon = tvm.const(epsilon, 'float32') # m += (grad - m) * (1 - beta1) rhs = tvm.compute(beta1.shape, lambda *i: beta1(*i) * neg_one_const("float32")) rhs = tvm.compute(rhs.shape, lambda *i: rhs(*i) + one_const("float32")) lhs = topi.subtract(grad, m) rhs = tvm.compute(lhs.shape, lambda *i: lhs(*i) * rhs[0]) m = topi.add(m, rhs) # v = max(beta2*v, abs(grad)) lhs = tvm.compute(v.shape, lambda *i: v(*i) * beta2[0]) rhs = topi.abs(grad) v = topi.maximum(lhs, rhs) # var -= lr / (1 - beta1_power) * (m / (v + epsilon)) # lr * m / (1 - beta1_power) * (v + epsilon) # v + epsilon rhs = tvm.compute(v.shape, lambda *i: v(*i) + epsilon) # 1 - beta1_power lhs = tvm.compute(beta1_power.shape, lambda *i: beta1_power(*i) * neg_one_const("float32")) lhs = tvm.compute(lhs.shape, lambda *i: lhs(*i) + one_const("float32")) # (1 - beta1_power) * (v + epsilon) rhs = tvm.compute(rhs.shape, lambda *i: rhs(*i) * lhs[0]) # lr * m lhs = tvm.compute(m.shape, lambda *i: m(*i) * lr[0]) # lr * m / (1 - beta1_power) * (v + epsilon) rhs = reciprocal(rhs) rhs = topi.multiply(lhs, rhs) var = topi.subtract(var, rhs) if inp_dtype == 'float16': var = topi.cast(var, inp_dtype) m = topi.cast(m, inp_dtype) v = topi.cast(v, inp_dtype) return var, m, v
def _cmpare_value(input_data, nudged_min, nudged_max): """ where((input_data<=nudged_max)&(x>=nudged_min),1,0) Args: input_data (tvm.tensor.Tensor): Input data nudged_min (tvm.tensor.Tensor): Minimum value of comparison nudged_max (tvm.tensor.Tensor): Maximum value of comparison Returns: tvm.tensor.Tensor """ min_value = tvm.const(2**(-126), dtype="float32") # (2**(-126))*(2**(62))*(2**(62))*(2**(2)) = 1 # so min_value*max_value*max_value*max_value_one = 1 max_value = tvm.const(2**(62), dtype="float32") max_value_one = tvm.const(2**(2), dtype="float32") data_zero = topi.multiply(input_data, 0) max_value_tensor = topi.add(data_zero, max_value) min_value_tensor = topi.add(data_zero, min_value) max_value_one_tensor = topi.add(data_zero, max_value_one) sub_tmp = topi.subtract(input_data, nudged_min) sub_min = topi.add(sub_tmp, min_value) vmax_tmp = topi.maximum(sub_min, data_zero) sub_tmp_max = topi.subtract(nudged_max, input_data) sub_max = topi.add(sub_tmp_max, min_value) vmin_tmp = topi.maximum(sub_max, data_zero) one_tmp = topi.multiply(vmax_tmp, vmin_tmp) one_min = topi.minimum(one_tmp, min_value_tensor) vmul_max_value = topi.multiply(one_min, max_value_tensor) vmul_max_value_one = topi.multiply(vmul_max_value, max_value_tensor) between_nudged_min_max = topi.multiply(vmul_max_value_one, max_value_one_tensor) return between_nudged_min_max
def truncate_div_compute(input_x1, input_x2): """compute for truncate_div""" int_list = ("int32", "int8", "uint8") if input_x1.dtype in int_list: data_zero = dc.zero_const("float32") data_x_broad = cast(input_x1, "float32") data_y_broad = cast(input_x2, "float32") res_div = topi.divide(data_x_broad, data_y_broad) res_min_int = ceil(topi.minimum(res_div, data_zero)) res_max_int = floor(topi.maximum(res_div, data_zero)) res_trunc = topi.add(res_min_int, res_max_int) res_trunc = cast(res_trunc, "float32") else: res_trunc = topi.divide(input_x1, input_x2) return cast(res_trunc, input_x1.dtype)
def less_compare_float32(data_x, data_y): """if x is less than y, then return 1, else return 0""" shape_inputs = get_shape(data_x) # minimun num of float32 2**(-126) data_min = akg.lang.ascend.broadcast(tvm.const(2**(-126), dtype="float32"), shape_inputs, "float32") data_zero = akg.lang.ascend.broadcast(dc.zero_const("float32"), shape_inputs, "float32") res_sub = topi.subtract(data_y, data_x) res_min = topi.minimum(res_sub, data_min) res_max = topi.maximum(res_min, data_zero) # max num of float32 is 2**126 # but cce can only support 2**62, so use 62 * 62 * 2 to adaptor 126 res_mul_fierst = topi.multiply(res_max, tvm.const(2**62, dtype="float32")) res_mul_second = topi.multiply(res_mul_fierst, tvm.const(2**62, dtype="float32")) res = topi.multiply(res_mul_second, tvm.const(2**2, dtype="float32")) return res
def fused_bn_double_follow_relu(data0, data1, data2, data3, data4, data5, data6, data7, data8, data9, layout='NHWC', out_dtype='float16', target=utils.CUDA): """ input: data: length is 5 data0-4: bn parameters for conv2d tensor 1 data5-9: bn parameters for conv2d tensor 2 layout: only (N, H, W, C), (N, C, H, W) supported out_dtype: float16 output: ReLU: max(batch-normalized tensor1 + batch-normalized tensor2, 0) """ if layout == 'NCHW': data4 = topi.transpose(data4, (0, 2, 3, 1)) data9 = topi.transpose(data9, (0, 2, 3, 1)) elif layout != 'NHWC': raise NotImplementedError('Layout not supported {} '.format(layout)) add0 = fused_bn_follow(data0, data1, data2, data3, data4) add1 = fused_bn_follow(data5, data6, data7, data8, data9) add0 = topi.cast(add0, out_dtype) add1 = topi.cast(add1, out_dtype) add2 = topi.add(add0, add1) output = topi.maximum(add2, 0) if layout == "NCHW": output = topi.transpose(output, (0, 3, 1, 2)) return output
def maximum(data1, data2): """ Take element-wise maximum of two tensors with auto-broadcasting. Args: data1: tvm.tensor.Tensor data2: tvm.tensor.Tensor Returns: tvm.tensor.Tensor of maximum of two tensors. """ shape1 = [x.value for x in data1.shape] shape2 = [x.value for x in data2.shape] vc_util.check_shape(shape1) vc_util.check_shape(shape2) vc_util.auto_broadcast_check(shape1, shape2) vc_util.elemwise_dtype_check(data1.dtype, data2.dtype) res = topi.maximum(data1, data2) return res
def fake_quant_with_min_max_vars_per_channel_compute(input_data, input_min, input_max, num_bits=8, narrow_range=False): """fake_quant_with_min_max_vars_per_channel compute implemention""" shape = get_shape(input_data.shape) dtype = input_data.dtype min_broadcast = akg.lang.ascend.broadcast(input_min, shape, dtype) max_broadcast = akg.lang.ascend.broadcast(input_max, shape, dtype) # get nudged_min and nudged_max by nudged_min_max_compute function nudged_min_nudged_max = nudged_min_max_compute(min_broadcast, max_broadcast, num_bits, narrow_range) # transform the input between nudged_max and nudged_min clamped_tmp = topi.minimum(input_data, nudged_min_nudged_max[1]) clamped = topi.maximum(clamped_tmp, nudged_min_nudged_max[0]) # calculate the quantized and dequantized results clamped_shifted = topi.subtract(clamped, nudged_min_nudged_max[0]) if product_is_mini(): clamped_shifted_div_scale = mul(clamped_shifted, reciprocal(nudged_min_nudged_max[2]), target=utils.CCE) else: clamped_shifted_div_scale = Divide(clamped_shifted, nudged_min_nudged_max[2], target=utils.CCE) result_tmp = topi.add(clamped_shifted_div_scale, dc.half_const(dtype)) floor_result_tmp = akg.lang.ascend.floor(result_tmp) if product_is_mini(): floor_result_tmp = topi.cast(floor_result_tmp, "float16") floor_result_tmp = topi.cast(floor_result_tmp, "float32") scale_product = topi.multiply(floor_result_tmp, nudged_min_nudged_max[2]) tmp_res = topi.add(scale_product, nudged_min_nudged_max[0]) # get bool_both_zero_value by bool_both_zero_compute function bool_both_zero_value = bool_both_zero_compute(min_broadcast, max_broadcast) res = topi.multiply(tmp_res, bool_both_zero_value) return res
def _apply_adagrad_da_compute(var, gradient_accum, gradient_squared_accum, grad, lr, l1, l2, global_step): """Compute adagrad_da.""" dtype = var.dtype # cast to float32 for higher precision if dtype == "float16": gradient_accum = topi.cast(gradient_accum, "float32") gradient_squared_accum = topi.cast(gradient_squared_accum, "float32") grad = topi.cast(grad, "float32") lr = topi.cast(lr, "float32") l1 = topi.cast(l1, "float32") l2 = topi.cast(l2, "float32") if product_is_mini(): global_step = topi.cast(global_step, "float16") global_step = topi.cast(global_step, "float32") else: global_step = topi.cast(global_step, "float32") # 1.grad_accum += grad gradient_accum = topi.add(gradient_accum, grad) # 2.grad_squared_accum += grad * grad gs = topi.multiply(grad, grad) gradient_squared_accum = topi.add(gradient_squared_accum, gs) # 3.if l1 > 0: tmp_val = Sign(grad_accum) * max(|grad_accum|-l1*global_step, 0) # else: tmp_val = grad_accum sign_val = Sign(gradient_accum) abs_val = topi.abs(gradient_accum) mul_val = topi.multiply(global_step, l1) sub_val = topi.subtract(abs_val, mul_val) max_val = topi.maximum(sub_val, tvm.const(0, sub_val.dtype)) tmp_val = topi.multiply(sign_val, max_val) def select(l1, tmp_val, gradient_accum): """Returns tmp_val if l1 > 0 else gradient_accum.""" if product_is_mini(): l1 = topi.cast(l1, "float16") tmp_val = topi.cast(tmp_val, "float16") gradient_accum = topi.cast(gradient_accum, "float16") tmp_val = akg.tvm.compute( tmp_val.shape, lambda *i: tvm.expr.Select(l1[0] > 0, tmp_val(*i), gradient_accum(*i))) return topi.cast(tmp_val, "float32") if product_is_mini() else tmp_val tmp_val = select(l1, tmp_val, gradient_accum) # 4.x_value = -1 * lr * tmp_val x_value = topi.multiply(lr, tvm.const(-1, "float32")) x_value = topi.multiply(x_value, tmp_val) # 5.y_value = l2 * global_step * lr + sqrt(grad_squared_accum) pro_val = topi.multiply(l2, global_step) pro_val = topi.multiply(pro_val, lr) sqrt_val = sqrt(gradient_squared_accum, target=utils.CCE) y_value = topi.add(pro_val, sqrt_val) # 6.var = x_value / y_value if product_is_mini(): y_rec = reciprocal(y_value, target=utils.CCE) var_out = topi.multiply(x_value, y_rec) else: var_out = topi.divide(x_value, y_value) if dtype == "float16": var_out = akg.lang.ascend.cast_to(var_out, "float16") gradient_accum = akg.lang.ascend.cast_to(gradient_accum, "float16") gradient_squared_accum = akg.lang.ascend.cast_to( gradient_squared_accum, "float16") return var_out, gradient_accum, gradient_squared_accum