def _atan_compute(data): """compute for atan""" dtype = data.dtype if dtype == "float16": data = topi.cast(data, "float32") abs_data = topi.abs(data) tensor_one = dc.one_const(abs_data.dtype) abs_data_sub_one = topi.subtract(abs_data, tensor_one) abs_data_add_one = topi.add(abs_data, tensor_one) abs_data2 = topi.abs(topi.divide(abs_data_sub_one, abs_data_add_one)) # calucate data less than one res = _do_atan_taylor(abs_data) # calucate data more than one res_mt_one = topi.add(_do_atan_taylor(abs_data2), tvm.const(CONST_PI_BY_FOUR, abs_data2.dtype)) res = topi.minimum(res, res_mt_one) if utils.product_is_mini() and data.dtype == "float32": sign_mask = topi.cast(topi.sign(topi.cast(data, "float16")), "float32") else: sign_mask = topi.sign(data) res = topi.multiply(res, sign_mask) if dtype == "float16": res = topi.cast(res, "float16") return res
def selu_compute(input_data): """selu compute implemention""" # if input_dtype is float16,convert it to float32 dtype = input_data.dtype if dtype == "float16" or dtype == "float32": input_data = topi.cast(input_data, "float32") type_tmp = "float32" else: input_data = topi.cast(input_data, "float16") type_tmp = "float16" # generate tensor_zero to be compared tensor_zero = topi.multiply(input_data, tvm.const(0, dtype=type_tmp)) # generate negative_res and positive_res to compute # When the element value is greater than 0 and less than 0 negative_res = topi.minimum(input_data, tensor_zero) positive_res = topi.maximum(input_data, tensor_zero) exp_res = exp(negative_res) sub_res = topi.add(exp_res, tvm.const(SCALAR_NEGATIVE_ONE, dtype=type_tmp)) negative_muls_res = topi.multiply(sub_res, tvm.const(SCALE_ALPHA_PRODUCT, dtype=type_tmp)) if dtype == "int8": negative_muls_res = akg.lang.cce.ceil(negative_muls_res) positive_muls_res = topi.multiply(positive_res, tvm.const(SCALE, dtype=type_tmp)) res = topi.add(negative_muls_res, positive_muls_res) # cast to ori_dtype if dtype == "float16" or dtype == "int8" or dtype == "int32": res = topi.cast(res, dtype) return res
def fake_quant_with_min_max_args(input_data, min_=-6, max_=6, num_bits=8, narrow_range=False): """ Computes Fake-quantize the 'input_data' tensor, type float32 to 'output_data' tensor of same type output_data = (floor(clamped_shifted * inv_nudged_scale + 0.5f))) * scale + nudged_min scale = (max-min) / (quant_max-quant_min) Args: data_x1 (tvm.tensor.Tensor): Tensor of dtype "float32" min ([float, int]): scalar, defaults to -6 max ([float, int]): scalar, defaults to 6. [min; max] define the clamping range for the input_data data num_bits ([float, int]): Defaults to 8. num_bits is the bitwidth of the quantization,between 2 and 16 narrow_range ([bool]): True, quantized into the quantization range [1; 2^num_bits - 1] False,quantized into the quantization range [0; 2^num_bits - 1] Returns: tvm.tensor.Tensor """ shape = get_shape(input_data) utils.check_shape(shape) dtype = input_data.dtype utils.ops_dtype_check(dtype, utils.DtypeForDavinci.FLOAT32) nudged_min, nudged_max, scale = nudge_min_max(min_, max_, num_bits, narrow_range) zero_tensor = tvm.compute(input_data.shape, lambda *i: tvm.const(0, dtype="float32"), name="zero_tensor") nudged_max_tensor = topi.add(zero_tensor, nudged_max) nudged_min_tensor = topi.add(zero_tensor, nudged_min) inv_nudged_scale = 1.00 / scale # Transform the input between nudged_max and nudged_min clamped_vmin = topi.minimum(input_data, nudged_max_tensor) clamped = topi.maximum(clamped_vmin, nudged_min_tensor) # Calculate the quantized and dequantized results clamped_shifted = topi.subtract(clamped, nudged_min_tensor) vmul_shifted = topi.multiply(clamped_shifted, inv_nudged_scale) vadds_shifted = topi.add(vmul_shifted, 0.5) floor_vadds_shifted = floor(vadds_shifted) floor_cast = akg.lang.ascend.cast_to(floor_vadds_shifted, dtype) res_scale = topi.multiply(floor_cast, scale) res = topi.add(res_scale, nudged_min_tensor) return res
def my_dsl(dtype, kernel_name, attrs): m = tvm.var("M") n = tvm.var("N") A = tvm.placeholder((m, ), name="A", dtype=dtype) B = tvm.placeholder((m, ), name="B", dtype=dtype) if insn == "add": C = topi.add(A, B) elif insn == "sub": C = topi.subtract(A, B) if insn == "mul": C = topi.multiply(A, B) elif insn == "div": C = topi.divide(A, B) elif insn == "max": C = topi.maximum(A, B) elif insn == "min": C = topi.minimum(A, B) elif insn == "abs": C = tvm.compute(A.shape, lambda *index: tvm.abs(A(*index)), name='C') elif insn == "exp": C = topi.exp(A) elif insn == "log": C = topi.log(A) elif insn == "sqrt": C = topi.sqrt(A) C = topi.log(A) elif insn == "sqrt": C = topi.sqrt(A) elif insn == "adds": C = A + tvm.const(2, dtype) elif insn == "muls": C = A * tvm.const(2, dtype) # C = tvm.compute((m, ), lambda i: A[i] + B[i], name="C") s = tvm.create_schedule([C.op]) with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): if insnType == "binary": mod = akg.build(s, [A, B, C], "cce", name=kernel_name, attrs=attrs, polyhedral=True) else: mod = akg.build(s, [A, C], "cce", name=kernel_name, attrs=attrs, polyhedral=True) return mod
def _asin_compute(data_input): """Compute asin""" dtype = data_input.dtype boundary = tvm.const(BOUNDARY, "float32") # Change dtype to float32 if dtype == "float16": data_input = topi.cast(data_input, "float32") # Sign mask data_sign = sign(data_input) # All positive data1 = topi.multiply(data_input, data_sign) # x belongs to (0, 2^(-0.5)) choice_1 = topi.minimum(data1, boundary) choice_1 = topi.subtract(choice_1, boundary) choice_1_floor = akg.lang.cce.floor(choice_1) # the dtype of choice_1_floor is int32, need to be cast to fp32. if utils.product_is_mini(): choice_1_floor = topi.cast(choice_1_floor, "float16") choice_1_floor = topi.cast(choice_1_floor, "float32") else: choice_1_floor = topi.cast(choice_1_floor, "float32") choice_1 = topi.multiply(choice_1_floor, neg_one_const("float32")) taylor1 = _taylor_compute(data1) res_1 = topi.multiply(taylor1, choice_1) # x belongs to (2^(-0.5), 1) choice_2 = topi.subtract(one_const("float32"), choice_1) data2 = topi.subtract(one_const("float32"), topi.multiply(data1, data1)) data2_sqrt = _sqrt(data2) taylor2 = _taylor_compute(data2_sqrt, data2) res_2 = topi.multiply(taylor2, neg_one_const("float32")) res_2 = topi.add(res_2, tvm.const(HALF_PI, "float32")) res_2 = topi.multiply(res_2, choice_2) # Restore sign res_1 = topi.add(res_1, res_2) res_1 = topi.multiply(res_1, data_sign) # Restore dtype if dtype == "float16": res_1 = topi.cast(res_1, "float16") return res_1
def truncate_div_compute(input_x1, input_x2): """compute for truncate_div""" int_list = ("int32", "int8", "uint8") if input_x1.dtype in int_list: data_zero = dc.zero_const("float32") data_x_broad = cast(input_x1, "float32") data_y_broad = cast(input_x2, "float32") res_div = topi.divide(data_x_broad, data_y_broad) res_min_int = ceil(topi.minimum(res_div, data_zero)) res_max_int = floor(topi.maximum(res_div, data_zero)) res_trunc = topi.add(res_min_int, res_max_int) res_trunc = cast(res_trunc, "float32") else: res_trunc = topi.divide(input_x1, input_x2) return cast(res_trunc, input_x1.dtype)
def less_compare_float32(data_x, data_y): """if x is less than y, then return 1, else return 0""" shape_inputs = get_shape(data_x) # minimun num of float32 2**(-126) data_min = akg.lang.ascend.broadcast(tvm.const(2**(-126), dtype="float32"), shape_inputs, "float32") data_zero = akg.lang.ascend.broadcast(dc.zero_const("float32"), shape_inputs, "float32") res_sub = topi.subtract(data_y, data_x) res_min = topi.minimum(res_sub, data_min) res_max = topi.maximum(res_min, data_zero) # max num of float32 is 2**126 # but cce can only support 2**62, so use 62 * 62 * 2 to adaptor 126 res_mul_fierst = topi.multiply(res_max, tvm.const(2**62, dtype="float32")) res_mul_second = topi.multiply(res_mul_fierst, tvm.const(2**62, dtype="float32")) res = topi.multiply(res_mul_second, tvm.const(2**2, dtype="float32")) return res
def fake_quant_with_min_max_vars_per_channel_compute(input_data, input_min, input_max, num_bits=8, narrow_range=False): """fake_quant_with_min_max_vars_per_channel compute implemention""" shape = get_shape(input_data.shape) dtype = input_data.dtype min_broadcast = akg.lang.ascend.broadcast(input_min, shape, dtype) max_broadcast = akg.lang.ascend.broadcast(input_max, shape, dtype) # get nudged_min and nudged_max by nudged_min_max_compute function nudged_min_nudged_max = nudged_min_max_compute(min_broadcast, max_broadcast, num_bits, narrow_range) # transform the input between nudged_max and nudged_min clamped_tmp = topi.minimum(input_data, nudged_min_nudged_max[1]) clamped = topi.maximum(clamped_tmp, nudged_min_nudged_max[0]) # calculate the quantized and dequantized results clamped_shifted = topi.subtract(clamped, nudged_min_nudged_max[0]) if product_is_mini(): clamped_shifted_div_scale = mul(clamped_shifted, reciprocal(nudged_min_nudged_max[2]), target=utils.CCE) else: clamped_shifted_div_scale = Divide(clamped_shifted, nudged_min_nudged_max[2], target=utils.CCE) result_tmp = topi.add(clamped_shifted_div_scale, dc.half_const(dtype)) floor_result_tmp = akg.lang.ascend.floor(result_tmp) if product_is_mini(): floor_result_tmp = topi.cast(floor_result_tmp, "float16") floor_result_tmp = topi.cast(floor_result_tmp, "float32") scale_product = topi.multiply(floor_result_tmp, nudged_min_nudged_max[2]) tmp_res = topi.add(scale_product, nudged_min_nudged_max[0]) # get bool_both_zero_value by bool_both_zero_compute function bool_both_zero_value = bool_both_zero_compute(min_broadcast, max_broadcast) res = topi.multiply(tmp_res, bool_both_zero_value) return res
def _cmpare_value(input_data, nudged_min, nudged_max): """ where((input_data<=nudged_max)&(x>=nudged_min),1,0) Args: input_data (tvm.tensor.Tensor): Input data nudged_min (tvm.tensor.Tensor): Minimum value of comparison nudged_max (tvm.tensor.Tensor): Maximum value of comparison Returns: tvm.tensor.Tensor """ min_value = tvm.const(2**(-126), dtype="float32") # (2**(-126))*(2**(62))*(2**(62))*(2**(2)) = 1 # so min_value*max_value*max_value*max_value_one = 1 max_value = tvm.const(2**(62), dtype="float32") max_value_one = tvm.const(2**(2), dtype="float32") data_zero = topi.multiply(input_data, 0) max_value_tensor = topi.add(data_zero, max_value) min_value_tensor = topi.add(data_zero, min_value) max_value_one_tensor = topi.add(data_zero, max_value_one) sub_tmp = topi.subtract(input_data, nudged_min) sub_min = topi.add(sub_tmp, min_value) vmax_tmp = topi.maximum(sub_min, data_zero) sub_tmp_max = topi.subtract(nudged_max, input_data) sub_max = topi.add(sub_tmp_max, min_value) vmin_tmp = topi.maximum(sub_max, data_zero) one_tmp = topi.multiply(vmax_tmp, vmin_tmp) one_min = topi.minimum(one_tmp, min_value_tensor) vmul_max_value = topi.multiply(one_min, max_value_tensor) vmul_max_value_one = topi.multiply(vmul_max_value, max_value_tensor) between_nudged_min_max = topi.multiply(vmul_max_value_one, max_value_one_tensor) return between_nudged_min_max
def _do_atan_taylor(data): """ Taylor algorithm for atan. if x > 0 and x < tan(pi/8): atan(x) = x - x^3/3 + x^5/5 - x^7/7 ... elif x > tan(pi/8) and x < tan(pi/4): atan(x) = atan(y) + atan((x-y)/(1+xy)) Args: data (tvm.tensor.Tensor): Input data. Returns: A tvm.tensor.Tensor of atan(x). """ dtype = data.dtype tensor_offset = tvm.const(TAN_PI_BY_EIGHT, dtype) deno = topi.multiply(data, tvm.const(TAN_PI_BY_EIGHT, dtype)) deno = topi.add(deno, dc.one_const(dtype)) molecule = topi.subtract(data, tensor_offset) ddata = topi.divide(molecule, deno) ddata = topi.abs(ddata) square_ddata = topi.multiply(ddata, ddata) res = tvm.const(ATAN_TAYLOR_COEF[CONST_ITERTOR], dtype) for i in reversed(range(CONST_ITERTOR)): res = topi.multiply(res, square_ddata) res = topi.add(res, tvm.const(ATAN_TAYLOR_COEF[i], dtype)) res = topi.multiply(res, ddata) res = topi.add(res, tvm.const(CONST_PI_BY_EIGHT, dtype)) square_data = topi.multiply(data, data) res2 = tvm.const(ATAN_TAYLOR_COEF[CONST_ITERTOR2], dtype) for i in reversed(range(CONST_ITERTOR2)): res2 = topi.multiply(res2, square_data) res2 = topi.add(res2, tvm.const(ATAN_TAYLOR_COEF[i], dtype)) return topi.minimum(res, topi.multiply(res2, data))