def _bessel_i0e_compute(input_data): """bessel i0e compute""" shape_input = input_data.shape dtype_input = input_data.dtype # chose the type of data in begin if dtype_input == "float16": input_data = Cast(input_data, "float32", target=utils.CCE) abs_data = Abs(input_data, target=utils.CCE) # compute bessel_i0e for data in (-3.75, 3.75) # t = |x| / 3.75 # I0e = e^-|x|(1 + 3.5156229t^2 + 3.0899424t^4 + 1.2067492t^6 + 0.2659732t^8 # + 0.0360768t^10 + 0.0045813t^12)), |x| <= 3.75 broad_const_limit = akg.lang.ascend.broadcast( akg.tvm.const(CONST_LIMIT, "float32"), shape_input) before_abs_data = minimum(abs_data, broad_const_limit) data = topi.multiply(before_abs_data, 1.0 / CONST_LIMIT) square_data = mul(data, data, target=utils.CCE) before_res = topi.multiply(square_data, ITR_BEFORE[LEN_BEFORE - 1]) before_res = topi.add(before_res, ITR_BEFORE[LEN_BEFORE - 2]) for iter_number in ITR_BEFORE[LEN_BEFORE - 3::-1]: before_res = mul(before_res, square_data, target=utils.CCE) before_res = topi.add(before_res, iter_number) exp_data = Exp(neg(before_abs_data, target=utils.CCE), target=utils.CCE) before_res = mul(before_res, exp_data, target=utils.CCE) # compute bessel_i0e for data in other domain # t = |x| / 3.75 # I0e(x) = (1 / sqrt(|x|))*(0.39894228 + 0.01328592t^-1 + 0.00225319t^-2 + -0.00157565t^-3 # + 0.00916281t^-4 + -0.02057706t^-5 + 0.02635537t^-6 + -0.01647633t^-7 # + 0.00392377t^-8), |x| >= 3.75 data = Divide(broad_const_limit, abs_data, target=utils.CCE) after_res = topi.multiply(data, ITR_AFTER[LEN_AFTER - 1]) after_res = topi.add(after_res, ITR_AFTER[LEN_AFTER - 2]) for iter_number in ITR_AFTER[LEN_AFTER - 3::-1]: after_res = mul(after_res, data, target=utils.CCE) after_res = topi.add(after_res, iter_number) rsqrt_data = rsqrt(abs_data, target=utils.CCE) after_res = mul(after_res, rsqrt_data, target=utils.CCE) after_res = minimum(before_res, after_res, target=utils.CCE) # chose the type of data in end if dtype_input == "float16": after_res = Cast(after_res, "float16", target=utils.CCE) return after_res
def fused_gather_nd_reduce_sum_mul_unsorted_segment_sum(input1, input2, input3, input4, input5, axis=0, keepdims=False, num=0, target=utils.CUDA): item_get = gather_nd(input1, input2) sum_axis = reduce_sum(item_get, axis, keepdims, target) prod = mul(sum_axis, input3, target=utils.CUDA) res1 = unsorted_segment_sum(prod, input4, num, op_id=0) res2 = unsorted_segment_sum(prod, input5, num, op_id=1) return res1, res2
def _bessel_i1e_compute(input_data): """bessel i1e compute""" shape = utils.get_shape(input_data) dtype = input_data.dtype # chose the type of data in begin if dtype == "float16": input_data = Cast(input_data, "float32", target=utils.CCE) abs_data = Abs(input_data, utils.CCE) # compute bessel_i1e for data in (-3.75, 3.75) before_res = _before_res_compute(abs_data) # compute bessel_i1e for data in other domain after_res = _after_res_compute(abs_data) # As vcmp_lt and vsel instruction don't support fp32 on mini # It can be simplified by some methods, such as , "auto cast" if product_is_mini(): res = akg.tvm.compute( shape, lambda *indice: akg.tvm.expr.Select( abs_data[indice].astype("float16") < akg.tvm.const( CONST_LIMIT, "float16"), before_res[indice].astype( "float16"), after_res[indice].astype("float16"))) res = Cast(res, "float32", target=utils.CCE) else: res = akg.tvm.compute( shape, lambda *indice: akg.tvm.expr.Select(abs_data[ indice] < CONST_LIMIT, before_res[indice], after_res[indice])) data_sign = Sign(input_data, target=utils.CCE) res = mul(res, data_sign, target=utils.CCE) if dtype == "float16": res = Cast(res, "float16", target=utils.CCE) return res
def matmul_mul(x, y, c, b, out_dtype, left_format="zZ", right_format="nZ", out_format="zN", transpose_x=False, transpose_y=False, attrs=None, target="cce"): matmul_res, attrs = matmul(x, y, b, out_dtype, left_format, right_format, out_format, transpose_x, transpose_y, attrs=attrs) attr = {} print(matmul_res.shape) res = mul(matmul_res, c, target=target) return res, attrs
def mul_mean(first_input, second_input, axis=None, keepdims=False, target="cce"): temp = mul(first_input, second_input, target=target) output, _ = mean(temp, axis, keepdims) return output
def mul_conv(data, fmap_shape, filter_shape, pad_, stride_, dilation_, use_bias=False, block_size=16, attrs=None, target="cce"): a1 = data[0] a2 = data[1] b = data[2] a = mul(data[0], data[1], target=target) if use_bias: conv_data = [a, b, data[3]] else: conv_data = [a, b] res = Conv(conv_data, fmap_shape, filter_shape, pad_, stride_, dilation_, use_bias, block_size, attrs) return res
def _after_res_compute(abs_data): """ compute bessel_i1e for abs value of data greater than or equal to 3.75 Algrithm: t = 3.75 / x I1(x) = (1 / sqrt(x))*(0.39894228 - 0.03988024t - 0.00362018t^2 + 0.00163801t^3 - 0.01031555t^4 + 0.02282967t^5 - 0.02895312t^6 + 0.01787654t^7 - 0.00420059t^8) """ broad_const_limit = akg.lang.ascend.broadcast( akg.tvm.const(CONST_LIMIT, abs_data.dtype), abs_data.shape) data = Divide(broad_const_limit, abs_data, target=utils.CCE) after_res = topi.multiply(data, ITR_AFTER[LEN_AFTER - 1]) after_res = topi.add(after_res, ITR_AFTER[LEN_AFTER - 2]) for iter_number in ITR_AFTER[LEN_AFTER - 3::-1]: after_res = mul(after_res, data, target=utils.CCE) after_res = topi.add(after_res, iter_number) abs_data_rsqrt = rsqrt(abs_data, target=utils.CCE) after_res = mul(after_res, abs_data_rsqrt, target=utils.CCE) return after_res
def _before_res_compute(abs_data): """ compute bessel_i1e for abs value of data less than or equal to 3.75 Algrithm: t = x / 3.75 I1(x) = e^-|x|*x*(0.5 + 0.87890594t^2 + 0.51498869t^4 + 0.15084934t^6 + 0.02658773t^8 + 0.00301532t^10 + 0.00032411t^12) """ data = topi.multiply(abs_data, 1.0 / CONST_LIMIT) data_square = mul(data, data, target=utils.CCE) before_res = topi.multiply(data_square, ITR_BEFORE[LEN_BEFORE - 1]) before_res = topi.add(before_res, ITR_BEFORE[LEN_BEFORE - 2]) for iter_number in ITR_BEFORE[LEN_BEFORE - 3::-1]: before_res = mul(before_res, data_square, target=utils.CCE) before_res = topi.add(before_res, iter_number) exp_value = Exp(neg(abs_data, target=utils.CCE), target=utils.CCE) before_res = mul(before_res, exp_value, target=utils.CCE) before_res = mul(before_res, abs_data, target=utils.CCE) return before_res
def sigmoid_cross_entropy_with_logits(labels=None, logits=None, target="cce"): ## # \brief Computes sigmoid cross entropy given `logits`. # # \f[ # cost = lables * -log(sigmoid(logits)) + (1 - lables) * -log(1 - sigmoid(logits)) # \f] # \param labels akg.tvm.Tensor of the same type and shape as `logits`. # \param logits akg.tvm.Tensor of type float16, float32 # # \return akg.tvm.Tensor of the same shape as `logits` with the componentwise logistic losses. ## if get_shape(logits) != get_shape(labels): raise ValueError( "logits and labels must have the same shape (%s vs %s)" % (get_shape(logits), get_shape(labels))) if logits.dtype != labels.dtype: raise ValueError( "logits and labels must have the same dtype (%s vs %s)" % (logits.dtype, labels.dtype)) shape = logits.shape dtype = logits.dtype check_list = ["float16", "float32"] if not (dtype.lower() in check_list): raise RuntimeError( "sigmoid_cross_entropy_with_logits only support %s while dtype is %s" % (",".join(check_list), dtype)) # z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x)) # = z * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x))) # = max(x, 0) - x * z + log(1 + exp(-abs(x))) zero = akg.tvm.const(0, dtype=dtype) relu_logits = akg.tvm.compute( shape, lambda *indice: akg.tvm.expr.Select( logits(*indice) < zero, zero, logits(*indice)), name="relu_logits") neg_abs_logits = akg.tvm.compute( shape, lambda *indice: akg.tvm.expr.Select( logits(*indice) < zero, logits(*indice), logits(*indice) * -1), name="neg_abs_logits") sigmoid_logits = Exp(neg_abs_logits, target=target) + akg.tvm.const( 1, dtype=dtype) ln_sigmoid_logits = log(sigmoid_logits, target=target) logits_mul_lables = mul(logits, labels, target=target) res = relu_logits - logits_mul_lables + ln_sigmoid_logits return res
def mul_unsortedsegmentsum(input1, input2, ids_tensor, num_segments, target="cce"): import akg.tvm temp = mul(input1, input2, target='cce') output = unsorted_segment_sum(temp, ids_tensor, num_segments, target=target)[0] output = akg.tvm.compute(output.shape, lambda *i: output(*i), "fused_mul_unsorted") return output
def fake_quant_with_min_max_vars_per_channel_compute(input_data, input_min, input_max, num_bits=8, narrow_range=False): """fake_quant_with_min_max_vars_per_channel compute implemention""" shape = get_shape(input_data.shape) dtype = input_data.dtype min_broadcast = akg.lang.ascend.broadcast(input_min, shape, dtype) max_broadcast = akg.lang.ascend.broadcast(input_max, shape, dtype) # get nudged_min and nudged_max by nudged_min_max_compute function nudged_min_nudged_max = nudged_min_max_compute(min_broadcast, max_broadcast, num_bits, narrow_range) # transform the input between nudged_max and nudged_min clamped_tmp = topi.minimum(input_data, nudged_min_nudged_max[1]) clamped = topi.maximum(clamped_tmp, nudged_min_nudged_max[0]) # calculate the quantized and dequantized results clamped_shifted = topi.subtract(clamped, nudged_min_nudged_max[0]) if product_is_mini(): clamped_shifted_div_scale = mul(clamped_shifted, reciprocal(nudged_min_nudged_max[2]), target=utils.CCE) else: clamped_shifted_div_scale = Divide(clamped_shifted, nudged_min_nudged_max[2], target=utils.CCE) result_tmp = topi.add(clamped_shifted_div_scale, dc.half_const(dtype)) floor_result_tmp = akg.lang.ascend.floor(result_tmp) if product_is_mini(): floor_result_tmp = topi.cast(floor_result_tmp, "float16") floor_result_tmp = topi.cast(floor_result_tmp, "float32") scale_product = topi.multiply(floor_result_tmp, nudged_min_nudged_max[2]) tmp_res = topi.add(scale_product, nudged_min_nudged_max[0]) # get bool_both_zero_value by bool_both_zero_compute function bool_both_zero_value = bool_both_zero_compute(min_broadcast, max_broadcast) res = topi.multiply(tmp_res, bool_both_zero_value) return res
def fused_gather_gather_add_mul_max_exp_scatter_add(inp1, inp2, inp3, inp4, axis, target=utils.CUDA): ndim = len(inp1.shape) axis = axis + ndim if axis < 0 else axis assert axis >= 0 assert axis < ndim gather_out1 = gather(inp1, inp2, axis, "1") gather_out2 = gather(inp1, inp2, axis, "2") add_out = Add(gather_out1, gather_out2, target=target) mul_out = mul(add_out, inp3, utils.CUDA) max_out = maximum(add_out, mul_out, utils.CUDA) exp_out = Exp(max_out, utils.CUDA) scatter_out = scatter_add(inp1, inp4, exp_out) return exp_out, scatter_out
def matmul_mul_transdata(x, y, c, b, out_dtype, left_format="zZ", right_format="nZ", out_format="zN", transpose_x=False, transpose_y=False, attrs=None, target="cce"): matmul_res, attrs = matmul(x, y, b, out_dtype, left_format, right_format, out_format, transpose_x, transpose_y, attrs=attrs) res = mul(matmul_res, c, target=target) if out_format == 'zN': n1, m1, m0, n0 = matmul_res.shape[-4:] new_shape = matmul_res.shape[:-4] + [m1 * m0, n1 * n0] elif out_format == 'zZ': m1, n1, m0, n0 = matmul_res.shape[-4:] new_shape = matmul_res.shape[:-4] + [m1 * m0, n1 * n0] func = akg.tvm.get_global_func("TransData") res = func( [res], { "src_format": "FRACTAL_NZ", "dst_format": "DefaultFormat", "output_shape": new_shape }) return res, attrs
def div_no_nan(data_x, data_y, target=utils.CCE): """ Returns 0 if the denominator is zero, else, like Div. Args: data_x (tvm.tensor.Tensor): tensor with type int32/int8/uint8, float16/float32. data_y (tvm.tensor.Tensor): tensor with type int32/int8/uint8, float16/float32. Returns: tvm.tensor.Tensor. """ dtype = data_x.dtype if dtype != data_y.dtype: raise TypeError("input dtype should be the same") utils.ops_dtype_check(dtype, [utils.DtypeForDavinci.ALL_FLOAT, utils.DtypeForDavinci.INT8, utils.DtypeForDavinci.UINT8, utils.DtypeForDavinci.INT32]) utils.check_shape(data_x.shape) utils.check_shape(data_y.shape) utils.auto_broadcast_check(data_x, data_y) # dtype for vsel and vcmp if product_is_mini(): compute_dtype = "float16" else: compute_dtype = "float32" # div fp16 y returns 0 if y < 2^-12 # div fp32 y returns 0 if y < 2^-64 min_val = tvm.const(2**(-12) if product_is_mini() else 2**(-64), dtype=compute_dtype) tvm_one = tvm.const(1, dtype=compute_dtype) tvm_zero = tvm.const(0, dtype=compute_dtype) if not product_is_mini() and dtype == "float16": min_val = tvm.const(2**(-12), "float32") data_y_fp32 = akg.lang.ascend.cast_to(data_y, "float32") # avoid when y > 2^15 cast from fp32 to fp16 in mini clip_y_fp32 = akg.topi.clip(data_y_fp32, -1.0, 1.0) abs_clip_y_fp32 = Abs(clip_y_fp32, target) y_cmp = akg.lang.ascend.cast_to(abs_clip_y_fp32, compute_dtype) is_zero = tvm.compute(data_y.shape, lambda *i : tvm.expr.Select( y_cmp(*i) < min_val, tvm_one, tvm_zero), name="is_zero") # if fp32 y < 2^-24, cast(y,fp16)==0. to find y in (2^-64, 2^-24): if product_is_mini() and dtype == "float32": is_zero = _refine_is_zero(is_zero, abs_clip_y_fp32) is_zero = akg.lang.ascend.cast_to(is_zero, "float32") not_zero = tvm.compute(data_y.shape, lambda *i : (1 - is_zero(*i)).astype("float32"), name="not_zero") # replace [x1 x2]/[y1 0] by [x1 0]/[y1 1] data_x = mul(akg.lang.ascend.cast_to(data_x, "float32"), not_zero, target=target) data_y = akg.lang.ascend.cast_to(data_y, "float32") + is_zero res = Divide(data_x, data_y, target=target) if dtype in ("int8", "uint8", "int32"): res = akg.lang.ascend.floor(res) res = akg.lang.ascend.cast_to(res, dtype) else: res = akg.lang.ascend.cast_to(res, dtype) return res
def nudged_min_max_compute(min_broadcast, max_broadcast, num_bits, narrow_range): """ Calculate the maximum and minimum values of the quantization. Notes: Each channel scale[i] euqal to (max_broadcast[i] - min_broadcast[i]) / (quant_max - quant_min). Then compute nudged_zero_point: nudged_zero_point = floor(between_min_max_float + 0.5) + less_quant_min_float + more_quant_max_float, between_min_max_float is first calculated by: zero_point_from_min = (quant_min_float - min_broadcast) / scale, then between_min_max_float = zero_point_from_min, which min_broadcast <= zero_point_from_min <= max_broadcast. Besides, the value of less_quant_min_float is equal to quant_min or zero, zero_point_from_min < quant_min_float, the value is quant_min, else is 0. The same as more_quant_max_float. Finally according to scale and nudged_zero_point to compute nudged_min and nudged_max: nudged_min = (quant_min - nudged_zero_point) * scale nudged_max = (quant_max - nudged_zero_point) * scale Args: min_broadcast (tvm.tensor.Tensor): minimum value to be quantified for each channel. max_broadcast (tvm.tensor.Tensor): maximum value to be quantified for each channel. num_bits (int): num_bits is the bitwidth of the quantization, range [2,16]. narrow_range (bool): if True, for each channel, quantized into the quantization range [0, 2^num_bits - 1] else quantized into the quantization range [1, 2^num_bits - 1]. Returns: nudged_min (tvm.tensor.Tensor): The same type and shape as min_broadcast. nudged_max (tvm.tensor.Tensor): The same type and shape as max_broadcast. scale (tvm.tensor.Tensor): The same type and shape as max_broadcast. """ dtype = min_broadcast.dtype quant_min = 1 if narrow_range else 0 quant_max = (2**num_bits) - 1 # because of need compute each channel, so quant_min and quant_max need to broadcast. quant_min_float = topi.full(min_broadcast.shape, dtype, tvm.const(quant_min, dtype)) quant_max_float = topi.full(min_broadcast.shape, dtype, tvm.const(quant_max, dtype)) # caculate each channel max and min difference. max_sub_min = topi.subtract(max_broadcast, min_broadcast) quant_max_sub_quant_min = topi.subtract(quant_max_float, quant_min_float) # compute scale = (max_broadcast - min_broadcast) / (quant_max - quant_min) # and min_div_scale = min_broadcast / scale if product_is_mini(): scale = mul(max_sub_min, reciprocal(quant_max_sub_quant_min), target=utils.CCE) min_div_scale = Mul(min_broadcast, reciprocal(scale), target=utils.CCE) else: scale = Divide(max_sub_min, quant_max_sub_quant_min, target=utils.CCE) min_div_scale = Divide(min_broadcast, scale, target=utils.CCE) # zero_point_from_min = quant_min_float - min_broadcast / scale zero_point_from_min = topi.subtract(quant_min_float, min_div_scale) # if zero_point_from_min < quant_min_float, bool_less_quant_min_float = 1 else 0 bool_less_quant_min_float = less_compare_float32(zero_point_from_min, quant_min_float) # if quant_max_float < zero_point_from_min, bool_more_quant_max_float = 1 else 0 bool_more_quant_max_float = less_compare_float32(quant_max_float, zero_point_from_min) # according to above bool param to select effective value less_quant_min_float = topi.multiply(quant_min_float, bool_less_quant_min_float) more_quant_max_float = topi.multiply(quant_max_float, bool_more_quant_max_float) # compute which num is not less than quant_min_float and not large than quant_max_float tensor_one = topi.full(min_broadcast.shape, dtype, dc.one_const(dtype)) bool_not_less_quant_min_float = topi.subtract(tensor_one, bool_less_quant_min_float) bool_not_more_quant_max_float = topi.subtract(tensor_one, bool_more_quant_max_float) bool_between_min_max = topi.multiply(bool_not_less_quant_min_float, bool_not_more_quant_max_float) between_min_max_float = topi.multiply(zero_point_from_min, bool_between_min_max) # add 0.5 to num which min <= num <= max and then floor them. between_min_max_add_half_one = topi.add(between_min_max_float, dc.half_const(dtype)) between_min_max_round = akg.lang.ascend.floor(between_min_max_add_half_one) if product_is_mini(): between_min_max_round = topi.cast(between_min_max_round, "float16") between_min_max_round = topi.cast(between_min_max_round, "float32") # calculate the maximum and minimum values of the quantization nudged_zero_point_tmp = topi.add(less_quant_min_float, more_quant_max_float) nudged_zero_point = topi.add(nudged_zero_point_tmp, between_min_max_round) nudged_min_tmp = topi.subtract(quant_min_float, nudged_zero_point) nudged_max_tmp = topi.subtract(quant_max_float, nudged_zero_point) nudged_min = topi.multiply(nudged_min_tmp, scale) nudged_max = topi.multiply(nudged_max_tmp, scale) res = [nudged_min, nudged_max, scale] return res
def mul_ad(head, a, b): output = mul(a, b, target=utils.CCE) jacs_ = list(akg.differentiate(output, [a], head)) return jacs_[0]
def mul(x, y, target=utils.CUDA): """Mul""" return math.mul(x, y, target)
def mul_sub_mutioutput(first_input, second_input, third_input, target="cce"): temp = mul(first_input, second_input, target=target) output = sub(temp, third_input, target=target) return [temp, output]