def _compute_mini(data_input, shape): """ Use log and taylor to compute arctanh has the feature: arctanh(-abs(x)) = -arctanh(abs(x)) """ data_abs = topi.abs(data_input) result_ln = _compute_log(data_abs) result_taylor = _compute_taylor(data_abs) data_abs = topi.cast(data_abs, "float16") data_input = topi.cast(data_input, "float16") result_taylor = topi.cast(result_taylor, "float16") result_ln = topi.cast(result_ln, "float16") # when |x| < 0.5 using taylor computing, and when 0.5<|x|<1 using log() data_res = tvm.compute(shape, lambda *i : akg.tvm.expr.Select(data_abs(*i) < dc.half_const("float16"), result_taylor(*i), result_ln(*i)), name="le") # arctanh has the feature: arctanh(-abs(x)) = -arctanh(abs(x)) data_res_neg = topi.multiply(data_res, dc.neg_one_const("float16")) data_res = tvm.compute(shape, lambda *i : akg.tvm.expr.Select(data_input(*i) < dc.zero_const("float16"), data_res_neg(*i), data_res(*i)), name="neg") return data_res
def matrix_diag(data, out_shape): """ Generate a batched tensor whose value in diagonal lines are defined in `data`. Args: data (tvm.tensor.Tensor): A tensor of type float16, float32 or int32. Rank is L. out_shape (Union[list, tuple]): Output shape of length L + 1. The value of `out_shape[0, ..., L-1]` should be equal to `data.shape[0, ..., L-1]`. Returns: tvm.tensor.Tensor, has same type as "data", shape is "out_shape". """ dtype = data.dtype utils.ops_dtype_check(dtype, [utils.DtypeForDavinci.ALL_FLOAT, utils.DtypeForDavinci.INT32]) shape = get_shape(data) utils.check_shape(data) utils.check_shape(out_shape, length=len(shape) + 1) if tuple(shape[:-1]) != tuple(out_shape[:-2]): raise RuntimeError("The value of out_shape[:-2] should be equal to data.shape[:-1]") res = akg.tvm.compute(out_shape, lambda *i: akg.tvm.if_then_else(akg.tvm.all(i[-1] == i[-2], i[-1] < shape[-1]), data(*i[:-1]), zero_const(dtype)), name="diag") return res
def _less_equal_compare_float32(data_x, data_y): """if x <= y, then return 1, else 0""" data_out = tvm.compute( data_x.shape, lambda *index: tvm.expr.Select( data_x(*index) <= data_y(*index), dc.one_const(data_x.dtype), dc.zero_const(data_x.dtype))) return data_out
def _atan2_compute(y, x): """compute for atan2""" const_pi_by_two = 1.5707963267948966192313216916398 dtype = y.dtype if dtype == "float16": y = topi.cast(y, "float32") x = topi.cast(x, "float32") x_lt_zero_y_mask, y_ge_zero_mask = _init_atan2_mask(y, x) y_cmp_zero = topi.multiply(y_ge_zero_mask, tvm.const(const_pi_by_two, "float32")) res_x_lt_zero = topi.multiply(x_lt_zero_y_mask, dc.pi_const("float32")) # caculate the atan(y/x) when x > 0 if product_is_mini(): x_rec = reciprocal(x, target=utils.CCE) res = topi.multiply(y, x_rec) else: res = topi.divide(y, x) res, _ = atan(res) if product_is_mini(): tensor_zero = dc.zero_const("float16") x = topi.cast(x, "float16") y_cmp_zero = topi.cast(y_cmp_zero, "float16") res = topi.cast(res, "float16") else: tensor_zero = dc.zero_const("float32") res = tvm.compute(res.shape, lambda *i: tvm.expr.Select( x(*i) == tensor_zero, y_cmp_zero(*i), res(*i)), name="res") if product_is_mini(): res = topi.cast(res, "float32") res = topi.add(res, res_x_lt_zero) return topi.cast(res, dtype)
def bool_both_zero_compute(juduged_min, juduged_max): """if input min and max are both zero then output_data will be all zero,so need a juduge compute tensor""" dtype = juduged_min.dtype tensor_zero = topi.full(juduged_min.shape, dtype, dc.zero_const(dtype)) min_abs = topi.abs(juduged_min) max_abs = topi.abs(juduged_max) min_max_replace = topi.add(min_abs, max_abs) # just check wether min and max are all zero, if true return 0 bool_min_max_product_less_zero = less_compare_float32( min_max_replace, tensor_zero) bool_min_max_product_more_zero = less_compare_float32( tensor_zero, min_max_replace) bool_both_zero = topi.add(bool_min_max_product_less_zero, bool_min_max_product_more_zero) return bool_both_zero
def reduce_all(data, axis=None, keepdims=False): """ Computes logical and of the input tensor. Args: data(tvm.tensor.Tensor): Tensor of type Boolean. axis(Union[None, int, list, tuple]): Specifies which axes to reduce, if None, all dimensions of input tensor data will be reduced and the shape of output tensor will be (1,). keepdims(Union[None, bool]): if true, keep the dimensions with length 1. Returns: tvm.tensor.Tensor of same type as input tensor data. """ shape = [x.value for x in data.shape] vc_util.ops_dtype_check(data.dtype, vc_util.DtypeForDavinci.BOOL) vc_util.check_shape(shape) if axis is None and keepdims is False: raise ValueError("keepdims must be True when axis is None!") axis_new = ft_util.refine_reduce_axis(data, axis) xx1 = akg.tvm.compute(shape, lambda *indice: data(*indice).astype("float16"), name='xx1') xx = (-xx1 + dc.one_const("float16")) yy = akg.topi.sum(xx, axis=axis_new, keepdims=keepdims) o_shape = list(yy.shape) zz = akg.tvm.compute(o_shape, lambda *indice: yy(*indice).astype("bool"), name='zz') y1 = akg.tvm.compute( o_shape, lambda *indice: akg.tvm.expr.Select(zz( *indice), dc.zero_const("float16"), dc.one_const("float16")), name="y1") y = akg.tvm.compute(o_shape, lambda *indice: y1(*indice).astype("bool"), name='y') return y
def truncate_div_compute(input_x1, input_x2): """compute for truncate_div""" int_list = ("int32", "int8", "uint8") if input_x1.dtype in int_list: data_zero = dc.zero_const("float32") data_x_broad = cast(input_x1, "float32") data_y_broad = cast(input_x2, "float32") res_div = topi.divide(data_x_broad, data_y_broad) res_min_int = ceil(topi.minimum(res_div, data_zero)) res_max_int = floor(topi.maximum(res_div, data_zero)) res_trunc = topi.add(res_min_int, res_max_int) res_trunc = cast(res_trunc, "float32") else: res_trunc = topi.divide(input_x1, input_x2) return cast(res_trunc, input_x1.dtype)
def _init_atan2_mask(data_y_, data_x_): """ Compute mask for atan2. Args: data_y (tvm.tensor.Tensor): The y of atan2(y, x). data_x (tvm.tensor.Tensor): The x of atan2(y, x). Returns: mask (tvm.tensor.Tensor): The mask of x's and y's value. """ is_cast_for_mini = utils.product_is_mini() and data_y_.dtype == "float32" # in mini, select only support float16 if is_cast_for_mini: data_x = topi.cast(data_x_, "float16") data_y = topi.cast(data_y_, "float16") else: data_x = data_x_ data_y = data_y_ dtype_input = data_y.dtype tensor_one = dc.one_const(dtype_input) tensor_zero = dc.zero_const(dtype_input) tensor_neg_one = dc.neg_one_const(dtype_input) y_ge_zero = tvm.compute( data_y.shape, lambda *i: tvm.expr.Select( data_y(*i) >= tensor_zero, tensor_one, tensor_neg_one), name="y_ge_zero") x_lt_zero_y_mask = tvm.compute( data_y.shape, lambda *i: tvm.expr.Select( data_x(*i) < tensor_zero, y_ge_zero(*i), tensor_zero), name="xlt0_y_mask") if is_cast_for_mini: x_lt_zero_y_mask = topi.cast(x_lt_zero_y_mask, "float32") y_ge_zero = topi.cast(y_ge_zero, "float32") return (x_lt_zero_y_mask, y_ge_zero)
def less_compare_float32(data_x, data_y): """if x is less than y, then return 1, else return 0""" shape_inputs = get_shape(data_x) # minimun num of float32 2**(-126) data_min = akg.lang.ascend.broadcast(tvm.const(2**(-126), dtype="float32"), shape_inputs, "float32") data_zero = akg.lang.ascend.broadcast(dc.zero_const("float32"), shape_inputs, "float32") res_sub = topi.subtract(data_y, data_x) res_min = topi.minimum(res_sub, data_min) res_max = topi.maximum(res_min, data_zero) # max num of float32 is 2**126 # but cce can only support 2**62, so use 62 * 62 * 2 to adaptor 126 res_mul_fierst = topi.multiply(res_max, tvm.const(2**62, dtype="float32")) res_mul_second = topi.multiply(res_mul_fierst, tvm.const(2**62, dtype="float32")) res = topi.multiply(res_mul_second, tvm.const(2**2, dtype="float32")) return res
def fused_minimum_or_maximum_grad(dz, x, y, grad_x, grad_y, op_type): """ Gradient for minimum or maximum operation between two input tensors `x` and `y`. Args: dz (tvm.tensor.Tensor): Type float16, float32, int32. x (tvm.tensor.Tensor): Type float16, float32, int32. y (tvm.tensor.Tensor): Type float16, float32, int32. grad_x (bool): Whether calculate dx. grad_y (bool): Whether calculate dy. op_type (str): The type of the op, "GE" for MaximumGrad or "LE" for MinimumGrad. Note: At least one of grad_x and grad_y is True. Returns: dx, tvm.tensor.Tensor of the same type as inputs, it will be returned if grad_x is True. dy, tvm.tensor.Tensor of the same type as inputs, it will be returned if grad_y is True. """ vc_util.check_shape(x) vc_util.check_shape(y) vc_util.check_shape(dz) vc_util.ops_dtype_check([x.dtype, y.dtype, dz.dtype], [vc_util.DtypeForDavinci.ALL_FLOAT, vc_util.DtypeForDavinci.INT32]) vc_util.broadcast_check(x, dz) vc_util.broadcast_check(y, dz) # check op types check_list = ["GE", "LE"] if op_type not in check_list: raise ValueError("FusedMinimumOrMaximumGrad only support %s while op type is %s" % (",".join(check_list), op_type)) if not grad_x and not grad_y: raise ValueError("At least one of grad_x and grad_y is True.") x_shape = get_shape(x) y_shape = get_shape(y) dz_shape = get_shape(dz) ori_dtype = dz.dtype # get greater compute x = akg.lang.cce.broadcast(x, dz_shape) y = akg.lang.cce.broadcast(y, dz_shape) if utils.product_is_mini() and ori_dtype != "float16": x = cast(x, "float16") y = cast(y, "float16") dz = cast(dz, "float16") elif ori_dtype == "int32": x = cast(x, "float32") y = cast(y, "float32") dz = cast(dz, "float32") zero = zero_const(dz.dtype) if op_type == "LE": dx = tvm.compute(dz_shape, lambda *i: tvm.expr.Select((x(*i) <= y(*i)), dz(*i), zero), name='dx') dy = topi.subtract(dz, dx) elif op_type == "GE": dx = tvm.compute(dz_shape, lambda *i: tvm.expr.Select((x(*i) >= y(*i)), dz(*i), zero), name='dx') dy = topi.subtract(dz, dx) if dx.dtype == "float16": # cast to fp32 for higher precision of reduce_sum. if get_shape(dx) != x_shape: dx = cast(dx, "float32") if get_shape(dy) != y_shape: dy = cast(dy, "float32") dx = sum.sum_by_shape(dx, x_shape) dy = sum.sum_by_shape(dy, y_shape) if ori_dtype != dx.dtype: dx = cast(dx, ori_dtype) if ori_dtype != dy.dtype: dy = cast(dy, ori_dtype) attrs = get_default_attrs() if grad_x and grad_y: return dx, dy, attrs if grad_x: return dx, attrs return dy, attrs
def avgpool_with_img2col(data, kernel, stride, strategy): """ Performs the avgpool with img2col. Note: Only support 5D format(NC1HWC0), and pooling will work on H and W. Args: data (tvm.tensor.Tensor): Tensor of type float16, float32. kernel (Union[list, tuple]): two int numbers for pooling window's size. stride (Union[list, tuple]): two int numbers for window's stride. strategy (Union[str, list, tuple]): padding, should be 'VALID','SAME' or instance of list(four int numbers, as 'CONSTANTS' strategy). Support **Strategies** is the same as avgpool. Returns: tvm.tensor.Tensor, result for gradient of avgpooling. """ shape = get_shape(data) dtype = data.dtype utils.davinci_format_check(shape, "NC1HWC0", dim=5) utils.ops_dtype_check(dtype, utils.DtypeForDavinci.FLOAT16) utils.check_shape(kernel, 2, "Kernel") utils.check_shape(stride, 2, "Stride") kernel_h, kernel_w = kernel in_n, in_c1, _, _, in_c0 = shape [ph_h, ph_t, pw_h, pw_t], [out_h, out_w] = \ cal_pad_shapes_by_strategy(shape, kernel, stride, strategy) pad = [ph_h, ph_t, pw_h, pw_t] pad_value = zero_const(dtype) # fmap img2col l1 -> ub in zZ format by fractal fmap_img2col_shp_ub = (in_n, in_c1, kernel_h, kernel_w, out_h, out_w, in_c0) fmap_img2col_ub = img2col(data, fmap_img2col_shp_ub, kernel_h, kernel_w, pad, stride, pad_value, tag="") out_shape = (in_n, in_c1, out_h, out_w, in_c0) reduce_axis_h = akg.tvm.reduce_axis((0, kernel_h), name="reduce_h") reduce_axis_w = akg.tvm.reduce_axis((0, kernel_w), name="reduce_w") res_sum = akg.tvm.compute( out_shape, lambda n, c1, oh, ow, c0: akg.tvm.sum( fmap_img2col_ub[n, c1, reduce_axis_h, reduce_axis_w, oh, ow, c0], axis=[reduce_axis_h, reduce_axis_w]), name="pooling_avg") dividor = akg.tvm.const(kernel_h * kernel_w, dtype) output = akg.tvm.compute(out_shape, lambda *i: res_sum(*i) / dividor, name="res_value") return output