def xdivy_grad_compute(placeholders, shape_max, dtype, rx, ry): """ Do element-wise xdivy_grad compute. Args: placeholders (Union[list, typle]): the placeholder of data input shape_max (Union[list, typle]): the shape of broadcast dtype (string): the type of data input rx (list): the reduction indices of data input with broadcast ry (list): the reduction indices for data input with broadcast Returns: output_y1 (tvm.tensor.Tensor): result of xdivy_grad output_y2 (tvm.tensor.Tensor): result of xdivy_grad """ x1_ori = placeholders[0] x2_ori = placeholders[1] grad_ori = placeholders[2] if dtype == "float16": x1 = akg.lang.cce.cast_to(x1_ori, "float32") x2 = akg.lang.cce.cast_to(x2_ori, "float32") grad = akg.lang.cce.cast_to(grad_ori, "float32") x1 = akg.lang.cce.broadcast(x1, shape_max) x2 = akg.lang.cce.broadcast(x2, shape_max) grad = akg.lang.cce.broadcast(grad, shape_max) else: x1 = akg.lang.cce.broadcast(x1_ori, shape_max) x2 = akg.lang.cce.broadcast(x2_ori, shape_max) grad = akg.lang.cce.broadcast(grad_ori, shape_max) esp_min = tvm.const(1.18e-38, dtype="float32") x1_addepsmin = akg.lang.cce.vadds(x1, esp_min) if utils.product_is_mini(): x1_addepsmin_rec = reciprocal(x1_addepsmin) not_zero_x1 = akg.lang.cce.vmul(x1, x1_addepsmin_rec) x2_rec = reciprocal(x2) partial_x1 = akg.lang.cce.vmul(not_zero_x1, x2_rec) else: not_zero_x1 = div(x1, x1_addepsmin) partial_x1 = div(not_zero_x1, x2) partial_x1g = akg.lang.cce.vmul(partial_x1, grad) neg_one = tvm.const(-1, dtype="float32") neg_x1 = akg.lang.cce.vmuls(x1, neg_one) partial_x1pow = akg.lang.cce.vmul(partial_x1, partial_x1) partial_x2 = akg.lang.cce.vmul(neg_x1, partial_x1pow) partial_x2g = akg.lang.cce.vmul(partial_x2, grad) output_y1 = akg.lang.cce.sum(partial_x1g, rx, keepdims=True) output_y2 = akg.lang.cce.sum(partial_x2g, ry, keepdims=True) if dtype == "float16": output_y1 = akg.lang.cce.cast_to(output_y1, "float16") output_y2 = akg.lang.cce.cast_to(output_y2, "float16") return output_y1, output_y2
def xlogy_grad_compute(placeholders, shape_max, dtype, rx, ry): """ do element-wise xlogy_grad compute Args: placeholders (Union[list, typle]): the placeholder of data input shape_max (Union[list, typle]): the shape of broadcast dtype (string): the type of data input rx (list): the reduction indices of data input with broadcast ry (list): the reduction indices for data input with broadcast Returns output_y1 (tvm.tensor.Tensor): result of xlogy_grad output_y2 (tvm.tensor.Tensor): result of xlogy_grad """ x1_ori = placeholders[0] x2_ori = placeholders[1] grad_ori = placeholders[2] if dtype == "float16": x1 = akg.lang.cce.cast_to(x1_ori, "float32") x2 = akg.lang.cce.cast_to(x2_ori, "float32") grad = akg.lang.cce.cast_to(grad_ori, "float32") x1 = akg.lang.cce.broadcast(x1, shape_max) x2 = akg.lang.cce.broadcast(x2, shape_max) grad = akg.lang.cce.broadcast(grad, shape_max) else: x1 = akg.lang.cce.broadcast(x1_ori, shape_max) x2 = akg.lang.cce.broadcast(x2_ori, shape_max) grad = akg.lang.cce.broadcast(grad_ori, shape_max) esp_min = tvm.const(1.18e-38, dtype="float32") x1_addespmin = akg.lang.cce.vadds(x1, esp_min) if utils.product_is_mini(): not_zero_x1 = akg.lang.cce.vmul(x1, reciprocal(x1_addespmin)) log_x2 = tvm.compute( x2.shape, lambda *i: (tvm.log(x2(*i).astype("float16"))).astype("float32"), name="log_x2") else: not_zero_x1 = div(x1, x1_addespmin) log_x2 = akg.lang.cce.vlog(x2) partial_x1 = akg.lang.cce.vmul(not_zero_x1, log_x2) partial_x1g = akg.lang.cce.vmul(partial_x1, grad) partial_x2 = div(x1, x2) if not utils.product_is_mini() else \ akg.lang.cce.vmul(x1, reciprocal(x2)) partial_x2g = akg.lang.cce.vmul(partial_x2, grad) output_y1 = akg.lang.cce.sum(partial_x1g, rx, keepdims=True) output_y2 = akg.lang.cce.sum(partial_x2g, ry, keepdims=True) if dtype == "float16": output_y1 = akg.lang.cce.cast_to(output_y1, "float16") output_y2 = akg.lang.cce.cast_to(output_y2, "float16") return output_y1, output_y2
def _tan_2x_multi(input_x, times): """calculating tan x by calculating tan (x/2^times) and using double angle formula multiple times""" # Calculate tan (x/2^times) if input_x.dtype == FLOAT_16 and utils.product_is_mini(): input_x_divide = topi.multiply(input_x, tvm.const(1.0/(2.0**times), FLOAT_16)) res = _tan_expand(input_x_divide) else: input_x_divide = topi.multiply(input_x, 1.0/(2.0**times)) res = _tan_expand(input_x_divide) while times != 0: # using double angle formula: tan 2x = 2*tan x/(1-tan x*tan x) if input_x.dtype == FLOAT_16 and utils.product_is_mini(): res_numerator = topi.multiply(res, tvm.const(2.0, FLOAT_16)) tanx_square = topi.multiply(res, res) res_denominator = topi.add(topi.multiply(tanx_square, tvm.const(-1.0, FLOAT_16)), tvm.const(1.0, FLOAT_16)) else: res_numerator = topi.multiply(res, 2.0) tanx_square = topi.multiply(res, res) res_denominator = topi.add(topi.multiply(tanx_square, -1.0), 1.0) if utils.product_is_mini(): res = mul(res_numerator, reciprocal(res_denominator)) else: res = div(res_numerator, res_denominator) times = times - 1 return res
def floordiv(data1, data2): """ Calculate x/y, and always returns an integer which is floored. Args: data1 (tvm.tensor.Tensor): Tensor of type float16, float32. data2 (tvm.tensor.Tensor): Tensor of type float16, float32. Returns: tvm.tensor.Tensor, has type of int32. """ vc_util.ops_dtype_check([data1.dtype, data2.dtype], vc_util.DtypeForDavinci.ALL_FLOAT) shape1 = [x.value for x in data1.shape] vc_util.check_shape(shape1) shape2 = [x.value for x in data2.shape] vc_util.check_shape(shape2) if utils.product_is_mini(): rec = reciprocal(data2, high_precision=True) res = data1 * rec else: res = akg.topi.divide(data1, data2) res = akg.lang.cce.floor(res) return res
def _div_ascend(data1, data2): """ Calculates x/y, and returns an integer when inputs are all integers. When both arguments are integers, use integer division (also known as "floor division"). When arguments are float numbers, use normal floating point division Note: div supports broadcasting. Args: data1 (tvm.tensor.Tensor): Tensor of type float16, float32, int32, int8 and uint8. data2 (tvm.tensor.Tensor): Tensor of type float16, float32, int32, int8 and uint8. Returns: tvm.tensor.Tensor, has the same type as data1 and data2. """ utils.ops_dtype_check([data1.dtype, data2.dtype], utils.DtypeForDavinci.ALL_TYPES) utils.elemwise_dtype_check(data1.dtype, data2.dtype) dtype = data1.dtype shape1 = [x.value for x in data1.shape] shape2 = [x.value for x in data2.shape] utils.check_shape(shape1) utils.check_shape(shape2) utils.auto_broadcast_check(shape1, shape2) n_shape1, n_shape2, out_shape = produce_shapes(shape1, shape2) if n_shape1 != out_shape: input1_cast = akg.topi.broadcast_to(data1, out_shape) else: input1_cast = data1 if n_shape2 != out_shape: input2_cast = akg.topi.broadcast_to(data2, out_shape) else: input2_cast = data2 if dtype in ("int32", "int8", "uint8"): input1p = Case(input1_cast, "float16", utils.CCE) input2p = Cast(input2_cast, "float16", utils.CCE) else: input1p = input1_cast input2p = input2_cast if product_is_mini(): input2p_rec = reciprocal(input2p, target=utils.CCE) res = akg.topi.multiply(input1p, input2p_rec) else: res = akg.topi.divide(input1p, input2p) if dtype in ("int8", "uint8"): res = floor(res, utils.CCE) res = Cast(res, "float16", utils.CCE) if dtype in ("int32", "int8", "uint8"): res = Cast(res, dtype, utils.CCE) return res
def _apply_ada_max_compute(var, m, v, grad, lr, beta1, beta1_power, beta2, epsilon): """Compute ada_max.""" # cast to float32 for improved accuracy inp_dtype = var.dtype if inp_dtype == 'float16': var = topi.cast(var, 'float32') m = topi.cast(m, 'float32') v = topi.cast(v, 'float32') lr = topi.cast(lr, 'float32') beta1_power = topi.cast(beta1_power, 'float32') beta1 = topi.cast(beta1, 'float32') beta2 = topi.cast(beta2, 'float32') grad = topi.cast(grad, 'float32') epsilon = tvm.const(epsilon, 'float32') # m += (grad - m) * (1 - beta1) rhs = tvm.compute(beta1.shape, lambda *i: beta1(*i) * neg_one_const("float32")) rhs = tvm.compute(rhs.shape, lambda *i: rhs(*i) + one_const("float32")) lhs = topi.subtract(grad, m) rhs = tvm.compute(lhs.shape, lambda *i: lhs(*i) * rhs[0]) m = topi.add(m, rhs) # v = max(beta2*v, abs(grad)) lhs = tvm.compute(v.shape, lambda *i: v(*i) * beta2[0]) rhs = topi.abs(grad) v = topi.maximum(lhs, rhs) # var -= lr / (1 - beta1_power) * (m / (v + epsilon)) # lr * m / (1 - beta1_power) * (v + epsilon) # v + epsilon rhs = tvm.compute(v.shape, lambda *i: v(*i) + epsilon) # 1 - beta1_power lhs = tvm.compute(beta1_power.shape, lambda *i: beta1_power(*i) * neg_one_const("float32")) lhs = tvm.compute(lhs.shape, lambda *i: lhs(*i) + one_const("float32")) # (1 - beta1_power) * (v + epsilon) rhs = tvm.compute(rhs.shape, lambda *i: rhs(*i) * lhs[0]) # lr * m lhs = tvm.compute(m.shape, lambda *i: m(*i) * lr[0]) # lr * m / (1 - beta1_power) * (v + epsilon) rhs = reciprocal(rhs) rhs = topi.multiply(lhs, rhs) var = topi.subtract(var, rhs) if inp_dtype == 'float16': var = topi.cast(var, inp_dtype) m = topi.cast(m, inp_dtype) v = topi.cast(v, inp_dtype) return var, m, v
def softsign_compute(input_features): """ompute for softsign""" dtype = input_features.dtype if dtype == "float16": input_features = akg.lang.cce.cast_to(input_features, "float32") data_abs = akg.lang.cce.vabs(input_features) data_add = akg.lang.cce.vadds(data_abs, SCALAR_ONE) data_rec = reciprocal(data_add) res = akg.lang.cce.vmul(input_features, data_rec) if dtype == "float16": res = akg.lang.cce.cast_to(res, "float16") return res
def fake_quant_with_min_max_vars_per_channel_compute(input_data, input_min, input_max, num_bits=8, narrow_range=False): """fake_quant_with_min_max_vars_per_channel compute implemention""" shape = get_shape(input_data.shape) dtype = input_data.dtype min_broadcast = akg.lang.cce.broadcast(input_min, shape, dtype) max_broadcast = akg.lang.cce.broadcast(input_max, shape, dtype) # get nudged_min and nudged_max by nudged_min_max_compute function nudged_min_nudged_max = nudged_min_max_compute(min_broadcast, max_broadcast, num_bits, narrow_range) # transform the input between nudged_max and nudged_min clamped_tmp = topi.minimum(input_data, nudged_min_nudged_max[1]) clamped = topi.maximum(clamped_tmp, nudged_min_nudged_max[0]) # calculate the quantized and dequantized results clamped_shifted = topi.subtract(clamped, nudged_min_nudged_max[0]) if utils.product_is_mini(): clamped_shifted_div_scale = mul(clamped_shifted, reciprocal(nudged_min_nudged_max[2])) else: clamped_shifted_div_scale = div(clamped_shifted, nudged_min_nudged_max[2]) result_tmp = topi.add(clamped_shifted_div_scale, dc.half_const(dtype)) floor_result_tmp = akg.lang.cce.floor(result_tmp) if utils.product_is_mini(): floor_result_tmp = topi.cast(floor_result_tmp, "float16") floor_result_tmp = topi.cast(floor_result_tmp, "float32") scale_product = topi.multiply(floor_result_tmp, nudged_min_nudged_max[2]) tmp_res = topi.add(scale_product, nudged_min_nudged_max[0]) # get bool_both_zero_value by bool_both_zero_compute function bool_both_zero_value = bool_both_zero_compute(min_broadcast, max_broadcast) res = topi.multiply(tmp_res, bool_both_zero_value) return res
def _atan2_compute(y, x): """compute for atan2""" dtype = y.dtype if dtype == "float16": y = topi.cast(y, "float32") x = topi.cast(x, "float32") x_lt_zero_y_mask, y_ge_zero_mask = _init_atan2_mask(y, x) y_cmp_zero = topi.multiply(y_ge_zero_mask, tvm.const(CONST_PI_BY_TWO, "float32")) res_x_lt_zero = topi.multiply(x_lt_zero_y_mask, dc.pi_const("float32")) # caculate the atan(y/x) when x > 0 if utils.product_is_mini(): x_rec = reciprocal(x) res = topi.multiply(y, x_rec) else: res = topi.divide(y, x) res, _ = atan(res) if utils.product_is_mini(): tensor_zero = dc.zero_const("float16") x = topi.cast(x, "float16") y_cmp_zero = topi.cast(y_cmp_zero, "float16") res = topi.cast(res, "float16") else: tensor_zero = dc.zero_const("float32") res = tvm.compute(res.shape, lambda *i: tvm.expr.Select( x(*i) == tensor_zero, y_cmp_zero(*i), res(*i)), name="res") if utils.product_is_mini(): res = topi.cast(res, "float32") res = topi.add(res, res_x_lt_zero) return topi.cast(res, dtype)
def Reciprocal(x): """reciprocal""" return reciprocal.reciprocal(x, high_precision=True)
def nudged_min_max_compute(min_broadcast, max_broadcast, num_bits, narrow_range): """ Calculate the maximum and minimum values of the quantization. Notes: Each channel scale[i] euqal to (max_broadcast[i] - min_broadcast[i]) / (quant_max - quant_min). Then compute nudged_zero_point: nudged_zero_point = floor(between_min_max_float + 0.5) + less_quant_min_float + more_quant_max_float, between_min_max_float is first calculated by: zero_point_from_min = (quant_min_float - min_broadcast) / scale, then between_min_max_float = zero_point_from_min, which min_broadcast <= zero_point_from_min <= max_broadcast. Besides, the value of less_quant_min_float is equal to quant_min or zero, zero_point_from_min < quant_min_float, the value is quant_min, else is 0. The same as more_quant_max_float. Finally according to scale and nudged_zero_point to compute nudged_min and nudged_max: nudged_min = (quant_min - nudged_zero_point) * scale nudged_max = (quant_max - nudged_zero_point) * scale Args: min_broadcast (tvm.tensor.Tensor): minimum value to be quantified for each channel. max_broadcast (tvm.tensor.Tensor): maximum value to be quantified for each channel. num_bits (int): num_bits is the bitwidth of the quantization, range [2,16]. narrow_range (bool): if True, for each channel, quantized into the quantization range [0, 2^num_bits - 1] else quantized into the quantization range [1, 2^num_bits - 1]. Returns: nudged_min (tvm.tensor.Tensor): The same type and shape as min_broadcast. nudged_max (tvm.tensor.Tensor): The same type and shape as max_broadcast. scale (tvm.tensor.Tensor): The same type and shape as max_broadcast. """ dtype = min_broadcast.dtype quant_min = 1 if narrow_range else 0 quant_max = (2**num_bits) - 1 # because of need compute each channel, so quant_min and quant_max need to broadcast. quant_min_float = topi.full(min_broadcast.shape, dtype, tvm.const(quant_min, dtype)) quant_max_float = topi.full(min_broadcast.shape, dtype, tvm.const(quant_max, dtype)) # caculate each channel max and min difference. max_sub_min = topi.subtract(max_broadcast, min_broadcast) quant_max_sub_quant_min = topi.subtract(quant_max_float, quant_min_float) # compute scale = (max_broadcast - min_broadcast) / (quant_max - quant_min) # and min_div_scale = min_broadcast / scale if utils.product_is_mini(): scale = mul(max_sub_min, reciprocal(quant_max_sub_quant_min)) min_div_scale = mul(min_broadcast, reciprocal(scale)) else: scale = div(max_sub_min, quant_max_sub_quant_min) min_div_scale = div(min_broadcast, scale) # zero_point_from_min = quant_min_float - min_broadcast / scale zero_point_from_min = topi.subtract(quant_min_float, min_div_scale) # if zero_point_from_min < quant_min_float, bool_less_quant_min_float = 1 else 0 bool_less_quant_min_float = less_compare_float32(zero_point_from_min, quant_min_float) # if quant_max_float < zero_point_from_min, bool_more_quant_max_float = 1 else 0 bool_more_quant_max_float = less_compare_float32(quant_max_float, zero_point_from_min) # according to above bool param to select effective value less_quant_min_float = topi.multiply(quant_min_float, bool_less_quant_min_float) more_quant_max_float = topi.multiply(quant_max_float, bool_more_quant_max_float) # compute which num is not less than quant_min_float and not large than quant_max_float tensor_one = topi.full(min_broadcast.shape, dtype, dc.one_const(dtype)) bool_not_less_quant_min_float = topi.subtract(tensor_one, bool_less_quant_min_float) bool_not_more_quant_max_float = topi.subtract(tensor_one, bool_more_quant_max_float) bool_between_min_max = topi.multiply(bool_not_less_quant_min_float, bool_not_more_quant_max_float) between_min_max_float = topi.multiply(zero_point_from_min, bool_between_min_max) # add 0.5 to num which min <= num <= max and then floor them. between_min_max_add_half_one = topi.add(between_min_max_float, dc.half_const(dtype)) between_min_max_round = akg.lang.cce.floor(between_min_max_add_half_one) if utils.product_is_mini(): between_min_max_round = topi.cast(between_min_max_round, "float16") between_min_max_round = topi.cast(between_min_max_round, "float32") # calculate the maximum and minimum values of the quantization nudged_zero_point_tmp = topi.add(less_quant_min_float, more_quant_max_float) nudged_zero_point = topi.add(nudged_zero_point_tmp, between_min_max_round) nudged_min_tmp = topi.subtract(quant_min_float, nudged_zero_point) nudged_max_tmp = topi.subtract(quant_max_float, nudged_zero_point) nudged_min = topi.multiply(nudged_min_tmp, scale) nudged_max = topi.multiply(nudged_max_tmp, scale) res = [nudged_min, nudged_max, scale] return res
def reciprocal_auto(tensor): """Reciprocal with auto schedule.""" return reciprocal.reciprocal(tensor)
def reciprocal_manual(tensor): """Reciprocal with manual schedule.""" return reciprocal.reciprocal(tensor)
def _apply_adagrad_da_compute(var, gradient_accum, gradient_squared_accum, grad, lr, l1, l2, global_step): """Compute adagrad_da.""" dtype = var.dtype # cast to float32 for higher precision if dtype == "float16": gradient_accum = topi.cast(gradient_accum, "float32") gradient_squared_accum = topi.cast(gradient_squared_accum, "float32") grad = topi.cast(grad, "float32") lr = topi.cast(lr, "float32") l1 = topi.cast(l1, "float32") l2 = topi.cast(l2, "float32") if utils.product_is_mini(): global_step = topi.cast(global_step, "float16") global_step = topi.cast(global_step, "float32") else: global_step = topi.cast(global_step, "float32") # 1.grad_accum += grad gradient_accum = topi.add(gradient_accum, grad) # 2.grad_squared_accum += grad * grad gs = topi.multiply(grad, grad) gradient_squared_accum = topi.add(gradient_squared_accum, gs) # 3.if l1 > 0: tmp_val = sign(grad_accum) * max(|grad_accum|-l1*global_step, 0) # else: tmp_val = grad_accum sign_val = sign(gradient_accum) abs_val = topi.abs(gradient_accum) mul_val = topi.multiply(global_step, l1) sub_val = topi.subtract(abs_val, mul_val) max_val = topi.maximum(sub_val, tvm.const(0, sub_val.dtype)) tmp_val = topi.multiply(sign_val, max_val) def select(l1, tmp_val, gradient_accum): """Returns tmp_val if l1 > 0 else gradient_accum.""" if utils.product_is_mini(): l1 = topi.cast(l1, "float16") tmp_val = topi.cast(tmp_val, "float16") gradient_accum = topi.cast(gradient_accum, "float16") tmp_val = akg.tvm.compute( tmp_val.shape, lambda *i: tvm.expr.Select(l1[0] > 0, tmp_val(*i), gradient_accum(*i))) return topi.cast(tmp_val, "float32") if utils.product_is_mini() else tmp_val tmp_val = select(l1, tmp_val, gradient_accum) # 4.x_value = -1 * lr * tmp_val x_value = topi.multiply(lr, tvm.const(-1, "float32")) x_value = topi.multiply(x_value, tmp_val) # 5.y_value = l2 * global_step * lr + sqrt(grad_squared_accum) pro_val = topi.multiply(l2, global_step) pro_val = topi.multiply(pro_val, lr) sqrt_val = sqrt(gradient_squared_accum) y_value = topi.add(pro_val, sqrt_val) # 6.var = x_value / y_value if utils.product_is_mini(): y_rec = reciprocal(y_value) var_out = topi.multiply(x_value, y_rec) else: var_out = topi.divide(x_value, y_value) if dtype == "float16": var_out = akg.lang.cce.cast_to(var_out, "float16") gradient_accum = akg.lang.cce.cast_to(gradient_accum, "float16") gradient_squared_accum = akg.lang.cce.cast_to(gradient_squared_accum, "float16") return var_out, gradient_accum, gradient_squared_accum