def sin_compute(x): """compute for sine""" dtype = x.dtype shape = get_shape(x) # cast to type float32 when type is float16 if dtype == FLOAT_16: x = akg.lang.cce.cast_to(x, FLOAT_32) pai_multiple = akg.lang.cce.vmuls(x, 1 / PI) round_float = akg.lang.cce.cast_to(akg.lang.cce.round(pai_multiple), FLOAT_32) # to adjust x to [-pai/2,pai/2] x = akg.lang.cce.vsub(x, akg.lang.cce.vmuls(round_float, PI)) res = _sin(x) # if round is odd, the final result need to mutiply -1. # Need to multipy 1/2 to get the ceil value ceil_value = akg.lang.cce.ceil(akg.lang.cce.vmuls(round_float, 1 / 2)) # if odd, ceil*2-round is 1,if even, the value is 0 sub_value = akg.lang.cce.vsub( akg.lang.cce.vmuls(ceil_value, tvm.const(2, dtype)), round_float) tensor_one = akg.lang.cce.broadcast(tvm.const(1, FLOAT_32), shape) odd_tensor = akg.lang.cce.vsub(tensor_one, sub_value) even_tensor = akg.lang.cce.vsub(odd_tensor, tensor_one) odd_even_tensor = akg.lang.cce.vadd(odd_tensor, even_tensor) res = akg.lang.cce.vmul(res, odd_even_tensor) # cast the dtype to float16 if dtype == FLOAT_16: res = akg.lang.cce.cast_to(res, FLOAT_16) return res
def _tan_2x_multi(input_x, times): """calculating tan x by calculating tan (x/2^times) and using double angle formula multiple times""" # Calculate tan (x/2^times) if input_x.dtype == FLOAT_16 and utils.product_is_mini(): input_x_divide = topi.multiply(input_x, tvm.const(1.0/(2.0**times), FLOAT_16)) res = _tan_expand(input_x_divide) else: input_x_divide = topi.multiply(input_x, 1.0/(2.0**times)) res = _tan_expand(input_x_divide) while times != 0: # using double angle formula: tan 2x = 2*tan x/(1-tan x*tan x) if input_x.dtype == FLOAT_16 and utils.product_is_mini(): res_numerator = topi.multiply(res, tvm.const(2.0, FLOAT_16)) tanx_square = topi.multiply(res, res) res_denominator = topi.add(topi.multiply(tanx_square, tvm.const(-1.0, FLOAT_16)), tvm.const(1.0, FLOAT_16)) else: res_numerator = topi.multiply(res, 2.0) tanx_square = topi.multiply(res, res) res_denominator = topi.add(topi.multiply(tanx_square, -1.0), 1.0) if utils.product_is_mini(): res = mul(res_numerator, reciprocal(res_denominator)) else: res = div(res_numerator, res_denominator) times = times - 1 return res
def csr_div(inputs, attrs): row_idx, col_idx, sparse_data, dense = inputs shape = tuple(attrs["dense_shape"]) feature_shape = get_shape(sparse_data.shape)[1:] assert dense.dtype == sparse_data.dtype, "data and weight must have the same dtype" num_rows = row_idx.shape[0] - 1 dense_shape = get_shape(dense.shape) sparse_shape = get_shape(shape) broadcast_shape = get_broadcast_shape(dense_shape, sparse_shape) need_expand = tvm.const(len(dense_shape) < len(broadcast_shape)) need_broadcast_first_dim = tvm.const( len(dense_shape) == len(broadcast_shape) and dense_shape[0] < broadcast_shape[0]) need_broadcast_last_dim = tvm.const( len(dense_shape) == len(broadcast_shape) and dense_shape[1] < broadcast_shape[1]) def gen_ir(dense, sparse_data, col_idx, row_idx, output): ib = tvm.ir_builder.create() ib.scope_attr("INFO", "csr_avg_row", int(sparse_data.shape[0]) // max(int(num_rows), 1)) with ib.for_range(0, num_rows, name='i') as i: start = ib.load(row_idx, i) end = ib.load(row_idx, i + 1) with ib.for_range(0, end - start, name='j') as j: pos = start + j with ib.for_range_n(feature_shape, 'k') as k: with ib.if_scope(pos < end): col = ib.load(col_idx, pos) store_loc = [pos] + k val = ib.load(sparse_data, store_loc) with ib.if_scope(need_expand): ib.store(output, store_loc, val / ib.load(dense, [col] + k)) with ib.else_scope(): with ib.if_scope(need_broadcast_first_dim): ib.store(output, store_loc, val / ib.load(dense, [0, col] + k)) with ib.else_scope(): with ib.if_scope(need_broadcast_last_dim): ib.store(output, store_loc, val / ib.load(dense, [i, 0] + k)) with ib.else_scope(): ib.store( output, store_loc, val / ib.load(dense, [i, col] + k)) return ib.get() output_name = "T_csr_div_" + dense.op.name + "_" + sparse_data.op.name out_buf = tvm.decl_buffer(sparse_data.shape, sparse_data.dtype, output_name) attrs = {"remove_self_dependence": True, "csr_op": True} return tvm.extern( [sparse_data.shape], [dense, sparse_data, col_idx, row_idx], lambda ins, outs: gen_ir(ins[0], ins[1], ins[2], ins[3], outs[0]), dtype=sparse_data.dtype, out_buffers=[out_buf], name=output_name, attrs=attrs)
def tan_compute(input_x): """tan compute implemention""" dtype = input_x.dtype # cast to type float32 when type is float16 in cloud and mini, or int32 in cloud if dtype == FLOAT_16 or dtype == FLOAT_32 or (dtype == INT_32 and not product_is_mini()): input_x = topi.cast(input_x, FLOAT_32) # adjust x to [-pi/2,pi/2] using x = x-round(x/pi)*pi round_pi_div = akg.lang.ascend.round( topi.multiply(input_x, tvm.const(1.0 / PI, FLOAT_32))) round_pi_div = akg.lang.ascend.cast_to(round_pi_div, FLOAT_32) input_x = topi.subtract( input_x, topi.multiply(round_pi_div, tvm.const(PI, FLOAT_32))) # cast to type float16 when type is int32 in mini elif dtype == INT_32 and product_is_mini(): input_x = topi.cast(input_x, FLOAT_16) # adjust x to [-pi/2,pi/2] using x = x-round(x/pi)*pi round_pi_div = akg.lang.ascend.round( topi.multiply(input_x, tvm.const(1.0 / PI, FLOAT_16))) round_pi_div = akg.lang.ascend.cast_to(round_pi_div, FLOAT_16) input_x = topi.subtract( input_x, topi.multiply(round_pi_div, tvm.const(PI, FLOAT_16))) res = _tan_2x_multi(input_x, TAN_2X_TIMES) # cast the dtype to original dtype res = topi.cast(res, dtype) return res
def _elu_mini_compute(exp_res, data, shape): """ do element-wise e^x - 1 compute in mini scene f(x) = e^x - 1, x <= TAYLOR_THRESHOLD or x >= 0 f(x) = fifth taylor computer, TAYLOR_THRESHOLD < x < 0 Args: exp_res (tvm.tensor.Tensor): the tensor of e^x -1, float16 data (tvm.tensor.Tensor): input, float16 shape (list): the shape of input Returns: tvm.tensor.Tensor """ TAYLOR_THRESHOLD = -0.7 input_right_border = tvm.const(0.0, "float16") right_border = tvm.compute(shape, lambda *i: input_right_border) taylor_res = _elu_taylor_compute(data) input_left_border = tvm.const(TAYLOR_THRESHOLD, "float16") left_border = tvm.compute(shape, lambda *i: input_left_border) exp_taylor_neg = tvm.compute(shape, lambda *i: tvm.expr.Select\ (data(*i) > left_border(*i), taylor_res(*i), exp_res(*i)), name="gt") exp_res = tvm.compute(shape, lambda *i: tvm.expr.Select\ (data(*i) < right_border(*i), exp_taylor_neg(*i), exp_res(*i)), name="lt") return exp_res
def fused_bn_update(input1, input2, input3, input4, dtype, c1, c2, c3, c4): """ fused operator. Args: input1 ~ input4: tvm.tensor.Tensor. dtype: dtype of Tensor. c1 ~ c4: const. Returns: Three output (list of tvm.tensor.Tensor). """ const1 = tvm.const(c1, dtype) mul0 = topi.multiply(input2, const1) mul1 = topi.multiply(input1, const1) mul2 = topi.multiply(mul1, mul1) sigma2 = topi.subtract(mul0, mul2) const2 = tvm.const(c2, dtype) rsqrt_val = topi.rsqrt(topi.add(sigma2, const2)) const3 = tvm.const(c3, dtype) mul3 = topi.multiply(sigma2, const3) sub1 = topi.subtract(input3, mul3) const4 = tvm.const(c4, dtype) data1 = topi.multiply(const4, sub1) sub2 = topi.subtract(input4, mul1) data2 = topi.multiply(const4, sub2) return (rsqrt_val, data1, data2)
def sigmoid_cross_entropy_with_logits_grad_compute(predict, target, dout): """sigmoid_cross_entropy_with_logits_grad compute implemention""" dtype = predict.dtype if dtype == "float16": predict = topi.cast(predict, "float32") target = topi.cast(target, "float32") dout = topi.cast(dout, "float32") # e^x val1 = exp(predict) # 1 + e^x val2 = topi.add(val1, tvm.const(SCALAR_ONE, dtype="float32")) # e^x / (1 + e^x) val3 = topi.divide(val1, val2) # -target val4 = topi.multiply(target, tvm.const(SCALAR_NEGTIVE_ONE, dtype="float32")) # e^x / (1 + e^x) -y val5 = topi.add(val3, val4) result = topi.multiply(val5, dout) if dtype == "float16": result = topi.cast(result, dtype) return result
def selu_compute(input_data): """selu compute implemention""" # if input_dtype is float16,convert it to float32 dtype = input_data.dtype if dtype == "float16" or dtype == "float32": input_data = topi.cast(input_data, "float32") type_tmp = "float32" else: input_data = topi.cast(input_data, "float16") type_tmp = "float16" # generate tensor_zero to be compared tensor_zero = topi.multiply(input_data, tvm.const(0, dtype=type_tmp)) # generate negative_res and positive_res to compute # When the element value is greater than 0 and less than 0 negative_res = topi.minimum(input_data, tensor_zero) positive_res = topi.maximum(input_data, tensor_zero) exp_res = exp(negative_res) sub_res = topi.add(exp_res, tvm.const(SCALAR_NEGATIVE_ONE, dtype=type_tmp)) negative_muls_res = topi.multiply(sub_res, tvm.const(SCALE_ALPHA_PRODUCT, dtype=type_tmp)) if dtype == "int8": negative_muls_res = akg.lang.cce.ceil(negative_muls_res) positive_muls_res = topi.multiply(positive_res, tvm.const(SCALE, dtype=type_tmp)) res = topi.add(negative_muls_res, positive_muls_res) # cast to ori_dtype if dtype == "float16" or dtype == "int8" or dtype == "int32": res = topi.cast(res, dtype) return res
def _asin_grad_compute(x, dy): """Compute asin_grad.""" dtype = x.dtype if dtype == "float16": x = topi.cast(x, "float32") dy = topi.cast(dy, "float32") # step 1: calculate num_to_vrsqrt = 1 - x^2 data = topi.multiply(x, x) data = topi.multiply(data, tvm.const(-1, "float32")) num_to_vrsqrt = topi.add(data, tvm.const(1, "float32")) # step 2: calculate dy * (1 / sqrt(1 - x^2)) if utils.product_is_mini(): # mini: use newton's method for high accuracy result res = _vrsqrt_newton(num_to_vrsqrt) res = topi.multiply(res, dy) else: # cloud: use vdiv for high efficiency computation vsqrt_res = topi.sqrt(num_to_vrsqrt) res = topi.divide(dy, vsqrt_res) if dtype == "float16": res = topi.cast(res, "float16") return res
def ReLU6Grad(y_grad, x, target=utils.CUDA): """ Computes Gradients of Rectified Linear 6. Args: y_grad (tvm.tensor.Tensor): Tensor of type float16, float32, gradients backpropagated to the ReLU6 op. x (tvm.tensor.Tensor): Tensor of type float16/float32, inputs that where passed to the ReLU6 op, or its outputs. Returns: tvm.tensor.Tensor, has same type and shape as x. Supported Platforms: 'GPU' """ if target != utils.CUDA: raise RuntimeError("the target %s is not supported!" % target) shape = x.shape dtype = x.dtype zero = tvm.const(0, dtype) six = tvm.const(6, dtype) res0 = tvm.compute(shape, lambda *i: tvm.if_then_else(x(*i) >= zero, x(*i), zero)) res6 = tvm.compute( shape, lambda *i: tvm.if_then_else(x(*i) >= six, zero, res0(*i))) res = tvm.compute( shape, lambda *i: tvm.if_then_else(res6(*i) == zero, zero, y_grad(*i))) return res
def xdivy_grad_compute(placeholders, shape_max, dtype, rx, ry): """ Do element-wise xdivy_grad compute. Args: placeholders (Union[list, typle]): the placeholder of data input shape_max (Union[list, typle]): the shape of broadcast dtype (string): the type of data input rx (list): the reduction indices of data input with broadcast ry (list): the reduction indices for data input with broadcast Returns: output_y1 (tvm.tensor.Tensor): result of xdivy_grad output_y2 (tvm.tensor.Tensor): result of xdivy_grad """ x1_ori = placeholders[0] x2_ori = placeholders[1] grad_ori = placeholders[2] if dtype == "float16": x1 = akg.lang.cce.cast_to(x1_ori, "float32") x2 = akg.lang.cce.cast_to(x2_ori, "float32") grad = akg.lang.cce.cast_to(grad_ori, "float32") x1 = akg.lang.cce.broadcast(x1, shape_max) x2 = akg.lang.cce.broadcast(x2, shape_max) grad = akg.lang.cce.broadcast(grad, shape_max) else: x1 = akg.lang.cce.broadcast(x1_ori, shape_max) x2 = akg.lang.cce.broadcast(x2_ori, shape_max) grad = akg.lang.cce.broadcast(grad_ori, shape_max) esp_min = tvm.const(1.18e-38, dtype="float32") x1_addepsmin = akg.lang.cce.vadds(x1, esp_min) if utils.product_is_mini(): x1_addepsmin_rec = reciprocal(x1_addepsmin) not_zero_x1 = akg.lang.cce.vmul(x1, x1_addepsmin_rec) x2_rec = reciprocal(x2) partial_x1 = akg.lang.cce.vmul(not_zero_x1, x2_rec) else: not_zero_x1 = div(x1, x1_addepsmin) partial_x1 = div(not_zero_x1, x2) partial_x1g = akg.lang.cce.vmul(partial_x1, grad) neg_one = tvm.const(-1, dtype="float32") neg_x1 = akg.lang.cce.vmuls(x1, neg_one) partial_x1pow = akg.lang.cce.vmul(partial_x1, partial_x1) partial_x2 = akg.lang.cce.vmul(neg_x1, partial_x1pow) partial_x2g = akg.lang.cce.vmul(partial_x2, grad) output_y1 = akg.lang.cce.sum(partial_x1g, rx, keepdims=True) output_y2 = akg.lang.cce.sum(partial_x2g, ry, keepdims=True) if dtype == "float16": output_y1 = akg.lang.cce.cast_to(output_y1, "float16") output_y2 = akg.lang.cce.cast_to(output_y2, "float16") return output_y1, output_y2
def _update_m(m, beta, grad): """Update m_out = m * beta + grad * (1 - beta)""" m_beta = topi.multiply(m, beta) beta_neg = topi.multiply(beta, tvm.const(-1, beta.dtype)) beta_1 = topi.add(beta_neg, tvm.const(1, beta_neg.dtype)) grad_beta_gs = topi.multiply(grad, beta_1) m_out = topi.add(m_beta, grad_beta_gs) return m_out
def kernel_ir(dst, data): ib = tvm.ir_builder.create() with ib.for_range_n(data.shape, "ax") as i: zero = tvm.const(0, data.dtype) one = tvm.const(1, out_dtype) with ib.if_scope(ib.load(data, i) > zero): ib.store(dst, 0, one) return ib.get()
def _newton_iter(data, init_x): """Do element-wise Newton compute.""" # Newton begin:x(n+1) = x(n)*(3-a*x(n)^2)/2 init_square = topi.multiply(init_x, init_x) newton_res = topi.multiply(init_square, data) newton_res = topi.multiply(newton_res, neg_one_const("float32")) newton_res = topi.add(newton_res, tvm.const(3, "float32")) newton_res = topi.multiply(newton_res, init_x) newton_res = topi.multiply(newton_res, tvm.const(0.5, "float32")) return newton_res
def _elu_taylor_compute(data): """ Calculate e^x - 1, Use fifth order taylor expansion e^x = 1 + x + (x^2 / 2!) + (x^3 / 3!) + (x^4 / 4!) + (x^5 / 5!) e^x - 1 = x + (x^2 / 2!) + (x^3 / 3!) + (x^4 / 4!) + (x^5 / 5!) Args: data (tvm.tensor.Tensor): input Returns : tvm.tensor.Tensor """ TAYLOR_SECOND_ORDER_PARAM = 1 / 2.0 TAYLOR_THIRD_ORDER_PARAM = 1 / 6.0 TAYLOR_FOURTH_ORDER_PARAM = 1 / 24.0 TAYLOR_FIFTH_ORDER_PARAM = 1 / 120.0 dtype = data.dtype if dtype == "float16": data = akg.lang.ascend.cast_to(data, "float32") # x^2 / 2! taylor_second_order_param = tvm.const(TAYLOR_SECOND_ORDER_PARAM, "float32") data_power_2 = akg.lang.ascend.vmul(data, data) data_power_2_div_2 = akg.lang.ascend.vmuls(data_power_2, taylor_second_order_param) # x^3 / 3! taylor_third_order_param = tvm.const(TAYLOR_THIRD_ORDER_PARAM, "float32") data_power_3 = akg.lang.ascend.vmul(data_power_2, data) data_power_3_div_6 = akg.lang.ascend.vmuls(data_power_3, taylor_third_order_param) # x^4 / 4! taylor_fourth_order_param = tvm.const(TAYLOR_FOURTH_ORDER_PARAM, "float32") data_power_4 = akg.lang.ascend.vmul(data_power_3, data) data_power_4_div_24 = akg.lang.ascend.vmuls(data_power_4, taylor_fourth_order_param) # x^5 / 5! taylor_fifth_order_param = tvm.const(TAYLOR_FIFTH_ORDER_PARAM, "float32") data_power_5 = akg.lang.ascend.vmul(data_power_4, data) data_power_5_div_120 = akg.lang.ascend.vmuls(data_power_5, taylor_fifth_order_param) res = akg.lang.ascend.vadd(data, data_power_2_div_2) res = akg.lang.ascend.vadd(res, data_power_3_div_6) res = akg.lang.ascend.vadd(res, data_power_4_div_24) res = akg.lang.ascend.vadd(res, data_power_5_div_120) if dtype == "float16": res = akg.lang.ascend.cast_to(res, "float16") return res
def _newton(start_value, num_to_vrsqrt): """Do newton's method to calculate vrsqrt.""" x0_square = topi.multiply(start_value, start_value) mul_res = topi.multiply(x0_square, num_to_vrsqrt) mul_res = topi.multiply(mul_res, tvm.const(-1, "float32")) head0_tmp = topi.add(mul_res, tvm.const(3, "float32")) head0 = topi.multiply(head0_tmp, start_value) newton_res = topi.multiply(head0, tvm.const(0.5, "float32")) return newton_res
def sgd_compute(parameters, gradient, learning_rate, accum, momentum, stat, dampening=0.0, weight_decay=0.0, nesterov=False): """sgd compute implementation""" dtype = parameters.dtype if dtype == "float16": parameters = topi.cast(parameters, "float32") accum = topi.cast(accum, "float32") learning_rate = topi.cast(learning_rate, "float32") gradient = topi.cast(gradient, "float32") momentum = topi.cast(momentum, "float32") stat = topi.cast(stat, "float32") # if weight_decay != 0.0, need compute grad_delta to update gradient if weight_decay != 0.0: parameters = topi.multiply(parameters, tvm.const(1.0, 'float32')) grad_delta = topi.multiply(parameters, weight_decay) gradient = topi.add(gradient, grad_delta) stat_mid = topi.multiply(stat, tvm.const(-1, "float32")) stat_act = topi.add(stat_mid, tvm.const(1, "float32")) dampening_t = topi.multiply(stat_act, dampening) # update accum accum_delta = tvm.compute(accum.shape, lambda *indice: accum(*indice) * momentum[0]) gradient_damp = topi.multiply(gradient, dampening_t) accum_t = topi.add(accum_delta, gradient) if dampening != 0.0: accum_t = topi.subtract(accum_t, gradient_damp) # update parameters if nesterov: parameters_delta = tvm.compute(gradient.shape, lambda *indice: gradient(*indice) * learning_rate[0]) parameters_delta_2 = tvm.compute(accum_t.shape, lambda *indice: accum_t(*indice) * momentum[0]) parameters_delta_2 = tvm.compute(parameters_delta_2.shape, lambda *indice: parameters_delta_2(*indice) * learning_rate[0]) parameters_delta = topi.add(parameters_delta, parameters_delta_2) parameters_t = topi.subtract(parameters, parameters_delta) else: parameters_delta = tvm.compute(accum_t.shape, lambda *indice: accum_t(*indice) * learning_rate[0]) parameters_t = topi.subtract(parameters, parameters_delta) # update stat stat_t = topi.multiply(stat_act, tvm.const(NUM_ZERO, 'float32')) if dtype == "float16": parameters_t = topi.cast(parameters_t, "float16") accum_t = topi.cast(accum_t, "float16") stat_t = topi.cast(stat_t, "float16") return parameters_t, accum_t, stat_t
def my_dsl(dtype, kernel_name, attrs): m = tvm.var("M") n = tvm.var("N") A = tvm.placeholder((m, ), name="A", dtype=dtype) B = tvm.placeholder((m, ), name="B", dtype=dtype) if insn == "add": C = topi.add(A, B) elif insn == "sub": C = topi.subtract(A, B) if insn == "mul": C = topi.multiply(A, B) elif insn == "div": C = topi.divide(A, B) elif insn == "max": C = topi.maximum(A, B) elif insn == "min": C = topi.minimum(A, B) elif insn == "abs": C = tvm.compute(A.shape, lambda *index: tvm.abs(A(*index)), name='C') elif insn == "exp": C = topi.exp(A) elif insn == "log": C = topi.log(A) elif insn == "sqrt": C = topi.sqrt(A) C = topi.log(A) elif insn == "sqrt": C = topi.sqrt(A) elif insn == "adds": C = A + tvm.const(2, dtype) elif insn == "muls": C = A * tvm.const(2, dtype) # C = tvm.compute((m, ), lambda i: A[i] + B[i], name="C") s = tvm.create_schedule([C.op]) with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): if insnType == "binary": mod = akg.build(s, [A, B, C], "cce", name=kernel_name, attrs=attrs, polyhedral=True) else: mod = akg.build(s, [A, C], "cce", name=kernel_name, attrs=attrs, polyhedral=True) return mod
def select_compute(condition, x1, x2): """select compute implementation""" shape = get_shape(x1) con_shape = get_shape(condition) num_dtype = x1.dtype bool_dtype = condition.dtype if num_dtype in ("int8", "uint8"): x1_dtype = "float32" ones = akg.lang.cce.broadcast(tvm.const(VALUE_ONE, dtype="float32"), shape, output_dtype="float32") x1 = akg.lang.cce.cast_to(x1, "float32") x2 = akg.lang.cce.cast_to(x2, "float32") else: x1_dtype = num_dtype ones = akg.lang.cce.broadcast(tvm.const(VALUE_ONE, dtype=num_dtype), shape, output_dtype=num_dtype) if bool_dtype == "int8": if x1_dtype == "int32": condition_dtype = akg.lang.cce.ceil(condition) else: condition_dtype = akg.lang.cce.cast_to(condition, x1_dtype) else: if x1_dtype == "int32": condition_dtype = condition else: condition_dtype = akg.lang.cce.cast_to(condition, x1_dtype) if list(con_shape) != list(shape): condition_dtype = akg.lang.cce.broadcast(condition_dtype, shape) vinsn_support_dtype = ("float16", "float32") if utils.product_is_mini(): vinsn_support_dtype = ("float16", ) if num_dtype in vinsn_support_dtype: res = topi.where(condition_dtype, x1, x2) else: # For data types that are not supported by the vector instruction (vcmp and vsel), # if the `topi.where` is directly used, the related instructions generated in the .cce file # are scalar instructions such as `cond ? x1 : x2`, which is very inefficient. # Therefore, other equivalent calculation methods are adopted. condition_opp = akg.lang.cce.vsub(ones, condition_dtype) temp_x = akg.lang.cce.vmul(x1, condition_dtype) temp_y = akg.lang.cce.vmul(x2, condition_opp) res = akg.lang.cce.vadd(temp_x, temp_y) if num_dtype in ("int8", "uint8"): res = akg.lang.cce.cast_to(res, num_dtype) return res
def _asin_compute(data_input): """Compute asin""" dtype = data_input.dtype boundary = tvm.const(BOUNDARY, "float32") # Change dtype to float32 if dtype == "float16": data_input = topi.cast(data_input, "float32") # Sign mask data_sign = sign(data_input) # All positive data1 = topi.multiply(data_input, data_sign) # x belongs to (0, 2^(-0.5)) choice_1 = topi.minimum(data1, boundary) choice_1 = topi.subtract(choice_1, boundary) choice_1_floor = akg.lang.cce.floor(choice_1) # the dtype of choice_1_floor is int32, need to be cast to fp32. if utils.product_is_mini(): choice_1_floor = topi.cast(choice_1_floor, "float16") choice_1_floor = topi.cast(choice_1_floor, "float32") else: choice_1_floor = topi.cast(choice_1_floor, "float32") choice_1 = topi.multiply(choice_1_floor, neg_one_const("float32")) taylor1 = _taylor_compute(data1) res_1 = topi.multiply(taylor1, choice_1) # x belongs to (2^(-0.5), 1) choice_2 = topi.subtract(one_const("float32"), choice_1) data2 = topi.subtract(one_const("float32"), topi.multiply(data1, data1)) data2_sqrt = _sqrt(data2) taylor2 = _taylor_compute(data2_sqrt, data2) res_2 = topi.multiply(taylor2, neg_one_const("float32")) res_2 = topi.add(res_2, tvm.const(HALF_PI, "float32")) res_2 = topi.multiply(res_2, choice_2) # Restore sign res_1 = topi.add(res_1, res_2) res_1 = topi.multiply(res_1, data_sign) # Restore dtype if dtype == "float16": res_1 = topi.cast(res_1, "float16") return res_1
def _apply_adadelta_compute(var, accum, accum_update, grad, lr, rho, epsilon): """Compute apply_adadelta""" dtype = var.dtype if dtype == "float16": var = topi.cast(var, "float32") accum = topi.cast(accum, "float32") accum_update = topi.cast(accum_update, "float32") lr = topi.cast(lr, "float32") rho = topi.cast(rho, "float32") grad = topi.cast(grad, "float32") epsilon = tvm.const(epsilon, "float32") tensor_one = akg.lang.ascend.broadcast(tvm.const(1, "float32"), var.shape) tensor_rho = topi.broadcast_to(rho, var.shape) tensor_rho_gs = topi.subtract(tensor_one, tensor_rho) tensor_epsilon = akg.lang.ascend.broadcast(epsilon, var.shape) # accum = accum * rho + grad ** 2 * (1 - rho) rhs = topi.multiply(accum, tensor_rho) lhs = topi.multiply(grad, grad) lhs = topi.multiply(lhs, tensor_rho_gs) accum_res = akg.lang.ascend.vadd(lhs, rhs) # update = (accum_update + epsilon).sqrt * (accum + epsilon).rsqrt * grad rhs = topi.add(accum_update, tensor_epsilon) rhs = sqrt(rhs, target=utils.CCE) lhs = topi.add(accum_res, tensor_epsilon) lhs = rsqrt(lhs, target=utils.CCE) lhs = topi.multiply(grad, lhs) update = topi.multiply(lhs, rhs) # var -= update * lr var_res = topi.broadcast_to(lr, var.shape) var_res = topi.multiply(update, var_res) var_res = topi.subtract(var, var_res) # accum_update = rho * accum_update + (1 - rho) * update.square rhs = topi.multiply(accum_update, tensor_rho) lhs = topi.multiply(update, update) lhs = topi.multiply(lhs, tensor_rho_gs) accum_update_res = akg.lang.ascend.vadd(lhs, rhs) if dtype == "float16": var_res = topi.cast(var_res, "float16") accum_res = topi.cast(accum_res, "float16") accum_update_res = topi.cast(accum_update_res, "float16") return var_res, accum_res, accum_update_res
def _sinh_taylor_compute(x): """sinh(x) value is x * (1 + x^2( 1/3! + x^2(1/5! + x^2/7!)))""" taylor_params = [ tvm.const(0.1666666666666666666666666666666666, dtype), tvm.const(0.0083333333333333333333333333333333, dtype), tvm.const(0.0001984126984126984126984126984126, dtype) ] x_square = topi.multiply(x, x) sinh_taylor = tvm.compute( x.shape, lambda *indice: x(*indice) * (1 + x_square(*indice) * (taylor_params[0] + x_square(*indice) * (taylor_params[1] + x_square(*indice) * taylor_params[2]))), name="sinh_taylor") return sinh_taylor
def kernel_ir(data, dst): ib = tvm.ir_builder.create() # axes before cumm-axis with ib.for_range_n(shape[:axis], "i0") as i0: # axes after cumm-axis with ib.for_range_n(shape[axis + 1:], "i1") as i1: idx_0 = i0 + [0] + i1 if not reverse else i0 + [ shape[axis] - 1 ] + i1 ib.store( dst, idx_0, tvm.const(1, data.dtype) if exclusive else ib.load( data, idx_0)) # iterate the cumm-axis to do cumulated production (start from 1) with ib.for_range(1, shape[axis], name="cum_idx") as m: idx_pre = i0 + [m - 1] + i1 if not reverse else i0 + [ shape[axis] - m ] + i1 idx_cur = i0 + [m] + i1 if not reverse else i0 + [ shape[axis] - 1 - m ] + i1 ib.store( dst, idx_cur, ib.load(dst, idx_pre) * ib.load(data, idx_pre if exclusive else idx_cur)) return ib.get()
def _atan_compute(data): """compute for atan""" dtype = data.dtype if dtype == "float16": data = topi.cast(data, "float32") abs_data = topi.abs(data) tensor_one = dc.one_const(abs_data.dtype) abs_data_sub_one = topi.subtract(abs_data, tensor_one) abs_data_add_one = topi.add(abs_data, tensor_one) abs_data2 = topi.abs(topi.divide(abs_data_sub_one, abs_data_add_one)) # calucate data less than one res = _do_atan_taylor(abs_data) # calucate data more than one res_mt_one = topi.add(_do_atan_taylor(abs_data2), tvm.const(CONST_PI_BY_FOUR, abs_data2.dtype)) res = topi.minimum(res, res_mt_one) if utils.product_is_mini() and data.dtype == "float32": sign_mask = topi.cast(topi.sign(topi.cast(data, "float16")), "float32") else: sign_mask = topi.sign(data) res = topi.multiply(res, sign_mask) if dtype == "float16": res = topi.cast(res, "float16") return res
def _taylor_compute(data_x, x_square=None): """Do arcsinx compute use the 15th order taylor expansion when 0 <= x <= BOUNDARY.""" if x_square is None: x_square = topi.multiply(data_x, data_x) else: x_square = x_square """asin(x) = x + 1/6*x^3 + 3/40*x^5 + 5/112*x^7 + ... + 13!!/(14!!*15)*x^15""" res = topi.multiply(x_square, tvm.const(COEF[TAYLOR_COUNT], "float32")) for temp in reversed(range(TAYLOR_COUNT)): res = topi.add(res, tvm.const(COEF[temp], "float32")) if temp == 0: res = topi.multiply(res, data_x) else: res = topi.multiply(x_square, res) return res
def _update_var(decay_gm, alpha, lr, grad, var): """Update var_out = var - lr * (alpha + decay_gm) * grad""" decay_gm_alpha = topi.add(decay_gm, alpha) res = topi.multiply(decay_gm_alpha, lr) res = topi.multiply(res, grad) res_neg = topi.multiply(res, tvm.const(-1, res.dtype)) var_out = topi.add(var, res_neg) return var_out
def _mean(data, axis, cof, shape): size = 1 for i, _ in enumerate(axis): size = size * shape[axis[i]] cof = cof / tvm.const(size, "float32") tmp = topi.multiply(data, cof) res = topi.sum(tmp, axis) return res
def xlogy_grad_compute(placeholders, shape_max, dtype, rx, ry): """ do element-wise xlogy_grad compute Args: placeholders (Union[list, typle]): the placeholder of data input shape_max (Union[list, typle]): the shape of broadcast dtype (string): the type of data input rx (list): the reduction indices of data input with broadcast ry (list): the reduction indices for data input with broadcast Returns output_y1 (tvm.tensor.Tensor): result of xlogy_grad output_y2 (tvm.tensor.Tensor): result of xlogy_grad """ x1_ori = placeholders[0] x2_ori = placeholders[1] grad_ori = placeholders[2] if dtype == "float16": x1 = akg.lang.cce.cast_to(x1_ori, "float32") x2 = akg.lang.cce.cast_to(x2_ori, "float32") grad = akg.lang.cce.cast_to(grad_ori, "float32") x1 = akg.lang.cce.broadcast(x1, shape_max) x2 = akg.lang.cce.broadcast(x2, shape_max) grad = akg.lang.cce.broadcast(grad, shape_max) else: x1 = akg.lang.cce.broadcast(x1_ori, shape_max) x2 = akg.lang.cce.broadcast(x2_ori, shape_max) grad = akg.lang.cce.broadcast(grad_ori, shape_max) esp_min = tvm.const(1.18e-38, dtype="float32") x1_addespmin = akg.lang.cce.vadds(x1, esp_min) if utils.product_is_mini(): not_zero_x1 = akg.lang.cce.vmul(x1, reciprocal(x1_addespmin)) log_x2 = tvm.compute( x2.shape, lambda *i: (tvm.log(x2(*i).astype("float16"))).astype("float32"), name="log_x2") else: not_zero_x1 = div(x1, x1_addespmin) log_x2 = akg.lang.cce.vlog(x2) partial_x1 = akg.lang.cce.vmul(not_zero_x1, log_x2) partial_x1g = akg.lang.cce.vmul(partial_x1, grad) partial_x2 = div(x1, x2) if not utils.product_is_mini() else \ akg.lang.cce.vmul(x1, reciprocal(x2)) partial_x2g = akg.lang.cce.vmul(partial_x2, grad) output_y1 = akg.lang.cce.sum(partial_x1g, rx, keepdims=True) output_y2 = akg.lang.cce.sum(partial_x2g, ry, keepdims=True) if dtype == "float16": output_y1 = akg.lang.cce.cast_to(output_y1, "float16") output_y2 = akg.lang.cce.cast_to(output_y2, "float16") return output_y1, output_y2
def fake_quant_with_min_max_args(input_data, min_=-6, max_=6, num_bits=8, narrow_range=False): """ Computes Fake-quantize the 'input_data' tensor, type float32 to 'output_data' tensor of same type output_data = (floor(clamped_shifted * inv_nudged_scale + 0.5f))) * scale + nudged_min scale = (max-min) / (quant_max-quant_min) Args: data_x1 (tvm.tensor.Tensor): Tensor of dtype "float32" min ([float, int]): scalar, defaults to -6 max ([float, int]): scalar, defaults to 6. [min; max] define the clamping range for the input_data data num_bits ([float, int]): Defaults to 8. num_bits is the bitwidth of the quantization,between 2 and 16 narrow_range ([bool]): True, quantized into the quantization range [1; 2^num_bits - 1] False,quantized into the quantization range [0; 2^num_bits - 1] Returns: tvm.tensor.Tensor """ shape = get_shape(input_data) utils.check_shape(shape) dtype = input_data.dtype utils.ops_dtype_check(dtype, utils.DtypeForDavinci.FLOAT32) nudged_min, nudged_max, scale = nudge_min_max(min_, max_, num_bits, narrow_range) zero_tensor = tvm.compute(input_data.shape, lambda *i: tvm.const(0, dtype="float32"), name="zero_tensor") nudged_max_tensor = topi.add(zero_tensor, nudged_max) nudged_min_tensor = topi.add(zero_tensor, nudged_min) inv_nudged_scale = 1.00 / scale # Transform the input between nudged_max and nudged_min clamped_vmin = topi.minimum(input_data, nudged_max_tensor) clamped = topi.maximum(clamped_vmin, nudged_min_tensor) # Calculate the quantized and dequantized results clamped_shifted = topi.subtract(clamped, nudged_min_tensor) vmul_shifted = topi.multiply(clamped_shifted, inv_nudged_scale) vadds_shifted = topi.add(vmul_shifted, 0.5) floor_vadds_shifted = floor(vadds_shifted) floor_cast = akg.lang.ascend.cast_to(floor_vadds_shifted, dtype) res_scale = topi.multiply(floor_cast, scale) res = topi.add(res_scale, nudged_min_tensor) return res
def TensorcoreConv(data, weight, stride=[1, 1], pad=[0, 0, 0, 0], dilation=[1, 1], out_dtype="float32", name="out", target=utils.CUDA): batch, in_h, in_w, in_c = data.shape out_c, k_h, k_w, _ = weight.shape pad_top, pad_bottom, pad_left, pad_right = pad s_h, s_w = stride d_h, d_w = dilation k_h_d = (k_h - 1) * d_h + 1 k_w_d = (k_w - 1) * d_w + 1 o_h = (in_h + pad_top + pad_bottom - k_h_d) // s_h + 1 o_w = (in_w + pad_left + pad_right - k_w_d) // s_w + 1 has_pad = not (pad_left == 0 and pad_right == 0 and pad_top == 0 and pad_bottom == 0) if has_pad: data_pad = tvm.compute( (batch, in_h + pad_top + pad_bottom, in_w + pad_left + pad_right, in_c), lambda n, h, w, i: tvm.if_then_else( tvm.all(h >= pad_top, h - pad_bottom < in_h, w >= pad_left, w - pad_right < in_w), data[n, h - pad_top, w - pad_left, i], tvm.const(0.0, "float16"), ), name="Pad", ) else: data_pad = data rc = tvm.reduce_axis((0, in_c), name="rc") rh = tvm.reduce_axis((0, k_h), name="rh") rw = tvm.reduce_axis((0, k_w), name="rw") if out_dtype == "float32": out = tvm.compute( (batch, o_h, o_w, out_c), lambda n, h, w, o: tvm.sum(data_pad[n, (h * s_h + rh * d_h), ( w * s_w + rw * d_w), rc].astype("float32") * weight[ o, rh, rw, rc].astype("float32"), axis=[rc, rh, rw]), name=name) else: out = tvm.compute( (batch, o_h, o_w, out_c), lambda n, h, w, o: tvm.sum(data_pad[n, (h * s_h + rh * d_h), ( w * s_w + rw * d_w), rc] * weight[o, rh, rw, rc], axis=[rc, rh, rw]), name=name) return out