def my_dsl(dtype, kernel_name, attrs): m = tvm.var("M") n = tvm.var("N") A = tvm.placeholder((m, ), name="A", dtype=dtype) B = tvm.placeholder((m, ), name="B", dtype=dtype) if insn == "add": C = topi.add(A, B) elif insn == "sub": C = topi.subtract(A, B) if insn == "mul": C = topi.multiply(A, B) elif insn == "div": C = topi.divide(A, B) elif insn == "max": C = topi.maximum(A, B) elif insn == "min": C = topi.minimum(A, B) elif insn == "abs": C = tvm.compute(A.shape, lambda *index: tvm.abs(A(*index)), name='C') elif insn == "exp": C = topi.exp(A) elif insn == "log": C = topi.log(A) elif insn == "sqrt": C = topi.sqrt(A) C = topi.log(A) elif insn == "sqrt": C = topi.sqrt(A) elif insn == "adds": C = A + tvm.const(2, dtype) elif insn == "muls": C = A * tvm.const(2, dtype) # C = tvm.compute((m, ), lambda i: A[i] + B[i], name="C") s = tvm.create_schedule([C.op]) with akg.build_config(add_lower_pass=cce.debug_mode(0), dump_pass_ir=True): if insnType == "binary": mod = akg.build(s, [A, B, C], "cce", name=kernel_name, attrs=attrs, polyhedral=True) else: mod = akg.build(s, [A, C], "cce", name=kernel_name, attrs=attrs, polyhedral=True) return mod
def _asin_grad_compute(x, dy): """Compute asin_grad.""" dtype = x.dtype if dtype == "float16": x = topi.cast(x, "float32") dy = topi.cast(dy, "float32") # step 1: calculate num_to_vrsqrt = 1 - x^2 data = topi.multiply(x, x) data = topi.multiply(data, tvm.const(-1, "float32")) num_to_vrsqrt = topi.add(data, tvm.const(1, "float32")) # step 2: calculate dy * (1 / sqrt(1 - x^2)) if utils.product_is_mini(): # mini: use newton's method for high accuracy result res = _vrsqrt_newton(num_to_vrsqrt) res = topi.multiply(res, dy) else: # cloud: use vdiv for high efficiency computation vsqrt_res = topi.sqrt(num_to_vrsqrt) res = topi.divide(dy, vsqrt_res) if dtype == "float16": res = topi.cast(res, "float16") return res
def asinh(x, target=utils.CCE): r""" Compute asinh function. .. math:: asinh(x) = log(x+\sqrt{x*x+1}) Args: x (tvm.tensor.Tensor): Tensor of type float16, float32. Returns: tvm.tensor.Tensor, has the same type and shape as x. Supported Platforms: 'Ascend' """ # check shape utils.check_shape(x) # check input tensor data_type utils.ops_dtype_check(x.dtype, utils.DtypeForDavinci.ALL_FLOAT) dtype = x.dtype # Known that, asinh(x) = log(x + sqrt(x*x+1)), and, asinh(-x) = -asinh(x) # If x is a large negative number, (x + sqrt(x*x+1)) will be close to zero. # So, asinh(x) = sign(x) * log(|x| + sqrt(|x|*|x| + 1)) compute_dtype = dtype if dtype == "float16": # To avoid overflow and higher accuracy, x is casted to float32 compute_dtype = "float32" x = topi.cast(x, compute_dtype) x_abs = topi.abs(x) if product_is_mini(): # sqrt(|x|*|x| + 1) = |x| * sqrt(1 + 1/(|x|*|x|)) vsquare_add_one = topi.add(1, topi.divide(1, topi.multiply(x_abs, x_abs))) sqrt_compute_value = sqrt_mini_newton_iter_impl(vsquare_add_one) sqrt_value = topi.multiply(x_abs, sqrt_compute_value) else: x_abs_square_add_one = topi.add(topi.multiply(x_abs, x_abs), 1) sqrt_value = topi.sqrt(x_abs_square_add_one) x_add_sqrt = topi.add(x_abs, sqrt_value) if product_is_mini(): log_value = log_compute_mini_impl(x_add_sqrt, target) else: log_value = topi.log(x_add_sqrt) res = topi.multiply(Sign(x, target), log_value) if res.dtype != dtype: res = topi.cast(res, dtype) if product_is_mini(): attrs = {"enable_auto_inline": False} return res, attrs return res
def _sinh_2x(sinh_x): """sinh(2x) = 2*sinh(x)*sqrt(sinh(x)^2+1)""" sinh_x_square = topi.multiply(sinh_x, sinh_x) sinh_x_square_add_one = topi.add(sinh_x_square, 1) sqrt_value = topi.sqrt(sinh_x_square_add_one) sinh_x_mul_sqrt_value = topi.multiply(sinh_x, sqrt_value) sinh_2x = topi.multiply(2, sinh_x_mul_sqrt_value) return sinh_2x
def _sqrt(data): """Calculate sqrt by using three times newton iteration(Mini) or vsqrt(Cloud).""" if utils.product_is_mini(): data_sqrt = topi.rsqrt(data) data_sqrt = _newton_iter(data, data_sqrt) data_sqrt = _newton_iter(data, data_sqrt) data_sqrt = _newton_iter(data, data_sqrt) return topi.multiply(data, data_sqrt) else: return topi.sqrt(data)
def LambApplyOptimizerAssign(grad, input_v, input_m, input_param, beta_1, one_minus_beta_1, beta_2, one_minus_beta_2, epsilon, steps, do_use_weight, weight_decay_rate): # compute next_v square_grad = topi.multiply(grad, grad) # mul_3 mul_3_result = topi.multiply(square_grad, one_minus_beta_2) # mul_2 mul_2_result = topi.multiply(input_v, beta_2) # compute: next_v = (multiply(self.beta_2, v) + multiply(1.0 - self.beta_2, square(grad))) next_v = topi.add(mul_2_result, mul_3_result) # compute next_m mul_0_result = topi.multiply(input_m, beta_1) # mul_1 mul_1_result = topi.multiply(grad, one_minus_beta_1) # compute: next_m = (multiply(self.beta_1, m) + multiply(1.0 - self.beta_1, grad)) next_m = topi.add(mul_0_result, mul_1_result) const_one = akg.tvm.const(1.0, input_v.dtype) # compute: beta1_correction = (1 - self.beta_1 ** steps) beta_1_steps = pow_compute(beta_1, steps, grad) neg_beta_1_step = neg(beta_1_steps, utils.CCE) beta1_correction = topi.add(neg_beta_1_step, const_one) # compute: beta2_correction = (1 - self.beta_2 ** steps) beta_2_steps = pow_compute(beta_2, steps, grad) neg_beta_2_step = neg(beta_2_steps, utils.CCE) beta2_correction = topi.add(neg_beta_2_step, const_one) # compute: next_m_unbiased = next_m / beta1_correction next_m_unbiased = Divide(next_m, beta1_correction, utils.CCE) # compute: next_v_unbiased = next_v / beta2_correction next_v_unbiased = Divide(next_v, beta2_correction, utils.CCE) # compute update sqrt_next_v = topi.sqrt(next_v_unbiased) # add_2 add_2_result = topi.add(sqrt_next_v, epsilon) # compute: update = next_m / (sqrt(next_v) + self.epsilon) update = Divide(next_m_unbiased, add_2_result, utils.CCE) # compute do_use_weight_decay do_use_weight_mul = topi.multiply(input_param, weight_decay_rate) do_use_weight_decay = topi.multiply(do_use_weight_mul, do_use_weight) update = topi.add(do_use_weight_decay, update) attrs = {'enable_auto_inline': False} dim_info, _ = lamb_apply_optimizer_assign_set_dim_func(grad) if dim_info != "": attrs["dim"] = dim_info return update, next_v, next_m, attrs