def _compute_process(input_list): var, m, lr, logbase, sign_decay, beta, grad = input_list[0], input_list[1], \ input_list[2], input_list[3], \ input_list[4], input_list[5], input_list[6] m_t = _compute_m_t(m, beta, grad) sign_gm = te.lang.cce.vmul(sign(m_t), sign(grad)) update = _compute_update(logbase, sign_decay, sign_gm, grad) var_t = _compute_var(var, lr, update) return var_t, m_t
def _compute_positive(prox_v, alpha_broad, l1_broad, l2_broad): """ the operator's compute var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0} Parameters: ---------- prox_v: the value of prox_v alpha_broad: the value of alpha_broad l1_broad: the value of l1_broad l2_broad: the value of l2_broad Returns the value of var_res """ prox_v_abs = te.lang.cce.vabs(prox_v) prox_v_sign = sign(prox_v) # 1+alpha*l2 alpha_l2 = te.lang.cce.vmul(alpha_broad, l2_broad) alpha_l2_1 = te.lang.cce.vadds(alpha_l2, tvm.const(CONST_ONE, "float32")) # max{|prox_v|-alpha*l1,0} alpha_l1 = te.lang.cce.vmul(alpha_broad, l1_broad) alpha_l1_neg = te.lang.cce.vmuls(alpha_l1, tvm.const(CONST_ONE_NEG, "float32")) prox_v_l1 = te.lang.cce.vadd(prox_v_abs, alpha_l1_neg) max_value = te.lang.cce.vmax( prox_v_l1, te.lang.cce.broadcast(tvm.const(CONST_ZERO, "float32"), prox_v.shape)) # sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0} res = te.lang.cce.vdiv(prox_v_sign, alpha_l2_1) var_res = te.lang.cce.vmul(res, max_value) return var_res
def bessel_i1e_compute(x, y, kernel_name="bessel_i1e"): """ Algrithm: I0 = 1 + ( (z/2) / (1!) )^2 + ((z/2)^2 / (2!))^2 + ... + ((z/2)^n / (n!)) ^2 I0e = I0 / exp(x) I1e = I0e * z / (2*(k+1)) u = 4 * v^2 Ive = (1 - (u-1)/(8*z) + (u-1)*(u-9)/(2! * (8*z)^2) - (u-1)*(u-9)*(u-25)/(3!*(8*z)^3)) /sqrt(2*pi*z) Parameters ---------- x: the placeholder of data input y: the dict of output kernel_name: cce kernel name, default value is "bessel_i1e" Returns ------- A tensor. Has the same type as x. """ shape_input = x.shape dtype_input = x.dtype # chose the type of data in begin if dtype_input == "float16" and \ api_check_support("te.lang.cce.vadd", "float32"): x = te.lang.cce.cast_to(x, "float32") abs_data = te.lang.cce.vabs(x) broad_const_limit = te.lang.cce.broadcast(tvm.const(CONST_LIMIT, x.dtype), shape_input) before_res = _before_res_compute(abs_data, broad_const_limit) after_res = _after_res_compute(abs_data, broad_const_limit) if abs_data.dtype == before_res.dtype and \ api_check_support("te.lang.cce.vcmpsel", abs_data.dtype): res = te.lang.cce.vcmpsel(abs_data, broad_const_limit, 'lt', before_res, after_res) else: select_index = te.lang.cce.vcmp(abs_data, broad_const_limit, 'lt') res = te.lang.cce.vsel(select_index, before_res, after_res) data_sign = util_compute.sign(x) res = te.lang.cce.vmul(res, data_sign) if dtype_input == "float16": res = te.lang.cce.cast_to(res, "float16") return res
def atan_compute(x, y, kernel_name="atan"): """ Algorithm: atan ---------------------------------- Parameters: x: Input data y : the dict of output kernel_name: cce kernel name, default value is "atan" ---------------------------------- Returns: A Tensor of atan(x). """ dtype = x.dtype shape = x.shape if dtype == "float16" and \ api_check_support("te.lang.cce.vadd", "float32"): x = te.lang.cce.cast_to(x, "float32") abs_data = te.lang.cce.vabs(x) tensor_one = te.lang.cce.broadcast(tvm.const(CONST_POS_ONE, x.dtype), shape) abs_data_sub_one = te.lang.cce.vsub(abs_data, tensor_one) abs_data_add_one = te.lang.cce.vadd(abs_data, tensor_one) abs_data2 = te.lang.cce.vdiv(abs_data_sub_one, abs_data_add_one) abs_data2 = te.lang.cce.vabs(abs_data2) # calucate data less than one res = _do_taylor(abs_data) # calucate data more than one res_mt_one = _do_taylor(abs_data2) res_mt_one = te.lang.cce.vadds(res_mt_one, CONST_PI_BY_FOUR) res = te.lang.cce.vmin(res, res_mt_one) sign_mask = util_compute.sign(x) res = te.lang.cce.vmul(res, sign_mask) if dtype == "float16": res = te.lang.cce.cast_to(res, "float16") return res
def _atan_compute(input_x): """ Algorithm: atan ---------------------------------- Parameters: input_x: Input data. ---------------------------------- Returns: A Tensor of atan(x). """ shape = input_x.shape dtype = input_x.dtype if dtype == "float16" and \ api_check_support("te.lang.cce.vadd", "float32"): input_x = te.lang.cce.cast_to(input_x, "float32") abs_data = te.lang.cce.vabs(input_x) tensor_one = te.lang.cce.broadcast(tvm.const(CONST_POS_ONE, input_x.dtype), shape) abs_data_sub_one = te.lang.cce.vsub(abs_data, tensor_one) abs_data_add_one = te.lang.cce.vadd(abs_data, tensor_one) abs_data2 = te.lang.cce.vdiv(abs_data_sub_one, abs_data_add_one) abs_data2 = te.lang.cce.vabs(abs_data2) # calucate data less than one res = _do_taylor(abs_data) # calucate data more than one res_mt_one = _do_taylor(abs_data2) res_mt_one = te.lang.cce.vadds(res_mt_one, CONST_PI_BY_FOUR) res = te.lang.cce.vmin(res, res_mt_one) sign_mask = util_compute.sign(input_x) res = te.lang.cce.vmul(res, sign_mask) if dtype == "float16": res = te.lang.cce.cast_to(res, "float16") return res
def asinh_compute_cloud(input_x, output_y, kernel_name="asinh"): """ algrithm: asinh(x) = log(x + sqrt(x^2 + 1)) Parameters ---------- input_x: the placeholder of data input output_y : the dict of output kernel_name : cce kernel name, default value is "asinh" Returns ------- res : result of asinh """ inp_dtype = input_x.dtype.lower() has_improve_precision = False if inp_dtype == "float16" and \ tbe_platform.cce_conf.api_check_support("te.lang.cce.vlog", "float32"): input_x = te.lang.cce.cast_to(input_x, "float32") has_improve_precision = True inp_dtype = "float32" data_abs = te.lang.cce.vabs(input_x) data_x_square = te.lang.cce.vmul(data_abs, data_abs) data_add = te.lang.cce.vadds(data_x_square, tvm.const(CONST_ONE, inp_dtype)) data_s_1_sqrt = te.lang.cce.vsqrt(data_add) data_res = te.lang.cce.vadd(data_s_1_sqrt, data_abs) result = te.lang.cce.vlog(data_res) res = te.lang.cce.vmul(result, sign(input_x)) if has_improve_precision: res = te.lang.cce.cast_to(res, "float16") return res
def apply_ftrl_d_compute(var, accum, linear, grad, lr, l1, l2, lr_power, var_out, accum_out, linear_out, kernel_name='apply_ftrl_d'): """ Update '*var' according to the Ftrl-proximal algorithm. accum_new = accum + grad * grad linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var x = l1 * linear.sign - linear y = accum_new^(-lr_power) / lr + 2 * l2 var = x / y if |linear| > l1 else 0.0 accum = accum_new Parameters: ---------- var : mutable tensor var. accum: mutable tensor accum. linear : mutable tensor linear. grad : tensor grad. lr : scalar lr. l1 : scalar l1. l2 : scalar l2. lr_power : scalar lr_power. var_out : the dict of var output. accum_out : the dict of accum output. linear_out : the dict of linear output. kernel_name : cce kernel name, default value is "apply_ftrl_d" (optional). Returns: ------- None """ # cast to float32 for higher accuracy dtype = var.dtype has_improve_precision = False if dtype == "float16" and \ tbe_platform.cce_conf.api_check_support("te.lang.cce.vexp", "float32"): var_tmp = te.lang.cce.cast_to(var, "float32") accum_tmp = te.lang.cce.cast_to(accum, "float32") linear_tmp = te.lang.cce.cast_to(linear, "float32") grad = te.lang.cce.cast_to(grad, "float32") lr = te.lang.cce.cast_to(lr, "float32") l1 = te.lang.cce.cast_to(l1, "float32") l2 = te.lang.cce.cast_to(l2, "float32") lr_power = te.lang.cce.cast_to(lr_power, "float32") has_improve_precision = True else: var_tmp = te.lang.cce.vadds(var, tvm.const(NUM_ZERO, dtype)) accum_tmp = te.lang.cce.vadds(accum, tvm.const(NUM_ZERO, dtype)) linear_tmp = te.lang.cce.vadds(linear, tvm.const(NUM_ZERO, dtype)) # broadcast scalar to appropriate shape zero_tensor = te.lang.cce.broadcast(tvm.const(NUM_ZERO, var_tmp.dtype), var.shape) lr = te.lang.cce.broadcast(lr, var.shape) l1 = te.lang.cce.broadcast(l1, var.shape) l2 = te.lang.cce.broadcast(l2, var.shape) lr_power = te.lang.cce.broadcast(lr_power, var.shape) # 1.accum_new = accum + grad^2 gs = te.lang.cce.vmul(grad, grad) accum_new = te.lang.cce.vadd(accum_tmp, gs) # 2.linear += grad - (accum_new^(-lr_power)-accum^(-lr_power))/lr*var lr_power = te.lang.cce.vmuls(lr_power, tvm.const(NUM_M_ONE, var_tmp.dtype)) accum_new_p = _pow(accum_new, lr_power, zero_tensor) accum_p = _pow(accum_tmp, lr_power, zero_tensor) accum_p = te.lang.cce.vsub(accum_new_p, accum_p) accum_p = te.lang.cce.vdiv(accum_p, lr) accum_p = te.lang.cce.vmul(accum_p, var_tmp) accum_p = te.lang.cce.vsub(grad, accum_p) linear_t = te.lang.cce.vadd(linear_tmp, accum_p) # 3.x_res = l1*linear.sign()-linear x_res = sign(linear_t) x_res = te.lang.cce.vmul(x_res, l1) x_res = te.lang.cce.vsub(x_res, linear_t) # 4.y_res = accum_new^(-lr_power)/lr + 2*l2 l2 = te.lang.cce.vmuls(l2, tvm.const(NUM_TWO, var_tmp.dtype)) y_res = te.lang.cce.vdiv(accum_new_p, lr) y_res = te.lang.cce.vadd(y_res, l2) # 5.var = x_res / y_res if linear.abs > l1, else var = 0 x_res = te.lang.cce.vdiv(x_res, y_res) linear_abs = te.lang.cce.vabs(linear_t) var_sel = te.lang.cce.vcmp(linear_abs, l1, 'gt') var_t = te.lang.cce.vsel(var_sel, x_res, zero_tensor) # result of vsel is fp16, should cast to fp32 var_t = te.lang.cce.cast_to(var_t, "float32") if has_improve_precision: var_t = te.lang.cce.cast_to(var_t, "float16") accum_new = te.lang.cce.cast_to(accum_new, "float16") linear_t = te.lang.cce.cast_to(linear_t, "float16") # 8.var_output_data = var_t var_output_data = te.lang.cce.vadds(var_t, tvm.const(NUM_ZERO, var_t.dtype)) accum_output_data = te.lang.cce.vadds(accum_new, tvm.const(NUM_ZERO, accum_new.dtype)) linear_output_data = te.lang.cce.vadds(linear_t, tvm.const(NUM_ZERO, linear_t.dtype)) def _compute(*index): return var_t(*index), accum_new(*index), linear_t( *index), var_output_data(*index), accum_output_data( *index), linear_output_data(*index) return tvm.compute(var.shape, _compute, name="outputs")
def apply_ftrl_v2_d_compute(var, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power, var_out, accum_out, linear_out, kernel_name='apply_ftrl_v2_d'): """ Update '*var' according to the Ftrl-proximal algorithm. grad_with_shrinkage = grad + 2 * l2_shrinkage * var accum_new = accum + grad * grad linear += grad_with_shrinkage - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var x = l1 * linear.sign - linear y = accum_new^(-lr_power) / lr + 2 * l2 var = x / y if |linear| > l1 else 0.0 accum = accum_new Parameters: ---------- var : mutable tensor var. accum: mutable tensor accum. linear : mutable tensor linear. grad : tensor grad. lr : scalar lr. l1 : scalar l1. l2 : scalar l2. l2_shrinkage: scalar l2_shrinkage. lr_power : scalar lr_power. var_out : the dict of output var. accum_out : the dict of output accum. linear_out : the dict of output linear. kernel_name : cce kernel name, default value is "apply_ftrl_v2_d". Returns: ------- the value of var_new, accum_new, linear_new, output_data """ dtype = var.dtype # cast to float32 for higher accuracy has_improve_precision = False if dtype == "float16" and \ tbe_platform.cce_conf.api_check_support("te.lang.cce.vexp", "float32"): var_tmp = te.lang.cce.cast_to(var, "float32") accum_tmp = te.lang.cce.cast_to(accum, "float32") linear_tmp = te.lang.cce.cast_to(linear, "float32") grad = te.lang.cce.cast_to(grad, "float32") lr = te.lang.cce.cast_to(lr, "float32") l1 = te.lang.cce.cast_to(l1, "float32") l2 = te.lang.cce.cast_to(l2, "float32") l2_shrinkage = te.lang.cce.cast_to(l2_shrinkage, "float32") lr_power = te.lang.cce.cast_to(lr_power, "float32") has_improve_precision = True else: var_tmp = te.lang.cce.vadds(var, tvm.const(NUM_ZERO, "float32")) accum_tmp = te.lang.cce.vadds(accum, tvm.const(NUM_ZERO, "float32")) linear_tmp = te.lang.cce.vadds(linear, tvm.const(NUM_ZERO, "float32")) # 1.grad_with_shrinkage = grad + 2 * l2_shrinkage * var mul_value = te.lang.cce.vmuls(l2_shrinkage, tvm.const(NUM_TWO, "float32")) mul_value = te.lang.cce.broadcast(mul_value, var_tmp.shape) mul_value2 = te.lang.cce.vmul(mul_value, var_tmp) grad_with_shrinkage = te.lang.cce.vadd(grad, mul_value2) # 2.accum_new = accum + grad^2 gs = te.lang.cce.vmul(grad, grad) accum_new = te.lang.cce.vadd(accum_tmp, gs) # 3.accum_pow_sub = accum_new^(-lr_power)-accum^(-lr_power) lr_power = te.lang.cce.vmuls(lr_power, tvm.const(NUM_M_ONE, "float32")) lr_power = te.lang.cce.broadcast(lr_power, var_tmp.shape) accum_new_pow = _pow(accum_new, lr_power) accum_pow = _pow(accum_tmp, lr_power) accum_pow_sub = te.lang.cce.vsub(accum_new_pow, accum_pow) # 4.linear += grad_with_shrinkage - accum_pow_sub / lr * var lr = te.lang.cce.broadcast(lr, var_tmp.shape) accum_pow_div = te.lang.cce.vdiv(accum_pow_sub, lr) accum_pow_mul = te.lang.cce.vmul(accum_pow_div, var_tmp) accum_pow = te.lang.cce.vsub(grad_with_shrinkage, accum_pow_mul) linear_new = te.lang.cce.vadd(linear_tmp, accum_pow) # 5.x_res = l1*linear.sign()-linear l1 = te.lang.cce.broadcast(l1, var_tmp.shape) x_res = sign(linear_new) x_res = te.lang.cce.vmul(x_res, l1) x_res = te.lang.cce.vsub(x_res, linear_new) # 6.y_res = accum_new^(-lr_power)/lr + 2*l2 l2 = te.lang.cce.vmuls(l2, tvm.const(NUM_TWO, "float32")) l2 = te.lang.cce.broadcast(l2, var_tmp.shape) y_res = te.lang.cce.vdiv(accum_new_pow, lr) y_res = te.lang.cce.vadd(y_res, l2) # 7.var = x_res / y_res if linear.abs > l1, else var = 0 x_res = te.lang.cce.vdiv(x_res, y_res) linear_abs = te.lang.cce.vabs(linear_new) zero_tensor = te.lang.cce.broadcast(tvm.const(NUM_ZERO, "float32"), var_tmp.shape) var_sel = te.lang.cce.vcmp(linear_abs, l1, 'gt') var_new = te.lang.cce.vsel(var_sel, x_res, zero_tensor) # dtype after vsel is float16 at mini var_new = te.lang.cce.cast_to(var_new, "float32") if has_improve_precision: var_new = te.lang.cce.cast_to(var_new, "float16") accum_new = te.lang.cce.cast_to(accum_new, "float16") linear_new = te.lang.cce.cast_to(linear_new, "float16") # 8.output_var = var_new output_data = te.lang.cce.vadds(var_new, tvm.const(NUM_ZERO, var_new.dtype)) accum_out_data = te.lang.cce.vadds(accum_new, tvm.const(NUM_ZERO, accum_new.dtype)) linear_out_data = te.lang.cce.vadds(linear_new, tvm.const(NUM_ZERO, linear_new.dtype)) def _compute(*index): return var_new(*index), accum_new(*index), \ linear_new(*index), output_data(*index), \ accum_out_data(*index), linear_out_data(*index) return tvm.compute(var.shape, _compute, name="outputs")
def apply_proximal_adagrad_d_compute(var, accum, lr, l1, l2, grad, var_out, accum_out, use_locking=False, kernel_name="apply_proximal_adagrad"): """ the operator's compute accum += grad * grad learning_rate = lr_broad * rsqrt(accum) prox_v = var - grad * learning_rate if l1 > 0 : var = sign(prox_v)/(1+learning_rate*l2)*max{|prox_v|-learning_rate*l1,0} else: var = prox_v / (1+l2*learning_rate) Parameters ---------- var: dict input tensor contains shape and dtype attributes. only support float16, float32. accum: dict input tensor contains shape and dtype attributes. Must have the same type as 'var'. lr: dict input tensor contains shape and dtype attributes. Must have the same type as 'var'. l1: dict input tensor contains shape and dtype attributes. Must have the same type as 'var'. l2: dict input tensor contains shape and dtype attributes. Must have the same type as 'var'. grad: dict input tensor contains shape and dtype attributes. Must have the same type as 'var'. var_out: dict output tensor contains shape and dtype attributes. Must have the same type as 'var'. accum_out: dict output tensor contains shape and dtype attributes. Must have the same type as 'accum'. use_locking: bool default value is "False" kernel_name: str kernel name, default value is "apply_proximal_adagrad_d" Returns: the value of out_var, accum_out, out_data """ dtype = var.dtype has_improve_precision = False if dtype == "float16" and \ tbe_platform.cce_conf.api_check_support("te.lang.cce.vsqrt", "float32"): var = te.lang.cce.cast_to(var, "float32") accum = te.lang.cce.cast_to(accum, "float32") lr = te.lang.cce.cast_to(lr, "float32") l1 = te.lang.cce.cast_to(l1, "float32") l2 = te.lang.cce.cast_to(l2, "float32") grad = te.lang.cce.cast_to(grad, "float32") has_improve_precision = True lr_broad = te.lang.cce.broadcast(lr, var.shape) l1_broad = te.lang.cce.broadcast(l1, var.shape) l2_broad = te.lang.cce.broadcast(l2, var.shape) grad_2 = te.lang.cce.vmul(grad, grad) accum_out = te.lang.cce.vadd(accum, grad_2) accum_sqrt = te.lang.cce.vsqrt(accum_out) learning_rate = te.lang.cce.vdiv(lr_broad, accum_sqrt) learning_rate_grad = te.lang.cce.vmul(grad, learning_rate) prox_v = te.lang.cce.vsub(var, learning_rate_grad) l2_lr = te.lang.cce.vmul(l2_broad, learning_rate) l2_lr_1 = te.lang.cce.vadds(l2_lr, tvm.const(CONST_ONE, "float32")) prox_v_abs = te.lang.cce.vabs(prox_v) prox_v_sign = sign(prox_v) learning_rate_l1 = te.lang.cce.vmul(learning_rate, l1_broad) prox_v_l1 = te.lang.cce.vsub(prox_v_abs, learning_rate_l1) max_value = te.lang.cce.vmax( prox_v_l1, te.lang.cce.broadcast(tvm.const(CONST_ZERO, "float32"), prox_v.shape)) var_res = te.lang.cce.vmul(prox_v_sign, max_value) var_new = te.lang.cce.vdiv(var_res, l2_lr_1) output_data = te.lang.cce.vadds(var_new, tvm.const(CONST_ZERO, "float32")) output_accum_data = te.lang.cce.vadds(accum_out, tvm.const(CONST_ZERO, "float32")) if has_improve_precision: var_new = te.lang.cce.cast_to(var_new, "float16") accum_out = te.lang.cce.cast_to(accum_out, "float16") output_data = te.lang.cce.cast_to(output_data, "float16") output_accum_data = te.lang.cce.cast_to(output_accum_data, "float16") # this compute is for muti output def _compute(*index): return var_new(*index), accum_out(*index), output_data( *index), output_accum_data(*index) return tvm.compute(var.shape, _compute, name="outputs")
def apply_adagrad_da_d_compute(var, gradient_accumulator, gradient_squared_accumulator, grad, lr, l1, l2, global_step, var_out, gradient_accumulator_out, gradient_squared_accumulator_out, kernel_name='apply_adagrad_da_d'): """ Update '*var' according to the Ftrl-proximal algorithm. grad_accum += grad grad_squared_accum += grad * grad tmp_val=sign(grad_accum) * max{|grad_accum|-l1*global_step, 0} if l1>0 else grad_accum x_value = -1 * lr * tmp_val y_value = l2 * global_step * lr + sqrt(grad_squared_accum) var = x_value / y_value Parameters: ---------- var : mutable tensor var. gradient_accumulator: mutable tensor gradient_accumulator. gradient_squared_accumulator : mutable tensor gradient_squared_accumulator. grad : tensor grad. lr : scalar lr. l1 : scalar l1. l2 : scalar l2. global_step : scalar global_step. var_out : the dict of output. gradient_accumulator_out : the dict of output. gradient_squared_accumulator_out : the dict of output. kernel_name : cce kernel name, default value is "apply_adagrad_da". Returns: ------- None """ # cast to float32 for higher accuracy dtype = var.dtype has_improve_precision = False cast_type = var.dtype if dtype == "float16" and \ tbe_platform.cce_conf.api_check_support("te.lang.cce.vsqrt", "float32"): cast_type = "float32" has_improve_precision = True if dtype == "float16": if has_improve_precision: var_tmp = te.lang.cce.cast_to(var, "float32") var_tmp = te.lang.cce.vmuls(var_tmp, tvm.const(NUM_ZERO, "float32")) grad_accum_tmp = te.lang.cce.cast_to(gradient_accumulator, "float32") grad_sq_accum_tmp = te.lang.cce.cast_to( gradient_squared_accumulator, "float32") grad = te.lang.cce.cast_to(grad, "float32") lr = te.lang.cce.cast_to(lr, "float32") l1 = te.lang.cce.cast_to(l1, "float32") l2 = te.lang.cce.cast_to(l2, "float32") else: var_tmp = te.lang.cce.vmuls(var, tvm.const(NUM_ZERO, "float16")) grad_accum_tmp = te.lang.cce.vadds(gradient_accumulator, tvm.const(NUM_ZERO, "float16")) grad_sq_accum_tmp = te.lang.cce.vadds( gradient_squared_accumulator, tvm.const(NUM_ZERO, "float16")) else: var_tmp = te.lang.cce.vmuls(var, tvm.const(NUM_ZERO, "float32")) grad_accum_tmp = te.lang.cce.vadds(gradient_accumulator, tvm.const(NUM_ZERO, "float32")) grad_sq_accum_tmp = te.lang.cce.vadds(gradient_squared_accumulator, tvm.const(NUM_ZERO, "float32")) global_step = te.lang.cce.cast_to(global_step, cast_type) # 1.grad_accum += grad gradient_accum_new = te.lang.cce.vadd(grad_accum_tmp, grad) # 2.grad_squared_accum += grad * grad gs = te.lang.cce.vmul(grad, grad) gradient_squared_accum_new = te.lang.cce.vadd(grad_sq_accum_tmp, gs) # 3.tmp_val = sign(grad_accum) * max{|grad_accum|-l1*global_step, 0} # if l1>0 else grad_accum sign_val = sign(gradient_accum_new) abs_val = te.lang.cce.vabs(gradient_accum_new) mul_val = te.lang.cce.vmul(global_step, l1) mul_val = te.lang.cce.broadcast(mul_val, var_tmp.shape) sub_val = te.lang.cce.vsub(abs_val, mul_val) zero_tensor = te.lang.cce.broadcast(tvm.const(NUM_ZERO, cast_type), var_tmp.shape) max_val = te.lang.cce.vmax(sub_val, zero_tensor) tmp_val = te.lang.cce.vmul(sign_val, max_val) l1 = te.lang.cce.broadcast(l1, var_tmp.shape) l1_cmp = te.lang.cce.vcmp(l1, zero_tensor, "gt") tmp_val = te.lang.cce.vsel(l1_cmp, tmp_val, gradient_accum_new) # 4.x_value = -1 * lr * tmp_val x_value = te.lang.cce.vmuls(lr, tvm.const(NUM_M_ONE, cast_type)) x_value = te.lang.cce.broadcast(x_value, var_tmp.shape) x_value = te.lang.cce.vmul(x_value, tmp_val) # 5.y_value = l2 * global_step * lr + sqrt(grad_squared_accum) pro_val = te.lang.cce.vmul(l2, global_step) pro_val = te.lang.cce.vmul(pro_val, lr) pro_val = te.lang.cce.broadcast(pro_val, var_tmp.shape) sqrt_val = te.lang.cce.vsqrt(gradient_squared_accum_new, priority_flag=1.0) y_value = te.lang.cce.vadd(pro_val, sqrt_val) # 6.var = x_value / y_value var_t = te.lang.cce.vdiv(x_value, y_value) var_new = te.lang.cce.vadd(var_t, var_tmp) if dtype == "float16" and has_improve_precision: var_new = te.lang.cce.cast_to(var_new, "float16") gradient_accum_new = te.lang.cce.cast_to(gradient_accum_new, "float16") gradient_squared_accum_new = te.lang.cce.cast_to( gradient_squared_accum_new, "float16") # 7. output_data = var_new output_data = te.lang.cce.vadds(var_new, tvm.const(NUM_ZERO, var_new.dtype)) res1_data = te.lang.cce.vadds(gradient_accum_new, tvm.const(NUM_ZERO, var_new.dtype)) res2_data = te.lang.cce.vadds(gradient_squared_accum_new, tvm.const(NUM_ZERO, var_new.dtype)) def _compute(*index): return var_new(*index), gradient_accum_new(*index), \ gradient_squared_accum_new(*index), output_data(*index),\ res1_data(*index), res2_data(*index) return tvm.compute(var.shape, _compute, name="outputs")
def asin_compute(x, y, kernel_name="asin"): """ do element-wise asin compute asin(x) = | arcsin(sqrt(1-x^2)) - HALF_PI, x belongs to (-1, -2^(-0.5)) | the 15th order taylor expansion, x belongs to (-2^(-0.5), 2^(-0.5)) | HALF_PI - arcsin(sqrt(1-x^2)), x belongs to (2^(-0.5), 1) Parameters: ---------- x: the placeholder of data input y : the dict of output kernel_name : cce kernel name, default value is "cce_asin" Returns : A Tensor. Has the same type as data_input. ------- """ shape = x.shape dtype = x.dtype # Change dtype to float32 if dtype == "float16" and \ api_check_support("te.lang.cce.vadd", "float32"): x = te.lang.cce.cast_to(x, "float32") # Sign mask sign = util_compute.sign(x) # All positive x = te.lang.cce.vmul(x, sign) # x belongs to (0, 2^(-0.5)) if api_check_support("te.lang.cce.vmins", x.dtype): choice_1 = te.lang.cce.vmins(x, tvm.const(BOUNDARY_1, x.dtype)) else: boundary_mask1 = te.lang.cce.broadcast(tvm.const(BOUNDARY_1, x.dtype), shape) choice_1 = te.lang.cce.vmin(x, boundary_mask1) if api_check_support("te.lang.cce.vsubs", choice_1.dtype): choice_1 = te.lang.cce.vsubs(choice_1, tvm.const(BOUNDARY_1, choice_1.dtype)) else: boundary_mask1 = te.lang.cce.broadcast( tvm.const(BOUNDARY_1, choice_1.dtype), shape) choice_1 = te.lang.cce.vsub(choice_1, boundary_mask1) choice_1 = te.lang.cce.vmuls(te.lang.cce.floor(choice_1), NEG_NUM_ONE) res_1 = _taylor_compute(x) res_1 = te.lang.cce.vmul(res_1, choice_1) # x belongs to (2^(-0.5), 1) choice_2 = te.lang.cce.vmuls(choice_1, tvm.const(NEG_NUM_ONE, x.dtype)) choice_2 = te.lang.cce.vadds(choice_2, tvm.const(NUM_ONE, x.dtype)) res_2 = te.lang.cce.vmul(x, x) res_2 = te.lang.cce.vmuls(res_2, tvm.const(NEG_NUM_ONE, x.dtype)) res_2 = te.lang.cce.vadds(res_2, tvm.const(NUM_ONE, x.dtype)) res_2_sqrt = te.lang.cce.vsqrt(res_2) res_2 = _taylor_compute(res_2_sqrt, res_2) res_2 = te.lang.cce.vmuls(res_2, tvm.const(NEG_NUM_ONE, x.dtype)) res_2 = te.lang.cce.vadds(res_2, tvm.const(HALF_PI, x.dtype)) res_2 = te.lang.cce.vmul(res_2, choice_2) # Restore sign res_1 = te.lang.cce.vadd(res_1, res_2) res_1 = te.lang.cce.vmul(res_1, sign) # Restore dtype if dtype == "float16": res_1 = te.lang.cce.cast_to(res_1, "float16") return res_1