def apply_proximal_adagrad_d(var, accum, lr, l1, l2, grad, var_out, accum_out, use_locking=False, kernel_name="apply_proximal_adagrad_d"): """ Update '*var' and '*accum' according to FOBOS with Adagrad learning rate. Parameters ---------- var: dict input tensor contains shape and dtype attributes. only support float16, float32. accum: dict input tensor contains shape and dtype attributes. Must have the same type as 'var'. lr: dict input tensor contains shape and dtype attributes. Must have the same type as 'var'. l1: dict input tensor contains shape and dtype attributes. Must have the same type as 'var'. l2: dict input tensor contains shape and dtype attributes. Must have the same type as 'var'. grad: dict input tensor contains shape and dtype attributes. Must have the same type as 'var'. var_out: dict output tensor contains shape and dtype attributes. Must have the same type as 'var'. accum_out: dict output tensor contains shape and dtype attributes. Must have the same type as 'accum'. use_locking: bool default value is "False" kernel_name: str kernel name, default value is "apply_proximal_adagrad_d" Returns: None """ _check_shape_is_same(var, accum, grad) input_dict = (var, accum, lr, l1, l2, grad) args = ApplyOpConfig.TensorArgs(input_dict, apply_proximal_adagrad_d_compute, [var_out, accum_out], 15) name = ApplyOpConfig.TensorName(all=('var', 'accum', 'lr', 'l1', 'l2', 'grad'), scalar=('lr', 'l1', 'l2'), reuse=('var', 'accum')) common_apply_op_process(ApplyOpConfig(args, name), kernel_name)
def apply_adadelta_d(var, accum, accum_update, lr, rho, epsilon, grad, var_out, accum_out, accum_update_out, kernel_name="apply_adadelta_d"): """ Update '*var' according to the adadelta scheme. accum = rho * accum + (1 - rho) * grad ** 2 update = (update_accum + epsilon).sqrt() * (accum + epsilon).rsqrt() * grad update_accum = rho * update_accum + (1 - rho) * update.square(); var -= update * lr; Parameters: ---------- var: the dict of input, only support float16, float32 accum: the dict of accum, only support float16, float32 accum_update: the dict of accum_update, only support float16, float32 lr: the dict of lr, only support float16, float32 rho: the dict of rho, only support float16, float32 epsilon: the dict of epsilon, only support float16, float32 grad: the dict of grad, only support float16, float32 var_out: the dict of var output data accum_out: the dict of accum output data accum_update_out: the dict of accum_update output data kernel_name : cce kernel name, default value is "apply_adadelta_d" Returns ------- None """ input_dict = (var, accum, accum_update, lr, rho, epsilon, grad) args = ApplyOpConfig.TensorArgs(input_dict, apply_adadelta_d_compute, [var_out, accum_out, accum_update_out], 16) name = ApplyOpConfig.TensorName(all=('var', 'accum', 'accum_update', 'lr', 'rho', 'epsilon', 'grad'), scalar=('lr', 'rho', 'epsilon'), reuse=('var', 'accum', 'accum_update')) common_apply_op_process(ApplyOpConfig(args, name), kernel_name)
def apply_power_sign_d(var, m, lr, logbase, sign_decay, beta, grad, var_out, m_out, kernel_name="apply_power_sign_d"): """ Update '*var' according to the AddSign update Parameters: ---------- var: dict of Variable, only support float16, float32 m : dict of input_grad, only support float16, float32 lr : dict of lr, only support float16, float32 logbase : dict of logbase, only support float16, float32 sign_decay : dict of sign_decay, only support float16, float32 grad : dict of grad, only support float16, float32 beta : dict of beta, only support float16, float32 var_out : dict of output, only support float16, float32 m_out : dict of output, only support float16, float32 kernel_name : cce kernel name, default value is apply_power_sign Algorithm : ---------- m_t <- beta * m_{t-1} + (1 - beta) * grad update <- exp(logbase * sign_decay * sign(grad) * sign(m_t)) * grad variable <- variable - lr_t * update Returns ---------- None """ input_dict = (var, m, lr, logbase, sign_decay, beta, grad) check_list = ('float16', 'float32') dtype = var.get('dtype') check_dtype(dtype, check_list, param_name="var") dtype = dtype.lower() args = ApplyOpConfig.TensorArgs(input_dict, apply_power_sign_d_compute, [var_out, m_out], 6 if dtype == 'float32' else 10) name = ApplyOpConfig.TensorName(all=('var', 'm', 'lr', 'logbase', 'sign_decay', 'beta', 'grad'), scalar=('lr', 'logbase', 'sign_decay', 'beta'), reuse=('m', 'var')) common_apply_op_process(ApplyOpConfig(args, name), kernel_name)
def apply_proximal_gradient_descent( var, alpha, l1, l2, delta, out, kernel_name="apply_proximal_gradient_descent"): """ Update '*var' as FOBOS algorithm with fixed learning rate.. prox_v = var - alpha * delta var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0} Parameters: ---------- var: the dict of var, only support float16, float32 alpha: the dict of alpha, only support float16, float32 l1: the dict of l1, only support float16, float32 l2: the dict of l2, only support float16, float32 delta: the dict of delta, only support float16, float32 out: the dict of output, only support float16, float32 kernel_name : cce kernel name, default value is "apply_proximal_gradient_descent" Returns ------- None """ check_list = ('float16', 'float32') dtype = var.get('dtype') check_dtype(dtype, check_list, param_name="var") dtype = dtype.lower() input_dict = (var, alpha, l1, l2, delta) args = ApplyOpConfig.TensorArgs(input_dict, apply_proximal_gradient_descent_compute, out, 5 if dtype == 'float32' else 10) name = ApplyOpConfig.TensorName(all=('var', 'alpha', 'l1', 'l2', 'delta'), scalar=('alpha', 'l1', 'l2'), reuse=('var', )) options = ApplyOpConfig.TensorOptions(build=set_bool_storage_config()) common_apply_op_process(ApplyOpConfig(args, name, options), kernel_name)
def apply_add_sign_d(var, m, lr, alpha, sign_decay, beta, grad, var_out, m_out, kernel_name="apply_add_sign_d"): """ Update '*var' according to the AddSign update. m_t <- beta1 * m_{t-1} + (1 - beta1) * g update <- (alpha + sign_decay * sign(g) *sign(m)) * g variable <- variable - lr_t * update Parameters: ---------- var: the dict of var, support float16, float32 m: the dict of m, support float16, float32 lr: the dict of lr, support float16, float32 alpha: the dict of alpha, support float16, float32 sign_decay: the dict of sign_decay, support float16, float32 beta: the dict of beta, support float16, float32 grad: the dict of grad, support float16, float32 var_out: the dict of var output data m_out: the dict of m output data otherwise the behavior is undefined, but may exhibit less contention. kernel_name : cce kernel name, default value is "apply_add_sign_d" Returns ------- None """ input_dict = (var, m, lr, alpha, sign_decay, beta, grad) out = [var_out, m_out] args = ApplyOpConfig.TensorArgs(input_dict, apply_add_sign_d_compute, out, 10) name = ApplyOpConfig.TensorName(all=('var', 'm', 'lr', 'alpha', 'sign_decay', 'beta', 'grad'), scalar=('lr', 'alpha', 'sign_decay', 'beta'), reuse=('var', 'm')) options = ApplyOpConfig.TensorOptions(build=set_bool_storage_config()) common_apply_op_process(ApplyOpConfig(args, name, options), kernel_name)
def sgd(parameters, gradient, learning_rate, accum, momentum, stat, update, dampening, weight_decay, nesterov, kernel_name="sgd"): """ Update '*parameters' according to the SGD algorithm. accum = accum * momentum + grad if use_nesterov is True: parameters -= grad * lr + accum * momentum * lr else: parameters -= accum * lr Parameters: ---------- parameters : mutable tensor parameters. gradient : tensor grad. learning_rate : scalar lr. accum: mutable tensor accum. momentum : scalar momentum. stat : mutable tensor stat. update: out dict. dampening: (float, optional): dampening for momentum (default: 0) weight_decay: weight decay (L2 penalty) (default: 0) nesterov: bool. If true, use nesterov computing grad, default value is False. kernel_name : cce kernel name, default value is "sgd" (optional). Returns: ------- None """ if nesterov and dampening != 0: raise RuntimeError("Nesterov requires zero dampening!") if weight_decay < 0: raise RuntimeError("weight_decay must >=0.") input_dict = (parameters, gradient, learning_rate, accum, momentum, stat) args = ApplyOpConfig.TensorArgs( input_dict, sgd_compute, update, 17 if nesterov else 9, ) name = ApplyOpConfig.TensorName(all=('parameters', 'gradient', 'learning_rate', 'accum', 'momentum', 'stat'), scalar=('learning_rate', 'momentum'), reuse=('accum', 'parameters', 'stat')) options = ApplyOpConfig.TensorOptions( attrs=[dampening, weight_decay, nesterov]) common_apply_op_process(ApplyOpConfig(args, name, options), kernel_name)
def apply_ftrl_d(var, accum, linear, grad, lr, l1, l2, lr_power, var_out, accum_out, linear_out, kernel_name="apply_ftrl_d"): """ Update '*var' according to the Ftrl-proximal algorithm. accum_new = accum + grad * grad linear += grad - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2 var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0 accum = accum_new Parameters: ---------- var : the dict of mutable tensor var, only support float16, float32 accum : the dict of mutable tensor accum. Must have the same data type as `var`. linear : the dict of mutable tensor linear. Must have the same data type as `var`. grad : the dict of tensor grad. Must have the same data type as `var`. lr : the dict of scalar lr. Must have the same data type as `var`. l1 : the dict of scalar l1. Must have the same data type as `var`. l2 : the dict of scalar l2. Must have the same data type as `var`. lr_power : the dict of scalar lr_power. Must have the same data type as `var`. var_out: the dict of var output data. accum_out: the dict of accum output data. linear_out: the dict of linear output data kernel_name : cce kernel name, default value is "apply_ftrl_d". Returns ------- None """ input_dict = (var, accum, linear, grad, lr, l1, l2, lr_power) out = [var_out, accum_out, linear_out] args = ApplyOpConfig.TensorArgs(input_dict, apply_ftrl_d_compute, out, 15) name = ApplyOpConfig.TensorName(all=('var', 'accum', 'linear', 'grad', 'lr', 'l1', 'l2', 'lr_power'), scalar=('lr', 'l1', 'l2', 'lr_power'), reuse=('var', 'accum', 'linear')) options = ApplyOpConfig.TensorOptions(build=set_bool_storage_config()) common_apply_op_process(ApplyOpConfig(args, name, options), kernel_name)
def apply_momentum_d(var, accum, lr, grad, momentum, var_out, accum_out, use_nesterov=False, kernel_name="apply_momentum_d"): """ Update '*var' according to the ApplyMomentum algorithm. accum = accum * momentum + grad if use_nesterov is True: var -= gard * lr + accum * momentum * lr else: var -= accum * lr Parameters: ---------- var : the dict of mutable tensor var, only support float16, float32. accum : the dict of mutable tensor accum. Must have the same data type as `var`. lr : the dict of scalar lr. Must have the same data type as `var`. grad : the dict of tensor grad. Must have the same data type as `var`. momentum : the dict of scalar momentum. Must have the same data type as `var`. var_out : the dict of output var. accum_out : the dict of output accum. use_nesterov: bool. If true, use nesterov computing grad, default value is False. kernel_name : cce kernel name, default value is "apply_momentum_d". Returns ------- None """ input_dict = (var, accum, lr, grad, momentum) args = ApplyOpConfig.TensorArgs( input_dict, apply_momentum_compute_d, [var_out, accum_out], 8 if use_nesterov else 6, ) name = ApplyOpConfig.TensorName(all=('var', 'accum', 'lr', 'grad', 'momentum'), scalar=('lr', 'momentum'), reuse=('accum', 'var')) options = ApplyOpConfig.TensorOptions(attrs=use_nesterov) common_apply_op_process(ApplyOpConfig(args, name, options), kernel_name)
def apply_ftrl_v2_d(var, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power, var_out, accum_out, linear_out, use_locking=False, kernel_name="apply_ftrl_v2_d"): """ Update '*var' according to the Ftrl-proximal algorithm. grad_with_shrinkage = grad + 2 * l2_shrinkage * var accum_new = accum + grad * grad linear += grad_with_shrinkage - (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var x = l1 * linear.sign - linear y = accum_new^(-lr_power) / lr + 2 * l2 var = x / y if |linear| > l1 else 0.0 accum = accum_new Parameters: ---------- var : the dict of mutable tensor var, only support float16, float32 accum : the dict of mutable tensor accum. Must have the same data type as `var`. linear : the dict of mutable tensor linear. Must have the same data type as `var`. grad : the dict of tensor grad. Must have the same data type as `var`. lr : the dict of scalar lr. Must have the same data type as `var`. l1 : the dict of scalar l1. Must have the same data type as `var`. l2 : the dict of scalar l2. Must have the same data type as `var`. l2_shrinkage: the dict of scalar l2_shrinkage. Must have the same data type as `var`. lr_power : the dict of scalar lr_power. Must have the same data type as `var`. var_out : the dict of output var. accum_out : the dict of output accum. linear_out : the dict of output linear. use_locking : optional attr, default value is False. kernel_name : cce kernel name, default value is "apply_ftrl_v2_d". Returns ------- None """ input_dict = (var, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power) args = ApplyOpConfig.TensorArgs(input_dict, apply_ftrl_v2_d_compute, [var_out, accum_out, linear_out], 15) name = ApplyOpConfig.TensorName(all=('var', 'accum', 'linear', 'grad', 'lr', 'l1', 'l2', 'l2_shrinkage', 'lr_power'), scalar=('lr', 'l1', 'l2', 'l2_shrinkage', 'lr_power'), reuse=('var', 'accum', 'linear')) options = ApplyOpConfig.TensorOptions(build=set_bool_storage_config()) common_apply_op_process(ApplyOpConfig(args, name, options), kernel_name)
def apply_adagrad_da_d(var, gradient_accumulator, gradient_squared_accumulator, grad, lr, l1, l2, global_step, var_out, gradient_accumulator_out, gradient_squared_accumulator_out, use_locking=False, kernel_name='apply_adagrad_da_d'): """ Update '*var' according to the Ftrl-proximal algorithm. grad_accum += grad grad_squared_accum += grad * grad tmp_val=sign(grad_accum) * max{|grad_accum|-l1*global_step, 0} if l1>0 else grad_accum x_value = -1 * lr * tmp_val y_value = l2 * global_step * lr + sqrt(grad_squared_accum) var = x_value / y_value Parameters: ---------- var : the dict of mutable tensor var, only support float16, float32 gradient_accumulator: the dict of mutable tensor gradient_accumulator, Must have the same data type as `var`. gradient_squared_accumulator : the dict of mutable tensor gradient_squared_accumulator, Must have the same data type as `var`. grad : the dict of tensor grad. Must have the same data type as `var`. lr : the dict of scalar lr. Must have the same data type as `var`. l1 : the dict of scalar l1. Must have the same data type as `var`. l2 : the dict of scalar l2. Must have the same data type as `var`. global_step : the dict of scalar global_step, only support int32. var_out : the dict of output. gradient_accumulator_out : the dict of output. gradient_squared_accumulator_out : the dict of output. use_locking : optional attr, default value is False. kernel_name : cce kernel name, default value is "apply_adagrad_da". Returns: ------- None """ # check dtype same stype_dict = (var, gradient_accumulator, gradient_squared_accumulator, grad, lr, l1, l2) normalized_dtype_list = [None] * len(stype_dict) for i, d in enumerate(stype_dict): dtype = d.get('dtype') normalized_dtype_list[i] = dtype.lower() if any(elem != normalized_dtype_list[0] for elem in normalized_dtype_list): raise RuntimeError("All input data types must be the same") # check global_step dtype dtype = global_step.get("dtype").lower() check_dtype(dtype, ("int32", ), param_name="global_step") input_dict = (var, gradient_accumulator, gradient_squared_accumulator, grad, lr, l1, l2, global_step) args = ApplyOpConfig.TensorArgs( input_dict, apply_adagrad_da_d_compute, [var_out, gradient_accumulator_out, gradient_squared_accumulator_out], 15) name = ApplyOpConfig.TensorName( all=('var', 'gradient_accumulator', 'gradient_squared_accumulator', 'grad', 'lr', 'l1', 'l2', 'global_step'), scalar=('lr', 'l1', 'l2', 'global_step'), reuse=('var', 'gradient_accumulator', 'gradient_squared_accumulator')) options = ApplyOpConfig.TensorOptions(build=set_bool_storage_config(), dtype=('float16', 'float32', 'int32')) common_apply_op_process(ApplyOpConfig(args, name, options), kernel_name, same_flag=False)
def apply_centered_rms_prop_d(var, mg, ms, mom, lr, rho, momentum, epsilon, grad, var_out, mg_out, ms_out, mom_out, kernel_name="apply_centered_rms_prop_d"): """ Update '*var' according to the centered RMSProp algorithm. mean_square = decay * mean_square + (1-decay) * gradient ** 2 mean_grad = decay * mean_grad + (1-decay) * gradient Delta = learning_rate*gradient/sqrt(mean_square+epsilon-mean_grad**2) mg_{t} <- rho * mg_{t-1} + (1-rho) * grad ms_{t} <- rho * ms_{t-1} + (1-rho) * grad * grad mom_{t} <- momentum*mom_{t-1}+lr*grad/sqrt(ms_{t}-mg{t}*mg{t}+epsilon) var_{t} <- var_{t-1} - mom_{t} Parameters: ---------- var: dict of tensor var, include shape and dtype, dtype support float16 and float32. mg: dict of tensor mg(mean_grad), include shape and dtype, dtype support float16 and float32. ms: dict of tensor ms(mean_square), include shape and dtype, dtype support float16 and float32. mom: dict of tensor mom, include shape and dtype, dtype support float16 and float32. lr: dict of scalar lr(learning rate). Must have the same dtype as var. rho: dict of scalar rho(decay rate). Must have the same dtype as var. momentum: dict of scalar momentum. Must have the same dtype as var. epsilon: dict of scalar epsilon. Must have the same dtype as var. grad: dict of tensor grad. Must have the same dtype as var. var_out: the dict of var output, only support float16, float32 mg_out: the dict of mg output, only support float16, float32 ms_out: the dict of ms output, only support float16, float32 mom_out: the dict of mom output, only support float16, float32 kernel_name : cce kernel name, default value is "apply_centered_rms_prop_d". Returns ------- None """ input_dict = (var, mg, ms, mom, lr, rho, momentum, epsilon, grad) out = [var_out, mg_out, ms_out, mom_out] check_list = ('float16', 'float32') dtype = var.get('dtype') check_dtype(dtype, check_list, param_name="var") dtype = dtype.lower() args = ApplyOpConfig.TensorArgs(input_dict, apply_centered_rms_prop_d_compute, out, 6 if dtype == "float32" else 12) name = ApplyOpConfig.TensorName(all=('var', 'mg', 'ms', 'mom', 'lr', 'rho', 'momentum', 'epsilon', 'grad'), scalar=('lr', 'rho', 'momentum', 'epsilon'), reuse=('mg', 'ms', 'mom', 'var')) common_apply_op_process(ApplyOpConfig(args, name), kernel_name)
def apply_adam_d(var, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad, var_out, m_out, v_out, use_locking=False, use_nesterov=False, kernel_name="apply_adam_d"): """ the opreator's compute lr_t = learning_rate*(sqrt(1-beta2_power)) / (1-beta1_power) m_t = m + (1-beta1)*(grad-m) v_t = v + (1-beta2)*(grad*grad-v) if use_nesterov == True: var_t = var - lr_t*(m_t*beta1 + (1-beta1)*grad) / (epsilon + sqrt(v_t)) else: vat_t = var - lr_t*m_t / (epsilon + sqrt(v_t)) Parameters: ---------- var: dict input tensor contains shape and dtype attributes. only support float16, float32. m: dict input tensor contains shape and dtype attributes. Must have the same type as 'var'. v: dict input tensor contains shape and dtype attributes. Must have the same type as 'var'. beta1_power: dict input tensor contains shape and dtype attributes. Must have the same type as 'var'. beta2_power: dict input tensor contains shape and dtype attributes. Must have the same type as 'var'. lr: dict input tensor contains shape and dtype attributes. Must have the same type as 'var'. beta1: dict input tensor contains shape and dtype attributes. Must have the same type as 'var'. beta2: dict input tensor contains shape and dtype attributes. Must have the same type as 'var'. epsilon: dict input tensor contains shape and dtype attributes. Must have the same type as 'var'. grad: dict input tensor contains shape and dtype attributes. Must have the same type as 'var'. var_out: dict output tensor contains shape and dtype attributes. Must have the same type as 'var'. m_out: dict output tensor contains shape and dtype attributes. Must have the same type as 'm'. v_out: dict output tensor contains shape and dtype attributes. Must have the same type as 'v'. use_locking: bool default value is "False". use_nesterov: bool default value is "False". kernel_name : str kernel name, default value is "apply_adam_d" Returns: None """ input_dict = (var, m, v, beta1_power, beta2_power, lr, beta1, beta2, epsilon, grad) args = ApplyOpConfig.TensorArgs(input_dict, apply_adam_d_compute, [var_out, m_out, v_out], 15) name = ApplyOpConfig.TensorName(all=('var', 'm', 'v', 'beta1_power', 'beta2_power', 'lr', 'beta1', 'beta2', 'epsilon', 'grad'), scalar=('lr', 'beta1_power', 'beta2_power', 'beta1', 'beta2', 'epsilon'), reuse=('m', 'v', 'var')) options = ApplyOpConfig.TensorOptions(attrs=(use_nesterov)) common_apply_op_process(ApplyOpConfig(args, name, options), kernel_name)
def apply_ada_max_d(var, m, v, beta1_power, lr, beta1, beta2, epsilon, grad, var_out, m_out, v_out, kernel_name='apply_ada_max_d'): """ Update '*var' according to the AdaMax algorithm. m_t <- beta1 * m_{t-1} + (1 - beta1) * g v_t <- max(beta2 * v_{t-1}, abs(g)) variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon) Parameters: ---------- var : the dict of mutable tensor var. Must be one of the following data types: `float32`, `float16`. m: the dict of mutable tensor m. Must have the same data type as `var`. v : the dict of mutable tensor v. Must have the same data type as `var`. beta1_power : the dict of scalar beta1_power. Must have the same data type as `var`. lr : the dict of scalar lr. Must have the same data type as `var`. beta1 : the dict of scalar beta1. Must have the same data type as `var`. beta2 : the dict of scalar beta2. Must have the same data type as `var`. epsilon : the dict of scalar epsilon. Must have the same data type as `var`. grad : the dict of tensor grad. Must have the same data type as `var`. var_out : the dict of var output. m_out : the dict of m output. v_out : the dict of v output. kernel_name : cce kernel name, default value is "apply_ada_max" (optional). Returns: ------- None """ input_dict = (var, m, v, beta1_power, lr, beta1, beta2, epsilon, grad) args = ApplyOpConfig.TensorArgs(input_dict, apply_ada_max_d_compute, [var_out, m_out, v_out], 14) name = ApplyOpConfig.TensorName(all=('var', 'm', 'v', 'beta1_power', 'lr', 'beta1', 'beta2', 'epsilon', 'grad'), scalar=('lr', 'beta1_power', 'beta1', 'beta2', 'epsilon'), reuse=('m', 'v', 'var')) common_apply_op_process(ApplyOpConfig(args, name), kernel_name)
def apply_keras_momentum_d(var, accum, lr, grad, momentum, out_var, out_accum, use_locking=False, use_nesterov=False, kernel_name="apply_keras_momentum_d"): """ Update '*var' according to the momentum scheme. accum = accum * momentum - grad * lr if use_nesterov is True: var = var + accum * momentum - grad * lr else: var = var + accum Parameters ---------- var : dict of tensor var, include shape and dtype. accum : dict of tensor accum, include shape and dtype. lr: dict of scalar lr(learning rate), include shape and dtype. grad: dict of tensor grad, include shape and dtype. momentum: dict of scala, include shape and dtype. out_var: dict of updated var. out_accum: dict of updated accum. use_locking: bool, default value is "False", if "True", var will be updated by using Nesterov momentum. use_nesterov: bool, default value is "False". kernel_name : kernel name, default value is "apply_keras_momentum_d" Returns ------- None """ input_dict = (var, accum, lr, grad, momentum) args = ApplyOpConfig.TensorArgs( input_dict, apply_keras_momentum_d_compute, [out_var, out_accum], 6 if use_nesterov else 5, ) name = ApplyOpConfig.TensorName(all=('var', 'accum', 'lr', 'grad', 'momentum'), scalar=('lr', 'momentum'), reuse=()) options = ApplyOpConfig.TensorOptions(attrs=use_nesterov) common_apply_op_process(ApplyOpConfig(args, name, options), kernel_name)
def fused_mul_apply_momentum_extern(var, accum, lr, x1, momentum, x2, var_copy, out_fp32, out_fp16, out_accum, use_nesterov=False, kernel_name="fused_mul_apply_momentum"): """ Update '*var' according to the ApplyMomentum algorithm. accum = accum * momentum + x1 * x2 if use_nesterov is True: var -= gard * lr + accum * momentum * lr else: var -= accum * lr Parameters: ---------- var : the dict of mutable tensor var, Dtype is float32. accum: the dict of mutable tensor accum. lr : the dict of scalar lr. x1 : the dict of tensor grad. momentum : the dict of scalar momentum. x2 : the dict of tensor grad. var_copy : the dict of mutable tensor var, Dtype is float16. out_fp32 : the dict of output. Dtype is float32. out_fp16 : the dict of output. Dtype is float16. out_accum : the dict of output. Dtype is same as input accum. use_nesterov: bool. If true, use nesterov computing grad, default value is False. kernel_name : cce kernel name, default value is "fused_mul_apply_momentum". Returns ------- None """ var_dtype = var.get("dtype") op_utils.check_dtype(var_dtype, ("float32", ), param_name="var") var_copy_dtype = var_copy.get("dtype") op_utils.check_dtype(var_copy_dtype, ("float16", ), param_name="var_copy") input_dict = (var, accum, lr, x1, momentum, x2, var_copy) outputs = [out_fp32, out_fp16, out_accum] args = ApplyOpConfig.TensorArgs( input_dict, _fused_mul_apply_momentum_extern_compute, outputs, 10 if use_nesterov else 8, ) name = ApplyOpConfig.TensorName(all=('var', 'accum', 'lr', 'x1', 'momentum', 'x2', 'var_copy'), scalar=('lr', 'momentum', 'x2'), reuse=('accum', 'var', 'var_copy')) options = ApplyOpConfig.TensorOptions(attrs=use_nesterov) common_apply_op_process(ApplyOpConfig(args, name, options), kernel_name, same_flag=False)