def __init__(self, params, learning_rate=1e-3, beta1=0.9, beta2=0.999, eps=1e-8, use_locking=False, use_nesterov=False, weight_decay=0.0, loss_scale=1.0): super(LazyAdam, self).__init__(learning_rate, params, weight_decay, loss_scale) _check_param_value(beta1, beta2, eps, weight_decay, self.cls_name) validator.check_value_type("use_locking", use_locking, [bool], self.cls_name) validator.check_value_type("use_nesterov", use_nesterov, [bool], self.cls_name) self.beta1 = Tensor(beta1, mstype.float32) self.beta2 = Tensor(beta2, mstype.float32) self.beta1_power = Parameter(initializer(1, [1], mstype.float32), name="beta1_power") self.beta2_power = Parameter(initializer(1, [1], mstype.float32), name="beta2_power") self.eps = Tensor(eps, mstype.float32) self.use_nesterov = use_nesterov self.use_locking = use_locking self._is_device = True self.moment1 = self.parameters.clone(prefix="moment1", init='zeros') self.moment2 = self.parameters.clone(prefix="moment2", init='zeros') self.hyper_map = C.HyperMap() self.opt = P.Adam(use_locking, use_nesterov) self.sparse_opt = P.FusedSparseLazyAdam(use_locking, use_nesterov) self.sparse_opt.add_prim_attr("primitive_target", "CPU") self._ps_pull = P.Pull() self._ps_push = P.Push("Adam", [0, 1, 2]) self._ps_push.add_prim_attr("use_nesterov", use_nesterov)
def __init__(self, params, initial_accum=0.1, learning_rate=0.001, lr_power=-0.5, l1=0.0, l2=0.0, use_locking=False, loss_scale=1.0, weight_decay=0.0): super(FTRL, self).__init__(learning_rate, params, weight_decay, loss_scale=loss_scale) if self.dynamic_lr or self.is_group_lr: raise ValueError('Dynamic learning rate or group learning rate is currently not supported.') _check_param(initial_accum, lr_power, l1, l2, use_locking, self.cls_name) self.moments = self.parameters.clone(prefix="moments", init=initial_accum) self.linear = self.parameters.clone(prefix="linear", init='zeros') self.l1 = l1 self.l2 = l2 self.lr = learning_rate self.lr_power = lr_power if not self.is_group: self.decay_flags = tuple((lambda: True)() for x in self.parameters) self.hyper_map = C.HyperMap() self.opt = P.ApplyFtrl(use_locking=use_locking) self.use_locking = use_locking self.sparse_opt = P.SparseApplyFtrl(learning_rate, l1, l2, lr_power, use_locking=use_locking) self._ps_pull = P.Pull() self._ps_push = P.Push("Ftrl", [0, 1, 2]) self._ps_push.add_prim_attr("init_accum", initial_accum) self._ps_push.add_prim_attr("lr", learning_rate) self._ps_push.add_prim_attr("l1", l1) self._ps_push.add_prim_attr("l2", l2) self._ps_push.add_prim_attr("lr_power", lr_power)
def _run_opt_with_sparse(opt, sparse_opt, beta1_power, beta2_power, beta1, beta2, eps, lr, gradient, params, moment1, moment2, ps_parameter): """Apply sparse adam optimizer to the weight parameter when the gradient is sparse.""" success = True indices = gradient.indices() values = gradient.values() if ps_parameter: op_shape = P.Shape() _ps_pull = P.Pull() _ps_push = P.Push("Adam", [0, 1, 2]) shapes = (op_shape(params), op_shape(moment1), op_shape(moment2), op_shape(beta1_power), op_shape(beta2_power), op_shape(lr), op_shape(beta1), op_shape(beta2), op_shape(eps), op_shape(values), op_shape(indices)) success = F.depend( success, _ps_pull( _ps_push((beta1_power, beta2_power, lr, beta1, beta2, eps, values, indices), shapes), params)) else: success = F.depend( success, sparse_opt(params, moment1, moment2, beta1_power, beta2_power, lr, beta1, beta2, eps, values, indices)) return success
def __init__(self, params, initial_accum=0.1, learning_rate=0.001, lr_power=-0.5, l1=0.0, l2=0.0, use_locking=False, loss_scale=1.0, weight_decay=0.0): super(PSFTRL, self).__init__(learning_rate, params, loss_scale=loss_scale) if self.is_group: raise RuntimeError( f"The {self.cls_name} optimizer cannot support group setting.") _check_param(initial_accum, lr_power, l1, l2, use_locking, self.cls_name) self.moments = self.parameters.clone(prefix="moments", init=initial_accum) self.linear = self.parameters.clone(prefix="linear", init='zeros') self.l1 = l1 self.l2 = l2 self.lr_power = lr_power self.weight_decay = weight_decay self.decay_tf = tuple((lambda: True)() for x in self.parameters) self.hyper_map = C.HyperMap() self.push = P.Push("Ftrl", [0, 1, 2]) self.push.add_prim_attr("primitive_target", "CPU") self.pull = P.Pull() self.pull.add_prim_attr("primitive_target", "CPU")
def _tensor_run_opt_ext(opt, momentum, learning_rate, gradient, weight, moment, ps_parameter): """Apply momentum optimizer to the weight parameter using Tensor.""" success = True if ps_parameter: op_shape = P.Shape() _ps_pull = P.Pull() _ps_push = P.Push("ApplyMomentum", []) shapes = (op_shape(learning_rate), op_shape(gradient), op_shape(momentum)) success = F.depend(success, _ps_pull(_ps_push((learning_rate, gradient, momentum), shapes), weight)) else: success = F.depend(success, opt(weight, moment, learning_rate, gradient, momentum)) return success
def _run_opt_with_one_number(opt, sparse_opt, beta1_power, beta2_power, beta1, beta2, eps, lr, gradient, params, moment1, moment2, ps_parameter): """Apply adam optimizer to the weight parameter using Tensor.""" success = True if ps_parameter: op_shape = P.Shape() _ps_pull = P.Pull() _ps_push = P.Push("Adam", [0, 1, 2]) success = F.depend(success, _ps_pull(_ps_push((beta1_power, beta2_power, lr, beta1, beta2, eps, gradient), (op_shape(params), op_shape(moment1), op_shape(moment2))), params)) else: success = F.depend(success, opt(params, moment1, moment2, beta1_power, beta2_power, lr, beta1, beta2, eps, gradient)) return success
def _tensor_run_opt_with_sparse(opt, spars_opt, l1, l2, lr_power, learning_rate, linear, gradient, weight, moment, ps_parameter): """Apply sparse ftrl optimizer to the weight parameter when the gradient is sparse.""" success = True indices = gradient.indices() values = gradient.values() if ps_parameter: op_shape = P.Shape() _ps_pull = P.Pull() _ps_push = P.Push("Ftrl", [0, 1, 2]) shapes = (op_shape(weight), op_shape(moment), op_shape(linear), op_shape(values), op_shape(indices)) success = F.depend( success, _ps_pull(_ps_push((values, indices), shapes), weight)) else: success = F.depend(success, spars_opt(weight, moment, linear, values, indices)) return success
def _tensor_run_opt(opt, spars_opt, l1, l2, lr_power, learning_rate, linear, gradient, weight, moment, ps_parameter): """Apply ftrl optimizer to the weight parameter.""" success = True if ps_parameter: op_shape = P.Shape() _ps_pull = P.Pull() _ps_push = P.Push("Ftrl", [0, 1, 2]) success = F.depend( success, _ps_pull( _ps_push( (gradient, learning_rate, l1, l2, lr_power), (op_shape(weight), op_shape(moment), op_shape(linear))), weight)) else: success = F.depend( success, opt(weight, moment, linear, gradient, learning_rate, l1, l2, lr_power)) return success