Exemplo n.º 1
0
    def __init__(self, network, config, sens=1000.0):
        super(TrainStepWrap, self).__init__()
        self.network = network
        self.network.set_train()
        self.trainable_params = network.trainable_params()
        weights_w = []
        weights_d = []
        for params in self.trainable_params:
            if 'wide' in params.name:
                weights_w.append(params)
            else:
                weights_d.append(params)

        self.weights_w = ParameterTuple(weights_w)
        self.weights_d = ParameterTuple(weights_d)
        self.optimizer_w = FTRL(learning_rate=config.ftrl_lr,
                                params=self.weights_w,
                                l1=5e-4,
                                l2=5e-4,
                                initial_accum=0.1,
                                loss_scale=sens)

        #self.optimizer_d = ProximalAdagrad(self.weights_d, learning_rate=config.adam_lr,loss_scale=sens)
        self.optimizer_d = Adam(self.weights_d,
                                learning_rate=config.adam_lr,
                                eps=1e-6,
                                loss_scale=sens)

        self.hyper_map = C.HyperMap()

        self.grad_w = C.GradOperation(get_by_list=True, sens_param=True)
        self.grad_d = C.GradOperation(get_by_list=True, sens_param=True)

        self.sens = sens
        self.loss_net_w = IthOutputCell(network, output_index=0)
        self.loss_net_d = IthOutputCell(network, output_index=1)

        self.reducer_flag = False
        self.grad_reducer_w = None
        self.grad_reducer_d = None
        parallel_mode = context.get_auto_parallel_context("parallel_mode")
        if parallel_mode in (ParallelMode.DATA_PARALLEL,
                             ParallelMode.HYBRID_PARALLEL):
            self.reducer_flag = True
        if self.reducer_flag:
            mean = context.get_auto_parallel_context("mirror_mean")
            degree = context.get_auto_parallel_context("device_num")
            self.grad_reducer_w = DistributedGradReducer(
                self.optimizer_w.parameters, mean, degree)
            self.grad_reducer_d = DistributedGradReducer(
                self.optimizer_d.parameters, mean, degree)
Exemplo n.º 2
0
    def __init__(self, network, optimizer, scale_update_cell=None):

        super(TransformerTrainOneStepWithLossScaleCell,
              self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()
        self.network.add_flags(defer_inline=True)
        self.weights = optimizer.parameters
        self.optimizer = optimizer
        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
        self.reducer_flag = False
        self.all_reduce = P.AllReduce()

        self.parallel_mode = _get_parallel_mode()
        if self.parallel_mode not in ParallelMode.MODE_LIST:
            raise ValueError("Parallel mode does not support: ",
                             self.parallel_mode)
        if self.parallel_mode in [
                ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
        ]:
            self.reducer_flag = True
        self.grad_reducer = None
        if self.reducer_flag:
            mean = _get_gradients_mean()
            degree = _get_device_num()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                       mean, degree)
        self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
        self.clip_gradients = ClipGradients()
        self.cast = P.Cast()
        if context.get_context("device_target") == "GPU":
            self.gpu_target = True
            self.float_status = P.FloatStatus()
            self.addn = P.AddN()
            self.reshape = P.Reshape()
        else:
            self.gpu_target = False
            self.alloc_status = P.NPUAllocFloatStatus()
            self.get_status = P.NPUGetFloatStatus()
            self.clear_status = P.NPUClearFloatStatus()
        self.reduce_sum = P.ReduceSum(keep_dims=False)
        self.base = Tensor(1, mstype.float32)
        self.less_equal = P.LessEqual()
        self.hyper_map = C.HyperMap()

        self.loss_scale = None
        self.loss_scaling_manager = scale_update_cell
        if scale_update_cell:
            self.loss_scale = Parameter(
                Tensor(scale_update_cell.get_loss_scale(),
                       dtype=mstype.float32))
Exemplo n.º 3
0
    def __init__(self,
                 params,
                 decay_steps,
                 warmup_steps=0,
                 start_learning_rate=0.1,
                 end_learning_rate=0.0001,
                 power=1.0,
                 beta1=0.9,
                 beta2=0.999,
                 eps=1e-6,
                 weight_decay=0.0,
                 decay_filter=lambda x: 'LayerNorm' not in x.name and 'bias'
                 not in x.name):

        super(Lamb, self).__init__(start_learning_rate, params)
        _check_param_value(decay_steps, warmup_steps, start_learning_rate,
                           end_learning_rate, power, beta1, beta2, eps,
                           weight_decay, self.cls_name)

        # turn them to scalar when me support scalar/tensor mix operations
        self.global_step = Parameter(initializer(0, [1]), name="global_step")

        self.warmup_steps = Tensor(np.array([warmup_steps]).astype(np.float32))
        self.warmup_flag = False
        if warmup_steps > 0:
            self.warmup_flag = True
        self.decay_steps = Tensor(np.array([decay_steps]).astype(np.float32))
        self.start_learning_rate = Tensor(
            np.array([start_learning_rate]).astype(np.float32))
        self.end_learning_rate = Tensor(
            np.array([end_learning_rate]).astype(np.float32))
        self.diff_learning_rate = Tensor(
            np.array([start_learning_rate - end_learning_rate
                      ]).astype(np.float32))
        self.power = power
        self.beta1 = Tensor(np.array([beta1]).astype(np.float32))
        self.beta2 = Tensor(np.array([beta2]).astype(np.float32))
        self.eps = Tensor(np.array([eps]).astype(np.float32))
        self.weight_decay_tensor = Tensor(
            np.array([weight_decay]).astype(np.float32))
        self.params = self.parameters
        self.moments1 = self.params.clone(prefix="lamb_m", init='zeros')
        self.moments2 = self.params.clone(prefix="lamb_v", init='zeros')
        self.decay_flag = tuple(decay_filter(x) for x in self.params)

        self.hyper_map = C.HyperMap()
        self.min = P.Minimum()
        self.pow = P.Pow()
        self.greater = P.Greater()
        self.one = Tensor(np.array([1.0]).astype(np.float32))
        self.cast = P.Cast()
Exemplo n.º 4
0
 def __init__(self,
              params,
              accum=0.1,
              learning_rate=0.001,
              update_slots=True,
              loss_scale=1.0,
              weight_decay=0.0):
     super(Adagrad, self).__init__(learning_rate, params, weight_decay,
                                   loss_scale)
     _check_param_value(accum, update_slots, self.cls_name)
     self.accum = self.parameters.clone(prefix="accum", init=accum)
     self.hyper_map = C.HyperMap()
     self.update_slots = update_slots
     self.opt = P.ApplyAdagrad(update_slots=update_slots)
Exemplo n.º 5
0
 def __init__(self,
              params,
              learning_rate,
              momentum,
              matrix_A,
              matrix_G,
              weight_decay=0.0,
              loss_scale=1.0,
              num_hidden_layers=24,
              batch_size=12,
              damping=0.03,
              decay_filter=lambda x: 'layernorm' not in x.name.lower() and
              'bias' not in x.name.lower()):
     super(THOR, self).__init__(learning_rate, params, weight_decay,
                                loss_scale)
     if isinstance(momentum, float) and momentum < 0.0:
         raise ValueError(
             "momentum should be at least 0.0, but got momentum {}".format(
                 momentum))
     self.momentum = Parameter(Tensor(momentum, mstype.float32))
     self.params = self.parameters
     self.moments = self.params.clone(prefix="moments", init='zeros')
     self.hyper_map = C.HyperMap()
     self.opt = P.ApplyMomentum()
     self.matrix_A = ParameterTuple(matrix_A)
     self.matrix_G = ParameterTuple(matrix_G)
     self.matmul = P.MatMul()
     self.transpose = P.Transpose()
     self.shape = P.Shape()
     self.reshape = P.Reshape()
     self.mul = P.Mul()
     self.gather = P.GatherV2()
     self.matrix_A_inv = ()
     self.matrix_G_inv = ()
     self.num_hidden_layers = num_hidden_layers
     self.sqrt = P.Sqrt()
     self.assign = P.Assign()
     self.cast = P.Cast()
     self.thor = True
     self.weight_decay = weight_decay * loss_scale
     self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
     self.expand = P.ExpandDims()
     self.square = P.Square()
     self.inv = P.Inv()
     self.batch_size = batch_size
     self.damping = damping
     self.one = Tensor(1, mstype.int32)
     self.cov_step = Parameter(initializer(0, [1], mstype.int32),
                               requires_grad=False)
Exemplo n.º 6
0
 def __init__(self,
              params,
              decay_steps,
              learning_rate=0.001,
              end_learning_rate=0.0001,
              power=10.0,
              beta1=0.9,
              beta2=0.999,
              eps=1e-6,
              weight_decay=0.0,
              decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in
              x.name,
              warmup_steps=0):
     super(AdamWeightDecayDynamicLR, self).__init__(learning_rate, params)
     if self.is_group:
         raise RuntimeError(
             f"The {self.cls_name} optimizer cannot support group setting.")
     _check_param_value(beta1, beta2, eps, weight_decay, self.cls_name)
     _check_learning_rate_value(learning_rate, end_learning_rate,
                                decay_steps, power, self.cls_name)
     # turn them to scalar when me support scalar/tensor mix operations
     self.global_step = Parameter(initializer(0, [1]))
     self.warmup_steps = Tensor(np.array([warmup_steps]).astype(np.float32))
     self.warmup_flag = False
     if warmup_steps > 0:
         self.warmup_flag = True
     self.decay_steps = Tensor(np.array([decay_steps]).astype(np.float32))
     self.end_learning_rate = Tensor(
         np.array([end_learning_rate]).astype(np.float32))
     self.diff_learning_rate = Tensor(
         np.array([learning_rate - end_learning_rate]).astype(np.float32))
     self.power = power
     self.beta1 = Tensor(np.array([beta1]).astype(np.float32))
     self.beta2 = Tensor(np.array([beta2]).astype(np.float32))
     self.eps = Tensor(np.array([eps]).astype(np.float32))
     self.weight_decay_tensor = Tensor(
         np.array([weight_decay]).astype(np.float32))
     self.params = self.parameters
     self.moments1 = self.params.clone(prefix="adam_m", init='zeros')
     self.moments2 = self.params.clone(prefix="adam_v", init='zeros')
     self.decay_flag = tuple(decay_filter(x) for x in self.params)
     self.hyper_map = C.HyperMap()
     self.min = P.Minimum()
     self.pow = P.Pow()
     self.greater = P.Greater()
     self.one = Tensor(np.array([1.0]).astype(np.float32))
     self.cast = P.Cast()
     self.start_learning_rate = Tensor(
         np.array([learning_rate]).astype(np.float32))
Exemplo n.º 7
0
    def __init__(self, network, sens=1024.0, host_device_mix=False, parameter_server=False):
        super(TrainStepWrap, self).__init__()
        parallel_mode = context.get_auto_parallel_context("parallel_mode")
        is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL)
        self.network = network
        self.network.set_grad()
        self.network.set_train()
        self.trainable_params = network.trainable_params()
        weights_w = []
        weights_d = []
        for params in self.trainable_params:
            if 'wide' in params.name:
                weights_w.append(params)
            else:
                weights_d.append(params)
        self.weights_w = ParameterTuple(weights_w)
        self.weights_d = ParameterTuple(weights_d)

        if (host_device_mix and is_auto_parallel) or parameter_server:
            self.optimizer_d = LazyAdam(
                self.weights_d, learning_rate=3.5e-4, eps=1e-8, loss_scale=sens)
            self.optimizer_w = FTRL(learning_rate=5e-2, params=self.weights_w,
                                    l1=1e-8, l2=1e-8, initial_accum=1.0, loss_scale=sens)
            self.optimizer_w.sparse_opt.add_prim_attr("primitive_target", "CPU")
            self.optimizer_d.sparse_opt.add_prim_attr("primitive_target", "CPU")
        else:
            self.optimizer_d = Adam(
                self.weights_d, learning_rate=3.5e-4, eps=1e-8, loss_scale=sens)
            self.optimizer_w = FTRL(learning_rate=5e-2, params=self.weights_w,
                                    l1=1e-8, l2=1e-8, initial_accum=1.0, loss_scale=sens)
        self.hyper_map = C.HyperMap()
        self.grad_w = C.GradOperation(get_by_list=True,
                                      sens_param=True)
        self.grad_d = C.GradOperation(get_by_list=True,
                                      sens_param=True)
        self.sens = sens
        self.loss_net_w = IthOutputCell(network, output_index=0)
        self.loss_net_d = IthOutputCell(network, output_index=1)

        self.reducer_flag = False
        self.grad_reducer_w = None
        self.grad_reducer_d = None
        self.reducer_flag = parallel_mode in (ParallelMode.DATA_PARALLEL,
                                              ParallelMode.HYBRID_PARALLEL)
        if self.reducer_flag:
            mean = context.get_auto_parallel_context("gradients_mean")
            degree = context.get_auto_parallel_context("device_num")
            self.grad_reducer_w = DistributedGradReducer(self.optimizer_w.parameters, mean, degree)
            self.grad_reducer_d = DistributedGradReducer(self.optimizer_d.parameters, mean, degree)
Exemplo n.º 8
0
    def __init__(self, params, learning_rate=1e-3, beta1=0.9, beta2=0.999, eps=1e-6, weight_decay=0.0,
                 decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name):
        super(AdamWeightDecay, self).__init__(learning_rate, params)
        _check_param_value(beta1, beta2, eps, weight_decay, self.cls_name)
        self.beta1 = Tensor(np.array([beta1]).astype(np.float32))
        self.beta2 = Tensor(np.array([beta2]).astype(np.float32))
        self.eps = Tensor(np.array([eps]).astype(np.float32))
        self.weight_decay_tensor = Tensor(np.array([weight_decay]).astype(np.float32))

        self.params = self.parameters
        self.moments1 = self.params.clone(prefix="adam_m", init='zeros')
        self.moments2 = self.params.clone(prefix="adam_v", init='zeros')
        self.decay_flag = tuple(decay_filter(x) for x in self.params)

        self.hyper_map = C.HyperMap()
Exemplo n.º 9
0
            def __init__(self, *args, **kwargs):
                super(DPOptimizer, self).__init__(*args, **kwargs)
                self._mech = mech
                self._tuple_add = _TupleAdd()
                self._hyper_map = C.HyperMap()
                self._micro_batches = Tensor(micro_batches, mstype.float32)

                self._mech_param_updater = None
                if self._mech is not None and self._mech._decay_policy is not None:
                    self._mech_param_updater = _MechanismsParamsUpdater(
                        decay_policy=self._mech._decay_policy,
                        decay_rate=self._mech._noise_decay_rate,
                        cur_noise_multiplier=self._mech._noise_multiplier,
                        init_noise_multiplier=self._mech.
                        _initial_noise_multiplier)
Exemplo n.º 10
0
    def __init__(self,
                 params,
                 learning_rate=1e-3,
                 beta1=0.9,
                 beta2=0.999,
                 eps=1e-8,
                 use_locking=False,
                 use_nesterov=False,
                 weight_decay=0.0,
                 loss_scale=1.0):
        super(Adam, self).__init__(learning_rate, params)
        _check_param_value(beta1, beta2, eps, weight_decay)
        validator.check_type("use_locking", use_locking, [bool])
        validator.check_type("use_nesterov", use_nesterov, [bool])
        validator.check_type("loss_scale", loss_scale, [float])
        validator.check_number_range("loss_scale", loss_scale, 1.0,
                                     float("inf"), Rel.INC_LEFT)

        self.dynamic_lr = False
        if isinstance(learning_rate, Iterable) or \
                (isinstance(learning_rate, Tensor) and learning_rate.dim() == 1):
            self.dynamic_lr = True
            self.gather = P.GatherV2()
            self.assignadd = P.AssignAdd()
            self.global_step = Parameter(initializer(0, [1], mstype.int32),
                                         name="global_step")
            self.axis = 0

        self.beta1 = Tensor(beta1, mstype.float32)
        self.beta2 = Tensor(beta2, mstype.float32)
        self.beta1_power = Parameter(initializer(1, [1], mstype.float32),
                                     name="beta1_power")
        self.beta2_power = Parameter(initializer(1, [1], mstype.float32),
                                     name="beta2_power")
        self.eps = eps

        self.moment1 = self.parameters.clone(prefix="moment1", init='zeros')
        self.moment2 = self.parameters.clone(prefix="moment2", init='zeros')

        self.hyper_map = C.HyperMap()
        self.opt = P.Adam(use_locking, use_nesterov)
        self.weight_decay = weight_decay * loss_scale
        self.reciprocal_scale = 1.0 / loss_scale

        self.pow = P.Pow()
        self.sqrt = P.Sqrt()
        self.one = Tensor(np.array([1.0]).astype(np.float32))
        self.realdiv = P.RealDiv()
Exemplo n.º 11
0
 def __init__(self, parameters, mean=True, degree=None):
     super(DistributedGradReducer, self).__init__(auto_prefix=False)
     self.hyper_map = C.HyperMap()
     self.mul = P.Mul()
     if degree is None:
         self.degree = get_group_size()
     else:
         if not isinstance(degree, int) or degree <= 0:
             raise ValueError(
                 "Parameter 'degree' in DistributedGradReducer should large than 0 and be int"
             )
         self.degree = degree
     self.mean = mean
     self.allreduce_filter = tuple(x.layerwise_parallel is False
                                   for x in parameters)
     _init_optimizer_allreduce()
Exemplo n.º 12
0
 def __init__(self,
              params,
              learning_rate=1e-3,
              beta1=0.9,
              beta2=0.999,
              eps=1e-6,
              weight_decay=0.0):
     super(AdamWeightDecayOp, self).__init__(learning_rate, params,
                                             weight_decay)
     _check_param_value(beta1, beta2, eps, self.cls_name)
     self.beta1 = Tensor(np.array([beta1]).astype(np.float32))
     self.beta2 = Tensor(np.array([beta2]).astype(np.float32))
     self.eps = Tensor(np.array([eps]).astype(np.float32))
     self.moments1 = self.parameters.clone(prefix="adam_m", init='zeros')
     self.moments2 = self.parameters.clone(prefix="adam_v", init='zeros')
     self.hyper_map = C.HyperMap()
    def __init__(self, network, optimizer, sens=1.0):
        super(BertTrainCell, self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()
        self.weights = optimizer.parameters
        self.optimizer = optimizer
        self.sens = sens
        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
        self.clip_type = gradient_cfg.clip_type
        self.clip_value = gradient_cfg.clip_value
        self.reducer_flag = False
        self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
        if self.parallel_mode in [
                ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
        ]:
            self.reducer_flag = True
        self.grad_reducer = F.identity
        self.degree = 1
        if self.reducer_flag:
            mean = context.get_auto_parallel_context("gradients_mean")
            self.degree = get_group_size()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                       mean, self.degree)
        self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
        self.cast = P.Cast()
        self.hyper_map = C.HyperMap()

        self.saved_params = self.weights.clone(prefix='saved')
        self.length = len(self.weights)
        self.quant_embedding_list = []
        self.quant_weight_list = []
        for i, key in enumerate(self.saved_params):
            if 'embedding_lookup' in key.name and 'min' not in key.name and 'max' not in key.name:
                self.quant_embedding_list.append(i)
            elif 'weight' in key.name and 'dense_1' not in key.name:
                self.quant_weight_list.append(i)
        self.quant_embedding_list_length = len(self.quant_embedding_list)
        self.quant_weight_list_length = len(self.quant_weight_list)

        self.quantize_embedding = QuantizeWeightCell(
            num_bits=network.embedding_bits,
            compute_type=network.compute_type,
            clip_value=network.weight_clip_value)
        self.quantize_weight = QuantizeWeightCell(
            num_bits=network.weight_bits,
            compute_type=network.compute_type,
            clip_value=network.weight_clip_value)
Exemplo n.º 14
0
    def __init__(self,
                 optimizer,
                 epsilon=1e-05,
                 coefficient=0.001,
                 use_clip=False,
                 lars_filter=lambda x: 'LayerNorm' not in x.name and 'bias'
                 not in x.name):
        super(LARS, self).__init__(0.0,
                                   [Parameter(Tensor(0.0), name="fake_param")])
        _check_param_value(optimizer, epsilon, coefficient, use_clip,
                           self.cls_name)
        self.opt = optimizer
        self.parameters = optimizer.parameters
        self.use_clip = use_clip
        self.lars_flag = tuple(lars_filter(x) for x in self.parameters)
        self.is_group = optimizer.is_group
        self.learning_rate = Parameter(Tensor(0.0, dtype=mstype.float32),
                                       name="fake_lr")
        self.decay_flags = optimizer.decay_flags
        self.reciprocal_scale = optimizer.reciprocal_scale
        self.need_scale = optimizer.need_scale
        self.hyper_map = C.HyperMap()
        self.lars = P.LARSUpdate(epsilon, coefficient, use_clip)
        self.cast = P.Cast()

        if use_clip:
            self.is_group_lr = optimizer.is_group_lr
            self.dynamic_lr = optimizer.dynamic_lr
            self.origin_learning_rate = optimizer.learning_rate
            self.global_step = optimizer.global_step
            if self.is_group_lr and self.dynamic_lr:
                raise ValueError('Grouped dynamic learning rate is currently not supported for the inputs optimizer ' \
                                 'of lars.')

        if self.is_group:
            self.weight_decay = tuple(
                map(lambda x: x / optimizer.loss_scale,
                    optimizer.weight_decay))
            optimizer.weight_decay = tuple(
                map(lambda x: 0.0, optimizer.weight_decay))
        else:
            self.weight_decay = optimizer.weight_decay / optimizer.loss_scale
            optimizer.weight_decay = 0.0

        optimizer.decay_flags = tuple(map(lambda x: False, self.decay_flags))
        optimizer.reciprocal_scale = 1.0
        optimizer.exec_weight_decay = False
Exemplo n.º 15
0
    def __init__(self, network, sens=1000.0):
        super(TrainStepWrap, self).__init__()
        self.network = network
        self.network.set_train()
        self.trainable_params = network.trainable_params()
        weights_w = []
        weights_d = []
        for params in self.trainable_params:
            if 'wide' in params.name:
                weights_w.append(params)
            else:
                weights_d.append(params)
        self.weights_w = ParameterTuple(weights_w)
        self.weights_d = ParameterTuple(weights_d)
        self.optimizer_w = FTRL(learning_rate=1e-2,
                                params=self.weights_w,
                                l1=1e-8,
                                l2=1e-8,
                                initial_accum=1.0)
        self.optimizer_d = Adam(self.weights_d,
                                learning_rate=3.5e-4,
                                eps=1e-8,
                                loss_scale=sens)
        self.hyper_map = C.HyperMap()
        self.grad_w = C.GradOperation('grad_w',
                                      get_by_list=True,
                                      sens_param=True)
        self.grad_d = C.GradOperation('grad_d',
                                      get_by_list=True,
                                      sens_param=True)
        self.sens = sens
        self.loss_net_w = IthOutputCell(network, output_index=0)
        self.loss_net_d = IthOutputCell(network, output_index=1)

        self.reducer_flag = False
        self.grad_reducer_w = None
        self.grad_reducer_d = None
        parallel_mode = _get_parallel_mode()
        self.reducer_flag = parallel_mode in (ParallelMode.DATA_PARALLEL,
                                              ParallelMode.HYBRID_PARALLEL)
        if self.reducer_flag:
            mean = _get_mirror_mean()
            degree = _get_device_num()
            self.grad_reducer_w = DistributedGradReducer(
                self.optimizer_w.parameters, mean, degree)
            self.grad_reducer_d = DistributedGradReducer(
                self.optimizer_d.parameters, mean, degree)
Exemplo n.º 16
0
    def __init__(self,
                 params,
                 learning_rate=0.1,
                 momentum=0.0,
                 dampening=0.0,
                 weight_decay=0.0,
                 nesterov=False,
                 loss_scale=1.0):

        super(SGD, self).__init__(learning_rate, params, weight_decay,
                                  loss_scale)

        if isinstance(momentum, int):
            momentum = float(momentum)
        if not isinstance(momentum, float):
            raise TypeError("momentum should be float number!")

        if isinstance(momentum, float) and momentum < 0.0:
            raise ValueError(
                "momentum should be at least 0.0, but got momentum {}".format(
                    momentum))

        if isinstance(dampening, int):
            dampening = float(dampening)
        if not isinstance(dampening, float):
            raise TypeError("dampening should be float number")

        if dampening < 0.0:
            raise ValueError(
                "dampening should be at least 0.0, but got dampening {}".
                format(dampening))
        self.dampening = dampening

        if isinstance(weight_decay, int):
            weight_decay = float(weight_decay)

        validator.check_value_type("nesterov", nesterov, [bool], self.cls_name)
        self.nesterov = nesterov

        self.opt = P.SGD(dampening, weight_decay, nesterov)

        self.momentum = Parameter(Tensor(momentum, mstype.float32),
                                  name="momentum")
        self.accum = self.parameters.clone(prefix="accum", init='zeros')
        self.stat = self.parameters.clone(prefix="stat", init='ones')
        self.hyper_map = C.HyperMap()
Exemplo n.º 17
0
 def __init__(self, params, config):
     super(GlobalNorm, self).__init__()
     self.norm = nn.Norm()
     self.hyper_map = C.HyperMap()
     self.config = config
     self.allreduce_filter = tuple(
         "projection.bias" not in x.name and "layernorm" not in x.name
         and "embedding_table" not in x.name for x in params)
     self.length = len(params)
     self.values = []
     self.group_size = get_group_size()
     for item in self.allreduce_filter:
         if item:
             self.values.append(1.0)
         else:
             self.values.append(self.group_size * 1.0)
     self.values = tuple(self.values)
Exemplo n.º 18
0
    def __init__(self, params, learning_rate, beta1=0.9, beta2=0.999, eps=1e-6, weight_decay=0.0):
        super(Lamb, self).__init__(learning_rate, params, weight_decay)
        _check_param_value(beta1, beta2, eps, self.cls_name)

        # turn them to scalar when me support scalar/tensor mix operations
        self.beta1 = Tensor(np.array([beta1]).astype(np.float32))
        self.beta2 = Tensor(np.array([beta2]).astype(np.float32))
        self.eps = Tensor(np.array([eps]).astype(np.float32))
        self.params = self.parameters
        self.moments1 = self.params.clone(prefix="lamb_m", init='zeros')
        self.moments2 = self.params.clone(prefix="lamb_v", init='zeros')

        if not self.dynamic_lr:
            self.global_step = Parameter(initializer(0, [1]), name='global_step')
            self.assignadd = P.AssignAdd()
        self.hyper_map = C.HyperMap()
        self.device_ascend = context.get_context("device_target") == "Ascend"
Exemplo n.º 19
0
    def __init__(self, network, optimizer, scale_update_cell=None, accumulation_steps=1, enable_global_norm=False):
        super(BertTrainAccumulateStepsWithLossScaleCell, self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()
        self.weights = optimizer.parameters
        self.optimizer = optimizer
        self.accumulation_steps = accumulation_steps
        self.enable_global_norm = enable_global_norm
        self.one = Tensor(np.array([1]).astype(np.int32))
        self.zero = Tensor(np.array([0]).astype(np.int32))
        self.local_step = Parameter(initializer(0, [1], mstype.int32), name="local_step")
        self.accu_grads = self.weights.clone(prefix="accu_grads", init='zeros')
        self.accu_overflow = Parameter(initializer(0, [1], mstype.int32), name="accu_overflow")
        self.loss = Parameter(initializer(0, [1], mstype.float32), name="accu_loss")

        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
        self.reducer_flag = False
        self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
        if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
            self.reducer_flag = True
        self.grad_reducer = F.identity
        self.degree = 1
        if self.reducer_flag:
            self.degree = get_group_size()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree)
        self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
        self.overflow_reducer = F.identity
        if self.is_distributed:
            self.overflow_reducer = P.AllReduce()
        self.cast = P.Cast()
        self.alloc_status = P.NPUAllocFloatStatus()
        self.get_status = P.NPUGetFloatStatus()
        self.clear_before_grad = P.NPUClearFloatStatus()
        self.reduce_sum = P.ReduceSum(keep_dims=False)
        self.base = Tensor(1, mstype.float32)
        self.less_equal = P.LessEqual()
        self.logical_or = P.LogicalOr()
        self.not_equal = P.NotEqual()
        self.select = P.Select()
        self.reshape = P.Reshape()
        self.hyper_map = C.HyperMap()
        self.loss_scale = None
        self.loss_scaling_manager = scale_update_cell
        if scale_update_cell:
            self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32),
                                        name="loss_scale")
Exemplo n.º 20
0
    def __init__(self, params, initial_accum=0.1, learning_rate=0.001, lr_power=-0.5, l1=0.0, l2=0.0,
                 use_locking=False, loss_scale=1.0, weight_decay=0.0):
        super(FTRL, self).__init__(learning_rate, params)

        _check_param(initial_accum, learning_rate, lr_power, l1, l2, use_locking, loss_scale, weight_decay,
                     self.cls_name)
        self.moments = self.parameters.clone(prefix="moments", init=initial_accum)
        self.linear = self.parameters.clone(prefix="linear", init='zeros')
        self.l1 = l1
        self.l2 = l2
        self.lr_power = lr_power
        self.reciprocal_scale = 1.0 / loss_scale
        self.weight_decay = weight_decay
        self.decay_tf = tuple((lambda:True)() for x in self.parameters)
        self.hyper_map = C.HyperMap()
        self.opt = P.ApplyFtrl(use_locking=use_locking)
        self.one = Tensor(1, mstype.int32)
Exemplo n.º 21
0
    def __init__(self, network, optimizer, scale_sense):
        super(DFCNNCTCTrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False)
        self.network = network
        self.optimizer = optimizer

        if isinstance(scale_sense, nn.Cell):
            self.loss_scaling_manager = scale_sense
            self.scale_sense = Parameter(Tensor(scale_sense.get_loss_scale(),
                                                dtype=mstype.float32), name="scale_sense")
        elif isinstance(scale_sense, Tensor):
            if scale_sense.shape == (1,) or scale_sense.shape == ():
                self.scale_sense = Parameter(scale_sense, name='scale_sense')
            else:
                raise ValueError("The shape of scale_sense must be (1,) or (), but got {}".format(
                    scale_sense.shape))
        else:
            raise TypeError("The scale_sense must be Cell or Tensor, but got {}".format(
                type(scale_sense)))

        self.network.set_grad()
        self.weights = ParameterTuple(network.trainable_params())

        self.grad = C.GradOperation(get_by_list=True,
                                    sens_param=True)

        self.reducer_flag = False
        self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
        if self.parallel_mode not in ParallelMode.MODE_LIST:
            raise ValueError("Parallel mode does not support: ", self.parallel_mode)
        if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
            self.reducer_flag = True
        self.grad_reducer = None
        if self.reducer_flag:
            mean = context.get_auto_parallel_context("gradients_mean")
            degree = get_group_size()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)
        self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)

        self.clip_gradients = ClipGradients()
        self.cast = P.Cast()
        self.addn = P.AddN()
        self.reshape = P.Reshape()
        self.hyper_map = C.HyperMap()
        self.less_equal = P.LessEqual()
        self.allreduce = P.AllReduce()
 def __init__(self,
              network,
              optimizer,
              scale_update_cell=None,
              enable_global_norm=True,
              config=None):
     super(PANGUALPHATrainOneStepWithLossScaleCell,
           self).__init__(auto_prefix=False)
     self.network = network
     self.config = config
     self.network.add_flags(defer_inline=True)
     self.weights = optimizer.parameters
     self.optimizer = optimizer
     self.enable_global_norm = enable_global_norm
     self.grad = C.GradOperation(get_by_list=True, sens_param=True)
     self.reducer_flag = False
     self.allreduce = P.AllReduce()
     self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
     if self.parallel_mode in [
         ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
     ]:
         self.reducer_flag = True
     self.grad_reducer = F.identity
     self.degree = 1
     if self.reducer_flag:
         self.degree = get_group_size()
         self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                    False, self.degree)
     self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
     self.cast = P.Cast()
     self.alloc_status = P.NPUAllocFloatStatus()
     self.get_status = P.NPUGetFloatStatus()
     self.clear_before_grad = P.NPUClearFloatStatus()
     self.reduce_sum = P.ReduceSum(keep_dims=False)
     self.depend_parameter_use = P.ControlDepend(depend_mode=1)
     self.base = Tensor(1, mstype.float32)
     self.less_equal = P.LessEqual()
     self.hyper_map = C.HyperMap()
     self.loss_scale = None
     self.loss_scaling_manager = scale_update_cell
     if scale_update_cell:
         self.loss_scale = Parameter(Tensor(
             scale_update_cell.get_loss_scale(), dtype=mstype.float32),
             name="loss_scale")
     self.clip = ClipByGlobalNorm(self.weights, self.config, pipeline=False)
Exemplo n.º 23
0
 def __init__(self,
              params,
              learning_rate,
              momentum,
              weight_decay=0.0,
              loss_scale=1.0):
     super(Momentum, self).__init__(learning_rate, params, weight_decay,
                                    loss_scale)
     if isinstance(momentum, float) and momentum < 0.0:
         raise ValueError(
             "momentum should be at least 0.0, but got momentum {}".format(
                 momentum))
     self.momentum = Parameter(Tensor(momentum, mstype.float32),
                               name="momentum")
     self.params = self.parameters
     self.moments = self.params.clone(prefix="moments", init='zeros')
     self.hyper_map = C.HyperMap()
     self.opt = P.ApplyMomentum()
Exemplo n.º 24
0
 def __init__(self,
              params,
              accum=0.1,
              learning_rate=0.001,
              l1=0.0,
              l2=0.0,
              use_locking=False,
              loss_scale=1.0,
              weight_decay=0.0):
     super(ProximalAdagrad, self).__init__(learning_rate, params,
                                           weight_decay, loss_scale)
     _check_param_value(accum, l1, l2, use_locking, self.cls_name)
     self.accum = self.parameters.clone(prefix="accum", init=accum)
     self.l1 = Tensor(l1, mstype.float32)
     self.l2 = Tensor(l2, mstype.float32)
     self.hyper_map = C.HyperMap()
     self.opt = P.ApplyProximalAdagrad(use_locking=use_locking)
     self.sparse_opt = P.FusedSparseProximalAdagrad(use_locking=use_locking)
Exemplo n.º 25
0
 def __init__(self,
              params,
              initial_accum=0.1,
              learning_rate=0.001,
              lr_power=-0.5,
              l1=0.0,
              l2=0.0,
              use_locking=False,
              loss_scale=1.0,
              weight_decay=0.0):
     super(FTRL, self).__init__(learning_rate,
                                params,
                                weight_decay,
                                loss_scale=loss_scale)
     if self.dynamic_lr or self.is_group_lr:
         raise ValueError(
             'Dynamic learning rate or group learning rate is currently not supported.'
         )
     _check_param(initial_accum, lr_power, l1, l2, use_locking,
                  self.cls_name)
     self.moments = self.parameters.clone(prefix="moments",
                                          init=initial_accum)
     self.linear = self.parameters.clone(prefix="linear", init='zeros')
     self.l1 = l1
     self.l2 = l2
     self.lr = learning_rate
     self.lr_power = lr_power
     if not self.is_group:
         self.decay_flags = tuple((lambda: True)() for x in self.parameters)
     self.hyper_map = C.HyperMap()
     self.opt = P.ApplyFtrl(use_locking=use_locking)
     self.use_locking = use_locking
     self.sparse_opt = P.SparseApplyFtrl(learning_rate,
                                         l1,
                                         l2,
                                         lr_power,
                                         use_locking=use_locking)
     self._ps_pull = P.Pull()
     self._ps_push = P.Push("Ftrl", [0, 1, 2])
     self._ps_push.add_prim_attr("init_accum", initial_accum)
     self._ps_push.add_prim_attr("lr", learning_rate)
     self._ps_push.add_prim_attr("l1", l1)
     self._ps_push.add_prim_attr("l2", l2)
     self._ps_push.add_prim_attr("lr_power", lr_power)
Exemplo n.º 26
0
    def __init__(self, network, optimizer, scale_update_cell=None):

        super(BertFinetuneCell, self).__init__(auto_prefix=False)
        self.network = network
        self.network.set_grad()
        self.weights = optimizer.parameters
        self.optimizer = optimizer
        self.grad = C.GradOperation(get_by_list=True, sens_param=True)
        self.reducer_flag = False
        self.allreduce = P.AllReduce()
        self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
        if self.parallel_mode in [
                ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
        ]:
            self.reducer_flag = True
        self.grad_reducer = None
        if self.reducer_flag:
            mean = context.get_auto_parallel_context("gradients_mean")
            degree = get_group_size()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                       mean, degree)
        self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
        self.cast = P.Cast()
        self.gpu_target = False
        if context.get_context("device_target") == "GPU":
            self.gpu_target = True
            self.float_status = P.FloatStatus()
            self.addn = P.AddN()
            self.reshape = P.Reshape()
        else:
            self.alloc_status = P.NPUAllocFloatStatus()
            self.get_status = P.NPUGetFloatStatus()
            self.clear_before_grad = P.NPUClearFloatStatus()
        self.reduce_sum = P.ReduceSum(keep_dims=False)
        self.depend_parameter_use = P.ControlDepend(depend_mode=1)
        self.base = Tensor(1, mstype.float32)
        self.less_equal = P.LessEqual()
        self.hyper_map = C.HyperMap()
        self.loss_scale = None
        self.loss_scaling_manager = scale_update_cell
        if scale_update_cell:
            self.loss_scale = Parameter(
                Tensor(scale_update_cell.get_loss_scale(),
                       dtype=mstype.float32))
Exemplo n.º 27
0
    def __init__(self,
                 params,
                 learning_rate=0.1,
                 decay=0.9,
                 momentum=0.0,
                 epsilon=1e-10,
                 use_locking=False,
                 centered=False,
                 loss_scale=1.0,
                 weight_decay=0.0,
                 decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in
                 x.name):
        super(RMSProp, self).__init__(learning_rate, params, weight_decay,
                                      loss_scale, decay_filter)

        if isinstance(momentum, float) and momentum < 0.0:
            raise ValueError(
                "momentum should be at least 0.0, but got momentum {}".format(
                    momentum))

        if decay < 0.0:
            raise ValueError(
                "decay should be at least 0.0, but got dampening {}".format(
                    decay))
        self.decay = decay
        self.epsilon = epsilon

        validator.check_value_type("use_locking", use_locking, [bool],
                                   self.cls_name)
        validator.check_value_type("centered", centered, [bool], self.cls_name)
        self.centered = centered
        if centered:
            self.opt = P.ApplyCenteredRMSProp(use_locking)
            self.mg = self.parameters.clone(prefix="mean_grad", init='zeros')
        else:
            self.opt = P.ApplyRMSProp(use_locking)

        self.momentum = momentum

        self.ms = self.parameters.clone(prefix="mean_square", init='zeros')
        self.moment = self.parameters.clone(prefix="moment", init='zeros')
        self.hyper_map = C.HyperMap()

        self.decay = decay
Exemplo n.º 28
0
    def __init__(self,
                 params,
                 learning_rate=1e-3,
                 beta1=0.9,
                 beta2=0.999,
                 eps=1e-8,
                 use_locking=False,
                 use_nesterov=False,
                 weight_decay=0.0,
                 loss_scale=1.0,
                 decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in
                 x.name):
        super(Adam, self).__init__(learning_rate, params, weight_decay,
                                   loss_scale, decay_filter)
        _check_param_value(beta1, beta2, eps, weight_decay, self.cls_name)
        validator.check_value_type("use_locking", use_locking, [bool],
                                   self.cls_name)
        validator.check_value_type("use_nesterov", use_nesterov, [bool],
                                   self.cls_name)
        validator.check_value_type("loss_scale", loss_scale, [float],
                                   self.cls_name)
        validator.check_number_range("loss_scale", loss_scale, 1.0,
                                     float("inf"), Rel.INC_LEFT, self.cls_name)

        self.beta1 = Tensor(beta1, mstype.float32)
        self.beta2 = Tensor(beta2, mstype.float32)
        self.beta1_power = Parameter(initializer(1, [1], mstype.float32),
                                     name="beta1_power")
        self.beta2_power = Parameter(initializer(1, [1], mstype.float32),
                                     name="beta2_power")
        self.eps = eps

        self.moment1 = self.parameters.clone(prefix="moment1", init='zeros')
        self.moment2 = self.parameters.clone(prefix="moment2", init='zeros')

        self.decay_tf = tuple(decay_filter(x) for x in self.parameters)
        self.hyper_map = C.HyperMap()
        self.opt = P.Adam(use_locking, use_nesterov)

        self.pow = P.Pow()
        self.sqrt = P.Sqrt()
        self.one = Tensor(np.array([1.0]).astype(np.float32))
        self.realdiv = P.RealDiv()
Exemplo n.º 29
0
    def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, A_inv_max, G_inv_max,
                 weight_decay=0.0, loss_scale=1.0, use_nesterov=False, decay_filter=lambda x: x.name not in []):
        super(THOR_GPU, self).__init__(learning_rate, params, weight_decay, loss_scale)
        Validator.check_value_type("momentum", momentum, [float], self.cls_name)
        if isinstance(momentum, float) and momentum < 0.0:
            raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum))
        self.momentum = Parameter(Tensor(momentum, mstype.float32))
        self.params = self.parameters
        self.use_nesterov = Validator.check_bool(use_nesterov)
        self.moments = self.params.clone(prefix="moments", init='zeros')
        self.hyper_map = C.HyperMap()
        self.opt = P.ApplyMomentum(use_nesterov=self.use_nesterov)

        self.feature_map = [1.0 / 12544, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136,
                            1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136,
                            1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784,
                            1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784,
                            1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196,
                            1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196,
                            1.0 / 196, 1.0 / 196, 1.0 / 196,
                            1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49,
                            1.0]
        self.feature_map_new = [x ** 0.5 for x in self.feature_map]
        self.transpose = P.Transpose()
        self.shape = P.Shape()
        self.reshape = P.Reshape()
        self.matmul = P.MatMul()
        self.matrix_A = ParameterTuple(matrix_A)
        self.matrix_G = ParameterTuple(matrix_G)
        self.A_inv_max = ParameterTuple(A_inv_max)
        self.G_inv_max = ParameterTuple(G_inv_max)
        self.assign = P.Assign()
        self.mul = P.Mul()

        mean = _get_gradients_mean()
        degree = _get_device_num()

        parameter_length = len(self.feature_map)
        self.grad_reducer_thorA = DistributedGradReducerThor(parameter_length, ((parameter_length,), 0), mean, degree)
        self.grad_reducer_thorG = DistributedGradReducerThor(parameter_length, ((parameter_length,), 0), mean, degree)
        self.weight_decay = weight_decay
        self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
        self.update_gradient = P.UpdateThorGradient(split_dim=128)
Exemplo n.º 30
0
 def __init__(self,
              parameter_length,
              split_indices,
              mean=True,
              degree=None):
     super(DistributedGradReducerThor, self).__init__(auto_prefix=False)
     self.hyper_map = C.HyperMap()
     self.mul = P.Mul()
     if degree is None:
         self.degree = get_group_size()
     else:
         if not isinstance(degree, int) or degree <= 0:
             raise ValueError(
                 "Parameter 'degree' in DistributedGradReducer should large than 0 and be int"
             )
         self.degree = degree
     self.mean = mean
     self.op_list = _init_allreduce_operators(parameter_length,
                                              split_indices)