Exemplo n.º 1
0
    def update(self):
        """
        Update Exponential Moving Average. Should only call this method in
        train program.
        """
        param_master_emas = []
        for param, tmp in self._params_tmps:
            with param.block.program._optimized_guard(
                [param, tmp]), name_scope('moving_average'):
                param_ema = self._ema_vars[param.name]
                if param.name + '.master' in self._ema_vars:
                    master_ema = self._ema_vars[param.name + '.master']
                    param_master_emas.append([param_ema, master_ema])
                else:
                    ema_t = param_ema * self._decay_var + param * (
                        1 - self._decay_var)
                    layers.assign(input=ema_t, output=param_ema)

        # for fp16 params
        for param_ema, master_ema in param_master_emas:
            default_main_program().global_block().append_op(
                type="cast",
                inputs={"X": master_ema},
                outputs={"Out": param_ema},
                attrs={
                    "in_dtype": master_ema.dtype,
                    "out_dtype": param_ema.dtype
                })
Exemplo n.º 2
0
 def apply_gradients(self, params_grads):
     flattened = []
     for p, g in params_grads:
         flattened.extend([p, g])
     with flattened[0].block.program._optimized_guard(
             flattened), name_scope("optimizer"):
         self._apply_gradients_impl(params_grads)
Exemplo n.º 3
0
    def _append_decoupled_weight_decay(self, block, param_and_grad):
        """
        Add decoupled weight decay op.
            parameter = parameter - parameter * coeff * lr
        Args:
            block: block in which variable is to be created
            param_and_grad: (parameters, gradients) pairs,
                the parameters need to decay.
        Raises:
            Exception: The type of coeff and parameter is not consistent.
        """
        if isinstance(param_and_grad, dict):
            param_and_grad = self._update_param_group(param_and_grad)
        param, grad = param_and_grad

        if self._apply_decay_param_fun is not None \
                and not self._apply_decay_param_fun(param.name):
            return

        if isinstance(self._learning_rate, float):
            learning_rate = self._learning_rate
        else:
            # NOTE. We add this function to the _append_optimize_op(),
            # for we must make sure _create_param_lr() be called after
            # optimizer._create_global_learning_rate().
            learning_rate = self._create_param_lr(param_and_grad)

        with block.program._optimized_guard(
            [param, grad]), framework.name_scope('weight decay'):
            self._params_name.add(param.name)

            # If it has been calculated, the result will be reused.
            # NOTE(wangxi): In dygraph mode, apply_gradient will be executed
            # every step, so need clear _lr_to_coeff every step,
            # we do this in _create_optimization_pass
            decay_coeff = self._lr_to_coeff.get(learning_rate, None)
            if decay_coeff is None:
                # NOTE(wangxi): for pipeline to set device:all
                with paddle.static.device_guard(None):
                    decay_coeff = 1.0 - learning_rate * self._coeff
                self._lr_to_coeff[learning_rate] = decay_coeff

            find_master = (self._multi_precision and
                           param.dtype == core.VarDesc.VarType.FP16)
            if find_master:
                master_weight = self._master_weights[param.name]
                scaled_param = master_weight * decay_coeff
                paddle.fluid.layers.assign(
                    input=scaled_param, output=master_weight)
            else:
                scaled_param = param * decay_coeff
                paddle.fluid.layers.assign(input=scaled_param, output=param)
Exemplo n.º 4
0
    def __init__(self,
                 decay=0.999,
                 thres_steps=None,
                 zero_debias=False,
                 name=None):
        self._decay = decay
        self._thres_steps = thres_steps
        self._name = name if name is not None else ''
        self._decay_var = self._get_ema_decay()

        self._params_tmps = []
        for param in default_main_program().global_block().all_parameters():
            if param.do_model_average != False:
                tmp = param.block.create_var(
                    name=unique_name.generate(".".join(
                        [self._name + param.name, 'ema_tmp'])),
                    dtype=param.dtype,
                    persistable=False,
                    stop_gradient=True)
                self._params_tmps.append((param, tmp))

        self._ema_vars = {}
        for param, tmp in self._params_tmps:
            with param.block.program._optimized_guard(
                [param, tmp]), name_scope('moving_average'):
                self._ema_vars[param.name] = self._create_ema_vars(param)

        self.apply_program = Program()
        block = self.apply_program.global_block()
        with program_guard(main_program=self.apply_program):
            decay_pow = self._get_decay_pow(block)
            for param, tmp in self._params_tmps:
                param = block._clone_variable(param)
                tmp = block._clone_variable(tmp)
                ema = block._clone_variable(self._ema_vars[param.name])
                layers.assign(input=param, output=tmp)
                # bias correction
                if zero_debias:
                    ema = ema / (1.0 - decay_pow)
                layers.assign(input=ema, output=param)

        self.restore_program = Program()
        block = self.restore_program.global_block()
        with program_guard(main_program=self.restore_program):
            for param, tmp in self._params_tmps:
                tmp = block._clone_variable(tmp)
                param = block._clone_variable(param)
                layers.assign(input=tmp, output=param)
Exemplo n.º 5
0
 def _create_param_lr(self, param_and_grad):
     """
     create learning rate parameter
     """
     # create learning rate variable for every parameter
     param = param_and_grad[0]
     param_lr = param.optimize_attr['learning_rate']
     if type(param_lr) == Variable:
         return param_lr
     else:
         if param_lr == 1.0:
             return self._global_learning_rate()
         else:
             with fluid.default_main_program()._lr_schedule_guard(
                     is_with_opt=True), framework.name_scope(
                         'scale_with_param_lr'):
                 return self._global_learning_rate() * param_lr
Exemplo n.º 6
0
    def minimize(self,
                 loss,
                 startup_program=None,
                 parameter_list=None,
                 no_grad_set=None):
        params_grads = self.backward(loss=loss,
                                     startup_program=startup_program,
                                     parameter_list=parameter_list,
                                     no_grad_set=no_grad_set)
        scaled_params = self._scale_parameters(params_grads)
        for p_grad_sgrad in scaled_params:
            param, grad, scaled_param = p_grad_sgrad
            with param.block.program._optimized_guard(
                [param, grad]), framework.name_scope('weight decay'):
                updated_param = paddle.fluid.layers.elementwise_sub(
                    x=param, y=scaled_param)
                paddle.fluid.layers.assign(input=updated_param, output=param)

        optimize_ops = self.apply_optimize(loss=loss,
                                           params_grads=params_grads,
                                           startup_program=startup_program)
        return optimize_ops, params_grads
Exemplo n.º 7
0
    def _scale_parameters(self, params_and_grads):
        """
        Adds weight decay ops.
            scaled_parameter = parameter * coeff

        Args:
            params_and_grads: A list of (parameters, gradients) pairs,
                the parameters need to decay.
        Raises:
            Exception: The type of coeff and parameter is not consistent.
        """
        if isinstance(self._coeff, float) and self._coeff == 0.0:
            return

        scaled_params = []
        for param, grad in params_and_grads:
            # If no gradient then we don't need to do anything
            if grad is None:
                continue
            if self._apply_decay_param_fun is not None \
                    and not self._apply_decay_param_fun(param.name):
                continue

            if isinstance(self._coeff, float):
                assert param.dtype is not paddle.fluid.core.VarDesc.VarType.FP32, \
                    "the type of coeff(float) and parameter(%s) is not consistent."%(self._coeff.dtype)
            else:
                assert self._coeff.dtype == param.dtype, \
                    "the type of coeff(%s) and parameter(%s) is not consistent."%(self._coeff.dtype, param.dtype)

            with param.block.program._optimized_guard(
                [param, grad]), framework.name_scope('weight decay'):
                assert param.name not in self._params_name
                scaled_params.append((param, grad, param * self._coeff))
                self._params_name.add(param.name)
        return scaled_params
Exemplo n.º 8
0
    def fp16_compression(param_and_grads):
        """
        Compress fp32 gradients to fp16 during allreduce.
        """
        op_maker = core.op_proto_and_checker_maker

        new_param_and_grads = []  # param, grad, is_cast
        # cast grad from fp32->fp16 before allreduce,
        for param, grad in param_and_grads:
            if grad is None or grad.dtype != core.VarDesc.VarType.FP32:
                new_param_and_grads.append((param, grad, False))
                continue

            op = grad.op
            block = grad.block
            var_attr = op.all_attrs()[op_maker.kOpRoleVarAttrName()]
            if param.name not in var_attr:
                new_param_and_grads.append((param, grad, False))
                continue

            # remove (param, grad) from op_role_var
            var_attr.remove(param.name)
            var_attr.remove(grad.name)
            if len(var_attr) > 1:
                op._set_attr(op_maker.kOpRoleVarAttrName(), var_attr)
            else:
                op._remove_attr(op_maker.kOpRoleVarAttrName())

            new_grad = block.create_var(
                name=unique_name.generate(grad.name + ".cast_fp16"),
                dtype=core.VarDesc.VarType.FP16,
                persistable=False,
                stop_gradient=True)

            with block.program._backward_role_guard():
                cast_op = block.append_op(type="cast",
                                          inputs={"X": grad},
                                          outputs={"Out": new_grad},
                                          attrs={
                                              "in_dtype":
                                              core.VarDesc.VarType.FP32,
                                              "out_dtype":
                                              core.VarDesc.VarType.FP16
                                          },
                                          stop_gradient=True)

                backward = op_maker.OpRole.Backward
                cast_op._set_attr(op_maker.kOpRoleAttrName(), backward)
                cast_op._set_attr(op_maker.kOpRoleVarAttrName(),
                                  [param.name, new_grad.name])
                new_grad.op = cast_op

            new_param_and_grads.append((param, new_grad, True))

        ret_param_and_grads = []
        # cast grad from fp16->fp32 after allreduce.
        # NOTE. Now we split fp16 compression into two for loops,
        # if we do not separate them, fuse allreduce will wrong.
        # This must be the problem of fuse allreduce pass, need
        # fixed in future.
        for param, grad, cast in new_param_and_grads:
            if not cast:
                ret_param_and_grads.append((param, grad))
                continue

            block = grad.block
            new_grad = block.create_var(
                name=unique_name.generate(grad.name + ".cast_fp32"),
                dtype=core.VarDesc.VarType.FP32,
                persistable=False,
                stop_gradient=True)

            with block.program._optimized_guard(
                [param, grad]), framework.name_scope('fp16_allreduce'):
                cast_op = block.append_op(type="cast",
                                          inputs={"X": grad},
                                          outputs={"Out": new_grad},
                                          attrs={
                                              "in_dtype":
                                              core.VarDesc.VarType.FP16,
                                              "out_dtype":
                                              core.VarDesc.VarType.FP32
                                          },
                                          stop_gradient=True)
            ret_param_and_grads.append((param, new_grad))

        return ret_param_and_grads
Exemplo n.º 9
0
    def _create_optimization_pass(self, parameters_and_grads):
        """Add optimization operators to update gradients to tensors.

        Args:
          parameters_and_grads(list(tuple(Tensor, Tensor))):
            a list of (tensor, gradient) pair to update.

        Returns:
          return_op_list: a list of operators that will complete one step of
            optimization. This will include parameter update ops, global step
            update ops and any other custom ops required by subclasses to manage
            their internal state.
        """
        # This is a default implementation of create_optimization_pass that
        # can be shared by most optimizers. This implementation assumes that
        # the subclass will implement the _append_optimize_op method and the
        #  _initialize_tensors method. The subclass can extend the
        # _create_accumulators method if it needs to create accumulators
        # for parameters and extend _finish_update method to add custom ops.

        # Allways called under program_guard use global block as loss block
        # But if current block is in control flow, append optimize op in the
        # grad block of current block

        global_block = framework.default_main_program().global_block()
        target_block = global_block
        current_block = framework.default_main_program().current_block()
        if current_block.idx != global_block.idx:
            assert current_block.backward_block_idx != -1, \
                "current block is not global_block, but it doesn't have backward block."
            target_block = framework.default_main_program().blocks[
                current_block.backward_block_idx]

        start = len(target_block.ops)
        self.helper = LayerHelper(self.__class__.__name__)
        self._update_param_device_map(parameters_and_grads, target_block)
        self._create_accumulators(
            target_block,
            [p[0] for p in parameters_and_grads if p[0].trainable])
        self._create_global_learning_rate()

        if framework.in_dygraph_mode():
            for param_and_grad in parameters_and_grads:
                if param_and_grad[1] is None:
                    continue
                if param_and_grad[0].trainable is True:
                    self._append_optimize_op(target_block, param_and_grad)
        else:
            for param_and_grad in parameters_and_grads:
                if param_and_grad[1] is None:
                    continue
                with param_and_grad[0].block.program._optimized_guard(
                        param_and_grad), name_scope("optimizer"):
                    if param_and_grad[0].trainable is True:
                        device = self._get_device_for_param(param_and_grad[0]
                                                            .name)
                        with device_guard(device):
                            optimize_op = self._append_optimize_op(
                                target_block, param_and_grad)

        # Get custom finish ops for subclasses
        # FIXME: Need to fix this once we figure out how to handle dependencies
        self._finish_update(target_block, parameters_and_grads)

        end = len(target_block.ops)
        return target_block._slice_ops(start, end)