def _clip_grad(clip_type, clip_value, grad): dt = F.dtype(grad) if clip_type == 0: new_grad = C.clip_by_value( grad, F.cast(F.tuple_to_array((-clip_value, )), dt), F.cast(F.tuple_to_array((clip_value, )), dt)) else: new_grad = nn.ClipByNorm()(grad, F.cast(F.tuple_to_array((clip_value, )), dt)) return new_grad
def _clip_grad(clip_value, grad): """ Clip gradients. Inputs: clip_value (float): Specifies how much to clip. grad (tuple[Tensor]): Gradients. Outputs: tuple[Tensor], clipped gradients. """ dt = ops.dtype(grad) new_grad = nn.ClipByNorm()(grad, ops.cast(ops.tuple_to_array((clip_value,)), dt)) return new_grad
def _clip_grad(clip_type, clip_value, grad): """ Clip gradients. Inputs: clip_type (int): The way to clip, 0 for 'value', 1 for 'norm'. clip_value (float): Specifies how much to clip. grad (tuple[Tensor]): Gradients. Outputs: tuple[Tensor], clipped gradients. """ if clip_type not in (0, 1): return grad dt = F.dtype(grad) if clip_type == 0: new_grad = C.clip_by_value(grad, F.cast(F.tuple_to_array((-clip_value,)), dt), F.cast(F.tuple_to_array((clip_value,)), dt)) else: new_grad = nn.ClipByNorm()(grad, F.cast(F.tuple_to_array((clip_value,)), dt)) return new_grad
def __init__(self): super(_ClipGradients, self).__init__() self.clip_by_norm = nn.ClipByNorm() self.dtype = P.DType()
from mindspore.ops import operations as P from mindspore.ops import functional as F from mindspore.ops import composite as C from mindspore.common.tensor import Tensor from mindspore.common.parameter import Parameter, ParameterTuple from mindspore.common import dtype as mstype from mindspore.nn.wrap.grad_reducer import DistributedGradReducer from mindspore.train.parallel_utils import ParallelMode from mindspore.communication.management import get_group_size from mindspore import context from .bert_model import BertModel GRADIENT_CLIP_TYPE = 1 GRADIENT_CLIP_VALUE = 1.0 _nn_clip_by_norm = nn.ClipByNorm() clip_grad = C.MultitypeFuncGraph("clip_grad") # pylint: disable=consider-using-in @clip_grad.register("Number", "Number", "Tensor") def _clip_grad(clip_type, clip_value, grad): """ Clip gradients. Inputs: clip_type (int): The way to clip, 0 for 'value', 1 for 'norm'. clip_value (float): Specifies how much to clip. grad (tuple[Tensor]): Gradients. Outputs:
def test_clip_by_norm(): clip_by_norm = nn.ClipByNorm() x = Tensor(np.array([[-2, 0, 0], [0, 3, 4]]).astype(np.float32)) clip_norm = Tensor(np.array([1]).astype(np.float32)) clip_by_norm(x, clip_norm)
def __init__(self): super(Network, self).__init__() self.norm_value = Tensor(np.array([1]).astype(np.float32)) self.clip = nn.ClipByNorm()