Exemplo n.º 1
0
    def __init__(self, network, optimizer, scale_update_cell=None):

        super(BertFinetuneCell, self).__init__(auto_prefix=False)
        self.network = network
        self.weights = ParameterTuple(network.trainable_params())
        self.optimizer = optimizer
        self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
        self.reducer_flag = False
        self.allreduce = P.AllReduce()
        self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
        if self.parallel_mode in [
                ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL
        ]:
            self.reducer_flag = True
        self.grad_reducer = None
        if self.reducer_flag:
            mean = context.get_auto_parallel_context("mirror_mean")
            degree = get_group_size()
            self.grad_reducer = DistributedGradReducer(optimizer.parameters,
                                                       mean, degree)
        self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
        self.clip_gradients = ClipGradients()
        self.cast = P.Cast()
        self.alloc_status = P.NPUAllocFloatStatus()
        self.get_status = P.NPUGetFloatStatus()
        self.clear_before_grad = P.NPUClearFloatStatus()
        self.reduce_sum = P.ReduceSum(keep_dims=False)
        self.depend_parameter_use = P.ControlDepend(depend_mode=1)
        self.base = Tensor(1, mstype.float32)
        self.less_equal = P.LessEqual()
        self.hyper_map = C.HyperMap()
        self.loss_scale = None
        self.loss_scaling_manager = scale_update_cell
        if scale_update_cell:
            self.loss_scale = Parameter(Tensor(
                scale_update_cell.get_loss_scale(), dtype=mstype.float32),
                                        name="loss_scale")
Exemplo n.º 2
0
 def __init__(self, network):
     super(TrainStepWrapForAdam, self).__init__()
     self.network = network
     self.weights = ParameterTuple(network.get_parameters())
     self.optimizer = AdamWeightDecay(self.weights)
     self.clip_gradients = ClipGradients()
Exemplo n.º 3
0
    }),
    ('Adam_2', {
        'block':
        set_train(
            TrainStepWrapForAdam(
                GetNextSentenceOutput(BertConfig(batch_size=1)))),
        'desc_inputs': [[128, 768], [128, 2]],
        'skip': ['backward']
    }),
    ('AdamWeightDecayDynamicLR', {
        'block': set_train(TrainStepWrapForAdamDynamicLr(NetForAdam())),
        'desc_inputs': [[1, 64]],
        'skip': ['backward']
    }),
    ('ClipGradients', {
        'block': TempC2Wrap(ClipGradients(), 1, 1.0),
        'desc_inputs': [tuple(convert(shp) for shp in [[1], [1], [1]])],
        'skip': ['backward', 'exec']
    }),
]

test_case = functools.reduce(lambda x, y: x + y, [test_case_cell_ops])
# use -k to select certain testcast
# pytest  tests/python/ops/test_ops.py::test_backward -k LayerNorm

test_exec_case = filter(
    lambda x: 'skip' not in x[1] or 'exec' not in x[1]['skip'], test_case)
test_backward_exec_case = filter(
    lambda x: 'skip' not in x[1] or 'backward' not in x[1]['skip'] and
    'backward_exec' not in x[1]['skip'], test_case)
test_check_gradient_case = filter(