def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, A_inv_max, G_inv_max, weight_decay=0.0, loss_scale=1.0, decay_filter=lambda x: x.name not in []): super(THOR, self).__init__(learning_rate, params, weight_decay, loss_scale) if isinstance(momentum, float) and momentum < 0.0: raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum)) self.momentum = Parameter(Tensor(momentum, mstype.float32)) self.params = self.parameters self.moments = self.params.clone(prefix="moments", init='zeros') self.hyper_map = C.HyperMap() self.opt = P.ApplyMomentum() self.matrix_A = ParameterTuple(matrix_A) self.matrix_G = ParameterTuple(matrix_G) self.A_inv_max = ParameterTuple(A_inv_max) self.G_inv_max = ParameterTuple(G_inv_max) self.cube_matmul_left = P.CusMatMulCubeFraczLeftCast() self.cube_matmul_left_fc = P.CusMatMulCubeDenseLeft() self.cube_matmul_right_fc = P.CusMatMulCubeDenseRight() self.cube_matmul_right_mul = P.CusMatMulCubeFraczRightMul() self.transpose = P.Transpose() self.shape = P.Shape() self.reshape = P.Reshape() self.mul = P.Mul() self.weight_idx = [] for i in range(len(self.params)): if "conv" in self.params[i].name or "end_point" in self.params[i].name: self.weight_idx.append(i) self.weight_idx.append(len(self.params)) self.feature_map = [1.0 / 12544, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0] mean = _get_gradients_mean() degree = _get_device_num() parameter_length = len(self.feature_map) self.grad_reducer_Amax = DistributedGradReducerThor(parameter_length, ((27,), 2), mean, degree) self.grad_reducer_Gmax = DistributedGradReducerThor(parameter_length, ((27,), 4), mean, degree) self.grad_reducer_A = DistributedGradReducerThor(parameter_length, ((27,), 6), mean, degree) self.grad_reducer_G = DistributedGradReducerThor(parameter_length, ((27,), 8), mean, degree) self.matrix_A_inv = () self.matrix_G_inv = () self.matrix_max_inv = () for i in range(54): self.matrix_max_inv = self.matrix_max_inv + ( Parameter(initializer(1, [1], mstype.float32), name="matrix_max" + str(i), requires_grad=False),) self.log = P.Log() self.exp = P.Exp() self.sqrt = P.Sqrt() self.matrix_max_inv = ParameterTuple(self.matrix_max_inv) self.assign = P.Assign() self.cast = P.Cast() self.thor = True self.weight_decay = weight_decay * loss_scale self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
def __init__(self, network, optimizer, sens=1.0): super(TrainOneStepCellWithGradClip, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() self.network.add_flags(defer_inline=True) self.weights = optimizer.parameters self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.reducer_flag = False self.grad_reducer = None self.hyper_map = C.HyperMap() self.greater = P.Greater() self.select = P.Select() self.norm = nn.Norm(keep_dims=True) self.dtype = P.DType() self.cast = P.Cast() self.concat = P.Concat(axis=0) self.ten = Tensor(np.array([10.0]).astype(np.float32)) parallel_mode = _get_parallel_mode() if parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL): self.reducer_flag = True if self.reducer_flag: mean = _get_gradients_mean() degree = _get_device_num() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)
def __init__(self, network, optimizer, sens=1.0): super(TrainOneStepCell, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() self.network.add_flags(defer_inline=True) self.weights = optimizer.parameters self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.reducer_flag = False self.grad_reducer = None self._tuple_add = _TupleAdd() self._tuple_mul = _TupleMul() parallel_mode = _get_parallel_mode() if parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL): self.reducer_flag = True if self.reducer_flag: mean = _get_gradients_mean() degree = _get_device_num() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) self.do_privacy = False self.grad_mask_tup = () # tuple containing grad_mask(cell) self.de_weight_tup = () # tuple containing de_weight(cell) self._suppress_pri_ctrl = None
def __init__(self, network, total_steps=1, sens=16384.0): super(TrainStepWrap, self).__init__(auto_prefix=False) self.network = network self.network.set_train() self.network.add_flags(defer_inline=True) self.weights = ParameterTuple(network.trainable_params()) lr = dynamic_lr(0.01, total_steps, 5000) self.optimizer = nn.Adam(self.weights, learning_rate=lr, beta1=0.9, beta2=0.999, eps=1e-8, loss_scale=sens) self.hyper_map = C.HyperMap() self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.reducer_flag = False self.grad_reducer = None parallel_mode = _get_parallel_mode() if parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL): self.reducer_flag = True if self.reducer_flag: mean = _get_gradients_mean() degree = _get_device_num() self.grad_reducer = DistributedGradReducer( self.optimizer.parameters, mean, degree)
def __init__(self, network, optimizer, scale_update_cell=None): super(TrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False) self.network = network self.network.add_flags(defer_inline=True) self.weights = ParameterTuple(network.trainable_params()) self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.hyper_map = C.HyperMap() self.alloc_status = NPUAllocFloatStatus() self.get_status = NPUGetFloatStatus() self.clear_status = NPUClearFloatStatus() self.reduce_sum = ReduceSum(keep_dims=False) self.base = Tensor(1, mstype.float32) self.reducer_flag = False self.less_equal = LessEqual() self.allreduce = P.AllReduce() self.parallel_mode = _get_parallel_mode() self.grad_reducer = None parallel_mode = _get_parallel_mode() if parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL): self.reducer_flag = True if self.reducer_flag: mean = _get_gradients_mean() degree = _get_device_num() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) self.is_distributed = self.parallel_mode != ParallelMode.STAND_ALONE self.loss_scale = None self.loss_scaling_manager = scale_update_cell if scale_update_cell: self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32), name="loss_scale")
def __init__(self, network, optimizer, sens=1.0): super(TrainOneStepCell, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() self.weights = optimizer.parameters self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.reducer_flag = False self.grad_reducer = F.identity self.parallel_mode = _get_parallel_mode() if self.parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL): self.reducer_flag = True if self.reducer_flag: self.mean = _get_gradients_mean() self.degree = _get_device_num() self.grad_reducer = DistributedGradReducer(self.weights, self.mean, self.degree) self.use_grad_accumulation = False if self.parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.STAND_ALONE): self.use_grad_accumulation = True if self.use_grad_accumulation: self.max_accumulation_step = get_auto_parallel_context("grad_accumulation_step") if self.max_accumulation_step <= 1: self.max_accumulation_step = 1 self.use_grad_accumulation = False if self.use_grad_accumulation: self.grad_accumulation = GradientAccumulation(self.max_accumulation_step, self.optimizer)
def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, weight_decay=0.0, loss_scale=1.0, num_hidden_layers=24, batch_size=12, damping=0.03, decay_filter=lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower()): super(THOR, self).__init__(learning_rate, params, weight_decay, loss_scale) if isinstance(momentum, float) and momentum < 0.0: raise ValueError( "momentum should be at least 0.0, but got momentum {}".format( momentum)) self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum") self.params = self.parameters self.moments = self.params.clone(prefix="moments", init='zeros') self.hyper_map = C.HyperMap() self.opt = P.ApplyMomentum() self.matrix_A = ParameterTuple(matrix_A) self.matrix_G = ParameterTuple(matrix_G) self.matmul = P.MatMul() self.transpose = P.Transpose() self.shape = P.Shape() self.reshape = P.Reshape() self.mul = P.Mul() self.gather = P.GatherV2() self.matrix_A_inv = () self.matrix_G_inv = () self.num_hidden_layers = num_hidden_layers self.sqrt = P.Sqrt() self.assign = P.Assign() self.cast = P.Cast() self.thor = True self.weight_decay = weight_decay * loss_scale self.decay_flags = tuple(decay_filter(x) for x in self.parameters) self.expand = P.ExpandDims() self.square = P.Square() self.inv = P.Inv() self.batch_size = batch_size self.damping = damping self.one = Tensor(1, mstype.int32) self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False) mean = _get_gradients_mean() degree = _get_device_num() self.grad_reducer_g = DistributedGradReducerThor( self.parameters, 3, mean, degree)
def __init__(self, network, optimizer, scale_update_cell=None): super(TransformerTrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() self.network.add_flags(defer_inline=True) self.weights = optimizer.parameters self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.reducer_flag = False self.all_reduce = P.AllReduce() self.parallel_mode = _get_parallel_mode() if self.parallel_mode not in ParallelMode.MODE_LIST: raise ValueError("Parallel mode does not support: ", self.parallel_mode) if self.parallel_mode in [ ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL ]: self.reducer_flag = True self.grad_reducer = None if self.reducer_flag: mean = _get_gradients_mean() degree = _get_device_num() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) self.clip_gradients = ClipGradients() self.cast = P.Cast() if context.get_context("device_target") == "GPU": self.gpu_target = True self.float_status = P.FloatStatus() self.addn = P.AddN() self.reshape = P.Reshape() else: self.gpu_target = False self.alloc_status = P.NPUAllocFloatStatus() self.get_status = P.NPUGetFloatStatus() self.clear_status = P.NPUClearFloatStatus() self.reduce_sum = P.ReduceSum(keep_dims=False) self.depend_parameter_use = P.ControlDepend(depend_mode=1) self.base = Tensor(1, mstype.float32) self.less_equal = P.LessEqual() self.hyper_map = C.HyperMap() self.loss_scale = None self.loss_scaling_manager = scale_update_cell if scale_update_cell: self.loss_scale = Parameter( Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32)) self.add_flags(has_effect=True)
def __init__(self, network, optimizer, sens=1.0): super(TrainOneStepCell, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() self.freeze = isinstance(optimizer, acc.FreezeOpt) self.optimizer = optimizer if not self.freeze: self.weights = self.optimizer.parameters self.train_strategy = getattr(self.optimizer, 'train_strategy', None) self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.reducer_flag = False self.grad_reducer = F.identity self.parallel_mode = _get_parallel_mode() self.reducer_flag = self.parallel_mode in ( ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL) self.use_grad_accumulation = self.parallel_mode in ( ParallelMode.DATA_PARALLEL, ParallelMode.STAND_ALONE) if self.use_grad_accumulation: self.max_accumulation_step = get_auto_parallel_context( "grad_accumulation_step") if self.max_accumulation_step <= 1: self.max_accumulation_step = 1 self.use_grad_accumulation = False self.grad_accumulation = None if self.use_grad_accumulation: self.grad_accumulation = GradientAccumulation( self.max_accumulation_step, self.optimizer) if self.reducer_flag: self.mean = _get_gradients_mean() self.degree = _get_device_num() if self.freeze: self.grad_reducers = (DistributedGradReducer( opt.parameters, self.mean, self.degree) for opt in self.optimizer.opts) self.freeze_nets = tuple( _TrainFreezeCell(self.network, self.sens, self.grad, reducer, self.use_grad_accumulation, self.max_accumulation_step, opt) for reducer, opt in zip(self.grad_reducers, self.optimizer)) else: self.grad_reducer = DistributedGradReducer( self.optimizer.parameters, self.mean, self.degree) else: if self.freeze: self.freeze_nets = tuple( _TrainFreezeCell(self.network, self.sens, self.grad, self. grad_reducer, self.use_grad_accumulation, self.max_accumulation_step, opt) for opt in self.optimizer.opts) self.step = Parameter(Tensor(0, dtype=mstype.int32))
def __init__(self, network, optimizer, scale_sense=None): super(TrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() self.network.add_flags(defer_inline=True) self.weights = optimizer.parameters self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.hyper_map = C.HyperMap() if context.get_context("device_target") == "GPU": self.gpu_target = True self.float_status = P.FloatStatus() self.addn = P.AddN() self.reshape = P.Reshape() else: self.gpu_target = False self.alloc_status = NPUAllocFloatStatus() self.get_status = NPUGetFloatStatus() self.clear_status = NPUClearFloatStatus() self.reduce_sum = ReduceSum(keep_dims=False) self.base = Tensor(1, mstype.float32) self.less_equal = LessEqual() self.depend_parameter_use = ControlDepend(depend_mode=1) self.allreduce = P.AllReduce() self.parallel_mode = _get_parallel_mode() self.grad_reducer = F.identity self.reducer_flag = self.parallel_mode in [ ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL ] if self.reducer_flag: mean = _get_gradients_mean() degree = _get_device_num() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) self.is_distributed = self.parallel_mode != ParallelMode.STAND_ALONE self.scale_sense = None self.loss_scaling_manager = None if isinstance(scale_sense, Cell): self.loss_scaling_manager = scale_sense self.scale_sense = Parameter(Tensor(scale_sense.get_loss_scale(), dtype=mstype.float32), name="scale_sense") if isinstance(scale_sense, Tensor): self.scale_sense = Parameter(scale_sense, name='scale_sense')
def __init__(self, network, optimizer, sens=1.0): super(TrainOneStepCell, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() self.network.add_flags(defer_inline=True) self.weights = optimizer.parameters self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.reducer_flag = False self.grad_reducer = F.identity self.parallel_mode = _get_parallel_mode() if self.parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL): self.reducer_flag = True if self.reducer_flag: mean = _get_gradients_mean() degree = _get_device_num() self.grad_reducer = DistributedGradReducer(self.weights, mean, degree)
def __init__(self, network, lr, eps, loss_scale=1000.0): super(TrainStepWrap, self).__init__(auto_prefix=False) self.network = network self.network.set_train() self.weights = ParameterTuple(network.trainable_params()) self.optimizer = Adam(self.weights, learning_rate=lr, eps=eps, loss_scale=loss_scale) self.hyper_map = C.HyperMap() self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.sens = loss_scale self.reducer_flag = False self.grad_reducer = None parallel_mode = _get_parallel_mode() if parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL): self.reducer_flag = True if self.reducer_flag: mean = _get_gradients_mean() degree = _get_device_num() self.grad_reducer = DistributedGradReducer(self.optimizer.parameters, mean, degree)
def __init__(self, network, optimizer, sens=1.0): super(NASNetAMobileTrainOneStepWithClipGradient, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() self.network.add_flags(defer_inline=True) self.weights = optimizer.parameters self.optimizer = optimizer self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True) self.hyper_map = C.HyperMap() self.sens = sens self.reducer_flag = False self.grad_reducer = None parallel_mode = _get_parallel_mode() if parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL): self.reducer_flag = True if self.reducer_flag: mean = _get_gradients_mean() degree = _get_device_num() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)
def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, A_inv_max, G_inv_max, weight_decay=0.0, loss_scale=1.0, use_nesterov=False, decay_filter=lambda x: x.name not in []): super(THOR_GPU, self).__init__(learning_rate, params, weight_decay, loss_scale) Validator.check_value_type("momentum", momentum, [float], self.cls_name) if isinstance(momentum, float) and momentum < 0.0: raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum)) self.momentum = Parameter(Tensor(momentum, mstype.float32)) self.params = self.parameters self.use_nesterov = Validator.check_bool(use_nesterov) self.moments = self.params.clone(prefix="moments", init='zeros') self.hyper_map = C.HyperMap() self.opt = P.ApplyMomentum(use_nesterov=self.use_nesterov) self.feature_map = [1.0 / 12544, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0] self.feature_map_new = [x ** 0.5 for x in self.feature_map] self.transpose = P.Transpose() self.shape = P.Shape() self.reshape = P.Reshape() self.matmul = P.MatMul() self.matrix_A = ParameterTuple(matrix_A) self.matrix_G = ParameterTuple(matrix_G) self.A_inv_max = ParameterTuple(A_inv_max) self.G_inv_max = ParameterTuple(G_inv_max) self.assign = P.Assign() self.mul = P.Mul() mean = _get_gradients_mean() degree = _get_device_num() parameter_length = len(self.feature_map) self.grad_reducer_thorA = DistributedGradReducerThor(parameter_length, ((parameter_length,), 0), mean, degree) self.grad_reducer_thorG = DistributedGradReducerThor(parameter_length, ((parameter_length,), 0), mean, degree) self.weight_decay = weight_decay self.decay_flags = tuple(decay_filter(x) for x in self.parameters) self.update_gradient = P.UpdateThorGradient(split_dim=128)
def __init__(self, network, sens=1000.0): super(TrainStepWrap, self).__init__() self.network = network self.network.set_train() self.trainable_params = network.trainable_params() weights_w = [] weights_d = [] for params in self.trainable_params: if 'wide' in params.name: weights_w.append(params) else: weights_d.append(params) self.weights_w = ParameterTuple(weights_w) self.weights_d = ParameterTuple(weights_d) self.optimizer_w = FTRL(learning_rate=1e-2, params=self.weights_w, l1=1e-8, l2=1e-8, initial_accum=1.0) self.optimizer_d = Adam(self.weights_d, learning_rate=3.5e-4, eps=1e-8, loss_scale=sens) self.hyper_map = C.HyperMap() self.grad_w = C.GradOperation(get_by_list=True, sens_param=True) self.grad_d = C.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.loss_net_w = IthOutputCell(network, output_index=0) self.loss_net_d = IthOutputCell(network, output_index=1) self.reducer_flag = False self.grad_reducer_w = None self.grad_reducer_d = None parallel_mode = _get_parallel_mode() self.reducer_flag = parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL) if self.reducer_flag: mean = _get_gradients_mean() degree = _get_device_num() self.grad_reducer_w = DistributedGradReducer( self.optimizer_w.parameters, mean, degree) self.grad_reducer_d = DistributedGradReducer( self.optimizer_d.parameters, mean, degree)
def __init__(self, loss_netD, loss_netG, optimizerD, optimizerG, sens=1, auto_prefix=True): super(TrainOneStepCell, self).__init__(auto_prefix=auto_prefix) self.loss_netD = loss_netD # loss network self.loss_netD.set_grad() self.loss_netD.add_flags(defer_inline=True) self.loss_netG = loss_netG self.loss_netG.set_grad() self.loss_netG.add_flags(defer_inline=True) self.weights_G = optimizerG.parameters self.optimizerG = optimizerG self.weights_D = optimizerD.parameters self.optimizerD = optimizerD self.grad = ops.GradOperation(get_by_list=True, sens_param=True) self.sens = sens # Parallel processing self.reducer_flag = False self.grad_reducer_G = F.identity self.grad_reducer_D = F.identity self.parallel_mode = _get_parallel_mode() if self.parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL): self.reducer_flag = True if self.reducer_flag: mean = _get_gradients_mean() degree = _get_device_num() self.grad_reducer_G = DistributedGradReducer( self.weights_G, mean, degree) self.grad_reducer_D = DistributedGradReducer( self.weights_D, mean, degree)
def __init__(self, network, optimizer, norm_bound=1.0, sens=1.0, micro_batches=None, noise_mech=None, clip_mech=None): super(_TrainOneStepCell, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() self.network.add_flags(defer_inline=True) self.weights = optimizer.parameters self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.reducer_flag = False self.grad_reducer = None parallel_mode = _get_parallel_mode() if parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL): self.reducer_flag = True if self.reducer_flag: mean = _get_gradients_mean() degree = _get_device_num() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) # dp params if micro_batches is None: msg = 'micro_batches must give in differential privacy, but got value: {}'.format( micro_batches) LOGGER.error(TAG, msg) raise ValueError(msg) self._micro_batches = micro_batches self._norm_bound = norm_bound self._split = P.Split(0, self._micro_batches) self._clip_by_global_norm = _ClipGradients() self._noise_mech = noise_mech self._clip_mech = clip_mech self._tuple_add = _TupleAdd() self._add = P.Add() self._norm = nn.Norm() self._hyper_map = C.HyperMap() self._zero = Tensor(0, mstype.float32) self._assign = P.Assign() self._div = P.Div() self._sqrt = P.Sqrt() self._reduce_sum = P.ReduceSum() self._square_all = P.Square() self._less = P.Less() self._cast = P.Cast() self._micro_float = Tensor(micro_batches, mstype.float32) self._noise_mech_param_updater = None if self._noise_mech is not None and self._noise_mech._decay_policy is not None: self._noise_mech_param_updater = _MechanismsParamsUpdater( decay_policy=self._noise_mech._decay_policy, decay_rate=self._noise_mech._noise_decay_rate, cur_noise_multiplier=self._noise_mech._noise_multiplier, init_noise_multiplier=self._noise_mech. _initial_noise_multiplier)
def __init__(self, network, optimizer, scale_update_cell=None, micro_batches=None, norm_bound=1.0, noise_mech=None, clip_mech=None): super(_TrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() self.network.add_flags(defer_inline=True) self.weights = ParameterTuple(network.trainable_params()) self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.hyper_map = C.HyperMap() if context.get_context("device_target") == "GPU": self.gpu_target = True self.float_status = P.FloatStatus() self.addn = P.AddN() self.reshape = P.Reshape() else: self.gpu_target = False self.alloc_status = NPUAllocFloatStatus() self.get_status = NPUGetFloatStatus() self.clear_status = NPUClearFloatStatus() self.reduce_sum = ReduceSum(keep_dims=False) self.base = Tensor(1, mstype.float32) self.less_equal = LessEqual() self.allreduce = P.AllReduce() self.parallel_mode = _get_parallel_mode() self.grad_reducer = F.identity self.reducer_flag = self.parallel_mode in [ ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL ] if self.reducer_flag: mean = _get_gradients_mean() degree = _get_device_num() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) self.is_distributed = self.parallel_mode != ParallelMode.STAND_ALONE self.loss_scale = None self.loss_scaling_manager = scale_update_cell if scale_update_cell: self.loss_scale = Parameter(Tensor( scale_update_cell.get_loss_scale(), dtype=mstype.float32), name="loss_scale") self.add_flags(has_effect=True) # dp params self._micro_batches = micro_batches self._norm_bound = norm_bound self._split = P.Split(0, self._micro_batches) self._clip_by_global_norm = _ClipGradients() self._noise_mech = noise_mech self._clip_mech = clip_mech self._add = P.Add() self._norm = nn.Norm() self._tuple_add = _TupleAdd() self._hyper_map = C.HyperMap() self._micro_float = Tensor(micro_batches, mstype.float32) self._zero = Tensor(0, mstype.float32) self._assign = P.Assign() self._div = P.Div() self._sqrt = P.Sqrt() self._reduce_sum = P.ReduceSum() self._square_all = P.Square() self._less = P.Less() self._cast = P.Cast() self._noise_mech_param_updater = None if self._noise_mech is not None and self._noise_mech._decay_policy is not None: self._noise_mech_param_updater = _MechanismsParamsUpdater( decay_policy=self._noise_mech._decay_policy, decay_rate=self._noise_mech._noise_decay_rate, cur_noise_multiplier=self._noise_mech._noise_multiplier, init_noise_multiplier=self._noise_mech. _initial_noise_multiplier)