def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, A_inv_max, G_inv_max, weight_decay=0.0, loss_scale=1.0, decay_filter=lambda x: x.name not in []): super(THOR, self).__init__(learning_rate, params, weight_decay, loss_scale) if isinstance(momentum, float) and momentum < 0.0: raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum)) self.momentum = Parameter(Tensor(momentum, mstype.float32)) self.params = self.parameters self.moments = self.params.clone(prefix="moments", init='zeros') self.hyper_map = C.HyperMap() self.opt = P.ApplyMomentum() self.matrix_A = ParameterTuple(matrix_A) self.matrix_G = ParameterTuple(matrix_G) self.A_inv_max = ParameterTuple(A_inv_max) self.G_inv_max = ParameterTuple(G_inv_max) self.cube_matmul_left = P.CusMatMulCubeFraczLeftCast() self.cube_matmul_left_fc = P.CusMatMulCubeDenseLeft() self.cube_matmul_right_fc = P.CusMatMulCubeDenseRight() self.cube_matmul_right_mul = P.CusMatMulCubeFraczRightMul() self.transpose = P.Transpose() self.shape = P.Shape() self.reshape = P.Reshape() self.mul = P.Mul() self.weight_idx = [] for i in range(len(self.params)): if "conv" in self.params[i].name or "end_point" in self.params[i].name: self.weight_idx.append(i) self.weight_idx.append(len(self.params)) self.feature_map = [1.0 / 12544, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0] mean = _get_gradients_mean() degree = _get_device_num() parameter_length = len(self.feature_map) self.grad_reducer_Amax = DistributedGradReducerThor(parameter_length, ((27,), 2), mean, degree) self.grad_reducer_Gmax = DistributedGradReducerThor(parameter_length, ((27,), 4), mean, degree) self.grad_reducer_A = DistributedGradReducerThor(parameter_length, ((27,), 6), mean, degree) self.grad_reducer_G = DistributedGradReducerThor(parameter_length, ((27,), 8), mean, degree) self.matrix_A_inv = () self.matrix_G_inv = () self.matrix_max_inv = () for i in range(54): self.matrix_max_inv = self.matrix_max_inv + ( Parameter(initializer(1, [1], mstype.float32), name="matrix_max" + str(i), requires_grad=False),) self.log = P.Log() self.exp = P.Exp() self.sqrt = P.Sqrt() self.matrix_max_inv = ParameterTuple(self.matrix_max_inv) self.assign = P.Assign() self.cast = P.Cast() self.thor = True self.weight_decay = weight_decay * loss_scale self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, weight_decay=0.0, loss_scale=1.0, num_hidden_layers=24, batch_size=12, damping=0.03, decay_filter=lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower()): super(THOR, self).__init__(learning_rate, params, weight_decay, loss_scale) if isinstance(momentum, float) and momentum < 0.0: raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum)) self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum") self.params = self.parameters self.moments = self.params.clone(prefix="moments", init='zeros') self.hyper_map = C.HyperMap() self.opt = P.ApplyMomentum() self.matrix_A = ParameterTuple(matrix_A) self.matrix_G = ParameterTuple(matrix_G) self.matmul = P.MatMul() self.transpose = P.Transpose() self.shape = P.Shape() self.reshape = P.Reshape() self.mul = P.Mul() self.gather = P.GatherV2() self.matrix_A_inv = () self.matrix_G_inv = () self.num_hidden_layers = num_hidden_layers self.sqrt = P.Sqrt() self.assign = P.Assign() self.cast = P.Cast() self.thor = True self.weight_decay = weight_decay * loss_scale self.decay_flags = tuple(decay_filter(x) for x in self.parameters) self.expand = P.ExpandDims() self.square = P.Square() self.inv = P.Inv() self.batch_size = batch_size self.damping = damping self.one = Tensor(1, mstype.int32) self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False)
def get_weight_bias(self): stdv = 1 / math.sqrt(self.hidden_s) gate_size = 4 * self.hidden_s w_list_value = [] b_list_value = [] for i in range(self.num_layers): b0 = np.zeros(gate_size, dtype=np.float16) w_shape = self.input_s if i == 0 else (self.num_directions * self.hidden_s) w_np = np.random.uniform(-stdv, stdv, (w_shape + self.hidden_s, gate_size)).astype(np.float16) w_list_value.append(Parameter(initializer(Tensor(w_np), [w_shape + self.hidden_s, gate_size]), name="weight_fw" + str(i))) if self.has_bias: b_np = np.random.uniform(-stdv, stdv, gate_size).astype(np.float16) b_list_value.append(Parameter(initializer(Tensor(b_np), [gate_size]), name="bias_fw" + str(i))) else: b_list_value.append(Parameter(initializer(Tensor(b0), [gate_size]), name="bias_fw" + str(i))) if self.bidirectional: w_bw_np = np.random.uniform(-stdv, stdv, (w_shape + self.hidden_s, gate_size)).astype(np.float16) b_list_value.append(Parameter(initializer(Tensor(w_bw_np), [w_shape + self.hidden_s, gate_size]), name="weight_bw" + str(i))) b_bw_np = np.random.uniform(-stdv, stdv, (4 * self.hidden_s)).astype( np.float16) if self.has_bias else b0 b_list_value.append(Parameter(initializer(Tensor(b_bw_np), [gate_size]), name="bias_bw" + str(i))) w_list_value = ParameterTuple(w_list_value) b_list_value = ParameterTuple(b_list_value) return w_list_value, b_list_value
def __init__(self, seq_len, batch_size, input_size, hidden_size, num_layers, has_bias, bidirectional, dropout): super(Net, self).__init__() num_directions = 1 if bidirectional: num_directions = 2 input_np = np.array([[[0.6755, -1.6607, 0.1367], [0.4276, -0.7850, -0.3758]], [[-0.6424, -0.6095, 0.6639], [0.7918, 0.4147, -0.5089]], [[-1.5612, 0.0120, -0.7289], [-0.6656, -0.6626, -0.5883]], [[-0.9667, -0.6296, -0.7310], [0.1026, -0.6821, -0.4387]], [[-0.4710, 0.6558, -0.3144], [-0.8449, -0.2184, -0.1806]] ]).astype(np.float32) self.x = Parameter(initializer(Tensor(input_np), [seq_len, batch_size, input_size]), name='x') self.hlist = [] self.clist = [] self.hlist.append(Parameter(initializer( Tensor( np.array([0.1, 0.1, 0.1, 0.1]).reshape((num_directions, batch_size, hidden_size)).astype( np.float32)), [num_directions, batch_size, hidden_size]), name='h')) self.clist.append(Parameter(initializer( Tensor( np.array([0.2, 0.2, 0.2, 0.2]).reshape((num_directions, batch_size, hidden_size)).astype( np.float32)), [num_directions, batch_size, hidden_size]), name='c')) self.h = ParameterTuple(tuple(self.hlist)) self.c = ParameterTuple(tuple(self.clist)) wih = np.array([[3.4021e-01, -4.6622e-01, 4.5117e-01], [-6.4257e-02, -2.4807e-01, 1.3550e-02], # i [-3.2140e-01, 5.5578e-01, 6.3589e-01], [1.6547e-01, -7.9030e-02, -2.0045e-01], [-6.9863e-01, 5.9773e-01, -3.9062e-01], [-3.0253e-01, -1.9464e-01, 7.0591e-01], [-4.0835e-01, 3.6751e-01, 4.7989e-01], [-5.6894e-01, -5.0359e-01, 4.7491e-01]]).astype(np.float32).reshape([1, -1]) whh = np.array([[-0.4820, -0.2350], [-0.1195, 0.0519], [0.2162, -0.1178], [0.6237, 0.0711], [0.4511, -0.3961], [-0.5962, 0.0906], [0.1867, -0.1225], [0.1831, 0.0850]]).astype(np.float32).reshape([1, -1]) bih = np.zeros((1, 8)).astype(np.float32) w_np = np.concatenate((wih, whh, bih), axis=1).reshape([-1, 1, 1]) self.w = Parameter(initializer(Tensor(w_np), w_np.shape), name='weight0') self.lstm = StackLSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, has_bias=has_bias, bidirectional=bidirectional, dropout=dropout) self.lstm.weight = ParameterTuple(tuple([self.w]))
def __init__(self, network, optimizer, scale_update_cell=None): super(TrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False) self.network = network self.network.add_flags(defer_inline=True) self.weights = ParameterTuple(network.trainable_params()) self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.hyper_map = C.HyperMap() self.alloc_status = NPUAllocFloatStatus() self.get_status = NPUGetFloatStatus() self.clear_status = NPUClearFloatStatus() self.reduce_sum = ReduceSum(keep_dims=False) self.base = Tensor(1, mstype.float32) self.reducer_flag = False self.less_equal = LessEqual() self.allreduce = P.AllReduce() self.parallel_mode = _get_parallel_mode() self.grad_reducer = None parallel_mode = _get_parallel_mode() if parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL): self.reducer_flag = True if self.reducer_flag: mean = _get_gradients_mean() degree = _get_device_num() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) self.is_distributed = self.parallel_mode != ParallelMode.STAND_ALONE self.loss_scale = None self.loss_scaling_manager = scale_update_cell if scale_update_cell: self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32), name="loss_scale")
def __init__(self, network, optimizer, scale_update_cell=None): super(BertSquadCell, self).__init__(auto_prefix=False) self.network = network self.weights = ParameterTuple(network.trainable_params()) self.optimizer = optimizer self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True) self.reducer_flag = False self.allreduce = P.AllReduce() self.parallel_mode = context.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL ]: self.reducer_flag = True self.grad_reducer = None if self.reducer_flag: mean = context.get_auto_parallel_context("mirror_mean") degree = get_group_size() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) self.cast = P.Cast() self.alloc_status = P.NPUAllocFloatStatus() self.get_status = P.NPUGetFloatStatus() self.clear_before_grad = P.NPUClearFloatStatus() self.reduce_sum = P.ReduceSum(keep_dims=False) self.depend_parameter_use = P.ControlDepend(depend_mode=1) self.base = Tensor(1, mstype.float32) self.less_equal = P.LessEqual() self.hyper_map = C.HyperMap() self.loss_scale = None self.loss_scaling_manager = scale_update_cell if scale_update_cell: self.loss_scale = Parameter(Tensor( scale_update_cell.get_loss_scale(), dtype=mstype.float32), name="loss_scale")
def test_grad_fv_and_insert_gradient_of(): class FvAndInsertGradientNet(nn.Cell): def __init__(self): super(FvAndInsertGradientNet, self).__init__() self.gather = P.GatherV2() self.damping = Tensor(np.array([0.03, 0.03], np.float32)) self.cov_step = Parameter(0, name="cov_step", requires_grad=False) self.freq = Tensor(278, ms.int32) self.getG = P.InsertGradientOf(self.save_gradient) self.z = Parameter(Tensor(np.array([1.0], np.float32)), name='z') def save_gradient(self, dout): self.cov_step = self.cov_step + self.freq return dout def construct(self, *inputs): # fv self.z from construct_wrapper x, = inputs self.z = x # insert_gradient_of self.gather(self.damping, self.cov_step, 0) out = self.getG(x) return out net = FvAndInsertGradientNet() input_data = Tensor(np.array([1.0], np.float32)) # if use grad_all_list, the generated graph will have env_setitem # as gradient for inputs is constant zero, so it will depend on result of grad. grad_net = grad_by_list(net, ParameterTuple(net.trainable_params())) print(grad_net(input_data))
def __init__(self, net): super(GradNet, self).__init__() self.weights = ParameterTuple(net.trainable_params()) self.net = net grad_op = C.GradOperation(get_all=False, get_by_list=True, sens_param=True) sens = Tensor(np.ones([3, 4, 5]), dtype=mstype.float32) self.grad = Bprop(self.net, True, self.weights, grad_op, sens)
def test_insert_gradient_of(): class InsertGradientNet(nn.Cell): def __init__(self): super(InsertGradientNet, self).__init__() self.gather = P.GatherV2() self.damping = Tensor(np.array([0.03, 0.03], np.float32)) self.cov_step = Parameter(0, name="cov_step", requires_grad=False) self.freq = Tensor(278, ms.int32) self.getG = P.InsertGradientOf(self.save_gradient) def save_gradient(self, dout): self.cov_step = self.cov_step + self.freq return dout def construct(self, x): self.gather(self.damping, self.cov_step, 0) out = P.ReLU()(x) out = self.getG(out) out = self.getG(out) return out net = InsertGradientNet() input_data = np.array([[1.2, 2.1], [2.2, 3.2]]).astype(np.float32) grad_net = grad_all_list(net, ParameterTuple(net.trainable_params())) print(grad_net(Tensor(input_data)))
def __init__(self, network): super(GradByListNet, self).__init__() self.grad = C.GradOperation(get_all=True, sens_param=True, get_by_list=True) self.network = network self.params = ParameterTuple(network.trainable_params())
def __init__(self, network, optimizer): super(TrainOneStepCell, self).__init__(auto_prefix=False) self.network = network self.network.add_flags(defer_inline=True) self.weights = ParameterTuple(network.trainable_params()) self.optimizer = optimizer self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
def __init__(self, net): super(GradNet, self).__init__() self.weights = ParameterTuple(net.trainable_params()) self.net = net grad_op = C.GradOperation( name='grad', get_all=True, get_by_list=False, sens_param=True) self.grad = Bprop(self.net, False, self.weights, grad_op)
def __init__(self, network, optimizer, sens=1.0): super(TransformerTrainOneStepCell, self).__init__(auto_prefix=False) self.network = network self.weights = ParameterTuple(network.trainable_params()) self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.reducer_flag = False self.parallel_mode = context.get_auto_parallel_context("parallel_mode") if self.parallel_mode not in ParallelMode.MODE_LIST: raise ValueError("Parallel mode does not support: ", self.parallel_mode) if self.parallel_mode in [ ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL ]: self.reducer_flag = True self.grad_reducer = None if self.reducer_flag: mean = context.get_auto_parallel_context("gradients_mean") degree = get_group_size() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) self.clip_gradients = ClipGradients() self.cast = P.Cast()
def __init__(self, network): super(Grad, self).__init__() self.network = network self.weights = ParameterTuple(network.trainable_params()) self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
def __init__(self, net): super(NetGrad, self).__init__() self.grad_op = C.GradOperation('grad', get_by_list=True, sens_param=False) self.net = net self.weights = ParameterTuple(self.net.trainable_params())
def __init__(self, network): super(TrainOneStepCell, self).__init__(auto_prefix=False) self.network = network self.network.set_train() self.weights = ParameterTuple(network.trainable_params()) self.optimizer = nn.Momentum(self.weights, 0.1, 0.9) self.hyper_map = C.HyperMap() self.grad = C.GradOperation(get_by_list=True)
def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, A_inv_max, G_inv_max, weight_decay=0.0, loss_scale=1.0, use_nesterov=False, decay_filter=lambda x: x.name not in []): super(THOR_GPU, self).__init__(learning_rate, params, weight_decay, loss_scale) Validator.check_value_type("momentum", momentum, [float], self.cls_name) if isinstance(momentum, float) and momentum < 0.0: raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum)) self.momentum = Parameter(Tensor(momentum, mstype.float32)) self.params = self.parameters self.use_nesterov = Validator.check_bool(use_nesterov) self.moments = self.params.clone(prefix="moments", init='zeros') self.hyper_map = C.HyperMap() self.opt = P.ApplyMomentum(use_nesterov=self.use_nesterov) self.feature_map = [1.0 / 12544, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0] self.feature_map_new = [x ** 0.5 for x in self.feature_map] self.transpose = P.Transpose() self.shape = P.Shape() self.reshape = P.Reshape() self.matmul = P.MatMul() self.matrix_A = ParameterTuple(matrix_A) self.matrix_G = ParameterTuple(matrix_G) self.A_inv_max = ParameterTuple(A_inv_max) self.G_inv_max = ParameterTuple(G_inv_max) self.assign = P.Assign() self.mul = P.Mul() mean = _get_gradients_mean() degree = _get_device_num() parameter_length = len(self.feature_map) self.grad_reducer_thorA = DistributedGradReducerThor(parameter_length, ((parameter_length,), 0), mean, degree) self.grad_reducer_thorG = DistributedGradReducerThor(parameter_length, ((parameter_length,), 0), mean, degree) self.weight_decay = weight_decay self.decay_flags = tuple(decay_filter(x) for x in self.parameters) self.update_gradient = P.UpdateThorGradient(split_dim=128)
def __init__(self, input_size, hidden_size, num_layers=1, has_bias=True, batch_first=False, dropout=0.0, bidirectional=False): super(StackLSTM, self).__init__() self.num_layers = num_layers self.batch_first = batch_first self.transpose = P.Transpose() # direction number num_directions = 2 if bidirectional else 1 # input_size list input_size_list = [input_size] for i in range(num_layers - 1): input_size_list.append(hidden_size * num_directions) # layers layers = [] for i in range(num_layers): layers.append( nn.LSTMCell(input_size=input_size_list[i], hidden_size=hidden_size, has_bias=has_bias, batch_first=batch_first, bidirectional=bidirectional, dropout=dropout)) # weights weights = [] for i in range(num_layers): # weight size weight_size = (input_size_list[i] + hidden_size) * num_directions * hidden_size * 4 if has_bias: bias_size = num_directions * hidden_size * 4 weight_size = weight_size + bias_size # numpy weight stdv = 1 / math.sqrt(hidden_size) w_np = np.random.uniform(-stdv, stdv, (weight_size, 1, 1)).astype(np.float32) # lstm weight weights.append( Parameter(initializer(Tensor(w_np), w_np.shape), name="weight" + str(i))) # self.lstms = layers self.weight = ParameterTuple(tuple(weights))
def __init__(self, learning_rate, parameters, weight_decay=0.0, loss_scale=1.0, decay_filter=lambda x: 'beta' not in x.name and 'gamma' not in x.name): super(Optimizer, self).__init__() if isinstance(learning_rate, float): self.dynamic_lr = False self.gather = None self.assignadd = None self.global_step = None validator.check_number_range("learning rate", learning_rate, 0.0, float("inf"), Rel.INC_LEFT) else: self.dynamic_lr = True self.gather = P.GatherV2() self.assignadd = P.AssignAdd() self.global_step = Parameter(initializer(0, [1], mindspore.int32), name='global_step') if isinstance(learning_rate, Iterable): learning_rate = Tensor( np.array(list(learning_rate)).astype(np.float32)) elif isinstance(learning_rate, Tensor): if learning_rate.dim() > 1: raise ValueError( "Learning rate should be a 0 or 1 dim `Tensor`," f"but got {learning_rate.dim()}.") if learning_rate.dim() == 1 and learning_rate.size() < 2: logger.warning( "If want to use the dynamic learning rate, please make sure that the number " "of elements in the list, tuple or tensor passed is greater than 1." ) else: raise TypeError( "Learning rate should be float, Tensor or Iterable.") if loss_scale <= 0.0: raise ValueError( "Loss scale should be greater than 0, but got {}".format( loss_scale)) if weight_decay < 0.0: raise ValueError( "Weight decay should be equal or greater than 0, but got {}". format(weight_decay)) self.learning_rate = Parameter(learning_rate, name="learning_rate") self.parameters = ParameterTuple(parameters) self.reciprocal_scale = 1.0 / loss_scale self.weight_decay = weight_decay * loss_scale self.decay_flags = tuple(decay_filter(x) for x in self.parameters) if not self.parameters: raise ValueError("optimizer got an empty parameter list.")
def test_grad_refactor_13(): class Net(nn.Cell): """ Net definition """ def __init__(self): super(Net, self).__init__() self.z = Parameter(Tensor(np.ones([2]).astype(np.float32)), name='z') def construct(self, x, y): return x * self.z * y net = Net() weights = ParameterTuple(net.trainable_params()) C.grad_by_list(net, weights)(Tensor(np.ones([2]).astype(np.float32)), Tensor(np.zeros([2]).astype(np.float32)))
def __init__(self, net): super(GradNet, self).__init__() self.weights = ParameterTuple(net.trainable_params()) self.net = net self.sens = Parameter(Tensor(np.ones([3, 4, 5]), dtype=mstype.float32), name='sens', requires_grad=False) self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
def __init__(self, num_class, label, mask, l2_coeff, params): super(MaskedSoftMaxLoss, self).__init__() self.num_class = num_class self.label = label self.mask = mask self.softmax = P.SoftmaxCrossEntropyWithLogits() self.reduce_mean = P.ReduceMean() self.cast = P.Cast() self.l2_coeff = l2_coeff self.params = ParameterTuple(list(param for param in params if param.name[-4:] != 'bias')) self.reduce_sum = P.ReduceSum() self.num_params = len(self.params)
def __init__(self, network, optimizer, sens=1.0): super(TrainOneStepWithLarsCell, self).__init__(auto_prefix=False) self.network = network self.slice_index, self.params_len, weights = get_net_trainable_reordered_params( self.network) self.weights = ParameterTuple(weights) self.optimizer = optimizer self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True) self.sens = Parameter(Tensor([sens], mstype.float32), name='sens', requires_grad=False) self.weight_decay = 1.0 self.lars = P.Lars(epsilon=1.0, hyperpara=1.0)
def __init__(self, func, wrt_params, params, grad_op, sens=None): super(Bprop, self).__init__(auto_prefix=False) self.func = func self.wrt_params = wrt_params self.params = None if self.wrt_params and params: self.params = ParameterTuple(params) self.grad = grad_op self.with_sens = False self.sens = sens if sens: self.sens = Tensor(sens, dtype=mstype.float32) self.with_sens = True
def __init__(self, network, optimizer, scale_update_cell=None): super(TransformerTrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() self.network.add_flags(defer_inline=True) self.weights = ParameterTuple(network.trainable_params()) self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.reducer_flag = False self.allreduce = P.AllReduce() self.parallel_mode = _get_parallel_mode() if self.parallel_mode not in ParallelMode.MODE_LIST: raise ValueError("Parallel mode does not support: ", self.parallel_mode) if self.parallel_mode in [ ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL ]: self.reducer_flag = True self.grad_reducer = None if self.reducer_flag: mean = _get_gradients_mean() degree = _get_device_num() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) self.clip_gradients = ClipGradients() self.cast = P.Cast() if context.get_context("device_target") == "GPU": self.gpu_target = True self.float_status = P.FloatStatus() self.addn = P.AddN() self.reshape = P.Reshape() else: self.gpu_target = False self.alloc_status = P.NPUAllocFloatStatus() self.get_status = P.NPUGetFloatStatus() self.clear_before_grad = P.NPUClearFloatStatus() self.reduce_sum = P.ReduceSum(keep_dims=False) self.depend_parameter_use = P.ControlDepend(depend_mode=1) self.base = Tensor(1, mstype.float32) self.less_equal = P.LessEqual() self.hyper_map = C.HyperMap() self.loss_scale = None self.loss_scaling_manager = scale_update_cell if scale_update_cell: self.loss_scale = Parameter(Tensor( scale_update_cell.get_loss_scale(), dtype=mstype.float32), name="loss_scale")
def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, A_inv_max, G_inv_max, weight_decay=0.0, loss_scale=1.0, use_nesterov=False, decay_filter=lambda x: x.name not in []): super(SKFAC_GPU, self).__init__(learning_rate, params, weight_decay, loss_scale) Validator.check_value_type("momentum", momentum, [float], self.cls_name) if isinstance(momentum, float) and momentum < 0.0: raise ValueError( "momentum should be at least 0.0, but got momentum {}".format( momentum)) self.momentum = Parameter(Tensor(momentum, mstype.float32)) self.params = self.parameters self.use_nesterov = Validator.check_bool(use_nesterov) self.moments = self.params.clone(prefix="moments", init='zeros') self.hyper_map = C.HyperMap() self.opt = P.ApplyMomentum(use_nesterov=self.use_nesterov) self.transpose = P.Transpose() self.shape = P.Shape() self.reshape = P.Reshape() self.matmul = P.MatMul() self.matrix_A = ParameterTuple(matrix_A) self.matrix_G = ParameterTuple(matrix_G) self.A_inv_max = ParameterTuple(A_inv_max) self.G_inv_max = ParameterTuple(G_inv_max) self.assign = P.Assign() self.mul = P.Mul() self.weight_decay = weight_decay self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
def TrainWrap(net, loss_fn=None, optimizer=None, weights=None): """ TrainWrap """ if loss_fn is None: loss_fn = nn.SoftmaxCrossEntropyWithLogits(reduction='mean', sparse=True) loss_net = nn.WithLossCell(net, loss_fn) loss_net.set_train() if weights is None: weights = ParameterTuple(net.trainable_params()) if optimizer is None: optimizer = nn.Adam(weights, learning_rate=0.003, beta1=0.9, beta2=0.999, eps=1e-5, use_locking=False, use_nesterov=False, weight_decay=4e-5, loss_scale=1.0) train_net = nn.TrainOneStepCell(loss_net, optimizer) return train_net
def test_load_grad(): class LoadNet(nn.Cell): def __init__(self): super().__init__() self.z = Parameter(Tensor(np.array([1.0], np.float32)), name='z') def construct(self, x, y): x = x * y * self.z return x x = Tensor(np.array([2.0], np.float32)) y = Tensor(np.array([3.0], np.float32)) load_net = LoadNet() grad_net = grad_all_list(load_net, ParameterTuple(load_net.trainable_params())) print(grad_net(x, y))
def test_switch_layer_with_single_prim(): class SwitchLayerCell(nn.Cell): def __init__(self): super(SwitchLayerCell, self).__init__() self.layers = (nn.ReLU(), nn.ReLU()) self.z3 = Parameter( Tensor(np.full([128, 96], 0.6, dtype=np.float32)), name='z3') def construct(self, index, x): ret = self.layers[index](x) * self.z3 return ret index = Tensor(0, dtype=mstype.int32) net = SwitchLayerCell() net(index, Tensor(np.full([128, 96], 0.6, dtype=np.float32))) C.grad_by_list(net, ParameterTuple(net.trainable_params()))(index, Tensor(np.full([128, 96], 0.6, dtype=np.float32))) C.grad_all(net)(index, Tensor(np.full([128, 96], 0.6, dtype=np.float32)))
def __init__(self, network, optimizer, sens=1.0): super(BertTrainOneStepCell, self).__init__(auto_prefix=False) self.network = network self.weights = ParameterTuple(network.trainable_params()) self.optimizer = optimizer self.grad = C.GradOperation(get_by_list=True, sens_param=True) self.sens = sens self.reducer_flag = False self.parallel_mode = context.get_auto_parallel_context("parallel_mode") if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: self.reducer_flag = True self.grad_reducer = None if self.reducer_flag: mean = context.get_auto_parallel_context("gradients_mean") degree = get_group_size() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) self.cast = P.Cast() self.hyper_map = C.HyperMap()