def __init__(self): """init function""" super(Rerank_Downstream, self).__init__() self.dense_0 = nn.Dense(in_channels=4096, out_channels=8192, has_bias=True) self.relu_1 = nn.ReLU() self.reducemean_2 = P.ReduceMean(keep_dims=True) self.sub_3 = P.Sub() self.sub_4 = P.Sub() self.pow_5 = P.Pow() self.pow_5_input_weight = 2.0 self.reducemean_6 = P.ReduceMean(keep_dims=True) self.add_7 = P.Add() self.add_7_bias = 9.999999960041972e-13 self.sqrt_8 = P.Sqrt() self.div_9 = P.Div() self.mul_10 = P.Mul() self.mul_10_w = Parameter(Tensor( np.random.uniform(0, 1, (8192, )).astype(np.float32)), name=None) self.add_11 = P.Add() self.add_11_bias = Parameter(Tensor( np.random.uniform(0, 1, (8192, )).astype(np.float32)), name=None) self.dense_12 = nn.Dense(in_channels=8192, out_channels=2, has_bias=True)
def __init__(self, batch_size, query_linear_bias, key_linear_bias, value_linear_bias): """init function""" super(MultiHeadAttn, self).__init__() self.batch_size = batch_size self.matmul = nn.MatMul() self.add = P.Add() self.reshape = P.Reshape() self.transpose = P.Transpose() self.div = P.Div() self.softmax = nn.Softmax(axis=3) self.query_linear_weight = Parameter(Tensor( np.random.uniform(0, 1, (4096, 4096)).astype(np.float32)), name=None) self.query_linear_bias = query_linear_bias self.key_linear_weight = Parameter(Tensor( np.random.uniform(0, 1, (4096, 4096)).astype(np.float32)), name=None) self.key_linear_bias = key_linear_bias self.value_linear_weight = Parameter(Tensor( np.random.uniform(0, 1, (4096, 4096)).astype(np.float32)), name=None) self.value_linear_bias = value_linear_bias self.reshape_shape = tuple([batch_size, 512, 64, 64]) self.w = Parameter(Tensor( np.random.uniform(0, 1, (64, 64, 4096)).astype(np.float32)), name=None) self.b = Parameter(Tensor( np.random.uniform(0, 1, (4096, )).astype(np.float32)), name=None)
def construct(self, data, label): """ construct a compute flow. """ weights = self.weights record_datas = self._split(data) record_labels = self._split(label) loss = self.network(record_datas[0], record_labels[0]) sens = P.Fill()(P.DType()(loss), P.Shape()(loss), self.sens) record_grad = self.grad(self.network, weights)(record_datas[0], record_labels[0], sens) record_grad = self._clip_by_global_norm(record_grad, GRADIENT_CLIP_TYPE, self._l2_norm) grads = record_grad total_loss = loss for i in range(1, self._micro_batches): loss = self.network(record_datas[i], record_labels[i]) sens = P.Fill()(P.DType()(loss), P.Shape()(loss), self.sens) record_grad = self.grad(self.network, weights)(record_datas[i], record_labels[i], sens) record_grad = self._clip_by_global_norm(record_grad, GRADIENT_CLIP_TYPE, self._l2_norm) grads = self._tuple_add(grads, record_grad) total_loss = P.TensorAdd()(total_loss, loss) loss = P.Div()(total_loss, self._micro_float) if self._mech is not None: grad_noise = self._hyper_map(self._mech, grads) grads = self._tuple_add(grads, grad_noise) grads = self._hyper_map(F.partial(_grad_scale, self._micro_float), grads) if self.reducer_flag: # apply grad reducer on grads grads = self.grad_reducer(grads) return F.depend(loss, self.optimizer(grads))
def __init__(self, norm_bound=1.5, initial_noise_multiplier=5.0, alpha=6e-4, decay_policy='Step'): super(AdaGaussianRandom, self).__init__() initial_noise_multiplier = check_value_positive( 'initial_noise_multiplier', initial_noise_multiplier) initial_noise_multiplier = Tensor( np.array(initial_noise_multiplier, np.float32)) self._initial_noise_multiplier = Parameter( initial_noise_multiplier, name='initial_noise_multiplier') self._noise_multiplier = Parameter(initial_noise_multiplier, name='noise_multiplier') norm_bound = check_value_positive('norm_bound', norm_bound) self._norm_bound = Tensor(np.array(norm_bound, np.float32)) alpha = check_param_type('alpha', alpha, float) self._alpha = Tensor(np.array(alpha, np.float32)) self._decay_policy = check_param_type('decay_policy', decay_policy, str) self._mean = 0.0 self._sub = P.Sub() self._mul = P.Mul() self._add = P.TensorAdd() self._div = P.Div() self._stddev = self._update_stddev() self._dtype = mstype.float32
def __init__(self, dim, n_heads): super().__init__() # h self.n_heads = n_heads # v = V / h self.size_per_head = dim // n_heads scores_mul = 1.0 / np.sqrt(float(self.size_per_head)) self.scores_mul = ms.Tensor(scores_mul, ms.float32) self.exones = P.Ones()((1, 1, n_heads, 1, 1), ms.int32) # shape = (h, v) self.reshape_tail = (self.n_heads, self.size_per_head) self.output = Dense(dim, dim, has_bias=False) self.mul = P.Mul() self.div = P.Div() self.softmax = P.Softmax() self.bmm = P.BatchMatMul() self.bmmt = P.BatchMatMul(transpose_b=True) self.squeeze = P.Squeeze(-2) self.reducesum = P.ReduceSum(keep_dims=True) self.transpose = P.Transpose() self.trans_shape = (0, 1, 3, 2, 4)
def __init__(self, sparse=False, stra_list=None): super(SoftmaxCrossEntropyExpand, self).__init__() if stra_list is None: stra_list = [] if len(stra_list) < 11: stra_list = [None] * 11 self.exp = P.Exp() self.reduce_sum = P.ReduceSum(keep_dims=True).set_strategy( strategy=stra_list[1]) self.onehot = P.OneHot().set_strategy(strategy=stra_list[2]) self.on_value = Tensor(1.0, mstype.float32) self.off_value = Tensor(0.0, mstype.float32) self.div = P.Div().set_strategy(strategy=stra_list[3]) self.log = P.Log().set_strategy(strategy=stra_list[4]) self.sum_cross_entropy = P.ReduceSum(keep_dims=False).set_strategy( strategy=stra_list[5]) self.mul = P.Mul().set_strategy(strategy=stra_list[6]) self.mul2 = P.Mul().set_strategy(strategy=stra_list[7]) self.cast = P.Cast() self.reduce_mean = P.ReduceMean(keep_dims=False).set_strategy( strategy=stra_list[8]) self.sparse = sparse self.reduce_max = P.ReduceMax(keep_dims=True).set_strategy( strategy=stra_list[9]) self.sub = P.Sub().set_strategy(strategy=stra_list[10])
def __init__(self, rgb_range, rgb_mean, rgb_std=(1.0, 1.0, 1.0), sign=-1): """Construct the class MeanShift. :param rgb_range: range of tensor, usually 1.0 or 255.0 :param rgb_mean: mean of rgb value :param rgb_std: std of rgb value :param sign: -1 for subtract, 1 for add """ super(MeanShift, self).__init__() self.conv2d = nn.Conv2d(3, 3, kernel_size=1, stride=1, padding=0, has_bias=True, group=1, dilation=1, pad_mode="pad") self.conv2d.update_parameters_name("conv2d_" + uuid.uuid1().hex[:8] + ".") std = Tensor(rgb_std, mindspore.float32) self.conv2d.weight = Tensor( np.eye(3).reshape(3, 3, 1, 1).astype(np.float32)) self.reshape = P.Reshape() self.div = P.Div() self.conv2d.weight = self.div(self.conv2d.weight, self.reshape(std, (3, 1, 1, 1))) self.conv2d.bias = sign * rgb_range * Tensor(rgb_mean, mindspore.float32) self.conv2d.bias = self.div(self.conv2d.bias, std) self.requires_grad = False
def __init__(self): super(MultiHeadAttn, self).__init__() self.matmul_0 = nn.MatMul() self.matmul_0.to_float(mstype.float16) self.matmul_0_w = Parameter(Tensor( np.random.uniform(0, 1, (768, 768)).astype(np.float32)), name=None) self.matmul_1 = nn.MatMul() self.matmul_1.to_float(mstype.float16) self.matmul_1_w = Parameter(Tensor( np.random.uniform(0, 1, (768, 768)).astype(np.float32)), name=None) self.matmul_2 = nn.MatMul() self.matmul_2.to_float(mstype.float16) self.matmul_2_w = Parameter(Tensor( np.random.uniform(0, 1, (768, 768)).astype(np.float32)), name=None) self.add_3 = P.Add() self.add_3_bias = Parameter(Tensor( np.random.uniform(0, 1, (768, )).astype(np.float32)), name=None) self.add_4 = P.Add() self.add_4_bias = Parameter(Tensor( np.random.uniform(0, 1, (768, )).astype(np.float32)), name=None) self.add_5 = P.Add() self.add_5_bias = Parameter(Tensor( np.random.uniform(0, 1, (768, )).astype(np.float32)), name=None) self.reshape_6 = P.Reshape() self.reshape_6_shape = tuple([BATCH_SIZE, 448, 12, 64]) self.reshape_7 = P.Reshape() self.reshape_7_shape = tuple([BATCH_SIZE, 448, 12, 64]) self.reshape_8 = P.Reshape() self.reshape_8_shape = tuple([BATCH_SIZE, 448, 12, 64]) self.transpose_9 = P.Transpose() self.transpose_10 = P.Transpose() self.transpose_11 = P.Transpose() self.matmul_12 = nn.MatMul() self.matmul_12.to_float(mstype.float16) self.div_13 = P.Div() self.div_13_w = 8.0 self.add_14 = P.Add() self.softmax_15 = nn.Softmax(axis=3) self.matmul_16 = nn.MatMul() self.matmul_16.to_float(mstype.float16) self.transpose_17 = P.Transpose() self.reshape_18 = P.Reshape() self.reshape_18_shape = tuple([BATCH_SIZE, 448, 768]) self.matmul_19 = nn.MatMul() self.matmul_19.to_float(mstype.float16) self.matmul_19_w = Parameter(Tensor( np.random.uniform(0, 1, (768, 768)).astype(np.float32)), name=None) self.add_20 = P.Add() self.add_20_bias = Parameter(Tensor( np.random.uniform(0, 1, (768, )).astype(np.float32)), name=None)
def __init__(self): super(GeLU, self).__init__() self.div = P.Div() self.div_w = 1.4142135381698608 self.erf = P.Erf() self.add = P.Add() self.add_bias = 1.0 self.mul = P.Mul() self.mul_w = 0.5
def __init__(self): super().__init__() self.mul = P.Mul() self.add = P.Add() self.sub = P.Sub() self.div = P.Div() self.assign = P.Assign() self.param_a = Parameter(Tensor(5, mstype.int32), name='a') self.param_b = Parameter(Tensor(2, mstype.int32), name='b') self.param_c = Parameter(Tensor(20, mstype.int32), name='c')
def __init__(self): super(GeLU, self).__init__() self.div_0 = P.Div() self.div_0_w = 1.4142135381698608 self.erf_1 = P.Erf() self.add_2 = P.Add() self.add_2_bias = 1.0 self.mul_3 = P.Mul() self.mul_4 = P.Mul() self.mul_4_w = 0.5
def __init__(self, layer_norm_weight, layer_norm_bias): """init function""" super(LayerNorm, self).__init__() self.reducemean = P.ReduceMean(keep_dims=True) self.sub = P.Sub() self.pow = P.Pow() self.add = P.Add() self.sqrt = P.Sqrt() self.div = P.Div() self.mul = P.Mul() self.layer_norm_weight = layer_norm_weight self.layer_norm_bias = layer_norm_bias
def __init__(self): super().__init__() self.relu = nn.ReLU() self.mul = P.Mul() self.add = P.Add() self.sub = P.Sub() self.div = P.Div() self.assign = P.Assign() param_a = np.full((1, ), 5, dtype=np.float32) self.param_a = Parameter(Tensor(param_a), name='a') param_b = np.full((1, ), 2, dtype=np.float32) self.param_b = Parameter(Tensor(param_b), name='b')
def __init__(self, decay_policy, decay_rate, cur_noise_multiplier, init_noise_multiplier): super(_MechanismsParamsUpdater, self).__init__() self._decay_policy = decay_policy self._decay_rate = decay_rate self._cur_noise_multiplier = cur_noise_multiplier self._init_noise_multiplier = init_noise_multiplier self._div = P.Div() self._add = P.TensorAdd() self._assign = P.Assign() self._sub = P.Sub() self._one = Tensor(1, mstype.float32) self._mul = P.Mul() self._exp = P.Exp()
def __init__(self, batch_size, labels, rnn_hidden_size, nb_layers, audio_conf, rnn_type='LSTM', bidirectional=True, device_target='GPU'): super(DeepSpeechModel, self).__init__() self.batch_size = batch_size self.hidden_size = rnn_hidden_size self.hidden_layers = nb_layers self.rnn_type = rnn_type self.audio_conf = audio_conf self.labels = labels self.bidirectional = bidirectional self.reshape_op = P.Reshape() self.shape_op = P.Shape() self.transpose_op = P.Transpose() self.add = P.Add() self.div = P.Div() sample_rate = self.audio_conf.sample_rate window_size = self.audio_conf.window_size num_classes = len(self.labels) self.conv = MaskConv() # This is to calculate self.pre, self.stride = self.get_conv_num() # Based on above convolutions and spectrogram size using conv formula (W - F + 2P)/ S+1 rnn_input_size = int(math.floor((sample_rate * window_size) / 2) + 1) rnn_input_size = int(math.floor(rnn_input_size + 2 * 20 - 41) / 2 + 1) rnn_input_size = int(math.floor(rnn_input_size + 2 * 10 - 21) / 2 + 1) rnn_input_size *= 32 self.RNN = BatchRNN(batch_size=self.batch_size, input_size=rnn_input_size, num_layers=nb_layers, hidden_size=rnn_hidden_size, bidirectional=bidirectional, batch_norm=False, rnn_type=self.rnn_type, device_target=device_target) fully_connected = nn.Dense(rnn_hidden_size, num_classes, has_bias=False) self.fc = SequenceWise(fully_connected)
def __init__(self, mul_7_w_shape, add_8_bias_shape): """init function""" super(LayerNorm, self).__init__() self.reducemean_0 = P.ReduceMean(keep_dims=True) self.sub_1 = P.Sub() self.pow_2 = P.Pow() self.pow_2_input_weight = 2.0 self.reducemean_3 = P.ReduceMean(keep_dims=True) self.add_4 = P.Add() self.add_4_bias = 9.999999960041972e-13 self.sqrt_5 = P.Sqrt() self.div_6 = P.Div() self.mul_7 = P.Mul() self.mul_7_w = Parameter(Tensor(np.random.uniform(0, 1, mul_7_w_shape).astype(np.float32)), name=None) self.add_8 = P.Add() self.add_8_bias = Parameter(Tensor(np.random.uniform(0, 1, add_8_bias_shape).astype(np.float32)), name=None)
def __init__(self, passthrough_w_0, passthrough_w_1): """init function""" super(LayerNorm, self).__init__() self.reducemean_0 = P.ReduceMean(keep_dims=True) self.sub_1 = P.Sub() self.pow_2 = P.Pow() self.pow_2_input_weight = 2.0 self.reducemean_3 = P.ReduceMean(keep_dims=True) self.add_4 = P.Add() self.add_4_bias = 9.999999960041972e-13 self.sqrt_5 = P.Sqrt() self.div_6 = P.Div() self.mul_7 = P.Mul() self.mul_7_w = passthrough_w_0 self.add_8 = P.Add() self.add_8_bias = passthrough_w_1
def __init__(self, batch_size, passthrough_w_0, passthrough_w_1, passthrough_w_2): """init function""" super(MultiHeadAttn, self).__init__() self.batch_size = batch_size self.matmul_0 = nn.MatMul() self.matmul_0_w = Parameter(Tensor( np.random.uniform(0, 1, (4096, 4096)).astype(np.float32)), name=None) self.matmul_1 = nn.MatMul() self.matmul_1_w = Parameter(Tensor( np.random.uniform(0, 1, (4096, 4096)).astype(np.float32)), name=None) self.matmul_2 = nn.MatMul() self.matmul_2_w = Parameter(Tensor( np.random.uniform(0, 1, (4096, 4096)).astype(np.float32)), name=None) self.add_3 = P.Add() self.add_3_bias = passthrough_w_0 self.add_4 = P.Add() self.add_4_bias = passthrough_w_1 self.add_5 = P.Add() self.add_5_bias = passthrough_w_2 self.reshape_6 = P.Reshape() self.reshape_6_shape = tuple([batch_size, 512, 64, 64]) self.reshape_7 = P.Reshape() self.reshape_7_shape = tuple([batch_size, 512, 64, 64]) self.reshape_8 = P.Reshape() self.reshape_8_shape = tuple([batch_size, 512, 64, 64]) self.transpose_9 = P.Transpose() self.transpose_10 = P.Transpose() self.transpose_11 = P.Transpose() self.matmul_12 = nn.MatMul() self.div_13 = P.Div() self.div_13_w = 8.0 self.add_14 = P.Add() self.softmax_15 = nn.Softmax(axis=3) self.matmul_16 = nn.MatMul() self.transpose_17 = P.Transpose() self.matmul_18 = P.MatMul() self.matmul_18_weight = Parameter(Tensor( np.random.uniform(0, 1, (64, 64, 4096)).astype(np.float32)), name=None) self.add_19 = P.Add() self.add_19_bias = Parameter(Tensor( np.random.uniform(0, 1, (4096, )).astype(np.float32)), name=None)
def __init__(self, sparse=False): super(SoftmaxCrossEntropyExpand, self).__init__() self.exp = P.Exp() self.reduce_sum = P.ReduceSum(keep_dims=True) self.onehot = P.OneHot() self.on_value = Tensor(1.0, mstype.float32) self.off_value = Tensor(0.0, mstype.float32) self.div = P.Div() self.log = P.Log() self.sum_cross_entropy = P.ReduceSum(keep_dims=False) self.mul = P.Mul() self.mul2 = P.Mul() self.cast = P.Cast() self.reduce_mean = P.ReduceMean(keep_dims=False) self.sparse = sparse self.reduce_max = P.ReduceMax(keep_dims=True) self.sub = P.Sub()
def __init__(self): super(LayerNorm, self).__init__() self.reducemean = P.ReduceMean(keep_dims=True) self.sub = P.Sub() self.cast = P.Cast() self.cast_to = mstype.float32 self.pow = P.Pow() self.pow_weight = 2.0 self.add = P.Add() self.add_bias_0 = 9.999999960041972e-13 self.sqrt = P.Sqrt() self.div = P.Div() self.mul = P.Mul() self.mul_weight = Parameter(Tensor( np.random.uniform(0, 1, (768, )).astype(np.float32)), name=None) self.add_bias_1 = Parameter(Tensor( np.random.uniform(0, 1, (768, )).astype(np.float32)), name=None)
def __init__(self, num_bits=2, compute_type=mstype.float32, clip_value=1.0, per_channel=False): self.num_bits = num_bits self.compute_type = compute_type self.clip_value = clip_value self.per_channel = per_channel self.clamp = C.clip_by_value self.abs = P.Abs() self.sum = P.ReduceSum() self.nelement = F.size self.div = P.Div() self.cast = P.Cast() self.max = P.ReduceMax() self.min = P.ReduceMin() self.floor = P.Floor()
def __init__(self, num_bits=8, compute_type=mstype.float32, clip_value=1.0, per_channel=False): super(QuantizeWeightCell, self).__init__() self.num_bits = num_bits self.compute_type = compute_type self.clip_value = clip_value self.per_channel = per_channel self.clamp = C.clip_by_value self.abs = P.Abs() self.sum = P.ReduceSum() self.nelement = F.size self.div = P.Div() self.cast = P.Cast() self.max = P.ReduceMax() self.min = P.ReduceMin() self.round = P.Round()
def __init__(self): super(LayerNorm, self).__init__() self.reducemean_0 = P.ReduceMean(keep_dims=True) self.sub_1 = P.Sub() self.cast_2 = P.Cast() self.cast_2_to = mstype.float32 self.pow_3 = P.Pow() self.pow_3_input_weight = 2.0 self.reducemean_4 = P.ReduceMean(keep_dims=True) self.add_5 = P.Add() self.add_5_bias = 9.999999960041972e-13 self.sqrt_6 = P.Sqrt() self.div_7 = P.Div() self.mul_8 = P.Mul() self.mul_8_w = Parameter(Tensor( np.random.uniform(0, 1, (768, )).astype(np.float32)), name=None) self.add_9 = P.Add() self.add_9_bias = Parameter(Tensor( np.random.uniform(0, 1, (768, )).astype(np.float32)), name=None)
def __init__(self, bert_layer_norm_weight_shape, bert_layer_norm_bias_shape, eps=1e-12): """init function""" super(BertLayerNorm, self).__init__() self.reducemean = P.ReduceMean(keep_dims=True) self.sub = P.Sub() self.pow = P.Pow() self.add = P.Add() self.sqrt = P.Sqrt() self.div = P.Div() self.mul = P.Mul() self.variance_epsilon = eps self.bert_layer_norm_weight = Parameter(Tensor( np.random.uniform(0, 1, bert_layer_norm_weight_shape).astype( np.float32)), name=None) self.bert_layer_norm_bias = Parameter(Tensor( np.random.uniform(0, 1, bert_layer_norm_bias_shape).astype(np.float32)), name=None)
def __init__(self, seq_len): super(MultiHeadAttn, self).__init__() self.matmul = nn.MatMul() self.matmul.to_float(mstype.float16) self.query = Parameter(Tensor( np.random.uniform(0, 1, (768, 768)).astype(np.float32)), name=None) self.key = Parameter(Tensor( np.random.uniform(0, 1, (768, 768)).astype(np.float32)), name=None) self.value = Parameter(Tensor( np.random.uniform(0, 1, (768, 768)).astype(np.float32)), name=None) self.add = P.Add() self.query_bias = Parameter(Tensor( np.random.uniform(0, 1, (768, )).astype(np.float32)), name=None) self.key_bias = Parameter(Tensor( np.random.uniform(0, 1, (768, )).astype(np.float32)), name=None) self.value_bias = Parameter(Tensor( np.random.uniform(0, 1, (768, )).astype(np.float32)), name=None) self.reshape = P.Reshape() self.to_shape_0 = tuple([BATCH_SIZE, seq_len, 12, 64]) self.transpose = P.Transpose() self.div = P.Div() self.div_w = 8.0 self.softmax = nn.Softmax(axis=3) self.to_shape_1 = tuple([BATCH_SIZE, seq_len, 768]) self.context_weight = Parameter(Tensor( np.random.uniform(0, 1, (768, 768)).astype(np.float32)), name=None) self.context_bias = Parameter(Tensor( np.random.uniform(0, 1, (768, )).astype(np.float32)), name=None)
def __init__(self, temperature=0.07, contrast_mode='all', base_temperature=0.07): super(SupConLoss, self).__init__() self.temperature = temperature self.contrast_mode = contrast_mode self.base_temperature = base_temperature self.normalize = P.L2Normalize(axis=2) self.eye = P.Eye() self.unbind = P.Unstack(axis=1) self.cat = P.Concat(axis=0) self.matmul = P.MatMul() self.div = P.Div() self.transpose = P.Transpose() self.maxes = P.ArgMaxWithValue(axis=1, keep_dims=True) self.tile = P.Tile() self.scatter = P.ScatterNd() self.oneslike = P.OnesLike() self.exp = P.Exp() self.sum = P.ReduceSum(keep_dims=True) self.log = P.Log() self.reshape = P.Reshape() self.mean = P.ReduceMean()
def __init__(self, norm_bound=1.0, initial_noise_multiplier=1.5, noise_decay_rate=6e-4, decay_policy='Time', seed=0): super(AdaGaussianRandom, self).__init__() norm_bound = check_value_positive('norm_bound', norm_bound) initial_noise_multiplier = check_value_positive( 'initial_noise_multiplier', initial_noise_multiplier) self._norm_bound = Tensor(norm_bound, mstype.float32) initial_noise_multiplier = Tensor(initial_noise_multiplier, mstype.float32) self._initial_noise_multiplier = Parameter( initial_noise_multiplier, name='initial_noise_multiplier') self._noise_multiplier = Parameter(initial_noise_multiplier, name='noise_multiplier') self._mean = Tensor(0, mstype.float32) noise_decay_rate = check_param_type('noise_decay_rate', noise_decay_rate, float) check_param_in_range('noise_decay_rate', noise_decay_rate, 0.0, 1.0) self._noise_decay_rate = Tensor(noise_decay_rate, mstype.float32) if decay_policy not in ['Time', 'Step']: raise NameError( "The decay_policy must be in ['Time', 'Step'], but " "get {}".format(decay_policy)) self._decay_policy = decay_policy self._sub = P.Sub() self._mul = P.Mul() self._add = P.TensorAdd() self._div = P.Div() self._dtype = mstype.float32 self._normal = P.Normal(seed=seed) self._assign = P.Assign() self._one = Tensor(1, self._dtype)
def construct(self, data, label, sens=None): """ construct a compute flow. """ init = False if not self.gpu_target: # init overflow buffer init = self.alloc_status() # clear overflow buffer self.clear_status(init) if sens is None: scaling_sens = self.loss_scale else: scaling_sens = sens # DP clip weights = self.weights record_datas = self._split(data) record_labels = self._split(label) # first index loss = self.network(record_datas[0], record_labels[0]) scaling_sens_filled = C.ones_like(loss) * F.cast( scaling_sens, F.dtype(loss)) record_grad = self.grad(self.network, weights)(record_datas[0], record_labels[0], scaling_sens_filled) beta = self._zero square_sum = self._zero for grad in record_grad: square_sum = self._add(square_sum, self._reduce_sum(self._square_all(grad))) norm_grad = self._sqrt(square_sum) beta = self._add( beta, self._cast(self._less(norm_grad, self._norm_bound), mstype.float32)) record_grad = self._clip_by_global_norm(record_grad, GRADIENT_CLIP_TYPE, self._norm_bound) grads = record_grad total_loss = loss for i in range(1, self._micro_batches): loss = self.network(record_datas[i], record_labels[i]) scaling_sens_filled = C.ones_like(loss) * F.cast( scaling_sens, F.dtype(loss)) record_grad = self.grad(self.network, weights)(record_datas[i], record_labels[i], scaling_sens_filled) square_sum = self._zero for grad in record_grad: square_sum = self._add( square_sum, self._reduce_sum(self._square_all(grad))) norm_grad = self._sqrt(square_sum) beta = self._add( beta, self._cast(self._less(norm_grad, self._norm_bound), mstype.float32)) record_grad = self._clip_by_global_norm(record_grad, GRADIENT_CLIP_TYPE, self._norm_bound) grads = self._tuple_add(grads, record_grad) total_loss = P.TensorAdd()(total_loss, loss) loss = P.Div()(total_loss, self._micro_float) beta = self._div(beta, self._micro_batches) if self._noise_mech is not None: grad_noise_tuple = () for grad_item in grads: grad_noise = self._mech(grad_item) grad_noise_tuple = grad_noise_tuple + (grad_noise, ) grads = self._tuple_add(grads, grad_noise_tuple) grads = self._hyper_map(F.partial(_grad_scale, self._micro_float), grads) # update mech parameters if self._noise_mech_param_updater is not None: multiplier = self._noise_mech_param_updater() loss = F.depend(loss, multiplier) grads = self.hyper_map(F.partial(_grad_scale, scaling_sens), grads) # apply grad reducer on grads grads = self.grad_reducer(grads) # get the overflow buffer if not self.gpu_target: self.get_status(init) # sum overflow buffer elements, 0:not overflow , >0:overflow flag_sum = self.reduce_sum(init, (0, )) else: flag_sum = self.hyper_map(F.partial(_grad_overflow), grads) flag_sum = self.addn(flag_sum) # convert flag_sum to scalar flag_sum = self.reshape(flag_sum, (())) if self.is_distributed: # sum overflow flag over devices flag_reduce = self.allreduce(flag_sum) cond = self.less_equal(self.base, flag_reduce) else: cond = self.less_equal(self.base, flag_sum) overflow = cond if sens is None: overflow = self.loss_scaling_manager(self.loss_scale, cond) # if there is no overflow, do optimize if overflow: opt = False else: opt = self.optimizer(grads) ret = (loss, cond, scaling_sens) if self._clip_mech is not None: next_norm_bound = self._clip_mech(beta, self._norm_bound) P.assign(self._norm_bound, next_norm_bound) return F.depend(ret, opt)
def __init__(self, network, optimizer, norm_bound=1.0, sens=1.0, micro_batches=None, noise_mech=None, clip_mech=None): super(_TrainOneStepCell, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() self.network.add_flags(defer_inline=True) self.weights = optimizer.parameters self.optimizer = optimizer self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True) self.sens = sens self.reducer_flag = False self.grad_reducer = None parallel_mode = _get_parallel_mode() if parallel_mode in (ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL): self.reducer_flag = True if self.reducer_flag: mean = _get_mirror_mean() degree = _get_device_num() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) # dp params if micro_batches is None: msg = 'micro_batches must give in differential privacy, but got value: {}'.format( micro_batches) LOGGER.error(TAG, msg) raise ValueError(msg) self._micro_batches = micro_batches self._norm_bound = norm_bound self._split = P.Split(0, self._micro_batches) self._clip_by_global_norm = _ClipGradients() self._noise_mech = noise_mech self._clip_mech = clip_mech self._tuple_add = _TupleAdd() self._add = P.TensorAdd() self._norm = nn.Norm() self._hyper_map = C.HyperMap() self._zero = Tensor(0, mstype.float32) self._assign = P.Assign() self._div = P.Div() self._sqrt = P.Sqrt() self._reduce_sum = P.ReduceSum() self._square_all = P.Square() self._less = P.Less() self._cast = P.Cast() self._micro_float = Tensor(micro_batches, mstype.float32) self._noise_mech_param_updater = None if self._noise_mech is not None and self._noise_mech._decay_policy is not None: self._noise_mech_param_updater = _MechanismsParamsUpdater( decay_policy=self._noise_mech._decay_policy, decay_rate=self._noise_mech._noise_decay_rate, cur_noise_multiplier=self._noise_mech._noise_multiplier, init_noise_multiplier=self._noise_mech. _initial_noise_multiplier)
def __init__(self, network, optimizer, scale_update_cell=None, micro_batches=None, norm_bound=1.0, noise_mech=None, clip_mech=None): super(_TrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False) self.network = network self.network.set_grad() self.network.add_flags(defer_inline=True) self.weights = ParameterTuple(network.trainable_params()) self.optimizer = optimizer self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True) self.hyper_map = C.HyperMap() if context.get_context("device_target") == "GPU": self.gpu_target = True self.float_status = P.FloatStatus() self.addn = P.AddN() self.reshape = P.Reshape() else: self.gpu_target = False self.alloc_status = NPUAllocFloatStatus() self.get_status = NPUGetFloatStatus() self.clear_status = NPUClearFloatStatus() self.reduce_sum = ReduceSum(keep_dims=False) self.base = Tensor(1, mstype.float32) self.less_equal = LessEqual() self.depend_parameter_use = ControlDepend(depend_mode=1) self.allreduce = P.AllReduce() self.parallel_mode = _get_parallel_mode() self.grad_reducer = F.identity self.reducer_flag = self.parallel_mode in [ ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL ] if self.reducer_flag: mean = _get_mirror_mean() degree = _get_device_num() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) self.is_distributed = self.parallel_mode != ParallelMode.STAND_ALONE self.loss_scale = None self.loss_scaling_manager = scale_update_cell if scale_update_cell: self.loss_scale = Parameter(Tensor( scale_update_cell.get_loss_scale(), dtype=mstype.float32), name="loss_scale") self.add_flags(has_effect=True) # dp params self._micro_batches = micro_batches self._norm_bound = norm_bound self._split = P.Split(0, self._micro_batches) self._clip_by_global_norm = _ClipGradients() self._noise_mech = noise_mech self._clip_mech = clip_mech self._add = P.TensorAdd() self._norm = nn.Norm() self._tuple_add = _TupleAdd() self._hyper_map = C.HyperMap() self._micro_float = Tensor(micro_batches, mstype.float32) self._zero = Tensor(0, mstype.float32) self._assign = P.Assign() self._div = P.Div() self._sqrt = P.Sqrt() self._reduce_sum = P.ReduceSum() self._square_all = P.Square() self._less = P.Less() self._cast = P.Cast() self._noise_mech_param_updater = None if self._noise_mech is not None and self._noise_mech._decay_policy is not None: self._noise_mech_param_updater = _MechanismsParamsUpdater( decay_policy=self._noise_mech._decay_policy, decay_rate=self._noise_mech._noise_decay_rate, cur_noise_multiplier=self._noise_mech._noise_multiplier, init_noise_multiplier=self._noise_mech. _initial_noise_multiplier)