def __init__(self, mean=None, sd=None, seed=0, dtype=mstype.float32, name="Normal"): """ Constructor of normal distribution. """ param = dict(locals()) super(Normal, self).__init__(dtype, name, param) if mean is not None and sd is not None: self._mean_value = convert_to_batch(mean, self._broadcast_shape, dtype) self._sd_value = convert_to_batch(sd, self._broadcast_shape, dtype) check_greater_equal_zero(self._sd_value, "Standard deviation") else: self._mean_value = mean self._sd_value = sd #ops needed for the class self.exp = P.Exp() self.add = P.TensorAdd() self.mul = P.Mul() self.sq = P.Square() self.log = P.Log() self.sqrt = P.Sqrt() self.realdiv = P.RealDiv() self.expm1 = P.Expm1() if get_context( 'device_target') == 'Ascend' else self._expm1_by_step self.normal = P.Normal(seed=seed) self.shape = P.Shape() self.zeroslike = P.ZerosLike() self.const = P.ScalarToArray()
def __init__(self, mean=None, sd=None, seed=None, dtype=mstype.float32, name="Normal"): """ Constructor of Normal. """ param = dict(locals()) param['param_dict'] = {'mean': mean, 'sd': sd} valid_dtype = mstype.float_type Validator.check_type(type(self).__name__, dtype, valid_dtype) super(Normal, self).__init__(seed, dtype, name, param) self._mean_value = self._add_parameter(mean, 'mean') self._sd_value = self._add_parameter(sd, 'sd') if self._sd_value is not None: check_greater_zero(self._sd_value, "Standard deviation") # ops needed for the class self.exp = exp_generic self.expm1 = expm1_generic self.log = log_generic self.erf = P.Erf() self.squeeze = P.Squeeze(0) self.cast = P.Cast() self.const = P.ScalarToArray() self.shape = P.Shape() self.sq = P.Square() self.sqrt = P.Sqrt()
def __init__(self): """init function""" super(Rerank_Downstream, self).__init__() self.dense_0 = nn.Dense(in_channels=4096, out_channels=8192, has_bias=True) self.relu_1 = nn.ReLU() self.reducemean_2 = P.ReduceMean(keep_dims=True) self.sub_3 = P.Sub() self.sub_4 = P.Sub() self.pow_5 = P.Pow() self.pow_5_input_weight = 2.0 self.reducemean_6 = P.ReduceMean(keep_dims=True) self.add_7 = P.Add() self.add_7_bias = 9.999999960041972e-13 self.sqrt_8 = P.Sqrt() self.div_9 = P.Div() self.mul_10 = P.Mul() self.mul_10_w = Parameter(Tensor( np.random.uniform(0, 1, (8192, )).astype(np.float32)), name=None) self.add_11 = P.Add() self.add_11_bias = Parameter(Tensor( np.random.uniform(0, 1, (8192, )).astype(np.float32)), name=None) self.dense_12 = nn.Dense(in_channels=8192, out_channels=2, has_bias=True)
def __init__(self, rate=None, seed=0, dtype=mstype.float32, name="Exponential"): """ Constructor of Exponential distribution. """ param = dict(locals()) super(Exponential, self).__init__(dtype, name, param) if rate is not None: self._rate = cast_to_tensor(rate, mstype.float32) check_greater_zero(self._rate, "rate") else: self._rate = rate self.minval = np.finfo(np.float).tiny # ops needed for the class self.const = P.ScalarToArray() self.dtypeop = P.DType() self.exp = P.Exp() self.fill = P.Fill() self.less = P.Less() self.log = P.Log() self.select = P.Select() self.shape = P.Shape() self.sqrt = P.Sqrt() self.sq = P.Square() self.uniform = P.UniformReal(seed=seed)
def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, weight_decay=0.0, loss_scale=1.0, num_hidden_layers=24, batch_size=12, damping=0.03, decay_filter=lambda x: 'layernorm' not in x.name.lower() and 'bias' not in x.name.lower()): super(THOR, self).__init__(learning_rate, params, weight_decay, loss_scale) if isinstance(momentum, float) and momentum < 0.0: raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum)) self.momentum = Parameter(Tensor(momentum, mstype.float32), name="momentum") self.params = self.parameters self.moments = self.params.clone(prefix="moments", init='zeros') self.hyper_map = C.HyperMap() self.opt = P.ApplyMomentum() self.matrix_A = ParameterTuple(matrix_A) self.matrix_G = ParameterTuple(matrix_G) self.matmul = P.MatMul() self.transpose = P.Transpose() self.shape = P.Shape() self.reshape = P.Reshape() self.mul = P.Mul() self.gather = P.GatherV2() self.matrix_A_inv = () self.matrix_G_inv = () self.num_hidden_layers = num_hidden_layers self.sqrt = P.Sqrt() self.assign = P.Assign() self.cast = P.Cast() self.thor = True self.weight_decay = weight_decay * loss_scale self.decay_flags = tuple(decay_filter(x) for x in self.parameters) self.expand = P.ExpandDims() self.square = P.Square() self.inv = P.Inv() self.batch_size = batch_size self.damping = damping self.one = Tensor(1, mstype.int32) self.cov_step = Parameter(initializer(0, [1], mstype.int32), name="cov_step", requires_grad=False)
def __init__(self, mean=None, sd=None, seed=0, dtype=mstype.float32, name="Normal"): """ Constructor of normal distribution. """ param = dict(locals()) valid_dtype = mstype.float_type check_type(dtype, valid_dtype, type(self).__name__) super(Normal, self).__init__(seed, dtype, name, param) self.parameter_type = dtype if mean is not None and sd is not None: self._mean_value = cast_to_tensor(mean, self.parameter_type) self._sd_value = cast_to_tensor(sd, self.parameter_type) check_greater_zero(self._sd_value, "Standard deviation") else: self._mean_value = mean self._sd_value = sd #ops needed for the class self.exp = exp_generic self.expm1 = expm1_generic self.log = log_generic self.erf = erf_generic self.squeeze = P.Squeeze(0) self.cast = P.Cast() self.const = P.ScalarToArray() self.fill = P.Fill() self.shape = P.Shape() self.sq = P.Square() self.sqrt = P.Sqrt() self.zeroslike = P.ZerosLike()
def __init__(self, low=None, high=None, seed=0, dtype=mstype.float32, name="Uniform"): """ Constructor of Uniform distribution. """ param = dict(locals()) super(Uniform, self).__init__(dtype, name, param) if low is not None and high is not None: self._low = convert_to_batch(low, self._broadcast_shape, dtype) self._high = convert_to_batch(high, self._broadcast_shape, dtype) check_greater(self.low, self.high, "low value", "high value") else: self._low = low self._high = high # ops needed for the class self.const = P.ScalarToArray() self.dtypeop = P.DType() self.exp = P.Exp() self.fill = P.Fill() self.less = P.Less() self.lessequal = P.LessEqual() self.log = P.Log() self.logicaland = P.LogicalAnd() self.select = P.Select() self.shape = P.Shape() self.sq = P.Square() self.sqrt = P.Sqrt() self.uniform = P.UniformReal(seed=seed) self.zeroslike = P.ZerosLike()
def __init__(self, probs=None, seed=0, dtype=mstype.int32, name="Geometric"): """ Constructor of Geometric distribution. """ param = dict(locals()) super(Geometric, self).__init__(dtype, name, param) if probs is not None: self._probs = cast_to_tensor(probs, dtype=mstype.float32) check_prob(self._probs) else: self._probs = probs self.minval = np.finfo(np.float).tiny # ops needed for the class self.const = P.ScalarToArray() self.dtypeop = P.DType() self.fill = P.Fill() self.floor = P.Floor() self.issubclass = P.IsSubClass() self.less = P.Less() self.log = P.Log() self.pow = P.Pow() self.select = P.Select() self.shape = P.Shape() self.sq = P.Square() self.sqrt = P.Sqrt() self.uniform = P.UniformReal(seed=seed)
def __init__(self, loc, scale, seed=0, dtype=mstype.float32, name="Gumbel"): """ Constructor of Gumbel distribution. """ valid_dtype = mstype.float_type Validator.check_type_name("dtype", dtype, valid_dtype, type(self).__name__) gumbel_cdf = msb.GumbelCDF(loc, scale) super(Gumbel, self).__init__( distribution=msd.Uniform(0.0, 1.0, dtype=dtype), bijector=msb.Invert(gumbel_cdf), seed=seed, name=name) # overwrite default_parameters and parameter_names self._reset_parameters() self._loc = self._add_parameter(loc, 'loc') self._scale = self._add_parameter(scale, 'scale') self._gumbel_bijector = gumbel_cdf # ops needed for the class self.cast = P.Cast() self.const = P.ScalarToArray() self.exp = exp_generic self.expm1 = expm1_generic self.fill = P.Fill() self.lgamma = nn.LGamma() self.log = log_generic self.shape = P.Shape() self.sqrt = P.Sqrt()
def __init__(self, loc=None, scale=None, seed=0, dtype=mstype.float32, name="LogNormal"): """ Constructor of LogNormal distribution. """ super(LogNormal, self).__init__(distribution=msd.Normal(loc, scale, dtype=dtype), bijector=msb.Exp(), seed=seed, name=name) # overwrite default_parameters and parameter_names self._reset_parameters() self._loc = self._add_parameter(loc, 'loc') self._scale = self._add_parameter(scale, 'scale') self.log_2pi = np.log(2 * np.pi) #ops needed for the class self.dtypeop = P.DType() self.exp = exp_generic self.expm1 = P.Expm1() self.log = log_generic self.const = P.ScalarToArray() self.erf = P.Erf() self.fill = P.Fill() self.greater = P.Greater() self.select = P.Select() self.shape = P.Shape() self.sq = P.Square() self.sqrt = P.Sqrt() self.cast = P.Cast() self.squeeze = P.Squeeze(0)
def __init__(self, params, learning_rate, momentum, matrix_A, matrix_G, A_inv_max, G_inv_max, weight_decay=0.0, loss_scale=1.0, decay_filter=lambda x: x.name not in []): super(THOR, self).__init__(learning_rate, params, weight_decay, loss_scale) if isinstance(momentum, float) and momentum < 0.0: raise ValueError("momentum should be at least 0.0, but got momentum {}".format(momentum)) self.momentum = Parameter(Tensor(momentum, mstype.float32)) self.params = self.parameters self.moments = self.params.clone(prefix="moments", init='zeros') self.hyper_map = C.HyperMap() self.opt = P.ApplyMomentum() self.matrix_A = ParameterTuple(matrix_A) self.matrix_G = ParameterTuple(matrix_G) self.A_inv_max = ParameterTuple(A_inv_max) self.G_inv_max = ParameterTuple(G_inv_max) self.cube_matmul_left = P.CusMatMulCubeFraczLeftCast() self.cube_matmul_left_fc = P.CusMatMulCubeDenseLeft() self.cube_matmul_right_fc = P.CusMatMulCubeDenseRight() self.cube_matmul_right_mul = P.CusMatMulCubeFraczRightMul() self.transpose = P.Transpose() self.shape = P.Shape() self.reshape = P.Reshape() self.mul = P.Mul() self.weight_idx = [] for i in range(len(self.params)): if "conv" in self.params[i].name or "end_point" in self.params[i].name: self.weight_idx.append(i) self.weight_idx.append(len(self.params)) self.feature_map = [1.0 / 12544, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0] mean = _get_gradients_mean() degree = _get_device_num() parameter_length = len(self.feature_map) self.grad_reducer_Amax = DistributedGradReducerThor(parameter_length, ((27,), 2), mean, degree) self.grad_reducer_Gmax = DistributedGradReducerThor(parameter_length, ((27,), 4), mean, degree) self.grad_reducer_A = DistributedGradReducerThor(parameter_length, ((27,), 6), mean, degree) self.grad_reducer_G = DistributedGradReducerThor(parameter_length, ((27,), 8), mean, degree) self.matrix_A_inv = () self.matrix_G_inv = () self.matrix_max_inv = () for i in range(54): self.matrix_max_inv = self.matrix_max_inv + ( Parameter(initializer(1, [1], mstype.float32), name="matrix_max" + str(i), requires_grad=False),) self.log = P.Log() self.exp = P.Exp() self.sqrt = P.Sqrt() self.matrix_max_inv = ParameterTuple(self.matrix_max_inv) self.assign = P.Assign() self.cast = P.Cast() self.thor = True self.weight_decay = weight_decay * loss_scale self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
def __init__(self, params, learning_rate=1e-3, beta1=0.9, beta2=0.999, eps=1e-8, use_locking=False, use_nesterov=False, weight_decay=0.0, loss_scale=1.0): super(Adam, self).__init__(learning_rate, params, weight_decay, loss_scale) _check_param_value(beta1, beta2, eps, weight_decay, self.cls_name) validator.check_value_type("use_locking", use_locking, [bool], self.cls_name) validator.check_value_type("use_nesterov", use_nesterov, [bool], self.cls_name) validator.check_value_type("loss_scale", loss_scale, [float], self.cls_name) validator.check_number_range("loss_scale", loss_scale, 1.0, float("inf"), Rel.INC_LEFT, self.cls_name) self.beta1 = Tensor(beta1, mstype.float32) self.beta2 = Tensor(beta2, mstype.float32) self.beta1_power = Parameter(initializer(1, [1], mstype.float32), name="beta1_power") self.beta2_power = Parameter(initializer(1, [1], mstype.float32), name="beta2_power") self.eps = eps self.moment1 = self.parameters.clone(prefix="moment1", init='zeros') self.moment2 = self.parameters.clone(prefix="moment2", init='zeros') self.hyper_map = C.HyperMap() self.opt = P.Adam(use_locking, use_nesterov) self.pow = P.Pow() self.sqrt = P.Sqrt() self.one = Tensor(np.array([1.0]).astype(np.float32)) self.realdiv = P.RealDiv()
def __init__(self, probs=None, seed=0, dtype=mstype.int32, name="Bernoulli"): """ Constructor of Bernoulli distribution. """ param = dict(locals()) valid_dtype = mstype.int_type + mstype.uint_type + mstype.float_type check_type(dtype, valid_dtype, type(self).__name__) super(Bernoulli, self).__init__(seed, dtype, name, param) self.parameter_type = mstype.float32 if probs is not None: self._probs = cast_to_tensor(probs, mstype.float32) check_prob(self.probs) else: self._probs = probs # ops needed for the class self.exp = exp_generic self.log = log_generic self.erf = erf_generic self.squeeze = P.Squeeze(0) self.cast = P.Cast() self.const = P.ScalarToArray() self.dtypeop = P.DType() self.floor = P.Floor() self.fill = P.Fill() self.less = P.Less() self.shape = P.Shape() self.select = P.Select() self.sq = P.Square() self.sqrt = P.Sqrt() self.uniform = C.uniform
def __init__(self, loc=None, scale=None, seed=0, dtype=mstype.float32, name="LogNormal"): """ Constructor of LogNormal distribution. """ super(LogNormal, self).__init__(distribution=msd.Normal(loc, scale, dtype=dtype), bijector=msb.Exp(), dtype=dtype, seed=seed, name=name) self.log_2pi = np.log(2 * np.pi) #ops needed for the class self.exp = exp_generic self.expm1 = expm1_generic self.log = log_generic self.const = P.ScalarToArray() self.erf = P.Erf() self.fill = P.Fill() self.shape = P.Shape() self.sq = P.Square() self.sqrt = P.Sqrt() self.zeroslike = P.ZerosLike()
def __init__(self, probs=None, seed=0, dtype=mstype.int32, name="Bernoulli"): """ Constructor of Bernoulli distribution. """ param = dict(locals()) super(Bernoulli, self).__init__(dtype, name, param) if probs is not None: self._probs = cast_to_tensor(probs, dtype=mstype.float32) check_prob(self.probs) else: self._probs = probs self.seed = seed # ops needed for the class self.cast = P.Cast() self.const = P.ScalarToArray() self.dtypeop = P.DType() self.erf = P.Erf() self.fill = P.Fill() self.log = P.Log() self.less = P.Less() self.shape = P.Shape() self.select = P.Select() self.sq = P.Square() self.sqrt = P.Sqrt() self.uniform = P.UniformReal(seed=seed)
def __init__(self, rate=None, seed=0, dtype=mstype.float32, name="Exponential"): """ Constructor of Exponential distribution. """ param = dict(locals()) valid_dtype = mstype.float_type check_type(dtype, valid_dtype, type(self).__name__) super(Exponential, self).__init__(seed, dtype, name, param) self.parameter_type = dtype if rate is not None: self._rate = cast_to_tensor(rate, self.parameter_type) check_greater_zero(self._rate, "rate") else: self._rate = rate self.minval = np.finfo(np.float).tiny # ops needed for the class self.exp = exp_generic self.log = log_generic self.squeeze = P.Squeeze(0) self.cast = P.Cast() self.const = P.ScalarToArray() self.dtypeop = P.DType() self.fill = P.Fill() self.less = P.Less() self.select = P.Select() self.shape = P.Shape() self.sqrt = P.Sqrt() self.sq = P.Square() self.uniform = C.uniform
def __init__(self, num_groups, num_channels, eps=1e-05, affine=True, gamma_init='ones', beta_init='zeros'): super(GroupNorm, self).__init__() self.num_groups = check_int_positive(num_groups) self.num_channels = check_int_positive(num_channels) if num_channels % num_groups != 0: raise ValueError("num_channels should be divided by num_groups") self.eps = check_typename('eps', eps, (float, )) self.affine = check_bool(affine) gamma = initializer(gamma_init, [num_channels, 1, 1]) beta = initializer(beta_init, [num_channels, 1, 1]) if self.affine: self.gamma = Parameter(gamma, name='gamma') self.beta = Parameter(beta, name='beta') else: self.gamma = gamma self.beta = beta self.shape = F.shape self.reshape = F.reshape self.reduce_mean = P.ReduceMean(keep_dims=True) self.square = F.square self.reduce_sum = P.ReduceSum(keep_dims=True) self.sqrt = P.Sqrt()
def __init__(self, mean=None, sd=None, seed=0, dtype=mstype.float32, name="Normal"): """ Constructor of normal distribution. """ param = dict(locals()) valid_dtype = mstype.float_type check_type(dtype, valid_dtype, "Normal") super(Normal, self).__init__(seed, dtype, name, param) if mean is not None and sd is not None: self._mean_value = convert_to_batch(mean, self.broadcast_shape, dtype) self._sd_value = convert_to_batch(sd, self.broadcast_shape, dtype) check_greater_equal_zero(self._sd_value, "Standard deviation") else: self._mean_value = mean self._sd_value = sd #ops needed for the class self.const = P.ScalarToArray() self.erf = P.Erf() self.exp = P.Exp() self.expm1 = self._expm1_by_step self.fill = P.Fill() self.log = P.Log() self.shape = P.Shape() self.sq = P.Square() self.sqrt = P.Sqrt() self.zeroslike = P.ZerosLike()
def _update_run_op_for_map_tensor(beta1, beta2, eps, lr, weight_decay_tensor, param, m, v, gradient, decay_flag): op_mul = P.Mul() op_square = P.Square() op_sqrt = P.Sqrt() op_cast = P.Cast() op_reshape = P.Reshape() op_shape = P.Shape() param_fp32 = op_cast(param, mstype.float32) m_fp32 = op_cast(m, mstype.float32) v_fp32 = op_cast(v, mstype.float32) gradient_fp32 = op_cast(gradient, mstype.float32) next_m = op_mul(beta1, m_fp32) + op_mul( op_cast(F.tuple_to_array( (1.0, )), mstype.float32) - beta1, gradient_fp32) next_v = op_mul(beta2, v_fp32) + op_mul( op_cast(F.tuple_to_array( (1.0, )), mstype.float32) - beta2, op_square(gradient_fp32)) update = next_m / (op_sqrt(next_v) + eps) if decay_flag: update = update + op_mul(weight_decay_tensor, param_fp32) update_with_lr = op_mul(lr, update) next_param = param_fp32 - op_reshape(update_with_lr, op_shape(param_fp32)) next_v = F.depend(next_v, F.assign(param, next_param)) next_v = F.depend(next_v, F.assign(m, next_m)) next_v = F.depend(next_v, F.assign(v, next_v)) return next_v
def __init__(self, config, roi_layer, out_channels, featmap_strides, batch_size=1, finest_scale=56, mask=False): super(SingleRoIExtractor, self).__init__() cfg = config self.train_batch_size = batch_size self.out_channels = out_channels self.featmap_strides = featmap_strides self.num_levels = len(self.featmap_strides) self.out_size = roi_layer['mask_out_size'] if mask else roi_layer['out_size'] self.mask = mask self.sample_num = roi_layer['sample_num'] self.roi_layers = self.build_roi_layers(self.featmap_strides) self.roi_layers = L.CellList(self.roi_layers) self.sqrt = P.Sqrt() self.log = P.Log() self.finest_scale_ = finest_scale self.clamp = C.clip_by_value self.cast = P.Cast() self.equal = P.Equal() self.select = P.Select() _mode_16 = False self.dtype = np.float16 if _mode_16 else np.float32 self.ms_dtype = mstype.float16 if _mode_16 else mstype.float32 self.set_train_local(cfg, training=True)
def __init__(self, axis=(), keep_dims=False): super(Norm, self).__init__() self.axis = axis self.keep_dims = keep_dims self.reduce_sum = P.ReduceSum(True) self.sqrt = P.Sqrt() self.squeeze = P.Squeeze(self.axis)
def __init__(self, in_channels, out_channels, weight_init='normal', bias_init='zeros', damping=0.03, loss_scale=1, frequency=278, batch_size=32, has_bias=True, activation=None): super(Dense_Thor_GPU, self).__init__() self.in_channels = Validator.check_positive_int(in_channels) self.out_channels = Validator.check_positive_int(out_channels) self.has_bias = Validator.check_bool(has_bias) self.thor = True if isinstance(weight_init, Tensor): if weight_init.ndim != 2 or weight_init.shape[0] != out_channels or \ weight_init.shape[1] != in_channels: raise ValueError("weight_init shape error") self.weight = Parameter(initializer(weight_init, [out_channels, in_channels])) if self.has_bias: if isinstance(bias_init, Tensor): if bias_init.ndim != 1 or bias_init.shape[0] != out_channels: raise ValueError("bias_init shape error") self.bias = Parameter(initializer(bias_init, [out_channels])) self.matmul = P.MatMul(transpose_b=True) self.bias_add = P.BiasAdd() self.activation = get_activation(activation) self.activation_flag = self.activation is not None split_dim = 128 matrix_A_shape, matrix_G_shape = caculate_matmul_shape(self.in_channels, self.out_channels, split_dim) self.matrix_A_inv = Parameter(Tensor(np.zeros(matrix_A_shape).astype(np.float32)), requires_grad=False) self.matrix_G_inv = Parameter(Tensor(np.zeros(matrix_G_shape).astype(np.float32)), requires_grad=False) self.broadcast_to = P.BroadcastTo(matrix_A_shape) self.cov_step = Parameter(initializer(0, [1], mstype.int32), requires_grad=False) self.shape = P.Shape() self.reshape = P.Reshape() self.transpose = P.Transpose() self.mul = P.Mul() self.cube_matmul = P.MatMul(transpose_a=True) self.loss_scale = Tensor(1 / loss_scale, mstype.float16) self.batch_size = Tensor(batch_size, mstype.float16) self.getG = P.InsertGradientOf(self.save_gradient) self.damping = Parameter(Tensor(damping), requires_grad=False) self.dampingA = Tensor(np.identity(in_channels), mstype.float32) self.dampingG = Tensor(np.identity(out_channels), mstype.float32) self.cast = P.Cast() self.gather = P.Gather() self.freq = Tensor(frequency, mstype.int32) self.axis = 0 self.add = P.Add() self.sqrt = P.Sqrt() self.cholesky = P.CholeskyTrsm(split_dim=split_dim) self.vector_matmul = P.BatchMatMul(transpose_a=True)
def _update_run_op(beta1, beta2, eps, lr, weight_decay, param, m, v, gradient, decay_flag, optim_filter): """ Update parameters. Args: beta1 (Tensor): The exponential decay rate for the 1st moment estimations. Should be in range (0.0, 1.0). beta2 (Tensor): The exponential decay rate for the 2nd moment estimations. Should be in range (0.0, 1.0). eps (Tensor): Term added to the denominator to improve numerical stability. Should be greater than 0. lr (Tensor): Learning rate. weight_decay (Number): Weight decay. Should be equal to or greater than 0. param (Tensor): Parameters. m (Tensor): m value of parameters. v (Tensor): v value of parameters. gradient (Tensor): Gradient of parameters. decay_flag (bool): Applies weight decay or not. optim_filter (bool): Applies parameter update or not. Returns: Tensor, the new value of v after updating. """ if optim_filter: op_mul = P.Mul() op_square = P.Square() op_sqrt = P.Sqrt() op_cast = P.Cast() op_reshape = P.Reshape() op_shape = P.Shape() param_fp32 = op_cast(param, mstype.float32) m_fp32 = op_cast(m, mstype.float32) v_fp32 = op_cast(v, mstype.float32) gradient_fp32 = op_cast(gradient, mstype.float32) next_m = op_mul(beta1, m_fp32) + op_mul( op_cast(F.tuple_to_array( (1.0, )), mstype.float32) - beta1, gradient_fp32) next_v = op_mul(beta2, v_fp32) + op_mul( op_cast(F.tuple_to_array( (1.0, )), mstype.float32) - beta2, op_square(gradient_fp32)) update = next_m / (eps + op_sqrt(next_v)) if decay_flag: update = op_mul(weight_decay, param_fp32) + update update_with_lr = op_mul(lr, update) next_param = param_fp32 - op_reshape(update_with_lr, op_shape(param_fp32)) next_param = F.depend( next_param, F.assign(param, op_cast(next_param, F.dtype(param)))) next_param = F.depend(next_param, F.assign(m, op_cast(next_m, F.dtype(m)))) next_param = F.depend(next_param, F.assign(v, op_cast(next_v, F.dtype(v)))) return op_cast(next_param, F.dtype(param)) return gradient
def __init__(self, seed, dtype, name, param): """ Constructor of distribution class. """ super(Distribution, self).__init__() if seed is None: seed = 0 validator.check_value_type('name', name, [str], type(self).__name__) validator.check_non_negative_int(seed, 'seed', name) self._name = name self._seed = seed self._dtype = cast_type_for_device(dtype) self._parameters = {} # parsing parameters for k in param.keys(): if not(k == 'self' or k.startswith('_')): self._parameters[k] = param[k] # some attributes if 'distribution' in self.parameters.keys(): self.parameter_type = self.parameters['distribution'].parameter_type else: self.parameter_type = set_param_type(self.parameters['param_dict'], dtype) self._broadcast_shape = self._calc_broadcast_shape() self._is_scalar_batch = self._check_is_scalar_batch() # set the function to call according to the derived class's attributes self._set_prob() self._set_log_prob() self._set_sd() self._set_var() self._set_cdf() self._set_survival() self._set_log_cdf() self._set_log_survival() self._set_cross_entropy() self.context_mode = context.get_context('mode') self.device_target = context.get_context('device_target') self.checktuple = CheckTuple() self.checktensor = CheckTensor() self.broadcast = broadcast_to # ops needed for the base class self.cast_base = P.Cast() self.dtype_base = P.DType() self.exp_base = exp_generic self.fill_base = P.Fill() self.log_base = log_generic self.sametypeshape_base = P.SameTypeShape() self.sq_base = P.Square() self.sqrt_base = P.Sqrt() self.shape_base = P.Shape()
def test_sqrt(): """ test_sqrt """ input_tensor = Tensor(np.array([[4, 4], [9, 9]])) sqrt = P.Sqrt() expect = np.array([[2, 2], [3, 3]]) result = sqrt(input_tensor) assert np.all(result.asnumpy() == expect)
def sqrt(nptype): np.random.seed(0) x_np = np.random.rand(2, 3, 4, 4).astype(nptype) context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU") output_ms = P.Sqrt()(Tensor(x_np)) output_np = np.sqrt(x_np) assert np.allclose(output_ms.asnumpy(), output_np)
def __init__(self, axis=(), keep_dims=False): super(Norm, self).__init__() Validator.check_value_type("keep_dims", keep_dims, [bool], self.cls_name) self.axis = axis self.keep_dims = keep_dims self.reduce_sum = P.ReduceSum(True) self.sqrt = P.Sqrt() self.squeeze = P.Squeeze(self.axis)
def __init__( self, vocab_size, embedding_size, embedding_shape, use_one_hot_embeddings=False, initializer_range=0.02, batch_size=12, damping=0.03, loss_scale=1, frequency=100, ): super(Embedding_Thor, self).__init__() self.vocab_size = vocab_size self.use_one_hot_embeddings = use_one_hot_embeddings self.embedding_table = Parameter( initializer(TruncatedNormal(initializer_range), [vocab_size, embedding_size])) self.thor = True self.expand = P.ExpandDims() self.shape_flat = (-1, ) self.gather = P.Gather() self.one_hot = P.OneHot() self.on_value = Tensor(1.0, mstype.float32) self.off_value = Tensor(0.0, mstype.float32) self.array_mul = P.MatMul() self.reshape = P.Reshape() self.em_shape = tuple(embedding_shape) self.shape = P.Shape() self.loss_scale = Tensor(1 / loss_scale, mstype.float16) self.matrix_A_inv = Parameter(Tensor( np.zeros([vocab_size]).astype(np.float16)), requires_grad=False) self.matrix_G_inv = Parameter(Tensor( np.zeros([embedding_size, embedding_size]).astype(np.float16)), requires_grad=False) self.fake_G = Tensor( np.zeros([embedding_size, embedding_size]).astype(np.float16)) self.dampingA = Tensor(np.ones([vocab_size]).astype(np.float32)) self.dampingG = Tensor(np.identity(embedding_size), mstype.float32) self.cov_step = Parameter(initializer(0, [1], mstype.int32), requires_grad=False) self.freq = Tensor(frequency, mstype.int32) self.axis = 0 self.damping = damping self.gather = P.Gather() self.sqrt = P.Sqrt() self.mul = P.Mul() self.cast = P.Cast() self.cube_matmul = P.CusMatMulCube(transpose_a=True) self.vector_matmul = P.CusBatchMatMul() self.cholesky = P.CusCholeskyTrsm() self.matrix_combine = P.CusMatrixCombine() self.reduce_sum = P.ReduceSum(keep_dims=False) self.inv = P.Inv() self.getG = P.InsertGradientOf(self.save_gradient) self.batch_size = batch_size
def __init__(self): super(Net, self).__init__() self.add = P.TensorAdd() self.sub = P.Sub() self.mul = P.Mul() self.div = P.RealDiv() self.sqrt = P.Sqrt() self.pow = P.Pow() self.neg = P.Neg()
def _run_opt_with_sparse(opt, sparse_opt, push, pull, use_locking, use_nesterov, target, beta1_power, beta2_power, beta1, beta2, eps, lr, gradient, param, m, v, ps_parameter, cache_enable): """Apply sparse adam optimizer to the weight parameter when the gradient is sparse.""" success = True indices = gradient.indices values = gradient.values if ps_parameter and not cache_enable: op_shape = P.Shape() shapes = (op_shape(param), op_shape(m), op_shape(v), op_shape(beta1_power), op_shape(beta2_power), op_shape(lr), op_shape(beta1), op_shape(beta2), op_shape(eps), op_shape(values), op_shape(indices)) success = F.depend(success, pull(push((beta1_power, beta2_power, lr, beta1, beta2, eps, values, indices), shapes), param)) return success if not target: success = F.depend(success, sparse_opt(param, m, v, beta1_power, beta2_power, lr, beta1, beta2, eps, values, indices)) else: op_mul = P.Mul() op_square = P.Square() op_sqrt = P.Sqrt() scatter_add = P.ScatterAdd(use_locking) success = F.depend(success, F.assign(m, op_mul(beta1, m))) success = F.depend(success, F.assign(v, op_mul(beta2, v))) grad_indices = gradient.indices grad_value = gradient.values next_m = scatter_add(m, grad_indices, op_mul(F.tuple_to_array((1.0,)) - beta1, grad_value)) next_v = scatter_add(v, grad_indices, op_mul(F.tuple_to_array((1.0,)) - beta2, op_square(grad_value))) if use_nesterov: m_temp = next_m * _scaler_ten F.assign(m, op_mul(beta1, next_m)) div_value = scatter_add(m, op_mul(grad_indices, _scaler_one), op_mul(F.tuple_to_array((1.0,)) - beta1, grad_value)) param_update = div_value / (op_sqrt(next_v) + eps) F.assign(m, m_temp / _scaler_ten) else: param_update = next_m / (op_sqrt(next_v) + eps) lr_t = lr * op_sqrt(1 - beta2_power) / (1 - beta1_power) next_param = param - lr_t * param_update success = F.depend(success, F.assign(param, next_param)) success = F.depend(success, F.assign(m, next_m)) success = F.depend(success, F.assign(v, next_v)) return success