예제 #1
0
 def __init__(self):
     super().__init__()
     self.op = P.Mul()
예제 #2
0
    def __init__(self,
                 vocab_size,
                 embedding_size,
                 field_size,
                 param_init='normal',
                 target='CPU',
                 slice_mode='batch_slice',
                 feature_num_list=None,
                 max_norm=None,
                 sparse=True,
                 operator='SUM'):
        super(MultiFieldEmbeddingLookup,
              self).__init__(vocab_size, embedding_size, param_init, target,
                             slice_mode, feature_num_list, max_norm, sparse)
        self.field_size = validator.check_value_type('field_size', field_size,
                                                     [int], self.cls_name)
        self.operator = operator

        self.mul = P.Mul()
        self.inf_mask_mul = P.Mul()
        self.bias_add = P.TensorAdd()
        self.inf_add = P.TensorAdd()
        self.merge_op = None
        self.count_op = P.UnsortedSegmentSum()
        self.abs = P.Abs()
        self.equal = P.Equal()
        self.add = P.TensorAdd()
        self.cast = P.Cast()
        self.div_no_nan = P.DivNoNan()
        self.expand = P.ExpandDims()
        self.max_mask_mul = P.Mul()
        self.max_no_equal = P.NotEqual()

        if operator == MultiFieldEmbeddingLookup.OPERATOR_SUM:
            self.merge_op = P.UnsortedSegmentSum()
        elif operator == MultiFieldEmbeddingLookup.OPERATOR_MAX:
            self.merge_op = P.UnsortedSegmentMax()
        elif operator == MultiFieldEmbeddingLookup.OPERATOR_MEAN:
            self.merge_op = P.UnsortedSegmentSum()
        else:
            raise ValueError(
                "The operator supports ['SUM', 'MAX', 'MEAN'], but found: " +
                str(operator))

        parallel_mode = _get_parallel_mode()
        is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL,
                                             ParallelMode.AUTO_PARALLEL)
        if slice_mode in ["table_row_slice", "batch_slice"
                          ] and is_auto_parallel:
            self.merge_op.shard(
                ((get_group_size(), 1, 1), (get_group_size(), 1)))
            self.expand.shard(((get_group_size(), ), ))
            self.bias_add.shard(((1, 1), (1, 1)))
            self.mul.shard(
                ((get_group_size(), 1, 1), (get_group_size(), 1, 1)))
            self.count_op.shard(((get_group_size(), 1), (get_group_size(), 1)))
            self.add.shard(((get_group_size(), ), (get_group_size(), )))
            self.div_no_nan.shard(
                ((get_group_size(), 1), (get_group_size(), 1)))
            self.max_mask_mul.shard(
                ((get_group_size(), 1), (get_group_size(), 1)))
            self.max_no_equal.shard(((1, ), ()))
            if operator == MultiFieldEmbeddingLookup.OPERATOR_MAX:
                self.equal.shard(((get_group_size(), 1, 1), ()))
                self.inf_mask_mul.shard(((get_group_size(), 1, 1), ()))
                self.merge_op.shard(
                    ((get_group_size(), 1), (get_group_size(), )))
                self.count_op.shard(
                    ((get_group_size(), ), (get_group_size(), )))
                self.inf_add.shard(
                    ((get_group_size(), 1, 1), (get_group_size(), 1, 1)))
        elif slice_mode == "table_column_slice" and is_auto_parallel:
            self.merge_op.shard(((1, 1, get_group_size()), (1, 1)))
            self.div_no_nan.shard(((1, get_group_size()), (1, 1)))
            self.bias_add.shard(((1, 1), (1, 1)))
            self.mul.shard(((1, 1, 1), (1, 1, get_group_size())))
            self.count_op.shard(((1, 1), (1, 1)))
            self.add.shard(((1, ), (1, )))
            self.max_mask_mul.shard(((1, get_group_size()), (1, 1)))
            self.expand.shard(((1, ), ))
            self.max_no_equal.shard(((1, ), ()))
            if operator == MultiFieldEmbeddingLookup.OPERATOR_MAX:
                self.equal.shard(((1, 1, 1), ()))
                self.inf_mask_mul.shard(((1, 1, 1), ()))
                self.merge_op.shard(((1, get_group_size()), (1, )))
                self.count_op.shard(((1, ), (1, )))
                self.inf_add.shard(((1, 1, get_group_size()), (1, 1, 1)))
        else:
            if is_auto_parallel:
                raise ValueError(
                    "slice_mode should be  ['table_row_slice', 'batch_slice' and \
                       'table_column_slice'], but get " + str(slice_mode))

        # Min value for fp32
        self.negative_inf_value = -3.402823466E+38
예제 #3
0
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
from mindspore.ops import Primitive
from mindspore.ops import operations as P

Add = P.TensorAdd()
Mul = P.Mul()
Sqrt = P.Sqrt()
Square = P.Square()
make_tuple = Primitive('make_tuple')
tuple_getitem = Primitive('tuple_getitem')
LambNextRight = Primitive('LambNextRight')


class FnDict:
    def __init__(self):
        self.fnDict = {}

    def __call__(self, fn):
        self.fnDict[fn.__name__] = fn

    def __getitem__(self, name):
예제 #4
0
 def __init__(self, mul_weight, strategy1=None, strategy2=None):
     super().__init__()
     self.mul = P.Mul().shard(strategy1)
     self.neg = P.Neg().shard(strategy2)
     self.mul_weight = Parameter(mul_weight, "w1")
예제 #5
0
    def __init__(self,
                 in_channel,
                 out_channel,
                 stride=1,
                 use_se=False,
                 se_block=False):
        super(ResidualBlock, self).__init__()
        self.stride = stride
        self.use_se = use_se
        self.se_block = se_block
        channel = out_channel // self.expansion
        self.conv1 = _conv1x1(in_channel,
                              channel,
                              stride=1,
                              use_se=self.use_se)
        self.bn1 = _bn(channel)
        if self.use_se and self.stride != 1:
            self.e2 = nn.SequentialCell([
                _conv3x3(channel, channel, stride=1, use_se=True),
                _bn(channel),
                nn.ReLU(),
                nn.MaxPool2d(kernel_size=2, stride=2, pad_mode='same')
            ])
        else:
            self.conv2 = _conv3x3(channel,
                                  channel,
                                  stride=stride,
                                  use_se=self.use_se)
            self.bn2 = _bn(channel)

        self.conv3 = _conv1x1(channel,
                              out_channel,
                              stride=1,
                              use_se=self.use_se)
        self.bn3 = _bn_last(out_channel)
        if self.se_block:
            self.se_global_pool = P.ReduceMean(keep_dims=False)
            self.se_dense_0 = _fc(out_channel,
                                  int(out_channel / 4),
                                  use_se=self.use_se)
            self.se_dense_1 = _fc(int(out_channel / 4),
                                  out_channel,
                                  use_se=self.use_se)
            self.se_sigmoid = nn.Sigmoid()
            self.se_mul = P.Mul()
        self.relu = nn.ReLU()

        self.down_sample = False

        if stride != 1 or in_channel != out_channel:
            self.down_sample = True
        self.down_sample_layer = None

        if self.down_sample:
            if self.use_se:
                if stride == 1:
                    self.down_sample_layer = nn.SequentialCell([
                        _conv1x1(in_channel,
                                 out_channel,
                                 stride,
                                 use_se=self.use_se),
                        _bn(out_channel)
                    ])
                else:
                    self.down_sample_layer = nn.SequentialCell([
                        nn.MaxPool2d(kernel_size=2, stride=2, pad_mode='same'),
                        _conv1x1(in_channel,
                                 out_channel,
                                 1,
                                 use_se=self.use_se),
                        _bn(out_channel)
                    ])
            else:
                self.down_sample_layer = nn.SequentialCell([
                    _conv1x1(in_channel,
                             out_channel,
                             stride,
                             use_se=self.use_se),
                    _bn(out_channel)
                ])
        self.add = F.tensor_add
예제 #6
0
 def __init__(self, strategy1, strategy2, strategy3):
     super().__init__()
     self.mul1 = P.Mul().shard(strategy1)
     self.reduce_sum = P.ReduceSum(keep_dims=False).shard(strategy2)
     self.mul2 = P.Mul().shard(strategy3)
예제 #7
0
def _run_opt_with_sparse(opt, sparse_opt, push, pull, use_locking,
                         use_nesterov, target, beta1_power, beta2_power, beta1,
                         beta2, eps, lr, gradient, param, m, v, ps_parameter,
                         cache_enable):
    """Apply sparse adam optimizer to the weight parameter when the gradient is sparse."""
    success = True
    indices = gradient.indices
    values = gradient.values
    if ps_parameter and not cache_enable:
        op_shape = P.Shape()
        shapes = (op_shape(param), op_shape(m), op_shape(v),
                  op_shape(beta1_power), op_shape(beta2_power), op_shape(lr),
                  op_shape(beta1), op_shape(beta2), op_shape(eps),
                  op_shape(values), op_shape(indices))
        success = F.depend(
            success,
            pull(
                push((beta1_power, beta2_power, lr, beta1, beta2, eps, values,
                      indices), shapes), param))
        return success

    if not target:
        success = F.depend(
            success,
            sparse_opt(param, m, v, beta1_power, beta2_power, lr, beta1, beta2,
                       eps, values, indices))
    else:
        op_mul = P.Mul()
        op_square = P.Square()
        op_sqrt = P.Sqrt()
        scatter_add = P.ScatterAdd(use_locking)

        assign_m = F.assign(m, op_mul(beta1, m))
        assign_v = F.assign(v, op_mul(beta2, v))

        grad_indices = gradient.indices
        grad_value = gradient.values

        next_m = scatter_add(
            m, grad_indices,
            op_mul(F.tuple_to_array((1.0, )) - beta1, grad_value))

        next_v = scatter_add(
            v, grad_indices,
            op_mul(F.tuple_to_array((1.0, )) - beta2, op_square(grad_value)))

        if use_nesterov:
            m_temp = next_m * _scaler_ten
            assign_m_nesterov = F.assign(m, op_mul(beta1, next_m))
            div_value = scatter_add(
                m, op_mul(grad_indices, _scaler_one),
                op_mul(F.tuple_to_array((1.0, )) - beta1, grad_value))
            param_update = div_value / (op_sqrt(next_v) + eps)

            m_recover = F.assign(m, m_temp / _scaler_ten)

            F.control_depend(m_temp, assign_m_nesterov)
            F.control_depend(assign_m_nesterov, div_value)
            F.control_depend(param_update, m_recover)
        else:
            param_update = next_m / (op_sqrt(next_v) + eps)

        lr_t = lr * op_sqrt(1 - beta2_power) / (1 - beta1_power)

        next_param = param - lr_t * param_update

        F.control_depend(assign_m, next_m)
        F.control_depend(assign_v, next_v)

        success = F.depend(success, F.assign(param, next_param))
        success = F.depend(success, F.assign(m, next_m))
        success = F.depend(success, F.assign(v, next_v))

    return success
예제 #8
0
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 pad_mode='same',
                 padding=0,
                 dilation=1,
                 group=1,
                 data_format='NCHW',
                 has_bias=False,
                 weight_init='normal',
                 damping=0.03,
                 loss_scale=1,
                 frequency=278,
                 bias_init='zeros'):
        self.thor = True
        ksizes = (1, kernel_size, kernel_size, 1)
        self.hw = kernel_size * kernel_size
        strides = (1, stride, stride, 1)
        kernel_size = twice(kernel_size)
        super(Conv2d_Thor, self).__init__(
            in_channels,
            out_channels,
            kernel_size,
            stride,
            pad_mode,
            padding,
            dilation,
            group,
            data_format,
            has_bias,
            weight_init,
            bias_init,
        )
        self.conv2d = P.Conv2D(out_channel=self.out_channels,
                               kernel_size=self.kernel_size,
                               mode=1,
                               pad_mode=self.pad_mode,
                               pad=self.padding,
                               stride=self.stride,
                               dilation=self.dilation,
                               group=self.group)

        self.img2col = P.CusImg2Col(ksizes=ksizes, strides=strides)
        self.cube_matmul = P.CusMatMulCube(transpose_a=True)
        self.matrix_combine = P.CusMatrixCombine()
        self.cholesky = P.CusCholeskyTrsm()
        self.transpose02314 = P.CusTranspose02314()
        self.matrix_A_dim = self.in_channels * self.kernel_size[
            0] * self.kernel_size[1]
        self.matrix_G_dim = self.out_channels
        self.matrix_A_device_shape, self.matrix_A_device_dim = caculate_device_shape(
            self.matrix_A_dim, self.in_channels, True)
        self.matrix_G_device_shape, self.matrix_G_device_dim = caculate_device_shape(
            self.matrix_G_dim, self.in_channels, False)
        self.matrix_A_device_temp_shape = (self.matrix_A_device_shape[0],
                                           self.matrix_A_device_shape[2],
                                           self.matrix_A_device_shape[1],
                                           self.matrix_A_device_shape[3])
        self.matrix_G_device_temp_shape = (self.matrix_G_device_shape[0],
                                           self.matrix_G_device_shape[2],
                                           self.matrix_G_device_shape[1],
                                           self.matrix_G_device_shape[3])
        self.matrix_A_inv = Parameter(Tensor(
            np.reshape(
                np.identity(self.matrix_A_device_dim).astype(np.float16),
                self.matrix_A_device_shape)),
                                      name='matrix_A_inv',
                                      requires_grad=False)
        self.A_inv_max = Parameter(initializer(0, [1], mstype.float32),
                                   name="A_inv_max",
                                   requires_grad=False)
        self.matrix_G_inv = Parameter(Tensor(
            np.reshape(
                np.identity(self.matrix_G_device_dim).astype(np.float16),
                self.matrix_G_device_shape)),
                                      name="matrix_G_inv",
                                      requires_grad=False)

        self.G_inv_max = Parameter(initializer(0, [1], mstype.float32),
                                   name="G_inv_max",
                                   requires_grad=False)
        self.fake_G = Tensor(
            np.reshape(
                np.identity(self.matrix_G_device_dim).astype(np.float16),
                self.matrix_G_device_shape))
        self.fake_G_inv_max = Tensor(np.zeros([
            1,
        ]).astype(np.float32))

        self.shape = P.Shape()
        self.reshape = P.Reshape()
        self.transpose = P.Transpose()
        self.cov_step = Parameter(initializer(0, [1], mstype.int32),
                                  name="cov_step",
                                  requires_grad=False)
        self.mul = P.Mul()
        self.cast = P.Cast()
        self.damping = Tensor(damping)
        self.vector_matmul = P.CusBatchMatMul()
        self.diag_block_dim = 128
        self.channels_slice_flag = False
        if self.in_channels % C0 != 0:
            self.channels_slice_flag = True

        self.padA_flag = False
        if (self.matrix_A_dim // self.diag_block_dim) * self.diag_block_dim != self.matrix_A_dim \
            and self.matrix_A_dim > self.diag_block_dim:
            self.padA_flag = True
            pad_dim = self.diag_block_dim - self.matrix_A_dim % self.diag_block_dim
            self.padA = P.Pad(((0, pad_dim), (0, pad_dim)))
        self.device_shape_pad_flag = False
        if self.matrix_A_dim != self.matrix_A_device_dim:
            self.device_shape_pad_flag = True
            self.device_shape_pad = P.Pad(((0, 0), (0, C0 - self.in_channels),
                                           (0, 0), (0, C0 - self.in_channels)))
        self.slice = P.Slice()
        self.gather = P.GatherV2()
        self.freq = Tensor(frequency, mstype.int32)
        self.loss_scale = Tensor(1 / loss_scale, mstype.float16)
        self.axis = 0

        dampingA_dim = self.matrix_A_dim
        if (self.matrix_A_dim % self.diag_block_dim
            ) != 0 and self.matrix_A_dim > self.diag_block_dim:
            dampingA_dim = (self.matrix_A_dim // self.diag_block_dim +
                            1) * self.diag_block_dim
        dampingG_dim = self.matrix_G_dim
        if (self.matrix_G_dim % self.diag_block_dim
            ) != 0 and self.matrix_G_dim > self.diag_block_dim:
            dampingG_dim = (self.matrix_G_dim // self.diag_block_dim +
                            1) * self.diag_block_dim

        self.dampingA = Tensor(np.identity(dampingA_dim), mstype.float32)
        self.dampingG = Tensor(np.identity(dampingG_dim), mstype.float32)
        self.fused_abs_max1 = P.CusFusedAbsMax1(
            [self.matrix_A_dim, self.matrix_A_dim])
        self.fused_abs_max2 = P.CusFusedAbsMax1()
        self.log = P.Log()
        self.exp = P.Exp()
        self.sqrt = P.Sqrt()
        self.getG = P.InsertGradientOf(self.save_gradient)
예제 #9
0
    def __init__(self,
                 in_channels,
                 out_channels,
                 weight_init='normal',
                 bias_init='zeros',
                 damping=0.03,
                 loss_scale=1,
                 frequency=278,
                 has_bias=True,
                 activation=None):
        super(Dense_Thor, self).__init__()
        self.in_channels = check_int_positive(in_channels)
        self.out_channels = check_int_positive(out_channels)
        self.has_bias = check_bool(has_bias)
        self.thor = True
        if isinstance(weight_init, Tensor):
            if weight_init.dim() != 2 or weight_init.shape()[0] != out_channels or \
                    weight_init.shape()[1] != in_channels:
                raise ValueError("weight_init shape error")

        self.weight = Parameter(initializer(weight_init,
                                            [out_channels, in_channels]),
                                name="weight")

        if self.has_bias:
            if isinstance(bias_init, Tensor):
                if bias_init.dim() != 1 or bias_init.shape(
                )[0] != out_channels:
                    raise ValueError("bias_init shape error")

            self.bias = Parameter(initializer(bias_init, [out_channels]),
                                  name="bias")

        self.matmul = P.MatMul(transpose_b=True)
        self.bias_add = P.BiasAdd()

        self.activation = get_activation(activation)
        self.activation_flag = self.activation is not None

        self.matrix_A_inv = Parameter(Tensor(
            np.zeros([128, 128, 16, 16]).astype(np.float16)),
                                      name='matrix_A_inv',
                                      requires_grad=False)
        self.matrix_G_inv = Parameter(Tensor(
            np.zeros([63, 63, 16, 16]).astype(np.float16)),
                                      name="matrix_G_inv",
                                      requires_grad=False)
        self.fake_G = Tensor(np.zeros([63, 63, 16, 16]).astype(np.float16))

        self.matmul = P.MatMul(transpose_b=True)
        self.cube_matmul = P.CusMatMulCube(transpose_a=True)
        self.matrix_combine = P.CusMatrixCombine()
        self.cholesky = P.CusCholeskyTrsm()
        self.shape = P.Shape()
        self.reshape = P.Reshape()
        self.transpose = P.Transpose()
        self.cov_step = Parameter(initializer(0, [1], mstype.int32),
                                  name="cov_step",
                                  requires_grad=False)
        self.mul = P.Mul()
        self.cast = P.Cast()
        self.damping = Tensor(damping)
        self.loss_scale = Tensor(1 / loss_scale, mstype.float16)
        self.vector_matmul = P.CusBatchMatMul()
        self.pad = P.Pad(((0, 24), (0, 24)))
        self.pad1 = P.Pad(((0, 8), (0, 8)))
        self.slice = P.Slice()
        self.gather = P.GatherV2()
        self.assignadd = P.AssignAdd()
        self.freq = Tensor(frequency, mstype.int32)
        self.axis = 0
        self.A_inv_max = Parameter(initializer(0, [1], mstype.float32),
                                   name="A_inv_max",
                                   requires_grad=False)
        self.G_inv_max = Parameter(initializer(0, [1], mstype.float32),
                                   name="G_inv_max",
                                   requires_grad=False)
        self.fused_abs_max1 = P.CusFusedAbsMax1([1000, 1000])
        self.fused_abs_max2 = P.CusFusedAbsMax1()
        self.log = P.Log()
        self.exp = P.Exp()
        self.dampingA = Tensor(np.identity(2048), mstype.float32)
        self.dampingG = Tensor(np.identity(1024), mstype.float32)
        self.add = P.TensorAdd()
        self.sqrt = P.Sqrt()
        self.getG = P.InsertGradientOf(self.save_gradient)
예제 #10
0
    def __init__(self, config):
        super(WideDeepModel, self).__init__()
        self.batch_size = config.batch_size
        parallel_mode = _get_parallel_mode()
        if parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL,
                             ParallelMode.AUTO_PARALLEL):
            self.batch_size = self.batch_size * get_group_size()
        self.field_size = config.field_size
        self.vocab_size = config.vocab_size
        self.emb_dim = config.emb_dim
        self.deep_layer_dims_list = config.deep_layer_dim
        self.deep_layer_act = config.deep_layer_act
        self.init_args = config.init_args
        self.weight_init, self.bias_init = config.weight_bias_init
        self.weight_bias_init = config.weight_bias_init
        self.emb_init = config.emb_init
        self.drop_out = config.dropout_flag
        self.keep_prob = config.keep_prob
        self.deep_input_dims = self.field_size * self.emb_dim
        self.layer_dims = self.deep_layer_dims_list + [1]
        self.all_dim_list = [self.deep_input_dims] + self.layer_dims

        init_acts = [('Wide_w', [self.vocab_size, 1], self.emb_init),
                     ('V_l2', [self.vocab_size, self.emb_dim], self.emb_init),
                     ('Wide_b', [1], self.emb_init)]
        var_map = init_var_dict(self.init_args, init_acts)
        self.wide_w = var_map["Wide_w"]
        self.wide_b = var_map["Wide_b"]
        self.embedding_table = var_map["V_l2"]
        self.dense_layer_1 = DenseLayer(self.all_dim_list[0],
                                        self.all_dim_list[1],
                                        self.weight_bias_init,
                                        self.deep_layer_act,
                                        convert_dtype=True)
        self.dense_layer_2 = DenseLayer(self.all_dim_list[1],
                                        self.all_dim_list[2],
                                        self.weight_bias_init,
                                        self.deep_layer_act,
                                        convert_dtype=True)
        self.dense_layer_3 = DenseLayer(self.all_dim_list[2],
                                        self.all_dim_list[3],
                                        self.weight_bias_init,
                                        self.deep_layer_act,
                                        convert_dtype=True)
        self.dense_layer_4 = DenseLayer(self.all_dim_list[3],
                                        self.all_dim_list[4],
                                        self.weight_bias_init,
                                        self.deep_layer_act,
                                        convert_dtype=True)
        self.dense_layer_5 = DenseLayer(self.all_dim_list[4],
                                        self.all_dim_list[5],
                                        self.weight_bias_init,
                                        self.deep_layer_act,
                                        convert_dtype=True)

        self.gather_v2 = P.GatherV2().shard(((1, 8), (1, 1)))
        self.gather_v2_1 = P.GatherV2()
        self.mul = P.Mul()
        self.reduce_sum = P.ReduceSum(keep_dims=False)
        self.reshape = P.Reshape()
        self.square = P.Square()
        self.shape = P.Shape()
        self.tile = P.Tile()
        self.concat = P.Concat(axis=1)
        self.cast = P.Cast()
예제 #11
0
 def __init__(self, strategy0, strategy1):
     super().__init__()
     self.fc_nobias = P.MatMul(transpose_b=True).set_strategy(strategy0)
     self.reduce_sum = P.ReduceSum(
         keep_dims=False).set_strategy(strategy1)
     self.mul = P.Mul().set_strategy(strategy=((), ()))
예제 #12
0
def _update_run_op(beta1, beta2, eps, global_step, lr, weight_decay, param, m,
                   v, gradient, decay_flag, optim_filter):
    """
    Update parameters.

    Args:
        beta1 (Tensor): The exponential decay rate for the 1st moment estimates. Should be in range (0.0, 1.0).
        beta2 (Tensor): The exponential decay rate for the 2nd moment estimates. Should be in range (0.0, 1.0).
        eps (Tensor): Term added to the denominator to improve numerical stability. Should be greater than 0.
        lr (Tensor): Learning rate.
        weight_decay (Number): Weight decay. Should be equal to or greater than 0.
        global_step (Tensor): Global step.
        param (Tensor): Parameters.
        m (Tensor): m value of parameters.
        v (Tensor): v value of parameters.
        gradient (Tensor): Gradient of parameters.
        decay_flag (bool): Specifies whether param update with weight decay.
        optim_filter(bool): Applies parameter update or not.

    Returns:
        Tensor, the new value of v after updating.
    """
    if optim_filter:
        op_mul = P.Mul()
        op_sqrt = P.Sqrt()
        op_rsqrt = P.Rsqrt()
        op_square = P.Square()
        op_cast = P.Cast()
        op_reshape = P.Reshape()
        op_shape = P.Shape()
        op_pow = P.Pow()
        op_norm = layer.Norm()
        op_select = P.Select()
        op_greater = P.Greater()
        op_fill = P.Fill()
        op_dtype = P.DType()

        param_fp32 = op_cast(param, mstype.float32)
        m_fp32 = op_cast(m, mstype.float32)
        v_fp32 = op_cast(v, mstype.float32)
        gradient_fp32 = op_cast(gradient, mstype.float32)

        next_m = op_mul(beta1, m_fp32) + op_mul(
            op_cast(num_one, mstype.float32) - beta1, gradient_fp32)

        next_v = op_mul(beta2, v_fp32) + op_mul(
            op_cast(num_one, mstype.float32) - beta2, op_square(gradient_fp32))

        next_mm = next_m / (op_cast(num_one, mstype.float32) - op_pow(
            beta1, op_cast(global_step + num_one, mstype.float32)))
        next_vv = next_v / (op_cast(num_one, mstype.float32) - op_pow(
            beta2, op_cast(global_step + num_one, mstype.float32)))
        w_norm = op_norm(param_fp32)
        g_norm = op_norm(gradient_fp32)

        g_norm_hat = op_norm(
            op_mul(next_mm, op_rsqrt(next_vv + eps)) +
            weight_decay * param_fp32)
        zeros = F.zeros_like(w_norm)
        ones = op_fill(op_dtype(w_norm), op_shape(w_norm), 1.0)
        trust_ratio = op_select(
            op_greater(w_norm, zeros),
            op_select(op_greater(g_norm, zeros), w_norm / g_norm_hat, ones),
            ones)
        tens = op_fill(op_dtype(trust_ratio), op_shape(trust_ratio), 10.0)
        trust_ratio = C.clip_by_value(trust_ratio, zeros, tens)
        update = next_mm / (op_sqrt(next_vv) + eps)

        if decay_flag:
            update = update + op_mul(weight_decay, param_fp32)

        update_with_lr = op_mul(op_mul(trust_ratio, lr), update)

        next_param = param_fp32 - op_reshape(update_with_lr,
                                             op_shape(param_fp32))

        next_param = F.depend(next_param, F.assign(param, next_param))
        next_param = F.depend(next_param, F.assign(m, next_m))
        next_param = F.depend(next_param, F.assign(v, next_v))

        return next_param
    return gradient
예제 #13
0
    def __init__(self,
                 params,
                 learning_rate,
                 momentum,
                 matrix_A,
                 matrix_G,
                 A_inv_max,
                 G_inv_max,
                 weight_decay=0.0,
                 loss_scale=1.0,
                 batch_size=32.0,
                 decay_filter=lambda x: x.name not in []):
        super(THOR, self).__init__(learning_rate, params, weight_decay,
                                   loss_scale)
        if isinstance(momentum, float) and momentum < 0.0:
            raise ValueError(
                "momentum should be at least 0.0, but got momentum {}".format(
                    momentum))
        self.momentum = Parameter(Tensor(momentum, mstype.float32),
                                  name="momentum")
        self.params = self.parameters
        self.moments = self.params.clone(prefix="moments", init='zeros')
        self.hyper_map = C.HyperMap()
        self.opt = P.ApplyMomentum()
        self.matrix_A = ParameterTuple(matrix_A)
        self.matrix_G = ParameterTuple(matrix_G)
        self.A_inv_max = ParameterTuple(A_inv_max)
        self.G_inv_max = ParameterTuple(G_inv_max)
        self.cube_matmul_left = P.CusMatMulCubeFraczLeftCast()
        self.cube_matmul_left_fc = P.CusMatMulCubeDenseLeft()
        self.cube_matmul_right_fc = P.CusMatMulCubeDenseRight()
        self.cube_matmul_right_mul = P.CusMatMulCubeFraczRightMul()
        self.transpose = P.Transpose()
        self.shape = P.Shape()
        self.reshape = P.Reshape()
        self.mul = P.Mul()
        self.weight_idx = []
        for i in range(len(self.params)):
            if "conv" in self.params[i].name or "end_point" in self.params[
                    i].name:
                self.weight_idx.append(i)
        self.weight_idx.append(len(self.params))
        self.feature_map = [
            1.0 / 12544, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136,
            1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136,
            1.0 / 3136, 1.0 / 3136, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784,
            1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784,
            1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 196, 1.0 / 196, 1.0 / 196,
            1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196,
            1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196,
            1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 49, 1.0 / 49,
            1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49,
            1.0 / 49, 1.0
        ]
        mean = _get_mirror_mean()
        degree = _get_device_num()
        self.grad_reducer_Amax = DistributedGradReducerThor(
            self.parameters, 2, mean, degree)
        self.grad_reducer_Gmax = DistributedGradReducerThor(
            self.parameters, 5, mean, degree)
        self.grad_reducer_A = DistributedGradReducerThor(
            self.parameters, 3, mean, degree)
        self.grad_reducer_G = DistributedGradReducerThor(
            self.parameters, 4, mean, degree)
        self.matrix_A_inv = ()
        self.matrix_G_inv = ()
        self.matrix_max_inv = ()

        for i in range(54):
            self.matrix_max_inv = self.matrix_max_inv + (Parameter(
                initializer(1, [1], mstype.float32),
                name="matrix_max" + str(i),
                requires_grad=False), )
        self.log = P.Log()
        self.exp = P.Exp()
        self.sqrt = P.Sqrt()
        self.matrix_max_inv = ParameterTuple(self.matrix_max_inv)
        self.assign = P.Assign()
        self.cast = P.Cast()
        self.thor = True
        self.weight_decay = weight_decay * loss_scale
        self.decay_flags = tuple(decay_filter(x) for x in self.parameters)

        self.conv_index = [
            0, 1, 2, 3, 6, 7, 8, 9, 12, 13, 14, 17, 18, 19, 22, 23, 24, 25, 28,
            29, 30, 33, 34, 35, 38, 39, 40, 43, 44, 45, 46, 49, 50, 51, 54, 55,
            56, 59, 60, 61, 64, 65, 66, 69, 70, 71, 74, 75, 76, 77, 80, 81, 82,
            85
        ]
        self.batch_size = batch_size
        self.bn_index = [
            3, 7, 10, 13, 17, 20, 23, 26, 30, 33, 36, 39, 42, 45, 49, 52
        ]
        self.bn_gradient_index = [
            -1, -1, -1, 4, -1, -1, -1, 10, -1, -1, 15, -1, -1, 20, -1, -1, -1,
            26, -1, -1, 31, -1, -1, 36, -1, -1, 41, -1, -1, -1, 47, -1, -1, 52,
            -1, -1, 57, -1, -1, 62, -1, -1, 67, -1, -1, 72, -1, -1, -1, 78, -1,
            -1, 83
        ]
예제 #14
0
 def __init__(self, mul_size):
     super().__init__()
     self.mul_weight = Tensor(np.full(mul_size, 0.6, dtype=np.float32))
     self.mul = P.Mul()
예제 #15
0
 def __init__(self):
     super().__init__()
     self.mul1 = P.Mul()
     self.reduce_mean = P.ReduceMean(keep_dims=False)
     self.reduce_sum = P.ReduceSum(keep_dims=False).add_prim_attr(
         "cross_batch", True)
예제 #16
0
 def __init__(self, alpha=0.2):
     super(LeakyReLU, self).__init__()
     self.greater_equal = P.GreaterEqual()
     self.mul = P.Mul()
     self.alpha = alpha
예제 #17
0
 def __init__(self, strategy1, strategy2, strategy3):
     super().__init__()
     self.mul = P.Mul().shard(strategy1)
     self.reduce_max = P.ReduceMax(keep_dims=False).shard(strategy2)
     self.add = P.TensorAdd().shard(strategy3)
예제 #18
0
 def __init__(self, strategy1, strategy2):
     super().__init__()
     self.matmul = P.MatMul().shard(strategy1)
     self.mul = P.Mul().shard(strategy2)
예제 #19
0
 def __init__(self):
     super().__init__()
     self.add = P.TensorAdd()
     self.sub = P.Sub()
     self.mul = P.Mul()
     self.div = P.RealDiv()
예제 #20
0
    def __init__(self,
                 num_features,
                 eps=1e-5,
                 momentum=0.9,
                 affine=True,
                 gamma_init='ones',
                 beta_init='zeros',
                 moving_mean_init='zeros',
                 moving_var_init='ones',
                 use_batch_statistics=None,
                 device_num_each_group=1):
        super(_BatchNorm, self).__init__()
        if num_features < 1:
            raise ValueError("num_features must be at least 1")

        if momentum < 0 or momentum > 1:
            raise ValueError(
                "momentum should be a number in range [0, 1], but got {}".
                format(momentum))

        self.use_batch_statistics = use_batch_statistics
        self.num_features = num_features
        self.eps = eps
        self.moving_mean = Parameter(initializer(moving_mean_init,
                                                 num_features),
                                     name="mean",
                                     requires_grad=False)
        self.moving_variance = Parameter(initializer(moving_var_init,
                                                     num_features),
                                         name="variance",
                                         requires_grad=False)
        self.gamma = Parameter(initializer(gamma_init, num_features),
                               name="gamma",
                               requires_grad=affine)
        self.beta = Parameter(initializer(beta_init, num_features),
                              name="beta",
                              requires_grad=affine)
        self.group = check_int_positive(device_num_each_group)
        self.is_global = False
        if self.group != 1:
            self.rank_id = get_rank()
            self.rank_size = get_group_size()
            self.device_list = [i for i in range(0, self.rank_size)]
            self.rank_list = self.list_group(self.device_list, self.group)
            self.rank_list_idx = len(self.rank_list)
            for i in range(self.rank_list_idx):
                if self.rank_id in self.rank_list[i] and self.group != 1:
                    self.is_global = True
                    management.create_group('group' + str(i),
                                            self.rank_list[i])
                    self.all_reduce = P.AllReduce(
                        P.ReduceOp.SUM,
                        'group' + str(i)).add_prim_attr('fusion', 1)
        self.shape = P.Shape()
        self.reduce_mean = P.ReduceMean(keep_dims=True)
        self.square = P.Square()
        self.sqrt = P.Sqrt()
        self.cast = P.Cast()
        self.dtype = P.DType()
        self.reshape = P.Reshape()
        self.is_ascend = context.get_context("device_target") == "Ascend"
        self.momentum = 1.0 - momentum
        if context.get_context("enable_ge"):
            self.is_ge_backend = True
        else:
            self.is_ge_backend = False

        if self.is_ge_backend or self.is_ascend:
            self.bn_train = P.BatchNorm(is_training=True, epsilon=self.eps)
        else:
            self.bn_train = P.FusedBatchNorm(mode=1,
                                             epsilon=self.eps,
                                             momentum=self.momentum)
        self.bn_infer = P.BatchNorm(is_training=False, epsilon=self.eps)

        data_parallel_strategy = ((1, ), (1, ))
        data_parallel_strategy_one = ((1, ), ())
        self.sub_mean = P.Sub().set_strategy(data_parallel_strategy)
        self.sub_var = P.Sub().set_strategy(data_parallel_strategy)
        self.mul_mean = P.Mul().set_strategy(data_parallel_strategy_one)
        self.mul_var = P.Mul().set_strategy(data_parallel_strategy_one)
        self.assign_sub_mean = P.AssignSub().set_strategy(
            data_parallel_strategy)
        self.assign_sub_var = P.AssignSub().set_strategy(
            data_parallel_strategy)
예제 #21
0
 def __init__(self, strategy1, strategy2, strategy3):
     super().__init__()
     self.mul = P.Mul().shard(strategy1)
     self.mul2 = P.Mul().shard(strategy2)
     self.cast = P.Cast().shard(strategy3)
     self.cast2 = P.Cast().shard(strategy3)
예제 #22
0
    def __init__(self,
                 from_tensor_width,
                 to_tensor_width,
                 from_seq_length,
                 to_seq_length,
                 num_attention_heads=1,
                 size_per_head=512,
                 query_act=None,
                 key_act=None,
                 value_act=None,
                 has_attention_mask=False,
                 attention_probs_dropout_prob=0.0,
                 use_one_hot_embeddings=False,
                 initializer_range=0.02,
                 do_return_2d_tensor=False,
                 use_relative_positions=False,
                 compute_type=mstype.float32):

        super(BertAttention, self).__init__()
        self.from_seq_length = from_seq_length
        self.to_seq_length = to_seq_length
        self.num_attention_heads = num_attention_heads
        self.size_per_head = size_per_head
        self.has_attention_mask = has_attention_mask
        self.use_relative_positions = use_relative_positions

        self.scores_mul = 1.0 / math.sqrt(float(self.size_per_head))
        self.reshape = P.Reshape()
        self.shape_from_2d = (-1, from_tensor_width)
        self.shape_to_2d = (-1, to_tensor_width)
        weight = TruncatedNormal(initializer_range)
        units = num_attention_heads * size_per_head
        self.query_layer = nn.Dense(from_tensor_width,
                                    units,
                                    activation=query_act,
                                    weight_init=weight).to_float(compute_type)
        self.key_layer = nn.Dense(to_tensor_width,
                                  units,
                                  activation=key_act,
                                  weight_init=weight).to_float(compute_type)
        self.value_layer = nn.Dense(to_tensor_width,
                                    units,
                                    activation=value_act,
                                    weight_init=weight).to_float(compute_type)

        self.shape_from = (-1, from_seq_length, num_attention_heads,
                           size_per_head)
        self.shape_to = (-1, to_seq_length, num_attention_heads, size_per_head)

        self.matmul_trans_b = P.BatchMatMul(transpose_b=True)
        self.multiply = P.Mul()
        self.transpose = P.Transpose()
        self.trans_shape = (0, 2, 1, 3)
        self.trans_shape_relative = (2, 0, 1, 3)
        self.trans_shape_position = (1, 2, 0, 3)
        self.multiply_data = -10000.0
        self.matmul = P.BatchMatMul()

        self.softmax = nn.Softmax()
        self.dropout = nn.Dropout(1 - attention_probs_dropout_prob)

        if self.has_attention_mask:
            self.expand_dims = P.ExpandDims()
            self.sub = P.Sub()
            self.add = P.TensorAdd()
            self.cast = P.Cast()
            self.get_dtype = P.DType()
        if do_return_2d_tensor:
            self.shape_return = (-1, num_attention_heads * size_per_head)
        else:
            self.shape_return = (-1, from_seq_length,
                                 num_attention_heads * size_per_head)

        self.cast_compute_type = SaturateCast(dst_type=compute_type)
        if self.use_relative_positions:
            self._generate_relative_positions_embeddings = \
                RelaPosEmbeddingsGenerator(length=to_seq_length,
                                           depth=size_per_head,
                                           max_relative_position=16,
                                           initializer_range=initializer_range,
                                           use_one_hot_embeddings=use_one_hot_embeddings)
예제 #23
0
 def __init__(self, strategy):
     super().__init__()
     self.reshape = P.Reshape()
     self.mul = P.Mul().set_strategy(strategy)
     self.relu = P.ReLU()
예제 #24
0
 def __init__(self):
     super().__init__()
     self.norm1 = P.L2Normalize()
     self.norm2 = P.L2Normalize()
     self.mul1 = P.Mul()
     self.mul2 = P.Mul()
 def __init__(self):
     super(Net, self).__init__()
     self.mul = P.Mul()
     self.relu = P.ReLU()
     self.wd = Parameter(Tensor(np.ones([8, 8, 8, 8]).astype(np.float32)), name="wide")
     self.wt = Parameter(Tensor(np.ones([8, 8, 8, 8]).astype(np.float32)), name="l")
예제 #26
0
 def __init__(self, strategy1, strategy2, strategy3):
     super().__init__()
     self.mul1 = P.Mul().shard(strategy1)
     self.arg_max_with_value = P.ArgMaxWithValue(keep_dims=False,
                                                 axis=-1).shard(strategy2)
     self.mul2 = P.Mul().shard(strategy3)
예제 #27
0
 def __init__(self):
     super(MultiOutNet, self).__init__()
     self.add = P.Add()
     self.mul = P.Mul()
     self.sum = P.ReduceSum()
예제 #28
0
 def __init__(self, strategy1, strategy2, strategy3):
     super().__init__()
     self.mul1 = P.Mul().shard(strategy1)
     self.arg_min_with_value = P.ArgMinWithValue(keep_dims=True,
                                                 axis=-1).shard(strategy2)
     self.relu = P.ReLU().shard(strategy3)
예제 #29
0
 def __init__(self):
     super().__init__()
     self.mul1 = P.Mul()
     self.prelu = P.PReLU()
예제 #30
0
 def __init__(self, alpha=0.2):
     super(LeakyReLU, self).__init__()
     validator.check_value_type('alpha', alpha, [float, int], self.cls_name)
     self.greater_equal = P.GreaterEqual()
     self.mul = P.Mul()
     self.alpha = alpha