Пример #1
0
    def __init__(self, config):
        super(WideDeepModel, self).__init__()
        self.batch_size = config.batch_size
        host_device_mix = bool(config.host_device_mix)
        parameter_server = bool(config.parameter_server)
        parallel_mode = _get_parallel_mode()
        is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL,
                                             ParallelMode.AUTO_PARALLEL)
        if is_auto_parallel:
            self.batch_size = self.batch_size * get_group_size()
        self.field_size = config.field_size
        self.vocab_size = config.vocab_size
        self.emb_dim = config.emb_dim
        self.deep_layer_dims_list = config.deep_layer_dim
        self.deep_layer_act = config.deep_layer_act
        self.init_args = config.init_args
        self.weight_init, self.bias_init = config.weight_bias_init
        self.weight_bias_init = config.weight_bias_init
        self.emb_init = config.emb_init
        self.drop_out = config.dropout_flag
        self.keep_prob = config.keep_prob
        self.deep_input_dims = self.field_size * self.emb_dim
        self.layer_dims = self.deep_layer_dims_list + [1]
        self.all_dim_list = [self.deep_input_dims] + self.layer_dims

        init_acts = [('Wide_w', [self.vocab_size, 1], self.emb_init),
                     ('V_l2', [self.vocab_size, self.emb_dim], self.emb_init),
                     ('Wide_b', [1], self.emb_init)]
        var_map = init_var_dict(self.init_args, init_acts)
        self.wide_w = var_map["Wide_w"]
        self.wide_b = var_map["Wide_b"]
        self.embedding_table = var_map["V_l2"]
        if parameter_server:
            self.wide_w.set_param_ps()
            self.embedding_table.set_param_ps()
        self.dense_layer_1 = DenseLayer(self.all_dim_list[0],
                                        self.all_dim_list[1],
                                        self.weight_bias_init,
                                        self.deep_layer_act,
                                        convert_dtype=True,
                                        drop_out=config.dropout_flag)
        self.dense_layer_2 = DenseLayer(self.all_dim_list[1],
                                        self.all_dim_list[2],
                                        self.weight_bias_init,
                                        self.deep_layer_act,
                                        convert_dtype=True,
                                        drop_out=config.dropout_flag)
        self.dense_layer_3 = DenseLayer(self.all_dim_list[2],
                                        self.all_dim_list[3],
                                        self.weight_bias_init,
                                        self.deep_layer_act,
                                        convert_dtype=True,
                                        drop_out=config.dropout_flag)
        self.dense_layer_4 = DenseLayer(self.all_dim_list[3],
                                        self.all_dim_list[4],
                                        self.weight_bias_init,
                                        self.deep_layer_act,
                                        convert_dtype=True,
                                        drop_out=config.dropout_flag)
        self.dense_layer_5 = DenseLayer(self.all_dim_list[4],
                                        self.all_dim_list[5],
                                        self.weight_bias_init,
                                        self.deep_layer_act,
                                        use_activation=False,
                                        convert_dtype=True,
                                        drop_out=config.dropout_flag)
        self.wide_mul = P.Mul()
        self.deep_mul = P.Mul()
        self.reduce_sum = P.ReduceSum(keep_dims=False)
        self.reshape = P.Reshape()
        self.deep_reshape = P.Reshape()
        self.square = P.Square()
        self.shape = P.Shape()
        self.tile = P.Tile()
        self.concat = P.Concat(axis=1)
        self.cast = P.Cast()
        if is_auto_parallel and host_device_mix:
            self.dense_layer_1.dropout.dropout_do_mask.set_strategy(
                ((1, get_group_size()), ))
            self.dense_layer_1.matmul.set_strategy(
                ((1, get_group_size()), (get_group_size(), 1)))
            self.deep_embeddinglookup = nn.EmbeddingLookup()
            self.deep_embeddinglookup.embeddinglookup.set_strategy(
                ((1, get_group_size()), (1, 1)))
            self.wide_embeddinglookup = nn.EmbeddingLookup()
            self.wide_embeddinglookup.embeddinglookup.set_strategy(
                ((get_group_size(), 1), (1, 1)))
            self.deep_mul.set_strategy(((1, 1, get_group_size()), (1, 1, 1)))
            self.deep_reshape.add_prim_attr("skip_redistribution", True)
            self.reduce_sum.add_prim_attr("cross_batch", True)
        elif parameter_server:
            self.deep_embeddinglookup = nn.EmbeddingLookup()
            self.wide_embeddinglookup = nn.EmbeddingLookup()
        else:
            self.deep_embeddinglookup = nn.EmbeddingLookup(target='DEVICE')
            self.wide_embeddinglookup = nn.EmbeddingLookup(target='DEVICE')
Пример #2
0
    def __init__(self,
                 in_channels,
                 out_channels,
                 weight_init='normal',
                 bias_init='zeros',
                 damping=0.03,
                 loss_scale=1,
                 frequency=278,
                 batch_size=32,
                 has_bias=True,
                 activation=None):
        super(Dense_Thor_GPU, self).__init__()
        self.in_channels = Validator.check_positive_int(in_channels)
        self.out_channels = Validator.check_positive_int(out_channels)
        self.has_bias = Validator.check_bool(has_bias)
        self.thor = True
        if isinstance(weight_init, Tensor):
            if weight_init.dim() != 2 or weight_init.shape[0] != out_channels or \
                    weight_init.shape[1] != in_channels:
                raise ValueError("weight_init shape error")

        self.weight = Parameter(
            initializer(weight_init, [out_channels, in_channels]))

        if self.has_bias:
            if isinstance(bias_init, Tensor):
                if bias_init.dim() != 1 or bias_init.shape[0] != out_channels:
                    raise ValueError("bias_init shape error")

            self.bias = Parameter(initializer(bias_init, [out_channels]))

        self.matmul = P.MatMul(transpose_b=True)
        self.bias_add = P.BiasAdd()

        self.activation = get_activation(activation)
        self.activation_flag = self.activation is not None
        split_dim = 128
        matrix_A_shape, matrix_G_shape = caculate_matmul_shape(
            self.in_channels, self.out_channels, split_dim)
        self.matrix_A_inv = Parameter(Tensor(
            np.zeros(matrix_A_shape).astype(np.float32)),
                                      requires_grad=False)
        self.matrix_G_inv = Parameter(Tensor(
            np.zeros(matrix_G_shape).astype(np.float32)),
                                      requires_grad=False)
        self.broadcast_to = P.BroadcastTo(matrix_A_shape)
        self.cov_step = Parameter(initializer(0, [1], mstype.int32),
                                  requires_grad=False)
        self.shape = P.Shape()
        self.reshape = P.Reshape()
        self.transpose = P.Transpose()
        self.mul = P.Mul()
        self.cube_matmul = P.MatMul(transpose_a=True)
        self.loss_scale = Tensor(1 / loss_scale, mstype.float16)
        self.batch_size = Tensor(batch_size, mstype.float16)
        self.getG = P.InsertGradientOf(self.save_gradient)
        self.damping = Parameter(Tensor(damping), requires_grad=False)
        self.dampingA = Tensor(np.identity(in_channels), mstype.float32)
        self.dampingG = Tensor(np.identity(out_channels), mstype.float32)
        self.cast = P.Cast()
        self.gather = P.GatherV2()
        self.freq = Tensor(frequency, mstype.int32)
        self.axis = 0
        self.add = P.TensorAdd()
        self.sqrt = P.Sqrt()
        self.cholesky = P.CholeskyTrsm(split_dim=split_dim)
        self.vector_matmul = P.BatchMatMul(transpose_a=True)
Пример #3
0
    def __init__(self,
                 in_channels,
                 out_channels,
                 weight_init='normal',
                 bias_init='zeros',
                 damping=0.03,
                 loss_scale=1,
                 frequency=278,
                 batch_size=32,
                 has_bias=True,
                 activation=None):
        super(Dense_Thor, self).__init__()
        self.in_channels = Validator.check_positive_int(in_channels)
        self.out_channels = Validator.check_positive_int(out_channels)
        self.has_bias = Validator.check_bool(has_bias)
        self.thor = True
        self.batch_size = batch_size
        if isinstance(weight_init, Tensor):
            if weight_init.dim() != 2 or weight_init.shape[0] != out_channels or \
                    weight_init.shape[1] != in_channels:
                raise ValueError("weight_init shape error")

        self.weight = Parameter(
            initializer(weight_init, [out_channels, in_channels]))

        if self.has_bias:
            if isinstance(bias_init, Tensor):
                if bias_init.dim() != 1 or bias_init.shape[0] != out_channels:
                    raise ValueError("bias_init shape error")

            self.bias = Parameter(initializer(bias_init, [out_channels]))

        self.matmul = P.MatMul(transpose_b=True)
        self.bias_add = P.BiasAdd()

        self.activation = get_activation(activation)
        self.activation_flag = self.activation is not None

        self.matrix_A_inv = Parameter(Tensor(
            np.zeros([128, 128, 16, 16]).astype(np.float16)),
                                      requires_grad=False)
        self.matrix_G_inv = Parameter(Tensor(
            np.zeros([63, 63, 16, 16]).astype(np.float16)),
                                      requires_grad=False)
        self.fake_G = Tensor(np.zeros([63, 63, 16, 16]).astype(np.float16))

        self.matmul = P.MatMul(transpose_b=True)
        self.cube_matmul = P.CusMatMulCube(transpose_a=True)
        self.matrix_combine = P.CusMatrixCombine()
        self.cholesky = P.CusCholeskyTrsm()
        self.shape = P.Shape()
        self.reshape = P.Reshape()
        self.transpose = P.Transpose()
        self.cov_step = Parameter(initializer(0, [1], mstype.int32),
                                  requires_grad=False)
        self.mul = P.Mul()
        self.cast = P.Cast()
        self.damping = Tensor(damping)
        self.loss_scale = Tensor(1 / loss_scale, mstype.float16)
        self.vector_matmul = P.CusBatchMatMul()
        self.pad = P.Pad(((0, 23), (0, 23)))
        self.pad1 = P.Pad(((0, 7), (0, 7)))
        self.slice = P.Slice()
        self.gather = P.GatherV2()
        self.assignadd = P.AssignAdd()
        self.freq = Tensor(frequency, mstype.int32)
        self.axis = 0
        self.A_inv_max = Parameter(initializer(0, [1], mstype.float32),
                                   requires_grad=False)
        self.G_inv_max = Parameter(initializer(0, [1], mstype.float32),
                                   requires_grad=False)
        self.fused_abs_max1 = P.CusFusedAbsMax1([1001, 1001])
        self.fused_abs_max2 = P.CusFusedAbsMax1()
        self.log = P.Log()
        self.exp = P.Exp()
        self.dampingA = Tensor(np.identity(2048), mstype.float32)
        self.dampingG = Tensor(np.identity(1024), mstype.float32)
        self.add = P.TensorAdd()
        self.sqrt = P.Sqrt()
        self.getG = P.InsertGradientOf(self.save_gradient)
Пример #4
0
    def construct(self, grid, prediction, pred_xy, pred_wh, y_true, gt_box,
                  input_shape):
        # prediction : origin output from yolo
        # pred_xy: (sigmoid(xy)+grid)/grid_size
        # pred_wh: (exp(wh)*anchors)/input_shape
        # y_true : after normalize
        # gt_box: [batch, maxboxes, xyhw] after normalize

        object_mask = y_true[:, :, :, :, 4:5]
        class_probs = y_true[:, :, :, :, 5:]
        true_boxes = y_true[:, :, :, :, :4]

        grid_shape = P.Shape()(prediction)[1:3]
        grid_shape = P.Cast()(F.tuple_to_array(grid_shape[::-1]), ms.float32)

        pred_boxes = self.concat((pred_xy, pred_wh))
        true_xy = y_true[:, :, :, :, :2] * grid_shape - grid
        true_wh = y_true[:, :, :, :, 2:4]
        true_wh = P.Select()(P.Equal()(true_wh,
                                       0.0), P.Fill()(P.DType()(true_wh),
                                                      P.Shape()(true_wh), 1.0),
                             true_wh)
        true_wh = P.Log()(true_wh / self.anchors * input_shape)
        # 2-w*h for large picture, use small scale, since small obj need more precise
        box_loss_scale = 2 - y_true[:, :, :, :, 2:3] * y_true[:, :, :, :, 3:4]

        gt_shape = P.Shape()(gt_box)
        gt_box = P.Reshape()(gt_box,
                             (gt_shape[0], 1, 1, 1, gt_shape[1], gt_shape[2]))

        # add one more dimension for broadcast
        iou = self.iou(P.ExpandDims()(pred_boxes, -2), gt_box)
        # gt_box is x,y,h,w after normalize
        # [batch, grid[0], grid[1], num_anchor, num_gt]
        best_iou = self.reduce_max(iou, -1)
        # [batch, grid[0], grid[1], num_anchor]

        # ignore_mask IOU too small
        ignore_mask = best_iou < self.ignore_threshold
        ignore_mask = P.Cast()(ignore_mask, ms.float32)
        ignore_mask = P.ExpandDims()(ignore_mask, -1)
        # ignore_mask backpro will cause a lot maximunGrad and minimumGrad time consume.
        # so we turn off its gradient
        ignore_mask = F.stop_gradient(ignore_mask)

        confidence_loss = self.confidenceLoss(object_mask,
                                              prediction[:, :, :, :,
                                                         4:5], ignore_mask)
        class_loss = self.classLoss(object_mask, prediction[:, :, :, :, 5:],
                                    class_probs)

        object_mask_me = P.Reshape()(object_mask, (-1, 1))  # [8, 72, 72, 3, 1]
        box_loss_scale_me = P.Reshape()(box_loss_scale, (-1, 1))
        pred_boxes_me = xywh2x1y1x2y2(pred_boxes)
        pred_boxes_me = P.Reshape()(pred_boxes_me, (-1, 4))
        true_boxes_me = xywh2x1y1x2y2(true_boxes)
        true_boxes_me = P.Reshape()(true_boxes_me, (-1, 4))
        ciou = self.giou(pred_boxes_me, true_boxes_me)
        ciou_loss = object_mask_me * box_loss_scale_me * (1 - ciou)
        ciou_loss_me = self.reduce_sum(ciou_loss, ())
        loss = ciou_loss_me + confidence_loss + class_loss
        batch_size = P.Shape()(prediction)[0]
        return loss / batch_size
Пример #5
0
def _update_run_op(beta1, beta2, eps, lr, weight_decay_tensor, global_step,
                   param, m, v, gradient, decay_flag):
    """
    Update parameters.

    Args:
        beta1 (Tensor): The exponential decay rate for the 1st moment estimates. Should be in range (0.0, 1.0).
        beta2 (Tensor): The exponential decay rate for the 2nd moment estimates. Should be in range (0.0, 1.0).
        eps (Tensor): Term added to the denominator to improve numerical stability. Should be greater than 0.
        lr (Tensor): Learning rate.
        weight_decay_tensor (Tensor): Weight decay. Should be equal to or greater than 0.
        global_step (Tensor): Global step.
        param (Tensor): Parameters.
        m (Tensor): m value of parameters.
        v (Tensor): v value of parameters.
        gradient (Tensor): Gradient of parameters.
        decay_flag (bool): Specifies whether param update with weight decay.

    Returns:
        Tensor, the new value of v after updating.
    """
    op_mul = P.Mul()
    op_sqrt = P.Sqrt()
    op_rsqrt = P.Rsqrt()
    op_square = P.Square()
    op_cast = P.Cast()
    op_reshape = P.Reshape()
    op_shape = P.Shape()
    op_pow = P.Pow()
    op_norm = layer.Norm()
    op_select = P.Select()
    op_greater = P.Greater()
    op_fill = P.Fill()
    op_dtype = P.DType()

    param_fp32 = op_cast(param, mstype.float32)
    m_fp32 = op_cast(m, mstype.float32)
    v_fp32 = op_cast(v, mstype.float32)
    gradient_fp32 = op_cast(gradient, mstype.float32)

    next_m = op_mul(beta1, m_fp32) + op_mul(
        op_cast(num_one, mstype.float32) - beta1, gradient_fp32)

    next_v = op_mul(beta2, v_fp32) + op_mul(
        op_cast(num_one, mstype.float32) - beta2, op_square(gradient_fp32))

    next_mm = next_m / (op_cast(num_one, mstype.float32) - op_pow(
        beta1, op_cast(global_step + num_one, mstype.float32)))
    next_vv = next_v / (op_cast(num_one, mstype.float32) - op_pow(
        beta2, op_cast(global_step + num_one, mstype.float32)))
    w_norm = op_norm(param_fp32)
    g_norm = op_norm(gradient_fp32)

    g_norm_hat = op_norm(
        op_mul(next_mm, op_rsqrt(next_vv + eps)) +
        weight_decay_tensor * param_fp32)
    zeros = F.zeros_like(w_norm)
    ones = op_fill(op_dtype(w_norm), op_shape(w_norm), 1.0)
    trust_ratio = op_select(
        op_greater(w_norm, zeros),
        op_select(op_greater(g_norm, zeros), w_norm / g_norm_hat, ones), ones)
    tens = op_fill(op_dtype(trust_ratio), op_shape(trust_ratio), 10.0)
    trust_ratio = C.clip_by_value(trust_ratio, zeros, tens)
    update = next_mm / (op_sqrt(next_vv) + eps)

    if decay_flag:
        update = update + op_mul(weight_decay_tensor, param_fp32)

    update_with_lr = op_mul(op_mul(trust_ratio, lr), update)

    next_param = param_fp32 - op_reshape(update_with_lr, op_shape(param_fp32))

    next_v = F.depend(next_v, F.assign(param, next_param))
    next_v = F.depend(next_v, F.assign(m, next_m))
    next_v = F.depend(next_v, F.assign(v, next_v))

    return next_v
Пример #6
0
 def __init__(self, num_features):
     super().__init__()
     self.reshape = P.Reshape()
     self.shape = P.Shape()
     self.bn2d = nn.BatchNorm2d(num_features, data_format="NCHW")
Пример #7
0
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 pad_mode='same',
                 padding=0,
                 dilation=1,
                 group=1,
                 has_bias=False,
                 weight_init='normal',
                 bias_init='zeros'):
        kernel_size = twice(kernel_size)
        stride = twice(stride)
        dilation = twice(dilation)
        Validator.check_value_type('padding', padding, (int, tuple), self.cls_name)
        if isinstance(padding, tuple):
            Validator.check_equal_int(len(padding), 4, 'padding size', self.cls_name)
        # out_channels and in_channels swap.
        # cause Conv2DBackpropInput's out_channel refers to Conv2D's out_channel,
        # then Conv2dTranspose's out_channel refers to Conv2DBackpropInput's in_channel.
        super(Conv2dTranspose, self).__init__(
            in_channels,
            out_channels,
            kernel_size,
            stride,
            pad_mode,
            padding,
            dilation,
            group,
            has_bias,
            weight_init,
            bias_init,
            transposed=True)

        self.in_channels = in_channels
        self.out_channels = out_channels
        self.shape = P.Shape()
        if pad_mode not in ('valid', 'same', 'pad'):
            raise ValueError('Attr \'pad_mode\' of \'Conv2dTranspose\' Op passed '
                             + str(pad_mode) + ', should be one of values in \'valid\', \'same\', \'pad\'.')
        self.is_valid = self.pad_mode == 'valid'
        self.is_same = self.pad_mode == 'same'
        self.is_pad = self.pad_mode == 'pad'
        if Validator.check_bool(has_bias):
            self.bias = Parameter(initializer(bias_init, [out_channels]), name='bias')

        # cause Conv2DBackpropInput's out_channel refers to Conv2D's out_channel.
        self.conv2d_transpose = P.Conv2DBackpropInput(out_channel=in_channels,
                                                      kernel_size=kernel_size,
                                                      mode=1,
                                                      pad_mode=pad_mode,
                                                      pad=padding,
                                                      stride=stride,
                                                      dilation=dilation,
                                                      group=group)
        self.bias_add = P.BiasAdd()
        if isinstance(self.padding, int):
            self.padding_top, self.padding_bottom, self.padding_left, self.padding_right = (self.padding,) * 4
        else:
            self.padding_top, self.padding_bottom, self.padding_left, self.padding_right = self.padding
Пример #8
0
     'block': P.Transpose(),
     'desc_const': [(0, 2, 1)],
     'desc_inputs': [[1, 2, 3]],
     'desc_bprop': [[1, 3, 2]]}),
 ('Transpose_dim4', {
     'block': P.Transpose(),
     'desc_const': [(0, 1, 2, 3)],
     'desc_inputs': [[1, 2, 3, 4]],
     'desc_bprop': [[1, 2, 4, 3]]}),
 ('AddN', {
     'block': NetForTupleInput(P.AddN()),
     'desc_inputs': [[2, 3, 3, 5], [2, 3, 3, 5]],
     'desc_bprop': [[2, 3, 3, 5]],
     'skip': ['backward']}),
 ('Shape', {
     'block': P.Shape(),
     'desc_inputs': [[3, 3, 2, 2]],
     'skip': ['backward']}),
 ('Reshape', {
     'block': P.Reshape(),
     'desc_const': [(64,)],
     'desc_inputs': [[64, 1]],
     'desc_bprop': [[64]]}),
 ('Cast', {
     'block': P.Cast(),
     'desc_const': [mstype.int32],
     'desc_inputs': [[2, 3, 4, 5]],
     'desc_bprop': [Tensor(np.ones((2, 3, 3, 5)).astype(np.int32))]}),
 ('ExpandDims', {
     'block': P.ExpandDims(),
     'desc_const': [0],
Пример #9
0
def get_axis(x):
    shape_op = P.Shape()
    shape = shape_op(x)
    length = F.tuple_len(shape)
    perm = F.make_range(0, length)
    return perm
Пример #10
0
    def __init__(self,
                 params,
                 learning_rate,
                 momentum,
                 matrix_A,
                 matrix_G,
                 A_inv_max,
                 G_inv_max,
                 weight_decay=0.0,
                 loss_scale=1.0,
                 decay_filter=lambda x: x.name not in []):
        super(THOR, self).__init__(learning_rate, params, weight_decay,
                                   loss_scale)
        if isinstance(momentum, float) and momentum < 0.0:
            raise ValueError(
                "momentum should be at least 0.0, but got momentum {}".format(
                    momentum))
        self.momentum = Parameter(Tensor(momentum, mstype.float32),
                                  name="momentum")
        self.params = self.parameters
        self.moments = self.params.clone(prefix="moments", init='zeros')
        self.hyper_map = C.HyperMap()
        self.opt = P.ApplyMomentum()
        self.matrix_A = ParameterTuple(matrix_A)
        self.matrix_G = ParameterTuple(matrix_G)
        self.A_inv_max = ParameterTuple(A_inv_max)
        self.G_inv_max = ParameterTuple(G_inv_max)
        self.cube_matmul_left = P.CusMatMulCubeFraczLeftCast()
        self.cube_matmul_left_fc = P.CusMatMulCubeDenseLeft()
        self.cube_matmul_right_fc = P.CusMatMulCubeDenseRight()
        self.cube_matmul_right_mul = P.CusMatMulCubeFraczRightMul()
        self.transpose = P.Transpose()
        self.shape = P.Shape()
        self.reshape = P.Reshape()
        self.mul = P.Mul()
        self.weight_idx = []
        for i in range(len(self.params)):
            if "conv" in self.params[i].name or "end_point" in self.params[
                    i].name:
                self.weight_idx.append(i)
        self.weight_idx.append(len(self.params))
        self.feature_map = [
            1.0 / 12544, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136,
            1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136,
            1.0 / 3136, 1.0 / 3136, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784,
            1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784,
            1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 196, 1.0 / 196, 1.0 / 196,
            1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196,
            1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196,
            1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 49, 1.0 / 49,
            1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49,
            1.0 / 49, 1.0
        ]
        mean = _get_gradients_mean()
        degree = _get_device_num()
        parameter_length = len(self.feature_map)
        self.grad_reducer_Amax = DistributedGradReducerThor(
            parameter_length, ((27, ), 2), mean, degree)
        self.grad_reducer_Gmax = DistributedGradReducerThor(
            parameter_length, ((27, ), 4), mean, degree)
        self.grad_reducer_A = DistributedGradReducerThor(
            parameter_length, ((27, ), 6), mean, degree)
        self.grad_reducer_G = DistributedGradReducerThor(
            parameter_length, ((27, ), 8), mean, degree)
        self.matrix_A_inv = ()
        self.matrix_G_inv = ()
        self.matrix_max_inv = ()

        for i in range(54):
            self.matrix_max_inv = self.matrix_max_inv + (Parameter(
                initializer(1, [1], mstype.float32),
                name="matrix_max" + str(i),
                requires_grad=False), )
        self.log = P.Log()
        self.exp = P.Exp()
        self.sqrt = P.Sqrt()
        self.matrix_max_inv = ParameterTuple(self.matrix_max_inv)
        self.assign = P.Assign()
        self.cast = P.Cast()
        self.thor = True
        self.weight_decay = weight_decay * loss_scale
        self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
Пример #11
0
    def __init__(self,
                 params,
                 learning_rate,
                 momentum,
                 matrix_A,
                 matrix_G,
                 A_inv_max,
                 G_inv_max,
                 weight_decay=0.0,
                 loss_scale=1.0,
                 use_nesterov=False,
                 decay_filter=lambda x: x.name not in []):
        super(THOR_GPU, self).__init__(learning_rate, params, weight_decay,
                                       loss_scale)
        Validator.check_value_type("momentum", momentum, [float],
                                   self.cls_name)
        if isinstance(momentum, float) and momentum < 0.0:
            raise ValueError(
                "momentum should be at least 0.0, but got momentum {}".format(
                    momentum))
        self.momentum = Parameter(Tensor(momentum, mstype.float32),
                                  name="momentum")
        self.params = self.parameters
        self.use_nesterov = Validator.check_bool(use_nesterov)
        self.moments = self.params.clone(prefix="moments", init='zeros')
        self.hyper_map = C.HyperMap()
        self.opt = P.ApplyMomentum(use_nesterov=self.use_nesterov)

        self.feature_map = [
            1.0 / 12544, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136,
            1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136, 1.0 / 3136,
            1.0 / 3136, 1.0 / 3136, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784,
            1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 784,
            1.0 / 784, 1.0 / 784, 1.0 / 784, 1.0 / 196, 1.0 / 196, 1.0 / 196,
            1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196,
            1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196,
            1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 196, 1.0 / 49, 1.0 / 49,
            1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49, 1.0 / 49,
            1.0 / 49, 1.0
        ]
        self.feature_map_new = [x**0.5 for x in self.feature_map]
        self.transpose = P.Transpose()
        self.shape = P.Shape()
        self.reshape = P.Reshape()
        self.matmul = P.MatMul()
        self.matrix_A = ParameterTuple(matrix_A)
        self.matrix_G = ParameterTuple(matrix_G)
        self.A_inv_max = ParameterTuple(A_inv_max)
        self.G_inv_max = ParameterTuple(G_inv_max)
        self.assign = P.Assign()
        self.mul = P.Mul()

        mean = _get_gradients_mean()
        degree = _get_device_num()

        parameter_length = len(self.feature_map)
        self.grad_reducer_thorA = DistributedGradReducerThor(
            parameter_length, ((parameter_length, ), 0), mean, degree)
        self.grad_reducer_thorG = DistributedGradReducerThor(
            parameter_length, ((parameter_length, ), 0), mean, degree)
        self.weight_decay = weight_decay
        self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
        self.update_gradient = P.UpdateThorGradient(split_dim=128)
Пример #12
0
from mindspore.common.parameter import Parameter, ParameterTuple
from mindspore.common import dtype as mstype
from mindspore._checkparam import Validator as validator
from mindspore._checkparam import Rel
from mindspore.nn import Optimizer
from mindspore.nn import TrainOneStepCell, WithLossCell
from mindspore.nn.optim import Momentum
from mindspore.train import Model
from ....dataset_mock import MindData

context.set_context(mode=context.GRAPH_MODE, enable_sparse=True)

reduce_sum = P.ReduceSum()
unsorted_segment_sum = P.UnsortedSegmentSum()
transpose = P.Transpose()
shape_op = P.Shape()
reshape = P.Reshape()
size_op = P.Size()
invert_permutation = P.InvertPermutation()
logical_and = P.LogicalAnd()


def get_axis(x):
    shape = shape_op(x)
    length = F.tuple_len(shape)
    perm = F.make_range(0, length)
    return perm


class MSELoss(nn.Cell):
    def __init__(self):
Пример #13
0
def _run_opt_with_sparse(opt, sparse_opt, push, pull, use_locking, use_nesterov, target, beta1_power,
                         beta2_power, beta1, beta2, eps, lr, gradient, param, m, v, ps_parameter, cache_enable):
    """Apply sparse adam optimizer to the weight parameter when the gradient is sparse."""
    success = True
    indices = gradient.indices
    values = gradient.values
    if ps_parameter and not cache_enable:
        op_shape = P.Shape()
        shapes = (op_shape(param), op_shape(m), op_shape(v),
                  op_shape(beta1_power), op_shape(beta2_power), op_shape(lr), op_shape(beta1),
                  op_shape(beta2), op_shape(eps), op_shape(values), op_shape(indices))
        success = F.depend(success, pull(push((beta1_power, beta2_power, lr, beta1, beta2,
                                               eps, values, indices), shapes), param))
        return success

    if not target:
        success = F.depend(success, sparse_opt(param, m, v, beta1_power, beta2_power, lr, beta1, beta2,
                                               eps, values, indices))
    else:
        op_mul = P.Mul()
        op_square = P.Square()
        op_sqrt = P.Sqrt()
        scatter_add = P.ScatterAdd(use_locking)

        assign_m = F.assign(m, op_mul(beta1, m))
        assign_v = F.assign(v, op_mul(beta2, v))

        grad_indices = gradient.indices
        grad_value = gradient.values

        next_m = scatter_add(m,
                             grad_indices,
                             op_mul(F.tuple_to_array((1.0,)) - beta1, grad_value))

        next_v = scatter_add(v,
                             grad_indices,
                             op_mul(F.tuple_to_array((1.0,)) - beta2, op_square(grad_value)))

        if use_nesterov:
            m_temp = next_m * _scaler_ten
            assign_m_nesterov = F.assign(m, op_mul(beta1, next_m))
            div_value = scatter_add(m,
                                    op_mul(grad_indices, _scaler_one),
                                    op_mul(F.tuple_to_array((1.0,)) - beta1, grad_value))
            param_update = div_value / (op_sqrt(next_v) + eps)

            m_recover = F.assign(m, m_temp / _scaler_ten)

            F.control_depend(m_temp, assign_m_nesterov)
            F.control_depend(assign_m_nesterov, div_value)
            F.control_depend(param_update, m_recover)
        else:
            param_update = next_m / (op_sqrt(next_v) + eps)

        lr_t = lr * op_sqrt(1 - beta2_power) / (1 - beta1_power)

        next_param = param - lr_t * param_update

        F.control_depend(assign_m, next_m)
        F.control_depend(assign_v, next_v)

        success = F.depend(success, F.assign(param, next_param))
        success = F.depend(success, F.assign(m, next_m))
        success = F.depend(success, F.assign(v, next_v))

    return success
Пример #14
0
 def __init__(self,
              params,
              learning_rate,
              momentum,
              matrix_A,
              matrix_G,
              A_inv_max,
              G_inv_max,
              weight_decay=0.0,
              loss_scale=1.0,
              num_hidden_layers=24,
              batch_size=12,
              damping=0.03,
              frequency=10,
              decay_filter=lambda x: 'layernorm' not in x.name.lower() and
              'bias' not in x.name.lower()):
     super(THOR, self).__init__(learning_rate, params, weight_decay,
                                loss_scale)
     if isinstance(momentum, float) and momentum < 0.0:
         raise ValueError(
             "momentum should be at least 0.0, but got momentum {}".format(
                 momentum))
     self.momentum = Parameter(Tensor(momentum, mstype.float32),
                               name="momentum")
     self.params = self.parameters
     self.moments = self.params.clone(prefix="moments", init='zeros')
     self.hyper_map = C.HyperMap()
     self.opt = P.ApplyMomentum()
     self.matrix_A = ParameterTuple(matrix_A)
     self.matrix_G = ParameterTuple(matrix_G)
     self.A_inv_max = ParameterTuple(A_inv_max)
     self.G_inv_max = ParameterTuple(G_inv_max)
     self.matmul = P.MatMul()
     self.transpose = P.Transpose()
     self.shape = P.Shape()
     self.reshape = P.Reshape()
     self.mul = P.Mul()
     self.gather = P.GatherV2()
     self.matrix_A_inv = ()
     self.matrix_G_inv = ()
     self.matrix_max_inv = ()
     self.num_hidden_layers = num_hidden_layers
     fc_layer_num = num_hidden_layers * 6 + 5
     for i in range(fc_layer_num):
         self.matrix_max_inv = self.matrix_max_inv + (Parameter(
             initializer(1, [1], mstype.float32),
             name="matrix_max" + str(i),
             requires_grad=False), )
     self.log = P.Log()
     self.exp = P.Exp()
     self.sqrt = P.Sqrt()
     self.matrix_max_inv = ParameterTuple(self.matrix_max_inv)
     self.assign = P.Assign()
     self.cast = P.Cast()
     self.thor = True
     self.weight_decay = weight_decay * loss_scale
     self.decay_flags = tuple(decay_filter(x) for x in self.parameters)
     self.expand = P.ExpandDims()
     self.square = P.Square()
     self.inv = P.Inv()
     self.batch_size = batch_size
     self.damping = damping
     self.freq = Tensor(frequency, mstype.int32)
     self.one = Tensor(1, mstype.int32)
     self.cov_step = Parameter(initializer(0, [1], mstype.int32),
                               name="cov_step",
                               requires_grad=False)
     mean = _get_mirror_mean()
     degree = _get_device_num()
     self.grad_reducer_g = DistributedGradReducerThor1(
         self.parameters, 3, mean, degree)
Пример #15
0
    def __init__(self, config):
        super(AutoDisModel, self).__init__()

        self.batch_size = config.batch_size
        self.field_size = config.data_field_size
        self.vocab_size = config.data_vocab_size
        self.emb_dim = config.data_emb_dim
        self.deep_layer_dims_list, self.deep_layer_act = config.deep_layer_args
        self.init_args = config.init_args
        self.weight_bias_init = config.weight_bias_init
        self.keep_prob = config.keep_prob
        self.hash_size = config.hash_size
        self.split_index = config.split_index
        self.temperature = config.temperature
        init_acts = [('W_l2', [self.vocab_size, 1], 'normal', ms_type),
                     ('V_l2', [self.vocab_size,
                               self.emb_dim], 'normal', ms_type),
                     ('b', [1], 'normal', ms_type),
                     ('logits', [self.split_index,
                                 self.hash_size], 'random', ms_type),
                     ('autodis_embedding',
                      [self.split_index, self.hash_size,
                       self.emb_dim], 'random', ms_type_16)]
        var_map = init_var_dict(self.init_args, init_acts)
        self.fm_w = var_map["W_l2"]
        self.fm_b = var_map["b"]
        self.embedding_table = var_map["V_l2"]
        self.logits = var_map["logits"]
        self.autodis_embedding = var_map["autodis_embedding"]
        # Deep Layers
        self.deep_input_dims = self.field_size * self.emb_dim + 1
        self.all_dim_list = [self.deep_input_dims
                             ] + self.deep_layer_dims_list + [1]
        self.dense_layer_1 = DenseLayer(self.all_dim_list[0],
                                        self.all_dim_list[1],
                                        self.weight_bias_init,
                                        self.deep_layer_act, self.keep_prob)
        self.dense_layer_2 = DenseLayer(self.all_dim_list[1],
                                        self.all_dim_list[2],
                                        self.weight_bias_init,
                                        self.deep_layer_act, self.keep_prob)
        self.dense_layer_3 = DenseLayer(self.all_dim_list[2],
                                        self.all_dim_list[3],
                                        self.weight_bias_init,
                                        self.deep_layer_act, self.keep_prob)
        self.dense_layer_4 = DenseLayer(self.all_dim_list[3],
                                        self.all_dim_list[4],
                                        self.weight_bias_init,
                                        self.deep_layer_act, self.keep_prob)
        # FM, linear Layers
        self.Gatherv2 = P.GatherV2()
        self.Mul = P.Mul()
        self.ReduceSum = P.ReduceSum(keep_dims=False)
        self.Reshape = P.Reshape()
        self.Square = P.Square()
        self.Shape = P.Shape()
        self.Tile = P.Tile()
        self.Concat = P.Concat(axis=1)
        self.Cast = P.Cast()

        # AutoDis
        self.Slice = P.Slice()
        self.BatchMatMul = P.BatchMatMul()
        self.ExpandDims = P.ExpandDims()
        self.Transpose = P.Transpose()
        self.SoftMax = P.Softmax()
Пример #16
0
def _IgammacContinuedFraction(ax, x, a, enabled):
    """Helper function for computing Igammac using a continued fraction."""

    abs_x = P.Abs()
    logicaland = P.LogicalAnd()
    greater = P.Greater()
    less = P.Less()
    notequal = P.NotEqual()
    fill = P.Fill()
    shape = P.Shape()
    dtype = P.DType()
    select = P.Select()

    # If more data types are supported, this epsilon need to be selected.
    epsilon = eps_fp32

    def cond(vals):
        enabled = vals[0]
        c = vals[5]
        return logicaland(less(c, 2000), enabled)

    def body(vals):
        enabled = vals[0]
        ans = vals[1]
        t = vals[2]
        y = vals[3]
        z = vals[4]
        c = vals[5]
        pkm1 = vals[6]
        qkm1 = vals[7]
        pkm2 = vals[8]
        qkm2 = vals[9]

        dpkm2_da = vals[10]
        dqkm2_da = vals[11]
        dpkm1_da = vals[12]
        dqkm1_da = vals[13]
        dans_da = vals[14]

        c = c + 1
        y = y + 1
        z = z + 2

        yc = y * c
        pk = pkm1 * z - pkm2 * yc
        qk = qkm1 * z - qkm2 * yc
        qk_is_nonzero = notequal(qk, 0)
        r = pk / qk

        t = select(qk_is_nonzero, abs_x((ans - r) / r),
                   fill(dtype(t), shape(t), 1))
        ans = select(qk_is_nonzero, r, ans)

        dpk_da = dpkm1_da * z - pkm1 - dpkm2_da * yc + pkm2 * c
        dqk_da = dqkm1_da * z - qkm1 - dqkm2_da * yc + qkm2 * c
        dans_da_new = select(qk_is_nonzero, (dpk_da - ans * dqk_da) / qk,
                             dans_da)
        grad_conditional = select(qk_is_nonzero, abs_x(dans_da_new - dans_da),
                                  fill(dtype(dans_da), shape(dans_da), 1))

        pkm2 = pkm1
        pkm1 = pk
        qkm2 = qkm1
        qkm1 = qk

        dpkm2_da = dpkm1_da
        dqkm2_da = dqkm1_da
        dpkm1_da = dpk_da
        dqkm1_da = dqk_da

        rescale = greater(abs_x(pk), 1 / epsilon)
        pkm2 = select(rescale, pkm2 * epsilon, pkm2)
        pkm1 = select(rescale, pkm1 * epsilon, pkm1)
        qkm2 = select(rescale, qkm2 * epsilon, qkm2)
        qkm1 = select(rescale, qkm1 * epsilon, qkm1)

        dpkm2_da = select(rescale, dpkm2_da * epsilon, dpkm2_da)
        dqkm2_da = select(rescale, dqkm2_da * epsilon, dqkm2_da)
        dpkm1_da = select(rescale, dpkm1_da * epsilon, dpkm1_da)
        dqkm1_da = select(rescale, dqkm1_da * epsilon, dqkm1_da)

        conditional = logicaland(enabled, greater(grad_conditional, epsilon))

        return (conditional, select(enabled, ans,
                                    vals[1]), select(enabled, t, vals[2]),
                select(enabled, y, vals[3]), select(enabled, z, vals[4]), c,
                select(enabled, pkm1, vals[6]), select(enabled, qkm1, vals[7]),
                select(enabled, pkm2, vals[8]), select(enabled, qkm2, vals[9]),
                select(enabled, dpkm2_da,
                       vals[10]), select(enabled, dqkm2_da, vals[11]),
                select(enabled, dpkm1_da,
                       vals[12]), select(enabled, dqkm1_da, vals[13]),
                select(enabled, dans_da_new, vals[14]))

    y = 1 - a
    z = x + y + 1
    c = fill(dtype(x), shape(x), 0)
    pkm2 = fill(dtype(x), shape(x), 1)
    qkm2 = x
    pkm1 = x + 1
    qkm1 = z * x
    ans = pkm1 / qkm1
    t = fill(dtype(x), shape(x), 1)
    dpkm2_da = fill(dtype(x), shape(x), 0)
    dqkm2_da = fill(dtype(x), shape(x), 0)
    dpkm1_da = fill(dtype(x), shape(x), 0)
    dqkm1_da = -x
    dans_da = (dpkm1_da - ans * dqkm1_da) / qkm1
    vals = (enabled, ans, t, y, z, c, pkm1, qkm1, pkm2, qkm2, dpkm2_da,
            dqkm2_da, dpkm1_da, dqkm1_da, dans_da)
    vals = _while_helper_func(cond, body, vals)
    ans = vals[1]
    return ans * ax
Пример #17
0
    def __init__(self, config):
        super(WideDeepModel, self).__init__()
        self.batch_size = config.batch_size
        host_device_mix = bool(config.host_device_mix)
        parameter_server = bool(config.parameter_server)
        parallel_mode = context.get_auto_parallel_context("parallel_mode")
        is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL,
                                             ParallelMode.AUTO_PARALLEL)
        if is_auto_parallel:
            self.batch_size = self.batch_size * get_group_size()
        is_field_slice = config.field_slice
        sparse = config.sparse
        self.field_size = config.field_size
        self.vocab_size = config.vocab_size
        self.vocab_cache_size = config.vocab_cache_size
        self.emb_dim = config.emb_dim
        self.deep_layer_dims_list = config.deep_layer_dim
        self.deep_layer_act = config.deep_layer_act
        self.init_args = config.init_args
        self.weight_init, self.bias_init = config.weight_bias_init
        self.weight_bias_init = config.weight_bias_init
        self.emb_init = config.emb_init
        self.drop_out = config.dropout_flag
        self.keep_prob = config.keep_prob
        self.deep_input_dims = self.field_size * self.emb_dim
        self.layer_dims = self.deep_layer_dims_list + [1]
        self.all_dim_list = [self.deep_input_dims] + self.layer_dims

        init_acts = [('Wide_b', [1], self.emb_init)]
        var_map = init_var_dict(self.init_args, init_acts)
        self.wide_b = var_map["Wide_b"]
        self.dense_layer_1 = DenseLayer(self.all_dim_list[0],
                                        self.all_dim_list[1],
                                        self.weight_bias_init,
                                        self.deep_layer_act,
                                        convert_dtype=True,
                                        drop_out=config.dropout_flag)
        self.dense_layer_2 = DenseLayer(self.all_dim_list[1],
                                        self.all_dim_list[2],
                                        self.weight_bias_init,
                                        self.deep_layer_act,
                                        convert_dtype=True,
                                        drop_out=config.dropout_flag)
        self.dense_layer_3 = DenseLayer(self.all_dim_list[2],
                                        self.all_dim_list[3],
                                        self.weight_bias_init,
                                        self.deep_layer_act,
                                        convert_dtype=True,
                                        drop_out=config.dropout_flag)
        self.dense_layer_4 = DenseLayer(self.all_dim_list[3],
                                        self.all_dim_list[4],
                                        self.weight_bias_init,
                                        self.deep_layer_act,
                                        convert_dtype=True,
                                        drop_out=config.dropout_flag)
        self.dense_layer_5 = DenseLayer(self.all_dim_list[4],
                                        self.all_dim_list[5],
                                        self.weight_bias_init,
                                        self.deep_layer_act,
                                        use_activation=False,
                                        convert_dtype=True,
                                        drop_out=config.dropout_flag)
        self.wide_mul = P.Mul()
        self.deep_mul = P.Mul()
        self.reduce_sum = P.ReduceSum(keep_dims=False)
        self.reshape = P.Reshape()
        self.deep_reshape = P.Reshape()
        self.square = P.Square()
        self.shape = P.Shape()
        self.tile = P.Tile()
        self.concat = P.Concat(axis=1)
        self.cast = P.Cast()
        self.unique = P.Unique().shard(((1, ), ))
        self.wide_gatherv2 = P.GatherV2()
        self.deep_gatherv2 = P.GatherV2()
        if is_auto_parallel and sparse and not is_field_slice:
            target = 'DEVICE'
            if host_device_mix:
                target = 'CPU'
            self.wide_embeddinglookup = nn.EmbeddingLookup(
                self.vocab_size,
                1,
                target=target,
                slice_mode=nn.EmbeddingLookup.TABLE_ROW_SLICE)
            if config.deep_table_slice_mode == "column_slice":
                self.deep_embeddinglookup = nn.EmbeddingLookup(
                    self.vocab_size,
                    self.emb_dim,
                    target=target,
                    slice_mode=nn.EmbeddingLookup.TABLE_COLUMN_SLICE)
                self.dense_layer_1.dropout.dropout.shard(
                    ((1, get_group_size()), ))
                self.dense_layer_1.dropout.dropout_do_mask.shard(
                    ((1, get_group_size()), ))
                self.dense_layer_1.matmul.shard(
                    ((1, get_group_size()), (get_group_size(), 1)))
                self.dense_layer_1.matmul.add_prim_attr(
                    "field_size", self.field_size)
                self.deep_mul.shard(((1, 1, get_group_size()), (1, 1, 1)))
                self.deep_reshape.add_prim_attr("skip_redistribution", True)
            else:
                self.deep_embeddinglookup = nn.EmbeddingLookup(
                    self.vocab_size,
                    self.emb_dim,
                    target=target,
                    slice_mode=nn.EmbeddingLookup.TABLE_ROW_SLICE)
            self.reduce_sum.add_prim_attr("cross_batch", True)
            self.embedding_table = self.deep_embeddinglookup.embedding_table
        elif is_auto_parallel and host_device_mix and is_field_slice and config.full_batch and config.manual_shape:
            manual_shapes = tuple((s[0] for s in config.manual_shape))
            self.deep_embeddinglookup = nn.EmbeddingLookup(
                self.vocab_size,
                self.emb_dim,
                slice_mode=nn.EmbeddingLookup.FIELD_SLICE,
                manual_shapes=manual_shapes)
            self.wide_embeddinglookup = nn.EmbeddingLookup(
                self.vocab_size,
                1,
                slice_mode=nn.EmbeddingLookup.FIELD_SLICE,
                manual_shapes=manual_shapes)
            self.deep_mul.shard(
                ((1, get_group_size(), 1), (1, get_group_size(), 1)))
            self.wide_mul.shard(
                ((1, get_group_size(), 1), (1, get_group_size(), 1)))
            self.reduce_sum.shard(((1, get_group_size(), 1), ))
            self.dense_layer_1.dropout.dropout.shard(((1, get_group_size()), ))
            self.dense_layer_1.dropout.dropout_do_mask.shard(
                ((1, get_group_size()), ))
            self.dense_layer_1.matmul.shard(
                ((1, get_group_size()), (get_group_size(), 1)))
            self.embedding_table = self.deep_embeddinglookup.embedding_table
        elif parameter_server:
            cache_enable = self.vocab_cache_size > 0
            target = 'DEVICE' if cache_enable else 'CPU'
            if not cache_enable:
                sparse = True
            if is_auto_parallel and config.full_batch and cache_enable:
                self.deep_embeddinglookup = nn.EmbeddingLookup(
                    self.vocab_size,
                    self.emb_dim,
                    target=target,
                    slice_mode=nn.EmbeddingLookup.TABLE_ROW_SLICE,
                    sparse=sparse,
                    vocab_cache_size=self.vocab_cache_size)
                self.wide_embeddinglookup = nn.EmbeddingLookup(
                    self.vocab_size,
                    1,
                    target=target,
                    slice_mode=nn.EmbeddingLookup.TABLE_ROW_SLICE,
                    sparse=sparse,
                    vocab_cache_size=self.vocab_cache_size)
            else:
                self.deep_embeddinglookup = nn.EmbeddingLookup(
                    self.vocab_size,
                    self.emb_dim,
                    target=target,
                    sparse=sparse,
                    vocab_cache_size=self.vocab_cache_size)
                self.wide_embeddinglookup = nn.EmbeddingLookup(
                    self.vocab_size,
                    1,
                    target=target,
                    sparse=sparse,
                    vocab_cache_size=self.vocab_cache_size)
            self.embedding_table = self.deep_embeddinglookup.embedding_table
            self.deep_embeddinglookup.embedding_table.set_param_ps()
            self.wide_embeddinglookup.embedding_table.set_param_ps()
        else:
            self.deep_embeddinglookup = nn.EmbeddingLookup(self.vocab_size,
                                                           self.emb_dim,
                                                           target='DEVICE',
                                                           sparse=sparse)
            self.wide_embeddinglookup = nn.EmbeddingLookup(self.vocab_size,
                                                           1,
                                                           target='DEVICE',
                                                           sparse=sparse)
            self.embedding_table = self.deep_embeddinglookup.embedding_table
Пример #18
0
    def __init__(self,
                 in_channels,
                 out_channels,
                 weight_init='normal',
                 bias_init='zeros',
                 damping=0.03,
                 loss_scale=1,
                 frequency=100,
                 has_bias=False,
                 activation=None,
                 batch_size=12):
        super(Dense_Thor, self).__init__()
        self.in_channels = Validator.check_positive_int(in_channels)
        self.out_channels = Validator.check_positive_int(out_channels)
        self.has_bias = Validator.check_bool(has_bias)
        self.thor = True
        if isinstance(weight_init, Tensor):
            if weight_init.dim() != 2 or weight_init.shape()[0] != out_channels or \
                    weight_init.shape()[1] != in_channels:
                raise ValueError("weight_init shape error")

        self.weight = Parameter(initializer(weight_init,
                                            [out_channels, in_channels]),
                                name="weight")

        if self.has_bias:
            if isinstance(bias_init, Tensor):
                if bias_init.dim() != 1 or bias_init.shape(
                )[0] != out_channels:
                    raise ValueError("bias_init shape error")

            self.bias = Parameter(initializer(bias_init, [out_channels]),
                                  name="bias")

        self.matmul = P.MatMul(transpose_b=True)
        self.bias_add = P.BiasAdd()

        self.activation = get_activation(activation)
        self.activation_flag = self.activation is not None
        self.matrix_A_inv = Parameter(Tensor(
            np.zeros([in_channels, in_channels]).astype(np.float16)),
                                      name='matrix_A_inv',
                                      requires_grad=False)
        self.matrix_G_inv = Parameter(Tensor(
            np.zeros([out_channels, out_channels]).astype(np.float16)),
                                      name="matrix_G_inv",
                                      requires_grad=False)
        self.fake_G = Tensor(
            np.zeros([out_channels, out_channels]).astype(np.float16))

        self.matmul = P.MatMul(transpose_b=True)
        self.cube_matmul = P.CusMatMulCube(transpose_a=True)
        self.matrix_combine = P.CusMatrixCombine()
        self.cholesky = P.CusCholeskyTrsm()
        self.shape = P.Shape()
        self.reshape = P.Reshape()
        self.transpose = P.Transpose()
        self.cov_step = Parameter(initializer(0, [1], mstype.int32),
                                  name="cov_step",
                                  requires_grad=False)
        self.mul = P.Mul()
        self.cast = P.Cast()
        self.damping = damping
        self.loss_scale = Tensor(1 / loss_scale, mstype.float16)
        self.vector_matmul = P.CusBatchMatMul()
        self.gather = P.GatherV2()
        self.assignadd = P.AssignAdd()
        self.freq = Tensor(frequency, mstype.int32)
        self.axis = 0
        self.abs = P.Abs()
        self.reduce_max = P.ReduceMax(keep_dims=False)
        self.log = P.Log()
        self.exp = P.Exp()
        self.dampingA = Tensor(np.identity(in_channels), mstype.float32)
        self.dampingG = Tensor(np.identity(out_channels), mstype.float32)
        self.sqrt = P.Sqrt()
        self.getG = P.InsertGradientOf(self.save_gradient)
        self.batch_size = batch_size
Пример #19
0
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 pad_mode='same',
                 padding=0,
                 dilation=1,
                 group=1,
                 has_bias=False,
                 weight_init='normal',
                 bias_init='zeros'):

        Validator.check_value_type("kernel_size", kernel_size, [int], self.cls_name)
        Validator.check_value_type("stride", stride, [int], self.cls_name)
        Validator.check_value_type("padding", padding, [int], self.cls_name)
        Validator.check_value_type("dilation", dilation, [int], self.cls_name)
        Validator.check_int(kernel_size, 1, Rel.GE, 'kernel_size', self.cls_name)
        Validator.check_int(stride, 1, Rel.GE, 'stride', self.cls_name)
        Validator.check_non_negative_int(padding, 'padding', self.cls_name)
        Validator.check_int(dilation, 1, Rel.GE, 'dilation', self.cls_name)
        kernel_size = (1, kernel_size)
        stride = (1, stride)
        dilation = (1, dilation)
        get_shape = P.Shape()
        get_dtype = P.DType()
        if isinstance(weight_init, Tensor):
            weight_init_shape = get_shape(weight_init)
            Validator.check_equal_int(len(weight_init_shape), 3, 'weight_init_shape', self.cls_name)
            weight_init_dtype = get_dtype(weight_init)
            weight_init_value = weight_init.asnumpy()
            weight_init_value = np.expand_dims(weight_init_value, 2)
            weight_init = Tensor(weight_init_value, weight_init_dtype)

        super(Conv1d, self).__init__(
            in_channels,
            out_channels,
            kernel_size,
            stride,
            pad_mode,
            padding,
            dilation,
            group,
            has_bias,
            weight_init,
            bias_init)
        self.padding = (0, 0, padding, padding)
        self.conv2d = P.Conv2D(out_channel=self.out_channels,
                               kernel_size=self.kernel_size,
                               mode=1,
                               pad_mode=self.pad_mode,
                               pad=self.padding,
                               stride=self.stride,
                               dilation=self.dilation,
                               group=self.group)
        self.bias_add = P.BiasAdd()
        if pad_mode not in ('valid', 'same', 'pad'):
            raise ValueError('Attr \'pad_mode\' of \'Conv1d\' Op passed '
                             + str(pad_mode) + ', should be one of values in \'valid\', \'same\', \'pad\'.')
        self.expand_dims = P.ExpandDims()
        self.squeeze = P.Squeeze(2)
        self.shape = P.Shape()
Пример #20
0
    def __init__(
        self,
        vocab_size,
        embedding_size,
        embedding_shape,
        use_one_hot_embeddings=False,
        initializer_range=0.02,
        name='embedding_table',
        batch_size=12,
        damping=0.03,
        loss_scale=1,
        frequency=100,
    ):
        super(Embedding_Thor, self).__init__()
        self.vocab_size = vocab_size
        self.use_one_hot_embeddings = use_one_hot_embeddings
        self.embedding_table = Parameter(initializer(
            TruncatedNormal(initializer_range), [vocab_size, embedding_size]),
                                         name=name)
        self.thor = True
        self.expand = P.ExpandDims()
        self.shape_flat = (-1, )
        self.gather = P.GatherV2()
        self.one_hot = P.OneHot()
        self.on_value = Tensor(1.0, mstype.float32)
        self.off_value = Tensor(0.0, mstype.float32)
        self.array_mul = P.MatMul()
        self.reshape = P.Reshape()
        self.em_shape = tuple(embedding_shape)
        self.shape = P.Shape()
        self.loss_scale = Tensor(1 / loss_scale, mstype.float16)

        self.matrix_A_inv = Parameter(Tensor(
            np.zeros([vocab_size]).astype(np.float16)),
                                      name='matrix_A_inv',
                                      requires_grad=False)
        self.matrix_G_inv = Parameter(Tensor(
            np.zeros([embedding_size, embedding_size]).astype(np.float16)),
                                      name="matrix_G_inv",
                                      requires_grad=False)
        self.fake_G = Tensor(
            np.zeros([embedding_size, embedding_size]).astype(np.float16))
        self.dampingA = Tensor(np.ones([vocab_size]).astype(np.float32))
        self.dampingG = Tensor(np.identity(embedding_size), mstype.float32)
        self.cov_step = Parameter(initializer(0, [1], mstype.int32),
                                  name="cov_step",
                                  requires_grad=False)
        self.freq = Tensor(frequency, mstype.int32)
        self.axis = 0
        self.damping = damping
        self.gather = P.GatherV2()
        self.sqrt = P.Sqrt()
        self.mul = P.Mul()
        self.cast = P.Cast()
        self.cube_matmul = P.CusMatMulCube(transpose_a=True)
        self.vector_matmul = P.CusBatchMatMul()
        self.cholesky = P.CusCholeskyTrsm()
        self.matrix_combine = P.CusMatrixCombine()
        self.reduce_sum = P.ReduceSum(keep_dims=False)
        self.inv = P.Inv()
        self.getG = P.InsertGradientOf(self.save_gradient)
        self.batch_size = batch_size
Пример #21
0
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 pad_mode='same',
                 padding=0,
                 dilation=1,
                 group=1,
                 has_bias=False,
                 weight_init='normal',
                 bias_init='zeros'):
        Validator.check_value_type("kernel_size", kernel_size, [int], self.cls_name)
        Validator.check_value_type("stride", stride, [int], self.cls_name)
        Validator.check_value_type("padding", padding, [int], self.cls_name)
        Validator.check_value_type("dilation", dilation, [int], self.cls_name)
        Validator.check_int(kernel_size, 1, Rel.GE, 'kernel_size', self.cls_name)
        Validator.check_int(stride, 1, Rel.GE, 'stride', self.cls_name)
        Validator.check_non_negative_int(padding, 'padding', self.cls_name)
        Validator.check_int(dilation, 1, Rel.GE, 'dilation', self.cls_name)
        kernel_size = (1, kernel_size)
        stride = (1, stride)
        dilation = (1, dilation)
        get_shape = P.Shape()
        get_dtype = P.DType()
        if isinstance(weight_init, Tensor):
            weight_init_shape = get_shape(weight_init)
            Validator.check_equal_int(len(weight_init_shape), 3, 'weight_init_shape', self.cls_name)
            weight_init_dtype = get_dtype(weight_init)
            weight_init_value = weight_init.asnumpy()
            weight_init_value = np.expand_dims(weight_init_value, 2)
            weight_init = Tensor(weight_init_value, weight_init_dtype)
        # out_channels and in_channels swap.
        # cause Conv2DBackpropInput's out_channel refers to Conv2D's out_channel,
        # then Conv1dTranspose's out_channel refers to Conv2DBackpropInput's in_channel.
        super(Conv1dTranspose, self).__init__(
            in_channels,
            out_channels,
            kernel_size,
            stride,
            pad_mode,
            padding,
            dilation,
            group,
            has_bias,
            weight_init,
            bias_init,
            transposed=True)
        self.padding = (0, 0, padding, padding)
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.shape = P.Shape()
        if pad_mode not in ('valid', 'same', 'pad'):
            raise ValueError('Attr \'pad_mode\' of \'Conv1dTranspose\' Op passed '
                             + str(pad_mode) + ', should be one of values in \'valid\', \'same\', \'pad\'.')
        self.is_valid = self.pad_mode == 'valid'
        self.is_same = self.pad_mode == 'same'
        self.is_pad = self.pad_mode == 'pad'
        if Validator.check_bool(has_bias):
            self.bias = Parameter(initializer(bias_init, [out_channels]), name='bias')

        # cause Conv2DBackpropInput's out_channel refers to Conv2D's out_channel.
        self.conv2d_transpose = P.Conv2DBackpropInput(out_channel=in_channels,
                                                      kernel_size=kernel_size,
                                                      mode=1,
                                                      pad_mode=pad_mode,
                                                      pad=self.padding,
                                                      stride=stride,
                                                      dilation=dilation,
                                                      group=group)
        self.bias_add = P.BiasAdd()
        self.expand_dims = P.ExpandDims()
        self.squeeze = P.Squeeze(2)
Пример #22
0
    def __init__(self, vocab_size, embedding_size, param_init='normal',
                 target='CPU', slice_mode='batch_slice', manual_shapes=None,
                 max_norm=None, sparse=True, vocab_cache_size=0):
        super(EmbeddingLookup, self).__init__()
        validator.check_value_type('sparse', sparse, [bool], self.cls_name)
        self.target = target
        if target not in ('CPU', 'DEVICE'):
            raise ValueError('Attr \'target\' of \'EmbeddingLookup\' Op passed '
                             + str(target) + ', should be one of values in \'CPU\', \'DEVICE\'.')
        if not sparse and target == 'CPU':
            raise ValueError('When target is CPU, embedding_lookup must be sparse.')
        enable_ps = context.get_ps_context("enable_ps")
        if not enable_ps and vocab_cache_size > 0:
            logger.warning("The configuration of 'vocab_cache_size' is valid only in parameter server trainning mode, "
                           "current mode is not parameter server trainning mode, so it will be ignored.")
            vocab_cache_size = 0
        if sparse:
            self.gatherv2 = P.SparseGatherV2()
        else:
            self.gatherv2 = P.GatherV2()
        self.embeddinglookup = P.EmbeddingLookup().add_prim_attr('primitive_target', 'CPU')
        self.vocab_size = validator.check_positive_int(vocab_size, 'vocab_size')
        self.vocab_cache_size = validator.check_non_negative_int(vocab_cache_size, 'vocab_cache_size')
        self.embedding_size = validator.check_positive_int(embedding_size, 'embedding_size')
        parallel_mode = _get_parallel_mode()
        is_auto_parallel = parallel_mode in (ParallelMode.SEMI_AUTO_PARALLEL, ParallelMode.AUTO_PARALLEL)
        self.cache_enable = self.vocab_cache_size > 0
        if self.cache_enable:
            if is_auto_parallel:
                self.vocab_cache_size = self.vocab_cache_size * get_group_size()
            self.vocab_size = self.vocab_cache_size

        self.embedding_table = Parameter(initializer(param_init, [self.vocab_size, self.embedding_size]),
                                         name='embedding_table')
        if self.cache_enable:
            self.embedding_table.cache_enable = True
            _set_cache_enable(True)
            if _is_role_worker():
                _insert_hash_table_size(self.embedding_table.name, vocab_cache_size, embedding_size, vocab_size)
        self.forward_unique = False
        self.gather_revert = P.GatherV2()
        self.unique = P.Unique().shard(((1,),))
        self.reshape = P.Reshape()
        self.shape = P.Shape()
        indices_shape_size = 2
        if slice_mode == "field_slice" and is_auto_parallel:
            if not manual_shapes:
                raise ValueError("in slice field mode, the manual_shapes should not be none")
            if not isinstance(manual_shapes, tuple):
                raise TypeError("manual_shapes type must be tuple(int) cannot be {}!".format(type(manual_shapes)))
            for dim in manual_shapes:
                validator.check_positive_int(dim, 'manual shape dim', self.cls_name)
            self.gatherv2.add_prim_attr("manual_split", manual_shapes)
            self.embeddinglookup.add_prim_attr("manual_split", manual_shapes)
            self.gatherv2.shard(((get_group_size(), 1), (1, get_group_size())))
            self.embeddinglookup.shard(((get_group_size(), 1), (1, get_group_size())))
        elif slice_mode == "table_row_slice" and is_auto_parallel:
            if target == 'DEVICE' and not self.cache_enable:
                indices_shape_size = 1
                self.gather_revert.shard(((1, 1), (get_group_size(),)))
                self.forward_unique = True
            indices_strategy = (1,)*indices_shape_size
            self.gatherv2.shard(((get_group_size(), 1), indices_strategy))
            self.embeddinglookup.shard(((get_group_size(), 1), indices_strategy))
        elif slice_mode == "table_column_slice" and is_auto_parallel:
            if target == 'DEVICE':
                indices_shape_size = 1
                self.gather_revert.shard(((1, get_group_size()), (1,)))
                self.forward_unique = True
            indices_strategy = (1,)*indices_shape_size
            self.gatherv2.shard(((1, get_group_size()), indices_strategy))
            self.embeddinglookup.shard(((1, get_group_size()), indices_strategy))
        elif slice_mode == "batch_slice" and is_auto_parallel:
            indices_strategy = [get_group_size()]
            indices_strategy.extend([1]*(indices_shape_size - 1))
            indices_strategy = tuple(indices_strategy)
            self.gatherv2.shard(((1, 1), indices_strategy))
            self.embeddinglookup.shard(((1, 1), indices_strategy))
        else:
            if is_auto_parallel:
                raise ValueError("slice_mode should support mode in nn.EmbeddingLookup, but get "
                                 + str(slice_mode))
        self.embedding_table.unique = self.forward_unique
        self.max_norm = max_norm
        if self.max_norm is not None:
            self.max_norm = validator.check_positive_float(self.max_norm, 'max_norm', self.cls_name)
            self.max_norm = Tensor(self.max_norm, dtype=mstype.float32)
Пример #23
0
    def __init__(self,
                 q_tensor_width,
                 k_tensor_width,
                 v_tensor_width,
                 hidden_width,
                 out_tensor_width,
                 num_attention_heads=1,
                 query_act=None,
                 key_act=None,
                 value_act=None,
                 out_act=None,
                 has_attention_mask=True,
                 attention_probs_dropout_prob=0.0,
                 use_one_hot_embeddings=False,
                 initializer_range=0.02,
                 do_return_2d_tensor=False,
                 compute_type=mstype.float16,
                 same_dim=True):
        super(MultiheadAttention, self).__init__()
        self.num_attention_heads = num_attention_heads
        self.size_per_head = int(hidden_width / num_attention_heads)
        self.has_attention_mask = has_attention_mask
        self.use_one_hot_embeddings = use_one_hot_embeddings
        self.initializer_range = initializer_range
        self.do_return_2d_tensor = do_return_2d_tensor
        self.same_dim = same_dim

        self.scores_mul = Tensor(
            [1.0 / math.sqrt(float(self.size_per_head))], dtype=compute_type)
        self.reshape = P.Reshape()
        self.shape_q_2d = (-1, q_tensor_width)
        self.shape_k_2d = (-1, k_tensor_width)
        self.shape_v_2d = (-1, v_tensor_width)
        self.hidden_width = int(hidden_width)
        if self.same_dim:
            self.in_proj_layer = Parameter(Tensor(np.random.rand(hidden_width * 3,
                                                                 q_tensor_width), dtype=mstype.float32), name="weight")
        else:
            self.query_layer = nn.Dense(q_tensor_width,
                                        hidden_width,
                                        activation=query_act,
                                        has_bias=False).to_float(compute_type)
            self.key_layer = nn.Dense(k_tensor_width,
                                      hidden_width,
                                      activation=key_act,
                                      has_bias=False).to_float(compute_type)
            self.value_layer = nn.Dense(q_tensor_width,
                                        hidden_width,
                                        activation=value_act,
                                        has_bias=False).to_float(compute_type)
        self.out_proj = nn.Dense(hidden_width,
                                 out_tensor_width,
                                 activation=out_act,
                                 has_bias=False).to_float(compute_type)

        self.matmul_trans_b = P.BatchMatMul(transpose_b=True)
        self.multiply = P.Mul()
        self.transpose = P.Transpose()
        self.trans_shape = (0, 2, 1, 3)
        self.trans_shape_relative = (2, 0, 1, 3)
        self.trans_shape_position = (1, 2, 0, 3)
        self.multiply_data = Tensor([-10000.0,], dtype=compute_type)
        self.matmul = P.BatchMatMul()

        self.softmax = nn.Softmax()
        self.dropout = nn.Dropout(1. - attention_probs_dropout_prob)
        self.use_dropout = attention_probs_dropout_prob > 0

        if self.has_attention_mask:
            self.expand_dims = P.ExpandDims()
            self.sub = P.Sub()
            self.add = P.TensorAdd()
            self.cast = P.Cast()
            self.get_dtype = P.DType()

        self.softmax_cast = P.Cast()
        self.matmul_dense = P.MatMul(transpose_b=True)
        self.split = P.Split(0, 3)
        self.equal = P.Equal()
        self.shape = P.Shape()
Пример #24
0
 def construct(self):
     weights = self.weights
     loss = self.network()
     sens = P.Fill()(P.DType()(loss), P.Shape()(loss), self.sens)
     grads = self.grad(self.network, weights)(sens)
     return F.depend(loss, self.optimizer(grads))
Пример #25
0
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 pad_mode='same',
                 padding=0,
                 dilation=1,
                 group=1,
                 data_format='NCHW',
                 has_bias=False,
                 weight_init='normal',
                 damping=0.03,
                 loss_scale=1,
                 frequency=278,
                 batch_size=32,
                 bias_init='zeros'):
        self.thor = True
        self.hw = kernel_size * kernel_size
        kernel_size = twice(kernel_size)
        super(Conv2d_Thor_GPU, self).__init__(
            in_channels,
            out_channels,
            kernel_size,
            stride,
            pad_mode,
            padding,
            dilation,
            group,
            data_format,
            has_bias,
            weight_init,
            bias_init,
        )
        self.conv2d = P.Conv2D(out_channel=self.out_channels,
                               kernel_size=self.kernel_size,
                               mode=1,
                               pad_mode=self.pad_mode,
                               pad=self.padding,
                               stride=self.stride,
                               dilation=self.dilation,
                               group=self.group)

        self.matrix_A_dim = self.in_channels * self.kernel_size[
            0] * self.kernel_size[1]
        self.matrix_G_dim = self.out_channels

        split_dim = 128
        matrix_A_shape, matrix_G_shape = caculate_matmul_shape(
            self.matrix_A_dim, self.matrix_G_dim, split_dim)
        self.matrix_A_inv = Parameter(np.zeros(matrix_A_shape).astype(
            np.float32),
                                      requires_grad=False)
        self.matrix_G_inv = Parameter(np.zeros(matrix_G_shape).astype(
            np.float32),
                                      requires_grad=False)
        self.broadcast_to = P.BroadcastTo(matrix_A_shape)
        self.cov_step = Parameter(initializer(0, [1], mstype.int32),
                                  requires_grad=False)
        self.img2col = P.Im2Col(kernel_size=kernel_size,
                                stride=stride,
                                pad_mode="same")
        self.matmul = P.MatMul(transpose_b=True)
        self.shape = P.Shape()
        self.reshape = P.Reshape()
        self.mul = P.Mul()
        self.getG = P.InsertGradientOf(self.save_gradient)
        self.loss_scale = Tensor(1 / loss_scale, mstype.float16)
        self.batch_size = Tensor(batch_size, mstype.float16)
        self.transpose = P.Transpose()
        self.cast = P.Cast()
        self.gather = P.GatherV2()
        self.freq = Tensor(frequency, mstype.int32)
        self.axis = 0
        self.sqrt = P.Sqrt()
        self.reduce_mean = P.ReduceMean(keep_dims=False)
        self.damping = Parameter(Tensor(damping), requires_grad=False)
        self.dampingA = Tensor(np.identity(self.matrix_A_dim), mstype.float32)
        self.dampingG = Tensor(np.identity(self.matrix_G_dim), mstype.float32)
        self.cholesky = P.CholeskyTrsm(split_dim=split_dim)
        self.vector_matmul = P.BatchMatMul(transpose_a=True)
Пример #26
0
    def __init__(self, config):
        super(DeepFMModel, self).__init__()

        self.batch_size = config.batch_size
        self.field_size = config.data_field_size
        self.vocab_size = config.data_vocab_size
        self.emb_dim = config.data_emb_dim
        self.deep_layer_dims_list, self.deep_layer_act = config.deep_layer_args
        self.init_args = config.init_args
        self.weight_bias_init = config.weight_bias_init
        self.keep_prob = config.keep_prob
        init_acts = [('W_l2', [self.vocab_size, 1], 'normal'),
                     ('V_l2', [self.vocab_size, self.emb_dim], 'normal')]
        var_map = init_var_dict(self.init_args, init_acts)
        self.fm_w = var_map["W_l2"]
        self.embedding_table = var_map["V_l2"]
        " Deep Layers "
        self.deep_input_dims = self.field_size * self.emb_dim
        self.all_dim_list = [self.deep_input_dims
                             ] + self.deep_layer_dims_list + [1]
        self.dense_layer_1 = DenseLayer(self.all_dim_list[0],
                                        self.all_dim_list[1],
                                        self.weight_bias_init,
                                        self.deep_layer_act,
                                        self.keep_prob,
                                        convert_dtype=True)
        self.dense_layer_2 = DenseLayer(self.all_dim_list[1],
                                        self.all_dim_list[2],
                                        self.weight_bias_init,
                                        self.deep_layer_act,
                                        self.keep_prob,
                                        convert_dtype=True)
        self.dense_layer_3 = DenseLayer(self.all_dim_list[2],
                                        self.all_dim_list[3],
                                        self.weight_bias_init,
                                        self.deep_layer_act,
                                        self.keep_prob,
                                        convert_dtype=True)
        self.dense_layer_4 = DenseLayer(self.all_dim_list[3],
                                        self.all_dim_list[4],
                                        self.weight_bias_init,
                                        self.deep_layer_act,
                                        self.keep_prob,
                                        convert_dtype=True)
        self.dense_layer_5 = DenseLayer(self.all_dim_list[4],
                                        self.all_dim_list[5],
                                        self.weight_bias_init,
                                        self.deep_layer_act,
                                        self.keep_prob,
                                        convert_dtype=True,
                                        use_act=False)
        " FM, linear Layers "
        self.Gatherv2 = P.GatherV2()
        self.Mul = P.Mul()
        self.ReduceSum = P.ReduceSum(keep_dims=False)
        self.Reshape = P.Reshape()
        self.Square = P.Square()
        self.Shape = P.Shape()
        self.Tile = P.Tile()
        self.Concat = P.Concat(axis=1)
        self.Cast = P.Cast()
Пример #27
0
    def __init__(self,
                 in_channels,
                 out_channels,
                 kernel_size,
                 stride=1,
                 pad_mode='same',
                 padding=0,
                 dilation=1,
                 group=1,
                 data_format='NCHW',
                 has_bias=False,
                 weight_init='normal',
                 damping=0.03,
                 loss_scale=1,
                 frequency=278,
                 batch_size=32,
                 bias_init='zeros'):
        self.thor = True
        ksizes = (1, kernel_size, kernel_size, 1)
        self.hw = kernel_size * kernel_size
        strides = (1, stride, stride, 1)
        kernel_size = twice(kernel_size)
        super(Conv2d_Thor, self).__init__(
            in_channels,
            out_channels,
            kernel_size,
            stride,
            pad_mode,
            padding,
            dilation,
            group,
            data_format,
            has_bias,
            weight_init,
            bias_init,
        )
        self.conv2d = P.Conv2D(out_channel=self.out_channels,
                               kernel_size=self.kernel_size,
                               mode=1,
                               pad_mode=self.pad_mode,
                               pad=self.padding,
                               stride=self.stride,
                               dilation=self.dilation,
                               group=self.group)
        self.batch_size = batch_size
        self.img2col = P.CusImg2Col(ksizes=ksizes, strides=strides)
        self.cube_matmul = P.CusMatMulCube(transpose_a=True)
        self.matrix_combine = P.CusMatrixCombine()
        self.cholesky = P.CusCholeskyTrsm()
        self.transpose02314 = P.CusTranspose02314()
        self.matrix_A_dim = self.in_channels * self.kernel_size[
            0] * self.kernel_size[1]
        self.matrix_G_dim = self.out_channels
        self.matrix_A_device_shape, self.matrix_A_device_dim = caculate_device_shape(
            self.matrix_A_dim, self.in_channels, True)
        self.matrix_G_device_shape, self.matrix_G_device_dim = caculate_device_shape(
            self.matrix_G_dim, self.in_channels, False)
        self.matrix_A_device_temp_shape = (self.matrix_A_device_shape[0],
                                           self.matrix_A_device_shape[2],
                                           self.matrix_A_device_shape[1],
                                           self.matrix_A_device_shape[3])
        self.matrix_G_device_temp_shape = (self.matrix_G_device_shape[0],
                                           self.matrix_G_device_shape[2],
                                           self.matrix_G_device_shape[1],
                                           self.matrix_G_device_shape[3])
        self.matrix_A_inv = Parameter(Tensor(
            np.reshape(
                np.identity(self.matrix_A_device_dim).astype(np.float16),
                self.matrix_A_device_shape)),
                                      requires_grad=False)
        self.A_inv_max = Parameter(initializer(0, [1], mstype.float32),
                                   requires_grad=False)
        self.matrix_G_inv = Parameter(Tensor(
            np.reshape(
                np.identity(self.matrix_G_device_dim).astype(np.float16),
                self.matrix_G_device_shape)),
                                      requires_grad=False)

        self.G_inv_max = Parameter(initializer(0, [1], mstype.float32),
                                   requires_grad=False)
        self.fake_G = Tensor(
            np.reshape(
                np.identity(self.matrix_G_device_dim).astype(np.float16),
                self.matrix_G_device_shape))

        self.shape = P.Shape()
        self.reshape = P.Reshape()
        self.transpose = P.Transpose()
        self.cov_step = Parameter(initializer(0, [1], mstype.int32),
                                  requires_grad=False)
        self.mul = P.Mul()
        self.cast = P.Cast()
        self.damping = Tensor(damping)
        self.vector_matmul = P.CusBatchMatMul()
        self.diag_block_dim = 128
        self.channels_slice_flag = False
        if self.in_channels % C0 != 0:
            self.channels_slice_flag = True

        self.padA_flag = False
        if (self.matrix_A_dim // self.diag_block_dim) * self.diag_block_dim != self.matrix_A_dim \
            and self.matrix_A_dim > self.diag_block_dim:
            self.padA_flag = True
            pad_dim = self.diag_block_dim - self.matrix_A_dim % self.diag_block_dim
            self.padA = P.Pad(((0, pad_dim), (0, pad_dim)))
        self.device_shape_pad_flag = False
        if self.matrix_A_dim != self.matrix_A_device_dim:
            self.device_shape_pad_flag = True
            self.device_shape_pad = P.Pad(((0, 0), (0, C0 - self.in_channels),
                                           (0, 0), (0, C0 - self.in_channels)))
        self.slice = P.Slice()
        self.gather = P.GatherV2()
        self.freq = Tensor(frequency, mstype.int32)
        self.loss_scale = Tensor(1 / loss_scale, mstype.float16)
        self.axis = 0

        dampingA_dim = self.matrix_A_dim
        if (self.matrix_A_dim % self.diag_block_dim
            ) != 0 and self.matrix_A_dim > self.diag_block_dim:
            dampingA_dim = (self.matrix_A_dim // self.diag_block_dim +
                            1) * self.diag_block_dim
        dampingG_dim = self.matrix_G_dim
        if (self.matrix_G_dim % self.diag_block_dim
            ) != 0 and self.matrix_G_dim > self.diag_block_dim:
            dampingG_dim = (self.matrix_G_dim // self.diag_block_dim +
                            1) * self.diag_block_dim

        self.dampingA = Tensor(np.identity(dampingA_dim), mstype.float32)
        self.dampingG = Tensor(np.identity(dampingG_dim), mstype.float32)
        self.fused_abs_max1 = P.CusFusedAbsMax1(
            [self.matrix_A_dim, self.matrix_A_dim])
        self.fused_abs_max2 = P.CusFusedAbsMax1()
        self.log = P.Log()
        self.exp = P.Exp()
        self.sqrt = P.Sqrt()
        self.getG = P.InsertGradientOf(self.save_gradient)
Пример #28
0
    def __init__(self, config):
        super(WideDeepModel, self).__init__()
        emb_128_size = 650000
        emb64_single_size = 17300
        emb64_multi_size = 20900
        indicator_size = 16
        deep_dim_list = [1024, 1024, 1024, 1024, 1024]
        # deep_dropout=0.0
        wide_reg_coef = [0.0, 0.0]
        deep_reg_coef = [0.0, 0.0]
        wide_lr = 0.2
        deep_lr = 1.0

        self.input_emb_dim = config.input_emb_dim
        self.batch_size = config.batch_size
        self.deep_layer_act = config.deep_layers_act
        self.init_args = config.init_args
        self.weight_init, self.bias_init = config.weight_bias_init
        self.weight_bias_init = config.weight_bias_init
        self.emb_init = config.emb_init

        self.keep_prob = config.keep_prob
        self.layer_dims = deep_dim_list + [1]
        self.all_dim_list = [self.input_emb_dim] + self.layer_dims

        self.continue_field_size = 32
        self.emb_128_size = emb_128_size
        self.emb64_single_size = emb64_single_size
        self.emb64_multi_size = emb64_multi_size
        self.indicator_size = indicator_size

        self.wide_l1_coef, self.wide_l2_coef = wide_reg_coef
        self.deep_l1_coef, self.deep_l2_coef = deep_reg_coef
        self.wide_lr = wide_lr
        self.deep_lr = deep_lr

        init_acts_embedding_metrix = [
            ('emb128_embedding', [self.emb_128_size, 128], self.emb_init),
            ('emb64_single', [self.emb64_single_size, 64], self.emb_init),
            ('emb64_multi', [self.emb64_multi_size, 64], self.emb_init),
            ('emb64_indicator', [self.indicator_size, 64], self.emb_init)
        ]
        var_map = init_var_dict(self.init_args, init_acts_embedding_metrix)
        self.emb128_embedding = var_map["emb128_embedding"]
        self.emb64_single = var_map["emb64_single"]
        self.emb64_multi = var_map["emb64_multi"]
        self.emb64_indicator = var_map["emb64_indicator"]

        init_acts_wide_weight = [
            ('wide_continue_w', [self.continue_field_size], self.emb_init),
            ('wide_emb128_w', [self.emb_128_size], self.emb_init),
            ('wide_emb64_single_w', [self.emb64_single_size], self.emb_init),
            ('wide_emb64_multi_w', [self.emb64_multi_size], self.emb_init),
            ('wide_indicator_w', [self.indicator_size], self.emb_init),
            ('wide_bias', [1], self.emb_init)
        ]
        var_map = init_var_dict(self.init_args, init_acts_wide_weight)
        self.wide_continue_w = var_map["wide_continue_w"]
        self.wide_emb128_w = var_map["wide_emb128_w"]
        self.wide_emb64_single_w = var_map["wide_emb64_single_w"]
        self.wide_emb64_multi_w = var_map["wide_emb64_multi_w"]
        self.wide_indicator_w = var_map["wide_indicator_w"]
        self.wide_bias = var_map["wide_bias"]

        self.dense_layer_1 = DenseLayer(self.all_dim_list[0],
                                        self.all_dim_list[1],
                                        self.weight_bias_init,
                                        self.deep_layer_act,
                                        drop_out=config.dropout_flag,
                                        convert_dtype=True)
        self.dense_layer_2 = DenseLayer(self.all_dim_list[1],
                                        self.all_dim_list[2],
                                        self.weight_bias_init,
                                        self.deep_layer_act,
                                        drop_out=config.dropout_flag,
                                        convert_dtype=True)
        self.dense_layer_3 = DenseLayer(self.all_dim_list[2],
                                        self.all_dim_list[3],
                                        self.weight_bias_init,
                                        self.deep_layer_act,
                                        drop_out=config.dropout_flag,
                                        convert_dtype=True)
        self.dense_layer_4 = DenseLayer(self.all_dim_list[3],
                                        self.all_dim_list[4],
                                        self.weight_bias_init,
                                        self.deep_layer_act,
                                        drop_out=config.dropout_flag,
                                        convert_dtype=True)
        self.dense_layer_5 = DenseLayer(self.all_dim_list[4],
                                        self.all_dim_list[5],
                                        self.weight_bias_init,
                                        self.deep_layer_act,
                                        drop_out=config.dropout_flag,
                                        convert_dtype=True)

        self.deep_predict = DenseLayer(self.all_dim_list[5],
                                       self.all_dim_list[6],
                                       self.weight_bias_init,
                                       self.deep_layer_act,
                                       drop_out=config.dropout_flag,
                                       convert_dtype=True,
                                       use_activation=False)

        self.gather_v2 = P.GatherV2()
        self.mul = P.Mul()
        self.reduce_sum_false = P.ReduceSum(keep_dims=False)
        self.reduce_sum_true = P.ReduceSum(keep_dims=True)
        self.reshape = P.Reshape()
        self.square = P.Square()
        self.shape = P.Shape()
        self.tile = P.Tile()
        self.concat = P.Concat(axis=1)
        self.cast = P.Cast()
        self.reduceMean_false = P.ReduceMean(keep_dims=False)
        self.Concat = P.Concat(axis=1)
        self.BiasAdd = P.BiasAdd()
        self.expand_dims = P.ExpandDims()
        self.flatten = Flatten()
Пример #29
0
    def __init__(self,
                 num_features,
                 eps=1e-5,
                 momentum=0.9,
                 affine=True,
                 gamma_init='ones',
                 beta_init='zeros',
                 moving_mean_init='zeros',
                 moving_var_init='ones',
                 use_batch_statistics=None,
                 device_num_each_group=1,
                 input_dims='2d',
                 data_format='NCHW'):
        super(_BatchNorm, self).__init__()
        validator.check_value_type('num_features', num_features, [int],
                                   self.cls_name)
        if num_features < 1:
            raise ValueError("num_features must be at least 1")

        if momentum < 0 or momentum > 1:
            raise ValueError(
                "momentum should be a number in range [0, 1], but got {}".
                format(momentum))
        self.format = validator.check_string(data_format, ['NCHW', 'NHWC'],
                                             'format', self.cls_name)
        if context.get_context(
                "device_target") != "GPU" and self.format == "NHWC":
            raise ValueError("NHWC format only support in GPU target.")
        self.use_batch_statistics = use_batch_statistics
        self.num_features = num_features
        self.eps = eps
        self.input_dims = input_dims
        self.moving_mean = Parameter(initializer(moving_mean_init,
                                                 num_features),
                                     name="mean",
                                     requires_grad=False)
        self.moving_variance = Parameter(initializer(moving_var_init,
                                                     num_features),
                                         name="variance",
                                         requires_grad=False)
        self.gamma = Parameter(initializer(gamma_init, num_features),
                               name="gamma",
                               requires_grad=affine)
        self.beta = Parameter(initializer(beta_init, num_features),
                              name="beta",
                              requires_grad=affine)
        self.group = validator.check_positive_int(device_num_each_group)
        self.is_global = False
        if self.group != 1:
            self.rank_id = get_rank()
            self.rank_size = get_group_size()
            self.device_list = [i for i in range(0, self.rank_size)]
            self.rank_list = self.list_group(self.device_list, self.group)
            self.rank_list_idx = len(self.rank_list)
            for i in range(self.rank_list_idx):
                if self.rank_id in self.rank_list[i] and self.group != 1:
                    self.is_global = True
                    management.create_group('group' + str(i),
                                            self.rank_list[i])
                    self.all_reduce = P.AllReduce(
                        P.ReduceOp.SUM,
                        'group' + str(i)).add_prim_attr('fusion', 1)
        self.shape = P.Shape()
        self.reduce_mean = P.ReduceMean(keep_dims=True)
        self.square = P.Square()
        self.sqrt = P.Sqrt()
        self.cast = P.Cast()
        self.dtype = P.DType()
        self.reshape = P.Reshape()
        self._target = context.get_context("device_target")
        self.is_graph_mode = context.get_context("mode") == context.GRAPH_MODE
        self.momentum = 1.0 - momentum
        if context.get_context("enable_ge"):
            self.is_ge_backend = True
        else:
            self.is_ge_backend = False

        if self._target == "Ascend":
            self.bn_train = P.BatchNorm(is_training=True,
                                        epsilon=self.eps,
                                        momentum=self.momentum)
        if self._target == "GPU":
            self.bn_train = P.FusedBatchNormEx(mode=1,
                                               epsilon=self.eps,
                                               momentum=self.momentum,
                                               data_format=self.format)
        if self._target == "CPU":
            self.bn_train = P.FusedBatchNorm(mode=1,
                                             epsilon=self.eps,
                                             momentum=self.momentum)
        self.bn_infer = P.BatchNorm(is_training=False,
                                    epsilon=self.eps,
                                    data_format=self.format)
        self.enable_global_sync = self.is_global and (self.is_ge_backend or\
            (self.is_graph_mode and self._target == "Ascend"))

        data_parallel_strategy = ((1, ), (1, ))
        data_parallel_strategy_one = ((1, ), ())
        self.sub_mean = P.Sub().shard(data_parallel_strategy)
        self.sub_var = P.Sub().shard(data_parallel_strategy)
        self.mul_mean = P.Mul().shard(data_parallel_strategy_one)
        self.mul_var = P.Mul().shard(data_parallel_strategy_one)
        self.assign_sub_mean = P.AssignSub().shard(data_parallel_strategy)
        self.assign_sub_var = P.AssignSub().shard(data_parallel_strategy)
Пример #30
0
    def __init__(self, num_classes, is_training=True,
                 stem_filters=32, penultimate_filters=1056, filters_multiplier=2):
        super(NASNetAMobile, self).__init__()
        self.is_training = is_training
        self.stem_filters = stem_filters
        self.penultimate_filters = penultimate_filters
        self.filters_multiplier = filters_multiplier

        filters = self.penultimate_filters//24
        # 24 is default value for the architecture

        self.conv0 = nn.SequentialCell([
            nn.Conv2d(in_channels=3, out_channels=self.stem_filters, kernel_size=3, stride=2, pad_mode='pad', padding=0,
                      has_bias=False),
            nn.BatchNorm2d(num_features=self.stem_filters, eps=0.001, momentum=0.9, affine=True)
        ])

        self.cell_stem_0 = CellStem0(
            self.stem_filters, num_filters=filters//(filters_multiplier**2)
        )
        self.cell_stem_1 = CellStem1(
            self.stem_filters, num_filters=filters//filters_multiplier
        )

        self.cell_0 = FirstCell(
            in_channels_left=filters,
            out_channels_left=filters//2,  # 1, 0.5
            in_channels_right=2*filters,
            out_channels_right=filters
        )  # 2, 1
        self.cell_1 = NormalCell(
            in_channels_left=2*filters,
            out_channels_left=filters,  # 2, 1
            in_channels_right=6*filters,
            out_channels_right=filters
        )  # 6, 1
        self.cell_2 = NormalCell(
            in_channels_left=6*filters,
            out_channels_left=filters,  # 6, 1
            in_channels_right=6*filters,
            out_channels_right=filters
        )  # 6, 1
        self.cell_3 = NormalCell(
            in_channels_left=6*filters,
            out_channels_left=filters,  # 6, 1
            in_channels_right=6*filters,
            out_channels_right=filters
        )  # 6, 1

        self.reduction_cell_0 = ReductionCell0(
            in_channels_left=6*filters,
            out_channels_left=2*filters,  # 6, 2
            in_channels_right=6*filters,
            out_channels_right=2*filters
        )  # 6, 2

        self.cell_6 = FirstCell(
            in_channels_left=6*filters,
            out_channels_left=filters,  # 6, 1
            in_channels_right=8*filters,
            out_channels_right=2*filters
        )  # 8, 2
        self.cell_7 = NormalCell(
            in_channels_left=8*filters,
            out_channels_left=2*filters,  # 8, 2
            in_channels_right=12*filters,
            out_channels_right=2*filters
        )  # 12, 2
        self.cell_8 = NormalCell(
            in_channels_left=12*filters,
            out_channels_left=2*filters,  # 12, 2
            in_channels_right=12*filters,
            out_channels_right=2*filters
        )  # 12, 2
        self.cell_9 = NormalCell(
            in_channels_left=12*filters,
            out_channels_left=2*filters,  # 12, 2
            in_channels_right=12*filters,
            out_channels_right=2*filters
        )  # 12, 2

        if is_training:
            self.aux_logits = AuxLogits(in_channels=12*filters, out_channels=num_classes)

        self.reduction_cell_1 = ReductionCell1(
            in_channels_left=12*filters,
            out_channels_left=4*filters,  # 12, 4
            in_channels_right=12*filters,
            out_channels_right=4*filters
        )  # 12, 4

        self.cell_12 = FirstCell(
            in_channels_left=12*filters,
            out_channels_left=2*filters,  # 12, 2
            in_channels_right=16*filters,
            out_channels_right=4*filters
        )  # 16, 4
        self.cell_13 = NormalCell(
            in_channels_left=16*filters,
            out_channels_left=4*filters,  # 16, 4
            in_channels_right=24*filters,
            out_channels_right=4*filters
        )  # 24, 4
        self.cell_14 = NormalCell(
            in_channels_left=24*filters,
            out_channels_left=4*filters,  # 24, 4
            in_channels_right=24*filters,
            out_channels_right=4*filters
        )  # 24, 4
        self.cell_15 = NormalCell(
            in_channels_left=24*filters,
            out_channels_left=4*filters,  # 24, 4
            in_channels_right=24*filters,
            out_channels_right=4*filters
        )  # 24, 4

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(keep_prob=0.5)
        self.classifier = nn.Dense(in_channels=24*filters, out_channels=num_classes)
        self.shape = P.Shape()
        self.reshape = P.Reshape()
        self.avg_pool = nn.AvgPool2d(kernel_size=7, stride=1)
        self._initialize_weights()