예제 #1
0
def _param_check(shape_x, dtype_x, axis, kernel_name):
    """check param

    Parameters
    ----------
    shape_x: list
        input shape
    dtype_x: str
        input dtype
    axis: int
        axis int num
    kernel_name: str
        kernel_name string

    Returns
    -------
    None
    """
    util.check_shape_rule(shape_x, max_dim=8)
    util.check_tensor_shape_size(shape_x)
    check_list = ("int32", "float32")
    util.check_dtype_rule(dtype_x.lower(), check_list)
    util.check_kernel_name(kernel_name)
def fake_quant_perchannel(x, min_val, max_val, y,
                          symmetric, narrow_range, num_bits, channel_axis,
                          kernel_name="fake_quant_perchannel"):
    """FakeQuantPerChannel"""
    x_shape = x.get("shape")
    x_shape_ = x.get("ori_shape")
    x_format = x.get("format")
    x_dtype = x.get("dtype")
    min_shape = min_val.get("ori_shape")
    min_dtype = min_val.get("dtype")
    max_shape = max_val.get("ori_shape")
    max_dtype = max_val.get("dtype")
    # for Dense weight quant, 2d[co,ci] -> 4d[1,co,ci,1], channel_axis_ need change to 1.
    if channel_axis == 0 and x_shape_[0] != min_shape[0] and x_shape_[1] == min_shape[0]:
        channel_axis_ = 1
    else:
        channel_axis_ = channel_axis
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(x_shape)
    util.check_shape_rule(min_shape, 1, 1, x_shape_[channel_axis_])
    util.check_shape_rule(max_shape, 1, 1, x_shape_[channel_axis_])
    util.check_tensor_shape_size(x_shape)
    util.check_tensor_shape_size(min_shape)
    util.check_tensor_shape_size(max_shape)

    check_list = ["float32", "float16"]
    x_dtype = x_dtype.lower()
    min_dtype = min_dtype.lower()
    max_dtype = max_dtype.lower()
    util.check_dtype_rule(x_dtype, check_list)
    util.check_dtype_rule(min_dtype, check_list)
    util.check_dtype_rule(max_dtype, check_list)

    quant_min = 0
    quant_max = 2 ** num_bits - 1
    if narrow_range:
        quant_min = quant_min + 1

    shape_c = [1] * len(x_shape)
    shape_c[channel_axis_] = min_val.get("ori_shape")[0]
    if x_format == "NC1HWC0" and channel_axis_ == 1:
        shape_c = min_val.get("shape")
    input_data = tvm.placeholder(x_shape, name="x", dtype=x_dtype)
    min_data = tvm.placeholder(shape_c, name="min_val", dtype=x_dtype)
    max_data = tvm.placeholder(shape_c, name="max_val", dtype=x_dtype)
    res = fake_quant_perchannel_compute(input_data, min_data, max_data, y,
                                        quant_min, quant_max, symmetric, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    tensor_list = [input_data, min_data, max_data, res]
    config = {"print_ir": False,
              "name": kernel_name,
              "tensor_list": tensor_list}

    te.lang.cce.cce_build_code(sch, config)
예제 #3
0
def fake_quant_with_min_max_grad(dout, x, min_val, max_val, dx,
                                 num_bits, quant_delay, symmetric, narrow_range,
                                 kernel_name="fake_quant_with_min_max_grad"):
    """FakeQuantWithMinMaxGrad"""
    input_shape = x.get("shape")
    input_dtype = x.get("dtype")
    min_shape = min_val.get("ori_shape")
    min_dtype = min_val.get("dtype")
    max_shape = max_val.get("ori_shape")
    max_dtype = max_val.get("dtype")

    min_shape = util.scalar2tensor_one(min_shape)
    max_shape = util.scalar2tensor_one(max_shape)
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(input_shape)
    util.check_shape_rule(min_shape, 1, 1, 1)
    util.check_shape_rule(max_shape, 1, 1, 1)
    util.check_tensor_shape_size(input_shape)
    util.check_tensor_shape_size(min_shape)
    util.check_tensor_shape_size(max_shape)

    check_list = ["float32", 'float16']
    x_dtype = input_dtype.lower()
    min_dtype = min_dtype.lower()
    max_dtype = max_dtype.lower()
    util.check_dtype_rule(x_dtype, check_list)
    util.check_dtype_rule(min_dtype, check_list)
    util.check_dtype_rule(max_dtype, check_list)

    input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]),)
    shape_min, _, _ = util.produce_shapes(min_shape, input_shape)

    if symmetric:
        quant_min = 0 - 2 ** (num_bits - 1)
        quant_max = 2 ** (num_bits - 1) - 1
    else:
        quant_min = 0
        quant_max = 2 ** num_bits - 1
    if narrow_range:
        quant_min = quant_min + 1

    dout_data = tvm.placeholder(input_shape, name="dout", dtype=x_dtype)
    input_data = tvm.placeholder(input_shape, name="x", dtype=x_dtype)
    min_data = tvm.placeholder(shape_min, name="min_data", dtype=min_dtype)
    max_data = tvm.placeholder(shape_min, name="max_data", dtype=max_dtype)
    res = fake_quant_with_min_max_grad_compute(dout_data, input_data, min_data, max_data, quant_min,
                                               quant_max, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    tensor_list = [dout_data, input_data, min_data, max_data, res]
    config = {"print_ir": False,
              "name": kernel_name,
              "tensor_list": tensor_list}

    te.lang.cce.cce_build_code(sch, config)
예제 #4
0
    def check_param_common(self):
        """
        Check parameter

        Parameters
        ----------
        None

        Returns
        -------
        None
        """
        util.check_kernel_name(self.kernel_name)
        util.check_shape_rule(self.indices_shape)
        util.check_shape_rule(self.grad_shape)

        util.check_shape_size(self.indices_shape, SHAPE_SIZE_LIMIT)
        util.check_shape_size(self.grad_shape, SHAPE_SIZE_LIMIT)

        check_list_indices_dtype = ("int32", "int64")

        util.check_dtype_rule(self.indices_dtype, check_list_indices_dtype)
        util.check_dtype_rule(self.grad_dtype, ("float32"))

        if self.grad_shape[1:] != self.var_shape[1:]:
            raise RuntimeError(
                "grad's shape must be the same as var's shape"
                " except first dimension")

        if len(self.indices_shape) != 1:
            raise RuntimeError(
                "indices must be one-dimensioal")

        if self.grad_shape[0] != self.indices_shape[0]:
            raise RuntimeError("grad must be the same shape as indices in "
                               "first dimension")
def fake_learned_scale_quant_perlayer_grad_d_reduce(
        dout_alpha,
        dalpha,
        kernel_name="fake_learned_scale_quant_perlayer_grad_d_reduce"):
    """FakeLearnedScaleQuantPerLayerGradDReduce"""

    dout_alpha_shape = dout_alpha.get("shape")
    dout_alpha_dtype = dout_alpha.get("dtype")

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(dout_alpha_shape)
    util.check_tensor_shape_size(dout_alpha_shape)

    check_list = ["float32", 'float16']
    dout_alpha_dtype = dout_alpha_dtype.lower()
    util.check_dtype_rule(dout_alpha_dtype, check_list)

    input_shape = (functools_reduce(lambda x, y: x * y, dout_alpha_shape[:]), )

    dout_alpha_data = tvm.placeholder(input_shape,
                                      name="dout_alpha",
                                      dtype=dout_alpha_dtype)
    res = fake_learned_scale_quant_perlayer_grad_d_reduce_compute(
        dout_alpha_data, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    tensor_list = [dout_alpha_data, res]
    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": tensor_list
    }

    te.lang.cce.cce_build_code(sch, config)
def fake_quant_minmax_update(x, min_val, max_val, min_up, max_up,
                             ema, ema_decay, symmetric, narrow_range, training, num_bits,
                             kernel_name="fake_quant_minmax_update"):
    """FakeQuantPerLayer op"""
    input_shape = x.get("shape")
    input_dtype = x.get("dtype")
    min_shape = min_val.get("ori_shape")
    min_dtype = min_val.get("dtype")
    max_shape = max_val.get("ori_shape")
    max_dtype = max_val.get("dtype")

    min_shape = util.scalar2tensor_one(min_shape)
    max_shape = util.scalar2tensor_one(max_shape)
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(input_shape)
    util.check_shape_rule(min_shape, 1, 1, 1)
    util.check_shape_rule(max_shape, 1, 1, 1)
    util.check_tensor_shape_size(input_shape)
    util.check_tensor_shape_size(min_shape)
    util.check_tensor_shape_size(max_shape)

    check_list = ["float32", "float16"]
    x_dtype = input_dtype.lower()
    min_dtype = min_dtype.lower()
    max_dtype = max_dtype.lower()
    util.check_dtype_rule(x_dtype, check_list)
    util.check_dtype_rule(min_dtype, check_list)
    util.check_dtype_rule(max_dtype, check_list)

    input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]),)
    shape_min, _, _ = util.produce_shapes(min_shape, input_shape)

    if symmetric:
        quant_min = 0 - 2 ** (num_bits - 1)
        quant_max = 2 ** (num_bits - 1) - 1
    else:
        quant_min = 0
        quant_max = 2 ** num_bits - 1
    if narrow_range:
        quant_min = quant_min + 1

    input_data = tvm.placeholder(input_shape, name="x", dtype=x_dtype)
    min_data = tvm.placeholder(shape_min, name="min_data", dtype=min_dtype)
    max_data = tvm.placeholder(shape_min, name="max_data", dtype=max_dtype)
    res_list = fake_quant_minmax_update_compute(input_data, min_data, max_data,
                                                ema, ema_decay, quant_min, quant_max, training, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res_list)

    tensor_list = [input_data, min_data, max_data] + list(res_list)
    config = {"print_ir": False,
              "name": kernel_name,
              "tensor_list": tensor_list}

    te.lang.cce.cce_build_code(sch, config)
예제 #7
0
def minmax_update_perchannel(x,
                             min_val,
                             max_val,
                             min_up,
                             max_up,
                             ema,
                             ema_decay,
                             channel_axis,
                             kernel_name="minmax_update_perchannel"):
    """MinMaxUpdatePerChannel op"""
    x_shape = x.get("ori_shape")
    x_format = x.get("format")
    x_dtype = x.get("dtype")
    min_shape = min_val.get("ori_shape")
    min_dtype = min_val.get("dtype")
    max_shape = max_val.get("ori_shape")
    max_dtype = max_val.get("dtype")

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(x_shape)
    util.check_shape_rule(min_shape, 1, 1, x_shape[channel_axis])
    util.check_shape_rule(max_shape, 1, 1, x_shape[channel_axis])
    util.check_tensor_shape_size(x_shape)
    util.check_tensor_shape_size(min_shape)
    util.check_tensor_shape_size(max_shape)

    check_list = ["float32", "float16"]
    x_dtype = x_dtype.lower()
    min_dtype = min_dtype.lower()
    max_dtype = max_dtype.lower()
    util.check_dtype_rule(x_dtype, check_list)
    util.check_dtype_rule(min_dtype, check_list)
    util.check_dtype_rule(max_dtype, check_list)

    if channel_axis == 0:
        shape_c = min_val.get("ori_shape")
    else:
        shape_c = [min_val.get("shape")[1], min_val.get("shape")[-1]]
    input_data = tvm.placeholder(x.get("shape"), name="x", dtype=x_dtype)
    min_data = tvm.placeholder(shape_c, name="min_val", dtype=x_dtype)
    max_data = tvm.placeholder(shape_c, name="max_val", dtype=x_dtype)
    res_list = minmax_update_perchannel_compute(input_data, min_data, max_data,
                                                ema, ema_decay, channel_axis)

    with tvm.target.cce():
        sch = generic.auto_schedule(res_list)

    tensor_list = [input_data, min_data, max_data] + list(res_list)
    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": tensor_list
    }

    te.lang.cce.cce_build_code(sch, config)
def minmax_update_perlayer(x,
                           min_val,
                           max_val,
                           min_up,
                           max_up,
                           ema,
                           ema_decay,
                           kernel_name="minmax_update_perlayer"):
    """MinMaxUpdatePerLayer op"""
    input_shape = x.get("shape")
    input_dtype = x.get("dtype")
    min_shape = min_val.get("ori_shape")
    min_dtype = min_val.get("dtype")
    max_shape = max_val.get("ori_shape")
    max_dtype = max_val.get("dtype")

    min_shape = util.scalar2tensor_one(min_shape)
    max_shape = util.scalar2tensor_one(max_shape)
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(input_shape)
    util.check_shape_rule(min_shape, 1, 1, 1)
    util.check_shape_rule(max_shape, 1, 1, 1)
    util.check_tensor_shape_size(input_shape)
    util.check_tensor_shape_size(min_shape)
    util.check_tensor_shape_size(max_shape)

    check_list = ["float32", "float16"]
    x_dtype = input_dtype.lower()
    min_dtype = min_dtype.lower()
    max_dtype = max_dtype.lower()
    util.check_dtype_rule(x_dtype, check_list)
    util.check_dtype_rule(min_dtype, check_list)
    util.check_dtype_rule(max_dtype, check_list)

    input_shape = (functools_reduce(lambda x, y: x * y, input_shape[:]), )
    shape_min, _, _ = util.produce_shapes(min_shape, input_shape)

    input_data = tvm.placeholder(input_shape, name="x", dtype=x_dtype)
    min_data = tvm.placeholder(shape_min, name="min_data", dtype=min_dtype)
    max_data = tvm.placeholder(shape_min, name="max_data", dtype=max_dtype)
    res_list = minmax_update_perlayer_compute(input_data, min_data, max_data,
                                              ema, ema_decay)

    with tvm.target.cce():
        sch = generic.auto_schedule(res_list)

    tensor_list = [input_data, min_data, max_data] + list(res_list)
    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": tensor_list
    }

    te.lang.cce.cce_build_code(sch, config)
예제 #9
0
def addcdiv(x1, x2, x3, y=None, alpha=1.0, kernel_name="addcdiv"):

    check_list = ("float16", "float32")

    shape_x1 = x1.get("shape")
    dtype_x1 = x1.get("dtype").lower()

    shape_x2 = x2.get("shape")
    dtype_x2 = x2.get("dtype").lower()

    shape_x3 = x3.get("shape")
    dtype_x3 = x3.get("dtype").lower()

    util.check_shape_rule(shape_x1)    # 校验算子的shape,维度数需要大于等于1、小于等于8
    util.check_shape_size(shape_x1, SHAPE_SIZE_LIMIT)    # 校验算子第一个输入shape大小
    util.check_dtype_rule(dtype_x1, check_list)    # 校验算子的输入数据类型

    util.check_shape_rule(shape_x2)
    util.check_shape_size(shape_x2, SHAPE_SIZE_LIMIT)
    util.check_dtype_rule(dtype_x2, check_list)

    util.check_shape_rule(shape_x3)
    util.check_shape_size(shape_x3, SHAPE_SIZE_LIMIT)
    util.check_dtype_rule(dtype_x3, check_list)

    if dtype_x1 != dtype_x2 or dtype_x1 != dtype_x3:
        raise RuntimeError("the type of x1, x2, x3 must be the same!")

    util.check_kernel_name(kernel_name)    # 校验算子的kernel_name

    # 取shape_x1,shape_x2,shape_x3中每个维度的大值赋给shape_max
    shape_x2, shape_x3, shape_max = broadcast_shapes(shape_x2, shape_x3)
    util.check_tensor_shape_size(shape_max)     # 对shape_max进行校验
    shape_x1, _, shape_max = broadcast_shapes(shape_x1, shape_max)
    util.check_tensor_shape_size(shape_max)     # 对shape_max进行校验
    shape_x2, _, _ = broadcast_shapes(shape_x2, shape_max)    # 将input_x的shape广播为shape_max
    shape_x3, _, _ = broadcast_shapes(shape_x3, shape_max)    # 将input_y的shape广播为shape_max

    data_x1 = tvm.placeholder(shape_x1, name="data_x1", dtype=dtype_x1)
    data_x2 = tvm.placeholder(shape_x2, name="data_x2", dtype=dtype_x2)
    data_x3 = tvm.placeholder(shape_x3, name="data_x3", dtype=dtype_x3)

    res = addcdiv_compute(data_x1, data_x2, data_x3, shape_max, alpha, kernel_name)

    with tvm.target.cce():
        schedule = generic.auto_schedule(res)

    config = {"name": kernel_name,
              "tensor_list": [data_x1, data_x2, data_x3, res]}

    te.lang.cce.cce_build_code(schedule, config)
def check_param(x, grad, argmax, y, ksize, strides, padding, dtype, dilation,
                ceil_mode, kernel_name):
    """
    check the parameters is valid, if one is invalid,then raise error
    Parameters
    ----------
    x: dict,shape and datatype
    grad: dict,shape and datatype
    argmax: dict,shape and datatype
    y: dict,shape and datatype
    ksize: kernel or windows size,minimum length is 4,
          just like [1, poolingWindowH, poolingWindowW, 1]
    strides: stride , minimum length is 4, just like
    [1, poolingStrideH, poolingStrideW, 1]
    padding: pad mode
    Returns
    -------
    None
    """
    y_shape = x.get("shape")
    y_dtype = x.get("dtype").lower()
    y_dtype_arg = y.get("dtype").lower()
    input_gard_shape = grad.get("shape")
    grad_dtype = grad.get("dtype").lower()
    argmax_shape = argmax.get("shape")
    argmax_dtype = argmax.get("dtype").lower()
    util.check_shape_rule(y_shape)
    util.check_shape_rule(input_gard_shape)
    util.check_shape_rule(argmax_shape)
    util.check_kernel_name(kernel_name)
    check_shape_5hd(y_shape)
    check_shape_5hd(input_gard_shape)
    util.check_tensor_shape_size(input_gard_shape)
    util.check_tensor_shape_size(argmax_shape)
    util.check_tensor_shape_size(y_shape)
    util.check_dtype_rule(grad_dtype, ("float16", "float32", "int32"))
    util.check_dtype_rule(argmax_dtype, ("uint16"))
    util.check_dtype_rule(y_dtype, ("float16", "float32", "int32"))

    if y_dtype != grad_dtype or y_dtype_arg != y_dtype:
        raise RuntimeError("The dtype of tensor must be same")

    if dtype != DT_INT32 and dtype != DT_INT64:
        raise RuntimeError(
            "The dtype of input max indice must be int32 or int64")

    check_output_dim_with_ksize_stride(padding, input_gard_shape, y_shape,
                                       ksize, strides, dilation, ceil_mode)
예제 #11
0
파일: conv3d.py 프로젝트: gekowa/ascend-opp
def check_conv3d_dtype(fmp_dtype, w_dtype, res_dtype):
    """
    algorithm: check the input params of conv3d

    Parameters
    ----------

    fmp_dtype: the dtype of feature

    w_dtype: the dtype of filter

    res_dtype: the dtype of output

    Returns
    -------
    None
    """

    util.check_dtype_rule(fmp_dtype, ('float16', ))
    util.check_dtype_rule(w_dtype, ('float16', ))
    util.check_dtype_rule(res_dtype, ('float16', ))
예제 #12
0
def conv_layer_fast_cce_para_check(shape_in, shape_w, in_dtype, w_dtype,
                                   res_dtype, padh, padw, strideh, stridew,
                                   bias, kernel_name):
    # conv shape check
    util.check_kernel_name(kernel_name)

    # conv data type check
    util.check_dtype_rule(in_dtype, ['float16'])
    util.check_dtype_rule(w_dtype, ['float16'])
    util.check_dtype_rule(res_dtype, ['float16'])

    if not isinstance(bias, bool):
        raise RuntimeError("bias dtype should be bool.")

    if isinstance(padh, list):
        if len(padh) != PAD_SHAPE_DIM:
            raise RuntimeError("Dimension must be %d when padh is a list." %
                               PAD_SHAPE_DIM)
        pad_top = padh[0]
        pad_bottom = padh[1]
    else:
        pad_top = padh
        pad_bottom = padh

    if isinstance(padw, list):
        if len(padw) != PAD_SHAPE_DIM:
            raise RuntimeError("Dimension must be %d when padw is a list." %
                               PAD_SHAPE_DIM)
        pad_left = padw[0]
        pad_right = padw[1]
    else:
        pad_left = padw
        pad_right = padw

    shape_in, shape_w = te.lang.cce.check_conv_shape(
        shape_in, shape_w, pad_top, pad_bottom, pad_left, pad_right, strideh,
        stridew, in_dtype, w_dtype, res_dtype)

    return shape_in, shape_w
예제 #13
0
def cheak(x, y1, y2, axis, kernel_name):
    """
    Function: Check parameters (eg: shape dtype etc).
    Modify : 2020-08-03
    """
    util.check_kernel_name(kernel_name)

    shape = y1.get("shape")
    dtype = y1.get("dtype").lower()
    util.check_dtype_rule(dtype, ("float16"))
    util.check_shape_rule(shape)

    shape = y2.get("shape")
    dtype = y2.get("dtype").lower()
    util.check_dtype_rule(dtype, ("int32"))
    util.check_shape_rule(shape)

    shape = x.get("shape")
    dtype = x.get("dtype").lower()
    util.check_dtype_rule(dtype, ("float16"))
    util.check_shape_rule(shape)

    if axis == -1:
        axis = len(shape) - 1

    if axis != len(shape) - 1:
        raise RuntimeError("Dim should take the last one.")

    allnum = functools_reduce(lambda x, y: x * y, shape)

    num = shape[axis]

    if num > MAX_NUM:
        raise RuntimeError("Num in dim is too big (>7040).")

    return shape, dtype, allnum, num
예제 #14
0
def smooth_l1_loss_v2(predict,
                      label,
                      loss,
                      sigma=1.0,
                      reduction="mean",
                      kernel_name="smooth_l1_loss_v2"):
    """
    calculating data

    Parameters
    ----------
    predict : dict
        shape and dtype of input
    label : dict
        shape and dtype of input
    loss : dict
        shape and dtype of output,
        should be same shape and type as input
    sigma: float
        sigma, default value is 1
    reduction: str
        type of result, default value is "mean"
    kernel_name : str
        kernel name, default value is "smooth_l1_lossV2"

    Returns
    -------
    None
    """
    util.check_kernel_name(kernel_name)
    check_list = ("float16", "float32")

    shape_predict = predict.get("shape")
    dtype_predict = predict.get("dtype").lower()
    util.check_dtype_rule(dtype_predict, check_list)

    shape_label = label.get("shape")
    dtype_label = label.get("dtype").lower()
    util.check_dtype_rule(dtype_label, check_list)

    shape_loss = label.get("shape")
    dtype_loss = loss.get("dtype").lower()
    util.check_dtype_rule(dtype_loss, check_list)

    util.check_shape_rule(shape_predict)
    util.check_shape_rule(shape_label)
    util.check_shape_rule(shape_loss)

    util.compare_tensor_dict_key(predict, label, "shape")

    check_list_reduction = ("none", "mean", "sum")
    reduction_type = reduction.lower()

    util.check_dtype_rule(reduction_type, check_list_reduction)

    input_predict = tvm.placeholder(shape_predict,
                                    name="predict",
                                    dtype=dtype_predict)
    input_label = tvm.placeholder(shape_label, name="label", dtype=dtype_label)

    res = smooth_l1_loss_v2_compute(input_predict, input_label, sigma,
                                    reduction_type)

    # TODO:auto schedule
    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    # TODO:operator build
    config = {
        "name": kernel_name,
        "tensor_list": [input_predict, input_label, res]
    }

    te.lang.cce.cce_build_code(sch, config)
예제 #15
0
def fake_quant_perchannel_grad(dout,
                               x,
                               min_val,
                               max_val,
                               dx,
                               symmetric,
                               narrow_range,
                               num_bits,
                               channel_axis,
                               kernel_name="fake_quant_perchannel_grad"):
    """FakeQuantPerChannelGrad"""
    x_shape = x.get("shape")
    x_format = x.get("format")
    x_dtype = x.get("dtype")
    min_shape = min_val.get("ori_shape")
    min_dtype = min_val.get("dtype")
    max_shape = max_val.get("ori_shape")
    max_dtype = max_val.get("dtype")

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(x_shape)
    util.check_shape_rule(min_shape, 1, 1, x_shape[channel_axis])
    util.check_shape_rule(max_shape, 1, 1, x_shape[channel_axis])
    util.check_tensor_shape_size(x_shape)
    util.check_tensor_shape_size(min_shape)
    util.check_tensor_shape_size(max_shape)

    check_list = ["float32", "float16"]
    x_dtype = x_dtype.lower()
    min_dtype = min_dtype.lower()
    max_dtype = max_dtype.lower()
    util.check_dtype_rule(x_dtype, check_list)
    util.check_dtype_rule(min_dtype, check_list)
    util.check_dtype_rule(max_dtype, check_list)

    if symmetric:
        quant_min = 0 - 2**(num_bits - 1)
        quant_max = 2**(num_bits - 1) - 1
    else:
        quant_min = 0
        quant_max = 2**num_bits - 1
    if narrow_range:
        quant_min = quant_min + 1

    shape_c = [1] * len(x_shape)
    shape_c[channel_axis] = min_val.get("ori_shape")[0]
    if x_format == "NC1HWC0" and channel_axis == 1:
        shape_c = min_val.get("shape")
    dout_data = tvm.placeholder(x_shape, name="dout", dtype=x_dtype)
    input_data = tvm.placeholder(x_shape, name="x", dtype=x_dtype)
    min_data = tvm.placeholder(shape_c, name="min_val", dtype=x_dtype)
    max_data = tvm.placeholder(shape_c, name="max_val", dtype=x_dtype)
    res = fake_quant_perchannel_grad_compute(dout_data, input_data, min_data,
                                             max_data, quant_min, quant_max,
                                             kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    tensor_list = [dout_data, input_data, min_data, max_data, res]
    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": tensor_list
    }

    te.lang.cce.cce_build_code(sch, config)
예제 #16
0
def addcmul(input_data, x1, x2, y, value=1.0, kernel_name="addcmul"):
    """
    algorithm: addcmul
    calculating data's addcmul, y = input_data + value * (x1 * x2)

    Parameters
    ----------
    input_data : dict
        shape and dtype of first input, only support float16, float32, int32, int8, uint8
    x1 : dict
        shape and dtype of second input, only support float16, float32, int32, int8, uint8
    x2 : dict
        shape and dtype of third input, only support float16, float32, int32, int8, uint8
    y: dict
        shape and dtype of output, should be broadcast shape and type as input
    value: float
        scaling coefficient, default value is 1.0
    kernel_name : str
        cce kernel name, default value is addcmul

    Returns
    -------
    None
    """
    shape_input = input_data.get("shape")
    shape_x1 = x1.get("shape")
    shape_x2 = x2.get("shape")
    dtype_input = input_data.get("dtype").lower()
    dtype_x1 = x1.get("dtype").lower()
    dtype_x2 = x2.get("dtype").lower()

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_input)
    util.check_shape_size(shape_input, SHAPE_SIZE_LIMIT)
    util.check_shape_rule(shape_x1)
    util.check_shape_size(shape_x1, SHAPE_SIZE_LIMIT)
    util.check_shape_rule(shape_x2)
    util.check_shape_size(shape_x2, SHAPE_SIZE_LIMIT)

    check_list = ("float16", "float32", "int32", "int8", "uint8")
    util.check_dtype_rule(dtype_input, check_list)
    util.check_dtype_rule(dtype_x1, check_list)
    util.check_dtype_rule(dtype_x2, check_list)
    if dtype_input != dtype_x1 or dtype_input != dtype_x2:
        raise RuntimeError("the type of input_data, x1, x2 must be same")

    shape_x1, shape_x2, shape_max1 = broadcast_shapes(shape_x1, shape_x2)
    util.check_tensor_shape_size(shape_max1)
    shape_input, _, shape_max = broadcast_shapes(shape_input, shape_max1)
    util.check_tensor_shape_size(shape_max)
    shape_x1, _, _ = broadcast_shapes(shape_x1, shape_max)
    shape_x2, _, _ = broadcast_shapes(shape_x2, shape_max)

    data_input = tvm.placeholder(shape_input, dtype=dtype_input, name="data_input")
    data_x1 = tvm.placeholder(shape_x1, dtype=dtype_x1, name="data_x1")
    data_x2 = tvm.placeholder(shape_x2, dtype=dtype_x2, name="data_x2")
    res = addcmul_compute(data_input, data_x1, data_x2, shape_max, y, value, kernel_name="addcmul")

    with tvm.target.cce():
        schedule = generic.auto_schedule(res)

    tensor_list = [data_input, data_x1, data_x2, res]

    config = {"print_ir": False,
              "name": kernel_name,
              "tensor_list": tensor_list}

    te.lang.cce.cce_build_code(schedule, config)
예제 #17
0
def check_conv3dbp_input_params(
        shape_filter,  # pylint:disable=R0913,R0914,R0915
        shape_out_backprop,
        input_sizes,
        strides,
        pads,
        dilations,
        filter_dtype,
        out_backprop_dtype,
        res_dtype,
        kernel_name):
    """
    The params check function of conv3d backprop input

    Parameters:
    -------------------------
    shape_filter : The shape of filter.
                   5-D with shape (depth, height, weight, batch, channels)

    shape_out_backprop : The shape of gradients.
                         5-D with shape[batch, depth, height, weight,channels]

    input_sizes : The shape of feature map.
                  5-D with shape [batch, depth, height, weight, channels].

    strides : A list of ints. The stride of the sliding window.

    pads : A list of ints.

    dilations : An optional list of ints. Only support [1, 1, 1, 1, 1] now.

    filter_dtype : The dtype of filter data. Default value is float16.

    out_backprop_dtype : The dtype of gradients data. Default value is float16

    res_dtype : The dtype of result(De/Dx) data. Default value is float16.

    kernel_name : Cce kernel name.
                  Default value is "conv3d_backprop_intput_cce"

    Returns : All transformed params.


    """
    def _check_attr_range(attr_name, attr_value, attr_min, attr_max):
        if attr_value < attr_min or attr_value > attr_max:
            dict_args = {
                'errCode': 'E60011',
                'range': '[{},{}]'.format(attr_min, attr_max),
                'attr_name': attr_name,
                'value': str(attr_value)
            }
            raise RuntimeError(dict_args,
                               err_mana.get_error_message(dict_args))

    def _check_64bits_limitation(attr_name, attr_value, dtype=None):
        if dtype is None:
            bit_ratio = BIT_RATIO_DICT.get("float16")
        else:
            bit_ratio = BIT_RATIO_DICT.get(dtype)
        if attr_value * bit_ratio > DATA_SIZE_MAX:
            dict_args = {
                'errCode': 'E60020',
                'attr_name': attr_name,
            }
            raise RuntimeError(dict_args,
                               err_mana.get_error_message(dict_args))

    def _check_l1_limitation():
        block_size = 16
        w_value = dedy_w * stride_w
        if fmap_w > block_size:
            h_value_max = filter_h_dilation + 1
        elif block_size % fmap_w == 0:
            h_value_max = filter_h_dilation + block_size // fmap_w - 1
        else:
            h_value_max = filter_h_dilation + block_size // fmap_w + 1

        a_l1_size = h_value_max * w_value * \
                    ((filter_d_dilation - 2)//stride_d + 2) * block_size * 2
        b_l1_size = filter_h_dilation * filter_w_dilation * \
                    filter_d_dilation * block_size * block_size * 2
        l1_size = get_soc_spec("L1_SIZE")
        if (a_l1_size + b_l1_size) > l1_size:
            dict_args = {'errCode': 'E60022'}
            raise RuntimeError(dict_args,
                               err_mana.get_error_message(dict_args))

    def _check_shape_error():
        fmap_h_padding = fmap_h + pad_up + pad_down
        fmap_w_padding = fmap_w + pad_left + pad_right
        fmap_d_padding = fmap_deep + pad_head + pad_tail

        if fmap_channel != filter_channel:
            dict_args = {
                'errCode': 'E60108',
                'reason': "Shape error: Fmap's C must be equal to Filter'C."
            }
            raise RuntimeError(dict_args,
                               err_mana.get_error_message(dict_args))
        if dedy_channel != filter_batch:
            dict_args = {
                'errCode': 'E60108',
                'reason': "Shape error: Dedy's C must be equal to Filter'N."
            }
            raise RuntimeError(dict_args,
                               err_mana.get_error_message(dict_args))
        if fmap_batch != dedy_batch:
            dict_args = {
                'errCode': 'E62503',
                'backprop_N': str(dedy_batch),
                'forward_shape': str(fmap_batch)
            }
            raise RuntimeError(dict_args,
                               err_mana.get_error_message(dict_args))
        if filter_h_dilation > fmap_h_padding:
            dict_args = {
                'errCode': 'E62507',
                'dim': 'H',
                'filter_dila': str(filter_h_dilation),
                'input_pad': str(fmap_h_padding)
            }
            raise RuntimeError(dict_args,
                               err_mana.get_error_message(dict_args))
        if filter_w_dilation > fmap_w_padding:
            dict_args = {
                'errCode': 'E62507',
                'dim': 'W',
                'filter_dila': str(filter_w_dilation),
                'input_pad': str(fmap_w_padding)
            }
            raise RuntimeError(dict_args,
                               err_mana.get_error_message(dict_args))
        if filter_d_dilation > fmap_d_padding:
            dict_args = {
                'errCode': 'E62507',
                'dim': 'D',
                'filter_dila': str(filter_d_dilation),
                'input_pad': str(fmap_d_padding)
            }
            raise RuntimeError(dict_args,
                               err_mana.get_error_message(dict_args))
        if ((fmap_h - filter_h_dilation + pad_up + pad_down) // stride_h +
                1) != dedy_h:
            dict_args = {
                'errCode': 'E60024',
            }
            raise RuntimeError(dict_args,
                               err_mana.get_error_message(dict_args))
        if ((fmap_w - filter_w_dilation + pad_left + pad_right) // stride_w +
                1) != dedy_w:
            dict_args = {
                'errCode': 'E60025',
            }
            raise RuntimeError(dict_args,
                               err_mana.get_error_message(dict_args))
        if ((fmap_deep - filter_d_dilation + pad_head + pad_tail) // stride_d +
                1) != dedy_deep:
            dict_args = {
                'errCode': 'E62508',
            }
            raise RuntimeError(dict_args,
                               err_mana.get_error_message(dict_args))

    # Base check, Mainly required by interface appearance
    # ===========================================================
    # util check
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_filter, CONV_BACKPROP_SHAPE_DIM,
                          CONV_BACKPROP_SHAPE_DIM, DEFAULT_MAX_SHAPE_NUM)
    util.check_shape_rule(shape_out_backprop, CONV_BACKPROP_SHAPE_DIM,
                          CONV_BACKPROP_SHAPE_DIM, DEFAULT_MAX_SHAPE_NUM)
    util.check_shape_rule(input_sizes, CONV_BACKPROP_SHAPE_DIM,
                          CONV_BACKPROP_SHAPE_DIM, DEFAULT_MAX_SHAPE_NUM)
    util.check_shape_rule(strides, STRIDES_SHAPE_DIM, STRIDES_SHAPE_DIM,
                          DEFAULT_MAX_SHAPE_NUM)

    # pads check
    if isinstance(pads, (tuple, list)) and \
            len(pads) != CONV_BACKPROP_PAD_SHAPE_DIM:
        dict_args = {
            'errCode': 'E62501',
            'param_name': 'pads',
        }
        raise RuntimeError(dict_args, err_mana.get_error_message(dict_args))

    if isinstance(pads, str) and pads not in ['SAME', 'VALID']:
        dict_args = {
            'errCode': 'E60000',
            'param_name': 'pads',
            'expected_value': 'SAME or VALID',
            'input_value': str(pads),
        }
        raise RuntimeError(dict_args, err_mana.get_error_message(dict_args))
    # dilations check
    util.check_shape_rule(dilations, CONV_BACKPROP_SHAPE_DIM,
                          CONV_BACKPROP_SHAPE_DIM, DEFAULT_MAX_SHAPE_NUM)
    dilation_n, dilation_d, dilation_h, dilation_w, dilation_c = dilations
    if dilation_n != 1 or dilation_c != 1:
        dict_args = {
            'errCode': 'E60023',
            'dilation_n': str(dilation_n),
            'dilation_c': str(dilation_c),
        }
        raise RuntimeError(dict_args, err_mana.get_error_message(dict_args))

    # detype chek
    filter_dtype = filter_dtype.lower()
    out_backprop_dtype = out_backprop_dtype.lower()
    res_dtype = res_dtype.lower()
    util.check_dtype_rule(filter_dtype, ['float16'])
    util.check_dtype_rule(out_backprop_dtype, ['float16'])
    util.check_dtype_rule(res_dtype, ['float16'])

    # the relation limits between shape
    shape_filter = list(shape_filter)
    shape_out_backprop = list(shape_out_backprop)
    input_sizes = list(input_sizes)
    strides = list(strides)
    fmap_batch, fmap_deep, fmap_h, fmap_w, fmap_channel = input_sizes
    dedy_batch, dedy_deep, dedy_h, dedy_w, dedy_channel = shape_out_backprop
    filter_depth, filter_h, \
    filter_w, filter_channel, filter_batch = shape_filter
    _, stride_d, stride_h, stride_w, _ = strides

    filter_h_dilation = (filter_h - 1) * dilation_h + 1
    filter_w_dilation = (filter_w - 1) * dilation_w + 1
    filter_d_dilation = (filter_depth - 1) * dilation_d + 1

    if pads == 'SAME':
        pad_h = align(fmap_h, stride_h) - stride_h + filter_h - fmap_h
        pad_h = max(pad_h, 0)
        pad_up = pad_h // 2
        pad_down = pad_h - pad_up
        pad_w = align(fmap_w, stride_w) - stride_w + filter_w - fmap_w
        pad_w = max(pad_w, 0)
        pad_left = pad_w // 2
        pad_right = pad_w - pad_left
        pad_d = align(fmap_deep, stride_d)\
                - stride_d + filter_depth - fmap_deep
        pad_d = max(pad_d, 0)
        pad_head = pad_d // 2
        pad_tail = pad_d - pad_head

        pads = [pad_head, pad_tail, pad_up, pad_down, pad_left, pad_right]
    elif pads == "VALID":
        pads = PADDING_VAILD
    # pads compute
    pads = list(pads)
    pad_head, pad_tail, pad_up, pad_down, pad_left, pad_right = pads

    fmap_h_padding = fmap_h + pad_up + pad_down
    fmap_w_padding = fmap_w + pad_left + pad_right

    # special cases
    dey_hw_min, fmap_hw_min = DEDY_HW_MIN, FMAP_HW_MIN
    # limitation by chip:
    # if kernel h,w in [1,11] and fmap h/w after padding equals to filter h/w
    # load3d support h,w is 1
    if (1 <= filter_h <= 11) and (1 <= filter_w <= 11) \
            and (fmap_h_padding == filter_h or fmap_w_padding == filter_w):
        dey_hw_min = 1
        fmap_hw_min = 1
    _check_shape_error()
    _check_l1_limitation()

    # Dedy value limit
    _check_attr_range("Dedy's H after expands", dedy_h * stride_h, dey_hw_min,
                      DEDY_HW_MAX)
    _check_attr_range("Dedy's W after expands", dedy_w * stride_w, dey_hw_min,
                      DEDY_HW_MAX)

    # filter value limit
    _check_attr_range("filter's H", filter_h, FILTER_HW_MIN, FILTER_HW_MAX)
    _check_attr_range("filter's W", filter_w, FILTER_HW_MIN, FILTER_HW_MAX)
    _check_attr_range("filter's D", filter_depth, FILTER_HW_MIN, FILTER_D_MAX)

    _check_attr_range("filter H*W", filter_h * filter_w, FILTER_HW_MIN,
                      FILTER_HW_SIZE)

    _check_attr_range("filter H*W*D", filter_h * filter_w * filter_depth,
                      FILTER_HW_MIN, KHWD_COEFF)

    # Fmap value limit
    _check_attr_range("Fmap's H", fmap_h, fmap_hw_min, FMAP_HW_MAX)
    _check_attr_range("Fmap's W", fmap_w, fmap_hw_min, FMAP_HW_MAX)

    # stride value limit
    _check_attr_range("stride's H", stride_h, STRIDE_HW_MIN, STRIDE_HW_MAX)
    _check_attr_range("stride's W", stride_w, STRIDE_HW_MIN, STRIDE_HW_MAX)
    _check_attr_range("stride's H*W", stride_h * stride_w, STRIDE_HW_MIN,
                      STRIDE_SIZE_MAX)
    _check_attr_range("stride's H*W*D", stride_h * stride_w * stride_d,
                      STRIDE_HW_MIN, STRIDE_SIZE_HWD_MAX)

    # check shape size, 64 bits limitation
    # ===========================================================
    c0_size = cce_params.C0_SIZE
    fmap_size = fmap_batch * align(fmap_channel, c0_size) \
                * fmap_deep * fmap_h * fmap_w
    dedy_size = dedy_batch * align(dedy_channel, c0_size) \
                * dedy_deep * dedy_h * dedy_w
    filter_size = align(filter_batch, c0_size) * \
    align(filter_channel, c0_size) * filter_depth * filter_h * filter_w
    _check_64bits_limitation("input", fmap_size, dtype=res_dtype)
    _check_64bits_limitation("out_backprop",
                             dedy_size,
                             dtype=out_backprop_dtype)
    _check_64bits_limitation("filter", filter_size, dtype=filter_dtype)

    result = (shape_filter, shape_out_backprop, input_sizes, strides, pads,
              dilations, filter_dtype, out_backprop_dtype, res_dtype,
              kernel_name)
    return result
def fake_learned_scale_quant_perchannel_grad_d(
        dout,
        input_x,
        alpha,
        quant_max,
        dx,
        dalpha,
        neg_trunc,
        channel_axis,
        kernel_name="fake_learned_scale_quant_perchannel_grad_d"):
    """FakeLearnedScaleQuantPerChannelGradD"""
    input_shape = input_x.get("shape")
    input_x_shape_ = input_x.get("ori_shape")
    input_x_format = input_x.get("format")
    input_dtype = input_x.get("dtype")
    alpha_shape = alpha.get("ori_shape")
    alpha_dtype = alpha.get("dtype")
    quant_max_shape = quant_max.get("ori_shape")
    quant_max_dtype = quant_max.get("dtype")
    # for Dense weight quant, 2d[co,ci] -> 4d[1,co,ci,1], channel_axis_ need change to 1.
    if channel_axis == 0 and input_x_shape_[0] != alpha_shape[
            0] and input_x_shape_[1] == alpha_shape[0]:
        channel_axis_ = 1
    else:
        channel_axis_ = channel_axis

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(input_shape)
    util.check_shape_rule(alpha_shape, 1, 1, input_x_shape_[channel_axis_])
    util.check_shape_rule(quant_max_shape, 1, 1, 1)
    util.check_tensor_shape_size(input_shape)
    util.check_tensor_shape_size(alpha_shape)
    util.check_tensor_shape_size(quant_max_shape)

    check_list = ["float32", "float16"]
    input_dtype = input_dtype.lower()
    alpha_dtype = alpha_dtype.lower()
    quant_max_dtype = quant_max_dtype.lower()
    util.check_dtype_rule(input_dtype, check_list)
    util.check_dtype_rule(alpha_dtype, check_list)
    util.check_dtype_rule(quant_max_dtype, check_list)

    shape_c = [1] * len(input_shape)
    shape_c[channel_axis_] = alpha.get("ori_shape")[0]
    if input_x_format == "NC1HWC0" and channel_axis_ == 1:
        shape_c = alpha.get("shape")

    dout_data = tvm.placeholder(input_shape, name="dout", dtype=input_dtype)
    input_data = tvm.placeholder(input_shape, name="x", dtype=input_dtype)
    alpha_data = tvm.placeholder(shape_c, name="alpha_data", dtype=alpha_dtype)
    quant_max_data = tvm.placeholder(quant_max_shape,
                                     name="quant_max_data",
                                     dtype=quant_max_dtype)
    res = fake_learned_scale_quant_perchannel_grad_d_compute(
        dout_data, input_data, alpha_data, quant_max_data, neg_trunc,
        kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    tensor_list = [dout_data, input_data, alpha_data, quant_max_data
                   ] + list(res)
    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": tensor_list
    }

    te.lang.cce.cce_build_code(sch, config)
예제 #19
0
def batchnorm_fold(x, x_sum, x_square_sum, mean, variance,
                   y, batch_mean, batch_std, running_mean, running_std, mean_updated, variance_updated,
                   momentum=0.9, epsilon=1e-5, is_training=True, freeze_bn=0, data_format="NCHW",
                   kernel_name="batchnorm_fold"):
    """batchnorm_fold TBE op"""
    momentum = 1.0 - momentum
    util.check_kernel_name(kernel_name)
    data_format = data_format.upper()
    if data_format != "NCHW":
        raise RuntimeError("The data_format only support NCHW")

    shape_x = x.get("shape")
    shape_mean = mean.get("shape")
    shape_variance = variance.get("shape")
    dtype_x = x.get("dtype")
    dtype_mean = mean.get("dtype")
    dtype_variance = variance.get("dtype")
    for shape in (shape_x, shape_mean, shape_variance):
        util.check_shape_rule(shape)
        util.check_tensor_shape_size(shape)
    check_tuple = ("float16", "float32")
    for dtype in (dtype_x, dtype_mean, dtype_variance):
        util.check_dtype_rule(dtype.lower(), check_tuple)

    format_data = x.get("format").upper()
    if format_data not in ("NCHW", "NC1HWC0"):
        raise RuntimeError("Format of input only support 4D and 5HD")

    if format_data == "NC1HWC0":
        if len(shape_x) != 5:
            raise RuntimeError("batchnorm_fold only support shape 5D"
                               "when input format is NC1HWC0")
        shape_mean = (1, shape_x[1], 1, 1, shape_x[4])
    elif format_data == "NCHW":
        if len(shape_x) < 2 or len(shape_x) > 4:
            raise RuntimeError("batchnorm_fold only support shape 2D to 4D")
        if shape_x[1] != shape_mean[0]:
            raise RuntimeError("data_format is NCHW, shape_bias must"
                               "be equal to the second axis of shape_x")
        shape_mean = (1, shape_x[1],)
        for _ in range(2, len(shape_x)):
            shape_mean = shape_mean + (1,)

    x_input = tvm.placeholder(shape_x, name="x_input", dtype=dtype_x.lower())
    x_sum = tvm.placeholder(shape_mean, name="x_sum", dtype=dtype_x.lower())
    x_square_sum = tvm.placeholder(shape_mean, name="x_square_sum", dtype=dtype_x.lower())
    mean = tvm.placeholder(shape_mean, name="mean", dtype=dtype_mean.lower())
    variance = tvm.placeholder(shape_mean, name="variance", dtype=dtype_variance.lower())

    shape_x = te.lang.cce.util.shape_to_list(x_input.shape)
    num = shape_x[0] * shape_x[2] * shape_x[3]
    num_rec = 1.0 / num

    # compute the mean of x
    batch_mean = te.lang.cce.vmuls(x_sum, num_rec)

    # compute the variance of x
    variance_div = te.lang.cce.vmuls(x_square_sum, num_rec)
    mean_square = te.lang.cce.vmul(batch_mean, batch_mean)
    batch_var_biased = te.lang.cce.vsub(variance_div, mean_square)

    if num == 1:
        batch_var_scaler = 0.0
    else:
        batch_var_scaler = float(num) / (num - 1)
    batch_variance = te.lang.cce.vmuls(batch_var_biased, batch_var_scaler)
    batch_std = te.lang.cce.vsqrt(te.lang.cce.vadds(batch_variance, epsilon))

    factor = 1.0 - momentum
    factor_reverse = momentum
    mean_mul = te.lang.cce.vmuls(batch_mean, factor)
    mean_mul_rev = te.lang.cce.vmuls(mean, factor_reverse)
    mean_updated = te.lang.cce.vadd(mean_mul, mean_mul_rev)

    var_mul = te.lang.cce.vmuls(batch_variance, factor)
    var_mul_rev = te.lang.cce.vmuls(variance, factor_reverse)
    variance_updated = te.lang.cce.vadd(var_mul, var_mul_rev)

    y = te.lang.cce.vadds(x_input, 0.0)
    running_mean = te.lang.cce.vadds(mean, 0.0)
    running_std = te.lang.cce.vsqrt(te.lang.cce.vadds(variance, epsilon))
    res = [y, batch_mean, batch_std, running_mean, running_std, mean_updated, variance_updated]

    with tvm.target.cce():
        sch = generic.auto_schedule(res)
    config = {"name": kernel_name,
              "tensor_list": [x_input, x_sum, x_square_sum, mean, variance] + res}
    te.lang.cce.cce_build_code(sch, config)
예제 #20
0
def correction_mul_grad(dout,
                        x,
                        batch_std,
                        running_std,
                        dx,
                        d_batch_std,
                        channel,
                        kernel_name="correction_mul_grad"):
    """CorrectionMulGrad op"""
    shape_dout = dout.get("shape")
    shape_x = dout.get("shape")

    dtype_dout = dout.get("dtype")
    dtype_x = x.get("dtype")
    dtype_batch_std = batch_std.get("dtype")
    dtype_running_std = running_std.get("dtype")

    inp_dtype_dout = dtype_dout.lower()
    inp_dtype_x = dtype_x.lower()
    inp_dtype_batch_std = dtype_batch_std.lower()
    inp_dtype_running_std = dtype_running_std.lower()

    util.check_dtype_rule(inp_dtype_dout, ("float16", "float32"))
    util.check_dtype_rule(inp_dtype_x, ("float16", "float32"))
    util.check_dtype_rule(inp_dtype_batch_std, ("float32", ))
    util.check_dtype_rule(inp_dtype_running_std, ("float32", ))
    util.compare_tensor_dict_key(dout, x, "dtype")
    util.compare_tensor_dict_key(dout, x, "shape")
    util.compare_tensor_dict_key(dx, x, "shape")
    util.compare_tensor_dict_key(batch_std, running_std, "shape")
    util.compare_tensor_dict_key(batch_std, d_batch_std, "shape")

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_x)
    util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT)

    data_format = dout.get("format")
    ori_format = dout.get("format")
    if data_format.upper() not in ("NC1HWC0", "NCHW"):
        raise RuntimeError("Un supported data format {}".format(data_format))
    if data_format.upper() == "NCHW" and ori_format != "NCHW":
        raise RuntimeError("data_format(NCHW) must same as ori_format")

    shape_c = [1] * len(shape_x)
    shape_c[channel] = batch_std.get("ori_shape")[0]
    if data_format == "NC1HWC0" and channel == 1:
        shape_c = batch_std.get("shape")

    dout_t = tvm.placeholder(shape_dout, name="dout", dtype=inp_dtype_dout)
    x_t = tvm.placeholder(shape_x, name="x", dtype=inp_dtype_x)
    batch_std_t = tvm.placeholder(shape_c,
                                  name="batch_std",
                                  dtype=inp_dtype_batch_std)
    running_std_t = tvm.placeholder(shape_c,
                                    name="running_std",
                                    dtype=inp_dtype_running_std)
    res_list = correction_mul_grad_compute(dout_t, x_t, batch_std_t,
                                           running_std_t, channel, data_format,
                                           kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res_list)

    tensor_list = [dout_t, x_t, batch_std_t, running_std_t] + list(res_list)
    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": tensor_list
    }

    te.lang.cce.cce_build_code(sch, config)
예제 #21
0
def check_conv3dbp_filter_params(shape_x, shape_out_backprop, filter_sizes,
                                 strides, pads, dilations, x_dtype,
                                 out_backprop_dtype, res_dtype, kernel_name):
    """
    The params check function of conv3d_backprop_filter

    Parameters:
    ----------
    shape_x : The shape of feature map,
              which is 5-D [batch, depth, channels, height, weight].

    shape_out_backprop : The shape of gradients,
                         which is 5-D [batch, depth,channels, height, weight].

    filter_sizes : The shape of filter.
                   which is 5-D [batch, depth, channels, height, weight].

    strides : The stride of the sliding window. A list of ints.

    pads : "SAME"or"VALID",
           indicating the type of pads algorithm to use, or list.

    dilations : An optional list of ints. Default value is [1, 1, 1, 1].

    x_dtype : Fmeature map  data dtype. Default value is float16.

    out_backprop_dtype : Gradients data dtype. Default value is float16.

    res_dtype : Result(De/Dw) data dtype. Default value is float32.

    kernel_name : Kernel name of cce.
                  Default value is "conv3d_backprop_filter_cce"

    Returns : All transformed params.
    ----------
    """
    def _check_attr_range_dw(name, value, attr_min=None, attr_max=None):
        if not attr_min and not attr_max:
            return
        if not attr_min:
            if value > attr_max:
                args_dict = {
                    'errCode': 'E60011',
                    'range': '(,{}]'.format(attr_max),
                    'attr_name': name,
                    'value': str(value)
                }
                raise RuntimeError(args_dict,
                                   err_mana.get_error_message(args_dict))
        elif not attr_max:
            if value < attr_min:
                args_dict = {
                    'errCode': 'E60011',
                    'range': '[{},)'.format(attr_min),
                    'attr_name': name,
                    'value': str(value)
                }
                raise RuntimeError(args_dict,
                                   err_mana.get_error_message(args_dict))
        elif value > attr_max or value < attr_min:
            args_dict = {
                'errCode': 'E60011',
                'range': '[{},{}]'.format(attr_min, attr_max),
                'attr_name': name,
                'value': str(value)
            }
            raise RuntimeError(args_dict,
                               err_mana.get_error_message(args_dict))

    def _check_64bits_limitation(attr_name, attr_value, dtype=None):
        if dtype:
            bit_ratio = BIT_RATIO_DICT.get(dtype)
        else:
            bit_ratio = BIT_RATIO_DICT.get("float16")
        if attr_value * bit_ratio > DATA_SIZE_MAX:
            args_dict = {'errCode': 'E60020', 'attr_name': attr_name}
            raise RuntimeError(args_dict,
                               err_mana.get_error_message(args_dict))

    # First : Base check, Mainly required by interface appearance
    # ===========================================================
    # util check
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_x, CONV3D_BACKPROP_SHAPE_DIM,
                          CONV3D_BACKPROP_SHAPE_DIM, DEFAULT_MAX_SHAPE_NUM)
    util.check_shape_rule(shape_out_backprop, CONV3D_BACKPROP_SHAPE_DIM,
                          CONV3D_BACKPROP_SHAPE_DIM, DEFAULT_MAX_SHAPE_NUM)
    util.check_shape_rule(filter_sizes, CONV3D_BACKPROP_SHAPE_DIM,
                          CONV3D_BACKPROP_SHAPE_DIM, DEFAULT_MAX_SHAPE_NUM)
    util.check_shape_rule(strides, STRIDES_SHAPE_DIM, STRIDES_SHAPE_DIM,
                          DEFAULT_MAX_SHAPE_NUM)

    def _check_attr_pads():
        # pads check
        if isinstance(pads, (tuple, list)) and \
                len(pads) != PADDING_SHAPE_DIM:
            args_dict = {'errCode': 'E62501', 'param_name': 'pads'}
            raise RuntimeError(args_dict,
                               err_mana.get_error_message(args_dict))

        if isinstance(pads, str) and pads not in PADDING_SUPPORT:
            args_dict = {
                'errCode': 'E60021',
                'expected_pad_mode': '[{}]'.format(PADDING_SUPPORT),
                'actual_pad_mode': str(pads)
            }
            raise RuntimeError(args_dict,
                               err_mana.get_error_message(args_dict))

    _check_attr_pads()

    # dilations check
    util.check_shape_rule(dilations, CONV3D_BACKPROP_SHAPE_DIM,
                          CONV3D_BACKPROP_SHAPE_DIM, DEFAULT_MAX_SHAPE_NUM)
    dilation_n, dilation_d, dilation_c, dilation_h, dilation_w = dilations
    _check_attr_range_dw("dilations's H", dilation_h, DILATION_MIN,
                         DILATION_MAX)
    _check_attr_range_dw("dilations's W", dilation_w, DILATION_MIN,
                         DILATION_MAX)

    if dilation_n != 1 or dilation_c != 1:
        args_dict = {
            'errCode': 'E60023',
            'dilation_n': str(dilation_n),
            'dilation_c': str(dilation_c)
        }
        raise RuntimeError(args_dict, err_mana.get_error_message(args_dict))

    # detype check
    x_dtype = x_dtype.lower()
    out_backprop_dtype = out_backprop_dtype.lower()
    res_dtype = res_dtype.lower()
    util.check_dtype_rule(x_dtype, ['float16'])
    util.check_dtype_rule(out_backprop_dtype, ['float16'])
    util.check_dtype_rule(res_dtype, ['float32', 'float16'])

    # Second : Furture Check, Mainly required by SRS
    # ===========================================================
    # the relation limits between shape
    shape_x = list(shape_x)
    shape_out_backprop = list(shape_out_backprop)
    filter_sizes = list(filter_sizes)
    strides = list(strides)
    fmap_batch, fmap_d, fmap_channel, fmap_h, fmap_w = shape_x
    dedy_batch, dedy_d, dedy_channel, dedy_h, dedy_w = shape_out_backprop
    filter_batch, filter_d, filter_channel, filter_h, filter_w = filter_sizes
    stride_d, stride_h, stride_w = strides

    filter_d_dilation = (filter_d - 1) * dilation_d + 1
    filter_h_dilation = (filter_h - 1) * dilation_h + 1
    filter_w_dilation = (filter_w - 1) * dilation_w + 1

    # pads compute
    if pads == 'SAME':
        pad_d = \
            align(fmap_d, stride_d) - stride_d + filter_d_dilation - fmap_d
        pad_d = max(pad_d, 0)
        pad_front = pad_d // 2
        pad_back = pad_d - pad_front
        pad_w = \
            align(fmap_w, stride_w) - stride_w + filter_w_dilation - fmap_w
        pad_w = max(pad_w, 0)
        pad_left = pad_w // 2
        pad_right = pad_w - pad_left
        pad_h = \
            align(fmap_h, stride_h) - stride_h + filter_h_dilation - fmap_h
        pad_h = max(pad_h, 0)
        pad_up = pad_h // 2
        pad_down = pad_h - pad_up
        pads = [pad_front, pad_back, pad_up, pad_down, pad_left, pad_right]
    elif pads == "VALID":
        pads = PADDING_VAILD
    pads = list(pads)
    pad_front, pad_back, pad_up, pad_down, pad_left, pad_right = pads
    if pad_front >= filter_d_dilation or pad_back >= filter_d_dilation:
        args_dict = {
            'errCode': 'E60013',
            'depth_of_pad': '{}, {}'.format(pad_front, pad_back),
            'depth_of_filter': '{}'.format(filter_d_dilation)
        }
        raise RuntimeError(args_dict, err_mana.get_error_message(args_dict))
    if pad_up >= filter_h_dilation or pad_down >= filter_h_dilation:
        args_dict = {
            'errCode': 'E60016',
            'h_of_filter': '{}'.format(filter_h_dilation),
            'h_of_pad': '{}, {}'.format(pad_up, pad_down)
        }
        raise RuntimeError(args_dict, err_mana.get_error_message(args_dict))
    if pad_left >= filter_w_dilation or pad_right >= filter_w_dilation:
        args_dict = {
            'errCode': 'E60017',
            'w_of_filter': '{}'.format(filter_w_dilation),
            'w_of_pad': '{}, {}'.format(pad_left, pad_right)
        }
        raise RuntimeError(args_dict, err_mana.get_error_message(args_dict))

    fmap_w_padding = fmap_w + pad_left + pad_right
    fmap_h_padding = fmap_h + pad_up + pad_down

    # special cases
    fmap_hw_min, dey_hw_min = FMAP_HW_MIN, DEDY_HW_MIN
    # limitation by chip:
    # if kernel h,w in [1,11] and fmap h/w after padding equals to filter h/w
    # load3d support h,w is 1
    if (1 <= filter_w <= 11) and (1 <= filter_h <= 11) and (1 <= filter_d <= 11)\
            and (fmap_w_padding == filter_w or fmap_h_padding == filter_h):
        fmap_hw_min = 1
        dey_hw_min = 1

    # Dedy value limit
    _check_attr_range_dw("Dedy's H", dedy_h, dey_hw_min, DEDY_HW_MAX)
    _check_attr_range_dw("Dedy's W", dedy_w, dey_hw_min, DEDY_HW_MAX)

    # filter value limit
    _check_attr_range_dw("filter's H", filter_h, FILTER_HW_MIN, FILTER_HW_MAX)
    _check_attr_range_dw("filter's W", filter_w, FILTER_HW_MIN, FILTER_HW_MAX)

    # Fmap value limit
    _check_attr_range_dw("Fmap's H", fmap_h, fmap_hw_min, FMAP_HW_MAX)
    _check_attr_range_dw("Fmap's W", fmap_w, fmap_hw_min, FMAP_HW_MAX)

    # stride value limit
    _check_attr_range_dw("stride's H", stride_h, STRIDE_HW_MIN, STRIDE_HW_MAX)
    _check_attr_range_dw("stride's W", stride_w, STRIDE_HW_MIN, STRIDE_HW_MAX)

    def _check_axis_hw():
        if fmap_batch != dedy_batch:
            args_dict = {
                'errCode': 'E62503',
                'backprop_N': str(dedy_batch),
                'forward_shape': str(fmap_batch)
            }
            raise RuntimeError(args_dict,
                               err_mana.get_error_message(args_dict))
        if dedy_channel != filter_batch:
            args_dict = {
                'errCode': 'E62504',
                'backprop_C': str(dedy_channel),
                'forward_shape': str(filter_batch)
            }
            raise RuntimeError(args_dict,
                               err_mana.get_error_message(args_dict))
        if fmap_channel != filter_channel:
            args_dict = {
                'errCode': 'E60010',
                'channel_of_x': str(fmap_channel),
                'channel_of_filter': str(filter_channel)
            }
            raise RuntimeError(args_dict,
                               err_mana.get_error_message(args_dict))
        if filter_w_dilation > fmap_w_padding:
            args_dict = {
                'errCode': 'E60015',
                'w_of_x': str(fmap_w_padding),
                'w_of_filter': str(filter_w_dilation)
            }
            raise RuntimeError(args_dict,
                               err_mana.get_error_message(args_dict))
        if filter_h_dilation > fmap_h_padding:
            args_dict = {
                'errCode': 'E60014',
                'h_of_x': str(fmap_h_padding),
                'h_of_filter': str(filter_h_dilation)
            }
            raise RuntimeError(args_dict,
                               err_mana.get_error_message(args_dict))

        # Third : value check, Mainly required by the convolution rule
        if ((fmap_w - filter_w_dilation + pad_left + pad_right) // stride_w +
                1) != dedy_w:
            args_dict = {'errCode': 'E60025'}
            raise RuntimeError(args_dict,
                               err_mana.get_error_message(args_dict))
        if ((fmap_h - filter_h_dilation + pad_up + pad_down) // stride_h +
                1) != dedy_h:
            args_dict = {'errCode': 'E60024'}
            raise RuntimeError(args_dict,
                               err_mana.get_error_message(args_dict))

    _check_axis_hw()

    def _min_l1_byte():
        # Forth : L1 limitation, Mainly required by chip
        al1_min_byte = C0 * C0 * 2

        if dedy_w % C0 == 0:
            bl1_min_byte = filter_h_dilation * fmap_w * C0 * 2
        else:
            bl1_min_byte = (filter_h_dilation + stride_h) * fmap_w * C0 * 2

        l1_size = get_soc_spec("L1_SIZE")  # L1 size
        if (al1_min_byte + bl1_min_byte) > l1_size:
            args_dict = {'errCode': 'E60022'}
            raise RuntimeError(args_dict,
                               err_mana.get_error_message(args_dict))

    _min_l1_byte()
    # Fifth : check shape size, 64 bits limitation
    c0_size = cce_params.C0_SIZE
    fmap_size = fmap_batch * fmap_d * align(fmap_channel,
                                            c0_size) * fmap_h * fmap_w
    dedy_size = dedy_batch * dedy_d * align(dedy_channel,
                                            c0_size) * dedy_h * dedy_w
    filter_size = \
        align(filter_batch, c0_size) * filter_d * align(filter_channel, c0_size) \
        * filter_h * filter_w
    _check_64bits_limitation("fmap_size", fmap_size, dtype=x_dtype)
    _check_64bits_limitation("dedy_size", dedy_size, dtype=out_backprop_dtype)
    _check_64bits_limitation("filter_size", filter_size, dtype=res_dtype)

    result = (shape_x, shape_out_backprop, filter_sizes, strides, pads,
              dilations, x_dtype, out_backprop_dtype, res_dtype, kernel_name)
    return result
예제 #22
0
    def check_param(self, var_out):
        """
        Check parameter

        Parameters
        ----------
        var_out: dict
            data of input
            datatype suports float32,float16,int32,int8,uint8
        Returns
        -------
        None
        """
        var_out_shape = var_out.get("shape")
        var_out_dtype = var_out.get("dtype").lower()
        if var_out_dtype == "bool":
            var_out_dtype = "int8"
        util.check_kernel_name(self.kernel_name)
        util.check_shape_rule(self.var_shape)
        util.check_shape_rule(self.indices_shape)
        util.check_shape_rule(self.updates_shape)

        util.check_tensor_shape_size(self.var_shape)
        util.check_tensor_shape_size(self.indices_shape)
        util.check_tensor_shape_size(self.updates_shape)
        util.check_tensor_shape_size(var_out_shape)

        check_list_indices = ("int32")
        util.check_dtype_rule(self.indices_dtype, check_list_indices)
        check_list_var = ("float16", "float32", "int32", "int8", "uint8")
        util.check_dtype_rule(self.var_dtype, check_list_var)
        util.check_dtype_rule(self.updates_dtype, check_list_var)
        util.check_dtype_rule(var_out_dtype, check_list_var)

        if (self.updates_dtype != self.var_dtype or var_out_dtype != self.var_dtype):
            raise RuntimeError(
                "dtype updates:{} var_out:{} must same as var{}".format(self.updates_dtype, var_out_dtype,
                                                                        self.var_dtype))

        if var_out_shape != self.var_shape:
            raise RuntimeError(
                "var_out's shape:{} must be the same as var's shape:{}".format(var_out_shape, self.var_shape))

        # updates is not support broadcast to var current
        if self.var_shape != self.updates_shape:
            raise RuntimeError(
                "var's shape:{} must same as updates's shape:{}".format(self.updates_shape, self.var_shape))

        if self.axis >= len(self.updates_shape):
            raise RuntimeError("axis:{} must in range updates shapes:{} len:{}".format(self.axis, self.updates_shape,
                                                                                       len(self.updates_shape)))

        # not support indecis is null
        if len(self.indices_shape) != 1:
            raise RuntimeError("indices_shape:{} len:{} must be l".format(self.indices_shape, len(self.indices_shape)))

        if self.indices_shape[0] != self.updates_shape[self.axis]:
            raise RuntimeError("indices:{} != updates.size(axis({})):{}".format(len(self.indices_shape), self.axis,
                                                                                self.updates_shape[self.axis]))

        # indicis now support cut slice to ub
        if (self.indices_dtype_bytes_size * self.indices_num) > (self.ub_size_bytes * 8 // 10):
            raise RuntimeError("indices num:{} large than ub size:{}".format(self.indices_num, self.ub_size_bytes))
def fake_quant_min_max_per_channel_update(
        x,
        min_val,
        max_val,
        min_up,
        max_up,
        ema,
        ema_decay,
        symmetric,
        narrow_range,
        training,
        num_bits,
        channel_axis,
        kernel_name="fake_quant_min_max_per_channel_update"):
    """FakeQuantPerLayer op"""
    x_shape = x.get("ori_shape")
    x_format = x.get("format")
    x_dtype = x.get("dtype")
    min_shape = min_val.get("ori_shape")
    min_dtype = min_val.get("dtype")
    max_shape = max_val.get("ori_shape")
    max_dtype = max_val.get("dtype")

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(x_shape)
    util.check_shape_rule(min_shape, 1, 1, x_shape[channel_axis])
    util.check_shape_rule(max_shape, 1, 1, x_shape[channel_axis])
    util.check_tensor_shape_size(x_shape)
    util.check_tensor_shape_size(min_shape)
    util.check_tensor_shape_size(max_shape)

    check_list = ["float32", "float16"]
    x_dtype = x_dtype.lower()
    min_dtype = min_dtype.lower()
    max_dtype = max_dtype.lower()
    util.check_dtype_rule(x_dtype, check_list)
    util.check_dtype_rule(min_dtype, check_list)
    util.check_dtype_rule(max_dtype, check_list)

    if symmetric:
        quant_min = 0 - 2**(num_bits - 1)
        quant_max = 2**(num_bits - 1) - 1
    else:
        quant_min = 0
        quant_max = 2**num_bits - 1
    if narrow_range:
        quant_min = quant_min + 1

    shape_c = [min_val.get("shape")[1], min_val.get("shape")[-1]]
    input_data = tvm.placeholder(x.get("shape"), name="x", dtype=x_dtype)
    min_data = tvm.placeholder(shape_c, name="min_val", dtype=x_dtype)
    max_data = tvm.placeholder(shape_c, name="max_val", dtype=x_dtype)
    res_list = fake_quant_min_max_per_channel_update_compute(
        input_data, min_data, max_data, ema, ema_decay, quant_min, quant_max,
        training, channel_axis, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res_list)

    tensor_list = [input_data, min_data, max_data] + list(res_list)
    config = {
        "print_ir": False,
        "name": kernel_name,
        "tensor_list": tensor_list
    }

    te.lang.cce.cce_build_code(sch, config)
예제 #24
0
def conv_layer_cce_para_check(shape_in, shape_w, in_dtype, w_dtype, res_dtype,
                              padh, padw, strideh, stridew, quantize_config,
                              scale_sqrt, scale_q_dtype, offset_q_dtype,
                              scale_dq_dtype, scale_rq_dtype, offset_rq_dtype,
                              offset_w_dtype, offset_pad_dtype, bias,
                              kernel_name):
    # conv shape check
    util.check_kernel_name(kernel_name)

    # conv data type check
    util.check_dtype_rule(in_dtype, ['float16', 'int8', 'uint8'])
    util.check_dtype_rule(w_dtype, ['float16', 'int8', 'uint8'])
    res_dtype_list = ['float16', 'int8', 'uint8']
    if is_v200_version():
        res_dtype_list.append('int32')
    util.check_dtype_rule(res_dtype, res_dtype_list)
    util.check_dtype_rule(scale_q_dtype, ['float16'])
    util.check_dtype_rule(offset_q_dtype, ['float16'])
    util.check_dtype_rule(scale_dq_dtype, ['float16'])
    util.check_dtype_rule(scale_rq_dtype, ['float16'])
    util.check_dtype_rule(offset_rq_dtype, ['float16'])
    util.check_dtype_rule(offset_w_dtype, ['int32'])
    util.check_dtype_rule(offset_pad_dtype, ['uint8'])

    if not isinstance(bias, bool):
        raise RuntimeError("bias dtype should be bool.")

    if quantize_config[0] == 0:
        if is_v200_version():
            util.check_dtype_rule(in_dtype, ('int8', ))
            util.check_dtype_rule(w_dtype, ('int8', ))
            util.check_dtype_rule(res_dtype, ('int32', ))
        else:
            util.check_dtype_rule(in_dtype, ['float16'])
            util.check_dtype_rule(w_dtype, ['float16'])
            util.check_dtype_rule(res_dtype, ['float16'])

    if quantize_config[0] == 1:
        util.check_dtype_rule(w_dtype, ['int8'])
        if quantize_config[1] == 0:
            util.check_dtype_rule(in_dtype, ['int8', 'float16'])
            util.check_dtype_rule(res_dtype, ['int8', 'float16'])
        elif quantize_config[1] == 1:
            util.check_dtype_rule(in_dtype, ['uint8', 'float16'])
            util.check_dtype_rule(res_dtype, ['uint8', 'float16'])
        elif quantize_config[1] == 2:
            raise RuntimeError("All Offset mode quantize not support.")
        else:
            raise RuntimeError("Invalid quantize algorithm.")

    # quantize switch on
    if quantize_config[0] == 1:
        quantize_turn_on = True
        # quantize -> DeQuantize dataflow
        if in_dtype == 'float16' and w_dtype == 'int8' and res_dtype == 'float16':
            pass
        # DeQuantize dataflow
        elif (in_dtype in ['int8', 'uint8'] and w_dtype == 'int8'
              and res_dtype == 'float16'):
            pass
        # quantize -> ReQuantize dataflow
        elif (in_dtype == 'float16' and w_dtype == 'int8'
              and res_dtype in ['int8', 'uint8']):
            pass
        # ReQuantize dataflow
        elif (in_dtype in ['int8', 'uint8'] and w_dtype == 'int8'
              and res_dtype in ['int8', 'uint8']):
            pass
        else:
            raise RuntimeError("Not support in/out data type for quantize.")

        if quantize_config not in ([1, 0, 0], [1, 1, 0], [1, 0, 1], [1, 1, 1]):
            raise RuntimeError("Invalid Quantize Config.")

        if scale_sqrt not in ([0, 0, 0], [1, 0, 0], [0, 1, 0], [1, 1, 0],
                              [0, 0, 1], [1, 0, 1], [0, 1, 1], [1, 1, 1]):
            raise RuntimeError("Invalid Quantize Config.")

    # quantize switch off
    elif quantize_config[0] == 0:
        if quantize_config != [0, 0, 0]:
            raise RuntimeError("Invalid Quantize Config.")
        if scale_sqrt != [0, 0, 0]:
            raise RuntimeError("Invalid Quantize Config.")
    else:
        raise RuntimeError("Invalid Quantize Config.")

    if isinstance(padh, list):
        if len(padh) != PAD_SHAPE_DIM:
            raise RuntimeError("Dimension must be %d when padh is a list." %
                               PAD_SHAPE_DIM)
        pad_top = padh[0]
        pad_bottom = padh[1]
    else:
        pad_top = padh
        pad_bottom = padh

    if isinstance(padw, list):
        if len(padw) != PAD_SHAPE_DIM:
            raise RuntimeError("Dimension must be %d when padw is a list." %
                               PAD_SHAPE_DIM)
        pad_left = padw[0]
        pad_right = padw[1]
    else:
        pad_left = padw
        pad_right = padw

    shape_in, shape_w = te.lang.cce.check_conv_shape(shape_in, shape_w, pad_top, pad_bottom, \
                                                    pad_left, pad_right, strideh, \
                                                    stridew, in_dtype, w_dtype, res_dtype)

    return shape_in, shape_w
예제 #25
0
def check_conv2dbp_filter_params(shape_x, shape_out_backprop, filter_sizes,
                                 strides, pads, dilations, x_dtype,
                                 out_backprop_dtype, res_dtype, kernel_name):
    """
    The params check function of conv2d_backprop_filter

    Parameters:
    ----------
    shape_x : The shape of feature map,
              which is 4-D [batch, channels, height, weight].

    shape_out_backprop : The shape of gradients,
                         which is 4-D [batch, channels, height, weight].

    filter_sizes : The shape of filter.
                   which is 4-D [batch, channels, height, weight].

    strides : The stride of the sliding window. A list of ints.

    pads : "SAME"or"VALID",
           indicating the type of pads algorithm to use, or list.

    dilations : An optional list of ints. Default value is [1, 1, 1, 1].

    x_dtype : Fmeature map  data dtype. Default value is float16.

    out_backprop_dtype : Gradients data dtype. Default value is float16.

    res_dtype : Result(De/Dw) data dtype. Default value is float32.

    kernel_name : Kernel name of cce.
                  Default value is "conv2d_backprop_filter_cce"

    Returns : All transformed params.
    ----------
    """
    def _align(input_x, input_y):
        if input_y == 0:
            dict_args = {}
            dict_args['errCode'] = "E60108"
            dict_args['reason'] = "Division by zero"
            raise RuntimeError(dict_args, err_man.get_error_message(dict_args))
        return (input_x + input_y - 1) // input_y * input_y

    def _check_attr_range_dw(name, value, attr_min=None, attr_max=None):
        if not attr_min and not attr_max:
            return
        if not attr_min:
            if (not isinstance(value, int)) or value > attr_max:
                dict_args = {}
                dict_args['errCode'] = "E64001"
                dict_args['range'] = "(, {}]".format(attr_max)
                dict_args['attr_name'] = name
                dict_args["value"] = str(value)
                raise RuntimeError(dict_args,
                                   err_man.get_error_message(dict_args))
        elif not attr_max:
            if (not isinstance(value, int)) or value < attr_min:
                dict_args = {}
                dict_args['errCode'] = "E64001"
                dict_args['range'] = "[{}, )".format(attr_min)
                dict_args['attr_name'] = name
                dict_args["value"] = str(value)
                raise RuntimeError(dict_args,
                                   err_man.get_error_message(dict_args))
        elif(not isinstance(value, int)) or value > attr_max \
                or value < attr_min:
            dict_args = {}
            dict_args['errCode'] = "E64001"
            dict_args['range'] = "[{},{}]".format(attr_min, attr_max)
            dict_args['attr_name'] = name
            dict_args["value"] = str(value)
            raise RuntimeError(dict_args, err_man.get_error_message(dict_args))

    def _check_64bits_limitation(attr_name, attr_value, dtype=None):
        if dtype:
            bit_ratio = BIT_RATIO_DICT.get(dtype)
        else:
            bit_ratio = BIT_RATIO_DICT.get("float16")
        if attr_value * bit_ratio > DATA_SIZE_MAX:
            dict_args = {}
            dict_args['errCode'] = "E60020"
            dict_args['attr_name'] = attr_name
            raise RuntimeError(dict_args, err_man.get_error_message(dict_args))

    def _is_conv1d_situation():
        if fmap_h_padding == 1 and filter_h_dilation == 1 and stride_h == 1:
            return True
        return False

    def _is_load3d_special():
        # limitation by chip:
        # Ascend910
        # load3d not support when only fmap w after padding equals to filter w
        if get_soc_spec("SOC_VERSION") == 'Ascend910' \
            and fmap_h_padding != filter_h \
            and fmap_w_padding == filter_w:
            return False
        # limitation by chip:
        # if kernel h,w in [1,11]
        # and fmap h/w after padding equals to filter h/w
        # load3d support h,w is 1
        if (1 <= filter_h <= 11) and (1 <= filter_w <= 11) \
            and (fmap_h_padding == filter_h or fmap_w_padding == filter_w):
            return True
        return False

    # First : Base check, Mainly required by interface appearance
    # ===========================================================
    # util check
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_x, CONV_BACKPROP_SHAPE_DIM,
                          CONV_BACKPROP_SHAPE_DIM, DEFAULT_MAX_SHAPE_NUM)
    util.check_shape_rule(shape_out_backprop, CONV_BACKPROP_SHAPE_DIM,
                          CONV_BACKPROP_SHAPE_DIM, DEFAULT_MAX_SHAPE_NUM)
    util.check_shape_rule(filter_sizes, CONV_BACKPROP_SHAPE_DIM,
                          CONV_BACKPROP_SHAPE_DIM, DEFAULT_MAX_SHAPE_NUM)
    util.check_shape_rule(strides, STRIDES_SHAPE_DIM, STRIDES_SHAPE_DIM,
                          DEFAULT_MAX_SHAPE_NUM)

    def _check_attr_pads():
        # pads check
        if isinstance(pads, (tuple, list)) and \
                len(pads) != CONV_BACKPROP_SHAPE_DIM:
            dict_args = dict()
            dict_args["errCode"] = "E60107"
            dict_args["param_name"] = "pads"
            raise RuntimeError(dict_args, err_man.get_error_message(dict_args))

        if isinstance(pads, str) and pads not in PADDING_SUPPORT:
            dict_args = {}
            dict_args['errCode'] = "E60021"
            dict_args['expected_pad_mode'] = str(PADDING_SUPPORT)
            dict_args['actual_pad_mode'] = str(pads)

            raise RuntimeError(dict_args, err_man.get_error_message(dict_args))

    _check_attr_pads()

    # dilations check
    util.check_shape_rule(dilations, CONV_BACKPROP_SHAPE_DIM,
                          CONV_BACKPROP_SHAPE_DIM, DEFAULT_MAX_SHAPE_NUM)
    dilation_n, dilation_c, dilation_h, dilation_w = dilations
    _check_attr_range_dw("dilations's H", dilation_h, DILATION_MIN,
                         DILATION_MAX)
    _check_attr_range_dw("dilations's W", dilation_w, DILATION_MIN,
                         DILATION_MAX)
    if dilation_n != 1 or dilation_c != 1:
        dict_args = {}
        dict_args["errCode"] = "E60023"
        dict_args["dilation_n"] = str(dilation_n)
        dict_args["dilation_c"] = str(dilation_c)
        raise RuntimeError(dict_args, err_man.get_error_message(dict_args))

    # detype chek
    x_dtype = x_dtype.lower()
    out_backprop_dtype = out_backprop_dtype.lower()
    res_dtype = res_dtype.lower()
    util.check_dtype_rule(x_dtype, ['float16'])
    util.check_dtype_rule(out_backprop_dtype, ['float16'])
    util.check_dtype_rule(res_dtype, ['float32', 'float16'])

    # Second : Furture Check, Mainly required by SRS
    # ===========================================================
    # the relation limits between shape
    shape_x = list(shape_x)
    shape_out_backprop = list(shape_out_backprop)
    filter_sizes = list(filter_sizes)
    strides = list(strides)
    fmap_batch, fmap_channel, fmap_h, fmap_w = shape_x
    dedy_batch, dedy_channel, dedy_h, dedy_w = shape_out_backprop
    filter_batch, filter_channel, filter_h, filter_w = filter_sizes
    stride_h, stride_w = strides

    filter_h_dilation = (filter_h - 1) * dilation_h + 1
    filter_w_dilation = (filter_w - 1) * dilation_w + 1

    # pads compute
    if pads == 'SAME':
        pad_w = _align(fmap_w, stride_w) - stride_w + \
                filter_w_dilation - fmap_w
        pad_w = max(pad_w, 0)
        pad_left = pad_w // 2
        pad_right = pad_w - pad_left
        pad_h = _align(fmap_h, stride_h) - stride_h + \
                filter_h_dilation - fmap_h
        pad_h = max(pad_h, 0)
        pad_up = pad_h // 2
        pad_down = pad_h - pad_up
        pads = [pad_up, pad_down, pad_left, pad_right]
    elif pads == "VALID":
        pads = PADDING_VAILD
    pads = list(pads)
    pad_up, pad_down, pad_left, pad_right = pads
    if pad_up >= filter_h_dilation or pad_down >= filter_h_dilation:
        dict_args = dict()
        dict_args["errCode"] = "E64005"
        dict_args["direction"] = 'H'
        dict_args["pads_dir"] = "pad_up and pad_down"
        dict_args["pads_value"] = "[{}, {}]".format(pad_up, pad_down)
        dict_args["filter_value"] = str(filter_h_dilation)
        raise RuntimeError(dict_args, err_man.get_error_message(dict_args))
    if pad_left >= filter_w_dilation or pad_right >= filter_w_dilation:
        dict_args = dict()
        dict_args["errCode"] = "E64005"
        dict_args["direction"] = 'W'
        dict_args["pads_dir"] = "pad_left and pad_right"
        dict_args["pads_value"] = "[{}, {}]".format(pad_left, pad_right)
        dict_args["filter_value"] = str(filter_w_dilation)
        raise RuntimeError(dict_args, err_man.get_error_message(dict_args))

    fmap_w_padding = fmap_w + pad_left + pad_right
    fmap_h_padding = fmap_h + pad_up + pad_down
    # special cases
    fmap_hw_min, dedy_hw_min = FMAP_HW_MIN, DEDY_HW_MIN
    dedy_hw_max, fmap_hw_max = DEDY_HW_MAX, FMAP_HW_MAX

    # exchange h and w will not change date in memmory
    if fmap_w_padding == 1 and filter_w == 1 and dedy_w == 1:
        shape_x = (fmap_batch, fmap_channel, fmap_w, fmap_h)
        shape_out_backprop = (dedy_batch, dedy_channel, dedy_w, dedy_h)
        filter_sizes = (filter_batch, filter_channel, filter_w, filter_h)
        strides = stride_w, stride_h
        dilations = dilation_n, dilation_c, dilation_w, dilation_h
        fmap_h_padding, fmap_w_padding = fmap_w_padding, fmap_h_padding
        dedy_h, dedy_w = dedy_w, dedy_h
        fmap_h, fmap_w = fmap_w, fmap_h
        filter_h, filter_w = filter_w, filter_h
        filter_h_dilation, filter_w_dilation = filter_w_dilation,\
                                               filter_h_dilation
    # limitation by chip:
    # if kernel h,w in [1,11] and fmap h/w after padding equals to filter h/w
    # load3d support h,w is 1
    if _is_load3d_special():
        fmap_hw_min = 1
        dedy_hw_min = 1

    # if conv1d situation, make sure w is in [1,2**31-1]
    if _is_conv1d_situation():
        dedy_hw_min = 1
        fmap_hw_min = 1
        dedy_hw_max = CONV1D_MAX_W
        fmap_hw_max = CONV1D_MAX_W

    # Dedy value limit
    _check_attr_range_dw("Dedy's H", dedy_h, dedy_hw_min, dedy_hw_max)
    _check_attr_range_dw("Dedy's W", dedy_w, dedy_hw_min, dedy_hw_max)

    # filter value limit
    _check_attr_range_dw("filter's H", filter_h, FILTER_HW_MIN, FILTER_HW_MAX)
    _check_attr_range_dw("filter's W", filter_w, FILTER_HW_MIN, FILTER_HW_MAX)

    # Fmap value limit
    _check_attr_range_dw("Fmap's H", fmap_h, fmap_hw_min, fmap_hw_max)
    _check_attr_range_dw("Fmap's W", fmap_w, fmap_hw_min, fmap_hw_max)

    # stride value limit
    _check_attr_range_dw("stride's H", stride_h, STRIDE_HW_MIN, STRIDE_HW_MAX)
    _check_attr_range_dw("stride's W", stride_w, STRIDE_HW_MIN, STRIDE_HW_MAX)

    def _check_axis_hw():
        if fmap_batch != dedy_batch:
            dict_args = {}
            dict_args['errCode'] = "E64002"
            dict_args['param1'] = "Fmap's N"
            dict_args['param2'] = "Dedy's N"
            dict_args['actual_value'] = "{}, {}".\
                format(fmap_batch, dedy_batch)
            raise RuntimeError(dict_args, err_man.get_error_message(dict_args))
        if dedy_channel != filter_batch:
            dict_args = {}
            dict_args['errCode'] = "E64002"
            dict_args['param1'] = "Dedy's C"
            dict_args['param2'] = "Filter's N"
            dict_args['actual_value'] = "{}, {}". \
                format(dedy_channel, filter_batch)
            raise RuntimeError(dict_args, err_man.get_error_message(dict_args))
        if fmap_channel != filter_channel:
            dict_args = {}
            dict_args['errCode'] = "E64002"
            dict_args['param1'] = "Fmap's C"
            dict_args['param2'] = "Filter's C"
            dict_args['actual_value'] = "{}, {}". \
                format(fmap_channel, filter_channel)
            raise RuntimeError(dict_args, err_man.get_error_message(dict_args))
        if filter_w_dilation > fmap_w_padding:
            dict_args = dict()
            dict_args["errCode"] = "E60015"
            dict_args["w_of_x"] = str(fmap_w_padding)
            dict_args["w_of_filter"] = str(filter_w_dilation)
            raise RuntimeError(dict_args, err_man.get_error_message(dict_args))
        if filter_h_dilation > fmap_h_padding:
            dict_args = dict()
            dict_args["errCode"] = "E60014"
            dict_args["h_of_x"] = str(fmap_h_padding)
            dict_args["h_of_filter"] = str(filter_h_dilation)
            raise RuntimeError(dict_args, err_man.get_error_message(dict_args))

        # Third : value check, Mainly required by the convolution rule
        if ((fmap_w - filter_w_dilation + pad_left + pad_right) // stride_w +
                1) != dedy_w:
            dict_args = {}
            dict_args["errCode"] = "E60025"
            raise RuntimeError(dict_args, err_man.get_error_message(dict_args))
        if ((fmap_h - filter_h_dilation + pad_up + pad_down) // stride_h +
                1) != dedy_h:
            dict_args = {}
            dict_args["errCode"] = "E60024"
            raise RuntimeError(dict_args, err_man.get_error_message(dict_args))

    _check_axis_hw()

    def _min_l1_byte():
        # Forth : L1 limitation, Mainly required by chip
        al1_min_byte = C0 * C0 * 2
        if not _is_conv1d_situation():
            kl1_min = fmap_w
        else:
            kl1_min = (C0 - 1) * stride_w + filter_w_dilation
        if dedy_w % C0 == 0:
            bl1_min_byte = filter_h_dilation * kl1_min * C0 * 2
        else:
            bl1_min_byte = (filter_h_dilation + stride_h) * kl1_min * C0 * 2

        l1_size = get_soc_spec("L1_SIZE")  # L1 size
        if (al1_min_byte + bl1_min_byte) > l1_size:
            dict_args = {}
            dict_args["errCode"] = "E60026"
            raise RuntimeError(dict_args, err_man.get_error_message(dict_args))

    _min_l1_byte()
    # Fifth : check shape size, 64 bits limitation
    c0_size = cce_params.C0_SIZE
    fmap_size = fmap_batch * _align(fmap_channel, c0_size) * fmap_h * fmap_w
    dedy_size = dedy_batch * _align(dedy_channel, c0_size) * dedy_h * dedy_w
    filter_size = \
        _align(filter_batch, c0_size) * _align(filter_channel, c0_size) \
        * filter_h * filter_w
    _check_64bits_limitation("fmap_size", fmap_size, dtype=x_dtype)
    _check_64bits_limitation("dedy_size", dedy_size, dtype=out_backprop_dtype)
    _check_64bits_limitation("filter_size", filter_size, dtype=res_dtype)

    result = (shape_x, shape_out_backprop, filter_sizes, strides, pads,
              dilations, x_dtype, out_backprop_dtype, res_dtype, kernel_name)
    return result
예제 #26
0
def decode_bbox(box_predictions,
                anchors,
                decoded_boxes,
                decode_clip,
                kernel_name="decode_bbox"):
    """
    calculating data

    Parameters
    ----------
    box_predictions : shape and dtype of input
    anchors : shape and dtype of input
    decoded_boxes : shape and dtype of output, s
                    hould be same shape and type as input
    decode_clip : decode_clip
    kernel_name : kernel name, default value is "decode_bbox"
    Returns
    -------
    None
    """

    # check param & data
    shape_box_predictions = box_predictions.get("shape")
    shape_anchors = anchors.get("shape")
    shape_decoded_boxes = decoded_boxes.get("shape")
    util.check_kernel_name(kernel_name)
    format_box_predictions = box_predictions.get("format")
    format_anchors = anchors.get("format")
    format_decoded_boxes = decoded_boxes.get("format")
    check_format_shape(format_box_predictions, format_anchors,
                       format_decoded_boxes)
    util.check_shape_rule(shape_box_predictions, CONFIG_THREE, CONFIG_FOUR,
                          None)
    util.check_shape_rule(shape_anchors, CONFIG_THREE, CONFIG_FOUR, None)
    util.check_shape_rule(shape_decoded_boxes, CONFIG_TWO, CONFIG_TWO, None)
    util.check_shape_size(shape_box_predictions, SHAPE_SIZE_LIMIT)
    util.check_shape_size(shape_anchors, SHAPE_SIZE_LIMIT)
    util.check_shape_size(shape_decoded_boxes, SHAPE_SIZE_LIMIT)
    util.check_dtype_rule(box_predictions.get("dtype").lower(), ("float16", ))
    util.check_dtype_rule(anchors.get("dtype").lower(), ("float16", ))
    util.check_dtype_rule(decoded_boxes.get("dtype").lower(), ("float16", ))
    if shape_box_predictions != shape_anchors:
        raise RuntimeError("the input shape_box_predictions and anchors)"
                           "must be same")
    if (reduce(lambda x, y: x * y, shape_box_predictions[:])) \
            != (reduce(lambda x, y: x * y, shape_decoded_boxes[:])):
        raise RuntimeError("the input shape (box_predictions and anchors"
                           "is not equal to out shape(decoded_boxes)")
    if (shape_box_predictions[-1] == CONFIG_FOUR
            and len(shape_box_predictions) == CONFIG_THREE):
        if shape_decoded_boxes[1] != CONFIG_FOUR:
            raise RuntimeError("the output shape_decoded_boxes must be 4")
    else:
        if (shape_box_predictions[0] == CONFIG_FOUR
                and len(shape_box_predictions) == CONFIG_FOUR):
            if shape_decoded_boxes[0] != CONFIG_FOUR:
                raise RuntimeError("the output shape_decoded_boxes must be 4")
        else:
            raise RuntimeError("the input shape not in {(4,C,H,W), (H,W,4)}")
    if not isinstance(decode_clip, (float, int)):
        raise RuntimeError("input param type of decode_clip should be Float")
    if decode_clip < 0 or decode_clip > 10:
        raise RuntimeError(
            "input param decode_clip can't be negtive and shoud be [0,10]! ")
    # init the tiling shape
    print("shape_box_predictions", shape_box_predictions)
    shape = TilingFunc(shape_box_predictions)
    # calculate the deocede_bbox
    tik_instance = tik.Tik(tik.Dprofile())
    data_tensor = InitTensor(tik_instance, shape)
    if shape.input_shape[-1] == CONFIG_FOUR \
            and len(shape.input_shape) == CONFIG_THREE:
        decode_bbox_compute(tik_instance, shape, data_tensor, decode_clip,
                            kernel_name)
    if shape.input_shape[0] == CONFIG_FOUR \
            and len(shape.input_shape) == CONFIG_FOUR:
        decode_bbox_compute_transpose(tik_instance, shape, data_tensor,
                                      decode_clip, kernel_name)
    return tik_instance
예제 #27
0
def select_v2(condition, x1, x2, y, kernel_name="select_v2"):
    """
      Selects elements from `x1` or `x2`, depending on `condition`.

      Parameters
      ----------
      condition: dict
          dict of condition, include keys(shape and dtype),
          only support bool
      x1: dict
          dict of x1, only support float16, float32, int32, int8, uint8
      x2: dict
          dict of x2, only support float16, float32, int32, int8, uint8
      y: dict
          dict of output
      kernel_name: str
          cce kernel name, default value is "select"

      Returns
      -------
      None
      """
    shape_x1 = x1.get("shape")
    dtype_x1 = x1.get("dtype")
    shape_x2 = x2.get("shape")
    dtype_x2 = x2.get("dtype")
    bool_dtype = condition.get("dtype")
    con_shape = condition.get("shape")

    shape_x1, con_shape, shape_max_x1 = util.produce_shapes(
        shape_x1, con_shape)
    shape_x2, con_shape, shape_max_x2 = util.produce_shapes(
        shape_x2, con_shape)

    if shape_x1[-1] == 1 and shape_x2[-1] == 1 and con_shape[-1] == 1 \
            and shape_max_x1[-1] == 1:
        shape_x1 = shape_x1 if len(shape_x1) == 1 else shape_x1[:-1]
        shape_x2 = shape_x2 if len(shape_x2) == 1 else shape_x2[:-1]
        con_shape = con_shape if len(con_shape) == 1 else con_shape[:-1]

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_x1)
    util.check_tensor_shape_size(shape_x1)

    if shape_x1 == shape_x2 == con_shape:
        shape_x1 = (functools_reduce(lambda x, y: x * y, shape_x1[:]), )
        shape_x2 = (functools_reduce(lambda x, y: x * y, shape_x2[:]), )
        con_shape = (functools_reduce(lambda x, y: x * y, con_shape[:]), )

    dtype_x1 = dtype_x1.lower()
    dtype_x2 = dtype_x2.lower()
    check_list = ("float16", "float32", "int32", "int8", "uint8")
    util.check_dtype_rule(dtype_x1, check_list)
    if dtype_x1 != dtype_x2:
        raise RuntimeError("Dtype of tensor x1 and x2 must be equal!")

    bool_dtype = bool_dtype.lower()
    bool_check_list = ("bool", "int8", "uint8")
    util.check_dtype_rule(bool_dtype, bool_check_list)

    condition = tvm.placeholder(con_shape, name="condition", dtype=bool_dtype)
    input_then = tvm.placeholder(shape_x1, name="input_then", dtype=dtype_x1)
    input_else = tvm.placeholder(shape_x2, name="input_else", dtype=dtype_x2)

    with tvm.target.cce():
        res = select_v2_compute(condition, input_then, input_else, y,
                                kernel_name)
        sch = generic.auto_schedule(res)

    config = {
        "name": kernel_name,
        "tensor_list": [condition, input_then, input_else, res],
        "bool_storage_as_1bit": False
    }
    te.lang.cce.cce_build_code(sch, config)
예제 #28
0
def ascend_dequant_s16(x0,
                       deq_scale,
                       x1,
                       y,
                       relu_flag=False,
                       kernel_name='ascend_dequant_s16'):
    """
    int32 -> int16

    Parameters:
    ----------
    x0 : the dict of input

    deq_scale: the dict of dequant num

    x1 : the input of add tensor

    y : the dict of output.

    relu_flag : the relu mode when true the result to do relu

    kernel_name : cce kernel name, default value is "ascend_dequant_s16"

    Returns:
    -------
    None
    """

    shape_x0 = x0.get("shape")
    format_x0 = x0.get("format")
    dtype_x0 = x0.get("dtype")

    shape_deq = deq_scale.get("shape")
    format_deq = deq_scale.get("format")
    dtype_deq = deq_scale.get("dtype")

    check_list = [("int32", ), ("uint64", ), ("int16", )]
    format_list = ["NC1HWC0", "FRACTAL_NZ"]
    util.check_dtype_rule(dtype_x0, check_list[0])
    util.check_dtype_rule(dtype_deq, check_list[1])

    if format_x0 not in format_list:
        raise RuntimeError("x0 only support [NC1HWC0, FRACTAL_NZ]")

    if format_x0 == "NC1HWC0":
        if len(shape_x0) != 5:
            raise ValueError(
                "x0 shape must of length 5 when format is NC1HWC0")

    if format_x0 == "FRACTAL_NZ":
        if len(shape_x0) < 4:
            raise RuntimeError(
                "x0 shape length must >= 4 when format is FRACTAL_NZ")

    if len(shape_deq) != 5:
        raise ValueError("deq_scale shape must of length 5")

    if format_deq != "NC1HWC0":
        raise ValueError("deq_scale only support NC1HWC0")

    if shape_deq[0] != 1 or shape_deq[2] != 1 or shape_deq[3] != 1:
        raise RuntimeError("deq_scale shape must be 1 in n,h,w")

    if format_x0 == "NC1HWC0":
        # n, C1, H*W, C0
        shape_x0 = [
            shape_x0[0], shape_x0[1], shape_x0[2] * shape_x0[3], shape_x0[4]
        ]

    ori_shape_deq = deq_scale.get("ori_shape")
    attr = {"ori_shape": ori_shape_deq}
    input_x0 = tvm.placeholder(shape_x0, dtype_x0, "x0")
    input_deq = tvm.placeholder(shape_deq,
                                name="deq_scale",
                                dtype=dtype_deq,
                                attrs=attr)
    input_x1 = None
    if x1:
        shape_bias = x1.get("shape")
        input_x1 = tvm.placeholder(shape_bias, "int16", "x1")

    with tvm.target.cce():
        res = ascend_dequant_s16_compute(input_x0, input_deq, input_x1,
                                         relu_flag, kernel_name)
        generic.auto_schedule(res)
def conv_layer_cce(shape_in,
                   shape_w,
                   in_dtype,
                   w_dtype,
                   res_dtype,
                   padh,
                   padw,
                   strideh,
                   stridew,
                   bias=0,
                   kernel_name="conv_layer_cce",
                   need_build=0,
                   need_print=0):
    """

    Parameters
    ----------
    shape_in : shape of data_in

    shape_w : shape of filter

    in_dtype : the feature map data type

    w_dtype : the weight data type

    res_dtype : the result data type

    padh: the padding shape in H

    padw: the padding shape in Weight

    strideh: the stride value in H

    stridew: the stride value in Weight

    quantizeConfig: quantize config table, default [0, 0, 0]
    quantizeConfig[0] - quantize function switch
                        0: quantize off
                        1: quantize on
    quantizeConfig[1] - QuantizeAlgorithm
                        0: non offset
                        1: half offset
                        2: all offset ( Not supported now )
    quantizeConfig[2] - QuantizeScaleType (for Dequantize/Requantize, quantize always scalar)
                        0: scalar
                        1: vector

    scaleSqrt: scale mode
    scaleSqrt[0] - Quantize scale mode
                   0: non sqrt
                   1: sqrt
    scaleSqrt[1] - DeQuantize scale mode
                   0: non sqrt
                   1: sqrt
    scaleSqrt[2] - ReQuantize scale mode
                   0: non sqrt
                   1: sqrt

    scaleQ_dtype: Quantize scale data type, default 'float16'

    offsetQ_dtype: Quantize offset data type, default 'float16'

    scaleDq_dtype: DeQuantize scale data type, default 'float16'

    scaleRq_dtype: ReQuantize scale data type, default 'float16'

    offsetRq_dtype: ReQuantize offset data type, default 'float16'

    offsetW_dtype: Weight offset data type, default 'int32'

    offsetPad_dtype: Quantize Cube offset data type, default 'uint8'

    bias: the tag for bias or not

    kernel_name : cce kernel name, default value is "cce_conv"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None

    """
    # for pylint, otherwise "Dangerous default value [] as argument"
    #    if quantizeConfig is None:
    #        quantizeConfig = [0, 0, 0]
    #    if scaleSqrt is None:
    #        scaleSqrt = [0, 0, 0]

    # conv shape check
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_in, CONV_SHAPE_DIM, CONV_SHAPE_DIM)
    util.check_shape_rule(shape_w, CONV_SHAPE_DIM, CONV_SHAPE_DIM)

    in_dtype = in_dtype.lower()
    w_dtype = w_dtype.lower()
    res_dtype = res_dtype.lower()
    #    scaleQ_dtype = scaleQ_dtype.lower()
    #    offsetQ_dtype = offsetQ_dtype.lower()
    #    scaleDq_dtype = scaleDq_dtype.lower()
    #    scaleRq_dtype = scaleRq_dtype.lower()
    #    offsetRq_dtype = offsetRq_dtype.lower()
    #    offsetW_dtype = offsetW_dtype.lower()
    #    offsetPad_dtype = offsetPad_dtype.lower()

    # conv data type check
    util.check_dtype_rule(in_dtype, ['float16', 'int8', 'uint8'])
    util.check_dtype_rule(w_dtype, ['float16', 'int8', 'uint8'])
    util.check_dtype_rule(res_dtype, ['float16', 'int8', 'uint8'])
    #    util.check_dtype_rule(scaleQ_dtype, ['float16'])
    #    util.check_dtype_rule(offsetQ_dtype, ['float16'])
    #    util.check_dtype_rule(scaleDq_dtype, ['float16'])
    #    util.check_dtype_rule(scaleRq_dtype, ['float16'])
    #    util.check_dtype_rule(offsetRq_dtype, ['float16'])
    #    util.check_dtype_rule(offsetW_dtype, ['int32'])
    #    util.check_dtype_rule(offsetPad_dtype, ['uint8'])

    #    if quantizeConfig[0] == 0:
    util.check_dtype_rule(in_dtype, ['float16'])
    util.check_dtype_rule(w_dtype, ['float16'])
    util.check_dtype_rule(res_dtype, ['float16'])

    #    if quantizeConfig[0] == 1:
    #        util.check_dtype_rule(w_dtype, ['int8'])

    shape_in = list(shape_in)
    shape_w = list(shape_w)

    #    shape_in, shape_w = te.lang.cce.check_conv_shape(shape_in, shape_w, padh, padw, strideh,
    #                                                     stridew, in_dtype, w_dtype, res_dtype)

    #    if shape_in[1]!=shape_w[1]:
    #        raise RuntimeError("shape_in[1] must equal to shape_w[1]")

    block_size_K = CUBE_MKN[in_dtype]['mac'][1]
    shape_in[1] = (
        (shape_in[1] + block_size_K - 1) // block_size_K) * block_size_K
    shape_w[1] = shape_in[1]

    hi = shape_in[2]
    wi = shape_in[3]
    hk = shape_w[2]
    wk = shape_w[3]
    h_out = 0
    w_out = 0
    #    print(hi)
    #    print(wi)
    #    print(hk)
    #    print(wk)
    #   print(strideh)
    #    print(stridew)
    #    print(padh)
    #    print(padw)
    if strideh != 0:
        h_out = (hi + (2 * padh) - hk) / strideh + 1  # calculated by hi and wi
    if stridew != 0:
        w_out = (wi + (2 * padw) - wk) / stridew + 1  # calculated by hi and wi

    if h_out <= 0:
        raise RuntimeError(
            "h_out must >0, h_out = (hi + (2 * padh) - hk) / strideh + 1")
    if w_out <= 0:
        raise RuntimeError(
            "w_out must >0, w_out = (wi + (2 * padw) - wk) / stridew + 1")

    if padh > hk:
        raise RuntimeError("kernel H must >= Pad H")

    if (shape_in[0] * w_out * h_out * hk * wk *
            CUBE_MKN[w_dtype]['mac'][1]) > (np.int64(2**31) - 1):
        raise RuntimeError("im2col shape exceed 32bit limitation")

    conv_check_rule(shape_in, shape_w, in_dtype, w_dtype, padh, padw, strideh,
                    stridew)

    if res_dtype in ['int8', 'uint8']:
        w_block_size_K = CUBE_MKN[w_dtype]['mac'][1]
        shape_w[0] = ((shape_w[0] + w_block_size_K - 1) //
                      w_block_size_K) * w_block_size_K
    else:
        w_block_size_N = CUBE_MKN[w_dtype]['mac'][2]
        shape_w[0] = ((shape_w[0] + w_block_size_N - 1) //
                      w_block_size_N) * w_block_size_N

    # padh, padw check
    if padh < PAD_MIN or padh > PAD_MAX:
        raise RuntimeError("padh must be in [0,255].")
    if padw < PAD_MIN or padw > PAD_MAX:
        raise RuntimeError("padw must be in [0,255].")

    # strideh, stridew check
    if strideh < STRIDE_MIN or strideh > STRIDE_MAX:
        raise RuntimeError("strideh must be in [1,63].")
    if stridew < STRIDE_MIN or stridew > STRIDE_MAX:
        raise RuntimeError("stridew must be in [1,63].")

    # filterH, filterW check
    if shape_w[2] < FILTER_HW_MIN or shape_w[2] > FILTER_HW_MAX:
        raise RuntimeError("filterh must be in [1,255].")
    if shape_w[3] < FILTER_HW_MIN or shape_w[3] > FILTER_HW_MAX:
        raise RuntimeError("filterw must be in [1,255].")

    # tiling check, filterH*inputC*inputW*sizeof(in_dtype) < half of(L1_BUFFER)
    SIZE_OF_L1_BUFFER = cce_product.getParams("L1_Buffer")  # bytes

    if (in_dtype == 'float16'):
        if (shape_w[2]) * (shape_in[1]) * (shape_in[3]) * SIZE_OF_FP16 > (
                SIZE_OF_L1_BUFFER / 2):
            raise RuntimeError("min cut is out of half of L1 memory.")

    if (in_dtype == 'int8' or in_dtype == 'uint8'):
        if (shape_w[2]) * (shape_in[1]) * (shape_in[3]) * SIZE_OF_8BIT > (
                SIZE_OF_L1_BUFFER / 2):
            raise RuntimeError("min cut is out of half of L1 memory.")

    # quantize switch on


#    if quantizeConfig[0] == 1:
#        quantizeTurnOn = True
# quantize -> DeQuantize dataflow
#        if (in_dtype == 'float16' and w_dtype == 'int8' and res_dtype == 'float16'):
#            isQuantize = True
#            isDeQuantize = True
#            isReQuantize = False
# DeQuantize dataflow
#        elif ((in_dtype == 'int8' or in_dtype == 'uint8') and w_dtype == 'int8' and res_dtype == 'float16'):
#            isQuantize = False
#            isDeQuantize = True
#            isReQuantize = False
# quantize -> ReQuantize dataflow
#        elif (in_dtype == 'float16' and w_dtype == 'int8' and (res_dtype == 'int8' or res_dtype == 'uint8')):
#            isQuantize = True
#            isDeQuantize = False
#            isReQuantize = True
# ReQuantize dataflow
#        elif ((in_dtype == 'int8' or in_dtype == 'uint8') and w_dtype == 'int8' and (res_dtype == 'int8' or res_dtype == 'uint8')):
#            isQuantize = False
#            isDeQuantize = False
#            isReQuantize = True
#        else:
#            raise RuntimeError("Not support in/out data type for quantize.")
# quantize switch off
#    elif quantizeConfig[0] == 0:
    quantizeTurnOn = False
    isQuantize = False
    isDeQuantize = False
    isReQuantize = False
    #    else:
    #        raise RuntimeError("Invalid Quantize Config.")

    # - - - # - - - # - - - - - - - # - - - - - - # - - - # - - - # - - - - #
    # 07    | 06    | 05      04    | 03          | 02    | 01    | 00      #
    # QSqrt | scale | offset        | ReQ         | DeQ   | Quan  | Switch  #
    # - - - # - - - # - - - # - - - # - - - - - - # - - - # - - - # - - - - #
    # 15    | 14    | 13    | 12    | 11          | 10    | 09    | 08      #
    # Null  | Null  | Null  | Null  |in_dsl_flag  | bias  | RqSqrt| DqSqrt  #
    # - - - # - - - # - - - # - - - # - - - # - - - # - - - # - - - - #
    # in_dsl_flag     #0: to imply conv by ir directly, it's not perferred
    #                 #1: to imply  conv by dsl, it's default way
    #    in_dsl_flag = 1  # 0 for old conv
    #    te.lang.cce.conv_param.tiling = tiling

    model_config = (1 if quantizeTurnOn else 0)     \
        | (1 if isQuantize else 0) << 1    \
        | (1 if isDeQuantize else 0) << 2  \
        | (1 if isReQuantize else 0) << 3  \
        | 0 << 4           \
        | 0 << 6           \
        | 0 << 7                \
        | 0 << 8                \
        | 0 << 9                \
        | (1 if bias else 0) << 10         \
        | 1 << 11

    with tvm.target.cce():
        Data = tvm.placeholder(shape_in, name='Fmap', dtype=in_dtype)
        Weight = tvm.placeholder(shape_w, name='Filter', dtype=w_dtype)

        # bias or fusion_bias(half offset)
        if bias or (model_config & 0x31 == 0x11):
            Bias = tvm.placeholder(
                (shape_w[0], ),
                name='Bias',
                dtype="int32" if quantizeTurnOn else "float16")
        # bias or fusion_bias(all offset)
        elif bias or (model_config & 0x31 == 0x21):
            Bias = tvm.placeholder(
                (shape_w[0], ),
                name='Bias',
                dtype="uint32" if quantizeTurnOn else "float16")

        # quantize on
        if quantizeTurnOn:
            QuantizeAlgorithm = quantizeConfig[1]
            if isQuantize:
                scaleQ = tvm.placeholder((CUBE_MKN[scaleQ_dtype]['mac'][1], ),
                                         name='scaleQ',
                                         dtype=scaleQ_dtype)
                if QuantizeAlgorithm == 1 or QuantizeAlgorithm == 2:
                    offsetQ = tvm.placeholder(
                        (CUBE_MKN[offsetQ_dtype]['mac'][1], ),
                        name='offsetQ',
                        dtype=offsetQ_dtype)

            if isDeQuantize:
                scaleDq_shape = (CUBE_MKN[scaleDq_dtype]['mac'][1],
                                 ) if quantizeConfig[2] == 0 else (
                                     shape_w[0], )
                scaleDq = tvm.placeholder(scaleDq_shape,
                                          name='scaleDq',
                                          dtype=scaleDq_dtype)

            if isReQuantize:
                scaleRq_shape = (CUBE_MKN[scaleRq_dtype]['mac'][1],
                                 ) if quantizeConfig[2] == 0 else (
                                     shape_w[0], )
                scaleRq = tvm.placeholder(scaleRq_shape,
                                          name='scaleRq',
                                          dtype=scaleRq_dtype)
                if QuantizeAlgorithm == 1 or QuantizeAlgorithm == 2:
                    offsetRq_shape = (CUBE_MKN[offsetRq_dtype]['mac'][1],
                                      ) if quantizeConfig[2] == 0 else (
                                          shape_w[0], )
                    offsetRq = tvm.placeholder(offsetRq_shape,
                                               name='offsetRq',
                                               dtype=offsetRq_dtype)
            # need offsetPad , for half offset and all offset
            if QuantizeAlgorithm == 1 or QuantizeAlgorithm == 2:
                offsetPad = tvm.placeholder(
                    (CUBE_MKN[offsetPad_dtype]['mac'][1], ),
                    name='offsetPad',
                    dtype=offsetPad_dtype)

            # non offset
            if QuantizeAlgorithm == 0:
                if bias:
                    if isQuantize:
                        if isDeQuantize:
                            tensor_list = te.lang.cce.conv(
                                Data, Weight, Bias, scaleQ, scaleDq, res_dtype,
                                padh, padw, strideh, stridew, model_config)
                        else:
                            tensor_list = te.lang.cce.conv(
                                Data, Weight, Bias, scaleQ, scaleRq, res_dtype,
                                padh, padw, strideh, stridew, model_config)

                    else:
                        if isDeQuantize:
                            tensor_list = te.lang.cce.conv(
                                Data, Weight, Bias, scaleDq, res_dtype, padh,
                                padw, strideh, stridew, model_config)
                        else:
                            tensor_list = te.lang.cce.conv(
                                Data, Weight, Bias, scaleRq, res_dtype, padh,
                                padw, strideh, stridew, model_config)

                else:
                    if isQuantize:
                        if isDeQuantize:
                            tensor_list = te.lang.cce.conv(
                                Data, Weight, scaleQ, scaleDq, res_dtype, padh,
                                padw, strideh, stridew, model_config)
                        else:
                            tensor_list = te.lang.cce.conv(
                                Data, Weight, scaleQ, scaleRq, res_dtype, padh,
                                padw, strideh, stridew, model_config)

                    else:
                        if isDeQuantize:
                            tensor_list = te.lang.cce.conv(
                                Data, Weight, scaleDq, res_dtype, padh, padw,
                                strideh, stridew, model_config)
                        else:
                            tensor_list = te.lang.cce.conv(
                                Data, Weight, scaleRq, res_dtype, padh, padw,
                                strideh, stridew, model_config)

            # half offset
            elif QuantizeAlgorithm == 1:
                if isQuantize:
                    if isDeQuantize:
                        tensor_list = te.lang.cce.conv(Data, Weight, Bias,
                                                       scaleQ, offsetQ,
                                                       scaleDq, offsetPad,
                                                       res_dtype, padh, padw,
                                                       strideh, stridew,
                                                       model_config)
                    else:
                        tensor_list = te.lang.cce.conv(Data, Weight, Bias,
                                                       scaleQ, offsetQ,
                                                       scaleRq, offsetRq,
                                                       offsetPad, res_dtype,
                                                       padh, padw, strideh,
                                                       stridew, model_config)

                else:
                    if isDeQuantize:
                        tensor_list = te.lang.cce.conv(Data, Weight, Bias,
                                                       scaleDq, offsetPad,
                                                       res_dtype, padh, padw,
                                                       strideh, stridew,
                                                       model_config)
                    else:
                        tensor_list = te.lang.cce.conv(Data, Weight, Bias,
                                                       scaleRq, offsetRq,
                                                       offsetPad, res_dtype,
                                                       padh, padw, strideh,
                                                       stridew, model_config)

            # all offset
            elif QuantizeAlgorithm == 2:
                raise RuntimeError("All Offset mode quantize not support.")
            else:
                raise RuntimeError("Invalid quantize algorithm.")
        # quantize off
        else:
            if bias:
                # Res = Data * Weight + Bias
                tensor_list = te.lang.cce.conv(Data, Weight, Bias, res_dtype,
                                               padh, padw, strideh, stridew,
                                               model_config)
            else:
                # Res = Data * Weight
                tensor_list = te.lang.cce.conv(Data, Weight, res_dtype, padh,
                                               padw, strideh, stridew,
                                               model_config)

        tensor_list = list(tensor_list)
        sch = generic.auto_schedule(tensor_list[-1])

    config = {
        "print_ir": need_print,
        "need_build": need_build,
        "name": kernel_name,
        "tensor_list": tensor_list
    }

    te.lang.cce.cce_build_code(sch, config)
예제 #30
0
def ascend_requant(x,
                   req_scale,
                   y,
                   relu_flag=False,
                   kernel_name='ascend_requant'):
    """
    int32 -> int8

    Parameters:
    ----------
    x : the dict of input

    req_scale: the dict of requant num

    offset: the dict of offset num

    y : the dict of output.

    relu_flag : the relu mode when true the result to do relu

    kernel_name : cce kernel name, default value is "ascend_requant"

    Returns:
    -------
    None
    """

    shape_x = x.get("shape")
    format_x = x.get("format")

    shape_req = req_scale.get("shape")
    format_req = req_scale.get("format")

    dtype_x = x.get("dtype")
    dtype_req = req_scale.get("dtype")

    check_list = [("int32", ), ("uint64", )]
    format_list = ["NC1HWC0", "FRACTAL_NZ"]
    util.check_dtype_rule(dtype_x, check_list[0])
    util.check_dtype_rule(dtype_req, check_list[1])

    if format_x not in format_list:
        raise RuntimeError("x only support [NC1HWC0, FRACTAL_NZ]")

    if format_x == "NC1HWC0":
        if len(shape_x) != 5:
            raise ValueError("x shape must of length 5 when format is NC1HWC0")
    if format_x == "FRACTAL_NZ":
        if len(shape_x) < 4:
            raise RuntimeError(
                "x shape length must >= 4 when format is FRACTAL_NZ")

    if len(shape_req) != 5:
        raise ValueError("req_scale shape must of length 5")

    if format_req != "NC1HWC0":
        raise ValueError("req_scale only support NC1HWC0")

    if shape_req[0] != 1 or shape_req[2] != 1 or shape_req[3] != 1:
        raise RuntimeError("req_scale shape must be 1 in n,h,w")

    if format_x == "NC1HWC0":
        # n, C1, H*W, C0
        shape_x = [shape_x[0], shape_x[1], shape_x[2] * shape_x[3], shape_x[4]]

    ori_shape_req = req_scale.get("ori_shape")
    attr = {"ori_shape": ori_shape_req}
    input_x = tvm.placeholder(shape_x, dtype_x, "x")
    input_req = tvm.placeholder(shape_req,
                                name="req_scale",
                                dtype=dtype_req,
                                attrs=attr)

    with tvm.target.cce():
        res = ascend_requant_compute(input_x, input_req, relu_flag,
                                     kernel_name)
        generic.auto_schedule(res)