Пример #1
0
def custom_equal(shape_x, shape_y, dtype, kernel_name="cce_tf_equal", need_build=False,
                 need_print=False):
    """
    do element-wise equal operation between two input tensors

    Parameters:
    ----------
    shape_x : shape of input x

    shape_y : shape of input y

    dtype : source data type, support float16,float32,int32,int8,uint8

    kernel_name : cce kernel name, default value is "cce_tf_equal"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None
    """

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_x)
    util.check_shape_rule(shape_y)

    check_list = ["float16", "float32", "int32", "int8", "uint8", "bool"]

    dtype = dtype.lower()
    if not (dtype in check_list):
        raise RuntimeError(
            "tf_equal_cce only support %s while dtype is %s" % (",".join(check_list), dtype))

    util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT)
    util.check_shape_size(shape_y, SHAPE_SIZE_LIMIT)

    shape_x, shape_y, shape_max = util.produce_shapes(shape_x, shape_y)

    util.check_shape_size(shape_max, SHAPE_SIZE_LIMIT)

    x = tvm.placeholder(shape_x, dtype=dtype, name="x")
    y = tvm.placeholder(shape_y, dtype=dtype, name="y")

    x_tmp = te.lang.cce.broadcast(x, shape_max)
    y_tmp = te.lang.cce.broadcast(y, shape_max)

    res = tvm.compute(shape_max, lambda *i: x_tmp(*i) == y_tmp(*i), name='res')

    sch = tvm.create_schedule(res.op)

    if need_print:
        with build_config:
            print(tvm.lower(sch, [x, y, res], simple_mode=True))

    if need_build:
        with build_config:
            tvm.build(sch, [x, y, res], "cce", name=kernel_name)
Пример #2
0
def custom_logical_not(shape,
                       dtype,
                       kernel_name="cce_tf_logical_not",
                       need_build=False,
                       need_print=False):
    """
    logical not for the input tensor

    Parameters
    ----------
    shape : input shape of data

    dtype : the data type, support bool

    kernel_name : cce kernel name, default value is "cce_logical_not"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None

    """
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)

    check_list = ["bool"]
    if not dtype.lower() in check_list:
        raise RuntimeError(
            "logical_not_cce ony supports %s while dtype is %s" %
            (",".join(check_list), dtype))

    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)

    inp_dtype = dtype.lower()

    data = tvm.placeholder(shape, name="data", dtype=inp_dtype)

    with tvm.target.cce():

        result = tvm.compute(
            shape,
            lambda *i: tvm.select(data[i] is True, False, True),
            name="result")

        schedule = tvm.create_schedule(result.op)

        if need_print:
            with build_config:
                print(tvm.lower(schedule, [data, result], simple_mode=True))
        if need_build:
            with build_config:
                tvm.build(schedule, [data, result], "cce", name=kernel_name)
Пример #3
0
def custom_Exp(shape,
               dtype,
               gamma,
               alpha,
               beta,
               kernel_name="cce_exp",
               need_build=False,
               need_print=False):
    """
    calculate gamma **(alpha * data + beta),
    calculate exp(log(gamma) * alpha * data) * (gamma ** beta)

    Parameters
    ----------
    shape : shape of data

    dtype : the data type, assume src_dtype equals dst_dtype, only support \
    float16, float32

    gamma : the data type must be same with dtype parameter
        args in (alpha * data + beta) ** gamma, base

    alpha : the data type must be same with dtype parameter
        args in (alpha * data + beta) ** gamma, scale

    beta : the data type must be same with dtype parameter
        args in (alpha * data + beta) ** gamma, shift

    kernel_name : cce kernel name, default value is "cce_exp"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None

    """
    supported_dtypes = ["float16", "float32"]
    device_api = "DeviceExp"

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)

    if not dtype.lower() in supported_dtypes:
        raise RuntimeError(
            "caffe_exp_layer_cce only support %s while dtype is %s" %
            (",".join(supported_dtypes), dtype))

    if gamma != -1 and gamma <= 0:
        # api  cc_device_exp_c handle gamma == -1 as e
        raise ValueError(
            "please ensure gamma is greater than 0, where gamma = %s" %
            str(gamma))

    inp_dtype = dtype.lower()
    shape = util.shape_refine(shape)
    data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype)

    v_datatype = util.get_device_api_dtype(inp_dtype)
    v_ndim = len(shape)
    block_num = "block_num"
    block_idx = "block_idx"
    pad_c0 = 0
    p_scale = util.create_param_ptr([alpha], inp_dtype, "p_scale")
    p_shift = util.create_param_ptr([beta], inp_dtype, "p_shift")
    p_base = util.create_param_ptr([gamma], inp_dtype, "p_base")
    p_shape = util.create_param_ptr(shape, "int32", "p_shape")

    # scale --> alpha, shitf --> beta, base --> gamma
    output = tvm.extern(
        shape,
        [data_input, p_scale, p_shift, p_base, p_shape],
        lambda ins, outs: tvm.call_extern(
            "int32_t",
            device_api,
            block_num,
            block_idx,
            v_datatype,
            ins[1].access_ptr("r"),  # scale
            ins[2].access_ptr("r"),  # shift
            ins[3].access_ptr("r"),  # base
            v_ndim,
            ins[4].access_ptr("r"),  # shape
            pad_c0,
            ins[0].access_ptr("r"),  # input x
            outs[0].access_ptr("w")),
        name="output",
        dtype=inp_dtype)

    schedule = tvm.create_schedule(output.op)

    if need_print:
        with build_config:
            print(tvm.lower(schedule, [data_input, output], simple_mode=True))
    if need_build:
        with build_config:
            tvm.build(schedule, [data_input, output], "cce", name=kernel_name)
Пример #4
0
def custom_truncatemod(shape1, shape2, dtype, kernel_name="cce_tf_truncatemod",
                       need_build=False, need_print=False):
    """
    do element-wise truncatemod operation between two input tensors

    Parameters:
    ----------
    shape1 : shape of input data1

    shape2 : shape of input data2

    dtype : source data type, support float16,float32,int32

    kernel_name : cce kernel name, default value is "cce_tf_truncatemod"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None
    """
    max_dim = 8
    shape1_len = len(shape1)
    shape2_len = len(shape2)
    if shape1_len > max_dim or shape2_len > max_dim:
        raise RuntimeError(
            "mod_cce only support up to %d dimensions while the shape's \
            dimensions is %d, %d" % (max_dim, shape1_len, shape2_len))
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape1)
    util.check_shape_rule(shape2)

    util.check_shape_size(shape1, SHAPE_SIZE_LIMIT)
    util.check_shape_size(shape2, SHAPE_SIZE_LIMIT)

    check_list = ["float16", "float32", "int32"]
    device_api_map = {"float16": "cc_device_truncatemod_float16",
                      "float32": "cc_device_truncatemod_float",
                      "int32": "cc_device_truncatemod_int32"}

    dtype = dtype.lower()
    if dtype not in check_list:
        raise RuntimeError(
            "tf_truncatemod_cce only support %s while dtype is %s" % (
                ",".join(check_list), dtype))

    shape1, shape2, shape_out = util.produce_shapes(shape1, shape2)
    util.check_shape_size(shape_out, SHAPE_SIZE_LIMIT)

    inp_dtype = dtype.lower()

    device_api = device_api_map[inp_dtype]

    # block
    block_num = "block_num"
    block_idx = "block_idx"
    # x param
    v_xndim_cnt = tvm.const(len(shape1), "int32")
    p_xshape = util.create_param_ptr(shape1, "int32", "p_xshape")
    xpad_c0 = tvm.const(0, "int32")
    data_input_x = tvm.placeholder(shape1, name="data_input_x",
                                   dtype=inp_dtype)
    # y param
    v_yndim_cnt = tvm.const(len(shape2), "int32")
    p_yshape = util.create_param_ptr(shape2, "int32", "p_yshape")
    ypad_c0 = tvm.const(0, "int32")
    data_input_y = tvm.placeholder(shape2, name="data_input_y",
                                   dtype=inp_dtype)
    # output
    v_out_ndim_cnt = tvm.const(len(shape_out), "int32")
    p_out_shape = util.create_param_ptr(shape_out, "int32", "p_yshape")
    out_padc0 = tvm.const(0, "int32")

    output = tvm.extern(shape_out,
                        [p_xshape, data_input_x, p_yshape, data_input_y,
                         p_out_shape], lambda ins, outs:
                        tvm.call_extern("int32_t", device_api,
                                        block_num,
                                        block_idx,
                                        v_xndim_cnt,
                                        ins[0].access_ptr("r"),  # shape x
                                        xpad_c0,
                                        ins[1].access_ptr("r"),  # input x
                                        v_yndim_cnt,
                                        ins[2].access_ptr("r"),  # shape y
                                        ypad_c0,
                                        ins[3].access_ptr("r"),  # input y
                                        v_out_ndim_cnt,
                                        ins[4].access_ptr("r"),  # shape out
                                        out_padc0,
                                        outs[0].access_ptr("w")),
                        name="output", dtype=inp_dtype)

    schedule = tvm.create_schedule(output.op)

    # print IR
    if need_print:
        with build_config:
            print(tvm.lower(schedule, [data_input_x, data_input_y, output],
                            simple_mode=True))
            # Compile to generate the cce file
    if need_build:
        with build_config:
            tvm.build(schedule, [data_input_x, data_input_y, output], "cce",
                      name=kernel_name)
Пример #5
0
def custom_round(shape,
                 dtype,
                 kernel_name="cce_round",
                 need_build=False,
                 need_print=False):
    """
    doing round operations, calculating data type is float16 or float32 or int32
    
    Parameters
    ----------
    shape : shape of data

    dtype : the data type, assume src_dtype equals dst_dtype

    kernel_name : cce kernel name, default value is "cce_round"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None
        
    """
    check_list = ["float16", "float32", "int32"]
    device_api_map = {
        "float16": "cc_device_round_float16",
        "float32": "cc_device_round_float",
        "int32": "cc_device_round_int32"
    }

    max_dim = 8
    shape_len = len(shape)
    if shape_len > max_dim:
        raise RuntimeError(
            "round_cce only support up to %d dimensions while the shape's dimension is %d"
            % (max_dim, shape_len))

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)

    if not (dtype.lower() in check_list):
        raise RuntimeError("round_cce only support %s while dtype is %s" %
                           (",".join(check_list), dtype))

    inp_dtype = dtype.lower()
    shape = util.shape_refine(shape)
    data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype)
    device_api = device_api_map[inp_dtype]

    block_num = "block_num"
    block_idx = "block_idx"
    v_ndim = tvm.const(len(shape), "int32")
    padC0 = tvm.const(0, "int32")
    p_shape = util.create_param_ptr(shape, "int32", "p_shape")

    output = tvm.extern(
        shape,
        [data_input, p_shape],
        lambda ins, outs: tvm.call_extern(
            "int32_t",
            device_api,
            block_num,
            block_idx,
            v_ndim,
            ins[1].access_ptr("r"),  # shape
            padC0,
            ins[0].access_ptr("r"),  # input x
            outs[0].access_ptr("w")),
        name="output",
        dtype=inp_dtype)

    s = tvm.create_schedule(output.op)

    if need_print:
        with build_config:
            print(tvm.lower(s, [data_input, output], simple_mode=True))
    if need_build:
        with build_config:
            tvm.build(s, [data_input, output], "cce", name=kernel_name)
Пример #6
0
def custom_pow(shape,
               shape_y,
               dtype,
               kernel_name="cce_tf_pow",
               need_build=False,
               need_print=False):
    """
    calculate x^y, calculating data type is float16 or float32 or int32
    when x < 0 , the output is a meaningless value.
    Parameters
    ----------
    shape : shape of data

    dtype : the data type, assume src_dtype equals dst_dtype, only support
    float16, float32, int32

    kernel_name : cce kernel name, default value is "tf_pow_cce"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None

    """
    supported_dtypes = ["float16", "float32", "int32"]
    device_api = "cc_device_pow"

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)

    if not dtype.lower() in supported_dtypes:
        raise RuntimeError("tf_pow_cce only support %s while dtype is %s" %
                           (",".join(supported_dtypes), dtype))

    inp_dtype = dtype.lower()
    shape = util.shape_refine(shape)
    data_lhs = tvm.placeholder(shape, name="data_lhs", dtype=inp_dtype)
    data_rhs = tvm.placeholder(shape, name="data_rhs", dtype=inp_dtype)

    v_datatype = util.get_device_api_dtype(inp_dtype)
    v_ndim = len(shape)
    block_num = "block_num"
    block_idx = "block_idx"
    pad_c0 = 0
    p_scale = util.create_param_ptr([0], inp_dtype, "p_scale")
    p_shift = util.create_param_ptr([0], inp_dtype, "p_shift")
    p_power = util.create_param_ptr([0], inp_dtype, "p_power")
    p_shape = util.create_param_ptr(shape, "int32", "p_shape")

    output = tvm.extern(
        shape,
        [data_lhs, data_rhs, p_scale, p_shift, p_power, p_shape],
        lambda ins, outs: tvm.call_extern(
            "int32_t",
            device_api,
            block_num,
            block_idx,
            v_datatype,
            ins[2].access_ptr("r"),  # scale
            ins[3].access_ptr("r"),  # shift
            ins[4].access_ptr("r"),  # power
            v_ndim,
            ins[5].access_ptr("r"),  # shape
            pad_c0,
            ins[0].access_ptr("r"),  # input x
            v_ndim,
            v_ndim,
            ins[5].access_ptr("r"),  # shape
            pad_c0,
            ins[1].access_ptr("r"),  # input y
            outs[0].access_ptr("w")),
        name="output",
        dtype=inp_dtype)

    schedule = tvm.create_schedule(output.op)

    if need_print:
        with build_config:
            print(
                tvm.lower(schedule, [data_lhs, data_rhs, output],
                          simple_mode=True))
    if need_build:
        with build_config:
            tvm.build(schedule, [data_lhs, data_rhs, output],
                      "cce",
                      name=kernel_name)
Пример #7
0
def custom_Upsample(shape,
                    dtype,
                    scale,
                    data_format="channels_last",
                    kernel_name="cce_darknet_upsample",
                    need_build=False,
                    need_print=False):
    """
    Parameters
    ----------
    shape: input tensor's shape

    dtype: input tensor's dtype, support:`float16,float32

    scale: the upsampling factors

    data_format: "channels_last" or "channels_first"

    kernel_name : kernel name, default value is "MyUpsample"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None
    """
    """
    TODO:
    Please refer to the TE DSL Manual, And code here with TE DSL.
    """
    inp_dtype = dtype.lower()
    check_list = ["float16", "float32", "int32", "int8", "uint8"]
    if inp_dtype not in check_list:
        raise RuntimeError("upsample only support %s while dtype is %s" %
                           (",".join(check_list), dtype))

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)
    size = (scale, scale)

    shape_size = len(shape)
    if not (shape_size == 4 or shape_size == 5):
        raise RuntimeError(
            "upsample only support 4D or 5D while len(shape):%d" % len(shape))

    input_tensor = tvm.placeholder(shape, name="input_tensor", dtype=inp_dtype)

    res = None
    if shape_size == 5:
        # shape_size == 5 D-sepecial (N, C1, H, W, C0)
        output_shape = (shape[0], shape[1], shape[2] * size[0],
                        shape[3] * size[1], shape[4])
        res = tvm.compute(
            output_shape, lambda n, c0, h, w, c: input_tensor[n, c0, h // size[
                0], w // size[1], c])
    else:
        if data_format == "channels_last":
            output_shape = (shape[0], shape[1] * size[0], shape[2] * size[1],
                            shape[3])
            res = tvm.compute(
                output_shape, lambda n, h, w, c: input_tensor[n, h // size[0],
                                                              w // size[1], c])
        elif data_format == "channels_first":
            output_shape = (shape[0], shape[1], shape[2] * size[0],
                            shape[3] * size[1])
            res = tvm.compute(
                output_shape, lambda n, c, h, w: input_tensor[n, c, h // size[
                    0], w // size[1]])
        else:
            raise RuntimeError(
                "upsample only support channels_last|channels_first "
                "while input type %s" % data_format)

    schedule = tvm.create_schedule(res.op)
    if need_print:
        with build_config:
            print(tvm.lower(schedule, [input_tensor, res], simple_mode=True))

    if need_build:
        with build_config:
            tvm.build(schedule, [input_tensor, res], "cce", name=kernel_name)
Пример #8
0
def custom_expm1(shape,
                 dtype,
                 kernel_name="cce_tf_expm1",
                 need_build=False,
                 need_print=False):
    """
    algorithm: expm1

    calculating data's expm1, y= (e ** x) - 1,dtype is float16 or float32.

    Parameters
    ----------
    shape : shape of data.

    dtype : the data type, assume src_dtype equals dst_dtype, only support float16, float32.

    kernel_name : cce kernel name, default value is "cce_tf_expm1".

    need_buid : if need to build CCEC kernel, default value is False.

    need_print : if need to print the ir, default value is False.

    Returns
    -------
    None

    """

    # [aicpu] int32_t cc_device_exp(uint32_t blockNum, uint32_t blockIdx, int32_t dataType, const void *scale, const void *shift,
    # const void *base, int32_t dimCnt, int32_t *shape, uint32_t padC0, const void *x, void *y);

    supported_dtypes = ["float16", "float32"]

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)

    if not (dtype.lower() in supported_dtypes):
        raise RuntimeError("tf_expm1_cce only support %s while dtype is %s" %
                           (",".join(supported_dtypes), dtype))

    inp_dtype = dtype.lower()
    shape = util.shape_refine(shape)
    data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype)

    # step 1. calculate y = exp ** x by aicpu api
    device_api = "DeviceExp"
    v_datatype = util.get_device_api_dtype(inp_dtype)
    v_ndim = len(shape)
    block_num = "block_num"
    block_idx = "block_idx"
    padC0 = 0
    p_scale = util.create_param_ptr([1], inp_dtype, "p_scale")
    p_shift = util.create_param_ptr([0], inp_dtype, "p_shift")
    p_base = util.create_param_ptr([-1], inp_dtype, "p_base")
    p_shape = util.create_param_ptr(shape, "int32", "p_shape")

    output_exp = tvm.extern(
        shape,
        [data_input, p_scale, p_shift, p_base, p_shape],
        lambda ins, outs: tvm.call_extern(
            "int32_t",
            device_api,
            block_num,
            block_idx,
            v_datatype,
            ins[1].access_ptr("r"),  # scale
            ins[2].access_ptr("r"),  # shift
            ins[3].access_ptr("r"),  # base
            v_ndim,
            ins[4].access_ptr("r"),  # shape
            padC0,
            ins[0].access_ptr("r"),  # input x
            outs[0].access_ptr("w")),
        name="output_exp",
        dtype=inp_dtype)

    offset = tvm.const((-1), dtype=inp_dtype)

    # step 2. cauculate y = exp ** x - 1 by tvm
    output = tvm.compute(
        shape,
        lambda *indice: output_exp(*indice) + offset.astype(inp_dtype),
        name="output")

    # step 3. schedule the computation by tvm
    s = tvm.create_schedule(output.op)

    # step 4. build by tvm
    if need_print:
        with build_config:
            print(tvm.lower(s, [data_input, output], simple_mode=True))
    if need_build:
        with build_config:
            tvm.build(s, [data_input, output], "cce", name=kernel_name)
def SpatialTransformer(input_shape, out_shape, dtype="float32", kernel_name="SpatialTransformer", need_build = True, need_print = False):
    """Spatial Transformer Layer
    
    Implements a spatial transformer layer as described in [1]_.
    Based on [2]_.
    
    Parameters
    ----------
    input_shape : 
        the shape of input tensor
        [num_batch, height, width, num_channels]
        
    out_shape: float
        the height and width of output tensor
        [out_height, out_width].
        
    out_size: tuple of two ints
        The size of the output of the network (height, width)
        
    dtype: data type    
        
    kernel_name : kernel name, default value is "SpatialTransformer"

    need_buid : if need to build CCEC kernel, default value is True

    need_print : if need to print the ir, default value is False
    
    Returns
    -------
    tvm.Tensor

    References
    ----------
    .. [1]  Spatial Transformer Networks
            Max Jaderberg, Karen Simonyan, Andrew Zisserman, Koray Kavukcuoglu
    .. [2]  https://github.com/tensorflow/models/tree/master/research/transformer
    """
    
    def _meshgrid(height, width):
		
        y0 = tvm.compute((height,), lambda i: -1 + i * 2.0 / (height - 1), name = 'y0')
        x0 = tvm.compute((width,), lambda i: -1 + i * 2.0 / (width - 1), name = 'x0')
        
        y = tvm.compute((height * width,), lambda i: y0[i // width], name = 'y')
        x = tvm.compute((height * width,), lambda i: x0[i % width], name = 'x')
        
        y = topi.reshape(y, (1, height * width))
        x = topi.reshape(x, (1, height * width))
        ones = tvm.compute((1, height * width), lambda i,j:1, name = 'ones')
         
        grid = tvm.compute((3, height * width),lambda i,j: 0.5 * (i - 1) * (i - 2) * x[0,j] + i * (2 - i) * y[0,j] + 0.5 * i * (i-1) * ones[0,j], name = 'grid')
        
        #grid = topi.concatenate((x,y,ones),0) #can not use topi.concatenate
        return grid       

    def _interpolate(im, im_shape, x, y, out_size, dtype):
        
        num_batch = im_shape[0]
        height = im_shape[1]
        width = im_shape[2]
        channels = im_shape[3]
            
        out_height = out_size[0]
        out_width = out_size[1]
        max_y = int(im_shape[1] - 1)
        max_x = int(im_shape[2] - 1)
               
        #[-1,1] -> [0, width-1]
        x = topi.multiply(topi.add(x, tvm.const(1, dtype=dtype)), width / tvm.const(2, dtype=dtype))
        y = topi.multiply(topi.add(y, tvm.const(1, dtype=dtype)), height / tvm.const(2, dtype=dtype))
            
        # do sampling
        dim3 = out_height * out_width * num_batch
            
        x0 = topi.cast(topi.floor(x), 'int32')  
        y0 = topi.cast(topi.floor(y), 'int32')
        x1 = topi.add(x0,tvm.const(1, dtype="int32"))
        y1 = topi.add(y0,tvm.const(1, dtype="int32"))

        x0 = topi.clip(x0, 0, max_x)
        x1 = topi.clip(x1, 0, max_x)
        y0 = topi.clip(y0, 0, max_y)
        y1 = topi.clip(y1, 0, max_y)

        dim2 = width
        dim1 = width * height

        base = tvm.compute((dim3,),lambda i:(i // (out_height * out_width)) * width * height, name = 'base')        
        base_y0 = topi.add(base, topi.multiply(y0, dim2))
        base_y1 = topi.add(base, topi.multiply(y1, dim2))

        idx_a = topi.add(base_y0, x0)
        idx_b = topi.add(base_y1, x0)
        idx_c = topi.add(base_y0, x1)
        idx_d = topi.add(base_y1, x1)
                
        im_flat = topi.reshape(im, (num_batch * height * width, channels))
        im_flat = topi.cast(im_flat, dtype)
        
        Ia = tvm.compute((dim3, channels),lambda i,j: im_flat[idx_a[i], j], name = 'Ia')       
        Ib = tvm.compute((dim3, channels),lambda i,j: im_flat[idx_b[i], j], name = 'Ib') 
        Ic = tvm.compute((dim3, channels),lambda i,j: im_flat[idx_c[i], j], name = 'Ic')
        Id = tvm.compute((dim3, channels),lambda i,j: im_flat[idx_d[i], j], name = 'Id')
            
        x0_f = topi.cast(x0, dtype)
        x1_f = topi.cast(x1, dtype)
        y0_f = topi.cast(y0, dtype)
        y1_f = topi.cast(y1, dtype)
        wa = topi.expand_dims(topi.multiply(topi.subtract(x1_f, x), topi.subtract(y1_f, y)), 1)
        wb = topi.expand_dims(topi.multiply(topi.subtract(x1_f, x), topi.subtract(y, y0_f)), 1)
        wc = topi.expand_dims(topi.multiply(topi.subtract(x, x0_f), topi.subtract(y1_f, y)), 1)
        wd = topi.expand_dims(topi.multiply(topi.subtract(x, x0_f), topi.subtract(y, y0_f)), 1)
 
        output = topi.add(topi.add(topi.add(topi.multiply(wa, Ia), topi.multiply(wb, Ib)),topi.multiply(wc, Ic)), topi.multiply(wd, Id))
        
        return output

    def _transform(theta, input_dim, out_size, input_shape, dtype):
        
        num_batch = input_shape[0]
        height = input_shape[1]
        width = input_shape[2]
        num_channels = input_shape[3]

        theta = topi.reshape(theta, (num_batch, 2, 3))
        theta = topi.cast(theta, dtype)

        out_height = out_size[0]
        out_width = out_size[1]
                
        grid = _meshgrid(out_height, out_width)       
        grid = topi.reshape(grid, (num_batch, 3, out_height*out_width))
        grid = topi.cast(grid, dtype=dtype)
        
        k = tvm.reduce_axis((0, 3), 'k')
        T_g = tvm.compute((num_batch, 2, out_height*out_width),lambda b, y, x: tvm.sum(theta[b, y, k] * grid[b, k, x], axis = k), name = 'T_g')
              
        x_s = tvm.compute((num_batch, 1, out_height*out_width), lambda i,j,k:T_g[i,0,k], name = 'x_s')
        y_s = tvm.compute((num_batch, 1, out_height*out_width), lambda i,j,k:T_g[i,1,k], name = 'y_s')
              
        x_s_flat = topi.reshape(x_s, (num_batch*out_height*out_width,))
        y_s_flat = topi.reshape(y_s, (num_batch*out_height*out_width,))
                      
        input_transformed = _interpolate(input_dim, input_shape, x_s_flat, y_s_flat, out_size, dtype)
        output = topi.reshape(input_transformed, [num_batch, out_height, out_width, num_channels])
        return output 
    
    num_batch = input_shape[0]
    input_height = input_shape[1]
    input_width = input_shape[2]
    channel = input_shape[3]
    
    U = tvm.placeholder((num_batch, input_height, input_width, channel), name="U", dtype=dtype)    
    theta = tvm.placeholder((num_batch, 6, 1, 1), dtype=dtype)    
    output = _transform(theta, U, out_shape, input_shape, dtype)       
    s = tvm.create_schedule(output.op)

    if need_print:
        with build_config:
            print(tvm.lower(s, [U, theta, output], simple_mode=True))
            
    if need_build:
        with build_config:
            tvm.build(s, [U, theta, output], "cce", name=kernel_name)
Пример #10
0
def custom_batch_matmul(shape_x,
                        shape_y,
                        dtype,
                        trans_a=False,
                        trans_b=False,
                        kernel_name="cce_tf_batch_matmul",
                        need_build=False,
                        need_print=False):
    """
    Multiplies slices of two tensors in batches(each slice can be viewed
    as an element of a batch), the output is of the same batch size.

    Each of the individual slices can optionally be transposed before
    multiplication by setting the trans_a or trans_b flag to True, which
    are by default False. The input tensors are 2-D or higher with the
    shape [..., r_x, c_x] and [..., r_y, c_y]. The output tensor is 2-D
    or higher with the shape [..., r_o, c_o], where
    r_o = c_x if trans_a else r_x
    c_o = r_y if trans_b else c_y

    Parameters
    ----------
    shape_x : shape of the first tensor x with rank > 1

    shape_y : shape of the second tensor y with the same type and shape with x

    dtype : the data type, support int8, uint8,float16,float32,int32

    kernel_name : cce kernel name, default value is "cce_batch_matmul"

    trans_a : if True, shape_x is transposed before multiplication

    trans_b : if True, shape_y is transposed before multiplication

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None
    """
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_x)
    util.check_shape_rule(shape_y)

    util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT)
    util.check_shape_size(shape_y, SHAPE_SIZE_LIMIT)

    data_dtype = dtype.lower()
    check_list = ["int8", "uint8", "float16", "float32", "int32"]
    if data_dtype not in check_list:
        raise RuntimeError(
            "batch_matmul_cce ony supports %s while dtype is %s" %
            (",".join(check_list), dtype))

    def transpose_tensor(shape, size):
        """Transpose the shape, e.g., the shape [..., r_x, c_x] is transposed
        to [..., c_x, r_x].

        Parameters
        ----------
        shape : shape of a tensor

        size : length of the shape

        Returns
        -------
        shape_ori : the transposed shape
        """
        shape_ori = ()
        if size == 1:
            shape_ori = shape_ori + shape
        elif size == 2:
            shape_ori = shape_ori + (shape[1], ) + (shape[0], )
        else:
            shape_ori = shape_ori + (shape[:(size - 2)]) + (
                shape[size - 1], ) + (shape[size - 2], )
        return shape_ori

    def check_matmul(shape_x, shape_y):
        """Check whether batch_matmul is supported or not.

        Parameters
        ----------
        shape_x : shape of the first tensor x

        shape_y : shape of the second tensor y with the same type and shape
        with x

        Returns
        -------
        None
        """
        len_x = len(shape_x)
        len_y = len(shape_y)
        if (len_x < 2) or (len_y < 2):
            raise RuntimeError("Only tensors of rank>=2 are supported!")
        if shape_x[len_x - 1] != shape_y[len_y - 2]:
            raise RuntimeError(
                "Invalid matrix multiplication for the inner 2 dimensions!")
        if (len_x == len_y) and (len_x > 2):
            for i in range(len_x - 2):
                if shape_x[i] != shape_y[i]:
                    raise RuntimeError("Outer dimensions do not match!")
            return
        elif (len_x == len_y) and (len_x == 2):
            return
        else:
            raise RuntimeError("The input tensors are not with the same rank!")

    def _compute(output_shape, x, y, K, trans_a, trans_b, *indices):
        """matmul compuation in terms of the output shape and the transposes

        Parameters
        ----------
        output_shape : the final output shape, e.g., shape_x = (2, 6),
            shape_y = (8, 2), trans_a = True, True_b = True, then,
            output_shape = (6, 8).

        x : the first input tensor according to shape_x.

        y : the second input tensor according to shape_y.

        K : the number of the axis for sum, in the above example, K = 2.

        trans_a : if True, x needs to be transposed.

        trans_b : if True, y needs to be transposed.

        *indices : the output shape space for tvm.compute.

        Returns
        -------
        tvm.Tensor
        """
        n_len = len(output_shape)
        k = tvm.reduce_axis((0, K), 'k')
        if trans_a is True and trans_b is False:
            # For example, A: (6, 7, 8), B: (6, 7, 9), so the length is n = 3
            # C = A' * B : (6, 8, 9), A' means the transpose of A
            # indices means the space of (6, 8, 9), k = 7
            # x_indices = indices[:1]+(7, )+indices[1:2] = (6, 7, 8)
            # y_indices = indices[:1]+(7, )+indices[2:] = (6, 7, 9)
            x_indices = indices[:(n_len - 2)] + (k, ) + indices[(n_len - 2):
                                                                (n_len - 1)]
            y_indices = indices[:(n_len - 2)] + (k, ) + indices[(n_len - 1):]
            return tvm.sum(x(*x_indices) * y(*y_indices), axis=k)
        elif not trans_a and trans_b:
            # For example, A: (6, 7, 8), B: (6, 9, 8), C = A * B' : (6, 7, 9)
            # indices means the space of (6, 7, 9), n=3, k = 8
            # x_indices = indices[:2]+(8, ) = (6, 7, 8)
            # y_indices = indices[:1]+indices[2:]+(8, ) = (6, 9, 8)
            x_indices = indices[:(n_len - 1)] + (k, )
            y_indices = indices[:(n_len - 2)] + indices[(n_len - 1):] + (k, )
            return tvm.sum(x(*x_indices) * y(*y_indices), axis=k)
        elif trans_a and trans_b:
            # For example, A: (6, 8, 10), B: (6, 12, 8), C = A' * B' : \
            # (6, 10, 12)
            # indices means the space of (6, 10, 12), n=3, k = 8
            # x_indices = indices[:1]+(8, )+indices[1:2] = (6, 8, 10)
            # y_indices = indices[:1]+indices[2:]+(8, ) = (6, 12, 8)
            x_indices = indices[:(n_len - 2)] + (k, ) + indices[(n_len - 2):
                                                                (n_len - 1)]
            y_indices = indices[:(n_len - 2)] + indices[(n_len - 1):] + (k, )
            return tvm.sum(x(*x_indices) * y(*y_indices), axis=k)
        else:
            # For example, A: (6, 15, 16), B: (6, 16, 18), C = A * B : \
            # (6, 15, 18)
            # indices means the space of (6, 15, 18), n=3, k = 16
            # x_indices = indices[:2]+(16, ) = (6, 15, 16)
            # y_indices = indices[:1]+(16, )+indices[2:] = (6, 16, 18)
            x_indices = indices[:(n_len - 1)] + (k, )
            y_indices = indices[:(n_len - 2)] + (k, ) + indices[(n_len - 1):]
            return tvm.sum(x(*x_indices) * y(*y_indices), axis=k)

    def check_supportted_shape_size(shape_x, shape_y, limit, trans_a, trans_b):
        """
        check shape size for operator
        ----------
        shape: shape of data

        limit: limit of the product

        Returns
        -------
        None
        """
        # This function is used to check whether the shape is too large to \
        # cause a timeout.
        # shape_x = (a,b,c,d,e,k)  shape_y = (a,b,c,d,k,f)
        # t_1 : time consumed by each addition operation
        # t_2 : time consumed by each multiplication operation
        # t_all : time consumed by a complete calculation
        # t_all is approximately equal to (a*b*c*d)*(e*k*f)*(t_1+t_2)
        # As (t_1 + t_2) is a constant, so t_all is proportional to \
        # (a * b * c * d * e * k * f)

        len_x = len(shape_x)
        len_y = len(shape_y)
        if (len_x < 2) or (len_y < 2):
            raise RuntimeError("Only tensors of rank>=2 are supported!")

        shape_x = list(shape_x)
        shape_y = list(shape_y)

        tmp_shape_x = shape_x[:]
        if trans_a:
            tmp_shape_x = shape_x[:-2] + [shape_x[-1], shape_x[-2]]

        tmp_shape_y = shape_y[:]
        if trans_b:
            tmp_shape_y = shape_y[:-2] + [shape_y[-1], shape_y[-2]]

        union_shape = tmp_shape_x + [tmp_shape_y[-1]]

        union_size = reduce(lambda i, j: i * j, union_shape)

        if union_size > limit:
            raise RuntimeError("the shape is too large to calculate")

    if data_dtype in ["float16", "float32", "int32"]:
        type_shape_map = {
            'float16': SHAPE_SIZE_FP16_LIMIT,
            'float32': SHAPE_SIZE_FP32_LIMIT,
            'int32': SHAPE_SIZE_INT32_LIMIT
        }

        check_supportted_shape_size(shape_x, shape_y,
                                    type_shape_map[data_dtype], trans_a,
                                    trans_b)

    x_size = len(shape_x)
    y_size = len(shape_y)
    shape_a = shape_x
    shape_b = shape_y
    if trans_a is True:
        shape_x = transpose_tensor(shape_x, x_size)

    if trans_b is True:
        shape_y = transpose_tensor(shape_y, y_size)

    check_matmul(shape_x, shape_y)
    last_axis = shape_x[x_size - 1]

    x_temp = tvm.placeholder(shape_a, name="input_1", dtype=data_dtype)
    y_temp = tvm.placeholder(shape_b, name="input_2", dtype=data_dtype)

    # output shape
    output_shape = ()
    for i in range(x_size - 1):
        output_shape = output_shape + (shape_x[i], )
    output_shape = output_shape + (shape_y[x_size - 1], )
    result = tvm.compute(
        output_shape,
        lambda *indices: _compute(output_shape, x_temp, y_temp, last_axis,
                                  trans_a, trans_b, *indices),
        name="result")
    schedule = tvm.create_schedule(result.op)

    if need_print:
        with build_config:
            print(
                tvm.lower(schedule, [x_temp, y_temp, result],
                          simple_mode=True))
    if need_build:
        with build_config:
            tvm.build(schedule, [x_temp, y_temp, result],
                      "cce",
                      name=kernel_name)
Пример #11
0
def custom_Reduction(shape,
                     dtype,
                     axis,
                     op,
                     coeff,
                     kernel_name="cce_reductionLayer",
                     need_build=False,
                     need_print=False):
    """
    Reduce a tensor on a certain axis, and scale output with coeff

    Parameters
    ----------
    shape : shape of data

    dtype : source data type, only support float16, float32, int8, uint8

    axis : the first axis to reduce, may be negative to index from the end
           (e.g., -1 for the last axis).
           If axis == 0, the output Blob always has the empty shape (count 1),
           performing reduction across the entire input.

    op : can only be one of "SUM, ASUM (sum of abs), SUMSQ (sum of sqr), MEAN"

    coeff : scale for output

    kernel_name : cce kernel name, default value is "cce_reductionLayer"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None

    """
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)

    check_list = ["float16", "float32", "int8", "uint8"]
    if not dtype.lower() in check_list:
        raise RuntimeError(
            "reductionLayer_cce only support %s while dtype is %s" %
            (",".join(check_list), dtype))

    reduction_op = ("SUM", "ASUM", "SUMSQ", "MEAN")

    if not isinstance(axis, int):
        raise RuntimeError("type of axis value should be int")
    if op not in reduction_op:
        raise RuntimeError("op can only be one of SUM, ASUM, SUMSQ , MEAN")
    if not isinstance(coeff, int) and not isinstance(coeff, float):
        raise RuntimeError("coeff must be a value")
    axis_origin = axis
    shape_origin = shape
    axis = util.axis_check(len(shape), axis)
    util.check_reduce_shape_rule(shape)
    shape = list(shape)
    shape1 = shape[:axis] + [
        functools_reduce(lambda x, y: x * y, shape[axis:])
    ]
    shape1, axis = util.shape_refine(shape1, axis)
    if not axis:
        axis = [0]
        shape1 = [1] + shape1
    inp_dtype = dtype.lower()
    data = tvm.placeholder(shape1, name="data_input", dtype=inp_dtype)
    with tvm.target.cce():
        res = caffe_reduction_layer_compute([data], shape_origin, dtype,
                                            axis_origin, op, coeff,
                                            kernel_name, need_build,
                                            need_print)

    if op == "MEAN" and (inp_dtype == "int8" or inp_dtype == "uint8"):
        util.check_shape_size(shape, SHAPE_SIZE_LIMIT)
        res = te.lang.cce.cast_to(res, inp_dtype)
        schedule = tvm.create_schedule(res.op)
        if need_print:
            with build_config:
                print(tvm.lower(schedule, [data, res], simple_mode=True))
        if need_build:
            with build_config:
                tvm.build(schedule, [data, res], "cce", name=kernel_name)
    else:
        with tvm.target.cce():
            sch = generic.auto_schedule(res)

        config = {
            "print_ir": need_print,
            "need_build": need_build,
            "name": kernel_name,
            "tensor_list": [data, res]
        }
        te.lang.cce.cce_build_code(sch, config)
def custom_logical_and(shape_x,
                       shape_y,
                       dtype,
                       kernel_name="cce_tf_logical_and",
                       need_build=False,
                       need_print=False):
    """
    do element-wise logical-and operation between two input tensors

    Parameters:
    ----------
    shape_x : shape of input data1

    shape_y : shape of input data2

    dtype : source data type, support "bool"

    kernel_name : cce kernel name, default value is "cce_tf_logical_and"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None
    """

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_x)
    util.check_shape_rule(shape_y)

    check_list = ["bool"]
    if not (dtype.lower() in check_list):
        raise RuntimeError(
            "logical_and_cce only support %s while dtype is %s" %
            (",".join(check_list), dtype))

    util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT)
    util.check_shape_size(shape_y, SHAPE_SIZE_LIMIT)

    inp_dtype = dtype.lower()

    shape_x, shape_y, shape_max = util.produce_shapes(shape_x, shape_y)
    data1 = tvm.placeholder(shape_x, dtype=inp_dtype, name="data1")
    data2 = tvm.placeholder(shape_y, dtype=inp_dtype, name="data2")

    with tvm.target.cce():
        data1_tmp1 = te.lang.cce.broadcast(data1, shape_max)
        data1_tmp2 = te.lang.cce.broadcast(data2, shape_max)

        min_value = tvm.const(0, dtype=inp_dtype)
        res = tvm.compute(
            shape_max,
            lambda *i: tvm.select(
                tvm.all(
                    tvm.any(
                        data1_tmp1(*i) > min_value,
                        data1_tmp1(*i) < -min_value),
                    tvm.any(
                        data1_tmp2(*i) > min_value,
                        data1_tmp2(*i) < -min_value)), True, False),
            name="res")

        sch = tvm.create_schedule(res.op)

    if need_print:
        with build_config:
            print(tvm.lower(sch, [data1, data2, res], simple_mode=True))

    if need_build:
        with build_config:
            tvm.build(sch, [data1, data2, res], "cce", name=kernel_name)
Пример #13
0
def custom_Power(shape,
                 dtype,
                 gamma,
                 alpha,
                 beta,
                 kernel_name="cce_caffe_power",
                 need_build=False,
                 need_print=False):
    """
    calculate (alpha * data + beta) ** gamma, calulation method exp(gamma * log(alpha * data + beta)).
    when alpha * data + beta < 0 , the output is a meaningless value.
    Parameters
    ----------
    shape : shape of data

    dtype : the data type, assume src_dtype equals dst_dtype, only support float16, float32

    gamma : the data type must be same with dtype parameter
        args in (alpha * data + beta) ** gamma

    alpha : the data type must be same with dtype parameter
        args in (alpha * data + beta) ** gamma

    beta : the data type must be same with dtype parameter
        args in (alpha * data + beta) ** gamma

    kernel_name : string
        kernel name in generated CCE kernal. default value is "cce_caffe_power"


    need_buid : bool
        if need to build CCEC kernel

    need_print : bool
        if need to print Halide IR

    Returns
    -------
    None
        
    """
    supported_dtypes = ["float16", "float32"]
    device_api = "cc_device_pow"

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)

    if not (dtype.lower() in supported_dtypes):
        raise RuntimeError("power_cce only support %s while dtype is %s" %
                           (",".join(supported_dtypes), dtype))

    inp_dtype = dtype.lower()
    shape = util.shape_refine(shape)
    data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype)

    v_datatype = util.get_device_api_dtype(inp_dtype)
    v_ndim_x = len(shape)
    v_ndim_y = 0
    p_shape_y = 0
    p_input_y = "nullptr"
    block_num = "block_num"
    block_idx = "block_idx"
    padC0 = 0

    p_scale = util.create_param_ptr([alpha], inp_dtype, "p_scale")
    p_shift = util.create_param_ptr([beta], inp_dtype, "p_shift")
    p_power = util.create_param_ptr([gamma], inp_dtype, "p_power")
    p_shape_x = util.create_param_ptr(shape, "int32", "p_shape_x")

    # scale --> alpha, shitf --> beta, power --> gamma
    output = tvm.extern(
        shape,
        [data_input, p_scale, p_shift, p_power, p_shape_x],
        lambda ins, outs: tvm.call_extern(
            "int32_t",
            device_api,
            block_num,
            block_idx,
            v_datatype,
            ins[1].access_ptr("r"),  # scale
            ins[2].access_ptr("r"),  # shift
            ins[3].access_ptr("r"),  # power
            v_ndim_x,
            ins[4].access_ptr("r"),  # shape
            padC0,
            ins[0].access_ptr("r"),  # input x
            v_ndim_y,
            v_ndim_y,
            p_shape_y,
            padC0,
            p_input_y,
            outs[0].access_ptr("w")),
        name="output",
        dtype=inp_dtype)

    s = tvm.create_schedule(output.op)

    if need_print:
        with build_config:
            print(tvm.lower(s, [data_input, output], simple_mode=True))
    if need_build:
        with build_config:
            tvm.build(s, [data_input, output], "cce", name=kernel_name)
Пример #14
0
def custom_tile(shape,
                multiples,
                dtype,
                kernel_name="cce_tile",
                need_build=False,
                need_print=False):
    """Operation and Schedule for tile, construct an array by repeating shape the number of times given by multiply_shape.

    Parameters
    ----------
    shape:shape of Tensor
    
    multiples:  shape of Tensor
    
    dtype: 
        the data type. only support float16, float32, int32, int8, uint8

    kernel_name : cce kernel name, default value is "cce_tile"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
        None
    """
    check_list = ["float16", "float32", "int32", "int8", "uint8"]
    if not (dtype.lower() in check_list):
        raise RuntimeError("tile_cce only support %s while dtype is %s" %
                           (",".join(check_list), dtype))
    tensor_l = []

    inp_dtype = dtype.lower()

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)
    tensor_l.append(tvm.placeholder(shape, name="shape", dtype=inp_dtype))

    for i in range(len(multiples)):
        if not isinstance(multiples[i], int):
            raise RuntimeError("InvalidArgumentError: Expected int value")
        if multiples[i] < 0:
            raise RuntimeError(
                "InvalidArgumentError: Expected int value or multiples[%d] >= 0, but got %d!"
                % (i, multiples[i]))

    tensor_l.append(
        tvm.placeholder(multiples, name="multiples", dtype=inp_dtype))

    out_tensor = compute_tile_cce(a_tuple=tensor_l)

    s = schedule_tile_cce(out_tensor)
    if need_print:
        with build_config:
            print(
                tvm.lower(s, [tensor_l[0], tensor_l[1], out_tensor],
                          simple_mode=True))

    if need_build:
        with build_config:
            tvm.build(s, tensor_l + [out_tensor], "cce", name=kernel_name)
Пример #15
0
def custom_exp(shape,
               dtype,
               kernel_name="cce_tf_exp",
               need_build=False,
               need_print=False):
    """
    algorithm: exp  

    calculating data's exp,y= e ** x ,dtype is float16,
    
    Parameters
    ----------
    shape : shape of data

    dtype : the data type, assume src_dtype equals dst_dtype, only support float16, float32

    kernel_name : cce kernel name, default value is "cce_tf_exp"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None

    """
    supported_dtypes = ["float16", "float32"]
    device_api = "DeviceExp"

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)
    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)

    if not (dtype.lower() in supported_dtypes):
        raise RuntimeError("tf_exp_cce only support %s while dtype is %s" %
                           (",".join(supported_dtypes), dtype))

    inp_dtype = dtype.lower()
    shape = util.shape_refine(shape)
    data_input = tvm.placeholder(shape, name="data_input", dtype=inp_dtype)

    v_datatype = util.get_device_api_dtype(inp_dtype)
    v_ndim = len(shape)
    block_num = "block_num"
    block_idx = "block_idx"
    padC0 = 0
    p_scale = util.create_param_ptr([1], inp_dtype, "p_scale")
    p_shift = util.create_param_ptr([0], inp_dtype, "p_shift")
    p_base = util.create_param_ptr([-1], inp_dtype, "p_base")
    p_shape = util.create_param_ptr(shape, "int32", "p_shape")

    output = tvm.extern(
        shape,
        [data_input, p_scale, p_shift, p_base, p_shape],
        lambda ins, outs: tvm.call_extern(
            "int32_t",
            device_api,
            block_num,
            block_idx,
            v_datatype,
            ins[1].access_ptr("r"),  # scale
            ins[2].access_ptr("r"),  # shift
            ins[3].access_ptr("r"),  # base
            v_ndim,
            ins[4].access_ptr("r"),  # shape
            padC0,
            ins[0].access_ptr("r"),  # input x
            outs[0].access_ptr("w")),
        name="output",
        dtype=inp_dtype)

    s = tvm.create_schedule(output.op)

    if need_print:
        with build_config:
            print(tvm.lower(s, [data_input, output], simple_mode=True))
    if need_build:
        with build_config:
            tvm.build(s, [data_input, output], "cce", name=kernel_name)