Exemplo n.º 1
0
def _infer_shape(format_pattern, x, y):
    shape_x = x.get("shape")
    shape_y = y.get("shape")
    ori_shape_x = x.get("ori_shape")
    ori_shape_y = y.get("ori_shape")
    shape_x = util.scalar2tensor_one(shape_x)
    shape_y = util.scalar2tensor_one(shape_y)
    if format_pattern == 1:
        ori_shape_x, shape_y, shape_max = op_utils.broadcast_shapes(
            ori_shape_x, shape_y, param_name_input1="x", param_name_input2="y")
        if shape_y[-2] == ori_shape_x[-2] and shape_y[-1] == ori_shape_x[-1]:
            raise RuntimeError("the inputshape of y is illegal")

        if shape_y[-2] == 1 and shape_y[-1] == ori_shape_x[-1]:
            shape_y.append(1)
            shape_y.append(1)
            shape_y[-3] = 1
            shape_y[-1] = shape_x[-1]
            shape_y[-4] = shape_x[-4]

        elif shape_y[-2] == ori_shape_x[-2] and shape_y[-1] == 1:
            shape_y.append(1)
            shape_y.append(1)
            shape_y[-4] = 1
            shape_y[-2] = shape_x[-2]
            shape_y[-3] = shape_x[-3]

        elif shape_y[-2] == shape_y[-1] == 1:
            shape_y.append(1)
            shape_y.append(1)

    elif format_pattern == 2:
        shape_x, ori_shape_y, shape_max = op_utils.broadcast_shapes(
            shape_x, ori_shape_y, param_name_input1="x", param_name_input2="y")
        if shape_x[-2] == ori_shape_y[-2] and shape_x[-1] == ori_shape_y[-1]:
            raise RuntimeError("the inputshape of x is illegal")

        if shape_x[-2] == 1 and shape_x[-1] == ori_shape_y[-1]:
            shape_x.append(1)
            shape_x.append(1)
            shape_x[-3] = 1
            shape_x[-1] = shape_y[-1]
            shape_x[-4] = shape_y[-4]

        elif shape_x[-2] == ori_shape_y[-2] and shape_x[-1] == 1:
            shape_x.append(1)
            shape_x.append(1)
            shape_x[-4] = 1
            shape_x[-2] = shape_y[-2]
            shape_x[-3] = shape_y[-3]

        elif shape_x[-2] == shape_x[-1] == 1:
            shape_x.append(1)
            shape_x.append(1)

    return shape_x, shape_y
Exemplo n.º 2
0
def _infer_shape(format_pattern, x, y):
    shape_x = x.get("shape")
    shape_y = y.get("shape")
    shape_x = scalar2tensor_one(shape_x)
    shape_y = scalar2tensor_one(shape_y)

    if format_pattern == 1:
        shape_x, shape_y, shape_max = broadcast_shapes(shape_x, shape_y,
                                                       param_name_input1="input_x",
                                                       param_name_input2="input_y")

        if shape_y[-2] == 1 and shape_y[-1] == shape_x[-1]:
            shape_y.append(1)
            shape_y.append(1)
            shape_y[-3] = 1
            shape_y[-1] = shape_x[-1]
            shape_y[-4] = shape_x[-4]

        elif shape_y[-2] == shape_x[-2] and shape_y[-1] == 1:
            shape_y.append(1)
            shape_y.append(1)
            shape_y[-4] = 1
            shape_y[-2] = shape_x[-2]
            shape_y[-3] = shape_x[-3]

        elif shape_y[-2] == shape_y[-1] == 1:
            shape_y.append(1)
            shape_y.append(1)

    elif format_pattern == 2:
        shape_x, shape_y, shape_max = broadcast_shapes(shape_x, shape_y,
                                                       param_name_input1="input_x",
                                                       param_name_input2="input_y")
        if shape_x[-2] == 1 and shape_x[-1] == shape_y[-1]:
            shape_x.append(1)
            shape_x.append(1)
            shape_x[-3] = 1
            shape_x[-1] = shape_y[-1]
            shape_x[-4] = shape_y[-4]

        elif shape_x[-2] == shape_y[-2] and shape_x[-1] == 1:
            shape_x.append(1)
            shape_x.append(1)
            shape_x[-4] = 1
            shape_x[-2] = shape_y[-2]
            shape_x[-3] = shape_y[-3]

        elif shape_x[-2] == shape_x[-1] == 1:
            shape_x.append(1)
            shape_x.append(1)

    return shape_x, shape_y
Exemplo n.º 3
0
def addcdiv(x1, x2, x3, y=None, alpha=1.0, kernel_name="addcdiv"):

    check_list = ("float16", "float32")

    shape_x1 = x1.get("shape")
    dtype_x1 = x1.get("dtype").lower()

    shape_x2 = x2.get("shape")
    dtype_x2 = x2.get("dtype").lower()

    shape_x3 = x3.get("shape")
    dtype_x3 = x3.get("dtype").lower()

    util.check_shape_rule(shape_x1)    # 校验算子的shape,维度数需要大于等于1、小于等于8
    util.check_shape_size(shape_x1, SHAPE_SIZE_LIMIT)    # 校验算子第一个输入shape大小
    util.check_dtype_rule(dtype_x1, check_list)    # 校验算子的输入数据类型

    util.check_shape_rule(shape_x2)
    util.check_shape_size(shape_x2, SHAPE_SIZE_LIMIT)
    util.check_dtype_rule(dtype_x2, check_list)

    util.check_shape_rule(shape_x3)
    util.check_shape_size(shape_x3, SHAPE_SIZE_LIMIT)
    util.check_dtype_rule(dtype_x3, check_list)

    if dtype_x1 != dtype_x2 or dtype_x1 != dtype_x3:
        raise RuntimeError("the type of x1, x2, x3 must be the same!")

    util.check_kernel_name(kernel_name)    # 校验算子的kernel_name

    # 取shape_x1,shape_x2,shape_x3中每个维度的大值赋给shape_max
    shape_x2, shape_x3, shape_max = broadcast_shapes(shape_x2, shape_x3)
    util.check_tensor_shape_size(shape_max)     # 对shape_max进行校验
    shape_x1, _, shape_max = broadcast_shapes(shape_x1, shape_max)
    util.check_tensor_shape_size(shape_max)     # 对shape_max进行校验
    shape_x2, _, _ = broadcast_shapes(shape_x2, shape_max)    # 将input_x的shape广播为shape_max
    shape_x3, _, _ = broadcast_shapes(shape_x3, shape_max)    # 将input_y的shape广播为shape_max

    data_x1 = tvm.placeholder(shape_x1, name="data_x1", dtype=dtype_x1)
    data_x2 = tvm.placeholder(shape_x2, name="data_x2", dtype=dtype_x2)
    data_x3 = tvm.placeholder(shape_x3, name="data_x3", dtype=dtype_x3)

    res = addcdiv_compute(data_x1, data_x2, data_x3, shape_max, alpha, kernel_name)

    with tvm.target.cce():
        schedule = generic.auto_schedule(res)

    config = {"name": kernel_name,
              "tensor_list": [data_x1, data_x2, data_x3, res]}

    te.lang.cce.cce_build_code(schedule, config)
Exemplo n.º 4
0
def maximum_compute(x1, x2, y, kernel_name="maximum"):
    """dynamic maximum compute

    Parameters:
    ----------
    x1: TVM tensor
        input_x tensor.
    x2: TVM tensor
        input_y tensor.
    y: dict
        shape and dtype of output.
    kernel_name: str
        cce kernel name, default value is "maximum".

    Returns
    -------
    res: TVM tensor
        output tensor, has the same shape and type as input tensor.
    """

    shape_x = te.lang.dynamic.shape_to_list(x1.shape)
    shape_y = te.lang.dynamic.shape_to_list(x2.shape)
    shape1, shape2, shape_max = broadcast_shapes(shape_x,
                                                 shape_y,
                                                 param_name_input1="x1",
                                                 param_name_input2="x2")

    data1 = te.lang.dynamic.broadcast(x1, shape_max)
    data2 = te.lang.dynamic.broadcast(x2, shape_max)

    res = te.lang.dynamic.vmax(data1, data2)

    return res
Exemplo n.º 5
0
def mul_compute(input1, input2, output, kernel_name="mul"):
    """
    calculating data's mul, c = a * b

    Parameters
    ----------
    input1: TVM tensor
        the placeholder of first input data
    input2: TVM tensor
        the placeholder of second input data
    output: dict
        shape and dtype of output, should be broadcast shape and type as input
    kernel_name: str
        cce kernel name, default value is mul

    Returns
    -------
    res : output of the data's mul
    """
    x0_shape = te.lang.dynamic.shape_to_list(input1.shape)
    x1_shape = te.lang.dynamic.shape_to_list(input2.shape)
    x0_shape, x1_shape, y_shape = broadcast_shapes(x0_shape, x1_shape,
                                                   param_name_input1="input1",
                                                   param_name_input2="input2")
    input1 = te.lang.dynamic.broadcast(input1, y_shape)
    input2 = te.lang.dynamic.broadcast(input2, y_shape)
    res = te.lang.dynamic.vmul(input1, input2)

    return res
Exemplo n.º 6
0
def _mul_compute(input_x, input_y, output_data, kernel_name="mul"):
    """
    calculating element-wise mul

    Parameters
    ----------
    input_x: TVM tensor
        the placeholder of first input data
    input_y: TVM tensor
        the placeholder of second input data
    output_data: dict
        shape and dtype of output, should be broadcast shape and type as input
    kernel_name: str
        cce kernel name, default value is "mul"

    Returns
    -------
    output of the element-wise mul
    """
    shape_x = te.lang.cce.util.shape_to_list(input_x.shape)
    shape_y = te.lang.cce.util.shape_to_list(input_y.shape)

    shape_x, shape_y, shape_max = op_utils.broadcast_shapes(
        shape_x, shape_y, param_name_input1="x", param_name_input2="y")
    if shape_x != shape_y and len(shape_x) == 2 and len(shape_y) == 2:
        res = _mul_compute_ex(input_x, input_y, shape_x, shape_y, shape_max)
        if res is not None:
            return res
    input_x = te.lang.cce.broadcast(input_x, shape_max)
    input_y = te.lang.cce.broadcast(input_y, shape_max)
    res = te.lang.cce.vmul(input_x, input_y)

    return res
Exemplo n.º 7
0
def sub_compute(input_x, input_y, output_z, kernel_name="sub"):
    """
    calculating data's sub, c = a - b

    Parameters
    ----------
    input_x: TVM tensor
        the placeholder of first input data
    input_y: TVM tensor
        the placeholder of second input data
    output_z: dict
        shape and dtype of output, should be broadcast shape and type as input
    kernel_name: str
        cce kernel name, default value is sub

    Returns
    -------
    res : output of the data's sub
    """
    shape_x = te.lang.dynamic.shape_to_list(input_x.shape)
    shape_y = te.lang.dynamic.shape_to_list(input_y.shape)

    shape_x, shape_y, shape_max = broadcast_shapes(shape_x,
                                                   shape_y,
                                                   param_name_input1="input_x",
                                                   param_name_input2="input_y")
    input_x = te.lang.dynamic.broadcast(input_x, shape_max)
    input_y = te.lang.dynamic.broadcast(input_y, shape_max)
    res = te.lang.dynamic.vsub(input_x, input_y)

    return res
Exemplo n.º 8
0
def _check_shape_compatibility(shape_in, shape_out):
    """
    Check if the shape of input tensor is compatible with output tensor.

    Parameters:
    ----------
    shape_in : shape of input tensor.

    shape_out : shape of output tensor.

    Returns:
    -------
    comp_shape_in : new shape_in compatible with shape_out.
    """

    try:
        comp_shape_in, comp_shape_out, shape_max = broadcast_shapes(
            shape_in,
            shape_out,
            param_name_input1="value",
            param_name_input2="dims")
        if comp_shape_out != shape_max:
            raise ValueError('shape_in is not compatible with shape_out.')
    except RuntimeError:
        raise ValueError('shape_in is not compatible with shape_out.')
    return comp_shape_in
Exemplo n.º 9
0
def add_compute(input_x, input_y, output_z, kernel_name="add"):
    """
    calculating data's add, c = a + b

    Parameters
    ----------
    input_x:
    left input, may be dict or tensor

    input_y:
    left input, may be dict or tensor

    output_z: dict
        shape and dtype of output, should be broadcast shape and type as input
    kernel_name: str
        cce kernel name, default value is add

    Returns
    -------
    res : output of the data's add
    """
    shape_x = te.lang.dynamic.shape_to_list(input_x.shape)
    shape_y = te.lang.dynamic.shape_to_list(input_y.shape)

    shape_x, shape_y, shape_max = broadcast_shapes(shape_x, shape_y,
                                                   param_name_input1="input_x",
                                                   param_name_input2="input_y")

    input_x = te.lang.dynamic.broadcast(input_x, shape_max)
    input_y = te.lang.dynamic.broadcast(input_y, shape_max)
    res = te.lang.dynamic.vadd(input_x, input_y)

    return res
Exemplo n.º 10
0
def logical_or_compute(x1, x2, y, kernel_name="logical_or"):
    """
    algorithm : logical_or_compute
    calculating the value of x1 OR x2 element-wise

    Parameters
    ----------
    x1 : the placeholders of x1

    x2 : the placeholders of x2

    y : the dict of y

    kernel_name : string, cce kernel name, default value is "logical_or"

    Returns
    -------
    result res
    """
    _, _, shape_max = broadcast_shapes(
        te.lang.cce.util.shape_to_list(x1.shape),
        te.lang.cce.util.shape_to_list(x2.shape),
        param_name_input1="x1",
        param_name_input2="x2")
    x1 = te.lang.cce.cast_to(x1, "float16")
    x2 = te.lang.cce.cast_to(x2, "float16")
    x1 = te.lang.cce.broadcast(x1, shape_max)
    x2 = te.lang.cce.broadcast(x2, shape_max)
    res = te.lang.cce.vmax(x1, x2)
    res = te.lang.cce.cast_to(res, "int8")

    return res
Exemplo n.º 11
0
def real_div_compute(x1, x2, y, kernel_name="real_div"):
    """
    calculating data's realdiv, c = a / b

    Parameters
    ----------
    x1: TVM tensor
        the placeholder of first input data
    x2: TVM tensor
        the placeholder of second input data
    y: dict
        shape and dtype of output, should be broadcast shape and type as input
    kernel_name: str
        cce kernel name, default value is real_div

    Returns
    -------
    res : output of the data's divide
    """
    shape_x = te.lang.dynamic.shape_to_list(x1.shape)
    shape_y = te.lang.dynamic.shape_to_list(x2.shape)
    shape_x, shape_y, shape_max = broadcast_shapes(shape_x, shape_y)
    data_x = te.lang.dynamic.broadcast(x1, shape_max)
    data_y = te.lang.dynamic.broadcast(x2, shape_max)
    res = te.lang.dynamic.vdiv(data_x, data_y)

    return res
Exemplo n.º 12
0
def logical_or(x1, x2, y, kernel_name="logical_or"):
    """
    algorithm : logical_or
    calculating the value of x1 OR x2 element-wise

    Parameters
    ----------
    x1 : the dict of x1,
         include shape and dtype,
         dtype support int8, the value only support 0, 1

    x2 : the dict of x2,
         include shape and dtype,
         dtype support int8, the value only support 0, 1

    y : the dict of y, include shape and dtype

    kernel_name : string, cce kernel name, default value is "logical_or"

    Returns
    -------
    None
    """

    shape_x1 = x1.get("shape")
    shape_x2 = x2.get("shape")
    dtype_x1 = x1.get("dtype")
    dtype_x2 = x2.get("dtype")
    if dtype_x1 == "bool" or dtype_x2 == "bool":
        dtype_x1 = "int8"
        dtype_x2 = "int8"

    check_shape(shape_x1, param_name="x1")
    check_shape(shape_x2, param_name="x2")

    check_tuple = ("int8", )
    check_dtype(dtype_x1, check_tuple, param_name="x1")
    check_dtype(dtype_x2, check_tuple, param_name="x2")

    shape_x1, shape_x2, shape_max = broadcast_shapes(shape_x1,
                                                     shape_x2,
                                                     param_name_input1="x1",
                                                     param_name_input2="x2")
    dtype = dtype_x1.lower()
    data_x1 = tvm.placeholder(shape_x1, name="data_x1", dtype=dtype)
    data_x2 = tvm.placeholder(shape_x2, name="data_x2", dtype=dtype)

    res = logical_or_compute(data_x1, data_x2, y, kernel_name)

    with tvm.target.cce():
        schedule = generic.auto_schedule(res)

    config = {
        "print_ir": False,
        "need_build": False,
        "name": kernel_name,
        "tensor_list": (data_x1, data_x2, res)
    }
    te.lang.cce.cce_build_code(schedule, config)
Exemplo n.º 13
0
def mul(x, y, output, kernel_name="mul"):
    """
    do element-wise mul operation between two input tensors

    Parameters:
    ----------
    x : dict.
        shape, dtype of input x
    y : dict.
        shape, dtype of input y
    output : dict.
        shape, dtype of ouput
    kernel_name : str.
        cce kernel name, default value is "mul"

    Returns
    -------
    None
    """
    # format_pattern = 1  Nz and vector
    # format_pattern = 2  vector and Nz
    # format_pattern = 0  Nz scalar  Nz Nz  ND ND
    format_pattern = _mul_check_format(x, y)
    shape_x, shape_y = _infer_shape(format_pattern, x, y)

    shape_x = util.scalar2tensor_one(shape_x)
    dtype_x = x.get("dtype").lower()
    shape_y = util.scalar2tensor_one(shape_y)
    dtype_y = y.get("dtype").lower()

    op_utils.check_shape(shape_x, param_name="x")
    op_utils.check_shape(shape_y, param_name="y")

    if dtype_x != dtype_y:
        raise RuntimeError("dtype of inputs should be consistent")
    dtype = dtype_x
    check_list = ("int32", "float16", "float32", "int16")
    op_utils.check_dtype(dtype, check_list, param_name="x")

    vmul_support = tbe_platform.cce_conf.api_check_support(
        "te.lang.cce.vmul", "float32")
    if dtype_x == "float32" and not vmul_support:
        raise RuntimeError(
            "Input dtype is float32, but do not support on the platform")

    shape_x, shape_y, shape_max = op_utils.broadcast_shapes(
        shape_x, shape_y, param_name_input1="x", param_name_input2="y")

    shape_x, shape_y = op_utils.refine_shapes_for_broadcast(shape_x, shape_y)
    input_x = tvm.placeholder(shape_x, dtype=dtype, name="x")
    input_y = tvm.placeholder(shape_y, dtype=dtype, name="y")

    res = _mul_compute(input_x, input_y, output, kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {"name": kernel_name, "tensor_list": (input_x, input_y, res)}
    te.lang.cce.cce_build_code(sch, config)
Exemplo n.º 14
0
def dequantize_compute(x,
                       min_range,
                       max_range,
                       y,
                       mode="MIN_COMBINED",
                       kernel_name="dequantize"):
    """
    Computation for dequantize the 'input' tensor into a float tensor.

    Parameters:
    ----------
    x: input data, dtype must be one of the following:
      only support `int8`, `uint8`, `int32`,

    min_range: input min_range, dtype must be `float32`.
      The minimum scalar value possibly produced for the input.

    max_range: input max_range, dtype must be `float32`.
      The maximum scalar value possibly produced for the input.

    y: the dict of output_data, dtype must be `float32`.

    mode: An optional `string` from: `"MIN_COMBINED", "MIN_FIRST", "SCALED"`.
      Defaults to `"MIN_COMBINED"`.

    kernel_name : cce kernel name, default value is "dequantize".

    Returns
    -------
    res : output of the dequantization's computation.
    """

    input_tensor = x

    shape_x = te.lang.cce.util.shape_to_list(x.shape)
    shape_range = te.lang.cce.util.shape_to_list(max_range.shape)

    shape_x, shape_range, shape_max = op_utils.broadcast_shapes(
        shape_x,
        shape_range,
        param_name_input1="x",
        param_name_input2="max_range")

    broadcast_min_range = te.lang.cce.broadcast(min_range, shape_max)
    broadcast_max_range = te.lang.cce.broadcast(max_range, shape_max)

    if mode == "MIN_COMBINED":
        res = _min_combined_mode_compute(input_tensor, broadcast_min_range,
                                         broadcast_max_range)

    elif mode == "MIN_FIRST":
        res = _min_first_mode_compute(input_tensor, broadcast_min_range,
                                      broadcast_max_range)

    elif mode == "SCALED":
        res = _scaled_mode_compute(input_tensor, broadcast_max_range)

    return res
Exemplo n.º 15
0
def atan2_compute(y, x, output_dict, kernel_name="atan2"):
    """
    Algorithm: atan2
    ----------------------------------
    Parameters:

        y: Input data y.

        x: Input data x.

        kernel_name: cce kernel name, default value is "atan2"
    ----------------------------------
    Returns:

        A Tensor of atan2(x).

    """

    shape_y = y.shape
    dtype_y = y.dtype
    shape_x = x.shape

    shape_y = te.lang.cce.util.shape_to_list(shape_y)
    shape_x = te.lang.cce.util.shape_to_list(shape_x)
    shape_y, shape_x, shape_broadcast = broadcast_shapes(shape_y, shape_x, param_name_input1="x1", param_name_input2="x2")
    y = te.lang.cce.broadcast(y, shape_broadcast)
    x = te.lang.cce.broadcast(x, shape_broadcast)

    if dtype_y == "float16" and \
       api_check_support("te.lang.cce.vadd", "float32"):
        y = te.lang.cce.cast_to(y, "float32")
        x = te.lang.cce.cast_to(x, "float32")

    mask = _init_atan2_mask(y, x)

    # caculate the atan(y/x) when x > 0
    res = te.lang.cce.vdiv(y, x)
    res = _atan_compute(res)

    y_cmp_zero = te.lang.cce.vmuls(mask[CONST_ONE],
                                   tvm.const(CONST_PI_BY_TWO, y.dtype))
    res_x_lt_zero = te.lang.cce.vmuls(mask[CONST_ZERO],
                                      tvm.const(CONST_PI, y.dtype))

    if x.dtype == res.dtype and api_check_support("te.lang.cce.vcmpsel", x.dtype):
        res = te.lang.cce.vcmpsel(x, tvm.const(CONST_ZERO, x.dtype), 'eq', y_cmp_zero, res)
    else:
        tensor_zero = te.lang.cce.broadcast(tvm.const(CONST_ZERO, x.dtype), shape_broadcast)
        x_equal_zero = te.lang.cce.vcmp(x, tensor_zero, 'eq')
        res = te.lang.cce.vsel(x_equal_zero, y_cmp_zero, res)

    res = te.lang.cce.vadd(res, res_x_lt_zero)

    if dtype_y == "float16":
        res = te.lang.cce.cast_to(res, "float16")
    return res
Exemplo n.º 16
0
def atan2(x1, x2, y, kernel_name="atan2"):
    """
    Algorithm: arctan2
        arctan2(y, x) = arctan(y/x)
    ----------------------------------
    Parameters:

        x1: the dict of input data x1, only support float16, float32.

        x2: the dict of input data x2, only support float16, float32.

        y: the dict of output

        kernel_name: default value is "atan2".
    ----------------------------------
    Returns:
        None
    """

    y_shape = x1.get("shape")
    x_shape = x2.get("shape")

    y_dtype = x1.get("dtype")
    x_dtype = x2.get("dtype")

    check_shape(y_shape, param_name="x1")
    check_shape(x_shape, param_name="x2")

    shape_y, shape_x, shape_max = broadcast_shapes(
        y_shape, x_shape, param_name_input1="x1", param_name_input2="x2")

    check_list = ("float16", "float32")
    check_dtype(y_dtype, check_list, param_name="x1")
    check_dtype(x_dtype, check_list, param_name="x2")
    if y_dtype.lower() != x_dtype.lower():
        raise RuntimeError("The input tensor must have identical dtype!")
    shape_y, shape_x = refine_shapes_for_broadcast(shape_y, shape_x)
    input_y = tvm.placeholder(shape_y, dtype=y_dtype.lower(), name="input_y")
    input_x = tvm.placeholder(shape_x, dtype=x_dtype.lower(), name="input_x")

    res = atan2_compute(input_y, input_x, y, kernel_name)
    res = te.lang.cce.cast_to(res, x_dtype.lower())
    with tvm.target.cce():
        auto_sch = topi.generic.auto_schedule(res)

    config = {
        "name": kernel_name,
        "tensor_list": (input_y, input_x, res),
        "print_ir": False,
        "bool_storage_as_1bit": False
    }

    te.lang.cce.cce_build_code(auto_sch, config)
Exemplo n.º 17
0
def floor_div_compute(input_x, input_y, output_z, kernel_name='floor_div'):
    """
       floordiv compute
       calculating data's floordiv, res =floor(x / y)

       Parameters
       ----------
       input_x: TVM tensor
           the placeholder of input_x
       input_y: TVM tensor
           the placeholder of input_y
       output_z: dict
           dict of output
       kernel_name: str
           kernel name, default value is "floor_div"

       Returns
       -------
       res: TVM tensor
           the result of floordiv compute
    """
    dtype_x = input_x.dtype
    input_x_shape = te.lang.dynamic.shape_to_list(input_x.shape)
    input_y_shape = te.lang.dynamic.shape_to_list(input_y.shape)
    input_x_shape, input_y_shape, shape_broad = \
        broadcast_shapes(input_x_shape, input_y_shape,
                         param_name_input1="input_x",
                         param_name_input2="input_y")

    if dtype_x != "float16" and tbe_platform.cce_conf.api_check_support(
            "te.lang.dynamic.vdiv", "float32"):
        input_x = te.lang.dynamic.cast_to(input_x, 'float32')
        input_y = te.lang.dynamic.cast_to(input_y, 'float32')

        input_x = te.lang.dynamic.broadcast(input_x, shape_broad)
        input_y = te.lang.dynamic.broadcast(input_y, shape_broad)
    else:
        input_x = te.lang.dynamic.broadcast(input_x, shape_broad)
        input_y = te.lang.dynamic.broadcast(input_y, shape_broad)

    res = te.lang.dynamic.vdiv(input_x, input_y)

    if dtype_x != "float16" and tbe_platform.cce_conf.get_soc_spec(
            "SOC_VERSION") == "Ascend310":
        res = te.lang.dynamic.cast_to(res, "float16")

    res = te.lang.dynamic.floor(res)

    res = te.lang.dynamic.cast_to(res, dtype_x)

    return res
Exemplo n.º 18
0
def greater_equal_compute(input_x,
                          input_y,
                          output_z,
                          kernel_name="greater_equal"):
    """
    if x is greater than y or equals y, then return 1, else return 0.

    Parameters
    ----------
    input_x: TVM tensor
        the placeholder of input_x, has shape, dtype and range attributes
    input_y: TVM tensor
        the placeholder of input_y, has shape, dtype and range attributes
    output_z: dict
        dict info of output_z
    kernel_name: str
        cce kernel name, default value is "greater_equal"

    Returns
    -------
    res: TVM tensor
        the result of compute
    """
    shape_x = te.lang.dynamic.shape_to_list(input_x.shape)
    shape_y = te.lang.dynamic.shape_to_list(input_y.shape)
    shape_x, shape_y, shape_max = broadcast_shapes(shape_x,
                                                   shape_y,
                                                   param_name_input1="input_x",
                                                   param_name_input2="input_y")

    dtype_x = input_x.dtype
    if dtype_x in ("int8", "uint8"):
        input_x = te.lang.dynamic.cast_to(input_x, "float16")
        input_y = te.lang.dynamic.cast_to(input_y, "float16")
        dtype_x = "float16"

    input_x = te.lang.dynamic.broadcast(input_x, shape_max)
    input_y = te.lang.dynamic.broadcast(input_y, shape_max)

    if dtype_x == "float32":
        # minimun num of float32 2**(-126)
        data_min = tvm.const(SCALAR_MIN_FP32, dtype=dtype_x)
    elif dtype_x == "float16":
        # minimun num of float16 2**(-24)
        data_min = tvm.const(SCALAR_MIN_FP16, dtype=dtype_x)
    else:
        # minimun num of int32 1
        data_min = tvm.const(SCALAR_ONE, dtype=dtype_x)

    return _greater_equal_compare((input_x, input_y), shape_max, dtype_x,
                                  data_min)
Exemplo n.º 19
0
def div_compute(input_x, input_y, output_z, kernel_name="div"):
    """
    div compute
    calculating data's div, res =x / y

    Parameters
    ----------
    input_x: TVM tensor
        the placeholder of input_x
    input_y: TVM tensor
        the placeholder of input_y
    output_div: dict
        dict with keys(shape and dtype) of output
    kernel_name: str
        kernel name, default value is "div"

    Returns
    -------
    res: TVM tensor
        the result of div compute
    """
    x_shape = te.lang.dynamic.shape_to_list(input_x.shape)
    y_shape = te.lang.dynamic.shape_to_list(input_y.shape)
    x_shape, y_shape, z_shape = broadcast_shapes(x_shape,
                                                 y_shape,
                                                 param_name_input1="input_x",
                                                 param_name_input2="input_y")
    dtype_x = input_x.dtype
    int_list = ("int8", "uint8", "int32")
    if tbe_platform.cce_conf.api_check_support("te.lang.dynamic.vdiv",
                                               "float32"):
        input_x = te.lang.dynamic.cast_to(input_x, "float32")
        input_y = te.lang.dynamic.cast_to(input_y, "float32")
    input_x = te.lang.dynamic.broadcast(input_x, z_shape)
    input_y = te.lang.dynamic.broadcast(input_y, z_shape)
    res = te.lang.dynamic.vdiv(input_x, input_y)

    if dtype_x in int_list:
        if tbe_platform.cce_conf.get_soc_spec("SOC_VERSION") == "Ascend310":
            res = te.lang.dynamic.cast_to(res, "float16")
        res = te.lang.dynamic.floor(res)

    res = te.lang.dynamic.cast_to(res, dtype_x)

    return res
Exemplo n.º 20
0
def less_compute(input_x, input_y, output_z, kernel_name="less"):
    """
    if x is less than y, then return 1, else return 0.

    Parameters:
    ----------
    input_x: TVM tensor
        the placeholder of first input data
    input_y: TVM tensor
        the placeholder of second input data
    output_x: dict
        shape and dtype of output, should be broadcast shape and type as input
    kernel_name: str
        cce kernel name, default value is less

    Returns
    -------
    the result
    """
    shape_x = te.lang.dynamic.shape_to_list(input_x.shape)
    shape_y = te.lang.dynamic.shape_to_list(input_y.shape)
    shape_x, shape_y, shape_max = broadcast_shapes(shape_x, shape_y, param_name_input1="input_x",
                                                   param_name_input2="input_y")

    dtype_x = input_x.dtype
    if dtype_x in ("uint8", "int8"):
        input_x = te.lang.dynamic.cast_to(input_x, "float16")
        input_y = te.lang.dynamic.cast_to(input_y, "float16")
        dtype_x = "float16"

    input_x = te.lang.dynamic.broadcast(input_x, shape_max)
    input_y = te.lang.dynamic.broadcast(input_y, shape_max)

    if dtype_x == "float32":
        # minimun num of float32 2**(-126)
        data_min = tvm.const(SCALAR_MIN_FP32, dtype=dtype_x)
    elif dtype_x == "float16":
        # minimun num of float16 2**(-24)
        data_min = tvm.const(SCALAR_MIN_FP16, dtype=dtype_x)
    else:
        # minimun num of int32 1
        data_min = tvm.const(SCALAR_ONE, dtype=dtype_x)

    return _less_compare((input_x, input_y), shape_max, dtype_x, data_min)
Exemplo n.º 21
0
def masked_fill_compute(x, mask, value, y, kernel_name="masked_fill"):
    '''
    calculating masked_fill
    :param x: TVM tensor
                   the output of previous layer
    :param mask: TVM tensor
                    mask dtype is bool
    :param value: scalar or TVM tensor
                    the value to fill in with
    :param kernel_name: str
                    kernel name, default value is "masked_fill"
    :return:y
            TVM tensor
    '''

    ori_dtype = x.dtype
    if x.dtype in ('int8', 'int32'):
        x = te.lang.cce.cast_to(x, 'float16')

    x_shape = te.lang.cce.util.shape_to_list(x.shape)
    mask_shape = te.lang.cce.util.shape_to_list(mask.shape)
    # computer output shape
    x_shape, mask_shpae, target_shape = op_utils.broadcast_shapes(
        x_shape, mask_shape)
    target_dtype = x.dtype
    mask = te.lang.cce.cast_to(mask, x.dtype)
    value = te.lang.cce.cast_to(value, x.dtype)

    mask = te.lang.cce.broadcast(mask, target_shape)
    tensor_ones = te.lang.cce.broadcast(tvm.const(1, target_dtype),
                                        target_shape)
    value = te.lang.cce.broadcast(value, target_shape)
    x = te.lang.cce.broadcast(x, target_shape)
    y = te.lang.cce.vcmpsel(mask, tensor_ones, 'eq', value, x)

    if y.dtype != ori_dtype:
        y = te.lang.cce.cast_to(y, ori_dtype)

    return y
Exemplo n.º 22
0
def floor_mod_compute(x1, x2, y, kernel_name="floor_mod"):
    """
    Compute remainder of division
    res = x1 - floor(input_data_x / input_data_y) * input_data_y

    Parameters
    ----------
    x1: TVM tensor
        input tensor has shape, dtype and range attributes
    x2: TVM tensor
        input tensor has shape, dtype and range attributes
    y: dict
        dict with keys(shape, dtype and range) of output
    kernel_name : str
        cce kernel name, default value is "floor_mod"

    Returns
    ------
    res: TVM tensor
        the calculation results
    """

    dtype = x1.dtype
    shape_x = te.lang.dynamic.shape_to_list(x1.shape)
    shape_y = te.lang.dynamic.shape_to_list(x2.shape)

    shape_x, shape_y, shape = broadcast_shapes(shape_x, shape_y,
                                               param_name_input1="x1",
                                               param_name_input2="x2")

    # calculate result, using float32 for better precision
    has_improve_precision = False
    input_x_fp32 = x1
    input_y_fp32 = x2
    if tbe_platform.cce_conf.api_check_support("te.lang.dynamic.vdiv",
                                               "float32"):
        input_x_fp32 = te.lang.dynamic.cast_to(x1, "float32")
        input_y_fp32 = te.lang.dynamic.cast_to(x2, "float32")
        has_improve_precision = True

    input_x_fp32 = te.lang.dynamic.broadcast(input_x_fp32, shape)
    input_y_fp32 = te.lang.dynamic.broadcast(input_y_fp32, shape)

    res = te.lang.dynamic.vdiv(input_x_fp32, input_y_fp32)

    if tbe_platform.cce_conf.api_check_support("te.lang.dynamic.floor",
                                               res.dtype):
        res = te.lang.dynamic.floor(res)
    else:
        res = te.lang.dynamic.cast_to(res, "float16")
        res = te.lang.dynamic.floor(res)

    if dtype != "int32":
        if has_improve_precision:
            res = te.lang.dynamic.cast_to(res, "float32")
        else:
            res = te.lang.dynamic.cast_to(res, "float16")
        res = te.lang.dynamic.vmul(res, input_y_fp32)
        res = te.lang.dynamic.vsub(input_x_fp32, res)
        if has_improve_precision:
            res = te.lang.dynamic.cast_to(res, dtype)
    else:
        x2_broad = te.lang.dynamic.broadcast(x2, shape)
        x1_broad = te.lang.dynamic.broadcast(x1, shape)
        res = te.lang.dynamic.vmul(res, x2_broad)
        res = te.lang.dynamic.vsub(x1_broad, res)

    return res
Exemplo n.º 23
0
def leaky_relu_grad_compute(g,
                            x,
                            y,
                            negative_slope=0,
                            kernel_name="leaky_relu_grad"):
    """
    calculate the backpropagation of leaky_relu operation
    y = gradients(x>0) or negative_slope*gradients(x<=0).

    Parameters
    ----------
    g : TVM tensor
        the placeholder of input g
    x : TVM tensor
        the placeholder of input x
    y : dict
        dict of output y, include keys(shape and dtype)
    negative_slope : float or int
        allow non-zero slope for negative inputs to speed up optimization
    kernel_name : str
        kernel name, default value is "leaky_relu_grad"

    Returns
    -------
    res: TVM tensor
        the result of leaky_relu_grad_compute
    """

    shape_list = broadcast_shapes(te.lang.dynamic.shape_to_list(g.shape),
                                  te.lang.dynamic.shape_to_list(x.shape))
    dtype = g.dtype
    g = te.lang.dynamic.broadcast(g, shape_list[2])
    x = te.lang.dynamic.broadcast(x, shape_list[2])

    if dtype == "float32":
        help_min = tvm.const(2**(-126), "float32")
        help_rec_one = tvm.const(2**38, "float32")
        help_rec_sec = tvm.const(2**44, "float32")
    elif dtype == "float16":
        help_min = tvm.const(2**(-24), "float16")
        help_rec_one = tvm.const(2**12, "float16")
        help_rec_sec = help_rec_one

    tmp_min_x = te.lang.dynamic.vmins(x, help_min)
    tmp_max_x = te.lang.dynamic.vmaxs(tmp_min_x,
                                      tvm.const(SCALAR_ZERO, "float32"))
    tmp_mul_x = te.lang.dynamic.vmuls(tmp_max_x, help_rec_one)

    if dtype == "float32":
        tmp_mul_x = te.lang.dynamic.vmuls(tmp_mul_x, help_rec_sec)

    result_tmp_right = te.lang.dynamic.vmuls(tmp_mul_x, help_rec_sec)

    result_sub = te.lang.dynamic.vadds(result_tmp_right,
                                       tvm.const(NEGATIVE_ONE, "float32"))
    result_abs = te.lang.dynamic.vabs(result_sub)
    result_tmp_left = te.lang.dynamic.vmuls(result_abs, negative_slope)

    result_tmp = te.lang.dynamic.vadd(result_tmp_left, result_tmp_right)

    res = te.lang.dynamic.vmul(g, result_tmp)
    return res
Exemplo n.º 24
0
def dequantize(x,
               min_range,
               max_range,
               y,
               mode="MIN_COMBINED",
               kernel_name="dequantize"):
    """
    Dequantize the 'input' tensor into a float tensor.

    [min_range, max_range] are scalar floats that specify the range for
    the 'input' data.

    The 'mode' attribute controls exactly which calculations are used
    to convert the float to their quantized equivalents.


    In 'MIN_COMBINED' mode,
    each value of the tensor will undergo the following:

    ```
    if T == int8 or T == int32: in[i] += (range(T) + 1) / 2.0
    out[i] = min_range + (in[i] * (max_range - min_range) / range(T))
    ```
    here `range(T) = numeric_limits<T>::max() - numeric_limits<T>::min()`

    Note that if quantizedtype is int8, the operation will additionally add
    each value by 128 prior to casting.


    If the mode is 'MIN_FIRST', then this approach is used:

    ```
    num_discrete_values = 1 << (# of bits in T)
    range_adjust = num_discrete_values / (num_discrete_values - 1)
    range = (range_max - range_min) * range_adjust
    range_scale = range / num_discrete_values
    if T == int32:
        result = range_min + ((input - numeric_limits<T>::min()) * range_scale)
    else if T == int8 or T == uint8:
        least_quantitize = -round(min_range * ((1 << num_bits) - 1) /
                            (max_range - min_range))
        offset = min_range + least_quantitize * 1.0 * (max_range - min_range) /
                            ((1 << num_bits) - 1)
        res_tmp = range_min + ((input - numeric_limits<T>::min()) * range_scale)
        result = res_tmp - offset
    ```


    In `SCALED` mode,

    ```
    m = input_max
    num_bits = sizeof(T) * 8
    if T == int8 or T == int32:
        [min_fixed, max_fixed] =
            [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1]
        s = (2.0 * m) / (max_fixed - min_fixed)
    if T == uint8:
        [min_fixed, max_fixed] = [0, (1 << num_bits) - 1]
        s = 1.0 * m / (max_fixed - min_fixed)
    result = input * s
    ```

    Parameters:
    ----------
    x: the dict of x, dtype must be one of the following:
      cloud version only supports `int8`, `uint8`, `int32`,
      mini version only supports `int8`, `uint8`.

    min_range: the dict of input_min_range, dtype must be `float32`.
      The minimum scalar value possibly produced for the input.

    max_range: the dict of input_max_range, dtype must be `float32`.
      The maximum scalar value possibly produced for the input.

    y: the dict of output_data, dtype must be `float32`.

    mode: An optional `string` from: `"MIN_COMBINED", "MIN_FIRST", "SCALED"`.
      Defaults to `"MIN_COMBINED"`.

    kernel_name : cce kernel name, default value is "dequantize"

    Returns
    -------
    None
    """
    shape_x = x.get("shape")
    shape_input_min_range = min_range.get("shape")
    shape_input_max_range = max_range.get("shape")
    shape_output_data = y.get("shape")
    if len(shape_input_min_range) != len(shape_input_max_range):
        raise RuntimeError("shape_input_min_range and shape_input_max_range"
                           " must be equal")
    if shape_output_data != shape_x:
        raise RuntimeError("shape_output_data and shape_x must be equal.")
    shape_range = shape_input_min_range
    op_utils.check_shape(shape_x, param_name="x")
    op_utils.check_shape(shape_range, param_name="min_range")

    dtype_x = x.get("dtype")
    dtype_input_min_range = min_range.get("dtype")
    dtype_input_max_range = max_range.get("dtype")
    dtype_output_data = y.get("dtype")
    dtype_x = dtype_x.lower()
    dtype_input_min_range = dtype_input_min_range.lower()
    dtype_input_max_range = dtype_input_max_range.lower()
    dtype_output_data = dtype_output_data.lower()
    check_list = ("int8", "uint8", "int32")
    s322f32_support = tbe_platform.cce_conf.api_check_support(
        "te.lang.cce.cast_to", "s322f32")
    if dtype_x == "int32" and not s322f32_support:
        raise RuntimeError("not support on the platform")
    vmul_support = tbe_platform.cce_conf.api_check_support(
        "te.lang.cce.vmul", "float32")
    if not vmul_support:
        raise RuntimeError("not support on the platform")
    op_utils.check_dtype(dtype_x, check_list, param_name="x")
    op_utils.check_dtype(dtype_input_min_range, ("float32", ),
                         param_name="min_range")
    op_utils.check_dtype(dtype_input_max_range, ("float32", ),
                         param_name="max_range")
    op_utils.check_dtype(dtype_output_data, ("float32", ), param_name="y")

    if mode not in ("MIN_COMBINED", "MIN_FIRST", "SCALED"):
        raise RuntimeError(
            "mode only support MIN_COMBINED, MIN_FIRST, SCALED.")

    shape_x, shape_range, _ = op_utils.broadcast_shapes(
        shape_x,
        shape_range,
        param_name_input1="x",
        param_name_input2="min_range")

    shape_x, shape_range = op_utils.refine_shapes_for_broadcast(
        shape_x, shape_range)
    input_tensor = tvm.placeholder(shape_x, dtype=dtype_x, name="x")
    min_range = tvm.placeholder(shape_range,
                                dtype="float32",
                                name="input_min_range")
    max_range = tvm.placeholder(shape_range,
                                dtype="float32",
                                name="input_max_range")

    res = dequantize_compute(input_tensor, min_range, max_range, y, mode,
                             kernel_name)

    with tvm.target.cce():
        sch = generic.auto_schedule(res)

    config = {
        "name": kernel_name,
        "tensor_list": [input_tensor, min_range, max_range, res],
        "dummy_placeholder": True
    }
    te.lang.cce.cce_build_code(sch, config)
Exemplo n.º 25
0
def addcmul(input_data, x1, x2, y, value=1.0, kernel_name="addcmul"):
    """
    algorithm: addcmul
    calculating data's addcmul, y = input_data + value * (x1 * x2)

    Parameters
    ----------
    input_data : dict
        shape and dtype of first input, only support float16, float32, int32, int8, uint8
    x1 : dict
        shape and dtype of second input, only support float16, float32, int32, int8, uint8
    x2 : dict
        shape and dtype of third input, only support float16, float32, int32, int8, uint8
    y: dict
        shape and dtype of output, should be broadcast shape and type as input
    value: float
        scaling coefficient, default value is 1.0
    kernel_name : str
        cce kernel name, default value is addcmul

    Returns
    -------
    None
    """
    shape_input = input_data.get("shape")
    shape_x1 = x1.get("shape")
    shape_x2 = x2.get("shape")
    dtype_input = input_data.get("dtype").lower()
    dtype_x1 = x1.get("dtype").lower()
    dtype_x2 = x2.get("dtype").lower()

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_input)
    util.check_shape_size(shape_input, SHAPE_SIZE_LIMIT)
    util.check_shape_rule(shape_x1)
    util.check_shape_size(shape_x1, SHAPE_SIZE_LIMIT)
    util.check_shape_rule(shape_x2)
    util.check_shape_size(shape_x2, SHAPE_SIZE_LIMIT)

    check_list = ("float16", "float32", "int32", "int8", "uint8")
    util.check_dtype_rule(dtype_input, check_list)
    util.check_dtype_rule(dtype_x1, check_list)
    util.check_dtype_rule(dtype_x2, check_list)
    if dtype_input != dtype_x1 or dtype_input != dtype_x2:
        raise RuntimeError("the type of input_data, x1, x2 must be same")

    shape_x1, shape_x2, shape_max1 = broadcast_shapes(shape_x1, shape_x2)
    util.check_tensor_shape_size(shape_max1)
    shape_input, _, shape_max = broadcast_shapes(shape_input, shape_max1)
    util.check_tensor_shape_size(shape_max)
    shape_x1, _, _ = broadcast_shapes(shape_x1, shape_max)
    shape_x2, _, _ = broadcast_shapes(shape_x2, shape_max)

    data_input = tvm.placeholder(shape_input, dtype=dtype_input, name="data_input")
    data_x1 = tvm.placeholder(shape_x1, dtype=dtype_x1, name="data_x1")
    data_x2 = tvm.placeholder(shape_x2, dtype=dtype_x2, name="data_x2")
    res = addcmul_compute(data_input, data_x1, data_x2, shape_max, y, value, kernel_name="addcmul")

    with tvm.target.cce():
        schedule = generic.auto_schedule(res)

    tensor_list = [data_input, data_x1, data_x2, res]

    config = {"print_ir": False,
              "name": kernel_name,
              "tensor_list": tensor_list}

    te.lang.cce.cce_build_code(schedule, config)
Exemplo n.º 26
0
def less_equal_compute(input_x, input_y, output_z, kernel_name="less_equal"):
    """
    compute for less_equal

    Parameters
    ----------
    input_x: TVM tensor
        the placeholder of input_x
    input_y: TVM tensor
        the placeholder of input_y
    output_z: dict
        dict info of output_z
    kernel_name: str
        cce kernel name, default value is "less_equal"

    Returns
    -------
    res: TVM tensor
        the result of compute
    """
    dtype_x = input_x.dtype
    shape_x = te.lang.dynamic.shape_to_list(input_x.shape)
    shape_y = te.lang.dynamic.shape_to_list(input_y.shape)
    shape_x, shape_y, shape_broadcast = broadcast_shapes(
        shape_x,
        shape_y,
        param_name_input1="input_x",
        param_name_input2="input_y")

    if dtype_x == "float32":
        scalar_min = tvm.const(SCALAR_MIN_FP32, dtype="float32")
        scalar_mul = tvm.const(SCALAR_MUL_FP32, dtype="float32")
        scalar_mul1 = tvm.const(SCALAR_MUL2_FP32, dtype="float32")
        scalar_neg_one = tvm.const(SCALAR_NEG_ONE, dtype="float32")
    else:
        scalar_min = tvm.const(SCALAR_MIN_FP16, dtype="float16")
        scalar_mul = tvm.const(SCALAR_MUL_FP16, dtype="float16")
        scalar_neg_one = tvm.const(SCALAR_NEG_ONE, dtype="float16")

    if dtype_x in ("int8", "uint8"):
        input_x = te.lang.dynamic.cast_to(input_x, "float16")
        input_y = te.lang.dynamic.cast_to(input_y, "float16")

    input_x = te.lang.dynamic.broadcast(input_x, shape_broadcast)
    input_y = te.lang.dynamic.broadcast(input_y, shape_broadcast)

    res_max = te.lang.dynamic.vmax(input_x, input_y)
    res_vsub = te.lang.dynamic.vsub(input_y, res_max)
    if tbe_platform.cce_conf.api_check_support("te.lang.dynamic.vabs",
                                               res_vsub.dtype):
        res_vabs = te.lang.dynamic.vabs(res_vsub)
    else:
        res_vsub = te.lang.dynamic.cast_to(res_vsub, "float32")
        res_vabs = te.lang.dynamic.vabs(res_vsub)

    res_min = te.lang.dynamic.vmins(res_vabs, scalar_min)
    res_vmul = te.lang.dynamic.vmuls(res_min, scalar_mul)
    res_vmul1 = te.lang.dynamic.vmuls(res_vmul, scalar_mul)

    if dtype_x == "float32":
        res_vmul2 = te.lang.dynamic.vmuls(res_vmul1, scalar_mul1)
        res_vsub1 = te.lang.dynamic.vadds(res_vmul2, scalar_neg_one)
        res_vabs1 = te.lang.dynamic.vabs(res_vsub1)
    else:
        res_vsub1 = te.lang.dynamic.vadds(res_vmul1, scalar_neg_one)
        res_vabs1 = te.lang.dynamic.vabs(res_vsub1)

    res = te.lang.dynamic.cast_to(res_vabs1, "int8", True)

    return res
Exemplo n.º 27
0
def masked_fill(x, mask, value, y, kernel_name="masked_fill"):
    '''
    :param x: dict
                    shape and dtype of tensor x input
    :param mask: dict
                    shape and dtype of tensor mask,
                    can be boardcast as shape as x
    :param value: dict
                    shape and dtype of value
    :param y: dict
                    the output of masked _fill
    :param kernel_name: str
                      kernel name, default value is "masked _fill"
    :return: none
    '''

    x_shape = x.get("shape")
    x_dtype = x.get("dtype")
    x_dtype_lower = x_dtype.lower()

    mask_shape = mask.get("shape")
    mask_dtype = mask.get("dtype")

    value_shape = value.get("shape")
    value_dtype = value.get("dtype")
    value_dtype_lower = value_dtype.lower()

    # check dtype
    x_dtype_list = ("float16", "float32", "int8", "int32")
    op_utils.check_dtype(x_dtype, x_dtype_list)

    mask_dtype_list = ("bool", "int8")
    op_utils.check_dtype(mask_dtype, mask_dtype_list)

    if mask_dtype == "bool":
        mask_dtype = "int8"

    value_dtype_list = ("float16", "float32", "int8", "int32")
    op_utils.check_dtype(value_dtype, value_dtype_list)

    # check shape
    op_utils.check_shape(x_shape)
    op_utils.check_shape(mask_shape)
    op_utils.check_shape(value_shape)

    # check boardcast shape
    x_shape, mask_shape, out_shape = op_utils.broadcast_shapes(
        x_shape, mask_shape)
    op_utils.check_shape(out_shape)

    # check kernel_name
    util.check_kernel_name(kernel_name)

    pos_mask_shape = tuple(
        [1] * (len(x_shape) - len(mask_shape))) + tuple(mask_shape)
    data_x = tvm.placeholder(x_shape, dtype=x_dtype_lower, name="data_x")

    data_mask = tvm.placeholder(pos_mask_shape,
                                dtype=mask_dtype,
                                name="data_mask")

    data_value = tvm.placeholder(pos_mask_shape,
                                 dtype=value_dtype_lower,
                                 name="data_value")

    y = masked_fill_compute(data_x, data_mask, data_value, kernel_name)

    with tvm.target.cce():
        schedule = generic.auto_schedule(y)

    config = {
        "name": kernel_name,
        "tensor_list": [data_x, data_mask, data_value, y],
    }
    te.lang.cce.cce_build_code(schedule, config)