Exemplo n.º 1
0
def relu_compute(x, y, kernel_name="relu"):
    """
    Algrithm : relu(x) = max(x, 0)

    Parameters
    ----------
    x: the placeholder of data input

    y : the dict of output

    kernel_name : cce kernel name

    Returns
    -------
    res : result of relu
    """

    inp_dtype = x.dtype
    shape = x.shape
    compatible_dtype = x.dtype

    if inp_dtype == 'int8' and api_check_support('te.lang.cce.cast_to',
                                                 's82f16'):
        x = te.lang.cce.cast_to(x, 'float16')
        compatible_dtype = 'float16'
    if api_check_support('te.lang.cce.vrelu', compatible_dtype):
        data_res = te.lang.cce.vrelu(x)
    else:
        tensor_zero = te.lang.cce.broadcast(
            tvm.const(CONST_ZERO, compatible_dtype), shape)
        data_res = te.lang.cce.vmax(x, tensor_zero)

    data_res = te.lang.cce.cast_to(data_res, inp_dtype)

    return data_res
Exemplo n.º 2
0
def atan2_compute(y, x, output_dict, kernel_name="atan2"):
    """
    Algorithm: atan2
    ----------------------------------
    Parameters:

        y: Input data y.

        x: Input data x.

        kernel_name: cce kernel name, default value is "atan2"
    ----------------------------------
    Returns:

        A Tensor of atan2(x).

    """

    shape_y = y.shape
    dtype_y = y.dtype
    shape_x = x.shape

    shape_y = te.lang.cce.util.shape_to_list(shape_y)
    shape_x = te.lang.cce.util.shape_to_list(shape_x)
    shape_y, shape_x, shape_broadcast = broadcast_shapes(shape_y, shape_x, param_name_input1="x1", param_name_input2="x2")
    y = te.lang.cce.broadcast(y, shape_broadcast)
    x = te.lang.cce.broadcast(x, shape_broadcast)

    if dtype_y == "float16" and \
       api_check_support("te.lang.cce.vadd", "float32"):
        y = te.lang.cce.cast_to(y, "float32")
        x = te.lang.cce.cast_to(x, "float32")

    mask = _init_atan2_mask(y, x)

    # caculate the atan(y/x) when x > 0
    res = te.lang.cce.vdiv(y, x)
    res = _atan_compute(res)

    y_cmp_zero = te.lang.cce.vmuls(mask[CONST_ONE],
                                   tvm.const(CONST_PI_BY_TWO, y.dtype))
    res_x_lt_zero = te.lang.cce.vmuls(mask[CONST_ZERO],
                                      tvm.const(CONST_PI, y.dtype))

    if x.dtype == res.dtype and api_check_support("te.lang.cce.vcmpsel", x.dtype):
        res = te.lang.cce.vcmpsel(x, tvm.const(CONST_ZERO, x.dtype), 'eq', y_cmp_zero, res)
    else:
        tensor_zero = te.lang.cce.broadcast(tvm.const(CONST_ZERO, x.dtype), shape_broadcast)
        x_equal_zero = te.lang.cce.vcmp(x, tensor_zero, 'eq')
        res = te.lang.cce.vsel(x_equal_zero, y_cmp_zero, res)

    res = te.lang.cce.vadd(res, res_x_lt_zero)

    if dtype_y == "float16":
        res = te.lang.cce.cast_to(res, "float16")
    return res
Exemplo n.º 3
0
def bessel_i1e_compute(x, y, kernel_name="bessel_i1e"):
    """
    Algrithm:
    I0 = 1 + ( (z/2) / (1!) )^2 + ((z/2)^2 / (2!))^2 + ... + ((z/2)^n / (n!)) ^2
    I0e = I0 / exp(x)
    I1e = I0e * z / (2*(k+1))
    u = 4 * v^2
    Ive = (1 - (u-1)/(8*z) + (u-1)*(u-9)/(2! * (8*z)^2) - (u-1)*(u-9)*(u-25)/(3!*(8*z)^3))
          /sqrt(2*pi*z)

    Parameters
    ----------
    x: the placeholder of data input

    y: the dict of output

    kernel_name: cce kernel name, default value is "bessel_i1e"

    Returns
    -------
    A tensor. Has the same type as x.
    """

    shape_input = x.shape
    dtype_input = x.dtype

    # chose the type of data in begin
    if dtype_input == "float16" and \
       api_check_support("te.lang.cce.vadd", "float32"):
        x = te.lang.cce.cast_to(x, "float32")

    abs_data = te.lang.cce.vabs(x)

    broad_const_limit = te.lang.cce.broadcast(tvm.const(CONST_LIMIT, x.dtype), shape_input)
    before_res = _before_res_compute(abs_data, broad_const_limit)
    after_res = _after_res_compute(abs_data, broad_const_limit)

    if abs_data.dtype == before_res.dtype and \
            api_check_support("te.lang.cce.vcmpsel", abs_data.dtype):
        res = te.lang.cce.vcmpsel(abs_data,
                                  broad_const_limit,
                                  'lt',
                                  before_res,
                                  after_res)
    else:
        select_index = te.lang.cce.vcmp(abs_data, broad_const_limit, 'lt')
        res = te.lang.cce.vsel(select_index, before_res, after_res)

    data_sign = util_compute.sign(x)
    res = te.lang.cce.vmul(res, data_sign)

    if dtype_input == "float16":
        res = te.lang.cce.cast_to(res, "float16")

    return res
Exemplo n.º 4
0
def _accumulate_nv2_compute(x, y, num, kernel_name='accumulate_nv2'):
    """
    Process accumulate_nv2 operator.

    Parameters:
    ----------
    x : the list of input tensor.

    y : the dict of output.

    num : the size of input.

    kernel_name : cce kernel name, default value is "accumulate_nv2".

    Returns:
    -------
    result : result of accumulate.
    """

    dtype = x[0].dtype
    shape = x[0].shape
    length = len(x)

    result = x[0]
    # in order to improve the accuracy, convert float16 to float32
    if dtype == 'float16' and length > 1 and \
       api_check_support("te.lang.cce.vadd", "float32"):
        result = te.lang.cce.cast_to(result, 'float32')

    for i in range(1, length):
        rhs = x[i]
        if dtype == 'float16' and \
           api_check_support("te.lang.cce.vadd", "float32"):
            rhs = te.lang.cce.cast_to(x[i], 'float32')
        result = te.lang.cce.vadd(result, rhs)

    if length == 1:
        # te.lang.cce.vmuls supports float16, float32. int8, uint8, int32 will
        # be converted to float16. This will cause the data to be truncated.
        # so use te.lang.cce.vmul.
        if dtype == "int32":
            value_one = tvm.const(NUM_ONE, dtype=dtype)
            value_one_tensor = te.lang.cce.broadcast(value_one, shape)
            result = te.lang.cce.vmul(result, value_one_tensor)
        else:
            result = te.lang.cce.vmuls(result, NUM_ONE)

    # in order to improve the accuracy, convert float32 back to float16
    if dtype == 'float16' and length > 1:
        result = te.lang.cce.cast_to(result, 'float16')

    return result
Exemplo n.º 5
0
def asinh_grad_compute(y, dy, output_res, kernel_name="cce_asinh_grad"):
    """
    do element-wise asinh_grad compute

    Parameters:
    ----------
    y : the placeholders of input y

    dy : the placeholders of input dy

    output_res : output dict

    kernel_name : cce kernel name, default value is "cce_asinh_grad"

    Return :
    -------
    dy * (1/cosh(y))
    """

    dtype = y.dtype
    if dtype == "float16" and \
       api_check_support("te.lang.cce.vadd", "float32"):
        y = te.lang.cce.cast_to(y, "float32")
        dy = te.lang.cce.cast_to(dy, "float32")

    if api_check_support('te.lang.cce.vexp', 'float32'):
        # use vexp,vdiv api for high efficiency computation
        # cosh(y) = (e^y + e^-y) / 2
        #           (e^2y + 1) / 2e^y
        exp_pos = te.lang.cce.vexp(y)
        res = te.lang.cce.vmul(exp_pos, exp_pos)
        res = te.lang.cce.vadds(res, tvm.const(NUM_ONE, y.dtype))
        data_dy1 = te.lang.cce.vmuls(dy, tvm.const(NUM_TWO, y.dtype))
        data_dy1 = te.lang.cce.vmul(data_dy1, exp_pos)
        res = te.lang.cce.vdiv(data_dy1, res)
    else:
        # use taylor's method for high accuracy result
        y = te.lang.cce.vmuls(y, tvm.const(NUM_REPEAT, y.dtype))
        cosh_value_0 = _cosh_taylor_compute(y)
        # repeat 3 times
        cosh_value_1 = _cosh_repeat(cosh_value_0)
        cosh_value_2 = _cosh_repeat(cosh_value_1)
        cosh_value = _cosh_repeat(cosh_value_2)
        res = te.lang.cce.vrec(cosh_value)
        res = te.lang.cce.vmul(res, dy)

    if dtype == "float16":
        res = te.lang.cce.cast_to(res, "float16")

    return res
Exemplo n.º 6
0
def acosh_grad_compute(y, dy, z, kernel_name="acos_grad"):
    """
    do acosh_grad compute
    Parameters:
    ----------------
    y: input tensor y
    dy: input tensor dy
    z: output dict
    kernel_name: cce kernel name, default value is "acosh_grad"
    return: dy * (1 / sinh(y))
    ----------------
    """

    dtype = y.dtype
    dtype_1 = dtype
    if dtype == "float16" and \
       api_check_support("te.lang.cce.vadd", "float32"):
        y = te.lang.cce.cast_to(y, "float32")
        dy = te.lang.cce.cast_to(dy, "float32")
        dtype = "float32"

    data_y = te.lang.cce.vmuls(y, tvm.const(NUM_REPEAT, dtype))
    sinh_value_0 = _taylor_sinh_compute(data_y)
    sinh_value_1 = _sinh_repeat_with_sqrt(sinh_value_0)
    sinh_value_2 = _sinh_repeat_with_sqrt(sinh_value_1)
    data_sinh = _sinh_repeat_with_sqrt(sinh_value_2)

    res = te.lang.cce.vdiv(dy, data_sinh)

    if dtype_1 == "float16":
        res = te.lang.cce.cast_to(res, "float16")
    return res
Exemplo n.º 7
0
def acosh_compute(input_data, output_res, kernel_name="acosh"):
    """
    do element-wise acosh compute
    f(x) = log(x+sqrt(x^2-1)),  for all inputs

    Parameters:
    ----------
    input_data: the placeholder of data input

    output_res : the dict of output

    kernel_name : cce kernel name, default value is "acosh"

    Returns : A Tensor. Has the same type as input_data.
    -------
    """
    data = input_data

    input_dtype = data.dtype.lower()
    if input_dtype == "float16" and \
       api_check_support("te.lang.cce.vadd", "float32"):
        data = te.lang.cce.cast_to(data, "float32")

    res = te.lang.cce.vmul(data, data)
    res = te.lang.cce.vadds(res, tvm.const(CONST_NEG_ONE, data.dtype))
    res = te.lang.cce.vsqrt(res, 1)
    res = te.lang.cce.vadd(res, data)
    res = te.lang.cce.vlog(res, 1)

    if input_dtype == "float16":
        res = te.lang.cce.cast_to(res, "float16")

    return res
Exemplo n.º 8
0
def _less_compare_float32(data_x, data_y):
    """
    Compare data_x and data_y to determine whether data_x is less than data_y.
    If the element in data_x is less than in data_y, then return 1,
    else return 0.

    max num of float32 is 2**126
    but cce can only support 2**62, so use 62/62/2 to adaptor 126
    (2**(-126))*(2**(62))*(2**(62))*(2**(2)) = 1
    so min_value*max_value*max_value*factor_value = 1
    """
    shape_inputs = te.lang.cce.util.shape_to_list(data_x.shape)
    min_value = tvm.const(2 ** (-126), dtype=D_TYPE)
    max_value = tvm.const(2 ** 62, dtype=D_TYPE)
    factor_value = tvm.const(2 ** 2, dtype=D_TYPE)

    if api_check_support("te.lang.cce.vmaxs", data_x.dtype):
        res_sub = te.lang.cce.vsub(data_y, data_x)
        res_min = te.lang.cce.vmins(res_sub, min_value)
        res_max = te.lang.cce.vmaxs(res_min, tvm.const(0, dtype=D_TYPE))
    else:
        data_zero = te.lang.cce.vmuls(data_x, 0)
        min_value_tensor = te.lang.cce.vadds(data_zero, min_value)

        res_sub = te.lang.cce.vsub(data_y, data_x)
        res_min = te.lang.cce.vmin(res_sub, min_value_tensor)
        res_max = te.lang.cce.vmax(res_min, data_zero)

    res_max_mul = te.lang.cce.vmuls(res_max, max_value)
    res_max_mul_max = te.lang.cce.vmuls(res_max_mul, max_value)
    res = te.lang.cce.vmuls(res_max_mul_max, factor_value)

    return res
Exemplo n.º 9
0
def acos_grad_compute(y, dy, z, kernel_name="acos_grad"):
    """
    do acos_grad compute with sqrt and div
    Parameters:
    ----------------
    y: input tensor y
    dy: input tensor dy
    z: output dict
    kernel_name: cce kernel name, default value is "acos_grad"
    return: dy * (- 1 / (1 - data_y^2)^1/2)
    ----------------
    """

    dtype = y.dtype
    dtype_1 = dtype
    if dtype == "float16" and \
       api_check_support("te.lang.cce.vadd", "float32"):
        y = te.lang.cce.cast_to(y, "float32")
        dy = te.lang.cce.cast_to(dy, "float32")
        dtype = "float32"

    data1_square = te.lang.cce.vmul(y, y)
    data1_square = te.lang.cce.vmuls(data1_square,
                                     tvm.const(NUM_MINUS_ONE, dtype=dtype))
    data1_square = te.lang.cce.vadds(data1_square,
                                     tvm.const(NUM_ONE, dtype=dtype))

    data1_reciprocal = te.lang.cce.vsqrt(data1_square, 1)
    data1_reciprocal = te.lang.cce.vdiv(dy, data1_reciprocal)
    res = te.lang.cce.vmuls(data1_reciprocal,
                            tvm.const(NUM_MINUS_ONE, dtype=dtype))

    if dtype_1 == "float16":
        res = te.lang.cce.cast_to(res, "float16")
    return res
Exemplo n.º 10
0
def atanh_compute(x, y, kernel_name="atanh"):
    """
    Algrithm : atanh(x) = 0.5 * log((1 + x) / (1 - x)) if abs(x) < 1

    Parameters
    ----------
    x: the placeholder of data input

    y : the dict of output

    kernel_name : cce kernel name

    Returns
    -------
    res : result of atanh
    """

    inp_dtype = x.dtype
    shape = x.shape

    if inp_dtype == "float16" and \
       api_check_support("te.lang.cce.vadd", "float32"):
        x = te.lang.cce.cast_to(x, "float32")

    data_res = _compute(x, shape)

    if inp_dtype == "float16":
        data_res = te.lang.cce.cast_to(data_res, "float16")
    else:
        data_res = te.lang.cce.cast_to(data_res, "float32")

    return data_res
Exemplo n.º 11
0
def rsqrt_compute(x, y, kernel_name="rsqrt_cce"):
    """
    Algrithm : rsqrt(x) = 1 / sqrt(x)  where x > 0

    Parameters
    ----------
    x: the placeholder of data input

    y : the dict of output

    kernel_name : cce kernel name

    Returns
    -------
    res : result of rsqrt
    """

    inp_dtype = x.dtype

    if inp_dtype == "float16" and api_check_support("te.lang.cce.vadd",
                                                    "float32"):
        x = te.lang.cce.cast_to(x, "float32")

    data_res = _compute(x)

    if inp_dtype == "float16":
        data_res = te.lang.cce.cast_to(data_res, "float16")

    return data_res
Exemplo n.º 12
0
def elu_grad_compute(grads, activations, y, kernel_name="elu_grad"):
    """
    elu_grad_compute
    f(x) = vmul(add(min(activation, 0), 1), gradient)

    Parameters:
    ----------
    data_gradient : the placeholder of gradient data

    data_activation : the placeholder of activation data

    data_output : the dict of output

    kernel_name : cce kernel name, default value is "elu_grad"

    Returns : A Tensor. Has the same type as data_gradient.
    -------
    """

    dtype = grads.dtype
    shape = grads.shape

    if dtype.lower() == "float16" and \
       api_check_support("te.lang.cce.vadd", "float32"):
        grads = te.lang.cce.cast_to(grads, "float32")
        activations = te.lang.cce.cast_to(activations, "float32")

    if api_check_support("te.lang.cce.vmins", "float32"):
        min_res = te.lang.cce.vmins(activations, NUM_ZERO)
        add_res = te.lang.cce.vadds(min_res, NUM_ONE)
        res = te.lang.cce.vmul(add_res, grads)
    else:
        input_border = tvm.const(NUM_ZERO, grads.dtype)
        scalar_param_one = tvm.const(NUM_ONE, grads.dtype)
        tensor_input_border = te.lang.cce.broadcast(input_border, shape)
        tensor_scalar_param_one = te.lang.cce.broadcast(
            scalar_param_one, shape)

        min_res = te.lang.cce.vmin(activations, tensor_input_border)
        add_res = te.lang.cce.vadd(min_res, tensor_scalar_param_one)
        res = te.lang.cce.vmul(add_res, grads)

    if dtype.lower() == "float16":
        res = te.lang.cce.cast_to(res, "float16")

    return res
Exemplo n.º 13
0
def asinh_compute_mini(input_x, output_y, kernel_name="asinh"):
    """
    algrithm: asinh(x) = log(x + sqrt(x^2 + 1))

    Parameters
    ----------
    input_x: the placeholder of data input

    output_y : the dict of output

    kernel_name : cce kernel name, default value is "asinh"

    Returns
    -------
    res : result of asinh

    """

    inp_dtype = input_x.dtype.lower()
    shape = input_x.shape
    has_improve_precision = False
    if inp_dtype == "float16" and \
            tbe_platform.cce_conf.api_check_support("te.lang.cce.vrec",
                                                    "float32"):
        input_x = te.lang.cce.cast_to(input_x, "float32")
        has_improve_precision = True

    input_x1 = te.lang.cce.vabs(input_x)
    # to fix bug for input data is 0.0
    input_x1 = te.lang.cce.vadds(input_x1, MIN_FP16)
    data_1_x = te.lang.cce.vrec(input_x1)
    data_1_x_square = te.lang.cce.vmul(data_1_x, data_1_x)
    data_1_x_square = te.lang.cce.vadds(data_1_x_square,
                                        tvm.const(CONST_ONE, "float32"))
    data_s_1_sqrt = _newton_sqrt(data_1_x_square, inp_dtype)
    data_res = te.lang.cce.vmul(data_s_1_sqrt, input_x1)
    data_res = te.lang.cce.vadd(input_x1, data_res)
    result = _log_taylor(data_res, shape)
    res_neg = te.lang.cce.vmuls(result, tvm.const(CONST_NEG_ONE, inp_dtype))

    if input_x.dtype == result.dtype and api_check_support(
            "te.lang.cce.vcmpsel", input_x.dtype):
        res = te.lang.cce.vcmpsel(input_x, tvm.const(CONST_ZERO,
                                                     input_x.dtype), 'le',
                                  res_neg, result)
    else:
        const_zero_tensor = te.lang.cce.broadcast(
            tvm.const(CONST_ZERO, input_x.dtype), shape)
        compare_one = te.lang.cce.vcmp(input_x, const_zero_tensor, "le")
        res = te.lang.cce.vsel(compare_one, res_neg, result)

    if has_improve_precision:
        res = te.lang.cce.cast_to(res, "float16")
    else:
        res = te.lang.cce.cast_to(res, "float32")

    return res
Exemplo n.º 14
0
def _log_compute(data_x, res, shape):
    """
    when data > 2, use vlog directly
    when data > 32768, float16 will overflow, use log(x/2.5)+log(2.5)

    Parameters
    ----------
    data: input tensor that we want to calculate log

    Returns
    -------
    res : return of log

    """
    # if data > 2, use vlog
    if data_x.dtype == res.dtype and api_check_support("te.lang.cce.vcmpsel",
                                                       data_x.dtype):
        res = te.lang.cce.vcmpsel(data_x, tvm.const(CONST_TWO, data_x.dtype),
                                  'ge', te.lang.cce.vlog(data_x), res)
    else:
        threshold_3 = te.lang.cce.broadcast(tvm.const(CONST_TWO, "float32"),
                                            shape)
        index_3 = te.lang.cce.vcmp(data_x, threshold_3, 'ge')
        res = te.lang.cce.vsel(index_3, te.lang.cce.vlog(data_x), res)

    # if data > 32768, use log(x/2.5)+log(2.5)
    overflow_value = te.lang.cce.vmuls(data_x, CONST_FIVE_TWO)
    res_overflow = te.lang.cce.vadds(te.lang.cce.vlog(overflow_value),
                                     LOG_FIVE_TWO)
    if data_x.dtype == res.dtype and api_check_support("te.lang.cce.vcmpsel",
                                                       data_x.dtype):
        res = te.lang.cce.vcmpsel(data_x, tvm.const(FLOAT_16_MAX,
                                                    data_x.dtype), 'ge',
                                  res_overflow, res)
    else:
        float_16_max_tensor = te.lang.cce.broadcast(
            tvm.const(FLOAT_16_MAX, "float32"), shape)
        index_4 = te.lang.cce.vcmp(data_x, float_16_max_tensor, 'ge')
        res = te.lang.cce.vsel(index_4, res_overflow, res)
    res = te.lang.cce.cast_to(res, "float32")

    return res
Exemplo n.º 15
0
def relu_v2_compute(x, y, mask, kernel_name="relu_v2_cce"):
    """
    Algrithm : relu_v2(x) = x and 1 when x > 0 , else 0, 0

    Parameters
    ----------
    x: the placeholder of data input

    y : the dict of output

    mask : the dict of output

    kernel_name : cce kernel name

    Returns
    -------
    res : result of relu_v2_res

    mask: result of relu_v2_mask
    """

    inp_dtype = x.dtype
    shape = x.shape
    compatible_dtype = x.dtype

    if inp_dtype == 'int8' and api_check_support('te.lang.cce.cast_to',
                                                 's82f16'):
        x = te.lang.cce.cast_to(x, 'float16')
        compatible_dtype = 'float16'
    if api_check_support('te.lang.cce.vrelu', compatible_dtype):
        data_res = te.lang.cce.vrelu(x)
    else:
        tensor_zero = te.lang.cce.broadcast(
            tvm.const(CONST_ZERO, compatible_dtype), shape)
        data_res = te.lang.cce.vmax(x, tensor_zero)

    data_res = te.lang.cce.cast_to(data_res, inp_dtype)
    mask = te.lang.cce.vcmp(x, CONST_ZERO, "gt", "bit")

    return data_res, mask
def _less_equal_compare_float32(data_x, data_y):
    """
    if x is less than y or equal y, then return 1, else return 0.

    Parameters:
    ----------
    data_x : TVM tensor
        tensor x
    data_y : TVM tensor
        tensor y

    Returns
    -------
    the compare result
    """
    scalar_min_fp32 = tvm.const(2 ** (-126), dtype="float32")
    scalar_mul_fp32_first = tvm.const(2 ** (50), dtype="float32")
    scalar_mul_fp32_second = tvm.const(2 ** (26), dtype="float32")
    scalar_one_fp32 = tvm.const(1.0, dtype="float32")
    scalar_one_fp32_neg = scalar_one_fp32 * tvm.const(-1.0, dtype="float32")

    if api_check_support("te.lang.cce.vmaxs", data_x.dtype):
        data_max = te.lang.cce.vmax(data_x, data_y)
        data_sub = te.lang.cce.vsub(data_y, data_max)
        data_abs = te.lang.cce.vabs(data_sub)
        data_min = te.lang.cce.vmins(data_abs, scalar_min_fp32)

        data_mul = te.lang.cce.vmuls(data_min, scalar_mul_fp32_first)
        data_mul_first = te.lang.cce.vmuls(data_mul, scalar_mul_fp32_first)
        data_mul_second = te.lang.cce.vmuls(data_mul_first, scalar_mul_fp32_second)

        data_sub_first = te.lang.cce.vadds(data_mul_second, scalar_one_fp32_neg)
        data_out = te.lang.cce.vabs(data_sub_first)
    else:
        tensor_zero = te.lang.cce.vmuls(data_x, 0)
        tensor_min_fp32 = te.lang.cce.vadds(tensor_zero, scalar_min_fp32)

        data_max = te.lang.cce.vmax(data_x, data_y)
        data_sub = te.lang.cce.vsub(data_y, data_max)
        data_abs = te.lang.cce.vabs(data_sub)
        data_min = te.lang.cce.vmin(data_abs, tensor_min_fp32)

        data_mul = te.lang.cce.vmuls(data_min, scalar_mul_fp32_first)
        data_mul_first = te.lang.cce.vmuls(data_mul, scalar_mul_fp32_first)
        data_mul_second = te.lang.cce.vmuls(data_mul_first, scalar_mul_fp32_second)

        data_sub_first = te.lang.cce.vadds(data_mul_second, scalar_one_fp32_neg)
        data_out = te.lang.cce.vabs(data_sub_first)

    return data_out
Exemplo n.º 17
0
def atan_compute(x, y, kernel_name="atan"):
    """
    Algorithm: atan

    ----------------------------------
    Parameters:

    x: Input data

    y : the dict of output

    kernel_name: cce kernel name, default value is "atan"

    ----------------------------------
    Returns:

        A Tensor of atan(x).

    """

    dtype = x.dtype
    shape = x.shape

    if dtype == "float16" and \
       api_check_support("te.lang.cce.vadd", "float32"):
        x = te.lang.cce.cast_to(x, "float32")
    abs_data = te.lang.cce.vabs(x)

    tensor_one = te.lang.cce.broadcast(tvm.const(CONST_POS_ONE, x.dtype),
                                       shape)

    abs_data_sub_one = te.lang.cce.vsub(abs_data, tensor_one)
    abs_data_add_one = te.lang.cce.vadd(abs_data, tensor_one)
    abs_data2 = te.lang.cce.vdiv(abs_data_sub_one, abs_data_add_one)
    abs_data2 = te.lang.cce.vabs(abs_data2)

    # calucate data less than one
    res = _do_taylor(abs_data)
    # calucate data more than one
    res_mt_one = _do_taylor(abs_data2)
    res_mt_one = te.lang.cce.vadds(res_mt_one, CONST_PI_BY_FOUR)

    res = te.lang.cce.vmin(res, res_mt_one)

    sign_mask = util_compute.sign(x)
    res = te.lang.cce.vmul(res, sign_mask)

    if dtype == "float16":
        res = te.lang.cce.cast_to(res, "float16")
    return res
Exemplo n.º 18
0
def _between_nudged_min_max_compute(x, nudged_min, nudged_max):
    """
    Compare x with nudged_min and nudged_max.
    If the element in x is greater than nudged_min and less than nudged_max,
    then return 1, else return 0.

    max num of float32 is 2**126
    but cce can only support 2**62, so use 62/62/2 to adaptor 126
    (2**(-126))*(2**(62))*(2**(62))*(2**(2)) = 1
    """
    shape_inputs = te.lang.cce.util.shape_to_list(x.shape)
    min_value = tvm.const(2 ** (-126), dtype=D_TYPE)
    max_value = tvm.const(2 ** 62, dtype=D_TYPE)
    factor_value = tvm.const(2 ** 2, dtype=D_TYPE)

    if api_check_support("te.lang.cce.vmaxs", x.dtype):
        sub_tensor_min = te.lang.cce.vsub(x, nudged_min)
        sub_min = te.lang.cce.vadds(sub_tensor_min, min_value)
        more_nudged_min_tensor = te.lang.cce.vmaxs(sub_min, tvm.const(0, dtype=D_TYPE))

        sub_tensor_max = te.lang.cce.vsub(nudged_max, x)
        sub_max = te.lang.cce.vadds(sub_tensor_max, min_value)
        less_nudged_max_tensor = te.lang.cce.vmaxs(sub_max, tvm.const(0, dtype=D_TYPE))

        between_nudged_tensor = te.lang.cce.vmul(more_nudged_min_tensor,
                                                 less_nudged_max_tensor)
        between_nudged_element = te.lang.cce.vmins(between_nudged_tensor, min_value)
    else:
        data_zero = te.lang.cce.vmuls(x, 0)
        min_value_tensor = te.lang.cce.vadds(data_zero, min_value)

        sub_tensor_min = te.lang.cce.vsub(x, nudged_min)
        sub_min = te.lang.cce.vadds(sub_tensor_min, min_value)
        more_nudged_min_tensor = te.lang.cce.vmax(sub_min, data_zero)

        sub_tensor_max = te.lang.cce.vsub(nudged_max, x)
        sub_max = te.lang.cce.vadds(sub_tensor_max, min_value)
        less_nudged_max_tensor = te.lang.cce.vmax(sub_max, data_zero)

        between_nudged_tensor = te.lang.cce.vmul(more_nudged_min_tensor,
                                                 less_nudged_max_tensor)
        between_nudged_element = te.lang.cce.vmin(between_nudged_tensor,
                                                  min_value_tensor)

    vmul_max_value = te.lang.cce.vmuls(between_nudged_element, max_value)
    vmul_factor_value = te.lang.cce.vmuls(vmul_max_value, max_value)
    between_nudged = te.lang.cce.vmuls(vmul_factor_value, factor_value)

    return between_nudged
Exemplo n.º 19
0
def approximate_equal_compute(input_x,
                              input_y,
                              output_z,
                              tolerance,
                              kernel_name="approximate_equal"):
    """
    algorithm: approximate_equal

    calculating abs(x-y) <= tolerance

    Parameters
    ----------
    input_x : the placeholders of input data
    input_y : the placeholders of input data
    tolerance: default 1e-5
    output_z: shape and dtype of output
    kernel_name: cce kernel name, default value is "approximate_equal"
    Returns
    -------
    the function of _approximate_equal_compute
    """

    input_dtype = input_x.dtype
    if input_dtype == "float16" and api_check_support("te.lang.cce.vadd",
                                                      "float32"):
        input_x = te.lang.cce.cast_to(input_x, "float32")
        input_y = te.lang.cce.cast_to(input_y, "float32")

    res_vsub = te.lang.cce.vsub(input_x, input_y)
    res_vabs = te.lang.cce.vabs(res_vsub)

    res_vabs = te.lang.cce.cast_to(res_vabs, input_x.dtype)
    tol_tensor = te.lang.cce.broadcast(tvm.const(tolerance, input_x.dtype),
                                       input_x.shape)

    res_cmp = te.lang.cce.vcmp(res_vabs, tol_tensor, 'le')
    zero_rb_tensor = te.lang.cce.broadcast(tvm.const(NUM_ZERO, "float16"),
                                           input_x.shape)
    one_rb_tensor = te.lang.cce.broadcast(tvm.const(NUM_ONE, "float16"),
                                          input_x.shape)
    res = te.lang.cce.vsel(res_cmp, one_rb_tensor, zero_rb_tensor)

    res = te.lang.cce.cast_to(res, "int8")

    return res
Exemplo n.º 20
0
def _atan_compute(input_x):
    """
    Algorithm: atan

    ----------------------------------
    Parameters:

        input_x: Input data.

    ----------------------------------
    Returns:

        A Tensor of atan(x).

    """

    shape = input_x.shape
    dtype = input_x.dtype
    if dtype == "float16" and \
       api_check_support("te.lang.cce.vadd", "float32"):
        input_x = te.lang.cce.cast_to(input_x, "float32")
    abs_data = te.lang.cce.vabs(input_x)

    tensor_one = te.lang.cce.broadcast(tvm.const(CONST_POS_ONE, input_x.dtype),
                                       shape)

    abs_data_sub_one = te.lang.cce.vsub(abs_data, tensor_one)
    abs_data_add_one = te.lang.cce.vadd(abs_data, tensor_one)
    abs_data2 = te.lang.cce.vdiv(abs_data_sub_one, abs_data_add_one)
    abs_data2 = te.lang.cce.vabs(abs_data2)

    # calucate data less than one
    res = _do_taylor(abs_data)
    # calucate data more than one
    res_mt_one = _do_taylor(abs_data2)
    res_mt_one = te.lang.cce.vadds(res_mt_one, CONST_PI_BY_FOUR)

    res = te.lang.cce.vmin(res, res_mt_one)

    sign_mask = util_compute.sign(input_x)
    res = te.lang.cce.vmul(res, sign_mask)

    if dtype == "float16":
        res = te.lang.cce.cast_to(res, "float16")
    return res
def _less_compare_float32(data_x, data_y):
    """
    if x is less than y, then return 1, else return 0.

    Parameters:
    ----------
    data_x : TVM tensor
        tensor x
    data_y : TVM tensor
        tensor y

    Returns
    -------
    the compare result
    """
    shape_inputs = te.lang.cce.util.shape_to_list(data_x.shape)
    # minimun num of float32 2**(-126)
    min_value = tvm.const(2 ** (-126), dtype="float32")

    if api_check_support("te.lang.cce.vmaxs", data_x.dtype):
        res_sub = te.lang.cce.vsub(data_y, data_x)
        res_min = te.lang.cce.vmins(res_sub, min_value)
        res_max = te.lang.cce.vmaxs(res_min, tvm.const(0, dtype="float32"))
    else:
        data_zero = te.lang.cce.vmuls(data_x, 0)
        data_min = te.lang.cce.vadds(data_zero, min_value)
        res_sub = te.lang.cce.vsub(data_y, data_x)
        res_min = te.lang.cce.vmin(res_sub, data_min)
        res_max = te.lang.cce.vmax(res_min, data_zero)

    # max num of float32 is 2**126
    # but cce can only support 2**62, so use 50/50/26 to adaptor 126
    res_mul_first = te.lang.cce.vmuls(res_max,
                                      tvm.const(2 ** 50, dtype="float32"))
    res_mul_second = te.lang.cce.vmuls(res_mul_first,
                                       tvm.const(2 ** 50, dtype="float32"))
    res = te.lang.cce.vmuls(res_mul_second, tvm.const(2 ** 26, dtype="float32"))

    return res
Exemplo n.º 22
0
def add_n_compute(datas, output, tensor_num, kernel_name="add_n"):
    """
    calculating data's adds, z = a + b + c...

    Parameters
    ----------
    datas : list of placeholders
        all input data
    output : dict
        dict of output
    tensor_num:
        nums of input
    kernel_name : string
        cce kernel name, default value is add_n

    Returns
    -------
    res : output of the data's add_n
    """

    data_type = datas[0].dtype
    has_covert_float32 = (data_type == "float16"
                          and api_check_support("te.lang.cce.vadd", "float32"))

    first_data = datas[0] if not has_covert_float32 else\
        te.lang.cce.cast_to(datas[0], "float32")

    res = first_data
    for i, data_n in enumerate(datas):
        if i == 0:
            continue
        temp_data = data_n if not has_covert_float32 else \
            te.lang.cce.cast_to(data_n, "float32")
        res = te.lang.cce.vadd(res, temp_data)

    if has_covert_float32:
        res = te.lang.cce.cast_to(res, "float16")
    return res
Exemplo n.º 23
0
def asin_grad_compute(y, dy, z, kernel_name="asin_grad"):
    """
    do element-wise asin_grad compute

    Parameters:
    ----------
    y : the placeholders of input y

    dy : the placeholders of input dy

    z : output dict

    kernel_name : cce kernel name, default value is "cce_asin_grad"

    return : dy * (1 / sqrt(1 - y^2))
    -------
    """

    dtype = y.dtype
    if dtype == "float16" and \
       api_check_support("te.lang.cce.vadd", "float32"):
        y = te.lang.cce.cast_to(y, "float32")
        dy = te.lang.cce.cast_to(dy, "float32")

    # step 1: calculate num_to_vrsqrt = 1 - y^2
    data = te.lang.cce.vmul(y, y)
    data = te.lang.cce.vmuls(data, tvm.const(NUM_MINUS_ONE, y.dtype))
    num_to_vrsqrt = te.lang.cce.vadds(data, tvm.const(NUM_ONE, y.dtype))

    # step 2: calculate dy * (1 / sqrt(1 - y^2))
    vsqrt_res = te.lang.cce.vsqrt(num_to_vrsqrt, 1)
    res = te.lang.cce.vdiv(dy, vsqrt_res)

    if dtype == "float16":
        res = te.lang.cce.cast_to(res, "float16")

    return res
Exemplo n.º 24
0
def atan_grad_compute(y, dy, z, kernel_name="atan_grad"):
    """
    Calculation for backward gradient

    Parameters:
    ----------
    y: the placeholder of input data
    dy: the placeholder of input dy
    output_z : dict of output
    kernel_name : cce kernel name, default value is atan_grad

    Algorithm :
    ----------
        res = 1/(1+y^2)*dy

    Returns
    ----------
    result res
    """

    scalar_one = tvm.const(CONST_ONE, "float32")
    dtype = y.dtype

    if dtype == "float16" and \
       api_check_support("te.lang.cce.vadd", "float32"):
        y = te.lang.cce.cast_to(y, "float32")
        dy = te.lang.cce.cast_to(dy, "float32")

    data_square = te.lang.cce.vmul(y, y)
    sum_tmp = te.lang.cce.vadds(data_square, scalar_one)
    res = te.lang.cce.vdiv(dy, sum_tmp)

    if dtype == "float16":
        res = te.lang.cce.cast_to(res, "float16")

    return res
Exemplo n.º 25
0
def _log_taylor(data_x, shape):
    """
    use taylor expansion to calculate log

    Parameters
    ----------
    data_x: input tensor that we want to calculate sqrt

    dtype : the type of tensor

    Returns
    -------
    res :  return of log

    """
    data = te.lang.cce.vadds(data_x, tvm.const(CONST_NEG_ONE, "float32"))
    data_1 = te.lang.cce.vadds(
        data, tvm.const(CONST_NEG_ONE * CONST_LOG_THRESHOLD_1, "float32"))
    if api_check_support("te.lang.cce.vcmpsel", "float32"):
        data_sel = te.lang.cce.vcmpsel(
            data, tvm.const(CONST_LOG_THRESHOLD_1, data.dtype), 'ge',
            te.lang.cce.vmuls(data_1, tvm.const(CONST_DOT_SIX, "float32")),
            data)
        data_sel = te.lang.cce.cast_to(data_sel, "float32")
        data_2 = te.lang.cce.vadds(
            data_sel,
            tvm.const(CONST_NEG_ONE * CONST_LOG_THRESHOLD_2, "float32"))
        data_vmuls = te.lang.cce.vmuls(data_2,
                                       tvm.const(CONST_THREE_FOUR, "float32"))
        data_sel_1 = te.lang.cce.vcmpsel(
            data_sel, tvm.const(CONST_LOG_THRESHOLD_2, data_sel.dtype), 'ge',
            data_vmuls, data_sel)
        data_sel_1 = te.lang.cce.cast_to(data_sel_1, "float32")
        taylor = _taylor_compute(data_sel_1)
        # add log(4/3)
        res = te.lang.cce.vcmpsel(
            data_sel, tvm.const(CONST_LOG_THRESHOLD_2, data_sel.dtype), 'ge',
            te.lang.cce.vadds(taylor, tvm.const(LOG_FOUR_THREE, "float32")),
            taylor)
        res = te.lang.cce.cast_to(res, "float32")
        # add log(5/3)
        data = te.lang.cce.cast_to(data, "float32")
        res = te.lang.cce.vcmpsel(
            data, tvm.const(CONST_LOG_THRESHOLD_1, data.dtype), 'ge',
            te.lang.cce.vadds(taylor, tvm.const(LOG_FIVE_THREE, "float32")),
            res)
    else:
        threshold_1 = te.lang.cce.broadcast(
            tvm.const(CONST_LOG_THRESHOLD_1, "float32"), shape)
        index_1 = te.lang.cce.vcmp(data, threshold_1, 'ge')
        data_sel = te.lang.cce.vsel(
            index_1,
            te.lang.cce.vmuls(data_1, tvm.const(CONST_DOT_SIX, "float32")),
            data)
        data_sel = te.lang.cce.cast_to(data_sel, "float32")

        threshold_2 = te.lang.cce.broadcast(
            tvm.const(CONST_LOG_THRESHOLD_2, "float32"), shape)
        index_2 = te.lang.cce.vcmp(data_sel, threshold_2, 'ge')
        data_2 = te.lang.cce.vadds(
            data_sel,
            tvm.const(CONST_NEG_ONE * CONST_LOG_THRESHOLD_2, "float32"))
        data_vmuls = te.lang.cce.vmuls(data_2,
                                       tvm.const(CONST_THREE_FOUR, "float32"))
        data_sel = te.lang.cce.vsel(index_2, data_vmuls, data_sel)
        data_sel = te.lang.cce.cast_to(data_sel, "float32")
        taylor = _taylor_compute(data_sel)
        # add log(4/3)
        res = te.lang.cce.vsel(
            index_2,
            te.lang.cce.vadds(taylor, tvm.const(LOG_FOUR_THREE, "float32")),
            taylor)
        res = te.lang.cce.cast_to(res, "float32")
        # add log(5/3)
        res = te.lang.cce.vsel(
            index_1,
            te.lang.cce.vadds(taylor, tvm.const(LOG_FIVE_THREE, "float32")),
            res)

    res = te.lang.cce.cast_to(res, "float32")
    # d: vlog:
    res = _log_compute(data_x, res, shape)

    return res
Exemplo n.º 26
0
def apply_centered_rms_prop_d_compute(var,
                                      mg,
                                      ms,
                                      mom,
                                      lr,
                                      rho,
                                      momentum,
                                      epsilon,
                                      grad,
                                      var_out,
                                      mg_out,
                                      ms_out,
                                      mom_out,
                                      kernel_name="apply_centered_rms_prop_d"):
    """
    Update '*var' according to the centered RMSProp algorithm.

    mean_square = decay * mean_square + (1-decay) * gradient ** 2
    mean_grad = decay * mean_grad + (1-decay) * gradient
    Delta = learning_rate*gradient/sqrt(mean_square+epsilon-mean_grad**2)
    mg_{t} <- rho * mg_{t-1} + (1-rho) * grad
    ms_{t} <- rho * ms_{t-1} + (1-rho) * grad * grad
    mom_{t} <- momentum*mom_{t-1}+lr*grad/sqrt(ms_{t}-mg{t}*mg{t}+epsilon)
    var_{t} <- var_{t-1} - mom_{t}

    Parameters:
    ----------
    var: dict of tensor var, include shape and dtype,
        dtype support float16 and float32.

    mg: dict of tensor mg(mean_grad), include shape and dtype,
        dtype support float16 and float32.

    ms: dict of tensor ms(mean_square), include shape and dtype,
        dtype support float16 and float32.

    mom: dict of tensor mom, include shape and dtype,
        dtype support float16 and float32.

    lr: dict of scalar lr(learning rate). Must have the same dtype as var.

    rho: dict of scalar rho(decay rate). Must have the same dtype as var.

    momentum: dict of scalar momentum. Must have the same dtype as var.

    epsilon: dict of scalar epsilon. Must have the same dtype as var.

    grad: dict of tensor grad. Must have the same dtype as var.

    out: dict of output out.

    kernel_name : cce kernel name, default value is "apply_centered_rms_prop_d".

    Returns
    -------
    None
    """

    inp_dtype = var.dtype
    if inp_dtype == "float16" and api_check_support("te.lang.cce.vadd",
                                                    "float32"):
        var = te.lang.cce.cast_to(var, "float32")
        mg = te.lang.cce.cast_to(mg, "float32")
        ms = te.lang.cce.cast_to(ms, "float32")
        mom = te.lang.cce.cast_to(mom, "float32")
        lr = te.lang.cce.cast_to(lr, "float32")
        rho = te.lang.cce.cast_to(rho, "float32")
        momentum = te.lang.cce.cast_to(momentum, "float32")
        epsilon = te.lang.cce.cast_to(epsilon, "float32")
        grad = te.lang.cce.cast_to(grad, "float32")

    tensor_one_rho = tvm.compute(
        rho.shape,
        lambda *indices: rho(*indices) * tvm.const(NUM_ONE_NA, rho.dtype),
        tag='elewise_single_VS_mul')
    tensor_one_rho = tvm.compute(tensor_one_rho.shape,
                                 lambda *indices: tensor_one_rho(*indices) +
                                 tvm.const(NUM_ONE, tensor_one_rho.dtype),
                                 tag='elewise_single_VS_add')

    mg_rho = tvm.compute(mg.shape,
                         lambda *indices: mg(*indices) * rho[0],
                         tag='elewise_single_VS_mul')
    rhs = tvm.compute(grad.shape,
                      lambda *indices: grad(*indices) * tensor_one_rho[0],
                      tag='elewise_single_VS_mul')
    out_mg = te.lang.cce.vadd(mg_rho, rhs)

    ms_rho = tvm.compute(ms.shape,
                         lambda *indices: ms(*indices) * rho[0],
                         tag='elewise_single_VS_mul')
    rhs = te.lang.cce.vmul(grad, grad)
    rhs = tvm.compute(rhs.shape,
                      lambda *indices: rhs(*indices) * tensor_one_rho[0],
                      tag='elewise_single_VS_mul')
    out_ms = te.lang.cce.vadd(ms_rho, rhs)

    lhs_mom = tvm.compute(mom.shape,
                          lambda *indices: mom(*indices) * momentum[0],
                          tag='elewise_single_VS_mul')
    lr_grad = tvm.compute(grad.shape,
                          lambda *indices: grad(*indices) * lr[0],
                          tag='elewise_single_VS_mul')
    rhs = te.lang.cce.vmul(out_mg, out_mg)
    rhs = te.lang.cce.vsub(out_ms, rhs)
    rhs_eps = tvm.compute(rhs.shape,
                          lambda *indices: rhs(*indices) + epsilon[0],
                          tag='elewise_single_VS_add')
    rhs_eps = te.lang.cce.vsqrt(rhs_eps)
    rhs_eps = te.lang.cce.vdiv(lr_grad, rhs_eps)
    out_mom = te.lang.cce.vadd(lhs_mom, rhs_eps)

    out_var = te.lang.cce.vsub(var, out_mom)

    if inp_dtype == "float16":
        out_var = te.lang.cce.cast_to(out_var, "float16")
        out_mg = te.lang.cce.cast_to(out_mg, "float16")
        out_ms = te.lang.cce.cast_to(out_ms, "float16")
        out_mom = te.lang.cce.cast_to(out_mom, "float16")

    mg_output_data = te.lang.cce.vadds(out_mg, NUM_ZERO)
    ms_output_data = te.lang.cce.vadds(out_ms, NUM_ZERO)
    mom_output_data = te.lang.cce.vadds(out_mom, NUM_ZERO)

    # this compute is for multi output
    def _compute(*index):
        return out_mg(*index), out_ms(*index), out_mom(*index), out_var(
            *index), out_var(*index), mg_output_data(*index), ms_output_data(
                *index), mom_output_data(*index)

    return tvm.compute(var.shape, _compute, name="outputs")
def fake_quant_with_min_max_args_compute(x,
                                         y,
                                         min=-6,
                                         max=6,
                                         num_bits=8,
                                         narrow_range=False,
                                         kernel_name="fake_quant_with_min_"
                                         "max_args"):
    """
    Computes Fake-quantize the 'x' tensor,
    type float32 to 'y' tensor of same type
    calculating data's :
    y = (floor(clamped_shifted * inv_nudged_scale + 0.5f)))*scale+nudged_min
    scale=(max-min)/(quant_max-quant_min)

    Parameters
    ----------
    x: TVM tenor
        the placeholder of input data,type is float32
    y: dict
        the dict of output data
    min: scalar float int
        Defaults to -6
    max: scalar float int
        Defaults to 6
        [min; max] define the clamping range for the x data
    num_bits: float int
        Defaults to 8.num_bits is the bitwidth of the quantization,
        between 2 and 16
    narrow_range: bool
        True or False.if None,narrow_range=False
        if True x values are quantized into the quantization range
        [1; 2^num_bits - 1]
        if False x values are quantized into the quantization range
        [0; 2^num_bits - 1]
    kernel_name: str
        cce kernel name, default value is "fake_quant_with_min_max_args"

    Returns
    -------
    res: TVM tensor
        the result of fake_quant_with_min_max_args_compute
    """
    shape_x = te.lang.cce.util.shape_to_list(x.shape)
    output_dtype = x.dtype

    nudged_min, nudged_max, scale = _nudge_min_max(min, max, num_bits,
                                                   narrow_range)

    if api_check_support("te.lang.cce.vmaxs", x.dtype):
        nudged_min_neg = nudged_min * (-1.0)
        inv_nudged_scale = 1.00 / scale

        # Transform the input between nudged_max and nudged_min
        clamped_vmin = te.lang.cce.vmins(x, nudged_max)
        clamped = te.lang.cce.vmaxs(clamped_vmin, nudged_min)

        # Calculate the quantized and dequantized results
        clamped_shifted = te.lang.cce.vadds(clamped, nudged_min_neg)
        vmul_shifted = te.lang.cce.vmuls(clamped_shifted, inv_nudged_scale)
        vadds_shifted = te.lang.cce.vadds(vmul_shifted,
                                          tvm.const(0.5, dtype="float32"))

        floor_vadds_shifted = te.lang.cce.floor(vadds_shifted)
        floor_cast = te.lang.cce.cast_to(floor_vadds_shifted, output_dtype)
        res_scale = te.lang.cce.vmuls(floor_cast, scale)
        res = te.lang.cce.vadds(res_scale, nudged_min)

    else:
        zero_tensor = te.lang.cce.vmuls(x, 0)
        nudged_max_tensor = te.lang.cce.vadds(zero_tensor, nudged_max)
        nudged_min_tensor = te.lang.cce.vadds(zero_tensor, nudged_min)
        inv_nudged_scale = 1.00 / scale
        inv_nudged_scale_const = tvm.const(inv_nudged_scale,
                                           dtype=output_dtype)

        # Transform the input between nudged_max and nudged_min
        clamped_vmin = te.lang.cce.vmin(x, nudged_max_tensor)
        clamped = te.lang.cce.vmax(clamped_vmin, nudged_min_tensor)

        # Calculate the quantized and dequantized results
        clamped_shifted = te.lang.cce.vsub(clamped, nudged_min_tensor)
        vmul_shifted = te.lang.cce.vmuls(clamped_shifted,
                                         inv_nudged_scale_const)
        vadds_shifted = te.lang.cce.vadds(vmul_shifted,
                                          tvm.const(0.5, dtype="float32"))
        floor_vadds_shifted = te.lang.cce.floor(vadds_shifted)
        floor_cast = te.lang.cce.cast_to(floor_vadds_shifted, output_dtype)
        res_scale = te.lang.cce.vmuls(floor_cast, scale)
        res = te.lang.cce.vadd(res_scale, nudged_min_tensor)

    return res
Exemplo n.º 28
0
def bessel_i0e_compute(x, y, kernel_name="bessel_i0e"):
    """
    Algrithm:
    I0 = 1 + ( (z/2) / (1!) )^2 + ((z/2)^2 / (2!))^2 + ... + ((z/2)^n / (n!)) ^2
    I0e = I0 / exp(x)

    t = x / 3.75
    I0(x) = e^-|x|(1 + 3.5156229t^2 + 3.0899424t^4 + 1.2067492t^6 + 0.2659732t^8
            + 0.0360768t^10 + 0.0045813t^12)), |x| <= 3.75
    I0(x) = (1 / sqrt(|x|))*(0.39894228 + 0.01328592t^-1 + 0.00225319t^-2 + -0.00157565t^-3
            + 0.00916281t^-4 + -0.02057706t^-5 + 0.02635537t^-6 + -0.01647633t^-7
            + 0.00392377t^-8), |x| >= 3.75

    Parameters
    ----------
    x: the placeholder of data input

    y : the dict of output

    kernel_name : cce kernel name, default value is "bessel_i0e"

    Returns
    -------
    A tensor. Has the same type as x.

    """

    shape_input = x.shape
    dtype_input = x.dtype

    # chose the type of data in begin
    if dtype_input == "float16" and \
       api_check_support("te.lang.cce.vadd", "float32"):
        x = te.lang.cce.cast_to(x, "float32")
    abs_data = te.lang.cce.vabs(x)

    # compute bessel_i0e for data in (-3.75, 3.75)
    broad_const_limit = te.lang.cce.broadcast(tvm.const(CONST_LIMIT, x.dtype),
                                              shape_input)
    before_abs_data = te.lang.cce.vmin(abs_data, broad_const_limit)
    data = te.lang.cce.vdiv(before_abs_data, broad_const_limit)
    square_data = te.lang.cce.vmul(data, data)

    before_res = te.lang.cce.vmuls(square_data, tvm.const(ITR_BEFORE[LEN_BEFORE - 1]))
    before_res = te.lang.cce.vadds(before_res, ITR_BEFORE[LEN_BEFORE - 2])
    for index in reversed(range(LEN_BEFORE - 2)):
        before_res = te.lang.cce.vmul(before_res, square_data)
        before_res = te.lang.cce.vadds(before_res, ITR_BEFORE[index])

    exp_data = te.lang.cce.vexp(before_abs_data)
    before_res = te.lang.cce.vdiv(before_res, exp_data)

    # compute bessel_i0e for data in other domain
    data = te.lang.cce.vdiv(broad_const_limit, abs_data)

    after_res = te.lang.cce.vmuls(data, tvm.const(ITR_AFTER[LEN_AFTER - 1]))
    after_res = te.lang.cce.vadds(after_res, ITR_AFTER[LEN_AFTER - 2])
    for index in reversed(range(LEN_AFTER - 2)):
        after_res = te.lang.cce.vmul(after_res, data)
        after_res = te.lang.cce.vadds(after_res, ITR_AFTER[index])

    sqrt_data = te.lang.cce.vsqrt(abs_data, 1)

    after_res = te.lang.cce.vdiv(after_res, sqrt_data)
    after_res = te.lang.cce.vmin(before_res, after_res)

    # chose the type of data in end
    if dtype_input == "float16":
        after_res = te.lang.cce.cast_to(after_res, "float16")

    return after_res
Exemplo n.º 29
0
    def __init__(self,
                 x,
                 cont,
                 w_xh_x_static,
                 h_0,
                 w_xh,
                 bias_h,
                 w_hh,
                 w_ho,
                 bias_o,
                 o_t,
                 h_t,
                 expose_hidden=False,
                 num_output=0,
                 kernel_name="basicrnn_cell",
                 impl_mode="high_performance"):
        """
        Init BasicRNNCell base parameters

        Parameters
        ----------
        x: dict
            data of input
        cont: dict
            data of cont
        w_xh_x_static: dict
            data of w_xh_x_static
        h_0: dict
            data of h_0
        w_xh: dict
            data of w_xh
        w_hh: dict
            data of w_hh
        w_ho: dict
            data of w_ho
        bias_h: dict
            data of bias_h
        bias_o: dict
            data of bias_o
        o_t: dict
            data of o_t
        h_t: dict
            data of o_t
        expose_hidden: bool
            if expose hidden state
        num_output: int
            number of output
        kernel_name: str
            the name of the operator
        impl_mode: str
            impl mode

        Returns
        -------
        None
        """
        self.kernel_name = kernel_name
        self.impl_mode = impl_mode
        self.tensor_list1 = {}
        self.tensor_list2 = {}
        self.emit_cmd = {}
        self.scope_list = {}
        self.tanh_ht_tensor = None
        self.tanh_ot_tensor = None
        self.expose_hidden = expose_hidden
        self.num_output = num_output

        self.has_static = True
        if w_xh_x_static is None:
            self.has_static = False

        dtypes = {
            "x": x.get("dtype").lower(),
            "w_xh": w_xh.get("dtype").lower(),
            "w_ho": w_ho.get("dtype").lower(),
            "bias_h": bias_h.get("dtype").lower(),
            "bias_o": bias_o.get("dtype").lower(),
            "o_t": o_t.get("dtype").lower(),
            "h_t": h_t.get("dtype").lower()
        }

        shapes = {
            "x": x.get("shape"),
            "w_xh": w_xh.get("shape"),
            "w_ho": w_ho.get("shape"),
            "bias_h": (math.ceil(float(bias_h.get("shape")[0]) / 16), 16),
            "bias_o": (math.ceil(float(bias_o.get("shape")[0]) / 16), 16),
            "o_t": o_t.get("shape"),
            "h_t": h_t.get("shape")
        }

        datas = {
            "x":
            tvm.placeholder(shapes["x"], name="x", dtype=dtypes["x"]),
            "w_xh":
            tvm.placeholder(shapes["w_xh"], name="w_xh", dtype=dtypes["w_xh"]),
            "w_ho":
            tvm.placeholder(shapes["w_ho"], name="w_ho", dtype=dtypes["w_ho"]),
            "bias_h":
            tvm.placeholder(shapes["bias_h"],
                            name="bias_h",
                            dtype=dtypes["bias_h"]),
            "bias_o":
            tvm.placeholder(shapes["bias_o"],
                            name="bias_o",
                            dtype=dtypes["bias_o"])
        }

        dims = {
            "batch_dim": shapes["x"][1],
            "input_dim": shapes["x"][0],
            "hidden_dim": shapes["w_xh"][1]
        }

        if self.has_static:
            dtypes["w_xh_x_static"] = w_xh_x_static.get("dtype").lower()
            shapes["w_xh_x_static"] = w_xh_x_static.get("shape")
            datas["w_xh_x_static"] = tvm.placeholder(
                shapes["w_xh_x_static"],
                name="w_xh_x_static",
                dtype=dtypes["w_xh_x_static"])

        if self.expose_hidden:
            dtypes["h_0"] = h_0.get("dtype").lower()
            dtypes["cont"] = cont.get("dtype").lower()
            dtypes["w_hh"] = w_hh.get("dtype").lower()

            shapes["cont"] = (math.ceil(float(cont.get("shape")[0]) / 16), 16)
            shapes["h_0"] = h_0.get("shape")
            shapes["w_hh"] = w_hh.get("shape")

            datas["cont"] = tvm.placeholder(shapes["cont"],
                                            name="cont",
                                            dtype=dtypes["cont"])
            datas["h_0"] = tvm.placeholder(shapes["h_0"],
                                           name="h_0",
                                           dtype=dtypes["h_0"])
            datas["w_hh"] = tvm.placeholder(shapes["w_hh"],
                                            name="w_hh",
                                            dtype=dtypes["w_hh"])

        self.check_input_parameters(dtypes, shapes, dims)

        self.shapes = shapes
        self.dtypes = dtypes
        self.datas = datas
        self.dims = dims

        self.device = "mini"
        if not cce_conf.api_check_support("te.lang.cce.vadd", "float32"):
            self.device = "hisi_es"
Exemplo n.º 30
0
def apply_momentum_compute_d(var,
                             accum,
                             lr,
                             grad,
                             momentum,
                             var_out,
                             accum_out,
                             use_nesterov,
                             kernel_name='apply_momentum_d'):
    """
    Update '*var' according to the ApplyMomentum algorithm.

    accum = accum * momentum + grad
    if use_nesterov is True:
        var -= grad * lr + accum * momentum * lr
    else:
        var -= accum * lr

    Parameters:
    ----------
    var : mutable tensor var.

    accum: mutable tensor accum.

    lr : scalar lr.

    grad : tensor grad.

    momentum : scalar momentum.

    var_out : the dict of output var.

    accum_out : the dict of output accum.

    use_nesterov: bool. If true, use nesterov computing grad,
        default value is False.

    kernel_name : cce kernel name, default value is "apply_momentum_d".

    Returns:
    -------
    None
    """

    # cast to float32 for higher accuracy
    dtype = var.dtype
    if dtype == "float16" and api_check_support("te.lang.cce.vadd", "float32"):
        var = te.lang.cce.cast_to(var, "float32")
        accum = te.lang.cce.cast_to(accum, "float32")
        lr = te.lang.cce.cast_to(lr, "float32")
        grad = te.lang.cce.cast_to(grad, "float32")
        momentum = te.lang.cce.cast_to(momentum, "float32")

    # update accum
    accum_delta = tvm.compute(accum.shape,
                              lambda *indice: accum(*indice) * momentum[0],
                              tag='elewise_single_VS_mul')
    accum_t = te.lang.cce.vadd(accum_delta, grad)

    # update var
    if use_nesterov:
        var_delta = tvm.compute(grad.shape,
                                lambda *indice: grad(*indice) * lr[0],
                                tag='elewise_single_VS_mul')
        var_delta_2 = tvm.compute(
            accum_t.shape,
            lambda *indice: accum_t(*indice) * momentum[0],
            tag='elewise_single_VS_mul')
        var_delta_2 = tvm.compute(var_delta_2.shape,
                                  lambda *indice: var_delta_2(*indice) * lr[0],
                                  tag='elewise_single_VS_mul')
        var_delta = te.lang.cce.vadd(var_delta, var_delta_2)
        var_t = te.lang.cce.vsub(var, var_delta)
    else:
        var_delta = tvm.compute(accum_t.shape,
                                lambda *indice: accum_t(*indice) * lr[0],
                                tag='elewise_single_VS_mul')
        var_t = te.lang.cce.vsub(var, var_delta)

    if dtype == "float16":
        var_t = te.lang.cce.cast_to(var_t, "float16")
        accum_t = te.lang.cce.cast_to(accum_t, "float16")

    var_out_data = te.lang.cce.vadds(var_t, tvm.const(NUM_ZERO, var_t.dtype))
    accum_out_data = te.lang.cce.vadds(accum_t,
                                       tvm.const(NUM_ZERO, accum_t.dtype))

    def _compute(*index):
        return accum_t(*index), var_t(*index), var_out_data(*index), \
               accum_out_data(*index)

    return tvm.compute(var.shape, _compute, name="outputs")