예제 #1
0
def _scalar_dequant_v100_v2(x_l0c, deq_ub, align_shape, x_shape, relu_flag,
                            sqrt_mode):
    """
    dequant for scale in v100

    """
    res = tvm.compute(
        align_shape,
        lambda i, j, k, l:
        (x_l0c(i, j, k, l).astype("float16") * deq_ub(0, 0, 0, 0)),
        name='dequant_to_fp16')

    if sqrt_mode:
        res = tvm.compute(x_shape,
                          lambda i, j, k, l:
                          (res(i, j, k, l) * deq_ub(0, 0, 0, 0)),
                          name='dequant_sqrt')

    if relu_flag:
        res = tvm.compute(x_shape,
                          lambda *indices: tvm.relu(res(*indices)),
                          name="dequant_relu")

    res = tvm.compute(x_shape,
                      lambda *indice: res(*indice),
                      name="res",
                      tag='dequant_res',
                      attrs={
                          'sqrt_mode': sqrt_mode,
                          'relu_mode': relu_flag,
                          'is_scalar': 1
                      })
    return res
    def _transform(theta, input_dim, out_size, input_shape, dtype):
        
        num_batch = input_shape[0]
        height = input_shape[1]
        width = input_shape[2]
        num_channels = input_shape[3]

        theta = topi.reshape(theta, (num_batch, 2, 3))
        theta = topi.cast(theta, dtype)

        out_height = out_size[0]
        out_width = out_size[1]
                
        grid = _meshgrid(out_height, out_width)       
        grid = topi.reshape(grid, (num_batch, 3, out_height*out_width))
        grid = topi.cast(grid, dtype=dtype)
        
        k = tvm.reduce_axis((0, 3), 'k')
        T_g = tvm.compute((num_batch, 2, out_height*out_width),lambda b, y, x: tvm.sum(theta[b, y, k] * grid[b, k, x], axis = k), name = 'T_g')
              
        x_s = tvm.compute((num_batch, 1, out_height*out_width), lambda i,j,k:T_g[i,0,k], name = 'x_s')
        y_s = tvm.compute((num_batch, 1, out_height*out_width), lambda i,j,k:T_g[i,1,k], name = 'y_s')
              
        x_s_flat = topi.reshape(x_s, (num_batch*out_height*out_width,))
        y_s_flat = topi.reshape(y_s, (num_batch*out_height*out_width,))
                      
        input_transformed = _interpolate(input_dim, input_shape, x_s_flat, y_s_flat, out_size, dtype)
        output = topi.reshape(input_transformed, [num_batch, out_height, out_width, num_channels])
        return output 
예제 #3
0
def _dequant_v200_v2(x_l0c, deq_ub, align_shape, x_shape, relu_flag,
                     tensor_flag):
    """
    dequant for vector in v200

    """
    if tensor_flag:
        res_f16 = tvm.compute(
            align_shape,
            lambda i, j, k, l: tvm.vdeq_cast(x_l0c(i, j, k, l),
                                             deq_ub(0, j, 0, l),
                                             dtype="float16",
                                             do_relu=relu_flag),
            name='dequant_to_fp16',
            tag="dequant_vector")

    else:
        res_f16 = tvm.compute(
            align_shape,
            lambda i, j, k, l: tvm.deq_cast(
                x_l0c(i, j, k, l), deq_ub(0, 0, 0, 0), dtype="float16"),
            name='dequant_to_fp16',
            tag="dequant_scale")
    is_scalar = 1
    if tensor_flag:
        is_scalar = 0
    res = tvm.compute(x_shape,
                      lambda *indice: res_f16(*indice),
                      name='res',
                      tag="dequant_res",
                      attrs={'is_scalar': is_scalar})

    return res
예제 #4
0
def _scalar_dequant_v100(x, x_shape, align_shape, deq_scale, relu_flag,
                         sqrt_mode):
    """
    dequant for scale in v100

    """
    res_f16 = tvm.compute(
        align_shape,
        lambda i, j, k, l:
        (x(i, j, k, l).astype("float16") * deq_scale(0, 0, 0, 0, 0)),
        name='dequant1',
        tag="dequant1_scale")

    res = tvm.compute(x_shape,
                      lambda *indice: res_f16(*indice),
                      name='dequant_remove_pad',
                      tag="dequant_remove_pad")

    if relu_flag:
        res = tvm.compute(x_shape,
                          lambda *indices: tvm.relu(res(*indices)),
                          name="dequant_relu",
                          tag="dequant_relu")
    if sqrt_mode:
        res = tvm.compute(
            x_shape,
            lambda i, j, k, l: (res(i, j, k, l) * deq_scale(0, 0, 0, 0, 0)),
            name='dequant2',
            tag='dequant2_scale',
        )

    return res
예제 #5
0
def _vector_depthwise_fused_v200(x, x_shape, align_shape, deq_scale,
                                 relu_flag):
    """
    depthwise dequant for vector in v200

    """
    res_f16 = tvm.compute(
        align_shape,
        lambda i, j, a, k, l: tvm.vdeq_cast(x(i, j // 2, j % 2, k, l),
                                            deq_scale(0, j, 0, 0, l),
                                            dtype="float16",
                                            do_relu=relu_flag),
        name='dequant1',
        tag="dequant1_vector",
        attrs={"relu_flag": relu_flag})

    align_shape[3] = x_shape[3].value

    res = tvm.compute(align_shape,
                      lambda *indice: res_f16(*indice),
                      name='dequant_remove_pad',
                      tag="dequant_remove_pad",
                      attrs={"sqrt_flag": 0})

    return res
예제 #6
0
def _unpack_compute_copy(input_place, y, num, axis, kernel_name="unpack"):
    """
    unpack a tensor into `num` tensors along axis dimension.

    Parameters
    ----------
    input_place: TVM tensor
        the tensor of input.
    y: tuple or list
        the list of output tensor.
    num : int.
        the length of the dim axis.
    axis: int.
        the axis to unpack along.
    kernel_name : str.
        cce kernel name, default value is "unpack".

    Returns
    -------
    gm2ub_tensor_list: list
        the list of gm2ub tensors, tensor type is TVM tensor.
    ub2gm_tensor_list: list
        the list of ub2gm tensors, tensor type is TVM tensor.
    virtual_node:
        the tensors of virtual output node, tensor type is TVM tensor.
    """
    input_shape = te.lang.cce.util.shape_to_list(input_place.shape)
    output_shape = input_shape
    for index, _ in enumerate(output_shape):
        output_shape[index] = output_shape[index] if index != axis else 1

    offset = 0
    gm2ub_tensor_list = []
    ub2gm_tensor_list = []
    for i in range(num):
        gm2ub_tensor = tvm.compute(output_shape,
                                   lambda *index: input_place(*_index_offset(
                                       output_shape, axis, offset, *index)),
                                   name=''.join(['tensor', str(i)]))
        gm2ub_tensor_list.append(gm2ub_tensor)

        ub2gm_tensor = tvm.compute(output_shape,
                                   lambda *index: gm2ub_tensor(*index),
                                   name=''.join(['res', str(i)]))
        ub2gm_tensor_list.append(ub2gm_tensor)

        offset = offset + output_shape[axis]

    # create a virtual node
    def _add_compute(*index):
        virtual_tensor = ub2gm_tensor_list[0](*index)
        for ub2gm_tensor in ub2gm_tensor_list[1:]:
            virtual_tensor += ub2gm_tensor(*index)
        return virtual_tensor

    virtual_node = tvm.compute(output_shape,
                               lambda *index: _add_compute(*index),
                               name="virtual_node")

    return gm2ub_tensor_list, ub2gm_tensor_list, virtual_node
예제 #7
0
def newton_iteration(shape, tensor_x_rec, tensor_x, symbol, iter_num):
    """
    the function of newton_iteration
    Parameters
    ----------
    shape: tensor shape
    tensor_x_rec: tensor
    tensor_x: tensor
    symbol: tensor symbol

    Returns
    -------
    tensor_list: dict
    scope_list: dict
    emit_list: dict
    """
    dtype_c = tensor_x_rec.dtype
    num_two = tvm.const(2, dtype=dtype_c)
    neg_one = tvm.const(-1, dtype=dtype_c)
    tmp = tensor_x_rec

    tensor_list = {}
    scope_list = {}
    emit_list = {}
    tmp_mul = None
    tmp_neg = None
    tmp_add = None
    for index in range(0, iter_num):
        key = "tmp_mul_" + symbol + str(index)
        tmp_mul = tvm.compute(shape,
                              lambda *i: tensor_x(*i) * tmp(*i),
                              name=key)
        tensor_list[key] = tmp_mul
        scope_list[key] = cce.scope_ubuf
        emit_list[key] = "vector_mul"

        key = "tmp_neg_" + symbol + str(index)
        tmp_neg = tvm.compute(shape,
                              lambda *i: tmp_mul(*i) * neg_one,
                              name=key)
        tensor_list[key] = tmp_neg
        scope_list[key] = cce.scope_ubuf
        emit_list[key] = "vector_muls"

        key = "tmp_add_" + symbol + str(index)
        tmp_add = tvm.compute(shape,
                              lambda *i: tmp_neg(*i) + num_two,
                              name=key)
        tensor_list[key] = tmp_add
        scope_list[key] = cce.scope_ubuf
        emit_list[key] = "vector_adds"

        key = "tmp_" + symbol + str(index)
        tmp = tvm.compute(shape, lambda *i: tmp_add(*i) * tmp(*i), name=key)
        tensor_list[key] = tmp
        scope_list[key] = cce.scope_ubuf
        emit_list[key] = "vector_mul"

    return tensor_list, scope_list, emit_list
예제 #8
0
def _compute_update(logbase, sign_decay, sign_gm, grad):
    vmul_tmp = tvm.compute(sign_gm.shape,
                           lambda *indice: sign_gm(*indice) * sign_decay[0],
                           tag='elewise_single_VS_mul')
    vmul_tmp = tvm.compute(vmul_tmp.shape,
                           lambda *indice: vmul_tmp(*indice) * logbase[0],
                           tag='elewise_single_VS_mul')
    exp_tmp = te.lang.cce.vexp(vmul_tmp)
    update = te.lang.cce.vmul(exp_tmp, grad)
    return update
예제 #9
0
def apply_keras_momentum_d_compute(var,
                                   accum,
                                   lr,
                                   grad,
                                   momentum,
                                   out_var,
                                   out_accum,
                                   use_nesterov,
                                   kernel_name="apply_keras_momentum_d"):
    """
    the operator's compute
    :param var: weight, placeholder
    :param accum: accum, placeholder
    :param lr: learning rate, placeholder
    :param grad: gradient, placeholder
    :param momentum: nesterov momentum, placeholder
    :param out_var: updated of var
    :param out_accum: updated of accum
    :param use_nesterov: bool
    :return: out_var, out_accum
    """
    inp_dtype = var.dtype
    # check the instruction supports or not
    vmul_support = tbe_platform.cce_conf.api_check_support(
        "te.lang.cce.vmul", "float32")
    if inp_dtype == "float32" and not vmul_support:
        raise RuntimeError(
            "Input dtype is float32, but do not support on the platform")

    # update var and accum according to the momentum scheme
    # accum = accum * momentum - grad * lr
    accum_momen = tvm.compute(accum.shape,
                              lambda *indices: accum(*indices) * momentum[0],
                              tag='elewise_single_VS_mul')
    grad_lr = tvm.compute(grad.shape,
                          lambda *indices: grad(*indices) * lr[0],
                          tag='elewise_single_VS_mul')
    out_accum = te.lang.cce.vsub(accum_momen, grad_lr)

    # var = var + accum * momentum - grad * lr
    if use_nesterov is True:
        accum_momen2 = tvm.compute(
            accum.shape,
            lambda *indices: out_accum(*indices) * momentum[0],
            tag='elewise_single_VS_mul')
        add_var_am = te.lang.cce.vadd(var, accum_momen2)
        out_var = te.lang.cce.vsub(add_var_am, grad_lr)
    # var = var + accum
    else:
        out_var = te.lang.cce.vadd(var, out_accum)

    def _compute(*index):
        return out_var(*index), out_accum(*index)

    return tvm.compute(var.shape, _compute, name='outputs')
예제 #10
0
def avg_pool_compute1(x, y, ksize, strides,
                      padding="VALID", data_format="NHWC",
                      is_fused_compute=True,
                      kernel_name="avg_pool"):
    """
    describe compute
    return: tensor
    """
    # create window and stride for pooling2d
    if data_format in ("NHWC",):
        window = [ksize[1], ksize[2]]
        stride = [strides[1], strides[2]]
    else:
        window = [ksize[2], ksize[3]]
        stride = [strides[2], strides[3]]

    window = list(window)
    stride = list(stride)

    # l1 fusion and l2 fusion
    l1_fusion_type = x.op.attrs["L1_fusion_type"].value \
        if "L1_fusion_type" in x.op.attrs else -1
    fusion_params = get_fusion_params(x, y, is_fused_compute)
    in_select_read_flag = fusion_params.get("in_select_read_flag")
    in_valid_shape = fusion_params.get("in_valid_shape")
    in_slice_offset = fusion_params.get("in_slice_offset")

    if in_select_read_flag:
        select_tensor_in = tvm.compute(in_valid_shape,
                                       lambda n, c1, h, w, c0:
                                       x(n, c1, h + in_slice_offset[2], w, c0),
                                       name="tensor_read_select",
                                       attrs=x.op.attrs)
        res = te.lang.cce.pooling2d(select_tensor_in, window, stride, "AVG",
                                    padding, fusion_params=fusion_params)
    elif l1_fusion_type == 1:
        x.op.attrs["addr_type"].value = 1
        in_l1_flag = True
        fusion_params["in_l1_flag"] = in_l1_flag

        l1_width_fusion_in = tvm.compute(x.shape,
                                         lambda n, c1, h, w, c0:
                                         x(n, c1, h, w, c0),
                                         name="l1_width_fusion_tensor_in",
                                         attrs=x.op.attrs)
        res = te.lang.cce.pooling2d(l1_width_fusion_in, window, stride,
                                    "AVG", padding,
                                    fusion_params=fusion_params)
    else:
        res = te.lang.cce.pooling2d(x, window, stride, "AVG", padding,
                                    fusion_params=fusion_params)

    return res
예제 #11
0
def _reform_by_vadds(input_tensor, input_shape, output_shape, offset_val,
                     nz_format_flag):
    """
    5 dim input tensor C0 change
    Parameters
    ----------
    input_tensor : input tensor

    input_shape : the shape of input tensor

    output_shape :the shape of output tensor

    offset_val : the val of offset

    nz_format_flag: the format of input tensor

    Returns
    -------
    res tensor
    """
    vadds_vector = tvm.compute(output_shape,
                               _reform_compute_generate(
                                   input_tensor, input_shape, output_shape,
                                   (True, offset_val, -1), nz_format_flag),
                               name='reform_by_vadds')

    return vadds_vector
예제 #12
0
def write_select_compute(input_tensor, output_x, kernel_name="write_select"):
    """
    calculating data

    Parameters
    ----------
    input_tensor : TVM tensor
        the input tensor
    output_x : dict
        dict of output_x, include keys(shape and dtype)
    kernel_name : str
        kernel name, default value is "write_select"

    Returns
    -------
    output tensor
    """
    # input_shape = output_x.get("shape")
    input_shape = input_tensor.shape
    valid_shape = output_x.get("valid_shape")

    if len(valid_shape) != PARA_LIST_LEN:
        raise RuntimeError("the len of valid shape should be 5")

    _, _, h_valid, w_valid, c0_valid = valid_shape

    compute_name = "res_write_select" + "_" + str(NAME_INDEX[0])
    NAME_INDEX[0] += 1
    res = tvm.compute(input_shape,
                      lambda *indice: input_tensor(*indice),
                      name=compute_name,
                      attrs={"HWC0": h_valid * w_valid * c0_valid},
                      tag=WRITE_SELECT_TAG)

    return res
예제 #13
0
def strided_write_compute(x, y, axis, stride, kernel_name='strided_write'):
    """
    write data to tensor by stride.

    Parameters:
    ----------
    x: placeholder of input tesnor.

    y: dict of output tensor.

    axis: which axis to write data by stride.

    stride: data write stride.

    kernel_name: cce kernel name, default value is "strided_write".

    Returns:
    ----------
    output_y: result tensor.
    """
    shape_y = tuple(i.value for i in x.shape)
    output_y = tvm.compute(shape_y, lambda *indice: x(*indice),
                           name="strided_write",
                           attrs={"stride": stride},
                           tag=STRIDED_WRITE_TAG)
    return output_y
예제 #14
0
def _format_transfer_nz(shape, x, c1_index):
    """
    C0 from 16 to 32 for FRACTAL_NZ
    """
    trans_shape = shape[:]
    trans_shape[c1_index] = trans_shape[c1_index] // 2
    trans_shape[-1] = trans_shape[-1] * 2
    res = tvm.compute(trans_shape,
                      _format_compute(x, trans_shape, c1_index),
                      name='data_transfer',
                      tag="requant_data_transfer")
    res = tvm.compute(trans_shape,
                      lambda *i: res[i],
                      name='res',
                      tag='requant_NZ')
    return res
예제 #15
0
def _muti_output(var_out, m_out, output_data, m_output_data, shape):
    """
    this compute is for muti output

    Parameters:
    ----------
    var_out: the value of var_out
    m_out: the value of m_out
    output_data: the dict of output_data
    shape: the shape of var

    Returns
    -------
    the new value of out_var and out_m
    the output
    """

    # this compute is for muti output
    def _compute(*index):
        return var_out(*index), m_out(*index), output_data(
            *index), m_output_data(*index)

    out_var, out_m, out_data, m_out_data = tvm.compute(shape,
                                                       _compute,
                                                       name="outputs")

    return out_var, out_m, out_data, m_out_data
예제 #16
0
def _reform_by_vmuls(input_tensor, input_shape, output_shape, scale_val,
                     nz_format_flag):
    """
    5 dim input tensor C0 change
    Parameters
    ----------
    input_tensor : input tensor

    input_shape : the shape of input tensor

    output_shape :the shape of output tensor

    scale_val : the val of scale

    nz_format_flag: the format of input tensor

    Returns
    -------
    res tensor
    """
    vmuls_vector = tvm.compute(output_shape,
                               _reform_compute_generate(
                                   input_tensor, input_shape, output_shape,
                                   (False, -1, scale_val), nz_format_flag),
                               name='reform_by_vmuls')

    return vmuls_vector
예제 #17
0
def strided_read_compute(x, y, axis, stride, kernel_name='strided_read'):
    """
    read data from tensor by stride.

    Parameters:
    ----------
    x: placeholder of input tesnor.

    y: dict of output tensor.

    axis: which axis to read data by stride.

    stride: data read stride.

    kernel_name: cce kernel name, default value is "strided_read".

    Returns:
    ----------
    output_y: result tensor.
    """
    output_y = tvm.compute(y.get("shape"),
                           lambda batch_idx, c1_idx, h_idx, w_idx, c0_idx: x[
                               batch_idx, c1_idx, h_idx, w_idx, c0_idx],
                           name="strided_read",
                           tag=STRIDED_READ_TAG,
                           attrs=x.op.attrs)

    return output_y
예제 #18
0
def custom_equal(shape_x, shape_y, dtype, kernel_name="cce_tf_equal", need_build=False,
                 need_print=False):
    """
    do element-wise equal operation between two input tensors

    Parameters:
    ----------
    shape_x : shape of input x

    shape_y : shape of input y

    dtype : source data type, support float16,float32,int32,int8,uint8

    kernel_name : cce kernel name, default value is "cce_tf_equal"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None
    """

    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape_x)
    util.check_shape_rule(shape_y)

    check_list = ["float16", "float32", "int32", "int8", "uint8", "bool"]

    dtype = dtype.lower()
    if not (dtype in check_list):
        raise RuntimeError(
            "tf_equal_cce only support %s while dtype is %s" % (",".join(check_list), dtype))

    util.check_shape_size(shape_x, SHAPE_SIZE_LIMIT)
    util.check_shape_size(shape_y, SHAPE_SIZE_LIMIT)

    shape_x, shape_y, shape_max = util.produce_shapes(shape_x, shape_y)

    util.check_shape_size(shape_max, SHAPE_SIZE_LIMIT)

    x = tvm.placeholder(shape_x, dtype=dtype, name="x")
    y = tvm.placeholder(shape_y, dtype=dtype, name="y")

    x_tmp = te.lang.cce.broadcast(x, shape_max)
    y_tmp = te.lang.cce.broadcast(y, shape_max)

    res = tvm.compute(shape_max, lambda *i: x_tmp(*i) == y_tmp(*i), name='res')

    sch = tvm.create_schedule(res.op)

    if need_print:
        with build_config:
            print(tvm.lower(sch, [x, y, res], simple_mode=True))

    if need_build:
        with build_config:
            tvm.build(sch, [x, y, res], "cce", name=kernel_name)
예제 #19
0
def _vector_depthwise_fused_v100(x, x_shape, align_shape, deq_scale, relu_flag,
                                 sqrt_mode):
    """
    dequant for vector in v100

    """

    if relu_flag:
        res_f16 = tvm.compute(align_shape,
                              lambda i, j, a, k, l: tvm.relu(
                                  x(i, j // 2, j % 2, k, l).astype("float16") *
                                  deq_scale(0, j, 0, 0, l)),
                              name='dequant1',
                              tag="dequant1_vector",
                              attrs={"relu_flag": 1})

    else:
        res_f16 = tvm.compute(align_shape,
                              lambda i, j, a, k, l: x(i, j // 2, j % 2, k, l).
                              astype("float16") * deq_scale(0, j, a, 0, l),
                              name='dequant1',
                              tag="dequant1_vector",
                              attrs={"relu_flag": 0})

    align_shape[3] = x_shape[3].value

    if not sqrt_mode:
        res = tvm.compute(align_shape,
                          lambda *indice: res_f16(*indice),
                          name='dequant_remove_pad',
                          tag="dequant_remove_pad",
                          attrs={"sqrt_flag": 0})
    else:
        res_sqrt = tvm.compute(
            align_shape,
            lambda i, j, a, k, l:
            (res_f16(i, j, a, k, l) * deq_scale(0, j, a, 0, l)),
            name='dequant2',
            tag='dequant2_vector')

        res = tvm.compute(align_shape,
                          lambda *indice: res_sqrt(*indice),
                          name='dequant2_remove_pad',
                          tag="dequant2_remove_pad",
                          attrs={"sqrt_flag": 1})
    return res
예제 #20
0
def _input_compute_generate(x, in_shape, read_shape, c1_dim, c1_index):
    """
    generate lambda func
    """
    x_shape = te.lang.cce.util.shape_to_list(x.shape)
    dtype = x.dtype
    x_slice_offset = _get_input_attr(x, "slice_offset", [], True)
    l1_fusion_flag = _get_input_attr(x, "l1_fusion_flag", -1, False)
    if not x_slice_offset:
        x_slice_offset = [0, 0, 0, 0, 0]

    if l1_fusion_flag != -1:
        x_w = x_shape[3]
        n_offset, _, h_offset, w_offset, _ = x_slice_offset
        if c1_dim % 2 == 0:
            input_ub = tvm.compute(
                in_shape,
                lambda n, c1, m, c0: x(n + n_offset, c1, (m // x_w) + h_offset,
                                       (m % x_w) + w_offset, c0),
                name="input_ub",
                attrs={"c_out": c1_dim})
        else:
            input_ub = tvm.compute(
                read_shape,
                lambda n, c1, m, c0: tvm.select(
                    c1 <= in_shape[c1_index] - 1,
                    x(n + n_offset, c1, (m // x_w) + h_offset,
                      (m % x_w) + w_offset, c0), tvm.const(0, dtype=dtype)),
                name='input_ub',
                attrs={"c_out": c1_dim})
    else:
        if c1_dim % 2 == 0:
            input_ub = tvm.compute(in_shape,
                                   lambda *i: x(*i),
                                   name="input_ub",
                                   attrs={"c_out": c1_dim})
        else:
            input_ub = tvm.compute(
                read_shape,
                lambda *indice: tvm.select(
                    indice[c1_index] <= in_shape[c1_index] - 1, x(*indice),
                    tvm.const(0, dtype=dtype)),
                name='input_ub',
                attrs={"c_out": c1_dim})
    return input_ub
예제 #21
0
def _scalar_dequant_v200(x, x_shape, align_shape, deq_scale):
    """
    dequant for scale in v200

    """
    res_f16 = tvm.compute(
        align_shape,
        lambda i, j, k, l: tvm.deq_cast(
            x(i, j, k, l), deq_scale(0, 0, 0, 0, 0), dtype="float16"),
        name='dequant',
        tag="dequant_scale")

    res = tvm.compute(x_shape,
                      lambda *indice: res_f16(*indice),
                      name='dequant_remove_pad',
                      tag="dequant_remove_pad")

    return res
예제 #22
0
def _compute_m_t(m, beta, grad):
    beta_tmp = tvm.compute(m.shape,
                           lambda *indice: m(*indice) * beta[0],
                           tag='elewise_single_VS_mul')
    beta_na = tvm.compute(
        beta.shape,
        lambda *indice: beta(*indice) * tvm.const(CONST_ONE_NA, beta.dtype),
        tag='elewise_single_VS_mul')
    beta_na = tvm.compute(
        beta_na.shape,
        lambda *indice: beta_na(*indice) + tvm.const(CONST_ONE, beta_na.dtype),
        tag='elewise_single_VS_add')
    beta_sub_tmp = tvm.compute(grad.shape,
                               lambda *indice: grad(*indice) * beta_na[0],
                               tag='elewise_single_VS_mul')

    m_t = te.lang.cce.vadd(beta_tmp, beta_sub_tmp)
    return m_t
예제 #23
0
def _assign_sub_compute(data_var, data_value, out, kernel_name='assign_sub'):
    """
    assign_sub compute function

    Parameters
    ----------
    data_var : tvm.tensor
        tensor of var
    data_value : tvm.tensor
        tensor of value
    out : dict
        dict of out.
    kernel_name : str
        cce kernel name, default value is "assign_sub"

    Returns
    -------
    sch : tvm.schedule
        the compute schedule
    res : tvm.tensor
        tensor of result
    """

    shape = data_var.shape
    shape = [i.value for i in shape]
    data_var_ub = tvm.compute(shape,
                              lambda *i: data_var(*i),
                              name='data_var_ub')
    data_value_ub = tvm.compute(shape,
                                lambda *i: data_value(*i),
                                name='data_value_ub')
    if data_var.dtype == "int8" or data_var.dtype == "uint8":
        data_var_cast = tvm.compute(
            shape,
            lambda *i: data_var_ub(*i).astype("float16"),
            name="data_var_cast")
        data_value_cast = tvm.compute(
            shape,
            lambda *i: data_value_ub(*i).astype("float16"),
            name="data_value_cast")
    else:
        data_var_cast = data_var_ub
        data_value_cast = data_value_ub
    res_ub = tvm.compute(shape,
                         lambda *i: data_var_cast(*i) - data_value_cast(*i),
                         name='res_ub.local.UB')
    if data_var.dtype == "int8" or data_var.dtype == "uint8":
        res_ub_cast = tvm.compute(shape,
                                  lambda *i: res_ub(*i).astype(data_var.dtype),
                                  name="res_ub_cast")
    else:
        res_ub_cast = res_ub
    res = tvm.compute(shape, lambda *i: res_ub_cast(*i), name='res')
    schedule_list = (data_var_ub, data_value_ub, data_var_cast,
                     data_value_cast, res_ub, res_ub_cast)
    sch = _assign_sub_schedule(schedule_list, res, shape, data_var.dtype,
                               data_var)

    return sch, res
예제 #24
0
def max_pool3d_compute(x,
                       y,
                       ksize,
                       strides,
                       padding="VALID",
                       data_format="NDHWC",
                       kernel_name="max_pool3d"):
    """
    describe compute
    return: tensor
    """
    shape = x.shape

    # copy gm to ub
    tensor_in_ub = tvm.compute(shape, lambda *i: x[i], name="tensor_in_ub")

    # vmax in W
    shape_w = (shape[0], shape[1], shape[2], shape[3] // 2, shape[4])
    tensor_w = tvm.compute(
        shape_w,
        lambda n, d, h, w, c: tvm.max(tensor_in_ub[n, d, h, 2 * w, c],
                                      tensor_in_ub[n, d, h, 2 * w + 1, c]),
        name='tensor_w')

    # vmax in H
    shape_h = (shape[0], shape[1], shape[2] // 2, shape[3] // 2, shape[4])
    tensor_h = tvm.compute(
        shape_h,
        lambda n, d, h, w, c: tvm.max(tensor_w[n, d, 2 * h, w, c], tensor_w[
            n, d, 2 * h + 1, w, c]),
        name='tensor_h')

    # vmax in D
    shape_d = (shape[0], shape[1] // 2, shape[2] // 2, shape[3] // 2, shape[4])
    tensor_d = tvm.compute(
        shape_d,
        lambda n, d, h, w, c: tvm.max(tensor_h[n, 2 * d, h, w, c], tensor_h[
            n, 2 * d + 1, h, w, c]),
        name='tensor_d')

    # copy ub to gm
    res = tvm.compute(shape_d, lambda *i: tensor_d[i], name='res')

    return res
예제 #25
0
def newton_iteration(shape, tensor_x_rec, tensor_x, symbol, tensor_list,
                     scope_list, operation_list):
    """
    the function of newton_iteration
    Parameters
    ----------
    shape : tensor shape
    tensor_x_rec : tensor
    tensor_x : tensor
    symbol : tensor symbol
    Returns
    -------
    """
    dtype_c = tensor_x_rec.dtype
    const_num_neg_two = tvm.const(-2, dtype=dtype_c)
    const_num_neg_one = tvm.const(-1, dtype=dtype_c)

    tensor_newton_mul0 = tvm.compute(
        shape,
        lambda *i: tensor_x_rec(*i) * tensor_x(*i),
        name="tensor_newton_mul0_" + symbol)
    tensor_list["tensor_newton_mul0_" + symbol] = tensor_newton_mul0
    scope_list["tensor_newton_mul0_" + symbol] = cce.scope_ubuf
    operation_list["tensor_newton_mul0_" + symbol] = "vector_mul"
    tensor_newton_add = tvm.compute(
        shape,
        lambda *i: tensor_newton_mul0(*i) + const_num_neg_two,
        name="tensor_newton_add_" + symbol)
    tensor_list["tensor_newton_add_" + symbol] = tensor_newton_add
    scope_list["tensor_newton_add_" + symbol] = cce.scope_ubuf
    operation_list["tensor_newton_add_" + symbol] = "vector_add"
    tensor_newton_mul1 = tvm.compute(
        shape,
        lambda *i: tensor_newton_add(*i) * tensor_x_rec(*i),
        name="tensor_newton_mul1_" + symbol)
    tensor_list["tensor_newton_mul1_" + symbol] = tensor_newton_mul1
    scope_list["tensor_newton_mul1_" + symbol] = cce.scope_ubuf
    operation_list["tensor_newton_mul1_" + symbol] = "vector_mul"
    tensor_newton_mul2 = tvm.compute(
        shape,
        lambda *i: tensor_newton_mul1(*i) * const_num_neg_one,
        name="tensor_newton_mul2_" + symbol)
    return tensor_newton_mul2
예제 #26
0
def im2col_fractal_v2(A_im2col_shape, A, config, compute_dtype):
    """
    calculate im2col_fractal tensor
    Parameters
    ----------
    A_im2col_shape : shape of A_im2col

    A : feature map

    config: the config of cube

    compute_dtype: dtype of compute result
    -------
    Returns : A_im2col_fractal tensor
    """
    def _im2col_fractal_indices(indices, A):
        """
        calculate im2col_fractal tvm lambda function
        Parameters
        ----------
        indices : indices in lambda function

        A : feature map

        -------
        Returns : im2col_fractal tvm lambda function
        """
        block_size = config['mac'][1]
        block_size_M = config['mac'][0]
        n, hw, c1, kernel_h, kernel_w, c0 = A.shape
        batch_size, i1, j1, i0, j0 = indices
        n_index = batch_size

        hw_index = i1 * block_size_M + i0

        c1_index = (((j1 * block_size + j0) // c0.value) //
                    kernel_w.value) // kernel_h.value

        kh_index = (((j1 * block_size + j0) // c0.value) //
                    kernel_w.value) % kernel_h.value

        kw_index = ((j1 * block_size + j0) // c0.value) % kernel_w.value

        c0_index = (j1 * block_size + j0) % c0.value

        dtype = compute_dtype
        return tvm.select(
            tvm.any(hw_index < 0, hw_index > hw.value - 1),
            tvm.const(0.0, dtype),
            A(n_index, hw_index, c1_index, kh_index, kw_index, c0_index))

    return tvm.compute(A_im2col_shape,
                       lambda *indices: _im2col_fractal_indices(indices, A),
                       name='im2col_fractal',
                       tag='im2col_fractal')
예제 #27
0
def custom_logical_not(shape,
                       dtype,
                       kernel_name="cce_tf_logical_not",
                       need_build=False,
                       need_print=False):
    """
    logical not for the input tensor

    Parameters
    ----------
    shape : input shape of data

    dtype : the data type, support bool

    kernel_name : cce kernel name, default value is "cce_logical_not"

    need_buid : if need to build CCEC kernel, default value is False

    need_print : if need to print the ir, default value is False

    Returns
    -------
    None

    """
    util.check_kernel_name(kernel_name)
    util.check_shape_rule(shape)

    check_list = ["bool"]
    if not dtype.lower() in check_list:
        raise RuntimeError(
            "logical_not_cce ony supports %s while dtype is %s" %
            (",".join(check_list), dtype))

    util.check_shape_size(shape, SHAPE_SIZE_LIMIT)

    inp_dtype = dtype.lower()

    data = tvm.placeholder(shape, name="data", dtype=inp_dtype)

    with tvm.target.cce():

        result = tvm.compute(
            shape,
            lambda *i: tvm.select(data[i] is True, False, True),
            name="result")

        schedule = tvm.create_schedule(result.op)

        if need_print:
            with build_config:
                print(tvm.lower(schedule, [data, result], simple_mode=True))
        if need_build:
            with build_config:
                tvm.build(schedule, [data, result], "cce", name=kernel_name)
예제 #28
0
def _compute_offset(in_tensor, in_shape, out_shape, attr_list, nz_format_flag):
    """
    the compute of scale
    Parameters
    ----------
    in_tensor : input tensor

    in_shape : the shape of input tensor

    out_shape :the shape of output tensor

    attr_list : the attr list

    nz_format_flag: the format of input tensor

    Returns
    -------
    res tensor
    """
    offset = attr_list[0]
    reform_flag = attr_list[1]
    scale = attr_list[2]
    if offset != 0 or scale == 1:
        offset_value = tvm.const(offset, "float16")
        if reform_flag:
            offset_ub = _reform_by_vadds(in_tensor, in_shape, out_shape,
                                         offset_value, nz_format_flag)
        else:
            offset_ub = tvm.compute(
                out_shape,
                lambda *indice: in_tensor(*indice) + offset_value,
                name="offset_ub")
        cast_i8_ub = tvm.compute(
            out_shape,
            lambda *indice: topi.cast(offset_ub(*indice), "int8"),
            name='cast_i8_ub')
    else:
        cast_i8_ub = tvm.compute(
            out_shape,
            lambda *indice: topi.cast(in_tensor(*indice), "int8"),
            name='cast_i8_ub')
    return cast_i8_ub
예제 #29
0
def _s32_to_s8_normal_compute(x, req_scale, align_shape, c1_index, tensor_flag,
                              relu_flag):
    """
    generate s32_to_s8 compute
    """
    if tensor_flag:
        res_ub = tvm.compute(align_shape,
                             _deq_cast_compute(x, req_scale, align_shape,
                                               c1_index, tensor_flag,
                                               relu_flag),
                             name='s32_to_s8',
                             tag="requant_vector")
    else:
        res_ub = tvm.compute(align_shape,
                             _deq_cast_compute(x, req_scale, align_shape,
                                               c1_index, tensor_flag,
                                               relu_flag),
                             name='s32_to_s8',
                             tag="requant_scale")
    return res_ub
예제 #30
0
def apply_proximal_gradient_descent_compute(
        var,
        alpha,
        l1,
        l2,
        delta,
        out,
        kernel_name="apply_proximal_gradient_descent"):
    """
    the operator's compute
    prox_v = var - alpha * delta
    if l1 > 0 :
        var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
    else:
        var = prox_v / (var + l2 * delta)

    Parameters:
    ----------
    var: the dict of var, only support float16, float32
    alpha: the dict of alpha, only support float16, float32
    l1: the dict of l1, only support float16, float32
    l2: the dict of l2, only support float16, float32
    delta: the dict of delta, only support float16, float32
    out: the dict of output, only support float16, float32

    Returns
    the value of out_var
    output_data
    """
    dtype = var.dtype

    if dtype == "float16":
        var = te.lang.cce.cast_to(var, "float32")
        alpha = te.lang.cce.cast_to(alpha, "float32")
        l1 = te.lang.cce.cast_to(l1, "float32")
        l2 = te.lang.cce.cast_to(l2, "float32")
        delta = te.lang.cce.cast_to(delta, "float32")

    alpha_broad = te.lang.cce.broadcast(alpha, var.shape)
    l1_broad = te.lang.cce.broadcast(l1, var.shape)
    l2_broad = te.lang.cce.broadcast(l2, var.shape)

    var_out = _compute_process(var, alpha_broad, l1_broad, l2_broad, delta)

    if dtype == "float16":
        var_out = te.lang.cce.cast_to(var_out, "float16")
    else:
        var_out = te.lang.cce.cast_to(var_out, "float32")

    # this compute is for muti output
    def _compute(*index):
        return var_out(*index), var_out(*index)

    return tvm.compute(var.shape, _compute, name="outputs")