def _transform(theta, input_dim, out_size, input_shape, dtype):
        
        num_batch = input_shape[0]
        height = input_shape[1]
        width = input_shape[2]
        num_channels = input_shape[3]

        theta = topi.reshape(theta, (num_batch, 2, 3))
        theta = topi.cast(theta, dtype)

        out_height = out_size[0]
        out_width = out_size[1]
                
        grid = _meshgrid(out_height, out_width)       
        grid = topi.reshape(grid, (num_batch, 3, out_height*out_width))
        grid = topi.cast(grid, dtype=dtype)
        
        k = tvm.reduce_axis((0, 3), 'k')
        T_g = tvm.compute((num_batch, 2, out_height*out_width),lambda b, y, x: tvm.sum(theta[b, y, k] * grid[b, k, x], axis = k), name = 'T_g')
              
        x_s = tvm.compute((num_batch, 1, out_height*out_width), lambda i,j,k:T_g[i,0,k], name = 'x_s')
        y_s = tvm.compute((num_batch, 1, out_height*out_width), lambda i,j,k:T_g[i,1,k], name = 'y_s')
              
        x_s_flat = topi.reshape(x_s, (num_batch*out_height*out_width,))
        y_s_flat = topi.reshape(y_s, (num_batch*out_height*out_width,))
                      
        input_transformed = _interpolate(input_dim, input_shape, x_s_flat, y_s_flat, out_size, dtype)
        output = topi.reshape(input_transformed, [num_batch, out_height, out_width, num_channels])
        return output 
    def _compute(output_shape, x, y, K, trans_a, trans_b, *indices):
        """matmul compuation in terms of the output shape and the transposes

        Parameters
        ----------
        output_shape : the final output shape, e.g., shape_x = (2, 6),
            shape_y = (8, 2), trans_a = True, True_b = True, then, output_shape = (6, 8).

        x : the first input tensor according to shape_x.

        y : the second input tensor according to shape_y.

        K : the number of the axis for sum, in the above example, K = 2.

        trans_a : if True, x needs to be transposed.

        trans_b : if True, y needs to be transposed.

        *indices : the output shape space for tvm.compute.

        Returns
        -------
        tvm.Tensor
        """
        N = len(output_shape)
        k = tvm.reduce_axis((0, K), 'k')
        if trans_a == True and trans_b == False:
            # For example, A: (6, 7, 8), B: (6, 7, 9), so the length is N = 3
            # C = A' * B : (6, 8, 9), A' means the transpose of A
            # indices means the space of (6, 8, 9), k = 7
            # x_indices = indices[:1]+(7, )+indices[1:2] = (6, 7, 8)
            # y_indices = indices[:1]+(7, )+indices[2:] = (6, 7, 9)
            x_indices = indices[:(N-2)]+(k, )+indices[(N-2):(N-1)]
            y_indices = indices[:(N-2)]+(k, )+indices[(N-1):]
            return tvm.sum(x(*x_indices)*y(*y_indices), axis=k)
        elif trans_a == False and trans_b == True:
            # For example, A: (6, 7, 8), B: (6, 9, 8), C = A * B' : (6, 7, 9)
            # indices means the space of (6, 7, 9), N=3, k = 8
            # x_indices = indices[:2]+(8, ) = (6, 7, 8)
            # y_indices = indices[:1]+indices[2:]+(8, ) = (6, 9, 8)
            x_indices = indices[:(N-1)]+(k, )
            y_indices = indices[:(N-2)]+indices[(N-1):]+(k, )
            return tvm.sum(x(*x_indices)*y(*y_indices), axis=k)
        elif trans_a == True and trans_b == True:
            # For example, A: (6, 8, 10), B: (6, 12, 8), C = A' * B' : (6, 10, 12)
            # indices means the space of (6, 10, 12), N=3, k = 8
            # x_indices = indices[:1]+(8, )+indices[1:2] = (6, 8, 10)
            # y_indices = indices[:1]+indices[2:]+(8, ) = (6, 12, 8)
            x_indices = indices[:(N-2)]+(k, )+indices[(N-2):(N-1)]
            y_indices = indices[:(N-2)]+indices[(N-1):]+(k, )
            return tvm.sum(x(*x_indices)*y(*y_indices), axis=k)
        else:
            # For example, A: (6, 15, 16), B: (6, 16, 18), C = A * B : (6, 15, 18)
            # indices means the space of (6, 15, 18), N=3, k = 16
            # x_indices = indices[:2]+(16, ) = (6, 15, 16)
            # y_indices = indices[:1]+(16, )+indices[2:] = (6, 16, 18)
            x_indices = indices[:(N-1)]+(k, )
            y_indices = indices[:(N-2)]+(k, )+indices[(N-1):]
            return tvm.sum(x(*x_indices)*y(*y_indices), axis=k)
def _max_pool_grad_grad_with_argmax_compute(
        placeholders,
        x,
        argmax,
        grad,
        y,
        ksize,
        strides,
        padding="VALID",
        ori_format_x="NCHW",
        kernel_name="cce_max_pool_grad_grad_with_argmax"):
    """
    Computes second-order gradients of the maxpooling function.

    Parameters
    ----------
        x: dict
             Include info about ori_input,
             format, ori_format, shape, ori_shape, dtype.
        grad: dict
             Include info about grad of ori_input,
             format, ori_format, shape, ori_shape, dtype.
        argmax: dict
             Include info about ori_input,
             format, ori_format, shape, ori_shape, dtype.
        y: dict
             Include info about result of function,
             format, ori_format, shape, ori_shape, dtype.
        ksize: list or tuple
            The size of the window for each dimension of the input tensor.
        strides: list or tuple
            The stride of the sliding window of the input tensor.
        padding: str
            The type of padding algorithm to use.
            Only support "VALID" or "SAME"
        kernel_name: str
            Cce kernel name,
            default value is "cce_max_pool_grad_grad_with_argmax"
    Returns
    -------
        grad_in_l1:
            process of movement of grad from gm to l1.
        grad_im2col:
            process of vm tensor of grad on l1.
        grad_fractal:
            process of fractal of grad from l1 to ub.
        grad_fractal_transp:
            process of transposition of grad.
        argmax_ub:
            process of movement of argmax from gm to ub.
        tensor_zero_ub:
            process of movement of zero tensor from gm to ub.
        grad_grad_col:
            tensor after selection.
        grad_grad:
            tensor after reduce_sum.
        output_res:
            output of the calculation.
    """
    argmax_tensor = placeholders[1]
    grad_tensor = placeholders[2]

    (grad_n, grad_c1, grad_h, grad_w, grad_c0) = grad.get("shape")
    if ori_format_x == "NHWC":
        _, kernel_h, kernel_w, _ = ksize
        _, stride_h, stride_w, _ = strides
    else:
        _, _, kernel_h, kernel_w = ksize
        _, _, stride_h, stride_w = strides

    shape_max_pool_h, pad_top, pad_bottom = \
        common.tf_get_windowed_output_size_verbose(
            grad_h, kernel_h, stride_h, padding)

    shape_max_pool_w, pad_left, pad_right = \
        common.tf_get_windowed_output_size_verbose(
            grad_w, kernel_w, stride_w, padding)

    pad_list = (pad_top, pad_bottom, pad_left, pad_right)
    stride = (stride_h, stride_w)

    # howo must be multiple of 16
    howo = _ceil_to(shape_max_pool_h * shape_max_pool_w, BLOCK_SIZE)

    # copy argmax from ub to gm
    shape_argmax_ub = (grad_n, grad_c1 * kernel_h * kernel_w,
                       howo // BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE)
    argmax_ub = tvm.compute(shape_argmax_ub,
                            lambda *i: argmax_tensor(*i),
                            name='argmax_ub')

    # load3d compute
    shape_grad = (grad_n, grad_c1, grad_h, grad_w, grad_c0)
    grad_in_l1 = tvm.compute(shape_grad,
                             lambda *i: grad_tensor[i],
                             name="grad_in_l1")
    # n howo c1 kh kw c0
    shape_grad_vm = (grad_n, shape_max_pool_h * shape_max_pool_w, grad_c1,
                     kernel_h, kernel_w, grad_c0)
    grad_im2col = common.img2col(
        grad_in_l1,
        shape_grad_vm,
        kernel_h,
        kernel_w,
        pad_list,
        stride,
    )
    # n hw c1 kh kw c0  ->  n c1 kh kw hw c0
    shape_fractal = (grad_n, howo // BLOCK_SIZE, grad_c1 * kernel_h * kernel_w,
                     BLOCK_SIZE, BLOCK_SIZE)
    grad_fractal = common.im2col_fractal(shape_fractal,
                                         grad_im2col,
                                         "ca",
                                         tag='')

    # (n, howo/16, c1khkw, 16, c0) -> (n, c1khkw, howo/16, 16, c0)
    shape_grad_fratical_transp = (grad_n, grad_c1 * kernel_h * kernel_w,
                                  howo // BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE)
    grad_fractal_transp = tvm.compute(
        shape_grad_fratical_transp,
        lambda i, j, k, l, m: grad_fractal[i, k, j, l, m],
        name='grad_fractal_transp')

    # declare a zero tensor, and move to ub for vsel
    dtype_tensor_zero = grad_tensor.dtype
    shape_tensor_zero = (BLOCK_SIZE, )
    tensor_zero_ub = tvm.compute(
        shape_tensor_zero,
        lambda *i: tvm.const(0, dtype=dtype_tensor_zero),
        name='tensor_zero_ub')

    # vsel compute
    shape_grad_grad_col = (grad_n, grad_c1 * kernel_h * kernel_w,
                           howo // BLOCK_SIZE, BLOCK_SIZE, BLOCK_SIZE)
    grad_grad_col = tvm.compute(
        shape_grad_grad_col,
        lambda i, j, k, l, m: tvm.select(argmax_ub[
            i, j, k, l, m], grad_fractal_transp[i, j, k, l, m], tensor_zero_ub[
                m]),
        name='grad_grad_col')

    # reduce_sum
    # (n, c1khkw, howo/16, 16, c0) -> (n, c1, howo/16, 16, c0)
    m = tvm.reduce_axis((0, kernel_h * kernel_w), "m")
    shape_grad_grad = (grad_n, grad_c1, howo // BLOCK_SIZE, BLOCK_SIZE,
                       BLOCK_SIZE)
    grad_grad = tvm.compute(
        shape_grad_grad,
        lambda i, j, n, p, q: tvm.sum(
            grad_grad_col[i, j * kernel_h * kernel_w + m, n, p, q], axis=[m]),
        name="grad_grad")

    extract_params = {}
    extract_params["padding_mode"] = padding
    extract_params["shape_max_pool_h"] = shape_max_pool_h
    extract_params["shape_max_pool_w"] = shape_max_pool_w
    extract_params["fmap_shape"] = shape_grad
    extract_params["ksizes"] = ksize
    extract_params["strides"] = strides
    extract_params["pad"] = pad_list
    extract_params["fmap_vm_shape"] = shape_grad_vm
    extract_params["fractal_shape"] = shape_fractal
    extract_params["HoWo"] = howo

    setfmatrix_dict = {
        "conv_kernel_h": kernel_h,
        "conv_kernel_w": kernel_w,
        "conv_padding_top": pad_top,
        "conv_padding_bottom": pad_bottom,
        "conv_padding_left": pad_left,
        "conv_padding_right": pad_right,
        "conv_stride_h": stride_h,
        "conv_stride_w": stride_w,
        "conv_fm_c": grad_c1 * grad_c0,
        "conv_fm_h": grad_h,
        "conv_fm_w": grad_w,
    }

    # UB to OUT
    output_res = tvm.compute(
        (grad_n, grad_c1, shape_max_pool_h * shape_max_pool_w, BLOCK_SIZE),
        lambda i, j, l, m: grad_grad[i, j, l // 16, l % 16, m],
        name="ub_to_out",
        attrs={
            'extract_params': extract_params,
            'setfmatrix_dict': setfmatrix_dict
        })

    return grad_in_l1, grad_im2col, grad_fractal, grad_fractal_transp, \
           argmax_ub, tensor_zero_ub, grad_grad_col, grad_grad, output_res
Exemplo n.º 4
0
def dynamic_lstm(input_x, weight, bias,
                 output_h, kernel_name="dynamic_lstm"):
    """
    x : dict
        A dict object, contains a Tensor 's type and
        shape and format, the type can be float32,
        the format can be [FRACTAL_NZ]
    w : dict
        A dict object, contains a Tensor 's type and
        shape and format, the type can be float32,
        the format can be [FRACTAL_ZN_LSTM]
    b : dict
        A dict object, contains a Tensor 's type and
        shape and format, the type can be float32,
        the format can be [ND]
    output_h : dict
        A dict object, contains a Tensor 's type and
        shape and format, the type can be float32,
        the format can be [FRACTAL_NZ]
    """

    check_dtype(input_x, weight, bias, output_h)

    shape_x_input = input_x.get("shape")
    shape_w_input = weight.get("shape")
    shape_b_input = bias.get("shape")
    shape_output = output_h.get("shape")

    check(shape_x_input, shape_w_input, shape_b_input, shape_output)

    scan_one_num = 1
    t_size = shape_x_input[0] + scan_one_num
    m_size = shape_x_input[2]
    k_size = shape_w_input[0]
    n_size = shape_w_input[1]
    hidden_size = shape_output[1]
    block_size = n_size // hidden_size
    in_x = k_size - hidden_size

    shape_b = (1, k_size, block_size, hidden_size, 16, 16)
    shape_c = (1, block_size, hidden_size, m_size, 16, 16)
    shape_bias = (1, block_size, hidden_size, 1, 1, 16)
    shape_x = (t_size, in_x, m_size, 16, 16)
    shape_h = (1, k_size - in_x, m_size, 16, 16)
    shape_i = (1, hidden_size, m_size, 16, 16)
    shape_i_t = (t_size, hidden_size, m_size, 16, 16)

    core_num = cce.get_soc_spec("CORE_NUM")
    # one core use 4 int64 that is 32B align
    shape_sync = (4 * core_num,)

    k0_size = 16

    input_dtype = input_x.get("dtype")
    data_dtype = 'float16'
    sync_dtype = 'int64'

    # define placeholder
    input_x = tvm.placeholder(shape_x, dtype=input_dtype, name='input_x')
    weight = tvm.placeholder(shape_b, dtype=input_dtype, name='weight')
    bias = tvm.placeholder(shape_bias, name='bias', dtype=input_dtype)
    s_state_h = tvm.placeholder(shape_h, dtype=input_dtype, name='state_h')
    s_state_c = tvm.placeholder(shape_i, dtype=input_dtype, name='state_c')

    sync0 = tvm.placeholder(shape_sync, name="sync0", dtype='int64')

    # compute

    # weight need first to ub and cast to float16
    weight_ub = \
        tvm.compute(
            shape_b,
            lambda *indices: weight(*indices),
            name="weight_ub")

    weight_fp16 = \
        tvm.compute(shape_b,
                    lambda *indices: weight_ub(*indices).astype(data_dtype),
                    name="weight_fp16")

    # input and s_state_h need first to ub and cast to float16
    shape_a_z_bigz = (t_size, m_size, k_size, 16, 16)

    # input and s_start_h is Nz, need trans to zZ
    # so change axis 1 and 2
    a_ub = tvm.compute(shape_a_z_bigz,
                       lambda *indice:
                       tvm.select(indice[2] < in_x,
                                  input_x[indice[0],
                                          indice[2],
                                          indice[1],
                                          indice[3],
                                          indice[4]],
                                  s_state_h[0,
                                            indice[2] - in_x,
                                            indice[1],
                                            indice[3],
                                            indice[4]]
                                  ),
                       name="a_ub", tag="concat")

    shape_a_z_bigz_1 = (1, m_size, k_size, 16, 16)

    a_ub_fp16 = \
        tvm.compute(shape_a_z_bigz_1,
                    lambda *indices: a_ub(*indices).astype(data_dtype),
                    name="a_ub_fp16")

    a_l1 = tvm.compute(shape_a_z_bigz_1,
                       lambda *indices: a_ub_fp16(*indices),
                       name='a_l1')
    b_l1 = tvm.compute(shape_b,
                       lambda *indices: weight_fp16(*indices),
                       name='b_l1')

    # shape_a_z_bigz_1 = (1, m_size, k_size, 16, 16)
    a_l0a = tvm.compute(shape_a_z_bigz, lambda *indices: a_l1(*indices), name="a_l0a")
    b_l0b = tvm.compute(shape_b, lambda *indices: b_l1(*indices), name="b_l0b")

    k1 = tvm.reduce_axis((0, k_size), name='k1')
    k0 = tvm.reduce_axis((0, k0_size), name='k0')

    c_l0c = tvm.compute(shape_c,
                        lambda t, nb_0, nb_1, mb, mp, np:
                        tvm.sum((a_l0a[t, mb, k1, mp, k0] * \
                                b_l0b[t, k1, nb_0, nb_1, np, k0]) \
                                .astype('float32'),
                                axis=[k1, k0]),
                        name='c_l0c')

    c_ub = tvm.compute(shape_c, lambda *indices: c_l0c(*indices), name="c_ub")

    bias_ub = tvm.compute(shape_bias,
                          lambda *indices: bias(*indices),
                          name='bias_ub')

    bias_bc_ub = te.lang.cce.broadcast(bias_ub, shape_c)
    c_ub_bias = te.lang.cce.vadd(c_ub, bias_bc_ub)

    # split matmul res
    i_t_index = 0
    j_t_index = 1
    f_t_index = 2
    o_t_index = 3
    i_t = \
        tvm.compute(shape_i,
                    lambda t, i, j, k, l: c_ub_bias(t, i_t_index, i, j, k, l),
                    name="i_t")
    j_t = \
        tvm.compute(shape_i,
                    lambda t, i, j, k, l: c_ub_bias(t, j_t_index, i, j, k, l),
                    name="j_t")
    f_t = \
        tvm.compute(shape_i,
                    lambda t, i, j, k, l: c_ub_bias(t, f_t_index, i, j, k, l),
                    name="f_t")
    o_t = \
        tvm.compute(shape_i,
                    lambda t, i, j, k, l: c_ub_bias(t, o_t_index, i, j, k, l),
                    name="o_t")

    f_t_sigmoid = sigmoid_compute(f_t)
    i_t_sigmoid = sigmoid_compute(i_t)
    o_t_sigmoid = sigmoid_compute(o_t)
    j_t_tanh = tanh_compute(j_t)

    c_t_tmp1 = te.lang.cce.vmul(s_state_c, f_t_sigmoid)
    c_t_tmp2 = te.lang.cce.vmul(j_t_tanh, i_t_sigmoid)
    update_c = te.lang.cce.vadd(c_t_tmp1, c_t_tmp2)

    update_c_gm = tvm.compute(shape_i_t,
                              lambda t, i, j, k, l: update_c(0, i, j, k, l),
                              name="update_c_gm")

    c_t_tanh = tanh_compute(update_c)

    update_h = te.lang.cce.vmul(c_t_tanh, o_t_sigmoid)
    update_h_gm = tvm.compute(shape_i_t,
                              lambda t, i, j, k, l: update_h(0, i, j, k, l),
                              name="update_h_gm")

    update_hc_vn = \
        tvm.compute(
            shape_i_t,
            lambda t, i, j, k, l: update_c_gm(0, i, j, k, l) +\
                                  update_h_gm(t, i, j, k, l),
            name="update_hc_vn")

    update_c_gm_vn = \
        tvm.compute(
            shape_i_t,
            lambda t, i, j, k, l: update_hc_vn(0, i, j, k, l),
            name="update_c_gm_vn")

    update_h_gm_vn = \
        tvm.compute(
            shape_i_t,
            lambda t, i, j, k, l: update_hc_vn(0, i, j, k, l),
            name="update_h_gm_vn")

    update_c_ub = \
        tvm.compute(
            shape_i,
            lambda t, i, j, k, l: update_c_gm_vn(t, i, j, k, l),
            name="update_c_ub")

    update_c_gm_2 = \
        tvm.compute(shape_i_t,
                    lambda t, i, j, k, l: update_c_ub(0, i, j, k, l),
                    name="update_c_gm_2")
    update_h_ub = \
        tvm.compute(
            shape_i,
            lambda t, i, j, k, l: update_h_gm_vn(t, i, j, k, l),
            name="update_h_ub")

    update_h_gm_2 = \
        tvm.compute(
            shape_i_t,
            lambda t, i, j, k, l: update_h_ub(0, i, j, k, l) +\
                                  update_c_gm_2(t, i, j, k, l),
            name="update_h_gm_2")

    update_h_gm_2_dummy = \
        tvm.compute(shape_i_t,
                    lambda t, i, j, k, l: update_h_gm_2(t, i, j, k, l),
                    name="update_h_gm_2_dummy")

    # state init
    init_shape = (1, hidden_size, m_size, 16, 16)

    s_state_h_ub = \
        tvm.compute(shape_h,
                    lambda *indices: tvm.const(0.0, dtype=input_dtype),
                    name='s_state_h_ub')
    s_state_c_ub = \
        tvm.compute(shape_i,
                    lambda *indices: tvm.const(0.0, dtype=input_dtype),
                    name='s_state_c_ub')

    s_init_h = \
        tvm.compute(
            init_shape,
            lambda _, i, j, k, l: s_state_h_ub[0, i, j, k, l],
            name="s_init_h")

    s_init_c = \
        tvm.compute(
            init_shape,
            lambda _, i, j, k, l: s_state_c_ub[0, i, j, k, l],
            name="s_init_c")

    # scan
    scan_h, scan_c = tvm.scan(
        [s_init_h, s_init_c],
        [update_h_ub, update_c_ub],
        [s_state_h, s_state_c],
        scan_update=[update_h_gm_2, update_h_gm_2_dummy],
        name="lstm_scan")

    # end compute

    # schedule
    s = tvm.create_schedule([scan_h.op, scan_c.op])

    new_build_list = [input_x, weight, bias, update_h_gm, update_c_gm,
                      sync0, update_h_gm_vn, update_c_gm_vn]

    def gen_reversed_subgraph_list(out_tensor, tensor_list):
        """
        traverse tensors by Depth-First-Search
        """
        if out_tensor is None:
            return
        stack = [out_tensor]
        visited_list = []
        while stack:
            cur_tensor = stack.pop()
            visited_list.append(cur_tensor)
            for in_tensor in cur_tensor.op.input_tensors:
                if in_tensor not in visited_list:
                    stack.append(in_tensor)
                    if "elewise" in in_tensor.op.tag or \
                            "broadcast" == in_tensor.op.tag:
                        if in_tensor not in tensor_list:
                            tensor_list.append(in_tensor)

    elewise_tensors = []
    gen_reversed_subgraph_list(update_h_gm, elewise_tensors)

    barrier_tensor = c_ub_bias
    elewise_before_barrier_tensors = [bias_bc_ub]

    # set scope
    s[a_l1].set_scope(cce.scope_cbuf)
    s[b_l1].set_scope(cce.scope_cbuf)
    s[a_l0a].set_scope(cce.scope_ca)
    s[b_l0b].set_scope(cce.scope_cb)
    s[c_l0c].set_scope(cce.scope_cc)
    s[c_ub].set_scope(cce.scope_ubuf)
    s[s_init_h].set_scope(cce.scope_ubuf)
    s[bias_ub].set_scope(cce.scope_ubuf)
    s[bias_bc_ub].set_scope(cce.scope_ubuf)
    s[scan_h].set_scope(cce.scope_ubuf)
    s[scan_c].set_scope(cce.scope_ubuf)
    s[update_h_ub].set_scope(cce.scope_ubuf)
    s[update_c_ub].set_scope(cce.scope_ubuf)
    s[s_state_h_ub].set_scope(cce.scope_ubuf)
    s[s_state_c_ub].set_scope(cce.scope_ubuf)

    s[weight_ub].set_scope(cce.scope_ubuf)
    s[weight_fp16].set_scope(cce.scope_ubuf)
    s[a_ub].set_scope(cce.scope_ubuf)
    s[a_ub_fp16].set_scope(cce.scope_ubuf)

    for tensor in elewise_tensors:
        s[tensor].set_scope(cce.scope_ubuf)

    # compute inline
    compute_inline_tensors = [i_t, j_t, f_t, o_t]
    for tensor in compute_inline_tensors:
        s[tensor].compute_inline()

    # matmul tiling
    factor_l1_m, factor_l1_n, factor_l1_k, \
    factor_l0_m, factor_l0_n, factor_l0_k = \
        _get_lstm_tiling(m_size, k_size, n_size)
    l1_n_outer, l1_n_inner = \
        s[c_l0c].split(c_l0c.op.axis[2],
                       factor=factor_l1_n // block_size)

    l1_m_outer, l1_m_inner = \
        s[c_l0c].split(c_l0c.op.axis[3],
                       factor=factor_l1_m)
    l1_k_outer, l1_k_inner = \
        s[c_l0c].split(c_l0c.op.reduce_axis[0],
                       factor=factor_l1_k)

    l0_n_outer, l0_n_inner = s[c_l0c].split(l1_n_inner,
                                            factor=factor_l0_n)
    l0_m_outer, l0_m_inner = s[c_l0c].split(l1_m_inner,
                                            factor=factor_l0_m)
    l0_k_outer, l0_k_inner = s[c_l0c].split(l1_k_inner,
                                            factor=factor_l0_k)

    s[c_l0c].reorder(l1_n_outer, c_l0c.op.axis[1],
                     l1_m_outer, l1_k_outer,
                     l0_n_outer, l0_m_outer, l0_k_outer,
                     l0_n_inner, l0_m_inner, c_l0c.op.axis[3 + 1],
                     c_l0c.op.axis[4 + 1], l0_k_inner,
                     c_l0c.op.reduce_axis[1])

    s[weight_ub].compute_at(s[c_l0c], l1_k_outer)
    s[weight_fp16].compute_at(s[c_l0c], l1_k_outer)
    s[a_ub].compute_at(s[c_l0c], l1_k_outer)
    s[a_ub_fp16].compute_at(s[c_l0c], l1_k_outer)

    s[a_l0a].compute_at(s[c_l0c], l0_k_outer)
    s[b_l0b].compute_at(s[c_l0c], l0_k_outer)
    s[a_l1].compute_at(s[c_l0c], l1_k_outer)
    s[b_l1].compute_at(s[c_l0c], l1_k_outer)

    ub_n_outer, ub_n_inner = \
        s[c_ub].split(c_ub.op.axis[2],
                      factor=factor_l1_n // block_size)

    ub_m_outer, ub_m_inner = s[c_ub].split(c_ub.op.axis[3],
                                           factor=factor_l1_m)
    s[c_ub].reorder(ub_n_outer, c_ub.op.axis[1], ub_m_outer,
                    ub_n_inner, ub_m_inner, c_ub.op.axis[4],
                    c_ub.op.axis[5])

    s[c_l0c].compute_at(s[c_ub], ub_n_outer)

    # elewise compute_at
    barrier_outer, barrier_inner = \
        s[barrier_tensor].split(barrier_tensor.op.axis[2],
                                factor=factor_l1_n // block_size)

    s[barrier_tensor].reorder(
        barrier_tensor.op.axis[0], barrier_outer,
        barrier_tensor.op.axis[1], barrier_inner,
        barrier_tensor.op.axis[3],
        barrier_tensor.op.axis[4],
        barrier_tensor.op.axis[5])

    s[c_ub].compute_at(s[barrier_tensor], barrier_outer)
    s[bias_ub].compute_at(s[barrier_tensor], barrier_outer)

    for tensor in elewise_before_barrier_tensors:
        s[tensor].compute_at(s[barrier_tensor], barrier_outer)

    vn_outer, vn_inner = \
        s[update_hc_vn].split(update_hc_vn.op.axis[0 + 1],
                              factor=factor_l1_n // block_size)

    second_split_factor = \
        (hidden_size // (factor_l1_n // block_size)) // core_num

    vn_o_outer, vn_o_inner = \
        s[update_hc_vn].split(vn_outer,
                              factor=second_split_factor)

    s[barrier_tensor].compute_at(s[update_hc_vn], vn_o_inner)

    for tensor in elewise_tensors:
        if tensor not in elewise_before_barrier_tensors:
            s[tensor].compute_at(s[update_hc_vn], vn_o_inner)

    s[update_c_gm].compute_at(s[update_hc_vn], vn_o_inner)
    s[update_h_gm].compute_at(s[update_hc_vn], vn_o_inner)

    second_split_factor = hidden_size // core_num

    res_h_outer, res_h_inner = \
        s[update_h_gm_2].split(update_h_gm_2.op.axis[1],
                               factor=hidden_size)

    s[update_hc_vn].compute_at(s[update_h_gm_2], update_h_gm_2.op.axis[0])

    s[update_c_gm_vn].compute_at(s[update_h_gm_2], res_h_outer)
    s[update_h_gm_vn].compute_at(s[update_h_gm_2], res_h_outer)
    s[update_c_ub].compute_at(s[update_h_gm_2], res_h_outer)
    s[update_c_gm_2].compute_at(s[update_h_gm_2], res_h_outer)
    s[update_h_ub].compute_at(s[update_h_gm_2], res_h_outer)

    s[update_h_gm_vn].bind_buffer(
        update_h_gm_vn.op.axis[0], 0,
        scan_h.op.scan_axis + res_h_outer)
    s[update_c_gm_vn].bind_buffer(
        update_c_gm_vn.op.axis[0], 0,
        scan_h.op.scan_axis + res_h_outer)

    # bind
    s[update_hc_vn].bind(vn_o_outer, tvm.thread_axis("blockIdx.x"))

    # multi core sync
    s[update_hc_vn].pragma(update_hc_vn.op.axis[0],
                          pragma_type="multicore_sync_wait_after",
                          pragma_value=sync0[0])
    s[update_hc_vn].pragma(update_hc_vn.op.axis[0],
                          pragma_type="multicore_sync_set_after",
                          pragma_value=sync0[0])

    # modify for extend
    s[input_x].bind_buffer(0, 0, scan_h.op.scan_axis)

    s[update_h_gm].buffer_tile((scan_h.op.scan_axis*1, 1),
                               (None, None), (None, None),
                               (None, None), (None, None))

    s[update_c_gm].buffer_tile((scan_h.op.scan_axis*1, 1),
                               (None, None), (None, None),
                               (None, None), (None, None))

    s[update_h_gm_2].buffer_tile((0, 1), (None, None), (None, None),
                                 (None, None), (None, None))
    s[update_c_gm_2].buffer_tile((0, 1), (None, None), (None, None),
                                 (None, None), (None, None))

    # buffer reuse
    s[update_h_gm].reused_by(update_h_gm_vn)
    s[update_c_gm].reused_by(update_c_gm_vn)

    # emit_insn
    s[a_l1].emit_insn(a_l1.op.axis[0], 'dma_copy')
    s[b_l1].emit_insn(b_l1.op.axis[0], 'dma_copy')
    s[a_l0a].emit_insn(a_l0a.op.axis[0], 'dma_copy')
    s[b_l0b].emit_insn(b_l0b.op.axis[0], 'dma_copy')

    s[weight_ub].emit_insn(weight_ub.op.axis[0], 'dma_copy')
    s[weight_fp16].emit_insn(weight_fp16.op.axis[0], 'vector_conv')

    s[a_ub].emit_insn(a_ub.op.axis[0], 'dma_copy')
    s[a_ub_fp16].emit_insn(a_ub_fp16.op.axis[0], 'vector_conv')

    mad_dict = {"mad_pattern": 0, "k_outer": [l1_k_outer, l0_k_outer]}
    s[c_l0c].emit_insn(l0_n_inner, 'mad', mad_dict)
    s[c_ub].emit_insn(ub_n_inner, 'dma_copy')

    s[s_init_h].emit_insn(s_init_h.op.axis[0], 'dma_copy')
    s[s_init_c].emit_insn(s_init_c.op.axis[0], 'dma_copy')
    s[bias_bc_ub].emit_insn(bias_bc_ub.op.axis[0], 'unified_broadcast')

    s[s_state_h_ub].emit_insn(s_state_h_ub.op.axis[0], 'broadcast')
    s[s_state_c_ub].emit_insn(s_state_c_ub.op.axis[0], 'broadcast')

    s[barrier_tensor].emit_insn(barrier_tensor.op.axis[1], 'vector_add')

    for tensor in elewise_tensors:
        if tensor != barrier_tensor:
            insn = get_emit_insn_map(tensor)
            s[tensor].emit_insn(tensor.op.axis[0], insn)

    s[bias_ub].emit_insn(bias_ub.op.axis[0], 'dma_copy')

    s[update_c_gm].emit_insn(s[update_c_gm].op.axis[1], 'dma_copy')
    s[update_h_gm].emit_insn(s[update_h_gm].op.axis[1], 'dma_copy')

    s[update_c_ub].emit_insn(update_c_ub.op.axis[1], 'dma_copy')
    s[update_h_ub].emit_insn(update_h_ub.op.axis[1], 'dma_copy')

    s[update_hc_vn].emit_insn(vn_inner, 'phony_insn')
    s[update_c_gm_vn].emit_insn(s[update_c_gm_vn].op.axis[0], 'phony_insn')
    s[update_h_gm_vn].emit_insn(s[update_h_gm_vn].op.axis[0], 'phony_insn')
    s[update_h_gm_2].emit_insn(res_h_inner, 'phony_insn')
    s[update_c_gm_2].emit_insn(s[update_c_gm_2].op.axis[0], 'phony_insn')
    s[update_h_gm_2_dummy].emit_insn(
        update_h_gm_2_dummy.op.axis[0], 'phony_insn')

    def _write_workspace_info(shape_list, dtype_list, sync_num, kernel_name):
        """
        modify json after build
        """
        def _write_code(wkspace_dict, fname):
            fname = os.path.realpath(fname)
            if fname.startswith(os.getcwd()):
                if os.path.exists(fname):
                    with open(fname, "r") as f:
                        load_dict = json.load(f)

                    load_dict.update(wkspace_dict)
                    with open(fname, "w") as f:
                        json.dump(load_dict, f,
                                  sort_keys=True, indent=4,
                                  separators=(',', ':'))

        def _get_data_width(ele):
            """
            get data width
            """
            m_sea = re.search(r'\d+', ele)
            if m_sea:
                return int(m_sea.group(0)) // 8
            return 0

        if not os.path.exists("kernel_meta"):
            os.mkdir("kernel_meta")
            os.chmod("kernel_meta", stat.S_IRWXU + stat.S_IRGRP + stat.S_IXGRP)

        num = len(shape_list)
        wkspace_dict = {}
        if num:
            total_size = [functools_reduce(lambda x, y: x * y, list_i) for
                          list_i in shape_list]

            addr_type_list = []
            for i, element in enumerate(dtype_list):
                total_size[i] = total_size[i] * _get_data_width(element)
                addr_type_list.append(0)

            if not os.path.exists("kernel_meta"):
                os.mkdir("kernel_meta")
                os.chmod("kernel_meta",
                         stat.S_IRWXU + stat.S_IRGRP + stat.S_IXGRP)

            wkspace_dict["workspace"] = {"num": num,
                                         "size": total_size,
                                         "type": addr_type_list}

        if sync_num:
            parameters_list = \
                (len(new_build_list) - 2 - sync_num) * [0, ] + sync_num * [1, ]
            wkspace_dict["parameters"] = parameters_list

        if wkspace_dict:
            _write_code(wkspace_dict, "kernel_meta/" + kernel_name + ".json")

    with build_config:
        tvm.build(s, new_build_list, "cce", name=kernel_name)
        _write_workspace_info(
            [shape_i_t, shape_sync],
            [input_dtype, sync_dtype],
            1, kernel_name)
Exemplo n.º 5
0
    def basic_rnn_cell_compute(self):
        """
        calculating data

        Parameters
        ----------
        input_x : TVM tensor
            the placeholder of input_x
        output_y : dict
            dict of output_y, include keys(shape and dtype)
        kernel_name : str
            kernel name, default value is "basicrnn_cell"

        Returns
        -------
        output tensor
        """
        matmul_res_shape = (self.dims["hidden_dim"], self.dims["batch_dim"],
                            16, 16)
        # Tensor x from GM to L1, L0A
        l1_x = tvm.compute(
            (self.dims["batch_dim"], self.dims["input_dim"], 16, 16),
            lambda i0, i1, i2, i3: self.datas["x"][i1, i0, i2, i3],
            name='l1_x')
        self.tensor_list1["l1_x"] = l1_x
        self.emit_cmd["l1_x"] = "dma_copy"
        self.scope_list["l1_x"] = cce.scope_cbuf

        l0a_x = tvm.compute(l1_x.shape, lambda *i: l1_x(*i), name='l0a_x')
        self.tensor_list1["l0a_x"] = l0a_x
        self.emit_cmd["l0a_x"] = "dma_copy"
        self.scope_list["l0a_x"] = cce.scope_ca

        # Tensor w_xh from GM to L1, L0B
        l1_w_xh = tvm.compute(self.datas["w_xh"].shape,
                              lambda *i: self.datas["w_xh"](*i),
                              name='l1_w_xh')
        self.tensor_list1["l1_w_xh"] = l1_w_xh
        self.emit_cmd["l1_w_xh"] = "dma_copy"
        self.scope_list["l1_w_xh"] = cce.scope_cbuf

        l0b_w_xh = tvm.compute(l1_w_xh.shape,
                               lambda *i: l1_w_xh(*i),
                               name='l0b_w_xh')
        self.tensor_list1["l0b_w_xh"] = l0b_w_xh
        self.emit_cmd["l0b_w_xh"] = "dma_copy"
        self.scope_list["l0b_w_xh"] = cce.scope_cb

        # Copy bias from GM to UB
        ub_bias_h = tvm.compute(self.datas["bias_h"].shape,
                                lambda *i: self.datas["bias_h"](*i),
                                name='ub_bias_h')
        self.tensor_list1["ub_bias_h"] = ub_bias_h
        self.emit_cmd["ub_bias_h"] = "dma_copy"
        self.scope_list["ub_bias_h"] = cce.scope_ubuf
        if ub_bias_h.dtype == "float16" and self.device != "hisi_es":
            l0c_bias_h = tvm.compute(
                matmul_res_shape,
                lambda i0, i1, i2, i3: ub_bias_h[i0, i3].astype("float32"),
                name='l0c_bias_h')
        else:
            l0c_bias_h = tvm.compute(matmul_res_shape,
                                     lambda i0, i1, i2, i3: ub_bias_h[i0, i3],
                                     name='l0c_bias_h')
        self.tensor_list1["l0c_bias_h"] = l0c_bias_h
        self.emit_cmd["l0c_bias_h"] = "dma_copy"
        self.scope_list["l0c_bias_h"] = cce.scope_cc

        reduce_kb = tvm.reduce_axis((0, self.dims["input_dim"]),
                                    name='reduce_kb')
        reduce_kp = tvm.reduce_axis((0, 16), name='reduce_kp')

        if self.device == "hisi_es":
            l0c_wht_xt = tvm.compute(
                matmul_res_shape,
                lambda nb, mb, mp, np: tvm.sum(
                    (l0a_x[mb, reduce_kb, mp, reduce_kp] * l0b_w_xh[
                        reduce_kb, nb, np, reduce_kp]),
                    axis=[reduce_kb, reduce_kp]),
                name='l0c_wht_xt',
                attrs={'input_order': 'positive'})
        else:
            l0c_wht_xt = tvm.compute(
                matmul_res_shape,
                lambda nb, mb, mp, np: tvm.sum(
                    (l0a_x[mb, reduce_kb, mp, reduce_kp] * l0b_w_xh[
                        reduce_kb, nb, np, reduce_kp]).astype("float32"),
                    axis=[reduce_kb, reduce_kp]),
                name='l0c_wht_xt',
                attrs={'input_order': 'positive'})
        self.tensor_list1["l0c_wht_xt"] = l0c_wht_xt
        self.scope_list["l0c_wht_xt"] = cce.scope_cc

        # Matmul accumulation wht_xt + bias_h
        l0c_wht_xt_bias_h = tvm.compute(
            matmul_res_shape,
            lambda *i: l0c_bias_h(*i) + l0c_wht_xt(*i),
            name="l0c_wht_xt_bias_h")
        self.tensor_list1["l0c_wht_xt_bias_h"] = l0c_wht_xt_bias_h
        self.emit_cmd["l0c_wht_xt_bias_h"] = "phony_insn"
        self.scope_list["l0c_wht_xt_bias_h"] = cce.scope_cc

        # Move ht to UB
        ub_wht_xt_bias_h = tvm.compute(matmul_res_shape,
                                       lambda *i: l0c_wht_xt_bias_h(*i),
                                       name='ub_wht_xt_bias_h')
        self.tensor_list1["ub_wht_xt_bias_h"] = ub_wht_xt_bias_h
        self.emit_cmd["ub_wht_xt_bias_h"] = "dma_copy"
        self.scope_list["ub_wht_xt_bias_h"] = cce.scope_ubuf

        if self.expose_hidden:
            ub_ht_tmp1 = self.compute_h_0_whh(ub_wht_xt_bias_h)
        else:
            ub_ht_tmp1 = ub_wht_xt_bias_h

        if self.has_static:
            # Copy bias from GM to UB
            ub_w_xh_x_static = tvm.compute(
                matmul_res_shape,
                lambda *i: self.datas["w_xh_x_static"](*i),
                name='ub_w_xh_x_static')
            self.tensor_list1["ub_w_xh_x_static"] = ub_w_xh_x_static
            self.emit_cmd["ub_w_xh_x_static"] = "dma_copy"
            self.scope_list["ub_w_xh_x_static"] = cce.scope_ubuf

            if ub_w_xh_x_static.dtype == "float16" \
                and self.device != "hisi_es":
                ub_w_xh_x_static_fp32 = tvm.compute(
                    ub_w_xh_x_static.shape,
                    lambda *i: topi.cast(ub_w_xh_x_static(*i), "float32"),
                    name="ub_w_xh_x_static_fp32")
                self.tensor_list1[
                    "ub_w_xh_x_static_fp32"] = ub_w_xh_x_static_fp32
                self.emit_cmd["ub_w_xh_x_static_fp32"] = "vector_conv"
                self.scope_list["ub_w_xh_x_static_fp32"] = cce.scope_ubuf
            else:
                ub_w_xh_x_static_fp32 = ub_w_xh_x_static
            ub_ht_tmp2 = tvm.compute(
                matmul_res_shape,
                lambda *i: ub_ht_tmp1(*i) + ub_w_xh_x_static_fp32(*i),
                name="ub_ht_tmp2")
            self.tensor_list1["ub_ht_tmp2"] = ub_ht_tmp2
            self.emit_cmd["ub_ht_tmp2"] = "vector_add"
            self.scope_list["ub_ht_tmp2"] = cce.scope_ubuf
        else:
            ub_ht_tmp2 = ub_ht_tmp1

        tanh_ht_tensor, ht_tanh_op, ht_tanh_scope = \
            tanh_compute(ub_ht_tmp2.shape, ub_ht_tmp2, "ht", self.impl_mode)

        if self.dtypes["h_t"] == "float16" and self.device != "hisi_es":
            ub_ht_fp16 = tvm.compute(
                matmul_res_shape,
                lambda *i: topi.cast(tanh_ht_tensor["ub_tanh_ht"]
                                     (*i), "float16"),
                name='ub_ht_fp16')
            tanh_ht_tensor["ub_ht_fp16"] = ub_ht_fp16
            ht_tanh_op["ub_ht_fp16"] = "vector_conv"
            ht_tanh_scope["ub_ht_fp16"] = cce.scope_ubuf
            ub_ht = ub_ht_fp16
        else:
            ub_ht = tanh_ht_tensor["ub_tanh_ht"]

        self.tanh_ht_tensor = tanh_ht_tensor
        self.scope_list.update(ht_tanh_scope)
        self.tensor_list1.update(tanh_ht_tensor)
        self.emit_cmd.update(ht_tanh_op)

        gm_ht = tvm.compute(matmul_res_shape,
                            lambda *i: ub_ht(*i),
                            name='gm_ht')
        self.tensor_list1["gm_ht"] = gm_ht
        self.scope_list["gm_ht"] = cce.scope_gm

        # Tensor ht from GM to L1, L0A
        if gm_ht.dtype == "float32":
            ub_ht_new = tvm.compute(matmul_res_shape,
                                    lambda *i: gm_ht(*i),
                                    name='ub_ht_new')
            self.tensor_list2["ub_ht_new"] = ub_ht_new
            self.emit_cmd["ub_ht_new"] = "dma_copy"
            self.scope_list["ub_ht_new"] = cce.scope_ubuf
            ub_ht_fp16 = tvm.compute(
                ub_ht_new.shape,
                lambda *i: topi.cast(ub_ht_new(*i), "float16"),
                name="ub_ht_fp16")
            self.tensor_list2["ub_ht_fp16"] = ub_ht_fp16
            self.emit_cmd["ub_ht_fp16"] = "vector_conv"
            self.scope_list["ub_ht_fp16"] = cce.scope_ubuf
        else:
            ub_ht_fp16 = gm_ht

        l1_ht = tvm.compute(
            (self.dims["batch_dim"], self.dims["hidden_dim"], 16, 16),
            lambda i0, i1, i2, i3: ub_ht_fp16[i1, i0, i2, i3],
            name='l1_ht')
        self.tensor_list2["l1_ht"] = l1_ht
        self.emit_cmd["l1_ht"] = "dma_copy"
        self.scope_list["l1_ht"] = cce.scope_cbuf

        l0a_ht = tvm.compute(l1_ht.shape, lambda *i: l1_ht(*i), name='l0a_ht')
        self.tensor_list2["l0a_ht"] = l0a_ht
        self.emit_cmd["l0a_ht"] = "dma_copy"
        self.scope_list["l0a_ht"] = cce.scope_ca

        # Tensor w_ho from ub to L1, L0B
        l1_w_ho = tvm.compute(self.datas["w_ho"].shape,
                              lambda *i: self.datas["w_ho"](*i),
                              name='l1_w_ho')
        self.tensor_list2["l1_w_ho"] = l1_w_ho
        self.emit_cmd["l1_w_ho"] = "dma_copy"
        self.scope_list["l1_w_ho"] = cce.scope_cbuf
        l0b_w_ho = tvm.compute(l1_w_ho.shape,
                               lambda *i: l1_w_ho(*i),
                               name='l0b_w_ho')
        self.tensor_list2["l0b_w_ho"] = l0b_w_ho
        self.emit_cmd["l0b_w_ho"] = "dma_copy"
        self.scope_list["l0b_w_ho"] = cce.scope_cb

        # Copy bias from GM to UB
        ub_bias_o = tvm.compute(self.datas["bias_o"].shape,
                                lambda *i: self.datas["bias_o"](*i),
                                name='ub_bias_o')
        self.tensor_list2["ub_bias_o"] = ub_bias_o
        self.emit_cmd["ub_bias_o"] = "dma_copy"
        self.scope_list["ub_bias_o"] = cce.scope_ubuf
        if ub_bias_o.dtype == "float16" and self.device != "hisi_es":
            l0c_bias_o = tvm.compute(
                matmul_res_shape,
                lambda i0, i1, i2, i3: ub_bias_o[i0, i3].astype("float32"),
                name='l0c_bias_o')
        else:
            l0c_bias_o = tvm.compute(matmul_res_shape,
                                     lambda i0, i1, i2, i3: ub_bias_o[i0, i3],
                                     name='l0c_bias_o')
        self.tensor_list2["l0c_bias_o"] = l0c_bias_o
        self.emit_cmd["l0c_bias_o"] = "dma_copy"
        self.scope_list["l0c_bias_o"] = cce.scope_cc

        reduce_kb = tvm.reduce_axis((0, self.dims["hidden_dim"]),
                                    name='reduce_kb')
        reduce_kp = tvm.reduce_axis((0, 16), name='reduce_kp')

        if self.device == "hisi_es":
            l0c_who_ht = tvm.compute(
                matmul_res_shape,
                lambda nb, mb, mp, np: tvm.sum(
                    (l0a_ht[mb, reduce_kb, mp, reduce_kp] * l0b_w_ho[
                        reduce_kb, nb, np, reduce_kp]),
                    axis=[reduce_kb, reduce_kp]),
                name='l0c_who_ht',
                attrs={'input_order': 'positive'})
        else:
            l0c_who_ht = tvm.compute(
                matmul_res_shape,
                lambda nb, mb, mp, np: tvm.sum(
                    (l0a_ht[mb, reduce_kb, mp, reduce_kp] * l0b_w_ho[
                        reduce_kb, nb, np, reduce_kp]).astype("float32"),
                    axis=[reduce_kb, reduce_kp]),
                name='l0c_who_ht',
                attrs={'input_order': 'positive'})
        self.tensor_list2["l0c_who_ht"] = l0c_who_ht
        self.scope_list["l0c_who_ht"] = cce.scope_cc

        # Matmul accumulation whh_ht + bias_o
        l0c_who_ht_bias_o = tvm.compute(
            matmul_res_shape,
            lambda *i: l0c_bias_o(*i) + l0c_who_ht(*i),
            name="l0c_who_ht_bias_o")
        self.tensor_list2["l0c_who_ht_bias_o"] = l0c_who_ht_bias_o
        self.emit_cmd["l0c_who_ht_bias_o"] = "phony_insn"
        self.scope_list["l0c_who_ht_bias_o"] = cce.scope_cc

        # Move ub_whh_ht_bias_o to UB
        ub_who_ht_bias_o = tvm.compute(matmul_res_shape,
                                       lambda *i: l0c_who_ht_bias_o(*i),
                                       name='ub_who_ht_bias_o')
        self.tensor_list2["ub_who_ht_bias_o"] = ub_who_ht_bias_o
        self.emit_cmd["ub_who_ht_bias_o"] = "dma_copy"
        self.scope_list["ub_who_ht_bias_o"] = cce.scope_ubuf

        tanh_ot_tensor, tanh_ot_operator, tanh_ot_scope = \
            tanh_compute(ub_who_ht_bias_o.shape, ub_who_ht_bias_o, "ot",
                         self.impl_mode)

        if self.dtypes["o_t"] == "float16" and self.device != "hisi_es":
            ub_ot_fp16 = tvm.compute(
                matmul_res_shape,
                lambda *i: topi.cast(tanh_ot_tensor["ub_tanh_ot"]
                                     (*i), "float16"),
                name='ub_ot_fp16')
            tanh_ot_tensor["ub_ot_fp16"] = ub_ot_fp16
            tanh_ot_operator["ub_ot_fp16"] = "vector_conv"
            tanh_ot_scope["ub_ot_fp16"] = cce.scope_ubuf
            ub_ot = ub_ot_fp16
        else:
            ub_ot = tanh_ot_tensor["ub_tanh_ot"]

        self.tanh_ot_tensor = tanh_ot_tensor
        self.scope_list.update(tanh_ot_scope)
        self.tensor_list2.update(tanh_ot_tensor)
        self.emit_cmd.update(tanh_ot_operator)

        gm_ot = tvm.compute(matmul_res_shape,
                            lambda *i: ub_ot(*i),
                            name='gm_ot')
        self.tensor_list2["gm_ot"] = gm_ot
        self.emit_cmd["gm_ot"] = "dma_copy"
        self.scope_list["gm_ot"] = cce.scope_gm

        res_empty = tvm.compute(matmul_res_shape,
                                lambda *i: gm_ot(*i) * gm_ht(*i),
                                name='res_empty')
        self.tensor_list2["res_empty"] = res_empty
        self.emit_cmd["res_empty"] = "phony_insn"
        self.scope_list["res_empty"] = cce.scope_ubuf

        schedule_list = [res_empty.op]
        sch = self.basic_rnn_cell_schedule(schedule_list)
        if self.has_static:
            build_list = (self.datas["x"], self.datas["cont"],
                          self.datas["w_xh_x_static"], self.datas["h_0"],
                          self.datas["w_xh"], self.datas["bias_h"],
                          self.datas["w_hh"], self.datas["w_ho"],
                          self.datas["bias_o"], gm_ot, gm_ht)
        else:
            if self.expose_hidden:
                build_list = (self.datas["x"], self.datas["cont"],
                              self.datas["h_0"], self.datas["w_xh"],
                              self.datas["bias_h"], self.datas["w_hh"],
                              self.datas["w_ho"], self.datas["bias_o"], gm_ot,
                              gm_ht)
            else:
                build_list = (self.datas["x"], self.datas["w_xh"],
                              self.datas["bias_h"], self.datas["w_ho"],
                              self.datas["bias_o"], gm_ot, gm_ht)

        with build_config:
            tvm.build(sch, build_list, "cce", name=self.kernel_name)
Exemplo n.º 6
0
    def compute_h_0_whh(self, wht_xt_bias_h):
        """
        calculating h_0_whh

        Parameters
        ----------
        wht_xt_bias_h : TVM tensor

        Returns
        -------
        output tensor
        """
        matmul_res_shape = (self.dims["hidden_dim"], self.dims["batch_dim"],
                            16, 16)
        # Tensor h_0 from GM to L1, L0A
        h_0_fp16 = self.datas["h_0"]
        if self.dtypes["h_0"] == "float32":
            ub_h_0 = tvm.compute(
                (self.dims["hidden_dim"], self.dims["batch_dim"], 16, 16),
                lambda *i: self.datas["h_0"](*i),
                name='ub_h_0')
            self.tensor_list1["ub_h_0"] = ub_h_0
            self.emit_cmd["ub_h_0"] = "dma_copy"
            self.scope_list["ub_h_0"] = cce.scope_ubuf

            h_0_fp16 = tvm.compute(ub_h_0.shape,
                                   lambda *i: topi.cast(ub_h_0(*i), "float16"),
                                   name="h_0_fp16")
            self.tensor_list1["h_0_fp16"] = h_0_fp16
            self.emit_cmd["h_0_fp16"] = "vector_conv"
            self.scope_list["h_0_fp16"] = cce.scope_ubuf

        l1_h_0 = tvm.compute(
            (self.dims["batch_dim"], self.dims["hidden_dim"], 16, 16),
            lambda i0, i1, i2, i3: h_0_fp16[i1, i0, i2, i3],
            name='l1_h_0')
        self.tensor_list1["l1_h_0"] = l1_h_0
        self.emit_cmd["l1_h_0"] = "dma_copy"
        self.scope_list["l1_h_0"] = cce.scope_cbuf
        l0a_h_0 = tvm.compute(l1_h_0.shape,
                              lambda *i: l1_h_0(*i),
                              name='l0a_w_hh')
        self.tensor_list1["l0a_h_0"] = l0a_h_0
        self.emit_cmd["l0a_h_0"] = "dma_copy"
        self.scope_list["l0a_h_0"] = cce.scope_ca

        # Tensor w_hh from GM to L1, L0B
        l1_w_hh = tvm.compute(self.datas["w_hh"].shape,
                              lambda *i: self.datas["w_hh"](*i),
                              name='l1_w_hh')
        self.tensor_list1["l1_w_hh"] = l1_w_hh
        self.emit_cmd["l1_w_hh"] = "dma_copy"
        self.scope_list["l1_w_hh"] = cce.scope_cbuf
        l0b_w_hh = tvm.compute(l1_w_hh.shape,
                               lambda *i: l1_w_hh(*i),
                               name='l0b_h_0')
        self.tensor_list1["l0b_w_hh"] = l0b_w_hh
        self.emit_cmd["l0b_w_hh"] = "dma_copy"
        self.scope_list["l0b_w_hh"] = cce.scope_cb

        reduce_kb = tvm.reduce_axis((0, self.dims["hidden_dim"]),
                                    name='reduce_kb')
        reduce_kp = tvm.reduce_axis((0, 16), name='reduce_kp')
        if self.device == "hisi_es":
            l0c_whh_ht = tvm.compute(
                matmul_res_shape,
                lambda nb, mb, mp, np: tvm.sum(
                    (l0a_h_0[mb, reduce_kb, mp, reduce_kp] * l0b_w_hh[
                        reduce_kb, nb, np, reduce_kp]),
                    axis=[reduce_kb, reduce_kp]),
                name='l0c_whh_ht',
                attrs={'input_order': 'positive'})
        else:
            l0c_whh_ht = tvm.compute(
                matmul_res_shape,
                lambda nb, mb, mp, np: tvm.sum(
                    (l0a_h_0[mb, reduce_kb, mp, reduce_kp] * l0b_w_hh[
                        reduce_kb, nb, np, reduce_kp]).astype("float32"),
                    axis=[reduce_kb, reduce_kp]),
                name='l0c_whh_ht',
                attrs={'input_order': 'positive'})
        self.tensor_list1["l0c_whh_ht"] = l0c_whh_ht
        self.scope_list["l0c_whh_ht"] = cce.scope_cc

        # Move whh_ht to UB
        ub_whh_ht = tvm.compute(matmul_res_shape,
                                lambda *i: l0c_whh_ht(*i),
                                name='ub_whh_ht')
        self.tensor_list1["ub_whh_ht"] = ub_whh_ht
        self.emit_cmd["ub_whh_ht"] = "dma_copy"
        self.scope_list["ub_whh_ht"] = cce.scope_ubuf

        # Move cont to UB
        ub_cont = tvm.compute(self.datas["cont"].shape,
                              lambda *i: self.datas["cont"](*i),
                              name='ub_cont')
        self.tensor_list1["ub_cont"] = ub_cont
        self.emit_cmd["ub_cont"] = "dma_copy"
        self.scope_list["ub_cont"] = cce.scope_ubuf

        if ub_cont.dtype == "float16" and self.device != "hisi_es":
            ub_cont_fp32 = tvm.compute(
                ub_cont.shape,
                lambda *i: topi.cast(ub_cont(*i), "float32"),
                name="ub_cont_fp32")
            self.tensor_list1["ub_cont_fp32"] = ub_cont_fp32
            self.emit_cmd["ub_cont_fp32"] = "vector_conv"
            self.scope_list["ub_cont_fp32"] = cce.scope_ubuf
        else:
            ub_cont_fp32 = ub_cont
        ub_whh_ht_cont = tvm.compute(
            matmul_res_shape,
            lambda i0, i1, i2, i3: ub_whh_ht[i0, i1, i2, i3] * ub_cont_fp32[
                i1, i2],
            name='ub_whh_ht_cont')
        self.tensor_list1["ub_whh_ht_cont"] = ub_whh_ht_cont
        self.emit_cmd["ub_whh_ht_cont"] = "vector_mul"
        self.scope_list["ub_whh_ht_cont"] = cce.scope_ubuf

        # Matmul accumulation wht_xt_bias_h + whh_ht_cont
        ub_ht_tmp1 = tvm.compute(
            matmul_res_shape,
            lambda *i: wht_xt_bias_h(*i) + ub_whh_ht_cont(*i),
            name="ub_ht_tmp1")
        self.tensor_list1["ub_ht_tmp1"] = ub_ht_tmp1
        self.emit_cmd["ub_ht_tmp1"] = "vector_add"
        self.scope_list["ub_ht_tmp1"] = cce.scope_ubuf

        return ub_ht_tmp1
Exemplo n.º 7
0
def _dynamic_gru_inner(input_list, custom_list):
    input_x = input_list[0]
    weight1 = input_list[1]
    weight2 = input_list[2]
    bias1 = input_list[3]
    bias2 = input_list[4]
    s_init_h_gm = input_list[5]
    s_state_h_gm_last = input_list[6]

    is_gate_output = custom_list[0]
    is_first_round = custom_list[1]
    is_global_init = custom_list[2]

    input_dtype = 'float16'
    bias_dtype = bias1.dtype
    fp16_input_output = bias_dtype == 'float16'

    shape_x_input = input_x.shape
    shape_w1_input = weight1.shape
    w1_size = 2
    w2_size = 1
    t_size = shape_x_input[0].value
    m_size = shape_x_input[2].value
    k_size = shape_w1_input[1].value
    hidden_size = shape_w1_input[3].value
    in_x = k_size - hidden_size

    shape_b_1 = (1, k_size, w1_size, hidden_size, 16, 16)
    shape_b_2 = (1, k_size, w2_size, hidden_size, 16, 16)
    shape_c_1 = (1, w1_size, hidden_size, m_size, 16, 16)
    shape_c_2 = (1, w2_size, hidden_size, m_size, 16, 16)
    shape_bias_1 = (1, w1_size, hidden_size, 1, 1, 16)
    shape_bias_2 = (1, hidden_size, 1, 1, 16)
    shape_i = (1, hidden_size, m_size, 16, 16)
    shape_i_t = (t_size, hidden_size, m_size, 16, 16)
    k0_size = 16

    if is_first_round and not is_global_init:
        s_state_h = tvm.compute(
            shape_i,
            lambda *indices: tvm.const(0.0, dtype='float32'),
            name='s_state_h')
        s_state_h_fp16 = tvm.compute(
            shape_i,
            lambda *indices: s_state_h(*indices).astype('float16'),
            name="s_state_h_fp16")
    else:
        last_h = s_init_h_gm if is_first_round else s_state_h_gm_last
        if fp16_input_output:
            s_state_h_fp16 = tvm.compute(shape_i,
                                         lambda *indices: last_h(*indices),
                                         name='s_state_h_fp16')
            s_state_h = tvm.compute(
                shape_i,
                lambda *indices: s_state_h_fp16(*indices).astype('float32'),
                name="s_state_h")
        else:
            s_state_h = tvm.compute(shape_i,
                                    lambda *indices: last_h(*indices),
                                    name='s_state_h')
            s_state_h_fp16 = tvm.compute(
                shape_i,
                lambda *indices: s_state_h(*indices).astype('float16'),
                name="s_state_h_fp16")

    # compute
    # input and s_state_h need first to ub and cast to float16
    shape_a_z_bigz = (1, m_size, k_size, 16, 16)

    # input and s_start_h is Nz, need trans to zZ
    # so change axis 1 and 2
    a_l1_1 = tvm.compute(
        shape_a_z_bigz,
        lambda *indice: tvm.select(
            indice[2] < in_x, input_x[indice[0], indice[2], indice[1], indice[
                3], indice[4]], s_state_h_fp16[0, indice[2] - in_x, indice[1],
                                               indice[3], indice[4]]),
        name="a_l1_1",
        tag="concat")
    b_l1_1 = tvm.compute(shape_b_1,
                         lambda *indices: weight1(*indices),
                         name='b_l1_1')
    a_l0a_1 = tvm.compute(shape_a_z_bigz,
                          lambda *indices: a_l1_1(*indices),
                          name="a_l0a_1")
    b_l0b_1 = tvm.compute(shape_b_1,
                          lambda *indices: b_l1_1(*indices),
                          name="b_l0b_1")
    k1_1 = tvm.reduce_axis((0, k_size), name='k1_1')
    k0_1 = tvm.reduce_axis((0, k0_size), name='k0_1')
    c_l0c_1 = tvm.compute(shape_c_1,
                          lambda t, nb_0, nb_1, mb, mp, np:
                          tvm.sum((a_l0a_1[t, mb, k1_1, mp, k0_1] * \
                                   b_l0b_1[t, k1_1, nb_0, nb_1, np, k0_1]) \
                                  .astype('float32'),
                                  axis=[k1_1, k0_1]),
                          name='c_l0c_1')
    c_ub_1 = tvm.compute(shape_c_1,
                         lambda *indices: c_l0c_1(*indices),
                         name="c_ub_1")
    bias_ub_1 = tvm.compute(shape_bias_1,
                            lambda *indices: bias1(*indices),
                            name='bias_ub_1')
    bias_ub_1_fp32 = bias_ub_1
    if fp16_input_output:
        bias_ub_1_fp32 = tvm.compute(
            shape_bias_1,
            lambda *indices: bias_ub_1(*indices).astype('float32'),
            name="bias_ub_1_fp32")
    bias_bc_ub_1 = tbe.broadcast(bias_ub_1_fp32, shape_c_1)
    c_ub_bias_1 = tbe.vadd(c_ub_1, bias_bc_ub_1)

    # split matmul res
    r_t_index = 0
    i_t_index = 1
    r_t = tvm.compute(
        shape_i,
        lambda t, i, j, k, l: c_ub_bias_1(t, r_t_index, i, j, k, l),
        name="r_t")
    i_t = tvm.compute(
        shape_i,
        lambda t, i, j, k, l: c_ub_bias_1(t, i_t_index, i, j, k, l),
        name="i_t")
    r_t_sigmoid = _sigmoid_compute(r_t)
    i_t_sigmoid = _sigmoid_compute(i_t)
    r_t_mid = r_t_sigmoid
    i_t_mid = i_t_sigmoid
    if is_gate_output:
        if fp16_input_output:
            r_t_sigmoid_fp16 = tvm.compute(
                shape_i,
                lambda *indices: r_t_sigmoid(*indices).astype('float16'),
                name="r_t_sigmoid_fp16")
            i_t_sigmoid_fp16 = tvm.compute(
                shape_i,
                lambda *indices: i_t_sigmoid(*indices).astype('float16'),
                name="i_t_sigmoid_fp16")

            r_t_gm = tvm.compute(shape_i,
                                 lambda *indices: r_t_sigmoid_fp16(*indices),
                                 name="r_t_gm")
            i_t_gm = tvm.compute(shape_i,
                                 lambda *indices: i_t_sigmoid_fp16(*indices),
                                 name="i_t_gm")

            r_t_gm_back = tvm.compute(shape_i,
                                      lambda *indices: r_t_gm(*indices),
                                      name="r_t_gm_back")
            i_t_gm_back = tvm.compute(shape_i,
                                      lambda *indices: i_t_gm(*indices),
                                      name="i_t_gm_back")

            r_t_gm_back_fp32 = tvm.compute(
                shape_i,
                lambda *indices: r_t_gm_back(*indices).astype('float32'),
                name="r_t_gm_back_fp32")
            i_t_gm_back_fp32 = tvm.compute(
                shape_i,
                lambda *indices: i_t_gm_back(*indices).astype('float32'),
                name="i_t_gm_back_fp32")

            r_t_mid = r_t_gm_back_fp32
            i_t_mid = i_t_gm_back_fp32
        else:
            r_t_gm = tvm.compute(shape_i,
                                 lambda *indices: r_t_sigmoid(*indices),
                                 name="r_t_gm")
            i_t_gm = tvm.compute(shape_i,
                                 lambda *indices: i_t_sigmoid(*indices),
                                 name="i_t_gm")

            r_t_gm_back = tvm.compute(shape_i,
                                      lambda *indices: r_t_gm(*indices),
                                      name="r_t_gm_back")
            i_t_gm_back = tvm.compute(shape_i,
                                      lambda *indices: i_t_gm(*indices),
                                      name="i_t_gm_back")

            r_t_mid = r_t_gm_back
            i_t_mid = i_t_gm_back
    r_t_h = tbe.vmul(r_t_mid, s_state_h)
    r_t_h_fp16 = \
        tvm.compute(shape_i,
                    lambda *indices: r_t_h(*indices).astype(input_dtype),
                    name="r_t_h_fp16")

    # second matmul
    a_l1_2 = tvm.compute(
        shape_a_z_bigz,
        lambda *indice: tvm.select(
            indice[2] < in_x, input_x[indice[0], indice[2], indice[1], indice[
                3], indice[4]], r_t_h_fp16[0, indice[2] - in_x, indice[1],
                                           indice[3], indice[4]]),
        name="a_l1_2",
        tag="concat")

    b_l1_2 = tvm.compute(shape_b_2,
                         lambda *indices: weight2(*indices),
                         name='b_l1_2')
    a_l0a_2 = tvm.compute(shape_a_z_bigz,
                          lambda *indices: a_l1_2(*indices),
                          name="a_l0a_2")
    b_l0b_2 = tvm.compute(shape_b_2,
                          lambda *indices: b_l1_2(*indices),
                          name="b_l0b_2")
    k1_2 = tvm.reduce_axis((0, k_size), name='k1_2')
    k0_2 = tvm.reduce_axis((0, k0_size), name='k0_2')
    c_l0c_2 = tvm.compute(shape_c_2,
                          lambda t, nb_0, nb_1, mb, mp, np:
                          tvm.sum((a_l0a_2[t, mb, k1_2, mp, k0_2] * \
                                   b_l0b_2[t, k1_2, nb_0, nb_1, np, k0_2]) \
                                  .astype('float32'),
                                  axis=[k1_2, k0_2]),
                          name='c_l0c_2')
    c_ub_2 = tvm.compute(shape_i,
                         lambda t, h, m, i, j: c_l0c_2(t, 0, h, m, i, j),
                         name="c_ub_2")
    bias_ub_2 = tvm.compute(shape_bias_2,
                            lambda t, h, m, i, j: bias2(t, h, m, i, j),
                            name='bias_ub_2')
    bias_ub_2_fp32 = bias_ub_2
    if fp16_input_output:
        bias_ub_2_fp32 = tvm.compute(
            shape_bias_2,
            lambda *indices: bias_ub_2(*indices).astype('float32'),
            name="bias_ub_2_fp32")
    bias_bc_ub_2 = tbe.broadcast(bias_ub_2_fp32, shape_i)
    c_ub_bias_2 = tbe.vadd(c_ub_2, bias_bc_ub_2)

    h_t_tanh = _tanh_compute(c_ub_bias_2)
    h_t_tanh_mid = h_t_tanh
    if is_gate_output:
        if fp16_input_output:
            h_t_tanh_fp16 = tvm.compute(
                shape_i,
                lambda *indices: h_t_tanh(*indices).astype('float16'),
                name="h_t_tanh_fp16")
            n_t_gm = tvm.compute(shape_i,
                                 lambda *indices: h_t_tanh_fp16(*indices),
                                 name="n_t_gm")
            n_t_gm_back = tvm.compute(shape_i,
                                      lambda *indices: n_t_gm(*indices),
                                      name="n_t_gm_back")
            n_t_gm_back_fp32 = tvm.compute(
                shape_i,
                lambda *indices: n_t_gm_back(*indices).astype('float32'),
                name="n_t_gm_back_fp32")
            h_t_tanh_mid = n_t_gm_back_fp32
        else:
            n_t_gm = tvm.compute(shape_i,
                                 lambda *indices: h_t_tanh(*indices),
                                 name="n_t_gm")
            n_t_gm_back = tvm.compute(shape_i,
                                      lambda *indices: n_t_gm(*indices),
                                      name="n_t_gm_back")
            h_t_tanh_mid = n_t_gm_back

    c_t_tmp1 = tbe.vsub(s_state_h, h_t_tanh_mid)
    c_t_tmp2 = tbe.vmul(c_t_tmp1, i_t_mid)
    update_h = tbe.vadd(c_t_tmp2, h_t_tanh_mid)
    update_h_ub = update_h
    if fp16_input_output:
        update_h_fp16 = tvm.compute(
            shape_i_t,
            lambda *indices: update_h(*indices).astype('float16'),
            name="update_h_fp16")
        update_h_ub = update_h_fp16
    update_y_gm = tvm.compute(shape_i_t,
                              lambda t, i, j, k, l: update_h_ub(0, i, j, k, l),
                              name="update_y_gm")
    update_y_gm_back = tvm.compute(
        shape_i_t,
        lambda t, i, j, k, l: update_y_gm(0, i, j, k, l),
        name="update_y_gm_back")
    update_h_gm = tvm.compute(
        shape_i_t,
        lambda t, i, j, k, l: update_y_gm_back(0, i, j, k, l),
        name="update_h_gm")
    # end compute

    # schedule
    s = tvm.schedule.create_schedule([update_h_gm.op])

    def gen_reversed_subgraph_list(out_tensor, tensor_list):
        """
        traverse tensors by Depth-First-Search
        """
        if out_tensor is None:
            return
        stack = [out_tensor]
        visited_list = []
        while stack:
            cur_tensor = stack.pop()
            visited_list.append(cur_tensor)
            for in_tensor in cur_tensor.op.input_tensors:
                if in_tensor not in visited_list:
                    stack.append(in_tensor)
                    if "elewise" in in_tensor.op.tag or \
                            "broadcast" == in_tensor.op.tag:
                        if in_tensor not in tensor_list:
                            tensor_list.append(in_tensor)

    elewise_tensors_r_t_h_fp16 = []
    gen_reversed_subgraph_list(r_t_h_fp16, elewise_tensors_r_t_h_fp16)

    elewise_tensors = []
    tmp_tensors = []
    gen_reversed_subgraph_list(update_h_gm, tmp_tensors)
    for i in tmp_tensors:
        if i not in elewise_tensors_r_t_h_fp16:
            elewise_tensors.append(i)

    # set scope
    s[s_state_h].set_scope(tbe_platform.scope_ubuf)
    s[s_state_h_fp16].set_scope(tbe_platform.scope_ubuf)
    s[a_l1_1].set_scope(tbe_platform.scope_cbuf)
    s[b_l1_1].set_scope(tbe_platform.scope_cbuf)
    s[a_l0a_1].set_scope(tbe_platform.scope_ca)
    s[b_l0b_1].set_scope(tbe_platform.scope_cb)
    s[c_l0c_1].set_scope(tbe_platform.scope_cc)
    s[c_ub_1].set_scope(tbe_platform.scope_ubuf)
    s[bias_ub_1].set_scope(tbe_platform.scope_ubuf)
    s[bias_bc_ub_1].set_scope(tbe_platform.scope_ubuf)
    s[r_t_h_fp16].set_scope(tbe_platform.scope_ubuf)
    s[a_l1_2].set_scope(tbe_platform.scope_cbuf)
    s[b_l1_2].set_scope(tbe_platform.scope_cbuf)
    s[a_l0a_2].set_scope(tbe_platform.scope_ca)
    s[b_l0b_2].set_scope(tbe_platform.scope_cb)
    s[c_l0c_2].set_scope(tbe_platform.scope_cc)
    s[c_ub_2].set_scope(tbe_platform.scope_ubuf)
    s[bias_ub_2].set_scope(tbe_platform.scope_ubuf)
    s[bias_bc_ub_2].set_scope(tbe_platform.scope_ubuf)
    s[update_y_gm_back].set_scope(tbe_platform.scope_ubuf)
    if is_gate_output:
        s[r_t_gm_back].set_scope(tbe_platform.scope_ubuf)
        s[i_t_gm_back].set_scope(tbe_platform.scope_ubuf)
        s[n_t_gm_back].set_scope(tbe_platform.scope_ubuf)
        if fp16_input_output:
            s[r_t_sigmoid_fp16].set_scope(tbe_platform.scope_ubuf)
            s[i_t_sigmoid_fp16].set_scope(tbe_platform.scope_ubuf)
            s[h_t_tanh_fp16].set_scope(tbe_platform.scope_ubuf)
            s[r_t_gm_back_fp32].set_scope(tbe_platform.scope_ubuf)
            s[i_t_gm_back_fp32].set_scope(tbe_platform.scope_ubuf)
            s[n_t_gm_back_fp32].set_scope(tbe_platform.scope_ubuf)
    if fp16_input_output:
        s[bias_ub_1_fp32].set_scope(tbe_platform.scope_ubuf)
        s[bias_ub_2_fp32].set_scope(tbe_platform.scope_ubuf)
        s[update_h_fp16].set_scope(tbe_platform.scope_ubuf)

    # compute inline
    compute_inline_tensors = [i_t, r_t]
    for tensor in compute_inline_tensors:
        s[tensor].compute_inline()

    # matmul tiling
    factor_l1_m, factor_l1_n, factor_l1_k, factor_l0_m, factor_l0_n, factor_l0_k = \
        _get_tiling(m_size, k_size, hidden_size)

    l1_n_outer_1, l1_n_inner_1 = s[c_l0c_1].split(c_l0c_1.op.axis[2],
                                                  factor=factor_l1_n)
    l1_m_outer_1, l1_m_inner_1 = s[c_l0c_1].split(c_l0c_1.op.axis[3],
                                                  factor=factor_l1_m)
    l1_k_outer_1, l1_k_inner_1 = s[c_l0c_1].split(c_l0c_1.op.reduce_axis[0],
                                                  factor=factor_l1_k)
    l0_n_outer_1, l0_n_inner_1 = s[c_l0c_1].split(l1_n_inner_1,
                                                  factor=factor_l0_n)
    l0_m_outer_1, l0_m_inner_1 = s[c_l0c_1].split(l1_m_inner_1,
                                                  factor=factor_l0_m)
    l0_k_outer_1, l0_k_inner_1 = s[c_l0c_1].split(l1_k_inner_1,
                                                  factor=factor_l0_k)
    s[c_l0c_1].reorder(c_l0c_1.op.axis[0], l1_n_outer_1, l1_k_outer_1,
                       c_l0c_1.op.axis[1], l1_m_outer_1, l0_n_outer_1,
                       l0_m_outer_1, l0_k_outer_1, l0_n_inner_1, l0_m_inner_1,
                       c_l0c_1.op.axis[4], c_l0c_1.op.axis[5], l0_k_inner_1,
                       c_l0c_1.op.reduce_axis[1])
    s[a_l1_1].double_buffer()
    s[b_l1_1].double_buffer()
    s[a_l0a_1].double_buffer()
    s[b_l0b_1].double_buffer()
    s[c_l0c_1].double_buffer()
    s[c_ub_1].double_buffer()
    s[a_l1_1].compute_at(s[c_l0c_1], l1_k_outer_1)
    s[b_l1_1].compute_at(s[c_l0c_1], c_l0c_1.op.axis[1])
    s[a_l0a_1].compute_at(s[c_l0c_1], l1_k_outer_1)
    s[b_l0b_1].compute_at(s[c_l0c_1], l0_k_outer_1)

    c_ub_bias_1_outer, c_ub_bias_1_inner = s[c_ub_bias_1].split(
        c_ub_bias_1.op.axis[2], factor=factor_l1_n)
    s[c_ub_bias_1].reorder(c_ub_bias_1.op.axis[0], c_ub_bias_1_outer,
                           c_ub_bias_1.op.axis[1], c_ub_bias_1_inner,
                           c_ub_bias_1.op.axis[3], c_ub_bias_1.op.axis[4],
                           c_ub_bias_1.op.axis[5])
    s[c_l0c_1].compute_at(s[c_ub_bias_1], c_ub_bias_1_outer)
    s[c_ub_1].compute_at(s[c_ub_bias_1], c_ub_bias_1_outer)
    s[bias_ub_1].compute_at(s[c_ub_bias_1], c_ub_bias_1_outer)
    s[bias_bc_ub_1].compute_at(s[c_ub_bias_1], c_ub_bias_1_outer)
    if fp16_input_output:
        s[bias_ub_1_fp32].compute_at(s[c_ub_bias_1], c_ub_bias_1_outer)
    s[c_ub_bias_1].emit_insn(c_ub_bias_1.op.axis[1], 'vector_add')

    r_t_h_fp16_outer, r_t_h_fp16_inner = s[r_t_h_fp16].split(
        r_t_h_fp16.op.axis[1], factor=factor_l1_n)
    for tensor in elewise_tensors_r_t_h_fp16:
        s[tensor].set_scope(tbe_platform.scope_ubuf)
        if tensor == c_ub_bias_1:
            continue
        s[tensor].compute_at(s[r_t_h_fp16], r_t_h_fp16_outer)
        insn = _get_emit_insn_map(tensor)
        s[tensor].emit_insn(tensor.op.axis[0], insn)
    if is_gate_output:
        s[r_t_gm].compute_at(s[r_t_h_fp16], r_t_h_fp16_outer)
        s[r_t_gm_back].compute_at(s[r_t_h_fp16], r_t_h_fp16_outer)
        if fp16_input_output:
            s[r_t_sigmoid_fp16].compute_at(s[r_t_h_fp16], r_t_h_fp16_outer)
            s[r_t_gm_back_fp32].compute_at(s[r_t_h_fp16], r_t_h_fp16_outer)
    s[r_t_h_fp16].emit_insn(r_t_h_fp16_inner, 'vector_conv')

    l1_n_outer_2, l1_n_inner_2 = s[c_l0c_2].split(c_l0c_2.op.axis[2],
                                                  factor=factor_l1_n)
    l1_m_outer_2, l1_m_inner_2 = s[c_l0c_2].split(c_l0c_2.op.axis[3],
                                                  factor=factor_l1_m)
    l1_k_outer_2, l1_k_inner_2 = s[c_l0c_2].split(c_l0c_2.op.reduce_axis[0],
                                                  factor=factor_l1_k)
    l0_n_outer_2, l0_n_inner_2 = s[c_l0c_2].split(l1_n_inner_2,
                                                  factor=factor_l0_n)
    l0_m_outer_2, l0_m_inner_2 = s[c_l0c_2].split(l1_m_inner_2,
                                                  factor=factor_l0_m)
    l0_k_outer_2, l0_k_inner_2 = s[c_l0c_2].split(l1_k_inner_2,
                                                  factor=factor_l0_k)
    s[c_l0c_2].reorder(c_l0c_2.op.axis[0], l1_n_outer_2, l1_k_outer_2,
                       c_l0c_2.op.axis[1], l1_m_outer_2, l0_n_outer_2,
                       l0_m_outer_2, l0_k_outer_2, l0_n_inner_2, l0_m_inner_2,
                       c_l0c_2.op.axis[4], c_l0c_2.op.axis[5], l0_k_inner_2,
                       c_l0c_2.op.reduce_axis[1])
    s[a_l1_2].double_buffer()
    s[b_l1_2].double_buffer()
    s[a_l0a_2].double_buffer()
    s[b_l0b_2].double_buffer()
    s[c_l0c_2].double_buffer()
    s[c_ub_2].double_buffer()
    s[a_l1_2].compute_at(s[c_l0c_2], l1_k_outer_2)
    s[b_l1_2].compute_at(s[c_l0c_2], c_l0c_2.op.axis[1])
    s[a_l0a_2].compute_at(s[c_l0c_2], l1_k_outer_2)
    s[b_l0b_2].compute_at(s[c_l0c_2], l0_k_outer_2)

    update_h_gm_outer, update_h_gm_inner = s[update_h_gm].split(
        update_h_gm.op.axis[1], factor=factor_l1_n)
    s[c_l0c_2].compute_at(s[update_h_gm], update_h_gm_outer)
    s[c_ub_2].compute_at(s[update_h_gm], update_h_gm_outer)
    s[bias_ub_2].compute_at(s[update_h_gm], update_h_gm_outer)
    s[bias_bc_ub_2].compute_at(s[update_h_gm], update_h_gm_outer)
    s[c_ub_bias_2].compute_at(s[update_h_gm], update_h_gm_outer)
    s[update_y_gm].compute_at(s[update_h_gm], update_h_gm_outer)
    s[update_y_gm_back].compute_at(s[update_h_gm], update_h_gm_outer)
    if fp16_input_output:
        s[bias_ub_2_fp32].compute_at(s[update_h_gm], update_h_gm_outer)
        s[update_h_fp16].compute_at(s[update_h_gm], update_h_gm_outer)
    if is_gate_output:
        s[i_t_gm].compute_at(s[update_h_gm], update_h_gm_outer)
        s[i_t_gm_back].compute_at(s[update_h_gm], update_h_gm_outer)
        s[n_t_gm].compute_at(s[update_h_gm], update_h_gm_outer)
        s[n_t_gm_back].compute_at(s[update_h_gm], update_h_gm_outer)
        if fp16_input_output:
            s[i_t_sigmoid_fp16].compute_at(s[update_h_gm], update_h_gm_outer)
            s[i_t_gm_back_fp32].compute_at(s[update_h_gm], update_h_gm_outer)
            s[h_t_tanh_fp16].compute_at(s[update_h_gm], update_h_gm_outer)
            s[n_t_gm_back_fp32].compute_at(s[update_h_gm], update_h_gm_outer)

    for tensor in elewise_tensors:
        s[tensor].set_scope(tbe_platform.scope_ubuf)
        s[tensor].compute_at(s[update_h_gm], update_h_gm_outer)
        insn = _get_emit_insn_map(tensor)
        s[tensor].emit_insn(tensor.op.axis[0], insn)

    # emit insn
    if is_first_round and not is_global_init:
        s[s_state_h].emit_insn(s_state_h.op.axis[0], 'broadcast')
        s[s_state_h_fp16].emit_insn(s_state_h_fp16.op.axis[0], 'vector_conv')
    else:
        if fp16_input_output:
            s[s_state_h_fp16].emit_insn(s_state_h_fp16.op.axis[0], 'dma_copy')
            s[s_state_h].emit_insn(s_state_h.op.axis[0], 'vector_conv')
        else:
            s[s_state_h].emit_insn(s_state_h.op.axis[0], 'dma_copy')
            s[s_state_h_fp16].emit_insn(s_state_h_fp16.op.axis[0],
                                        'vector_conv')

    s[a_l1_1].emit_insn(a_l1_1.op.axis[0], 'dma_copy')
    s[b_l1_1].emit_insn(b_l1_1.op.axis[0], 'dma_copy')
    s[a_l0a_1].emit_insn(a_l0a_1.op.axis[0], 'dma_copy')
    s[b_l0b_1].emit_insn(b_l0b_1.op.axis[0], 'dma_copy')
    mad_dict = {"mad_pattern": 0, "k_outer": [l1_k_outer_1, l0_k_outer_1]}
    s[c_l0c_1].emit_insn(l0_n_inner_1, 'mad', mad_dict)
    s[c_ub_1].emit_insn(c_ub_1.op.axis[0], 'dma_copy')
    s[bias_ub_1].emit_insn(bias_ub_1.op.axis[0], 'dma_copy')
    if fp16_input_output:
        s[bias_ub_1_fp32].emit_insn(bias_ub_1_fp32.op.axis[0], 'vector_conv')
        s[bias_ub_2_fp32].emit_insn(bias_ub_2_fp32.op.axis[0], 'vector_conv')
        s[update_h_fp16].emit_insn(update_h_fp16.op.axis[0], 'vector_conv')
    s[bias_bc_ub_1].emit_insn(bias_bc_ub_1.op.axis[0], 'unified_broadcast')
    s[a_l1_2].emit_insn(a_l1_2.op.axis[0], 'dma_copy')
    s[b_l1_2].emit_insn(b_l1_2.op.axis[0], 'dma_copy')
    s[a_l0a_2].emit_insn(a_l0a_2.op.axis[0], 'dma_copy')
    s[b_l0b_2].emit_insn(b_l0b_2.op.axis[0], 'dma_copy')
    mad_dict = {"mad_pattern": 0, "k_outer": [l1_k_outer_2, l0_k_outer_2]}
    s[c_l0c_2].emit_insn(l0_n_inner_2, 'mad', mad_dict)
    s[c_ub_2].emit_insn(c_ub_2.op.axis[0], 'dma_copy')
    s[bias_ub_2].emit_insn(bias_ub_2.op.axis[0], 'dma_copy')
    s[bias_bc_ub_2].emit_insn(bias_bc_ub_2.op.axis[0], 'unified_broadcast')
    s[update_y_gm].emit_insn(update_y_gm.op.axis[0], 'dma_copy')
    s[update_y_gm_back].emit_insn(update_y_gm_back.op.axis[0], 'phony_insn')
    s[update_y_gm_back].reused_by(update_h_ub)
    if is_gate_output:
        s[r_t_gm].emit_insn(r_t_gm.op.axis[0], 'dma_copy')
        s[i_t_gm].emit_insn(i_t_gm.op.axis[0], 'dma_copy')
        s[n_t_gm].emit_insn(n_t_gm.op.axis[0], 'dma_copy')
        s[r_t_gm_back].emit_insn(r_t_gm_back.op.axis[0], 'phony_insn')
        s[i_t_gm_back].emit_insn(i_t_gm_back.op.axis[0], 'phony_insn')
        s[n_t_gm_back].emit_insn(n_t_gm_back.op.axis[0], 'phony_insn')
        if fp16_input_output:
            s[r_t_sigmoid_fp16].emit_insn(r_t_sigmoid_fp16.op.axis[0],
                                          'vector_conv')
            s[i_t_sigmoid_fp16].emit_insn(i_t_sigmoid_fp16.op.axis[0],
                                          'vector_conv')
            s[h_t_tanh_fp16].emit_insn(h_t_tanh_fp16.op.axis[0], 'vector_conv')
            s[r_t_gm_back_fp32].emit_insn(r_t_gm_back_fp32.op.axis[0],
                                          'phony_insn')
            s[i_t_gm_back_fp32].emit_insn(i_t_gm_back_fp32.op.axis[0],
                                          'phony_insn')
            s[n_t_gm_back_fp32].emit_insn(n_t_gm_back_fp32.op.axis[0],
                                          'phony_insn')
            s[r_t_gm_back_fp32].reused_by(r_t_sigmoid)
            s[i_t_gm_back_fp32].reused_by(i_t_sigmoid)
            s[n_t_gm_back_fp32].reused_by(h_t_tanh)
            s[r_t_gm_back].reused_by(r_t_sigmoid_fp16)
            s[i_t_gm_back].reused_by(i_t_sigmoid_fp16)
            s[n_t_gm_back].reused_by(h_t_tanh_fp16)
        else:
            s[r_t_gm_back].reused_by(r_t_sigmoid)
            s[i_t_gm_back].reused_by(i_t_sigmoid)
            s[n_t_gm_back].reused_by(h_t_tanh)
    s[update_h_gm].emit_insn(update_h_gm_inner, 'dma_copy')

    output_list = [update_y_gm, update_h_gm]
    if is_gate_output:
        output_list.append(r_t_gm)
        output_list.append(i_t_gm)
        output_list.append(n_t_gm)
    return output_list, s
Exemplo n.º 8
0
def softmax_cross_entropy_with_logits_compute_ex(input_features, input_labels):
    """
    Computes softmax cross entropy cost.
    softmax = e^(x-max) / ∑(e^(x-max))
    log(softmax) = (x-max) - log(∑e^(x-max))
    cross_entropy = -∑(y * log⁡(softmax))

    Parameters
    # ----------
    input_features: TVM tensor
        input tensor contains shape and dtype attributes.
        source data type support "float16", "float32".
    input_labels: TVM tensor
        input tensor contains shape and dtype attributes.
        Must have the same type as 'input_features'.
    output_loss: dict
        data of output.
        Must have the same type as 'input_features'.
    output_backprop: dict
        data of output.
        Must have the same type as 'input_features'.
    kernel_name: str
        kernel name, default value is "softmax_cross_entropy_with_logits"

    Returns:
    res: TVM tensor
        output tensor. Has the same type as "input_features".
    """
    shape_features = te.lang.cce.util.shape_to_list(input_features.shape)
    shape_labels = te.lang.cce.util.shape_to_list(input_labels.shape)
    dtype = input_features.dtype.lower()

    if list(shape_features) != list(shape_labels):
        shape_features, shape_labels, shape_broadcast = \
            broadcast_shapes(shape_features, shape_labels, param_name_input1="input_features",
                             param_name_input2="input_labels")
        input_features = te.lang.cce.broadcast(input_features, shape_broadcast,
                                               dtype)
        input_labels = te.lang.cce.broadcast(input_labels, shape_broadcast,
                                             dtype)
    else:
        shape_broadcast = shape_features

    if dtype == "float16":
        input_features = te.lang.cce.cast_to(input_features, "float32")
        input_labels = te.lang.cce.cast_to(input_labels, "float32")

    with tvm.tag_scope("last_axis_reduce_max"):
        reduce_axis = tvm.reduce_axis((0, shape_broadcast[1]), name="rax0")
        data_max = tvm.compute(
            (shape_broadcast[0], 1),
            lambda upper, lower: tvm.max(input_features[upper, reduce_axis],
                                         axis=reduce_axis),
            name="last_axis_reduce_max")
    with tvm.tag_scope("elewise_binary_sub_scalar_L1"):
        data_sub = tvm.compute(input_features.shape,
                               lambda higher, lower: input_features[higher][
                                   lower] - data_max[higher][0],
                               name="manual_sub_0")
    data_exp = te.lang.cce.vexp(data_sub)
    data_sum = te.lang.cce.sum(data_exp, axis=-1, keepdims=True)
    with tvm.tag_scope("elewise_binary_div"):
        data_div = tvm.compute(data_exp.shape,
                               lambda higher, lower: data_exp[higher][lower] /
                               data_sum[higher][0],
                               name="manual_div_0")
    data_log_tmp = te.lang.cce.vlog(data_sum)
    with tvm.tag_scope("elewise_get_L1_workspace"):
        fake_buffer = tvm.compute(
            data_sub.shape,
            lambda higher, lower: tvm.const(0, "float32"),
            name="get_L1_workspace")
    with tvm.tag_scope("elewise_binary_sub"):
        data_log = tvm.compute(data_sub.shape,
                               lambda higher, lower: fake_buffer[higher][lower]
                               - data_log_tmp[higher][0],
                               name="manual_sub_1")
    data_mul = te.lang.cce.vmul(input_labels, data_log)
    with tvm.tag_scope("last_axis_reduce_sum_reuse"):
        reduce_axis = tvm.reduce_axis((0, shape_broadcast[1]), name="rax1")
        loss = tvm.compute(
            (shape_broadcast[0], 1),
            lambda upper, lower: tvm.sum(data_mul[upper, reduce_axis],
                                         axis=reduce_axis),
            name="last_axis_reduce_sum_reuse")
    loss = te.lang.cce.vmuls(loss, SCALAR_MINUS_ONE)
    backprop = te.lang.cce.vsub(data_div, input_labels)

    if dtype == "float16":
        loss = te.lang.cce.cast_to(loss, "float16")
        backprop = te.lang.cce.cast_to(backprop, "float16")

    res = [loss, backprop]

    return res
Exemplo n.º 9
0
def get_matmul_tensor(x, h, c, w, b, build_list, tensor_list, scope_list,
                      operation_list, is_hisi_es):
    shape_x = x.get("shape")
    shape_h = h.get("shape")
    shape_c = c.get("shape")
    dtype_x = x.get("dtype").lower()
    dtype_c = c.get("dtype").lower()
    dtype_b = b.get("dtype").lower()
    input_dim, batch_dim = shape_x[0:2]
    hidden_dim = shape_h[0]
    output_dim = hidden_dim
    shape_b = b.get("shape")
    shape_b = (shape_b[0] // 16, 16)
    shape_xh = (batch_dim, input_dim + hidden_dim, C0, C0)
    shape_w = w.get("shape")
    shape_w_split = list(shape_w)
    shape_w_split[1] = shape_w_split[1] // 4

    # Inputs in gm
    tensor_x = tvm.placeholder(shape_x, name='tensor_x', dtype=dtype_x)
    tensor_h = tvm.placeholder(shape_h, name='tensor_h', dtype=dtype_x)
    tensor_c = tvm.placeholder(shape_c, name='tensor_c', dtype=dtype_c)
    tensor_w = tvm.placeholder(shape_w, name='tensor_w', dtype=dtype_x)
    tensor_b = tvm.placeholder(shape_b, name='tensor_b', dtype=dtype_c)
    build_list["x"] = tensor_x
    build_list["h"] = tensor_h
    build_list["c"] = tensor_c
    build_list["w"] = tensor_w
    build_list["b"] = tensor_b

    symbol = ["it", "jt", "ft", "ot"]

    def _index_w(str_name, *index):
        if str_name == "it":
            return index[0], index[1], index[2], index[3]
        elif str_name == "jt":
            return index[0], index[1] + output_dim, index[2], index[3]
        elif str_name == "ft":
            return index[0], index[1] + output_dim * 2, index[2], index[3]
        return index[0], index[1] + output_dim * 3, index[2], index[3]

    def _index_bias(str_name):
        if str_name == "it":
            return 0
        elif str_name == "jt":
            return 1
        elif str_name == "ft":
            return 2
        return 3

    matmul_type = "float32"
    if is_hisi_es:
        matmul_type = "float16"

    for t in symbol:
        # caoncat x and h into 1 tensor,copy to L1
        tensor_xh_l1_tmp = tvm.compute(
            shape_xh,
            lambda *indice: tvm.select(
                indice[1] < input_dim, tensor_x[indice[1], indice[0], indice[
                    2], indice[3]], tensor_h[indice[1] - input_dim, indice[0],
                                             indice[2], indice[3]]),
            name="tensor_xh_l1_" + t,
            tag="concat")
        tensor_list["tensor_xh_l1_" + t] = tensor_xh_l1_tmp
        scope_list["tensor_xh_l1_" + t] = cce.scope_cbuf
        # optimazition: copy one time
        operation_list["tensor_xh_l1_" + t] = "dma_copy"

        # copy  xh  to L1
        tensor_xh_l0a_tmp = tvm.compute(shape_xh,
                                        lambda *i: tensor_xh_l1_tmp(*i),
                                        name='tensor_xh_l0a_' + t)
        tensor_list["tensor_xh_l0a_" + t] = tensor_xh_l0a_tmp
        scope_list["tensor_xh_l0a_" + t] = cce.scope_ca
        # optimazition: copy one time
        operation_list["tensor_xh_l0a_" + t] = "dma_copy"
        # copy w to L1 buf
        tensor_w_l1_tmp = tvm.compute(shape_w_split,
                                      lambda *i: tensor_w(*_index_w(t, *i)),
                                      name='tensor_w_l1_' + t)
        tensor_list["tensor_w_l1_" + t] = tensor_w_l1_tmp
        scope_list["tensor_w_l1_" + t] = cce.scope_cbuf
        operation_list["tensor_w_l1_" + t] = "dma_copy"

        # copy W from L1 to L0 B
        tensor_w_l0b_tmp = tvm.compute(shape_w_split,
                                       lambda *i: tensor_w_l1_tmp(*i),
                                       name='tensor_w_l0b_' + t)
        tensor_list["tensor_w_l0b_" + t] = tensor_w_l0b_tmp
        scope_list["tensor_w_l0b_" + t] = cce.scope_cb
        operation_list["tensor_w_l0b_" + t] = "dma_copy"

        # copy bias to ubuf ,split the
        tensor_b_ub_tmp = tvm.compute(
            shape_b,
            lambda i0, i1: tensor_b[_index_bias(t) * output_dim + i0, i1],
            name='tensor_b_ub_' + t)
        tensor_list["tensor_b_ub_" + t] = tensor_b_ub_tmp
        scope_list["tensor_b_ub_" + t] = cce.scope_ubuf
        operation_list["tensor_b_ub_" + t] = "dma_copy"

        #
        tensor_b_ub_true_tmp = tensor_b_ub_tmp
        if not is_hisi_es and dtype_b == "float16":
            tensor_b_ub_true_tmp = tvm.compute(
                shape_b,
                lambda *i: topi.cast(tensor_b_ub_tmp(*i), "float32"),
                name="tensor_b_ub_true_" + t)
            tensor_list["tensor_b_ub_true_" + t] = tensor_b_ub_true_tmp
            scope_list["tensor_b_ub_true_" + t] = cce.scope_ubuf
            operation_list["tensor_b_ub_true_" + t] = "vector_conv"

        # broadcast bias from [ouput_dim//16,16] to [output_dim//16,N//16,16,16]
        tensor_b_loc_tmp = tvm.compute(
            shape_h,
            lambda i0, i1, i2, i3: tensor_b_ub_true_tmp[i0, i3],
            name='tensor_b_loc_' + t)
        tensor_list["tensor_b_loc_" + t] = tensor_b_loc_tmp
        scope_list["tensor_b_loc_" + t] = cce.scope_cc
        operation_list["tensor_b_loc_" + t] = "dma_copy"
        # DO MATMUL
        reduce_kb = tvm.reduce_axis((0, input_dim + output_dim),
                                    name='reduce_kb_' + t)
        reduce_kp = tvm.reduce_axis((0, C0), name='reduce_kp_' + t)
        tensor_matmul_l0c_tmp = tvm.compute(
            shape_h,
            lambda nb, mb, mp, np: tvm.sum((tensor_xh_l0a_tmp[
                mb, reduce_kb, mp, reduce_kp] * tensor_w_l0b_tmp[
                    reduce_kb, nb, np, reduce_kp]).astype(matmul_type),
                                           axis=[reduce_kb, reduce_kp]),
            name='tensor_matmul_l0c_' + t,
            attrs={'input_order': 'positive'})
        tensor_list["tensor_matmul_l0c_" + t] = tensor_matmul_l0c_tmp
        scope_list["tensor_matmul_l0c_" + t] = cce.scope_cc
        # Matmul accumulation it + b_it
        tensor_matmul_result_l0c_tmp = tvm.compute(
            shape_h,
            lambda *i: tensor_b_loc_tmp(*i) + tensor_matmul_l0c_tmp(*i),
            name="tensor_matmul_result_l0c_" + t)
        tensor_list["tensor_matmul_result_l0c_" +
                    t] = tensor_matmul_result_l0c_tmp
        scope_list["tensor_matmul_result_l0c_" + t] = cce.scope_cc
        operation_list["tensor_matmul_result_l0c_" + t] = "phony_insn"

        # copy matmul result from l0c to ub
        gate_ub_tmp = tvm.compute(
            shape_h,
            lambda *i: tensor_list["tensor_matmul_result_l0c_" + t](*i),
            name=t + "_ub")
        tensor_list[t + "_ub"] = gate_ub_tmp
        scope_list[t + "_ub"] = cce.scope_ubuf
        operation_list[t + "_ub"] = "dma_copy"
Exemplo n.º 10
0
def avg_pool3d_compute(x,
                       y,
                       ksize,
                       strides,
                       pads,
                       data_format="NDHWC",
                       kernel_name="avg_pool3d"):
    """
    avg_pool3d compute

    Parameters
    ----------
    x: input tensor dict
    y: output tensor dict
    ksize: kernel size
    strides: strides
    padding: padding mode, str
    data_format: must be "NDHWC"
    kernel_name: kernel name

    Returns
    -------
    output tensor
    """
    shape = x.shape
    if len(ksize) == 5:
        a_size = (ksize[1] * ksize[2] * ksize[3])
        ksize_d = ksize[1]
    elif len(ksize) == 3:
        a_size = (ksize[0] * ksize[1] * ksize[2])
        ksize_d = ksize[0]
    else:
        a_size = ksize[0] * ksize[0] * ksize[0]
        ksize_d = ksize[0]

    if len(strides) == 5:
        stride_d = strides[1]
    else:
        stride_d = strides[0]

    # copy gm to ub
    tensor_in_ub = tvm.compute(shape, lambda *i: x[i], name="tensor_in_ub")

    tensor_in_ub_cast = tvm.compute(
        shape,
        lambda *i: tensor_in_ub(*i).astype("float32"),
        name="tensor_in_ub_cast")

    d_axis = tvm.reduce_axis((0, ksize_d), "d_sum")
    hw_axis = tvm.reduce_axis((0, shape[3]), "hw_sum")
    origin_d = shape[1]
    reduced_d = 1 + (origin_d - ksize_d) // stride_d
    shape_d_hw = (shape[0], reduced_d, shape[2], 1, shape[4])
    tensor_d_hw = tvm.compute(
        shape_d_hw,
        lambda n, d, c1, hw, c0: tvm.sum(tensor_in_ub_cast[
            n, d * stride_d + d_axis, c1, hw_axis, c0],
                                         axis=[d_axis, hw_axis]),
        name="tensor_d_hw")

    tensor_a = tvm.compute(
        shape_d_hw,
        lambda n, d, c1, hw, c0: tensor_d_hw[n, d, c1, hw, c0] * tvm.const(
            1.0 / a_size, dtype="float32"),
        name="tensor_a")

    res_cast = tvm.compute(shape_d_hw,
                           lambda *i: tensor_a(*i).astype("float16"),
                           name="res_cast")

    res = tvm.compute(shape_d_hw, lambda *i: res_cast[i], name='res')
    return res