예제 #1
0
This program is free software; you can redistribute it and/or modify
it under the terms of the Apache License Version 2.0.You may not use
this file except in compliance with the License.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
Apache License for more details at
http://www.apache.org/licenses/LICENSE-2.0
"""

from te import tik
from te.platform.cce_conf import te_set_l2_mode

te_set_l2_mode(1)

#size of 310 ai core ub buffer
UB_SIZE = 240 * 1024

#batch for N
MAX_BATCH = 1

#channel for C
MAX_CHANNEL = 1024

#width for W
MAX_WIDTH = 32

#height for H
MAX_HEIGHT = 32
예제 #2
0
def matmul_tik_compute(params, kernel_name):
    te_set_l2_mode(1)
    tik_instance = tik.Tik()
    if not isinstance(params, dict):
        params = params.__dict__
    m_size, k_size, n_size = params['M'], params['K'], params['N']
    data_type = params["data_type"]
    m_tiling_size = int(params["m_tiling_size"])
    n_tiling_size = int(params["n_tiling_size"])
    k_tiling_size = int(params['k_tiling_size'])

    m_cycle_times = params["m_cycle_times"]
    n_cycle_times = params["n_cycle_times"]
    k_cycle_times = params["k_cycle_times"]

    # Determine the output type
    if data_type == "float16":
        C_loc_out_type = "float32"
        K0 = 16
    else:
        C_loc_out_type = "int32"
        K0 = 32
    block_size = 16

    n_thread_num = params['n_thread_num']
    m_thread_num = params['m_thread_num']
    k_thread_num = params['k_thread_num']

    # Occupy the input tensor.
    output_gm = tik_instance.Tensor(C_loc_out_type, (n_size // block_size,
                                                     m_size, block_size),
                                    name="C_gm", scope=tik.scope_gm)
    inputa_gm = tik_instance.Tensor(params["data_type"], (k_size // K0,
                                                          m_size, K0),
                                    name="A_gm", scope=tik.scope_gm)
    inputb_gm = tik_instance.Tensor(params["data_type"], (k_size // K0,
                                                          n_size, K0),
                                    name="B_gm", scope=tik.scope_gm)

    # Tiling is realized through the for_range() loop.
    with tik_instance.for_range(0, 2, block_num = 2) as core_id:
        with tik_instance.for_range(0, n_cycle_times // 2,
                                    thread_num=n_thread_num) as n_idx:
            with tik_instance.for_range(0, m_cycle_times,
                                        thread_num=m_thread_num) as m_idx:
                dst_l0c = tik_instance.Tensor(C_loc_out_type,
                                              [n_tiling_size // 16,
                                               m_tiling_size, 16],
                                              name='dst_l0c',
                                              scope=tik.scope_cbuf_out)
                with tik_instance.for_range(0, k_cycle_times,
                                            thread_num=k_thread_num) as k_idx:
                    # Calculation result data transfer.
                    inputa_l1 = tik_instance.Tensor(params['data_type'],
                                               [k_tiling_size // K0,
                                                m_tiling_size, K0],
                                               name="A_tiling_l1",
                                               scope=tik.scope_cbuf)
                    tik_instance.data_move(inputa_l1,
                                           inputa_gm[k_idx *
                                                     k_tiling_size // K0,
                                           m_idx * m_tiling_size, :],
                                           0, k_tiling_size // K0, m_tiling_size,
                                           m_size - m_tiling_size, 0)
                    inputb_l1 = tik_instance.Tensor(params["data_type"],
                                               [k_tiling_size // K0,
                                                n_tiling_size, K0],
                                               name="B_tiling_l1",
                                               scope=tik.scope_cbuf)
                    if n_size - n_tiling_size > 65535:
                        with tik_instance.for_range(0, k_tiling_size // K0) \
                                as dma_k_idx:
                            tik_instance.data_move(inputb_l1[dma_k_idx, :, :],
                                                   inputb_gm[k_idx *
                                                             k_tiling_size //
                                                             K0 + dma_k_idx,
                                                   (core_id * n_cycle_times // 2 +
                                                    n_idx) * n_tiling_size, :],
                                                    0, 1, n_tiling_size, 0, 0)
                    else:
                        tik_instance.data_move(inputb_l1,
                                               inputb_gm[k_idx *
                                                         k_tiling_size // K0,
                                               (core_id * n_cycle_times // 2 +
                                                n_idx) * n_tiling_size, :], 0,
                                               k_tiling_size // K0,
                                               n_tiling_size,
                                               n_size - n_tiling_size, 0)
                    # Call matmul API to matrix multiplication calculation.
                    with tik_instance.if_scope(k_idx == 0):
                        tik_instance.matmul(dst_l0c, inputa_l1, inputb_l1,
                                            m_tiling_size,
                                            k_tiling_size, n_tiling_size,
                                            init_l1out=True)
                    with tik_instance.else_scope():
                        tik_instance.matmul(dst_l0c, inputa_l1, inputb_l1,
                                            m_tiling_size,
                                            k_tiling_size, n_tiling_size,
                                            init_l1out=False)
                tik_instance.fixpipe(output_gm[n_tiling_size // 16 * (core_id *
                                                                  n_cycle_times
                                                                  // 2 +
                                                                  n_idx),
                                     m_idx * m_tiling_size, :], dst_l0c,
                                     n_tiling_size // 16,
                                     m_tiling_size * 16 *
                                     DTYPE_SIZE[C_loc_out_type]//32,
                                     (m_size - m_tiling_size) * 16 *
                                     DTYPE_SIZE[C_loc_out_type] // 32, 0)

    tik_instance.BuildCCE(kernel_name=kernel_name,
                          inputs=[inputa_gm, inputb_gm], outputs=[output_gm])
    return tik_instance
예제 #3
0
def conv2d_tik_compute(params):
    te_set_l2_mode(1)
    tik_instance = tik.Tik(tik.Dprofile(params["arch"], params["version"]),
                           err_msg_level=1)
    n, c1, h, w, c0 = params["fm_shape"]
    c1, kh, kw, cout, c0 = params["weight_shape"]
    stride_h, stride_w = params["stride_list"]
    dilation_h, dilation_w = params["dilation_list"]
    pad_top, pad_bot, pad_left, pad_right = params["pad_list"]
    kh_dilation = (kh - 1) * dilation_h + 1
    kw_dilation = (kw - 1) * dilation_w + 1
    ho = int(np.ceil((h + pad_top + pad_bot - kh_dilation + 1) / stride_h))
    wo = int(np.ceil((w + pad_right + pad_left - kw_dilation + 1) / stride_w))
    round_howo = ceil_div(ho * wo, 16) * 16

    fm_gm = tik_instance.Tensor(params['fm_dtype'], (n, c1, h, w, c0),
                                name='fm_gm',
                                scope=tik.scope_gm)
    weight_gm = tik_instance.Tensor(params['weight_type'],
                                    (c1, kh, kw, cout, c0),
                                    name='weight_gm',
                                    scope=tik.scope_gm)

    if params['dst_gm_type'] in ("int8", "uint8"):
        dst_gm = tik_instance.Tensor(params['dst_gm_type'],
                                     [n, cout // 32, ho, wo, 32],
                                     name='dst_gm',
                                     scope=tik.scope_gm)
    else:
        dst_gm = tik_instance.Tensor(params['dst_gm_type'],
                                     [n, cout // 16, ho, wo, 16],
                                     name='dst_gm',
                                     scope=tik.scope_gm)

    core_num = 2
    pre_core_cout = cout // core_num
    cout_iter_num = pre_core_cout // params["cout_split_factor"]
    Cin_blocks = c1

    with tik_instance.for_range(0, core_num, block_num=core_num) as cout_o:
        with tik_instance.for_range(0, cout_iter_num, thread_num=1) as cout_i:
            weight_L1 = tik_instance.Tensor(
                params['weight_type'],
                (Cin_blocks, kh, kw, params["cout_split_factor"], c0),
                name='weight_l1',
                scope=tik.scope_cbuf)
            tik_instance.data_move(
                weight_L1,
                weight_gm.flatten()[cout_o * pre_core_cout * c0 +
                                    params["cout_split_factor"] * cout_i * c0],
                0, Cin_blocks * kh * kw, params["cout_split_factor"],
                (cout - params["cout_split_factor"]), 0)

            with tik_instance.for_range(0, n, thread_num=2) as n_index:
                feature_map_l1 = tik_instance.Tensor(params['fm_dtype'],
                                                     (c1, h, w, c0),
                                                     name='feature_map_l1',
                                                     scope=tik.scope_cbuf)
                tik_instance.data_move(feature_map_l1,
                                       fm_gm[n_index, :, :, :, :], 0, 1,
                                       c1 * h * w, 0, 0)
                dst_l0c = tik_instance.Tensor(
                    params['dst_l0c_type'],
                    [params["cout_split_factor"] // 16, round_howo, 16],
                    name='dst_l0c',
                    scope=tik.scope_cbuf_out)

                tik_instance.conv2d(
                    dst_l0c, feature_map_l1, weight_L1, (c1, h, w, c0),
                    (Cin_blocks, kh, kw, params["cout_split_factor"], c0),
                    params['stride_list'], params['pad_list'],
                    params['dilation_list'], params['pad_value'])

                tik_instance.fixpipe(
                    dst_gm[n_index, (cout_o * pre_core_cout +
                                     params["cout_split_factor"] * cout_i) //
                           (32 // DTYPE_SIZE[params['dst_gm_type']]), 0, 0, 0],
                    dst_l0c,
                    params["cout_split_factor"] // 16,
                    ho * wo * 16 * DTYPE_SIZE[params['dst_l0c_type']] // 32,
                    0,
                    0,
                    extend_params={
                        "bias": None,
                        "quantize_params": params["quantize_params"]
                    })

    tik_instance.BuildCCE(kernel_name=params["kernel_name"],
                          inputs=[fm_gm, weight_gm],
                          outputs=[dst_gm])

    return tik_instance
예제 #4
0
def matmul_tik_compute(params, kernel_name, new_ws=None, cnt=0):
    te_set_l2_mode(1)
    tik_instance = tik.Tik(tik.Dprofile('v100', 'mini'), err_msg_level=1)
    if not isinstance(params, dict):
        params = params.__dict__
    m, k, n = params['M'], params['K'], params['N']
    data_type = params["data_type"]
    m_tiling_size = int(params["m_tiling_size"])
    n_tiling_size = int(params["n_tiling_size"])
    k_tiling_size = int(params['k_tiling_size'])

    m_cycle_times = params["m_cycle_times"]
    n_cycle_times = params["n_cycle_times"]
    k_cycle_times = params["k_cycle_times"]

    if data_type == "float16":
        C_loc_out_type = "float32"
        K0 = 16
    else:
        C_loc_out_type = "int32"
        K0 = 32
    block_size = 16

    n_thread_num = params['n_thread_num']
    m_thread_num = params['m_thread_num']
    k_thread_num = params['k_thread_num']

    C_gm = tik_instance.Tensor(C_loc_out_type,
                               (n // block_size, m, block_size),
                               name="C_gm",
                               scope=tik.scope_gm)
    A_gm = tik_instance.Tensor(params["data_type"], (k // K0, m, K0),
                               name="A_gm",
                               scope=tik.scope_gm)
    B_gm = tik_instance.Tensor(params["data_type"], (k // K0, n, K0),
                               name="B_gm",
                               scope=tik.scope_gm)

    with tik_instance.for_range(0, 2, block_num=2) as core_id:
        with tik_instance.for_range(0,
                                    n_cycle_times // 2,
                                    thread_num=n_thread_num) as n_idx:
            with tik_instance.for_range(0,
                                        m_cycle_times,
                                        thread_num=m_thread_num) as m_idx:
                dst_l0c = tik_instance.Tensor(
                    C_loc_out_type, [n_tiling_size // 16, m_tiling_size, 16],
                    name='dst_l0c',
                    scope=tik.scope_cbuf_out)
                with tik_instance.for_range(0,
                                            k_cycle_times,
                                            thread_num=k_thread_num) as k_idx:
                    A_l1 = tik_instance.Tensor(
                        params['data_type'],
                        [k_tiling_size // K0, m_tiling_size, K0],
                        name="A_tiling_l1",
                        scope=tik.scope_cbuf)
                    tik_instance.data_move(
                        A_l1, A_gm[k_idx * k_tiling_size // K0,
                                   m_idx * m_tiling_size, :], 0,
                        k_tiling_size // K0, m_tiling_size, m - m_tiling_size,
                        0)
                    B_l1 = tik_instance.Tensor(
                        params["data_type"],
                        [k_tiling_size // K0, n_tiling_size, K0],
                        name="B_tiling_l1",
                        scope=tik.scope_cbuf)
                    if n - n_tiling_size > 65535:
                        with tik_instance.for_range(0, k_tiling_size //
                                                    K0) as dma_k_idx:
                            tik_instance.data_move(
                                B_l1[dma_k_idx, :, :],
                                B_gm[k_idx * k_tiling_size // K0 + dma_k_idx,
                                     (core_id * n_cycle_times // 2 + n_idx) *
                                     n_tiling_size, :], 0, 1, n_tiling_size, 0,
                                0)
                    else:
                        tik_instance.data_move(
                            B_l1, B_gm[k_idx * k_tiling_size // K0,
                                       (core_id * n_cycle_times // 2 + n_idx) *
                                       n_tiling_size, :], 0,
                            k_tiling_size // K0, n_tiling_size,
                            n - n_tiling_size, 0)
                    with tik_instance.if_scope(k_idx == 0):
                        tik_instance.matmul(dst_l0c,
                                            A_l1,
                                            B_l1,
                                            m_tiling_size,
                                            k_tiling_size,
                                            n_tiling_size,
                                            init_l1out=True)
                    with tik_instance.else_scope():
                        tik_instance.matmul(dst_l0c,
                                            A_l1,
                                            B_l1,
                                            m_tiling_size,
                                            k_tiling_size,
                                            n_tiling_size,
                                            init_l1out=False)
                tik_instance.fixpipe(
                    C_gm[n_tiling_size // 16 *
                         (core_id * n_cycle_times // 2 + n_idx),
                         m_idx * m_tiling_size, :], dst_l0c,
                    n_tiling_size // 16,
                    m_tiling_size * 16 * DTYPE_SIZE[C_loc_out_type] // 32,
                    (m - m_tiling_size) * 16 * DTYPE_SIZE[C_loc_out_type] //
                    32, 0)

    tik_instance.BuildCCE(kernel_name=kernel_name,
                          inputs=[A_gm, B_gm],
                          outputs=[C_gm])
    return tik_instance