예제 #1
0
def conv2d_direct_simd_compute(cfg, data, kernel, strides, padding, dilation,
                               out_dtype):
    """Compute function for Cortex-M7 SIMD implementation of conv2d."""
    assert isinstance(strides, int) or len(strides) == 2
    assert isinstance(dilation, int) or len(dilation) == 2

    if isinstance(strides, int):
        stride_h = stride_w = strides
    else:
        stride_h, stride_w = strides

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    batch_size, in_height, in_width, in_channels = data.shape
    kernel_h, kernel_w, out_channels, _ = kernel.shape

    # compute the output shape
    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w))
    out_height = simplify(
        (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
    out_width = simplify(
        (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)

    pad_before = [0, pad_top, pad_left, 0]
    pad_after = [0, pad_down, pad_right, 0]
    padded_data = pad(data, pad_before, pad_after, name="padded_data")

    rc = te.reduce_axis((0, in_channels), name="rc")
    ry = te.reduce_axis((0, kernel_h), name="ry")
    rx = te.reduce_axis((0, kernel_w), name="rx")

    conv = te.compute(
        (batch_size, out_height, out_width, out_channels),
        lambda nn, yy, xx, ff: te.sum(
            padded_data[nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx
                        * dilation_w, rc].astype(out_dtype) * kernel[
                            ry, rx, ff, rc].astype(out_dtype),
            axis=[ry, rx, rc],
        ),
        name="conv2d",
        tag="conv2d_nhwc",
    )

    ###########################
    # Config Space Definition #
    ###########################
    n, oh, ow, co = (
        cfg.axis(batch_size.value),
        cfg.axis(out_height.value),
        cfg.axis(out_width.value),
        cfg.axis(out_channels.value),
    )
    kh, kw, ci = (
        cfg.reduce_axis(kernel_h.value),
        cfg.reduce_axis(kernel_w.value),
        cfg.reduce_axis(in_channels.value),
    )

    assert in_channels.value % 4 == 0
    owo, owi = cfg.define_split("tile_ow", ow, policy="factors", num_outputs=2)
    cio, cii = cfg.define_split("tile_ci",
                                ci,
                                policy="factors",
                                num_outputs=2,
                                filter=lambda x: x.size[-1] % 4 == 0)
    coo, coi = cfg.define_split("tile_co", co, policy="factors", num_outputs=2)

    cfg.define_reorder(
        "reorder_0_simd",
        [n, oh, owo, owi, coo, coi, kh, kw, cio, cii],
        policy="candidate",
        candidate=[
            [n, oh, kh, kw, owo, coo, cio, owi, coi, cii],
            [n, oh, kh, kw, coo, owo, cio, owi, coi, cii],
            [n, kh, kw, oh, owo, coo, cio, owi, coi, cii],
            [n, kh, kw, oh, coo, owo, cio, owi, coi, cii],
        ],
    )

    cfg.define_knob("auto_unroll_max_step", [0, 2, 4, 8, 16, 32])
    cfg.define_knob("unroll_explicit", [0, 1])

    return conv
예제 #2
0
def test_tensor_core_batch_matmal():
    batch_size = 4
    n = 512
    m, l = n, n
    assert n % 32 == 0
    assert m % 8 == 0
    assert l % 16 == 0
    nn, mm, ll = n // 32, m // 8, l // 16
    A = te.placeholder((batch_size, nn, ll, 32, 16), name="A", dtype="float16")
    B = te.placeholder((batch_size, ll, mm, 16, 8), name="B", dtype="float16")
    k1 = te.reduce_axis((0, ll), name="k1")
    k2 = te.reduce_axis((0, 16), name="k2")
    C = te.compute(
        (batch_size, nn, mm, 32, 8),
        lambda b, i, j, ii, jj: te.sum(
            A[b, i, k1, ii, k2].astype("float") * B[b, k1, j, k2, jj].astype("float"), axis=[k1, k2]
        ),
        name="Fragment_C",
    )
    s = te.create_schedule(C.op)

    warp_size = 32
    kernel_size = 16
    block_row_warps = 2
    block_col_warps = 4
    warp_row_tiles = 4
    warp_col_tiles = 2
    chunk = 4

    block_x = te.thread_axis("blockIdx.x")
    block_y = te.thread_axis("blockIdx.y")
    block_z = te.thread_axis("blockIdx.z")
    thread_x = te.thread_axis("threadIdx.x")
    thread_y = te.thread_axis("threadIdx.y")
    thread_z = te.thread_axis("threadIdx.z")

    AS = s.cache_read(A, "shared", [C])
    BS = s.cache_read(B, "shared", [C])
    AF = s.cache_read(AS, "wmma.matrix_a", [C])
    BF = s.cache_read(BS, "wmma.matrix_b", [C])
    CF = s.cache_write(C, "wmma.accumulator")

    b, i, j, kernel_i, kernel_j = s[C].op.axis
    i, ii = s[C].split(i, factor=warp_row_tiles)
    block_i, i = s[C].split(i, factor=block_row_warps)
    j, jj = s[C].split(j, factor=warp_col_tiles)
    block_j, j = s[C].split(j, factor=block_col_warps)
    s[C].reorder(block_i, block_j, i, j, ii, jj, kernel_i, kernel_j)
    s[C].bind(b, block_z)
    s[C].bind(block_i, block_x)
    s[C].bind(block_j, block_y)
    s[C].bind(i, thread_y)
    s[C].bind(j, thread_z)

    s[CF].compute_at(s[C], j)
    b, warp_i, warp_j, _i, _j = s[CF].op.axis
    k, _k = CF.op.reduce_axis
    ko, ki = s[CF].split(k, factor=chunk)
    s[CF].reorder(ko, ki, warp_i, warp_j, _i, _j, _k)

    s[AF].compute_at(s[CF], ki)
    s[BF].compute_at(s[CF], ki)

    s[AS].compute_at(s[CF], ko)
    b, xo, yo, xi, yi = AS.op.axis
    tx, xo = s[AS].split(xo, nparts=block_row_warps)
    ty, yo = s[AS].split(yo, nparts=block_col_warps)
    t = s[AS].fuse(xi, yi)
    to, ti = s[AS].split(t, nparts=warp_size)
    s[AS].bind(tx, thread_y)
    s[AS].bind(ty, thread_z)
    s[AS].bind(to, thread_x)

    s[BS].compute_at(s[CF], ko)
    b, xo, yo, xi, yi = BS.op.axis
    tx, xo = s[BS].split(xo, nparts=block_row_warps)
    ty, yo = s[BS].split(yo, nparts=block_col_warps)
    t = s[BS].fuse(xi, yi)
    to, ti = s[BS].split(t, nparts=warp_size)
    s[BS].bind(tx, thread_y)
    s[BS].bind(ty, thread_z)
    s[BS].bind(to, thread_x)

    s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix((32, 8, 16), "wmma.matrix_a"))
    s[BF].tensorize(BF.op.axis[-2], intrin_wmma_load_matrix((32, 8, 16), "wmma.matrix_b"))
    s[C].tensorize(kernel_i, intrin_wmma_store_matrix((32, 8, 16)))
    s[CF].tensorize(_i, intrin_wmma_gemm((32, 8, 16)))

    func = tvm.build(s, [A, B, C], "cuda")

    dev = tvm.gpu(0)
    a_np = np.random.uniform(size=(batch_size, nn, ll, 32, 16)).astype(A.dtype)
    b_np = np.random.uniform(size=(batch_size, ll, mm, 16, 8)).astype(B.dtype)
    a = tvm.nd.array(a_np, dev)
    b = tvm.nd.array(b_np, dev)
    c = tvm.nd.array(np.zeros((batch_size, nn, mm, 32, 8), dtype=C.dtype), dev)
    func(a, b, c)
    evaluator = func.time_evaluator(func.entry_name, dev, number=3)
    print("gemm with tensor core: %f ms" % (evaluator(a, b, c).mean * 1e3))

    if VERIFY:
        func(a, b, c)
        a_np = a_np.transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n)
        b_np = b_np.transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n)
        c_np = c.asnumpy().transpose((0, 1, 3, 2, 4)).reshape(batch_size, n, n)
        np.testing.assert_allclose(
            c_np, np.matmul(a_np.astype(C.dtype), b_np.astype(C.dtype)), rtol=1e-4, atol=1e-4
        )
예제 #3
0
def intrin_wmma_gemm(shape):
    n, m, l = shape
    A = te.placeholder((n, l), name="A", dtype="float16")
    B = te.placeholder((l, m), name="B", dtype="float16")
    k = te.reduce_axis((0, l), name="k")
    C = te.compute(
        (n, m),
        lambda ii, jj: te.sum(A[ii, k].astype("float") * B[k, jj].astype("float"), axis=k),
        name="C",
    )
    BA = tvm.tir.decl_buffer(
        A.shape, A.dtype, name="BA", scope="wmma.matrix_a", data_alignment=32, offset_factor=n * l
    )
    BB = tvm.tir.decl_buffer(
        B.shape, B.dtype, name="BB", scope="wmma.matrix_b", data_alignment=32, offset_factor=l * m
    )
    BC = tvm.tir.decl_buffer(
        C.shape,
        C.dtype,
        name="BC",
        scope="wmma.accumulator",
        data_alignment=32,
        offset_factor=n * m,
    )

    def intrin_func(ins, outs):
        BA, BB = ins
        (BC,) = outs

        def init():
            ib = tvm.tir.ir_builder.create()
            ib.emit(
                tvm.tir.call_intrin(
                    "handle",
                    "tir.tvm_fill_fragment",
                    BC.data,
                    n,
                    m,
                    l,
                    BC.elem_offset // (n * m),
                    0.0,
                )
            )
            return ib.get()

        def update():
            ib = tvm.tir.ir_builder.create()
            ib.emit(
                tvm.tir.call_intrin(
                    "handle",
                    "tir.tvm_mma_sync",
                    BC.data,
                    BC.elem_offset // (n * m),
                    BA.data,
                    BA.elem_offset // (n * l),
                    BB.data,
                    BB.elem_offset // (l * m),
                    BC.data,
                    BC.elem_offset // (n * m),
                )
            )
            return ib.get()

        return update(), init(), update()

    return te.decl_tensor_intrin(C.op, intrin_func, binds={A: BA, B: BB, C: BC})
예제 #4
0
    def _schedule(cfg, s, C):
        A, B = s[C].op.input_tensors
        batch, m_dim, k_dim = get_const_tuple(A.shape)
        batch, n_dim, k_dim = get_const_tuple(B.shape)
        out_dtype = C.dtype
        # inline astype fp16
        s[A].compute_inline()
        s[B].compute_inline()

        # Explicit memory access
        AS = s.cache_read(A, "shared", [C])
        BS = s.cache_read(B, "shared", [C])
        AF = s.cache_read(AS, "wmma.matrix_a", [C])
        BF = s.cache_read(BS, "wmma.matrix_b", [C])
        CF = s.cache_write(C, "wmma.accumulator")
        CS = s.cache_read(CF, "shared", [C])

        # fallback support
        target = tvm.target.Target.current()
        if cfg.is_fallback:
            ref_log = autotvm.tophub.load_reference_log(
                target.kind.name, target.model, "batch_matmul_tensorcore.cuda")
            cfg.fallback_with_reference_log(ref_log)

        # Deal with op fusion, such as bias/relu and slice after padding
        if C.op not in s.outputs and "injective" in s.outputs[0].tag:
            s[C].compute_inline()
            C = s.outputs[0].output(0)

        # create tuning space
        cfg.define_knob("block_row_warps", [1, 2, 4])
        cfg.define_knob("block_col_warps", [1, 2, 4])
        cfg.define_knob("warp_row_tiles", [1, 2, 4])
        cfg.define_knob("warp_col_tiles", [1, 2, 4])
        cfg.define_knob("chunk", [1, 2, 4, 8])
        cfg.define_knob("offset", [0, 8])
        cfg.define_knob("offsetCS", [0, 8])
        cfg.define_knob("vec", [1, 2, 4, 8])

        # Ensure that the default parameters are applicable when autotvm is not in use
        if m_dim % 32 == 0 and n_dim % 8 == 0:
            cfg.define_knob("wmma_m", [32, 16, 8])
        elif m_dim % 16 == 0 and n_dim % 16 == 0:
            cfg.define_knob("wmma_m", [16, 8, 32])
        elif m_dim % 8 == 0 and n_dim % 32 == 0:
            cfg.define_knob("wmma_m", [8, 16, 32])

        warp_size = 32
        wmma_k = 16
        block_row_warps = cfg["block_row_warps"].val
        block_col_warps = cfg["block_col_warps"].val
        warp_row_tiles = cfg["warp_row_tiles"].val
        warp_col_tiles = cfg["warp_col_tiles"].val
        chunk = cfg["chunk"].val
        offset = cfg["offset"].val
        offsetCS = cfg["offsetCS"].val
        wmma_m = cfg["wmma_m"].val
        vec = cfg["vec"].val

        if wmma_m == 16:
            wmma_n = 16
        elif wmma_m == 8:
            wmma_n = 32
        elif wmma_m == 32:
            wmma_n = 8

        # Define the stride of intrin functions
        AS_align = chunk * wmma_k + offset
        BS_align = chunk * wmma_k + offset
        CS_align = warp_col_tiles * block_col_warps * wmma_n + offsetCS
        AS_stride = [AS_align, 1]
        BS_stride = [BS_align, 1]
        AF_stride = [wmma_k, 1]
        BF_stride = [wmma_k, 1]
        CF_stride = [warp_col_tiles * wmma_n, 1]
        CS_stride = [CS_align, 1]

        block_x = te.thread_axis("blockIdx.x")
        block_y = te.thread_axis("blockIdx.y")
        block_z = te.thread_axis("blockIdx.z")
        thread_x = te.thread_axis("threadIdx.x")
        thread_y = te.thread_axis("threadIdx.y")
        thread_z = te.thread_axis("threadIdx.z")

        # Schedule for dense computation
        block_factor_m = wmma_m * warp_row_tiles * block_row_warps
        block_factor_n = wmma_n * warp_col_tiles * block_col_warps
        b, m, n = C.op.axis
        block_i, bc = s[C].split(m, factor=block_factor_m)
        block_j, oc = s[C].split(n, factor=block_factor_n)
        s[C].reorder(b, block_i, block_j, bc, oc)
        t = s[C].fuse(bc, oc)
        t, vi = s[C].split(t, factor=vec)
        t, tx = s[C].split(t, factor=warp_size)
        t, ty = s[C].split(t, factor=block_row_warps)
        t, tz = s[C].split(t, factor=block_col_warps)
        s[C].bind(block_i, block_x)
        s[C].bind(block_j, block_y)
        s[C].bind(b, block_z)
        s[C].bind(tz, thread_z)
        s[C].bind(ty, thread_y)
        s[C].bind(tx, thread_x)
        s[C].vectorize(vi)

        # Schedule for wmma store
        s[CS].compute_at(s[C], block_j)
        bs, bb, oo = CS.op.axis
        s[CS].storage_align(bb, CS_align - 1, CS_align)
        bb, bbi = s[CS].split(bb, factor=wmma_m)
        oo, ooi = s[CS].split(oo, factor=wmma_n)
        bb, bbii = s[CS].split(bb, factor=warp_row_tiles)
        oo, ooii = s[CS].split(oo, factor=warp_col_tiles)
        s[CS].reorder(bs, bb, oo, bbii, ooii, bbi, ooi)

        # Schedule for wmma computation
        s[CF].compute_at(s[CS], oo)
        bs, warp_i, warp_j = CF.op.axis
        warp_i, _ii = s[CF].split(warp_i, factor=wmma_m)
        warp_j, _jj = s[CF].split(warp_j, factor=wmma_n)
        (k, ) = CF.op.reduce_axis
        k, _k = s[CF].split(k, factor=wmma_k)
        ko, ki = s[CF].split(k, factor=chunk)
        s[CF].reorder(bs, ko, ki, warp_i, warp_j, _ii, _jj, _k)

        # Schedule for  wmma_matrix_a load
        s[AF].compute_at(s[CF], ki)
        bs, b, i = AF.op.axis
        b, b_ii = s[AF].split(b, factor=wmma_m)
        i, i_jj = s[AF].split(i, factor=wmma_k)
        s[AF].reorder(bs, b, i, b_ii, i_jj)

        # Schedule for  wmma_matrix_b load
        s[BF].compute_at(s[CF], ki)
        bs, o, i = BF.op.axis
        o, o_ii = s[BF].split(o, factor=wmma_n)
        i, i_ii = s[BF].split(i, factor=wmma_k)
        s[BF].reorder(bs, o, i, o_ii, i_ii)

        # Schedule for A's(B's) shared memory load
        def shared_shedule(stage, strides):
            s[stage].compute_at(s[CF], ko)
            bs, xo, yo = stage.op.axis
            s[stage].storage_align(xo, strides - 1, strides)
            t = s[stage].fuse(xo, yo)
            t, vi = s[stage].split(t, factor=vec)
            t, tx = s[stage].split(t, factor=warp_size)
            t, ty = s[stage].split(t, factor=block_row_warps)
            _, tz = s[stage].split(t, factor=block_col_warps)
            s[stage].bind(ty, thread_y)
            s[stage].bind(tz, thread_z)
            s[stage].bind(tx, thread_x)
            s[stage].vectorize(vi)

        shared_shedule(AS, AS_align)
        shared_shedule(BS, BS_align)

        shape = (wmma_m, wmma_n, wmma_k)
        # TODO: add checking here, datatype casting may cause precision loss
        in_dtype = "float16"
        AL_gemm = te.placeholder((wmma_m, wmma_k),
                                 name="AL_gemm",
                                 dtype=in_dtype)
        BL_gemm = te.placeholder((wmma_n, wmma_k),
                                 name="BL_gemm",
                                 dtype=in_dtype)
        k_gemm = te.reduce_axis((0, wmma_k), name="k_gemm")
        CL_compute = te.compute(
            (wmma_m, wmma_n),
            lambda ii, jj: te.sum(
                AL_gemm[ii, k_gemm].astype(out_dtype) * BL_gemm[jj, k_gemm].
                astype(out_dtype),
                axis=k_gemm,
            ),
            name="CL_compute",
        )

        # lower the computation loops down to TensorCore hardware intrinsics
        # by mapping the dense tensorcore to tensor intrinsics
        s[AF].tensorize(
            b_ii,
            intrin_wmma_load_matrix_A(
                AF_stride,
                AS_stride,
                shape,
                "row_major",
                (wmma_m, wmma_k),
                (wmma_m, wmma_k),
                "float16",
            ),
        )
        s[BF].tensorize(
            o_ii,
            intrin_wmma_load_matrix_W(
                BF_stride,
                BS_stride,
                shape,
                "col_major",
                (wmma_n, wmma_k),
                (wmma_n, wmma_k),
                "float16",
            ),
        )
        s[CF].tensorize(
            _ii,
            intrin_wmma_gemm(AL_gemm, BL_gemm, CL_compute, AF_stride,
                             BF_stride, CF_stride, shape),
        )
        s[CS].tensorize(
            bbi,
            intrin_wmma_store_matrix(CS_stride, CF_stride, shape, out_dtype,
                                     (wmma_m, wmma_n), (wmma_m, wmma_n)),
        )
예제 #5
0
def conv2d_spatial_pack_nhwc(cfg, data, kernel, strides, padding, dilation, out_dtype):
    """Spatial pack compute for Conv2d NHWC"""
    out_dtype = out_dtype or data.dtype

    N, IH, IW, IC = get_const_tuple(data.shape)
    assert len(kernel.shape) == 4, "AlterOpLayout not enabled for NHWC yet"
    KH, KW, _, OC = get_const_tuple(kernel.shape)

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    dilated_kernel_h = (KH - 1) * dilation_h + 1
    dilated_kernel_w = (KW - 1) * dilation_w + 1

    pad_top, pad_left, pad_down, pad_right = \
        get_pad_tuple(padding, (dilated_kernel_h, dilated_kernel_w))
    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)

    OH = (IH + pad_top + pad_down - dilated_kernel_h) // HSTR + 1
    OW = (IW + pad_left + pad_right - dilated_kernel_w) // WSTR + 1
    data_pad = nn.pad(data, [0, pad_top, pad_left, 0], [0, pad_down, pad_right, 0])

    # ==================== define configuration space ====================
    n, oc, oh, ow = cfg.axis(N), cfg.axis(OC), cfg.axis(OH), cfg.axis(OW)
    ic, kh, kw = cfg.reduce_axis(IC), cfg.reduce_axis(KH), cfg.reduce_axis(KW)

    oco, oci = cfg.define_split('tile_co', oc, num_outputs=2)
    oho, ohi = cfg.define_split('tile_oh', oh, num_outputs=2)
    owo, owi = cfg.define_split('tile_ow', ow, num_outputs=2)

    cfg.define_reorder('reorder_conv',
                       [n, oho, owo, oco, kh, kw, ic, ohi, owi, oci],
                       policy='candidate', candidate=[
                           [n, oho, owo, oco, kh, kw, ic, ohi, owi, oci],
                           [n, oho, owo, oco, ohi, kh, kw, ic, owi, oci],
                           [n, oho, owo, oco, ohi, kh, kw, owi, ic, oci],
                           [n, oho, owo, ohi, oco, kh, kw, owi, ic, oci]])

    cfg.define_annotate("ann_reduce", [kh, kw], policy='try_unroll')
    cfg.define_annotate("ann_spatial", [ohi, owi, oci], policy='try_unroll_vec')
    # ====================================================================

    OCI = cfg['tile_co'].size[-1]
    OHI = cfg['tile_oh'].size[-1]
    OWI = cfg['tile_ow'].size[-1]
    OCO = OC // OCI
    OHO = OH // OHI
    OWO = OW // OWI

    kvshape = (OCO, KH, KW, IC, OCI)
    ovshape = (N, OHO, OWO, OCO, OHI, OWI, OCI)
    oshape = (N, OH, OW, OC)

    if dilation_h != 1 or dilation_w != 1:
        # undilate input data
        dvshape = (N, OHO, OWO, KH, KW, IC, OHI, OWI)
        data_vec = te.compute(dvshape, lambda n, oho, owo, kh, kw, ic, ohi, owi:
                              data_pad[n][(oho*OHI+ohi)*HSTR+kh*dilation_h]
                              [(owo*OWI+owi)*WSTR+kw*dilation_w][ic],
                              name='data_vec_undilated')
    else:
        dvshape = (N, OHO, OWO, KH + (OHI-1)*HSTR, KW + (OWI-1)*WSTR, IC)
        data_vec = te.compute(dvshape, lambda n, oho, owo, ohi, owi, ic:
                              data_pad[n][oho*OHI*HSTR+ohi][owo*OWI*WSTR+owi][ic],
                              name='data_vec')
    kernel_vec = te.compute(kvshape, lambda oco, kh, kw, ic, oci: \
                            kernel[kh][kw][ic][oco*OCI+oci],
                            name='kernel_vec')

    ic = te.reduce_axis((0, IC), name='ic')
    kh = te.reduce_axis((0, KH), name='kh')
    kw = te.reduce_axis((0, KW), name='kw')

    if dilation_h != 1 or dilation_w != 1:
        conv = te.compute(ovshape, lambda n, oho, owo, oco, ohi, owi, oci: \
                          te.sum(data_vec[n, oho, owo, kh, kw, ohi, owi, ic].astype(out_dtype) *
                                 kernel_vec[oco, kh, kw, ic, oci].astype(out_dtype),
                                 axis=[ic, kh, kw]), name='conv')
    else:
        conv = te.compute(
            ovshape, lambda n, oho, owo, oco, ohi, owi, oci: \
            te.sum(data_vec[n, oho, owo, ohi*HSTR+kh, owi*WSTR+kw, ic].astype(out_dtype) *
                   kernel_vec[oco, kh, kw, ic, oci].astype(out_dtype),
                   axis=[ic, kh, kw]), name='conv')

    idiv = tvm.tir.indexdiv
    imod = tvm.tir.indexmod
    output = te.compute(oshape, lambda n, oho, owo, oc:
                        conv[n][idiv(oho, OHI)][idiv(owo, OWI)][idiv(oc, OCI)]\
                        [imod(oho, OHI)][imod(owo, OWI)][imod(oc, OCI)],
                        name='output_unpack', tag='spatial_conv_output_NHWC')
    return output
예제 #6
0
    'dtype = "float32"\n'
    "a = numpy.random.rand(M, K).astype(dtype)\n"
    "b = numpy.random.rand(K, N).astype(dtype)\n",
    stmt="answer = numpy.dot(a, b)",
    number=np_repeat,
)
print("Numpy running time: %f" % (np_runing_time / np_repeat))

answer = numpy.dot(a.numpy(), b.numpy())

# Algorithm
k = te.reduce_axis((0, K), "k")
A = te.placeholder((M, K), name="A")
B = te.placeholder((K, N), name="B")
C = te.compute((M, N),
               lambda m, n: te.sum(A[m, k] * B[k, n], axis=k),
               name="C")

# Default schedule
s = te.create_schedule(C.op)
func = tvm.build(s, [A, B, C], target=target, name="mmult")
assert func

c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), dev)
func(a, b, c)
tvm.testing.assert_allclose(c.numpy(), answer, rtol=1e-5)

evaluator = func.time_evaluator(func.entry_name, dev, number=1)
print("Baseline: %f" % evaluator(a, b, c).mean)

################################################################################################
예제 #7
0
def group_conv2d_NCHWc_int8(
    cfg, data, kernel, stride, padding, dilation, groups, out_dtype="float32"
):
    """Group convolution operator for 'group_conv2d_NCHWc_int8'.

    Parameters
    ----------
    data : tvm.te.Tensor
        4-D with shape [batch, in_channel, in_height, in_width] or
        5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block]

    kernel : tvm.te.Tensor
        4-D with shape [num_filter, in_channel // groups, filter_height, filter_width] or
        6-D with shape [num_filter_chunk, in_channel_chunk // groups, filter_height,
        filter_width, num_filter_block, in_channel_block]

    stride : int or a list/tuple of two ints
        Stride size, or [stride_height, stride_width]

    padding : int or str
        Padding size, or ['VALID', 'SAME']

    dilation : int or a list/tuple of two ints
        dilation size, or [dilation_height, dilation_width]

    groups : int
        number of groups

    out_dtype : str
        The output type. This is used for mixed precision.

    Returns
    -------
    Output : tvm.te.Tensor
        5-D with shape [batch, out_channel, out_height, out_width, out_channel_block]
    """
    ic_block_factor = 4
    oc_block_factor = 4

    pre_computed = len(kernel.shape) == 6
    if not pre_computed:
        batch, channels, height, width = get_const_tuple(data.shape)
        out_channels, in_channels, kernel_h, kernel_w = get_const_tuple(kernel.shape)

        assert channels % groups == 0, "input channels must divide group size"
        assert out_channels % groups == 0, "output channels must divide group size"
        assert (
            channels % ic_block_factor == 0
        ), "Number of input channels per group must divide {}".format(ic_block_factor)
        assert (
            out_channels % oc_block_factor == 0
        ), "Number of output channels per group must divide {}".format(oc_block_factor)

        packed_data = te.compute(
            (batch, channels // ic_block_factor, height, width, ic_block_factor),
            lambda n, c, h, w, vc: data[n, c * ic_block_factor + vc, h, w],
            name="packed_data",
        )
        packed_kernel = te.compute(
            (
                out_channels // oc_block_factor,
                in_channels // ic_block_factor,
                kernel_h,
                kernel_w,
                oc_block_factor,
                ic_block_factor,
            ),
            lambda oc_chunk, ic_chunk, kh, kw, oc_block, ic_block: kernel[
                oc_chunk * oc_block_factor + oc_block, ic_chunk * ic_block_factor + ic_block, kh, kw
            ],
            name="packed_kernel",
        )
    else:
        packed_data = data
        packed_kernel = kernel

    batch, ic_chunk, in_height, in_width, _ = get_const_tuple(packed_data.shape)
    oc_chunk, _, kernel_h, kernel_w, oc_block, ic_block = get_const_tuple(packed_kernel.shape)

    # TODO(kumasento): these assertions ensure that the number of groups
    # should be smaller or equal to the number of blocks, so that each
    # group will have at least one block.
    # Shall we pad the channels to avoid raising assertions?
    assert (
        groups <= oc_chunk
    ), "Number of groups {} should be less than " "output channel chunk size {}".format(
        groups, oc_chunk
    )
    assert (
        groups <= ic_chunk
    ), "Number of groups {} should be less than " "input channel chunk size {}".format(
        groups, ic_chunk
    )

    if isinstance(stride, int):
        stride_h = stride_w = stride
    else:
        stride_h, stride_w = stride

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    # pad the input data
    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, (kernel_h, kernel_w))
    pad_before = [0, 0, pad_top, pad_left, 0]
    pad_after = [0, 0, pad_down, pad_right, 0]
    pad_data = pad(packed_data, pad_before, pad_after, name="pad_data")

    # compute the output shape
    out_height = (in_height - (kernel_h - 1) * dilation_h - 1 + pad_top + pad_down) // stride_h + 1
    out_width = (in_width - (kernel_w - 1) * dilation_w - 1 + pad_left + pad_right) // stride_w + 1

    oshape = (batch, oc_chunk, out_height, out_width, oc_block)

    icc = te.reduce_axis((0, ic_chunk // groups), name="ic_chunk")
    icb = te.reduce_axis((0, ic_block_factor), name="ic_block")
    kh = te.reduce_axis((0, kernel_h), name="kh")
    kw = te.reduce_axis((0, kernel_w), name="kw")

    # NOTE(kumasento): explanation of this snippet -
    # oc_chunk//groups and ic_chunk//groups give you the number of blocks,
    # i.e., chunk, per group.
    # occ is the ID of the output channel block, so that occ//(oc_chunk//groups)
    # produces the ID of the group.
    # Multiplying that result with ic_chunk//groups resulting in the ID
    # of the beginning block of the corresponding input group.
    # Adding the block offset (icc) will give you the exact block ID.
    #
    # Compared with a normal convolution, group convolution only sums
    # input channels from the group that an output channel resides in.
    conv = te.compute(
        oshape,
        lambda n, occ, oh, ow, ocb: te.sum(
            pad_data[
                n,
                occ // (oc_chunk // groups) * (ic_chunk // groups) + icc,
                oh * stride_h + kh * dilation_h,
                ow * stride_w + kw * dilation_w,
                icb,
            ].astype("int32")
            * packed_kernel[occ, icc, kh, kw, ocb, icb].astype("int32"),
            axis=[icc, kh, kw, icb],
        ),
    )

    # Type conversion
    output = te.compute(
        oshape, lambda *index: conv(*index).astype(out_dtype), tag="group_conv2d_NCHWc_int8"
    )

    num_flop = (
        batch
        * oc_chunk
        * oc_block
        * out_height
        * out_width
        * ic_chunk
        * ic_block
        * kernel_h
        * kernel_w
        * 2
        // groups
    )
    cfg.add_flop(num_flop)

    return output
예제 #8
0
def conv2d_spatial_pack_nchw(cfg, data, kernel, strides, padding, dilation,
                             out_dtype, num_tile):
    """compute define for Conv2d Spatial Pack with NCHW layout"""
    out_dtype = out_dtype or data.dtype
    N, CI, IH, IW = get_const_tuple(data.shape)
    if isinstance(N, tvm.tir.Any):
        N = tvm.te.size_var("n")
    if not isinstance(IH, int) or not isinstance(IW, int):
        raise RuntimeError(
            "ARM winograd conv2d doesn't support dynamic input height or width."
        )

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    if len(kernel.shape) == 4:
        pre_packed = False
        CO, _, KH, KW = get_const_tuple(kernel.shape)
    else:  # kernel tensor is pre packed
        pre_packed = True
        CO, _, KH, KW, VC = get_const_tuple(kernel.shape)
        CO = CO * VC

    dilated_kernel_h = (KH - 1) * dilation_h + 1
    dilated_kernel_w = (KW - 1) * dilation_w + 1
    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w))
    HSTR, WSTR = strides if isinstance(strides,
                                       (tuple, list)) else (strides, strides)
    OH = (IH + pad_top + pad_bottom - dilated_kernel_h) // HSTR + 1
    OW = (IW + pad_left + pad_right - dilated_kernel_w) // WSTR + 1
    data_pad = nn.pad(data, [0, 0, pad_top, pad_left],
                      [0, 0, pad_bottom, pad_right])

    # ==================== define configuration space ====================
    # TODO(@kevinthesun): Support tuning/optimization for dynamic shape.
    n_tuning_axis = N if isinstance(N, int) else 1
    n, co, oh, ow = cfg.axis(n_tuning_axis), cfg.axis(CO), cfg.axis(
        OH), cfg.axis(OW)
    ci, kh, kw = cfg.reduce_axis(CI), cfg.reduce_axis(KH), cfg.reduce_axis(KW)

    if num_tile == 2:  # for arm cpu
        co, vc = cfg.define_split("tile_co", co, num_outputs=2)
        oh, vh = cfg.define_split("tile_oh", oh, num_outputs=2)
        ow, vw = cfg.define_split("tile_ow", ow, num_outputs=2)
    elif num_tile == 3:  # for mali gpu
        co, _, vc = cfg.define_split("tile_co", co, num_outputs=3)
        oh, _, vh = cfg.define_split("tile_oh", oh, num_outputs=3)
        ow, _, vw = cfg.define_split("tile_ow", ow, num_outputs=3)
    else:
        raise RuntimeError("Invalid num_tile")

    cfg.define_reorder(
        "reorder_0",
        [n, co, oh, ow, ci, kh, kw, vh, vw, vc],
        policy="candidate",
        candidate=[
            [n, co, oh, ow, ci, kh, kw, vh, vw, vc],
            [n, co, oh, ow, ci, kh, kw, vc, vh, vw],
        ],
    )

    cfg.define_annotate("ann_reduce", [kh, kw], policy="try_unroll")
    cfg.define_annotate("ann_spatial", [vh, vw, vc], policy="try_unroll_vec")

    # fallback support
    if cfg.is_fallback:
        if num_tile == 2:  # arm cpu
            ref_log = autotvm.tophub.load_reference_log(
                "arm_cpu", "rk3399", "conv2d_nchw_spatial_pack.arm_cpu")
            cfg.fallback_with_reference_log(ref_log)
        elif num_tile == 3:  # mali gpu
            ref_log = autotvm.tophub.load_reference_log(
                "mali", "rk3399", "conv2d_nchw_spatial_pack.mali")
            cfg.fallback_with_reference_log(ref_log)
    # ====================================================================

    VC = cfg["tile_co"].size[-1]
    VH = cfg["tile_oh"].size[-1]
    VW = cfg["tile_ow"].size[-1]

    kvshape = (CO // VC, CI, KH, KW, VC)
    ovshape = (N, CO // VC, OH // VH, OW // VW, VH, VW, VC)
    oshape = (N, CO, OH, OW)

    if dilation_h != 1 or dilation_w != 1:
        # undilate input data
        dvshape = (N, OH // VH, OW // VW, CI, KH, KW, VH, VW)
        data_vec = te.compute(
            dvshape,
            lambda n, h, w, ci, kh, kw, vh, vw: data_pad[n][ci][
                (h * VH + vh) * HSTR + kh * dilation_h][
                    (w * VW + vw) * WSTR + kw * dilation_w],
            name="data_vec_undilated",
        )
    else:
        dvshape = (N, OH // VH, OW // VW, CI, VH * HSTR + KH - 1,
                   VW * WSTR + KW - 1)
        data_vec = te.compute(
            dvshape,
            lambda n, h, w, ci, vh, vw: data_pad[n][ci][h * VH * HSTR + vh][
                w * VW * WSTR + vw],
            name="data_vec",
        )

    if autotvm.GLOBAL_SCOPE.in_tuning:
        # use "kernel_autotvm" instead of "kernel" to avoid naming conflict with OpenCL keyword
        kernel_vec = tvm.te.placeholder(kvshape,
                                        kernel.dtype,
                                        name="kernel_autotvm")
    else:
        if pre_packed:
            kernel_vec = kernel
        else:
            kernel_vec = te.compute(
                kvshape,
                lambda co, ci, kh, kw, vc: kernel[co * VC + vc][ci][kh][kw],
                name="kernel_vec",
            )

    ci = te.reduce_axis((0, CI), name="ci")
    kh = te.reduce_axis((0, KH), name="kh")
    kw = te.reduce_axis((0, KW), name="kw")

    if dilation_h != 1 or dilation_w != 1:
        conv = te.compute(
            ovshape,
            lambda n, co, h, w, vh, vw, vc: te.sum(
                data_vec[n, h, w, ci, kh, kw, vh, vw].astype(out_dtype) *
                kernel_vec[co, ci, kh, kw, vc].astype(out_dtype),
                axis=[ci, kh, kw],
            ),
            name="conv",
        )
    else:
        conv = te.compute(
            ovshape,
            lambda n, co, h, w, vh, vw, vc: te.sum(
                data_vec[n, h, w, ci, vh * HSTR + kh, vw * WSTR + kw].astype(
                    out_dtype) * kernel_vec[co, ci, kh, kw, vc].astype(
                        out_dtype),
                axis=[ci, kh, kw],
            ),
            name="conv",
        )

    idxdiv = tvm.tir.indexdiv
    idxmod = tvm.tir.indexmod

    output = te.compute(
        oshape,
        lambda n, co, h, w: conv[n,
                                 idxdiv(co, VC),
                                 idxdiv(h, VH),
                                 idxdiv(w, VW),
                                 idxmod(h, VH),
                                 idxmod(w, VW),
                                 idxmod(co, VC), ],
        name="output_unpack",
        tag="spatial_conv2d_output",
    )
    return output
예제 #9
0
def test_gemm():
    # graph
    nn = 2048
    n = te.var('n')
    n = tvm.runtime.convert(nn)
    m, l = n, n
    A = te.placeholder((l, n), name='A')
    B = te.placeholder((l, m), name='B')
    k = te.reduce_axis((0, l), name='k')
    C = te.compute((m, n),
                   lambda ii, jj: te.sum(A[k, jj] * B[k, ii], axis=k),
                   name='C')

    # schedule
    s = te.create_schedule(C.op)
    AA = s.cache_read(A, "shared", [C])
    BB = s.cache_read(B, "shared", [C])
    AL = s.cache_read(AA, "local", [C])
    BL = s.cache_read(BB, "local", [C])
    CC = s.cache_write(C, "local")

    scale = 8
    num_thread = 8
    block_factor = scale * num_thread
    block_x = te.thread_axis("blockIdx.x")
    thread_x = te.thread_axis((0, num_thread), "threadIdx.x")
    block_y = te.thread_axis("blockIdx.y")
    thread_y = te.thread_axis((0, num_thread), "threadIdx.y")
    thread_xz = te.thread_axis((0, 2), "vthread", name="vx")
    thread_yz = te.thread_axis((0, 2), "vthread", name="vy")

    by, yi = s[C].split(C.op.axis[0], factor=block_factor)
    bx, xi = s[C].split(C.op.axis[1], factor=block_factor)
    s[C].bind(by, block_y)
    s[C].bind(bx, block_x)
    s[C].reorder(by, bx, yi, xi)

    tyz, yi = s[C].split(yi, nparts=2)
    ty, yi = s[C].split(yi, nparts=num_thread)
    txz, xi = s[C].split(xi, nparts=2)
    tx, xi = s[C].split(xi, nparts=num_thread)
    s[C].bind(tyz, thread_yz)
    s[C].bind(txz, thread_xz)
    s[C].bind(ty, thread_y)
    s[C].bind(tx, thread_x)
    s[C].reorder(tyz, txz, ty, tx, yi, xi)
    s[CC].compute_at(s[C], tx)

    yo, xo = CC.op.axis
    ko, ki = s[CC].split(k, factor=8)
    kt, ki = s[CC].split(ki, factor=1)
    s[CC].reorder(ko, kt, ki, yo, xo)
    s[AA].compute_at(s[CC], ko)
    s[BB].compute_at(s[CC], ko)
    s[CC].unroll(kt)
    s[AL].compute_at(s[CC], kt)
    s[BL].compute_at(s[CC], kt)
    # Schedule for A's shared memory load
    ty, xi = s[AA].split(s[AA].op.axis[0], nparts=num_thread)
    _, xi = s[AA].split(s[AA].op.axis[1], factor=num_thread * 4)
    tx, xi = s[AA].split(xi, nparts=num_thread)
    s[AA].bind(ty, thread_y)
    s[AA].bind(tx, thread_x)
    s[AA].vectorize(xi)
    # Schedule for B' shared memory load
    ty, xi = s[BB].split(s[BB].op.axis[0], nparts=num_thread)
    _, xi = s[BB].split(s[BB].op.axis[1], factor=num_thread * 4)
    tx, xi = s[BB].split(xi, nparts=num_thread)
    s[BB].bind(ty, thread_y)
    s[BB].bind(tx, thread_x)
    s[BB].vectorize(xi)
    s[AA].double_buffer()
    s[BB].double_buffer()

    # correctness

    def check_device(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            print("Skip because %s is not enabled" % device)
            return
        print("Device %s" % device)
        f = tvm.build(s, [A, B, C], device)
        # launch the kernel.
        n, m, l = nn, nn, nn
        a_np = np.random.uniform(size=(n, l)).astype(A.dtype)
        b_np = np.random.uniform(size=(m, l)).astype(B.dtype)
        a = tvm.nd.array(a_np, ctx)
        b = tvm.nd.array(b_np, ctx)
        c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx)
        for i in range(2):
            f(a, b, c)
        tvm.testing.assert_allclose(c.asnumpy(),
                                    np.dot(b_np.T, a_np),
                                    rtol=1e-5)

        num_flops = 2 * nn * nn * nn
        num_runs = 10
        timer_f = f.time_evaluator(f.entry_name, ctx, number=num_runs)
        t = timer_f(a, b, c).mean
        GFLOPS = num_flops / (t * 1e3) / 1e6
        print("average time cost of %d runs = %g ms, %g GFLOPS." %
              (num_runs, t * 1e3, GFLOPS))

    for device in ["cuda", "opencl", "rocm", "nvptx", "vulkan"]:
        with tvm.transform.PassContext(
                config={
                    "tir.UnrollLoop": {
                        "auto_max_step": 128,
                        "explicit_unroll": device != "cuda"
                    }
                }):
            check_device(device)
예제 #10
0
def gemm_quantized(M, N, K, unroll, interleave, in_type, out_type):
    """
    Use integer ARM v8 instructions in order to produce a block c of 4x4 elements
    given two 4xK blocks a and b' (where b' is a Kx4 block transposed). The final
    result is c = a*b (where '*' indicates the matrix product)

    Every row of the matrix c is obtained (for uint8) by a sequence of

          umull -> uadalp -> umull2 -> uadalp

    The block size is constrained by the number of registers available in arvm8. This
    function returns a TensorIntrin that can be used to tensorize
    a schedule.

    Parameters
    ----------
    M: int
        rows of the matrix A
    N: int
        columns of the matrix B
    K: int
        columns of matrix A
    in_type: str, {'uint8', 'int8'}
    out_type: str, {'uint32', 'int32'}

    Returns
    -------
    intrin : TensorIntrin
        The ARM uint8/int8 TensorIntrin that can be used in tensorizing schedule
    """
    A = te.placeholder((K // 16, te.var("m"), 16), dtype=in_type, name='A')
    B = te.placeholder((K // 16, te.var("n"), 16), dtype=in_type, name='B')

    idxm = tvm.tir.indexmod

    k = te.reduce_axis((0, K), "k")

    C = te.compute((te.var("m"), te.var("n")),
                   lambda x, y: te.sum(A[k // 16, x, idxm(k, 16)].astype(
                       out_type) * B[k // 16, y, idxm(k, 16)].astype(out_type),
                                       axis=k),
                   name="C")

    a_buffer = tvm.tir.decl_buffer(A.shape,
                                   dtype=in_type,
                                   name="a_buffer",
                                   offset_factor=1,
                                   strides=[te.var('sa_1'),
                                            te.var('sa_2'), 1])

    b_buffer = tvm.tir.decl_buffer(B.shape,
                                   dtype=in_type,
                                   name="b_buffer",
                                   offset_factor=1,
                                   strides=[te.var('sb_1'),
                                            te.var('sb_2'), 1])

    c_buffer = tvm.tir.decl_buffer(C.shape,
                                   dtype=out_type,
                                   name="c_buffer",
                                   offset_factor=1,
                                   strides=[te.var('sc'), 1])

    def _intrin_func(ins, outs):
        def _instr():
            ib = tvm.tir.ir_builder.create()
            aa, bb = ins
            cc = outs[0]
            stepA = min(4, M)
            stepB = min(4, N)
            intrin_name = "gemm_quantized_{0}_{0}_int32_{1}_{2}".format(
                in_type, stepA, stepB)
            if unroll:
                intrin_name += ("_" + str(K))
            if interleave:
                intrin_name += "_interleaved"
            ib.emit(
                tvm.tir.call_extern("int32", intrin_name,
                                    outs[0].access_ptr("w"),
                                    a_buffer.access_ptr("r"),
                                    b_buffer.access_ptr("r"), K))
            return ib.get()

        # body, reset, update
        return _instr()

    buffer_params = {"offset_factor": 1}
    return te.decl_tensor_intrin(C.op,
                                 _intrin_func,
                                 binds={
                                     A: a_buffer,
                                     B: b_buffer,
                                     C: c_buffer
                                 },
                                 default_buffer_params=buffer_params)
예제 #11
0
def dot_int8_int8_int32(int32_lanes, dtype='uint'):
    """
    Int8 dot product by every 4 elements using ARM v8.2 udot.
    This function takes two arrays of int8 datatype -- data[4] and
    kernel[int32_lanes][4] -- and computes a dot product of data[4] with every
    4 elements of kernels, resulting in output[int32_lanes] of uint32 datatype.
    The pseudo code is as follows.

    .. code-block:: c

        void dot_int8_int8_int32(int8 data[4], int8 kernel[16][4], int32 output[16]){
            for (int i = 0; i < int32_lanes; i++){
                out[i] = 0;
                for (int k = 0; k < 4; k++){
                    out[i] += data[k] * kernel[i][k]
                }
            }
        }

    Physically, the kernel array sits in a vector register and
    the data[4] is broadcasted to another vector register. This
    function returns a TensorIntrin that can be used to tensorize
    a schedule.

    Parameters
    ----------
    int32_lanes: int
        How many int32/uint32 to produce
    dtype: str, optional, {"uint", "int"}
        Whether it works on unsigned int or signed int

    Returns
    -------
    intrin : TensorIntrin
        The ARM uint8 TensorIntrin that can be used in tensorizing schedule
    """
    num_int8_elements = 4  # 4 int8 elements in int32

    data = te.placeholder((num_int8_elements, ),
                          dtype='%s8' % dtype,
                          name='data')
    kernel = te.placeholder((int32_lanes, num_int8_elements),
                            dtype='%s8' % dtype,
                            name='kernel')

    k = te.reduce_axis((0, num_int8_elements), name='k')
    C = te.compute((int32_lanes, ),
                   lambda i: te.sum(data[k].astype('%s32' % dtype) * kernel[
                       i, k].astype('%s32' % dtype),
                                    axis=k),
                   name="C")

    a_buffer = tvm.tir.decl_buffer(data.shape,
                                   dtype='%s8' % dtype,
                                   name="a_buffer",
                                   offset_factor=1,
                                   strides=[1])
    b_buffer = tvm.tir.decl_buffer(kernel.shape,
                                   dtype='%s8' % dtype,
                                   name="b_buffer",
                                   offset_factor=1,
                                   strides=[te.var('s'), 1])

    def _intrin_func(ins, outs):
        def _instr(index):
            ib = tvm.tir.ir_builder.create()
            if index == 1:
                ib.emit(outs[0].vstore(
                    0, tvm.tir.const(0, '%s32x%d' % (dtype, int32_lanes))))
                return ib.get()

            dtype_a = '%s8x%d' % (dtype, num_int8_elements)
            dtype_b = '%s8x%d' % (dtype, int32_lanes * num_int8_elements)
            dtype_c = '%s32x%d' % (dtype, int32_lanes)

            a_int8 = ins[0].vload([0], dtype_a)
            re_int32 = tvm.tir.call_intrin('%s32' % dtype, 'tir.reinterpret',
                                           a_int8)
            # broadcast a
            vec_ai32 = re_int32.astype(dtype_c)

            vec_a = tvm.tir.call_intrin(dtype_b, 'tir.reinterpret', vec_ai32)
            vec_b = ins[1].vload([0, 0], dtype_b)
            vec_c = outs[0].vload([0], dtype_c)

            inst = 'udot' if dtype == 'uint' else 'sdot'
            inst = 'llvm.aarch64.neon.%s.v%di32.v%di8' % (
                inst, int32_lanes, int32_lanes * num_int8_elements)
            vdot = tvm.tir.call_llvm_pure_intrin(dtype_c, inst,
                                                 tvm.tir.const(2, 'uint32'),
                                                 vec_c, vec_a, vec_b)
            ib.emit(outs[0].vstore(0, vdot))
            return ib.get()

        # body, reset, update
        return _instr(0), _instr(1), _instr(2)

    buffer_params = {"offset_factor": 1}
    return te.decl_tensor_intrin(C.op,
                                 _intrin_func,
                                 binds={
                                     data: a_buffer,
                                     kernel: b_buffer
                                 },
                                 default_buffer_params=buffer_params)
예제 #12
0
def conv2d_NCHWc_int8(data,
                      kernel,
                      stride,
                      padding,
                      dilation,
                      layout,
                      out_layout,
                      out_dtype="int32",
                      n_elems=4):
    """Conv2D operator for nChw[x]c layout.

    Parameters
    ----------
    data : tvm.te.Tensor
        5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block]

    kernel : tvm.te.Tensor
        7-D with shape
        [num_filter_chunk, in_channel_chunk, filter_height, filter_width, in_channel_block/4,
        num_filter_block, 4]

    stride : int or a list/tuple of two ints
        stride size, or [stride_height, stride_width]

    padding : int or a list/tuple of 2 or 4 ints
        padding size, or
        [pad_height, pad_width] for 2 ints, or
        [pad_top, pad_left, pad_bottom, pad_right] for 4 ints

    dilation: int or a list/tuple of two ints
        dilation size, or [dilation_height, dilation_width]

    layout : str
        Input data layout

    out_layout : str
        Output data layout

    out_dtype : str
        output data type

    n_elems : int
        numer of int8 elements accumulated

    Returns
    -------
    output : tvm.te.Tensor
        5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block]
    """

    # layout and out_layout are not used here,
    # we keep them for debug convenience when dumping autotvm workload
    HSTR, WSTR = stride if isinstance(stride,
                                      (tuple, list)) else (stride, stride)
    dilation_h, dilation_w = (dilation if isinstance(dilation,
                                                     (tuple, list)) else
                              (dilation, dilation))

    n, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape)
    in_channel = ic_chunk * ic_bn
    oc_chunk, ic_chunk_group, kernel_height, kernel_width, _, oc_bn, _ = get_const_tuple(
        kernel.shape)
    num_filter = oc_chunk * oc_bn
    groups = ic_chunk // ic_chunk_group

    dilated_kernel_h = (kernel_height - 1) * dilation_h + 1
    dilated_kernel_w = (kernel_width - 1) * dilation_w + 1

    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w))
    HPAD = pad_top + pad_down
    WPAD = pad_left + pad_right

    # output shape
    out_height = (ih + HPAD - dilated_kernel_h) // HSTR + 1
    out_width = (iw + WPAD - dilated_kernel_w) // WSTR + 1
    oshape = (n, oc_chunk, out_height, out_width, oc_bn)
    pad_before = (0, 0, pad_top, pad_left, 0)
    pad_after = (0, 0, pad_down, pad_right, 0)

    # DOPAD
    DOPAD = HPAD != 0 or WPAD != 0
    if DOPAD:
        data_pad = pad(data, pad_before, pad_after, name="data_pad")
    else:
        data_pad = data

    ic = te.reduce_axis((0, in_channel), name="ic")
    kh = te.reduce_axis((0, kernel_height), name="kh")
    kw = te.reduce_axis((0, kernel_width), name="kw")

    if groups == 1:
        ic_outer = te.reduce_axis((0, in_channel // ic_bn), name="ic_outer")
        ic_f_inner = te.reduce_axis((0, ic_bn // n_elems), name="ic_f_inner")
        ic_s_inner = te.reduce_axis((0, n_elems), name="ic_s_inner")
        return te.compute(
            oshape,
            lambda n, oc_chunk, oh, ow, oc_block: te.sum(
                data_pad[n, ic_outer, oh * HSTR + kh * dilation_h, ow * WSTR +
                         kw * dilation_w, ic_f_inner * n_elems + ic_s_inner, ].
                astype(out_dtype
                       ) * kernel[oc_chunk, ic_outer, kh, kw, ic_f_inner,
                                  oc_block, ic_s_inner].astype(out_dtype),
                axis=[kh, kw, ic_outer, ic_f_inner, ic_s_inner],
            ),
            name="conv2d_NCHWc_int8",
            tag="conv2d_NCHWc_int8",
            attrs={"schedule_rule": "meta_schedule.conv2d_NCHWc_int8"},
        )
    # for int8 group conv support
    ic_chunk = in_channel // ic_bn
    ic_outer = te.reduce_axis((0, ic_chunk // groups), name="ic_outer")
    ic_f_inner = te.reduce_axis((0, ic_bn // n_elems), name="ic_f_inner")
    ic_s_inner = te.reduce_axis((0, n_elems), name="ic_s_inner")
    oshape = (n, oc_chunk, out_height, out_width, oc_bn)
    return te.compute(
        oshape,
        lambda n, occ, oh, ow, oc_block: te.sum(
            data_pad[n, (occ * oc_bn // (oc_chunk * oc_bn // groups)) *
                     (ic_chunk // groups) + ic_outer, oh * HSTR + kh, ow * WSTR
                     + kw, ic_f_inner * n_elems + ic_s_inner, ].
            astype(out_dtype) * kernel[occ, ic_outer, kh, kw, ic_f_inner,
                                       oc_block, ic_s_inner].astype(out_dtype),
            axis=[kh, kw, ic_outer, ic_f_inner, ic_s_inner],
        ),
        name="conv2d_NCHWc_int8",
        tag="conv2d_NCHWc_int8",
        attrs={"schedule_rule": "meta_schedule.conv2d_NCHWc_int8"},
    )
예제 #13
0
def conv2d_NCHWc(data,
                 kernel,
                 stride,
                 padding,
                 dilation,
                 layout,
                 out_layout,
                 out_dtype="float32"):
    """Conv2D operator for nChw[x]c layout.

    Parameters
    ----------
    data : tvm.te.Tensor
        5-D with shape [batch, in_channel_chunk, in_height, in_width, in_channel_block]

    kernel : tvm.te.Tensor
        6-D with shape
        [num_filter_chunk, in_channel_chunk, filter_height, filter_width,
        in_channel_block, num_filter_block]

    stride : int or a list/tuple of two ints
        stride size, or [stride_height, stride_width]

    padding : int or a list/tuple of 2 or 4 ints
        padding size, or
        [pad_height, pad_width] for 2 ints, or
        [pad_top, pad_left, pad_bottom, pad_right] for 4 ints

    dilation: int or a list/tuple of two ints
        dilation size, or [dilation_height, dilation_width]

    layout : str
        Input data layout

    out_layout : str
        Output data layout

    out_dtype : str
        output data type

    Returns
    -------
    output : tvm.te.Tensor
        5-D with shape [batch, out_channel_chunk, out_height, out_width, out_channel_block]
    """

    # layout and out_layout are not used here,
    # we keep them for debug convenience when dumping autotvm workload
    HSTR, WSTR = stride if isinstance(stride,
                                      (tuple, list)) else (stride, stride)
    dilation_h, dilation_w = (dilation if isinstance(dilation,
                                                     (tuple, list)) else
                              (dilation, dilation))

    n, ic_chunk, ih, iw, ic_bn = get_const_tuple(data.shape)
    in_channel = ic_chunk * ic_bn
    target = tvm.target.Target.current(allow_none=False)
    oc_chunk, ic_chunk_group, kernel_height, kernel_width, _, oc_bn = get_const_tuple(
        kernel.shape)
    num_filter = oc_chunk * oc_bn
    groups = ic_chunk // ic_chunk_group

    dilated_kernel_h = (kernel_height - 1) * dilation_h + 1
    dilated_kernel_w = (kernel_width - 1) * dilation_w + 1

    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w))
    HPAD = pad_top + pad_down
    WPAD = pad_left + pad_right

    # output shape
    out_height = (ih + HPAD - dilated_kernel_h) // HSTR + 1
    out_width = (iw + WPAD - dilated_kernel_w) // WSTR + 1
    oshape = (n, oc_chunk, out_height, out_width, oc_bn)
    pad_before = (0, 0, pad_top, pad_left, 0)
    pad_after = (0, 0, pad_down, pad_right, 0)

    # DOPAD
    DOPAD = HPAD != 0 or WPAD != 0
    if DOPAD:
        data_pad = pad(data, pad_before, pad_after, name="data_pad")
    else:
        data_pad = data

    ic = te.reduce_axis((0, in_channel), name="ic")
    kh = te.reduce_axis((0, kernel_height), name="kh")
    kw = te.reduce_axis((0, kernel_width), name="kw")

    idxdiv = tvm.tir.indexdiv
    idxmod = tvm.tir.indexmod

    return te.compute(
        oshape,
        lambda n, oc_chunk, oh, ow, oc_block: te.sum(
            data_pad[n,
                     idxdiv(ic, ic_bn), oh * HSTR + kh * dilation_h, ow * WSTR
                     + kw * dilation_w,
                     idxmod(ic, ic_bn), ].astype(out_dtype) * kernel[
                         oc_chunk,
                         idxdiv(ic, ic_bn), kh, kw,
                         idxmod(ic, ic_bn), oc_block].astype(out_dtype),
            axis=[ic, kh, kw],
        ),
        name="conv2d_NCHWc",
        tag="conv2d_NCHWc",
    )
예제 #14
0
def _conv2d_winograd_nhwc_impl(
    data,
    weight,
    strides,
    padding,
    dilation,
    out_dtype,
    tile_size,
    pre_computed=False,
    auto_scheduler_rewritten_layout="",
):
    """Conv2D Winograd implementation in NHWC layout.
    This is a clean version to be used by the auto-scheduler for both CPU and GPU.

    Parameters
    ----------
    data : tvm.Tensor
        4-D with shape [batch, in_height, in_width, in_channel]
    weight : tvm.Tensor
        4-D with shape [filter_height, filter_width, in_channel, num_filter]
    strides : int or a list/tuple of two ints
        stride size, or [stride_height, stride_width]
    padding : int or a list/tuple of two ints
        padding size, or [pad_height, pad_width]
    dilation: int or a list/tuple of two ints
        dilation size, or [dilation_height, dilation_width]
    out_dtype : str, optional
        Specifies the output data type.
    tile_size : int
        The size of the tile to use for the Winograd filter
    pre_computed: bool = False
        Whether the kernel is precomputed
    auto_scheduler_rewritten_layout: str = ""
        The layout after auto-scheduler's layout rewrite pass.

    Returns
    -------
    output : tvm.Tensor
        4-D with shape [batch, out_height, out_width, out_channel]
    """
    N, H, W, CI = get_const_tuple(data.shape)
    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation"
    if not pre_computed:
        KH, KW, CI, CO = get_const_tuple(weight.shape)
    else:
        if auto_scheduler_rewritten_layout:
            H_CAT, W_CAT, CO, CI = get_const_tuple(
                auto_scheduler.get_shape_from_rewritten_layout(
                    auto_scheduler_rewritten_layout,
                    ["eps", "nu", "co", "ci"]))
            auto_scheduler.remove_index_check(weight)
        else:
            H_CAT, W_CAT, CO, CI = get_const_tuple(weight.shape)

        KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1

    pad_t, pad_l, pad_b, pad_r = get_pad_tuple(padding, (KH, KW))
    HSTR, WSTR = (strides, strides) if isinstance(strides, int) else strides
    assert HSTR == 1 and WSTR == 1 and KH == 3 and KW == 3

    r = KW
    m = tile_size
    alpha = m + r - 1
    A, B, G = winograd_transform_matrices(m, r, out_dtype)

    H = (H + pad_t + pad_b - KH) // HSTR + 1
    W = (W + pad_l + pad_r - KW) // WSTR + 1
    nH, nW = (H + m - 1) // m, (W + m - 1) // m
    P = N * nH * nW

    pad_extra = (nW - 1) * m + alpha - (H + pad_t + pad_b)
    data_pad = pad(
        data,
        (0, pad_t, pad_l, 0),
        (0, pad_b + pad_extra, pad_r + pad_extra, 0),
        name="data_pad",
        attrs={"schedule_rule": "None"},
    )

    if not pre_computed:
        r_kh = te.reduce_axis((0, KH), name="r_kh")
        r_kw = te.reduce_axis((0, KW), name="r_kw")
        kernel_pack = te.compute(
            (alpha, alpha, CO, CI),
            lambda eps, nu, co, ci: te.sum(weight[r_kh][r_kw][ci][co] * G[eps][
                r_kh] * G[nu][r_kw],
                                           axis=[r_kh, r_kw]),
            name="kernel_pack",
        )
        attrs = {}
    else:
        kernel_pack = weight
        attrs = {"layout_free_placeholders": [kernel_pack]}

    # pack data tile
    input_tile = te.compute(
        (alpha, alpha, P, CI),
        lambda eps, nu, p, ci: data_pad[p // (nH * nW)][(
            (p // nW) % nH) * m + eps][(p % nW) * m + nu][ci],
        name="input_tile",
        attrs={"schedule_rule": "None"},
    )

    # transform data
    target = tvm.target.Target.current(allow_none=True)
    if target is not None:
        target_kind = "meta_schedule.winograd_data_pack." + target.kind.name
    else:
        target_kind = "None"

    r_a = te.reduce_axis((0, alpha), "r_a")
    r_b = te.reduce_axis((0, alpha), "r_b")
    data_pack = te.compute(
        (alpha, alpha, P, CI),
        lambda eps, nu, p, ci: te.sum(input_tile[r_a][r_b][p][ci] * B[r_a][eps]
                                      * B[r_b][nu],
                                      axis=[r_a, r_b]),
        name="data_pack",
        attrs={
            "auto_scheduler_simplify_const_tensor_indices":
            ["eps", "nu", "r_a", "r_b"],
            "schedule_rule":
            target_kind,
        },
        # the attrs are necessary hints for the auto-scheduler
    )

    # do batch gemm
    ci = te.reduce_axis((0, CI), name="ci")
    bgemm = te.compute(
        (alpha, alpha, P, CO),
        lambda eps, nu, p, co: te.sum(data_pack[eps][nu][p][ci] * kernel_pack[
            eps][nu][co][ci],
                                      axis=[ci]),
        name="bgemm",
        attrs=attrs,
    )

    if auto_scheduler_rewritten_layout:
        bgemm = auto_scheduler.rewrite_compute_body(
            bgemm, auto_scheduler_rewritten_layout)

    # inverse transform
    r_a = te.reduce_axis((0, alpha), "r_a")
    r_b = te.reduce_axis((0, alpha), "r_b")
    inverse = te.compute(
        (m, m, P, CO),
        lambda vh, vw, p, co: te.sum(
            bgemm[r_a][r_b][p][co] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b]),
        name="inverse",
        attrs={
            "auto_scheduler_simplify_const_tensor_indices":
            ["vh", "vw", "r_a", "r_b"],
            "schedule_rule":
            "meta_schedule.winograd_inverse",
        },
        # the attrs are necessary hints for the auto-scheduler
    )

    # output
    output = te.compute(
        (N, H, W, CO),
        lambda n, h, w, co: inverse[h % m, w % m, n * nH * nW + (h // m) * nW +
                                    (w // m), co],
        name="conv2d_winograd",
    )

    return output
예제 #15
0
파일: dense.py 프로젝트: zjppoet/tvm
def dense_pack(cfg, data, weight, bias=None, out_dtype=None):
    """Compute dense with transformed weight."""
    if out_dtype is None:
        out_dtype = data.dtype
    M, K = get_const_tuple(data.shape)  # batch, in_dim
    if len(weight.shape) == 3:
        N, _, packw_bn = get_const_tuple(weight.shape)  # out_dim
        N = N * packw_bn
    else:
        N, _ = get_const_tuple(weight.shape)  # out_dim
    # create tuning space
    cfg.define_split("tile_y",
                     32 if isinstance(M, (tvm.tir.Var, tvm.tir.Any)) else M,
                     num_outputs=3)
    cfg.define_split("tile_x",
                     32 if isinstance(N, (tvm.tir.Var, tvm.tir.Any)) else N,
                     num_outputs=3)
    cfg.define_split("tile_k",
                     32 if isinstance(K, (tvm.tir.Var, tvm.tir.Any)) else K,
                     num_outputs=2)
    cfg.define_split(
        "tile_inner",
        32 if isinstance(M, (tvm.tir.Var, tvm.tir.Any)) else M,
        num_outputs=2,
        filter=lambda y: y.size[-1] <= 16,
    )
    if cfg.is_fallback:
        _default_dense_pack_config(cfg, M, N, K)

    if len(weight.shape) == 2:
        packw_bn = cfg["tile_x"].size[-1]
        packw_shape = (N // packw_bn, K, packw_bn)
        if autotvm.GLOBAL_SCOPE.in_tuning:
            # Directly use modified data layout placeholder.
            packw = tvm.te.placeholder(packw_shape,
                                       weight.dtype,
                                       name="packed_weight")
        else:
            packw = te.compute(packw_shape,
                               lambda z, y, x: weight[z * packw_bn + x, y],
                               name="packed_weight")
    else:
        packw = weight

    idxdiv = tvm.tir.indexdiv
    idxmod = tvm.tir.indexmod
    k = te.reduce_axis((0, K), name="k")
    C = te.compute(
        (M, N),
        lambda y, x: te.sum(
            data[y, k].astype(out_dtype) * packw[idxdiv(
                x, packw_bn), k, idxmod(x, packw_bn)].astype(out_dtype),
            axis=k,
        ),
        tag="dense_pack",
    )
    if bias is not None:
        C = te.compute((M, N),
                       lambda i, j: C[i, j] + bias[j].astype(out_dtype),
                       tag=tag.BROADCAST)
    return C
예제 #16
0
import tvm
from tvm import te

n = 1024
dtype = "float32"
A = te.placeholder((n, n), dtype=dtype, name='A')
k = te.reduce_axis((0, n), name='k')
B = te.compute((n, ), lambda i: te.sum(A[i, k], axis=k), name='B')

s = te.create_schedule(B.op)

print(tvm.lower(s, [A, B], simple_mode=True))
print("---------cutting line---------")

BW = s.cache_write(B, "local")

print(tvm.lower(s, [A, B], simple_mode=True))
예제 #17
0
    def _run(env, remote):
        # declare
        o = 4
        n = 1
        m = 4
        x = te.placeholder((o, n, env.BATCH, env.BLOCK_IN),
                           name="x",
                           dtype=env.inp_dtype)
        w = te.placeholder((m, n, env.BLOCK_OUT, env.BLOCK_IN),
                           name="w",
                           dtype=env.wgt_dtype)
        x_buf = te.compute((o, n, env.BATCH, env.BLOCK_IN), lambda *i: x(*i),
                           "x_buf")
        w_buf = te.compute((m, n, env.BLOCK_OUT, env.BLOCK_IN),
                           lambda *i: w(*i), "w_buf")
        ko = te.reduce_axis((0, n), name="ko")
        ki = te.reduce_axis((0, env.BLOCK_IN), name="ki")
        y_gem = te.compute(
            (o, m, env.BATCH, env.BLOCK_OUT),
            lambda bo, co, bi, ci: te.sum(x_buf[bo, ko, bi, ki].astype(
                env.acc_dtype) * w_buf[co, ko, ci, ki].astype(env.acc_dtype),
                                          axis=[ko, ki]),
            name="y_gem")
        y_shf = te.compute((o, m, env.BATCH, env.BLOCK_OUT),
                           lambda *i: y_gem(*i) >> 8,
                           name="y_shf")
        y_max = te.compute((o, m, env.BATCH, env.BLOCK_OUT),
                           lambda *i: tvm.te.max(y_shf(*i), 0), "y_max")  #relu
        y_min = te.compute((o, m, env.BATCH, env.BLOCK_OUT),
                           lambda *i: tvm.te.min(y_max(*i),
                                                 (1 <<
                                                  (env.INP_WIDTH - 1)) - 1),
                           "y_min")  #relu
        y = te.compute((o, m, env.BATCH, env.BLOCK_OUT),
                       lambda *i: y_min(*i).astype(env.inp_dtype),
                       name="y")

        if not remote:
            return

        def verify(s, name=None):
            mod = vta.build(s, [x, w, y], "ext_dev", env.target_host)
            temp = util.tempdir()
            mod.save(temp.relpath("gemm.o"))
            remote.upload(temp.relpath("gemm.o"))
            f = remote.load_module("gemm.o")
            # verify
            ctx = remote.ext_dev(0)
            x_np = np.random.randint(-128,
                                     128,
                                     size=(o, n, env.BATCH,
                                           env.BLOCK_IN)).astype(x.dtype)
            w_np = np.random.randint(-128,
                                     128,
                                     size=(m, n, env.BLOCK_OUT,
                                           env.BLOCK_IN)).astype(w.dtype)
            y_np = np.zeros((o, m, env.BATCH, env.BLOCK_OUT)).astype(y.dtype)
            x_nd = tvm.nd.array(x_np, ctx)
            w_nd = tvm.nd.array(w_np, ctx)
            y_nd = tvm.nd.array(y_np, ctx)
            y_np = y_np.astype(env.acc_dtype)
            for b in range(o):
                for i in range(m):
                    for j in range(n):
                        y_np[b, i, :] += np.dot(
                            x_np[b, j, :].astype(env.acc_dtype),
                            w_np[i, j].T.astype(env.acc_dtype))
            y_np = np.right_shift(y_np, 8)
            y_np = np.clip(y_np, 0, (1 <<
                                     (env.INP_WIDTH - 1)) - 1).astype(y.dtype)

            if env.TARGET in ["sim", "tsim"]:
                simulator.clear_stats()

            f(x_nd, w_nd, y_nd)

            np.testing.assert_equal(y_np, y_nd.asnumpy())

            if env.TARGET in ["sim", "tsim"]:
                sim_stats = simulator.stats()
                print("GEMM schedule:{} execution statistics:".format(name))
                for k, v in sim_stats.items():
                    print("\t{:<16}: {:>16}".format(k, v))

        def test_schedule1():
            # default schedule with no smt
            s = te.create_schedule(y.op)
            # set the scope of the SRAM buffers
            s[x_buf].set_scope(env.inp_scope)
            s[w_buf].set_scope(env.wgt_scope)
            s[y_gem].set_scope(env.acc_scope)
            s[y_shf].set_scope(env.acc_scope)
            s[y_max].set_scope(env.acc_scope)
            s[y_min].set_scope(env.acc_scope)
            # set pragmas for DMA transfer and ALU ops
            s[x_buf].compute_at(s[y_gem], ko)
            s[x_buf].pragma(s[x_buf].op.axis[0], env.dma_copy)
            s[w_buf].compute_at(s[y_gem], ko)
            s[w_buf].pragma(s[w_buf].op.axis[0], env.dma_copy)
            s[y_shf].pragma(s[y_shf].op.axis[0], env.alu)
            s[y_max].pragma(s[y_max].op.axis[0], env.alu)
            s[y_min].pragma(s[y_min].op.axis[0], env.alu)
            s[y].pragma(s[y].op.axis[0], env.dma_copy)
            # tensorization
            s[y_gem].reorder(ko, s[y_gem].op.axis[0], s[y_gem].op.axis[1],
                             s[y_gem].op.axis[2], s[y_gem].op.axis[3], ki)
            s[y_gem].tensorize(s[y_gem].op.axis[2], env.gemm)
            verify(s, name="default")

        def test_smt():
            # test smt schedule
            s = te.create_schedule(y.op)
            s[x_buf].set_scope(env.inp_scope)
            s[w_buf].set_scope(env.wgt_scope)
            s[y_gem].set_scope(env.acc_scope)
            s[y_shf].set_scope(env.acc_scope)
            s[y_max].set_scope(env.acc_scope)
            s[y_min].set_scope(env.acc_scope)
            abo, aco, abi, aci = s[y].op.axis
            abo1, abo2 = s[y].split(abo, nparts=2)
            s[y].bind(abo1, te.thread_axis("cthread"))
            s[y_gem].compute_at(s[y], abo1)
            s[y_shf].compute_at(s[y], abo1)
            s[y_max].compute_at(s[y], abo1)
            s[y_min].compute_at(s[y], abo1)
            s[y_gem].reorder(ko, s[y_gem].op.axis[0], s[y_gem].op.axis[1],
                             s[y_gem].op.axis[2], s[y_gem].op.axis[3], ki)
            s[y_gem].tensorize(s[y_gem].op.axis[2], env.gemm)
            s[y_shf].pragma(s[y_shf].op.axis[0], env.alu)
            s[y_max].pragma(s[y_max].op.axis[0], env.alu)
            s[y_min].pragma(s[y_min].op.axis[0], env.alu)
            s[x_buf].compute_at(s[y_gem], ko)
            s[x_buf].pragma(s[x_buf].op.axis[0], env.dma_copy)
            s[w_buf].compute_at(s[y_gem], ko)
            s[w_buf].pragma(s[w_buf].op.axis[0], env.dma_copy)
            s[y].pragma(abo2, env.dma_copy)
            verify(s, name="smt")

        test_schedule1()
        test_smt()
예제 #18
0
def dot_16x1x16_uint8_int8_int16():
    """
    Int8 dot product by every 2 elements using AVX512 Skylake instructions.
    This function takes two arrays of uint8 and int8 datatype -- data[2] and
    kernel[4][32][2] -- and computes a dot product of data[2] with every
    2 elements of kernels, resulting in output[4][32] of int16 datatype.
    The pseudo code is as follows.
    .. code-block:: c
        void dot_16x1x16_uint8_int8_int16(uint8 data[2], int8 kernel[32*4][2],
                int16 output[32*4]){
            for (int i = 0; i< 4; i++){
                for (int j = 0; j < 32; j++){
                    output[i][i] = 0;
                    for (int k = 0; k < 2; k++){
                        output[i][j][k] += data[k] * kernel[i][j][k]
                    }
                }
            }
        }

    Physically, the kernel array sits in four AVX512 vector registers and
    the data[2] is broadcasted to another AVX512 vector register. This
    function returns a TensorIntrin that can be used to tensorize
    a schedule.

    Returns
    -------
    intrin : TensorIntrin
        The Skylake int8 TensorIntrin that can be used in tensorizing schedule
    """

    int16_lanes = 4 * 32  # 4*32 int32 lanes in 4 AVX512 vector registers
    num_int8_elements = 2  # 2 int8 elements in int16
    data = te.placeholder((num_int8_elements, ), dtype="uint8", name="data")
    kernel = te.placeholder((int16_lanes, num_int8_elements),
                            dtype="int8",
                            name="kernel")
    k = te.reduce_axis((0, num_int8_elements), name="k")
    C = te.compute(
        (int16_lanes, ),
        lambda i: te.sum(
            data[k].astype("int16") * kernel[i, k].astype("int16"), axis=k),
        name="C",
    )

    a_buffer = tvm.tir.decl_buffer(data.shape,
                                   dtype="uint8",
                                   name="a_buffer",
                                   offset_factor=1,
                                   strides=[1])
    b_buffer = tvm.tir.decl_buffer(kernel.shape,
                                   dtype="int8",
                                   name="b_buffer",
                                   offset_factor=1)

    # strides=[te.var('ldw'), 1, 1])

    def _intrin_func(ins, outs):
        def _instr(index):
            ib = tvm.tir.ir_builder.create()
            if index == 1:
                for i in range(4):
                    ib.emit(outs[0].vstore([i * 32],
                                           tvm.tir.const(0, "int16x32")))
                return ib.get()

            a_int8 = ins[0].vload([0], "uint8x2")
            re_int16 = tvm.tir.call_intrin("int16", "tir.reinterpret", a_int8)
            vec_ai16 = re_int16.astype("int16x32")
            vec_a = tvm.tir.call_intrin("int8x64", "tir.reinterpret", vec_ai16)

            for i in range(4):
                vec_b = ins[1].vload([i * 32, 0], "int8x64")
                pair_reduction = tvm.tir.call_llvm_pure_intrin(
                    "int16x32",
                    "llvm.x86.avx512.pmaddubs.w.512",
                    tvm.tir.const(0, "uint32"),
                    vec_a,
                    vec_b,
                )
                if index == 0:
                    ib.emit(outs[0].vstore([i * 32], pair_reduction))
                else:
                    ib.emit(outs[0].vstore(
                        [i * 32],
                        pair_reduction + outs[0].vload([i * 32], "int16x32")))
            return ib.get()

        # body, reset, update
        return _instr(0), _instr(1), _instr(2)

    buffer_params = {"offset_factor": 1}
    return te.decl_tensor_intrin(
        C.op,
        _intrin_func,
        binds={
            data: a_buffer,
            kernel: b_buffer
        },
        default_buffer_params=buffer_params,
    )
예제 #19
0
Apad = te.compute(
    (in_size + 2 * pad, in_size + 2 * pad, in_channel, batch),
    lambda yy, xx, cc, nn: tvm.tir.if_then_else(
        tvm.tir.all(yy >= pad, yy - pad < in_size, xx >= pad, xx - pad <
                    in_size), A[yy - pad, xx - pad, cc, nn],
        tvm.tir.const(0., "float32")),
    name='Apad')
# Create reduction variables
rc = te.reduce_axis((0, in_channel), name='rc')
ry = te.reduce_axis((0, kernel), name='ry')
rx = te.reduce_axis((0, kernel), name='rx')
# Compute the convolution
B = te.compute(
    (out_size, out_size, out_channel, batch),
    lambda yy, xx, ff, nn: te.sum(Apad[yy * stride + ry, xx * stride + rx, rc,
                                       nn] * W[ry, rx, rc, ff],
                                  axis=[ry, rx, rc]),
    name='B')

###############################################################################
# Memory Hierarchy
# ----------------
#
# We first specify the memory hierarchy for buffers. The figure below shows the
# GPU memory hierarchy. One important difference from CPU memory hierarchy is
# that GPU provides a cache buffer called shared memory, which is managed by
# programmers. Thus how to maximize the data reuse in the shared memory is
# critical to achieve high performance in GPU kernels.
#
# .. image:: https://github.com/dmlc/web-data/raw/master/tvm/tutorial/gpu_memory_hierarchy.png
#      :align: center
예제 #20
0
def dot_16x1x16_uint8_int8_int32_cascadelake():
    """
    Int8 dot product by every 4 elements using AVX512VNNI Cascade Lake instructions.
    This function takes two arrays of uint8 and int8 datatype -- data[4] and
    kernel[16][4] -- and computes a dot product of data[4] with every
    4 elements of kernels, resulting in output[16] of int32 datatype.
    The pseudo code is as follows.
    .. code-block:: c
        void dot_16x1x16_uint8_int8_int32_cascadelake(uint8 data[4], int8 kernel[16][4],
                int32 output[16]){
            for (int i = 0; i < 16; i++){
                output[i] = 0;
                for (int k = 0; k < 4; k++){
                    output[i] += data[k] * kernel[i][k]
                }
            }
        }

    Physically, the kernel array sits in an AVX512 vector register and
    the data[4] is broadcasted to another AVX512 vector register. This
    function returns a TensorIntrin that can be used to tensorize
    a schedule.

    Returns
    -------
    intrin : TensorIntrin
        The Cascade Lake int8 TensorIntrin that can be used in tensorizing schedule
    """

    int32_lanes = 16  # 16 int32 lanes in AVX512
    num_int8_elements = 4  # 4 int8 elements in int32
    data = te.placeholder((num_int8_elements, ), dtype="uint8", name="data")
    kernel = te.placeholder((int32_lanes, num_int8_elements),
                            dtype="int8",
                            name="kernel")
    k = te.reduce_axis((0, num_int8_elements), name="k")
    C = te.compute(
        (int32_lanes, ),
        lambda i: te.sum(
            data[k].astype("int32") * kernel[i, k].astype("int32"), axis=k),
        name="C",
    )

    a_buffer = tvm.tir.decl_buffer(data.shape,
                                   dtype="uint8",
                                   name="a_buffer",
                                   offset_factor=1,
                                   strides=[1])
    b_buffer = tvm.tir.decl_buffer(kernel.shape,
                                   dtype="int8",
                                   name="b_buffer",
                                   offset_factor=1,
                                   strides=[te.var("ldw"), 1])

    def _intrin_func(ins, outs):
        def _instr(index):
            ib = tvm.tir.ir_builder.create()
            if index == 1:
                ib.emit(outs[0].vstore(0, tvm.tir.const(0, "int32x16")))
                return ib.get()

            a_int8 = ins[0].vload([0], "uint8x4")
            re_int32 = tvm.tir.call_intrin("int32", "tir.reinterpret", a_int8)
            vec_ai32 = re_int32.astype("int32x16")
            vec_b = ins[1].vload([0, 0], "int8x64")

            vnni_inst_name = "llvm.x86.avx512.vpdpbusd.512"
            llvm_id = tvm.target.codegen.llvm_lookup_intrinsic_id(
                vnni_inst_name)

            if llvm_id != 0:  # VNNI is available for current LLVM version
                vec_bi32 = tvm.tir.call_intrin("int32x16", "tir.reinterpret",
                                               vec_b)
                vec_zero = tvm.tir.const(0, "int32x16")
                quad_reduction = tvm.tir.call_llvm_pure_intrin(
                    "int32x16",
                    "llvm.x86.avx512.vpdpbusd.512",
                    tvm.tir.const(0, "uint32"),
                    vec_zero,
                    vec_ai32,
                    vec_bi32,
                )
            else:  # Fall back to the normal AVX512
                vec_a = tvm.tir.call_intrin("int8x64", "tir.reinterpret",
                                            vec_ai32)
                vec_one = tvm.tir.const(1, "int16x32")
                pair_reduction = tvm.tir.call_llvm_pure_intrin(
                    "int16x32",
                    "llvm.x86.avx512.pmaddubs.w.512",
                    tvm.tir.const(0, "uint32"),
                    vec_a,
                    vec_b,
                )
                quad_reduction = tvm.tir.call_llvm_pure_intrin(
                    "int32x16",
                    "llvm.x86.avx512.pmaddw.d.512",
                    tvm.tir.const(0, "uint32"),
                    pair_reduction,
                    vec_one,
                )

            if index == 0:
                ib.emit(outs[0].vstore(0, quad_reduction))
            else:
                ib.emit(outs[0].vstore(
                    0, quad_reduction + outs[0].vload([0], "int32x16")))
            return ib.get()

        # body, reset, update
        return _instr(0), _instr(1), _instr(2)

    buffer_params = {"offset_factor": 1}
    return te.decl_tensor_intrin(
        C.op,
        _intrin_func,
        binds={
            data: a_buffer,
            kernel: b_buffer
        },
        default_buffer_params=buffer_params,
    )
예제 #21
0
from __future__ import absolute_import, print_function

import tvm
from tvm import te
import numpy as np

N, M, L = 1024, 512, 64
A = te.placeholder((N, L), name='A')
B = te.placeholder((M, L), name='B')
k = te.reduce_axis((0, L), name='k')
C = te.compute((N, M),
               lambda i, j: te.sum(A[i, k] * B[k, j], axis=k),
               name="C")
s = te.create_schedule(C.op)
print(tvm.lower(s, [A, B, C], simple_mode=True))

factor = 16
x, y = C.op.axis
(z, ) = C.op.reduce_axis
yo, yi = s[C].split(y, factor=factor)
s[C].reorder(x, yo, yi, z)
print(tvm.lower(s, [A, B, C], simple_mode=True))


def intrin_gemv(m, l):
    a = te.placeholder((l, ), name="a")
    b = te.placeholder((m, l), name="b")
    k = te.reduce_axis((0, l), name="k")
    c = te.compute((m, ), lambda i: te.sum(a[k] * b[i, k], axis=k), name="c")
    Ab = tvm.tir.decl_buffer(a.shape,
                             a.dtype,
예제 #22
0
def dot_16x1x16_uint8_int8_int32_skylake():
    """
    Int8 dot product by every 4 elements using AVX512 Skylake instructions.
    This function takes two arrays of uint8 and int8 datatype -- data[4] and
    kernel[16][4] -- and computes a dot product of data[4] with every
    4 elements of kernels, resulting in output[16] of int32 datatype.
    The pseudo code is as follows.
    .. code-block:: c
        void dot_16x1x16_uint8_int8_int32(uint8 data[4], int8 kernel[16][4],
                int32 output[16]){
            for (int i = 0; i < 16; i++){
                output[i] = 0;
                for (int k = 0; k < 4; k++){
                    output[i] += data[k] * kernel[i][k]
                }
            }
        }

    Physically, the kernel array sits in an AVX512 vector register and
    the data[4] is broadcasted to another AVX512 vector register. This
    function returns a TensorIntrin that can be used to tensorize
    a schedule.

    Returns
    -------
    intrin : TensorIntrin
        The Skylake int8 TensorIntrin that can be used in tensorizing schedule
    """

    int32_lanes = get_simd_32bit_lanes()
    num_int8_elements = 4  # 4 int8 elements in int32
    data = te.placeholder((num_int8_elements, ), dtype="uint8", name="data")
    kernel = te.placeholder((int32_lanes, num_int8_elements),
                            dtype="int8",
                            name="kernel")
    k = te.reduce_axis((0, num_int8_elements), name="k")
    C = te.compute(
        (int32_lanes, ),
        lambda i: te.sum(
            data[k].astype("int32") * kernel[i, k].astype("int32"), axis=k),
        name="C",
    )

    a_buffer = tvm.tir.decl_buffer(data.shape,
                                   dtype="uint8",
                                   name="a_buffer",
                                   offset_factor=1,
                                   strides=[1])
    b_buffer = tvm.tir.decl_buffer(kernel.shape,
                                   dtype="int8",
                                   name="b_buffer",
                                   offset_factor=1,
                                   strides=[te.var("ldw"), 1])

    def _intrin_func(ins, outs):
        def _instr(index):
            # int_lx32 - output datatype after pmaddubs - 16 bits to number of lanes
            # int_8xl - input datatype to pmaddubs - 8 bits to number of lanes
            # int_32xl - output datatype after pmaddw - 32 bits per number of lanes

            if int32_lanes == 4:
                int_lx32 = "int16x8"
                int_8xl = "int8x16"
                int_32xl = "int32x4"
                pmaddubs = "llvm.x86.ssse3.pmadd.ub.sw.128"
                pmaddw = "llvm.x86.sse2.pmadd.wd"
            elif int32_lanes == 8:
                int_lx32 = "int16x16"
                int_8xl = "int8x32"
                int_32xl = "int32x8"
                pmaddubs = "llvm.x86.avx2.pmadd.ub.sw"
                pmaddw = "llvm.x86.avx2.pmadd.wd"
            elif int32_lanes == 16:
                int_lx32 = "int16x32"
                int_8xl = "int8x64"
                int_32xl = "int32x16"
                pmaddubs = "llvm.x86.avx512.pmaddubs.w.512"
                pmaddw = "llvm.x86.avx512.pmaddw.d.512"

            ib = tvm.tir.ir_builder.create()
            if index == 1:
                ib.emit(outs[0].vstore(0, tvm.tir.const(0, int_32xl)))
                return ib.get()

            a_int8 = ins[0].vload([0], "uint8x4")
            re_int32 = tvm.tir.call_intrin("int32", "tir.reinterpret", a_int8)
            vec_ai32 = re_int32.astype(int_32xl)
            vec_a = tvm.tir.call_intrin(int_8xl, "tir.reinterpret", vec_ai32)
            vec_b = ins[1].vload([0, 0], int_8xl)
            vec_one = tvm.tir.const(1, int_lx32)
            pair_reduction = tvm.tir.call_llvm_pure_intrin(
                int_lx32,
                pmaddubs,
                tvm.tir.const(0, "uint32"),
                vec_a,
                vec_b,
            )
            quad_reduction = tvm.tir.call_llvm_pure_intrin(
                int_32xl,
                pmaddw,
                tvm.tir.const(0, "uint32"),
                pair_reduction,
                vec_one,
            )
            if index == 0:
                ib.emit(outs[0].vstore(0, quad_reduction))
            else:
                ib.emit(outs[0].vstore(
                    0, quad_reduction + outs[0].vload([0], int_32xl)))
            return ib.get()

        # body, reset, update
        return _instr(0), _instr(1), _instr(2)

    buffer_params = {"offset_factor": 1}
    return te.decl_tensor_intrin(
        C.op,
        _intrin_func,
        binds={
            data: a_buffer,
            kernel: b_buffer
        },
        default_buffer_params=buffer_params,
    )
예제 #23
0
def conv2d_spatial_pack_nchw(cfg, data, kernel, strides, padding, dilation,
                             out_dtype, num_tile):
    """compute define for Conv2d Spatial Pack with NCHW layout"""
    out_dtype = out_dtype or data.dtype
    N, CI, IH, IW = get_const_tuple(data.shape)

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    if len(kernel.shape) == 4:
        pre_packed = False
        CO, _, KH, KW = get_const_tuple(kernel.shape)
    else:  # kernel tensor is pre packed
        pre_packed = True
        CO, _, KH, KW, VC = get_const_tuple(kernel.shape)
        CO = CO * VC

    dilated_kernel_h = (KH - 1) * dilation_h + 1
    dilated_kernel_w = (KW - 1) * dilation_w + 1
    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w))
    HSTR, WSTR = strides if isinstance(strides, (tuple, list)) else (strides, strides)
    OH = (IH + pad_top + pad_bottom - dilated_kernel_h) // HSTR + 1
    OW = (IW + pad_left + pad_right - dilated_kernel_w) // WSTR + 1
    data_pad = nn.pad(data, [0, 0, pad_top, pad_left], [0, 0, pad_bottom, pad_right])

    # ==================== define configuration space ====================
    n, co, oh, ow = cfg.axis(N), cfg.axis(CO), cfg.axis(OH), cfg.axis(OW)
    ci, kh, kw = cfg.reduce_axis(CI), cfg.reduce_axis(KH), cfg.reduce_axis(KW)

    if num_tile == 2:     # for arm cpu
        co, vc = cfg.define_split('tile_co', co, num_outputs=2)
        oh, vh = cfg.define_split('tile_oh', oh, num_outputs=2)
        ow, vw = cfg.define_split('tile_ow', ow, num_outputs=2)
    elif num_tile == 3:   # for mali gpu
        co, _, vc = cfg.define_split('tile_co', co, num_outputs=3)
        oh, _, vh = cfg.define_split('tile_oh', oh, num_outputs=3)
        ow, _, vw = cfg.define_split('tile_ow', ow, num_outputs=3)
    else:
        raise RuntimeError("Invalid num_tile")

    cfg.define_reorder("reorder_0",
                       [n, co, oh, ow, ci, kh, kw, vh, vw, vc],
                       policy='candidate', candidate=[
                           [n, co, oh, ow, ci, kh, kw, vh, vw, vc],
                           [n, co, oh, ow, ci, kh, kw, vc, vh, vw]])

    cfg.define_annotate("ann_reduce", [kh, kw], policy='try_unroll')
    cfg.define_annotate("ann_spatial", [vh, vw, vc], policy='try_unroll_vec')

    # fallback support
    if cfg.is_fallback:
        if num_tile == 2:     # arm cpu
            ref_log = autotvm.tophub.load_reference_log(
                'arm_cpu', 'rk3399', 'conv2d_nchw_spatial_pack.arm_cpu')
            cfg.fallback_with_reference_log(ref_log)
        elif num_tile == 3:  # mali gpu
            ref_log = autotvm.tophub.load_reference_log(
                'mali', 'rk3399', 'conv2d_nchw_spatial_pack.mali')
            cfg.fallback_with_reference_log(ref_log)
    # ====================================================================

    VC = cfg["tile_co"].size[-1]
    VH = cfg["tile_oh"].size[-1]
    VW = cfg["tile_ow"].size[-1]

    kvshape = (CO // VC, CI, KH, KW, VC)
    ovshape = (N, CO // VC, OH // VH, OW // VW, VH, VW, VC)
    oshape = (N, CO, OH, OW)

    if dilation_h != 1 or dilation_w != 1:
        # undilate input data
        dvshape = (N, OH // VH, OW // VW, CI, KH, KW, VH, VW)
        data_vec = te.compute(dvshape, lambda n, h, w, ci, kh, kw, vh, vw:
                              data_pad[n][ci][(h*VH+vh)*HSTR+kh*dilation_h]
                              [(w*VW+vw)*WSTR+kw*dilation_w],
                              name='data_vec_undilated')
    else:
        dvshape = (N, OH // VH, OW // VW, CI, VH*HSTR + KH-1, VW*WSTR + KW-1)
        data_vec = te.compute(dvshape, lambda n, h, w, ci, vh, vw:
                              data_pad[n][ci][h*VH*HSTR+vh][w*VW*WSTR+vw],
                              name='data_vec')

    if pre_packed:
        kernel_vec = kernel
    else:
        kernel_vec = te.compute(kvshape, lambda co, ci, kh, kw, vc:
                                kernel[co*VC+vc][ci][kh][kw],
                                name='kernel_vec')

    ci = te.reduce_axis((0, CI), name='ci')
    kh = te.reduce_axis((0, KH), name='kh')
    kw = te.reduce_axis((0, KW), name='kw')

    if dilation_h != 1 or dilation_w != 1:
        conv = te.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
                          te.sum(data_vec[n, h, w, ci, kh, kw, vh, vw].astype(out_dtype) *
                                 kernel_vec[co, ci, kh, kw, vc].astype(out_dtype),
                                 axis=[ci, kh, kw]), name='conv')
    else:
        conv = te.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
                          te.sum(data_vec[n, h, w, ci, vh*HSTR+kh, vw*WSTR+kw].astype(out_dtype) *
                                 kernel_vec[co, ci, kh, kw, vc].astype(out_dtype),
                                 axis=[ci, kh, kw]), name='conv')

    idxdiv = tvm.tir.indexdiv
    idxmod = tvm.tir.indexmod

    output = te.compute(oshape, lambda n, co, h, w:
                        conv[n,
                             idxdiv(co, VC), idxdiv(h, VH), idxdiv(w, VW),
                             idxmod(h, VH), idxmod(w, VW), idxmod(co, VC)],
                        name='output_unpack', tag='spatial_conv2d_output')
    return output
예제 #24
0
def _decl_winograd(cfg, data, kernel, strides, padding, dilation, out_dtype,
                   tile_size):
    N, CI, IH, IW = get_const_tuple(data.shape)
    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    if len(kernel.shape) == 4:
        if dilation_h != 1 or dilation_w != 1:
            kernel = nn.dilate(kernel, (1, 1, dilation_h, dilation_w))
        pre_computed = False
        CO, _, KH, KW = get_const_tuple(kernel.shape)
    else:
        assert (dilation_h, dilation_w) == (1, 1), "Does not support dilation"
        pre_computed = True
        H_CAT, W_CAT, CO, CI, VC = get_const_tuple(kernel.shape)
        CO *= VC
        KH, KW = H_CAT - tile_size + 1, W_CAT - tile_size + 1
    HSTR, WSTR = strides if isinstance(strides,
                                       (tuple, list)) else (strides, strides)
    pt, pl, pb, pr = nn.get_pad_tuple(padding, (KH, KW))

    assert KH == 3 and KW == 3 and HSTR == 1 and WSTR == 1
    data_pad = nn.pad(data, (0, 0, pt, pl), (0, 0, pb, pr), name="data_pad")

    r = KW
    m = tile_size
    alpha = m + r - 1
    A, B, G = winograd_transform_matrices(m, r, out_dtype)

    H = (IH + pt + pb - 3) // HSTR + 1
    W = (IW + pl + pr - 3) // WSTR + 1
    nH, nW = (H + m - 1) // m, (W + m - 1) // m
    P = N * nH * nW

    ##### space definition begin #####
    tile_bna_candidates = [1, 2, 4, 8, 16]
    factors = get_factors(CO)
    cfg.define_knob('tile_bna',
                    [x for x in tile_bna_candidates if x in factors])
    cfg.define_knob('tile_bnb', [1, 2, 4, 8, 16])
    cfg.define_split('tile_t1', CI, num_outputs=2, max_factor=128)
    cfg.define_split('tile_t2', CO, num_outputs=2, max_factor=128)
    cfg.define_split('c_unroll', CI, num_outputs=2, max_factor=8)
    cfg.define_knob('yt', [1, 2, 4, 8, 16, 32])
    ##### space definition end #####

    if cfg.is_fallback:
        cfg['tile_bnb'].val = 4
        cfg['tile_bna'].val = 4
        while CO % cfg['tile_bna'].val != 0:
            cfg['tile_bna'].val //= 2
        cfg['yt'].val = 8
        cfg.fallback_split('tile_t1', [-1, 128])
        cfg.fallback_split('tile_t2', [-1, 128])
        cfg.fallback_split('c_unroll', [-1, 8])

    bna = cfg['tile_bna'].val
    bnb = cfg['tile_bnb'].val

    P_round = (P + bnb - 1) // bnb * bnb
    assert CO % bna == 0 and P_round % bnb == 0

    # pack input tile
    input_tile = te.compute(
        (CI, P_round // bnb, alpha, alpha, bnb), lambda ci, b, eps, nu, bb: \
        tvm.tir.if_then_else(
            b * bnb + bb < P,
            data_pad[(b*bnb+bb) // (nH*nW)][ci][(b*bnb+bb) // nW % nH * m + eps]
            [(b*bnb+bb) % nW * m + nu], tvm.tir.const(0, data_pad.dtype)), name='d')

    if autotvm.GLOBAL_SCOPE.in_tuning:
        VC = cfg['tile_k'].size[-1]
        kvshape = (KH + tile_size - 1, KW + tile_size - 1,
                   tvm.tir.indexdiv(CO, VC), CI, VC)
        U = tvm.te.placeholder(kvshape, kernel.dtype, name="U")
    else:
        # transform kernel
        if pre_computed:
            U = kernel
        else:
            r_kh = te.reduce_axis((0, KH), 'r_kh')
            r_kw = te.reduce_axis((0, KW), 'r_kw')
            U = te.compute(
                (alpha, alpha, CO // bna, CI, bna),
                lambda eps, nu, co, ci, vco: te.sum(kernel[co * bna + vco][ci][
                    r_kh][r_kw] * G[eps][r_kh] * G[nu][r_kw],
                                                    axis=[r_kh, r_kw]),
                name='U')

    # transform image
    r_a = te.reduce_axis((0, alpha), 'r_a')
    r_b = te.reduce_axis((0, alpha), 'r_b')
    V = te.compute((alpha, alpha, P_round // bnb, CI, bnb),
                   lambda eps, nu, p, ci, vp: te.sum(input_tile[ci][p][r_a][
                       r_b][vp] * B[r_a][eps] * B[r_b][nu],
                                                     axis=[r_a, r_b]),
                   name='V')

    idxdiv = tvm.tir.indexdiv
    idxmod = tvm.tir.indexmod

    # batch gemm
    ci = te.reduce_axis((0, CI), name='c')
    M = te.compute(
        (alpha, alpha, CO, P_round),
        lambda eps, nu, co, p: te.sum(U[eps][nu][idxdiv(co, bna)][ci][idxmod(
            co, bna)] * V[eps][nu][idxdiv(p, bnb)][ci][idxmod(p, bnb)],
                                      axis=ci),
        name='M')

    r_a = te.reduce_axis((0, alpha), 'r_a')
    r_b = te.reduce_axis((0, alpha), 'r_b')
    Y = te.compute(
        (CO, P, m, m),
        lambda co, p, vh, vw: te.sum(
            M[r_a][r_b][co][p] * A[r_a][vh] * A[r_b][vw], axis=[r_a, r_b]),
        name='Y')

    # unpack output
    output = te.compute(
        (N, CO, H, W),
        lambda n, co, h, w: Y[co, n * nH * nW + idxdiv(h, m) * nW + idxdiv(
            w, m),
                              idxmod(h, m),
                              idxmod(w, m)]
        # The following hack term is used to make the padding in batch gemm ("M")
        # effective, otherwise the padding will be eliminated by bound inference.
        # Use `tvm.tir.Mul` instead of `*` to avoid issues in const folding.
        + tvm.tir.Mul(tvm.tir.const(0, out_dtype), M[alpha - 1][alpha - 1][
            CO - 1][P_round - 1]),
        name='output',
        tag='winograd_conv2d_output')

    # we have to manually assign effective GFLOP for winograd
    cfg.add_flop(2 * N * CO * H * W * KH * KW * CI)
    return output
예제 #25
0
def batch_matmul_int8(cfg, x, y, out_shape=None, out_dtype=None):
    """Batch Matmul operator for int8 on CUDA.

    Parameters
    ----------
    cfg : ConfigSpace
        Autotvm tuning space config file.

    x : tvm.te.Tensor
        3-D with shape [batch, M, K] or [batch, K, M].

    y : tvm.te.Tensor
        3-D with shape [batch, K, N] or [batch, N, K].

    out_shape : List[Optional]
        Explicit intended output shape of the computation. Can be useful in cases
        with dynamic input shapes.

    out_dtype : Optional[str]
        Specifies the output data type for mixed precision batch matmul.

    Returns
    -------
    output : tvm.te.Tensor
        3-D with shape [batch, M, N]
    """
    if out_dtype is None:
        out_dtype = x.dtype

    x_shape = get_const_tuple(x.shape)
    y_shape = get_const_tuple(y.shape)
    assert len(x_shape) == 3 and len(
        y_shape) == 3, "only support 3-dim batch_matmul"

    XB, M, XK = x.shape
    YB, N, YK = y.shape
    assert XB == YB or XB == 1 or YB == 1, "batch dimension doesn't match"
    assert XK == YK, "shapes of x and y is inconsistent"

    nB = tvm.te.max(XB, YB)
    nK = ((XK + 3) // 4) * 4
    reduce_k = te.reduce_axis((0, nK), name="k")

    # pad for _dp4a vectorize
    pad_x = te.compute(
        (XB, M, nK),
        lambda b, i, j: tvm.te.if_then_else(
            j >= XK,
            tvm.runtime.convert(0).astype(x.dtype), x[b, i, j]),
    )
    pad_y = te.compute(
        (YB, N, nK),
        lambda b, i, j: tvm.te.if_then_else(
            j >= YK,
            tvm.runtime.convert(0).astype(y.dtype), y[b, i, j]),
    )

    out = te.compute(
        (nB, M, N),
        lambda b, i, j: te.sum(
            pad_x[b if XB != 1 else 0, i, reduce_k].astype(out_dtype) * pad_y[
                b if YB != 1 else 0, j, reduce_k].astype(out_dtype),
            axis=[reduce_k],
        ),
        tag="batch_matmul_int8",
    )
    cfg.add_flop(XB * M * N * nK * 2)
    return out
예제 #26
0
def compute_depthwise_conv2d_nhwc(_, data, kernel, strides, padding, dilation,
                                  out_dtype):
    """TOPI compute callback for depthwise_conv2d nhwc

    Parameters
    ----------
    cfg: ConfigEntity
        The config for this template

    data : tvm.te.Tensor
        4-D with shape [batch, in_height, in_width, in_channel]

    kernel : tvm.te.Tensor
        4-D with shape [filter_height, filter_width, in_channel, channel_multiplier]

    strides : list of two ints
        [stride_height, stride_width]

    padding : list of two ints
        [pad_height, pad_width]

    dilation : list of two ints
        [dilation_height, dilation_width]

    out_dtype: str
        The output type. This is used for mixed precision.

    Returns
    -------
    output : tvm.te.Tensor
        4-D with shape [batch, out_height, out_width, out_channel]
    """

    out_dtype = out_dtype or data.dtype

    N, IH, IW, IC = get_const_tuple(data.shape)

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    KH, KW, IC, channel_multiplier = get_const_tuple(kernel.shape)

    dilated_kernel_h = (KH - 1) * dilation_h + 1
    dilated_kernel_w = (KW - 1) * dilation_w + 1

    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w))
    HSTR, WSTR = strides if isinstance(strides,
                                       (tuple, list)) else (strides, strides)

    OH = (IH + pad_top + pad_down - dilated_kernel_h) // HSTR + 1
    OW = (IW + pad_left + pad_right - dilated_kernel_w) // WSTR + 1

    if pad_top or pad_left or pad_down or pad_right:
        data_pad = nn.pad(data, [0, pad_top, pad_left, 0],
                          [0, pad_down, pad_right, 0],
                          name="data_pad")
    else:
        data_pad = data

    output_shape = (N, OH, OW, IC * channel_multiplier)

    idxdiv = tvm.tir.indexdiv
    idxmod = tvm.tir.indexmod

    reduce_h = te.reduce_axis((0, KH), name='reduce_h')
    reduce_w = te.reduce_axis((0, KW), name='reduce_w')

    out = te.compute(
        output_shape,
        lambda n, h, w, c: te.sum(
            data_pad[n, HSTR * h + dilation_h * reduce_h, w * WSTR + reduce_w *
                     dilation_w,
                     idxdiv(c, channel_multiplier)].astype(out_dtype) * kernel[
                         reduce_h, reduce_w,
                         idxdiv(c, channel_multiplier),
                         idxmod(c, channel_multiplier)].astype(out_dtype),
            axis=[reduce_h, reduce_w]),
        name='depthwise_conv2d_nhwc_output')
    return out
예제 #27
0
def test_tensor_core_batch_conv():
    # The sizes of inputs and filters
    batch_size = 32
    height = 14
    width = 14
    in_channels = 32
    out_channels = 64
    kernel_h = 3
    kernel_w = 3
    pad_h = 1
    pad_w = 1
    stride_h = 1
    stride_w = 1
    block_size = 16

    block_row_warps = 2
    block_col_warps = 4
    warp_row_tiles = 4
    warp_col_tiles = 2
    warp_size = 32
    chunk = 2

    # Input feature map: (N, H, W, IC, n, ic)
    data_shape = (
        batch_size // block_size,
        height,
        width,
        in_channels // block_size,
        block_size,
        block_size,
    )
    # Kernel: (H, W, IC, OC, ic, oc)
    kernel_shape = (
        kernel_h,
        kernel_w,
        in_channels // block_size,
        out_channels // block_size,
        block_size,
        block_size,
    )

    # Output feature map: (N, H, W, OC, n, oc)
    output_shape = (
        batch_size // block_size,
        height,
        width,
        out_channels // block_size,
        block_size,
        block_size,
    )

    assert batch_size % block_size == 0
    assert in_channels % block_size == 0
    assert out_channels % block_size == 0

    kh = te.reduce_axis((0, kernel_h), name="kh")
    kw = te.reduce_axis((0, kernel_w), name="kw")
    ic = te.reduce_axis((0, in_channels // block_size), name="ic")
    ii = te.reduce_axis((0, block_size), name="ii")

    # Algorithm
    A = te.placeholder(data_shape, name="A", dtype="float16")
    W = te.placeholder(kernel_shape, name="W", dtype="float16")
    Apad = te.compute(
        (
            batch_size // block_size,
            height + 2 * pad_h,
            width + 2 * pad_w,
            in_channels // block_size,
            block_size,
            block_size,
        ),
        lambda n, h, w, i, nn, ii: tvm.tir.if_then_else(
            tvm.tir.all(h >= pad_h, h - pad_h < height, w >= pad_w, w - pad_w < width),
            A[n, h - pad_h, w - pad_w, i, nn, ii],
            tvm.tir.const(0.0, "float16"),
        ),
        name="Apad",
    )
    Conv = te.compute(
        output_shape,
        lambda n, h, w, o, nn, oo: te.sum(
            Apad[n, h * stride_h + kh, w * stride_w + kw, ic, nn, ii].astype("float32")
            * W[kh, kw, ic, o, ii, oo].astype("float32"),
            axis=[ic, kh, kw, ii],
        ),
        name="Conv",
    )

    s = te.create_schedule(Conv.op)
    s[Apad].compute_inline()

    AS = s.cache_read(Apad, "shared", [Conv])
    WS = s.cache_read(W, "shared", [Conv])
    AF = s.cache_read(AS, "wmma.matrix_a", [Conv])
    WF = s.cache_read(WS, "wmma.matrix_b", [Conv])
    ConvF = s.cache_write(Conv, "wmma.accumulator")

    block_x = te.thread_axis("blockIdx.x")
    block_y = te.thread_axis("blockIdx.y")
    block_z = te.thread_axis("blockIdx.z")
    thread_x = te.thread_axis("threadIdx.x")
    thread_y = te.thread_axis("threadIdx.y")
    thread_z = te.thread_axis("threadIdx.z")

    nc, hc, wc, oc, nnc, ooc = Conv.op.axis
    block_k = s[Conv].fuse(hc, wc)
    s[Conv].bind(block_k, block_z)
    nc, nci = s[Conv].split(nc, factor=warp_row_tiles)
    block_i, nc = s[Conv].split(nc, factor=block_row_warps)
    oc, oci = s[Conv].split(oc, factor=warp_col_tiles)
    block_j, oc = s[Conv].split(oc, factor=block_col_warps)
    s[Conv].reorder(block_k, block_i, block_j, nc, oc, nci, oci, nnc, ooc)
    s[Conv].bind(block_i, block_x)
    s[Conv].bind(block_j, block_y)
    s[Conv].bind(nc, thread_y)
    s[Conv].bind(oc, thread_z)

    s[ConvF].compute_at(s[Conv], oc)
    n, h, w, o, nnf, oof = ConvF.op.axis
    ko, ki = s[ConvF].split(ic, factor=chunk)
    s[ConvF].reorder(ko, kh, ki, kw, n, o, nnf, oof, ii)

    s[AF].compute_at(s[ConvF], kw)
    s[WF].compute_at(s[ConvF], kw)

    s[WS].compute_at(s[ConvF], kh)
    s[AS].compute_at(s[ConvF], kh)

    n, h, w, i, nn, ii = AS.op.axis
    tx, xo = s[AS].split(n, nparts=block_row_warps)
    ty, yo = s[AS].split(xo, nparts=block_col_warps)
    t = s[AS].fuse(nn, ii)
    to, ti = s[AS].split(t, factor=warp_size)
    s[AS].bind(tx, thread_y)
    s[AS].bind(ty, thread_z)
    s[AS].bind(ti, thread_x)

    kh, kw, ic, o, ii, oo = WS.op.axis
    tx, xo = s[WS].split(o, nparts=block_row_warps)
    ty, yo = s[WS].split(xo, nparts=block_col_warps)
    t = s[WS].fuse(ii, oo)
    to, ti = s[WS].split(t, nparts=warp_size)
    s[WS].bind(tx, thread_y)
    s[WS].bind(ty, thread_z)
    s[WS].bind(to, thread_x)
    s[WS].vectorize(ti)

    s[AF].tensorize(AF.op.axis[-2], intrin_wmma_load_matrix((16, 16, 16), "wmma.matrix_a"))
    s[WF].tensorize(WF.op.axis[-2], intrin_wmma_load_matrix((16, 16, 16), "wmma.matrix_b"))
    s[Conv].tensorize(nnc, intrin_wmma_store_matrix((16, 16, 16)))
    s[ConvF].tensorize(nnf, intrin_wmma_gemm((16, 16, 16)))

    func = tvm.build(s, [A, W, Conv], "cuda")

    dev = tvm.gpu(0)
    a_np = np.random.uniform(size=data_shape).astype(A.dtype)
    w_np = np.random.uniform(size=kernel_shape).astype(W.dtype)
    a = tvm.nd.array(a_np, dev)
    w = tvm.nd.array(w_np, dev)
    c = tvm.nd.array(np.zeros(output_shape, dtype=Conv.dtype), dev)
    evaluator = func.time_evaluator(func.entry_name, dev, number=3)
    print("conv2d with tensor core: %f ms" % (evaluator(a, w, c).mean * 1e3))

    if VERIFY:
        func(a, w, c)
        a_np = a_np.transpose(0, 4, 1, 2, 3, 5).reshape(batch_size, height, width, in_channels)
        w_np = w_np.transpose(0, 1, 2, 4, 3, 5).reshape(
            kernel_h, kernel_w, in_channels, out_channels
        )
        c_np = (
            c.asnumpy()
            .transpose((0, 4, 1, 2, 3, 5))
            .reshape(batch_size, height, width, out_channels)
        )
        c_std = conv2d_nhwc_python(
            a_np.astype(Conv.dtype), w_np.astype(Conv.dtype), (stride_h, stride_w), (pad_h, pad_w)
        ).astype(Conv.dtype)
        np.testing.assert_allclose(c_np, c_std, rtol=1e-4, atol=1e-4)
예제 #28
0
def _decl_spatial_pack(cfg, data, kernel, strides, padding, dilation,
                       out_dtype, num_tile):
    out_dtype = out_dtype or data.dtype

    N, C, IH, IW = get_const_tuple(data.shape)

    if isinstance(dilation, int):
        dilation_h = dilation_w = dilation
    else:
        dilation_h, dilation_w = dilation

    if len(kernel.shape) == 4:
        pre_packed = False
        C, M, KH, KW = get_const_tuple(kernel.shape)
    else:  # kernel tensor is pre packed
        pre_packed = True
        C, M, KH, KW, VC = get_const_tuple(kernel.shape)
        C = C * VC

    dilated_kernel_h = (KH - 1) * dilation_h + 1
    dilated_kernel_w = (KW - 1) * dilation_w + 1

    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
        padding, (dilated_kernel_h, dilated_kernel_w))
    HSTR, WSTR = strides if isinstance(strides,
                                       (tuple, list)) else (strides, strides)
    OH = (IH + pad_top + pad_down - dilated_kernel_h) // HSTR + 1
    OW = (IW + pad_left + pad_right - dilated_kernel_w) // WSTR + 1
    # pack data
    HPAD = pad_top + pad_down
    WPAD = pad_left + pad_right
    DOPAD = (HPAD != 0 or WPAD != 0)
    if DOPAD:
        data_pad = nn.pad(data, (0, 0, pad_top, pad_left),
                          (0, 0, pad_down, pad_right),
                          name="data_pad")
    else:
        data_pad = data

    # fallback support
    # Currently, Mali schedule doesn't use it like conv2d.
    if cfg.is_fallback:
        ref_log = autotvm.tophub.load_reference_log(
            'arm_cpu', 'rk3399', 'depthwise_conv2d_nchw_spatial_pack.arm_cpu')
        cfg.fallback_with_reference_log(ref_log)

    # ==================== define configuration space ====================
    n, c, oh, ow = cfg.axis(N), cfg.axis(C), cfg.axis(OH), cfg.axis(OW)
    kh, kw = cfg.reduce_axis(KH), cfg.reduce_axis(KW)

    # Currently, Mali schedule doesn't use it like conv2d.
    # Leave num_tile for possible future use of Mali schedule
    if num_tile == 2:  # for arm cpu
        co, vc = cfg.define_split('tile_co', c, num_outputs=2)
        oh, vh = cfg.define_split('tile_oh', oh, num_outputs=2)
        ow, vw = cfg.define_split('tile_ow', ow, num_outputs=2)
    else:
        raise RuntimeError("Invalid num_tile")

    cfg.define_reorder("reorder_0", [n, co, oh, ow, kh, kw, vh, vw, vc],
                       policy='candidate',
                       candidate=[[n, co, oh, ow, kh, kw, vh, vw, vc],
                                  [n, co, oh, ow, kh, kw, vc, vh, vw]])

    cfg.define_reorder("reorder_1", [n, co, oh, ow, vh, vw, vc],
                       policy='candidate',
                       candidate=[[n, co, oh, ow, vh, vw, vc],
                                  [n, co, oh, ow, vc, vh, vw],
                                  [n, co, oh, ow, vh, vc, vw]])

    cfg.define_annotate("ann_reduce", [kh, kw], policy='try_unroll')
    cfg.define_annotate("ann_spatial", [vh, vw, vc], policy='try_unroll_vec')
    # ====================================================================

    VC = cfg["tile_co"].size[-1]
    VH = cfg["tile_oh"].size[-1]
    VW = cfg["tile_ow"].size[-1]

    kvshape = (C // VC, M, KH, KW, VC)
    ovshape = (N, C * M // VC, OH // VH, OW // VW, VH, VW, VC)
    oshape = (N, C * M, OH, OW)

    if dilation_h != 1 or dilation_w != 1:
        # undilate input data
        dvshape = (N, OH // VH, OW // VW, C, KH, KW, VH, VW)
        data_vec = te.compute(
            dvshape,
            lambda n, h, w, c, kh, kw, vh, vw: data_pad[n][c][
                (h * VH + vh) * HSTR + kh * dilation_h][
                    (w * VW + vw) * WSTR + kw * dilation_w],
            name='data_vec_undilated')
    else:
        dvshape = (N, OH // VH, OW // VW, C, VH * HSTR + KH - 1,
                   VW * WSTR + KW - 1)
        data_vec = te.compute(dvshape,
                              lambda n, h, w, c, vh, vw: data_pad[n][c][
                                  h * VH * HSTR + vh][w * VW * WSTR + vw],
                              name='data_vec')

    if pre_packed:
        kernel_vec = kernel
    else:
        kernel_vec = te.compute(
            kvshape,
            lambda co, m, kh, kw, vc: kernel[co * VC + vc][m][kh][kw],
            name='kernel_vec')

    kh = te.reduce_axis((0, KH), name='kh')
    kw = te.reduce_axis((0, KW), name='kw')

    idxdiv = tvm.tir.indexdiv
    idxmod = tvm.tir.indexmod

    if dilation_h != 1 or dilation_w != 1:
        conv = te.compute(
            ovshape, lambda n, co, h, w, vh, vw, vc: \
            te.sum(data_vec[n, h, w, idxdiv(co * VC + vc, M), kh, kw, vh, vw]
                   .astype(out_dtype) *
                   kernel_vec[idxdiv(co, M), idxmod(co, M), kh, kw, vc].astype(out_dtype),
                   axis=[kh, kw]), name='depthwise_conv')
    else:
        conv = te.compute(ovshape, lambda n, co, h, w, vh, vw, vc: \
                          te.sum(data_vec[n, h, w, idxdiv((co * VC + vc), M), vh * HSTR + kh,
                                          vw * WSTR + kw].astype(out_dtype) *
                                 kernel_vec[idxdiv(co, M),
                                            idxmod(co, M),
                                            kh, kw, vc].astype(out_dtype),
                                 axis=[kh, kw]), name='depthwise_conv')

    output = te.compute(oshape,
                        lambda n, co, h, w: conv[n,
                                                 idxdiv(co, VC),
                                                 idxdiv(h, VH),
                                                 idxdiv(w, VW),
                                                 idxmod(h, VH),
                                                 idxmod(w, VW),
                                                 idxmod(co, VC)],
                        name='output_unpack',
                        tag='spatial_depthwise_conv2d_nchw_output')
    return output
예제 #29
0
def test_tensorize_matmul():
    n = 1024
    m = n
    l = n
    A = te.placeholder((n, l), name='A')
    B = te.placeholder((m, l), name='B')
    k = te.reduce_axis((0, l), name='k')
    C = te.compute((n, m),
                   lambda i, j: te.sum(B[j, k] * A[i, k], axis=k),
                   name='C')

    def check(factor):
        s = te.create_schedule(C.op)
        x, y = C.op.axis
        yo, yi = s[C].split(y, factor=factor)
        gemv = intrin_gemv(factor, l)
        s[C].tensorize(yi, gemv)
        s = s.normalize()
        dom_map = tvm.te.schedule.InferBound(s)
        finfer = tvm.get_global_func("test.op.InferTensorizeRegion")
        out_dom, in_dom = finfer(s[C], dom_map)
        assert tvm.tir.ir_pass.Equal(out_dom[x].extent, 1)
        assert tvm.tir.ir_pass.Equal(out_dom[y].extent, factor)
        assert tvm.tir.ir_pass.Equal(out_dom[y].min, yo * factor)
        fmatch = tvm.get_global_func("test.op.MatchTensorizeBody")
        body = fmatch(s[C], out_dom, in_dom, gemv)
        assert tvm.tir.ir_pass.Equal(
            tvm.tir.ir_pass.CanonicalSimplify(body[0]),
            tvm.tir.ir_pass.CanonicalSimplify(gemv.op.body[0]))
        stmt = tvm.te.schedule.ScheduleOps(s, dom_map)
        tvm.lower(s, [A, B, C])

    def check_rfactor(factor, rfactor):
        s = te.create_schedule(C.op)
        x, y = C.op.axis
        rk = C.op.reduce_axis[0]
        yo, yi = s[C].split(y, factor=factor)
        ro, ri = s[C].split(rk, factor=rfactor)
        s[C].reorder(yo, ro, yi, ri)
        gemv = intrin_gemv(factor, rfactor)
        s[C].tensorize(yi, gemv)
        s = s.normalize()
        dom_map = tvm.te.schedule.InferBound(s)
        finfer = tvm.get_global_func("test.op.InferTensorizeRegion")
        out_dom, in_dom = finfer(s[C], dom_map)
        assert tvm.tir.ir_pass.Equal(out_dom[x].extent, 1)
        assert tvm.tir.ir_pass.Equal(out_dom[y].extent, factor)
        assert tvm.tir.ir_pass.Equal(out_dom[y].min, yo * factor)
        fmatch = tvm.get_global_func("test.op.MatchTensorizeBody")
        body = fmatch(s[C], out_dom, in_dom, gemv)
        assert tvm.tir.ir_pass.Equal(
            tvm.tir.ir_pass.CanonicalSimplify(body[0]),
            tvm.tir.ir_pass.CanonicalSimplify(gemv.op.body[0]))
        stmt = tvm.te.schedule.ScheduleOps(s, dom_map)
        tvm.lower(s, [A, B, C])

    def check_rfactor_no_reset(factor, rfactor):
        s = te.create_schedule(C.op)
        x, y = C.op.axis
        rk = C.op.reduce_axis[0]
        yo, yi = s[C].split(y, factor=factor)
        ro, ri = s[C].split(rk, factor=rfactor)
        s[C].reorder(yo, ro, yi, ri)
        gemv = intrin_gemv_no_reset(factor, rfactor)
        s[C].tensorize(yi, gemv)
        s = s.normalize()
        dom_map = tvm.te.schedule.InferBound(s)
        finfer = tvm.get_global_func("test.op.InferTensorizeRegion")
        out_dom, in_dom = finfer(s[C], dom_map)
        assert tvm.tir.ir_pass.Equal(out_dom[x].extent, 1)
        assert tvm.tir.ir_pass.Equal(out_dom[y].extent, factor)
        assert tvm.tir.ir_pass.Equal(out_dom[y].min, yo * factor)
        fmatch = tvm.get_global_func("test.op.MatchTensorizeBody")
        body = fmatch(s[C], out_dom, in_dom, gemv)
        assert tvm.tir.ir_pass.Equal(
            tvm.tir.ir_pass.CanonicalSimplify(body[0]),
            tvm.tir.ir_pass.CanonicalSimplify(gemv.op.body[0]))
        stmt = tvm.te.schedule.ScheduleOps(s, dom_map)
        tvm.lower(s, [A, B, C])

    def check_rfactor_no_reset_multi_reduction(factor, rfactor):
        s = te.create_schedule(C.op)
        x, y = C.op.axis
        rk = C.op.reduce_axis[0]
        yo, yi = s[C].split(y, factor=factor)
        ro, ri = s[C].split(rk, factor=rfactor)
        roo, roi = s[C].split(ro, factor=2)
        s[C].reorder(yo, roo, roi, yi, ri)
        gemv = intrin_gemv_no_reset(factor, rfactor)
        s[C].tensorize(yi, gemv)
        s = s.normalize()
        dom_map = tvm.te.schedule.InferBound(s)
        finfer = tvm.get_global_func("test.op.InferTensorizeRegion")
        out_dom, in_dom = finfer(s[C], dom_map)
        assert tvm.tir.ir_pass.Equal(out_dom[x].extent, 1)
        assert tvm.tir.ir_pass.Equal(out_dom[y].extent, factor)
        assert tvm.tir.ir_pass.Equal(out_dom[y].min, yo * factor)
        fmatch = tvm.get_global_func("test.op.MatchTensorizeBody")
        body = fmatch(s[C], out_dom, in_dom, gemv)
        assert tvm.tir.ir_pass.Equal(
            tvm.tir.ir_pass.CanonicalSimplify(body[0]),
            tvm.tir.ir_pass.CanonicalSimplify(gemv.op.body[0]))
        stmt = tvm.te.schedule.ScheduleOps(s, dom_map)
        tvm.lower(s, [A, B, C])

    check(16)
    check_rfactor(16, 16)
    check_rfactor_no_reset(16, 16)
    check_rfactor_no_reset_multi_reduction(16, 16)
예제 #30
0
    def run_gemm_packed(env, remote, batch_size, channel, block):
        data_shape = (batch_size // env.BATCH, channel // env.BLOCK_IN,
                      env.BATCH, env.BLOCK_IN)
        weight_shape = (
            channel // env.BLOCK_OUT,
            channel // env.BLOCK_IN,
            env.BLOCK_OUT,
            env.BLOCK_IN,
        )
        res_shape = (batch_size // env.BATCH, channel // env.BLOCK_OUT,
                     env.BATCH, env.BLOCK_OUT)
        # To compute number of ops, use a x2 factor for FMA
        num_ops = 2 * channel * channel * batch_size

        ko = te.reduce_axis((0, channel // env.BLOCK_IN), name="ko")
        ki = te.reduce_axis((0, env.BLOCK_IN), name="ki")

        data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype)
        weight = te.placeholder(weight_shape,
                                name="weight",
                                dtype=env.wgt_dtype)
        data_buf = te.compute(data_shape, lambda *i: data(*i), "data_buf")
        weight_buf = te.compute(weight_shape, lambda *i: weight(*i),
                                "weight_buf")
        res_gem = te.compute(
            res_shape,
            lambda bo, co, bi, ci: te.sum(
                data_buf[bo, ko, bi, ki].astype(env.acc_dtype) * weight_buf[
                    co, ko, ci, ki].astype(env.acc_dtype),
                axis=[ko, ki],
            ),
            name="res_gem",
        )
        res_shf = te.compute(res_shape,
                             lambda *i: res_gem(*i) >> 8,
                             name="res_shf")
        res_max = te.compute(res_shape, lambda *i: tvm.te.max(res_shf(*i), 0),
                             "res_max")  # relu
        res_min = te.compute(res_shape,
                             lambda *i: tvm.te.min(res_max(*i),
                                                   (1 <<
                                                    (env.INP_WIDTH - 1)) - 1),
                             "res_min")  # relu
        res = te.compute(res_shape,
                         lambda *i: res_min(*i).astype(env.inp_dtype),
                         name="res")

        def verify(s):
            mod = vta.build(s, [data, weight, res],
                            "ext_dev",
                            env.target_host,
                            name="gemm")
            temp = utils.tempdir()
            mod.save(temp.relpath("gemm.o"))
            remote.upload(temp.relpath("gemm.o"))
            f = remote.load_module("gemm.o")
            # verify
            dev = remote.ext_dev(0)
            # Data in original format
            data_orig = np.random.randint(-128,
                                          128,
                                          size=(batch_size,
                                                channel)).astype(data.dtype)
            weight_orig = np.random.randint(
                -128, 128, size=(channel, channel)).astype(weight.dtype)
            data_packed = data_orig.reshape(batch_size // env.BATCH, env.BATCH,
                                            channel // env.BLOCK_IN,
                                            env.BLOCK_IN).transpose(
                                                (0, 2, 1, 3))
            weight_packed = weight_orig.reshape(channel // env.BLOCK_OUT,
                                                env.BLOCK_OUT,
                                                channel // env.BLOCK_IN,
                                                env.BLOCK_IN).transpose(
                                                    (0, 2, 1, 3))
            res_np = np.zeros(res_shape).astype(res.dtype)
            data_arr = tvm.nd.array(data_packed, dev)
            weight_arr = tvm.nd.array(weight_packed, dev)
            res_arr = tvm.nd.array(res_np, dev)
            res_ref = np.zeros(res_shape).astype(env.acc_dtype)
            for b in range(batch_size // env.BATCH):
                for i in range(channel // env.BLOCK_OUT):
                    for j in range(channel // env.BLOCK_IN):
                        res_ref[b, i, :] += np.dot(
                            data_packed[b, j, :].astype(env.acc_dtype),
                            weight_packed[i, j].T.astype(env.acc_dtype),
                        )
            res_ref = np.right_shift(res_ref, 8)
            res_ref = np.clip(res_ref, 0,
                              (1 << (env.INP_WIDTH - 1)) - 1).astype(res.dtype)
            time_f = f.time_evaluator("gemm", dev, number=20)
            if env.TARGET in ["sim", "tsim"]:
                simulator.clear_stats()
            cost = time_f(data_arr, weight_arr, res_arr)
            if env.TARGET in ["sim", "tsim"]:
                stats = simulator.stats()
                print("Execution statistics:")
                for k, v in stats.items():
                    print("\t{:<16}: {:>16}".format(k, v))
            res_unpack = res_arr.numpy().reshape(batch_size // env.BATCH,
                                                 channel // env.BLOCK_OUT,
                                                 env.BATCH, env.BLOCK_OUT)
            return cost

        def run_schedule(load_inp, load_wgt, gemm, alu, store_out, print_ir):
            s = te.create_schedule(res.op)
            s[data_buf].set_scope(env.inp_scope)
            s[weight_buf].set_scope(env.wgt_scope)
            s[res_gem].set_scope(env.acc_scope)
            s[res_shf].set_scope(env.acc_scope)
            s[res_min].set_scope(env.acc_scope)
            s[res_max].set_scope(env.acc_scope)

            if block:
                bblock = block // env.BATCH
                iblock = block // env.BLOCK_IN
                oblock = block // env.BLOCK_OUT
                xbo, xco, xbi, xci = s[res].op.axis
                xb1, xco1, xb2, xco2 = s[res].tile(xbo, xco, bblock, oblock)
                store_pt = xb2

                s[res_gem].compute_at(s[res], xco1)
                s[res_shf].compute_at(s[res], xco1)
                s[res_min].compute_at(s[res], xco1)
                s[res_max].compute_at(s[res], xco1)

                xbo, xco, xbi, xci = s[res_gem].op.axis
                # Compute one line at a time
                ko1, ko2 = s[res_gem].split(ko, iblock)
                s[res_gem].reorder(ko1, ko2, xbo, xco, xbi, xci, ki)
                s[data_buf].compute_at(s[res_gem], ko1)
                s[weight_buf].compute_at(s[res_gem], ko1)
                # Use VTA instructions
                s[data_buf].pragma(s[data_buf].op.axis[0], load_inp)
                s[weight_buf].pragma(s[weight_buf].op.axis[0], load_wgt)
                s[res_gem].tensorize(xbi, gemm)
                s[res_shf].pragma(s[res_shf].op.axis[0], alu)
                s[res_min].pragma(s[res_min].op.axis[0], alu)
                s[res_max].pragma(s[res_max].op.axis[0], alu)
                s[res].pragma(store_pt, store_out)
            else:
                xbo, xco, xbi, xci = s[res_gem].op.axis
                s[res_gem].reorder(ko, xbo, xco, xbi, xci, ki)
                # Use VTA instructions
                s[data_buf].pragma(s[data_buf].op.axis[0], load_inp)
                s[weight_buf].pragma(s[weight_buf].op.axis[0], load_wgt)
                s[res_gem].tensorize(xbi, gemm)
                s[res_shf].pragma(s[res_shf].op.axis[0], alu)
                s[res_min].pragma(s[res_min].op.axis[0], alu)
                s[res_max].pragma(s[res_max].op.axis[0], alu)
                s[res].pragma(s[res].op.axis[0], store_out)

            if print_ir:
                print(tvm.lower(s, [data, weight, res], simple_mode=True))
            return verify(s)

        def gemm_normal(print_ir):
            mock = env.mock
            print("----- GEMM GOPS End-to-End Test-------")

            def run_test(header, print_ir):
                cost = run_schedule(
                    env.dma_copy,
                    env.dma_copy,
                    env.gemm,
                    env.alu,
                    env.dma_copy,
                    print_ir,
                )
                gops = (num_ops / cost.mean) / float(10**9)
                print(header)
                print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops))

            with vta.build_config():
                run_test("NORMAL", print_ir)

        def gemm_unittest(print_ir):
            mock = env.mock
            print("----- GEMM Unit Test-------")

            def run_test(header, print_ir):
                cost = run_schedule(mock.dma_copy, mock.dma_copy, env.gemm,
                                    mock.alu, mock.dma_copy, print_ir)
                gops = (num_ops / cost.mean) / float(10**9)
                print(header)
                print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops))

            with vta.build_config():
                run_test("NORMAL", print_ir)

        def alu_unittest(print_ir):
            mock = env.mock
            print("----- ALU Unit Test-------")

            def run_test(header, print_ir):
                cost = run_schedule(mock.dma_copy, mock.dma_copy, mock.gemm,
                                    env.alu, mock.dma_copy, print_ir)
                gops = (num_ops / cost.mean) / float(10**9)
                print(header)
                print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops))

            with vta.build_config():
                run_test("NORMAL", print_ir)
            print("")

        def load_inp_unittest(print_ir):
            mock = env.mock
            print("----- LoadInp Unit Test-------")

            def run_test(header, print_ir):
                cost = run_schedule(env.dma_copy, mock.dma_copy, mock.gemm,
                                    mock.alu, mock.dma_copy, print_ir)
                gops = (num_ops / cost.mean) / float(10**9)
                bandwith = (batch_size * channel * env.INP_WIDTH /
                            cost.mean) / float(10**9)
                print(header)
                print("\tTime cost = %g sec/op, %g GOPS, bandwidth=%g Gbits" %
                      (cost.mean, gops, bandwith))

            with vta.build_config():
                run_test("NORMAL", print_ir)
            print("")

        def load_wgt_unittest(print_ir):
            mock = env.mock
            print("----- LoadWgt Unit Test-------")

            def run_test(header, print_ir):
                cost = run_schedule(mock.dma_copy, env.dma_copy, mock.gemm,
                                    mock.alu, mock.dma_copy, print_ir)
                gops = (num_ops / cost.mean) / float(10**9)
                bandwith = (channel * channel * env.WGT_WIDTH /
                            cost.mean) / float(10**9)
                print(header)
                print("\tTime cost = %g sec/op, %g GOPS, bandwidth=%g Gbits" %
                      (cost.mean, gops, bandwith))

            with vta.build_config():
                run_test("NORMAL", print_ir)
            print("")

        def store_out_unittest(print_ir):
            mock = env.mock
            print("----- StoreOut Unit Test-------")

            def run_test(header, print_ir):
                cost = run_schedule(mock.dma_copy, mock.dma_copy, mock.gemm,
                                    mock.alu, env.dma_copy, print_ir)
                gops = (num_ops / cost.mean) / float(10**9)
                bandwith = (batch_size * channel * env.OUT_WIDTH /
                            cost.mean) / float(10**9)
                print(header)
                print("\tTime cost = %g sec/op, %g GOPS, bandwidth=%g Gbits" %
                      (cost.mean, gops, bandwith))

            with vta.build_config():
                run_test("NORMAL", print_ir)
            print("")

        gemm_normal(False)
        gemm_unittest(False)
        alu_unittest(False)