def _alter_conv3d_layout(attrs, inputs, tinfos, out_type):
    target = tvm.target.Target.current(allow_none=False)
    dispatch_ctx = autotvm.task.DispatchContext.current

    _, outs = relay.backend.compile_engine.select_implementation(
        relay.op.get("nn.conv3d"), attrs, tinfos, out_type, target)
    workload = autotvm.task.get_workload(outs)
    if workload is None:
        # The best implementation is not an AutoTVM template,
        # we then assume it's not necessary to alter this op.
        return None
    cfg = dispatch_ctx.query(target, workload)
    if cfg.is_fallback:  # if is fallback, clear query cache and return None
        autotvm.task.clear_fallback_cache(target, workload)
        return None

    topi_tmpl = workload[0]
    new_attrs = {k: attrs[k] for k in attrs.keys()}

    strides = attrs.get_int_tuple("strides")
    padding = attrs.get_int_tuple("padding")
    dilation = attrs.get_int_tuple("dilation")
    groups = attrs.get_int('groups')
    data_layout = attrs["data_layout"]
    kernel_layout = attrs["kernel_layout"]
    data, kernel = tinfos
    out_dtype = out_type.dtype

    if topi_tmpl == "conv3d_ncdhw_winograd.cuda":
        if dilation != (1, 1, 1):
            logger.warning("Does not support weight pre-transform for dilated 3D convolution.")
            return None

        assert data_layout == "NCDHW" and kernel_layout == "OIDHW"
        N, CI, D, H, W = get_const_tuple(data.shape)
        CO, _, KD, KH, KW = get_const_tuple(kernel.shape)

        # Pre-compute weight transformation in winograd
        tile_size = _infer_tile_size(tinfos[0], tinfos[1])

        weight = relay.nn.contrib_conv3d_winograd_weight_transform(inputs[1], tile_size=tile_size)
        new_attrs['tile_size'] = tile_size
        new_attrs['channels'] = CO

        # Store the same config for the altered operators (workload)
        new_data = data
        # Check if depth is transformed or not
        if 2 < KD < 8 and KD == KH:
            new_weight = te.placeholder(
                (KD + tile_size - 1, KH + tile_size - 1, KW + tile_size - 1, CO, CI),
                dtype=kernel.dtype)
        else:
            new_weight = te.placeholder(
                (KH + tile_size - 1, KW + tile_size - 1, KD, CO, CI),
                dtype=kernel.dtype)
        new_workload = autotvm.task.args_to_workload(
            [new_data, new_weight, strides, padding, dilation, out_dtype],
            "conv3d_ncdhw_winograd_without_weight_transform.cuda")
        dispatch_ctx.update(target, new_workload, cfg)
        return relay.nn.contrib_conv3d_winograd_without_weight_transform(
            inputs[0], weight, **new_attrs)

    return None
Пример #2
0
def verify_conv2d_NCHWc_int8(
    batch,
    in_channel,
    in_size,
    num_filter,
    kernel,
    stride,
    padding,
    dilation=1,
    add_bias=False,
    add_relu=False,
):
    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(
        padding, (kernel, kernel))
    padding_sum = pad_top + pad_left + pad_bottom + pad_right
    print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" %
          (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum,
           dilation))

    in_height = in_width = in_size

    A = te.placeholder((batch, in_channel, in_height, in_width),
                       name="A",
                       dtype="int8")
    W = te.placeholder((num_filter, in_channel, kernel, kernel),
                       name="W",
                       dtype="int8")
    bias = te.placeholder(
        (num_filter // oc_block_factor, 1, 1, oc_block_factor),
        name="bias",
        dtype="int8")

    a_shape = get_const_tuple(A.shape)
    w_shape = get_const_tuple(W.shape)
    bias_shape = get_const_tuple(bias.shape)
    dtype = A.dtype

    @memoize("topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw")
    def get_ref_data():
        a_np = np.random.randint(low=-128, high=127,
                                 size=a_shape).astype(dtype)
        w_np = np.random.randint(low=-128, high=128,
                                 size=w_shape).astype(dtype)
        b_np = np.random.uniform(size=bias_shape).astype(dtype)
        dw_np = tvm.topi.testing.dilate_python(w_np,
                                               (1, 1, dilation, dilation))
        c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride,
                                                   padding).astype(dtype)

        # convert to NCHWc
        _, _, out_height, out_width = c_np.shape
        c_np = c_np.reshape(
            (batch, num_filter // oc_block_factor, oc_block_factor, out_height,
             out_width)).transpose(0, 1, 3, 4, 2)

        if add_bias:
            b_np = np.random.uniform(size=bias_shape).astype(dtype)
            c_np += b_np
        if add_relu:
            c_np = np.maximum(c_np, 0)

        return a_np, w_np, b_np, c_np

    a_np, w_np, b_np, c_np = get_ref_data()

    def check_target(target):
        dev = tvm.device(target, 0)
        if not tvm.testing.device_enabled(target):
            print("Skip because %s is not enabled" % target)
            return
        if target == "cuda" and not tvm.contrib.nvcc.have_int8(
                dev.compute_version):
            print("Skip because int8 intrinsics are not available")
            return

        print("Running on target: %s" % target)
        with tvm.target.Target(target):
            C = topi.cuda.conv2d_NCHWc_int8(A, W, (stride, stride), padding,
                                            (dilation, dilation), "NCHW",
                                            dtype)
            if add_bias:
                C = topi.add(C, bias)
            if add_relu:
                C = topi.nn.relu(C)
            s = topi.cuda.schedule_conv2d_NCHWc_int8([C])

        a = tvm.nd.array(a_np, dev)
        w = tvm.nd.array(w_np, dev)
        b = tvm.nd.array(b_np, dev)
        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype),
                         dev)
        if add_bias:
            tvm.build(
                s,
                [A, W, bias, C],
                target,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                (batch, in_channel, in_size, num_filter, kernel, stride,
                 padding_sum, dilation),
            )
            func = tvm.build(
                s,
                [A, W, bias, C],
                target,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                (batch, in_channel, in_size, num_filter, kernel, stride,
                 padding_sum, dilation),
            )
            func(a, w, b, c)
        else:
            func = tvm.build(
                s,
                [A, W, C],
                target,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                (batch, in_channel, in_size, num_filter, kernel, stride,
                 padding_sum, dilation),
            )
            func(a, w, c)
        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)

    for target in ["cuda"]:
        check_target(target)
Пример #3
0
def verify_conv2d_nchw_int8(
    batch,
    in_channel,
    in_size,
    num_filter,
    kernel,
    stride,
    padding,
    dilation=1,
    add_bias=False,
    add_relu=False,
):
    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(
        padding, (kernel, kernel))
    padding_sum = pad_top + pad_left + pad_bottom + pad_right
    print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" %
          (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum,
           dilation))

    in_height = in_width = in_size

    A = te.placeholder((batch, in_channel, in_height, in_width),
                       name="A",
                       dtype="int8")
    W = te.placeholder((num_filter, in_channel, kernel, kernel),
                       name="W",
                       dtype="int8")
    bias = te.placeholder((num_filter, 1, 1), name="bias", dtype="int8")

    a_shape = get_const_tuple(A.shape)
    w_shape = get_const_tuple(W.shape)
    bias_shape = get_const_tuple(bias.shape)
    dtype = A.dtype

    @memoize("topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw")
    def get_ref_data():
        a_np = np.random.randint(low=-128, high=127,
                                 size=a_shape).astype(dtype)
        w_np = np.random.randint(low=-128, high=128,
                                 size=w_shape).astype(dtype)
        b_np = np.random.uniform(size=bias_shape).astype(dtype)
        dw_np = tvm.topi.testing.dilate_python(w_np,
                                               (1, 1, dilation, dilation))
        c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride,
                                                   padding).astype(dtype)

        if add_bias:
            b_np = np.random.uniform(size=bias_shape).astype(dtype)
            c_np += b_np
        if add_relu:
            c_np = np.maximum(c_np, 0)

        return a_np, w_np, b_np, c_np

    a_np, w_np, b_np, c_np = get_ref_data()

    def verify_workload_padding():
        _, _, out_height, out_width = get_const_tuple(c_np.shape)
        wkl = _get_workload(A, W, (stride, stride), padding, dilation, dtype)

        # for testing functionality,
        # we choose arbitrary int32_lanes and num_int8_elements can divide the channel,
        # regardless of the performance.
        int32_lanes, num_int8_elements = num_filter, in_channel

        # check if tile_ow candidates are the factors of the right output weight.
        cfg = autotvm.get_config()
        fallback_schedule_cpu_common_int8(cfg, wkl, int32_lanes,
                                          num_int8_elements)
        ow_tile = np.prod(cfg["tile_ow"].size)

        tvm.testing.assert_allclose(ow_tile, out_width)

    def check_target(target):
        dev = tvm.device(target, 0)
        if not tvm.testing.device_enabled(target):
            print("Skip because %s is not enabled" % target)
            return
        if target == "cuda" and not tvm.contrib.nvcc.have_int8(
                dev.compute_version):
            print("Skip because int8 intrinsics are not available")
            return

        print("Running on target: %s" % target)
        with tvm.target.Target(target):
            C = topi.cuda.conv2d_nchw_int8(A, W, (stride, stride), padding,
                                           (dilation, dilation), dtype)
            if add_bias:
                C = topi.add(C, bias)
            if add_relu:
                C = topi.nn.relu(C)
            s = topi.cuda.schedule_conv2d_nchw_int8([C])

        a = tvm.nd.array(a_np, dev)
        w = tvm.nd.array(w_np, dev)
        b = tvm.nd.array(b_np, dev)
        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype),
                         dev)
        if add_bias:
            tvm.build(
                s,
                [A, W, bias, C],
                target,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                (batch, in_channel, in_size, num_filter, kernel, stride,
                 padding_sum, dilation),
            )
            func = tvm.build(
                s,
                [A, W, bias, C],
                target,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                (batch, in_channel, in_size, num_filter, kernel, stride,
                 padding_sum, dilation),
            )
            func(a, w, b, c)
        else:
            func = tvm.build(
                s,
                [A, W, C],
                target,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                (batch, in_channel, in_size, num_filter, kernel, stride,
                 padding_sum, dilation),
            )
            func(a, w, c)
        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)

    verify_workload_padding()

    for target in ["cuda"]:
        check_target(target)
Пример #4
0
Файл: poc.py Проект: were/UNIT
import tvm
from tvm import te
from tensorizer.intrinsics import INTRINSICS
import numpy as np

n, m, k = 128, 768, 3072

a = te.placeholder((n, k), 'float16')
b = te.placeholder((k, m), 'float16')

block_k = 4

rv = te.reduce_axis((0, k // block_k), )


def compute(xo, yo, z, xi, yi):
    x = xo * 16 + xi
    y = yo * 16 + yi
    lhs = a[x, z * (k // block_k) + rv].astype('float32')
    rhs = b[rv + z * (k // block_k), y].astype('float32')
    return te.sum(lhs * rhs, axis=[rv])


c = te.compute((n // 16, m // 16, block_k, 16, 16), compute)

blkX = tvm.te.thread_axis('blockIdx.x')
blkY = tvm.te.thread_axis('blockIdx.y')
thrY = tvm.te.thread_axis('threadIdx.y')
thrX = tvm.te.thread_axis('threadIdx.x')

sch = te.create_schedule(c.op)
def test_dwarf_debug_information():
    nn = 1024
    n = tvm.runtime.convert(nn)
    A = te.placeholder((n,), name='A')
    B = te.placeholder((n,), name='B')
    C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name='C')
    s = te.create_schedule(C.op)
    xo, xi = s[C].split(C.op.axis[0], factor=4)
    s[C].parallel(xo)
    s[C].vectorize(xi)
    def check_llvm_object():
        if not tvm.runtime.enabled("llvm"):
            return
        if tvm.target.codegen.llvm_version_major() < 5:
            return
        if tvm.target.codegen.llvm_version_major() > 6:
            return
        # build two functions
        f2 = tvm.lower(s, [A, B, C], name="fadd1")
        f1 = tvm.lower(s, [A, B, C], name="fadd2")
        m = tvm.build([f1, f2], "llvm")
        temp = util.tempdir()
        o_path = temp.relpath("temp.o")
        m.save(o_path)
        import re
        import shutil
        import subprocess
        import sys

        # Try the dwarfdump utility (OS X)
        if shutil.which("dwarfdump"):
            output = subprocess.check_output(["dwarfdump", o_path])
            assert re.search(r"""DW_AT_name\\t\("fadd1"\)""", str(output))
            assert re.search(r"""DW_AT_name\\t\("fadd2"\)""", str(output))

        # Try gobjdump (OS X)
        if shutil.which("gobjdump"):
            output = subprocess.check_output(["gobjdump", "--dwarf", o_path])
            assert re.search(r"""DW_AT_name.*fadd1""", str(output))
            assert re.search(r"""DW_AT_name.*fadd2""", str(output))

        # Try objdump (Linux) - Darwin objdump has different DWARF syntax.
        if shutil.which("objdump") and sys.platform != 'darwin':
            output = subprocess.check_output(["objdump", "--dwarf", o_path])
            assert re.search(r"""DW_AT_name.*fadd1""", str(output))
            assert re.search(r"""DW_AT_name.*fadd2""", str(output))

    def check_llvm_ir():
        if not tvm.runtime.enabled("llvm"):
            return
        if tvm.target.codegen.llvm_version_major() < 5:
            return
        if tvm.target.codegen.llvm_version_major() > 6:
            return
        # build two functions
        f2 = tvm.lower(s, [A, B, C], name="fadd1")
        f1 = tvm.lower(s, [A, B, C], name="fadd2")
        m = tvm.build([f1, f2], target="llvm -target=aarch64-linux-gnu")
        ll = m.get_source("ll")

        # On non-Darwin OS, don't explicitly specify DWARF version.
        import re
        assert not re.search(r""""Dwarf Version""""", ll)
        assert re.search(r"""llvm.dbg.value""", ll)

        # Try Darwin, require DWARF-2
        m = tvm.build([f1, f2],
                      target="llvm -target=x86_64-apple-darwin-macho")
        ll = m.get_source("ll")
        assert re.search(r"""i32 4, !"Dwarf Version", i32 2""", ll)
        assert re.search(r"""llvm.dbg.value""", ll)

    check_llvm_object()
    check_llvm_ir()
Пример #6
0
def make_matrix_mul(shapeA, transposeA, shapeB, transposeB, tgt, tgt_host,
                    func_name, dtype="float32"):
    """TODO: Your code here"""
    """Hint: use tvm.reduce_axis, tvm.sum"""
    """Hint: treat 4 cases of transposeA, transposeB separately"""
    """Hint: for tvm schedule, use split, reorder, vectorize, parallel"""
    """Hint: debug tvm schedule using tvm.lower"""
    A=te.placeholder(shapeA,dtype=dtype,name="A")
    B=te.placeholder(shapeB,dtype=dtype, name="B")
    def transpose(mat):
        return te.compute((mat.shape[1],mat.shape[0]),lambda i,j:mat[j][i])


    AA=A if not transposeA else transpose(A)
    BB=B if not transposeB else transpose(B)
    k=te.reduce_axis((0,AA.shape[1]),name="k")
    C=te.compute((AA.shape[0],BB.shape[1]),lambda i,j:te.sum(AA[i][k]*BB[k][j],axis =k))

    s=te.create_schedule(C.op)
    if tgt=="llvm":
        xo,yo,xi,yi=s[C].tile(C.op.axis[0],C.op.axis[1],32,32)
        k,=s[C].op.reduce_axis
        ko,ki=s[C].split(k,factor=4)
        s[C].reorder(xo,yo,ko,xi,yi,ki)
        # s[C].parallel(ki)
    if tgt=="cuda":
        if transposeA:
            xx1,xx2=s[AA].split(AA.op.axis[0],factor=32)
            s[AA].bind(xx1,te.thread_axis("blockIdx.x"))
            s[AA].bind(xx2,te.thread_axis("threadIdx.x"))
        if transposeB:
            yy1,yy2=s[BB].split(BB.op.axis[0],factor=32)
            s[BB].bind(yy1, te.thread_axis("blockIdx.y"))
            s[BB].bind(yy2, te.thread_axis("threadIdx.y"))

        x1,x2=s[C].split(C.op.axis[0],factor =32)
        y1,y2=s[C].split(C.op.axis[1],factor=32)
        # s[C].reorder(x1,y1,x2,y2)
        s[C].bind(x1,te.thread_axis("blockIdx.x"))
        s[C].bind(y1,te.thread_axis("blockIdx.y"))
        s[C].bind(x2,te.thread_axis("threadIdx.x"))
        s[C].bind(y2,te.thread_axis("threadIdx.y"))


    # bn = 32
    # CC = s.cache_write(C, 'global')
    # xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
    # s[CC].compute_at(s[C], yo)
    # xc, yc = s[CC].op.axis
    # k, = s[CC].op.reduce_axis
    # ko, ki = s[CC].split(k, factor=4)
    # s[CC].reorder(ko, xc, ki, yc)
    # s[CC].unroll(ki)
    # s[CC].vectorize(yc)
    # s[C].parallel(xo)


    # print(tvm.lower(s,[A,B,C],simple_mode=True))

    f=tvm.build(s,[A,B,C],tgt,tgt_host,name=func_name)
    return f
Пример #7
0
def make_matrix_softmax_cross_entropy(shape, tgt, tgt_host, func_name,
                                      dtype="float32"):
    """TODO: Your code here"""
    """Hint: output shape should be (1,)"""
    A_=te.placeholder(shape,dtype=dtype,name="A_")
    A=te.placeholder(shape,dtype=dtype,name="A")

    #desined by myself
    k = te.reduce_axis((0, A.shape[1]), name="k")
    A_max = te.compute((A.shape[0],), lambda i: te.max(A[i, k], axis=k))
    A_ex = te.compute(shape, lambda i, j: te.exp(A[i, j] - A_max[i]))
    k1 = te.reduce_axis((0, A.shape[1]), name="k1")
    A_ex_sum = te.compute((A.shape[0],), lambda i: te.sum(A_ex[i, k1], axis=k1))
    A_logsoftmax = te.compute(shape, lambda i, j: te.log(A_ex[i, j] / A_ex_sum[i]))

    k2=te.reduce_axis((0,shape[1]),name="k2")
    A_logsoftmax_sum=te.compute((shape[0],0),lambda i:te.sum(A_logsoftmax[i,k2]*A_[i,k2],axis=k2))
    k3=te.reduce_axis((0,shape[0]),name="k3")
    B=te.compute((1,),lambda i: te.sum(-A_logsoftmax_sum[k3],axis = k3))
    B1=te.compute((1,), lambda i: B[i] / shape[0])

    s=te.create_schedule(B1.op)
    if tgt=="cuda":
        #I'dont know why it can't work?
        s = te.create_schedule(B1.op)

        num_thread = 64
        block_x = te.thread_axis("blockIdx.x")
        thread_x = te.thread_axis((0, num_thread), "threadIdx.x")

        s[A_ex].bind(A_ex.op.axis[0], block_x)
        s[A_max].bind(A_max.op.axis[0], block_x)

        k_ex_sum = A_ex_sum.op.reduce_axis[0]
        ko, ki = s[A_ex_sum].split(k_ex_sum, factor=num_thread)
        EF = s.rfactor(A_ex_sum, ki)
        s[A_ex_sum].bind(s[A_ex_sum].op.axis[0], block_x)
        s[A_ex_sum].bind(s[A_ex_sum].op.reduce_axis[0], thread_x)
        s[EF].compute_at(s[A_ex_sum], s[A_ex_sum].op.reduce_axis[0])
        s[A_ex_sum].set_store_predicate(thread_x.var.equal(0))

        tx, xi = s[A_logsoftmax].split(A_logsoftmax.op.axis[1], nparts=num_thread)
        s[A_logsoftmax].bind(A_logsoftmax.op.axis[0], block_x)
        s[A_logsoftmax].bind(tx, thread_x)

        k_logsoftmax_sum = A_logsoftmax_sum.op.reduce_axis[0]
        klso, klsi = s[A_logsoftmax_sum].split(k_logsoftmax_sum, factor=num_thread)
        lsEF = s.rfactor(A_logsoftmax_sum, klsi)
        s[A_logsoftmax_sum].bind(s[A_logsoftmax_sum].op.axis[0], block_x)
        s[A_logsoftmax_sum].bind(s[A_logsoftmax_sum].op.reduce_axis[0], thread_x)
        s[lsEF].compute_at(s[A_logsoftmax_sum], s[A_logsoftmax_sum].op.reduce_axis[0])
        s[A_logsoftmax_sum].set_store_predicate(thread_x.var.equal(0))

        k_B=B.op.reduce_axis[0]
        kbo,kbi=s[B].split(k_B,factor=num_thread)
        bEF=s.rfactor(B,kbi)
        s[B].bind(s[B].op.reduce_axis[0],thread_x)
        s[bEF].compute_at(s[B],s[B].op.reduce_axis[0])
        s[B].set_store_predicate(block_x.var.equal(0))

        s[B1].set_store_predicate(block_x.var.equal(0))


        print(tvm.lower(s, [A, A_,B1], simple_mode=True))


    f=tvm.build(s,[A,A_,B1],tgt,tgt_host,name=func_name)
    return f
Пример #8
0
def verify_resize3d(
    batch,
    in_channel,
    in_depth,
    in_height,
    in_width,
    out_depth,
    out_height,
    out_width,
    layout="NCDHW",
    coordinate_transformation_mode="half_pixel",
    method="trilinear",
):
    if layout == "NCDHW":
        A = te.placeholder((batch, in_channel, in_depth, in_height, in_width),
                           name="A",
                           dtype="float32")
        dtype = A.dtype
        out_shape = (batch, in_channel, out_depth, out_height, out_width)
        a_np = np.random.uniform(size=(batch, in_channel, in_depth, in_height,
                                       in_width)).astype(dtype)
    elif layout == "NDHWC":
        A = te.placeholder((batch, in_depth, in_height, in_width, in_channel),
                           name="A",
                           dtype="float32")
        dtype = A.dtype
        out_shape = (batch, out_depth, out_height, out_width, in_channel)
        a_np = np.random.uniform(size=(batch, in_depth, in_height, in_width,
                                       in_channel)).astype(dtype)
    else:
        raise NotImplementedError("Layout not supported {} ".format(layout))

    B = topi.image.resize3d(
        A,
        (out_depth, out_height, out_width),
        layout=layout,
        coordinate_transformation_mode=coordinate_transformation_mode,
        method=method,
    )

    if method == "trilinear":
        b_np = tvm.topi.testing.trilinear_resize3d_python(
            a_np, (out_depth, out_height, out_width), layout,
            coordinate_transformation_mode)
    else:
        scale_d = out_depth / in_depth
        scale_h = out_height / in_height
        scale_w = out_width / in_width
        b_np = tvm.topi.testing.upsampling3d_python(
            a_np, (scale_d, scale_h, scale_w), layout)

    def check_target(target, dev):
        print("Running on target: %s" % target)
        with tvm.target.Target(target):
            s = tvm.topi.testing.get_injective_schedule(target)(B)
        a = tvm.nd.array(a_np, dev)
        b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), dev)
        f = tvm.build(s, [A, B], target)
        f(a, b)

        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3)

    for target, dev in tvm.testing.enabled_targets():
        check_target(target, dev)
Пример #9
0
    def verify_crop_and_resize(
        image_shape,
        np_boxes,
        np_box_indices,
        np_crop_size,
        layout="NHWC",
        method="bilinear",
        extrapolation_value=0.0,
    ):

        images = te.placeholder(image_shape, name="images", dtype="float32")
        np_images = np.random.uniform(size=image_shape).astype("float32")
        boxes = te.placeholder(np_boxes.shape, name="boxes", dtype="float32")
        box_ind = te.placeholder(np_box_indices.shape,
                                 name="box_ind",
                                 dtype="int32")

        batch = len(np_box_indices)
        target_height, target_width = np_crop_size[0], np_crop_size[1]
        if layout == "NHWC":
            channel = image_shape[3]
            out_shape = (batch, target_height, target_width, channel)
        elif layout == "NCHW":
            channel = image_shape[1]
            out_shape = (batch, channel, target_height, target_width)
        else:
            raise NotImplementedError(
                "Layout {} is not supported.".format(layout))

        out = topi.image.crop_and_resize(
            images,
            boxes,
            box_ind,
            np_crop_size,
            layout=layout,
            method=method,
            extrapolation_value=extrapolation_value,
        )

        baseline_np = tvm.topi.testing.crop_and_resize_python(
            np_images, np_boxes, np_box_indices, np_crop_size, layout, method,
            extrapolation_value)

        def check_target(target, dev):
            print("Running on target: %s" % target)
            with tvm.target.Target(target):
                s = tvm.topi.testing.get_injective_schedule(target)(out)
            tvm_images = tvm.nd.array(np_images, dev)
            tvm_boxes = tvm.nd.array(np_boxes, dev)
            tvm_indices = tvm.nd.array(np_box_indices, dev)
            tvm_out = tvm.nd.array(np.zeros(out_shape, dtype="float32"), dev)
            f = tvm.build(s, [images, boxes, box_ind, out],
                          target,
                          name="crop_and_resize")
            f(tvm_images, tvm_boxes, tvm_indices, tvm_out)

            tvm.testing.assert_allclose(tvm_out.asnumpy(),
                                        baseline_np,
                                        rtol=1e-3,
                                        atol=1e-3)

        for target, dev in tvm.testing.enabled_targets():
            check_target(target, dev)
Пример #10
0
def verify_reduce_map_ele(in_shape,
                          axis,
                          keepdims,
                          type="sum",
                          dtype="float32"):
    # Build the logic and compile the function
    A = te.placeholder(shape=in_shape, name="A", dtype=dtype)
    A1 = topi.sqrt(topi.exp(A))
    out_dtype = dtype
    if type == "sum":
        B = topi.sum(A1, axis=axis, keepdims=keepdims)
    elif type == "all":
        B = topi.all(A, axis=axis, keepdims=keepdims)
    elif type == "any":
        B = topi.any(A, axis=axis, keepdims=keepdims)
    elif type == "max":
        B = topi.max(A1, axis=axis, keepdims=keepdims)
    elif type == "min":
        B = topi.min(A1, axis=axis, keepdims=keepdims)
    elif type == "argmax":
        B = topi.argmax(A1, axis=axis, keepdims=keepdims)
        out_dtype = "int32"
    elif type == "argmin":
        B = topi.argmin(A1, axis=axis, keepdims=keepdims)
        out_dtype = "int32"
    else:
        raise NotImplementedError

    def check_device(device, dev):
        print("Running on target: %s" % device)
        with tvm.target.Target(device):
            s = tvm.topi.testing.get_reduce_schedule(device)(B)

        foo = tvm.build(s, [A, B], device, name=type)
        # Test
        if dtype == "bool":
            in_npy_map = in_npy = np.random.choice([True, False],
                                                   size=in_shape)
        else:
            in_npy = np.random.uniform(-1, 1, size=in_shape).astype(dtype)
            in_npy_map = np.sqrt(np.exp(in_npy)).astype(dtype)

        if type == "sum":
            out_npy = in_npy_map.sum(axis=axis, keepdims=keepdims)
        elif type == "all" and dtype == "bool":
            out_npy = in_npy_map.all(axis=axis, keepdims=keepdims)
        elif type == "any" and dtype == "bool":
            out_npy = in_npy_map.any(axis=axis, keepdims=keepdims)
        elif type == "max":
            out_npy = in_npy_map.max(axis=axis, keepdims=keepdims)
        elif type == "min":
            out_npy = in_npy_map.min(axis=axis, keepdims=keepdims)
        elif type == "argmax":
            out_npy = _my_npy_argmax(in_npy_map, axis=axis, keepdims=keepdims)
        elif type == "argmin":
            out_npy = _my_npy_argmin(in_npy_map, axis=axis, keepdims=keepdims)
        else:
            raise NotImplementedError
        data_tvm = tvm.nd.array(in_npy, device=dev)
        out_tvm = tvm.nd.empty(shape=out_npy.shape,
                               device=dev,
                               dtype=out_dtype)
        for _ in range(1):
            foo(data_tvm, out_tvm)
        if type == "argmax" or type == "argmin":
            out_tvm_indices = out_tvm.numpy()
            if keepdims:
                out_tvm_indices = np.take(out_tvm_indices,
                                          indices=0,
                                          axis=axis)
            if axis is None:
                out_tvm_val = in_npy_map.ravel()[out_tvm_indices]
            else:
                other_indices = tuple(
                    np.indices(in_shape[0:axis] + in_shape[(axis + 1):]))
                sel_indices = other_indices[0:axis] + (
                    out_tvm_indices, ) + other_indices[axis:]
                out_tvm_val = in_npy_map[sel_indices]
            if type == "argmax":
                tvm.testing.assert_allclose(out_tvm_val,
                                            in_npy_map.max(axis=axis), 1e-3,
                                            1e-3)
            elif type == "argmin":
                tvm.testing.assert_allclose(out_tvm_val,
                                            in_npy_map.min(axis=axis), 1e-3,
                                            1e-3)
        else:
            tvm.testing.assert_allclose(out_tvm.numpy(), out_npy, 1e-3, 1e-3)

    for device, dev in tvm.testing.enabled_targets():
        check_device(device, dev)
Пример #11
0
def dot_16x1x16_uint8_int8_int32_skylake():
    """
    Int8 dot product by every 4 elements using AVX512 Skylake instructions.
    This function takes two arrays of uint8 and int8 datatype -- data[4] and
    kernel[16][4] -- and computes a dot product of data[4] with every
    4 elements of kernels, resulting in output[16] of int32 datatype.
    The pseudo code is as follows.
    .. code-block:: c
        void dot_16x1x16_uint8_int8_int32(uint8 data[4], int8 kernel[16][4],
                int32 output[16]){
            for (int i = 0; i < 16; i++){
                output[i] = 0;
                for (int k = 0; k < 4; k++){
                    output[i] += data[k] * kernel[i][k]
                }
            }
        }

    Physically, the kernel array sits in an AVX512 vector register and
    the data[4] is broadcasted to another AVX512 vector register. This
    function returns a TensorIntrin that can be used to tensorize
    a schedule.

    Returns
    -------
    intrin : TensorIntrin
        The Skylake int8 TensorIntrin that can be used in tensorizing schedule
    """

    int32_lanes = 16  # 16 int32 lanes in AVX512
    num_int8_elements = 4  # 4 int8 elements in int32
    data = te.placeholder((num_int8_elements, ), dtype='uint8', name='data')
    kernel = te.placeholder((int32_lanes, num_int8_elements),
                            dtype='int8',
                            name='kernel')
    k = te.reduce_axis((0, num_int8_elements), name='k')
    C = te.compute(
        (int32_lanes, ),
        lambda i: te.sum(
            data[k].astype('int32') * kernel[i, k].astype('int32'), axis=k),
        name="C")

    a_buffer = tvm.tir.decl_buffer(data.shape,
                                   dtype='uint8',
                                   name="a_buffer",
                                   offset_factor=1,
                                   strides=[1])
    b_buffer = tvm.tir.decl_buffer(kernel.shape,
                                   dtype='int8',
                                   name="b_buffer",
                                   offset_factor=1,
                                   strides=[te.var('ldw'), 1])

    def _intrin_func(ins, outs):
        def _instr(index):
            ib = tvm.tir.ir_builder.create()
            if index == 1:
                ib.emit(outs[0].vstore(0, tvm.tir.const(0, 'int32x16')))
                return ib.get()

            a_int8 = ins[0].vload([0], "uint8x4")
            re_int32 = tvm.tir.call_intrin('int32', 'tir.reinterpret', a_int8)
            vec_ai32 = re_int32.astype('int32x16')
            vec_a = tvm.tir.call_intrin('int8x64', 'tir.reinterpret', vec_ai32)
            vec_b = ins[1].vload([0, 0], "int8x64")
            vec_one = tvm.tir.const(1, "int16x32")
            pair_reduction = tvm.tir.call_llvm_pure_intrin(
                'int16x32', 'llvm.x86.avx512.pmaddubs.w.512',
                tvm.tir.const(0, 'uint32'), vec_a, vec_b)
            quad_reduction = tvm.tir.call_llvm_pure_intrin(
                'int32x16', 'llvm.x86.avx512.pmaddw.d.512',
                tvm.tir.const(0, 'uint32'), pair_reduction, vec_one)
            if index == 0:
                ib.emit(outs[0].vstore(0, quad_reduction))
            else:
                ib.emit(outs[0].vstore(
                    0, quad_reduction + outs[0].vload([0], 'int32x16')))
            return ib.get()

        # body, reset, update
        return _instr(0), _instr(1), _instr(2)

    buffer_params = {"offset_factor": 1}
    return te.decl_tensor_intrin(C.op,
                                 _intrin_func,
                                 binds={
                                     data: a_buffer,
                                     kernel: b_buffer
                                 },
                                 default_buffer_params=buffer_params)
Пример #12
0
def verify_conv2d_nchw(
        batch,
        in_channel,
        in_size,
        num_filter,
        kernel,
        stride,
        padding,
        dilation=1,
        add_bias=False,
        add_relu=False,
        devices=['cuda', 'llvm -device=arm_cpu', 'opencl -device=mali']):
    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(
        padding, (kernel, kernel))
    padding_sum = pad_top + pad_left + pad_bottom + pad_right
    print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" %
          (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum,
           dilation))

    in_height = in_width = in_size

    A = te.placeholder((batch, in_channel, in_height, in_width), name='A')
    W = te.placeholder((num_filter, in_channel, kernel, kernel), name='W')
    bias = te.placeholder((num_filter, 1, 1), name='bias')

    a_shape = get_const_tuple(A.shape)
    w_shape = get_const_tuple(W.shape)
    bias_shape = get_const_tuple(bias.shape)
    dtype = A.dtype

    @memoize("topi.tests.test_topi_conv2d_nchw.verify_conv2d_nchw")
    def get_ref_data():
        a_np = np.random.uniform(size=a_shape).astype(dtype)
        w_np = np.random.uniform(size=w_shape).astype(dtype)
        b_np = np.random.uniform(size=bias_shape).astype(dtype)
        dw_np = tvm.topi.testing.dilate_python(w_np,
                                               (1, 1, dilation, dilation))
        c_np = tvm.topi.testing.conv2d_nchw_python(a_np, dw_np, stride,
                                                   padding)
        if add_bias:
            b_np = np.random.uniform(size=bias_shape).astype(dtype)
            c_np += b_np
        if add_relu:
            c_np = np.maximum(c_np, 0)
        return a_np, w_np, b_np, c_np

    a_np, w_np, b_np, c_np = get_ref_data()

    def check_device(device):
        ctx = tvm.context(device, 0)
        if not tvm.testing.device_enabled(device):
            print("Skip because %s is not enabled" % device)
            return
        print("Running on target: %s" % device)
        with tvm.target.create(device):
            fcompute, fschedule = tvm.topi.testing.dispatch(
                device, _conv2d_nchw_winograd_implement)
            C = fcompute(A, W, stride, padding, dilation, dtype)
            if add_bias:
                C = topi.add(C, bias)
            if add_relu:
                C = topi.nn.relu(C)
            s = fschedule([C])

        a = tvm.nd.array(a_np, ctx)
        w = tvm.nd.array(w_np, ctx)
        b = tvm.nd.array(b_np, ctx)
        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype),
                         ctx)
        if add_bias:
            func = tvm.build(s, [A, W, bias, C],
                             device,
                             name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                             (batch, in_channel, in_size, num_filter, kernel,
                              stride, padding_sum, dilation))
            func(a, w, b, c)
        else:
            func = tvm.build(s, [A, W, C],
                             device,
                             name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                             (batch, in_channel, in_size, num_filter, kernel,
                              stride, padding_sum, dilation))
            func(a, w, c)

        rtol = 1e-3
        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=rtol)

    for device in devices:
        check_device(device)
Пример #13
0
  visitor is implemented.
- How a Schedule is lowered to either an IRModule class or a LLVM module. Otherwise,
  take a look at ``python/tvm/build_module.py`` to get some basics.

"""
import tvm
from tvm import te
import numpy as np

######################################################################
# We first write a very simple vector add and build it with the default schedule. Then, we use
# our customized lowering pass to manipulate the IR directly instead of using schedule primitives.
#

n = tvm.tir.const(128, "int32")
a = te.placeholder((n, ), name="a")
b = te.placeholder((n, ), name="b")
c = te.compute((n, ), lambda i: a[i] + b[i], name='c')

sch = te.create_schedule(c.op)
ir = tvm.lower(sch, [a, b, c])
print(ir)

######################################################################
# Writing a Pass
# --------------
# Essentially, an "IR transformation pass" is a function which maps a statement to a new statement.
# Thus, we define this vectorize function and implement it step by step.
#

######################################################################
Пример #14
0
def test_convolution_inference():
    BATCH = 8
    IH = 48
    IW = 48
    IC = 16
    OC = 16
    K = 3
    PAD = 1
    STRIDE = 1

    OH = (IH + 2 * PAD - K) + 1
    OW = (IW + 2 * PAD - K) + 1
    dshape = (BATCH, IC, IH, IW)
    kshape = (OC, IC, K, K)
    bshape = (OC, )
    oshape = (BATCH, OC, OH, OW)

    data = te.placeholder(dshape, name="data")
    kernel = te.placeholder(kshape, name="kernel")
    bias = te.placeholder(bshape, name="bias")

    def verify(target="llvm",
               algorithm=nnpack.ConvolutionAlgorithm.AUTO,
               with_bias=True):
        if not tvm.get_global_func(
                "tvm.contrib.nnpack.fully_connected_inference", True):
            pytest.skip("extern function is not available")
        if not nnpack.is_available():
            pytest.skip("nnpack is not available")

        ctx = tvm.cpu(0)
        output = nnpack.convolution_inference(
            data,
            kernel,
            bias if with_bias else None,
            [PAD, PAD, PAD, PAD],
            [STRIDE, STRIDE],
            algorithm=algorithm,
        )
        s = te.create_schedule(output.op)

        f = tvm.build(s, [data, kernel, bias, output], target)

        na = np.random.uniform(size=dshape).astype(data.dtype)
        nb = np.random.uniform(size=kshape).astype(kernel.dtype)
        nc = np.zeros(bshape, dtype=bias.dtype)
        ta = tvm.nd.array(na, ctx)
        tb = tvm.nd.array(nb, ctx)
        tc = tvm.nd.array(nc, ctx)
        td = tvm.nd.array(np.zeros(oshape, dtype=output.dtype), ctx)
        f(ta, tb, tc, td)
        nd = np_conv(np.reshape(na, (BATCH, IC, IH, IW)), nb, PAD,
                     STRIDE) + nc.reshape(1, bshape[0], 1, 1)
        tvm.testing.assert_allclose(td.asnumpy(),
                                    nd.reshape(BATCH, IC, IH, IW),
                                    rtol=1e-5)

    for algorithm in [
            nnpack.ConvolutionAlgorithm.AUTO,
            nnpack.ConvolutionAlgorithm.FFT_8x8,
            nnpack.ConvolutionAlgorithm.FFT_16x16,
            nnpack.ConvolutionAlgorithm.WT_8x8,
            nnpack.ConvolutionAlgorithm.IMPLICIT_GEMM,
            nnpack.ConvolutionAlgorithm.WT_8x8_FP16,
    ]:
        for with_bias in [True, False]:
            verify(algorithm=algorithm, with_bias=with_bias)
Пример #15
0
def test_dependent_output_shape():
    n, m, x = te.size_var('n'), te.size_var('m'), te.size_var('x')
    A = te.placeholder((n, m))
    B = te.compute((m, n//x), lambda i, j: A[i,j] , name='B')
    s = te.create_schedule(B.op)
    mod = tvm.build(s, [A, B, x])
Пример #16
0
def verify_resize(
    batch,
    in_channel,
    in_height,
    in_width,
    out_height,
    out_width,
    layout="NCHW",
    coord_trans="align_corners",
    method="bilinear",
):
    if layout == "NCHW":
        A = te.placeholder((batch, in_channel, in_height, in_width),
                           name="A",
                           dtype="float32")
        dtype = A.dtype
        out_shape = (batch, in_channel, out_height, out_width)
        a_np = np.random.uniform(size=(batch, in_channel, in_height,
                                       in_width)).astype(dtype)
    elif layout == "NHWC":
        A = te.placeholder((batch, in_height, in_width, in_channel),
                           name="A",
                           dtype="float32")
        dtype = A.dtype
        out_shape = (batch, out_height, out_width, in_channel)
        a_np = np.random.uniform(size=(batch, in_height, in_width,
                                       in_channel)).astype(dtype)
    else:
        raise NotImplementedError("Layout not supported {} ".format(layout))
    B = topi.image.resize(
        A,
        (out_height, out_width),
        layout=layout,
        coordinate_transformation_mode=coord_trans,
        method=method,
    )
    if method == "bilinear":
        b_np = tvm.topi.testing.bilinear_resize_python(a_np,
                                                       (out_height, out_width),
                                                       layout, coord_trans)
    else:
        # TODO: Nearest neighbor case doesn't do anything with coordinate transform mode, and also
        # nearest_neighbors and align_corners combination in topi doesn't match the output of this
        # function.
        scale_h = out_height / in_height
        scale_w = out_width / in_width
        b_np = tvm.topi.testing.upsampling_python(a_np, (scale_h, scale_w),
                                                  layout)

    def check_target(target, dev):
        print("Running on target: %s" % target)
        with tvm.target.Target(target):
            s = tvm.topi.testing.get_injective_schedule(target)(B)
        a = tvm.nd.array(a_np, dev)
        b = tvm.nd.array(np.zeros(out_shape, dtype=dtype), dev)
        f = tvm.build(s, [A, B], target)
        f(a, b)

        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-3, atol=1e-3)

    for target, dev in tvm.testing.enabled_targets():
        check_target(target, dev)
Пример #17
0
import tvm
from tvm import te
n = 1024 

dtype = "float32"

A = te.placeholder((n, n), dtype=dtype, name='A')
k = te.reduce_axis((0, n), name='k')
B = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name='B')

s = te.create_schedule(B.op)

print(tvm.lower(s, [A, B], simple_mode=True))
print("---------cutting line---------")

AA = s.cache_read(A, "shared", [B])

print(tvm.lower(s, [A, B], simple_mode=True))
Пример #18
0
def verify_broadcast_binary_ele(lhs_shape, rhs_shape,
                                ftopi, fnumpy,
                                lhs_min=-100, lhs_max=100,
                                rhs_min=-100, rhs_max=100,
                                dtype="float32"):
    # Build the logic and compile the function
    A = (te.var("A", dtype=dtype) if lhs_shape is None
         else te.placeholder(shape=lhs_shape, name="A", dtype=dtype))
    B = (te.var("B", dtype=dtype) if rhs_shape is None
         else te.placeholder(shape=rhs_shape, name="B", dtype=dtype))
    C = ftopi(A, B)
    if isinstance(A, tvm.tir.PrimExpr) and isinstance(B, tvm.tir.PrimExpr):
        assert(isinstance(C, tvm.tir.PrimExpr))
        return

    def gen_operand(shape, low, high, ctx):
        if shape is None:
            npy = float(np.random.uniform(low=low, high=high))
            if dtype.startswith('int'):
                npy = int(npy)
            nd = npy
        else:
            npy = np.random.uniform(low=low, high=high,
                                    size=shape).astype(dtype)
            nd = tvm.nd.array(npy, ctx)
        return npy, nd

    def check_device(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            print("Skip because %s is not enabled" % device)
            return
        print("Running on target: %s" % device)
        with tvm.target.create(device):
            s = topi.testing.get_broadcast_schedule(device)(C)
        foo = tvm.build(s, [A, B, C], device, name="broadcast_binary" + "_" + ftopi.__name__)

        lhs_npy, lhs_nd = gen_operand(lhs_shape, lhs_min, lhs_max, ctx)
        rhs_npy, rhs_nd = gen_operand(rhs_shape, rhs_min, rhs_max, ctx)
        out_npy = fnumpy(lhs_npy, rhs_npy)

        if fnumpy == np.floor_divide:
            # avoid check too close to X.5 and X.0
            # FIXME: floor_divide(94.90735, 0.6731018) behaves as floor(div(94.90735, 0.6731018))
            # However the result is somehow incorrect - need to further investigate.
            # And looks like numpy's floor_div(a,b) is implemented different from floor(div(a,b))
            mask = np.logical_or(np.abs(np.abs(np.fmod(lhs_npy / rhs_npy, 1)) - 0.5) < 1e-6,
                                 np.abs(np.fmod(lhs_npy / rhs_npy, 1)) < 1e-6)
            if mask.any():
                lhs_npy = lhs_npy + mask * 1e-3  * rhs_npy
                lhs_npy = lhs_npy.astype(dtype)
                lhs_nd = tvm.nd.array(lhs_npy, ctx) if lhs_shape is not None else lhs_npy.item()
                out_npy = fnumpy(lhs_npy, rhs_npy)

        out_nd = tvm.nd.array(np.empty(out_npy.shape).astype(C.dtype), ctx)
        foo(lhs_nd, rhs_nd, out_nd)
        tvm.testing.assert_allclose(out_nd.asnumpy(), out_npy, rtol=1E-4, atol=1E-4)

    for target in get_all_backend():
        check_device(target)
    check_device("sdaccel")
Пример #19
0
def make_matrix_softmax(shape, tgt, tgt_host, func_name, dtype="float32"):

    """TODO: Your code here"""
    """Hint: use tvm.reduce_axis, tvm.sum, tvm.max, tvm.exp"""
    """Hint: do not reuse the same reduction axis j."""
    """Hint: implement the following version for better stability
        e_x = np.exp(x - np.max(x))
        softmax(x)= e_x / e_x.sum()
    """
    A=te.placeholder(shape,dtype = dtype, name="A")
    '''
    #desined by myself
    k=te.reduce_axis((0,A.shape[1]),name="k")
    A_max=te.compute((A.shape[0],),lambda i:te.max(A[i,k],axis=k))
    A_ex=te.compute(shape,lambda i,j:te.exp(A[i,j]-A_max[i]))
    k1=te.reduce_axis((0,A.shape[1]),name="k1")
    A_ex_sum=te.compute((A.shape[0],),lambda i:te.sum(A_ex[i,k1],axis = k1))
    B=te.compute(shape,lambda i,j:A_ex[i,j]/A_ex_sum[i])

    s=te.create_schedule(B.op)

    if tgt=="cuda":
        s[B].bind(B.op.axis[1],te.thread_axis("threadIdx.x"))
        s[A_ex_sum].bind(k1,te.thread_axis("threadIdx.x"))
        s[A_ex].bind(A_ex.op.axis[1],te.thread_axis("threadIdx.x"))
        s[A_max].bind(k,te.thread_axis("threadIdx.x"))
        # print (tvm.lower(s,[A,B],simple_mode=True))
    '''

    #use topi
    B=topi.nn.softmax(A,axis=1)
    if tgt=="llvm":
        s = te.create_schedule(B.op)
    elif tgt=="cuda":
        # s=topi.cuda.schedule_softmax(B)
        s=te.create_schedule(B.op)
        softmax = B
        expsum = softmax.op.input_tensors[1]
        exp = softmax.op.input_tensors[0]
        max_elem = s[exp].op.input_tensors[1]

        num_thread = 64
        block_x = te.thread_axis("blockIdx.x")
        thread_x = te.thread_axis((0, num_thread), "threadIdx.x")

        s[exp].bind(exp.op.axis[0], block_x)
        s[max_elem].bind(max_elem.op.axis[0], block_x)

        k = expsum.op.reduce_axis[0]
        ko, ki = s[expsum].split(k, factor=num_thread)
        EF = s.rfactor(expsum, ki)
        s[expsum].bind(s[expsum].op.axis[0], block_x)
        s[expsum].bind(s[expsum].op.reduce_axis[0], thread_x)
        s[EF].compute_at(s[expsum], s[expsum].op.reduce_axis[0])
        s[expsum].set_store_predicate(thread_x.var.equal(0))

        tx, xi = s[softmax].split(softmax.op.axis[1], nparts=num_thread)
        s[softmax].bind(softmax.op.axis[0], block_x)
        s[softmax].bind(tx, thread_x)


        print(tvm.lower(s, [A, B], simple_mode=True))
    else:
        s=None

    f=tvm.build(s,[A,B],tgt,tgt_host,name=func_name)
    return f
def schedule_nhwc_tensorcore_cuda(cfg, s, Conv):
    """Schedule tensorcore template"""
    kh, kw, ic = s[Conv].op.reduce_axis
    out_dtype = Conv.dtype
    trans_paddata, kernel = s[Conv].op.input_tensors
    in_dtype = trans_paddata.dtype
    batch, _, _, _ = get_const_tuple(Conv.shape)
    _, _, _, out_channels = get_const_tuple(kernel.shape)
    paddata = s[trans_paddata].op.input_tensors

    # inline the pad and dtype transform
    s[trans_paddata].compute_inline()
    s[kernel].compute_inline()
    s[paddata[0]].compute_inline()

    # Designate the memory hierarchy
    AS = s.cache_read(trans_paddata, "shared", [Conv])
    WS = s.cache_read(kernel, "shared", [Conv])
    AF = s.cache_read(AS, "wmma.matrix_a", [Conv])
    WF = s.cache_read(WS, "wmma.matrix_b", [Conv])
    ConvF = s.cache_write(Conv, "wmma.accumulator")

    if Conv.op in s.outputs:
        output = Conv
        ConvS = s.cache_read(ConvF, "shared", [Conv])
        OL = ConvS
    else:
        output = s.outputs[0].output(0)
        s[Conv].set_scope("shared")
        OL = Conv

    # Schedule for autotvm
    cfg.define_knob("block_row_warps", [1, 2, 4])
    cfg.define_knob("block_col_warps", [1, 2, 4])
    cfg.define_knob("warp_row_tiles", [1, 2, 4])
    cfg.define_knob("warp_col_tiles", [1, 2, 4])
    cfg.define_knob("chunk", [1, 2, 4, 8])
    cfg.define_knob("offset", [0, 8])
    cfg.define_knob("vector_width", [1, 2, 4, 8])

    if batch % 16 == 0 and out_channels % 16 == 0:
        cfg.define_knob("wmma_m", [16, 8, 32])
    elif batch % 8 == 0 and out_channels % 32 == 0:
        cfg.define_knob("wmma_m", [8, 16, 32])
    elif batch % 32 == 0 and out_channels % 8 == 0:
        cfg.define_knob("wmma_m", [32, 16, 8])

    # fallback support
    target = tvm.target.Target.current()
    if cfg.is_fallback:
        ref_log = autotvm.tophub.load_reference_log(
            target.kind.name, target.model, "conv2d_nhwc_tensorcore.cuda")
        cfg.fallback_with_reference_log(ref_log)

    block_row_warps = cfg["block_row_warps"].val
    block_col_warps = cfg["block_col_warps"].val
    warp_row_tiles = cfg["warp_row_tiles"].val
    warp_col_tiles = cfg["warp_col_tiles"].val
    chunk = cfg["chunk"].val
    offset = cfg["offset"].val
    wmma_m = cfg["wmma_m"].val
    vector_width = cfg["vector_width"].val

    wmma_k = 16
    if wmma_m == 16:
        wmma_n = 16
    elif wmma_m == 8:
        wmma_n = 32
    elif wmma_m == 32:
        wmma_n = 8

    warp_size = 32

    block_x = te.thread_axis("blockIdx.x")
    block_y = te.thread_axis("blockIdx.y")
    block_z = te.thread_axis("blockIdx.z")
    thread_x = te.thread_axis("threadIdx.x")
    thread_y = te.thread_axis("threadIdx.y")
    thread_z = te.thread_axis("threadIdx.z")

    # Define the intrin strides
    def get_strides(extents):
        return [np.prod(extents[i:]).tolist() for i in range(len(extents))]

    AS_align = chunk * wmma_k + offset
    WS_align = warp_col_tiles * block_col_warps * wmma_n + offset
    block_factor_n = wmma_m * warp_row_tiles * block_row_warps
    block_factor_o = wmma_n * warp_col_tiles * block_col_warps
    CS_align = block_factor_o + offset
    AS_strides = get_strides([1, 1, AS_align, 1])
    AL_strides = get_strides([1, 1, wmma_k, 1])
    WS_strides = get_strides([WS_align, 1])
    WL_strides = get_strides([wmma_n * warp_col_tiles, 1])
    CL_strides = get_strides([1, 1, wmma_n * warp_col_tiles, 1])
    CS_strides = get_strides([1, 1, CS_align, 1])

    # Schedule for output
    nc, hc, wc, oc = output.op.axis
    block_k = s[output].fuse(hc, wc)
    s[output].bind(block_k, block_z)
    block_i, nc = s[output].split(nc, factor=block_factor_n)
    block_j, oc = s[output].split(oc, factor=block_factor_o)
    s[output].reorder(block_k, block_i, block_j, nc, oc)
    t = s[output].fuse(nc, oc)
    t, ti = s[output].split(t, factor=vector_width)
    t, tx = s[output].split(t, factor=warp_size)
    t, ty = s[output].split(t, factor=block_row_warps)
    t, tz = s[output].split(t, factor=block_col_warps)
    s[output].bind(block_i, block_x)
    s[output].bind(block_j, block_y)
    s[output].bind(tz, thread_z)
    s[output].bind(ty, thread_y)
    s[output].bind(tx, thread_x)
    s[output].vectorize(ti)

    # Schedule wmma store
    s[OL].compute_at(s[output], block_j)
    nc, hc, wc, oc = OL.op.axis
    s[OL].reorder(hc, wc, nc, oc)
    s[OL].storage_align(wc, CS_align - 1, CS_align)
    oc, ooc = s[OL].split(oc, factor=wmma_n)
    oc, oci = s[OL].split(oc, factor=warp_col_tiles)
    _, oc = s[OL].split(oc, factor=block_col_warps)
    nc, nnc = s[OL].split(nc, factor=wmma_m)
    nc, nci = s[OL].split(nc, factor=warp_row_tiles)
    _, nc = s[OL].split(nc, factor=block_row_warps)
    s[OL].reorder(nc, oc, nci, oci, nnc, ooc)
    s[OL].bind(nc, thread_y)
    s[OL].bind(oc, thread_z)

    # Schedule wmma computation
    s[ConvF].compute_at(s[OL], oc)
    n, h, w, o = ConvF.op.axis
    n, nnf = s[ConvF].split(n, factor=wmma_m)
    o, oof = s[ConvF].split(o, factor=wmma_n)
    ic, ii = s[ConvF].split(ic, factor=wmma_k)
    ko, ki = s[ConvF].split(ic, factor=chunk)
    s[ConvF].reorder(kh, kw, ko, ki, n, o, nnf, oof, ii)

    s[AF].compute_at(s[ConvF], ki)
    s[WF].compute_at(s[ConvF], ki)

    # Schedule wmma load
    n, h, w, i = AF.op.axis
    n, nn = s[AF].split(n, factor=wmma_m)
    i, ii = s[AF].split(i, factor=wmma_k)
    s[AF].reorder(n, i, nn, ii)

    kh, kw, i, o = WF.op.axis
    i, ii = s[WF].split(i, factor=wmma_k)
    o, oo = s[WF].split(o, factor=wmma_n)
    s[WF].reorder(o, i, oo)
    s[WF].reorder(i, o, ii, oo)

    s[WS].compute_at(s[ConvF], ko)
    s[AS].compute_at(s[ConvF], ko)

    # Schedule for data's share memory
    n, h, w, i = AS.op.axis
    s[AS].reorder(h, w, n, i)
    s[AS].storage_align(w, AS_align - 1, AS_align)
    t = s[AS].fuse(n, i)
    t, ti = s[AS].split(t, factor=vector_width)
    t, tx = s[AS].split(t, factor=warp_size)
    t, ty = s[AS].split(t, factor=block_row_warps)
    _, tz = s[AS].split(t, factor=block_col_warps)
    s[AS].bind(ty, thread_y)
    s[AS].bind(tz, thread_z)
    s[AS].bind(tx, thread_x)
    s[AS].vectorize(ti)

    # Schedule for kernel's share memory
    kh, kw, ic, o = WS.op.axis
    t = s[WS].fuse(ic, o)
    s[WS].storage_align(ic, WS_align - 1, WS_align)
    t, ti = s[WS].split(t, factor=vector_width)
    t, tx = s[WS].split(t, factor=warp_size)
    t, ty = s[WS].split(t, factor=block_row_warps)
    _, tz = s[WS].split(t, factor=block_col_warps)
    s[WS].bind(ty, thread_y)
    s[WS].bind(tz, thread_z)
    s[WS].bind(tx, thread_x)
    s[WS].vectorize(ti)

    shape = (wmma_m, wmma_n, wmma_k)

    # tensorize the wmma process
    AS_shape = (wmma_m, 1, 1, wmma_k)
    AL_shape = (wmma_m, 1, 1, wmma_k)
    WS_shape = (wmma_k, wmma_n)
    WL_shape = (wmma_k, wmma_n)
    CL_shape = (wmma_m, 1, 1, wmma_n)
    CS_shape = (wmma_m, 1, 1, wmma_n)

    AL_gemm = te.placeholder(AL_shape, name="A", dtype=in_dtype)
    WL_gemm = te.placeholder(WL_shape, name="B", dtype=in_dtype)
    k_gemm = te.reduce_axis((0, wmma_k), name="k")
    CL_compute = te.compute(
        CL_shape,
        lambda ii, t0, t1, jj: te.sum(
            AL_gemm[ii, t0, t1, k_gemm].astype(out_dtype) * WL_gemm[k_gemm, jj]
            .astype(out_dtype),
            axis=k_gemm,
        ),
        name="C",
    )

    s[AF].tensorize(
        nn,
        intrin_wmma_load_matrix_A(AL_strides, AS_strides, shape, "row_major",
                                  AS_shape, AL_shape, in_dtype),
    )
    s[WF].tensorize(
        ii,
        intrin_wmma_load_matrix_W(WL_strides, WS_strides, shape, "row_major",
                                  WS_shape, WL_shape, in_dtype),
    )
    s[OL].tensorize(
        nnc,
        intrin_wmma_store_matrix(CS_strides, CL_strides, shape, out_dtype,
                                 CL_shape, CS_shape))
    s[ConvF].tensorize(
        nnf,
        intrin_wmma_gemm(AL_gemm, WL_gemm, CL_compute, AL_strides, WL_strides,
                         CL_strides, shape),
    )

    N, OH, OW, CO = get_const_tuple(output.shape)
    KH, KW, CI, _ = get_const_tuple(kernel.shape)
    cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW)
Пример #21
0
def verify_conv1d_integration():
    batch_size = 1
    num_channel = 1
    num_filter = 1

    # Note: TVM doesn't have a separate op for 1D convolution, so we use conv2d instead.
    # We set height=1 to indicate that convolution is really 1D.
    stride = (1, 1)
    dilate = (1, 1)
    padding = (0, 0)

    kernel_size = (1, 3)
    input_window_size = (1, 10)
    inc_input_size = (1, 2)
    context_size = (1, 4)
    inc_output_size = (1, 2)
    output_window_size = (1, 8)

    num_iteration = 20
    buffer_axis = 3

    kernel_shape = (num_filter, num_channel, kernel_size[0], kernel_size[1])
    input_window_shape = (batch_size, num_channel, input_window_size[0],
                          input_window_size[1])
    inc_input_shape = (batch_size, num_channel, inc_input_size[0],
                       inc_input_size[1])
    inc_output_shape = (batch_size, num_filter, inc_output_size[0],
                        inc_output_size[1])
    context_shape = (batch_size, num_channel, context_size[0], context_size[1])
    output_window_shape = (batch_size, num_filter, output_window_size[0],
                           output_window_size[1])
    # Rule: Convolution of Tensor[context_shape] and Tensor[kernel_shape]
    #       produces Tensor[inc_input_shape]

    dtype = "float32"

    inc_input = te.placeholder(inc_input_shape, name="inc_input", dtype=dtype)
    input_window = te.placeholder(input_window_shape,
                                  name="input_window",
                                  dtype=dtype)
    context = te.placeholder(context_shape, name="context", dtype=dtype)
    kernel = te.placeholder(kernel_shape, name="kernel", dtype=dtype)
    inc_output = te.placeholder(inc_input_shape,
                                name="inc_output",
                                dtype=dtype)
    output_window = te.placeholder(output_window_shape,
                                   name="output_window",
                                   dtype=dtype)

    # Use memoize, pickle the test data for next time use
    @memoize("topi.tests.test_fifo_buffer_conv1d_integration")
    def get_data():
        # Generate [num_iteration] slices of input
        inc_input_np = np.random.uniform(
            size=tuple([num_iteration] + list(inc_input_shape))).astype(dtype)
        input_window_np = np.zeros(input_window_shape, dtype=dtype)
        kernel_np = np.random.uniform(size=kernel_shape).astype(dtype)
        context_np = np.zeros(context_shape, dtype=dtype)
        output_window_np = np.zeros(output_window_shape, dtype=dtype)

        return (inc_input_np, input_window_np, kernel_np, context_np,
                output_window_np)

    # Get the test data
    inc_input_np, input_window_np, kernel_np, context_np, output_window_np = get_data(
    )

    def check_device(device, ctx):
        print("  Running on target: {}".format(device))

        conv2d_nchw, schedule_conv2d_nchw = tvm.topi.testing.get_conv2d_nchw_implement(
            device)

        with tvm.target.Target(device):
            out = topi.nn.fifo_buffer(inc_input, context, axis=buffer_axis)
            s = tvm.topi.testing.get_injective_schedule(device)([out])
            update_context = tvm.build(s, [inc_input, context, out],
                                       device,
                                       name="update_context")

            out = conv2d_nchw(context, kernel, stride, padding, dilate, dtype)
            s = schedule_conv2d_nchw([out])
            conv2d_inc = tvm.build(s, [context, kernel, out],
                                   device,
                                   name="conv2d_inc")

            out = topi.nn.fifo_buffer(inc_output,
                                      output_window,
                                      axis=buffer_axis)
            s = tvm.topi.testing.get_injective_schedule(device)([out])
            update_output_window = tvm.build(s,
                                             [inc_output, output_window, out],
                                             device,
                                             name="update_output_window")

            out = topi.nn.fifo_buffer(inc_input,
                                      input_window,
                                      axis=buffer_axis)
            s = tvm.topi.testing.get_injective_schedule(device)([out])
            update_input_window = tvm.build(s, [inc_input, input_window, out],
                                            device,
                                            name="update_input_window")

            out = conv2d_nchw(input_window, kernel, stride, padding, dilate,
                              dtype)
            s = schedule_conv2d_nchw([out])
            conv2d = tvm.build(s, [input_window, kernel, out],
                               device,
                               name="conv2d")

        input_window_tvm = tvm.nd.array(input_window_np, ctx=ctx)
        new_input_window_tvm = tvm.nd.empty(shape=input_window_shape,
                                            ctx=ctx,
                                            dtype=dtype)
        kernel_tvm = tvm.nd.array(kernel_np, ctx=ctx)
        context_tvm = tvm.nd.array(context_np, ctx=ctx)
        new_context_tvm = tvm.nd.empty(shape=context_shape,
                                       ctx=ctx,
                                       dtype=dtype)
        inc_output_tvm = tvm.nd.empty(shape=inc_output_shape,
                                      ctx=ctx,
                                      dtype=dtype)
        output_window_tvm = tvm.nd.array(output_window_np, ctx=ctx)
        new_output_window_tvm = tvm.nd.empty(shape=output_window_shape,
                                             ctx=ctx,
                                             dtype=dtype)
        output_window_ref_tvm = tvm.nd.empty(shape=output_window_shape,
                                             ctx=ctx,
                                             dtype=dtype)

        for i in range(num_iteration):
            # Take i-th slice of inc_input_np
            inc_input_tvm = tvm.nd.array(inc_input_np[i], ctx=ctx)

            # Compute new output window incrementally, using the FIFO buffer op
            update_context(inc_input_tvm, context_tvm, new_context_tvm)
            conv2d_inc(new_context_tvm, kernel_tvm, inc_output_tvm)
            update_output_window(inc_output_tvm, output_window_tvm,
                                 new_output_window_tvm)
            context_tvm = new_context_tvm
            output_window_tvm = new_output_window_tvm

            # Compute full input window, so that we have a baseline
            update_input_window(inc_input_tvm, input_window_tvm,
                                new_input_window_tvm)
            input_window_tvm = new_input_window_tvm
            conv2d(input_window_tvm, kernel_tvm, output_window_ref_tvm)
            # Incrementally updating the output window should be equivalent to computing it from
            # scratch using the input window
            tvm.testing.assert_allclose(output_window_tvm.asnumpy(),
                                        output_window_ref_tvm.asnumpy())

    for device, ctx in tvm.testing.enabled_targets():
        check_device(device, ctx)
Пример #22
0
def test_tile_nd():
    input = te.placeholder((12, 12), dtype="uint8", name="input")
    out = topi.nn.relu(input)
    sch = te.create_schedule([out.op])
    outer_iters, inner_iters = tile_nd(sch, out, (3, 4))
    assert tuple(sch[out].leaf_iter_vars) == (*outer_iters, *inner_iters)
    def check(start, end, dstart, dend, dtype, floor_div=False):
        div = tvm.te.floordiv if floor_div else tvm.tir.truncdiv
        mod = tvm.te.floormod if floor_div else tvm.tir.truncmod

        # A are dividends, B are divisors. Note that we add 1 to make include end in the range.
        A = te.placeholder((end - start + 1,), name="A", dtype=dtype)
        B = te.placeholder((dend - dstart + 1,), name="B", dtype=dtype)
        # We clip values with min and max so that simplifiers know the ranges of values
        clipa = lambda x: tvm.te.min(tvm.tir.const(end, dtype), tvm.te.max(tvm.tir.const(start, dtype), x))
        clipb = lambda x: tvm.te.min(tvm.tir.const(dend, dtype), tvm.te.max(tvm.tir.const(dstart, dtype), x))
        # If the range is just a single point, use the constant itself
        if start == end:
            clipa = lambda x: tvm.tir.const(start, dtype)
        if dstart == dend:
            clipb = lambda x: tvm.tir.const(dstart, dtype)
        # D are division results and M are modulo results
        [D, M] = te.compute((end - start + 1, dend - dstart + 1),
                             lambda i, j: (div(clipa(A[i]), clipb(B[j])),
                                          mod(clipa(A[i]), clipb(B[j]))))

        s = te.create_schedule([D.op, M.op])
        f = tvm.build(s, [A, B, D, M], "llvm")

        # Fill input arrays with values
        A_arr = tvm.nd.empty((end - start + 1,), dtype)
        B_arr = tvm.nd.empty((dend - dstart + 1,), dtype)
        A_arr.copyfrom(np.arange(start, end + 1, dtype=dtype))
        B_np = np.arange(dstart, dend + 1, dtype=dtype)
        # If the range of the divisor contains 0, replace it with 1 to avoid division by zero
        if dend >= 0 and dstart <= 0:
            B_np[-dstart] = 1
        B_arr.copyfrom(B_np)
        D_arr = tvm.nd.empty((end - start + 1, dend - dstart + 1), dtype)
        M_arr = tvm.nd.empty((end - start + 1, dend - dstart + 1), dtype)

        # Run the function and convert the results to numpy
        f(A_arr, B_arr, D_arr, M_arr)
        D_arr = D_arr.asnumpy()
        M_arr = M_arr.asnumpy()

        # This helper just prints additional info on failure
        def _show_info():
            print("dtype: {}".format(dtype))
            print("dividend range: [{}, {}]".format(start, end))
            print("divisor range: [{}, {}]".format(dstart, dend))
            lowered = tvm.lower(s, [A, B, D, M], simple_mode=True)
            print("Lowered code:")
            print(lowered)

        # Check that the computed values are correct
        for i in range(start, end + 1):
            for j in range(dstart, dend + 1):
                if j == 0:
                    continue

                if floor_div:
                    dref = i // j
                    mref = i % j
                else:
                    dref = int(float(i) / j)
                    mref = int(math.fmod(i, j))

                if D_arr[i - start, j - dstart] != dref:
                    _show_info()
                    raise AssertionError("Incorrect division result: {}({}, {}) is {} "
                                         "but should be {}".format(div.__name__, i, j,
                                                                   D_arr[i - start, j - dstart],
                                                                   dref))
                if M_arr[i - start, j - dstart] != mref:
                    _show_info()
                    raise AssertionError("Incorrect modulo result: {}({}, {}) is {} "
                                         "but should be {}".format(mod.__name__, i, j,
                                                                   M_arr[i - start, j - dstart],
                                                                   mref))
Пример #24
0
def test_schedule_pragmas_for_const():
    input = te.placeholder((12, 12), dtype="uint8", name="input")
    const = te.compute((), lambda: 2)
    add = topi.add(input, const)
    sch = te.create_schedule([add.op])
    schedule_pragmas(sch)
Пример #25
0
def verify_conv2d_NHWC_gemm_int8(
    batch,
    in_channel,
    in_size,
    num_filter,
    kernel,
    stride,
    padding,
    dilation=1,
    add_bias=False,
    add_relu=False,
):
    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(
        padding, (kernel, kernel))
    padding_sum = pad_top + pad_left + pad_bottom + pad_right
    print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" %
          (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum,
           dilation))

    in_height = in_width = in_size

    A = te.placeholder((batch, in_height, in_width, in_channel),
                       name="A",
                       dtype="int8")
    W = te.placeholder((kernel, kernel, in_channel, num_filter),
                       name="W",
                       dtype="int8")
    bias = te.placeholder((num_filter, ), name="bias", dtype="int8")

    a_shape = get_const_tuple(A.shape)
    w_shape = get_const_tuple(W.shape)
    bias_shape = get_const_tuple(bias.shape)
    dtype = A.dtype

    @memoize("topi.tests.test_topi_conv2d_int8.verify_conv2d_nchw")
    def get_ref_data():
        a_np = np.random.randint(low=-128, high=127,
                                 size=a_shape).astype(dtype)
        w_np = np.random.randint(low=-128, high=128,
                                 size=w_shape).astype(dtype)
        b_np = np.random.uniform(size=bias_shape).astype(dtype)
        dw_np = tvm.topi.testing.dilate_python(w_np,
                                               (dilation, dilation, 1, 1))
        c_np = tvm.topi.testing.conv2d_nhwc_python(a_np, dw_np, stride,
                                                   padding).astype(dtype)

        if add_bias:
            b_np = np.random.uniform(size=bias_shape).astype(dtype)
            c_np += b_np
        if add_relu:
            c_np = np.maximum(c_np, 0)

        return a_np, w_np, b_np, c_np

    a_np, w_np, b_np, c_np = get_ref_data()

    def check_target(target):
        dev = tvm.device(target, 0)
        if not tvm.testing.device_enabled(target):
            print("Skip because %s is not enabled" % target)
            return
        print("Running on target: %s" % target)
        with tvm.target.Target(target):
            C = topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved(
                A, W, (stride, stride), padding, (dilation, dilation), dtype)
            if add_bias:
                C = topi.add(C, bias)
            if add_relu:
                C = topi.nn.relu(C)
            s = topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved([C])

        a = tvm.nd.array(a_np, dev)
        w = tvm.nd.array(w_np, dev)
        b = tvm.nd.array(b_np, dev)
        c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype),
                         dev)
        if add_bias:
            tvm.build(
                s,
                [A, W, bias, C],
                target,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                (batch, in_channel, in_size, num_filter, kernel, stride,
                 padding_sum, dilation),
            )
            func = tvm.build(
                s,
                [A, W, bias, C],
                target,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                (batch, in_channel, in_size, num_filter, kernel, stride,
                 padding_sum, dilation),
            )
            func(a, w, b, c)
        else:
            func = tvm.build(
                s,
                [A, W, C],
                target,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                (batch, in_channel, in_size, num_filter, kernel, stride,
                 padding_sum, dilation),
            )
            func(a, w, c)
        tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)

    check_target("llvm")
Пример #26
0
def verify_non_max_suppression(
    np_data,
    np_valid_count,
    np_indices,
    np_result,
    np_indices_result,
    max_output_size,
    iou_threshold,
    force_suppress,
    top_k,
    coord_start,
    score_index,
    id_index,
):
    dshape = np_data.shape
    batch, num_anchors, _ = dshape
    indices_dshape = (batch, num_anchors)
    data = te.placeholder(dshape, name="data")
    valid_count = te.placeholder((batch, ), dtype="int32", name="valid_count")
    indices = te.placeholder((batch, num_anchors),
                             dtype="int32",
                             name="indices")

    def check_device(target):
        dev = tvm.device(target, 0)
        if not tvm.testing.device_enabled(target):
            print("Skip because %s is not enabled" % target)
            return
        print("Running on target: %s" % target)
        with tvm.target.Target(target):
            fcompute, fschedule = tvm.topi.testing.dispatch(
                target, _nms_implement)
            out = fcompute(
                data,
                valid_count,
                indices,
                max_output_size,
                iou_threshold,
                force_suppress,
                top_k,
                coord_start=coord_start,
                score_index=score_index,
                id_index=id_index,
                return_indices=False,
            )
            indices_out = fcompute(
                data,
                valid_count,
                indices,
                max_output_size,
                iou_threshold,
                force_suppress,
                top_k,
                coord_start=coord_start,
                score_index=score_index,
                id_index=id_index,
                return_indices=True,
            )
            s = fschedule(out)
            indices_s = fschedule(indices_out)

        tvm_data = tvm.nd.array(np_data, dev)
        tvm_valid_count = tvm.nd.array(np_valid_count, dev)
        tvm_indices = tvm.nd.array(np_indices, dev)

        tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), dev)
        f = tvm.build(s, [data, valid_count, indices, out], target)
        f(tvm_data, tvm_valid_count, tvm_indices, tvm_out)
        tvm.testing.assert_allclose(tvm_out.asnumpy(), np_result, rtol=1e-4)

        tvm_indices_out = tvm.nd.array(np.zeros(indices_dshape, dtype="int32"),
                                       dev)
        f = tvm.build(indices_s, [data, valid_count, indices, indices_out[0]],
                      target)
        f(tvm_data, tvm_valid_count, tvm_indices, tvm_indices_out)
        tvm.testing.assert_allclose(tvm_indices_out.asnumpy(),
                                    np_indices_result,
                                    rtol=1e-4)

    for target in ["llvm", "cuda", "opencl", "nvptx"]:
        check_device(target)
Пример #27
0
def compile_conv2d_NHWC_gemm_int8_arm(
    batch,
    in_channel,
    in_size,
    num_filter,
    kernel,
    stride,
    padding,
    dilation=1,
    add_bias=False,
    add_relu=False,
):
    pad_top, pad_left, pad_bottom, pad_right = get_pad_tuple(
        padding, (kernel, kernel))
    padding_sum = pad_top + pad_left + pad_bottom + pad_right
    print("Workload: (%d, %d, %d, %d, %d, %d, %d, %d)" %
          (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum,
           dilation))

    in_height = in_width = in_size
    A = te.placeholder((batch, in_height, in_width, in_channel),
                       name="A",
                       dtype="int8")
    W = te.placeholder((kernel, kernel, in_channel, num_filter),
                       name="W",
                       dtype="int8")
    bias = te.placeholder((num_filter, ), name="bias", dtype="int8")
    dtype = "int32"
    devices = [
        (
            "llvm --device arm_cpu --mtriple aarch64-linux-gnu",
            topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved,
            topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved,
        ),
        (
            "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+dotprod",
            topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved,
            topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved,
        ),
        (
            "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+dotprod",
            topi.arm_cpu.compute_conv2d_NHWC_quantized_native,
            topi.arm_cpu.schedule_conv2d_NHWC_quantized_native,
        ),
        # TODO(giuseros) Need LLVM-11 in order to compile with +i8mm extension
        # (
        #   "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+i8mm",
        #   topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved,
        #   topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved,
        # ),
    ]

    for device_tuple in devices:
        target = device_tuple[0]
        compute = device_tuple[1]
        schedule = device_tuple[2]

        dev = tvm.device(target, 0)
        if not tvm.testing.device_enabled(target):
            print("Skip because %s is not enabled" % target)
            return
        print("Compiling on arm AArch64 target: %s" % target)
        with tvm.target.Target(target):
            assert is_aarch64_arm(), "AArch64 target not recognized"

            C = compute(A, W, (stride, stride), padding, (dilation, dilation),
                        dtype)
            if add_bias:
                C = topi.add(C, bias)
            if add_relu:
                C = topi.nn.relu(C)
            s = schedule([C])

        if add_bias:
            tvm.build(
                s,
                [A, W, bias, C],
                target,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                (batch, in_channel, in_size, num_filter, kernel, stride,
                 padding_sum, dilation),
            )
            func = tvm.build(
                s,
                [A, W, bias, C],
                target,
                name="relu_%dnnn_%d_%d_%d_%d_%d_%d_%d" %
                (batch, in_channel, in_size, num_filter, kernel, stride,
                 padding_sum, dilation),
            )
        else:
            func = tvm.build(
                s,
                [A, W, C],
                target,
                name="relu_%d_%d_%d_%d_%d_%d_%d_%d" %
                (batch, in_channel, in_size, num_filter, kernel, stride,
                 padding_sum, dilation),
            )
Пример #28
0
def verify_multibox_prior(dshape,
                          sizes=(1, ),
                          ratios=(1, ),
                          steps=(-1, -1),
                          offsets=(0.5, 0.5),
                          clip=False):
    data = te.placeholder(dshape, name="data")

    dtype = data.dtype
    input_data = np.random.uniform(size=dshape).astype(dtype)

    in_height = data.shape[2].value
    in_width = data.shape[3].value
    num_sizes = len(sizes)
    num_ratios = len(ratios)
    size_ratio_concat = sizes + ratios
    steps_h = steps[0] if steps[0] > 0 else 1.0 / in_height
    steps_w = steps[1] if steps[1] > 0 else 1.0 / in_width
    offset_h = offsets[0]
    offset_w = offsets[1]

    oshape = (1, in_height * in_width * (num_sizes + num_ratios - 1), 4)
    np_out = np.zeros(oshape).astype(dtype)

    for i in range(in_height):
        center_h = (i + offset_h) * steps_h
        for j in range(in_width):
            center_w = (j + offset_w) * steps_w
            for k in range(num_sizes + num_ratios - 1):
                w = (size_ratio_concat[k] * in_height / in_width /
                     2.0 if k < num_sizes else size_ratio_concat[0] *
                     in_height / in_width *
                     math.sqrt(size_ratio_concat[k + 1]) / 2.0)
                h = (size_ratio_concat[k] /
                     2.0 if k < num_sizes else size_ratio_concat[0] /
                     math.sqrt(size_ratio_concat[k + 1]) / 2.0)
                count = (i * in_width * (num_sizes + num_ratios - 1) + j *
                         (num_sizes + num_ratios - 1) + k)
                np_out[0][count][0] = center_w - w
                np_out[0][count][1] = center_h - h
                np_out[0][count][2] = center_w + w
                np_out[0][count][3] = center_h + h
    if clip:
        np_out = np.clip(np_out, 0, 1)

    def check_device(target):
        dev = tvm.device(target, 0)
        if not tvm.testing.device_enabled(target):
            print("Skip because %s is not enabled" % target)
            return
        print("Running on target: %s" % target)

        fcompute, fschedule = tvm.topi.testing.dispatch(
            target, _multibox_prior_implement)
        with tvm.target.Target(target):
            out = fcompute(data, sizes, ratios, steps, offsets, clip)
            s = fschedule(out)

        tvm_input_data = tvm.nd.array(input_data, dev)
        tvm_out = tvm.nd.array(np.zeros(oshape, dtype=dtype), dev)
        f = tvm.build(s, [data, out], target)
        f(tvm_input_data, tvm_out)
        tvm.testing.assert_allclose(tvm_out.asnumpy(), np_out, rtol=1e-3)

    for target in ["llvm", "opencl", "cuda"]:
        check_device(target)
Пример #29
0
import tvm
from tvm import te
from tensorizer.intrinsics import INTRINSICS
import numpy as np

n, m, k = 64, 192, 1024

a = te.placeholder((n, k), 'float16')
b = te.placeholder((m // 32, k // 32, 32, 32), 'float16')

block_k = 2

rv = te.reduce_axis((0, k), )


def compute(x, y):
    lhs = a[x, rv].astype('float32')
    rhs = b[y // 32, rv // 32, rv % 32, y % 32].astype('float32')
    return te.sum(lhs * rhs, axis=[rv])


c = te.compute((n, m), compute)

blkY = tvm.te.thread_axis('blockIdx.y')
blkX = tvm.te.thread_axis('blockIdx.x')
thrZ = tvm.te.thread_axis('threadIdx.z')
thrY = tvm.te.thread_axis('threadIdx.y')
thrX = tvm.te.thread_axis('threadIdx.x')

sch = te.create_schedule(c.op)
def test_llvm_add_pipeline():
    nn = 1024
    n = tvm.runtime.convert(nn)
    A = te.placeholder((n, ), name="A")
    B = te.placeholder((n, ), name="B")
    C = te.compute(A.shape, lambda *i: A(*i) + B(*i), name="C")
    s = te.create_schedule(C.op)
    xo, xi = s[C].split(C.op.axis[0], factor=4)
    s[C].parallel(xo)
    s[C].vectorize(xi)

    def verify_elf(path, e_machine):
        with open(path, "rb") as fi:
            arr = fi.read(20)
            assert struct.unpack("ccc", arr[1:4]) == (b"E", b"L", b"F")
            endian = struct.unpack("b", arr[0x5:0x6])[0]
            endian = "<" if endian == 1 else ">"
            assert struct.unpack(endian + "h", arr[0x12:0x14])[0] == e_machine

    def build_i386():
        temp = util.tempdir()
        target = "llvm -mtriple=i386-pc-linux-gnu"
        f = tvm.build(s, [A, B, C], target)
        path = temp.relpath("myadd.o")
        f.save(path)
        verify_elf(path, 0x03)

    def build_arm():
        target = "llvm -mtriple=armv7-none-linux-gnueabihf"
        if not tvm.runtime.enabled(target):
            print("Skip because %s is not enabled.." % target)
            return
        temp = util.tempdir()
        f = tvm.build(s, [A, B, C], target)
        path = temp.relpath("myadd.o")
        f.save(path)
        verify_elf(path, 0x28)
        asm_path = temp.relpath("myadd.asm")
        f.save(asm_path)
        # Do a RPC verification, launch kernel on Arm Board if available.
        host = os.environ.get("TVM_RPC_ARM_HOST", None)
        remote = None
        if host:
            port = int(os.environ["TVM_RPC_ARM_PORT"])
            try:
                remote = rpc.connect(host, port)
            except tvm.error.TVMError as e:
                pass

        if remote:
            remote.upload(path)
            farm = remote.load_module("myadd.o")
            ctx = remote.cpu(0)
            n = nn
            a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
            b = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
            c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx)
            farm(a, b, c)
            tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy())
            print("Verification finish on remote..")

    build_i386()
    build_arm()