示例#1
0
文件: intrin_math.py 项目: gwli/tvm
def my_cuda_mylog_rule(op):
    """CUDA lowering rule for log"""
    if op.dtype == "float32":
        return tvm.call_pure_extern("float32", "logf", op.args[0])
    elif op.dtype == "float64":
        return tvm.call_pure_extern("float32", "log", op.args[0])
    else:
        return op
def my_cuda_log_rule(op):
    if op.dtype == "float32":
        # logf:外部函数名,应该是cuda的内部函数
        return tvm.call_pure_extern("float32", "logf", op.args[0])
    elif op.dtype == "float64":
        return tvm.call_pure_extern("float64", "log", op.args[0])
    else:
        return op
示例#3
0
def my_cuda_mylog_rule(op):
    """CUDA lowering rule for log"""
    if op.dtype == "float32":
        return tvm.call_pure_extern("float32", "logf", op.args[0])
    elif op.dtype == "float64":
        return tvm.call_pure_extern("float64", "log", op.args[0])
    else:
        return op
示例#4
0
def cuda_atomic_add_rule(op):
    if op.dtype == "float32":
        return tvm.call_pure_extern("float32", "atomicAdd", op.args[0],
                                    op.args[1])
    if op.dtype == "float64":
        return tvm.call_pure_extern("float64", "atomicAdd", op.args[0],
                                    op.args[1])
    if op.dtype == "int32":
        return tvm.call_pure_extern("int32", "atomicAdd", op.args[0],
                                    op.args[1])
    raise RuntimeError("only support int32, float32 and float64")
示例#5
0
def my_cuda_math_rule(op):
    """Customized CUDA intrinsic lowering rule"""
    assert isinstance(op, tvm.tir.Call)
    if op.dtype == "float32":
        # call float function
        return tvm.call_pure_extern("float32", "%sf" % op.name, op.args[0])
    elif op.dtype == "float64":
        # call double function
        return tvm.call_pure_extern("float32", op.name, op.args[0])
    else:
        # cannot do translation, return self.
        return op
示例#6
0
文件: intrin_math.py 项目: gwli/tvm
def my_cuda_math_rule(op):
    """Customized CUDA intrinsic lowering rule"""
    assert isinstance(op, tvm.expr.Call)
    if op.dtype == "float32":
        # call float function
        return tvm.call_pure_extern("float32", "%sf" % op.name, op.args[0])
    elif op.dtype == "float64":
        # call double function
        return tvm.call_pure_extern("float32", op.name, op.args[0])
    else:
        # cannot do translation, return self.
        return op
示例#7
0
 def check_cuda(dtype, n, lanes):
     if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"):
         print("skip because cuda is not enabled..")
         return
     if dtype == "int8" and not have_int8(tvm.gpu(0).compute_version):
         print("skip because gpu does not support int8")
         return
     A = tvm.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes))
     B = tvm.placeholder((n,), name='B', dtype="%sx%d" % (dtype, lanes))
     C = tvm.placeholder((n,), name='C', dtype="int32")
     D = tvm.compute((n,),
                     lambda i: tvm.call_pure_extern("int32", "__dp4a", A[i], B[i], C[i]), name='D')
     s = tvm.create_schedule(D.op)
     xo, xi = s[D].split(D.op.axis[0], factor=num_thread)
     s[D].bind(xo, tvm.thread_axis("blockIdx.x"))
     s[D].bind(xi, tvm.thread_axis("threadIdx.x"))
     fun = tvm.build(s, [A, B, C, D], "cuda")
     np_a = np.random.randint(low=-128, high=127, size=(n,lanes))
     np_b = np.random.randint(low=-128, high=127, size=(n,lanes))
     np_c = np.random.randint(low=0, high=127, size=(n,))
     np_d = [sum(x * y) + z for x, y, z in zip(np_a, np_b, np_c)]
     ctx = tvm.gpu(0)
     a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(np_a)
     b = tvm.nd.empty((n,), B.dtype, ctx).copyfrom(np_b)
     c = tvm.nd.empty((n,), C.dtype, ctx).copyfrom(np_c)
     d = tvm.nd.empty((n,), D.dtype, ctx)
     fun(a, b, c, d)
     tvm.testing.assert_allclose(d.asnumpy(), np_d)
示例#8
0
 def check_cuda(dtype, n, lanes):
     if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"):
         print("skip because cuda is not enabled..")
         return
     if dtype == "int8" and not have_int8(tvm.gpu(0).compute_version):
         print("skip because gpu does not support int8")
         return
     A = tvm.placeholder((n, ), name='A', dtype="%sx%d" % (dtype, lanes))
     B = tvm.placeholder((n, ), name='B', dtype="%sx%d" % (dtype, lanes))
     C = tvm.placeholder((n, ), name='C', dtype="int32")
     D = tvm.compute((n, ),
                     lambda i: tvm.call_pure_extern("int32", "__dp4a", A[i],
                                                    B[i], C[i]),
                     name='D')
     s = tvm.create_schedule(D.op)
     xo, xi = s[D].split(D.op.axis[0], factor=num_thread)
     s[D].bind(xo, bx)
     s[D].bind(xi, tx)
     fun = tvm.build(s, [A, B, C, D], "cuda")
     np_a = np.random.randint(low=-128, high=127, size=(n, lanes))
     np_b = np.random.randint(low=-128, high=127, size=(n, lanes))
     np_c = np.random.randint(low=0, high=127, size=(n, ))
     np_d = [sum(x * y) + z for x, y, z in zip(np_a, np_b, np_c)]
     ctx = tvm.gpu(0)
     a = tvm.nd.empty((n, ), A.dtype, ctx).copyfrom(np_a)
     b = tvm.nd.empty((n, ), B.dtype, ctx).copyfrom(np_b)
     c = tvm.nd.empty((n, ), C.dtype, ctx).copyfrom(np_c)
     d = tvm.nd.empty((n, ), D.dtype, ctx)
     fun(a, b, c, d)
     tvm.testing.assert_allclose(d.asnumpy(), np_d)
示例#9
0
文件: gemm_int8.py 项目: zhiics/tvm
    def intrin_func(ins, outs):
        xx, yy = ins
        zz = outs[0]
        ib = tvm.ir_builder.create()

        dp4a = zz.vstore(0, tvm.call_pure_extern('int32', '__dp4a',
                                                 xx.vload(0, dtype='int8x4'),
                                                 yy.vload(0, dtype='int8x4'),
                                                 zz.vload(0)))
        ib.emit(dp4a)

        body = ib.get()
        return body, zz.vstore(0, 0), body
示例#10
0
        def _instr(index):
            xx, yy = ins
            zz = outs[0]

            if index == 1:
                return zz.vstore(0, 0)

            ib = tvm.ir_builder.create()

            vec_x = xx.vload(0, dtype='int8x4')
            vec_y = yy.vload(0, dtype='int8x4')
            prev_z = 0 if index == 0 else zz.vload(0)

            new_z = tvm.call_pure_extern('int32', '__dp4a', vec_x, vec_y, prev_z)
            ib.emit(zz.vstore(0, new_z))

            return ib.get()
示例#11
0
        def _instr(index):
            xx, yy = ins
            zz = outs[0]

            if index == 1:
                return zz.vstore(0, 0)

            ib = tvm.ir_builder.create()

            vec_x = xx.vload(0, dtype='int8x4')
            vec_y = yy.vload(0, dtype='int8x4')
            prev_z = 0 if index == 0 else zz.vload(0)

            new_z = tvm.call_pure_extern('int32', '__dp4a', vec_x, vec_y,
                                         prev_z)
            ib.emit(zz.vstore(0, new_z))

            return ib.get()
示例#12
0
def lesson1():
  """
  The most straight-forward way to call target specific function is via
  extern function call construct in tvm.
  In th following example, we use :any:`tvm.call_pure_extern` to call
  :code:`__expf` function, which is only available under CUDA.
  """

  n = tvm.var("n")
  A = tvm.placeholder((n,), name='A')
  B = tvm.compute(A.shape,
                  lambda i: tvm.call_pure_extern("float32", "__expf", A[i]),
                  name="B")
  s = tvm.create_schedule(B.op)
  num_thread = 64
  bx, tx = s[B].split(B.op.axis[0], factor=num_thread)
  s[B].bind(bx, tvm.thread_axis("blockIdx.x"))
  s[B].bind(tx, tvm.thread_axis("threadIdx.x"))
  f = tvm.build(s, [A, B], "cuda", name="myexp")
  print(f.imported_modules[0].get_source())
示例#13
0
def test_llvm_import():
    # extern "C" is necessary to get the correct signature
    cc_code = """
    extern "C" float my_add(float x, float y) {
      return x + y;
    }
    """
    n = 10
    A = tvm.placeholder((n, ), name='A')
    B = tvm.compute(
        (n, ),
        lambda *i: tvm.call_pure_extern("float32", "my_add", A(*i), 1.0),
        name='B')

    def check_llvm(use_file):
        if not tvm.module.enabled("llvm"):
            return
        if not clang.find_clang(required=False):
            print("skip because clang is not available")
            return
        temp = util.tempdir()
        ll_path = temp.relpath("temp.ll")
        ll_code = clang.create_llvm(cc_code, output=ll_path)
        s = tvm.create_schedule(B.op)
        if use_file:
            s[B].pragma(s[B].op.axis[0], "import_llvm", ll_path)
        else:
            s[B].pragma(s[B].op.axis[0], "import_llvm", ll_code)
        # BUILD and invoke the kernel.
        f = tvm.build(s, [A, B], "llvm")
        ctx = tvm.cpu(0)
        # launch the kernel.
        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
        f(a, b)
        tvm.testing.assert_allclose(b.asnumpy(), a.asnumpy() + 1.0)

    check_llvm(use_file=True)
    check_llvm(use_file=False)
示例#14
0
def test_llvm_import():
    # extern "C" is necessary to get the correct signature
    cc_code = """
    extern "C" float my_add(float x, float y) {
      return x + y;
    }
    """
    n = 10
    A = tvm.placeholder((n,), name='A')
    B = tvm.compute((n,), lambda *i:
                    tvm.call_pure_extern("float32", "my_add", A(*i), 1.0),
                    name='B')
    def check_llvm(use_file):
        if not tvm.module.enabled("llvm"):
            return
        if not clang.find_clang(required=False):
            print("skip because clang is not available")
            return
        temp = util.tempdir()
        ll_path = temp.relpath("temp.ll")
        ll_code = clang.create_llvm(cc_code, output=ll_path)
        s = tvm.create_schedule(B.op)
        if use_file:
            s[B].pragma(s[B].op.axis[0], "import_llvm", ll_path)
        else:
            s[B].pragma(s[B].op.axis[0], "import_llvm", ll_code)
        # BUILD and invoke the kernel.
        f = tvm.build(s, [A, B], "llvm")
        ctx = tvm.cpu(0)
        # launch the kernel.
        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx)
        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx)
        f(a, b)
        tvm.testing.assert_allclose(
            b.asnumpy(), a.asnumpy() + 1.0)
    check_llvm(use_file=True)
    check_llvm(use_file=False)
示例#15
0
文件: TVM.py 项目: wayne9qiu/keops
# Declare Variable
A0 = tvm.placeholder((n, ), name='A0', dtype='float32')
A1 = tvm.placeholder((n, ), name='A1', dtype='float32')
A2 = tvm.placeholder((n, ), name='A2', dtype='float32')

B0 = tvm.placeholder((n, ), name='B0', dtype='float32')
B1 = tvm.placeholder((n, ), name='B1', dtype='float32')
B2 = tvm.placeholder((n, ), name='B2', dtype='float32')

D = tvm.placeholder((n, ), name='D', dtype='float32')

D_ij = lambda i : (A0[i] - B0[j]) * (B0[j] - A0[i]) \
                + (A1[i] - B1[j]) * (B1[j] - A1[i]) \
                + (A2[i] - B2[j]) * (B2[j] - A2[i])
K_ij = lambda i: tvm.call_pure_extern("float32", "__expf", D_ij(i))

C0 = tvm.compute((n, ), lambda i: tvm.sum(K_ij(i) * D[j], axis=j), name="C0")

# Scheduled the computation
s0 = tvm.create_schedule(C0.op)
bx, tx = s0[C0].split(C0.op.axis[0], factor=192)
s0[C0].bind(bx, tvm.thread_axis("blockIdx.x"))
s0[C0].bind(tx, tvm.thread_axis("threadIdx.x"))

# Actually build the binary
fconv0 = tvm.build(s0, [A0, A1, A2, B0, B1, B2, D, C0],
                   tgt,
                   target_host=tgt_host,
                   name="myconv0")
import tvm
import numpy as np

# direct declare extern math call
n = tvm.var('n')
A = tvm.placeholder((n, ), name='A')
B = tvm.compute(A.shape,
                lambda i: tvm.call_pure_extern('float32', '__expf', A[i]),
                name='B')
s = tvm.create_schedule(B.op)
num_thread = 64
bx, tx = s[B].split(B.op.axis[0], factor=num_thread)
s[B].bind(bx, tvm.thread_axis('blockIdx.x'))
s[B].bind(tx, tvm.thread_axis('threadIdx.x'))
f = tvm.build(s, [A, B], 'cuda', name='my_exp')
# print(f.imported_modules[0].get_source())

# unified intrinsic call
n = tvm.var('n')
A = tvm.placeholder((n, ), name='A')
B = tvm.compute(A.shape, lambda i: tvm.exp(A[i]), name='B')
s = tvm.create_schedule(B.op)
num_thread = 64
bx, tx = s[B].split(B.op.axis[0], factor=num_thread)
s[B].bind(bx, tvm.thread_axis('blockIdx.x'))
s[B].bind(tx, tvm.thread_axis('threadIdx.x'))
f_cuda = tvm.build(s, [A, B], 'cuda', name='my_exp')
print(f_cuda.imported_modules[0].get_source())

fopencl = tvm.build(s, [A, B], "opencl", name="myexp")
示例#17
0
 def reduce_op(x, y):
     assert x.dtype == y.dtype, "Reduing elements that don't have same data type: %s v.s. %s" % (
         x.dtype, y.dtype)
     return tvm.call_pure_extern(x.dtype, name, x, y, *args[1:])
示例#18
0
import tvm
import numpy as np

######################################################################
# Direct Declare Extern Math Call
# -------------------------------
# The most straight-forward way to call target specific function is via
# extern function call construct in tvm.
# In the following example, we use :any:`tvm.call_pure_extern` to call
# :code:`__expf` function, which is only available under CUDA.
#
n = tvm.var("n")
A = tvm.placeholder((n, ), name='A')
B = tvm.compute(A.shape,
                lambda i: tvm.call_pure_extern("float32", "__expf", A[i]),
                name="B")
s = tvm.create_schedule(B.op)
num_thread = 64
bx, tx = s[B].split(B.op.axis[0], factor=num_thread)
s[B].bind(bx, tvm.thread_axis("blockIdx.x"))
s[B].bind(tx, tvm.thread_axis("threadIdx.x"))
f = tvm.build(s, [A, B], "cuda", name="myexp")
print(f.imported_modules[0].get_source())

######################################################################
# Unified Intrinsic Call
# ----------------------
# The above code verifies that direct external call can be used to
# call into device specific functions.
# However, the above way only works for CUDA target with float type.
示例#19
0
文件: intrin_math.py 项目: gwli/tvm
import tvm
import numpy as np

######################################################################
# Direct Declare Extern Math Call
# -------------------------------
# The most straight-forward way to call target specific function is via
# extern function call construct in tvm.
# In th following example, we use :any:`tvm.call_pure_extern` to call
# :code:`__expf` function, which is only available under CUDA.
#
n = tvm.var("n")
A = tvm.placeholder((n,), name='A')
B = tvm.compute(A.shape,
                lambda i: tvm.call_pure_extern("float32", "__expf", A[i]),
                name="B")
s = tvm.create_schedule(B.op)
num_thread = 64
bx, tx = s[B].split(B.op.axis[0], factor=num_thread)
s[B].bind(bx, tvm.thread_axis("blockIdx.x"))
s[B].bind(tx, tvm.thread_axis("threadIdx.x"))
f = tvm.build(s, [A, B], "cuda", name="myexp")
print(f.imported_modules[0].get_source())

######################################################################
# Unified Intrinsic Call
# ----------------------
# The above code verifies that direct external call can be used to
# call into device specific functions.
# However, the above way only works for CUDA target with float type.