Exemplo n.º 1
0
def test_fp16_build():
    dtype = "float16"

    dev = tvm.cuda(0)
    if dtype == "float16" and not have_fp16(dev.compute_version):
        print("skip because gpu does not support fp16")
        return

    x = relay.var("x", dtype=dtype, shape=(4, 4))
    y = relay.var("y", dtype=dtype, shape=(4, 4))
    z = x + y
    func = relay.Function([x, y], z)
    X = tvm.nd.array(np.random.uniform(-1, 1, (4, 4)).astype(dtype), device=dev)
    Y = tvm.nd.array(np.random.uniform(-1, 1, (4, 4)).astype(dtype), device=dev)
    params = {
        "x": X,
        "y": Y,
    }

    # build
    g_json, mmod, params = relay.build(func, "cuda", params=params)

    # test
    rt = tvm.contrib.graph_executor.create(g_json, mmod, dev)
    rt.load_params(runtime.save_param_dict(params))
    rt.run()
    out = rt.get_output(0)

    np.testing.assert_allclose(out.asnumpy(), X.asnumpy() + Y.asnumpy(), atol=1e-5, rtol=1e-5)
Exemplo n.º 2
0
    def check_cuda(dtype, m=32, n=32):
        if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
            print("skip because cuda is not enabled..")
            return
        if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version):
            print("Skip because gpu does not have fp16 support")
            return

        a = te.placeholder((m, n), name="a", dtype=dtype)
        b = te.placeholder((m, n), name="b", dtype=dtype)
        c = a + b
        d = a * b
        e = topi.elemwise_sum([c, d])
        g = topi.sum(e)
        with tvm.target.cuda():
            sg = topi.cuda.schedule_reduce(g)
            ctx = tvm.gpu(0)
            func = tvm.build(sg, [a, b, g], 'cuda')
            a_np = np.random.uniform(size=(m, n)).astype(a.dtype)
            b_np = np.random.uniform(size=(m, n)).astype(b.dtype)
            g_np = np.sum(np.add(a_np * b_np, a_np + b_np))
            a_nd = tvm.nd.array(a_np, ctx)
            b_nd = tvm.nd.array(b_np, ctx)
            g_nd = tvm.nd.array(np.zeros(g_np.shape, dtype=g_np.dtype), ctx)
            func(a_nd, b_nd, g_nd)
            tvm.testing.assert_allclose(g_nd.asnumpy(), g_np, rtol=1e-3)
Exemplo n.º 3
0
    def check_single_op(opfunc, ref, dtype):
        shape = (10, 4)
        dtype = dtype
        tp = relay.TensorType(shape)
        x = relay.var("x", tp, dtype=dtype)
        y = opfunc(x)
        # test printer
        assert ("{}(%x)".format(y.op.name)) in y.astext()
        # test type inference
        yy = run_infer_type(y)
        assert yy.checked_type == tp

        if ref is not None:
            data = np.random.rand(*shape).astype(dtype)
            ref_res = ref(data)
            func = relay.Function([x], y)
            for target, ctx in tvm.testing.enabled_targets():
                # use graph by execuor default for testing, as we need
                # create function explicitly to avoid constant-folding.
                if dtype == 'float16' and target == 'cuda' and not have_fp16(
                        tvm.gpu(0).compute_version):
                    continue
                intrp = relay.create_executor("graph", ctx=ctx, target=target)
                op_res = intrp.evaluate(func)(data)
                np.testing.assert_allclose(op_res.asnumpy(),
                                           ref_res,
                                           rtol=0.01)
    def check_cuda(dtype):
        if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version):
            print("Skip because gpu does not have fp16 support")
            return

        n, m = 16, 16
        A = te.placeholder((n, m,), name='A', dtype=dtype)
        B = te.compute((n, m,), lambda j, i: A[j, (i + 1) % m], name='B')

        cuda_target = tvm.target.create("cuda")
        assert cuda_target.thread_warp_size == 2 * m
        with cuda_target:
            s = te.create_schedule(B.op)
            tx = te.thread_axis("threadIdx.x")
            ty = te.thread_axis("threadIdx.y")
            bx = te.thread_axis("blockIdx.x")

            AA = s.cache_read(A, "warp", [B])
            y, x = B.op.axis
            z, y = s[B].split(y, nparts=2)
            s[B].bind(x, tx)
            s[B].bind(y, ty)
            s[B].bind(z, bx)
            s[AA].compute_at(s[B], y)
            _, x = AA.op.axis
            s[AA].bind(x, tx)

            ctx = tvm.gpu(0)
            func = tvm.build(s, [A, B], "cuda")
            A_np = np.array([list(range(i, m + i)) for i in range(n)], dtype=dtype)
            B_np = np.array([list(range(1 + i, m + i)) + [i] for i in range(n)], dtype=dtype)
            A_nd = tvm.nd.array(A_np, ctx)
            B_nd = tvm.nd.array(np.zeros(B_np.shape, dtype=B_np.dtype), ctx)
            func(A_nd, B_nd)
            tvm.testing.assert_allclose(B_nd.asnumpy(), B_np, rtol=1e-3)
Exemplo n.º 5
0
    def check_conversion(tgt, ctx):
        if not tvm.runtime.enabled(tgt):
            print("skip because {} is not enabled.".format(tgt))
            return
        elif tgt == "cuda" and ctx.exist and not have_fp16(
                ctx.compute_version):
            print("skip because gpu does not support fp16")
            return

        n = 10

        for (src, dst) in [('float32', 'float16'), ('float16', 'float32')]:
            x = relay.var("x", relay.TensorType((n, ), src))
            y = x.astype(dst)
            func = relay.Function([x], y)

            # init input
            X = tvm.nd.array(n * np.random.randn(n).astype(src) - n / 2)

            # build
            with relay.build_config(opt_level=1):
                g_json, mmod, params = relay.build(
                    tvm.IRModule.from_expr(func), tgt)

            # test
            rt = tvm.contrib.graph_runtime.create(g_json, mmod, ctx)
            rt.set_input("x", X)
            rt.run()
            out = rt.get_output(0)

            np.testing.assert_allclose(out.asnumpy(),
                                       X.asnumpy().astype(dst),
                                       atol=1e-5,
                                       rtol=1e-5)
Exemplo n.º 6
0
    def check(t0, t1, factor):
        if (t0 == "float16" or t1 == "float16") and not have_fp16(tvm.cuda(0).compute_version):
            print("Skip because gpu does not have fp16 support")
            return

        # compute
        n = 128
        A = te.placeholder((n,), dtype=t0, name="A")
        B = te.placeholder((n,), dtype=t1, name="B")
        C = te.compute((n,), lambda i: A[i] + topi.cast(B[i], A.dtype), name="C")

        # schedule
        s = tvm.te.create_schedule(C.op)
        ob, ib = s[C].split(s[C].op.axis[0], factor=factor)
        s[C].vectorize(ib)
        s[C].bind(ob, tx)
        func = tvm.build(s, [A, B, C], "cuda")

        # correctness
        dev = tvm.cuda(0)
        low, high = (0, 20) if t0.startswith("u") or t1.startswith("u") else (-10, 10)
        a_np = np.random.randint(low, high, size=n).astype(A.dtype)
        b_np = np.random.randint(low, high, size=n).astype(B.dtype)
        c_np = (a_np + b_np).astype(A.dtype)
        a_nd = tvm.nd.array(a_np, dev)
        b_nd = tvm.nd.array(b_np, dev)
        c_nd = tvm.nd.array(np.zeros(c_np.shape, dtype=c_np.dtype), dev)
        func(a_nd, b_nd, c_nd)
        tvm.testing.assert_allclose(c_nd.numpy(), c_np, rtol=1e-3)
Exemplo n.º 7
0
    def check_binary_op(opfunc, ref, dtype):
        # TODO(@jroesch): this piece of code improperly uses type variables.
        n = tvm.var("n")
        s1 = (5, n, 5)
        s2 = (n, 1)
        t1 = relay.TensorType(s1)
        t2 = relay.TensorType(s2)
        x = relay.var("x", t1, dtype=dtype)
        y = relay.var("y", t2, dtype=dtype)
        z = opfunc(x, y)
        # test printer
        assert ("{}(%x, %y)".format(z.op.name)) in z.astext()
        zz = run_infer_type(z)
        assert zz.checked_type == t1

        if ref is not None:
            t1 = relay.TensorType((5, 10, 5))
            t2 = relay.TensorType((5, 10, 5))
            x = relay.var("x", t1, dtype=dtype)
            y = relay.var("y", t2, dtype=dtype)
            z = opfunc(x, y)
            x_data = np.random.rand(5, 10, 5).astype(dtype)
            y_data = np.random.rand(5, 10, 5).astype(dtype)
            ref_res = ref(x_data, y_data)
            func = relay.Function([x, y], z)

            for target, ctx in ctx_list():
                # use graph by execuor default for testing, as we need
                # create function explicitly to avoid constant-folding.
                if dtype ==  'float16' and target == 'cuda' and not have_fp16(tvm.gpu(0).compute_version):
                    continue
                intrp = relay.create_executor("graph", ctx=ctx, target=target)
                op_res = intrp.evaluate(func)(x_data, y_data)
                np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
Exemplo n.º 8
0
def test_bias_add():
    for dtype in ["float16", "float32"]:
        xshape = (10, 2, 3, 4)
        bshape = (2,)
        rtol = 1e-2 if dtype == "float16" else 1e-5
        x = relay.var("x", shape=xshape, dtype=dtype)
        bias = relay.var("bias", dtype=dtype)
        z = relay.nn.bias_add(x, bias)
        zz = run_infer_type(z)
        assert "axis=" not in zz.astext()
        assert zz.args[1].checked_type == relay.TensorType(bshape, dtype)

        func = relay.Function([x, bias], z)
        x_data = np.random.uniform(size=xshape).astype(dtype)
        y_data = np.random.uniform(size=bshape).astype(dtype)
        ref_res = x_data + y_data.reshape((2, 1, 1))
        for target, dev in tvm.testing.enabled_targets():
            if (
                dtype == "float16"
                and target == "cuda"
                and not have_fp16(tvm.cuda(0).compute_version)
            ):
                continue
            op_res = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
                x_data, y_data
            )
            np.testing.assert_allclose(op_res.numpy(), ref_res, rtol=rtol)
Exemplo n.º 9
0
    def run_test(tvm_intrin, np_func, dtype):
        if dtype == "float16" and not have_fp16(tvm.cuda(0).compute_version):
            print("Skip because gpu does not have fp16 support")
            return
        # set of intrinsics does not support fp16 yet.
        skip_set = {
            tvm.tir.abs,
            tvm.tir.round,
            tvm.tir.tan,
            tvm.tir.atan,
            tvm.tir.tanh,
            tvm.tir.cosh,
            tvm.tir.sinh,
        }
        if dtype == "float16" and tvm_intrin in skip_set:
            print("Skip because '{0}' does not support fp16 yet".format(tvm_intrin.__name__))
            return

        n = 128
        A = te.placeholder((n,), dtype=dtype, name="A")
        B = te.compute((n,), lambda *i: tvm_intrin(A(*i)), name="B")
        s = sched(B)
        f = tvm.build(s, [A, B], "cuda")
        dev = tvm.cuda(0)
        a = tvm.nd.array(np.random.uniform(0, 1, size=n).astype(A.dtype), dev)
        b = tvm.nd.array(np.zeros(shape=(n,)).astype(A.dtype), dev)
        f(a, b)
        tvm.testing.assert_allclose(b.numpy(), np_func(a.numpy()), atol=1e-3, rtol=1e-3)
Exemplo n.º 10
0
    def check_cuda(dtype, n, l, padding, lanes):
        if dtype == "float16" and not have_fp16(tvm.cuda(0).compute_version):
            print("Skip because gpu does not have fp16 support")
            return

        dev = tvm.cuda(0)
        A = tvm.te.placeholder((n, l), name="A", dtype=dtype)
        B = tvm.te.compute(
            (n // lanes, l + 2 * padding, lanes),
            lambda i, j, k: tvm.te.if_then_else(
                tvm.te.any(j < padding, j >= l + padding),
                tvm.runtime.convert(0).astype(dtype),
                A[i * lanes + k, j - padding],
            ),
            name="B",
        )
        s = te.create_schedule(B.op)
        block, thread, vectorize = s[B].op.axis
        s[B].bind(block, bx)
        s[B].bind(thread, tx)
        s[B].vectorize(vectorize)
        fun = tvm.build(s, [A, B], "cuda", name="vector_load_permute_pad")
        np_a = np.random.randint(low=-128, high=127, size=(n, l)).astype(A.dtype)
        a = tvm.nd.empty((n, l), A.dtype, dev).copyfrom(np_a)
        b = tvm.nd.empty((n // lanes, l + padding * 2, lanes), B.dtype, dev)
        fun(a, b)
        np_a_reshape = np_a.reshape(n // lanes, lanes, l).transpose(0, 2, 1)
        ref = np.pad(
            np_a_reshape, ((0, 0), (padding, padding), (0, 0)), mode="constant", constant_values=0
        )
        tvm.testing.assert_allclose(b.numpy(), ref)
Exemplo n.º 11
0
def test_fp16_conversion(target, dev):
    if target == "cuda" and not have_fp16(dev.compute_version):
        print("skip because gpu does not support fp16")
        return

    n = 10

    for (src, dst) in [("float32", "float16"), ("float16", "float32")]:
        x = relay.var("x", relay.TensorType((n,), src))
        y = x.astype(dst)
        func = relay.Function([x], y)

        # init input
        X = tvm.nd.array(n * np.random.randn(n).astype(src) - n / 2)

        # build
        with tvm.transform.PassContext(opt_level=1):
            g_json, mmod, params = relay.build(tvm.IRModule.from_expr(func), target)

        # test
        rt = tvm.contrib.graph_executor.create(g_json, mmod, dev)
        rt.set_input("x", X)
        rt.run()
        out = rt.get_output(0)

        np.testing.assert_allclose(out.asnumpy(), X.asnumpy().astype(dst), atol=1e-5, rtol=1e-5)
Exemplo n.º 12
0
 def check_cuda(dtype, n, lanes):
     if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"):
         print("skip because cuda is not enabled..")
         return
     if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version):
         print("skip because gpu does not support fp16")
         return
     if dtype == "int8" and not have_int8(tvm.gpu(0).compute_version):
         print("skip because gpu does not support int8")
         return
     A = tvm.placeholder((n, ), name='A', dtype="%sx%d" % (dtype, lanes))
     B = tvm.compute((n, ),
                     lambda i: A[i] + tvm.const(1, A.dtype),
                     name='B')
     s = tvm.create_schedule(B.op)
     xo, xi = s[B].split(B.op.axis[0], factor=num_thread)
     s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
     s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
     fun = tvm.build(s, [A, B], "cuda")
     ctx = tvm.gpu(0)
     a = tvm.nd.empty((n, ), A.dtype,
                      ctx).copyfrom(np.random.uniform(size=(n, lanes)))
     c = tvm.nd.empty((n, ), B.dtype, ctx)
     fun(a, c)
     tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)
Exemplo n.º 13
0
def test_concatenate():
    for dtype in ["float16", "float32"]:
        n, t, d = te.size_var("n"), te.size_var("t"), 100
        x = relay.var("x", shape=(n, t, d))
        y = relay.var("y", shape=(n, t, d))
        z = relay.concatenate((x, y), axis=-1)
        assert "axis=" in z.astext()
        zz = run_infer_type(z)
        assert zz.checked_type == relay.TensorType((n, t, 200))

        x = relay.exp(x)
        z = relay.concatenate((x, y), axis=2)
        zz = run_infer_type(z)
        assert zz.checked_type == relay.TensorType((n, t, 200))

        z = relay.concatenate((x, y), axis=1)
        zz = run_infer_type(z)
        assert zz.checked_type == relay.TensorType((n, t + t, 100))

        # check shape mismatches (the following case is expected to raise tvm._ffi.base.TVMError.
        try:
            x = relay.var("p1", shape=(2, 5))
            y = relay.var("p2", shape=(2, 3))
            c = relay.concatenate([x, y], axis=0)
            func = relay.Function([x, y], c)
            zz = run_infer_type(func)
        except tvm._ffi.base.TVMError:
            pass
        else:
            assert False

        x = relay.var("x", shape=(10, 5), dtype=dtype)
        y = relay.var("y", shape=(10, 5), dtype=dtype)
        t = relay.var("z", shape=(), dtype=dtype)
        z = relay.concatenate((x, y), axis=1)
        z = relay.add(z, t)
        # Check result.
        func = relay.Function([x, y, t], z)
        x_data = np.random.rand(10, 5).astype(dtype)
        y_data = np.random.rand(10, 5).astype(dtype)
        t_data = np.random.uniform(size=()).astype(dtype)
        ref_res = np.concatenate((x_data, y_data), axis=1) + t_data

        for target, dev in tvm.testing.enabled_targets():
            if (
                dtype == "float16"
                and target == "cuda"
                and not have_fp16(tvm.cuda(0).compute_version)
            ):
                continue
            op_res1 = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
                x_data, y_data, t_data
            )
            tvm.testing.assert_allclose(op_res1.numpy(), ref_res, rtol=0.01)
            op_res2 = relay.create_executor("debug", device=dev, target=target).evaluate(func)(
                x_data, y_data, t_data
            )
            tvm.testing.assert_allclose(op_res2.numpy(), ref_res, rtol=0.01)
Exemplo n.º 14
0
 def verify_expand_dims(dshape, dtype, oshape, axis, num_newaxis):
     x = relay.Var("x", relay.TensorType(dshape, dtype))
     func = relay.Function([x], relay.expand_dims(x, axis, num_newaxis))
     for target, ctx in ctx_list():
         if dtype ==  'float16' and target == 'cuda' and not have_fp16(tvm.gpu(0).compute_version):
             continue
         data = np.random.uniform(size=dshape).astype(dtype)
         ref_res = data.reshape(oshape)
         intrp = relay.create_executor("graph", ctx=ctx, target=target)
         op_res = intrp.evaluate(func)(data)
         np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=0.01)
Exemplo n.º 15
0
 def verify_expand_dims(dshape, dtype, oshape, axis, num_newaxis):
     x = relay.Var("x", relay.TensorType(dshape, dtype))
     func = relay.Function([x], relay.expand_dims(x, axis, num_newaxis))
     for target, dev in tvm.testing.enabled_targets():
         if (dtype == "float16" and target == "cuda"
                 and not have_fp16(tvm.cuda(0).compute_version)):
             continue
         data = np.random.uniform(size=dshape).astype(dtype)
         ref_res = data.reshape(oshape)
         op_res = relay.create_executor("graph", device=dev,
                                        target=target).evaluate(func)(data)
         np.testing.assert_allclose(op_res.numpy(), ref_res, rtol=0.01)
Exemplo n.º 16
0
    def check_target(target, dev):
        if dtype == "float16" and target == "cuda" and not have_fp16(tvm.gpu(0).compute_version):
            print("Skip because %s does not have fp16 support" % target)
            return
        print("Running on target: %s" % target)
        with tvm.target.Target(target):
            s = tvm.topi.testing.get_elemwise_schedule(target)(B)

        a = tvm.nd.array(a_np, dev)
        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
        foo = tvm.build(s, [A, B], target, name="relu")
        foo(a, b)
        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
    def check_cuda(dtype):
        if dtype == "float16" and not have_fp16(tvm.cuda(0).compute_version):
            print("Skip because gpu does not have fp16 support")
            return

        n, m = 16, 16
        A = te.placeholder(
            (
                n,
                m,
            ),
            name="A",
            dtype=dtype,
        )
        B = te.compute(
            (
                n,
                m,
            ),
            lambda j, i: A[j, (i + 1) % m],
            name="B",
        )

        cuda_target = tvm.target.Target("cuda")
        assert cuda_target.thread_warp_size == 2 * m
        with cuda_target:
            s = te.create_schedule(B.op)
            tx = te.thread_axis("threadIdx.x")
            ty = te.thread_axis("threadIdx.y")
            bx = te.thread_axis("blockIdx.x")

            AA = s.cache_read(A, "warp", [B])
            y, x = B.op.axis
            z, y = s[B].split(y, nparts=2)
            s[B].bind(x, tx)
            s[B].bind(y, ty)
            s[B].bind(z, bx)
            s[AA].compute_at(s[B], y)
            _, x = AA.op.axis
            s[AA].bind(x, tx)

            dev = tvm.cuda(0)
            # building with the CSE pass disabled as otherwise it would do some commoning
            with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]):
                func = tvm.build(s, [A, B], "cuda")
            A_np = np.array([list(range(i, m + i)) for i in range(n)], dtype=dtype)
            B_np = np.array([list(range(1 + i, m + i)) + [i] for i in range(n)], dtype=dtype)
            A_nd = tvm.nd.array(A_np, dev)
            B_nd = tvm.nd.array(np.zeros(B_np.shape, dtype=B_np.dtype), dev)
            func(A_nd, B_nd)
            tvm.testing.assert_allclose(B_nd.numpy(), B_np, rtol=1e-3)
Exemplo n.º 18
0
 def check_device(device, ctx):
     if in_dtype == "float16" and device == "cuda" and not have_fp16(ctx.compute_version):
         print("Skip because %s does not have fp16 support" % device)
         return
     print("Running on target: %s" % device)
     with tvm.target.Target(device):
         s = tvm.topi.testing.get_elemwise_schedule(device)(B)
     foo = tvm.build(s, [A, B], device, name="reinterpret")
     data_npy = generator(in_shape).astype(in_dtype)
     out_npy = data_npy.view(B.dtype)
     data_nd = tvm.nd.array(data_npy, ctx)
     out_nd = tvm.nd.array(np.empty(in_shape).astype(B.dtype), ctx)
     foo(data_nd, out_nd)
     np.testing.assert_equal(out_nd.asnumpy(), out_npy)
Exemplo n.º 19
0
    def test_unary_op(self, target, dev, relay_op, ref_func, supports_fp16,
                      dtype):
        target = tvm.target.Target(target)
        if dtype == "float16":
            if target.kind.name == "cuda":
                if not have_fp16(tvm.cuda(0).compute_version):
                    pytest.xfail(
                        "No float16 support on local cuda device (compute_version != 5.3 and < 6.0)"
                    )
            elif target.kind.name == "vulkan" and not target.attrs.get(
                    "supports_float16", False):
                pytest.xfail(
                    "No float16 support on vulkan target (supports_float16=False)"
                )
            elif not supports_fp16:
                pytest.xfail(
                    f"No float16 support on {target.kind.name} target")

        if target.kind.name == "vulkan" and relay_op in [
                tvm.relay.erf,
                tvm.relay.tan,
                tvm.relay.atan,
        ]:
            pytest.xfail(f"Vulkan runtime doesn't yet support {relay_op}")

        shape = (10, 4)
        dtype = dtype
        tp = relay.TensorType(shape, dtype=dtype)
        x = relay.var("x", type_annotation=tp)
        y = relay_op(x)
        # test printer
        assert ("{}(%x)".format(y.op.name)) in y.astext()
        # test type inference
        yy = run_infer_type(y)
        assert yy.checked_type == tp

        if ref_func is not None:
            data = np.random.rand(*shape).astype(dtype)
            ref_res = ref_func(data).astype(dtype)
            func = relay.Function([x], y)
            # use graph by execuor default for testing, as we need
            # create function explicitly to avoid constant-folding.
            op_res = relay.create_executor("graph", device=dev,
                                           target=target).evaluate(func)(data)
            tolerance = 1e-2 if dtype == "float16" else 1e-5
            np.testing.assert_allclose(op_res.numpy(), ref_res, rtol=tolerance)
Exemplo n.º 20
0
    def check_cuda(dtype):
        if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
            print("skip because cuda is not enabled..")
            return
        if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version):
            print("Skip because gpu does not have fp16 support")
            return

        m = 32
        A = te.placeholder((m, ), name='A', dtype=dtype)
        B = te.placeholder((m, ), name='B', dtype=dtype)
        C = te.compute((m, ),
                       lambda i: A[(i + 1) % m] + B[(i + 1) % m],
                       name='C')

        cuda_target = tvm.target.create("cuda")
        assert m <= cuda_target.thread_warp_size
        with cuda_target:
            s = te.create_schedule(C.op)
            tx = te.thread_axis("threadIdx.x")
            bx = te.thread_axis("blockIdx.x")

            AA = s.cache_read(A, "warp", [C])
            BB = s.cache_read(B, "warp", [C])
            xo, xi = s[C].split(C.op.axis[0], nparts=1)
            s[C].bind(xi, tx)
            s[C].bind(xo, bx)
            s[AA].compute_at(s[C], xo)
            s[BB].compute_at(s[C], xo)
            xo, xi = s[AA].split(s[AA].op.axis[0], nparts=1)
            s[AA].bind(xo, bx)
            s[AA].bind(xi, tx)
            xo, xi = s[BB].split(s[BB].op.axis[0], nparts=1)
            s[BB].bind(xo, bx)
            s[BB].bind(xi, tx)

            ctx = tvm.gpu(0)
            func = tvm.build(s, [A, B, C], "cuda")
            AB_np = np.array(list(range(m)), dtype=dtype)
            C_np = np.array(list(range(1, m)) + [0], dtype=dtype) * 2
            A_nd = tvm.nd.array(AB_np, ctx)
            B_nd = tvm.nd.array(AB_np, ctx)
            C_nd = tvm.nd.array(np.zeros(C_np.shape, dtype=C_np.dtype), ctx)
            func(A_nd, B_nd, C_nd)
            tvm.testing.assert_allclose(C_nd.asnumpy(), C_np, rtol=1e-3)
Exemplo n.º 21
0
    def check_device(device):
        if not tvm.testing.device_enabled(device):
            print("Skip because %s is not enabled" % device)
            return
        if dtype == "float16" and device == "cuda" and not have_fp16(tvm.gpu(0).compute_version):
            print("Skip because gpu does not have fp16 support")
            return
        with tvm.target.Target(device):
            ctx = tvm.context(device, 0)
            A = te.placeholder((n, m), name="A", dtype=dtype)
            B = te.compute((n, m), lambda i, j: A[i, j] + tvm.tir.const(1, A.dtype), name="B")
            S = tvm.topi.testing.get_elemwise_schedule(device)(B)

            fun = tvm.build(S, [A, B], device)
            np_A = tvm.nd.empty((n, m), A.dtype, ctx).copyfrom(np.random.uniform(size=(n, m)))
            np_B = tvm.nd.empty((n, m), B.dtype, ctx)
            fun(np_A, np_B)
            tvm.testing.assert_allclose(np_B.asnumpy(), np_A.asnumpy() + 1, rtol=1e-5)
Exemplo n.º 22
0
 def check_cuda(dtype, n, lanes):
     if dtype == "float16" and not have_fp16(tvm.cuda(0).compute_version):
         print("Skip because gpu does not have fp16 support")
         return
     if dtype == "int8" and not have_int8(tvm.cuda(0).compute_version):
         print("skip because gpu does not support int8")
         return
     A = te.placeholder((n,), name="A", dtype="%sx%d" % (dtype, lanes))
     B = te.compute((n,), lambda i: A[i] + tvm.tir.const(1, A.dtype), name="B")
     s = te.create_schedule(B.op)
     xo, xi = s[B].split(B.op.axis[0], factor=num_thread)
     s[B].bind(xo, bx)
     s[B].bind(xi, tx)
     fun = tvm.build(s, [A, B], "cuda")
     dev = tvm.cuda(0)
     a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np.random.uniform(size=(n, lanes)))
     c = tvm.nd.empty((n,), B.dtype, dev)
     fun(a, c)
     tvm.testing.assert_allclose(c.numpy(), a.numpy() + 1)
Exemplo n.º 23
0
    def check_device(device):
        ctx = tvm.context(device, 0)
        if not ctx.exist:
            print("Skip because %s is not enabled" % device)
            return
        if dtype == "float16" and device == "cuda" and not have_fp16(
                tvm.gpu(0).compute_version):
            print("Skip because %s does not have fp16 support" % device)
            return
        print("Running on target: %s" % device)
        with tvm.target.create(device):
            s = topi.testing.get_elemwise_schedule(device)(B)

        a = tvm.nd.array(a_np, ctx)
        b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype),
                         ctx)
        foo = tvm.build(s, [A, B], device, name="relu")
        foo(a, b)
        tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
Exemplo n.º 24
0
def test_relu(target, dev, m, n, dtype):
    A = te.placeholder((m, n), name="A", dtype=dtype)
    B = topi.nn.relu(A)

    a_np = np.random.uniform(low=-1.0, high=1.0, size=get_const_tuple(A.shape)).astype(A.dtype)
    b_np = a_np * (a_np > 0)

    if dtype == "float16" and target == "cuda" and not have_fp16(tvm.cuda(0).compute_version):
        pytest.skip("Skip because %s does not have fp16 support" % target)

    print("Running on target: %s" % target)
    with tvm.target.Target(target):
        s = tvm.topi.testing.get_elemwise_schedule(target)(B)

    a = tvm.nd.array(a_np, dev)
    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev)
    foo = tvm.build(s, [A, B], target, name="relu")
    foo(a, b)
    tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5)
    def check_cuda(dtype):
        if dtype == "float16" and not have_fp16(tvm.cuda(0).compute_version):
            print("Skip because gpu does not have fp16 support")
            return

        m = 128
        A = te.placeholder((m,), name="A", dtype=dtype)
        B = te.compute((m,), lambda i: A[i // 32 * 32 + (i + 1) % 32], name="B")

        cuda_target = tvm.target.Target("cuda")
        assert cuda_target.thread_warp_size == 32
        with cuda_target:
            s = te.create_schedule(B.op)
            AA = s.cache_read(A, "warp", [B])
            xo, xi = s[B].split(B.op.axis[0], 64)
            xi0, xi1 = s[B].split(xi, factor=32)
            tx = te.thread_axis("threadIdx.x")
            s[B].bind(xi1, tx)
            s[B].bind(xo, te.thread_axis("blockIdx.x"))
            s[AA].compute_at(s[B], xo)
            xo, xi = s[AA].split(s[AA].op.axis[0], 32)
            s[AA].bind(xi, tx)

            dev = tvm.cuda(0)
            # building with the CSE pass disabled as otherwise it would do some commoning
            with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]):
                func = tvm.build(s, [A, B], "cuda")
            A_np = np.array(list(range(m)), dtype=dtype)
            B_np = np.array(
                list(range(1, 32))
                + [0]
                + list(range(33, 64))
                + [32]
                + list(range(65, 96))
                + [64]
                + list(range(97, 128))
                + [96],
                dtype=dtype,
            )
            A_nd = tvm.nd.array(A_np, dev)
            B_nd = tvm.nd.array(np.zeros(B_np.shape, dtype=B_np.dtype), dev)
            func(A_nd, B_nd)
            tvm.testing.assert_allclose(B_nd.numpy(), B_np, rtol=1e-3)
    def check_cuda(dtype):
        if dtype == "float16" and not have_fp16(tvm.cuda(0).compute_version):
            print("Skip because gpu does not have fp16 support")
            return

        m = 32
        A = te.placeholder((m,), name="A", dtype=dtype)
        B = te.placeholder((m,), name="B", dtype=dtype)
        C = te.compute((m,), lambda i: A[(i + 1) % m] + B[(i + 1) % m], name="C")

        cuda_target = tvm.target.Target("cuda")
        assert m <= cuda_target.thread_warp_size
        with cuda_target:
            s = te.create_schedule(C.op)
            tx = te.thread_axis("threadIdx.x")
            bx = te.thread_axis("blockIdx.x")

            AA = s.cache_read(A, "warp", [C])
            BB = s.cache_read(B, "warp", [C])
            xo, xi = s[C].split(C.op.axis[0], nparts=1)
            s[C].bind(xi, tx)
            s[C].bind(xo, bx)
            s[AA].compute_at(s[C], xo)
            s[BB].compute_at(s[C], xo)
            xo, xi = s[AA].split(s[AA].op.axis[0], nparts=1)
            s[AA].bind(xo, bx)
            s[AA].bind(xi, tx)
            xo, xi = s[BB].split(s[BB].op.axis[0], nparts=1)
            s[BB].bind(xo, bx)
            s[BB].bind(xi, tx)

            dev = tvm.cuda(0)
            # building with the CSE pass disabled as otherwise it would do some commoning
            with tvm.transform.PassContext(opt_level=3, disabled_pass=["tir.CommonSubexprElimTIR"]):
                func = tvm.build(s, [A, B, C], "cuda")
            AB_np = np.array(list(range(m)), dtype=dtype)
            C_np = np.array(list(range(1, m)) + [0], dtype=dtype) * 2
            A_nd = tvm.nd.array(AB_np, dev)
            B_nd = tvm.nd.array(AB_np, dev)
            C_nd = tvm.nd.array(np.zeros(C_np.shape, dtype=C_np.dtype), dev)
            func(A_nd, B_nd, C_nd)
            tvm.testing.assert_allclose(C_nd.numpy(), C_np, rtol=1e-3)
Exemplo n.º 27
0
    def check(device, dtype, m=32, n=32):
        if not tvm.testing.device_enabled(device):
            print("Skipping", device)
            return
        dev = tvm.device(device, 0)
        if dtype == "float16" and not have_fp16(dev.compute_version):
            print("Skip because gpu does not have fp16 support")
            return

        a = tvm.te.placeholder((m, n), name="a", dtype=dtype)
        b = topi.sum(a)
        with tvm.target.Target(device):
            sb = tvm.te.create_schedule(b.op)
            i, _ = b.op.reduce_axis
            sb[b].bind(i, tvm.te.thread_axis("threadIdx.x"))
            func = tvm.build(sb, [a, b], device)
            a_np = np.random.uniform(size=(m, n)).astype(a.dtype)
            b_np = np.sum(a_np)
            a_nd = tvm.nd.array(a_np, dev)
            b_nd = tvm.nd.array(np.zeros(b_np.shape, dtype=b_np.dtype), dev)
            func(a_nd, b_nd)
            tvm.testing.assert_allclose(b_nd.asnumpy(), b_np, rtol=1e-3)
Exemplo n.º 28
0
def test_bias_add():
    for dtype in ['float16', 'float32']:
        xshape=(10, 2, 3, 4)
        bshape=(2,)
        rtol = 1e-2 if dtype == 'float16' else 1e-5
        x = relay.var("x", shape=xshape, dtype=dtype)
        bias = relay.var("bias", dtype=dtype)
        z = relay.nn.bias_add(x, bias)
        zz = run_infer_type(z)
        assert "axis=" not in zz.astext()
        assert zz.args[1].checked_type == relay.TensorType(bshape, dtype)

        func = relay.Function([x, bias], z)
        x_data = np.random.uniform(size=xshape).astype(dtype)
        y_data = np.random.uniform(size=bshape).astype(dtype)
        ref_res = x_data + y_data.reshape((2, 1, 1))
        for target, ctx in ctx_list():
            if dtype ==  'float16' and target == 'cuda' and not have_fp16(tvm.gpu(0).compute_version):
                continue
            intrp = relay.create_executor("graph", ctx=ctx, target=target)
            op_res = intrp.evaluate(func)(x_data, y_data)
            np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=rtol)
Exemplo n.º 29
0
    def check_cuda(dtype):
        if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
            print("skip because cuda is not enabled..")
            return
        if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version):
            print("Skip because gpu does not have fp16 support")
            return

        m = 128
        A = te.placeholder((m, ), name='A', dtype=dtype)
        B = te.compute((m, ),
                       lambda i: A[i // 32 * 32 + (i + 1) % 32],
                       name='B')

        cuda_target = tvm.target.create("cuda")
        assert cuda_target.thread_warp_size == 32
        with cuda_target:
            s = te.create_schedule(B.op)
            AA = s.cache_read(A, "warp", [B])
            xo, xi = s[B].split(B.op.axis[0], 64)
            xi0, xi1 = s[B].split(xi, factor=32)
            tx = te.thread_axis("threadIdx.x")
            s[B].bind(xi1, tx)
            s[B].bind(xo, te.thread_axis("blockIdx.x"))
            s[AA].compute_at(s[B], xo)
            xo, xi = s[AA].split(s[AA].op.axis[0], 32)
            s[AA].bind(xi, tx)

            ctx = tvm.gpu(0)
            func = tvm.build(s, [A, B], "cuda")
            A_np = np.array(list(range(m)), dtype=dtype)
            B_np = np.array(list(range(1, 32)) + [0] + list(range(33, 64)) +
                            [32] + list(range(65, 96)) + [64] +
                            list(range(97, 128)) + [96],
                            dtype=dtype)
            A_nd = tvm.nd.array(A_np, ctx)
            B_nd = tvm.nd.array(np.zeros(B_np.shape, dtype=B_np.dtype), ctx)
            func(A_nd, B_nd)
            tvm.testing.assert_allclose(B_nd.asnumpy(), B_np, rtol=1e-3)
    def check_cuda(dtype, m=32, n=32):
        if not tvm.gpu(0).exist or not tvm.runtime.enabled("cuda"):
            print("skip because cuda is not enabled..")
            return
        if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version):
            print("Skip because gpu does not have fp16 support")
            return

        a = tvm.te.placeholder((m, n), name="a", dtype=dtype)
        b = topi.sum(a)
        with tvm.target.cuda():
            sb = tvm.te.create_schedule(b.op)
            i, _ = b.op.reduce_axis
            sb[b].bind(i, tvm.te.thread_axis("threadIdx.x"))
            ctx = tvm.gpu(0)
            func = tvm.build(sb, [a, b], 'cuda')
            a_np = np.random.uniform(size=(m, n)).astype(a.dtype)
            b_np = np.sum(a_np)
            a_nd = tvm.nd.array(a_np, ctx)
            b_nd = tvm.nd.array(np.zeros(b_np.shape, dtype=b_np.dtype), ctx)
            func(a_nd, b_nd)
            tvm.testing.assert_allclose(b_nd.asnumpy(), b_np, rtol=1e-3)
Exemplo n.º 31
0
 def check_cuda(dtype, n, lanes):
     if not tvm.gpu(0).exist or not tvm.module.enabled("cuda"):
         print("skip because cuda is not enabled..")
         return
     if dtype == "float16" and not have_fp16(tvm.gpu(0).compute_version):
         print("skip because gpu does not support fp16")
         return
     if dtype == "int8" and not have_int8(tvm.gpu(0).compute_version):
         print("skip because gpu does not support int8")
         return
     A = tvm.placeholder((n,), name='A', dtype="%sx%d" % (dtype, lanes))
     B = tvm.compute((n,), lambda i: A[i]+tvm.const(1, A.dtype), name='B')
     s = tvm.create_schedule(B.op)
     xo, xi = s[B].split(B.op.axis[0], factor=num_thread)
     s[B].bind(xo, tvm.thread_axis("blockIdx.x"))
     s[B].bind(xi, tvm.thread_axis("threadIdx.x"))
     fun = tvm.build(s, [A, B], "cuda")
     ctx = tvm.gpu(0)
     a = tvm.nd.empty((n,), A.dtype, ctx).copyfrom(
         np.random.uniform(size=(n, lanes)))
     c = tvm.nd.empty((n,), B.dtype, ctx)
     fun(a, c)
     tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + 1)