def test_rocm_cross_thread_reduction(): if not tvm.rocm(0).exist or not tvm.module.enabled("rocm"): print("skip because rocm is not enabled..") return # based on the reduction tutorial n = tvm.var("n") m = tvm.var("m") A = tvm.placeholder((n, m), name='A') k = tvm.reduce_axis((0, m), "k") B = tvm.compute((n, ), lambda i: tvm.sum(A[i, k], axis=k), name="B") s = tvm.create_schedule(B.op) ko, ki = s[B].split(B.op.reduce_axis[0], factor=16) BF = s.rfactor(B, ki) xo, xi = s[B].split(s[B].op.axis[0], factor=32) s[B].bind(xo, tvm.thread_axis("blockIdx.x")) s[B].bind(xi, tvm.thread_axis("threadIdx.y")) tx = tvm.thread_axis("threadIdx.x") s[B].bind(s[B].op.reduce_axis[0], tx) s[BF].compute_at(s[B], s[B].op.reduce_axis[0]) s[B].set_store_predicate(tx.var.equal(0)) frocm = tvm.build(s, [A, B], "rocm") nn = 128 ctx = tvm.rocm(0) a = tvm.nd.array(np.random.uniform(size=(nn, nn)).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(nn, dtype=B.dtype), ctx) frocm(a, b) tvm.testing.assert_allclose(b.asnumpy(), np.sum(a.asnumpy(), axis=1), rtol=1e-4)
def test_rocm_cross_thread_reduction(): # based on the reduction tutorial n = te.size_var("n") m = te.size_var("m") A = te.placeholder((n, m), name="A") k = te.reduce_axis((0, m), "k") B = te.compute((n, ), lambda i: te.sum(A[i, k], axis=k), name="B") s = te.create_schedule(B.op) ko, ki = s[B].split(B.op.reduce_axis[0], factor=16) BF = s.rfactor(B, ki) xo, xi = s[B].split(s[B].op.axis[0], factor=32) s[B].bind(xo, bx) s[B].bind(xi, ty) s[B].bind(s[B].op.reduce_axis[0], tx) s[BF].compute_at(s[B], s[B].op.reduce_axis[0]) s[B].set_store_predicate(tx.var.equal(0)) frocm = tvm.build(s, [A, B], "rocm") nn = 128 ctx = tvm.rocm(0) a = tvm.nd.array(np.random.uniform(size=(nn, nn)).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(nn, dtype=B.dtype), ctx) frocm(a, b) tvm.testing.assert_allclose(b.asnumpy(), np.sum(a.asnumpy(), axis=1), rtol=1e-4)
def check_rocm(dtype, n): A = tvm.placeholder((n, ), name='A', dtype=dtype) ctx = tvm.rocm(0) a_np = np.random.uniform(size=(n, )).astype(A.dtype) a = tvm.nd.empty((n, ), A.dtype, ctx).copyfrom(a_np) b_np = a.asnumpy() tvm.testing.assert_allclose(a_np, b_np) tvm.testing.assert_allclose(a_np, a.asnumpy())
def enabled_ctx_list(): ctx_list = [('cpu', tvm.cpu(0)), ('gpu', tvm.gpu(0)), ('cl', tvm.opencl(0)), ('metal', tvm.metal(0)), ('rocm', tvm.rocm(0)), ('vpi', tvm.vpi(0))] for k, v in ctx_list: assert tvm.context(k, 0) == v ctx_list = [x[1] for x in ctx_list if x[1].exist] return ctx_list
def enabled_ctx_list(): ctx_list = [('cpu', tvm.cpu(0)), ('gpu', tvm.gpu(0)), ('cl', tvm.opencl(0)), ('metal', tvm.metal(0)), ('rocm', tvm.rocm(0)), ('vulkan', tvm.vulkan(0)), ('vpi', tvm.vpi(0))] for k, v in ctx_list: assert tvm.context(k, 0) == v ctx_list = [x[1] for x in ctx_list if x[1].exist] return ctx_list
def verify(target="rocm"): if not tvm.get_global_func("tvm.contrib.rocblas.matmul", True): print("skip because extern function is not available") return ctx = tvm.rocm(0) f = tvm.build(s, [A, B, C], target) a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx) f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-5)
def check_rocm(dtype, n, lanes): A = te.placeholder((n,), name="A", dtype="%sx%d" % (dtype, lanes)) B = te.compute((n,), lambda i: A[i] + tvm.tir.const(1, A.dtype), name="B") s = te.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=num_thread) s[B].bind(xo, bx) s[B].bind(xi, tx) fun = tvm.build(s, [A, B], "rocm") dev = tvm.rocm(0) a = tvm.nd.empty((n,), A.dtype, dev).copyfrom(np.random.uniform(size=(n, lanes))) c = tvm.nd.empty((n,), B.dtype, dev) fun(a, c) tvm.testing.assert_allclose(c.numpy(), a.numpy() + 1)
def verify(target="rocm"): if not tvm.module.enabled(target): print("skip because %s is not enabled..." % target) return if not tvm.get_global_func("tvm.contrib.rocblas.matmul", True): print("skip because extern function is not available") return ctx = tvm.rocm(0) f = tvm.build(s, [A, B, C], target) a = tvm.nd.array(np.random.uniform(size=(n, l)).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(l, m)).astype(B.dtype), ctx) c = tvm.nd.array(np.zeros((n, m), dtype=C.dtype), ctx) f(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-5)
def verify(): ctx = tvm.rocm(0) f = tvm.build(s, [X, W, Y], "rocm", target_host="llvm", name="conv2d") x = tvm.nd.array(np.random.uniform(-1, 1, xshape).astype(np.float32), ctx) w = tvm.nd.array(np.random.uniform(-1, 1, wshape).astype(np.float32), ctx) y = tvm.nd.array(np.random.uniform(-1, 1, yshape).astype(np.float32), ctx) f(x, w, y) Y_ref = topi.nn.conv2d_nchw(X, W, (stride_h, stride_w), (pad_h, pad_w), (dilation_h, dilation_w)) with tvm.target.rocm(): s_ref = topi.generic.schedule_conv2d_nchw([Y_ref]) f_ref = tvm.build(s_ref, [X, W, Y_ref], "rocm") y_ref = tvm.nd.array(np.random.uniform(-1, 1, yshape).astype(np.float32), ctx) f_ref(x, w, y_ref) print("Max abs diff:", np.max(np.abs(y.asnumpy() - y_ref.asnumpy()))) tvm.testing.assert_allclose(y.asnumpy(), y_ref.asnumpy(), atol=1e-3)
def verify(): ctx = tvm.rocm(0) f = tvm.build(s, [X, W, Y], "rocm", target_host="llvm", name="conv2d") x = tvm.nd.array(np.random.uniform(-1, 1, xshape).astype(np.float32), ctx) w = tvm.nd.array(np.random.uniform(-1, 1, wshape).astype(np.float32), ctx) y = tvm.nd.array(np.random.uniform(-1, 1, yshape).astype(np.float32), ctx) f(x, w, y) Y_ref = topi.nn.conv2d_nchw(X, W, (stride_h, stride_w), (pad_h, pad_w)) with tvm.target.rocm(): s_ref = topi.generic.schedule_conv2d_nchw([Y_ref]) f_ref = tvm.build(s_ref, [X, W, Y_ref], "rocm") y_ref = tvm.nd.array(np.random.uniform(-1, 1, yshape).astype(np.float32), ctx) f_ref(x, w, y_ref) print("Max abs diff:", np.max(np.abs(y.asnumpy() - y_ref.asnumpy()))) tvm.testing.assert_allclose(y.asnumpy(), y_ref.asnumpy(), atol=1e-3)
def verify_softmax(shape, axis, dtype="float32", log_softmax=False): miopen_op = miopen.log_softmax if log_softmax else miopen.softmax testing_op = (tvm.topi.testing.log_softmax_python if log_softmax else tvm.topi.testing.softmax_python) A = te.placeholder(shape, dtype=dtype, name="A") B = miopen_op(A, axis) s = te.create_schedule([B.op]) dev = tvm.rocm(0) a_np = np.random.uniform(size=shape).astype(dtype) b_np = testing_op(a_np) a = tvm.nd.array(a_np, dev) b = tvm.nd.array(b_np, dev) f = tvm.build(s, [A, B], target="rocm --host=llvm", name="softmax") f(a, b) tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-3)
def verify(target="rocm"): if not tvm.testing.device_enabled(target): print("skip because %s is not enabled..." % target) return if not tvm.get_global_func(lib.__name__ + ".batch_matmul", True): print("skip because extern function is not available") return dev = tvm.rocm(0) f = tvm.build(s, [A, B, C], target) a = tvm.nd.array(np.random.uniform(size=ashape).astype(A.dtype), dev) b = tvm.nd.array(np.random.uniform(size=bshape).astype(B.dtype), dev) c = tvm.nd.array(np.zeros((batch, m, n), dtype=C.dtype), dev) f(a, b, c) tvm.testing.assert_allclose(c.numpy(), get_numpy(a.numpy(), b.numpy(), transa, transb), rtol=1e-5)
def verify(): dev = tvm.rocm(0) f = tvm.build(s, [X, W, Y], "rocm --host=llvm", name="conv2d") x = tvm.nd.array( np.random.uniform(-1, 1, xshape).astype(np.float32), dev) w = tvm.nd.array( np.random.uniform(-1, 1, wshape).astype(np.float32), dev) y = tvm.nd.array( np.random.uniform(-1, 1, yshape).astype(np.float32), dev) f(x, w, y) Y_ref = topi.nn.conv2d_nchw(X, W, (stride_h, stride_w), (pad_h, pad_w), (dilation_h, dilation_w)) s_ref = te.create_schedule(Y_ref.op) f_ref = tvm.build(s_ref, [X, W, Y_ref], "rocm --host=llvm") y_ref = tvm.nd.array( np.random.uniform(-1, 1, yshape).astype(np.float32), dev) f_ref(x, w, y_ref) print("Max abs diff:", np.max(np.abs(y.numpy() - y_ref.numpy()))) tvm.testing.assert_allclose(y.numpy(), y_ref.numpy(), atol=1e-3)
def requires_gpu(*args): """Mark a test as requiring a GPU to run. Tests with this mark will not be run unless a gpu is present. Parameters ---------- f : function Function to mark """ _requires_gpu = [ pytest.mark.skipif( not tvm.cuda().exist and not tvm.rocm().exist and not tvm.opencl().exist and not tvm.metal().exist and not tvm.vulkan().exist, reason="No GPU present", ), *uses_gpu(), ] return _compose(args, _requires_gpu)
def test_rocm_inf_nan(): def check_inf_nan(ctx, n, value, dtype): A = tvm.placeholder((n, ), name='A', dtype=dtype) inf_value = tvm.const(value, dtype=dtype) C = tvm.compute((n, ), lambda i: inf_value, name='C') s = tvm.create_schedule(C.op) s[C].bind(s[C].op.axis[0], tx) fun = tvm.build(s, [A, C], "rocm") a = tvm.nd.empty((n, ), A.dtype, ctx) c = tvm.nd.empty((n, ), A.dtype, ctx) # Only need to test compiling here fun(a, c) ctx = tvm.rocm(0) check_inf_nan(ctx, 1, -float('inf'), 'float32') check_inf_nan(ctx, 1, -float('inf'), 'float64') check_inf_nan(ctx, 1, float('inf'), 'float32') check_inf_nan(ctx, 1, float('inf'), 'float64') check_inf_nan(ctx, 1, float('nan'), 'float32') check_inf_nan(ctx, 1, float('nan'), 'float64')
def test_rocm_inf_nan(): def check_inf_nan(dev, n, value, dtype): A = te.placeholder((n,), name="A", dtype=dtype) inf_value = tvm.tir.const(value, dtype=dtype) C = te.compute((n,), lambda i: inf_value, name="C") s = te.create_schedule(C.op) s[C].bind(s[C].op.axis[0], tx) fun = tvm.build(s, [A, C], "rocm") a = tvm.nd.empty((n,), A.dtype, dev) c = tvm.nd.empty((n,), A.dtype, dev) # Only need to test compiling here fun(a, c) dev = tvm.rocm(0) check_inf_nan(dev, 1, -float("inf"), "float32") check_inf_nan(dev, 1, -float("inf"), "float64") check_inf_nan(dev, 1, float("inf"), "float32") check_inf_nan(dev, 1, float("inf"), "float64") check_inf_nan(dev, 1, float("nan"), "float32") check_inf_nan(dev, 1, float("nan"), "float64")
# software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import tvm from tvm import te import numpy as np import unittest tx = te.thread_axis("threadIdx.x") ty = te.thread_axis("threadIdx.y") bx = te.thread_axis("blockIdx.x") by = te.thread_axis("blockIdx.y") @unittest.skipIf(not tvm.rocm(0).exist or not tvm.runtime.enabled("rocm"), "skip because rocm is not enabled..") def test_rocm_cross_thread_reduction(): # based on the reduction tutorial n = te.size_var("n") m = te.size_var("m") A = te.placeholder((n, m), name='A') k = te.reduce_axis((0, m), "k") B = te.compute((n,), lambda i: te.sum(A[i, k], axis=k), name="B") s = te.create_schedule(B.op) ko, ki = s[B].split(B.op.reduce_axis[0], factor=16) BF = s.rfactor(B, ki) xo, xi = s[B].split(s[B].op.axis[0], factor=32) s[B].bind(xo, bx) s[B].bind(xi, ty) s[B].bind(s[B].op.reduce_axis[0], tx) s[BF].compute_at(s[B], s[B].op.reduce_axis[0])
# inspect the best config dispatch_context = autotvm.apply_history_best("conv2d.log") best_config = dispatch_context.query(task.target, task.workload) print("\nBest config:") print(best_config) # apply history best from log file with autotvm.apply_history_best('conv2d.log'): with tvm.target.create("rocm"): s, arg_bufs = conv2d_no_batching(N, H, W, CO, CI, KH, KW, strides, padding) func = tvm.build(s, arg_bufs) # check correctness a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32) w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32) c_np = conv2d_nchw_python(a_np, w_np, strides, padding) ctx = tvm.rocm() a_tvm = tvm.nd.array(a_np, ctx=ctx) w_tvm = tvm.nd.array(w_np, ctx=ctx) c_tvm = tvm.nd.empty(c_np.shape, ctx=ctx) func(a_tvm, w_tvm, c_tvm) tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2) # Evaluate running time. Here we choose a large repeat number (400) to reduce the noise # and the overhead of kernel launch. You can also use nvprof to validate the result. evaluator = func.time_evaluator(func.entry_name, ctx, number=400) print('Time cost of this operator: %f' % evaluator(a_tvm, w_tvm, c_tvm).mean)
# software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import tvm import numpy as np import unittest tx = tvm.thread_axis("threadIdx.x") ty = tvm.thread_axis("threadIdx.y") bx = tvm.thread_axis("blockIdx.x") by = tvm.thread_axis("blockIdx.y") @unittest.skipIf(not tvm.rocm(0).exist or not tvm.module.enabled("rocm"), "skip because rocm is not enabled..") def test_rocm_cross_thread_reduction(): # based on the reduction tutorial n = tvm.size_var("n") m = tvm.size_var("m") A = tvm.placeholder((n, m), name='A') k = tvm.reduce_axis((0, m), "k") B = tvm.compute((n, ), lambda i: tvm.sum(A[i, k], axis=k), name="B") s = tvm.create_schedule(B.op) ko, ki = s[B].split(B.op.reduce_axis[0], factor=16) BF = s.rfactor(B, ki) xo, xi = s[B].split(s[B].op.axis[0], factor=32) s[B].bind(xo, bx) s[B].bind(xi, ty) s[B].bind(s[B].op.reduce_axis[0], tx)
# xo, xi = s[B].split(s[B].op.axis[0], factor=32) s[B].bind(xo, tvm.thread_axis("blockIdx.x")) s[B].bind(xi, tvm.thread_axis("threadIdx.y")) tx = tvm.thread_axis("threadIdx.x") s[B].bind(s[B].op.reduce_axis[0], tx) s[BF].compute_at(s[B], s[B].op.reduce_axis[0]) s[B].set_store_predicate(tx.var.equal(0)) frocm = tvm.build(s, [A, B], "rocm") print(frocm.get_source()) ###################################################################### # Verify the correctness of result kernel by comparing it to numpy. # nn = 128 ctx = tvm.rocm(0) a = tvm.nd.array(np.random.uniform(size=(nn, nn)).astype(A.dtype), ctx) b = tvm.nd.array(np.zeros(nn, dtype=B.dtype), ctx) frocm(a, b) tvm.testing.assert_allclose( b.asnumpy(), np.sum(a.asnumpy(), axis=1), rtol=1e-4) ###################################################################### # Describe Convolution via 2D Reduction # ------------------------------------- # In TVM, we can describe convolution via 2D reduction in a simple way. # Here is an example for 2D convolution with filter size = [3, 3] and strides = [1, 1]. # n = tvm.var('n') Input = tvm.placeholder((n, n), name='Input') Filter = tvm.placeholder((3, 3), name='Filter')
# software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. import tvm import numpy as np import unittest tx = tvm.thread_axis("threadIdx.x") ty = tvm.thread_axis("threadIdx.y") bx = tvm.thread_axis("blockIdx.x") by = tvm.thread_axis("blockIdx.y") @unittest.skipIf(not tvm.rocm(0).exist or not tvm.runtime.enabled("rocm"), "skip because rocm is not enabled..") def test_rocm_cross_thread_reduction(): # based on the reduction tutorial n = tvm.size_var("n") m = tvm.size_var("m") A = tvm.placeholder((n, m), name='A') k = tvm.reduce_axis((0, m), "k") B = tvm.compute((n, ), lambda i: tvm.sum(A[i, k], axis=k), name="B") s = tvm.create_schedule(B.op) ko, ki = s[B].split(B.op.reduce_axis[0], factor=16) BF = s.rfactor(B, ki) xo, xi = s[B].split(s[B].op.axis[0], factor=32) s[B].bind(xo, bx) s[B].bind(xi, ty) s[B].bind(s[B].op.reduce_axis[0], tx)