def check_device(device): if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.Target(device): fcompute, fschedule = tvm.topi.testing.dispatch( device, _conv2d_nhwc_implement) B = fcompute(A, W, stride, padding, dilation, dtype) s = fschedule([B]) dev = tvm.device(device, 0) a = tvm.nd.array(a_np, dev) w = tvm.nd.array(w_np, dev) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev) func = tvm.build(s, [A, W, B], device) func(a, w, b) tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
def check_target(target, ir): C = te.extern( (n, ), [], lambda ins, outs: ir(outs[0]), name="collatz", dtype="int32", ) s = te.create_schedule(C.op) with tvm.transform.PassContext(opt_level=3): func = tvm.build(s, [C], target) dev = tvm.device(target, 0) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) func(c) ref = np.array([collatz_ref(i) for i in range(n)]) tvm.testing.assert_allclose(c.numpy(), ref)
def profile_and_build_vm( mod, params, sm, tmp_dir="./tmp", lib_path="compile.so", vmcode_path="vmcode.ro", use_fast_math=False, ): mod = partition_for_cutlass(mod) mod, num_cutlass_partition = tune_cutlass_kernels(mod, sm, tmp_dir=tmp_dir) with tvm.transform.PassContext(opt_level=3): vm_exec = relay.vm.compile(mod, target="cuda", params=params) vm_exec = build_cutlass_kernels_vm( vm_exec, sm, tmp_dir, lib_path, vmcode_path, use_fast_math=use_fast_math ) dev = tvm.device("cuda", 0) return VirtualMachine(vm_exec, dev), dev, num_cutlass_partition
def test_out_of_bounds_llvm(index_a, index_b): n = te.size_var("n") A = te.placeholder((n, ), name="A") B = te.placeholder((n, ), name="B") C = te.compute(A.shape, lambda i: A[i + index_a] + B[i + index_b], name="C") s = te.create_schedule(C.op) tgt = "llvm" tgt_host = "llvm" stmt = tvm.lower(s, [A, B, C], simple_mode=True) print(stmt) fadd = tvm.build(s, [A, B, C], tgt, target_host=tgt_host, name="myadd") dev = tvm.device(tgt, 0) a = tvm.nd.array(np.random.uniform(size=1024).astype(A.dtype), dev) b = tvm.nd.array(np.random.uniform(size=1024).astype(B.dtype), dev) c = tvm.nd.array(np.zeros(1024, dtype=C.dtype), dev) fadd(a, b, c)
def check_device(device): dev = tvm.device(device, 0) print("Running on target: %s" % device) with tvm.target.Target(device): if bgemm == "direct": fcompute, fschedule = tvm.topi.testing.dispatch( device, _conv2d_nhwc_winograd_direct) elif bgemm == "tensorcore": fcompute, fschedule = tvm.topi.testing.dispatch( device, _conv2d_nhwc_winograd_tensorcore) C = fcompute(A, W, stride, padding, dilation, "float32") if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = fschedule([C]) a = tvm.nd.array(a_np, dev) w = tvm.nd.array(w_np, dev) b = tvm.nd.array(b_np, dev) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) if add_bias: func = tvm.build( s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func(a, w, b, c) else: func = tvm.build( s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func(a, w, c) tvm.testing.assert_allclose(c.numpy(), c_np, rtol=2e-3)
def test_dense_dense(): M, N, K = 128, 128, 128 data_shape = (M, K) weight_shape = (N, K) relay_mod = tvm.IRModule.from_expr( get_dense_dense(data_shape, weight_shape)) # print(relay.transform.InferType()(relay_mod)) data_np = np.random.randn(*data_shape).astype("float32") weight1_np = np.random.randn(*weight_shape).astype("float32") weight2_np = np.random.randn(*weight_shape).astype("float32") target = "llvm" params = {"weight1": weight1_np, "weight2": weight2_np} def schedule_fn(task, sch): if "nn_dense_nn_dense" in task.task_name: schedule_dense_dense(sch) return True return False database = apply_fixed_schedules(relay_mod, target, params, schedule_fn) with ApplyHistoryBest(database): with tvm.transform.PassContext( opt_level=3, config={"relay.backend.use_meta_schedule": True}, ): lib = relay.build(relay_mod, target=target, params=params) dev = tvm.device(target, 0) runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev)) runtime.set_input("data", data_np) runtime.run() out = runtime.get_output(0).numpy() ref = get_ref(data_np, weight1_np, weight2_np) tvm.testing.assert_allclose(out, ref, atol=1e-4, rtol=1e-4)
def check_device(device): dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return print("Running on target: %s" % device) with tvm.target.Target(device): fcompute, fschedule = tvm.topi.testing.dispatch( device, _conv2d_nchw_winograd_implement) C = fcompute(A, W, stride, padding, dilation, dtype) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = fschedule([C]) a = tvm.nd.array(a_np, dev) w = tvm.nd.array(w_np, dev) b = tvm.nd.array(b_np, dev) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) if add_bias: func = tvm.build( s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func(a, w, b, c) else: func = tvm.build( s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func(a, w, c) rtol = 1e-3 tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=rtol)
def test_nat_add(): mod = tvm.IRModule() p = Prelude(mod) p.mod.import_from_std("nat.rly") nat, z, s = p.mod.get_type("nat") add = p.mod.get_global_var("nat_add") dev = tvm.device("llvm", 0) intrp = create_executor(mod=mod, device=dev, target="llvm") assert mod[add].checked_type == relay.FuncType([nat(), nat()], nat()) assert count(p, intrp.evaluate(add(s(z()), s(z())))) == 2 expr = add(s(z()), s(z())) f = relay.GlobalVar("f") mod[f] = relay.Function([], expr) mod = transform.InferType()(mod) mod = transform.ToBasicBlockNormalForm()(mod) opt_expr = mod["f"] assert count(p, intrp.evaluate(opt_expr.body)) == 2 assert not Feature.fLet in detect_feature(mod[add]) check_basic_block_normal_form(opt_expr)
def check_target(target): if not tvm.testing.device_enabled(target): print("Skip because %s is not enabled" % target) return dev = tvm.device(target, 0) print("Running on target: %s" % target) with tvm.target.Target(target): fcompute, fschedule = tvm.topi.testing.dispatch( target, _argsort_implement) out = fcompute(data, axis=axis, is_ascend=is_ascend) s = fschedule(out) tvm_data = tvm.nd.array(np_data, dev) tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data_dtype), dev) f = tvm.build(s, [data, out], target) f(tvm_data, tvm_out) tvm.testing.assert_allclose(tvm_out.asnumpy(), np_indices.astype(data_dtype), rtol=1e0)
def test_check_and_update_host_consist_4(): """Test `check_and_update_host_consist` by using TVM Objects""" cuda_device_type = tvm.device("cuda").device_type target = {cuda_device_type: Target(target="cuda", host="llvm")} host = None target_1, host_1 = Target.check_and_update_host_consist(target, host) assert isinstance(target_1, dict) assert target_1[cuda_device_type].kind.name == "cuda" assert target_1[cuda_device_type].host.kind.name == "llvm" assert host_1 is None target = {cuda_device_type: Target(tvm.runtime.container.String("cuda"))} host = Target(tvm.runtime.container.String("llvm")) target = tvm.runtime.convert(target) assert isinstance(target, tvm.ir.container.Map) target_2, host_2 = Target.check_and_update_host_consist(target, host) assert isinstance(target_2, dict) assert target_2[cuda_device_type].kind.name == "cuda" assert host_2.kind.name == "llvm"
def build_and_run(inputs, func, target, target_host, *args, **kwargs): schedule, placeholders, binds = func(*args, **kwargs) func = tvm.build( schedule, placeholders, target=tvm.target.Target(target, host=target_host), binds=binds ) dev = tvm.device(target) tensors = [] for tensor in inputs: tensors.append(tvm.nd.array(tensor, dev)) tensors.append( tvm.nd.array( numpy.zeros([i.value for i in placeholders[-1].shape], dtype=placeholders[-1].dtype), dev, ) ) func(*tensors) return tensors[-1].asnumpy()
def check_device(device, host="llvm"): dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return freduce = tvm.build( s, args=[A, B], target=tvm.target.Target(device, host), name="myreduce" ) # launch the kernel. n = 1028 m = 129 x = tvm.nd.array(np.random.uniform(size=(n, m)).astype(A.dtype), dev) y = tvm.nd.array(np.zeros(n, dtype=B.dtype), dev) freduce(x, y) npy = y.numpy() npy[:2] = 0 res = np_reducer(x.numpy(), axis=1) res[:2] = 0 tvm.testing.assert_allclose(npy, res, rtol=1e-4)
def test_double_splitting_with_indivisible_factors(): m = 48 dtype = "float32" A = te.placeholder((m,), name="A", dtype=dtype) C = te.compute((m,), lambda i: A[i], name="C") D = te.compute((m,), lambda i: C[i], name="D") s = te.create_schedule(D.op) co, ci = s[C].split(C.op.axis[0], factor=10) do, di = s[D].split(D.op.axis[0], 32) s[C].compute_at(s[D], do) target = "llvm" with tvm.transform.PassContext(config={"tir.LoopPartition": {"partition_const_loop": True}}): f = tvm.lower(s, [A, C, D], name="fadd1", simple_mode=False) func = tvm.build(f, target=target) top_produce = f["fadd1"].body assert not any(collect_visit(top_produce, lambda x: isinstance(x, tvm.tir.IfThenElse))) # check functional correctness of generated code dev = tvm.device(target, 0) a = tvm.nd.array( numpy.ones( m, ).astype(dtype), dev, ) c = tvm.nd.array( numpy.zeros( m, ).astype(dtype), dev, ) d = tvm.nd.array( numpy.zeros( m, ).astype(dtype), dev, ) func(a, c, d) tvm.testing.assert_allclose(c.numpy(), a.numpy(), rtol=1e-5) tvm.testing.assert_allclose(d.numpy(), a.numpy(), rtol=1e-5)
def profile_and_build(mod, params, sm, tmp_dir="./tmp", lib_path="compile.so", use_fast_math=False): mod = partition_for_cutlass(mod) mod, num_cutlass_partition = tune_cutlass_kernels( mod, sm, profile_all=False, use_multiprocessing=False, tmp_dir=tmp_dir) with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target="cuda", params=params) lib = build_cutlass_kernels(lib, sm, tmp_dir, lib_path, use_fast_math=use_fast_math) dev = tvm.device("cuda", 0) rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev)) return rt_mod, dev, num_cutlass_partition
def check_device(device): dev = tvm.device(device, 0) print("Running on target: %s" % device) with tvm.target.Target(device): fcompute, fschedule = tvm.topi.testing.dispatch( device, _conv3d_ndhwc_tensorcore_implement) C = fcompute(A, W, stride, padding, dilation, 1, "float16") if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) s = fschedule([C]) a = tvm.nd.array(a_np, dev) w = tvm.nd.array(w_np, dev) b = tvm.nd.array(b_np, dev) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) if add_bias: func = tvm.build( s, [A, W, bias, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func(a, w, b, c) else: func = tvm.build( s, [A, W, C], device, name="relu_%d_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) func(a, w, c) # Tensorcores are very inaccurate, with large shapes, the accumulation # error is high especially away from 1. We disable atol as it is very # large for these numbers that are far away from 1. tvm.testing.assert_allclose(c.numpy(), c_np, atol=1e200, rtol=0.01)
def _get_targets(): target_str = os.environ.get("TVM_TEST_TARGETS", "") if len(target_str) == 0: target_str = DEFAULT_TEST_TARGETS targets = set() for dev in target_str.split(";"): if len(dev) == 0: continue target_kind = dev.split()[0] if tvm.runtime.enabled(target_kind) and tvm.device(target_kind, 0).exist: targets.add(dev) if len(targets) == 0: logging.warning( "None of the following targets are supported by this build of TVM: %s." " Try setting TVM_TEST_TARGETS to a supported target. Defaulting to llvm.", target_str, ) return {"llvm"} return targets
def test_gemm_tensorcore(): """Test running gemm on tensorcore.""" dev = tvm.device("cuda", 0) a_np = np.random.uniform(size=(1024, 1024)).astype("float16") b_np = np.random.uniform(size=(1024, 1024)).astype("float16") c_np = np.dot(a_np.astype("float32"), b_np.T.astype("float32")) buff_a = tvm.nd.array(a_np, dev) buff_b = tvm.nd.array(b_np, dev) buff_c = tvm.nd.array(np.zeros((1024, 1024), dtype="float32"), dev) myfunc = tvm.build(tensorcore_gemm, target="cuda", name="dense") myfunc(buff_a, buff_b, buff_c) tvm.testing.assert_allclose(buff_c.numpy(), c_np, rtol=1e-3) evaluator = myfunc.time_evaluator(myfunc.entry_name, dev, number=100) time_elapsed = evaluator(buff_a, buff_b, buff_c).mean num_flops = 2 * 1024 * 1024 * 1024 gflops = num_flops / (time_elapsed * 1e3) / 1e6 print("gemm with tensor core: %f ms" % (time_elapsed * 1e3)) print("GFLOPS: %f" % gflops)
def check_device(device): if not tvm.testing.device_enabled(device): return dev = tvm.device(device, 0) s = te.create_schedule(D.op) for stage in [C, D]: xo, xi = s[stage].split(stage.op.axis[0], factor=4) s[stage].bind(xo, te.thread_axis("blockIdx.x")) s[stage].bind(xi, te.thread_axis("threadIdx.x")) f = tvm.build(s, [A, B, D], device) a_np = np.random.uniform(size=n).astype(A.dtype) a = tvm.nd.array(a_np, dev) b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev) d = tvm.nd.array(np.zeros(n, dtype=D.dtype), dev) f(a, b, d) np.testing.assert_equal( d.numpy(), np.logical_and(a.numpy() > b.numpy(), a.numpy() > 1).astype("float32"), )
def check_device(device): if not tvm.runtime.module.enabled(device): print("Skip because %s is not enabled" % device) return ctx = tvm.device(device, 0) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), ctx) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), ctx) with tvm.build_config(auto_unroll_max_step=128, unroll_explicit=device == 'rocm'): func1 = tvm.build(s1, [A, W, B], device) func1(a, w, b) tvm.testing.assert_allclose(b.asnumpy(), b_np, rtol=1e-5) func2 = tvm.build(s2, [A, W, C], device) func2(a, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5)
def check_device(device): dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return fadd = tvm.build(s, [A, B, C], device, name="myadd") # launch the kernel. n = 1024 a = tvm.nd.array((np.random.uniform(size=n) * 256).astype(A.dtype), dev) b = tvm.nd.array((np.random.uniform(size=n) * 256).astype(B.dtype), dev) c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev) ftimer = fadd.time_evaluator(fadd.entry_name, dev, number=1) tcost = ftimer(a, b, c).mean tvm.testing.assert_allclose(c.asnumpy(), a.asnumpy() + b.asnumpy(), rtol=1e-6)
def __init__(self, model, train_loader, num_classes, criterion, lr: Union[float, Callable[[int], float]], # lr(epoch,) -> lr debug_mode=False, print_freq=1000, target='llvm', dtype='float64'): self.model = model self.train_loader = train_loader self.num_classes = num_classes self.criterion = criterion self.lr = lr if isinstance(lr, float) else lr(0) self._lr_func = lr if not isinstance(lr, float) else lambda epoch: lr self.debug_mode = debug_mode self.print_freq = print_freq self.target = target self.dtype = dtype self.ctx = tvm.device(target) self._build_func() self._allocate_buffers_for_endpoints() self._initialize_weights()
def test_opencl_ternary_expression(): def check_if_then_else(dev, n, dtype): A = te.placeholder((n, ), name="A", dtype=dtype) true_value = tvm.tir.const(1, dtype=dtype) false_value = tvm.tir.const(3, dtype=dtype) max_lhs = tvm.tir.const(2, dtype=dtype) max_rhs = tvm.tir.if_then_else(A[0] > 0, true_value, false_value) C = te.compute((n, ), lambda i: tvm.te.max(max_lhs, max_rhs), name="C") s = te.create_schedule(C.op) s[C].bind(s[C].op.axis[0], te.thread_axis("threadIdx.x")) fun = tvm.build(s, [A, C], target) a = tvm.nd.empty((n, ), A.dtype, dev) c = tvm.nd.empty((n, ), A.dtype, dev) # Only need to test compiling here fun(a, c) def check_select(dev, n, dtype): A = te.placeholder((n, ), name="A", dtype=dtype) true_value = tvm.tir.const(1, dtype=dtype) false_value = tvm.tir.const(3, dtype=dtype) max_lhs = tvm.tir.const(2, dtype=dtype) max_rhs = tvm.tir.Select(A[0] > 0, true_value, false_value) C = te.compute((n, ), lambda i: tvm.te.max(max_lhs, max_rhs), name="C") s = te.create_schedule(C.op) s[C].bind(s[C].op.axis[0], te.thread_axis("threadIdx.x")) fun = tvm.build(s, [A, C], target) a = tvm.nd.empty((n, ), A.dtype, dev) c = tvm.nd.empty((n, ), A.dtype, dev) # Only need to test compiling here fun(a, c) dev = tvm.device(target, 0) check_if_then_else(dev, 1, "int8") check_if_then_else(dev, 1, "uint8") check_if_then_else(dev, 1, "int16") check_if_then_else(dev, 1, "uint16") check_select(dev, 1, "int8") check_select(dev, 1, "uint8") check_select(dev, 1, "int16") check_select(dev, 1, "uint16")
def check_target(target, dev): dev = tvm.device(target, 0) with tvm.target.Target(target): fcompute, fschedule = tvm.topi.testing.dispatch(target, _conv1d_transpose_ncw_implement) B = fcompute(A, W, stride, padding, A.dtype, output_padding) C = topi.nn.relu(B) s1 = fschedule([B]) s2 = fschedule([C]) a = tvm.nd.array(a_np, dev) w = tvm.nd.array(w_np, dev) b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=B.dtype), dev) c = tvm.nd.array(np.zeros(get_const_tuple(C.shape), dtype=C.dtype), dev) func1 = tvm.build(s1, [A, W, B], target) func2 = tvm.build(s2, [A, W, C], target) func1(a, w, b) func2(a, w, c) tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5) tvm.testing.assert_allclose(c.numpy(), c_np, rtol=1e-5)
def test_stable_sort_by_key(): size = 6 keys = te.placeholder((size, ), name="keys", dtype="int32") values = te.placeholder((size, ), name="values", dtype="int32") keys_out, values_out = stable_sort_by_key_thrust(keys, values) for target in ["cuda", "rocm"]: if not tvm.testing.device_enabled(target): print("Skip because %s is not enabled" % target) continue with tvm.target.Target(target + " -libs=thrust") as tgt: if not thrust_check_func[target]( tgt, "tvm.contrib.thrust.stable_sort_by_key"): print("skip because thrust is not enabled...") return dev = tvm.device(target, 0) s = te.create_schedule([keys_out.op, values_out.op]) f = tvm.build(s, [keys, values, keys_out, values_out], target) keys_np = np.array([1, 4, 2, 8, 2, 7], np.int32) values_np = np.random.randint(0, 10, size=(size, )).astype(np.int32) keys_np_out = np.zeros(keys_np.shape, np.int32) values_np_out = np.zeros(values_np.shape, np.int32) keys_in = tvm.nd.array(keys_np, dev) values_in = tvm.nd.array(values_np, dev) keys_out = tvm.nd.array(keys_np_out, dev) values_out = tvm.nd.array(values_np_out, dev) f(keys_in, values_in, keys_out, values_out) ref_keys_out = np.sort(keys_np) ref_values_out = np.array( [values_np[i] for i in np.argsort(keys_np)]) tvm.testing.assert_allclose(keys_out.asnumpy(), ref_keys_out, rtol=1e-5) tvm.testing.assert_allclose(values_out.asnumpy(), ref_values_out, rtol=1e-5)
def check_graph_runtime(target, ref_res, device, func, params, config, opt_level, expected_index=None): with tvm.transform.PassContext(opt_level=opt_level, config=config): graph, lib, new_params = relay.build(func, target, params=params) contexts = [tvm.cpu(0), tvm.device(device)] graph_json = json.loads(graph) if "device_index" in graph_json["attrs"]: device_index = graph_json["attrs"]["device_index"][1] assert device_index == expected_index mod = graph_runtime.create(graph, lib, contexts) mod.set_input(**new_params) mod.run() res = mod.get_output(0).asnumpy() tvm.testing.assert_allclose(res, ref_res, rtol=1e-5, atol=1e-5)
def check_target(device): dev = tvm.device(device, 0) if not tvm.testing.device_enabled(device): print("skip because %s is not enabled.." % device) return fapi = tvm.lower(s, args=[A0, A1, B0, B1]) fargmax = tvm.build(fapi, target=device, name="argmax") np_idx = np.repeat(np.arange(nn, dtype="int32").reshape(1, nn), mm, axis=0) np_val = np.random.uniform(size=(mm, nn)).astype("float32") np_res = np.argmax(np_val, axis=1) nd_idx = tvm.nd.array(np_idx, dev) nd_val = tvm.nd.array(np_val, dev) nd_res0 = tvm.nd.array(np.zeros(mm, dtype="int32"), dev) nd_res1 = tvm.nd.array(np.zeros(mm, dtype="float32"), dev) fargmax(nd_idx, nd_val, nd_res0, nd_res1) tvm.testing.assert_allclose(np_res, nd_res0.numpy())
def check_target(target, ir): if not tvm.testing.device_enabled(target): return C = te.extern( shape, [], lambda ins, outs: ir(outs[0]), name="mandel_ir", dtype="float32", ) s = te.create_schedule(C.op) with tvm.transform.PassContext(opt_level=3): func = tvm.build(s, [C], target) dev = tvm.device(target, 0) c = tvm.nd.array(np.zeros(shape, dtype=C.dtype), dev) func(c) tvm.testing.assert_allclose(c.numpy(), ref, rtol=1e-5, atol=1e-5)
def verify_with_input(sorted_sequence_np, values_np, right): sorted_sequence = te.placeholder(sorted_sequence_np.shape, dtype="float32") values = te.placeholder(values_np.shape, dtype="float32") out_dtype = "int32" implementations = get_implementations() fcompute, fschedule = tvm.topi.testing.dispatch(target, implementations) with tvm.target.Target(target): indices = fcompute(sorted_sequence, values, right, out_dtype) s = fschedule([indices]) func = tvm.build(s, [sorted_sequence, values, indices], target=target) dev = tvm.device(target, 0) a = tvm.nd.array(sorted_sequence_np, dev) b = tvm.nd.array(values_np, dev) c = tvm.nd.array(np.zeros(values_np.shape, dtype=indices.dtype), dev) func(a, b, c) ref = searchsorted_ref(sorted_sequence_np, values_np, right, out_dtype) np.testing.assert_equal(c.numpy(), ref)
def check_device(device): dev = tvm.device(device, 0) if device == "cuda" and not tvm.contrib.nvcc.have_int8( dev.compute_version): print("Skip because int8 intrinsics are not available") return print("Running on target: %s" % device) with tvm.target.Target(device): D = topi.cuda.dense_int8(A, B, C if use_bias else None, out_dtype) D = topi.nn.relu(D) s = topi.cuda.schedule_dense_int8([D]) a = tvm.nd.array(a_np, dev) b = tvm.nd.array(b_np, dev) c = tvm.nd.array(c_np, dev) d = tvm.nd.array(np.zeros(get_const_tuple(D.shape), dtype=out_dtype), dev) f = tvm.build(s, [A, B, C, D], device, name="dense") f(a, b, c, d) tvm.testing.assert_allclose(d.numpy(), d_np, rtol=1e-5)
def enabled_targets(): """Get all enabled targets with associated contexts. In most cases, you should use :py:func:`tvm.testing.parametrize_targets` instead of this function. In this context, enabled means that TVM was built with support for this target and the target name appears in the TVM_TEST_TARGETS environment variable. If TVM_TEST_TARGETS is not set, it defaults to variable DEFAULT_TEST_TARGETS in this module. If you use this function in a test, you **must** decorate the test with :py:func:`tvm.testing.uses_gpu` (otherwise it will never be run on the gpu). Returns ------- targets: list A list of pairs of all enabled devices and the associated context """ return [(tgt, tvm.device(tgt)) for tgt in _get_targets()]