def test_ctx(): def test_ctx_func(ctx): assert tvm.gpu(7) == ctx return tvm.cpu(0) x = test_ctx_func(tvm.gpu(7)) assert x == tvm.cpu(0) x = tvm.opencl(10) x = tvm._api_internal._context_test(x, x.device_type, x.device_id) assert x == tvm.opencl(10)
def test_ctx(): def test_ctx_func(ctx): assert tvm.gpu(7) == ctx return tvm.cpu(0) x = test_ctx_func(tvm.gpu(7)) assert x == tvm.cpu(0) x = tvm.opencl(10) x = tvm.testing.context_test(x, x.device_type, x.device_id) assert x == tvm.opencl(10)
def test_device(): def test_device_func(dev): assert tvm.gpu(7) == dev return tvm.cpu(0) x = test_device_func(tvm.gpu(7)) assert x == tvm.cpu(0) x = tvm.opencl(10) x = tvm.testing.device_test(x, x.device_type, x.device_id) assert x == tvm.opencl(10)
def build_run_compare( tvm_mod, params1, input_shape, dtype="float32", target="llvm"): rpc_tracker_host = os.environ["TVM_TRACKER_HOST"] rpc_tracker_port = os.environ["TVM_TRACKER_PORT"] if rpc_tracker_host: run_on_host = 0 target_host = "llvm -mtriple=arm64-linux-android" rpc_tracker_port = int(rpc_tracker_port) else: run_on_host = 1 target_host="llvm" with relay.build_config(opt_level=3): graph, lib, params = relay.build( tvm_mod, target_host=target_host, target=target, params=params1 ) if run_on_host: ctx = tvm.opencl() m = graph_runtime.create(graph, lib, ctx) else: from tvm import rpc from tvm.contrib import utils, ndk rpc_key = "android" tracker = rpc.connect_tracker(rpc_tracker_host, rpc_tracker_port) remote = tracker.request( rpc_key, priority=0, session_timeout=600 ) temp = utils.tempdir() dso_binary = "dev_lib_cl.so" dso_binary_path = temp.relpath(dso_binary) ctx = remote.cl(0) lib.export_library(dso_binary_path, ndk.create_shared) remote.upload(dso_binary_path) rlib = remote.load_module(dso_binary) m = graph_runtime.create(graph, rlib, ctx) m.set_input(**params) inputs = [] if isinstance(input_shape, dict): for key in input_shape: inputs.append(np.random.normal(size=input_shape[key]).astype(dtype)) m.set_input(key, inputs[-1]) else: inputs.append(np.random.normal(size=input_shape).astype(dtype)) m.set_input("data", inputs[-1]) m.run() ref_outputs = get_reference(tvm_mod, params1, input_shape, inputs) for i, ref_output in enumerate(ref_outputs): tvm_output = m.get_output(i) output = tvm_output.asnumpy() # for index, x in np.ndenumerate(ref_output): # if abs(output[index] - x) > 0.01: # print(index, output[index], x) np.testing.assert_allclose(output, ref_output, rtol=1e-2, atol=1e-2)
def enabled_ctx_list(): ctx_list = [('cpu', tvm.cpu(0)), ('gpu', tvm.gpu(0)), ('cl', tvm.opencl(0)), ('metal', tvm.metal(0)), ('rocm', tvm.rocm(0)), ('vpi', tvm.vpi(0))] for k, v in ctx_list: assert tvm.context(k, 0) == v ctx_list = [x[1] for x in ctx_list if x[1].exist] return ctx_list
def enabled_ctx_list(): ctx_list = [('cpu', tvm.cpu(0)), ('gpu', tvm.gpu(0)), ('cl', tvm.opencl(0)), ('metal', tvm.metal(0)), ('rocm', tvm.rocm(0)), ('vulkan', tvm.vulkan(0)), ('vpi', tvm.vpi(0))] for k, v in ctx_list: assert tvm.context(k, 0) == v ctx_list = [x[1] for x in ctx_list if x[1].exist] return ctx_list
def requires_gpu(*args): """Mark a test as requiring a GPU to run. Tests with this mark will not be run unless a gpu is present. Parameters ---------- f : function Function to mark """ _requires_gpu = [ pytest.mark.skipif( not tvm.cuda().exist and not tvm.rocm().exist and not tvm.opencl().exist and not tvm.metal().exist and not tvm.vulkan().exist, reason="No GPU present", ), *uses_gpu(), ] return _compose(args, _requires_gpu)
def build_run_compare( tvm_mod, params1, input_shape, dtype="float32", target="llvm", static_mem_scopes=[], gpu_preprocess=None, stat_file=None, ): if "TVM_TRACKER_HOST" in os.environ and "TVM_TRACKER_PORT" in os.environ: rpc_tracker_host = os.environ["TVM_TRACKER_HOST"] rpc_tracker_port = os.environ["TVM_TRACKER_PORT"] run_on_host = 0 target_host = "llvm -mtriple=arm64-linux-android" rpc_tracker_port = int(rpc_tracker_port) else: run_on_host = 1 target_host = "llvm" if gpu_preprocess: tvm_mod_nchwc = gpu_preprocess(tvm_mod) else: tvm_mod_nchwc = tvm_mod if stat_file is not None: with autotvm.apply_history_best(stat_file): with tvm.transform.PassContext(opt_level=3): graph, lib, params = relay.build( tvm_mod_nchwc, target_host=target_host, target=target, params=params1 ) else: with tvm.transform.PassContext(opt_level=3): graph, lib, params = relay.build( tvm_mod_nchwc, target_host=target_host, target=target, params=params1 ) # verification that storage_scope has expected textures scopes graph_json = json.loads(graph) if "storage_scope" in graph_json["attrs"]: assert ( len(static_mem_scopes) == len(graph_json["attrs"]["storage_scope"][1]) or len(static_mem_scopes) == 0 ) else: assert len(static_mem_scopes) == 0 for i in range(0, len(static_mem_scopes)): assert static_mem_scopes[i] == graph_json["attrs"]["storage_scope"][1][i] if run_on_host: ctx = tvm.opencl() m = graph_runtime.create(graph, lib, ctx) else: from tvm import rpc from tvm.contrib import utils, ndk rpc_key = "android" tracker = rpc.connect_tracker(rpc_tracker_host, rpc_tracker_port) remote = tracker.request(rpc_key, priority=0, session_timeout=600) temp = utils.tempdir() dso_binary = "dev_lib_cl.so" dso_binary_path = temp.relpath(dso_binary) ctx = remote.cl(0) lib.export_library(dso_binary_path, ndk.create_shared) remote.upload(dso_binary_path) rlib = remote.load_module(dso_binary) m = graph_runtime.create(graph, rlib, ctx) m.set_input(**params) inputs = [] if isinstance(input_shape, dict): for key in input_shape: inputs.append(np.random.normal(size=input_shape[key]).astype(dtype)) m.set_input(key, inputs[-1]) else: inputs.append(np.random.normal(size=input_shape).astype(dtype)) m.set_input("data", inputs[-1]) m.run() ref_outputs = get_cpu_reference(tvm_mod, params1, input_shape, inputs) for i, ref_output in enumerate(ref_outputs): tvm_output = m.get_output(i) output = tvm_output.asnumpy() # for index, x in np.ndenumerate(ref_output): # if abs(output[index] - x) > 0.01: # print(index, output[index], x) np.testing.assert_allclose(output, ref_output, rtol=1e-1, atol=1e-1) return graph
best_config = dispatch_context.query(task.target, task.workload) print("\nBest config:") print(best_config) # apply history best from log file with autotvm.apply_history_best("conv2d-lenet-1.log"): with tvm.target.Target("opencl"): s, arg_bufs = conv2d_no_batching(N, H, W, CO, CI, KH, KW, strides, padding) func = tvm.build(s, arg_bufs) # check correctness a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32) w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32) c_np = conv2d_nchw_python(a_np, w_np, strides, padding) ctx = tvm.opencl() a_tvm = tvm.nd.array(a_np, ctx=ctx) w_tvm = tvm.nd.array(w_np, ctx=ctx) c_tvm = tvm.nd.empty(c_np.shape, ctx=ctx) func(a_tvm, w_tvm, c_tvm) tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2) # Evaluate running time. Here we choose a large repeat number (400) to reduce the noise # and the overhead of kernel launch. You can also use nvprof to validate the result. evaluator = func.time_evaluator(func.entry_name, ctx, number=400) print("Time cost of this operator: %f" % evaluator(a_tvm, w_tvm, c_tvm).mean) with open("conv2d-lenet-1.cl", "w") as fout: print(func.imported_modules[0].get_source(), file=fout)