def test_autotune_conv2d(temp_dir, board, west_cmd, tvm_debug): """Test AutoTune for microTVM Zephyr""" if board != "qemu_x86": pytest.xfail(f"Autotune fails on {board}.") runtime = Runtime("crt", {"system-lib": True}) model = test_utils.ZEPHYR_BOARDS[board] build_config = {"debug": tvm_debug} # Create a Relay model data_shape = (1, 3, 16, 16) weight_shape = (8, 3, 5, 5) data = relay.var("data", relay.TensorType(data_shape, "float32")) weight = relay.var("weight", relay.TensorType(weight_shape, "float32")) y = relay.nn.conv2d( data, weight, padding=(2, 2), kernel_size=(5, 5), kernel_layout="OIHW", out_dtype="float32", ) f = relay.Function([data, weight], y) mod = tvm.IRModule.from_expr(f) mod = relay.transform.InferType()(mod) data_sample = np.random.rand(data_shape[0], data_shape[1], data_shape[2], data_shape[3]).astype("float32") weight_sample = np.random.rand(weight_shape[0], weight_shape[1], weight_shape[2], weight_shape[3]).astype("float32") params = {mod["main"].params[1].name_hint: weight_sample} target = tvm.target.target.micro(model) pass_context = tvm.transform.PassContext( opt_level=3, config={"tir.disable_vectorize": True}) with pass_context: tasks = tvm.autotvm.task.extract_from_program(mod["main"], {}, target) assert len(tasks) > 0 config_main_stack_size = None if test_utils.qemu_boards(board): config_main_stack_size = 1536 project_options = { "zephyr_board": board, "west_cmd": west_cmd, "verbose": 1, "project_type": "host_driven", } if config_main_stack_size is not None: project_options["config_main_stack_size"] = config_main_stack_size module_loader = tvm.micro.AutoTvmModuleLoader( template_project_dir=test_utils.TEMPLATE_PROJECT_DIR, project_options=project_options, ) timeout = 200 builder = tvm.autotvm.LocalBuilder( timeout=timeout, n_parallel=1, build_kwargs={"build_option": { "tir.disable_vectorize": True }}, do_fork=True, build_func=tvm.micro.autotvm_build_func, runtime=runtime, ) runner = tvm.autotvm.LocalRunner(number=1, repeat=1, timeout=timeout, module_loader=module_loader) measure_option = tvm.autotvm.measure_option(builder=builder, runner=runner) log_path = pathlib.Path("zephyr_autotune.log") if log_path.exists(): log_path.unlink() n_trial = 10 for task in tasks: tuner = tvm.autotvm.tuner.GATuner(task) tuner.tune( n_trial=n_trial, measure_option=measure_option, callbacks=[ tvm.autotvm.callback.log_to_file(str(log_path)), tvm.autotvm.callback.progress_bar(n_trial, si_prefix="M"), ], si_prefix="M", ) assert tuner.best_flops > 0 check_tune_log(log_path) # Build without tuning with pass_context: lowered = tvm.relay.build(mod, target=target, runtime=runtime, params=params) temp_dir = utils.tempdir() with _make_session(temp_dir, board, west_cmd, lowered, build_config) as session: graph_mod = tvm.micro.create_local_graph_executor( lowered.get_graph_json(), session.get_system_lib(), session.device) graph_mod.set_input(**lowered.get_params()) graph_mod.run(data=data_sample) expected_output = graph_mod.get_output(0).numpy() del graph_mod # Build using autotune logs with tvm.autotvm.apply_history_best(str(log_path)): with pass_context: lowered_tuned = tvm.relay.build(mod, target=target, runtime=runtime, params=params) temp_dir = utils.tempdir() with _make_session(temp_dir, board, west_cmd, lowered_tuned, build_config) as session: graph_mod = tvm.micro.create_local_graph_executor( lowered_tuned.get_graph_json(), session.get_system_lib(), session.device) graph_mod.set_input(**lowered_tuned.get_params()) graph_mod.run(data=data_sample) output = graph_mod.get_output(0).numpy() del graph_mod tvm.testing.assert_allclose(output, expected_output, rtol=1e-4, atol=1e-5)
def compile_and_run(mod, input_list, output_list, use_calculated_workspaces, params=None, workspace_byte_alignment=8): """ This method verifies the generated source """ target = f"c -runtime=c --link-params --executor=aot --workspace-byte-alignment={workspace_byte_alignment}" cflags = f"-DTVM_RUNTIME_ALLOC_ALIGNMENT_BYTES={workspace_byte_alignment} " # The calculated workspaces will not account for stack allocator tags used for debugging if not use_calculated_workspaces: cflags += "-DTVM_CRT_STACK_ALLOCATOR_ENABLE_LIFO_CHECK " with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}): lib = tvm.relay.build(mod, target, target_host=target, params=params) tmp_path = utils.tempdir() tmp_dir = tmp_path.temp_dir base_path = os.path.join(tmp_dir, "test") build_path = os.path.join(base_path, "build") os.makedirs(build_path, exist_ok=True) tar_file = os.path.join(base_path, "test.tar") export_model_library_format(lib, tar_file) t = tarfile.open(tar_file) t.extractall(base_path) if use_calculated_workspaces: workspace_bytes = extract_main_workspace_sizebytes(base_path) else: workspace_bytes = 16384 * 1024 for i in range(len(input_list)): create_header_file((f"input_data{i}"), input_list[i], build_path) for i in range(len(output_list)): create_header_file( (f"output_data{i}"), np.zeros(output_list[i].shape, output_list[i].dtype), build_path, ) create_header_file((f"expected_output_data{i}"), output_list[i], build_path) create_main("test.c", input_list, output_list, build_path, workspace_bytes) # Verify that compiles fine file_dir = os.path.dirname(os.path.abspath(__file__)) makefile = os.path.join(file_dir, "aot_test.mk") make_cmd = (f"make CFLAGS='{cflags}' -f {makefile} build_dir=" + build_path + f" TVM_ROOT={file_dir}/../../../..") compile_log_path = os.path.join(build_path, "test_compile.log") ret = subprocess_with_stdout_and_log(make_cmd, ".", compile_log_path, False) assert ret == 0 # Verify that runs fine run_log_path = os.path.join(build_path, "test_run.log") ret = subprocess_with_stdout_and_log("./aot_test_runner", build_path, run_log_path, False) assert ret == 0
# TVM Compilation and Verification # -------------------------------- # After specifying the schedule, we can compile it into a TVM function. # We save the module so we can send it over RPC. # We run the function and verify it against a numpy implementation to # ensure correctness. # This library facilitates 2D convolution testing from tvm.topi.testing import conv2d_nchw_python # Compile the TVM module with vta.build_config(disabled_pass={"tir.CommonSubexprElimTIR"}): my_conv = vta.build(s, [data, kernel, res], tvm.target.Target("ext_dev", host=env.target_host), name="my_conv") temp = utils.tempdir() my_conv.save(temp.relpath("conv2d.o")) remote.upload(temp.relpath("conv2d.o")) f = remote.load_module("conv2d.o") # Get the remote device context ctx = remote.ext_dev(0) # Initialize the data and kernel arrays randomly in the int range # of (-128, 128] in NCHW layout data_np = np.random.randint(-128, 128, size=(batch_size, in_channels, height, width)).astype(data.dtype) kernel_np = np.random.randint(-128, 128,
def run_conv2d(env, remote, wl, target, check_correctness=True, print_ir=False, samples=4): # Workload assertions assert wl.hpad == wl.wpad # Perform packing only if we are targeting the accelerator if "arm_cpu" in target.keys: data_pack = False layout = "NCHW" conv2d_fcompute = topi.arm_cpu.conv2d_nchw_spatial_pack conv2d_fschedule = topi.arm_cpu.schedule_conv2d_nchw_spatial_pack elif "vta" in target.keys: data_pack = True layout = "NCHW%dn%dc" % (env.BATCH, env.BLOCK_IN) conv2d_fcompute = vta.top.conv2d_packed conv2d_fschedule = vta.top.schedule_conv2d_packed # Derive shapes depending upon packing a_shape = (wl.batch, wl.in_filter, wl.height, wl.width) w_shape = (wl.out_filter, wl.in_filter, wl.hkernel, wl.wkernel) b_shape = (wl.batch, wl.out_filter, 1, 1) if data_pack: data_shape = ( wl.batch // env.BATCH, wl.in_filter // env.BLOCK_IN, wl.height, wl.width, env.BATCH, env.BLOCK_IN, ) kernel_shape = ( wl.out_filter // env.BLOCK_OUT, wl.in_filter // env.BLOCK_IN, wl.hkernel, wl.wkernel, env.BLOCK_OUT, env.BLOCK_IN, ) bias_shape = ( wl.batch // env.BATCH, wl.out_filter // env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT, ) else: data_shape = a_shape kernel_shape = w_shape bias_shape = b_shape data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype) kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) bias = te.placeholder(bias_shape, name="bias", dtype=env.acc_dtype) padding = relay.nn.get_pad_tuple2d((wl.hpad, wl.wpad)) # Define base computation schedule with target: if data_pack: res = conv2d_fcompute(data, kernel, (wl.hstride, wl.wstride), padding, (1, 1), layout, env.acc_dtype) else: res = conv2d_fcompute(data, kernel, (wl.hstride, wl.wstride), padding, (1, 1), env.acc_dtype) res = topi.right_shift(res, 8) res = topi.add(res, bias) res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1) res = topi.cast(res, env.out_dtype) # Derive base schedule s = conv2d_fschedule([res]) if print_ir: print(vta.lower(s, [data, kernel, bias, res], simple_mode=True)) # Derive number of ops fout_height = (wl.height + 2 * wl.hpad - wl.hkernel) // wl.hstride + 1 fout_width = (wl.width + 2 * wl.wpad - wl.wkernel) // wl.wstride + 1 num_ops = (2 * wl.batch * fout_height * fout_width * wl.hkernel * wl.wkernel * wl.out_filter * wl.in_filter) # @memoize("vta.tests.test_benchmark_topi.conv2d.verify_nhwc") def get_ref_data(): # derive min max for act, wgt, and bias types (max non inclusive) a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 << (env.INP_WIDTH - 1)) w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 << (env.WGT_WIDTH - 1)) b_min, b_max = 0 - 1 << (env.INP_WIDTH + env.WGT_WIDTH - 2), 1 << (env.INP_WIDTH + env.WGT_WIDTH - 2) a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype) w_np = np.random.randint(w_min, w_max, size=w_shape).astype(kernel.dtype) b_np = np.random.randint(b_min, b_max, size=b_shape).astype(env.acc_dtype) r_np = tvm.topi.testing.conv2d_nchw_python( a_np.astype(env.acc_dtype), w_np.astype(env.acc_dtype), (wl.hstride, wl.wstride), wl.hpad, ).astype(env.acc_dtype) return a_np, w_np, b_np, r_np # Data in original format data_np, kernel_np, bias_np, res_ref = get_ref_data() if data_pack: data_np = data_np.reshape( wl.batch // env.BATCH, env.BATCH, wl.in_filter // env.BLOCK_IN, env.BLOCK_IN, wl.height, wl.width, ).transpose((0, 2, 4, 5, 1, 3)) kernel_np = kernel_np.reshape( wl.out_filter // env.BLOCK_OUT, env.BLOCK_OUT, wl.in_filter // env.BLOCK_IN, env.BLOCK_IN, wl.hkernel, wl.wkernel, ).transpose((0, 2, 4, 5, 1, 3)) bias_np = bias_np.reshape(wl.batch // env.BATCH, wl.out_filter // env.BLOCK_OUT, 1, 1, env.BATCH, env.BLOCK_OUT) # Build if "vta" in target.keys: mod = vta.build(s, [data, kernel, bias, res], target=target, target_host=env.target_host, name="conv2d") else: mod = tvm.build(s, [data, kernel, bias, res], target=target, target_host=env.target_host, name="conv2d") temp = utils.tempdir() mod.save(temp.relpath("conv2d.o")) remote.upload(temp.relpath("conv2d.o")) f = remote.load_module("conv2d.o") dev = remote.device(str(target)) res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype) data_arr = tvm.nd.array(data_np, dev) kernel_arr = tvm.nd.array(kernel_np, dev) bias_arr = tvm.nd.array(bias_np, dev) res_arr = tvm.nd.array(res_np, dev) time_f = f.time_evaluator("conv2d", dev, number=samples) # In vta sim mode, collect simulator runtime statistics stats = {} cost = None if env.TARGET in ["sim", "tsim"]: # Check if we're in local RPC mode (allows us to rebuild the # runtime on the fly when varying the VTA designs) local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0")) if local_rpc: if env.TARGET == "sim": remote.get_function("vta.simulator.profiler_clear")() else: remote.get_function("vta.tsim.profiler_clear")() cost = time_f(data_arr, kernel_arr, bias_arr, res_arr) if env.TARGET == "sim": stats = json.loads( remote.get_function("vta.simulator.profiler_status")()) else: stats = json.loads( remote.get_function("vta.tsim.profiler_status")()) else: simulator.clear_stats() cost = time_f(data_arr, kernel_arr, bias_arr, res_arr) stats = simulator.stats() else: cost = time_f(data_arr, kernel_arr, bias_arr, res_arr) # Check correctness correct = False if check_correctness: res_orig = res_arr.asnumpy() if data_pack: res_orig = res_orig.transpose( (0, 4, 1, 5, 2, 3)).reshape(wl.batch, wl.out_filter, fout_height, fout_width) bias_np = bias_np.transpose( (0, 4, 1, 5, 2, 3)).reshape(wl.batch, wl.out_filter, 1, 1) res_ref = res_ref >> env.WGT_WIDTH res_ref += bias_np res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1) res_ref = res_ref.astype(env.out_dtype) correct = np.allclose(res_orig, res_ref) gops = (num_ops / cost.mean) / float(10**9) status = "PASSED" if correct else "FAILED" if "arm_cpu" in target.keys: device = "CPU" elif "vta" in target.keys: device = "VTA" print("%s CONV2D TEST %s: Time cost = %g sec/op, %g GOPS" % (device, status, cost.mean, gops)) return correct, cost, stats
def test_export_model_library_format_llvm(): with utils.TempDirectory.set_keep_for_debug(True): target = tvm.target.target.micro("host") assert str(target)[:2] == "c " target = tvm.target.Target("llvm " + str(target)[2:]) with tvm.transform.PassContext(opt_level=3): relay_mod = tvm.parser.fromtext( """ #[version = "0.0.5"] def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[(1, 2), float32]) { %0 = cast(%a, dtype="float32") + %b * %c; %0 }""" ) factory = tvm.relay.build( relay_mod, target, target_host=target, mod_name="add", params={"c": numpy.array([[2.0, 4.0]], dtype="float32")}, ) temp_dir = utils.tempdir() mlf_tar_path = temp_dir.relpath("lib.tar") import tvm.micro as micro micro.export_model_library_format(factory, mlf_tar_path) tf = tarfile.open(mlf_tar_path) extract_dir = temp_dir.relpath("extract") os.mkdir(extract_dir) tf.extractall(extract_dir) with open(os.path.join(extract_dir, "metadata.json")) as json_f: metadata = json.load(json_f) assert metadata["version"] == 5 assert metadata["model_name"] == "add" export_datetime = datetime.datetime.strptime( metadata["export_datetime"], "%Y-%m-%d %H:%M:%SZ" ) assert (datetime.datetime.now() - export_datetime) < datetime.timedelta(seconds=60 * 5) assert metadata["target"] == {"1": str(target)} assert metadata["memory"]["sids"] == [ {"storage_id": 0, "size_bytes": 2, "input_binding": "a"}, {"storage_id": 1, "size_bytes": 8, "input_binding": "b"}, {"storage_id": 2, "size_bytes": 8, "input_binding": "p0"}, {"storage_id": 3, "size_bytes": 8}, ] assert metadata["memory"]["functions"]["main"] == [ { "constants_size_bytes": 8, "device": 1, "io_size_bytes": 18, "workspace_size_bytes": 0, } ] assert metadata["memory"]["functions"]["operator_functions"][0]["workspace"] == [ {"device": 1, "workspace_size_bytes": 0} ] assert ( "fused_cast_multiply_add" in metadata["memory"]["functions"]["operator_functions"][0]["function_name"] ) assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "lib", "add_lib0.o")) validate_graph_json(extract_dir, factory) with open(os.path.join(extract_dir, "src", "relay.txt")) as relay_f: assert relay_f.read() == str(relay_mod) with open(os.path.join(extract_dir, "parameters", "add.params"), "rb") as params_f: params = tvm.relay.load_param_dict(params_f.read()) assert "p0" in params
def tune_and_evaluate(tuning_opt): if env.TARGET != "sim": # Get remote from fleet node remote = autotvm.measure.request_remote( env.TARGET, tracker_host, tracker_port, timeout=10000 ) # Reconfigure the JIT runtime and FPGA. vta.reconfig_runtime(remote) vta.program_fpga(remote, bitstream=None) else: # In simulation mode, host the RPC server locally. remote = rpc.LocalSession() # Register VTA tuning tasks register_vta_tuning_tasks() # Perform task extraction on Relay program print("Extract tasks...") relay_prog, params = compile_network(env, target, network, start_pack, stop_pack) mod = tvm.IRModule.from_expr(relay_prog) tasks = autotvm.task.extract_from_program( mod, params=params, ops=(relay.op.get("nn.conv2d"),), target=target, target_host=env.target_host, ) # filter out non-packed conv2d task tasks = list(filter(lambda t: len(t.args[0][1]) > 4, tasks)) # We should have extracted 10 convolution tasks assert len(tasks) == 10 print("Extracted {} conv2d tasks:".format(len(tasks))) for tsk in tasks: inp = tsk.args[0][1] wgt = tsk.args[1][1] batch = inp[0] * inp[4] in_filter = inp[1] * inp[5] out_filter = wgt[0] * wgt[4] height, width = inp[2], inp[3] hkernel, wkernel = wgt[2], wgt[3] hstride, wstride = tsk.args[2][0], tsk.args[2][1] hpad, wpad = tsk.args[3][0], tsk.args[3][1] print( "({}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {})".format( batch, height, width, in_filter, out_filter, hkernel, wkernel, hpad, wpad, hstride, wstride, ) ) # We do not run the tuning in our webpage server since it takes too long. # Comment the following line to run it by yourself. return # run tuning tasks print("Tuning...") tune_tasks(tasks, **tuning_opt) # compile kernels with history best records with autotvm.tophub.context(target, extra_files=[log_file]): # Compile network print("Compile...") if target.device_name != "vta": with tvm.transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}): lib = relay.build( relay_prog, target=target, params=params, target_host=env.target_host ) else: with vta.build_config(opt_level=3, disabled_pass={"AlterOpLayout"}): lib = relay.build( relay_prog, target=target, params=params, target_host=env.target_host ) # Export library print("Upload...") temp = utils.tempdir() lib.save(temp.relpath("graphlib.o")) remote.upload(temp.relpath("graphlib.o")) lib = remote.load_module("graphlib.o") # Generate the graph runtime ctx = remote.ext_dev(0) if device == "vta" else remote.cpu(0) m = graph_runtime.GraphModule(lib["default"](ctx)) # upload parameters to device image = tvm.nd.array((np.random.uniform(size=(1, 3, 224, 224))).astype("float32")) m.set_input("data", image) # evaluate print("Evaluate inference time cost...") timer = m.module.time_evaluator("run", ctx, number=1, repeat=10) tcost = timer() prof_res = np.array(tcost.results) * 1000 # convert to millisecond print( "Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)) )
def verify_result( mod, map_inputs, out_shape, #result, tol=1e-5, target="llvm", ctx=tvm.cpu(), params=None, dpu_target="DPUCADX8G", tvm_ops=0, ): """To check the result between reference and byoc vitis-ai flow""" lib = build_module(mod, target, params=params, dpu_target=dpu_target, tvm_ops=tvm_ops) #lib = build_module(mod, target, params=params, tvm_ops=tvm_ops) lib.export_library("tvm_lib.so") print('-----------------1--------------------') #lib = update_lib(lib) ctx = tvm.cpu() rt_mod = graph_runtime.GraphModule(lib["default"](tvm.cpu())) for name, data in map_inputs.items(): rt_mod.set_input(name, data) rt_mod.set_input(**params) aa=1 while 1: start = time.time() rt_mod.run() end = time.time() inference_time = np.round(end - start, 2) print("Inference time: ", inference_time, " sec") aa=aa+1 if aa>10 : break out = rt_mod.get_output(0) # get top1 result top1 = np.argmax(out.asnumpy()) #print("TVM prediction top-1: {}".format(synset[top1])) print("TVM prediction top-1: {}",top1) temp = utils.tempdir() export_rt_mod_file = temp.relpath("vitis_ai.rtmod") #lib.export_library(temp.relpath("tvm_lib.so")) #lib.export_library("tvm_lib.so") # Export lib for aarch64 target #tvm_target = tvm.target.arm_cpu('DPUCZDX8G-zcu104') tvm_target = tvm.target.arm_cpu('zcu104') #tvm_target = tvm.target.arm_cpu('ultra96') lib_kwargs = { 'fcompile': contrib.cc.create_shared, 'cc': "/usr/aarch64-linux-gnu/bin/ld" } with tvm.transform.PassContext(opt_level=3, config={'relay.ext.vitis_ai.options.load_runtime_module': export_rt_mod_file}): lib_arm = relay.build(mod, target=tvm_target, params=params) #lib_arm = relay.build_module.build(mod, target=tvm_target, params=params) #lib_dpuv2.export_library('tvm_dpu_arm.so', **lib_kwargs) lib_arm.export_library('tvm_dpu_arm.so', **lib_kwargs) print('-----------------2--------------------')
def run_gemm( env, remote, target, batch_size, in_feat, out_feat, check_correctness=True, print_ir=True, samples=4, ): # Perform packing only if we are targeting the accelerator if "arm_cpu" in target.keys: data_pack = False elif "vta" in target.keys: data_pack = True # Derive shapes depending upon packing a_shape = (batch_size, in_feat) w_shape = (out_feat, in_feat) if data_pack: data_shape = (batch_size // env.BATCH, in_feat // env.BLOCK_IN, env.BATCH, env.BLOCK_IN) kernel_shape = ( out_feat // env.BLOCK_OUT, in_feat // env.BLOCK_IN, env.BLOCK_OUT, env.BLOCK_IN, ) fcompute = vta.top.dense_packed fschedule = vta.top.schedule_dense_packed else: data_shape = a_shape kernel_shape = w_shape fcompute = topi.x86.dense_nopack fschedule = topi.x86.schedule_dense_nopack data = te.placeholder(data_shape, name="data", dtype=env.inp_dtype) kernel = te.placeholder(kernel_shape, name="kernel", dtype=env.wgt_dtype) # Define base computation schedule with target: res = fcompute(data, kernel, None, env.acc_dtype) res = topi.right_shift(res, 8) res = my_clip(res, 0, (1 << env.OUT_WIDTH - 1) - 1) res = topi.cast(res, env.out_dtype) # Derive base schedule s = fschedule([res]) if print_ir: print(vta.lower(s, [data, kernel, res], simple_mode=True)) # Derive number of ops num_ops = 2 * batch_size * in_feat * out_feat # @memoize("vta.tests.test_benchmark_topi.dense.verify") def get_ref_data(): # derive min max for act, wgt types (max non inclusive) a_min, a_max = 0 - (1 << (env.INP_WIDTH - 1)), (1 << (env.INP_WIDTH - 1)) w_min, w_max = 0 - (1 << (env.WGT_WIDTH - 1)), (1 << (env.WGT_WIDTH - 1)) a_np = np.random.randint(a_min, a_max, size=a_shape).astype(data.dtype) w_np = np.random.randint(w_min, w_max, size=w_shape).astype(kernel.dtype) r_np = np.dot(a_np.astype(env.acc_dtype), w_np.T.astype(env.acc_dtype)).astype(env.acc_dtype) return a_np, w_np, r_np # Data in original format data_np, kernel_np, res_ref = get_ref_data() if data_pack: data_np = data_np.reshape(batch_size // env.BATCH, env.BATCH, in_feat // env.BLOCK_IN, env.BLOCK_IN).transpose((0, 2, 1, 3)) kernel_np = kernel_np.reshape(out_feat // env.BLOCK_OUT, env.BLOCK_OUT, in_feat // env.BLOCK_IN, env.BLOCK_IN).transpose((0, 2, 1, 3)) # Build if "vta" in target.keys: mod = vta.build(s, [data, kernel, res], target=target, target_host=env.target_host, name="dense") else: mod = tvm.build(s, [data, kernel, res], target=target, target_host=env.target_host, name="dense") temp = utils.tempdir() mod.save(temp.relpath("dense.o")) remote.upload(temp.relpath("dense.o")) f = remote.load_module("dense.o") ctx = remote.context(str(target)) res_np = np.zeros(topi.utils.get_const_tuple(res.shape)).astype(res.dtype) data_arr = tvm.nd.array(data_np, ctx) kernel_arr = tvm.nd.array(kernel_np, ctx) res_arr = tvm.nd.array(res_np, ctx) time_f = f.time_evaluator("dense", ctx, number=samples) # In vta sim mode, collect simulator runtime statistics stats = {} cost = None if env.TARGET in ["sim", "tsim"]: # Check if we're in local RPC mode (allows us to rebuild the # runtime on the fly when varying the VTA designs) local_rpc = int(os.environ.get("VTA_LOCAL_SIM_RPC", "0")) if local_rpc: if env.TARGET == "sim": remote.get_function("vta.simulator.profiler_clear")() else: remote.get_function("vta.tsim.profiler_clear")() cost = time_f(data_arr, kernel_arr, res_arr) if env.TARGET == "sim": stats = json.loads( remote.get_function("vta.simulator.profiler_status")()) else: stats = json.loads( remote.get_function("vta.tsim.profiler_status")()) else: simulator.clear_stats() cost = time_f(data_arr, kernel_arr, res_arr) stats = simulator.stats() else: cost = time_f(data_arr, kernel_arr, res_arr) # Check correctness correct = False if check_correctness: res_orig = res_arr.asnumpy() if data_pack: res_orig = res_orig.reshape(batch_size, out_feat) res_ref = res_ref >> 8 res_ref = np.clip(res_ref, 0, (1 << env.OUT_WIDTH - 1) - 1) res_ref = res_ref.astype(env.out_dtype) correct = np.allclose(res_orig, res_ref) gops = (num_ops / cost.mean) / float(10**9) status = "PASSED" if correct else "FAILED" if "arm_cpu" in target.keys: device = "CPU" elif "vta" in target.keys: device = "VTA" print("%s DENSE TEST %s: Time cost = %g sec/op, %g GOPS" % (device, status, cost.mean, gops)) return correct, cost, stats
def test_rpc_module(): # graph n = tvm.runtime.convert(1024) A = te.placeholder((n, ), name="A") B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B") a_np = np.random.uniform(size=1024).astype(A.dtype) temp = utils.tempdir() # Establish remote connection with target hardware tracker = rpc.connect_tracker(tracker_host, tracker_port) remote = tracker.request(key, priority=0, session_timeout=60) # Compile the Graph for CPU target s = te.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=64) s[B].parallel(xi) s[B].pragma(xo, "parallel_launch_point") s[B].pragma(xi, "parallel_barrier_when_finish") f = tvm.build(s, [A, B], target, name="myadd_cpu") path_dso_cpu = temp.relpath("cpu_lib.so") f.export_library(path_dso_cpu, ndk.create_shared) # Execute the portable graph on cpu target print("Run CPU test ...") dev = remote.cpu(0) remote.upload(path_dso_cpu) f2 = remote.load_module("cpu_lib.so") a = tvm.nd.array(a_np, dev) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev) time_f = f2.time_evaluator(f2.entry_name, dev, number=10) cost = time_f(a, b).mean print("%g secs/op\n" % cost) np.testing.assert_equal(b.numpy(), a.numpy() + 1) # Compile the Graph for OpenCL target if test_opencl: s = te.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=64) s[B].bind(xi, te.thread_axis("threadIdx.x")) s[B].bind(xo, te.thread_axis("blockIdx.x")) # Build the dynamic lib. # If we don't want to do metal and only use cpu, just set target to be target f = tvm.build(s, [A, B], tvm.target.Target("opencl", host=target), name="myadd") path_dso_cl = temp.relpath("dev_lib_cl.so") f.export_library(path_dso_cl, ndk.create_shared) print("Run GPU(OpenCL Flavor) test ...") dev = remote.cl(0) remote.upload(path_dso_cl) f1 = remote.load_module("dev_lib_cl.so") a = tvm.nd.array(a_np, dev) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev) time_f = f1.time_evaluator(f1.entry_name, dev, number=10) cost = time_f(a, b).mean print("%g secs/op\n" % cost) np.testing.assert_equal(b.numpy(), a.numpy() + 1) # Compile the Graph for Vulkan target if test_vulkan: s = te.create_schedule(B.op) xo, xi = s[B].split(B.op.axis[0], factor=64) s[B].bind(xi, te.thread_axis("threadIdx.x")) s[B].bind(xo, te.thread_axis("blockIdx.x")) # Build the dynamic lib. # If we don't want to do metal and only use cpu, just set target to be target f = tvm.build(s, [A, B], tvm.target.Target("vulkan", host=target), name="myadd") path_dso_vulkan = temp.relpath("dev_lib_vulkan.so") f.export_library(path_dso_vulkan, ndk.create_shared) print("Run GPU(Vulkan Flavor) test ...") dev = remote.vulkan(0) remote.upload(path_dso_vulkan) f1 = remote.load_module("dev_lib_vulkan.so") a = tvm.nd.array(a_np, dev) b = tvm.nd.array(np.zeros(1024, dtype=A.dtype), dev) time_f = f1.time_evaluator(f1.entry_name, dev, number=10) cost = time_f(a, b).mean print("%g secs/op\n" % cost) np.testing.assert_equal(b.numpy(), a.numpy() + 1)
def compile_and_run_multiple_models(mod_map, input_list_map, output_list_map, target_options, param_map): """ This method verifies the generated source """ target = f"c -runtime=c --link-params --executor=aot {target_options}" tmp_path = utils.tempdir() tmp_dir = tmp_path.temp_dir base_path = os.path.join(tmp_dir, "test") build_path = os.path.join(base_path, "build") os.makedirs(build_path, exist_ok=True) for mod_name, mod in mod_map.items(): with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}): lib = tvm.relay.build(mod, target, target_host=target, params=param_map[mod_name], mod_name=mod_name) tar_file = os.path.join(base_path, "test.tar") export_model_library_format(lib, tar_file) t = tarfile.open(tar_file) t.extractall(base_path) input_list = input_list_map[mod_name] output_list = output_list_map[mod_name] for i in range(len(input_list_map[mod_name])): create_header_file((f'{mangle_name(mod_name,"input_data")}{i}'), input_list[i], build_path) for i in range(len(output_list_map[mod_name])): create_header_file( (f'{mangle_name(mod_name,"output_data")}{i}'), np.zeros(output_list[i].shape, output_list[i].dtype), build_path, ) create_header_file( (f'{mangle_name(mod_name,"expected_output_data")}{i}'), output_list[i], build_path) create_main("test.c", input_list_map, output_list_map, build_path, workspace_bytes=16384 * 1024) # Verify that compiles fine file_dir = os.path.dirname(os.path.abspath(__file__)) makefile = os.path.join(file_dir, "aot_test.mk") make_cmd = f"make -f {makefile} build_dir=" + build_path + f" TVM_ROOT={file_dir}/../../../.." compile_log_path = os.path.join(build_path, "test_compile.log") ret = subprocess_with_stdout_and_log(make_cmd, ".", compile_log_path, False) assert ret == 0 # Verify that runs fine run_log_path = os.path.join(build_path, "test_run.log") ret = subprocess_with_stdout_and_log("./aot_test_runner", build_path, run_log_path, False) assert ret == 0
def export_library(self, file_name, fcompile=None, addons=None, workspace_dir=None, **kwargs): """Export the module and its imported device code one library. This function only works on host llvm modules. It will pack all the imported modules Parameters ---------- file_name : str The name of the shared library. fcompile : function(target, file_list, kwargs), optional Compilation function to use create dynamic library. If fcompile has attribute object_format, will compile host library to that format. Otherwise, will use default format "o". workspace_dir : str, optional the path to a directory used to create intermediary artifacts for the process exporting of the library. If this is not provided a temporary dir will be created. kwargs : dict, optional Additional arguments passed to fcompile Returns ------- result of fcompile() : unknown, optional If the compilation function returns an artifact it would be returned via export_library, if any. """ # NOTE: this function depends on contrib library features # which are only available in when TVM function is available. if _RUNTIME_ONLY: raise RuntimeError( "Cannot call export_library in runtime only mode") # Extra dependencies during runtime. from pathlib import Path from tvm.contrib import cc as _cc, tar as _tar, utils as _utils if isinstance(file_name, Path): file_name = str(file_name) if self.type_key == "stackvm": if not file_name.endswith(".stackvm"): raise ValueError( "Module[%s]: can only be saved as stackvm format." "did you build with LLVM enabled?" % self.type_key) self.save(file_name) return modules = self._collect_dso_modules() if workspace_dir is None: temp = _utils.tempdir() workspace_dir = temp.temp_dir files = addons if addons else [] is_system_lib = False has_c_module = False llvm_target_triple = None for index, module in enumerate(modules): if fcompile is not None and hasattr(fcompile, "object_format"): if module.type_key == "c": object_format = "c" has_c_module = True else: object_format = fcompile.object_format else: if module.type_key == "llvm": object_format = "o" else: assert module.type_key == "c" object_format = "c" if "cc" in kwargs: if kwargs["cc"] == "nvcc": object_format = "cu" has_c_module = True path_obj = os.path.join(workspace_dir, f"lib{index}.{object_format}") module.save(path_obj) files.append(path_obj) is_system_lib = (module.type_key == "llvm" and module.get_function("__tvm_is_system_module")()) llvm_target_triple = (module.type_key == "llvm" and module.get_function("_get_target_triple")()) if not fcompile: if file_name.endswith(".tar"): fcompile = _tar.tar else: fcompile = _cc.create_shared if llvm_target_triple is None and hasattr(fcompile, "get_target_triple"): llvm_target_triple = fcompile.get_target_triple() if getattr(fcompile, "need_system_lib", False) and not is_system_lib: raise ValueError("%s need --system-lib option" % str(fcompile)) if self.imported_modules: if enabled("llvm") and llvm_target_triple: path_obj = os.path.join(workspace_dir, f"devc.{object_format}") m = _ffi_api.ModulePackImportsToLLVM(self, is_system_lib, llvm_target_triple) m.save(path_obj) files.append(path_obj) else: path_cc = os.path.join(workspace_dir, "devc.c") with open(path_cc, "w") as f: f.write(_ffi_api.ModulePackImportsToC(self, is_system_lib)) files.append(path_cc) # The imports could contain a c module but the object format could be tar # Thus, it would not recognize the following include paths as options # which are there assuming a c compiler is the fcompile. if has_c_module and not file_name.endswith(".tar"): options = [] if "options" in kwargs: opts = kwargs["options"] options = opts if isinstance(opts, (list, tuple)) else [opts] opts = options + ["-I" + path for path in find_include_path()] kwargs.update({"options": opts}) return fcompile(file_name, files, **kwargs)
def test_export_operator_model_library_format(): import tvm.micro as micro target = tvm.target.target.micro("host") with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}): A = tvm.te.placeholder((2, ), dtype="int8") B = tvm.te.placeholder((1, ), dtype="int8") C = tvm.te.compute(A.shape, lambda i: A[i] + B[0], name="C") sched = tvm.te.create_schedule(C.op) mod = tvm.build( sched, [A, B, C], tvm.target.Target(target, target), runtime=Runtime("crt", {"system-lib": True}), name="add", ) temp_dir = utils.tempdir() mlf_tar_path = temp_dir.relpath("lib.tar") micro.export_model_library_format(mod, mlf_tar_path) tf = tarfile.open(mlf_tar_path) extract_dir = temp_dir.relpath("extract") os.mkdir(extract_dir) tf.extractall(extract_dir) with open(os.path.join(extract_dir, "metadata.json")) as json_f: metadata = json.load(json_f) assert metadata["version"] == 6 assert metadata["model_name"] == "add" export_datetime = datetime.datetime.strptime( metadata["export_datetime"], "%Y-%m-%d %H:%M:%SZ") assert (datetime.datetime.now() - export_datetime) < datetime.timedelta(seconds=60 * 5) assert metadata["target"] == [str(target)] assert metadata["memory"]["add"][0]["dtype"] == "int8" assert metadata["memory"]["add"][0]["shape"] == [2] assert metadata["memory"]["add"][0]["size_bytes"] == 2 assert metadata["memory"]["add"][1]["dtype"] == "int8" assert metadata["memory"]["add"][1]["shape"] == [1] assert metadata["memory"]["add"][1]["size_bytes"] == 1 assert metadata["memory"]["add"][2]["dtype"] == "int8" assert metadata["memory"]["add"][2]["shape"] == [2] assert metadata["memory"]["add"][2]["size_bytes"] == 2 assert os.path.exists( os.path.join(extract_dir, "codegen", "host", "src", "lib0.c")) assert os.path.exists( os.path.join(extract_dir, "codegen", "host", "src", "lib1.c")) assert (len(mod.ir_module_by_target) == 1 ), f"expect 1 ir_model_by_target: {mod.ir_module_by_target!r}" for target, ir_mod in mod.ir_module_by_target.items(): assert int(tvm.runtime.ndarray.device(str(target)).device_type) == 1 with open(os.path.join(extract_dir, "src", "tir-1.txt")) as tir_f: assert tir_f.read() == str(ir_mod)
def test_aot_executor(hexagon_launcher, hexagon_session): dtype = "float32" input_shape = (1, 128, 128, 3) w_shape = (5, 5, 3, 8) data = relay.var("data", relay.TensorType(input_shape, dtype)) weight = relay.var("weight", relay.TensorType(w_shape, dtype)) y = relay.nn.conv2d( data, weight, padding=(2, 2), kernel_size=(5, 5), data_layout="NHWC", kernel_layout="HWIO", out_dtype="float32", ) f = relay.Function([data, weight], y) relay_mod = tvm.IRModule.from_expr(f) relay_mod = relay.transform.InferType()(relay_mod) target_hexagon = tvm.target.hexagon("v68") temp = utils.tempdir() dso_binary = "test_binary.so" dso_binary_path = temp / dso_binary weight_data = np.random.rand(w_shape[0], w_shape[1], w_shape[2], w_shape[3]).astype(dtype=dtype) input_data = np.random.rand(input_shape[0], input_shape[1], input_shape[2], input_shape[3]).astype(dtype=dtype) params = {"weight": weight_data} inputs = {"data": input_data} with tvm.transform.PassContext(opt_level=3): lowered = tvm.relay.build( relay_mod, params=params, target=tvm.target.Target(target_hexagon, host="c"), runtime=Runtime("cpp"), executor=Executor("aot", { "unpacked-api": False, "interface-api": "c" }), ) # Uncomment this once the workaround is not needed. # lowered.export_library( # dso_binary_path, fcompile=hexagon.create_aot_shared, hexagon_arch="v68" # ) lowered.export_library(dso_binary_path, fcompile=_workaround_create_aot_shared(), hexagon_arch="v68") if hexagon_session is None: pytest.skip( msg="Skip hardware test, ANDROID_SERIAL_NUMBER is not set.") hexagon_launcher.upload(dso_binary_path, dso_binary) aot_mod = hexagon_launcher.get_aot_executor(dso_binary, hexagon_session) aot_mod.set_input(**inputs) aot_mod.run() hexagon_output = aot_mod.get_output(0).numpy() target_llvm = tvm.target.Target("llvm") with tvm.transform.PassContext(opt_level=3): llvm_lowered = tvm.relay.build( relay_mod, tvm.target.Target(target_llvm, host=target_llvm), runtime=Runtime("cpp"), executor=Executor("graph"), ) llvm_graph_mod = tvm.contrib.graph_executor.GraphModule( llvm_lowered["default"](tvm.cpu(0))) llvm_graph_mod.set_input(**params) llvm_graph_mod.run(**inputs) expected_output = llvm_graph_mod.get_output(0).numpy() tvm.testing.assert_allclose(hexagon_output, expected_output, rtol=1e-4, atol=1e-5)
def test_graph_executor_multiple_conv2d(hexagon_launcher, hexagon_session): dtype = "float32" input_shape = (1, 8, 8, 3) w1_shape = (5, 5, 3, 1) w2_shape = (5, 5, 1, 3) data = relay.var("data", relay.TensorType(input_shape, dtype)) weight1 = relay.var("weight1", relay.TensorType(w1_shape, dtype)) weight2 = relay.var("weight2", relay.TensorType(w2_shape, dtype)) y1 = relay.nn.conv2d( data, weight1, padding=(2, 2), kernel_size=(5, 5), data_layout="NHWC", kernel_layout="HWIO", out_dtype="float32", ) y2 = relay.nn.conv2d( y1, weight2, padding=(2, 2), kernel_size=(5, 5), data_layout="NHWC", kernel_layout="HWIO", out_dtype="float32", ) f = relay.Function([data, weight1, weight2], y2) relay_mod = tvm.IRModule.from_expr(f) relay_mod = relay.transform.InferType()(relay_mod) target_hexagon = tvm.target.hexagon("v68") runtime = Runtime("cpp") executor = Executor("graph") temp = utils.tempdir() dso_binary = "test_binary.so" dso_binary_path = temp.relpath(dso_binary) with tvm.transform.PassContext(opt_level=3): lowered = tvm.relay.build( relay_mod, tvm.target.Target(target_hexagon, host=target_hexagon), runtime=runtime, executor=executor, ) lowered.get_lib().save(dso_binary_path) if hexagon_session is None: pytest.skip( msg="Skip hardware test since ANDROID_SERIAL_NUMBER is not set.") hexagon_launcher.upload(dso_binary_path, dso_binary) weight1_data = np.random.rand(w1_shape[0], w1_shape[1], w1_shape[2], w1_shape[3]).astype(dtype=dtype) weight2_data = np.random.rand(w2_shape[0], w2_shape[1], w2_shape[2], w2_shape[3]).astype(dtype=dtype) input_data = np.random.rand(input_shape[0], input_shape[1], input_shape[2], input_shape[3]).astype(dtype=dtype) params = {"weight1": weight1_data, "weight2": weight2_data} inputs = {"data": input_data} graph_mod = hexagon_launcher.get_graph_executor(lowered.get_graph_json(), dso_binary, hexagon_session) graph_mod.set_input(**params) graph_mod.run(**inputs) hexagon_output = graph_mod.get_output(0).numpy() target_llvm = tvm.target.Target("llvm") with tvm.transform.PassContext(opt_level=3): llvm_lowered = tvm.relay.build( relay_mod, tvm.target.Target(target_llvm, host=target_llvm), runtime=runtime, executor=executor, ) llvm_graph_mod = tvm.contrib.graph_executor.GraphModule( llvm_lowered["default"](tvm.cpu(0))) llvm_graph_mod.set_input(**params) llvm_graph_mod.run(**inputs) expected_output = llvm_graph_mod.get_output(0).numpy() tvm.testing.assert_allclose(hexagon_output, expected_output, rtol=1e-4, atol=1e-5)
# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from tvm import relay from tvm.relay import testing import tvm from tvm import te import tvm.testing from tvm.contrib import utils header_file_dir_path = utils.tempdir() def gen_engine_header(): code = r""" #ifndef _ENGINE_H_ #define _ENGINE_H_ #include <cstdint> #include <string> #include <sstream> #include <vector> class Engine { }; #endif """
def test_mobilenet(): temp = utils.tempdir() image, synset = prepare_input() model, params = get_model("mobilenetv2_1.0", image.shape) def run(mod, target): with relay.build_config(opt_level=3): lib = relay.build(mod, target=target, target_host=target_host, params=params) path_dso = temp.relpath("deploy.dylib") lib.export_library(path_dso, xcode.create_dylib, arch=arch, sdk=sdk) xcode.codesign(path_dso) # Start RPC test server that contains the compiled library. xcode.popen_test_rpc(proxy_host, proxy_port, key, destination=destination, libs=[path_dso]) # connect to the proxy remote = rpc.connect(proxy_host, proxy_port, key=key) if target == "metal": dev = remote.metal(0) else: dev = remote.cpu(0) lib = remote.load_module("deploy.dylib") m = graph_executor.GraphModule(lib["default"](dev)) m.set_input("data", tvm.nd.array(image, dev)) m.run() tvm_output = m.get_output(0) top1 = np.argmax(tvm_output.asnumpy()[0]) print("TVM prediction top-1:", top1, synset[top1]) # evaluate ftimer = m.module.time_evaluator("run", dev, number=3, repeat=10) prof_res = np.array(ftimer().results) * 1000 print("%-19s (%s)" % ("%.2f ms" % np.mean(prof_res), "%.2f ms" % np.std(prof_res))) def annotate(func, compiler): """ An annotator for Core ML. """ # Bind free variables to the constant values. bind_dict = {} for arg in func.params: name = arg.name_hint if name in params: bind_dict[arg] = relay.const(params[name]) func = relay.bind(func, bind_dict) # Annotate the entire graph for Core ML mod = tvm.IRModule() mod["main"] = func seq = tvm.transform.Sequential([ transform.SimplifyInference(), transform.FoldConstant(), transform.FoldScaleAxis(), transform.AnnotateTarget(compiler), transform.MergeCompilerRegions(), transform.PartitionGraph(), ]) with relay.build_config(opt_level=3): mod = seq(mod) return mod # CPU run(model, target_host) # Metal run(model, "metal") # CoreML run(annotate(model, "coremlcompiler"), target_host)
def test_packed_global_variables(): """Check packed global variables in codegen output.""" dtype = "float32" ishape = (1, 32, 14, 14) wshape = (32, 32, 3, 3) interface_api = "packed" use_unpacked_api = False data0 = relay.var("data", shape=ishape, dtype=dtype) weight0 = relay.var("weight", shape=wshape, dtype=dtype) out = relay.nn.conv2d(data0, weight0, kernel_size=(3, 3), padding=(1, 1), groups=1) main_f = relay.Function([data0, weight0], out) mod = tvm.IRModule() mod["main"] = main_f mod = transform.InferType()(mod) i_data = np.random.uniform(0, 1, ishape).astype(dtype) w1_data = np.random.uniform(0, 1, wshape).astype(dtype) inputs = OrderedDict([("data", i_data), ("weight", w1_data)]) output_list = generate_ref_data(mod, inputs) compiled_models_list = compile_models( models=AOTTestModel(module=mod, inputs=inputs, outputs=output_list), interface_api=interface_api, use_unpacked_api=use_unpacked_api, workspace_byte_alignment=8, enable_op_fusion=True, pass_config=AOT_DEFAULT_RUNNER.pass_config, use_runtime_executor=True, target=tvm.target.Target("c"), ) compiled_model = compiled_models_list[0] tmp_path = utils.tempdir() base_path = tmp_path.temp_dir model = compiled_model.model tar_file = os.path.join(base_path, f"{model.name}.tar") export_model_library_format(compiled_model.executor_factory, tar_file) t = tarfile.open(tar_file) t.extractall(base_path) file_list = [] for path in (pathlib.Path(base_path) / "codegen" / "host" / "src").iterdir(): if path.is_file(): file_list.append(path) assert len(file_list) > 0 for path in file_list: with open(path, "r") as lib_f: lib1 = lib_f.readlines() tvmgen_names = [] tvmgen_funcs = [] for line in lib1: for item in line.split(" "): # Find all names starting with tvmgen_default if item.startswith("tvmgen_default"): # Collect any name starting with tvmgen_default tvmgen_names.append(item) # Collect all functions starting with tvmgen_default tvmgen_funcs += re.findall(r"(?<=).*(?=\()", item) # Check if any function name has a packed variable name in all items that start with tvmgen_default for func in tvmgen_funcs: assert f"{func}_packed" not in tvmgen_names
def test_graph_executor( android_serial_number, tvm_tracker_host, tvm_tracker_port, adb_server_socket ): dtype = "float32" data = relay.var("data", relay.TensorType((1, 64, 64, 3), dtype)) weight = relay.var("weight", relay.TensorType((5, 5, 3, 8), dtype)) y = relay.nn.conv2d( data, weight, padding=(2, 2), kernel_size=(5, 5), data_layout="NHWC", kernel_layout="HWIO", out_dtype="float32", ) f = relay.Function([data, weight], y) relay_mod = tvm.IRModule.from_expr(f) relay_mod = relay.transform.InferType()(relay_mod) target_hexagon = tvm.target.hexagon("v68") runtime = Runtime("cpp") executor = Executor("graph") temp = utils.tempdir() dso_binary = "test_binary.so" dso_binary_path = temp.relpath(dso_binary) weight_in = np.random.rand(5, 5, 3, 8).astype(dtype=dtype) data_in = np.random.rand(1, 64, 64, 3).astype(dtype=dtype) params = {"weight": weight_in} inputs = {"data": data_in} with tvm.transform.PassContext(opt_level=3): lowered = tvm.relay.build( relay_mod, tvm.target.Target(target_hexagon, host=target_hexagon), runtime=runtime, executor=executor, ) lowered.get_lib().save(dso_binary_path) if not android_serial_number: pytest.skip(msg="Skip hardware test since ANDROID_SERIAL_NUMBER is not set.") rpc_info = { "rpc_tracker_host": tvm_tracker_host, "rpc_tracker_port": tvm_tracker_port, "rpc_server_port": RPC_SERVER_PORT + 3, # See note at the beginning of the file "adb_server_socket": adb_server_socket, } launcher = HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info) launcher.upload(dso_binary_path, dso_binary) launcher.start_server() with launcher.start_session() as sess: graph_mod = launcher.get_graph_executor(lowered.get_graph_json(), dso_binary, sess) graph_mod.set_input(**params) graph_mod.run(**inputs) hexagon_output = graph_mod.get_output(0).numpy() launcher.stop_server() target_llvm = tvm.target.Target("llvm") with tvm.transform.PassContext(opt_level=3): llvm_lowered = tvm.relay.build( relay_mod, tvm.target.Target(target_llvm, host=target_llvm), runtime=runtime, executor=executor, ) llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0))) llvm_graph_mod.set_input(**params) llvm_graph_mod.run(**inputs) expected_output = llvm_graph_mod.get_output(0).numpy() tvm.testing.assert_allclose(hexagon_output, expected_output, rtol=1e-4, atol=1e-5)
def _c_to_llvm(c_code: str) -> str: unique_filename = str(uuid.uuid4()) temp = utils.tempdir() ll_path = temp.relpath(f"{unique_filename}.ll") ll_code = clang.create_llvm([c_code], output=ll_path) return ll_code
def test_aot_executor_multiple_conv2d( tvm_tracker_host, tvm_tracker_port, android_serial_number, adb_server_socket ): dtype = "float32" input_shape = (1, 8, 8, 3) w1_shape = (5, 5, 3, 1) w2_shape = (5, 5, 1, 3) data = relay.var("data", relay.TensorType(input_shape, dtype)) weight1 = relay.var("weight1", relay.TensorType(w1_shape, dtype)) weight2 = relay.var("weight2", relay.TensorType(w2_shape, dtype)) y1 = relay.nn.conv2d( data, weight1, padding=(2, 2), kernel_size=(5, 5), data_layout="NHWC", kernel_layout="HWIO", out_dtype="float32", ) y2 = relay.nn.conv2d( y1, weight2, padding=(2, 2), kernel_size=(5, 5), data_layout="NHWC", kernel_layout="HWIO", out_dtype="float32", ) f = relay.Function([data, weight1, weight2], y2) relay_mod = tvm.IRModule.from_expr(f) relay_mod = relay.transform.InferType()(relay_mod) target_hexagon = tvm.target.hexagon("v68") temp = utils.tempdir() dso_binary = "test_binary.so" dso_binary_path = temp / dso_binary weight1_data = np.random.rand(w1_shape[0], w1_shape[1], w1_shape[2], w1_shape[3]).astype( dtype=dtype ) weight2_data = np.random.rand(w2_shape[0], w2_shape[1], w2_shape[2], w2_shape[3]).astype( dtype=dtype ) input_data = np.random.rand( input_shape[0], input_shape[1], input_shape[2], input_shape[3] ).astype(dtype=dtype) params = {"weight1": weight1_data, "weight2": weight2_data} inputs = {"data": input_data} with tvm.transform.PassContext(opt_level=3): lowered = tvm.relay.build( relay_mod, params=params, target=tvm.target.Target(target_hexagon, host="c"), runtime=Runtime("cpp"), executor=Executor("aot", {"unpacked-api": False, "interface-api": "c"}), ) # Uncomment this once the workaround is not needed. # lowered.export_library( # dso_binary_path, fcompile=hexagon.create_aot_shared, hexagon_arch="v68" # ) lowered.export_library( dso_binary_path, fcompile=_workaround_create_aot_shared(), hexagon_arch="v68" ) if not android_serial_number: pytest.skip(msg="Skip hardware test since ANDROID_SERIAL_NUMBER is not set.") rpc_info = { "rpc_tracker_host": tvm_tracker_host, "rpc_tracker_port": tvm_tracker_port, "rpc_server_port": RPC_SERVER_PORT + 6, # See note at the beginning of the file "adb_server_socket": adb_server_socket, } launcher = HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info) launcher.upload(dso_binary_path, dso_binary) launcher.start_server() with launcher.start_session() as sess: aot_mod = launcher.get_aot_executor(dso_binary, sess) aot_mod.set_input(**inputs) aot_mod.run() hexagon_output = aot_mod.get_output(0).numpy() launcher.stop_server() target_llvm = tvm.target.Target("llvm") with tvm.transform.PassContext(opt_level=3): llvm_lowered = tvm.relay.build( relay_mod, tvm.target.Target(target_llvm, host=target_llvm), runtime=Runtime("cpp"), executor=Executor("graph"), ) llvm_graph_mod = tvm.contrib.graph_executor.GraphModule(llvm_lowered["default"](tvm.cpu(0))) llvm_graph_mod.set_input(**params) llvm_graph_mod.run(**inputs) expected_output = llvm_graph_mod.get_output(0).numpy() tvm.testing.assert_allclose(hexagon_output, expected_output, rtol=1e-4, atol=1e-5)
def run_and_check( models: List[AOTCompiledTestModel], runner: AOTTestRunner, interface_api: str, debug_calculated_workspaces=False, workspace_byte_alignment=8, data_linkage: AOTDataLinkage = None, ): """ This method uses the original test data and compiled runtime.Modules to run in the test runner to verify the results. """ tmp_path = utils.tempdir() tmp_dir = tmp_path.temp_dir cflags = f"-DTVM_RUNTIME_ALLOC_ALIGNMENT_BYTES={workspace_byte_alignment} " # The calculated workspaces will not account for stack allocator tags used for debugging if debug_calculated_workspaces: cflags += "-DTVM_CRT_STACK_ALLOCATOR_ENABLE_LIFO_CHECK " base_path = os.path.join(tmp_dir, "test") build_path = os.path.join(base_path, "build") os.makedirs(build_path, exist_ok=True) include_path = os.path.join(base_path, "include") os.mkdir(include_path) crt_root = tvm.micro.get_standalone_crt_dir() shutil.copy2( os.path.join(crt_root, "template", "crt_config-template.h"), os.path.join(include_path, "crt_config.h"), ) workspace_bytes = 0 for compiled_model in models: model = compiled_model.model tar_file = os.path.join(base_path, f"{model.name}.tar") export_model_library_format(compiled_model.executor_factory, tar_file) t = tarfile.open(tar_file) t.extractall(base_path) workspace_bytes += model.extra_memory_in_bytes if interface_api == "packed": workspace_bytes += mlf_extract_workspace_size_bytes(tar_file) for key in model.inputs: sanitized_tensor_name = re.sub(r"\W", "_", key) create_header_file( f'{mangle_name(model.name, "input_data")}_{sanitized_tensor_name}', model.inputs[key], include_path, data_linkage, ) for i in range(len(model.outputs)): create_header_file( (f'{mangle_name(model.name,"output_data")}{i}'), np.zeros(model.outputs[i].shape, model.outputs[i].dtype), include_path, data_linkage, ) create_header_file( (f'{mangle_name(model.name, "expected_output_data")}{i}'), model.outputs[i], include_path, data_linkage, ) create_main( "test.c", models, build_path, runner.includes, runner.prologue, runner.epilogue, data_linkage, interface_api, workspace_bytes, ) # Verify that compiles fine file_dir = os.path.dirname(os.path.abspath(__file__)) codegen_path = os.path.join(base_path, "codegen") makefile = os.path.join(file_dir, f"{runner.makefile}.mk") custom_params = " ".join( [f" {param}='{value}'" for param, value in runner.parameters.items()]) make_command = ( f"make -f {makefile} build_dir={build_path}" + f" CFLAGS='{cflags}'" + f" TVM_ROOT={file_dir}/../../../.." + f" AOT_TEST_ROOT={file_dir}" + f" CODEGEN_ROOT={codegen_path}" + f" STANDALONE_CRT_DIR={tvm.micro.get_standalone_crt_dir()}" + custom_params) compile_log_path = os.path.join(build_path, "test_compile.log") compile_command = f"{make_command} aot_test_runner" ret = subprocess_log_output(compile_command, ".", compile_log_path) assert ret == 0 # Verify that runs fine run_log_path = os.path.join(build_path, "test_run.log") run_command = f"{make_command} run" ret = subprocess_log_output(run_command, build_path, run_log_path) assert ret == 0 with open(run_log_path) as run_log: assert AOT_SUCCESS_TOKEN in run_log.read()
def __init__(self, package_path: str): self._tmp_dir = utils.tempdir() self.package_path = package_path self.import_package(self.package_path)
def run_and_check( models: List[AOTCompiledTestModel], runner: AOTTestRunner, interface_api: str, debug_calculated_workspaces=False, workspace_byte_alignment=8, data_linkage: AOTDataLinkage = None, test_dir: str = None, verbose: bool = False, ): """ This method uses the original test data and compiled runtime.Modules to run in the test runner to verify the results. """ base_path = test_dir if test_dir is None: tmp_path = utils.tempdir() tmp_dir = tmp_path.temp_dir base_path = os.path.join(tmp_dir, "test") cflags = f"-DTVM_RUNTIME_ALLOC_ALIGNMENT_BYTES={workspace_byte_alignment} " # The calculated workspaces will not account for stack allocator tags used for debugging if debug_calculated_workspaces: cflags += "-DTVM_CRT_STACK_ALLOCATOR_ENABLE_LIFO_CHECK " base_path = os.path.abspath(base_path) build_path = os.path.join(base_path, "build") os.makedirs(build_path, exist_ok=True) include_path = os.path.join(base_path, "include") os.mkdir(include_path) crt_root = tvm.micro.get_standalone_crt_dir() shutil.copy2( os.path.join(crt_root, "template", "crt_config-template.h"), os.path.join(include_path, "crt_config.h"), ) workspace_bytes = 0 for compiled_model in models: model = compiled_model.model tar_file = os.path.join(base_path, f"{model.name}.tar") export_model_library_format(compiled_model.executor_factory, tar_file) t = tarfile.open(tar_file) t.extractall(base_path) # Interface C APIs does not need compiler generated # workspace to generate the test application, because # workspace size is codegen'd as a macro to # tvmgen_<model_name>.h. if interface_api != "c": workspace_bytes += mlf_extract_workspace_size_bytes(tar_file) workspace_bytes += model.extra_memory_in_bytes for key in model.inputs: sanitized_tensor_name = re.sub(r"\W", "_", key) create_header_file( f'{mangle_name(model.name, "input_data")}_{sanitized_tensor_name}', model.inputs[key], include_path, data_linkage, ) for key in model.outputs: sanitized_tensor_name = re.sub(r"\W", "_", key) create_header_file( f'{mangle_name(model.name, "output_data")}_{sanitized_tensor_name}', np.zeros(model.outputs[key].shape, model.outputs[key].dtype), include_path, data_linkage, ) create_header_file( f'{mangle_name(model.name, "expected_output_data")}_{sanitized_tensor_name}', model.outputs[key], include_path, data_linkage, ) use_usmp = runner.pass_config.get("tir.usmp.enable", False) # We only need the stack allocator if USMP is not used use_stack_allocator = not use_usmp create_main( "test.c", models, build_path, runner.includes, runner.prologue, runner.epilogue, data_linkage, interface_api, workspace_bytes, use_stack_allocator, ) # Verify that compiles fine file_dir = os.path.dirname(os.path.abspath(__file__)) codegen_path = os.path.join(base_path, "codegen") makefile = os.path.join(file_dir, f"{runner.makefile}.mk") fvp_dir = "/opt/arm/FVP_Corstone_SSE-300/models/Linux64_GCC-6.4/" # TODO(@grant-arm): Remove once ci_cpu docker image has been updated to FVP_Corstone_SSE if not os.path.isdir(fvp_dir): fvp_dir = "/opt/arm/FVP_Corstone_SSE-300_Ethos-U55/models/Linux64_GCC-6.4/" custom_params = " ".join( [f" {param}='{value}'" for param, value in runner.parameters.items()]) make_command = ( f"make -f {makefile} build_dir={build_path}" + f" CFLAGS='{cflags}'" + f" TVM_ROOT={file_dir}/../../../.." + f" AOT_TEST_ROOT={file_dir}" + f" CODEGEN_ROOT={codegen_path}" + f" STANDALONE_CRT_DIR={tvm.micro.get_standalone_crt_dir()}" + f" FVP_DIR={fvp_dir}" + custom_params) compile_log_path = os.path.join(build_path, "test_compile.log") compile_command = f"{make_command} aot_test_runner" if verbose: print("Compile command:\n", compile_command) ret = subprocess_log_output(compile_command, ".", compile_log_path) assert ret == 0 # Verify that runs fine run_log_path = os.path.join(build_path, "test_run.log") run_command = f"{make_command} run" if verbose: print("Run command:\n", run_command) ret = subprocess_log_output(run_command, build_path, run_log_path) assert ret == 0 with open(run_log_path) as run_log: assert AOT_SUCCESS_TOKEN in run_log.read()
def test_c_link_params(): temp_dir = utils.tempdir() for dtype in LINKABLE_DTYPES: mod, param_init = _make_mod_and_params(dtype) rand_input = _make_random_tensor(dtype, INPUT_SHAPE) main_func = mod["main"] target = "c --link-params" with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}): lib = tvm.relay.build(mod, target, params=param_init) assert set(lib.params.keys()) == {"p0", "p1"} # NOTE: op folded src = lib.lib.get_source() lib.lib.save(temp_dir.relpath("test.c"), "c") c_dtype = _get_c_datatype(dtype) src_lines = src.split("\n") param = lib.params["p0"].asnumpy().reshape(np.prod(KERNEL_SHAPE)) param_def = f"static const {c_dtype} __tvm_param__p0[{np.prod(param.shape)}] = {{" for i, line in enumerate(src_lines): if line == param_def: i += 1 break else: assert False, f'did not find parameter definition "{param_def}":\n{src}' cursor = 0 width = dtype_info(dtype).bits // 4 + 2 if dtype.startswith("int"): width += 1 # Account for sign while "};" not in src_lines[i]: for match in HEX_NUM_RE.finditer(src_lines[i]): assert match.group() == _format_c_value( dtype, width, param[cursor] ), (f'p0 byte {cursor}: want "{_format_c_value(dtype, width, param[cursor])}" got ' f'"{match.group(0)}"; full p0 follows:\n{src}') cursor += 1 i += 1 assert cursor == np.prod(param.shape) # Need a unique name per library to avoid dlopen caching the lib load. lib_path = temp_dir.relpath(f"test-{dtype}-linked.so") lib["remove_params"]().export_library(lib_path) lib_mod = tvm.runtime.load_module(lib_path) # lib_mod = lib_factory['default']() graph = json.loads(lib.graph_json) for p in lib.params: _verify_linked_param(dtype, lib, lib_mod, graph, p) # Wrap in function to explicitly deallocate the runtime. def _run_linked(lib_mod): graph_rt = tvm.contrib.graph_executor.GraphModule( lib_mod["default"](tvm.cpu(0))) graph_rt.set_input("rand_input", rand_input) # NOTE: params not required. graph_rt.run() return graph_rt.get_output(0) linked_output = _run_linked(lib_mod) linked_params = lib.params with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}): lib = tvm.relay.build(mod, "c", params=param_init) _, _, params = lib # Need a unique name per library to avoid dlopen caching the lib load. lib_path = temp_dir.relpath(f"test-{dtype}-unlinked.so") lib.export_library(lib_path) lib_mod = tvm.runtime.load_module(lib_path) def _run_unlinked(lib_mod): graph_rt = tvm.contrib.graph_executor.GraphModule( lib_mod["default"](tvm.cpu(0))) graph_rt.set_input("rand_input", rand_input, **params) graph_rt.run() return graph_rt.get_output(0) unlinked_output = _run_unlinked(lib_mod) if "int" in dtype: np.testing.assert_equal(unlinked_output.asnumpy(), linked_output.asnumpy()) else: np.testing.assert_allclose(unlinked_output.asnumpy(), linked_output.asnumpy())
def check_local(coreml_model): temp = utils.tempdir() compiled_model = xcode.compile_coreml(coreml_model, out_dir=temp.temp_dir) ctx = tvm.cpu(0) verify(coreml_model, compiled_model, ctx)
def test_export_byoc_c_module(): """Test BYOC flow when it produces DSO-exportable modules. NOTE the general BYOC flow is not fully supported by Model Library Format right now. """ x = tvm.relay.var("x", shape=(10, 10)) w0 = tvm.relay.var("w0", shape=(10, 10)) w1 = tvm.relay.var("w1", shape=(10, 10)) w2 = tvm.relay.var("w2", shape=(10, 10)) w3 = tvm.relay.var("w3", shape=(10, 10)) w4 = tvm.relay.var("w4", shape=(10, 10)) w5 = tvm.relay.var("w5", shape=(10, 10)) w6 = tvm.relay.var("w6", shape=(10, 10)) w7 = tvm.relay.var("w7", shape=(10, 10)) # C compiler z0 = tvm.relay.add(x, w0) p0 = tvm.relay.subtract(z0, w1) q0 = tvm.relay.multiply(p0, w2) z1 = tvm.relay.add(x, w3) p1 = tvm.relay.subtract(z1, w4) q1 = tvm.relay.multiply(p1, w5) # Other parts on TVM z2 = tvm.relay.add(x, w6) q2 = tvm.relay.subtract(z2, w7) r = tvm.relay.concatenate((q0, q1, q2), axis=0) f = tvm.relay.Function([x, w0, w1, w2, w3, w4, w5, w6, w7], r) mod = tvm.IRModule() ann = byoc.CcompilerAnnotator() mod["main"] = ann.visit(f) mod = tvm.relay.transform.PartitionGraph("mod_name")(mod) mod = tvm.relay.transform.InferType()(mod) with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}): factory = tvm.relay.build(mod, tvm.target.target.micro("host"), runtime=Runtime("crt")) temp_dir = utils.tempdir() mlf_tar_path = temp_dir.relpath("lib.tar") from tvm import micro micro.export_model_library_format(factory, mlf_tar_path) with tarfile.open(mlf_tar_path, "r:*") as tf: tar_members = [ti.name for ti in tf.getmembers()] print("tar members", tar_members) assert "./metadata.json" in tar_members with tf.extractfile("./metadata.json") as f: metadata = json.load(f) main_md = metadata["memory"]["functions"]["main"] assert main_md == [ { "constants_size_bytes": 0, "device": 1, "io_size_bytes": 4800, "workspace_size_bytes": 800, } ]
def _listen_loop(sock, port, rpc_key, tracker_addr, load_library, custom_addr): """Listening loop of the server.""" def _accept_conn(listen_sock, tracker_conn, ping_period=2): """Accept connection from the other places. Parameters ---------- listen_sock: Socket The socket used by listening process. tracker_conn : connnection to tracker Tracker connection ping_period : float, optional ping tracker every k seconds if no connection is accepted. """ old_keyset = set() # Report resource to tracker if tracker_conn: matchkey = base.random_key(rpc_key + ":") base.sendjson( tracker_conn, [TrackerCode.PUT, rpc_key, (port, matchkey), custom_addr]) assert base.recvjson(tracker_conn) == TrackerCode.SUCCESS else: matchkey = rpc_key unmatch_period_count = 0 unmatch_timeout = 4 # Wait until we get a valid connection while True: if tracker_conn: trigger = select.select([listen_sock], [], [], ping_period) if not listen_sock in trigger[0]: base.sendjson(tracker_conn, [TrackerCode.GET_PENDING_MATCHKEYS]) pending_keys = base.recvjson(tracker_conn) old_keyset.add(matchkey) # if match key not in pending key set # it means the key is acquired by a client but not used. if matchkey not in pending_keys: unmatch_period_count += 1 else: unmatch_period_count = 0 # regenerate match key if key is acquired but not used for a while if unmatch_period_count * ping_period > unmatch_timeout + ping_period: logger.info( "no incoming connections, regenerate key ...") matchkey = base.random_key(rpc_key + ":", old_keyset) base.sendjson(tracker_conn, [ TrackerCode.PUT, rpc_key, (port, matchkey), custom_addr ]) assert base.recvjson( tracker_conn) == TrackerCode.SUCCESS unmatch_period_count = 0 continue conn, addr = listen_sock.accept() magic = struct.unpack("<i", base.recvall(conn, 4))[0] if magic != base.RPC_MAGIC: conn.close() continue keylen = struct.unpack("<i", base.recvall(conn, 4))[0] key = py_str(base.recvall(conn, keylen)) arr = key.split() expect_header = "client:" + matchkey server_key = "server:" + rpc_key if arr[0] != expect_header: conn.sendall(struct.pack("<i", base.RPC_CODE_MISMATCH)) conn.close() logger.warning("mismatch key from %s", addr) continue conn.sendall(struct.pack("<i", base.RPC_CODE_SUCCESS)) conn.sendall(struct.pack("<i", len(server_key))) conn.sendall(server_key.encode("utf-8")) return conn, addr, _parse_server_opt(arr[1:]) # Server logic tracker_conn = None while True: try: # step 1: setup tracker and report to tracker if tracker_addr and tracker_conn is None: tracker_conn = base.connect_with_retry(tracker_addr) tracker_conn.sendall(struct.pack("<i", base.RPC_TRACKER_MAGIC)) magic = struct.unpack("<i", base.recvall(tracker_conn, 4))[0] if magic != base.RPC_TRACKER_MAGIC: raise RuntimeError("%s is not RPC Tracker" % str(tracker_addr)) # report status of current queue cinfo = {"key": "server:" + rpc_key} base.sendjson(tracker_conn, [TrackerCode.UPDATE_INFO, cinfo]) assert base.recvjson(tracker_conn) == TrackerCode.SUCCESS # step 2: wait for in-coming connections conn, addr, opts = _accept_conn(sock, tracker_conn) except (socket.error, IOError): # retry when tracker is dropped if tracker_conn: tracker_conn.close() tracker_conn = None continue except RuntimeError as exc: raise exc # step 3: serving work_path = utils.tempdir() logger.info("connection from %s", addr) server_proc = multiprocessing.Process(target=_serve_loop, args=(conn, addr, load_library, work_path)) server_proc.start() # close from our side. conn.close() # wait until server process finish or timeout server_proc.join(opts.get("timeout", None)) if server_proc.is_alive(): logger.info("Timeout in RPC session, kill..") # pylint: disable=import-outside-toplevel import psutil parent = psutil.Process(server_proc.pid) # terminate worker children for child in parent.children(recursive=True): child.terminate() # terminate the worker server_proc.terminate() work_path.remove()
def test_gemm_gpu(N, times, bn, num_block, num_thread): assert bn <= N assert num_thread * num_thread * 16 <= N assert num_block * num_block * 2 <= N A = te.placeholder((N, N), name="A") B = te.placeholder((N, N), name="Btmp") k = te.reduce_axis((0, N), name="k") packedB = te.compute((N, N / bn, bn), lambda x, y, z: B[x, y * bn + z], name="B") C = te.compute( (N, N), lambda ii, jj: te.sum(A[ii, k] * packedB[k, jj / bn, jj % bn], axis=k), name="C" ) s = te.create_schedule(C.op) CC = s.cache_write(C, "local") block_x = te.thread_axis("blockIdx.x") block_y = te.thread_axis("blockIdx.y") thread_x = te.thread_axis("threadIdx.x") thread_y = te.thread_axis("threadIdx.y") thread_xz = te.thread_axis((0, 2), "vthread", name="vx") thread_yz = te.thread_axis((0, 2), "vthread", name="vy") pby, pbi = s[packedB].split(packedB.op.axis[0], nparts=num_thread) pbx, pbj = s[packedB].split(packedB.op.axis[1], nparts=num_thread) s[packedB].bind(pby, thread_y) s[packedB].bind(pbx, thread_x) pbz, pbk = s[packedB].split(packedB.op.axis[2], factor=8) s[packedB].vectorize(pbk) by, yi = s[C].split(C.op.axis[0], nparts=num_block) bx, xi = s[C].split(C.op.axis[1], nparts=num_thread) s[C].bind(by, block_y) s[C].bind(bx, thread_y) s[C].reorder(by, bx, yi, xi) tyz, yi = s[C].split(yi, nparts=2) ty, yi = s[C].split(yi, nparts=num_block) txz, xi = s[C].split(xi, nparts=2) tx, xi = s[C].split(xi, nparts=num_thread) s[C].reorder(tyz, txz, ty, tx, yi, xi) s[C].bind(tyz, thread_yz) s[C].bind(txz, thread_xz) s[C].bind(ty, block_x) s[C].bind(tx, thread_x) xyi, xxi = s[C].split(xi, factor=8) s[C].reorder(tyz, txz, ty, tx, yi, xyi, xxi) s[C].vectorize(xxi) s[CC].compute_at(s[C], yi) yo, xo = CC.op.axis s[CC].reorder(k, yo, xo) xo, xi = s[CC].split(xo, factor=8) s[CC].vectorize(xi) ko, ki = s[CC].split(k, factor=2) s[CC].unroll(ki) print(tvm.lower(s, [A, B, C], simple_mode=True)) f = tvm.build(s, [A, B, C], "opencl", target_host=target, name="gemm_gpu") temp = utils.tempdir() path_dso = temp.relpath("gemm_gpu.so") f.export_library(path_dso, ndk.create_shared) # connect to the proxy remote = rpc.connect(proxy_host, proxy_port, key=key) ctx = remote.cl(0) remote.upload(path_dso) f = remote.load_module("gemm_gpu.so") evaluate(f, ctx, N, times)
def export_library(self, file_name, fcompile=None, addons=None, workspace_dir=None, **kwargs): """ Export the module and all imported modules into a single device library. This function only works on host LLVM modules, other runtime::Module subclasses will work with this API but they must support implement the save and load mechanisms of modules completely including saving from streams and files. This will pack your non-shared library module into a single shared library which can later be loaded by TVM. Parameters ---------- file_name : str The name of the shared library. fcompile : function(target, file_list, kwargs), optional The compilation function to use create the final library object during export. For example, when fcompile=_cc.create_shared, or when it is not supplied but module is "llvm," this is used to link all produced artifacts into a final dynamic library. This behavior is controlled by the type of object exported. If fcompile has attribute object_format, will compile host library to that format. Otherwise, will use default format "o". workspace_dir : str, optional The path of the directory used to create the intermediate artifacts when exporting the module. If this is not provided a temporary dir will be created. kwargs : dict, optional Additional arguments passed to fcompile Returns ------- result of fcompile() : unknown, optional If the compilation function returns an artifact it would be returned via export_library, if any. """ # NOTE: this function depends on contrib library features # which are only available in when TVM function is available. if _RUNTIME_ONLY: raise RuntimeError( "Cannot call export_library in runtime only mode") # Extra dependencies during runtime. from pathlib import Path from tvm.contrib import cc as _cc, tar as _tar, utils as _utils if isinstance(file_name, Path): file_name = str(file_name) if self.type_key == "stackvm": if not file_name.endswith(".stackvm"): raise ValueError( "Module[%s]: can only be saved as stackvm format." "did you build with LLVM enabled?" % self.type_key) self.save(file_name) return modules = self._collect_dso_modules() if workspace_dir is None: temp = _utils.tempdir() workspace_dir = temp.temp_dir files = addons if addons else [] is_system_lib = False has_c_module = False llvm_target_string = None for index, module in enumerate(modules): if fcompile is not None and hasattr(fcompile, "object_format"): if module.type_key == "c": assert module.format in [ "c", "cc", "cpp", "cu", ], "The module.format needs to be either c, cc, cpp or cu." object_format = module.format has_c_module = True else: object_format = fcompile.object_format else: if module.type_key == "c": if len(module.format) > 0: assert module.format in [ "c", "cc", "cpp", "cu", ], "The module.format needs to be either c, cc, cpp, or cu." object_format = module.format else: object_format = "c" if "cc" in kwargs: if kwargs["cc"] == "nvcc": object_format = "cu" has_c_module = True else: assert module.type_key == "llvm" or module.type_key == "static_library" object_format = "o" path_obj = os.path.join(workspace_dir, f"lib{index}.{object_format}") module.save(path_obj) files.append(path_obj) is_system_lib = (module.type_key == "llvm" and module.get_function("__tvm_is_system_module")()) llvm_target_string = (module.type_key == "llvm" and module.get_function("_get_target_string")()) if not fcompile: if file_name.endswith(".tar"): fcompile = _tar.tar else: fcompile = _cc.create_shared if llvm_target_string is None and hasattr(fcompile, "get_target_triple"): triple = fcompile.get_target_triple() assert triple, "Target triple should not be empty" llvm_target_string = "llvm -mtriple " + triple if getattr(fcompile, "need_system_lib", False) and not is_system_lib: raise ValueError("%s need --system-lib option" % str(fcompile)) if self.imported_modules: if enabled("llvm") and llvm_target_string: path_obj = os.path.join(workspace_dir, f"devc.{object_format}") m = _ffi_api.ModulePackImportsToLLVM(self, is_system_lib, llvm_target_string) m.save(path_obj) files.append(path_obj) else: path_cc = os.path.join(workspace_dir, "devc.c") with open(path_cc, "w") as f: f.write(_ffi_api.ModulePackImportsToC(self, is_system_lib)) files.append(path_cc) # The imports could contain a c module but the object format could be tar # Thus, it would not recognize the following include paths as options # which are there assuming a c compiler is the fcompile. if has_c_module and not file_name.endswith(".tar"): options = [] if "options" in kwargs: opts = kwargs["options"] options = opts if isinstance(opts, (list, tuple)) else [opts] opts = options + ["-I" + path for path in find_include_path()] kwargs.update({"options": opts}) return fcompile(file_name, files, **kwargs)
def test_c_link_params(linkable_dtype): temp_dir = utils.tempdir() mod, param_init = _make_mod_and_params(linkable_dtype) rand_input = _make_random_tensor(linkable_dtype, INPUT_SHAPE) main_func = mod["main"] target = "c" executor = Executor("graph", {"link-params": True}) with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}): lib = tvm.relay.build(mod, target, executor=executor, params=param_init) assert len(lib.params.keys()) == 0 # NOTE: params became tir.constants src = lib.lib.get_source() lib.lib.save(temp_dir.relpath("test.c"), "c") c_dtype = _get_c_datatype(linkable_dtype) src_lines = src.split("\n") param = param_init[f"{linkable_dtype}_a"].reshape( np.prod(KERNEL_SHAPE)) param_def = rf"^static const {c_dtype} __attribute__\(\(section\(\".rodata.tvm\"\), aligned\(16\)\)\) constant_\d+\[{np.prod(param.shape)}\] = {{$" for i, line in enumerate(src_lines): if re.match(param_def, line): i += 1 break else: assert False, f'did not find parameter definition "{param_def}":\n{src}' cursor = 0 width = dtype_info(linkable_dtype).bits // 4 + 2 if linkable_dtype.startswith("int"): width += 1 # Account for sign while "};" not in src_lines[i]: for match in HEX_NUM_RE.finditer(src_lines[i]): cursor += 1 i += 1 assert cursor == np.prod(param.shape) # Need a unique name per library to avoid dlopen caching the lib load. lib_path = temp_dir.relpath(f"test-{linkable_dtype}-linked.so") lib["remove_params"]().export_library(lib_path) lib_mod = tvm.runtime.load_module(lib_path) # lib_mod = lib_factory['default']() graph = json.loads(lib.graph_json) for p in lib.params: _verify_linked_param(linkable_dtype, lib, lib_mod, graph, p) # Wrap in function to explicitly deallocate the runtime. def _run_linked(lib_mod): graph_rt = tvm.contrib.graph_executor.GraphModule( lib_mod["default"](tvm.cpu(0))) graph_rt.set_input("rand_input", rand_input) # NOTE: params not required. graph_rt.run() return graph_rt.get_output(0) linked_output = _run_linked(lib_mod) linked_params = lib.params with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}): lib = tvm.relay.build(mod, "c", params=param_init) _, _, params = lib # Need a unique name per library to avoid dlopen caching the lib load. lib_path = temp_dir.relpath(f"test-{linkable_dtype}-unlinked.so") lib.export_library(lib_path) lib_mod = tvm.runtime.load_module(lib_path) def _run_unlinked(lib_mod): graph_rt = tvm.contrib.graph_executor.GraphModule( lib_mod["default"](tvm.cpu(0))) graph_rt.set_input("rand_input", rand_input, **params) graph_rt.run() return graph_rt.get_output(0) unlinked_output = _run_unlinked(lib_mod) if "int" in linkable_dtype: np.testing.assert_equal(unlinked_output.numpy(), linked_output.numpy()) else: np.testing.assert_allclose(unlinked_output.numpy(), linked_output.numpy())