def main(): describe() print(f"Workload: {ARGS.workload}") mod, params, (input_name, input_shape, input_dtype) = get_network( ARGS.workload, ARGS.input_shape, cache_dir=ARGS.cache_dir, ) input_info = {input_name: input_shape} input_data = { item["name"]: generate_input_data(item["shape"], item["dtype"]) for item in ARGS.input_shape } for input_name, input_shape in input_info.items(): print(f" input_name : {input_name}") print(f" input_shape: {input_shape}") print(f" input_dtype: {input_dtype}") runner = ms.runner.RPCRunner( rpc_config=ARGS.rpc_config, evaluator_config=ms.runner.EvaluatorConfig( number=ARGS.number, repeat=ARGS.repeat, min_repeat_ms=ARGS.min_repeat_ms, enable_cpu_cache_flush=ARGS.cpu_flush, ), alloc_repeat=1, ) with ms.Profiler() as profiler: lib = ms.tune_relay( mod=mod, target=ARGS.target, config=ms.TuneConfig( strategy="evolutionary", num_trials_per_iter=64, max_trials_per_task=ARGS.num_trials, max_trials_global=ARGS.num_trials, adaptive_training=ARGS.adaptive_training, ), runner=runner, # type: ignore work_dir=ARGS.work_dir, params=params, backend=ARGS.backend, ) print("Tuning Time:") print(profiler.table()) run_module_via_rpc( rpc_config=ARGS.rpc_config, lib=lib, dev_type=ARGS.target.kind.name, args=input_data, continuation=create_timer(ARGS.backend), backend=ARGS.backend, )
def test_cuda_tensor_core(model_name, input_shape): """Integration tests of auto tensorization with CUDA tensor core""" target = tvm.target.Target("nvidia/geforce-rtx-3070") dev = tvm.cuda() if model_name.startswith("bert"): data = tvm.nd.array(np.random.randint(0, 30521, size=input_shape), dev) # embedding size else: data = tvm.nd.array(np.random.randn(*input_shape).astype("float32"), dev) mod, params, (input_name, _, _) = relay_workload.get_network(model_name, input_shape) seq = tvm.transform.Sequential( [ relay.transform.ToMixedPrecision(), ] ) with tvm.transform.PassContext(opt_level=3): mod = seq(mod) def convert_layout(mod): seq = tvm.transform.Sequential( [relay.transform.ConvertLayout({"nn.conv2d": ["NHWC", "OHWI"]})] ) with tvm.transform.PassContext(opt_level=3): mod = seq(mod) return mod with tempfile.TemporaryDirectory() as work_dir: with ms.Profiler() as profiler: rt_mod1: tvm.runtime.Module = ms.tune_relay( mod=convert_layout(mod), params=params, target=target, config=ms.TuneConfig( num_trials_per_iter=32, max_trials_per_task=200, max_trials_global=3000, ), sch_rules=ms.default_config._DefaultCUDATensorCore.schedule_rules, postprocs=ms.default_config._DefaultCUDATensorCore.postprocs, work_dir=work_dir, ) print(profiler.table()) # Compile without MetaSchedule for correctness check with tvm.transform.PassContext(opt_level=0): rt_mod2 = relay.build(mod, target=target, params=params) def get_output(data, lib): module = tvm.contrib.graph_executor.GraphModule(lib["default"](dev)) module.set_input(input_name, data) module.run() return module.get_output(0).numpy() # Check correctness actual_output = get_output(data, rt_mod1) expected_output = get_output(data, rt_mod2) assert np.allclose(actual_output, expected_output, rtol=1e-2, atol=2e-2)
def main(): describe() print(f"Workload: {ARGS.model_name}") onnx_model = onnx.load(ARGS.onnx_path) shape_dict = {} for item in ARGS.input_shape: print(f" input_name : {item['name']}") print(f" input_shape: {item['shape']}") print(f" input_dtype: {item['dtype']}") shape_dict[item["name"]] = item["shape"] mod, params = from_onnx(onnx_model, shape_dict, freeze_params=True) input_data = { item["name"]: generate_input_data(item["shape"], item["dtype"]) for item in ARGS.input_shape } runner = ms.runner.RPCRunner( rpc_config=ARGS.rpc_config, evaluator_config=ms.runner.EvaluatorConfig( number=ARGS.number, repeat=ARGS.repeat, min_repeat_ms=ARGS.min_repeat_ms, enable_cpu_cache_flush=ARGS.cpu_flush, ), alloc_repeat=1, ) with ms.Profiler() as profiler: lib = ms.tune_relay( mod=mod, target=ARGS.target, config=ms.TuneConfig( strategy="evolutionary", num_trials_per_iter=64, max_trials_per_task=ARGS.num_trials, max_trials_global=ARGS.num_trials, adaptive_training=ARGS.adaptive_training, ), runner=runner, # type: ignore work_dir=ARGS.work_dir, params=params, backend=ARGS.backend, ) print("Tuning Time:") print(profiler.table()) run_module_via_rpc( rpc_config=ARGS.rpc_config, lib=lib, dev_type=ARGS.target.kind.name, args=input_data, continuation=create_timer(ARGS.backend), backend=ARGS.backend, )
def test_meta_schedule_tune_relay( model_name: str, input_shape: List[int], target: str, ): dev = tvm.cpu() if str(target).startswith("llvm") else tvm.cuda() if model_name.startswith("bert"): data = tvm.nd.array(np.random.randint(0, 30521, size=input_shape), dev) # embedding size else: data = tvm.nd.array( np.random.randn(*input_shape).astype("float32"), dev) mod, params, (input_name, _, _) = get_network(name=model_name, input_shape=input_shape) target = Target(target) with tempfile.TemporaryDirectory() as work_dir: with ms.Profiler() as profiler: rt_mod1: tvm.runtime.Module = ms.tune_relay( mod=mod, params=params, target=target, config=ms.TuneConfig( strategy="evolutionary", num_trials_per_iter=32, max_trials_per_task=20000, max_trials_global=20000, ), work_dir=work_dir, ) print(profiler.table()) # Compile without meta-schedule for correctness check with tvm.transform.PassContext(opt_level=0): rt_mod2 = relay.build(mod, target=target, params=params) def get_output(data, lib): module = graph_executor.GraphModule(lib["default"](dev)) module.set_input(input_name, data) module.run() return module.get_output(0).numpy() # Check correctness actual_output = get_output(data, rt_mod1) expected_output = get_output(data, rt_mod2) assert np.allclose(actual_output, expected_output, rtol=1e-4, atol=2e-4)
def main(): mod, params, (input_name, input_shape, input_dtype) = get_network( ARGS.workload, ARGS.input_shape, cache_dir=ARGS.cache_dir, ) print(f"Workload: {ARGS.workload}") print(f" input_name: {input_name}") print(f" input_shape: {input_shape}") print(f" input_dtype: {input_dtype}") alloc_repeat = 1 runner = ms.runner.RPCRunner( rpc_config=ARGS.rpc_config, evaluator_config=ms.runner.EvaluatorConfig( number=3, repeat=1, min_repeat_ms=100, enable_cpu_cache_flush=False, ), alloc_repeat=alloc_repeat, max_workers=ARGS.rpc_workers, ) lib = ms.tune_relay( mod=mod, target=ARGS.target, config=ms.TuneConfig( strategy="evolutionary", num_trials_per_iter=64, max_trials_per_task=ARGS.num_trials, max_trials_global=ARGS.num_trials, ), runner=runner, # type: ignore work_dir=ARGS.work_dir, params=params, ) graph, rt_mod, params = lib.graph_json, lib.lib, lib.params if input_dtype.startswith("float"): input_data = np.random.uniform(size=input_shape).astype(input_dtype) else: input_data = np.random.randint(low=0, high=10000, size=input_shape, dtype=input_dtype) def f_timer(rt_mod, dev, input_data): # pylint: disable=import-outside-toplevel from tvm.contrib.graph_executor import GraphModule # pylint: enable=import-outside-toplevel mod = GraphModule(rt_mod["default"](dev)) mod.set_input(input_name, input_data) ftimer = mod.module.time_evaluator( "run", dev, min_repeat_ms=500, repeat=3, ) results = list(np.array(ftimer().results) * 1000.0) # type: ignore print("Running time in time_evaluator: ", results) run_module_via_rpc( rpc_config=ARGS.rpc_config, lib=lib, dev_type=ARGS.target.kind.name, args=[input_data], continuation=f_timer, ) def f_per_layer(rt_mod, dev, input_data): # pylint: disable=import-outside-toplevel from tvm.contrib.debugger.debug_executor import create # pylint: enable=import-outside-toplevel mod = create(graph, rt_mod, dev) mod.set_input(input_name, input_data) graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]] graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000) print("|graph_nodes| = ", len(graph_nodes)) print("|graph_time| = ", len(graph_time)) graph_nodes_time = {k: float(v) for k, v in zip(graph_nodes, graph_time)} for k, v in graph_nodes_time.items(): print(f"{k} : {v:.3f}") run_module_via_rpc( rpc_config=ARGS.rpc_config, lib=rt_mod, dev_type=ARGS.target.kind.name, args=[input_data], continuation=f_per_layer, )
def main(): describe() print(f"Workload: {ARGS.model_name}") onnx_model = onnx.load(ARGS.onnx_path) shape_dict = {} for item in ARGS.input_shape: print(f" input_name: {item['name']}") print(f" input_shape: {item['shape']}") print(f" input_dtype: {item['dtype']}") shape_dict[item["name"]] = item["shape"] mod, params = from_onnx(onnx_model, shape_dict, freeze_params=True) runner = ms.runner.RPCRunner( rpc_config=ARGS.rpc_config, evaluator_config=ms.runner.EvaluatorConfig( number=ARGS.number, repeat=ARGS.repeat, min_repeat_ms=ARGS.min_repeat_ms, enable_cpu_cache_flush=ARGS.cpu_flush, ), alloc_repeat=1, ) with ms.Profiler() as profiler: lib = ms.tune_relay( mod=mod, target=ARGS.target, config=ms.TuneConfig( strategy="evolutionary", num_trials_per_iter=64, max_trials_per_task=ARGS.num_trials, max_trials_global=ARGS.num_trials, ), runner=runner, # type: ignore work_dir=ARGS.work_dir, params=params, ) print("Tuning Time:") print(profiler.table()) graph, rt_mod, params = lib.graph_json, lib.lib, lib.params input_data = {} for item in ARGS.input_shape: input_name, input_shape, input_dtype = item["name"], item[ "shape"], item["dtype"] if input_dtype.startswith("float"): input_data[input_name] = np.random.uniform( size=input_shape).astype(input_dtype) else: input_data[input_name] = np.random.randint(low=0, high=10000, size=input_shape, dtype=input_dtype) def f_timer(rt_mod, dev, input_data): # pylint: disable=import-outside-toplevel from tvm.contrib.graph_executor import GraphModule # pylint: enable=import-outside-toplevel mod = GraphModule(rt_mod["default"](dev)) for input_name, input_value in input_data.items(): mod.set_input(input_name, input_value) ftimer = mod.module.time_evaluator( "run", dev, min_repeat_ms=500, repeat=3, ) results = list(np.array(ftimer().results) * 1000.0) # type: ignore print("Running time in time_evaluator: ", results) run_module_via_rpc( rpc_config=ARGS.rpc_config, lib=lib, dev_type=ARGS.target.kind.name, args=input_data, continuation=f_timer, ) def f_per_layer(rt_mod, dev, input_data): # pylint: disable=import-outside-toplevel from tvm.contrib.debugger.debug_executor import create # pylint: enable=import-outside-toplevel mod = create(graph, rt_mod, dev) for input_name, input_value in input_data.items(): mod.set_input(input_name, input_value) graph_nodes = [n["name"] for n in json.loads(graph)["nodes"]] graph_time = mod.run_individual(number=10, repeat=1, min_repeat_ms=5000) print("|graph_nodes| = ", len(graph_nodes)) print("|graph_time| = ", len(graph_time)) graph_nodes_time = { k: float(v) for k, v in zip(graph_nodes, graph_time) } for k, v in graph_nodes_time.items(): print(f"{k} : {v:.3f}") run_module_via_rpc( rpc_config=ARGS.rpc_config, lib=rt_mod, dev_type=ARGS.target.kind.name, args=input_data, continuation=f_per_layer, )