def profile_and_build_vm( mod, params, sm, split_k_slices=[1], tmp_dir="./tmp", lib_path="compile.so", vmcode_path="vmcode.ro", use_fast_math=False, use_3xtf32=True, ): mod = partition_for_cutlass(mod) mod, num_cutlass_partition = tune_cutlass_kernels( mod, sm, split_k_slices=split_k_slices, use_3xtf32=use_3xtf32, profile_all_alignments=False, find_first_valid=True, tmp_dir=tmp_dir, ) with tvm.transform.PassContext(opt_level=3): vm_exec = relay.vm.compile(mod, target="cuda", params=params) vm_exec = build_cutlass_kernels_vm(vm_exec, sm, tmp_dir, lib_path, vmcode_path, use_fast_math=use_fast_math) dev = tvm.device("cuda", 0) return VirtualMachine(vm_exec, dev), dev, num_cutlass_partition
def profile_and_build( mod, params, sm, split_k_slices=[1], tmp_dir="./tmp", lib_path="compile.so", use_fast_math=False, use_3xtf32=True, ): mod = partition_for_cutlass(mod) mod, num_cutlass_partition = tune_cutlass_kernels( mod, sm, use_3xtf32=use_3xtf32, split_k_slices=split_k_slices, profile_all_alignments=False, find_first_valid=True, use_multiprocessing=True, tmp_dir=tmp_dir, ) with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target="cuda", params=params) lib = build_cutlass_kernels(lib, sm, tmp_dir, lib_path, use_fast_math=use_fast_math) dev = tvm.device("cuda", 0) rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev)) return rt_mod, dev, num_cutlass_partition
def profile_and_build_vm( mod, params, sm, tmp_dir="./tmp", lib_path="compile.so", vmcode_path="vmcode.ro" ): mod = partition_for_cutlass(mod) mod, num_cutlass_partition = tune_cutlass_kernels(mod, sm, tmp_dir=tmp_dir) with tvm.transform.PassContext(opt_level=3): vm_exec = relay.vm.compile(mod, target="cuda", params=params) vm_exec = build_cutlass_kernels_vm(vm_exec, sm, tmp_dir, lib_path, vmcode_path) dev = tvm.device("cuda", 0) return VirtualMachine(vm_exec, dev), dev, num_cutlass_partition
def profile_and_build(mod, params, sm, tmp_dir="./tmp", lib_path="compile.so"): mod = partition_for_cutlass(mod) mod, num_cutlass_partition = tune_cutlass_kernels( mod, sm, profile_all=False, use_multiprocessing=False, tmp_dir=tmp_dir) with tvm.transform.PassContext(opt_level=3): lib = relay.build(mod, target="cuda", params=params) lib = build_cutlass_kernels(lib, sm, tmp_dir, lib_path) dev = tvm.device("cuda", 0) rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev)) return rt_mod, dev, num_cutlass_partition
return lib ########################################################### # Run the two subgraphs in pipeline with pipeline executor. # --------------------------------------------------------- # Set 'USE_PIPELINE_EXECUTOR' as ON, and set USE_CUTLASS' as ON in cmake. from tvm.contrib import graph_executor, pipeline_executor, pipeline_executor_build ######################################### # Create subgraph pipeline configuration. # Associate a subgraph module with a target. # Use CUTLASS BYOC to build the second subgraph module. mod0, mod1 = subgraphs[0], subgraphs[1] # Use cutlass as the codegen. mod1 = partition_for_cutlass(mod1) ################################################# # Get the pipeline executor configuration object. pipe_config = pipeline_executor_build.PipelineConfig() ########################################################################### # Set the compile target of the subgraph module. pipe_config[mod0].target = "llvm" pipe_config[mod0].dev = tvm.cpu(0) ############################################################## # Set the compile target of the second subgraph module as cuda. pipe_config[mod1].target = "cuda" pipe_config[mod1].dev = tvm.device("cuda", 0) pipe_config[mod1].build_func = cutlass_build pipe_config[mod1].export_cc = "nvcc" # Create the pipeline by connecting the subgraph modules. # The global input will be forwarded to the input interface of the first module named mod0