def profile_and_build_vm( mod, params, sm, split_k_slices=[1], tmp_dir="./tmp", lib_path="compile.so", vmcode_path="vmcode.ro", use_fast_math=False, use_3xtf32=True, ): mod = partition_for_cutlass(mod) mod, num_cutlass_partition = tune_cutlass_kernels( mod, sm, split_k_slices=split_k_slices, use_3xtf32=use_3xtf32, profile_all_alignments=False, find_first_valid=True, tmp_dir=tmp_dir, ) with tvm.transform.PassContext(opt_level=3): vm_exec = relay.vm.compile(mod, target="cuda", params=params) vm_exec = build_cutlass_kernels_vm(vm_exec, sm, tmp_dir, lib_path, vmcode_path, use_fast_math=use_fast_math) dev = tvm.device("cuda", 0) return VirtualMachine(vm_exec, dev), dev, num_cutlass_partition
def profile_and_build_vm( mod, params, sm, tmp_dir="./tmp", lib_path="compile.so", vmcode_path="vmcode.ro" ): mod = partition_for_cutlass(mod) mod, num_cutlass_partition = tune_cutlass_kernels(mod, sm, tmp_dir=tmp_dir) with tvm.transform.PassContext(opt_level=3): vm_exec = relay.vm.compile(mod, target="cuda", params=params) vm_exec = build_cutlass_kernels_vm(vm_exec, sm, tmp_dir, lib_path, vmcode_path) dev = tvm.device("cuda", 0) return VirtualMachine(vm_exec, dev), dev, num_cutlass_partition