Пример #1
0
def profile_and_build_vm(
    mod,
    params,
    sm,
    split_k_slices=[1],
    tmp_dir="./tmp",
    lib_path="compile.so",
    vmcode_path="vmcode.ro",
    use_fast_math=False,
    use_3xtf32=True,
):
    mod = partition_for_cutlass(mod)
    mod, num_cutlass_partition = tune_cutlass_kernels(
        mod,
        sm,
        split_k_slices=split_k_slices,
        use_3xtf32=use_3xtf32,
        profile_all_alignments=False,
        find_first_valid=True,
        tmp_dir=tmp_dir,
    )
    with tvm.transform.PassContext(opt_level=3):
        vm_exec = relay.vm.compile(mod, target="cuda", params=params)
    vm_exec = build_cutlass_kernels_vm(vm_exec,
                                       sm,
                                       tmp_dir,
                                       lib_path,
                                       vmcode_path,
                                       use_fast_math=use_fast_math)
    dev = tvm.device("cuda", 0)
    return VirtualMachine(vm_exec, dev), dev, num_cutlass_partition
Пример #2
0
def profile_and_build(
    mod,
    params,
    sm,
    split_k_slices=[1],
    tmp_dir="./tmp",
    lib_path="compile.so",
    use_fast_math=False,
    use_3xtf32=True,
):
    mod = partition_for_cutlass(mod)
    mod, num_cutlass_partition = tune_cutlass_kernels(
        mod,
        sm,
        use_3xtf32=use_3xtf32,
        split_k_slices=split_k_slices,
        profile_all_alignments=False,
        find_first_valid=True,
        use_multiprocessing=True,
        tmp_dir=tmp_dir,
    )
    with tvm.transform.PassContext(opt_level=3):
        lib = relay.build(mod, target="cuda", params=params)
    lib = build_cutlass_kernels(lib,
                                sm,
                                tmp_dir,
                                lib_path,
                                use_fast_math=use_fast_math)
    dev = tvm.device("cuda", 0)
    rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
    return rt_mod, dev, num_cutlass_partition
Пример #3
0
def profile_and_build_vm(
    mod, params, sm, tmp_dir="./tmp", lib_path="compile.so", vmcode_path="vmcode.ro"
):
    mod = partition_for_cutlass(mod)
    mod, num_cutlass_partition = tune_cutlass_kernels(mod, sm, tmp_dir=tmp_dir)
    with tvm.transform.PassContext(opt_level=3):
        vm_exec = relay.vm.compile(mod, target="cuda", params=params)
    vm_exec = build_cutlass_kernels_vm(vm_exec, sm, tmp_dir, lib_path, vmcode_path)
    dev = tvm.device("cuda", 0)
    return VirtualMachine(vm_exec, dev), dev, num_cutlass_partition
Пример #4
0
def profile_and_build(mod, params, sm, tmp_dir="./tmp", lib_path="compile.so"):
    mod = partition_for_cutlass(mod)
    mod, num_cutlass_partition = tune_cutlass_kernels(
        mod, sm, profile_all=False, use_multiprocessing=False, tmp_dir=tmp_dir)
    with tvm.transform.PassContext(opt_level=3):
        lib = relay.build(mod, target="cuda", params=params)
    lib = build_cutlass_kernels(lib, sm, tmp_dir, lib_path)
    dev = tvm.device("cuda", 0)
    rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
    return rt_mod, dev, num_cutlass_partition