예제 #1
0
        help=
        "Path to model trained for algorithm 'medium'. If not given, ignore this algorithm.",
    )
    parser.add_argument(
        "--largeDB1",
        default=None,
        help=
        "Path to model trained for algorithm 'largeDB1'. If not given, ignore this algorithm.",
    )
    parser.add_argument(
        "--largeDB2",
        default=None,
        help=
        "Path to model trained for algorithm 'largeDB2'. If not given, ignore this algorithm.",
    )
    parser.add_argument(
        "-c",
        "--chunk_size",
        type=int,
        default=2000,
        help=
        "Chunk size for dispatching joblib jobs. If memory errors are experienced, reduce this number",
    )

    args = parser.parse_args()
    paths_to_models = dict()
    for algo in kernel_algorithm.keys():
        paths_to_models[algo] = args.__dict__[algo]
    main(args.params, args.njobs, args.baseline, paths_to_models,
         args.chunk_size)
예제 #2
0
def get_optimal_kernels(
    mnks_to_predict,
    njobs,
    chunk_size,
    paths_to_models,
    gpu_properties,
    autotuning_properties,
    top_k,
):
    # optimal_kernels_list is a list of dictionaries
    # - keys: (m, n, k),
    # - values: Kernel object describing best parameters
    # - number of elements in each dictionary = top_k
    # each element of the list corresponds to the search of optimal kernels for a given mnk and a given algorithm

    print("Getting optimal kernels")

    # ===============================================================================
    # Load predictive trees and feature list
    tree = dict()
    kernel_to_investigate = dict()
    for algo in kernel_algorithm.keys():
        path_to_model = paths_to_models[algo]
        if path_to_model is not None:
            print("Algorithm: {:<8}, loading model from: {}".format(
                algo, path_to_model))
            tree[algo] = dict()
            tree[algo]["file"] = path_to_model
            features, tree[algo]["tree"] = safe_pickle_load(tree[algo]["file"])
            tree[algo]["features"] = features.tolist()
            kernel_to_investigate[algo] = kernel_algorithm[algo]
        else:
            print("Algorithm: {:<8}, no model found.".format(algo))

    if len(kernel_to_investigate) == 0:
        print("No model found. Specify path to predictive models using ")
        sys.exit(1)

    # ===============================================================================
    # Get mnks_by_algo to compute:
    mnks_by_algo = list(product(mnks_to_predict, kernel_to_investigate.keys()))
    num_mnks_by_algo = len(mnks_by_algo)
    optimal_kernels_list = list()
    ckpt_folder_name = "predict_genpars_ckpt"

    if not os.path.exists(ckpt_folder_name):
        os.mkdir(ckpt_folder_name)
    print("Caching intermediate results to:", ckpt_folder_name)

    for i in range(0, num_mnks_by_algo + 1, chunk_size):

        # Chunk up tasks
        start_chunk = i
        end_chunk = int(min(start_chunk + chunk_size, num_mnks_by_algo + 1))
        print("Completed {:,} tasks out of {:,}".format(i, num_mnks_by_algo))

        # Create checkpoint file or load checkpointed data from it
        checkpoint_file_name = os.path.join(
            ckpt_folder_name,
            "chunk_{}-{}.json".format(start_chunk, end_chunk))
        if os.path.exists(checkpoint_file_name):
            with open(checkpoint_file_name, "r") as f:
                optimal_kernels_list__ = json.load(f)
                optimal_kernels_list_ = list()
                for i, optker in enumerate(optimal_kernels_list__):
                    optimal_kernels_list_.append({})
                    for k, v in optker.items():
                        algo = v.pop("algorithm")
                        optimal_kernels_list_[i][to_tuple(
                            k)] = kernel_algorithm[algo](**v)
            print("Read chunk {}-{}\n".format(start_chunk, end_chunk))

        else:

            if njobs == 1:

                # Ignore joblib and run serially:
                for mnk, algo in mnks_by_algo:
                    gc.collect()
                    print("Find optimal kernels for mnk=", mnk, ", algo=",
                          algo)
                    optimal_kernels_list_ = find_optimal_kernel(
                        mnk,
                        algo,
                        tree[algo]["tree"],
                        tree[algo]["features"],
                        gpu_properties,
                        autotuning_properties,
                    )
            else:

                # Run prediction tasks in parallel with joblib
                optimal_kernels_list_ = Parallel(n_jobs=njobs, verbose=2)(
                    delayed(find_optimal_kernel, check_pickle=True)(
                        mnk,
                        algo,
                        tree[algo]["tree"],
                        tree[algo]["features"],
                        gpu_properties,
                        autotuning_properties,
                    ) for mnk, algo in mnks_by_algo[start_chunk:end_chunk])

            optimal_kernels_list_ = remove_empty_entries(optimal_kernels_list_)
            with open(checkpoint_file_name, "w") as f:
                optimal_kernels_list__ = list()
                for i, optker in enumerate(optimal_kernels_list_):
                    optimal_kernels_list__.append({})
                    for k, v in optker.items():
                        optimal_kernels_list__[i][to_string(k)] = v.as_dict
                json.dump(optimal_kernels_list__, f)

        optimal_kernels_list += optimal_kernels_list_

    print("Finished gathering candidates for optimal parameter space")

    # Group optimal kernel candidates by (m,n,k) in a dictionary
    optimal_kernels_mnk_algo = dict()
    for optimal_kernel_mnk in optimal_kernels_list:
        for mnk, kernels_mnk in optimal_kernel_mnk.items():
            m, n, k = mnk
            if (m, n, k) in optimal_kernels_mnk_algo.keys():
                optimal_kernels_mnk_algo[(m, n, k)].append(kernels_mnk)
            else:
                optimal_kernels_mnk_algo[(m, n, k)] = [kernels_mnk]

    # Find optimal kernel per mnk among the different algorithm possibilities
    optimal_kernels = dict()
    for mnk, candidate_kernels in optimal_kernels_mnk_algo.items():
        m, n, k = mnk
        optimal_kernel_mnk = sorted(candidate_kernels,
                                    key=lambda x: x.perf,
                                    reverse=True)[:top_k]
        optimal_kernels[(m, n, k)] = optimal_kernel_mnk[0]

    return optimal_kernels
예제 #3
0
def gen_benchmark(outdir, gpu_properties, autotuning_properties, compiler, m,
                  n, k):
    includes = []
    launcher_codes = []
    launchers = []
    kernel_descr = []
    indent = "  "
    file_extension = get_file_extension_from_compiler(compiler)

    # Get the kernel algorithms compatible with the given size:
    compatible_kernels = [
        kernel_algorithm[kernclass] for kernclass in kernel_algorithm.keys()
        if compatible_mnk(kernclass, m, n, k)
    ]

    # Get the parameter sets to measure for this (m,n,k)
    for kernclass in compatible_kernels:
        params = kernclass.promising_parameters(m, n, k, gpu_properties,
                                                autotuning_properties)
        if params == 0:
            continue

        for p in params:
            kern = kernclass(**p, source="autotuning_candidate", perf=0)
            includes.append("../../kernels/" + kern.include)
            launcher_codes.append(kern.launcher_code(compiler))
            launchers.append("launch_" + kern.name)
            kernel_descr.append(kernclass.__name__ + format_params(p))

    print("Found %d parameter sets for %dx%dx%d" % (len(launchers), m, n, k))
    if len(launchers) == 0:
        return

    # Compose the "include" line of the benchmark code
    incl_output = '#include "../../kernels/smm_acc_common.h"\n'
    for i in set(includes):
        incl_output += '#include "%s"\n' % i
    incl_output += "\n\n"

    # Compose the benchmark code
    # The benchmark is broken down in
    # - n_exe_files executables
    # - each executable is made of n_obj_files object files
    # - each object file is made up of launchers_per_obj launchers
    # - each launcher launches 1 GPU kernel with a certain set of kernel parameters
    # the hipcc compiler is very slow -> make a larger number of smaller executables
    max_launchers_per_exe = 10000 if compiler == "nvcc" else 100
    launchers_per_obj = 100 if compiler == "nvcc" else 10
    n_exe_files = int(len(launcher_codes) / max_launchers_per_exe) + 1
    launchers_per_exe = int(len(launcher_codes) / n_exe_files) + 1

    # Compose source code for each executable file
    for i in range(n_exe_files):
        chunk_a = i * launchers_per_exe
        chunk_b = min((i + 1) * launchers_per_exe, len(launcher_codes))
        n_obj_files = math.ceil((chunk_b - chunk_a) / launchers_per_obj)

        # Compose source code for each object file
        for j in range(n_obj_files):
            a = chunk_a + j * launchers_per_obj
            b = min(chunk_a + (j + 1) * launchers_per_obj, chunk_b)
            output = incl_output
            output += "\n\n".join(launcher_codes[a:b])
            fn = outdir + "/tune_%dx%dx%d_exe%d_part%d%s" % (
                m,
                n,
                k,
                i,
                j,
                file_extension,
            )
            writefile(fn, output)

        # Compose source code for "main" of executable file
        output = '#include "../../libsmm_acc_benchmark.h"\n\n'
        for j in range(chunk_b - chunk_a):
            output += ("int " + launchers[chunk_a + j] +
                       "(int *param_stack, int stack_size, ")
            if compiler == "nvcc":
                output += "cudaStream_t stream, "
            else:
                output += "hipStream_t stream, "
            output += ("int m_max, int n_max, int k_max," +
                       " double *a_data, double *b_data, double *c_data);\n")

        output += "\n"
        output += "int main(int argc, char** argv){\n"
        if compiler == "nvcc":
            output += (
                indent +
                "cudaError_t err = cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);\n"
            )
            output += indent + "if(err != cudaSuccess) return(-1);\n"
        else:  # i.e. compiler = hipcc
            output += (
                indent +
                "hipError_t err = hipDeviceSetSharedMemConfig(hipSharedMemBankSizeEightByte);\n"
            )
            output += indent + "if(err != hipSuccess) return(-1);\n"
        output += indent + "libsmm_acc_benchmark_t* handle;\n"
        output += indent + "KernelLauncher launchers[%d];\n" % (chunk_b -
                                                                chunk_a)
        output += indent + "char *kernel_descr[%d];\n" % (chunk_b - chunk_a)

        for j in range(chunk_b - chunk_a):
            output += indent + "launchers[%d]    = %s;\n" % (
                j, launchers[chunk_a + j])
            output += indent + 'kernel_descr[%d] = (char *) "%s";\n' % (
                j,
                kernel_descr[chunk_a + j],
            )
        output += indent + "libsmm_acc_benchmark_init(&handle, tune, %d, %d, %d);\n" % (
            m,
            n,
            k,
        )
        output += (
            indent +
            "int result = libsmm_acc_benchmark(handle, %d, %d, %d, %d, launchers, kernel_descr);\n"
            % (m, n, k, chunk_b - chunk_a))
        output += indent + "libsmm_acc_benchmark_finalize(handle);\n"
        output += indent + "return result;"
        output += "}\n"

        fn = outdir + "/tune_%dx%dx%d_exe%d_main%s" % (m, n, k, i,
                                                       file_extension)
        writefile(fn, output)