Пример #1
0
def test_complies_fortran_function_with_module():
    kernel_string = """
    module my_fancy_module
    use iso_c_binding

    contains

    function my_test_function() result(time)
        use iso_c_binding
        real (c_float) :: time

        time = 42.0
    end function my_test_function

    end module my_fancy_module
    """
    kernel_name = "my_test_function"
    kernel_sources = KernelSource(kernel_name, kernel_string, "C")
    kernel_instance = KernelInstance(kernel_name, kernel_sources,
                                     kernel_string, [], None, None, dict(), [])

    try:

        with CFunctions(compiler="gfortran") as cfunc:
            func = cfunc.compile(kernel_instance)

            result = cfunc.run_kernel(func, [], (), ())

        assert np.isclose(result, 42.0)

    finally:
        util.delete_temp_file("my_fancy_module.mod")
Пример #2
0
def test_setup_device_targets_max(fake_results):

    results_filename = "temp_test_results_file.json"
    header_filename = "temp_test_header_file.h"
    kernel_name, kernel_string, tune_params, problem_size, parameter_space, results, env = fake_results

    #add GFLOP/s as metric
    for i, e in enumerate(results):
        e['GFLOP/s'] = 1e5 / e['time']

    try:
        integration.store_results(results_filename,
                                  kernel_name,
                                  kernel_string,
                                  tune_params,
                                  problem_size,
                                  results,
                                  env,
                                  top=3,
                                  objective="GFLOP/s")
        integration.create_device_targets(header_filename,
                                          results_filename,
                                          objective="GFLOP/s")

        with open(header_filename, 'r') as fh:
            output_str = fh.read()
        assert "TARGET_My_GPU" in output_str
        assert "#define a 1" in output_str
        assert "#define b 4" in output_str

        #test output when more then one problem size is used, and best configuration is different
        for i, e in enumerate(results):
            if e['a'] == 1 and e['b'] == 4:
                e['time'] += 100
                e['GFLOP/s'] = 1e5 / e['time']
        integration.store_results(results_filename,
                                  kernel_name,
                                  kernel_string,
                                  tune_params,
                                  1000,
                                  results,
                                  env,
                                  top=3,
                                  objective="GFLOP/s")
        integration.create_device_targets(header_filename,
                                          results_filename,
                                          objective="GFLOP/s")

        with open(header_filename, 'r') as fh:
            output_str = fh.read()
        expected = "\n".join(["TARGET_My_GPU", "#define a 1", "#define b 5"])
        assert expected in output_str

    finally:
        util.delete_temp_file(results_filename)
        util.delete_temp_file(header_filename)
Пример #3
0
def run_kernel(kernel_name, kernel_string, problem_size, arguments,
               params, grid_div_x=None, grid_div_y=None, grid_div_z=None,
               lang=None, device=0, platform=0, cmem_args=None, compiler_options=None,
               block_size_names=None, quiet=False):

    #sort options into separate dicts
    opts = locals()
    kernel_options = Options([(k, opts[k]) for k in _kernel_options.keys()])
    device_options = Options([(k, opts[k]) for k in _device_options.keys()])

    #detect language and create the right device function interface
    dev = core.DeviceInterface(kernel_string, iterations=1, **device_options)

    #move data to the GPU
    util.check_argument_list(arguments)
    gpu_args = dev.ready_argument_list(arguments)

    instance = None
    try:
        #create kernel instance
        instance = dev.create_kernel_instance(kernel_options, params, False)
        if instance is None:
            raise Exception("cannot create kernel instance, too many threads per block")

        #compile the kernel
        func = dev.compile_kernel(instance, False)
        if func is None:
            raise Exception("cannot compile kernel, too much shared memory used")

        #add constant memory arguments to compiled module
        if cmem_args is not None:
            dev.copy_constant_memory_args(cmem_args)
    finally:
        #delete temp files
        if instance is not None:
            for v in instance.temp_files.values():
                util.delete_temp_file(v)

    #run the kernel
    if not dev.run_kernel(func, gpu_args, instance):
        raise Exception("runtime error occured, too many resources requested")

    #copy data in GPU memory back to the host
    results = []
    for i, arg in enumerate(arguments):
        if numpy.isscalar(arg):
            results.append(arg)
        else:
            results.append(numpy.zeros_like(arg))
            dev.memcpy_dtoh(results[-1], gpu_args[i])

    return results
Пример #4
0
    def compile_and_benchmark(self, gpu_args, params, kernel_options,
                              tuning_options):
        """ Compile and benchmark a kernel instance based on kernel strings and parameters """

        instance_string = util.get_instance_string(params)

        logging.debug('compile_and_benchmark ' + instance_string)
        mem_usage = round(
            resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1024.0, 1)
        logging.debug('Memory usage : %2.2f MB', mem_usage)

        verbose = tuning_options.verbose

        instance = self.create_kernel_instance(kernel_options, params, verbose)
        if instance is None:
            return None

        try:
            #compile the kernel
            func = self.compile_kernel(instance, verbose)
            if func is None:
                return None

            #add constant memory arguments to compiled module
            if kernel_options.cmem_args is not None:
                self.dev.copy_constant_memory_args(kernel_options.cmem_args)

            #test kernel for correctness and benchmark
            if tuning_options.answer is not None:
                self.check_kernel_correctness(func, gpu_args, instance,
                                              tuning_options.answer,
                                              tuning_options.atol,
                                              tuning_options.verify, verbose)

            #benchmark
            time = self.benchmark(func, gpu_args, instance, verbose)

        except Exception as e:
            #dump kernel_string to temp file
            temp_filename = util.get_temp_filename(suffix=".c")
            util.write_file(temp_filename, instance.kernel_string)
            print("Error while compiling or benchmarking, see source files: " +
                  temp_filename + " ".join(instance.temp_files.values()))
            raise e

        #clean up any temporary files, if no error occured
        for v in instance.temp_files.values():
            util.delete_temp_file(v)

        return time
Пример #5
0
    def compile_and_benchmark(self, gpu_args, params, kernel_options, tuning_options):
        """ Compile and benchmark a kernel instance based on kernel strings and parameters """

        instance_string = util.get_instance_string(params)

        logging.debug('compile_and_benchmark ' + instance_string)
        mem_usage = round(resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024.0, 1)
        logging.debug('Memory usage : %2.2f MB', mem_usage)

        verbose = tuning_options.verbose

        instance = self.create_kernel_instance(kernel_options, params, verbose)
        if instance is None:
            return None

        try:
            #compile the kernel
            func = self.compile_kernel(instance, verbose)
            if func is None:
                return None

            #add constant memory arguments to compiled module
            if kernel_options.cmem_args is not None:
                self.dev.copy_constant_memory_args(kernel_options.cmem_args)
            #add texture memory arguments to compiled module
            if kernel_options.texmem_args is not None:
                self.dev.copy_texture_memory_args(kernel_options.texmem_args)

            #test kernel for correctness and benchmark
            if tuning_options.answer is not None:
                self.check_kernel_output(func, gpu_args, instance, tuning_options.answer, tuning_options.atol, tuning_options.verify, verbose)

            #benchmark
            time = self.benchmark(func, gpu_args, instance, tuning_options.times, verbose)

        except Exception as e:
            #dump kernel_string to temp file
            temp_filename = util.get_temp_filename(suffix=".c")
            util.write_file(temp_filename, instance.kernel_string)
            print("Error while compiling or benchmarking, see source files: " + temp_filename + " ".join(instance.temp_files.values()))
            raise e

        #clean up any temporary files, if no error occured
        for v in instance.temp_files.values():
            util.delete_temp_file(v)

        return time
Пример #6
0
def test_PythonKernel_tuned(test_kernel):
    kernel_name, kernel_string, n, args, params = test_kernel
    c, a, b, n = args
    test_results_file = "test_results_file.json"
    results = params.copy()
    results['time'] = 1.0
    env = {"device_name": "bogus GPU"}
    try:
        #create a fake results file
        integration.store_results(test_results_file, kernel_name, kernel_string, params, n, [results], env)

        #create a kernel using the results
        kernel_function = kernelbuilder.PythonKernel(kernel_name, kernel_string, n, args, results_file=test_results_file)

        #test if params were retrieved correctly
        assert kernel_function.params["block_size_x"] == 384

        #see if it functions properly
        reference = kernel_function(c, a, b, n)
        assert np.allclose(reference[0], a+b)

    finally:
        util.delete_temp_file(test_results_file)
Пример #7
0
 def delete_temp_files(self):
     """Delete any generated temp files"""
     for v in self.temp_files.values():
         util.delete_temp_file(v)
Пример #8
0
def test_store_results(fake_results):

    filename = "temp_test_results_file.json"
    kernel_name, kernel_string, tune_params, problem_size, parameter_space, results, env = fake_results

    try:
        #test basic operation
        integration.store_results(filename,
                                  kernel_name,
                                  kernel_string,
                                  tune_params,
                                  problem_size,
                                  results,
                                  env,
                                  top=3)
        meta, stored_data = integration._read_results_file(filename)

        assert len([
            d for d in stored_data
            if d["device_name"] == "My_GPU" and d["problem_size"] == "100"
        ]) == 3

        #test if results for a different problem_size values are added
        integration.store_results(filename,
                                  kernel_name,
                                  kernel_string,
                                  tune_params,
                                  1000,
                                  results,
                                  env,
                                  top=3)
        meta, stored_data = integration._read_results_file(filename)

        assert len([
            d for d in stored_data
            if d["device_name"] == "My_GPU" and d["problem_size"] == "100"
        ]) == 3
        assert len([
            d for d in stored_data
            if d["device_name"] == "My_GPU" and d["problem_size"] == "1000"
        ]) == 3

        #test if results for a different GPU can be added
        integration.store_results(filename,
                                  kernel_name,
                                  kernel_string,
                                  tune_params,
                                  problem_size,
                                  results, {"device_name": "Another GPU"},
                                  top=3)
        meta, stored_data = integration._read_results_file(filename)

        assert len(set([d["device_name"] for d in stored_data])) == 2

        #test if overwriting results works
        for i, r in enumerate(results):
            r["time"] = 50.0 + i
        integration.store_results(filename,
                                  kernel_name,
                                  kernel_string,
                                  tune_params,
                                  problem_size,
                                  results,
                                  env,
                                  top=0.1)
        meta, stored_data = integration._read_results_file(filename)

        my_gpu_100_data = [
            d for d in stored_data
            if d["device_name"] == "My_GPU" and d["problem_size"] == "100"
        ]
        assert len(my_gpu_100_data) == 1
        assert my_gpu_100_data[0]["time"] < 100

    finally:
        util.delete_temp_file(filename)
Пример #9
0
def test_setup_device_targets(fake_results):

    results_filename = "temp_test_results_file.json"
    header_filename = "temp_test_header_file.h"
    kernel_name, kernel_string, tune_params, problem_size, parameter_space, results, env = fake_results

    try:
        integration.store_results(results_filename,
                                  kernel_name,
                                  kernel_string,
                                  tune_params,
                                  problem_size,
                                  results,
                                  env,
                                  top=3)
        #results file
        #{'My_GPU': {'100': [{'a': 1, 'b': 4, 'time': 100.0}, {'a': 1, 'b': 5, 'time': 101.0}, {'a': 1, 'b': 6, 'time': 102.0}]}}

        integration.create_device_targets(header_filename, results_filename)

        with open(header_filename, 'r') as fh:
            output_str = fh.read()

        assert "#ifdef TARGET_My_GPU" in output_str
        assert "#define a 1" in output_str
        assert "#define b 4" in output_str

        #test output when more then one problem size is used, and best configuration is different
        for i, e in enumerate(results):
            if e['a'] == 1 and e['b'] == 4:
                e['time'] += 100
        integration.store_results(results_filename,
                                  kernel_name,
                                  kernel_string,
                                  tune_params,
                                  1000,
                                  results,
                                  env,
                                  top=3)
        integration.create_device_targets(header_filename,
                                          results_filename,
                                          objective="time")

        with open(header_filename, 'r') as fh:
            output_str = fh.read()
        expected = "\n".join(["TARGET_My_GPU", "#define a 1", "#define b 5"])
        assert expected in output_str

        #test output when more then one problem size is used, and best configuration depends on total time
        for i, e in enumerate(results):
            if e['a'] == 1 and e['b'] == 6:
                e['time'] -= 3
        integration.store_results(results_filename,
                                  kernel_name,
                                  kernel_string,
                                  tune_params,
                                  1000,
                                  results,
                                  env,
                                  top=3)
        integration.create_device_targets(header_filename,
                                          results_filename,
                                          objective="time")

        with open(header_filename, 'r') as fh:
            output_str = fh.read()
        expected = "\n".join(["TARGET_My_GPU", "#define a 1", "#define b 6"])
        assert expected in output_str

        #test output when more then one GPU is used
        for i, e in enumerate(results):
            if e['a'] == 1 and e['b'] == 6:
                e['time'] += 3.1
        env['device_name'] = "My_GPU2"
        integration.store_results(results_filename,
                                  kernel_name,
                                  kernel_string,
                                  tune_params,
                                  1000,
                                  results,
                                  env,
                                  top=3)
        integration.create_device_targets(header_filename,
                                          results_filename,
                                          objective="time")

        with open(header_filename, 'r') as fh:
            output_str = fh.read()
        expected = "\n".join(["TARGET_My_GPU", "#define a 1", "#define b 6"])
        assert expected in output_str
        expected = "\n".join(["TARGET_My_GPU2", "#define a 1", "#define b 5"])
        assert expected in output_str
        expected = "\n".join([
            "#else /* default configuration */", "#define a 1", "#define b 5"
        ])
        assert expected in output_str

    finally:
        util.delete_temp_file(results_filename)
        util.delete_temp_file(header_filename)
Пример #10
0
    def compile(self, kernel_name, kernel_string):
        """call the C compiler to compile the kernel, return the function

        :param kernel_name: The name of the kernel to be compiled, used to lookup the
            function after compilation.
        :type kernel_name: string

        :param kernel_string: The C code that contains the function `kernel_name`
        :type kernel_string: string

        :returns: An ctypes function that can be called directly.
        :rtype: ctypes._FuncPtr
        """
        logging.debug('compiling ' + kernel_name)

        if self.lib != None:
            self.cleanup_lib()

        compiler_options = ["-fPIC"]

        #detect openmp
        if "#include <omp.h>" in kernel_string or "use omp_lib" in kernel_string:
            logging.debug('set using_openmp to true')
            self.using_openmp = True
            if self.compiler == "pgfortran":
                compiler_options.append("-mp")
            else:
                compiler_options.append("-fopenmp")

        #detect whether to use nvcc as default instead of g++, may overrule an explicitly passed g++
        if ("#include <cuda" in kernel_string) or ("__global__"
                                                   in kernel_string):
            if self.compiler == "g++" and self.nvcc_available:
                self.compiler = "nvcc"

        #select right suffix based on compiler
        suffix = ".cc"
        if self.compiler in ["gfortran", "pgfortran", "ftn", "ifort"]:
            suffix = ".F90"
        if self.compiler == "nvcc":
            suffix = suffix[:-1] + "u"
            compiler_options = ["-Xcompiler=" + c for c in compiler_options]

        if ".c" in suffix:
            if not "extern \"C\"" in kernel_string:
                kernel_string = "extern \"C\" {\n" + kernel_string + "\n}"

        #copy user specified compiler options to current list
        if self.compiler_options:
            compiler_options += self.compiler_options

        lib_args = []
        if "CL/cl.h" in kernel_string:
            lib_args = ["-lOpenCL"]

        logging.debug('using compiler ' + self.compiler)
        logging.debug('compiler_options ' + " ".join(compiler_options))
        logging.debug('lib_args ' + " ".join(lib_args))

        source_file = get_temp_filename(suffix=suffix)
        filename = ".".join(source_file.split(".")[:-1])

        #detect Fortran modules
        match = re.search(r"\s*module\s+([a-zA-Z_]*)", kernel_string)
        if match:
            if self.compiler == "gfortran":
                kernel_name = "__" + match.group(1) + "_MOD_" + kernel_name
            elif self.compiler in ["ftn", "ifort"]:
                kernel_name = match.group(1) + "_mp_" + kernel_name + "_"
            elif self.compiler == "pgfortran":
                kernel_name = match.group(1) + "_" + kernel_name + "_"

        try:
            write_file(source_file, kernel_string)

            lib_extension = ".so"
            if platform.system() == "Darwin":
                lib_extension = ".dylib"

            subprocess.check_call([self.compiler, "-c", source_file] +
                                  compiler_options + ["-o", filename + ".o"])
            subprocess.check_call([self.compiler, filename + ".o"] +
                                  compiler_options +
                                  ["-shared", "-o", filename + lib_extension] +
                                  lib_args)

            self.lib = numpy.ctypeslib.load_library(filename, '.')
            func = getattr(self.lib, kernel_name)
            func.restype = C.c_float

        finally:
            delete_temp_file(source_file)
            delete_temp_file(filename + ".o")
            delete_temp_file(filename + ".so")
            delete_temp_file(filename + ".dylib")

        return func
Пример #11
0
    def compile(self, kernel_instance):
        """call the C compiler to compile the kernel, return the function

        :param kernel_instance: An object representing the specific instance of the tunable kernel
            in the parameter space.
        :type kernel_instance: kernel_tuner.core.KernelInstance

        :returns: An ctypes function that can be called directly.
        :rtype: ctypes._FuncPtr
        """
        logging.debug('compiling ' + kernel_instance.name)

        kernel_string = kernel_instance.kernel_string
        kernel_name = kernel_instance.name

        if self.lib != None:
            self.cleanup_lib()

        compiler_options = ["-fPIC"]

        #detect openmp
        if "#include <omp.h>" in kernel_string or "use omp_lib" in kernel_string:
            logging.debug('set using_openmp to true')
            self.using_openmp = True
            if self.compiler == "pgfortran":
                compiler_options.append("-mp")
            else:
                compiler_options.append("-fopenmp")

        #if filename is known, use that one
        suffix = kernel_instance.kernel_source.get_user_suffix()

        #if code contains device code, suffix .cu is required
        device_code_signals = ["__global", "__syncthreads()", "threadIdx"]
        if any([snippet in kernel_string for snippet in device_code_signals]):
            suffix = ".cu"

        #detect whether to use nvcc as default instead of g++, may overrule an explicitly passed g++
        if (suffix == ".cu") or ("#include <cuda"
                                 in kernel_string) or ("cudaMemcpy"
                                                       in kernel_string):
            if self.compiler == "g++" and self.nvcc_available:
                self.compiler = "nvcc"

        if suffix is None:
            #select right suffix based on compiler
            suffix = ".cc"

            if self.compiler in ["gfortran", "pgfortran", "ftn", "ifort"]:
                suffix = ".F90"

        if self.compiler == "nvcc":
            compiler_options = ["-Xcompiler=" + c for c in compiler_options]

        #this basically checks if we aren't compiling Fortran
        #at the moment any C, C++, or CUDA code is assumed to use extern "C" linkage
        if ".c" in suffix:
            if not "extern \"C\"" in kernel_string:
                kernel_string = "extern \"C\" {\n" + kernel_string + "\n}"

        #copy user specified compiler options to current list
        if self.compiler_options:
            compiler_options += self.compiler_options

        lib_args = []
        if "CL/cl.h" in kernel_string:
            lib_args = ["-lOpenCL"]

        logging.debug('using compiler ' + self.compiler)
        logging.debug('compiler_options ' + " ".join(compiler_options))
        logging.debug('lib_args ' + " ".join(lib_args))

        source_file = get_temp_filename(suffix=suffix)
        filename = ".".join(source_file.split(".")[:-1])

        #detect Fortran modules
        match = re.search(r"\s*module\s+([a-zA-Z_]*)", kernel_string)
        if match:
            if self.compiler == "gfortran":
                kernel_name = "__" + match.group(1) + "_MOD_" + kernel_name
            elif self.compiler in ["ftn", "ifort"]:
                kernel_name = match.group(1) + "_mp_" + kernel_name + "_"
            elif self.compiler == "pgfortran":
                kernel_name = match.group(1) + "_" + kernel_name + "_"
        else:
            #for functions outside of modules
            if self.compiler in ["gfortran", "ftn", "ifort", "pgfortran"]:
                kernel_name = kernel_name + "_"

        try:
            write_file(source_file, kernel_string)

            lib_extension = ".so"
            if platform.system() == "Darwin":
                lib_extension = ".dylib"

            subprocess.check_call([self.compiler, "-c", source_file] +
                                  compiler_options + ["-o", filename + ".o"])
            subprocess.check_call([self.compiler, filename + ".o"] +
                                  compiler_options +
                                  ["-shared", "-o", filename + lib_extension] +
                                  lib_args)

            self.lib = numpy.ctypeslib.load_library(filename, '.')
            func = getattr(self.lib, kernel_name)
            func.restype = C.c_float

        finally:
            delete_temp_file(source_file)
            delete_temp_file(filename + ".o")
            delete_temp_file(filename + ".so")
            delete_temp_file(filename + ".dylib")

        return func
Пример #12
0
    def compile(self, kernel_name, kernel_string):
        """call the C compiler to compile the kernel, return the function

        :param kernel_name: The name of the kernel to be compiled, used to lookup the
            function after compilation.
        :type kernel_name: string

        :param kernel_string: The C code that contains the function `kernel_name`
        :type kernel_string: string

        :returns: An ctypes function that can be called directly.
        :rtype: ctypes._FuncPtr
        """
        logging.debug('compiling ' + kernel_name)

        if self.lib != None:
            self.cleanup_lib()

        suffix = ".cc"

        if not "extern \"C\"" in kernel_string:
            kernel_string = "extern \"C\" {\n" + kernel_string + "\n}"

        compiler_options = ["-fPIC"]
        if "#include <omp.h>" in kernel_string:
            logging.debug('set using_openmp to true')
            self.using_openmp = True
            compiler_options.append("-fopenmp")

        if ("#include <cuda" in kernel_string) or ("__global__" in kernel_string):
            if self.nvcc_available:
                self.compiler = "nvcc"

        if self.compiler == "nvcc":
            suffix = suffix[:-1] + "u"
            compiler_options = ["-Xcompiler=" + c for c in compiler_options]

        if self.compiler_options:
            compiler_options += self.compiler_options

        lib_args = []
        if "CL/cl.h" in kernel_string:
            lib_args = ["-lOpenCL"]

        logging.debug('using compiler ' + self.compiler)
        logging.debug('compiler_options ' + " ".join(compiler_options))
        logging.debug('lib_args ' + " ".join(lib_args))

        source_file = get_temp_filename(suffix=suffix)
        filename = ".".join(source_file.split(".")[:-1])

        try:
            write_file(source_file, kernel_string)

            subprocess.check_call([self.compiler, "-c", source_file] + compiler_options + ["-o", filename+".o"])
            subprocess.check_call([self.compiler, filename+".o"] + compiler_options + ["-shared", "-o", filename+".so"] + lib_args)

            self.lib = numpy.ctypeslib.load_library(filename, '.')

            func = getattr(self.lib, kernel_name)
            func.restype = C.c_float

        finally:
            delete_temp_file(source_file)
            delete_temp_file(filename+".o")
            delete_temp_file(filename+".so")


        return func
Пример #13
0
    def compile(self, kernel_name, kernel_string):
        """call the C compiler to compile the kernel, return the function

        :param kernel_name: The name of the kernel to be compiled, used to lookup the
            function after compilation.
        :type kernel_name: string

        :param kernel_string: The C code that contains the function `kernel_name`
        :type kernel_string: string

        :returns: An ctypes function that can be called directly.
        :rtype: ctypes._FuncPtr
        """
        logging.debug('compiling ' + kernel_name)

        if self.lib != None:
            self.cleanup_lib()

        compiler_options = ["-fPIC"]

        #detect openmp
        if "#include <omp.h>" in kernel_string or "use omp_lib" in kernel_string:
            logging.debug('set using_openmp to true')
            self.using_openmp = True
            if self.compiler == "pgfortran":
                compiler_options.append("-mp")
            else:
                compiler_options.append("-fopenmp")

        #select right suffix based on compiler
        suffix = ".cc"

        #detect whether to use nvcc as default instead of g++, may overrule an explicitly passed g++
        if ("#include <cuda" in kernel_string) or ("cudaMemcpy" in kernel_string):
            if self.compiler == "g++" and self.nvcc_available:
                self.compiler = "nvcc"

        #if contains device code suffix .cu is required by nvcc
        if self.compiler == "nvcc" and "__global__" in kernel_string:
            suffix = ".cu"
        if self.compiler in ["gfortran", "pgfortran", "ftn", "ifort"]:
            suffix = ".F90"

        if self.compiler == "nvcc":
            compiler_options = ["-Xcompiler=" + c for c in compiler_options]

        if ".c" in suffix:
            if not "extern \"C\"" in kernel_string:
                kernel_string = "extern \"C\" {\n" + kernel_string + "\n}"

        #copy user specified compiler options to current list
        if self.compiler_options:
            compiler_options += self.compiler_options

        lib_args = []
        if "CL/cl.h" in kernel_string:
            lib_args = ["-lOpenCL"]

        logging.debug('using compiler ' + self.compiler)
        logging.debug('compiler_options ' + " ".join(compiler_options))
        logging.debug('lib_args ' + " ".join(lib_args))

        source_file = get_temp_filename(suffix=suffix)
        filename = ".".join(source_file.split(".")[:-1])

        #detect Fortran modules
        match = re.search(r"\s*module\s+([a-zA-Z_]*)", kernel_string)
        if match:
            if self.compiler == "gfortran":
                kernel_name = "__" + match.group(1) + "_MOD_" + kernel_name
            elif self.compiler in ["ftn", "ifort"]:
                kernel_name = match.group(1) + "_mp_" + kernel_name + "_"
            elif self.compiler == "pgfortran":
                kernel_name = match.group(1) + "_" + kernel_name + "_"


        try:
            write_file(source_file, kernel_string)

            lib_extension = ".so"
            if platform.system() == "Darwin":
                lib_extension = ".dylib"

            subprocess.check_call([self.compiler, "-c", source_file] + compiler_options + ["-o", filename + ".o"])
            subprocess.check_call([self.compiler, filename + ".o"] + compiler_options + ["-shared", "-o", filename + lib_extension] + lib_args)


            self.lib = numpy.ctypeslib.load_library(filename, '.')
            func = getattr(self.lib, kernel_name)
            func.restype = C.c_float

        finally:
            delete_temp_file(source_file)
            delete_temp_file(filename+".o")
            delete_temp_file(filename+".so")
            delete_temp_file(filename+".dylib")

        return func