def verify_single_dump(input_path, max_block_threads): print(input_path) kernel_name = path.basename(input_path).split("_", 1)[1] with open(path.join(input_path, "launch.txt"), "r") as launch_f: launch_lines = list(map(int, launch_f.readlines())) block = tuple(launch_lines[3:6]) launch_block_size = block[0] * block[1] * block[2] if launch_block_size > max_block_threads: print( f" Skipping, launch block size ({launch_block_size}) bigger than maximum block size ({max_block_threads})" ) return module = drv.module_from_file(path.join(input_path, "module.ptx")) kernel = module.get_function(kernel_name) pre_args = append_debug_buffer(parse_arguments(input_path, "pre")) kernel_pre_args, host_pre_args = zip(*pre_args) kernel(*list(kernel_pre_args), grid=tuple(launch_lines[:3]), block=block, shared=launch_lines[6]) post_args = parse_arguments(input_path, "post") _, host_post_args_args = zip(*post_args) for idx, (pre_arg, post_arg) in enumerate(zip(host_pre_args, host_post_args_args)): if pre_arg is None: continue try: assert_array_equal_override(kernel_name, idx, pre_arg, post_arg) except Exception as e: print(f"{idx}: {e}")
def __init__(self, _gpu_num=0): if GPUDev.__instance != None: raise Exception("The GPUDev class is a singleton!") else: GPUDev.__instance = self from pycuda import driver as drv drv.init() self.id = _gpu_num self.dev = drv.Device(self.id) self.ctx = self.dev.make_context() this_dir = os.path.dirname(os.path.realpath(__file__)) + "/" if precision.num == 1: self.mod = drv.module_from_file(os.path.join( this_dir, '../gpu/cuda_kernels/kernels_single.cubin')) else: self.mod = drv.module_from_file(os.path.join( this_dir, '../gpu/cuda_kernels/kernels_double.cubin'))
def gfx_init( self ) : try : print 'compiling' self.prog = sh.compile_program_vfg( 'shad/balls' ) print 'compiled' self.loc_mmv = sh.get_loc(self.prog,'modelview' ) self.loc_mp = sh.get_loc(self.prog,'projection') self.l_color = sh.get_loc(self.prog,'color' ) self.l_size = sh.get_loc(self.prog,'ballsize' ) except ValueError as ve : print "Shader compilation failed: " + str(ve) sys.exit(0) # glUseProgram( self.prog ) # glUniform1i( pointsid , 0 ); # glUseProgram( 0 ) # # cuda init # self.grid = (int(self.BOX),int(self.BOX)) self.block = (1,1,int(self.BOX)) print 'CUDA: block %s , grid %s' % (str(self.block),str(self.grid)) # print cuda_driver.device_attribute.MAX_THREADS_PER_BLOCK # print cuda_driver.device_attribute.MAX_BLOCK_DIM_X # print cuda_driver.device_attribute.MAX_BLOCK_DIM_Y # print cuda_driver.device_attribute.MAX_BLOCK_DIM_Z floatbytes = np.dtype(np.float32).itemsize self.gpos = glGenBuffers(1) glBindBuffer( GL_ARRAY_BUFFER , self.gpos ) glBufferData( GL_ARRAY_BUFFER , self.pos.nbytes, self.pos, GL_STREAM_DRAW ) glBindBuffer( GL_ARRAY_BUFFER , 0 ) self.df1 = cuda_driver.mem_alloc( self.f.nbytes ) self.df2 = cuda_driver.mem_alloc( self.f.nbytes ) cuda_driver.memcpy_htod( self.df1 , self.f ) cuda_driver.memset_d32( self.df2 , 0 , self.NUM*self.Q ) mod = cuda_driver.module_from_file( 'lbm_kernel.cubin' ) self.collision = mod.get_function("collision_step") self.collision.prepare( "Piii" ) self.streaming = mod.get_function("streaming_step") self.streaming.prepare( "PPiii" ) self.colors = mod.get_function("colors") self.colors.prepare( "PPiii" )
def cuda_init( self ) : mod = cuda_driver.module_from_file( 'solid_kernel.cubin' ) self.cerr = cuda_driver.mem_alloc( 4 ) self.cut = mod.get_function("cut_x") self.cut.prepare( "PPPifiiiiiffP" ) self.fill_v = mod.get_function("fill_v") self.fill_v.prepare( "Piifff") self.fill_n = mod.get_function("fill_n") self.fill_n.prepare( "Piifff")
def load_module_new(module_name, module_file, nvcc_options, nvcc_include_dirs, cubin_cache_enable): cu_hexhash = hashlib.md5(bytearray(module_file, 'utf-8')).hexdigest() cu_hexhash_from_file = '' if not (os.path.exists("cubin_cache/" + str(module_name) + ".txt")): cache_file = open("cubin_cache/" + str(module_name) + ".txt", 'w+') cache_file.write(cu_hexhash) cache_file.close() else: cache_file = open("cubin_cache/" + str(module_name) + ".txt", 'r') cu_hexhash_from_file = cache_file.read() cache_file.close() if (cu_hexhash_from_file == cu_hexhash) & ( os.path.isfile("cubin/" + str(cu_hexhash_from_file) + "_cubin.cubin")) & cubin_cache_enable: print("Load cached %s kernel !" % str(module_name)) return drv.module_from_file("cubin/" + str(cu_hexhash) + "_cubin.cubin") else: if (os.path.isfile("cubin/" + str(cu_hexhash_from_file) + "_cubin.cubin")): os.remove("cubin/" + str(cu_hexhash_from_file) + "_cubin.cubin") cache_file = open("cubin_cache/" + str(module_name) + ".txt", 'w') cache_file.write(cu_hexhash) cache_file.close() print("Caching %s kernel !" % str(module_name)) cubin = pycuda.compiler.compile(module_file, options=nvcc_options, include_dirs=nvcc_include_dirs, cache_dir=None) save_cubin(cubin, "cubin/" + str(cu_hexhash) + "_cubin.cubin") return drv.module_from_file("cubin/" + str(cu_hexhash) + "_cubin.cubin")
def load_cuda(): md5 = hashlib.md5() md5.update(mod.encode("utf-8")) filename = md5.hexdigest() + ".cubin" path = pathlib.Path(__file__).resolve().parent / filename if not path.exists(): try: cubin = compiler.compile(mod, no_extern_c=True) with open(str(path), "wb") as handle: handle.write(cubin) except cuda.CompileError as ce: print(f"{ce}") return cuda.module_from_file(str(path))
def get_kernel(kernel_name): kernel_spec = kernels[kernel_name] params = _params[kernel_spec["params"]] sig = "" for p in params: if p[0:4] == "int ": sig += "I" elif p[0:6] == "float ": sig += "f" else: sig += "P" module = drv.module_from_file(os.path.join(cubin_dir, kernel_name + ".cubin")) func = module.get_function(kernel_name) func.prepare(sig) # print("Loaded: " + kernel) return func
def init_cubin(): from pycuda import driver from pycuda import gpuarray from pycuda import autoinit global context,kComputeMatMult device=driver.Device(gpu_id) context=device.make_context(flags=driver.ctx_flags.SCHED_YIELD) stream=driver.Stream() #print model_path mod=driver.module_from_file(os.path.join(os.path.dirname(os.path.abspath(__file__)),'mat_mult.cubin')) kComputeMatMult = mod.get_function("kComputeMatMult") kComputeMatMult.prepare([np.int32,np.int32,np.int32,'P','P','P']) if context: context.pop() print 'init ok'
def load_cuda_code_individual(): global _update_individuals_fn md5 = hashlib.md5() md5.update(mod.encode("utf-8")) filename = md5.hexdigest() + ".cubin" path = pathlib.Path(__file__).resolve().parent / filename if not path.exists(): try: cubin = compiler.compile(mod, no_extern_c=True) with open(str(path), "wb") as handle: handle.write(cubin) except cuda.CompileError as ce: print(f"{ce}") _cuda_module = cuda.module_from_file(str(path)) _update_individuals_fn = _cuda_module.get_function("update_individuals")
def __init__(self, kernel_set="fgemm_int64_wide32", locks=1024, calc_partials=True, bench=False): m = re.search(r'wide(\d+)', kernel_set) if m: self.width = int(m.group(1)) else: raise ValueError("Invalid kernel_set") self.locks = locks self.module = drv.module_from_file("kernels/" + kernel_set + ".cubin") self.mode = 0 if calc_partials else 4 self.fgemm = dict() for op in ("nt", "nn", "tn"): mod = self.module.get_function(kernel_set + "_" + op) mod.prepare("PPPIIIIIIHH") self.fgemm[op] = mod fprop_conv = self.module.get_function("fprop_conv_float32_K64N64T64") fprop_conv.prepare("PPPIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII") self.fgemm["fprop_conv"] = fprop_conv bprop_conv = self.module.get_function( "bprop_conv_float32_CRST64N64T64") bprop_conv.prepare("PPPPIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII") self.fgemm["bprop_conv"] = bprop_conv udpate_conv = self.module.get_function( "update_conv_float32_CRST64K64T64") udpate_conv.prepare("PPPPIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII") self.fgemm["update_conv"] = udpate_conv self.gpulock = drv.mem_alloc(locks * 4) drv.memset_d32(self.gpulock, 0, locks) self.bench = bench if bench: self.start = drv.Event() self.end = drv.Event()
def __init__(self, units, in_units, **kwargs): super(GPUMatMultLayer, self).__init__(**kwargs) gpu_id = 0 device = driver.Device(gpu_id) self.context = device.make_context(flags=driver.ctx_flags.SCHED_YIELD) self.stream = driver.Stream() mod = driver.module_from_file( os.path.join(os.path.dirname(os.path.abspath(__file__)), 'mat_mult.cubin')) self.kComputeMatMult = mod.get_function("kComputeMatMult") self.kComputeMatMult.prepare( [np.int32, np.int32, np.int32, 'P', 'P', 'P']) self.feature_dim = in_units self.output_dim = units with self.name_scope(): self.weight = self.params.get('weight', shape=(in_units, units)) self.bias = self.params.get('bias', shape=(units, )) if self.context: self.context.pop()
def get_kernel(kernel_name): #import ipdb; ipdb.set_trace() kernel_spec = kernels[kernel_name] params = _params[kernel_spec["params"]] sig = "" for p in params: ptype, pname = _space_re.split(p) if ptype[-1] == '*': sig += "Q" elif ptype == 'float': sig += "f" else: sig += "I" module = drv.module_from_file(os.path.join(cubin_dir, kernel_name + ".cubin")) func = module.get_function(kernel_name) func.prepare(sig) func.threads = kernel_spec["threads"] # print("Loaded: " + kernel_name) return func
def get_kernel(kernel_name): #import ipdb; ipdb.set_trace() kernel_spec = kernels[kernel_name] params = _params[kernel_spec["params"]] sig = "" for p in params: ptype, pname = _space_re.split(p) if ptype[-1] == '*': sig += "Q" elif ptype == 'float': sig += "f" else: sig += "I" module = drv.module_from_file( os.path.join(cubin_dir, kernel_name + ".cubin")) func = module.get_function(kernel_name) func.prepare(sig) func.threads = kernel_spec["threads"] # print("Loaded: " + kernel_name) return func
def __init__(self, kernel_set="fgemm_int64_wide32", locks=1024, calc_partials=True, bench=False): m = re.search( r'wide(\d+)', kernel_set) if m: self.width = int(m.group(1)) else: raise ValueError("Invalid kernel_set") self.locks = locks self.module = drv.module_from_file("kernels/" + kernel_set + ".cubin") self.mode = 0 if calc_partials else 4 self.fgemm = dict() for op in ("nt", "nn", "tn"): mod = self.module.get_function(kernel_set + "_" + op) mod.prepare("PPPIIIIIIHH") self.fgemm[op] = mod fprop_conv = self.module.get_function("fprop_conv_float32_K64N64T64") fprop_conv.prepare("PPPIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII") self.fgemm["fprop_conv"] = fprop_conv bprop_conv = self.module.get_function("bprop_conv_float32_CRST64N64T64") bprop_conv.prepare("PPPPIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII") self.fgemm["bprop_conv"] = bprop_conv udpate_conv = self.module.get_function("update_conv_float32_CRST64K64T64") udpate_conv.prepare("PPPPIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIII") self.fgemm["update_conv"] = udpate_conv self.gpulock = drv.mem_alloc(locks*4) drv.memset_d32(self.gpulock, 0, locks) self.bench = bench if bench: self.start = drv.Event() self.end = drv.Event()
def loadKernel(): module = drv.module_from_file("ssgls.ptx") glsKernel = module.get_function("doGLS_kernel") return module, glsKernel
def get_kernel(base_name, options=None): major, minor = _get_compute_capability() if major < 5: raise RuntimeError( "sass kernels require Maxwell or greater class hardware") arch = "sm_%d%d" % (major, minor) libprefix = "PERL5LIB=%s" % maxas_dir maxas_i = [libprefix, os.path.join(maxas_dir, "maxas.pl") + " -i -w"] maxas_p = [libprefix, os.path.join(maxas_dir, "maxas.pl") + " -p"] kernel_spec = kernels[base_name] kernel_name = base_name # static options if "args" in kernel_spec: for pair in kernel_spec["args"].items(): maxas_i.append("-D%s %s" % pair) maxas_p.append("-D%s %s" % pair) # dynamic options if options is not None: for opt in options: if type(opt) is tuple: maxas_i.append("-D%s %s" % opt) maxas_p.append("-D%s %s" % opt) kernel_name += "_%s%s" % opt else: maxas_i.append("-D%s 1" % opt) maxas_p.append("-D%s 1" % opt) kernel_name += "_%s" % opt maxas_i.insert(2, "-k " + kernel_name) sass_name = kernel_spec["sass"] + ".sass" cubin_name = kernel_name + ".cubin" cubin_dir = _get_cache_dir([arch, 'cubin']) ptx_version = "4.2" if major < 6 else "5.0" ptx_file = get_ptx_file(kernel_spec, kernel_name, arch, ptx_version) sass_file = os.path.join(sass_dir, sass_name) cubin_file = os.path.join(cubin_dir, cubin_name) if not os.path.exists(sass_file): raise RuntimeError("Missing: %s for kernel: %s" % (sass_name, kernel_name)) ptx_mtime = os.path.getmtime(ptx_file) cubin_mtime = os.path.getmtime(cubin_file) if os.path.exists( cubin_file) else 0 build_cubin = False if ptx_mtime > cubin_mtime: build_cubin = True includes = extract_includes(sass_name) for include, include_mtime in includes: if include_mtime > cubin_mtime: build_cubin = True break if build_cubin: # build the cubin and run maxas in the same command # we don't want the chance of a generated cubin not processed by maxas (in case user hits ^C in between these steps) run_command(["ptxas -v -arch", arch, "-o", cubin_file, ptx_file, ";"] + maxas_i + [sass_file, cubin_file]) cubin_mtime = time.time() # output preprocessed and disassembled versions in debug mode if debug: pre_dir = _get_cache_dir([arch, 'pre']) dump_dir = _get_cache_dir([arch, 'dump']) pre_file = os.path.join(pre_dir, kernel_name + "_pre.sass") dump_file = os.path.join(dump_dir, kernel_name + "_dump.sass") pre_mtime = os.path.getmtime(pre_file) if os.path.exists( pre_file) else 0 dump_mtime = os.path.getmtime(dump_file) if os.path.exists( dump_file) else 0 for include, include_mtime in includes: if include_mtime > pre_mtime: run_command(maxas_p + [sass_file, pre_file]) break if cubin_mtime > dump_mtime: run_command(["nvdisasm -c", cubin_file, ">", dump_file]) # generate the function signature for pycuda params = _params[kernel_spec["params"]] sig = "" for p in params: ptype, pname = _space_re.split(p) if ptype[-1] == '*': sig += "Q" elif ptype == 'float': sig += "f" elif ptype == 'unsigned': sig += "I" else: sig += "i" module = drv.module_from_file(cubin_file) func = module.get_function(kernel_name) func.prepare(sig) func.threads = kernel_spec["threads"] func.name = kernel_name func.static_shared = eval(kernel_spec["share"]) return func
def test_check_and_build_cu(): ''' check_and_build(): cuda ''' import yaml import os import pycuda.driver as cuda import atexit from build import check_and_make_parameter_header, check_and_build, clean code_type = 'cu' dpath = join(current_dpath, 'src') src_dir = {'f90':'f90', 'c':'c', 'cu':'cuda', 'cl':'opencl'}[code_type] build_dpath = join(dpath, src_dir, 'build') with open(join(dpath, 'build.yaml'), 'r') as f: build_dict = yaml.load(f) # # Remove previous generated files # ret, out, err = capture(clean)(code_type, dpath) equal( len(os.listdir(build_dpath)), 0 ) # # Make and compile header file # verify stdout and file existence # ret, out, err = capture(check_and_make_parameter_header)(code_type, dpath) ret, out, err = capture(check_and_build)(code_type, dpath) expect = ''' [compile] amb.cu using the PyCUDA build [compile] apb.cu using the PyCUDA build ''' equal('\n'+out+'\n', expect) # # PyCUDA environment # Load modules # cuda.init() device = cuda.Device(0) context = device.make_context() atexit.register(context.pop) lib_apb = cuda.module_from_file( join(build_dpath, 'apb.cubin') ) lib_amb = cuda.module_from_file( join(build_dpath, 'amb.cubin') ) apb = lib_apb.get_function('apb') amb = lib_amb.get_function('amb') # # setup # nx = 1000000 a = np.random.rand(nx) b = np.random.rand(nx) c = np.random.rand(nx) c2 = c.copy() a_dev = cuda.to_device(a) b_dev = cuda.to_device(b) c_dev = cuda.to_device(c) c2_dev = cuda.to_device(c2) with open(join(dpath, 'apb.yaml'), 'r') as f: apb_dict = yaml.load(f) with open(join(dpath, 'amb.yaml'), 'r') as f: amb_dict = yaml.load(f) kk = apb_dict['kk'] lll = apb_dict['lll'] mm = amb_dict['section']['mm'] ref = kk*a + lll*b + mm*c # # verify results # apb(np.int32(0), np.int32(nx), a_dev, b_dev, c_dev, block=(512,1,1), grid=(nx//512+1,1)) cuda.memcpy_dtoh(c, c_dev) aa_equal(ref, c, 14) amb(np.int32(0), np.int32(nx), a_dev, b_dev, c2_dev, block=(512,1,1), grid=(nx//512+1,1)) cuda.memcpy_dtoh(c2, c2_dev) aa_equal(ref, c2, 14) # # verify stdout if revision # ret, out, err = capture(check_and_build)(code_type, dpath) expect = ''' ./cuda/build/amb.cu is up to date. ./cuda/build/apb.cu is up to date. ''' equal('\n'+out.replace(dpath,'.')+'\n', expect) # # verify stdout if partial revision # os.remove(join(build_dpath, 'amb.cu')) ret, out, err = capture(check_and_build)(code_type, dpath) expect = ''' [compile] amb.cu using the PyCUDA build ./cuda/build/apb.cu is up to date. ''' equal('\n'+out.replace(dpath,'.')+'\n', expect)
def setup(self): self.mod = cuda.module_from_file('diffusion_kernel.cubin') self.func = self.mod.get_function("temperature_update16x16")
#!/usr/bin/env python2.7 # gpu_trunc_norm.py # Author: Nick Ulle from __future__ import division from math import ceil, sqrt import numpy as np import pycuda.autoinit import pycuda.driver as cuda # ----- PTX Module Setup # Import PTX module and setup its functions. _gpu_module = cuda.module_from_file('bin/gpu_trunc_norm.ptx') _gpu_trunc_norm = _gpu_module.get_function('gpu_trunc_norm') _gpu_curand_init = _gpu_module.get_function('gpu_curand_init') _gpu_curand_deinit = _gpu_module.get_function('gpu_curand_deinit') # ----- Globals # A pointer to the curand RNG states. _rng_state = None # The total number of threads per block. # This should match NUM_RNG in gpu_trunc_norm.cu _NUM_THREADS = 128 # ----- Functions def gpu_trunc_norm(n, mean, sd, a, b):
#!/usr/bin/env python2.7 # gpu_trunc_norm.py # Author: Nick Ulle from __future__ import division from math import ceil, sqrt import numpy as np import pycuda.autoinit import pycuda.driver as cuda # ----- PTX Module Setup # Import PTX module and setup its functions. _gpu_module = cuda.module_from_file('bin/gpu_trunc_norm.ptx') _gpu_trunc_norm = _gpu_module.get_function('gpu_trunc_norm') _gpu_curand_init = _gpu_module.get_function('gpu_curand_init') _gpu_curand_deinit = _gpu_module.get_function('gpu_curand_deinit') # ----- Globals # A pointer to the curand RNG states. _rng_state = None # The total number of threads per block. # This should match NUM_RNG in gpu_trunc_norm.cu _NUM_THREADS = 128 # ----- Functions def gpu_trunc_norm(n, mean, sd, a, b): blocks = int(ceil(sqrt(n / _NUM_THREADS)))
def calc_holo(scatterer, schema): # Extract the necessary components from schema imsize1 = uint32(schema.shape[0]) imsize2 = uint32(schema.shape[1]) pxsize = float64(schema.spacing[0]) wavevec = float64(schema.optics.wavevec) med_wavelen = float64(schema.optics.med_wavelen) einc = schema.optics.polarization start = time() # Extract the necessary components from scatterer # and calculate scattering coefficients # Single sphere case if isinstance(scatterer, Sphere): asbs = scat_coeffs(scatterer, schema.optics) num = uint32(1) sphLocation = scatterer.center x = float64(array([sphLocation[0]])) y = float64(array([sphLocation[1]])) z = float64(array([sphLocation[2]])) r = float64(array(scatterer.r)) n = array(scatterer.n, dtype=complex) # Multi-sphere case else: asbs = scat_coeffs(scatterer.scatterers[0], schema.optics) num = uint32(len(scatterer.get_component_list())) sphLocation = scatterer.centers x = float64(array(sphLocation[:,0])) y = float64(array(sphLocation[:,1])) z = float64(array(sphLocation[:,2])) r = float64(array(scatterer.r)) n = array(scatterer.n, dtype=complex) npoints = uint32(imsize1*imsize2) _, nstop = uint32(asbs.shape) # 2D array for storing final hologram holo = float64(zeros([imsize1,imsize2])) ##################################################### # Inputs to the kernel call: # Python var Size Py type GPU type # npoints 1 np.uint32 unsigned int # asbs 2 x nstop complex dcmplx # nstop 1 np.uint32 unsigned int # imsize1 1 np.uint32 unsigned int # imsize2 1 np.uint32 unsigned int # pxsize 1 float64 double # x,y,z,r,n 1 x num float64 double # wavevec 1 float64 double # med_wavelen 1 float64 double # num 1 np.uint32 unsigned int # einc 2 float64 double # holo imsize x imsize float64 double ##################################################### ############################################################ ## kernel_holo - Replaces asm_fullradial, calc_scat_field, ## and fieldstocart ############################################################ #uncomment to compile and cache the module #kernel_holo = cuda_compile(kernel_holo_source,"kernel_holo") #uncomment to load from cached file "precompiled.cubin" source_module = cu.module_from_file("precompiled.cubin") kernel_holo = source_module.get_function("kernel_holo") asbs_d = gpu.to_gpu(asbs.copy()) einc_d = gpu.to_gpu(einc) holo_d = gpu.to_gpu(holo) x_d = gpu.to_gpu(x) y_d = gpu.to_gpu(y) z_d = gpu.to_gpu(z) r_d = gpu.to_gpu(r) n_d = gpu.to_gpu(n.copy()) nblocks = 2**6 blocksize = (nblocks,1,1) gridsize = (int((npoints/nblocks)+(npoints%nblocks)),1) start_gpu_time = cu.Event() end_gpu_time = cu.Event() start_gpu_time.record() start_test = time() # assume the spheres are the same size, just pass asbs once kernel_holo(npoints, asbs_d, nstop, imsize2, imsize1, pxsize, x_d, y_d, z_d, r_d, n_d, wavevec, med_wavelen, num, einc_d, holo_d, block=blocksize, grid=gridsize) end_gpu_time.record() end_gpu_time.synchronize() gpu_time = start_gpu_time.time_till(end_gpu_time) * 1e-3 holo = holo_d.get() stop = time() print "holo computation took %f sec. of which %f sec. were on the GPU" % (stop-start, gpu_time) holo = Image(holo,optics=schema.optics, spacing = schema.spacing) return holo
# kernel compile and import kernel = ''' __global__ void daxpy(int nx, double a, double *x, double *y) { int idx = blockIdx.x*blockDim.x + threadIdx.x; if (idx < nx) y[idx] = a*x[idx] + y[idx]; } ''' from pycuda.compiler import SourceModule, compile #mod = SourceModule(kernel, cache_dir='./') cubin = compile(kernel, cache_dir='./') #mod = cuda.module_from_buffer(cubin) with open('daxpy.cubin', 'wb') as f: f.write(cubin) mod = cuda.module_from_file('daxpy.cubin') daxpy = mod.get_function('daxpy') dev = pycuda.autoinit.device print(dev.compute_capability()) #cuda.device_attribute['COMPUTE_CAPABILITY_MAJOR'] #cuda.device_attribute['COMPUTE_CAPABILITY_MINOR'] # setup nx = 2**20 # allocation a = np.random.rand() x = np.random.rand(nx) y = np.random.rand(nx)
# compiling cubin and loading it up if master: pass # import subprocess # subprocess.check_call(['nvcc', '--cubin', '-arch', 'sm_20', 'kernel.cu']) for i in range(slaves): mpi.send(None, dest = i, tag = 0xdeadbee) # debug('cubin compiled, msg sent') else: import pycuda.driver as cuda cuda.init() device = cuda.Device(gpu_id) device.make_context() mpi.recv(source = master_rank, tag = 0xdeadbee) mod = cuda.module_from_file('kernel.cubin') debug('msg received, cubin loaded') debug('# of devices: %d' % cuda.Device.count()) debug('gpu_id: %d' % gpu_id) mpi.Barrier() def init_nucleus(out_ar): cos = np.cos def bcc(x, y, z): return 0.166666666666666666667 * (cos(x + y) + cos(x + z) + cos(y + z) + cos(x - y) + cos(x - z) + cos(y - z)) # xvec = np.linspace(0, lx, nx, endpoint = False) # yvec = np.linspace(0, ly, ny, endpoint = False) # zvec = np.linspace(mpi.rank * lz, (mpi.rank + 1) * lz, nz, endpoint = False) # X, Y, Z = np.meshgrid(xvec, yvec, zvec)
cu.init() d = cu.Device(1) ctx = d.make_context() kernel_size = 3 block_size = (16, 16) #grid_size = calculate_grid_size((height, width), block_size) grid_size = (32, 32) #print(I.shape) #print(grid_size) I_gpu = cu.to_device(I.astype('float32')) J_gpu = cu.mem_alloc(J.nbytes) source = cu.module_from_file("sobel.cubin") kernel_naive = source.get_function("sobel_filter") kernel_naive.prepare(['P', 'P', 'Q', 'Q', 'Q', 'Q']) time = kernel_naive.prepared_timed_call(grid_size, block_size, I_gpu, J_gpu, height, width, kernel_size, 4) J1 = cu.from_device(J_gpu, shape=J.shape, dtype="float32") print("Time spent in kernel1: {}s".format(time() * 1e-3)) print("L1 norm: {}".format( np.sum(np.sum( np.abs(J - J1) )) )) plt.imshow(J1) plt.show() finally: ctx.pop() print('\ndone')
import sys from gpuDA.gpuDA import GpuDA from set_boundary import set_boundary_values comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() npz = 3 npy = 3 npx = 3 assert (size == npx * npy * npz) mod = cuda.module_from_file('diffusion_kernel.cubin') func = mod.get_function('temperature_update16x16') # local sizes: nx = 510 ny = 510 nz = 512 # global lengths: lx = 0.3 ly = 0.3 lz = 0.3 # material properties: alpha = 1e-5
def __init__(self, NGRID=16): self.mod = cuda.module_from_file("./cuda/graphene.cubin")
def gfx_init( self ) : try : print 'compiling' self.prog = sh.compile_program_vfg( 'shad/balls' ) print 'compiled' self.loc_mmv = sh.get_loc(self.prog,'modelview' ) self.loc_mp = sh.get_loc(self.prog,'projection') self.l_color = sh.get_loc(self.prog,'color' ) self.l_size = sh.get_loc(self.prog,'ballsize' ) except ValueError as ve : print "Shader compilation failed: " + str(ve) sys.exit(0) # glUseProgram( self.prog ) # glUniform1i( pointsid , 0 ); # glUseProgram( 0 ) # # cuda init # self.grid = (int(self.NUM/256)+1,1) self.block = (256,1,1) print 'CUDA: block %s , grid %s' % (str(self.block),str(self.grid)) self.gpts = glGenBuffers(1) glBindBuffer( GL_ARRAY_BUFFER , self.gpts ) glBufferData( GL_ARRAY_BUFFER , self.pts.nbytes , self.pts , GL_STREAM_DRAW ) glBindBuffer( GL_ARRAY_BUFFER , 0 ) # self.dprv = cuda_driver.mem_alloc( self.prv.nbytes ) self.dvel = cuda_driver.mem_alloc( self.vel.nbytes ) self.dacc = cuda_driver.mem_alloc( self.acc.nbytes ) self.dfrs = cuda_driver.mem_alloc( self.frs.nbytes ) self.dmas = cuda_driver.mem_alloc( self.mas.nbytes ) self.ddns = cuda_driver.mem_alloc( self.dns.nbytes ) # cuda_driver.memcpy_htod( self.dprv , self.prv ) cuda_driver.memcpy_htod( self.dvel , self.vel ) cuda_driver.memcpy_htod( self.dacc , self.acc ) cuda_driver.memcpy_htod( self.dfrs , self.frs ) cuda_driver.memcpy_htod( self.dmas , self.mas ) cuda_driver.memcpy_htod( self.ddns , self.dns ) mod = cuda_driver.module_from_file( 'sph_kernel.cubin' ) self.update_pts = mod.get_function("update_pts") self.update_pts.prepare( "PPPfi" ) self.update_vel = mod.get_function("update_vel") self.update_vel.prepare( "PPPfi" ) self.update_dns = mod.get_function("update_dns") self.update_dns.prepare( "PPPi" ) self.update_frs = mod.get_function("update_frs") self.update_frs.prepare( "PPPPPi" ) self.collisions = mod.get_function("collisions") self.collisions.prepare( "PPPfPfi" )
def get_kernel(base_name, options=None): attributes = drv.Context.get_device().get_attributes() major = attributes[drv.device_attribute.COMPUTE_CAPABILITY_MAJOR] minor = attributes[drv.device_attribute.COMPUTE_CAPABILITY_MINOR] if major < 5: raise RuntimeError("sass kernels require Maxwell or greater class hardware") arch = "sm_%d%d" % (major, minor) libprefix = "PERL5LIB=%s" % (maxas_dir) maxas_i = [libprefix, os.path.join(maxas_dir, "maxas.pl") + " -i -w"] maxas_p = [libprefix, os.path.join(maxas_dir, "maxas.pl") + " -p"] kernel_spec = kernels[base_name] kernel_name = base_name if "args" in kernel_spec: for pair in kernel_spec["args"].items(): maxas_i.append("-D%s %s" % pair) maxas_p.append("-D%s %s" % pair) if options is not None: for opt in options: if type(opt) is tuple: maxas_i.append("-D%s %s" % opt) maxas_p.append("-D%s %s" % opt) kernel_name += "_%s%s" % opt else: maxas_i.append("-D%s 1" % opt) maxas_p.append("-D%s 1" % opt) kernel_name += "_%s" % opt maxas_i.insert(2, "-k " + kernel_name) sass_name = kernel_spec["sass"] + ".sass" cubin_name = kernel_name + ".cubin" ptx_version = "4.2" if major < 6 else "5.0" ptx_file = get_ptx_file(kernel_spec, kernel_name, arch, ptx_version) sass_file = os.path.join(sass_dir, sass_name) cubin_file = os.path.join(cubin_dir, cubin_name) if not os.path.exists(sass_file): raise RuntimeError("Missing: %s for kernel: %s" % (sass_name, kernel_name)) ptx_age = os.path.getmtime(ptx_file) cubin_age = os.path.getmtime(cubin_file) if os.path.exists(cubin_file) else 0 if ptx_age > cubin_age: run_command([ "ptxas -v -arch", arch, "-o", cubin_file, ptx_file ]) cubin_age = 0 includes = extract_includes(sass_name) for include, include_age in includes: if include_age > cubin_age: run_command(maxas_i + [sass_file, cubin_file]) cubin_age = include_age break if debug: pre_file = os.path.join(pre_dir, kernel_name + "_pre.sass") dump_file = os.path.join(dump_dir, kernel_name + "_dump.sass") pre_age = os.path.getmtime(pre_file) if os.path.exists(pre_file) else 0 dump_age = os.path.getmtime(dump_file) if os.path.exists(dump_file) else 0 for include, include_age in includes: if include_age > pre_age: run_command(maxas_p + [sass_file, pre_file]) break if cubin_age > dump_age: run_command(["nvdisasm -raw", cubin_file, ">", dump_file]) params = _params[kernel_spec["params"]] sig = "" for p in params: ptype, pname = _space_re.split(p) if ptype[-1] == '*': sig += "Q" elif ptype == 'float': sig += "f" elif ptype == 'unsigned': sig += "I" else: sig += "i" module = drv.module_from_file(os.path.join(cubin_dir, kernel_name + ".cubin")) func = module.get_function(kernel_name) func.prepare(sig) func.threads = kernel_spec["threads"] return func
import pycuda.driver as cuda import pycuda.autoinit # from pycuda.compiler import SourceModule from pycuda.driver import module_from_file from pycuda.autoinit import context import pycuda.gpuarray as gpuarray import numpy as np import time ### ------------------------------------------ ### start timing the start of the end-to-end processing time ### ------------------------------------------ ## load precompiled cubin file mod = module_from_file("knn_kernels.cubin") # link to the kernel function lr_dist = mod.get_function('distKernel') get_k = mod.get_function('getKLabels') sort = mod.get_function('sort') #------------------------------------------------------------------------------ # parameters #------------------------------------------------------------------------------ neighbors = 3 # input data #open file
import pycuda.driver as cuda import pycuda.autoinit # from pycuda.compiler import SourceModule from pycuda.driver import module_from_file import pycuda.gpuarray as gpuarray import numpy as np import time ### ------------------------------------------ ### start timing the start of the end-to-end processing time ### ------------------------------------------ ## load precompiled cubin file mod = module_from_file("lr_kernels.cubin") # link to the kernel function lr_MM = mod.get_function('MatrixMultiplyKernel') lr_sig = mod.get_function('sub_sigKernel') lr_dist = mod.get_function('distKernel') #------------------------------------------------------------------------------ # parameters #------------------------------------------------------------------------------ maxIter = 100 alpha = 0.001 tol = 0.0001
# number of threads that one block can handle THREADS_PER_BLOCK = device.get_attribute(cuda.device_attribute.MAX_THREADS_PER_BLOCK) # number of blocks required to handle all atoms N_BLOCKS = int(np.ceil(float(atoms.get_number_of_atoms())/\ THREADS_PER_BLOCK)) # maximum number of threads per block N_THREAD = int(np.ceil(float(atoms.get_number_of_atoms())/\ N_BLOCKS)) path=os.path.dirname(os.path.realpath(inspect.getfile(inspect.currentframe()))) # ------------------------ # precompiled kernel is preferrable # ------------------------ kernel = path + '/Lennard-Jones_BpV.cubin' if os.path.isfile(kernel): kernel = cuda.module_from_file(kernel) else: # ------------------------ # just-in-time compilation # ------------------------ with open(path + '/Lennard-Jones_BpV.cu') as ifs: kernel_text = ifs.read() kernel = SourceModule(kernel_text) energy = kernel.get_function('Lennard_Jones_BpV') shared_mem = DIM * N_THREAD * np.float32().nbytes cuda_grid = (int(ngrid), N_BLOCKS) cuda_blocks = (N_THREAD, 1, 1) energy(cuda.InOut(grid), ngrid,
import pycuda.gpuarray as gpuarray import pycuda.driver as drv import pycuda.autoinit import numpy as np from pycuda.compiler import SourceModule my_mod = drv.module_from_file("Isoparametric.cubin") NumElements = 1000 NodesPerElement = 8 Elements = np.zeros((NodesPerElement, NumElements), dtype=np.int32) id = 0 for i in xrange(NodesPerElement): for j in xrange(NumElements): Elements[i, j] = id id = id + 1 print Elements
N_BLOCKS = int(np.ceil(float(atoms.get_number_of_atoms())/THREADS_PER_BLOCK)) # maximum number of threads per block N_THREAD = int(np.ceil(float(atoms.get_number_of_atoms())/N_BLOCKS)) path=os.path.dirname(os.path.realpath(inspect.getfile(inspect.currentframe()))) # ------------------------ # just-in-time compilation # ------------------------ #with open(path + '/distance_BpV.cu') as ifs: #kernel_text = ifs.read() #kernel = SourceModule(kernel_text) # ------------------------ # precompiled kernel # ------------------------ kernel = path + '/distance_BpV.cubin' kernel = cuda.module_from_file(path + '/distance_BpV.cubin') distance = kernel.get_function('distance_BpV') shared_mem = DIM * N_THREAD * np.float32().nbytes cuda_grid = (int(ngrid), N_BLOCKS) cuda_blocks = (N_THREAD, 1, 1) distance(cuda.InOut(grid), ngrid, cuda.In(pv), cuda.In(ndiv), cuda.In(spos), cuda.In(radii), natoms, block=cuda_blocks, grid=cuda_grid, shared=shared_mem)
def _get_module(path, clss, op, size=None): size = "" if size is None else "_" + size cubin = "{0}_{1}{2}.cubin".format(clss, op, size) return drv.module_from_file(os.path.join(path, cubin))
import pycuda.gpuarray as gpuarray import pycuda.driver as drv import pycuda.autoinit import numpy as np from pycuda.compiler import SourceModule my_mod = drv.module_from_file("Isoparametric.cubin") NumElements = 1000; NodesPerElement = 8 Elements = np.zeros((NodesPerElement,NumElements), dtype=np.int32) id = 0; for i in xrange(NodesPerElement): for j in xrange(NumElements): Elements[i,j] = id id = id + 1 print Elements
import matplotlib.pyplot as plt from time import time import pycuda.driver as cuda import pycuda.autoinit # Setup nx, ny = 1200, 1000 tmax, tgap = 500, 100 c = numpy.ones((nx,ny), order='F')*0.25 f = numpy.zeros_like(c, order='F') c_gpu = cuda.to_device(c) f_gpu = cuda.to_device(f) g_gpu = cuda.to_device(f) mod = cuda.module_from_file('core.cubin') advance_src = mod.get_function('advance_src') #advance = mod.get_function('advance') advance = mod.get_function('advance_smem') # Plot using the matplotlib plt.ion() imag = plt.imshow(c.T, origin='lower', vmin=-0.2, vmax=0.2) plt.colorbar() # Main loop for the time evolution inx, iny = numpy.int32(nx), numpy.int32(ny) bs, gs = (256,1,1), (nx*ny//256+1,1) t0 = time()
cuda.device_attribute.MAX_THREADS_PER_BLOCK) # number of blocks required to handle all atoms N_BLOCKS = int(np.ceil(float(atoms.get_number_of_atoms())/\ THREADS_PER_BLOCK)) # maximum number of threads per block N_THREAD = int(np.ceil(float(atoms.get_number_of_atoms())/\ N_BLOCKS)) path = os.path.dirname( os.path.realpath(inspect.getfile(inspect.currentframe()))) # ------------------------ # precompiled kernel is preferrable # ------------------------ kernel = path + '/Lennard-Jones_BpV.cubin' if os.path.isfile(kernel): kernel = cuda.module_from_file(kernel) else: # ------------------------ # just-in-time compilation # ------------------------ with open(path + '/Lennard-Jones_BpV.cu') as ifs: kernel_text = ifs.read() kernel = SourceModule(kernel_text) energy = kernel.get_function('Lennard_Jones_BpV') shared_mem = DIM * N_THREAD * np.float32().nbytes cuda_grid = (int(ngrid), N_BLOCKS) cuda_blocks = (N_THREAD, 1, 1) energy(cuda.InOut(grid),
import sys from gpuDA.gpuDA import GpuDA from set_boundary import set_boundary_values comm = MPI.COMM_WORLD rank = comm.Get_rank() size = comm.Get_size() npz = 3 npy = 3 npx = 3 assert(size == npx*npy*npz) mod = cuda.module_from_file('diffusion_kernel.cubin') func = mod.get_function('temperature_update16x16') # local sizes: nx = 510 ny = 510 nz = 512 # global lengths: lx = 0.3 ly = 0.3 lz = 0.3 # material properties: alpha = 1e-5