def prepare_kernels( s ): mod = cuda.SourceModule( file('./gpu_core/dielectric.cu','r').read() ) s.update_e = mod.get_function("update_e") s.update_h = mod.get_function("update_h") Db = ( s.tpb_main, 1, 1 ) s.update_e.prepare( "iiPPPPPPPPP", block=Db, shared=s.ns_main ) s.update_h.prepare( "iiPPPPPP", block=Db, shared=s.ns_main )
if __name__ == '__main__': import pycuda.driver as drv import numpy import pycuda.autoinit mod = drv.SourceModule(""" __global__ void multiply_them(float *dest, float *a, float *b) { const int i = threadIdx.x; dest[i] = a[i] * b[i]; } """) multiply_them = mod.get_function("multiply_them") a = numpy.random.randn(400).astype(numpy.float32) b = numpy.random.randn(400).astype(numpy.float32) dest = numpy.zeros_like(a) multiply_them(drv.Out(dest), drv.In(a), drv.In(b), block=(400, 1, 1)) print dest - a * b
psizEyf = cuda.mem_alloc( size_psiz ) psizEyb = cuda.mem_alloc( size_psiz ) psizHxf = cuda.mem_alloc( size_psiz ) psizHxb = cuda.mem_alloc( size_psiz ) psizHyf = cuda.mem_alloc( size_psiz ) psizHyb = cuda.mem_alloc( size_psiz ) ''' # Copy the arrays from host to device cuda.memcpy_htod( devCEx, CEx ) cuda.memcpy_htod( devCEy, CEy ) cuda.memcpy_htod( devCEz, CEz ) # Get the module from the cuda files mod_common = cuda.SourceModule( file('common.cu','r').read() ) mod_dielectric = cuda.SourceModule( file('dielectric.cu','r').read() ) mod_source = cuda.SourceModule( file('source.cu','r').read() ) #mod_cpml = cuda.SourceModule( file('cpml.cu','r').read().replace('NPMLp2',str(2*(Npml+1))).replace('NPMLp',str(Npml+1)).replace('NPML',str(Npml) ) ) ''' # Get the global pointer from the module rcmbE = mod_cpml.get_global("rcmbE") rcmbH = mod_cpml.get_global("rcmbH") rcmaE = mod_cpml.get_global("rcmaE") rcmaH = mod_cpml.get_global("rcmaH") #print rcmaE #print bE # Copy the arrays from host to constant memory
for (zi=0; zi < nz; zi++) { const real QdotR = Qx[qxi]*x[xi] +Qy[qyi]*y[yi] + Qz[qzi]*z[zi]; real cx,sx; sincos(QdotR,&sx,&cx); Re += density[densityidx]*cx; Im += density[densityidx]*sx; densityidx++; } } } result[idx].x = tex2D(tex, Re); result[idx].y = tex2D(tex, Im); } */ """ cuda_texture = cuda.SourceModule(kernel_source) import sys sys.exit() cuda_texture_func = cuda_texture.get_function("cudaBorn") texref = cuda_texture.get_texref("tex") def cudoSMBA_form(cell, Q, lattice, beam): ''' This module ties the cudo calculation of the SMBA to my class structures. Other tie-ins may easily be included here. ''' form = (born(cell.unit, cell.value_list[0], cell.value_list[1], cell.value_list[2], Q.q_list[0], Q.q_list[1], Q.q_list[2])) return form
import pycuda.driver as cuda import numpy as np mod = cuda.SourceModule(""" __global__ void doublify(float *a) { int idx = threadIdx.x + threadIdx.y*4; a[idx] *= 2; } """) def test_cuda(): cuda.init() assert cuda.Device.count() >= 1 dev = cuda.Device(0) ctx = dev.make_context() a = np.random.randn(4, 4).astype(np.float32) a_gpu = cuda.mem_alloc(a.size * a.dtype.itemsize) cuda.memcpy_htod(a_gpu, a) func = mod.get_function("doublify") func(a_gpu, block=(4, 4, 1)) a_doubled = np.empty_like(a) cuda.memcpy_dtoh(a_doubled, a_gpu)
mod = drv.SourceModule(""" __global__ void stateupdate(SCALAR *V_arr, SCALAR *ge_arr, SCALAR *gi_arr) { int i = blockIdx.x * blockDim.x + threadIdx.x; SCALAR V = V_arr[i]; SCALAR ge = ge_arr[i]; SCALAR gi = gi_arr[i]; SCALAR V__tmp = (ge+gi-(V+0.049))/0.02; SCALAR ge__tmp = -ge/0.005; SCALAR gi__tmp = -gi/0.01; V_arr[i] = V+0.0001*V__tmp; ge_arr[i] = ge+0.0001*ge__tmp; gi_arr[i] = gi+0.0001*gi__tmp; } __global__ void threshold(SCALAR *V, int *spikes, bool *spiked, unsigned int *global_j, int N) { int i = blockIdx.x * blockDim.x + threadIdx.x; bool this_spiked = V[i]>-0.05; spiked[i] = this_spiked; if(this_spiked) // && i<N) // can leave out i<N check if N%blocksize=0 { unsigned int j = atomicInc(global_j, N); spikes[j] = i; } } __global__ void propagate(int *spikes, int numspikes, SCALAR *v, SCALAR *W, int N) { int i = blockIdx.x * blockDim.x + threadIdx.x; SCALAR v_tmp = v[i]; for(int j=0; j<numspikes; j++) v_tmp += W[i+N*spikes[j]]; v[i] = v_tmp; } __global__ void reset(SCALAR *V, bool *spiked) { int i = blockIdx.x * blockDim.x + threadIdx.x; bool has_spiked = spiked[i]; V[i] = (V[i]*!has_spiked)+(-0.06)*has_spiked; // i.e. V[i]=-0.06 if spiked[i] } """.replace('SCALAR', precision))
def cuda_d_matern(matrixC_gpu, nx, ny, cmin, cmax, symm, diff_degree): blocksize = 16 nbkmaxstandard = 10 mod = cuda.SourceModule(s) prefac = 1.0 if diff_degree < 0.0: return None if diff_degree >= 10.0: #TODO gaussian return None if diff_degree != 1.0: ga = scipy.special.gamma(diff_degree) prefac = pow(0.5, (diff_degree - 1.0)) / ga snu = sqrt(diff_degree) * 2.0 fl = floor(diff_degree) # fl = N rem = diff_degree - fl if ((matrixC_gpu != None) and (symm == True) and (nx == ny)): nb = nx / blocksize if ((cmin == 0) and (cmax == nx)): #Load cuda function cuda_fct = mod.get_function("d_fillMatrix_rkbesl_symmetric_full") if (fl + 1 > nbkmaxstandard): #TODO: Recompile function with larger NBKMAX and load module return None #Convert input parameters diff_degree = numpy.float64(diff_degree) snu = numpy.float64(snu) rem = numpy.float64(rem) fl = numpy.uint32(fl) prefac = numpy.float64(prefac) nx = numpy.uint32(nx) #Execute cuda function cuda_fct(matrixC_gpu, diff_degree, snu, rem, fl, fl, prefac, nx, block=(blocksize, blocksize, 1), grid=(nb, nb)) #Ouput #matrixC_cpu = numpy.ones(nx*nx,numpy.float64) #cuda.memcpy_dtoh(matrixC_cpu, matrixC_gpu) #print matrixC_cpu #return matrixC_gpu return matrixC_gpu return None
from pycuda.reduction import ReductionKernel from pycuda.elementwise import ElementwiseKernel from skcuda.fft import fft, ifft, Plan from skcuda.linalg import conj # from skcuda.linalg import sqrt as cusqrt cross_correlatiob_mod = driver.SourceModule(""" _global_ void update_scores_angles(float32 *scores, int *angles, float32 *ccc_map, int angleID) { int idx = threadIdx.x + threadIdx.y*10 + threadIdx.z*100; if scores[idx] < ccc_map[idx] { scores[idx] = ccc_map[idx]; angles[idx] = angleID; } } __global__ void insert_volume(float32 *padded_volume, float32 *template, int sizePad, int sizeTem) { int idx = threadIdx.x + threadIdx.y * threadDimx.x + blockSize.x*threadDim.x*blockIdx.x int pad_idx = (1 + 2 * idx) * sizePad padded_volume[pad_idx] = template[idx] } """) update_scores_angles = cross_correlatiob_mod.get_function('update_scores_angles') class TemplateMatchingPlan(): def __init__(self, volume, template, gpu):
def get_kernel_initmem( s ): mod = cuda.SourceModule( file('./gpu_core/initmem.cu','r').read() ) return mod.get_function("initmem")
def prepare_kernels( s ): mod = cuda.SourceModule( file('./gpu_core/source.cu','r').read() ) s.update_src = mod.get_function("update_src") Db = ( s.tpb, 1, 1 ) s.update_src.prepare( "iiiiP", block=Db )
def get_module( s ): s.mod = cuda.SourceModule( file('./gpu_core/cpml_non_kapa.cu','r').read().replace('NPMLp2',str(2*(Npml+1))).replace('NPMLp',str(Npml+1)).replace('NPML',str(Npml) ) )
u = math.exp(vDt) d = math.exp(-vDt) pu = (If - d) / (u - d) pd = 1.0 - pu puByDf = pu * Df pdByDf = pd * Df processed = [data.S, data.X, vDt, puByDf, pdByDf] #print processed h_OptionData.append(processed) comp = binomialOptionFromProcessed(processed) comps.append(comp) array = numpy.array(h_OptionData,dtype=numpy.float32) a_gpu = cuda.to_device(array) b_gpu = cuda.mem_alloc(len(h_OptionData)) func = mod.get_function("binomialOptionsKernel") print "Loaded function..." func(a_gpu, b_gpu, block=(5,1,1), grid=(len(h_OptionData),1)) result = numpy.zeros((len(h_OptionData),1),dtype=numpy.float32) cuda.memcpy_dtoh(result,b_gpu) return zip(result.tolist(),comps) optionData = [] for i in range(100): optionData.append(OptionData(random.uniform(5.0,30.0),random.uniform(1.0,100.0),random.uniform(0.25,10.0),0.06,0.10)) mod = cuda.SourceModule(binomial_kernel) result = binomialOptionsGPU(optionData,mod) for res,comp in result: print str(comp)+" : "+str(res[0])
import pycuda.driver as cuda import pycuda.autoinit import numpy realrow = numpy.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=numpy.float32).reshape(1, 5) mod_copy_texture = cuda.SourceModule(""" texture<float, 1> tex; texture<float, 1> tex2; __global__ void copy_texture_kernel(float * data) { int ty=threadIdx.y; //data[ty] = tex1D(tex, (float)(ty)); data[ty] = tex1D(tex, (float)(ty)/2.0f); } """) copy_texture_func = mod_copy_texture.get_function("copy_texture_kernel") texref = mod_copy_texture.get_texref("tex") tex2ref = mod_copy_texture.get_texref("tex2") cuda.matrix_to_texref(realrow, texref, order="C") texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) texref.set_filter_mode(cuda.filter_mode.LINEAR) gpu_output = numpy.zeros_like(realrow) copy_texture_func(cuda.Out(gpu_output), block=(1, 1, 1), texrefs=[texref]) print "Output:"
mod = drv.SourceModule(""" __global__ void stateupdate(SCALAR *V, SCALAR *ge, SCALAR *gi) { int i = blockIdx.x * blockDim.x + threadIdx.x; SCALAR V__tmp = (ge[i]+gi[i]-(V[i]+0.049))/0.02; SCALAR ge__tmp = -ge[i]/0.005; SCALAR gi__tmp = -gi[i]/0.01; V[i] += 0.0001*V__tmp; ge[i] += 0.0001*ge__tmp; gi[i] += 0.0001*gi__tmp; } __device__ unsigned int irng(unsigned int n) { return ((n*1103515245+12345)/65536)%32768; } __device__ SCALAR srng(int i, int j) { // formula guarantees srng(i,i)=0 which is useful for weight matrices return (SCALAR)(irng(irng((unsigned int)i)^irng((unsigned int)j)))/32768.0; } __device__ bool weightfunc(int i, int j) { return srng(i,j)<SPARSENESS; } __global__ void threshold(SCALAR *V, int *spikes, bool *spiked, unsigned int *global_j, int N) { int i = blockIdx.x * blockDim.x + threadIdx.x; bool this_spiked = V[i]>-0.05; spiked[i] = this_spiked; if(this_spiked) // && i<N) // can leave out i<N check if N%blocksize=0 { unsigned int j = atomicInc(global_j, N); spikes[j] = i; } } __global__ void single_thread_threshold(SCALAR *V, int *spikes, bool *spiked, unsigned int *global_j, int N) { unsigned int j = 0; for(int i=0;i<N;i++) { if(V[i]>-0.05) { spiked[i] = true; spikes[j++] = i; } else { spiked[i] = false; } } *global_j = j; } __global__ void propagate(int *spikes, int numspikes, SCALAR *v, SCALAR W, int N, int offset) { int i = blockIdx.x * blockDim.x + threadIdx.x; for(int j=0; j<numspikes; j++) v[i] += weightfunc(i,spikes[j]+offset)*W; } __global__ void reset(SCALAR *V, bool *spiked) { int i = blockIdx.x * blockDim.x + threadIdx.x; V[i] = (V[i]*!spiked[i])+(-0.06)*spiked[i]; // i.e. V[i]=-0.06 if spiked[i] } """.replace('SCALAR', precision).replace('SPARSENESS', str(sparseness)))
mod = drv.SourceModule(""" __global__ void stateupdate(SCALAR *V, SCALAR *ge, SCALAR *gi) { int i = blockIdx.x * blockDim.x + threadIdx.x; SCALAR V__tmp = (ge[i]+gi[i]-(V[i]+0.049))/0.02; SCALAR ge__tmp = -ge[i]/0.005; SCALAR gi__tmp = -gi[i]/0.01; V[i] += 0.0001*V__tmp; ge[i] += 0.0001*ge__tmp; gi[i] += 0.0001*gi__tmp; } __global__ void threshold(SCALAR *V, int *spikes, bool *spiked, unsigned int *global_j, int N) { int i = blockIdx.x * blockDim.x + threadIdx.x; bool this_spiked = V[i]>-0.05; spiked[i] = this_spiked; if(this_spiked) // && i<N) // can leave out i<N check if N%blocksize=0 { unsigned int j = atomicInc(global_j, N); spikes[j] = i; } } __global__ void single_thread_threshold(SCALAR *V, int *spikes, bool *spiked, unsigned int *global_j, int N) { unsigned int j = 0; for(int i=0;i<N;i++) { if(V[i]>-0.05) { spiked[i] = true; spikes[j++] = i; } else { spiked[i] = false; } } *global_j = j; } __global__ void propagate(int *spikes, int numspikes, SCALAR *v, SCALAR *W, int N) { int i = blockIdx.x * blockDim.x + threadIdx.x; for(int j=0; j<numspikes; j++) v[i] += W[i+N*spikes[j]]; } __global__ void propagate_spike(int spike, SCALAR *alldata, int *allj, int *rowind, SCALAR *V) { int i = blockIdx.x * blockDim.x + threadIdx.x; int startindex = rowind[spike]; if(i>=rowind[spike+1]-startindex) return; i += startindex; V[allj[i]] += alldata[i]; } __global__ void reset(SCALAR *V, bool *spiked) { int i = blockIdx.x * blockDim.x + threadIdx.x; V[i] = (V[i]*!spiked[i])+(-0.06)*spiked[i]; // i.e. V[i]=-0.06 if spiked[i] } """.replace('SCALAR', precision))
mod = drv.SourceModule(""" /*__global__ void stateupdate(SCALAR *V_arr, SCALAR *ge_arr, SCALAR *gi_arr) { int i = blockIdx.x * blockDim.x + threadIdx.x; SCALAR V = V_arr[i]; SCALAR ge = ge_arr[i]; SCALAR gi = gi_arr[i]; // SCALAR V__tmp = (ge+gi-(V+0.049))/0.02; // SCALAR ge__tmp = -ge/0.005; // SCALAR gi__tmp = -gi/0.01; SCALAR V__tmp = (ge+gi-(V+0.049))*50; SCALAR ge__tmp = -ge*200; SCALAR gi__tmp = -gi*100; V_arr[i] = V+0.0001*V__tmp; ge_arr[i] = ge+0.0001*ge__tmp; gi_arr[i] = gi+0.0001*gi__tmp; }*/ __global__ void stateupdate(SCALAR *V_arr, SCALAR *ge_arr, SCALAR *gi_arr) { int i = blockIdx.x * blockDim.x + threadIdx.x; SCALAR V = V_arr[i]; SCALAR ge = ge_arr[i]; SCALAR gi = gi_arr[i]; V_arr[i] = V+1; ge_arr[i] = ge+1; gi_arr[i] = gi+1; } __global__ void stateupdate_noglobal(SCALAR *V_arr, SCALAR *ge_arr, SCALAR *gi_arr) { int i = blockIdx.x * blockDim.x + threadIdx.x; SCALAR V = 0.;//V_arr[i]; SCALAR ge = 0.;//ge_arr[i]; SCALAR gi = 0.;//gi_arr[i]; SCALAR V__tmp = (ge+gi-(V+0.049))/0.02; SCALAR ge__tmp = -ge/0.005; SCALAR gi__tmp = -gi/0.01; V = V+0.0001*V__tmp; ge = ge+0.0001*ge__tmp; gi = gi+0.0001*gi__tmp; } """.replace('SCALAR', precision))
def cholesky_gpu(matrixA_gpu, matrixA_size, dtype, blocksize): if dtype_names[dtype] != 'float': raise NotImplementedError, 'Double precision not working yet.' # Compile a kernel for this dtype and blocksize, if it does not already exist. if cholesky_modules.has_key((dtype, blocksize)): mod = cholesky_modules[dtype, blocksize] else: s = templ_subs(cholesky_template, blocksize=blocksize, dtype=dtype_names[dtype]) cholesky_sources[dtype, blocksize] = s mod = cholesky_modules[dtype, blocksize] = cuda.SourceModule(s) matrixA_size = numpy.uint32(matrixA_size) matrixBlocks = numpy.uint32(matrixA_size / blocksize) matrixRest = matrixA_size % blocksize if ((matrixA_gpu == None) or (matrixA_size == 0)): return None cuda_fct_topleft = mod.get_function("d_choldc_topleft") cuda_fct_strip = mod.get_function("d_choldc_strip") cuda_fct_diagupdate = mod.get_function("d_choldc_diagupdate") cuda_fct_loupdate = mod.get_function("d_choldc_loupdate") i = (int)(matrixBlocks) j = i while i > 2: logridx = 1 logridy = i - 2 stripgridx = i - 1 cuda_fct_topleft(matrixA_gpu, matrixA_size, numpy.uint32(j - i), block=(blocksize, blocksize, 1), grid=(1, 1)) cuda_fct_strip(matrixA_gpu, matrixA_size, numpy.uint32(j - i), block=(blocksize, blocksize, 1), grid=(stripgridx, 1)) cuda_fct_diagupdate(matrixA_gpu, matrixA_size, numpy.uint32(j - i), block=(blocksize, blocksize, 1), grid=(stripgridx, 1)) cuda_fct_loupdate(matrixA_gpu, matrixA_size, matrixBlocks, numpy.uint32(j - i), block=(blocksize, blocksize, 1), grid=(logridx, logridy)) i = i - 1 if (j > 1): cuda_fct_topleft(matrixA_gpu, matrixA_size, numpy.uint32(j - 2), block=(blocksize, blocksize, 1), grid=(1, 1)) cuda_fct_strip(matrixA_gpu, matrixA_size, numpy.uint32(j - 2), block=(blocksize, blocksize, 1), grid=(1, 1)) cuda_fct_diagupdate(matrixA_gpu, matrixA_size, numpy.uint32(j - 2), block=(blocksize, blocksize, 1), grid=(1, 1)) cuda_fct_topleft(matrixA_gpu, matrixA_size, numpy.uint32(j - 1), block=(blocksize, blocksize, 1), grid=(1, 1)) cuda.Context.synchronize()
def get_kernel_initmem( s ): fpath = '%s/core/initmem.cu' % base_dir mod = cuda.SourceModule( file( fpath,'r' ).read() ) return mod.get_function("initmem")
if __name__ == '__main__': import pycuda.autoinit from pycuda import driver as drv def gpuarrstatus(z): return '(cpu_changed=' + str( z._cpu_data_changed) + ', gpu_changed=' + str( z._gpu_data_changed) + ')' x = array([1, 2, 3, 4], dtype=numpy.float32) print '+ About to initialise GPUBufferedArray' y = GPUBufferedArray(x) print '- Initialised GPUBufferedArray', gpuarrstatus(y) print '+ About to set item on CPU array', gpuarrstatus(y) y[3] = 5 print '- Set item on CPU array', gpuarrstatus(y) mod = drv.SourceModule(''' __global__ void doubleit(float *y) { int i = threadIdx.x; y[i] *= 2.0; } ''') doubleit = mod.get_function("doubleit") print '+ About to call GPU function', gpuarrstatus(y) doubleit(y.gpu_array, block=(4, 1, 1)) print '- Called GPU function', gpuarrstatus(y) print '+ About to print CPU array', gpuarrstatus(y) print y print '- Printed CPU array', gpuarrstatus(y)
d_Result[gmemPos] = sum; //d_Result[gmemPos] = 128; smemPos += smemStride; gmemPos += gmemStride; } } ''' template = string.Template(template) code = template.substitute(KERNEL_RADIUS = KERNEL_RADIUS, KERNEL_W = KERNEL_W, COLUMN_TILE_H=COLUMN_TILE_H, COLUMN_TILE_W=COLUMN_TILE_W, ROW_TILE_W=ROW_TILE_W, KERNEL_RADIUS_ALIGNED=KERNEL_RADIUS_ALIGNED) module = cuda.SourceModule(code) convolutionRowGPU = module.get_function('convolutionRowGPU') convolutionColumnGPU = module.get_function('convolutionColumnGPU') d_Kernel_rows = module.get_global('d_Kernel_rows')[0] d_Kernel_columns = module.get_global('d_Kernel_columns')[0] # Helper functions for computing alignment... def iDivUp(a, b): # Round a / b to nearest higher integer value a = numpy.int32(a) b = numpy.int32(b) return (a / b + 1) if (a % b != 0) else (a / b) def iDivDown(a, b): # Round a / b to nearest lower integer value a = numpy.int32(a)
def _cudaIsBound(data, truncate, ratio): bsize = __CUDA_BLOCK_SIZE import pycuda.driver as cuda import pycuda.autoinit import pycuda.gpuarray as gpuarray my_stream = cuda.Stream() cuda.init() assert cuda.Device.count() >= 1 mass_scale_factor = 1.0 / (data['CellMass'].max()) m = (data['CellMass'] * mass_scale_factor).astype('float32') assert (m.size > bsize) gsize = int(math.ceil(float(m.size) / bsize)) assert (gsize > 16) # Now the tedious process of rescaling our values... length_scale_factor = data['dx'].max() / data['dx'].min() x = ((data['x'] - data['x'].min()) * length_scale_factor).astype('float32') y = ((data['y'] - data['y'].min()) * length_scale_factor).astype('float32') z = ((data['z'] - data['z'].min()) * length_scale_factor).astype('float32') p = na.zeros(z.shape, dtype='float32') x_gpu = cuda.mem_alloc(x.size * x.dtype.itemsize) y_gpu = cuda.mem_alloc(y.size * y.dtype.itemsize) z_gpu = cuda.mem_alloc(z.size * z.dtype.itemsize) m_gpu = cuda.mem_alloc(m.size * m.dtype.itemsize) p_gpu = cuda.mem_alloc(p.size * p.dtype.itemsize) for ag, a in [(x_gpu, x), (y_gpu, y), (z_gpu, z), (m_gpu, m), (p_gpu, p)]: cuda.memcpy_htod(ag, a) source = """ extern __shared__ float array[]; __global__ void isbound(float *x, float *y, float *z, float *m, float *p, int *nelem) { /* My index in the array */ int idx1 = blockIdx.x * blockDim.x + threadIdx.x; /* Note we are setting a start index */ int idx2 = blockIdx.y * blockDim.x; int offset = threadIdx.x; /* Here we're just setting up convenience pointers to our shared array */ float* x_data1 = (float*) array; float* y_data1 = (float*) &x_data1[blockDim.x]; float* z_data1 = (float*) &y_data1[blockDim.x]; float* m_data1 = (float*) &z_data1[blockDim.x]; float* x_data2 = (float*) &m_data1[blockDim.x]; float* y_data2 = (float*) &x_data2[blockDim.x]; float* z_data2 = (float*) &y_data2[blockDim.x]; float* m_data2 = (float*) &z_data2[blockDim.x]; x_data1[offset] = x[idx1]; y_data1[offset] = y[idx1]; z_data1[offset] = z[idx1]; m_data1[offset] = m[idx1]; x_data2[offset] = x[idx2 + offset]; y_data2[offset] = y[idx2 + offset]; z_data2[offset] = z[idx2 + offset]; m_data2[offset] = m[idx2 + offset]; __syncthreads(); float tx, ty, tz; float my_p = 0.0; if(idx1 < %(p)s) { for (int i = 0; i < blockDim.x; i++){ if(i + idx2 < idx1 + 1) continue; tx = (x_data1[offset]-x_data2[i]); ty = (y_data1[offset]-y_data2[i]); tz = (z_data1[offset]-z_data2[i]); my_p += m_data1[offset]*m_data2[i] / sqrt(tx*tx+ty*ty+tz*tz); } } p[idx1] += my_p; __syncthreads(); } """ mod = cuda.SourceModule(source % dict(p=m.size)) func = mod.get_function('isbound') mylog.info("Running CUDA functions. May take a while. (%0.5e, %s)", x.size, gsize) import pycuda.tools as ct t1 = time.time() ret = func(x_gpu, y_gpu, z_gpu, m_gpu, p_gpu, shared=8 * bsize * m.dtype.itemsize, block=(bsize, 1, 1), grid=(gsize, gsize), time_kernel=True) cuda.memcpy_dtoh(p, p_gpu) p1 = p.sum() if na.any(na.isnan(p)): raise ValueError return p1 * (length_scale_factor / (mass_scale_factor**2.0))
hermite_eval = cuda.SourceModule(""" #define A_TERMS %(lenA)d #define TERMS %(termsAlpha)d #define CLUSTERS %(clusters)d #define POLY_TERMS %(polyTerms)d #define BLOCKSIZE %(blocksize)d #define SQRT_2 1.4142135623730951f #define SIGMA %(sigma)f #define LEN_ALPHA %(len_alpha)d #define NUM_TERMS %(num_terms)d #define OPTS3 %(opts3)d #define NUM_CLUSTERS %(num_clusters)d #define DEST_PER_THREAD 2 // slightly optimised evaluation -- do all calculations for source clusters // at once -- save on memory bandwidth __global__ void eval_hermite2(float *r, float *A, float *tx, float *ty, float *sb, float *alpha, float *H) { float result, x, y; int alpha1; float h1, h2; int i, k; float pre_mult, t_x, t_y; // shared memory __shared__ float shared_alpha[TERMS]; __shared__ float shared_A[A_TERMS]; __shared__ float shared_sb[CLUSTERS]; __shared__ float shared_H[POLY_TERMS]; //////////////////////////////// // Read vars into shared memory // WARNING: Each block needs more threads than (TERMS + A_TERMS + POLY_TERMS + CLUSTERS) // otherwise it won't work. //////////////////////////////// // select what each thread reads if (threadIdx.x < TERMS){ // shared_alpha case i = 0; k = 0; } else if (threadIdx.x < TERMS + A_TERMS) { // shared_A case i = 1; k = - TERMS; } else if (threadIdx.x < TERMS + A_TERMS + POLY_TERMS) { // shared_H case i = 2; k = - TERMS - A_TERMS; } else if (threadIdx.x < TERMS + A_TERMS + POLY_TERMS + CLUSTERS) { // shared_sb case i = 3; k = - TERMS - A_TERMS - POLY_TERMS; } else { // No read case i = 4; k = 0; } // diverge the threads to have independent reads switch (i){ case 0: shared_alpha[threadIdx.x + k] = alpha[threadIdx.x + k]; break; case 1: shared_A[threadIdx.x + k] = A[threadIdx.x + k]; break; case 2: shared_H[threadIdx.x + k] = H[threadIdx.x + k]; break; case 3: shared_sb[threadIdx.x + k] = sb[threadIdx.x + k]; break; default: break; } //__threadfence_block(); __syncthreads(); if (OPTS3 < threadIdx.x + BLOCKSIZE*blockIdx.x) { return; } t_x = tx[threadIdx.x + BLOCKSIZE*blockIdx.x]; t_y = ty[threadIdx.x + BLOCKSIZE*blockIdx.x]; result = 0.0; /////////////////////////////// // Main loop, flops: (NumClusters * (19 + LenAlpha/2 * (14 + 4 * NumTerms)) + 2) /////////////////////////////// // run through this code for each cluster center for (k=0; k < NUM_CLUSTERS; k++) { // distance operator x = (t_x - shared_sb[k*2+0]) / SQRT_2 / SIGMA; //x = (t_x - sb[k*2+0]) / SQRT_2 / SIGMA; y = (t_y - shared_sb[k*2+1]) / SQRT_2 / SIGMA; //y = (t_y - sb[k*2+1]) / SQRT_2 / SIGMA; pre_mult = exp(-(x*x))*exp(-(y*y)); // look at shared memory - all variables called in // poly_eval should be in shared memory for (i=0; i < LEN_ALPHA/2; i++) { alpha1 = shared_alpha[i*2]; // I avoid the inner loop and get a superb speedup, but it needs to be hardcoded // is it possible to do the same using MACROS? or generating this from python? // ONLY USE p=5 here h1 = h2 = 0.0f; h1 = shared_H[NUM_TERMS*alpha1 + 0] + x*h1; h1 = shared_H[NUM_TERMS*alpha1 + 1] + x*h1; h1 = shared_H[NUM_TERMS*alpha1 + 2] + x*h1; /* h1 = shared_H[NUM_TERMS*alpha1 + 3] + x*h1; h1 = shared_H[NUM_TERMS*alpha1 + 4] + x*h1; h1 = shared_H[NUM_TERMS*alpha1 + 5] + x*h1; h1 = shared_H[NUM_TERMS*alpha1 + 6] + x*h1; h1 = shared_H[NUM_TERMS*alpha1 + 7] + x*h1; h1 = shared_H[NUM_TERMS*alpha1 + 8] + x*h1; h1 = shared_H[NUM_TERMS*alpha1 + 9] + x*h1; h1 = shared_H[NUM_TERMS*alpha1 + 10] + x*h1; h1 = shared_H[NUM_TERMS*alpha1 + 11] + x*h1; */ //result += alpha1; alpha1 = shared_alpha[i*2 + 1]; h2 = shared_H[NUM_TERMS*alpha1 + 0] + y*h2; h2 = shared_H[NUM_TERMS*alpha1 + 1] + y*h2; h2 = shared_H[NUM_TERMS*alpha1 + 2] + y*h2; /* h2 = shared_H[NUM_TERMS*alpha1 + 3] + y*h2; h2 = shared_H[NUM_TERMS*alpha1 + 4] + y*h2; h2 = shared_H[NUM_TERMS*alpha1 + 5] + y*h2; h2 = shared_H[NUM_TERMS*alpha1 + 6] + y*h2; h2 = shared_H[NUM_TERMS*alpha1 + 7] + y*h2; h2 = shared_H[NUM_TERMS*alpha1 + 8] + y*h2; h2 = shared_H[NUM_TERMS*alpha1 + 9] + y*h2; h2 = shared_H[NUM_TERMS*alpha1 + 10] + y*h2; h2 = shared_H[NUM_TERMS*alpha1 + 11] + y*h2; */ result += shared_A[k*LEN_ALPHA/2+i]*pre_mult*h1*h2; } } r[threadIdx.x + BLOCKSIZE*blockIdx.x] += result; } """ % {'lenA':len(A_curr),'termsAlpha':len(alpha),'clusters':len(sb_curr),'polyTerms':len(H), 'blocksize':blocksize, 'sigma': delta, 'len_alpha': len(alpha), 'num_terms': p, 'opts3': len(tx), 'num_clusters':clusters_this_call}, nvcc="nvcc",options=['-use_fast_math'], keep=False, no_extern_c=False)
N = 512 * 8 spikes = [i for i in range(N) if numpy.random.rand() < 0.1] blocksize = 512 block = (blocksize, 1, 1) grid = (N / blocksize, 1) repeats = 1000 print 'N:', N print 'numspikes:', len(spikes), 'proportion', float(len(spikes)) / N print 'repeats:', repeats print 'block:', block, 'grid:', grid mod = drv.SourceModule(""" __global__ void propagate(int *spikes, int numspikes, float *v, float *W, int N) { int i = blockIdx.x * blockDim.x + threadIdx.x; for(int j=0; j<numspikes; j++) v[i] += W[i+N*spikes[j]]; } """) propagate = mod.get_function("propagate") W = numpy.random.randn(N, N) W_gpu = gpuarray.to_gpu(numpy.array(W, dtype=numpy.float32)) v = gpuarray.to_gpu(numpy.zeros(N, dtype=numpy.float32)) v_pre = v.get() gpu_spikes = drv.mem_alloc(4 * len(spikes)) spikes = numpy.array(spikes, dtype=int) drv.memcpy_htod(gpu_spikes, spikes)
import pycuda.driver as drv from pycuda.gpuarray import GPUArray from pycuda import gpuarray import bisect import numpy, pylab, time, random from scipy import weave N = 1024 mod = drv.SourceModule( ''' #include <chag/pp/compact.cuh> __global__ void test() { int i = blockIdx.x * blockDim.x + threadIdx.x; i = i+1; } ''', options=[ r'-I"C:\Documents and Settings\goodman.CELERI\Bureau\source-20090929"' ], no_extern_c=True) #x = numpy.random.randn(N) #xgpu = gpuarray.to_gpu(x) #ygpu = gpuarray.to_gpu(numpy.zeros(x.shape)) #vec = gpuarray.to_gpu(numpy.zeros(1, dtype=int)) #z = x[x>=0.0] # #xgpu_start = int(xgpu.gpudata) #xgpu_end = xgpu_start+N*8
mod = drv.SourceModule(""" __global__ void stateupdate(SCALAR *V_arr, SCALAR *ge_arr, SCALAR *gi_arr) { int i = blockIdx.x * blockDim.x + threadIdx.x; SCALAR V = V_arr[i]; SCALAR ge = ge_arr[i]; SCALAR gi = gi_arr[i]; SCALAR V__tmp = (ge+gi-(V+0.049))/0.02; SCALAR ge__tmp = -ge/0.005; SCALAR gi__tmp = -gi/0.01; V_arr[i] = V+0.0001*V__tmp; ge_arr[i] = ge+0.0001*ge__tmp; gi_arr[i] = gi+0.0001*gi__tmp; } __global__ void threshold(SCALAR *V, int *spikes, bool *spiked, unsigned int *global_j, int N) { int i = blockIdx.x * blockDim.x + threadIdx.x; bool this_spiked = V[i]>-0.05; spiked[i] = this_spiked; if(this_spiked) { spikes[atomicInc(global_j, N)] = i; } } /////// TODO: next three functions are untested, but should be a non-blocking threshold function if N<=512*512 __global__ void threshold_blocksumcount(SCALAR *V, unsigned int *blocksumcount) { __shared__ unsigned short int partialsum[BLOCKSIZE]; int i = blockIdx.x * blockDim.x + threadIdx.x; int t = threadIdx.x; int b = blockIdx.x; bool this_spiked = V[i]>-0.05; partialsum[t] = (unsigned short int)this_spiked; for(unsigned int stride=blockDim.x/2; stride>=1; stride/=2) { __syncthreads(); if(t<stride) partialsum[t] += partialsum[t+stride]; } __syncthreads(); if(t==0) blocksumcount[b] = partialsum[t]; } __global__ void threshold_cumsum(unsigned int *blocksumcount, unsigned int *cumblocksumcount) { __shared__ unsigned int partialsum[BLOCKSIZE]; int t = threadIdx.x; partialsum[t] = blocksumcount[t]; for(unsigned int stride=1; stride<blockDim.x; stride*=2) { __syncthreads(); if(t>=stride) partialsum[t] += partialsum[t-stride]; } cumblocksumcount[t+1] = partialsum[t]; } __global__ void threshold_compact(SCALAR *V, int *spikes, unsigned int *blocksumcount, unsigned int *cumblocksumcount) { __shared__ unsigned short int partialsum[BLOCKSIZE]; int i = blockIdx.x * blockDim.x + threadIdx.x; int t = threadIdx.x; int b = blockIdx.x; // only compact those blocks with some spikes in (many will not in general, and this introduces no divergence) if(blocksumcount[b]>0) { bool this_spiked = V[i]>-0.05; partialsum[t] = (unsigned short int)this_spiked; for(unsigned int stride=1; stride<blockDim.x; stride*=2) { __syncthreads(); if(t>=stride) partialsum[t] += partialsum[t-stride]; } __syncthreads(); if(this_spiked) spikes[(unsigned int)partialsum[t]+cumblocksumcount[b]-1] = i; //spikes[i] = partialsum[t]+cumblocksumcount[b]; } } """.replace('SCALAR', precision).replace('BLOCKSIZE', str(blocksize)))
psizExf = cuda.mem_alloc(size_psiz) psizExb = cuda.mem_alloc(size_psiz) psizEyf = cuda.mem_alloc(size_psiz) psizEyb = cuda.mem_alloc(size_psiz) psizHxf = cuda.mem_alloc(size_psiz) psizHxb = cuda.mem_alloc(size_psiz) psizHyf = cuda.mem_alloc(size_psiz) psizHyb = cuda.mem_alloc(size_psiz) # Copy the arrays from host to device cuda.memcpy_htod(devCEx, CEx) cuda.memcpy_htod(devCEy, CEy) cuda.memcpy_htod(devCEz, CEz) # Get the module from the cuda files mod_common = cuda.SourceModule(file('common.cu', 'r').read()) mod_dielectric = cuda.SourceModule(file('dielectric.cu', 'r').read()) mod_source = cuda.SourceModule(file('source.cu', 'r').read()) mod_cpml = cuda.SourceModule( file('cpml.cu', 'r').read().replace('NPMLp2', str(2 * (Npml + 1))).replace( 'NPMLp', str(Npml + 1)).replace('NPML', str(Npml))) # Get the global pointer from the module rcmbE = mod_cpml.get_global("rcmbE") rcmbH = mod_cpml.get_global("rcmbH") rcmaE = mod_cpml.get_global("rcmaE") rcmaH = mod_cpml.get_global("rcmaH") #print rcmaE #print bE
def get_cuda_func(z_max, max_arr_size, total_image_count): raw = open('/home/john/python_scripts/cuda/cuda_code.c', 'rb').read() processed = raw % {'z_max':z_max, 'max':max_arr_size, 'total_image_count':total_image_count} mod = cuda.SourceModule(processed) return mod.get_function("modify")
mod = cuda.SourceModule(""" #define Z_MAX %(z_max)d __device__ float calc(float val_a, float val_b, int index) { /*if (index %% Z_MAX == 0) return 0.0;*/ return val_a * val_b / 256.0; //return min(255.0, val_a+val_b); //return max(0.0, val_a-val_b); } __global__ void modify(float *a, float *b, float *c) { int row_len = gridDim.x*blockDim.x*Z_MAX; int block_offset_x = blockDim.x*blockIdx.x*Z_MAX; int block_offset_y = blockDim.y*blockIdx.y*row_len; int i = threadIdx.x*Z_MAX + threadIdx.y*row_len + threadIdx.z + block_offset_x + block_offset_y; if (i > %(max)d) return; c[i] = calc(a[i], b[i], i); } __global__ void modify_linear(float *a, float *b, float *c) { int block_offset_x = blockDim.x*blockIdx.x; int i = threadIdx.x + block_offset_x; /*if (i > %(max)d) return;*/ c[i] = calc(a[i], b[i], i); } """ % {'z_max':3, 'max':2896782})#{'z_max':d, 'max':img1_arr.size}) if LINEAR:
import pycuda.autoinit as autoinit import pycuda.driver as drv from pycuda.gpuarray import GPUArray from pycuda import gpuarray import numpy, pylab, time N = 1000000 x0 = 3.2 block = (512, 1, 1) grid = (int(N / 512) + 1, 1) mod = drv.SourceModule(""" __global__ void threshold(float *x, float x0, int *J, unsigned int *global_j, int N) { int i = blockIdx.x * blockDim.x + threadIdx.x; if(x[i]>x0 && i<N){ unsigned int j = atomicInc(global_j, N); J[j] = i; } } """) threshold = mod.get_function("threshold") v = gpuarray.to_gpu(numpy.array(numpy.random.randn(N), dtype=numpy.float32)) J = drv.mem_alloc(4 * N) global_j = drv.mem_alloc(4) Jret = numpy.zeros(N, dtype=int) jret = numpy.zeros(1, dtype=numpy.uint32)