def setVariables(self): n, m, r = self.n, self.m, self.rank # compile the matrix separations and G update functions for CUDA G_size = m * r FTF_size = r**2 max_threads = tools.DeviceData().max_threads self.block_G = int(np.min([G_size, max_threads])) self.grid_G = np.int(np.ceil(G_size/np.float32(self.block_G))) self.block_FTF = int(np.min([FTF_size, max_threads])) self.grid_FTF = np.int(np.ceil(FTF_size/np.float32(self.block_FTF))) mod_msepXTF = compiler.SourceModule(matrix_separation_code % G_size) mod_msepFTF = compiler.SourceModule(matrix_separation_code % FTF_size) mod_Gupdate = compiler.SourceModule(G_update_code % G_size) self.matrix_separationXTF = \ mod_msepXTF.get_function("matrix_separation") self.matrix_separationFTF = \ mod_msepFTF.get_function("matrix_separation") self.G_ew_update = mod_Gupdate.get_function("G_ew_update") # allocate the matrices on the GPU self.G_gpu = gpuarray.to_gpu(self.G) self.F_gpu = gpuarray.empty((n,r), np.float32) self.X_gpu = gpuarray.to_gpu(self.X) self.GTG_gpu = gpuarray.empty((r,r), np.float32) self.GTGinv_gpu = gpuarray.empty((r,r), np.float32) self.XG_gpu = gpuarray.empty((n,r), np.float32) self.XTF_gpu = gpuarray.empty((m,r), np.float32) self.FTF_gpu = gpuarray.empty((r,r), np.float32) self.XTFpos_gpu = gpuarray.empty((m,r), np.float32) self.XTFneg_gpu = gpuarray.empty((m,r), np.float32) self.FTFpos_gpu = gpuarray.empty((r,r), np.float32) self.FTFneg_gpu = gpuarray.empty((r,r), np.float32) self.GFTFneg_gpu = gpuarray.empty((m,r), np.float32) self.GFTFpos_gpu = gpuarray.empty((m,r), np.float32)
def setVariables(self): # compile the update functions for H and W as elementwise Matrix-Mult. # is not in skcuda H_size = self.rank * self.m W_size = self.n * self.rank max_threads = tools.DeviceData().max_threads self.block_H = int(np.min([H_size, max_threads])) self.block_W = int(np.min([W_size, max_threads])) self.grid_H = np.int(np.ceil(H_size/np.float32(self.block_H))) self.grid_W = np.int(np.ceil(W_size/np.float32(self.block_W))) mod_H = compiler.SourceModule(update_kernel_code % H_size) mod_W = compiler.SourceModule(update_kernel_code % W_size) self.update_H = mod_H.get_function("ew_md") self.update_W = mod_W.get_function("ew_md") # allocate the matrices on the GPU self.H_gpu = gpuarray.to_gpu(self.H) self.W_gpu = gpuarray.to_gpu(self.W) self.X_gpu = gpuarray.to_gpu(self.X) self.WTW_gpu = gpuarray.empty((self.rank, self.rank), np.float32) self.WTWH_gpu = gpuarray.empty(self.H.shape, np.float32) self.WTX_gpu = gpuarray.empty(self.H.shape, np.float32) self.XHT_gpu = gpuarray.empty(self.W.shape, np.float32) self.WH_gpu = gpuarray.empty(self.X.shape, np.float32) self.WHHT_gpu = gpuarray.empty(self.W.shape, np.float32)
def __init__(self, x=1, y=1, z=1, a=0, b=255, GPU=True): self.output = np.array(x * y * z * [0]).astype(np.float64) if x == 1: self.output.reshape(y) elif z == 1: self.output.reshape(x, y) else: self.output.reshape(x, y, z) self.x = x self.y = y self.z = z self.a = a self.b = b self.GPU = GPU if GPU: kernel_code = normalize1DTemplate % { 'YDIM': self.y, 'MIN': self.a, 'MAX': self.b } module = compiler.SourceModule(kernel_code) self.normalize1DKernel = module.get_function("normalize1DKernel") kernel_code = normalize2DTemplate % { 'XDIM': self.x, 'YDIM': self.y, 'MIN': self.a, 'MAX': self.b } module = compiler.SourceModule(kernel_code) self.normalize2DKernel = module.get_function("normalize2DKernel")
def histogram(light): grid_gpu_template = """ __global__ void grid(int *values, int size, int *temp_grid) { unsigned int id = threadIdx.x; int i,bin; for(i=id;i<size;i+=blockDim.x){ bin=values[i]; if (values[i]==%(interv)s){ values[i]=%(interv)s-1; } temp_grid[id*%(interv)s+bin]+=1.0; } } """ reduction_gpu_template = """ __global__ void reduction(int *temp_grid, int *his) { unsigned int id = blockIdx.x*blockDim.x+threadIdx.x; if(id<%(interv)s){ for(int i=0;i<%(max_number_of_threads)s;i++){ his[id]+=temp_grid[id+%(interv)s*i]; } } } """ number_of_points = len(light) max_number_of_threads = 1024 interv = 15626 blocks = interv / max_number_of_threads if interv % max_number_of_threads != 0: blocks += 1 grid_gpu = grid_gpu_template % { 'interv': interv, } mod_grid = compiler.SourceModule(grid_gpu) grid = mod_grid.get_function("grid") reduction_gpu = reduction_gpu_template % { 'interv': interv, 'max_number_of_threads': max_number_of_threads, } mod_redt = compiler.SourceModule(reduction_gpu) redt = mod_redt.get_function("reduction") values_gpu = gpuarray.to_gpu(light) temp_grid_gpu = gpuarray.zeros((max_number_of_threads, interv), dtype=np.int32) hist = np.zeros(interv, dtype=np.int32) hist_gpu = gpuarray.to_gpu(hist) grid(values_gpu, np.int32(number_of_points), temp_grid_gpu, grid=(1, 1), block=(max_number_of_threads, 1, 1)) redt(temp_grid_gpu, hist_gpu, grid=(blocks, 1), block=(max_number_of_threads, 1, 1)) hist = hist_gpu.get() return hist
def _compile_at_runtime(self, step_code, parameters): # set beta to 1: repeats are pointless as simulation is deterministic self._beta = 1 fc = open( os.path.join( os.path.split(os.path.realpath(__file__))[0], 'cuLsoda_all.cu'), 'r') _sourceFromFile_ = fc.read() _isize_ = "#define ISIZE " + repr(20 + self._speciesNumber) + "\n" _rsize_ = "#define RSIZE " + repr( 22 + self._speciesNumber * max(16, self._speciesNumber + 9)) + "\n" _textures_ = "texture<float, 2, cudaReadModeElementType> param_tex;\n" _common_block_ = "__device__ struct cuLsodaCommonBlock common[" + repr( 1 * 1) + "];\n" _code_ = _isize_ + _rsize_ + _textures_ + step_code + _sourceFromFile_ + _common_block_ + self._lsoda_source_ if self._dump: of = open("full_ode_code.cu", "w") print >> of, _code_ # dummy compile to determine optimal blockSize and gridSize compiled = compiler.SourceModule(_code_, nvcc="nvcc", options=[], no_extern_c=True, keep=False) blocks, threads = self._get_optimal_gpu_param( parameters, compiled.get_function("cuLsoda")) blocks = self._MAXBLOCKSPERDEVICE # real compile _common_block_ = "__device__ struct cuLsodaCommonBlock common[" + repr( blocks * threads) + "];\n" _code_ = _isize_ + _rsize_ + _textures_ + step_code + _sourceFromFile_ + _common_block_ + self._lsoda_source_ if self._dump: of = open("full_ode_code.cu", "w") print >> of, _code_ compiled = compiler.SourceModule(_code_, nvcc="nvcc", options=[], no_extern_c=True, keep=False) self._param_tex = compiled.get_texref("param_tex") lsoda_kernel = compiled.get_function("cuLsoda") return compiled, lsoda_kernel
def genCks( allValidSpikVec, MATRIX_SIZE, TILE_WIDTH, configVec_str, spikTransMatFile) : #using all generated valid spiking vector files, 'feed' the files to the CUDA C kernels to evaluate (1) for spikVec in allValidSpikVec[ 0 ] : # string concatenation of the configVec, Ck-1, from configVec = [ '2', '2', '1', '0', '0', ...] # to configVec = 211 <string> Ck_1_str = configVec_str #write into total list of Ckspri #allGenCk = addTotalCk( allGenCk, Ck_1_str ) #print spikVec #form the filenames of the Cks and the Sks Ck = 'c_' + Ck_1_str + '_' + spikVec Ck_1 = 'c_' + Ck_1_str Sk = 's_' + spikVec #print ' Ck, Ck_1, Sk: ', Ck, Ck_1, Sk #import the vectors/Matrix as numpy ND arrays Ck_1 = toNumpyArr( Ck_1, MATRIX_SIZE ) Sk = toNumpyArr( Sk, MATRIX_SIZE ) M = toNumpyArr( spikTransMatFile, MATRIX_SIZE ) #allocate memory in the GPU Ck_1gpu = gpuarray.to_gpu( Ck_1 ) Skgpu = gpuarray.to_gpu( Sk ) Mgpu = gpuarray.to_gpu( M ) SkMgpu = gpuarray.empty( ( MATRIX_SIZE, MATRIX_SIZE), np.int32 ) Ckgpu = gpuarray.empty( ( MATRIX_SIZE, MATRIX_SIZE), np.int32 ) #get kernel code from template by specifying the constant MATRIX_SIZE #matmul_kernel = matmul_kernel_temp % { 'MATRIX_SIZE': MATRIX_SIZE} matmul_kernel = matmul_kernel_temp %{'MATRIX_SIZE': MATRIX_SIZE, 'TILE_WIDTH':TILE_WIDTH} #matadd_kernel = matadd_kernel_temp % { 'MATRIX_SIZE': MATRIX_SIZE} matadd_kernel = matadd_kernel_temp %{'MATRIX_SIZE': MATRIX_SIZE, 'TILE_WIDTH':TILE_WIDTH} # compile the kernel code mulmod = compiler.SourceModule(matmul_kernel) addmod = compiler.SourceModule(matadd_kernel) matrixmul = mulmod.get_function( "MatrixMulKernel" ) matrixadd = addmod.get_function( "MatrixAddKernel" ) #call kernel functions #matrixmul( Skgpu, Mgpu, SkMgpu, block = ( MATRIX_SIZE, MATRIX_SIZE, 1 ), ) #print ' BEFORE DEVICE CALLS. Time is ' #print str(datetime.now()) #create PyCUDA events to record time of kernel execution startTime = driver.Event() endTime = driver.Event() startTime.record( ) #start the timer matrixmul( Skgpu, Mgpu, SkMgpu, block = ( TILE_WIDTH, TILE_WIDTH, 1 ), grid = ( MATRIX_SIZE / TILE_WIDTH, MATRIX_SIZE / TILE_WIDTH ) ) #matrixadd( Ck_1gpu, SkMgpu, Ckgpu, block = ( MATRIX_SIZE, MATRIX_SIZE, 1 ), ) matrixadd( Ck_1gpu, SkMgpu, Ckgpu, block = ( TILE_WIDTH, TILE_WIDTH, 1 ), grid = ( MATRIX_SIZE / TILE_WIDTH, MATRIX_SIZE / TILE_WIDTH ) ) endTime.record( ) #start the end time timer. endTime.synchronize( ) # synchronize end of threads simTime = startTime.time_till( endTime ) * 1e-3 print " Kernel call exec time is ", simTime #print ' AFTER DEVICE CALLS. Time is ' #print str(datetime.now()) #print Ck_1gpu.get()[ 4 ] #this is a numpy ND array #write ND array into a file NDarrToFile( Ck, Ckgpu )
def genCks(allValidSpikVec, MATRIX_SIZE, configVec_str, spikTransMatFile): #using all generated valid spiking vector files, 'feed' the files to the CUDA C program to evaluate (1) #execute CUDA C program e.g. os.popen('./snp-v12.26.10.1 c_211 s0 M 5 c_211_s0') given the generated spik vecs for spikVec in allValidSpikVec[0]: # string concatenation of the configVec, Ck-1, from configVec = [ '2', '2', '1', '0', '0', ...] # to configVec = 211 <string> Ck_1_str = configVec_str #write into total list of Ckspri #allGenCk = addTotalCk( allGenCk, Ck_1_str ) #print spikVec #form the filenames of the Cks and the Sks Ck = 'c_' + Ck_1_str + '_' + spikVec Ck_1 = 'c_' + Ck_1_str Sk = 's_' + spikVec #import the vectors/Matrix as numpy ND arrays Ck_1 = toNumpyArr(Ck_1, MATRIX_SIZE) Sk = toNumpyArr(Sk, MATRIX_SIZE) M = toNumpyArr(spikTransMatFile, MATRIX_SIZE) #allocate memory in the GPU Ck_1gpu = gpuarray.to_gpu(Ck_1) Skgpu = gpuarray.to_gpu(Sk) Mgpu = gpuarray.to_gpu(M) SkMgpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.int32) Ckgpu = gpuarray.empty((MATRIX_SIZE, MATRIX_SIZE), np.int32) #get kernel code from template by specifying the constant MATRIX_SIZE matmul_kernel = matmul_kernel_temp % {'MATRIX_SIZE': MATRIX_SIZE} matadd_kernel = matadd_kernel_temp % {'MATRIX_SIZE': MATRIX_SIZE} # compile the kernel code mulmod = compiler.SourceModule(matmul_kernel) addmod = compiler.SourceModule(matadd_kernel) matrixmul = mulmod.get_function("MatrixMulKernel") matrixadd = addmod.get_function("MatrixAddKernel") #call kernel functions matrixmul( Skgpu, Mgpu, SkMgpu, block=(MATRIX_SIZE, MATRIX_SIZE, 1), ) matrixadd( Ck_1gpu, SkMgpu, Ckgpu, block=(MATRIX_SIZE, MATRIX_SIZE, 1), ) #print Ck_1gpu.get()[ 4 ] #this is a numpy ND array #write ND array into a file NDarrToFile(Ck, Ckgpu)
def prepare_kernels(files, templates, constvars, blockvars={}): """ Compile and prepare CUDA kernel functions Args: files : list of cuda source file handles templates : list of tuples describing the kernels constvars : dict of readonly variables blockvars : dict of blockvars whose description will be included in the preamble as preprocessor macros Returns: kernels : dict of executable CUDA kernels """ preamble, constvars = prepare_vars(constvars, blockvars) kernels_code = preamble for f in files: kernels_code += f.read().decode("utf-8") mod = compiler.SourceModule(kernels_code) kernels = {} for d in templates: kernels[d[0]] = prepare_kernelfun(mod, *d) for name, val in constvars.items(): const_ptr, size_in_bytes = mod.get_global(name) pycuda.driver.memcpy_htod(const_ptr, val) # WARNING: The gpudata argument in gpuarray.GPUArray usually requires a # pycuda.driver.DeviceAllocation and const_ptr is an int generated from # casting a CUdeviceptr to an int. # However, since DeviceAllocation is a simple wrapper around CUdeviceptr # (that gives a CUdeviceptr when cast to an int), it works like this. constvars[name] = gpuarray.GPUArray(val.shape, val.dtype, gpudata=const_ptr) return kernels
def blur(source_array, standard_deviation, filter_width): result_array = np.empty_like(source_array) red_channel = source_array[:, :, 0].copy() green_channel = source_array[:, :, 1].copy() blue_channel = source_array[:, :, 2].copy() height, width = source_array.shape[:2] dim_grid_x = math.ceil(width / BLOCK_SIZE) dim_grid_y = math.ceil(height / BLOCK_SIZE) gaussian_kernel = create_gaussian_kernel(filter_width, standard_deviation) mod = compiler.SourceModule(open('./blur.cu').read()) gaussian_blur = mod.get_function('gaussian_blur') for channel in (red_channel, green_channel, blue_channel): gaussian_blur(driver.In(channel), driver.Out(channel), np.uint32(width), np.uint32(height), driver.In(gaussian_kernel), np.uint32(filter_width), block=(BLOCK_SIZE, BLOCK_SIZE, 1), grid=(dim_grid_x, dim_grid_y)) result_array[:, :, 0] = red_channel result_array[:, :, 1] = green_channel result_array[:, :, 2] = blue_channel return result_array
def kde_gauss_cuda1d(x, nbins1D): nsmpl = len(x) nbins = nbins1D sigmax = np.std(x) bandwidth = 1.06 * sigmax * nsmpl**(-1. / 5.) xi = linspace(x.min(), x.max(), nbins1D) x_gpu = gpuarray.to_gpu(x.astype(np.float32)) xi_gpu = gpuarray.to_gpu(xi.astype(np.float32)) pdf_gpu = gpuarray.zeros(nbins, np.float32) b_s = 16 # get the kernel code from the template kernel_code = kernel_code_template_cov_1d % { 'DATA_SIZE': nsmpl, 'BAND_W': bandwidth, } # compile the kernel code mod = compiler.SourceModule(kernel_code) # get the kernel function from the compiled module cuda_gauss_kde = mod.get_function("gauss_kde1d") # call the kernel on the card cuda_gauss_kde( # inputs xi_gpu, x_gpu, # output pdf_gpu, # grid=(nbins // b_s, 1), # (only one) block of MATRIX_SIZE x MATRIX_SIZE threads block=(b_s, 1, 1), ) return xi, pdf_gpu.get() #*(xi[1]-xi[0])
def componentes_principales_panchromartic(r_s , g_s, b_s, q, size, block_size): block_size = block_size nb1_temp = [] nb2_temp = [] nb3_temp = [] size = size kernel_code = kernel_componentes_principales_pancromatica % { 'BLOCK_SIZE': BLOCK_SIZE, } mod = compiler.SourceModule(kernel_code) kernel = mod.get_function("componentesPrincipalesPancromatica") s1_gpu = gpuarray.zeros((block_size,block_size),np.float32) s2_gpu = gpuarray.zeros((block_size,block_size),np.float32) s3_gpu = gpuarray.zeros((block_size,block_size),np.float32) Rs_gpu_t = gpuarray.to_gpu(r_s) Gs_gpu_t = gpuarray.to_gpu(g_s) Bs_gpu_t = gpuarray.to_gpu(b_s) q_gpu = gpuarray.to_gpu(q) for i in range(len(r_s)): kernel( # inputs Rs_gpu_t[i], Gs_gpu_t[i], Bs_gpu_t[i], q_gpu, # output s1_gpu, s2_gpu, s3_gpu, # block of multiple threads block = (block_size, block_size, 1), ) nb1_temp.append(s1_gpu.get()) nb2_temp.append(s2_gpu.get()) nb3_temp.append(s3_gpu.get()) nb1 = stack_values(nb1_temp, g_s, size, block_size) nb2 = stack_values(nb2_temp, g_s, size, block_size) nb3 = stack_values(nb3_temp, g_s, size, block_size) return nb1, nb2, nb3
def col_update(self, itr, A, X_device, P, sin, cos, iterBlock): self.A_device = gpuarray.to_gpu(A) self.X_device = gpuarray.to_gpu(X_device) self.dev_sin = gpuarray.to_gpu(sin) self.dev_cos = gpuarray.to_gpu(cos) self.iterBlock_device = gpuarray.to_gpu(iterBlock) if (P % 2 == 0): grid_size = P / 2 else: grid_size = P / 2 + 1 mod2 = compiler.SourceModule(self.col_update_kernel_code) col_update_code = mod2.get_function("kernel_col_update") col_update_code(itr, self.A_device, self.X_device, P, self.device_eigenvectors, self.dev_sin, self.dev_cos, self.iterBlock_device, block=(np.int(P), np.int(P), 1), grid=(np.int(grid_size), np.int(grid_size), 1)) return self.device_eigenvectors.get()
def dotp(a_cpu, b_cpu): print(a_cpu.shape, b_cpu.shape) # transfer host (CPU) memory to device (GPU) memory a_gpu = gpuarray.to_gpu(a_cpu) b_gpu = gpuarray.to_gpu(b_cpu) # create empty gpu array for the result (C = A * B) c_gpu = gpuarray.empty((1, 1), np.float32) MATRIX_SIZE = len(a_cpu) # get the kernel code from the template # by specifying the constant MATRIX_SIZE kernel_code = kernel_code_template % {'MATRIX_SIZE': MATRIX_SIZE} # compile the kernel code mod = compiler.SourceModule(kernel_code) # get the kernel function from the compiled module matrixmul = mod.get_function("MatrixMulKernel") # call the kernel on the card matrixmul( # inputs a_gpu, b_gpu, # output c_gpu, # (only one) block of MATRIX_SIZE x MATRIX_SIZE threads block=(MATRIX_SIZE, MATRIX_SIZE, 1), ) return c_gpu.get()
def MatMul(self, A, rA, cA, B, rB, cB): self.C_gpu = gpuarray.empty((A.shape[0], B.shape[1]), dtype=np.float32) self.A_gpu = gpuarray.to_gpu(A) self.B_gpu = gpuarray.to_gpu(B) mod = compiler.SourceModule(self.mul_kernel_code) dev_mul = mod.get_function("kernel_MatMul") grid_x = np.int(np.ceil(cB * 1.0 / 16)) grid_y = np.int(np.ceil(rA * 1.0 / 16)) dev_mul(self.A_gpu, rA, cA, self.B_gpu, rB, cA, self.C_gpu, block=(16, 16, 1), grid=(grid_x, grid_y, 1)) """ dev_mul( self.A_gpu, rA, cA, self.B_gpu, self.C_gpu, block = (16, 16, 1), grid = (grid_x, grid_y, 1) ) """ return self.C_gpu.get()
def initKernels(self, *args, **kwargs): super(tanhLayer, self).initKernels(*args, **kwargs) m = self.A.shape[0] n = self.A.shape[1] kernel_code = cudaModules.forwardTemplate % {'NROWS': m, 'NCOLS': n} module = compiler.SourceModule(kernel_code) self.kernels.forwardKernel = module.get_function("forwardKernel")
def compile_kernels(srcFile, kernelNames, srcParams=None): """ Load the GPU kernels from the specified CUDA C file """ import pycuda.compiler as nvcc # Read the src file into a string custr = "" with io.open(srcFile, 'r') as file: for l in file: custr += l ## Replace consts in cu file if srcParams != None: custr = custr % srcParams # Compile the CUDA Kernel cu_kernel_source_module = nvcc.SourceModule(custr) # Load the kernels into a dictionary kernels = {} for name in kernelNames: try: kernels[name] = cu_kernel_source_module.get_function(name) except: log.error("Failed to find kernel function: %s", name) exit() return kernels
def parallel_transpose(self): # return: the transpose of input matrix # TODO: # Memory copy to device # Function call and measuring time here mod = compiler.SourceModule(self.kernel_code) #compiling the kernel transpose = mod.get_function("transpose") #getting the function start = time.time() #start timer transpose( # inputs self.a_gpu, # output self.b_gpu, np.float32(self.a_cpu.shape[1]), np.float32(self.a_cpu.shape[0]), # block size block=(16, 16, 1)) #kernel call self.times_gpu = time.time() - start #get time # Memory copy to host self.b_cpu = self.b_gpu.get() #copy result to host variable # Return output and measured time return self.b_cpu, self.times_gpu
def matrix_mul_naive(self): #MATRIX_SIZE = 10 #self.kernel_code = self.kernel_code_template % { #'MATRIX_SIZE': MATRIX_SIZE #} TILE_WIDTH = self.TILE_WIDTH self.kernel_code = self.kernel_code_template % { 'TILE_WIDTH': TILE_WIDTH } n = int(np.ceil(self.b_cpu.shape[1] / TILE_WIDTH)) m = int(np.ceil(self.a_cpu.shape[0] / TILE_WIDTH)) mod = compiler.SourceModule(self.kernel_code) matmul = mod.get_function("MatrixMulKernel_naive") start = time.time() matmul(self.a_gpu, self.b_gpu, self.c_gpu, np.int32(self.a_cpu.shape[0]), np.int32(self.a_cpu.shape[1]), np.int32(self.b_cpu.shape[1]), block=(TILE_WIDTH, TILE_WIDTH, 1), grid=(n, m, 1)) times_gpu_ = time.time() - start self.c_cpu = self.c_gpu.get() return self.c_cpu, times_gpu_
def _get_func(self): """ """ kernel_template = """ #define Ws_W $Ws_W #define Ws_H $Ws_H __global__ void grad_Ws(double*d, double*x, double*grad){ const size_t tx = threadIdx.x; const size_t ty = threadIdx.y; __shared__ double sd[Ws_H]; __shared__ double sx[Ws_W]; if (tx < Ws_W && ty < Ws_H){ sx[threadIdx.x] = x[threadIdx.x]; sd[threadIdx.y] = d[threadIdx.y]; __syncthreads(); grad[ty * Ws_W + tx] += sd[ty] * sx[tx]; } } """ kernel_template = string.Template(kernel_template) kernel_code = kernel_template.substitute(Ws_W = self.params['Ws_w'], Ws_H = self.params['Ws_h']) module = compiler.SourceModule(kernel_code) return module.get_function('grad_Ws')
def rgb2gray(image, height, width, channels=3): """ Metodo para la conversion de RGB a escala de grises """ # Asignacion de los tamanos de los vectores necesarios a_cpu = np.array(image).astype(np.float32) b_cpu = np.zeros((height, width)).astype(np.float32) # Asignacion de memoria requerida dentro del procesamiento a_gpu = cuda.mem_alloc(a_cpu.nbytes) b_gpu = cuda.mem_alloc(b_cpu.nbytes) # Copia de la informacion a de la cpu a la gpu cuda.memcpy_htod(a_gpu, a_cpu) # Kernel modificado con los valores necesarios kernel_code = kernel_code_template % { 'width': str(width), 'height': str(height), 'channels': str(channels) } # LLamdo del kernel mod = compiler.SourceModule(kernel_code) matrixmul = mod.get_function('rgb2gray') # Ejecucion del kernel matrixmul(b_gpu, a_gpu, block=(6, 36, 1), grid=(100, 8, 1)) #Copia de los resultados procesados por el kernel al cpu cuda.memcpy_dtoh(b_cpu, b_gpu) return b_cpu
def col_update(self, iter, A, X, P, sin, cos, iterBlock): self.A_device = gpuarray.to_gpu(A) self.dev_sin = gpuarray.to_gpu(sin) self.dev_cos = gpuarray.to_gpu(cos) self.iterBlock_device = gpuarray.to_gpu(iterBlock) self.X_device = gpuarray.to_gpu(X) self.device_eigenvectors = gpuarray.empty((P, P)) if (P % 2 == 0): grid_size = P / 2 else: grid_size = P / 2 + 1 mod2 = compiler.SourceModule(col_update_kernel_code) col_update_code = mod2.get_function(kernel_col_update) col_update_code( iter, self.A_device, self.X_device, P, self.device_eigenvectors, self.dev_sin, self.device_cos, self.iterBlock_device, block = (P, P, 1), grid = (grid_size, grid_size) ) return self.device_eigenvectors.get()
def __init__(self, idata): # idata: an array of lower characters. # TODO: # Declare host variables # Device memory allocation # Kernel code # -- initialize the device self.a_cpu = idata self.L = len(idata) import pycuda.autoinit self.kernel_code = """ __global__ void CapCharKernel(char *a, char *b) { // 1D Thread ID (assuming that only *one* block will be executed) int tx = threadIdx.x; b[tx] = a[tx] - 32; } """ # compile the kernel code mod = compiler.SourceModule(self.kernel_code) # get the kernel function from the compiled module self.capchar = mod.get_function("CapCharKernel") self.b_gpu = gpuarray.empty(self.L, 'S1')
def compute_params(self, A, P, itr, iterblock): self.A_gpu = gpuarray.to_gpu(A) self.iterBlock_device = gpuarray.to_gpu(iterblock) self.dev_sin = gpuarray.empty((P, P), np.float32) self.dev_cos = gpuarray.empty((P, P), np.float32) # self.iterBlock_device = gpuarray.empty((P-1)*P / 2 * 2), astype.int) if (P % 2 == 0): grid_size = np.int(P / 2) else: grid_size = np.int(P / 2 + 1) mod = compiler.SourceModule(self.compute_params_kernel_code) compute_params_code = mod.get_function("kernel_compute_params") compute_params_code(self.A_gpu, P, itr, self.dev_sin, self.dev_cos, self.iterBlock_device, block=(grid_size, grid_size, 1)) # block size? dc = self.dev_cos.get() ds = self.dev_sin.get() self.A_gpu.get() self.iterBlock_device.get() return ds, dc
def runAdd_parallel(self): # return: an array containing capitalized characters from idata and running time. # TODO: # Memory copy to device input_gpu = gpuarray.to_gpu(self.a) #allocate for input # print("Parallel Input completed") # Function call and measuring time here # print("Starting Kernel") kernel_code = self.capitalize_kernel #get kernel code from template mod = compiler.SourceModule(kernel_code) #compile the kernel code capitalize = mod.get_function("Capitalize") # get the kernel function start = datetime.datetime.now() capitalize(input_gpu, self.output_gpu, block=(1024, 1, 1), grid=(self.num_blocks + 1, 1, 1)) total_time = datetime.datetime.now() - start # Memory copy to host result = self.output_gpu.get() # Return output and measured time # print(result, total_time) return result, total_time
def __init__(self, shape, code, *params): """Return a cuda function that will execute on a grid. Input arguments: shape -- the size of the grid. code -- the CUDA source code to be executed at every cell. *params -- (type, name) tuples of the input parameters. Output arguments: wrapped_fun -- a function that accepts a list of pycuda.gpuarray.GPUArray objects as well as pycuda.driver.Function.__call__ keyword arguments. """ # Initialize parameters. self.shape = shape # Size of the simulation. self.block_shapes, self.grid_shapes = _get_shapes(shape) # Get the template and render it using jinja2. template = jinja_env.get_template('traverse.cu') cuda_source = template.render( params=params, \ dims=self.shape, \ loop_code=code, \ flat_tag='_f') f = open('/tmp/code', 'w') f.write(cuda_source) # Compile the code into a callable cuda function. mod = compiler.SourceModule(cuda_source) self.fun = mod.get_function('traverse')
def componentes_principales_original(r_s , g_s, b_s, q, size, block_size): cp1_temp = [] cp2_temp = [] cp3_temp = [] size = size block_size = block_size kernel_code = kernel_componentes_principales_original % { 'BLOCK_SIZE': BLOCK_SIZE, } mod = compiler.SourceModule(kernel_code) kernel = mod.get_function("componentesPrincipalesOriginal") s1_gpu = gpuarray.zeros((block_size,block_size),np.float32) s2_gpu = gpuarray.zeros((block_size,block_size),np.float32) s3_gpu = gpuarray.zeros((block_size,block_size),np.float32) q_gpu = gpuarray.to_gpu(q) Rs_gpu_t = gpuarray.to_gpu(r_s) Gs_gpu_t = gpuarray.to_gpu(g_s) Bs_gpu_t = gpuarray.to_gpu(b_s) for i in range(len(r_s)): kernel( # inputs Rs_gpu_t[i], Gs_gpu_t[i], Bs_gpu_t[i], q_gpu, # output s1_gpu, s2_gpu, s3_gpu, # block of multiple threads block = (block_size, block_size, 1), ) cp1_temp.append(s1_gpu.get()) cp2_temp.append(s2_gpu.get()) cp3_temp.append(s3_gpu.get()) cp1 = stack_values(cp1_temp, r_s, size, block_size) cp2 = stack_values(cp2_temp, r_s, size, block_size) cp3 = stack_values(cp3_temp, r_s, size, block_size) return cp1, cp2, cp3
def generate_forward_euler_code(self): eqs = self.eqs M = len(eqs._diffeq_names) all_variables = eqs._eq_names + eqs._diffeq_names + eqs._alias.keys() + ['t'] clines = '__global__ void stateupdate(int N, SCALAR t, SCALAR *S)\n' clines += '{\n' clines += ' int i = blockIdx.x * blockDim.x + threadIdx.x;\n' clines += ' if(i>=N) return;\n' for j, name in enumerate(eqs._diffeq_names): clines += ' int _index_' + name + ' = i+' + str(j) + '*N;\n' for j, name in enumerate(eqs._diffeq_names): # clines += ' SCALAR &' + name + ' = S[i+'+str(j)+'*N];\n' clines += ' SCALAR ' + name + ' = S[_index_' + name + '];\n' for j, name in enumerate(eqs._diffeq_names): namespace = eqs._namespace[name] expr = optimiser.freeze(eqs._string[name], all_variables, namespace) expr = rewrite_to_c_expression(expr) print expr if name in eqs._diffeq_names_nonzero: clines += ' SCALAR ' + name + '__tmp = ' + expr + ';\n' for name in eqs._diffeq_names_nonzero: # clines += ' '+name+' += '+str(self.clock_dt)+'*'+name+'__tmp;\n' clines += ' S[_index_' + name + '] = ' + name + '+' + str(self.clock_dt) + '*' + name + '__tmp;\n' clines += '}\n' clines = clines.replace('SCALAR', self.precision) self.gpu_mod = compiler.SourceModule(clines) self.gpu_func = self.gpu_mod.get_function("stateupdate") return clines
def genModulo(): """ Calculates corners of objects across all frames Returns: GPU kernel """ # Setup Kernel kernels = compiler.SourceModule(""" #include <stdio.h> __global__ void FindObj(int* a, int* frameOrigin, int* c, int img_height, int img_width) { // Setup Indexing int tx = threadIdx.x; int ty = threadIdx.y; int row_o = blockIdx.y*blockDim.y + ty; int col_o = blockIdx.x*blockDim.x + tx; // printf("%i %i \\n", tx, ty); // printf("row_o:%i col_o:%i \\n", row_o, col_o); // printf("blockIdx.y:%i blockDim.y:%i \\n", blockIdx.y, blockDim.y); // printf("blockIdx.x:%i blockDim.x:%i \\n", blockIdx.x, blockDim.x); int pixelColor = a[row_o*img_width + col_o]; //Gather all criteria for corner bool row_plus1 = false; bool row_plus2 = false; bool col_plus1 = false; bool col_plus2 = false; bool row_minus1 = true; bool col_minus1 = true; //Next rows in same column have same color and are in bounds if(((row_o + 1) % img_height) < img_height && a[(row_o + 1)*img_width + col_o] == pixelColor){row_plus1 = true;} if(((row_o + 2) % img_height) < img_height && a[(row_o + 2)*img_width + col_o] == pixelColor){row_plus2 = true;} //Next cols in same row have same color and are in bounds if((col_o + 1) < img_width && a[row_o*img_width + col_o + 1] == pixelColor){col_plus1 = true;} if((col_o + 2) < img_width && a[row_o*img_width + col_o + 2] == pixelColor){col_plus2 = true;} //Confirm not in the middle of an edge and on a vertex if((row_o - 1) >= 0 && a[(row_o - 1)*img_width + col_o] == pixelColor){row_minus1 = false;} if((col_o - 1) >= 0 && a[row_o*img_width + col_o - 1] == pixelColor){col_minus1 = false;} //printf("row: %i col: %i pixelColor: %i, row_plus1: %i, row_plus2: %i, col_plus1: %i, col_plus2: %i, row_minus1: %i, col_minus1: %i \\n",row_o, col_o, pixelColor, row_plus1, row_plus2, col_plus1, col_plus2, row_minus1, col_minus1); //If vertex, do all calculations for global array if(row_plus1 && row_plus2 && col_plus1 && col_plus2 && row_minus1 && col_minus1){ int frame = (int) ((row_o*img_width + col_o)/(img_width*img_height-1)); //printf("Found corner. row_o:%i col_o:%i pixelColor: %i, frame: %i\\n",row_o, col_o, pixelColor, frame); frameOrigin[frame*2] = col_o; frameOrigin[frame*2+1] = row_o % img_height; } //Check all criteria for validity //Modulo absolute thread location by frame size and store x val/y val to according locations in global array //After all global array values set, compare set of 2 x/y pairs to determine movement and set movement indicator in other global array //Return final global array } """) return kernels
def compile_kernel(self): """Compile the file containing the CUDA kernels.""" t0 = time() kernel_code = open(self.kernel, 'r').read() self.mod = compiler.SourceModule(kernel_code) print('GPU - Kern : %f' % (time() - t0))
def __init__(self, n, nrhs, coeffs): ''' Parameters ---------- n: The size of the tridiagonal system. nrhs: The number of right hand sides coeffs: A list of coefficients that make up the tridiagonal matrix: (b1, c1, ai, bi, ci, an, bn) ''' self.n = n self.nrhs = nrhs self.coeffs = coeffs # check that system_size is a power of 2: assert np.int(np.log2(self.n)) == np.log2(self.n) # compute coefficients a, b, etc., a, b, c, k1, k2, b_first, k1_first, k1_last = _precompute_coefficients( self.n, self.coeffs) # copy coefficients to buffers: self.a_d = gpuarray.to_gpu(a) self.b_d = gpuarray.to_gpu(b) self.c_d = gpuarray.to_gpu(c) self.k1_d = gpuarray.to_gpu(k1) self.k2_d = gpuarray.to_gpu(k2) self.b_first_d = gpuarray.to_gpu(b_first) self.k1_first_d = gpuarray.to_gpu(k1_first) self.k1_last_d = gpuarray.to_gpu(k1_last) tpl = jinja2.Template(kernel_template) rendered_kernel = tpl.render(n=self.n, shared_size=self.n / 2) module = compiler.SourceModule(rendered_kernel, options=['-O2']) self.cyclic_reduction = module.get_function('sharedMemCyclicReduction') self.cyclic_reduction.prepare('PPPPPPPPPddddd')