def test_multiple_2d_textures(self): mod = SourceModule(""" texture<float, 2, cudaReadModeElementType> mtx_tex; texture<float, 2, cudaReadModeElementType> mtx2_tex; __global__ void copy_texture(float *dest) { int row = threadIdx.x; int col = threadIdx.y; int w = blockDim.y; dest[row*w+col] = tex2D(mtx_tex, row, col) + tex2D(mtx2_tex, row, col); } """) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") mtx2_tex = mod.get_texref("mtx2_tex") shape = (3, 4) a = np.random.randn(*shape).astype(np.float32) b = np.random.randn(*shape).astype(np.float32) drv.matrix_to_texref(a, mtx_tex, order="F") drv.matrix_to_texref(b, mtx2_tex, order="F") dest = np.zeros(shape, dtype=np.float32) copy_texture(drv.Out(dest), block=shape + (1, ), texrefs=[mtx_tex, mtx2_tex]) assert la.norm(dest - a - b) < 1e-6
def create_2d_texture(a, module, variable, point_sampling=False): a = numpy.ascontiguousarray(a) out_texref = module.get_texref(variable) cuda.matrix_to_texref(a, out_texref, order='C') if point_sampling: out_texref.set_filter_mode(cuda.filter_mode.POINT) else: out_texref.set_filter_mode(cuda.filter_mode.LINEAR) return out_texref
def copy_texture_memory_args(self, texmem_args): """adds texture memory arguments to the most recently compiled module :param texmem_args: A dictionary containing the data to be passed to the device texture memory. See tune_kernel(). :type texmem_args: dict """ filter_mode_map = { 'point': drv.filter_mode.POINT, 'linear': drv.filter_mode.LINEAR } address_mode_map = { 'border': drv.address_mode.BORDER, 'clamp': drv.address_mode.CLAMP, 'mirror': drv.address_mode.MIRROR, 'wrap': drv.address_mode.WRAP } logging.debug('copy_texture_memory_args called') logging.debug('current module: ' + str(self.current_module)) self.texrefs = [] for k, v in texmem_args.items(): tex = self.current_module.get_texref(k) self.texrefs.append(tex) logging.debug('copying to texture: ' + str(k)) if not isinstance(v, dict): data = v else: data = v['array'] logging.debug('texture to be copied: ') logging.debug(data.nbytes) logging.debug(data.dtype) logging.debug(data.flags) drv.matrix_to_texref(data, tex, order="C") if isinstance(v, dict): if 'address_mode' in v and v['address_mode'] is not None: # address_mode is set per axis amode = v['address_mode'] if not isinstance(amode, list): amode = [amode] * data.ndim for i, m in enumerate(amode): try: if m is not None: tex.set_address_mode(i, address_mode_map[m]) except KeyError: raise ValueError('Unknown address mode: ' + m) if 'filter_mode' in v and v['filter_mode'] is not None: fmode = v['filter_mode'] try: tex.set_filter_mode(filter_mode_map[fmode]) except KeyError: raise ValueError('Unknown filter mode: ' + fmode) if 'normalized_coordinates' in v and v[ 'normalized_coordinates']: tex.set_flags(tex.get_flags() | drv.TRSF_NORMALIZED_COORDINATES)
def wrap_cuda_convolution(img, kernel) -> np.ndarray: # check kernel and get radius kernel_width, kernel_height = np.int32(np.shape(kernel)) assert kernel_height % 2 or kernel_width % 2, "Kernel shape does not consist of odd numbers!" assert kernel_height == kernel_width, "Kernel is not a square!" # cast input to float32 img_in = img.astype(np.float32) kernel_cpu = kernel.astype(np.float32) # pass data to cuda texture cuda.matrix_to_texref(img_in, TEX_IMG, order='C') cuda.matrix_to_texref(kernel_cpu, TEX_KERNEL, order='C') # setup output img_out = np.zeros_like(img, dtype=np.float32) # setup grid img_height, img_width = np.shape(img_in) blocksize = 32 grid = (int(np.ceil(img_width / blocksize)), int(np.ceil(img_height / blocksize)), 1) kernel_radius = kernel_width // 2 CUDA_CONVOLUTION(np.int32(img_width), np.int32(img_height), np.int32(kernel_radius), cuda.Out(img_out), texrefs=[TEX_IMG, TEX_KERNEL], block=(blocksize, blocksize, 1), grid=grid) return img_out
def test_multiple_2d_textures(self): mod = SourceModule(""" texture<float, 2, cudaReadModeElementType> mtx_tex; texture<float, 2, cudaReadModeElementType> mtx2_tex; __global__ void copy_texture(float *dest) { int row = threadIdx.x; int col = threadIdx.y; int w = blockDim.y; dest[row*w+col] = tex2D(mtx_tex, row, col) + tex2D(mtx2_tex, row, col); } """) copy_texture = mod.get_function("copy_texture") mtx_tex = mod.get_texref("mtx_tex") mtx2_tex = mod.get_texref("mtx2_tex") shape = (3,4) a = np.random.randn(*shape).astype(np.float32) b = np.random.randn(*shape).astype(np.float32) drv.matrix_to_texref(a, mtx_tex, order="F") drv.matrix_to_texref(b, mtx2_tex, order="F") dest = np.zeros(shape, dtype=np.float32) copy_texture(drv.Out(dest), block=shape+(1,), texrefs=[mtx_tex, mtx2_tex] ) assert la.norm(dest-a-b) < 1e-6
def rotate_image( a, resize = 1.5, angle = 20., interpolation = "linear", blocks = (16,16,1) ): """ Rotates the array. The new array has the new size and centers the picture in the middle. a - array (2-dim) resize - new_image w/old_image w angle - degrees to rotate the image interpolation - "linear" or None blocks - given to the kernel when run returns: a new array with dtype=uint8 containing the rotated image """ angle = angle/180. *pi # Convert this image to float. Unsigned int texture gave # strange results for me. This conversion is slow though :( a = a.astype("float32") # Calculate the dimensions of the new image calc_x = lambda x_y: (x_y[0]*a.shape[1]/2.*cos(angle)-x_y[1]*a.shape[0]/2.*sin(angle)) calc_y = lambda x_y1: (x_y1[0]*a.shape[1]/2.*sin(angle)+x_y1[1]*a.shape[0]/2.*cos(angle)) xs = [ calc_x(p) for p in [ (-1.,-1.),(1.,-1.),(1.,1.),(-1.,1.) ] ] ys = [ calc_y(p) for p in [ (-1.,-1.),(1.,-1.),(1.,1.),(-1.,1.) ] ] new_image_dim = ( int(numpy.ceil(max(ys)-min(ys))*resize), int(numpy.ceil(max(xs)-min(xs))*resize), ) # Now generate the cuda texture cuda.matrix_to_texref(a, texref, order="C") # We could set the next if we wanted to address the image # in normalized coordinates ( 0 <= coordinate < 1.) # texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) if interpolation == "linear": texref.set_filter_mode(cuda.filter_mode.LINEAR) # Calculate the gridsize. This is entirely given by the size of our image. gridx = new_image_dim[0]/blocks[0] if \ new_image_dim[0]%blocks[0]==1 else new_image_dim[0]/blocks[0] +1 gridy = new_image_dim[1]/blocks[1] if \ new_image_dim[1]%blocks[1]==0 else new_image_dim[1]/blocks[1] +1 # Get the output image output = numpy.zeros(new_image_dim,dtype="uint8") # Call the kernel copy_texture_func( numpy.float32(resize), numpy.float32(angle), numpy.uint16(a.shape[1]), numpy.uint16(a.shape[0]), numpy.uint16(new_image_dim[1]), numpy.uint16(new_image_dim[0]), cuda.Out(output),texrefs=[texref],block=blocks,grid=(gridx,gridy)) return output
def rotate_image( a, resize = 1.5, angle = 180., interpolation = "linear", blocks = (16,16,1) ): """ Rotates the array. The new array has the new size and centers the picture in the middle. a - array (2-dim) resize - new_image w/old_image w angle - degrees to rotate the image interpolation - "linear" or None blocks - given to the kernel when run returns: a new array with dtype=uint8 containing the rotated image """ angle = angle/180. *pi # Convert this image to float. Unsigned int texture gave # strange results for me. This conversion is slow though :( a = a.astype("float32") # Calculate the dimensions of the new image calc_x = lambda (x,y): (x*a.shape[1]/2.*cos(angle)-y*a.shape[0]/2.*sin(angle)) calc_y = lambda (x,y): (x*a.shape[1]/2.*sin(angle)+y*a.shape[0]/2.*cos(angle)) xs = [ calc_x(p) for p in [ (-1.,-1.),(1.,-1.),(1.,1.),(-1.,1.) ] ] ys = [ calc_y(p) for p in [ (-1.,-1.),(1.,-1.),(1.,1.),(-1.,1.) ] ] new_image_dim = ( int(numpy.ceil(max(ys)-min(ys))*resize), int(numpy.ceil(max(xs)-min(xs))*resize), ) # Now generate the cuda texture cuda.matrix_to_texref(a, texref, order="C") # We could set the next if we wanted to address the image # in normalized coordinates ( 0 <= coordinate < 1.) # texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) if interpolation == "linear": texref.set_filter_mode(cuda.filter_mode.LINEAR) # Calculate the gridsize. This is entirely given by the size of our image. gridx = new_image_dim[0]/blocks[0] if \ new_image_dim[0]%blocks[0]==1 else new_image_dim[0]/blocks[0] +1 gridy = new_image_dim[1]/blocks[1] if \ new_image_dim[1]%blocks[1]==0 else new_image_dim[1]/blocks[1] +1 # Get the output image output = numpy.zeros(new_image_dim,dtype="uint8") # Call the kernel copy_texture_func( numpy.float32(resize), numpy.float32(angle), numpy.uint16(a.shape[1]), numpy.uint16(a.shape[0]), numpy.uint16(new_image_dim[1]), numpy.uint16(new_image_dim[0]), cuda.Out(output),texrefs=[texref],block=blocks,grid=(gridx,gridy)) return output
def copy_texture_memory_args(self, texmem_args): """adds texture memory arguments to the most recently compiled module :param texmem_args: A dictionary containing the data to be passed to the device texture memory. TODO """ filter_mode_map = { 'point': drv.filter_mode.POINT, 'linear': drv.filter_mode.LINEAR } address_mode_map = { 'border': drv.address_mode.BORDER, 'clamp': drv.address_mode.CLAMP, 'mirror': drv.address_mode.MIRROR, 'wrap': drv.address_mode.WRAP } logging.debug('copy_texture_memory_args called') logging.debug('current module: ' + str(self.current_module)) self.texrefs = [] for k, v in texmem_args.items(): tex = self.current_module.get_texref(k) self.texrefs.append(tex) logging.debug('copying to texture: ' + str(k)) if not isinstance(v, dict): data = v else: data = v['array'] logging.debug('texture to be copied: ') logging.debug(data.nbytes) logging.debug(data.dtype) logging.debug(data.flags) drv.matrix_to_texref(data, tex, order="C") if isinstance(v, dict): if 'address_mode' in v and v['address_mode'] is not None: # address_mode is set per axis amode = v['address_mode'] if not isinstance(amode, list): amode = [ amode ] * data.ndim for i, m in enumerate(amode): try: if m is not None: tex.set_address_mode(i, address_mode_map[m]) except KeyError: raise ValueError('Unknown address mode: ' + m) if 'filter_mode' in v and v['filter_mode'] is not None: fmode = v['filter_mode'] try: tex.set_filter_mode(filter_mode_map[fmode]) except KeyError: raise ValueError('Unknown filter mode: ' + fmode) if 'normalized_coordinates' in v and v['normalized_coordinates']: tex.set_flags(tex.get_flags() | drv.TRSF_NORMALIZED_COORDINATES)
def doit(img): width, height = (img.shape[0], img.shape[1]) cuda.matrix_to_texref(img, texref, order="C") texref.set_filter_mode(cuda.filter_mode.LINEAR) gpu_output = np.zeros((width/2,height/2), dtype=np.float32) gridsize = (width / blocksize[0], height / blocksize[1]) downsample_func(cuda.Out(gpu_output), np.int32(width), np.int32(width/2), block=blocksize, grid = gridsize, texrefs=[texref]) # gpu_output = gpu_output.transpose() return gpu_output
def kernel(self): density, x, y, z, Qx, Qy, Qz, result = self.born_args nx, ny, nz, nqx, nqy, nqz = [ numpy.int32(len(v)) for v in x, y, z, Qx, Qy, Qz ] cx, cy, cz, cQx, cQy, cQz, cdensity = [ gpuarray.to_gpu(v) for v in x, y, z, Qx, Qy, Qz, density ] cframe = cuda.mem_alloc(result[0].nbytes) cuda.matrix_to_texref(nx, texref, order="C") texref.set_filter_mode(cuda.filter_mode.LINEAR) n = int(1 * nqy * nqz) print 'fn in kernel' while True: try: qxi = numpy.int32(self.work_queue.get(block=False)) except Queue.Empty: break print "%d of %d on %d\n" % (qxi, nqx, self.gpu), cuda_texture_func(nx, ny, nz, nqx, nqy, nqz, cdensity, cx, cy, cz, cQx, cQy, cQz, qxi, cuda.Out(result), texrefs=[texref]) #self.cudaBorn(nx,ny,nz,nqx,nqy,nqz, #cdensity,cx,cy,cz,cQx,cQy,cQz,qxi,cframe, #**cuda_partition(n)) ## Delay fetching result until the kernel is complete #cuda_sync() ## Fetch result back to the CPU #cuda.memcpy_dtoh(result[qxi], cframe) print "%d %s\n" % (qxi, ctemp.get()) del cx, cy, cz, cQx, cQy, cQz, cdensity, cframe
def cuda_interpolate(self, channel, m, size_result): cols = size_result[0] rows = size_result[1] kernel_code = """ texture<float, 2> tex; __global__ void interpolation(float *dest, float *m0, float *m1) { int idx = threadIdx.x + blockDim.x * blockIdx.x; int idy = threadIdx.y + blockDim.y * blockIdx.y; if (( idx < %(NCOLS)s ) && ( idy < %(NDIM)s )) { dest[%(NDIM)s * idx + idy] = tex2D(tex, m0[%(NDIM)s * idy + idx], m1[%(NDIM)s * idy + idx]); } } """ kernel_code = kernel_code % {'NCOLS': cols, 'NDIM': rows} mod = SourceModule(kernel_code) interpolation = mod.get_function("interpolation") texref = mod.get_texref("tex") channel = channel.astype("float32") drv.matrix_to_texref(channel, texref, order="F") texref.set_filter_mode(drv.filter_mode.LINEAR) bdim = (16, 16, 1) dx, mx = divmod(cols, bdim[0]) dy, my = divmod(rows, bdim[1]) gdim = ((dx + (mx > 0)) * bdim[0], (dy + (my > 0)) * bdim[1]) dest = np.zeros((rows, cols)).astype("float32") m0 = (m[0, :] - 1).astype("float32") m1 = (m[1, :] - 1).astype("float32") interpolation(drv.Out(dest), drv.In(m0), drv.In(m1), block=bdim, grid=gdim, texrefs=[texref]) return dest.astype("uint8")
def gpu_calc(image, sigma_r, sigma_d): N, M = image.shape[0], image.shape[1] block_size = (16, 16, 1) grid_size = (int(np.ceil(N / block_size[0])), int(np.ceil(M / block_size[1]))) result = np.zeros((N, M), dtype=np.uint32) calc = mod.get_function("calc") start = time.time() tex = mod.get_texref("tex") driver.matrix_to_texref(image.astype(np.uint32), tex, order="C") calc(driver.Out(result), np.int32(N), np.int32(M), np.float32(sigma_d), np.float32(sigma_r), block=block_size, grid=grid_size, texrefs=[tex]) driver.Context.synchronize() end = time.time() cv2.imwrite('img_gpu.bmp', result.astype(np.uint8)) return end - start
def cuda_interpolate3D(self, img, m, size_result): cols = size_result[0] rows = size_result[1] kernel_code = """ texture<float, 2> texR; texture<float, 2> texG; texture<float, 2> texB; __global__ void interpolation(float *dest, float *m0, float *m1) { int idx = threadIdx.x + blockDim.x * blockIdx.x; int idy = threadIdx.y + blockDim.y * blockIdx.y; if (( idx < %(NCOLS)s ) && ( idy < %(NDIM)s )) { dest[3*(%(NDIM)s * idx + idy)] = tex2D(texR, m0[%(NDIM)s * idy + idx], m1[%(NDIM)s * idy + idx]); dest[3*(%(NDIM)s * idx + idy) + 1] = tex2D(texG, m0[%(NDIM)s * idy + idx], m1[%(NDIM)s * idy + idx]); dest[3*(%(NDIM)s * idx + idy) + 2] = tex2D(texB, m0[%(NDIM)s * idy + idx], m1[%(NDIM)s * idy + idx]); } } """ kernel_code = kernel_code % {'NCOLS': cols, 'NDIM': rows} mod = SourceModule(kernel_code) interpolation = mod.get_function("interpolation") texrefR = mod.get_texref("texR") texrefG = mod.get_texref("texG") texrefB = mod.get_texref("texB") img = img.astype("float32") drv.matrix_to_texref(img[:, :, 0], texrefR, order="F") texrefR.set_filter_mode(drv.filter_mode.LINEAR) drv.matrix_to_texref(img[:, :, 1], texrefG, order="F") texrefG.set_filter_mode(drv.filter_mode.LINEAR) drv.matrix_to_texref(img[:, :, 2], texrefB, order="F") texrefB.set_filter_mode(drv.filter_mode.LINEAR) bdim = (16, 16, 1) dx, mx = divmod(cols, bdim[0]) dy, my = divmod(rows, bdim[1]) gdim = ((dx + (mx > 0)) * bdim[0], (dy + (my > 0)) * bdim[1]) dest = np.zeros((rows, cols, 3)).astype("float32") m0 = (m[0, :] - 1).astype("float32") m1 = (m[1, :] - 1).astype("float32") interpolation(drv.Out(dest), drv.In(m0), drv.In(m1), block=bdim, grid=gdim, texrefs=[texrefR, texrefG, texrefB]) return dest.astype("uint8")
def load_texture(self, name, arr): ''' Loads an array into a texture with a name. Address by the name in the kernel code. ''' tex = self.mod.get_texref(name) # x*y*z arr = arr.astype('float32') if len(arr.shape) == 3: carr = arr.copy('F') texarray = numpy3d_to_array(carr, 'F') tex.set_array(texarray) else: if len(arr.shape) == 1: arr = np.expand_dims(arr, 1) tex.set_flags(0) cuda.matrix_to_texref(arr, tex, order='F') tex.set_address_mode(0, cuda.address_mode.CLAMP) tex.set_address_mode(1, cuda.address_mode.CLAMP) tex.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) tex.set_filter_mode(cuda.filter_mode.LINEAR) self.textures[name] = tex
def trikmeans_gpu(data, clusters, iterations, return_times = 0): # trikmeans_gpu(data, clusters, iterations) returns (clusters, labels) # kmeans using triangle inequality algorithm and cuda # input arguments are the data, intial cluster values, and number of iterations to repeat # The shape of data is (nDim, nPts) where nDim = # of dimensions in the data and # nPts = number of data points # The shape of clustrs is (nDim, nClusters) # # The return values are the updated clusters and labels for the data #--------------------------------------------------------------- # get problem parameters #--------------------------------------------------------------- (nDim, nPts) = data.shape nClusters = clusters.shape[1] #--------------------------------------------------------------- # set calculation control variables #--------------------------------------------------------------- useTextureForData = 1 # block and grid sizes for the ccdist kernel (also for hdclosest) blocksize_ccdist = min(512, 16*(1+(nClusters-1)/16)) gridsize_ccdist = 1 + (nClusters-1)/blocksize_ccdist #block and grid sizes for the init module threads_desired = 16*(1+(max(nPts, nDim*nClusters)-1)/16) blocksize_init = min(512, threads_desired) gridsize_init = 1 + (threads_desired - 1)/blocksize_init #block and grid sizes for the step3 module blocksize_step3 = blocksize_init gridsize_step3 = gridsize_init #block and grid sizes for the step4 module for blocksize_step4_x in range(32, 512, 32): if blocksize_step4_x >= nClusters: break; blocksize_step4_y = min(nDim, 512/blocksize_step4_x) gridsize_step4_x = 1 + (nClusters-1)/blocksize_step4_x gridsize_step4_y = 1 + (nDim-1)/blocksize_step4_y #block and grid sizes for the calc_movement module blocksize_calcm = blocksize_step4_x gridsize_calcm = gridsize_step4_x #block and grid sizes for the step56 module blocksize_step56 = blocksize_init gridsize_step56 = gridsize_init #--------------------------------------------------------------- # prepare source modules #--------------------------------------------------------------- t1 = time.time() mod_ccdist = mods2.get_ccdist_module(nDim, nPts, nClusters, blocksize_ccdist, blocksize_init, blocksize_step4_x, blocksize_step4_y, blocksize_step56, useTextureForData) #mod_step56 = mods2.get_step56_module(nDim, nPts, nClusters, blocksize_step56) ccdist = mod_ccdist.get_function("ccdist") calc_hdclosest = mod_ccdist.get_function("calc_hdclosest") init = mod_ccdist.get_function("init") step3 = mod_ccdist.get_function("step3") step4 = mod_ccdist.get_function("step4") calc_movement = mod_ccdist.get_function("calc_movement") step56 = mod_ccdist.get_function("step56") pycuda.autoinit.context.synchronize() t2 = time.time() module_time = t2-t1 #--------------------------------------------------------------- # setup data on GPU #--------------------------------------------------------------- t1 = time.time() data = np.array(data).astype(np.float32) clusters = np.array(clusters).astype(np.float32) if useTextureForData: # copy the data to the texture texrefData = mod_ccdist.get_texref("texData") cuda.matrix_to_texref(data, texrefData, order="F") else: gpu_data = gpuarray.to_gpu(data) gpu_clusters = gpuarray.to_gpu(clusters) gpu_assignments = gpuarray.zeros((nPts,), np.int32) # cluster assignment gpu_lower = gpuarray.zeros((nClusters, nPts), np.float32) # lower bounds on distance between # point and each cluster gpu_upper = gpuarray.zeros((nPts,), np.float32) # upper bounds on distance between # point and any cluster gpu_ccdist = gpuarray.zeros((nClusters, nClusters), np.float32) # cluster-cluster distances gpu_hdClosest = gpuarray.zeros((nClusters,), np.float32) # half distance to closest gpu_hdClosest.fill(1.0e10) # set to large value // **TODO** get the acutal float max gpu_badUpper = gpuarray.zeros((nPts,), np.int32) # flag to indicate upper bound needs recalc gpu_clusters2 = gpuarray.zeros((nDim, nClusters), np.float32); gpu_cluster_movement = gpuarray.zeros((nClusters,), np.float32); gpu_cluster_changed = gpuarray.zeros((nClusters,), np.int32) pycuda.autoinit.context.synchronize() t2 = time.time() data_time = t2-t1 #--------------------------------------------------------------- # do calculations #--------------------------------------------------------------- ccdist_time = 0. hdclosest_time = 0. init_time = 0. step3_time = 0. step4_time = 0. step56_time = 0. t1 = time.time() ccdist(gpu_clusters, gpu_ccdist, gpu_hdClosest, block = (blocksize_ccdist, 1, 1), grid = (gridsize_ccdist, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() ccdist_time += t2-t1 t1 = time.time() calc_hdclosest(gpu_ccdist, gpu_hdClosest, block = (blocksize_ccdist, 1, 1), grid = (gridsize_ccdist, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() hdclosest_time += t2-t1 t1 = time.time() if useTextureForData: init(gpu_clusters, gpu_ccdist, gpu_hdClosest, gpu_assignments, gpu_lower, gpu_upper, block = (blocksize_init, 1, 1), grid = (gridsize_init, 1), texrefs=[texrefData]) else: init(gpu_data, gpu_clusters, gpu_ccdist, gpu_hdClosest, gpu_assignments, gpu_lower, gpu_upper, block = (blocksize_init, 1, 1), grid = (gridsize_init, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() init_time += t2-t1 """ print "data" print data print "gpu_dataout" print gpu_dataout return 1 """ for i in range(iterations): if i>0: t1 = time.time() ccdist(gpu_clusters, gpu_ccdist, gpu_hdClosest, block = (blocksize_ccdist, 1, 1), grid = (gridsize_ccdist, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() ccdist_time += t2-t1 t1 = time.time() calc_hdclosest(gpu_ccdist, gpu_hdClosest, block = (blocksize_ccdist, 1, 1), grid = (gridsize_ccdist, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() hdclosest_time += t2-t1 """ print "Just before step 3==========================================" print "gpu_clusters" print gpu_clusters print "gpu_ccdist" print gpu_ccdist print "gpu_hdClosest" print gpu_hdClosest print "gpu_assignments" print gpu_assignments print "gpu_lower" print gpu_lower print "gpu_upper" print gpu_upper print "gpu_badUpper" print gpu_badUpper """ t1 = time.time() gpu_cluster_changed.fill(0) if useTextureForData: step3(gpu_clusters, gpu_ccdist, gpu_hdClosest, gpu_assignments, gpu_lower, gpu_upper, gpu_badUpper, gpu_cluster_changed, block = (blocksize_step3, 1, 1), grid = (gridsize_step3, 1), texrefs=[texrefData]) else: step3(gpu_data, gpu_clusters, gpu_ccdist, gpu_hdClosest, gpu_assignments, gpu_lower, gpu_upper, gpu_badUpper, gpu_cluster_changed, block = (blocksize_step3, 1, 1), grid = (gridsize_step3, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() step3_time += t2-t1 """ print "gpu_cluster_changed" print gpu_cluster_changed.get() """ """ print "Just before step 4==========================================" print "gpu_assignments" print gpu_assignments print "gpu_lower" print gpu_lower print "gpu_upper" print gpu_upper print "gpu_badUpper" print gpu_badUpper """ t1 = time.time() if useTextureForData: step4(gpu_clusters, gpu_clusters2, gpu_assignments, gpu_cluster_movement, gpu_cluster_changed, block = (blocksize_step4_x, blocksize_step4_y, 1), grid = (gridsize_step4_x, gridsize_step4_y), texrefs=[texrefData]) else: step4(gpu_data, gpu_clusters, gpu_clusters2, gpu_assignments, gpu_cluster_movement, gpu_cluster_changed, block = (blocksize_step4_x, blocksize_step4_y, 1), grid = (gridsize_step4_x, gridsize_step4_y)) #""" calc_movement(gpu_clusters, gpu_clusters2, gpu_cluster_movement, block = (blocksize_calcm, 1, 1), grid = (gridsize_calcm, 1)) #""" pycuda.autoinit.context.synchronize() t2 = time.time() step4_time += t2-t1 """ print "Just before step 5==========================================" print "gpu_cluste_movement" print gpu_cluster_movement print "gpu_clusters" print gpu_clusters2 """ t1 = time.time() #------------------------------------------------------------------ if useTextureForData: step56(gpu_assignments, gpu_lower, gpu_upper, gpu_cluster_movement, gpu_badUpper, block = (blocksize_step56, 1, 1), grid = (gridsize_step56, 1), texrefs=[texrefData]) else: step56(gpu_assignments, gpu_lower, gpu_upper, gpu_cluster_movement, gpu_badUpper, block = (blocksize_step56, 1, 1), grid = (gridsize_step56, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() step56_time += t2-t1 #-------------------------------------------------------------- """ print "Just after step 6==========================================" print "gpu_lower" print gpu_lower print "gpu_upper" print gpu_upper print "gpu_badUpper" print gpu_badUpper """ #if gpuarray.sum(gpu_cluster_movement).get() < 1.e-7: #print "No change in clusters!" #break # prepare for next iteration temp = gpu_clusters gpu_clusters = gpu_clusters2 gpu_clusters2 = temp if return_times: return gpu_ccdist, gpu_hdClosest, gpu_assignments, gpu_lower, gpu_upper, \ gpu_clusters.get(), gpu_cluster_movement, \ data_time, module_time, init_time, \ ccdist_time/iterations, hdclosest_time/iterations, \ step3_time/iterations, step4_time/iterations, step56_time/iterations else: return gpu_clusters.get(), gpu_assignments.get()
def kmeans_gpu(data, clusters, iterations, return_times=0): # kmeans_gpu(data, clusters, iterations) returns (clusters, labels) # kmeans using standard algorithm and cuda # input arguments are the data, intial cluster values, and number of iterations to repeat # The shape of data is (nDim, nPts) where nDim = # of dimensions in the data and # nPts = number of data points # The shape of clusters is (nDim, nClusters) # # The return values are the updated clusters and labels for the data #--------------------------------------------------------------- # get problem parameters #--------------------------------------------------------------- (nDim, nPts) = data.shape nClusters = clusters.shape[1] #--------------------------------------------------------------- # set calculation control variables #--------------------------------------------------------------- useTextureForData = 0 usePageLockedMemory = 0 if (nPts > 32768): useTextureForData = 0 # block and grid sizes for the cluster_assign kernel threads_desired = 16 * (1 + (max(nPts, nDim * nClusters) - 1) / 16) blocksize_assign = min(256, threads_desired) gridsize_assign = 1 + (threads_desired - 1) / blocksize_assign """ print "\nblocksize_assign =", blocksize_assign print "gridsize_assign =", gridsize_assign """ # block and grid sizes for the cluster_calc kernel blocksize_calc = 2 while (blocksize_calc < min(512, nPts)): blocksize_calc *= 2 maxblocks = 512 seqcount_calc = 1 + (nPts - 1) / (blocksize_calc * maxblocks) gridsize_calc = 1 + (nPts - 1) / (seqcount_calc * blocksize_calc) """ print "blocksize_calc =", blocksize_calc print "gridsize_calc =", gridsize_calc print "seqcount_calc =", seqcount_calc """ blocksize_calc_part2 = 1 while (blocksize_calc_part2 < gridsize_calc): blocksize_calc_part2 *= 2 #--------------------------------------------------------------- # prepare source modules #--------------------------------------------------------------- t1 = time.time() mod_cuda = kernels.get_cuda_module(nDim, nPts, nClusters, blocksize_calc, seqcount_calc, gridsize_calc, blocksize_calc_part2, useTextureForData, BOUNDS) cuda_assign = mod_cuda.get_function("assign") cuda_calc = mod_cuda.get_function("calc") cuda_calc_part2 = mod_cuda.get_function("calc_part2") pycuda.autoinit.context.synchronize() t2 = time.time() module_time = t2 - t1 #--------------------------------------------------------------- # setup data on GPU #--------------------------------------------------------------- t1 = time.time() data = np.array(data).astype(np.float32) clusters = np.array(clusters).astype(np.float32) if useTextureForData: # copy the data to the texture texrefData = mod_cuda.get_texref("texData") cuda.matrix_to_texref(data, texrefData, order="F") else: if usePageLockedMemory: data_pl = cuda.pagelocked_empty_like(data) data_pl[:, :] = data gpu_data = gpuarray.to_gpu(data_pl) else: gpu_data = gpuarray.to_gpu(data) if usePageLockedMemory: clusters_pl = cuda.pagelocked_empty_like(clusters) clusters_pl[:, :] = clusters gpu_clusters = gpuarray.to_gpu(clusters_pl) else: gpu_clusters = gpuarray.to_gpu(clusters) gpu_assignments = gpuarray.zeros((nPts, ), np.int32) gpu_clusters2 = gpuarray.zeros((nDim, nClusters), np.float32) gpu_reduction_out = gpuarray.zeros((nDim, nClusters * gridsize_calc), np.float32) gpu_reduction_counts = gpuarray.zeros((nClusters * gridsize_calc, ), np.int32) pycuda.autoinit.context.synchronize() t2 = time.time() data_time = t2 - t1 #--------------------------------------------------------------- # do calculations #--------------------------------------------------------------- assign_time = 0. calc_time = 0. for i in range(iterations): # assign data to clusters t1 = time.time() if useTextureForData: cuda_assign(gpu_clusters, gpu_assignments, block=(blocksize_assign, 1, 1), grid=(gridsize_assign, 1), texrefs=[texrefData]) else: cuda_assign(gpu_data, gpu_clusters, gpu_assignments, block=(blocksize_assign, 1, 1), grid=(gridsize_assign, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() assign_time += t2 - t1 # calculate new cluster centers t1 = time.time() if useTextureForData: cuda_calc(gpu_reduction_out, gpu_reduction_counts, gpu_assignments, block=(blocksize_calc, 1, 1), grid=(gridsize_calc, nDim), texrefs=[texrefData]) else: cuda_calc(gpu_data, gpu_reduction_out, gpu_reduction_counts, gpu_assignments, block=(blocksize_calc, 1, 1), grid=(gridsize_calc, nDim)) cuda_calc_part2(gpu_reduction_out, gpu_reduction_counts, gpu_clusters2, gpu_clusters, block=(blocksize_calc_part2, 1, 1), grid=(1, nDim)) pycuda.autoinit.context.synchronize() t2 = time.time() calc_time += t2 - t1 # prepare for next iteration temp = gpu_clusters gpu_clusters = gpu_clusters2 gpu_clusters2 = temp if return_times: return gpu_assignments, gpu_clusters.get(), \ data_time, module_time, assign_time/iterations, calc_time/iterations else: return gpu_clusters.get(), gpu_assignments.get()
def main(): #Initialise InteractionMatrix def Delta(a,b): if a==b: return 1 else: return 0 for i in range(InteractionMatrix.shape[0]): for j in range(InteractionMatrix.shape[1]): InteractionMatrix[i][j] = ( 1 - i % 2 ) * Delta( i, j+1 ) + ( i % 2 ) * Delta( i, j-1 ) #Initialise GPU (equivalent of autoinit) drv.init() assert drv.Device.count() >= 1 dev = drv.Device(0) ctx = dev.make_context(0) #Convert GlobalParams to List GlobalParams = np.zeros(len(GlobalParamsDict.values())).astype(np.float32) count = 0 for x in GlobalParamsDict.keys(): GlobalParams[count] = GlobalParamsDict[x] count += 1 #Convert FitnessParams to List FitnessParams = np.zeros(len(FitnessParamsDict.values())).astype(np.float32) count = 0 for x in FitnessParamsDict.keys(): FitnessParams[count] = FitnessParamsDict[x] count += 1 #Convert GAParams to List GAParams = np.zeros(len(GAParamsDict.values())).astype(np.float32) count = 0 for x in GAParamsDict.keys(): GAParams[count] = GAParamsDict[x] count += 1 # Set environment for template package Jinja2 env = Environment(loader=PackageLoader('main', 'cuda')) # Load source code from file Source = env.get_template('kernel.cu') #Template( file(KernelFile).read() ) #Create dictionary argument for rendering RenderArgs= {"params_size":GlobalParams.nbytes,\ "fitnessparams_size":FitnessParams.nbytes,\ "gaparams_size":GAParams.nbytes,\ "genome_bytelength":int(ByteLengthGenome),\ "genome_bitlength":int(BitLengthGenome),\ "ga_nr_threadsperblock":GA_NrThreadsPerBlock,\ "textures":range( 0, NrFitnessFunctionGrids ),\ "curandinit_nr_threadsperblock":CurandInit_NrThreadsPerBlock,\ "with_mixed_crossover":WithMixedCrossover, "with_bank_conflict":WithBankConflict, "with_naive_roulette_wheel_selection":WithNaiveRouletteWheelSelection, "with_assume_normalized_fitness_function_values":WithAssumeNormalizedFitnessFunctionValues, "with_uniform_crossover":WithUniformCrossover, "with_single_point_crossover":WithSinglePointCrossover, "with_surefire_mutation":WithSurefireMutation, "with_storeassembledgridsinglobalmemory":WithStoreAssembledGridsInGlobalMemory, "ga_threaddimx":int(ThreadDim), "glob_nr_tiletypes":int(NrTileTypes), "glob_nr_edgetypes":int(NrEdgeTypes), "glob_nr_tileorientations":int(NrTileOrientations), "fit_dimgridx":int(DimGridX), "fit_dimgridy":int(DimGridY), "fit_nr_fitnessfunctiongrids":int(NrFitnessFunctionGrids), "fit_nr_fourpermutations":int(NrFourPermutations), "fit_assembly_redundancy":int(NrAssemblyRedundancy), "fit_nr_threadsperblock":int(Fit_NrThreadsPerBlock), "sort_threaddimx":int(Sort_ThreadDimX), "glob_nr_genomes":int(NrGenomes), "fit_dimthreadx":int(ThreadDimX), "fit_dimthready":int(ThreadDimY), "fit_dimsubgridx":int(SubgridDimX), "fit_dimsubgridy":int(SubgridDimY), "fit_nr_subgridsperbank":int(NrSubgridsPerBank), "glob_bitlength_edgetype":int(EdgeTypeBitLength) } # Render source code RenderedSource = Source.render( RenderArgs ) # Save rendered source code to file f = open('./rendered.cu', 'w') f.write(RenderedSource) f.close() #Load source code into module KernelSourceModule = SourceModule(RenderedSource, options=None, no_extern_c=True, arch="compute_11", code="sm_11", cache_dir=None) #Allocate values on GPU Genomes_h = drv.mem_alloc(Genomes.nbytes) FitnessPartialSums_h = drv.mem_alloc(FitnessPartialSums.nbytes) FitnessValues_h = drv.mem_alloc(FitnessValues.nbytes) AssembledGrids_h = drv.mem_alloc(AssembledGrids.nbytes) Mutexe_h = drv.mem_alloc(Mutexe.nbytes) ReductionList_h = drv.mem_alloc(ReductionList.nbytes) #Copy values to global memory drv.memcpy_htod(Genomes_h, Genomes) drv.memcpy_htod(FitnessPartialSums_h, FitnessPartialSums) drv.memcpy_htod(FitnessValues_h, FitnessValues) drv.memcpy_htod(AssembledGrids_h, AssembledGrids) drv.memcpy_htod(Mutexe_h, Mutexe) #Copy values to constant / texture memory for id in range(0, NrFitnessFunctionGrids): FitnessFunctionGrids_h.append( KernelSourceModule.get_texref("t_ucFitnessFunctionGrids%d"%(id)) ) drv.matrix_to_texref( FitnessFunctionGrids[id], FitnessFunctionGrids_h[id] , order="C") InteractionMatrix_h = KernelSourceModule.get_texref("t_ucInteractionMatrix") drv.matrix_to_texref( InteractionMatrix, InteractionMatrix_h , order="C") GlobalParams_h = KernelSourceModule.get_global("c_fParams") # Constant memory address drv.memcpy_htod(GlobalParams_h[0], GlobalParams) FitnessParams_h = KernelSourceModule.get_global("c_fFitnessParams") # Constant memory address drv.memcpy_htod(FitnessParams_h[0], FitnessParams) GAParams_h = KernelSourceModule.get_global("c_fGAParams") # Constant memory address drv.memcpy_htod(GAParams_h[0], GAParams) FourPermutations_h = KernelSourceModule.get_global("c_ucFourPermutations") # Constant memory address drv.memcpy_htod(FourPermutations_h[0], FourPermutations) FitnessSumConst_h = KernelSourceModule.get_global("c_fFitnessSumConst") FitnessListConst_h = KernelSourceModule.get_global("c_fFitnessListConst") #Set up curandStates curandState_bytesize = 40 # This might be incorrect, depending on your compiler (info from Tomasz Rybak's pyCUDA cuRAND wrapper) CurandStates_h = drv.mem_alloc(curandState_bytesize * NrGenomes) #Compile kernels curandinit_fnc = KernelSourceModule.get_function("CurandInitKernel") fitness_fnc = KernelSourceModule.get_function("FitnessKernel") sorting_fnc = KernelSourceModule.get_function("SortingKernel") ga_fnc = KernelSourceModule.get_function("GAKernel") #Initialise Curand curandinit_fnc(CurandStates_h, block=(int(CurandInit_NrThreadsPerBlock), 1, 1), grid=(int(CurandInit_NrBlocks), 1)) #Build parameter lists for FitnessKernel and GAKernel FitnessKernelParams = (Genomes_h, FitnessValues_h, AssembledGrids_h, CurandStates_h, Mutexe_h); SortingKernelParams = (FitnessValues_h, FitnessPartialSums_h) GAKernelParams = (Genomes_h, FitnessValues_h, AssembledGrids_h, CurandStates_h); #TEST ONLY return #TEST ONLY #Initialise CUDA timers start = drv.Event() stop = drv.Event() #execute kernels for specified number of generations start.record() for gen in range(0, GlobalParamsDict["NrGenerations"]): #print "Processing Generation: %d"%(gen) #fitness_fnc(*(FitnessKernelParams), block=fit_blocks, grid=fit_grid) #Launch CPU processing (should be asynchroneous calls) sorting_fnc(*(SortingKernelParams), block=sorting_blocks, grid=sorting_grids) #Launch Sorting Kernel drv.memcpy_dtoh(ReductionList, ReductionList_h) #Copy from Device to Host and finish sorting FitnessSumConst = ReductionList.sum() drv.memcpy_htod(FitnessSumConst_h[0], FitnessSumConst) #Copy from Host to Device constant memory drv.memcpy_dtod(FitnessListConst_h[0], FitnessValues_h, FitnessValues.nbytes) #Copy FitneValues from Device to Device Const ga_fnc(*(GAKernelParams), block=ga_blocks, grid=ga_grids) drv.memcpy_dtoh(Genomes, Genomes_h) #Copy data from GPU drv.memcpy_dtoh(FitnessValues, FitnessValues_h) drv.memcpy_dtoh(AssembledGrids, AssembledGrids_h) stop.record() stop.synchronize() print "Total kernel time taken: %fs"%(start.time_till(stop)*1e-3) print "Mean time per generation: %fs"%(start.time_till(stop)*1e-3 / NrGenerations) pass
def mds(MATRIX_FILE, DIMENSIONS, N_ITERATIONS, IMAGES_EVERY, STATION_BLOCK_SIZE, N_NEARBY_STATIONS, DEBUG_OUTPUT, STATUS_FUNCTION, GRAPH_FUNCTION) : print 'Loading matrix...' npz = np.load(MATRIX_FILE) station_coords = npz['station_coords'] grid_dim = npz['grid_dim'] matrix = npz['matrix'].astype(np.int32) # EVERYTHING SHOULD BE IN FLOAT32 for ease of debugging. even times. # Matrix and others should be textures, arrays, or in constant memory, to do cacheing. # As it is, I'm doing explicit cacheing. # force OD matrix symmetry for test # THIS was responsible for the coordinate drift!!! # need to symmetrize it before copy to device matrix = (matrix + matrix.T) / 2 station_coords_int = station_coords.round().astype(np.int32) # to be removed when textures are working station_coords_gpu = gpuarray.to_gpu(station_coords_int) matrix_gpu = gpuarray.to_gpu(matrix) max_x, max_y = grid_dim n_gridpoints = int(max_x * max_y) n_stations = len(station_coords) cuda_grid_shape = ( int( math.ceil( float(max_x)/CUDA_BLOCK_SHAPE[0] ) ), int( math.ceil( float(max_y)/CUDA_BLOCK_SHAPE[1] ) ) ) print "\n----PARAMETERS----" print "Input file: ", MATRIX_FILE print "Number of stations: ", n_stations print "OD matrix shape: ", matrix.shape print "Station coords shape: ", station_coords_int.shape print "Station cache size: ", N_NEARBY_STATIONS print "Map dimensions: ", grid_dim print "Number of map points: ", n_gridpoints print "Target space dimensionality: ", DIMENSIONS print "CUDA block dimensions: ", CUDA_BLOCK_SHAPE print "CUDA grid dimensions: ", cuda_grid_shape assert station_coords.shape == (n_stations, 2) assert N_NEARBY_STATIONS <= n_stations #sys.exit() # Make and register custom color map for pylab graphs cdict = {'red': ((0.0, 0.0, 0.0), (0.2, 0.0, 0.0), (0.4, 0.9, 0.9), (1.0, 0.0, 0.0)), 'green': ((0.0, 0.0, 0.1), (0.05, 0.9, 0.9), (0.1, 0.0, 0.0), (0.4, 0.9, 0.9), (0.6, 0.0, 0.0), (1.0, 0.0, 0.0)), 'blue': ((0.0, 0.0, 0.0), (0.05, 0.0, 0.0), (0.2, 0.9, 0.9), (0.3, 0.0, 0.0), (1.0, 0.0, 0.0))} mymap = LinearSegmentedColormap('mymap', cdict) mymap.set_over( (1.0, 0.0, 1.0) ) mymap.set_bad ( (0.0, 0.0, 0.0) ) pl.plt.register_cmap(cmap=mymap) # set up arrays for calculations coords_gpu = gpuarray.to_gpu(np.random.random( (max_x, max_y, DIMENSIONS) ).astype(np.float32)) # initialize coordinates to random values in range 0...1 forces_gpu = gpuarray.zeros( (int(max_x), int(max_y), DIMENSIONS), dtype=np.float32 ) # 3D float32 accumulate forces over one timestep weights_gpu = gpuarray.zeros( (int(max_x), int(max_y)), dtype=np.float32 ) # 2D float32 cell error accumulation errors_gpu = gpuarray.zeros( (int(max_x), int(max_y)), dtype=np.float32 ) # 2D float32 cell error accumulation near_stations_gpu = gpuarray.zeros( (cuda_grid_shape[0], cuda_grid_shape[1], N_NEARBY_STATIONS), dtype=np.int32) debug_gpu = gpuarray.zeros( n_gridpoints, dtype = np.int32 ) debug_img_gpu = gpuarray.zeros_like( errors_gpu ) print "\n----COMPILATION----" # times could be merged into forces kernel, if done by pixel not station. # integrate kernel could be GPUArray operation; also helps clean up code by using GPUArrays. # DIM should be replaced by python script, so as not to define twice. src = open("unified_mds.cu").read() src = src.replace( 'N_NEARBY_STATIONS_PYTHON', str(N_NEARBY_STATIONS) ) src = src.replace( 'N_STATIONS_PYTHON', str(n_stations) ) src = src.replace( 'DIMENSIONS_PYTHON', str(DIMENSIONS) ) mod = SourceModule(src, options=["--ptxas-options=-v"]) stations_kernel = mod.get_function("stations" ) forces_kernel = mod.get_function("forces" ) integrate_kernel = mod.get_function("integrate") matrix_texref = mod.get_texref('tex_matrix') station_coords_texref = mod.get_texref('tex_station_coords') near_stations_texref = mod.get_texref('tex_near_stations') #ts_coords_texref = mod.get_texref('tex_ts_coords') could be a 4-channel 2 dim texture, or 3 dim texture. or just 1D. cuda.matrix_to_texref(matrix, matrix_texref, order="F") # copy directly to device with texref - made for 2D x 1channel textures cuda.matrix_to_texref(station_coords_int, station_coords_texref, order="F") # fortran ordering, because we will be accessing with texND() instead of C-style indices near_stations_gpu.bind_to_texref_ext(near_stations_texref) # note, cuda.In and cuda.Out are from the perspective of the KERNEL not the host app! stations_kernel(near_stations_gpu, block=CUDA_BLOCK_SHAPE, grid=cuda_grid_shape) autoinit.context.synchronize() #print "Near stations list:" #print near_stations_gpu print "\n----CALCULATION----" t_start = time.time() n_pass = 0 while (n_pass < N_ITERATIONS) : n_pass += 1 # Pay attention to grid sizes when testing: if you don't run the integrator on the coordinates connected to stations, # they don't move... so the whole thing stabilizes in a couple of cycles. # Stations are worked on in blocks to avoid locking up the GPU with one giant kernel. for subset_low in range(0, n_stations, STATION_BLOCK_SIZE) : subset_high = subset_low + STATION_BLOCK_SIZE if subset_high > n_stations : subset_high = n_stations sys.stdout.write( "\rpass %03i / station %04i of %04i / total runtime %03.1f min " % (n_pass, subset_high, n_stations, (time.time() - t_start) / 60.0) ) sys.stdout.flush() STATUS_FUNCTION(n_pass, subset_high, n_stations, (time.time() - t_start) / 60.0, (time.time() - t_start) / n_pass + (subset_low/n_stations)) # adding texrefs in kernel call seems to change nothing, leaving them out. # max_x and max_y could be #defined in kernel source, along with STATION_BLOCK_SIZE forces_kernel(np.int32(n_stations), np.int32(subset_low), np.int32(subset_high), max_x, max_y, coords_gpu, forces_gpu, weights_gpu, errors_gpu, debug_gpu, debug_img_gpu, block=CUDA_BLOCK_SHAPE, grid=cuda_grid_shape) autoinit.context.synchronize() # print coords_gpu, forces_gpu time.sleep(0.5) # let the user interface catch up. integrate_kernel(max_x, max_y, coords_gpu, forces_gpu, weights_gpu, block=CUDA_BLOCK_SHAPE, grid=cuda_grid_shape) autoinit.context.synchronize() print IMAGES_EVERY if (IMAGES_EVERY > 0) and (n_pass % IMAGES_EVERY == 0) : #print 'Kernel debug output:' #print debug_gpu velocities = np.sqrt(np.sum(forces_gpu.get() ** 2, axis = 2)) pl.imshow( velocities.T, cmap=mymap, origin='bottom', vmin=0, vmax=100 ) pl.title( 'Velocity ( sec / timestep) - step %03d' % n_pass ) pl.colorbar() pl.savefig( 'img/vel%03d.png' % n_pass ) pl.close() pl.imshow( (errors_gpu.get() / weights_gpu.get() / 60.0 ).T, cmap=mymap, origin='bottom', vmin=0, vmax=100 ) pl.title( 'Average absolute error (min) - step %03d' %n_pass ) pl.colorbar() pl.savefig( 'img/err%03d.png' % n_pass ) pl.close() pl.imshow( (debug_img_gpu.get() / 60.0).T, cmap=mymap, origin='bottom', vmin=0, vmax=100 ) pl.title( 'Debugging Output - step %03d' %n_pass ) pl.colorbar() pl.savefig( 'img/debug%03d.png' % n_pass ) pl.close() GRAPH_FUNCTION('img/err%03d.png' % n_pass) #INTERFACE.update sys.stdout.write( "/ avg pass time %02.1f sec" % ( (time.time() - t_start) / n_pass, ) ) sys.stdout.flush()
def setup_texture_nparr(tex_ref, arr): cuda.matrix_to_texref(load_map(arr).astype(np.float32), tex_ref, order="C") tex_ref.set_address_mode(0, cuda.address_mode.WRAP) tex_ref.set_address_mode(1, cuda.address_mode.WRAP) tex_ref.set_filter_mode(cuda.filter_mode.POINT) tex_ref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
def kmeans_gpu(data, clusters, iterations, return_times = 0): # kmeans_gpu(data, clusters, iterations) returns (clusters, labels) # kmeans using standard algorithm and cuda # input arguments are the data, intial cluster values, and number of iterations to repeat # The shape of data is (nDim, nPts) where nDim = # of dimensions in the data and # nPts = number of data points # The shape of clusters is (nDim, nClusters) # # The return values are the updated clusters and labels for the data #--------------------------------------------------------------- # get problem parameters #--------------------------------------------------------------- (nDim, nPts) = data.shape nClusters = clusters.shape[1] #--------------------------------------------------------------- # set calculation control variables #--------------------------------------------------------------- useTextureForData = 0 usePageLockedMemory = 0 if(nPts > 32768): useTextureForData = 0 # block and grid sizes for the cluster_assign kernel threads_desired = 16*(1+(max(nPts, nDim*nClusters)-1)/16) blocksize_assign = min(256, threads_desired) gridsize_assign = 1 + (threads_desired - 1)/blocksize_assign """ print "\nblocksize_assign =", blocksize_assign print "gridsize_assign =", gridsize_assign """ # block and grid sizes for the cluster_calc kernel blocksize_calc = 2 while(blocksize_calc < min(512, nPts)): blocksize_calc *= 2 maxblocks = 512 seqcount_calc = 1 + (nPts-1)/(blocksize_calc * maxblocks) gridsize_calc = 1 + (nPts-1)/(seqcount_calc * blocksize_calc) """ print "blocksize_calc =", blocksize_calc print "gridsize_calc =", gridsize_calc print "seqcount_calc =", seqcount_calc """ blocksize_calc_part2 = 1 while(blocksize_calc_part2 < gridsize_calc): blocksize_calc_part2 *= 2 #--------------------------------------------------------------- # prepare source modules #--------------------------------------------------------------- t1 = time.time() mod_cuda = kernels.get_cuda_module(nDim, nPts, nClusters, blocksize_calc, seqcount_calc, gridsize_calc, blocksize_calc_part2, useTextureForData, BOUNDS) cuda_assign = mod_cuda.get_function("assign") cuda_calc = mod_cuda.get_function("calc") cuda_calc_part2 = mod_cuda.get_function("calc_part2") pycuda.autoinit.context.synchronize() t2 = time.time() module_time = t2-t1 #--------------------------------------------------------------- # setup data on GPU #--------------------------------------------------------------- t1 = time.time() data = np.array(data).astype(np.float32) clusters = np.array(clusters).astype(np.float32) if useTextureForData: # copy the data to the texture texrefData = mod_cuda.get_texref("texData") cuda.matrix_to_texref(data, texrefData, order="F") else: if usePageLockedMemory: data_pl = cuda.pagelocked_empty_like(data) data_pl[:,:] = data; gpu_data = gpuarray.to_gpu(data_pl) else: gpu_data = gpuarray.to_gpu(data) if usePageLockedMemory: clusters_pl = cuda.pagelocked_empty_like(clusters) clusters_pl[:,:] = clusters gpu_clusters = gpuarray.to_gpu(clusters_pl) else: gpu_clusters = gpuarray.to_gpu(clusters) gpu_assignments = gpuarray.zeros((nPts,), np.int32) gpu_clusters2 = gpuarray.zeros((nDim, nClusters), np.float32); gpu_reduction_out = gpuarray.zeros((nDim, nClusters*gridsize_calc), np.float32) gpu_reduction_counts = gpuarray.zeros((nClusters*gridsize_calc,), np.int32) pycuda.autoinit.context.synchronize() t2 = time.time() data_time = t2-t1 #--------------------------------------------------------------- # do calculations #--------------------------------------------------------------- assign_time = 0. calc_time = 0. for i in range(iterations): # assign data to clusters t1 = time.time() if useTextureForData: cuda_assign(gpu_clusters, gpu_assignments, block = (blocksize_assign, 1, 1), grid = (gridsize_assign, 1), texrefs=[texrefData]) else: cuda_assign(gpu_data, gpu_clusters, gpu_assignments, block = (blocksize_assign, 1, 1), grid = (gridsize_assign, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() assign_time += t2-t1 # calculate new cluster centers t1 = time.time() if useTextureForData: cuda_calc(gpu_reduction_out, gpu_reduction_counts, gpu_assignments, block = (blocksize_calc, 1, 1), grid = (gridsize_calc, nDim), texrefs=[texrefData]) else: cuda_calc(gpu_data, gpu_reduction_out, gpu_reduction_counts, gpu_assignments, block = (blocksize_calc, 1, 1), grid = (gridsize_calc, nDim)) cuda_calc_part2(gpu_reduction_out, gpu_reduction_counts, gpu_clusters2, gpu_clusters, block = (blocksize_calc_part2, 1, 1), grid = (1, nDim)) pycuda.autoinit.context.synchronize() t2 = time.time() calc_time += t2-t1 # prepare for next iteration temp = gpu_clusters gpu_clusters = gpu_clusters2 gpu_clusters2 = temp if return_times: return gpu_assignments, gpu_clusters.get(), \ data_time, module_time, assign_time/iterations, calc_time/iterations else: return gpu_clusters.get(), gpu_assignments.get()
def main(): #Create dictionary argument for rendering RenderArgs= {"safe_memory_mapping":1, "aligned_byte_length_genome":8, "bit_length_edge_type":3, "curand_nr_threads_per_block":256, "nr_tile_types":4, "nr_edge_types":8, "warpsize":32, "fit_dim_thread_x":1, "fit_dim_thread_y":1, "fit_dim_block_x":1 } # Set environment for template package Jinja2 env = Environment(loader=PackageLoader('main', './')) # Load source code from file Source = env.get_template('./alpha.cu') #Template( file(KernelFile).read() ) # Render source code RenderedSource = Source.render( RenderArgs ) # Save rendered source code to file f = open('./rendered.cu', 'w') f.write(RenderedSource) f.close() #Load source code into module KernelSourceModule = SourceModule(RenderedSource, options=None, arch="compute_20", code="sm_20") Kernel = KernelSourceModule.get_function("TestEdgeSortKernel") CurandKernel = KernelSourceModule.get_function("CurandInitKernel") #Initialise InteractionMatrix InteractionMatrix = numpy.zeros( ( 8, 8) ).astype(numpy.float32) def Delta(a,b): if a==b: return 1 else: return 0 for i in range(InteractionMatrix.shape[0]): for j in range(InteractionMatrix.shape[1]): InteractionMatrix[i][j] = ( 1 - i % 2 ) * Delta( i, j+1 ) + ( i % 2 ) * Delta( i, j-1 ) #Set up our InteractionMatrix InteractionMatrix_h = KernelSourceModule.get_texref("t_ucInteractionMatrix") drv.matrix_to_texref( InteractionMatrix, InteractionMatrix_h , order="C") print InteractionMatrix #a = numpy.random.randn(400).astype(numpy.uint8) #b = numpy.random.randn(400).astype(numpy.uint8) dest = numpy.arange(256).astype(numpy.uint8) for i in range(0, 256/8): dest[i*8 + 0] = 36 dest[i*8 + 1] = 151 dest[i*8 + 2] = 90 dest[i*8 + 3] = 109 dest[i*8 + 4] = 224 dest[i*8 + 5] = 4 dest[i*8 + 6] = 0 dest[i*8 + 7] = 0 dest_h = drv.mem_alloc(dest.nbytes) drv.memcpy_htod(dest_h, dest) print "before: " print dest curand = numpy.zeros(40*256).astype(numpy.uint8); curand_h = drv.mem_alloc(curand.nbytes) CurandKernel(curand_h, block=(32,1,1), grid=(1,1)) Kernel(dest_h, curand_h, block=(32,1,1), grid=(1,1)) drv.memcpy_dtoh(dest, dest_h) print "after: " print dest
def main(): #FourPermutations set-up FourPermutations = numpy.array([ [1,2,3,4], [1,2,4,3], [1,3,2,4], [1,3,4,2], [1,4,2,3], [1,4,3,2], [2,1,3,4], [2,1,4,3], [2,3,1,4], [2,3,4,1], [2,4,1,3], [2,4,3,1], [3,2,1,4], [3,2,4,1], [3,1,2,4], [3,1,4,2], [3,4,2,1], [3,4,1,2], [4,2,3,1], [4,2,1,3], [4,3,2,1], [4,3,1,2], [4,1,2,3], [4,1,3,2],]).astype(numpy.uint8) #Create dictionary argument for rendering RenderArgs= {"safe_memory_mapping":1, "aligned_byte_length_genome":8, "bit_length_edge_type":3, "curand_nr_threads_per_block":256, "nr_tile_types":4, "nr_edge_types":8, "warpsize":32, "fit_dim_thread_x":1, "fit_dim_thread_y":1, "fit_dim_block_x":1, "fit_dim_grid_x":19, "fit_dim_grid_y":19, "fit_nr_four_permutations":24, "fit_length_movelist":244, "fit_nr_redundancy_grid_depth":2, "fit_nr_redundancy_assemblies":10, "fit_tile_index_starting_tile":0, "glob_nr_tile_orientations":4 } # Set environment for template package Jinja2 env = Environment(loader=PackageLoader('main', './')) # Load source code from file Source = env.get_template('./alpha.cu') #Template( file(KernelFile).read() ) # Render source code RenderedSource = Source.render( RenderArgs ) # Save rendered source code to file f = open('./rendered.cu', 'w') f.write(RenderedSource) f.close() #Load source code into module KernelSourceModule = SourceModule(RenderedSource, options=None, arch="compute_20", code="sm_20") Kernel = KernelSourceModule.get_function("TestAssemblyKernel") CurandKernel = KernelSourceModule.get_function("CurandInitKernel") #Initialise InteractionMatrix InteractionMatrix = numpy.zeros( ( 8, 8) ).astype(numpy.float32) def Delta(a,b): if a==b: return 1 else: return 0 for i in range(InteractionMatrix.shape[0]): for j in range(InteractionMatrix.shape[1]): InteractionMatrix[i][j] = ( 1 - i % 2 ) * Delta( i, j+1 ) + ( i % 2 ) * Delta( i, j-1 ) #Set up our InteractionMatrix InteractionMatrix_h = KernelSourceModule.get_texref("t_ucInteractionMatrix") drv.matrix_to_texref( InteractionMatrix, InteractionMatrix_h , order="C") print InteractionMatrix #Set-up genomes dest = numpy.arange(256).astype(numpy.uint8) for i in range(0, 256/8): #dest[i*8 + 0] = int('0b00100101',2) #CRASHES #dest[i*8 + 1] = int('0b00010000',2) #CRASHES #dest[i*8 + 0] = int('0b00101000',2) #dest[i*8 + 1] = int('0b00000000',2) #dest[i*8 + 2] = int('0b00000000',2) #dest[i*8 + 3] = int('0b00000000',2) #dest[i*8 + 4] = int('0b00000000',2) #dest[i*8 + 5] = int('0b00000000',2) #dest[i*8 + 6] = int('0b00000000',2) #dest[i*8 + 7] = int('0b00000000',2) dest[i*8 + 0] = 36 dest[i*8 + 1] = 151 dest[i*8 + 2] = 90 dest[i*8 + 3] = 109 dest[i*8 + 4] = 224 dest[i*8 + 5] = 4 dest[i*8 + 6] = 0 dest[i*8 + 7] = 0 dest[0] = 40 dest[1] = 0 dest[2] = 0 dest[3] = 0 dest[4] = 0 dest[5] = 0 dest[6] = 0 dest[7] = 0 dest_h = drv.mem_alloc(dest.nbytes) drv.memcpy_htod(dest_h, dest) print "Genomes before: " print dest #Set-up grids grids = numpy.zeros((32, 19, 19)).astype(numpy.uint8) grids_h = drv.mem_alloc(grids.nbytes) drv.memcpy_htod(grids_h, grids) print "Grids:" print grids #Set-up fitness values fitness = numpy.zeros(256).astype(numpy.float32) fitness_h = drv.mem_alloc(fitness.nbytes) drv.memcpy_htod(fitness_h, fitness) print "Fitness values:" print fitness #Set-up curand curand = numpy.zeros(40*256).astype(numpy.uint8); curand_h = drv.mem_alloc(curand.nbytes) #Set-up four permutations FourPermutations_h = KernelSourceModule.get_global("c_ucFourPermutations") # Constant memory address drv.memcpy_htod(FourPermutations_h[0], FourPermutations) #Set-up timers #start = drv.Event() #stop = drv.Event() #start.record() #Call kernels CurandKernel(curand_h, block=(32,1,1), grid=(1,1)) Kernel(dest_h, fitness_h, grids_h, curand_h, block=(32,1,1), grid=(1,1)) #drv.Context.synchronize() #Clean-up timer #stop.synchronize() #print "Total kernel time taken: %fs"%(start.time_till(stop)*1e-3) #print "Mean time per generation: %fs"%(start.time_till(stop)*1e-3 / NrGenerations) #pass #Output drv.memcpy_dtoh(dest, dest_h) print "Genomes after: " print dest drv.memcpy_dtoh(fitness, fitness_h) print "Fitness after: " print fitness drv.memcpy_dtoh(grids, grids_h) print "Grids[0] after: " print grids[0] print "Grids[31] after:" print grids[31]
grid_size = (int(np.ceil(N / block_size[0])), int(np.ceil(M / block_size[1]))) start_cpu = time.time() result = filt_cpu(image, sigma_r, sigma_d) end_cpu = time.time() result_gpu = np.zeros((N, M), dtype=np.uint32) filt_gpu = mod.get_function("filt_gpu") start_gpu = time.time() tex = mod.get_texref("tex") tex.set_filter_mode(drv.filter_mode.LINEAR) tex.set_address_mode(0, drv.address_mode.MIRROR) tex.set_address_mode(1, drv.address_mode.MIRROR) drv.matrix_to_texref(image.astype(np.uint32), tex, order="C") filter_bilat(drv.Out(result_gpu), np.int32(N), np.int32(M), np.float32(sigma_d), np.float32(sigma_r), block=block_size, grid=grid_size, texrefs=[tex]) drv.Context.synchronize() end_gpu = time.time() cv2.imwrite('res_gpu.bmp', result_gpu.astype(np.uint8)) cv2.imwrite('res_cpu.bmp', result) print('Время CPU {}'.format(end_cpu - start_cpu))
x_out = np.array([i for i in range(M2)] * N2) y_out = np.array([i for i in range(N2)] * M2) start = driver.Event() stop = driver.Event() #подготовка текстуры print("Считаем на ГПУ...") start.record() prep_image = prepare_image(image) tex = mod.get_texref("tex") tex.set_filter_mode(driver.filter_mode.LINEAR) tex.set_address_mode(0, driver.address_mode.CLAMP) tex.set_address_mode(1, driver.address_mode.CLAMP) driver.matrix_to_texref(prep_image, tex, order="C") bilinear_interpolation_kernel(driver.Out(result), driver.In(x_out), driver.In(y_out), np.int32(M1), np.int32(N1), np.int32(M2), np.int32(N2), block=block, grid=grid, texrefs=[tex]) big_image = normalize_image(result, image.shape[2]) stop.record() stop.synchronize() gpu_time = stop.time_since(start)
def alm2lenmap_onGPU(lib_alm, unlalm, dx_gu, dy_gu, do_not_prefilter=False): """ Lens the input unl_CMB map on the GPU using the pyCUDA interface. dx dy displacement in grid units. (f.get_dx_ingridunits() e.g.) Can be path to arrays or arrays or memmap. Will probably crash for too large maps, with need to split the job. Works for 4096 x 4096 at least on my laptop. Cost dominated by texture setup. # FIXME : try get rid of texture Note that the first call might be substantially slower than subsequent calls, as it caches the fft and ifft plans for subsequent usage. :param unl_CMB: :param func: bicubic or bilinear :param normalized_tex: use a modified version of the GPU bicubic spline to account for periodicity of the map :return: """ if timed: ti = time.time() shape = lib_alm.ell_mat.shape rshape = (shape[0], shape[1] / 2 + 1) assert shape[0] == shape[1], shape assert IsPowerOfTwo(shape[0]), shape assert load_map(dx_gu).shape == shape, (load_map(dx_gu).shape, lib_alm.ell_mat.shape) assert load_map(dy_gu).shape == shape, (load_map(dy_gu).shape, lib_alm.ell_mat.shape) assert np.all(np.array(shape) % GPU_block[0] == 0), shape if shape[0] > 4096: print "--- Exercise caution, array shapes larger than 4096 have never been tested so far ---" GPU_grid = (shape[0] / GPU_block[0], shape[1] / GPU_block[1], 1) # Prefiltering forces the interpolant to pass through the samples and increase accuracy, but dominates the cost. rfft2_unlCMB_gpu = gpuarray.to_gpu( lib_alm.alm2rfft(unlalm / np.prod(shape)).astype(np.complex64)) coeffs_gpu = gpuarray.empty(lib_alm.ell_mat.shape, dtype=np.float32) plan, plan_inv = get_rfft_plans(shape) if not do_not_prefilter: # The prefilter makes sure the spline is exact at the nodes. # Uncomments this to put coeffs_gpu on pitched memory to allow later for 2D texture binding : # alloc,pitch = cuda.mem_alloc_pitch(shape[0] * 4,shape[1],4) # 4 bytes per float32 wx = (6. / (2. * np.cos( 2. * np.pi * Freq(np.arange(shape[0]), shape[0]) / shape[0]) + 4.)) wx_gpu = gpuarray.to_gpu(wx.astype(np.float32)) prefilter = CUDA_module.get_function("cf_outer_w") prefilter(rfft2_unlCMB_gpu, wx_gpu, np.int32(rshape[1]), np.int32(rshape[0]), block=GPU_block, grid=GPU_grid) del wx_gpu ifft(rfft2_unlCMB_gpu, coeffs_gpu, plan_inv, False) # Binding arrays to textures and getting lensing func. if texture_count == 0: lens_func = CUDA_module.get_function("bicubiclensKernel_notex") tex_refs = [] dx_gu = gpuarray.to_gpu(load_map(dx_gu).astype(np.float32)) dy_gu = gpuarray.to_gpu(load_map(dy_gu).astype(np.float32)) elif texture_count == 1: unl_CMB_tex = CUDA_module.get_texref("unl_CMB") tex_refs = [unl_CMB_tex] unl_CMB_tex.set_array(cuda.gpuarray_to_array(coeffs_gpu, "C")) del coeffs_gpu dx_gu = gpuarray.to_gpu(load_map(dx_gu).astype(np.float32)) dy_gu = gpuarray.to_gpu(load_map(dy_gu).astype(np.float32)) lens_func = CUDA_module.get_function( "bicubiclensKernel_normtex_singletex") elif texture_count == 3: unl_CMB_tex = CUDA_module.get_texref("unl_CMB") dx_tex = CUDA_module.get_texref("tex_dx") dy_tex = CUDA_module.get_texref("tex_dy") tex_refs = ([unl_CMB_tex, dx_tex, dy_tex]) unl_CMB_tex.set_array(cuda.gpuarray_to_array(coeffs_gpu, "C")) del coeffs_gpu cuda.matrix_to_texref(load_map(dx_gu).astype(np.float32), dx_tex, order="C") cuda.matrix_to_texref(load_map(dy_gu).astype(np.float32), dy_tex, order="C") lens_func = CUDA_module.get_function("bicubiclensKernel_normtex") else: tex_refs = [] lens_func = 0 assert 0 # Wraping, important for periodic boundary conditions. # Note that WRAP has not effect for unnormalized texture coordinates. for tex_ref in tex_refs: tex_ref.set_address_mode(0, cuda.address_mode.WRAP) tex_ref.set_address_mode(1, cuda.address_mode.WRAP) tex_ref.set_filter_mode(cuda.filter_mode.POINT) tex_ref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) if timed: t0 = time.time() len_CMB = np.empty(shape, dtype=np.float32) if texture_count == 0: lens_func(cuda.Out(len_CMB), coeffs_gpu, dx_gu, dy_gu, np.int32(shape[0]), block=GPU_block, grid=GPU_grid, texrefs=tex_refs) elif texture_count == 1: lens_func(cuda.Out(len_CMB), dx_gu, dy_gu, np.int32(shape[0]), block=GPU_block, grid=GPU_grid, texrefs=tex_refs) elif texture_count == 3: lens_func(cuda.Out(len_CMB), np.int32(shape[0]), block=GPU_block, grid=GPU_grid, texrefs=tex_refs) if timed: dt = time.time() - t0 t_tot = time.time() - ti print " GPU bicubic spline and transfer at %s Mpixel / sec, time %s sec" % ( np.prod(lib_alm.ell_mat.shape) / 1e6 / dt, dt) print " Total ex. time at %s Mpixel / sec, ex. time %s sec." % ( np.prod(shape) / 1e6 / t_tot, t_tot) return len_CMB.astype(np.float64)
IMG = 'input.bmp' img = cv2.imread(IMG, cv2.IMREAD_GRAYSCALE) M, N = img.shape sigma_d = 100 sigma_r = 10 block = (16, 16, 1) grid = (int(np.ceil(M / block[0])), int(np.ceil(N / block[1]))) start = driver.Event() stop = driver.Event() start.record() tex = mod.get_texref("tex") tex.set_filter_mode(driver.filter_mode.LINEAR) tex.set_address_mode(1, driver.address_mode.MIRROR) driver.matrix_to_texref(img.astype("int32"), tex, order="C") gpu_result = np.zeros((M, N), dtype=np.uint32) bilateral_interpolation(driver.Out(gpu_result), np.int32(M), np.int32(N), np.float32(sigma_d), np.float32(sigma_r), block=block, grid=grid, texrefs=[tex]) stop.record() stop.synchronize() gpu_time = stop.time_since(start) print(gpu_time) cv2.imwrite("output.bmp", gpu_result.astype("int8"))
def interpolate_image_by_cuda(image: np.ndarray, scale: int = 2): import pycuda.autoinit cu_module = SourceModule(open("kernel.cu", "r").read()) interpolate = cu_module.get_function("interpolate") start = time.time() print('Getting color channels..') uint32_image = np.zeros((image.shape[0], image.shape[1]), dtype=np.uint32) for x in range(uint32_image.shape[0]): for y in range(uint32_image.shape[1]): for ch in range(image.shape[2]): uint32_image[x, y] += image[x, y, ch] << (8 * (image.shape[2] - ch - 1)) print('Copying texture..') cu_tex = cu_module.get_texref("tex") cu_tex.set_filter_mode(cuda.filter_mode.POINT) cu_tex.set_address_mode(0, cuda.address_mode.CLAMP) cu_tex.set_address_mode(1, cuda.address_mode.CLAMP) cuda.matrix_to_texref(uint32_image, cu_tex, order="C") print('Getting image enlarged shape..') enlarged_image_shape = get_image_enlarged_shape(image, scale) result = np.zeros((enlarged_image_shape[0], enlarged_image_shape[1]), dtype=np.uint32) block = (16, 16, 1) grid = (int(np.ceil(enlarged_image_shape[0] / block[0])), int(np.ceil(enlarged_image_shape[1] / block[1]))) print('Interpolating..') interpolate(cuda.Out(result), np.int32(image.shape[1]), np.int32(image.shape[0]), np.int32(enlarged_image_shape[1]), np.int32(enlarged_image_shape[0]), np.int32(image.shape[2]), block=block, grid=grid, texrefs=[cu_tex]) print('Combining channels into color points..') rgba_image = np.zeros( (enlarged_image_shape[0], enlarged_image_shape[1], image.shape[2]), dtype=np.uint32) for x in range(rgba_image.shape[0]): for y in range(rgba_image.shape[1]): output_x_y = result[x, y] for ch in range(rgba_image.shape[2]): rgba_image[x, y, rgba_image.shape[2] - ch - 1] = output_x_y % 256 output_x_y >>= 8 print('Clearing temporaries..') del result del uint32_image print( f'interpolate_image_by_cuda - calculation time: {time.time() - start:.5f} s' ) return rgba_image
def trikmeans_gpu(data, clusters, iterations, return_times = 0): """trikmeans_gpu(data, clusters, iterations) returns (clusters, labels) K-means using triangle inequality algorithm and PyCuda Input arguments are the data, intial cluster values, and number of iterations to repeat. The shape of data is (nDim, nPts) where nDim = # of dimensions in the data and nPts = number of data points. The shape of clusters is (nDim, nClusters) The return values are the updated clusters and labels for the data """ #--------------------------------------------------------------- # get problem parameters #--------------------------------------------------------------- (nDim, nPts) = data.shape nClusters = clusters.shape[1] #--------------------------------------------------------------- # set calculation control variables #--------------------------------------------------------------- useTextureForData = 0 usePageLockedMemory = 0 if(nPts > 32768): useTextureForData = 0 # block and grid sizes for the ccdist kernel (also for hdclosest) blocksize_ccdist = min(512, 16*(1+(nClusters-1)/16)) gridsize_ccdist = 1 + (nClusters-1)/blocksize_ccdist #block and grid sizes for the init module threads_desired = 16*(1+(max(nPts, nDim*nClusters)-1)/16) #blocksize_init = min(512, threads_desired) blocksize_init = min(128, threads_desired) gridsize_init = 1 + (threads_desired - 1)/blocksize_init #block and grid sizes for the step3 module blocksize_step3 = blocksize_init if not useTextureForData: blocksize_step3 = min(256, blocksize_step3) gridsize_step3 = gridsize_init #block and grid sizes for the step4 module # Each block of threads will handle seqcount times the data # eg blocksize of 512 and seqcount of 4, each block reduces 4*512 = 2048 elements blocksize_step4 = 2 while(blocksize_step4 < min(512, nPts)): blocksize_step4 *= 2 maxblocks = 512 seqcount_step4 = 1 + (nPts-1)/(blocksize_step4*maxblocks) gridsize_step4 = 1 + (nPts-1)/(seqcount_step4*blocksize_step4) blocksize_step4part2 = 1 while(blocksize_step4part2 < gridsize_step4): blocksize_step4part2 *= 2 #block and grid sizes for the calc_movement module for blocksize_calcm in range(32, 512, 32): if blocksize_calcm >= nClusters: break; gridsize_calcm = 1 + (nClusters-1)/blocksize_calcm #block and grid sizes for the step56 module blocksize_step56 = blocksize_init gridsize_step56 = gridsize_init #--------------------------------------------------------------- # prepare source modules #--------------------------------------------------------------- t1 = time.time() mod_ccdist = kernels.get_big_module(nDim, nPts, nClusters, blocksize_step4, seqcount_step4, gridsize_step4, blocksize_step4part2, useTextureForData) ccdist = mod_ccdist.get_function("ccdist") calc_hdclosest = mod_ccdist.get_function("calc_hdclosest") init = mod_ccdist.get_function("init") step3 = mod_ccdist.get_function("step3") step4 = mod_ccdist.get_function("step4") step4part2 = mod_ccdist.get_function("step4part2") calc_movement = mod_ccdist.get_function("calc_movement") step56 = mod_ccdist.get_function("step56") pycuda.autoinit.context.synchronize() t2 = time.time() module_time = t2-t1 #--------------------------------------------------------------- # setup data on GPU #--------------------------------------------------------------- t1 = time.time() data = np.array(data).astype(np.float32) clusters = np.array(clusters).astype(np.float32) if useTextureForData: # copy the data to the texture texrefData = mod_ccdist.get_texref("texData") cuda.matrix_to_texref(data, texrefData, order="F") else: if usePageLockedMemory: data_pl = cuda.pagelocked_empty_like(data) data_pl[:,:] = data; gpu_data = gpuarray.to_gpu(data_pl) else: gpu_data = gpuarray.to_gpu(data) if usePageLockedMemory: clusters_pl = cuda.pagelocked_empty_like(clusters) clusters_pl[:,:] = clusters gpu_clusters = gpuarray.to_gpu(clusters_pl) else: gpu_clusters = gpuarray.to_gpu(clusters) gpu_assignments = gpuarray.zeros((nPts,), np.int32) # cluster assignment gpu_lower = gpuarray.zeros((nClusters, nPts), np.float32) # lower bounds on distance between # point and each cluster gpu_upper = gpuarray.zeros((nPts,), np.float32) # upper bounds on distance between # point and any cluster gpu_ccdist = gpuarray.zeros((nClusters, nClusters), np.float32) # cluster-cluster distances gpu_hdClosest = gpuarray.zeros((nClusters,), np.float32) # half distance to closest gpu_hdClosest.fill(1.0e10) # set to large value // **TODO** get the acutal float max gpu_badUpper = gpuarray.zeros((nPts,), np.int32) # flag to indicate upper bound needs recalc gpu_clusters2 = gpuarray.zeros((nDim, nClusters), np.float32); gpu_cluster_movement = gpuarray.zeros((nClusters,), np.float32); gpu_cluster_changed = gpuarray.zeros((nClusters,), np.int32) gpu_cluster_changed.fill(1) gpu_reduction_out = gpuarray.zeros((nDim, nClusters*gridsize_step4), np.float32) gpu_reduction_counts = gpuarray.zeros((nClusters*gridsize_step4,), np.int32) pycuda.autoinit.context.synchronize() t2 = time.time() data_time = t2-t1 #--------------------------------------------------------------- # do calculations #--------------------------------------------------------------- ccdist_time = 0. hdclosest_time = 0. init_time = 0. step3_time = 0. step4_time = 0. step56_time = 0. t1 = time.time() ccdist(gpu_clusters, gpu_ccdist, gpu_hdClosest, block = (blocksize_ccdist, 1, 1), grid = (gridsize_ccdist, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() ccdist_time += t2-t1 t1 = time.time() calc_hdclosest(gpu_ccdist, gpu_hdClosest, block = (blocksize_ccdist, 1, 1), grid = (gridsize_ccdist, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() hdclosest_time += t2-t1 t1 = time.time() if useTextureForData: init(gpu_clusters, gpu_ccdist, gpu_hdClosest, gpu_assignments, gpu_lower, gpu_upper, block = (blocksize_init, 1, 1), grid = (gridsize_init, 1), texrefs=[texrefData]) else: init(gpu_data, gpu_clusters, gpu_ccdist, gpu_hdClosest, gpu_assignments, gpu_lower, gpu_upper, block = (blocksize_init, 1, 1), grid = (gridsize_init, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() init_time += t2-t1 for i in range(iterations): if i>0: t1 = time.time() ccdist(gpu_clusters, gpu_ccdist, gpu_hdClosest, block = (blocksize_ccdist, 1, 1), grid = (gridsize_ccdist, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() ccdist_time += t2-t1 t1 = time.time() calc_hdclosest(gpu_ccdist, gpu_hdClosest, block = (blocksize_ccdist, 1, 1), grid = (gridsize_ccdist, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() hdclosest_time += t2-t1 t1 = time.time() if i > 0: gpu_cluster_changed.fill(0) if useTextureForData: step3(gpu_clusters, gpu_ccdist, gpu_hdClosest, gpu_assignments, gpu_lower, gpu_upper, gpu_badUpper, gpu_cluster_changed, block = (blocksize_step3, 1, 1), grid = (gridsize_step3, 1), texrefs=[texrefData]) else: step3(gpu_data, gpu_clusters, gpu_ccdist, gpu_hdClosest, gpu_assignments, gpu_lower, gpu_upper, gpu_badUpper, gpu_cluster_changed, block = (blocksize_step3, 1, 1), grid = (gridsize_step3, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() step3_time += t2-t1 t1 = time.time() if useTextureForData: step4(gpu_cluster_changed, gpu_reduction_out, gpu_reduction_counts, gpu_assignments, block = (blocksize_step4, 1, 1), grid = (gridsize_step4, nDim), texrefs=[texrefData]) else: step4(gpu_data, gpu_cluster_changed, gpu_reduction_out, gpu_reduction_counts, gpu_assignments, block = (blocksize_step4, 1, 1), grid = (gridsize_step4, nDim)) step4part2(gpu_cluster_changed, gpu_reduction_out, gpu_reduction_counts, gpu_clusters2, gpu_clusters, block = (blocksize_step4part2, 1, 1), grid = (1, nDim)) calc_movement(gpu_clusters, gpu_clusters2, gpu_cluster_movement, gpu_cluster_changed, block = (blocksize_calcm, 1, 1), grid = (gridsize_calcm, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() step4_time += t2-t1 t1 = time.time() if useTextureForData: step56(gpu_assignments, gpu_lower, gpu_upper, gpu_cluster_movement, gpu_badUpper, block = (blocksize_step56, 1, 1), grid = (gridsize_step56, 1), texrefs=[texrefData]) else: step56(gpu_assignments, gpu_lower, gpu_upper, gpu_cluster_movement, gpu_badUpper, block = (blocksize_step56, 1, 1), grid = (gridsize_step56, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() step56_time += t2-t1 # prepare for next iteration temp = gpu_clusters gpu_clusters = gpu_clusters2 gpu_clusters2 = temp if return_times: return gpu_ccdist, gpu_hdClosest, gpu_assignments, gpu_lower, gpu_upper, \ gpu_clusters.get(), gpu_cluster_movement, \ data_time, module_time, init_time, \ ccdist_time/iterations, hdclosest_time/iterations, \ step3_time/iterations, step4_time/iterations, step56_time/iterations else: return gpu_clusters.get(), gpu_assignments.get()
def main(): #FourPermutations set-up FourPermutations = numpy.array([ [1,2,3,4], [1,2,4,3], [1,3,2,4], [1,3,4,2], [1,4,2,3], [1,4,3,2], [2,1,3,4], [2,1,4,3], [2,3,1,4], [2,3,4,1], [2,4,1,3], [2,4,3,1], [3,2,1,4], [3,2,4,1], [3,1,2,4], [3,1,4,2], [3,4,2,1], [3,4,1,2], [4,2,3,1], [4,2,1,3], [4,3,2,1], [4,3,1,2], [4,1,2,3], [4,1,3,2],]).astype(numpy.uint8) BankSize = 8 # Do not go beyond 8! #Define constants DimGridX = 19 DimGridY = 19 #SearchSpaceSize = 2**24 #BlockDimY = SearchSpaceSize / (2**16) #BlockDimX = SearchSpaceSize / (BlockDimY) #print "SearchSpaceSize: ", SearchSpaceSize, " (", BlockDimX, ", ", BlockDimY,")" BlockDimX = 100 BlockDimY = 100 SearchSpaceSize = BlockDimX * BlockDimY * 32 #BlockDimX = 600 #BlockDimY = 600 FitnessValDim = SearchSpaceSize GenomeDim = SearchSpaceSize #Create dictionary argument for rendering RenderArgs= {"safe_memory_mapping":1, "aligned_byte_length_genome":4, "bit_length_edge_type":3, "curand_nr_threads_per_block":32, "nr_tile_types":2, "nr_edge_types":8, "warpsize":32, "fit_dim_thread_x":32*BankSize, "fit_dim_thread_y":1, "fit_dim_block_x":BlockDimX, "fit_dim_grid_x":19, "fit_dim_grid_y":19, "fit_nr_four_permutations":24, "fit_length_movelist":244, "fit_nr_redundancy_grid_depth":2, "fit_nr_redundancy_assemblies":10, "fit_tile_index_starting_tile":0, "glob_nr_tile_orientations":4, "banksize":BankSize, "curand_dim_block_x":BlockDimX } # Set environment for template package Jinja2 env = Environment(loader=PackageLoader('main', './')) # Load source code from file Source = env.get_template('./alpha.cu') #Template( file(KernelFile).read() ) # Render source code RenderedSource = Source.render( RenderArgs ) # Save rendered source code to file f = open('./rendered.cu', 'w') f.write(RenderedSource) f.close() #Load source code into module KernelSourceModule = SourceModule(RenderedSource, options=None, arch="compute_20", code="sm_20") Kernel = KernelSourceModule.get_function("SearchSpaceKernel") CurandKernel = KernelSourceModule.get_function("CurandInitKernel") #Initialise InteractionMatrix InteractionMatrix = numpy.zeros( ( 8, 8) ).astype(numpy.float32) def Delta(a,b): if a==b: return 1 else: return 0 for i in range(InteractionMatrix.shape[0]): for j in range(InteractionMatrix.shape[1]): InteractionMatrix[i][j] = ( 1 - i % 2 ) * Delta( i, j+1 ) + ( i % 2 ) * Delta( i, j-1 ) #Set up our InteractionMatrix InteractionMatrix_h = KernelSourceModule.get_texref("t_ucInteractionMatrix") drv.matrix_to_texref( InteractionMatrix, InteractionMatrix_h , order="C") print InteractionMatrix #Set-up genomes dest = numpy.arange(GenomeDim*4).astype(numpy.uint8) #for i in range(0, GenomeDim/4): #dest[i*8 + 0] = int('0b00100101',2) #CRASHES #dest[i*8 + 1] = int('0b00010000',2) #CRASHES #dest[i*8 + 0] = int('0b00101000',2) #dest[i*8 + 1] = int('0b00000000',2) #dest[i*8 + 2] = int('0b00000000',2) #dest[i*8 + 3] = int('0b00000000',2) #dest[i*8 + 4] = int('0b00000000',2) #dest[i*8 + 5] = int('0b00000000',2) #dest[i*8 + 6] = int('0b00000000',2) #dest[i*8 + 7] = int('0b00000000',2) # dest[i*4 + 0] = 40 # dest[i*4 + 1] = 0 # dest[i*4 + 2] = 0 # dest[i*4 + 3] = 0 dest_h = drv.mem_alloc(GenomeDim*4) #dest.nbytes) #drv.memcpy_htod(dest_h, dest) #print "Genomes before: " #print dest #Set-up grids #grids = numpy.zeros((10000, DimGridX, DimGridY)).astype(numpy.uint8) #TEST #grids_h = drv.mem_alloc(GenomeDim*DimGridX*DimGridY) #TEST #drv.memcpy_htod(grids_h, grids) #print "Grids:" #print grids #Set-up fitness values #fitness = numpy.zeros(FitnessValDim).astype(numpy.float32) #fitness_h = drv.mem_alloc(fitness.nbytes) fitness_left = numpy.zeros(FitnessValDim).astype(numpy.float32) fitness_left_h = drv.mem_alloc(fitness_left.nbytes) fitness_bottom = numpy.zeros(FitnessValDim).astype(numpy.float32) fitness_bottom_h = drv.mem_alloc(fitness_bottom.nbytes) #drv.memcpy_htod(fitness_h, fitness) #print "Fitness values:" #print fitness #Set-up grids grids = numpy.zeros((10000*32, DimGridX, DimGridY)).astype(numpy.uint8) #TEST grids_h = drv.mem_alloc(GenomeDim*DimGridX*DimGridY) #TEST #drv.memcpy_htod(grids_h, grids) #print "Grids:" #print grids #Set-up curand #curand = numpy.zeros(40*GenomeDim).astype(numpy.uint8); #curand_h = drv.mem_alloc(curand.nbytes) curand_h = drv.mem_alloc(40*GenomeDim) #Set-up four permutations FourPermutations_h = KernelSourceModule.get_global("c_ucFourPermutations") # Constant memory address drv.memcpy_htod(FourPermutations_h[0], FourPermutations) #SearchSpace control #SearchSpaceSize = 2**24 #BlockDimY = SearchSpaceSize / (2**16) #BlockDimX = SearchSpaceSize / (BlockDimY) #print "SearchSpaceSize: ", SearchSpaceSize, " (", BlockDimX, ", ", BlockDimY,")" #Schedule kernel calls #MaxBlockDim = 100 OffsetBlocks = (SearchSpaceSize) % (BlockDimX*BlockDimY*32) MaxBlockCycles = (SearchSpaceSize - OffsetBlocks)/(BlockDimX*BlockDimY*32) BlockCounter=0 print "Will do that many kernels a ",BlockDimX,"x",BlockDimY,":", MaxBlockCycles for i in range(MaxBlockCycles): #Set-up timer start = drv.Event() stop = drv.Event() start.record() print "Start kernel:" #Call kernels CurandKernel(curand_h, block=(32,1,1), grid=(BlockDimX, BlockDimY)) print "Finished Curand kernel, starting main kernel..." Kernel(dest_h, grids_h, fitness_left_h, fitness_bottom_h, curand_h, block=(32*BankSize,1,1), grid=(BlockDimX,BlockDimY)) #End timer stop.record() stop.synchronize() print "Total kernel time taken: %fs"%(start.time_till(stop)*1e-3) #print "Mean time per generation: %fs"%(start.time_till(stop)*1e-3 / NrGenerations) pass #Output #drv.memcpy_dtoh(dest, dest_h) #print "Genomes after: " #print dest[0:4] drv.memcpy_dtoh(fitness_left, fitness_left_h) print "FitnessLeft after: " print fitness_left#[1000000:1000500] drv.memcpy_dtoh(fitness_bottom, fitness_bottom_h) print "FitnessBottom after: " print fitness_bottom#[1000000:1000500] drv.memcpy_dtoh(grids, grids_h) print "Grids[0] after: " for i in range(0,5): print "Grid ",i,": " print grids[i]
def main(): #Initialise InteractionMatrix def Delta(a,b): if a==b: return 1 else: return 0 for i in range(InteractionMatrix.shape[0]): for j in range(InteractionMatrix.shape[1]): InteractionMatrix[i][j] = ( 1 - i % 2 ) * Delta( i, j+1 ) + ( i % 2 ) * Delta( i, j-1 ) #Initialise GPU (equivalent of autoinit) drv.init() assert drv.Device.count() >= 1 dev = drv.Device(0) ctx = dev.make_context(0) #Convert GlobalParams to List GlobalParams = np.zeros(len(GlobalParamsDict.values())).astype(np.float32) count = 0 for x in GlobalParamsDict.keys(): GlobalParams[count] = GlobalParamsDict[x] count += 1 #Convert FitnessParams to List FitnessParams = np.zeros(len(FitnessParamsDict.values())).astype(np.float32) count = 0 for x in FitnessParamsDict.keys(): FitnessParams[count] = FitnessParamsDict[x] count += 1 #Convert GAParams to List GAParams = np.zeros(len(GAParamsDict.values())).astype(np.float32) count = 0 for x in GAParamsDict.keys(): GAParams[count] = GAParamsDict[x] count += 1 # Set environment for template package Jinja2 env = Environment(loader=PackageLoader('main_discoverytime', './templates')) # Load source code from file Source = env.get_template('./kernel.cu') #Template( file(KernelFile).read() ) #Create dictionary argument for rendering RenderArgs= {"params_size":GlobalParams.nbytes,\ "fitnessparams_size":FitnessParams.nbytes,\ "gaparams_size":GAParams.nbytes,\ "genome_bytelength":int(ByteLengthGenome),\ "genome_bitlength":int(BitLengthGenome),\ "ga_nr_threadsperblock":GA_NrThreadsPerBlock,\ "textures":range( 0, NrFitnessFunctionGrids ),\ "curandinit_nr_threadsperblock":CurandInit_NrThreadsPerBlock,\ "with_mixed_crossover":WithMixedCrossover, "with_bank_conflict":WithBankConflict, "with_naive_roulette_wheel_selection":WithNaiveRouletteWheelSelection, "with_assume_normalized_fitness_function_values":WithAssumeNormalizedFitnessFunctionValues, "with_uniform_crossover":WithUniformCrossover, "with_single_point_crossover":WithSinglePointCrossover, "with_surefire_mutation":WithSurefireMutation, "with_storeassembledgridsinglobalmemory":WithStoreAssembledGridsInGlobalMemory, "ga_threaddimx":int(GA_ThreadDim), "glob_nr_tiletypes":int(NrTileTypes), "glob_nr_edgetypes":int(NrEdgeTypes), "glob_nr_tileorientations":int(NrTileOrientations), "fit_dimgridx":int(DimGridX), "fit_dimgridy":int(DimGridY), "fit_nr_fitnessfunctiongrids":int(NrFitnessFunctionGrids), "fit_nr_fourpermutations":int(NrFourPermutations), "fit_assembly_redundancy":int(NrAssemblyRedundancy), "fit_nr_threadsperblock":int(Fit_NrThreadsPerBlock), "sort_threaddimx":int(Sort_ThreadDimX), "glob_nr_genomes":int(NrGenomes), "fit_dimthreadx":int(ThreadDimX), "fit_dimthready":int(ThreadDimY), "fit_dimsubgridx":int(SubgridDimX), "fit_dimsubgridy":int(SubgridDimY), "fit_nr_subgridsperbank":int(NrSubgridsPerBank), "glob_bitlength_edgetype":int(EdgeTypeBitLength), "fitness_break_value":int(BitLengthGenome), # ADAPTED FOR DISCOVERY KERNEL "fitness_flag_index":int(NrGenomes) } # Render source code RenderedSource = Source.render( RenderArgs ) # Save rendered source code to file f = open('./rendered.cu', 'w') f.write(RenderedSource) f.close() #Load source code into module KernelSourceModule = SourceModule(RenderedSource, options=None, no_extern_c=True, arch="compute_20", code="sm_20", cache_dir=None) #Allocate values on GPU Genomes_h = drv.mem_alloc(Genomes.nbytes) FitnessPartialSums_h = drv.mem_alloc(FitnessPartialSums.nbytes) FitnessValues_h = drv.mem_alloc(FitnessValues.nbytes) AssembledGrids_h = drv.mem_alloc(AssembledGrids.nbytes) Mutexe_h = drv.mem_alloc(Mutexe.nbytes) #ReductionList_h = drv.mem_alloc(ReductionList.nbytes) #Copy values to global memory drv.memcpy_htod(Genomes_h, Genomes) drv.memcpy_htod(FitnessPartialSums_h, FitnessPartialSums) drv.memcpy_htod(FitnessValues_h, FitnessValues) drv.memcpy_htod(AssembledGrids_h, AssembledGrids) drv.memcpy_htod(Mutexe_h, Mutexe) #Copy values to constant / texture memory for id in range(0, NrFitnessFunctionGrids): FitnessFunctionGrids_h.append( KernelSourceModule.get_texref("t_ucFitnessFunctionGrids%d"%(id)) ) drv.matrix_to_texref( FitnessFunctionGrids[id], FitnessFunctionGrids_h[id] , order="C") InteractionMatrix_h = KernelSourceModule.get_texref("t_ucInteractionMatrix") drv.matrix_to_texref( InteractionMatrix, InteractionMatrix_h , order="C") GlobalParams_h = KernelSourceModule.get_global("c_fParams") # Constant memory address drv.memcpy_htod(GlobalParams_h[0], GlobalParams) FitnessParams_h = KernelSourceModule.get_global("c_fFitnessParams") # Constant memory address drv.memcpy_htod(FitnessParams_h[0], FitnessParams) GAParams_h = KernelSourceModule.get_global("c_fGAParams") # Constant memory address drv.memcpy_htod(GAParams_h[0], GAParams) FourPermutations_h = KernelSourceModule.get_global("c_ucFourPermutations") # Constant memory address drv.memcpy_htod(FourPermutations_h[0], FourPermutations) FitnessSumConst_h = KernelSourceModule.get_global("c_fFitnessSumConst") FitnessListConst_h = KernelSourceModule.get_global("c_fFitnessListConst") #Set up curandStates curandState_bytesize = 40 # This might be incorrect, depending on your compiler (info from Tomasz Rybak's pyCUDA cuRAND wrapper) CurandStates_h = drv.mem_alloc(curandState_bytesize * NrGenomes) #Compile kernels curandinit_fnc = KernelSourceModule.get_function("CurandInitKernel") #fitness_fnc = KernelSourceModule.get_function("FitnessKernel") sorting_fnc = KernelSourceModule.get_function("SortingKernel") ga_fnc = KernelSourceModule.get_function("GAKernel") #Initialise Curand curandinit_fnc(CurandStates_h, block=(int(CurandInit_NrThreadsPerBlock), 1, 1), grid=(int(CurandInit_NrBlocks), 1)) #Build parameter lists for FitnessKernel and GAKernel FitnessKernelParams = (Genomes_h, FitnessValues_h, AssembledGrids_h, CurandStates_h, Mutexe_h); SortingKernelParams = (FitnessValues_h, FitnessPartialSums_h) GAKernelParams = (Genomes_h, FitnessValues_h, AssembledGrids_h, CurandStates_h); #TEST ONLY #return #ADAPTED #TEST ONLY #START ADAPTED print "GENOMES NOW:\n" print Genomes print ":::STARTING KERNEL EXECUTION:::" #STOP ADAPTED #Discovery time parameters min_fitness_value = BitLengthGenome # Want all bits set mutation_rate = -2.0 #normally: -2 #Define Numpy construct to sideways join arrays (glue columns together) #Taken from: http://stackoverflow.com/questions/5355744/numpy-joining-structured-arrays #def join_struct_arrays(arrays): # sizes = np.array([a.itemsize for a in arrays]) # offsets = np.r_[0, sizes.cumsum()] # n = len(arrays[0]) # joint = np.empty((n, offsets[-1]), dtype=np.int32) # for a, size, offset in zip(arrays, sizes, offsets): # joint[:,offset:offset+size] = a.view(np.int32).reshape(n,size) # dtype = sum((a.dtype.descr for a in arrays), []) # return joint.ravel().view(dtype) #Test join_struct_arrays: #a = np.array([[1, 2], [11, 22], [111, 222]]).astype(np.int32); #b = np.array([[3, 4], [33, 44], [333, 444]]).astype(np.int32); #c = np.array([[5, 6], [55, 66], [555, 666]]).astype(np.int32); #print "Test join_struct_arrays:" #print join_struct_arrays([a, b, c]) #FAILED #Set up PYTABLES #class GAGenome(IsDescription): #gen_id = Int32Col() #fitness_val = Float32Col() #genome = StringCol(mByteLengthGenome) #last_nr_mutations = Int32Col() # Contains the Nr of mutations genome underwent during this generation #mother_id = Int32Col() # Contains the crossover "mother" #father_id = Int32Col() # Contains the crossover "father" (empty if no crossing over) #assembledgrid = StringCol(DimGridX*DimGridY) # 16-character String #class GAGenerations(IsDescription): # nr_generations = Int32Col() # nr_genomes = Int32Col() # mutation_rate = Float32Col() # Contains the Nr of mutations genome underwent during this generation #from datetime import datetime #filename = "fujiama_"+str(NrGenomes)+"_"+str(RateMutation)+"_"+".h5" #print filename #h5file = openFile(filename, mode = "w", title = "GA FILE") #group = h5file.createGroup("/", 'fujiama_ga', 'Fujiama Genetic Algorithm output') #table = h5file.createTable(group, 'GaGenerations', GAGenerations, "Raw data") #atom = Atom.from_dtype(np.float32) #Initialise File I/O FILE = open("fujiamakernel_nrgen-" + str(NrGenomes) + "_adaptation.plot", "w") #ADAPTED FOR TESTING HISTOGRAM #TestValues = [13,24,26,31,32,14] #print np.histogram(TestValues, bins=[0, 24, 32])[0][1] #quit() #Initialise CUDA timers start = drv.Event() stop = drv.Event() while mutation_rate < 1: # normally: 1 #ds = h5file.createArray(f.root, 'ga_raw_'+str(mutation_rate), atom, x.shape) mutation_rate += 0.1 GAParams[0] = 10.0 ** mutation_rate drv.memcpy_htod(GAParams_h[0], GAParams) print "Mutation rate: ", GAParams[0] #ADAPTED: Initialise global memory (absolutely necessary!!) drv.memcpy_htod(Genomes_h, Genomes) drv.memcpy_htod(FitnessValues_h, FitnessValues) drv.memcpy_htod(AssembledGrids_h, AssembledGrids) drv.memcpy_htod(Mutexe_h, Mutexe) #execute kernels for specified number of generations start.record() biggest_fit = 0 reprange = 100 average_breakup = np.zeros((reprange)).astype(np.float32) for rep in range(0, reprange): breakup_generation = GlobalParamsDict["NrGenerations"] dontcount = 0 #ADAPTED: Initialise global memory (absolutely necessary!!) drv.memcpy_htod(Genomes_h, Genomes) drv.memcpy_htod(FitnessValues_h, FitnessValues) drv.memcpy_htod(AssembledGrids_h, AssembledGrids) drv.memcpy_htod(Mutexe_h, Mutexe) #execute kernels for specified number of generations start.record() for gen in range(0, GlobalParamsDict["NrGenerations"]): #print "Processing Generation: %d"%(gen) #Launch CPU processing (should be asynchroneous calls) sorting_fnc(*(SortingKernelParams), block=sorting_blocks, grid=sorting_grids) #Launch Sorting Kernel drv.memcpy_dtoh(FitnessPartialSums, FitnessPartialSums_h) #Copy from Device to Host and finish sorting FitnessSumConst = FitnessPartialSums.sum() drv.memcpy_htod(FitnessSumConst_h[0], FitnessSumConst) #Copy from Host to Device constant memory #drv.memcpy_dtod(FitnessListConst_h[0], FitnessValues_h, FitnessValues.nbytes) #Copy FitnessValues from Device to Device Const #TEST ga_fnc(*(GAKernelParams), block=ga_blocks, grid=ga_grids) #TEST #Note: Fitness Function is here integrated into GA kernel! drv.memcpy_dtoh(Genomes_res, Genomes_h) #Copy data from GPU drv.memcpy_dtoh(FitnessValues_res, FitnessValues_h) #drv.memcpy_dtoh(AssembledGrids_res, AssembledGrids_h) #Takes about as much time as the whole simulation! #print FitnessValues_res #maxxie = FitnessValues_res.max() #if maxxie > biggest_fit: # biggest_fit = maxxie #print "max fitness:", maxxie #if maxxie >= 25.0 and breakup_generation == -1: if np.histogram(FitnessValues_res, (0, 24, 32))[0][1] >= NrGenomes/2: breakup_generation = gen break # else: # breakup_generation = -1 #if FitnessValues[NrGenomes] == float(1): # breakup_generation = i # break #else: # breakup_generation = -1 #maxxie = FitnessValues_res.max() #if maxxie >= 30: # print "Max fitness value: ", FitnessValues_res.max() #ds[:] = FitnessValues #join_struct_arrays(Genomes, FitnessValues, AssembledGrids); #trow = table.row #trow['nr_generations'] = NrGenerations #trow['nr_genomes'] = NrGenomes #trow['mutation_rate'] = mutation_rate #trow.append() #trow.flush() stop.record() stop.synchronize() print "Total kernel time taken: %fs"%(start.time_till(stop)*1e-3) print "Mean time per generation: %fs"%(start.time_till(stop)*1e-3 / breakup_generation) print "Discovery time (generations) for mutation rate %f: %d"%(GAParams[0], breakup_generation) #print "Max:", biggest_fit #TEST MODE #print "Printing all FitnessValues now:" #print FitnessValues_res #raw_input("Next redundancy with keystroke!..."); #if breakup_generation==0: # print FitnessValues_res # print "Genomes: " # print Genomes_res average_breakup[rep] = breakup_generation / reprange #if breakup_generation == -1: # dontcount = 1 # break #if dontcount == 1: # average_breakup.fill(20000) FILE.write( str(GAParams[0]) + " " + str(np.median(average_breakup)) + " " + str(np.std(average_breakup)) + "\n"); FILE.flush() #Clean-up pytables #h5file.close() #Clean up File I/O FILE.close()
def main(): #Set up global timer tot_time = time.time() #Define constants BankSize = 8 # Do not go beyond 8! WarpSize = 32 #Do not change... DimGridX = 19 DimGridY = 19 BlockDimX = 256 BlockDimY = 256 SearchSpaceSize = 2**24 #BlockDimX * BlockDimY * 32 FitnessValDim = BlockDimX*BlockDimY*WarpSize #SearchSpaceSize GenomeDim = BlockDimX*BlockDimY*WarpSize #SearchSpaceSize AlignedByteLengthGenome = 4 #Create dictionary argument for rendering RenderArgs= {"safe_memory_mapping":1, "aligned_byte_length_genome":AlignedByteLengthGenome, "bit_length_edge_type":3, "curand_nr_threads_per_block":32, "nr_tile_types":2, "nr_edge_types":8, "warpsize":WarpSize, "fit_dim_thread_x":32*BankSize, "fit_dim_thread_y":1, "fit_dim_block_x":BlockDimX, "fit_dim_grid_x":19, "fit_dim_grid_y":19, "fit_nr_four_permutations":24, "fit_length_movelist":244, "fit_nr_redundancy_grid_depth":2, "fit_nr_redundancy_assemblies":10, "fit_tile_index_starting_tile":0, "glob_nr_tile_orientations":4, "banksize":BankSize, "curand_dim_block_x":BlockDimX } # Set environment for template package Jinja2 env = Environment(loader=PackageLoader('main', './')) # Load source code from file Source = env.get_template('./alpha.cu') #Template( file(KernelFile).read() ) # Render source code RenderedSource = Source.render( RenderArgs ) # Save rendered source code to file f = open('./rendered.cu', 'w') f.write(RenderedSource) f.close() #Load source code into module KernelSourceModule = SourceModule(RenderedSource, options=None, arch="compute_20", code="sm_20") Kernel = KernelSourceModule.get_function("SearchSpaceKernel") CurandKernel = KernelSourceModule.get_function("CurandInitKernel") #Initialise InteractionMatrix InteractionMatrix = numpy.zeros( ( 8, 8) ).astype(numpy.float32) def Delta(a,b): if a==b: return 1 else: return 0 for i in range(InteractionMatrix.shape[0]): for j in range(InteractionMatrix.shape[1]): InteractionMatrix[i][j] = ( 1 - i % 2 ) * Delta( i, j+1 ) + ( i % 2 ) * Delta( i, j-1 ) #Set up our InteractionMatrix InteractionMatrix_h = KernelSourceModule.get_texref("t_ucInteractionMatrix") drv.matrix_to_texref( InteractionMatrix, InteractionMatrix_h , order="C") print InteractionMatrix #Set-up genomes #dest = numpy.arange(GenomeDim*4).astype(numpy.uint8) #for i in range(0, GenomeDim/4): #dest[i*8 + 0] = int('0b00100101',2) #CRASHES #dest[i*8 + 1] = int('0b00010000',2) #CRASHES #dest[i*8 + 0] = int('0b00101000',2) #dest[i*8 + 1] = int('0b00000000',2) #dest[i*8 + 2] = int('0b00000000',2) #dest[i*8 + 3] = int('0b00000000',2) #dest[i*8 + 4] = int('0b00000000',2) #dest[i*8 + 5] = int('0b00000000',2) #dest[i*8 + 6] = int('0b00000000',2) #dest[i*8 + 7] = int('0b00000000',2) # dest[i*4 + 0] = 40 # dest[i*4 + 1] = 0 # dest[i*4 + 2] = 0 # dest[i*4 + 3] = 0 dest_h = drv.mem_alloc(GenomeDim*AlignedByteLengthGenome) #dest.nbytes) #drv.memcpy_htod(dest_h, dest) #print "Genomes before: " #print dest #Set-up grids #grids = numpy.zeros((10000, DimGridX, DimGridY)).astype(numpy.uint8) #TEST #grids_h = drv.mem_alloc(GenomeDim*DimGridX*DimGridY) #TEST #drv.memcpy_htod(grids_h, grids) #print "Grids:" #print grids #Set-up fitness values #fitness = numpy.zeros(FitnessValDim).astype(numpy.float32) #fitness_h = drv.mem_alloc(fitness.nbytes) #fitness_size = numpy.zeros(FitnessValDim).astype(numpy.uint32) fitness_size = drv.pagelocked_zeros((FitnessValDim), numpy.uint32, "C", 0) fitness_size_h = drv.mem_alloc(fitness_size.nbytes) #fitness_hash = numpy.zeros(FitnessValDim).astype(numpy.uint32) fitness_hash = drv.pagelocked_zeros((FitnessValDim), numpy.uint32, "C", 0) fitness_hash_h = drv.mem_alloc(fitness_hash.nbytes) #drv.memcpy_htod(fitness_h, fitness) #print "Fitness values:" #print fitness #Set-up grids #grids = numpy.zeros((GenomeDim, DimGridX, DimGridY)).astype(numpy.uint8) #TEST grids = drv.pagelocked_zeros((GenomeDim, DimGridX, DimGridY), numpy.uint8, "C", 0) grids_h = drv.mem_alloc(GenomeDim*DimGridX*DimGridY) #TEST #drv.memcpy_htod(grids_h, grids) #print "Grids:" #print grids #Set-up curand #curand = numpy.zeros(40*GenomeDim).astype(numpy.uint8); #curand_h = drv.mem_alloc(curand.nbytes) curand_h = drv.mem_alloc(40*GenomeDim) #SearchSpace control #SearchSpaceSize = 2**24 #BlockDimY = SearchSpaceSize / (2**16) #BlockDimX = SearchSpaceSize / (BlockDimY) #print "SearchSpaceSize: ", SearchSpaceSize, " (", BlockDimX, ", ", BlockDimY,")" #Schedule kernel calls #MaxBlockDim = 100 OffsetBlocks = (SearchSpaceSize) % (BlockDimX*BlockDimY*WarpSize) MaxBlockCycles = (SearchSpaceSize - OffsetBlocks)/(BlockDimX*BlockDimY*WarpSize) BlockCounter = 0 print "Will do that many kernels a ", BlockDimX,"x", BlockDimY,"x ", WarpSize, ":", MaxBlockCycles #quit() #SET UP PROCESSING histo = {} #INITIALISATION CurandKernel(curand_h, block=(WarpSize,1,1), grid=(BlockDimX, BlockDimY)) print "Finished Curand kernel, starting main kernel..." #FIRST GENERATION proc_time = time.time() print "Starting first generation..." start = drv.Event() stop = drv.Event() start.record() Kernel(dest_h, grids_h, fitness_size_h, fitness_hash_h, curand_h, numpy.int64(0), block=(WarpSize*BankSize,1,1), grid=(BlockDimX,BlockDimY)) stop.record() stop.synchronize() print "Total kernel time taken: %fs"%(start.time_till(stop)*1e-3) print "Copying..." drv.memcpy_dtoh(fitness_size, fitness_size_h) drv.memcpy_dtoh(fitness_hash, fitness_hash_h) drv.memcpy_dtoh(grids, grids_h) #INTERMEDIATE GENERATION for i in range(MaxBlockCycles-1): print "Starting generation: ", i+1 start = drv.Event() stop = drv.Event() start.record() Kernel(dest_h, grids_h, fitness_size_h, fitness_hash_h, curand_h, numpy.int64((i+1)*BlockDimX*BlockDimY*WarpSize), block=(WarpSize*BankSize,1,1), grid=(BlockDimX,BlockDimY)) "Processing..." for j in range(grids.shape[0]): # if (fitness_hash[j]!=33) and (fitness_hash[j]!=44) and (fitness_hash[j]!=22): if fitness_hash[j] in histo: histo[fitness_hash[j]] = (histo[fitness_hash[j]][0], histo[fitness_hash[j]][1]+1) else: histo[fitness_hash[j]] = (grids[j], 1) stop.record() stop.synchronize() print "Total kernel time taken: %fs"%(start.time_till(stop)*1e-3) print "This corresponds to %f polyomino classification a second."%((BlockDimX*BlockDimY*WarpSize)/(start.time_till(stop)*1e-3)) print "Copying..." drv.memcpy_dtoh(fitness_size, fitness_size_h) drv.memcpy_dtoh(fitness_hash, fitness_hash_h) drv.memcpy_dtoh(grids, grids_h) #FINAL PROCESSING "Processing..." for i in range(grids.shape[0]): if fitness_hash[i] in histo: histo[fitness_hash[i]] = (histo[fitness_hash[i]][0], histo[fitness_hash[i]][1]+1) else: histo[fitness_hash[i]] = (grids[i], 1) print "Done!" #TIMING RESULTS print "Total time including set-up: ", (time.time() - tot_time) print "Total Processing time: ", (time.time() - proc_time) #OUTPUT print histo
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; if ((x >= width) || (y >= height)) { return; } data[x * width + y] = tex2D(tex, x, y); } """) downsample_func = downsample.get_function("downsample") texref = downsample.get_texref("tex") cuda.matrix_to_texref(img, texref, order="C") # texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) # texref.set_filter_mode(cuda.filter_mode.LINEAR) gpu_output = np.zeros_like(img, dtype=np.int16) blocksize = (16,16,1) gridsize = (width / blocksize[0], height / blocksize[1]) print 'blocksize', blocksize print 'gridsize', gridsize downsample_func(cuda.Out(gpu_output), np.int32(width), np.int32(height), block=blocksize, grid = gridsize, texrefs=[texref]) gpu_output = gpu_output.transpose()
def __init__(self, img_path): super(LFapplication, self).__init__() # # Load image data # base_path = os.path.splitext(img_path)[0] lenslet_path = base_path + '-lenslet.txt' optics_path = base_path + '-optics.txt' with open(lenslet_path, 'r') as f: tmp = eval(f.readline()) x_offset, y_offset, right_dx, right_dy, down_dx, down_dy = \ np.array(tmp, dtype=np.float32) with open(optics_path, 'r') as f: for line in f: name, val = line.strip().split() try: setattr(self, name, np.float32(val)) except: pass max_angle = math.atan(self.pitch / 2 / self.flen) # # Prepare image # im_pil = Image.open(img_path) if im_pil.mode == 'RGB': self.NCHANNELS = 3 w, h = im_pil.size im = np.zeros((h, w, 4), dtype=np.float32) im[:, :, :3] = np.array(im_pil).astype(np.float32) self.LF_dim = (ceil(h / down_dy), ceil(w / right_dx), 3) else: self.NCHANNELS = 1 im = np.array(im_pil.getdata()).reshape(im_pil.size[::-1]).astype( np.float32) h, w = im.shape self.LF_dim = (ceil(h / down_dy), ceil(w / right_dx)) x_start = x_offset - int(x_offset / right_dx) * right_dx y_start = y_offset - int(y_offset / down_dy) * down_dy x_ratio = self.flen * right_dx / self.pitch y_ratio = self.flen * down_dy / self.pitch # # Generate the cuda kernel # mod_LFview = pycuda.compiler.SourceModule( _kernel_tpl.render(newiw=self.LF_dim[1], newih=self.LF_dim[0], oldiw=w, oldih=h, x_start=x_start, y_start=y_start, x_ratio=x_ratio, y_ratio=y_ratio, x_step=right_dx, y_step=down_dy, NCHANNELS=self.NCHANNELS)) self.LFview_func = mod_LFview.get_function("LFview_kernel") self.texref = mod_LFview.get_texref("tex") # # Now generate the cuda texture # if self.NCHANNELS == 3: cuda.bind_array_to_texref( cuda.make_multichannel_2d_array(im, order="C"), self.texref) else: cuda.matrix_to_texref(im, self.texref, order="C") # # We could set the next if we wanted to address the image # in normalized coordinates ( 0 <= coordinate < 1.) # texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) # self.texref.set_filter_mode(cuda.filter_mode.LINEAR) # # Prepare the traits # self.add_trait('X_angle', Range(-max_angle, max_angle, 0.0)) self.add_trait('Y_angle', Range(-max_angle, max_angle, 0.0)) self.plotdata = ArrayPlotData(LF_img=self.sampleLF()) self.LF_img = Plot(self.plotdata) if self.NCHANNELS == 3: self.LF_img.img_plot("LF_img") else: self.LF_img.img_plot("LF_img", colormap=gray)
def run (self) : cuda.init() self.cuda_dev = cuda.Device(0) self.cuda_context = self.cuda_dev.make_context() # print 'Loading matrix...' # npz = np.load(self.MATRIX_FILE) # station_coords = npz['station_coords'] # grid_dim = npz['grid_dim'] # matrix = npz['matrix'] station_coords = self.station_coords grid_dim = self.grid_dim matrix = self.matrix nearby_stations = self.nearby_stations # EVERYTHING SHOULD BE IN FLOAT32 for ease of debugging. even times. # Matrix and others should be textures, arrays, or in constant memory, to do cacheing. # make matrix symmetric before converting to int32. this avoids halving the pseudo-infinity value. matrix = (matrix + matrix.T) / 2 #print np.where(matrix == np.inf) matrix[matrix == np.inf] = 99999999 # nan == nan is False because any operation involving nan is False ! # must use specific isnan function. however, inf works like a normal number. matrix[np.isnan(matrix)] = 99999999 #matrix[matrix >= 60 * 60 * 3] = 0 matrix = matrix.astype(np.int32) #matrix += 60 * 5 #matrix = np.nan_to_num(matrix) print matrix # force OD matrix symmetry for test # THIS was responsible for the coordinate drift!!! # need to symmetrize it before copy to device # matrix = (matrix + matrix.T) / 2 # print matrix #print np.any(matrix == np.nan) #print np.any(matrix == np.inf) station_coords_int = station_coords.round().astype(np.int32) # to be removed when textures are working station_coords_gpu = gpuarray.to_gpu(station_coords_int) matrix_gpu = gpuarray.to_gpu(matrix) max_x, max_y = grid_dim n_gridpoints = int(max_x * max_y) n_stations = len(station_coords) cuda_grid_shape = ( int( math.ceil( float(max_x)/CUDA_BLOCK_SHAPE[0] ) ), int( math.ceil( float(max_y)/CUDA_BLOCK_SHAPE[1] ) ) ) print "\n----PARAMETERS----" print "Input file: ", self.MATRIX_FILE print "Number of stations: ", n_stations print "OD matrix shape: ", matrix.shape print "Station coords shape: ", station_coords_int.shape print "Station cache size: ", self.N_NEARBY_STATIONS print "Map dimensions: ", grid_dim print "Number of map points: ", n_gridpoints print "Target space dimensionality: ", self.DIMENSIONS print "CUDA block dimensions: ", CUDA_BLOCK_SHAPE print "CUDA grid dimensions: ", cuda_grid_shape assert station_coords.shape == (n_stations, 2) assert self.N_NEARBY_STATIONS <= n_stations #sys.exit() # Make and register custom color map for pylab graphs cdict = {'red': ((0.0, 0.0, 0.0), (0.2, 0.0, 0.0), (0.4, 0.9, 0.9), (1.0, 0.0, 0.0)), 'green': ((0.0, 0.0, 0.1), (0.05, 0.9, 0.9), (0.1, 0.0, 0.0), (0.4, 0.9, 0.9), (0.6, 0.0, 0.0), (1.0, 0.0, 0.0)), 'blue': ((0.0, 0.0, 0.0), (0.05, 0.0, 0.0), (0.2, 0.9, 0.9), (0.3, 0.0, 0.0), (1.0, 0.0, 0.0))} mymap = LinearSegmentedColormap('mymap', cdict) mymap.set_over( (1.0, 0.0, 1.0) ) mymap.set_bad ( (0.0, 0.0, 0.0) ) #pl.plt.register_cmap(cmap=mymap) # set up arrays for calculations coords_gpu = gpuarray.to_gpu(np.random.random( (max_x, max_y, self.DIMENSIONS) ).astype(np.float32)) # initialize coordinates to random values in range 0...1 forces_gpu = gpuarray.zeros( (int(max_x), int(max_y), self.DIMENSIONS), dtype=np.float32 ) # 3D float32 accumulate forces over one timestep weights_gpu = gpuarray.zeros( (int(max_x), int(max_y)), dtype=np.float32 ) # 2D float32 cell error accumulation errors_gpu = gpuarray.zeros( (int(max_x), int(max_y)), dtype=np.float32 ) # 2D float32 cell error accumulation #near_stations_gpu = gpuarray.zeros( (int(max_x), int(max_y), self.N_NEARBY_STATIONS, 2), dtype=np.int32) # instead of using synthetic distances, use the network distance near stations lists. # rather than copying the array over to the GPU then binding to external texref, # could just use matrix_to_texref, but this function only seems to understand 2d arrays near_stations_gpu = gpuarray.to_gpu( nearby_stations ) debug_gpu = gpuarray.zeros( n_gridpoints, dtype = np.int32 ) debug_img_gpu = gpuarray.zeros_like( errors_gpu ) print "\n----COMPILATION----" # times could be merged into forces kernel, if done by pixel not station. # integrate kernel could be GPUArray operation; also helps clean up code by using GPUArrays. # DIM should be replaced by python script, so as not to define twice. replacements = [( 'N_NEAR_STATIONS', self.N_NEARBY_STATIONS ), ( 'N_STATIONS', n_stations ), ( 'DIM', self.DIMENSIONS)] src = preprocess_cu('unified_mds_stochastic.cu', replacements) #print src mod = SourceModule(src, options=["--ptxas-options=-v"]) stations_kernel = mod.get_function("stations" ) forces_kernel = mod.get_function("forces" ) integrate_kernel = mod.get_function("integrate") matrix_texref = mod.get_texref('tex_matrix') station_coords_texref = mod.get_texref('tex_station_coords') near_stations_texref = mod.get_texref('tex_near_stations') #ts_coords_texref = mod.get_texref('tex_ts_coords') could be a 4-channel 2 dim texture, or 3 dim texture. or just 1D. cuda.matrix_to_texref(matrix, matrix_texref, order="F") # copy directly to device with texref - made for 2D x 1channel textures cuda.matrix_to_texref(station_coords_int, station_coords_texref, order="F") # fortran ordering, because we will be accessing with texND() instead of C-style indices # again, matrix_to_texref is not used here because that function only understands 2D arrays near_stations_gpu.bind_to_texref_ext(near_stations_texref) # note, cuda.In and cuda.Out are from the perspective of the KERNEL not the host app! # stations_kernel disabled since true network distances are now being used #stations_kernel(near_stations_gpu, max_x, max_y, block=CUDA_BLOCK_SHAPE, grid=cuda_grid_shape) # autoinit.context.synchronize() #self.cuda_context.synchronize() #print "Near stations list:" #print near_stations_gpu #sys.exit() print "\n----CALCULATION----" t_start = time.time() n_pass = 0 active_cells = list(self.active_cells) # make a working list of map cells that are accessible print "N active cells:", len(active_cells) while (n_pass < self.N_ITERATIONS) : n_pass += 1 random.shuffle(active_cells) active_cells_gpu = gpuarray.to_gpu(np.array(active_cells).astype(np.int32)) # Pay attention to grid sizes when testing: if you don't run the integrator on the coordinates connected to stations, # they don't move... so the whole thing stabilizes in a couple of cycles. # Stations are worked on in blocks to avoid locking up the GPU with one giant kernel. # for subset_low in range(0, n_stations, self.STATION_BLOCK_SIZE) : for subset_low in range(1) : # changed to try integrating more often subset_high = subset_low + self.STATION_BLOCK_SIZE if subset_high > n_stations : subset_high = n_stations sys.stdout.write( "\rpass %03i / station %04i of %04i / total runtime %03.1f min " % (n_pass, subset_high, n_stations, (time.time() - t_start) / 60.0) ) sys.stdout.flush() self.emit(QtCore.SIGNAL( 'outputProgress(int, int, int, float, float)' ), n_pass, subset_high, n_stations, (time.time() - t_start) / 60.0, (time.time() - t_start) / n_pass + (subset_low/n_stations) ) # adding texrefs in kernel call seems to change nothing, leaving them out. # max_x and max_y could be #defined in kernel source, along with STATION_BLOCK_SIZE forces_kernel(np.int32(n_stations), np.int32(subset_low), np.int32(subset_high), max_x, max_y, active_cells_gpu, coords_gpu, forces_gpu, weights_gpu, errors_gpu, debug_gpu, debug_img_gpu, block=CUDA_BLOCK_SHAPE, grid=cuda_grid_shape) #autoinit.context.synchronize() self.cuda_context.synchronize() # show a sample of the results #print coords_gpu.get() [00:10,00:10] #print forces_gpu.get() [00:10,00:10] #print weights_gpu.get()[00:10,00:10] time.sleep(0.05) # let the OS GUI use the GPU for a bit. #pl.imshow( (debug_img_gpu.get() / 60.0).T, cmap=mymap, origin='bottom')#, vmin=0, vmax=100 ) #pl.title( 'Debugging Output - step %03d' %n_pass ) #pl.colorbar() #pl.savefig( 'img/debug%03d.png' % n_pass ) #pl.close() # why was this indented? shouldn't it be integrated only after all stations are taken into account? integrate_kernel(max_x, max_y, coords_gpu, forces_gpu, weights_gpu, block=CUDA_BLOCK_SHAPE, grid=cuda_grid_shape) self.cuda_context.synchronize() if (self.IMAGES_EVERY > 0) and (n_pass % self.IMAGES_EVERY == 0) : #print 'Kernel debug output:' #print debug_gpu # velocities = np.sqrt(np.sum(forces_gpu.get() ** 2, axis = 2)) # png_f = open('img/vel%03d.png' % n_pass, 'wb') # png_w = png.Writer(max_x, max_y, greyscale = True, bitdepth=8) # png_w.write(png_f, velocities / 1200) # png_f.close() # np.set_printoptions(threshold=np.nan) # print velocities.astype(np.int32) # pl.imshow( velocities.T, origin='bottom') #, vmin=0, vmax=100 ) # pl.title( 'Velocity ( sec / timestep) - step %03d' % n_pass ) # pl.colorbar() # pl.savefig( 'img/vel%03d.png' % n_pass ) # plt.close() # # pl.imshow( (errors_gpu.get() / weights_gpu.get() / 60.0 ).T, cmap=mymap, origin='bottom') #, vmin=0, vmax=100 ) # pl.title( 'Average absolute error (min) - step %03d' %n_pass ) # pl.colorbar() # pl.savefig( 'img/err%03d.png' % n_pass ) # pl.close() # pl.imshow( (debug_img_gpu.get() / 60.0).T, cmap=mymap, origin='bottom') #, vmin=0, vmax=100 ) # pl.title( 'Debugging Output - step %03d' %n_pass ) # pl.colorbar() # pl.savefig( 'img/debug%03d.png' % n_pass ) # pl.close() #self.emit( QtCore.SIGNAL( 'outputImage(QString)' ), QtCore.QString('img/err%03d.png' % n_pass) ) #self.emit( QtCore.SIGNAL( 'outputImage(QImage)' ), numpy2qimage( (errors_gpu.get() / weights_gpu.get() / 60.0 / 30 * 255 ).astype(np.uint8) ) ) velocities = np.sqrt(np.sum(forces_gpu.get() ** 2, axis = 2)) velocities /= 15. # out of 15 sec range velocities *= 255 np.clip(velocities, 0, 255, velocities) velImage = numpy2qimage(velocities.astype(np.uint8)).transformed(QtGui.QMatrix().rotate(-90)) # errors = np.sqrt(errors_gpu.get() / weights_gpu.get()) e = np.sum(np.nan_to_num(errors_gpu.get())) / np.sum(np.nan_to_num(weights_gpu.get())) print "average error (sec) over all active cells:", e errors = errors_gpu.get() / weights_gpu.get() # average instead of RMS error errors /= 60. errors /= 15. # out of 15 min range errors *= 255 np.clip(errors, 0, 255, errors) errImage = numpy2qimage(errors.astype(np.uint8)).transformed(QtGui.QMatrix().rotate(-90)) self.emit( QtCore.SIGNAL( 'outputImage(QImage, QImage)' ), errImage, velImage ) velImage.save('img/vel%03d.png' % n_pass, 'png' ) errImage.save('img/err%03d.png' % n_pass, 'png' ) sys.stdout.write( "/ avg pass time %02.1f sec" % ( (time.time() - t_start) / n_pass, ) ) sys.stdout.flush() #end of main loop np.save('result.npy', coords_gpu.get())
def __init__(self, img_path): super(LFapplication, self).__init__() # # Load image data # base_path = os.path.splitext(img_path)[0] lenslet_path = base_path + '-lenslet.txt' optics_path = base_path + '-optics.txt' with open(lenslet_path, 'r') as f: tmp = eval(f.readline()) x_offset, y_offset, right_dx, right_dy, down_dx, down_dy = \ np.array(tmp, dtype=np.float32) with open(optics_path, 'r') as f: for line in f: name, val = line.strip().split() try: setattr(self, name, np.float32(val)) except: pass max_angle = math.atan(self.pitch/2/self.flen) # # Prepare image # im_pil = Image.open(img_path) if im_pil.mode == 'RGB': self.NCHANNELS = 3 w, h = im_pil.size im = np.zeros((h, w, 4), dtype=np.float32) im[:, :, :3] = np.array(im_pil).astype(np.float32) self.LF_dim = (ceil(h/down_dy), ceil(w/right_dx), 3) else: self.NCHANNELS = 1 im = np.array(im_pil.getdata()).reshape(im_pil.size[::-1]).astype(np.float32) h, w = im.shape self.LF_dim = (ceil(h/down_dy), ceil(w/right_dx)) x_start = x_offset - int(x_offset / right_dx) * right_dx y_start = y_offset - int(y_offset / down_dy) * down_dy x_ratio = self.flen * right_dx / self.pitch y_ratio = self.flen * down_dy / self.pitch # # Generate the cuda kernel # mod_LFview = pycuda.compiler.SourceModule( _kernel_tpl.render( newiw=self.LF_dim[1], newih=self.LF_dim[0], oldiw=w, oldih=h, x_start=x_start, y_start=y_start, x_ratio=x_ratio, y_ratio=y_ratio, x_step=right_dx, y_step=down_dy, NCHANNELS=self.NCHANNELS ) ) self.LFview_func = mod_LFview.get_function("LFview_kernel") self.texref = mod_LFview.get_texref("tex") # # Now generate the cuda texture # if self.NCHANNELS == 3: cuda.bind_array_to_texref( cuda.make_multichannel_2d_array(im, order="C"), self.texref ) else: cuda.matrix_to_texref(im, self.texref, order="C") # # We could set the next if we wanted to address the image # in normalized coordinates ( 0 <= coordinate < 1.) # texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) # self.texref.set_filter_mode(cuda.filter_mode.LINEAR) # # Prepare the traits # self.add_trait('X_angle', Range(-max_angle, max_angle, 0.0)) self.add_trait('Y_angle', Range(-max_angle, max_angle, 0.0)) self.plotdata = ArrayPlotData(LF_img=self.sampleLF()) self.LF_img = Plot(self.plotdata) if self.NCHANNELS == 3: self.LF_img.img_plot("LF_img") else: self.LF_img.img_plot("LF_img", colormap=gray)
} i += blockDim.x * gridDim.x; } } """) ######## #get the kernel ######## copy_texture_func = mod_copy_texture.get_function("copy_texture_kernel") ######### #Map the Kernel to texture object ######### texref = mod_copy_texture.get_texref("tex") cuda.matrix_to_texref(realrow , texref , order = "C") #texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) #texref.set_filter_mode() A=5 gpu_output = np.zeros_like(realrow) tic() copy_texture_func(cuda.In(np.float32([M,N,A])),cuda.Out(gpu_output),block=(32,32, 1), grid=(M/32,N/32,1), texrefs=[texref]) print "time ",toc() print "Output" print gpu_output p.gray() p.subplot(1,2,1)
def trikmeans_gpu(data, clusters, iterations, return_times=0): """trikmeans_gpu(data, clusters, iterations) returns (clusters, labels) K-means using triangle inequality algorithm and PyCuda Input arguments are the data, intial cluster values, and number of iterations to repeat. The shape of data is (nDim, nPts) where nDim = # of dimensions in the data and nPts = number of data points. The shape of clusters is (nDim, nClusters) The return values are the updated clusters and labels for the data """ #--------------------------------------------------------------- # get problem parameters #--------------------------------------------------------------- (nDim, nPts) = data.shape nClusters = clusters.shape[1] #--------------------------------------------------------------- # set calculation control variables #--------------------------------------------------------------- useTextureForData = 0 usePageLockedMemory = 0 if (nPts > 32768): useTextureForData = 0 # block and grid sizes for the ccdist kernel (also for hdclosest) blocksize_ccdist = min(512, 16 * (1 + (nClusters - 1) / 16)) gridsize_ccdist = 1 + (nClusters - 1) / blocksize_ccdist #block and grid sizes for the init module threads_desired = 16 * (1 + (max(nPts, nDim * nClusters) - 1) / 16) #blocksize_init = min(512, threads_desired) blocksize_init = min(128, threads_desired) gridsize_init = 1 + (threads_desired - 1) / blocksize_init #block and grid sizes for the step3 module blocksize_step3 = blocksize_init if not useTextureForData: blocksize_step3 = min(256, blocksize_step3) gridsize_step3 = gridsize_init #block and grid sizes for the step4 module # Each block of threads will handle seqcount times the data # eg blocksize of 512 and seqcount of 4, each block reduces 4*512 = 2048 elements blocksize_step4 = 2 while (blocksize_step4 < min(512, nPts)): blocksize_step4 *= 2 maxblocks = 512 seqcount_step4 = 1 + (nPts - 1) / (blocksize_step4 * maxblocks) gridsize_step4 = 1 + (nPts - 1) / (seqcount_step4 * blocksize_step4) blocksize_step4part2 = 1 while (blocksize_step4part2 < gridsize_step4): blocksize_step4part2 *= 2 """ print "blocksize_step4 =", blocksize_step4 print "gridsize_step4 =", gridsize_step4 print "seqcount_step4 =", seqcount_step4 """ #block and grid sizes for the calc_movement module for blocksize_calcm in range(32, 512, 32): if blocksize_calcm >= nClusters: break gridsize_calcm = 1 + (nClusters - 1) / blocksize_calcm #block and grid sizes for the step56 module blocksize_step56 = blocksize_init gridsize_step56 = gridsize_init #--------------------------------------------------------------- # prepare source modules #--------------------------------------------------------------- t1 = time.time() mod_ccdist = kernels.get_big_module(nDim, nPts, nClusters, blocksize_step4, seqcount_step4, gridsize_step4, blocksize_step4part2, useTextureForData, BOUNDS) ccdist = mod_ccdist.get_function("ccdist") calc_hdclosest = mod_ccdist.get_function("calc_hdclosest") init = mod_ccdist.get_function("init") step3 = mod_ccdist.get_function("step3") step4 = mod_ccdist.get_function("step4") step4part2 = mod_ccdist.get_function("step4part2") calc_movement = mod_ccdist.get_function("calc_movement") step56 = mod_ccdist.get_function("step56") pycuda.autoinit.context.synchronize() t2 = time.time() module_time = t2 - t1 #--------------------------------------------------------------- # setup data on GPU #--------------------------------------------------------------- t1 = time.time() data = np.array(data).astype(np.float32) clusters = np.array(clusters).astype(np.float32) if useTextureForData: # copy the data to the texture texrefData = mod_ccdist.get_texref("texData") cuda.matrix_to_texref(data, texrefData, order="F") else: if usePageLockedMemory: data_pl = cuda.pagelocked_empty_like(data) data_pl[:, :] = data gpu_data = gpuarray.to_gpu(data_pl) else: gpu_data = gpuarray.to_gpu(data) if usePageLockedMemory: clusters_pl = cuda.pagelocked_empty_like(clusters) clusters_pl[:, :] = clusters gpu_clusters = gpuarray.to_gpu(clusters_pl) else: gpu_clusters = gpuarray.to_gpu(clusters) gpu_assignments = gpuarray.zeros((nPts, ), np.int32) # cluster assignment gpu_lower = gpuarray.zeros((nClusters, nPts), np.float32) # lower bounds on distance between # point and each cluster gpu_upper = gpuarray.zeros((nPts, ), np.float32) # upper bounds on distance between # point and any cluster gpu_ccdist = gpuarray.zeros((nClusters, nClusters), np.float32) # cluster-cluster distances gpu_hdClosest = gpuarray.zeros((nClusters, ), np.float32) # half distance to closest gpu_hdClosest.fill( 1.0e10) # set to large value // **TODO** get the acutal float max gpu_badUpper = gpuarray.zeros( (nPts, ), np.int32) # flag to indicate upper bound needs recalc gpu_clusters2 = gpuarray.zeros((nDim, nClusters), np.float32) gpu_cluster_movement = gpuarray.zeros((nClusters, ), np.float32) gpu_cluster_changed = gpuarray.zeros((nClusters, ), np.int32) gpu_cluster_changed.fill(1) gpu_reduction_out = gpuarray.zeros((nDim, nClusters * gridsize_step4), np.float32) gpu_reduction_counts = gpuarray.zeros((nClusters * gridsize_step4, ), np.int32) pycuda.autoinit.context.synchronize() t2 = time.time() data_time = t2 - t1 #--------------------------------------------------------------- # do calculations #--------------------------------------------------------------- ccdist_time = 0. hdclosest_time = 0. init_time = 0. step3_time = 0. step4_time = 0. step56_time = 0. t1 = time.time() ccdist(gpu_clusters, gpu_ccdist, gpu_hdClosest, block=(blocksize_ccdist, 1, 1), grid=(gridsize_ccdist, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() ccdist_time += t2 - t1 t1 = time.time() calc_hdclosest(gpu_ccdist, gpu_hdClosest, block=(blocksize_ccdist, 1, 1), grid=(gridsize_ccdist, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() hdclosest_time += t2 - t1 t1 = time.time() if useTextureForData: init(gpu_clusters, gpu_ccdist, gpu_hdClosest, gpu_assignments, gpu_lower, gpu_upper, block=(blocksize_init, 1, 1), grid=(gridsize_init, 1), texrefs=[texrefData]) else: init(gpu_data, gpu_clusters, gpu_ccdist, gpu_hdClosest, gpu_assignments, gpu_lower, gpu_upper, block=(blocksize_init, 1, 1), grid=(gridsize_init, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() init_time += t2 - t1 for i in range(iterations): if i > 0: t1 = time.time() ccdist(gpu_clusters, gpu_ccdist, gpu_hdClosest, block=(blocksize_ccdist, 1, 1), grid=(gridsize_ccdist, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() ccdist_time += t2 - t1 t1 = time.time() calc_hdclosest(gpu_ccdist, gpu_hdClosest, block=(blocksize_ccdist, 1, 1), grid=(gridsize_ccdist, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() hdclosest_time += t2 - t1 t1 = time.time() if i > 0: gpu_cluster_changed.fill(0) if useTextureForData: step3(gpu_clusters, gpu_ccdist, gpu_hdClosest, gpu_assignments, gpu_lower, gpu_upper, gpu_badUpper, gpu_cluster_changed, block=(blocksize_step3, 1, 1), grid=(gridsize_step3, 1), texrefs=[texrefData]) else: step3(gpu_data, gpu_clusters, gpu_ccdist, gpu_hdClosest, gpu_assignments, gpu_lower, gpu_upper, gpu_badUpper, gpu_cluster_changed, block=(blocksize_step3, 1, 1), grid=(gridsize_step3, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() step3_time += t2 - t1 t1 = time.time() if useTextureForData: step4(gpu_cluster_changed, gpu_reduction_out, gpu_reduction_counts, gpu_assignments, block=(blocksize_step4, 1, 1), grid=(gridsize_step4, nDim), texrefs=[texrefData]) else: step4(gpu_data, gpu_cluster_changed, gpu_reduction_out, gpu_reduction_counts, gpu_assignments, block=(blocksize_step4, 1, 1), grid=(gridsize_step4, nDim)) step4part2(gpu_cluster_changed, gpu_reduction_out, gpu_reduction_counts, gpu_clusters2, gpu_clusters, block=(blocksize_step4part2, 1, 1), grid=(1, nDim)) calc_movement(gpu_clusters, gpu_clusters2, gpu_cluster_movement, gpu_cluster_changed, block=(blocksize_calcm, 1, 1), grid=(gridsize_calcm, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() step4_time += t2 - t1 t1 = time.time() if useTextureForData: step56(gpu_assignments, gpu_lower, gpu_upper, gpu_cluster_movement, gpu_badUpper, block=(blocksize_step56, 1, 1), grid=(gridsize_step56, 1), texrefs=[texrefData]) else: step56(gpu_assignments, gpu_lower, gpu_upper, gpu_cluster_movement, gpu_badUpper, block=(blocksize_step56, 1, 1), grid=(gridsize_step56, 1)) pycuda.autoinit.context.synchronize() t2 = time.time() step56_time += t2 - t1 # prepare for next iteration temp = gpu_clusters gpu_clusters = gpu_clusters2 gpu_clusters2 = temp if return_times: return gpu_ccdist, gpu_hdClosest, gpu_assignments, gpu_lower, gpu_upper, \ gpu_clusters.get(), gpu_cluster_movement, \ data_time, module_time, init_time, \ ccdist_time/iterations, hdclosest_time/iterations, \ step3_time/iterations, step4_time/iterations, step56_time/iterations else: return gpu_clusters.get(), gpu_assignments.get()
import numpy realrow = numpy.array([1.0, 2.0, 3.0, 4.0, 5.0], dtype=numpy.float32).reshape(1, 5) mod_copy_texture = cuda.SourceModule(""" texture<float, 1> tex; texture<float, 1> tex2; __global__ void copy_texture_kernel(float * data) { int ty=threadIdx.y; //data[ty] = tex1D(tex, (float)(ty)); data[ty] = tex1D(tex, (float)(ty)/2.0f); } """) copy_texture_func = mod_copy_texture.get_function("copy_texture_kernel") texref = mod_copy_texture.get_texref("tex") tex2ref = mod_copy_texture.get_texref("tex2") cuda.matrix_to_texref(realrow, texref, order="C") texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) texref.set_filter_mode(cuda.filter_mode.LINEAR) gpu_output = numpy.zeros_like(realrow) copy_texture_func(cuda.Out(gpu_output), block=(1, 1, 1), texrefs=[texref]) print "Output:" print gpu_output