def __init__(self, vol_bnds, voxel_size): # Define voxel volume parameters. self._vol_bnds = vol_bnds # 3x2, rows: (x, y, z), columns: (min, max) in world coordinates in meters self._voxel_size = voxel_size # in meters (determines volume discretization and resolution) self._trunc_margin = self._voxel_size * 5 # truncation on SDF # Adjust volume bounds. self._vol_dim = np.ceil((self._vol_bnds[:, 1] - self._vol_bnds[:, 0]) / self._voxel_size).copy(order='C').astype( int) # ensure C-order contigous self._vol_bnds[:, 1] = self._vol_bnds[:, 0] + self._vol_dim * self._voxel_size self._vol_origin = self._vol_bnds[:, 0].copy(order='C').astype( np.float32) # ensure C-order contigous print("Voxel volume size: {:d} x {:d} x {:d}".format( self._vol_dim[0], self._vol_dim[1], self._vol_dim[2])) # Initialize pointers to voxel volume in CPU memory. self._tsdf_vol_cpu = np.ones(self._vol_dim).astype(np.float32) self._weight_vol_cpu = np.zeros(self._vol_dim).astype( np.float32 ) # for computing the cumulative moving average of observations per voxel self._color_vol_cpu = np.zeros(self._vol_dim).astype(np.float32) # Copy voxel volumes to GPU. if TSDF_GPU_MODE: self._tsdf_vol_gpu = cuda.mem_alloc(self._tsdf_vol_cpu.nbytes) cuda.memcpy_htod(self._tsdf_vol_gpu, self._tsdf_vol_cpu) self._weight_vol_gpu = cuda.mem_alloc(self._weight_vol_cpu.nbytes) cuda.memcpy_htod(self._weight_vol_gpu, self._weight_vol_cpu) self._color_vol_gpu = cuda.mem_alloc(self._color_vol_cpu.nbytes) cuda.memcpy_htod(self._color_vol_gpu, self._color_vol_cpu) # Cuda kernel function (C++) self._cuda_src_mod = SourceModule(""" __global__ void integrate(float * tsdf_vol, float * weight_vol, float * color_vol, float * vol_dim, float * vol_origin, float * cam_intr, float * cam_pose, float * other_params, float * color_im, float * depth_im) { // Get voxel index. int gpu_loop_idx = (int) other_params[0]; int max_threads_per_block = blockDim.x; int block_idx = blockIdx.z * gridDim.y * gridDim.x + blockIdx.y * gridDim.x + blockIdx.x; int voxel_idx = gpu_loop_idx * gridDim.x * gridDim.y * gridDim.z * max_threads_per_block + block_idx * max_threads_per_block + threadIdx.x; int vol_dim_x = (int)vol_dim[0]; int vol_dim_y = (int)vol_dim[1]; int vol_dim_z = (int)vol_dim[2]; if (voxel_idx > vol_dim_x * vol_dim_y * vol_dim_z) return; // Get voxel grid coordinates. float voxel_x = floorf(((float)voxel_idx) / ((float)(vol_dim_y * vol_dim_z))); float voxel_y = floorf(((float)(voxel_idx - ((int)voxel_x) * vol_dim_y * vol_dim_z)) / ((float)vol_dim_z)); float voxel_z = (float)(voxel_idx - ((int)voxel_x) * vol_dim_y * vol_dim_z - ((int)voxel_y) * vol_dim_z); // Voxel grid coordinates to world coordinates. float voxel_size = other_params[1]; float pt_x = vol_origin[0] + voxel_x * voxel_size; float pt_y = vol_origin[1] + voxel_y * voxel_size; float pt_z = vol_origin[2] + voxel_z * voxel_size; // World coordinates to camera coordinates. float tmp_pt_x = pt_x - cam_pose[0*4+3]; float tmp_pt_y = pt_y - cam_pose[1*4+3]; float tmp_pt_z = pt_z - cam_pose[2*4+3]; float cam_pt_x = cam_pose[0*4+0] * tmp_pt_x + cam_pose[1*4+0] * tmp_pt_y + cam_pose[2*4+0] * tmp_pt_z; float cam_pt_y = cam_pose[0*4+1] * tmp_pt_x + cam_pose[1*4+1] * tmp_pt_y + cam_pose[2*4+1] * tmp_pt_z; float cam_pt_z = cam_pose[0*4+2] * tmp_pt_x + cam_pose[1*4+2] * tmp_pt_y + cam_pose[2*4+2] * tmp_pt_z; // Camera coordinates to image pixels. int pixel_x = (int) roundf(cam_intr[0*3+0] * (cam_pt_x / cam_pt_z) + cam_intr[0*3+2]); int pixel_y = (int) roundf(cam_intr[1*3+1] * (cam_pt_y / cam_pt_z) + cam_intr[1*3+2]); // Skip if outside view frustum. int im_h = (int) other_params[2]; int im_w = (int) other_params[3]; if (pixel_x < 0 || pixel_x >= im_w || pixel_y < 0 || pixel_y >= im_h || cam_pt_z < 0) return; // Skip invalid depth. float depth_value = depth_im[pixel_y*im_w+pixel_x]; if (depth_value == 0) return; // Integrate TSDF. float trunc_margin = other_params[4]; float depth_diff = depth_value-cam_pt_z; if (depth_diff < -trunc_margin) return; float dist = fmin(1.0f, depth_diff / trunc_margin); float w_old = weight_vol[voxel_idx]; float obs_weight = other_params[5]; float w_new = w_old + obs_weight; weight_vol[voxel_idx] = w_new; tsdf_vol[voxel_idx] = (tsdf_vol[voxel_idx] * w_old + dist) / w_new; // Integrate color. float old_color = color_vol[voxel_idx]; float old_b = floorf(old_color / (256 * 256)); float old_g = floorf((old_color - old_b * 256 * 256) / 256); float old_r = old_color - old_b * 256 * 256 - old_g * 256; float new_color = color_im[pixel_y*im_w+pixel_x]; float new_b = floorf(new_color / (256 * 256)); float new_g = floorf((new_color - new_b * 256 * 256) / 256); float new_r = new_color - new_b * 256 * 256 - new_g * 256; new_b = fmin(roundf((old_b*w_old + new_b) / w_new), 255.0f); new_g = fmin(roundf((old_g*w_old + new_g) / w_new), 255.0f); new_r = fmin(roundf((old_r*w_old + new_r) / w_new), 255.0f); color_vol[voxel_idx] = new_b * 256 * 256 + new_g * 256 + new_r; }""") self._cuda_integrate = self._cuda_src_mod.get_function("integrate") # Determine block/grid size on GPU. gpu_dev = cuda.Device(0) self._max_gpu_threads_per_block = gpu_dev.MAX_THREADS_PER_BLOCK n_blocks = int( np.ceil( float(np.prod(self._vol_dim)) / float(self._max_gpu_threads_per_block))) grid_dim_x = min(gpu_dev.MAX_GRID_DIM_X, int(np.floor(np.cbrt(n_blocks)))) grid_dim_y = min(gpu_dev.MAX_GRID_DIM_Y, int(np.floor(np.sqrt(n_blocks / grid_dim_x)))) grid_dim_z = min( gpu_dev.MAX_GRID_DIM_Z, int(np.ceil(float(n_blocks) / float(grid_dim_x * grid_dim_y)))) self._max_gpu_grid_dim = np.array( [grid_dim_x, grid_dim_y, grid_dim_z]).astype(int) self._n_gpu_loops = int( np.ceil( float(np.prod(self._vol_dim)) / float( np.prod(self._max_gpu_grid_dim) * self._max_gpu_threads_per_block)))
if ndarray.ndim >= 2: cuda.TextureReference.set_address_mode( tex_ref, 1, address_mode) if ndarray.ndim >= 3: cuda.TextureReference.set_address_mode( tex_ref, 2, address_mode) cuda.TextureReference.set_filter_mode( tex_ref, filter_mode) tex_ref.set_flags(tex_ref.get_flags( ) & ~cuda.TRSF_NORMALIZED_COORDINATES & ~cuda.TRSF_READ_AS_INTEGER) with open(os.path.join(os.path.dirname(__file__), 'resize.cu')) as f: _read_data = f.read() _mod = SourceModule(_read_data) _tex_ref = _mod.get_texref('sourceTex') _kernel = _mod.get_function('resize') def resize_gpu(src_vol, dst_vol=None, dst_shape=None, scaling=None): if dst_shape is None: assert scaling dst_shape = [np.int(np.round(i * scaling)) for i in src_vol.shape] if dst_vol is None: dst_vol = gpuarray.GPUArray(dst_shape, np.float32) ndarray_to_float_tex(_tex_ref, src_vol)
def elmvis(Xraw, A, slowdown=10, report=5, maxtime=24 * 60 * 60, tol=0, batch=None, maxiter=None, maxupdate=None, maxstall=None, cossim=None, silent=False): """ELMVIS+ function running in GPU memory. """ X = Xraw / np.linalg.norm(Xraw, axis=1)[:, None] # unit-length version of X Xh = np.dot(A, X) # X_hat, predicted value of X N, d = X.shape I = np.arange(N) # index of samples # set default values if cossim is None: cossim = np.trace(X.T.dot(A).dot(X)) / N if maxiter is None: maxiter = N * N * N if maxupdate is None: maxupdate = N * N if maxstall is None: maxstall = N * N if not silent: print "original similarity: ", cossim # init GPU dt = X.dtype.type try: linalg.init() except ImportError as e: print e devA = gpuarray.to_gpu(A.astype(dt)) devX = gpuarray.to_gpu(X.astype(dt)) devXi1 = gpuarray.empty((d, ), dtype=dt) devXh = linalg.dot(devA, devX) devAi = gpuarray.empty((N, 2), dtype=dt) devDelta = gpuarray.empty((2, d), dtype=dt) result = gpuarray.empty((d, ), dtype=dt) # swap kernel kernel = """ __global__ void diff(%s *A, %s *Y, %s *AY, %s *result, long d, long N, long i1, long i2) { long j = blockDim.x * blockIdx.x + threadIdx.x; %s yi1 = Y[i1*d + j]; %s yi2 = Y[i2*d + j]; result[j] = (A[i1*N + i1] * (yi2 - yi1) + 2*AY[i1*d + j]) * (yi2 - yi1) + (A[i2*N + i2] * (yi1 - yi2) + 2*(AY[i2*d + j] + A[i2*N + i1]*(yi2 - yi1))) * (yi1 - yi2); } """ if dt is np.float64: kernel = kernel % ("double", "double", "double", "double", "double", "double") else: kernel = kernel % ("float", "float", "float", "float", "float", "float") mod_diff = SourceModule(kernel) dev_diff = mod_diff.get_function("diff") dev_diff.prepare("PPPPllll") block = result._block grid = (int(np.ceil(1.0 * result.shape[0] / block[0])), 1) t0 = tlast = time() stall = 0 iters = 0 updates = 0 updates_last = 0 iters_last = 0 ups_max = 0 while (iters < maxiter) and (stall < maxstall): iters += 1 stall += 1 # get two different random numbers i1, i2 = np.random.randint(0, N, size=2) while i1 == i2: i1, i2 = np.random.randint(0, N, size=2) dev_diff.prepared_call(grid, block, devA.gpudata, devX.gpudata, devXh.gpudata, result.gpudata, d, N, i1, i2) diff = np.sum(result.get()) if diff > tol: stall = 0 devAi[:, 0] = devA[:, i1] devAi[:, 1] = devA[:, i2] devDelta[0, :] = devX[i1, :] - devX[i2, :] devDelta[1, :] = devX[i2, :] - devX[i1, :] linalg.add_dot(devAi, devDelta, devXh, alpha=-1) tI = I[i1] I[i1] = I[i2] I[i2] = tI devXi1[:] = devX[i1, :] devX[i1] = devX[i2] devX[i2] = devXi1 cossim += diff / N updates += 1 if updates > maxupdate: break t = time() if t - tlast > report: ups = (updates - updates_last) * 1.0 / (t - tlast) ips = (iters - iters_last) * 1.0 / (t - tlast) if not silent: print "%d iters | %d updates | %.0f iters/s | %.0f updates/s | cos similarity = %.4f" % ( iters, updates, ips, ups, cossim) updates_last = updates iters_last = iters tlast = t ups_max = max(ups, ups_max) if ups < ups_max / slowdown: break if t - t0 > maxtime: break ips = iters * 1.0 / (time() - t0) ups = updates * 1.0 / (time() - t0) Xraw[:] = Xraw[I] cossim = np.trace(X.T.dot(A).dot(X)) / N if not silent: print "final similarity: ", cossim info = { 'cossim': cossim, 'iters': iters, 'updates': updates, 'ips': ips, 'ups': ups } return I, info
@author: tristan ''' import os import numpy as np from pycuda.compiler import SourceModule from gpu.gpuSimulation import useCachedKernels fluxCode = open( os.path.join(os.path.dirname(__file__), './fluxCalculations.cu'), 'r') try: # Put the kernel code into a SourceModule if useCachedKernels: fluxModule = SourceModule(fluxCode.read()) else: fluxModule = SourceModule(fluxCode.read(), cache_dir=False) fluxCode.close() # Create reference to the specific functions in the SourceModule FluxSolverFn = fluxModule.get_function("FluxSolver") BuildRFn = fluxModule.get_function("buildRValues") # Create callable functions def FluxSolver(FluxesGPU, UIntPtsGPU, BottomIntPtsGPU, propSpeedsGPU, m, n, blockDims, gridDims): FluxSolverFn(FluxesGPU, UIntPtsGPU, BottomIntPtsGPU,
class ImageFilter: src_module = """ __global__ void grayscale_filter(unsigned char *red, unsigned char *green, unsigned char *blue, unsigned int height, unsigned int width) { unsigned int row = threadIdx.y + blockIdx.y * blockDim.y; unsigned int col = threadIdx.x + blockIdx.x * blockDim.x; unsigned int index = col + row * width; // If boundary hit don't keep going (i.e. grid * block exceed max) if ((row > height) || (col > width)) return; // luminosity method unsigned char intensity = static_cast<unsigned char>( red[index] * 0.3 + green[index] * 0.59 + blue[index] * 0.11 ); red[index] = intensity; green[index] = intensity; blue[index] = intensity; } """ # Could have multiple __global__ kernels in here! def __init__(self, image_array, dim_block=32): self.module = SourceModule(self.src_module) self.image_array = image_array self.dim_block = dim_block self.block_size = dim_block**2 # square block used i.e. (16, 16, 1) self.grid_size = None # last filter grid used # number of rows, number of columns, and pixel vector size - here its 4 for rgba self.height, self.width, self.pixel_dimension = self.image_array.shape self.image_size = self.height * self.width @property def grayscale(self): """ Convert Image to Grayscale: luminosity of -> (0.3 * R) + (0.59 * G) + (0.11 * B) """ # Copy dimension as to not write over address for future call. red = np.copy(self.image_array[:, :, 0]) green = np.copy(self.image_array[:, :, 1]) blue = np.copy(self.image_array[:, :, 2]) # Adjust grid to specified block size dim_grid_x = math.ceil(self.width / self.dim_block) dim_grid_y = math.ceil(self.height / self.dim_block) # Determine max grid max_grid_dim_x = pycuda.autoinit.device.get_attribute( cuda.device_attribute.MAX_GRID_DIM_X) max_grid_dim_y = pycuda.autoinit.device.get_attribute( cuda.device_attribute.MAX_GRID_DIM_Y) # If grid determined from block size exceeds max grid, we have issues. if (max_grid_dim_x * max_grid_dim_y) < (dim_grid_x * dim_grid_y): raise ValueError('ERROR :: Image demensions :: Grid exceeds max') # for easy sanity check tracking self.grid_size = dim_grid_x * dim_grid_y # Call specific function from CUDA kernel grayscale_filter = self.module.get_function('grayscale_filter') # Use grayscale function is specific grid, block, and array for color channels grayscale_filter(cuda.InOut(red), cuda.InOut(green), cuda.InOut(blue), np.uint32(self.height), np.uint32(self.width), block=(self.dim_block, self.dim_block, 1), grid=(dim_grid_x, dim_grid_y)) # Allocates array and it will not take the time to set the element values. grayscale_image_array = np.empty_like(self.image_array.copy()) grayscale_image_array[:, :, 0] = red grayscale_image_array[:, :, 1] = green grayscale_image_array[:, :, 2] = blue return grayscale_image_array
mcopy = cuda.Memcpy3D() mcopy.width_in_bytes = mcopy.src_pitch = f.strides[1] mcopy.src_height = mcopy.height = ny mcopy.depth = nz memcopy(mcopy, set_c(f,(None,-1,-1)), cex_gpu) memcopy(mcopy, set_c(f,(-1,None,-1)), cey_gpu) memcopy(mcopy, set_c(f,(-1,-1,None)), cez_gpu) memcopy(mcopy, set_c(f,(None,0,0)), chx_gpu) memcopy(mcopy, set_c(f,(0,None,0)), chy_gpu) memcopy(mcopy, set_c(f,(0,0,None)), chz_gpu) # prepare kernels from pycuda.compiler import SourceModule mod = SourceModule(kernels) update_e = mod.get_function("update_e") update_h = mod.get_function("update_h") update_src = mod.get_function("update_src") tex = mod.get_texref("tex") tey = mod.get_texref("tey") tez = mod.get_texref("tez") thx = mod.get_texref("thx") thy = mod.get_texref("thy") thz = mod.get_texref("thz") tcex = mod.get_texref("tcex") tcey = mod.get_texref("tcey") tcez = mod.get_texref("tcez") tchx = mod.get_texref("tchx") tchy = mod.get_texref("tchy") tchz = mod.get_texref("tchz")
local_gpu_setup_kernel = pycuda.compiler.SourceModule( cuda_full_observables_production.cuda_full_observables_production_code, no_extern_c=True).get_function('setup_kernel') local_rng_states = drv.mem_alloc( np.int32(num_blocks * block_dim) * pycuda.characterize.sizeof( 'curandStateXORWOW', '#include <curand_kernel.h>')) local_gpu_setup_kernel(np.int32(int(num_blocks * block_dim)), local_rng_states, np.uint64(0), np.uint64(0), grid=(int(num_blocks), 1), block=(int(block_dim), 1, 1)) # get observables function gpu_observables_func = SourceModule( cuda_full_observables_production.cuda_full_observables_production_code, no_extern_c=True).get_function( 'gpu_full_observables_production_with_arrays') a_selected_sampler = [] for par_name in l_grid_parameters: a_selected_sampler.append(a_samples[:, d_parameter_to_index[par_name]]) a_selected_sampler = np.asarray(a_selected_sampler) print np.mean(a_selected_sampler, axis=1) print np.cov(a_selected_sampler) num_bins_s1_th2f = 70 num_bins_s2_th2f = 70 bin_edges_s1_th2 = np.linspace(3, 70, num_bins_s1_th2f + 1)
boundaryConditionsModule = SourceModule(""" __global__ void applyWallBoundaries(float *meshU, int m, int n) { int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; if (row < 2 && col < n) { meshU[row*n*3 + col*3] = meshU[(3-row)*n*3 + col*3]; meshU[row*n*3 + col*3 + 2] = -meshU[(3-row)*n*3 + col*3 + 2]; } else if (row > m-3 && row < m && col < n) { meshU[row*n*3 + col*3] = meshU[(2*m-5-row)*n*3 + col*3]; meshU[row*n*3 + col*3 + 2] = -meshU[(2*m-5-row)*n*3 + col*3 + 2]; } if (col < 2 && row < m) { meshU[row*n*3 + col*3] = meshU[row*n*3 + (3-col)*3]; meshU[row*n*3 + col*3 + 1] = -meshU[row*n*3 + (3-col)*3 + 1]; } else if (col > n-3 && col < n && row < m) { meshU[row*n*3 + col*3] = meshU[row*n*3 + (2*n-5-col)*3]; meshU[row*n*3 + col*3 + 1] = -meshU[row*n*3 + (2*n-5-col)*3 + 1]; } } __global__ void applyOpenBoundaries(float *meshU, int m, int n) { int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; if (row < 2 && col < n) { meshU[row*n*3 + col*3] = meshU[2*n*3 + col*3]; meshU[row*n*3 + col*3 + 1] = meshU[2*n*3 + col*3 + 1]; meshU[row*n*3 + col*3 + 2] = meshU[2*n*3 + col*3 + 2]; } else if (row > m-3 && row < m && col < n) { meshU[row*n*3 + col*3] = meshU[(m-3)*n*3 + col*3]; meshU[row*n*3 + col*3 + 1] = meshU[(m-3)*n*3 + col*3 + 1]; meshU[row*n*3 + col*3 + 2] = meshU[(m-3)*n*3 + col*3 + 2]; } if (col < 2 && row < m) { meshU[row*n*3 + col*3] = meshU[row*n*3 + 2*3]; meshU[row*n*3 + col*3 + 1] = meshU[m*n*3 + 2*3 + 1]; meshU[row*n*3 + col*3 + 2] = meshU[m*n*3 + 2*3 + 2]; } else if (col > n-3 && col < n && row < m) { meshU[row*n*3 + col*3] = meshU[row*n*3 + (n-3)*3]; meshU[row*n*3 + col*3 + 1] = meshU[m*n*3 + (n-3)*3 + 1]; meshU[row*n*3 + col*3 + 2] = meshU[m*n*3 + (n-3)*3 + 2]; } } """)
def write_results(prediction, confidence, num_classes, cuda_code, nms_conf=0.2): #confidence == 0.5 #num_classes = 80 #prediction = prediction.numpy() #print(type(prediction[0,1,0])) #tmp =(prediction[:,:,4] > confidence).float().unsqueeze(2) #print((prediction[:,:,4] > confidence).shape) #shape threshhold from [1,10647] => [1,10647,1] #print(tmp.shape) threshhold = (prediction[:, :, 4] > confidence) #print(threshhold.shape) conf_mask = np.expand_dims(threshhold, axis=2) #print(conf_mask.shape) prediction = prediction * conf_mask #print(prediction) box_corner = np.empty_like(prediction) box_corner[:, :, 0] = (prediction[:, :, 0] - prediction[:, :, 2] / 2) box_corner[:, :, 1] = (prediction[:, :, 1] - prediction[:, :, 3] / 2) box_corner[:, :, 2] = (prediction[:, :, 0] + prediction[:, :, 2] / 2) box_corner[:, :, 3] = (prediction[:, :, 1] + prediction[:, :, 3] / 2) prediction[:, :, :4] = box_corner[:, :, :4] print(type(prediction)) batch_size = prediction.shape[0] write = False THETA = nms_conf for ind in range(batch_size): image_pred = prediction[ind] #image Tensor arr = image_pred[:, 5:5 + num_classes] #max_conf, max_conf_score = image_pred[:,5:5+num_classes].max(axis= 1) max_conf = np.max(arr, axis=1) max_conf_score = np.argmax(arr, axis=1) #print(max_conf) #print(max_conf_score) #print(max_conf.shape) #print(max_conf_score.shape) #break max_conf = np.expand_dims(max_conf, axis=1) max_conf_score = np.expand_dims(max_conf_score, axis=1) max_conf_score = np.asarray(max_conf_score, dtype=np.float32) seq = (image_pred[:, :5], max_conf, max_conf_score) #print(max_conf) #print(max_conf_score) #print(max_conf.shape) #print(max_conf_score.shape) #break image_pred = np.concatenate(seq, 1) #print(image_pred.shape) non_zero_ind = np.nonzero(image_pred[:, 4]) non_zero_ind = non_zero_ind[0] #print(non_zero_ind) image_pred_ = image_pred[np.squeeze(non_zero_ind), :] try: image_pred_ = image_pred[np.squeeze(non_zero_ind), :] #print(image_pred_) except: continue #print(image_pred_.shape) if image_pred_.shape[0] == 0: continue # #Get the various classes detected in the image img_classes = unique(image_pred_[:, -1]) # -1 index holds the class index #print(img_classes) for cls in img_classes: #perform NMS #get the detections with one particular class image_pred_reshaped = np.expand_dims(image_pred_[:, -1] == cls, axis=1) cls_mask = image_pred_ * image_pred_reshaped class_mask_ind = np.nonzero(cls_mask[:, -2]) class_mask_ind = np.squeeze(class_mask_ind) image_pred_class = image_pred_[class_mask_ind] print(image_pred_class) #sort the detections such that the entry with the maximum objectness #confidence is at the top conf_sort_index = np.sort(image_pred_class[:, 4])[::-1] boxes = image_pred_class[:, 0:4].copy() conf_scores = image_pred_class[:, 4].copy() #print(conf_sort_index) cuda_code = string.Template(cuda_code) cuda_code = cuda_code.substitute(THETA=THETA) modules = SourceModule(cuda_code) # python function will change array's value, so use .copy() cuda_start = time.time() cuda_results = cuda_nms(modules, boxes, conf_scores, nms_conf) cuda_end = time.time() print("CUDA results:", cuda_results) print("CUDA version takes {} seconds".format(cuda_end - cuda_start)) print(cuda_results) index = cuda_results[0] image_pred_class = np.expand_dims(image_pred_class[index], axis=0) print(image_pred_class) batch_ind = np.zeros((image_pred_class.shape[0], 1)) #print(tmp) #batch_ind = tmp.fill_(ind) #Repeat the batch_id for as many detections of the class cls in the image seq = batch_ind, image_pred_class if not write: output = np.concatenate(seq, 1) write = True else: out = np.concatenate(seq, 1) output = np.concatenate([output, out]) try: return output except: return 0
from time import time # this is a naive parallel prefix-sum kernel that uses shared memory naive_ker = SourceModule(""" __global__ void naive_prefix(double *vec, double *out) { __shared__ double sum_buf[1024]; int tid = threadIdx.x; sum_buf[tid] = vec[tid]; // begin parallel prefix sum algorithm int iter = 1; for (int i=0; i < 10; i++) { __syncthreads(); if (tid >= iter ) { sum_buf[tid] = sum_buf[tid] + sum_buf[tid - iter]; } iter *= 2; } __syncthreads(); out[tid] = sum_buf[tid]; __syncthreads(); } """) naive_gpu = naive_ker.get_function("naive_prefix")
mod = SourceModule(""" __device__ void subst(float *a, float *b) { a[0] = b[0]; a[1] = b[1]; } __device__ void cadd(float *a, float *b, float *c) { c[0] = a[0] + b[0]; c[1] = a[1] + b[1]; } __device__ void cmul(float *a, float *b, float *c) { c[0] = a[0] * b[0] - a[1] * b[1]; c[1] = a[0] * b[1] + a[1] * b[0]; } __global__ void manderblot(float *x, float *y, int *image, int N, int roop, float threshold) { int idx = blockDim.x*blockIdx.x + threadIdx.x; float z[2] = {0, 0}; float _z[2] = {0, 0}; int i; float c[2] = {x[idx % N], y[idx / N]}; for (i = 0; i < roop; ++i) { cmul(z,z,_z); cadd(_z, c, _z); if (_z[0] * _z[0] + _z[1] * _z[1] > threshold * threshold){ image[idx] = i; return; } subst(z, _z); } image[idx] = 0; return; } """)
def _get_lut_bprop_kernel(dtype, deterministic=False): """ Builds the bprop kernel for lookup table layers based on templated code. If the deterministic version is requested, an index buffer must be passed as an argument. This index buffer re-orders items in the input tensor so that word_ids are sorted. This is required since we need to be sure that each thread only updates weights for one word id. Arguments: dtype (np.dtype): The data which the kernel will operate on. deterministic (boolean): Builds the deterministic kernel when this is set to True. """ if not deterministic: code = r""" __global__ void lut_bprop( int* inputs, %(type)s* dW, %(type)s* errors, const int nin, const int embedding_dim, const int vocab_size, const int pad_idx) { const int tid = threadIdx.x; const int bid = blockIdx.x; int word_id = inputs[bid]; int error_row = bid * embedding_dim; int output_row = word_id * embedding_dim; if(word_id != pad_idx) { for(int i = tid; i < embedding_dim; i += blockDim.x) { atomicAdd(&dW[output_row + i], errors[error_row + i]); } } } """ code = code % {"type": _ew_types[dtype]["type"]} module = SourceModule(code, options=["--use_fast_math"]) kernel = module.get_function("lut_bprop") kernel.prepare("PPPIIIi") else: code = r""" __global__ void lut_bprop( int* inputs, int* index_buffer, %(type)s* dW, %(type)s* errors, const int nin, const int embedding_dim, const int vocab_size, const int pad_idx) { const int tid = threadIdx.x; const int bid = blockIdx.x; int index_position = bid; int index = index_buffer[index_position]; int word_id = inputs[index]; if((bid == 0 || word_id != inputs[index_buffer[bid - 1]]) && word_id != pad_idx) { int output_row = word_id * embedding_dim; do { int error_row = index * embedding_dim; for(int i = tid; i < embedding_dim; i += blockDim.x) { dW[output_row + i] += errors[error_row + i]; } index_position++; if(index_position == gridDim.x) { break; } index = index_buffer[index_position]; } while(inputs[index] == word_id); } } """ code = code % {"type": _ew_types[dtype]["type"]} module = SourceModule(code, options=["--use_fast_math"]) kernel = module.get_function("lut_bprop") kernel.prepare("PPPPIIIi") kernel.name = "lut_bprop" return kernel
def _get_sorting_kernel(kernel_id, block_size): """ Builds kernels used for sorting inputs. There are several kernels here corresponding to the steps in the algorithm. The algorithm works by determining the sorted position for each input item. This is done with a bucket sort algorithm, where each word_id is a bucket. The first step determines the size of each bucket (number of occurences of each word_id). Next, a prefix some is computed over the list of bucket sizes to find where each bucket will be placed in the output buffer. Finally, each thread places it's index into the correct sorted position based on the bucket start index (computed from the prefix sum) and that thread's offset into the bucket (which is taken from the output of the atomic add done in the first step.) Arguments: kernel_id (Integer): Which step to build the kernel for [0, 4] block_size (Integer): Number of threads per block for the prefix sum kernels. """ code = r""" #define THREADS %(threads)s #define STORE_BLOCKSUM %(store_blocksum)s __global__ void sort_inputs0( int* inputs, int* index_buffer, int* offset_buffer, int* word_counts, const int vocab_size, const int input_length) { const int tid = threadIdx.x + (blockDim.x * blockIdx.x); int word_id; if(tid < input_length) { word_id = inputs[tid]; offset_buffer[tid] = atomicAdd(&word_counts[word_id], 1); } } __device__ void scan(int* buffer, int* blocksum, int global_length) { const int tid = (threadIdx.x << 1) + 1; const int gid = ((threadIdx.x + (blockIdx.x * blockDim.x)) << 1) + 1; __shared__ int local_counts[THREADS * 2]; local_counts[tid] = buffer[gid]; local_counts[tid - 1] = buffer[gid - 1]; #pragma unroll for(int skip = 1; skip <= THREADS; skip <<= 1) { int mask = (skip << 1) - 1; if((tid & mask) == mask) { local_counts[tid] += local_counts[tid - skip]; } __syncthreads(); } if(tid == (THREADS * 2 - 1)) { #if STORE_BLOCKSUM blocksum[blockIdx.x] = local_counts[tid]; #endif local_counts[tid] = 0; } #pragma unroll for(int skip = THREADS; skip > 0; skip >>= 1) { int mask = (skip << 1) - 1; if((tid & mask) == mask) { int temp = local_counts[tid - skip]; local_counts[tid - skip] = local_counts[tid]; local_counts[tid] += temp; } __syncthreads(); } if(gid < global_length) { buffer[gid] = local_counts[tid]; buffer[gid - 1] = local_counts[tid - 1]; } } __global__ void sort_inputs1( int* inputs, int* index_buffer, int* offset_buffer, int* word_counts, const int vocab_size, const int input_length) { scan(word_counts, word_counts + vocab_size, vocab_size); } __global__ void sort_inputs2( int* inputs, int* index_buffer, int* offset_buffer, int* word_counts, const int vocab_size, const int input_length) { scan(word_counts + vocab_size, 0, blockDim.x); } __global__ void sort_inputs3( int* inputs, int* index_buffer, int* offset_buffer, int* word_counts, const int vocab_size, const int input_length) { const int gid = (threadIdx.x + (blockIdx.x * blockDim.x)) << 1; if(gid < vocab_size) { word_counts[gid] += word_counts[vocab_size + blockIdx.x]; word_counts[gid + 1] += word_counts[vocab_size + blockIdx.x]; } } __global__ void sort_inputs4( int* inputs, int* index_buffer, int* offset_buffer, int* word_counts, const int vocab_size, const int input_length) { const int tid = threadIdx.x + (blockDim.x * blockIdx.x); int word_id; if(tid < input_length) { word_id = inputs[tid]; int sorted_position = word_counts[word_id] + offset_buffer[tid]; index_buffer[sorted_position] = tid; } } """ code = code % { "threads": block_size, "store_blocksum": (1 if kernel_id == 1 else 0) } module = SourceModule(code, options=["--use_fast_math"]) function_name = "sort_inputs" + str(kernel_id) kernel = module.get_function(function_name) kernel.prepare("PPPPII") kernel.name = "sort_inputs" return kernel
#=================================================== #=== A = alpha*A + beta*B #=== input A, B, alpha, beta #=== output A #=================================================== mod = SourceModule \ ( """ #include<stdio.h> #include<math.h> #define INDEX(a, b, yshape) (a)*(yshape) + (b) __global__ void matrixAddition(float *A,float *B, float alpha, float beta, int ydim) { unsigned int idx = threadIdx.x+(blockIdx.x*(blockDim.x*blockDim.y)); unsigned int a = idx/ydim; unsigned int b = idx%ydim; A[INDEX(a, b, ydim)] = alpha*A[INDEX(a, b, ydim)]+beta*B[INDEX(a, b, ydim)]; } """ ) matrixAddition = mod.get_function("matrixAddition") def matAdd(A, B, alpha, beta): forme1 = A.shape
cf = np.ones_like(f)*0.5 ex_gpu = cuda.to_device(f) ey_gpu = cuda.to_device(f) ez_gpu = cuda.to_device(f) hx_gpu = cuda.to_device(f) hy_gpu = cuda.to_device(f) hz_gpu = cuda.to_device(f) cex_gpu = cuda.to_device( set_c(cf,(None,-1,-1)) ) cey_gpu = cuda.to_device( set_c(cf,(-1,None,-1)) ) cez_gpu = cuda.to_device( set_c(cf,(-1,-1,None)) ) # prepare kernels from pycuda.compiler import SourceModule mod = SourceModule( kernels.replace('TPB',str(tpb)).replace('nyz',str(ny*nz)).replace('nx',str(nx)).replace('ny',str(ny)).replace('nz',str(nz)) ) update_h = mod.get_function("update_h") update_e = mod.get_function("update_e") update_src = mod.get_function("update_src") update_h.prepare("PPPPPP", block=(tpb,1,1)) update_e.prepare("PPPPPPPPP", block=(tpb,1,1)) update_src.prepare("fP", block=(nz,1,1)) # prepare for plot from matplotlib.pyplot import * ion() imsh = imshow(np.ones((nx,ny),'f').T, cmap=cm.hot, origin='lower', vmin=0, vmax=0.005) colorbar() # measure kernel execution time
//d_Result[gmemPos] = 128; smemPos += smemStride; gmemPos += gmemStride; } } ''' template = string.Template(template) code = template.substitute(KERNEL_RADIUS=KERNEL_RADIUS, KERNEL_W=KERNEL_W, COLUMN_TILE_H=COLUMN_TILE_H, COLUMN_TILE_W=COLUMN_TILE_W, ROW_TILE_W=ROW_TILE_W, KERNEL_RADIUS_ALIGNED=KERNEL_RADIUS_ALIGNED) module = SourceModule(code) convolutionRowGPU = module.get_function('convolutionRowGPU') convolutionColumnGPU = module.get_function('convolutionColumnGPU') d_Kernel_rows = module.get_global('d_Kernel_rows')[0] d_Kernel_columns = module.get_global('d_Kernel_columns')[0] # Helper functions for computing alignment... def iDivUp(a, b): # Round a / b to nearest higher integer value a = numpy.int32(a) b = numpy.int32(b) return (a / b + 1) if (a % b != 0) else (a / b) def iDivDown(a, b):
Sampler = SourceModule(""" #include <stdio.h> __device__ int cudarand(long long seed) { if (seed == 0) { seed = 1; } long long temp=(48271 * seed + 0) % 2147483647; return temp; } __global__ void rand_Negative_binomial(float *randomseed, int *target, int * matrix_scale,float *r, float *p) { const int idx = blockIdx.x * blockDim.x + threadIdx.x; if (idx<matrix_scale[0]) { int seed = cudarand(randomseed[idx] * 2147483647.0); seed=cudarand(seed); int current_index = idx/matrix_scale[1]; int suc = 0.0; float fail = 0.0; int total_r = r[current_index]; float prob = p[current_index]; while(total_r>fail) { seed = cudarand(seed); float temp = seed/2147483647.0; if (temp<prob) { suc++; } else { fail++; } } target[idx]=suc; } } """)
Ts[i_12]+=An[5]/my_factorial; } } } """ try: Context.get_device() except: import pycuda.autoinit mod = SourceModule(krnl, no_extern_c=True) _gpu_expm = mod.get_function("expm") def gpu_expm(As, Ts_vectorized, p=12): N = len(As) if Ts_vectorized.ndim != 2 or Ts_vectorized.shape[1] != 6: raise ValueError(Ts_vectorized.shape) threadsPerBlock = 512 nBlocks = int(np.ceil(float(N) / float(threadsPerBlock))) _gpu_expm(As.gpu, Ts_vectorized.gpu, np.int32(N), np.int32(p),
import pycuda.gpuarray as gpuarray s = cuda.Event() e = cuda.Event() s.record() code = """ __global__ void add_one(int n, int start, float *x) { int index = start + blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; for (int i = index; i < n; i += stride) x[i] += 1.0; } """ mod = SourceModule(code) add_one = mod.get_function("add_one") N = np.int32(1e8) nStreams = 2 streamSize = np.int32(N / nStreams) x = np.ones(N, dtype=np.float32) x_gpu = gpuarray.empty(N, dtype=np.float32) #cuda.memcpy_htod(x_gpu, x) stream = [] for i in range(nStreams): stream.append(cuda.Stream())
def build_sparse_transition_model_at_T(T, T_gpu, vel_data_gpu, params, bDimx, params_gpu, xs_gpu, ys_gpu, ac_angles, results, sumR_sa, save_file_for_each_a=False): gsize = int(params[0]) num_actions = int(params[1]) nrzns = int(params[2]) all_u_mat_gpu, all_v_mat_gpu, all_ui_mat_gpu, all_vi_mat_gpu, all_Yi_gpu = vel_data_gpu results_gpu_list = [] sumR_sa_gpu_list = [] for i in range(num_actions): results_gpu_list.append(cuda.mem_alloc(results.nbytes)) sumR_sa_gpu_list.append(cuda.mem_alloc(sumR_sa.nbytes)) for i in range(num_actions): cuda.memcpy_htod(results_gpu_list[i], results) cuda.memcpy_htod(sumR_sa_gpu_list[i], sumR_sa) print("alloted mem in inner func") # let one thread access a state centre. access coresponding velocities, run all actions # TODO: dt may not be int for a genral purpose code mod = SourceModule(""" __device__ int32_t get_thread_idx() // assigns idx to thread with which it accesses the flattened 3d vxrzns matrix // for a given T and a given action. // runs for both 2d and 3d grid // TODO: may have to change this considering cache locality { // here i, j, k refer to a general matrix M[i][j][k] int32_t i = threadIdx.x; int32_t j = blockIdx.y; int32_t k = blockIdx.x; int32_t idx = k + (j*gridDim.x) + (i*gridDim.x*gridDim.y)+ blockIdx.z*blockDim.x*gridDim.x*gridDim.y; return idx; } __device__ int32_t state1D_from_thread(int32_t T) { // j ~ blockIdx.x // i ~ blockIdx.y // The above three consitute a spatial state index from i and j of grid // last term is for including time index as well. return (blockIdx.x + (blockIdx.y*gridDim.x) + (T*gridDim.x*gridDim.y) ); } __device__ int32_t state1D_from_ij(int32_t* posid, int32_t T) { // posid = {i , j} // state id = j + i*dim(i) + T*dim(i)*dim(j) return (posid[1] + posid[0]*gridDim.x + (T*gridDim.x*gridDim.y) ) ; } __device__ bool is_edge_state(int32_t i, int32_t j) { // n = gsize -1 that is the last index of the domain assuming square domain int32_t n = gridDim.x - 1; if (i == 0 || i == n || j == 0 || j == n ) { return true; } else return false; } __device__ bool is_terminal(int32_t i, int32_t j, float* params) { int32_t i_term = params[8]; // terminal state indices int32_t j_term = params[9]; if(i == i_term && j == j_term) { return true; } else return false; } __device__ bool my_isnan(int s) { // By IEEE 754 rule, NaN is not equal to NaN return s != s; } __device__ void get_xypos_from_ij(int32_t i, int32_t j, float* xs, float* ys, float* x, float* y) { *x = xs[j]; *y = ys[gridDim.x - 1 - i]; return; } __device__ float get_angle_in_0_2pi(float theta) { float f_pi = 3.141592; if (theta < 0) { return theta + (2*f_pi); } else { return theta; } } __device__ float calculate_reward_const_dt(float* xs, float* ys, int32_t i_old, int32_t j_old, float xold, float yold, int32_t* newposids, float* params, float vnet_x, float vnet_y ) { // xold and yold are centre of old state (i_old, j_old) float dt = params[4]; float r1, r2, theta1, theta2, theta, h; float dt_new; float xnew, ynew; if (newposids[0] == i_old && newposids[1] == j_old) { dt_new = dt; } else { get_xypos_from_ij(newposids[0], newposids[1], xs, ys, &xnew, &ynew); //get centre of new states h = sqrtf((xnew - xold)*(xnew - xold) + (ynew - yold)*(ynew - yold)); r1 = h/(sqrtf((vnet_x*vnet_x) + (vnet_y*vnet_y))); theta1 = get_angle_in_0_2pi(atan2f(vnet_y, vnet_x)); theta2 = get_angle_in_0_2pi(atan2f(ynew - yold, xnew - xold)); theta = fabsf(theta1 -theta2); r2 = fabsf(sinf(theta)); dt_new = r1 + r2; if (threadIdx.x == 0 && blockIdx.z == 0 && blockIdx.x == 1 && blockIdx.y == 1) { params[24] = r1; params[25] = r2; } } return -dt_new; } __device__ void move(float ac_angle, float vx, float vy, float* xs, float* ys, int32_t* posids, float* params, float* r ) { int32_t n = params[0] - 1; // gsize - 1 // int32_t num_actions = params[1]; // int32_t nrzns = params[2]; float F = params[3]; float dt = params[4]; float r_outbound = params[5]; float r_terminal = params[6]; float Dj = fabsf(xs[1] - xs[0]); float Di = fabsf(ys[1] - ys[0]); float r_step = 0; *r = 0; int32_t i0 = posids[0]; int32_t j0 = posids[1]; float vnetx = F*cosf(ac_angle) + vx; float vnety = F*sinf(ac_angle) + vy; float x, y; get_xypos_from_ij(i0, j0, xs, ys, &x, &y); // x, y stores centre coords of state i0,j0 float xnew = x + (vnetx * dt); float ynew = y + (vnety * dt); //checks TODO: remove checks once verified if (threadIdx.x == 0 && blockIdx.z == 0 && blockIdx.x == 1 && blockIdx.y == 1) { params[12] = x; params[13] = y; params[14] = vnetx; params[15] = vnety; params[16] = xnew; params[17] = ynew; params[18] = ac_angle; } if (xnew > xs[n]) { xnew = xs[n]; *r += r_outbound; } else if (xnew < xs[0]) { xnew = xs[0]; *r += r_outbound; } if (ynew > ys[n]) { ynew = ys[n]; *r += r_outbound; } else if (ynew < ys[0]) { ynew = ys[0]; *r += r_outbound; } // TODO:xxDONE check logic wrt remainderf. remquof had issue int32_t xind, yind; //float remx = remquof((xnew - xs[0]), Dj, &xind); //float remy = remquof(-(ynew - ys[n]), Di, &yind); float remx = remainderf((xnew - xs[0]), Dj); float remy = remainderf(-(ynew - ys[n]), Di); xind = ((xnew - xs[0]) - remx)/Dj; yind = (-(ynew - ys[n]) - remy)/Di; if ((remx >= 0.5 * Dj) && (remy >= 0.5 * Di)) { xind += 1; yind += 1; } else if ((remx >= 0.5 * Dj && remy < 0.5 * Di)) { xind += 1; } else if ((remx < 0.5 * Dj && remy >= 0.5 * Di)) { yind += 1; } if (!(my_isnan(xind) || my_isnan(yind))) { posids[0] = yind; posids[1] = xind; if (is_edge_state(posids[0], posids[1])) //line 110 { *r += r_outbound; } if (threadIdx.x == 0 && blockIdx.z == 0 && blockIdx.x == 1 && blockIdx.y == 1) { params[26] = 9999; } } r_step = calculate_reward_const_dt(xs, ys, i0, j0, x, y, posids, params, vnetx, vnety); //TODO: change back to normal when needed //r_step = -dt; *r += r_step; //TODO: numerical check remaining if (is_terminal(posids[0], posids[1], params)) { *r += r_terminal; } if (threadIdx.x == 0 && blockIdx.z == 0 && blockIdx.x == 1 && blockIdx.y == 1) { params[19] = xnew; params[20] = ynew; params[21] = yind; params[22] = xind; params[23] = *r; //params[17] = ynew; //params[18] = ac_angle; } } __device__ void extract_velocity(float* vx, float* vy, int32_t T, float* all_u_mat, float* all_v_mat, float* all_ui_mat, float* all_vi_mat, float* all_Yi, float* params) { int32_t nrzns = params[2]; int32_t nmodes = params[7]; int32_t sp_uvi, str_uvi, sp_Yi, str_Yi; //startpoints and strides for accessing all_ui_mat, all_vi_mat and all_Yi float sum_x = 0; float sum_y = 0; float vx_mean, vy_mean; //thread index. also used to access resultant vxrzns[nrzns, gsize, gsize] int32_t idx = get_thread_idx(); //rzn index to identify which of the 5k rzn it is. used to access all_Yi. int32_t rzn_id = (blockIdx.z * blockDim.x) + threadIdx.x ; //mean_id is the index used to access the flattened all_u_mat[t,i,j]. int32_t mean_id = state1D_from_thread(T); //to access all_ui_mat and all_vi_mat str_uvi = gridDim.x * gridDim.y; sp_uvi = (T * nmodes * str_uvi) + (gridDim.x * blockIdx.y) + (blockIdx.x); // to access all_Yi sp_Yi = (T * nrzns * nmodes) + (rzn_id * nmodes); vx_mean = all_u_mat[mean_id]; for(int i = 0; i < nmodes; i++) { sum_x += all_ui_mat[sp_uvi + (i*str_uvi)]*all_Yi[sp_Yi + i]; } vy_mean = all_v_mat[mean_id]; for(int i = 0; i < nmodes; i++) { sum_y += all_vi_mat[sp_uvi + (i*str_uvi)]*all_Yi[sp_Yi + i]; } *vx = vx_mean + sum_x; *vy = vy_mean + sum_y; return; } //test: changer from float* to float ac_angle __global__ void transition_calc(float* T_arr, float* all_u_mat, float* all_v_mat, float* all_ui_mat, float* all_vi_mat, float* all_Yi, float ac_angle, float* xs, float* ys, float* params, float* sumR_sa, float* results) // resutls directions- 1: along S2; 2: along S1; 3: along columns towards count { int32_t gsize = params[0]; // size of grid along 1 direction. ASSUMING square grid. int32_t num_actions = params[1]; int32_t nrzns = params[2]; float F = params[3]; float dt = params[4]; float r_outbound = params[5]; float r_terminal = params[6]; int32_t nmodes = params[7]; int32_t i_term = params[8]; // terminal state indices int32_t j_term = params[9]; int32_t nT = params[10]; int32_t is_stationary = params[11]; int32_t T = (int32_t)T_arr[0]; int32_t idx = get_thread_idx(); float vx, vy; if(idx < gridDim.x*gridDim.y*nrzns) { int32_t posids[2] = {blockIdx.y, blockIdx.x}; //static declaration of array of size 2 to hold i and j values of S1. int32_t sp_id; //sp_id is space_id. S1%(gsize*gsize) // Afer move() these will be overwritten by i and j values of S2 float r; // to store immediate reward extract_velocity(&vx, &vy, T, all_u_mat, all_v_mat, all_ui_mat, all_vi_mat, all_Yi, params); //move(*ac_angle, vx, vy, xs, ys, posids, params, &r); move(ac_angle, vx, vy, xs, ys, posids, params, &r); int32_t S1, S2; if (is_stationary == 1) { T = 0; S1 = state1D_from_thread(T); //get init state number corresponding to thread id S2 = state1D_from_ij(posids, T); //get successor state number corresponding to posid and next timestep T+1 } else { S1 = state1D_from_thread(T); //get init state number corresponding to thread id S2 = state1D_from_ij(posids, T+1); //get successor state number corresponding to posid and next timestep T+1 sp_id = S1%(gsize*gsize); } //writing to sumR_sa. this array will later be divided by num_rzns, to get the avg float a = atomicAdd(&sumR_sa[sp_id], r); //TODO: try reduction if this is slow overall results[idx] = S2; __syncthreads(); /*if (threadIdx.x == 0 && blockIdx.z == 0) { sumR_sa[S1] = sumR_sa[S1]/nrzns; //TODO: change name to R_sa from sumR_sa since were not storing sum anymore } */ }//if ends return; } """) # sumR_sa2 = np.empty_like(sumR_sa, dtype = np.float32) # cuda.memcpy_dtoh(sumR_sa2, sumR_sa_gpu) # print("sumR_sa",sumR_sa) # print("sumR_sa",sumR_sa2[0:10001]) # T = np.array(T64, dtype = np.float32) params2 = np.empty_like(params).astype(np.float32) func = mod.get_function("transition_calc") for i in range(num_actions): print('T', T, " call kernel for action: ",i) func(T_gpu, all_u_mat_gpu, all_v_mat_gpu, all_ui_mat_gpu, all_vi_mat_gpu, all_Yi_gpu, ac_angles[i], xs_gpu, ys_gpu, params_gpu, sumR_sa_gpu_list[i], results_gpu_list[i], block=(bDimx, 1, 1), grid=(gsize, gsize, (nrzns // bDimx) + 1)) if i == 0: cuda.memcpy_dtoh(params2, params_gpu) print("params check:",) print( '\nangle= ', params2[18], '\nx =' ,params2[12], '\ny =' ,params2[13] , '\nvnetx = ',params2[14], '\nvnety =', params2[15], '\nxnew =', params2[16], '\nynew =', params2[17], '\nxnewupd =', params2[19], '\nynewupd =', params2[20], '\nyind i=', params2[21], '\nxind j=', params2[22], '\nr- =', params2[23], '\nr1+ =', params2[24], '\nr2+ =', params2[25], '\nenter_isnan =', params2[26] ) results2_list = [] sum_Rsa2_list = [] for i in range(num_actions): results2_list.append(np.empty_like(results)) sum_Rsa2_list.append(np.empty_like(sumR_sa)) # SYNCHRONISATION - pycuda does it implicitly. for i in range(num_actions): cuda.memcpy_dtoh(results2_list[i], results_gpu_list[i]) cuda.memcpy_dtoh(sum_Rsa2_list[i], sumR_sa_gpu_list[i]) print("memcpy_dtoh for action: ", i) for i in range(num_actions): sum_Rsa2_list[i] = sum_Rsa2_list[i] / nrzns # print("sumR_sa2\n",sumR_sa2,"\n\n") # print("results_a0\n",results2_list[0].T[50::int(gsize**2)]) print("OK REACHED END OF cuda relevant CODE\n") # make a list of inputs, each elelment for an action. and run parallal get_coo_ for each action # if save_file_for_each_a is true then each file must be named appopriately. if save_file_for_each_a == True: f1 = 'COO_Highway2D_T' + str(T) + '_a' f3 = '_of_' + str(num_actions) + 'A.npy' inputs = [(results2_list[i], nrzns, T, f1 + str(i) + f3) for i in range(num_actions)] else: inputs = [(results2_list[i], nrzns, T, None) for i in range(num_actions)] # coo_list_a is a list of coo for each each action for the given timestep. with Pool(num_actions) as p: coo_list_a = p.starmap(get_COO_, inputs) # print("coo print\n", coo.T[4880:4900, :]) print("\n\n") # print("time taken by cuda compute and transfer\n", (t2 - t1) / 60) # print("time taken for post processing to coo on cpu\n",(t3 - t2) / 60) return coo_list_a, sum_Rsa2_list
ex_gpu = cuda.to_device(f) ey_gpu = cuda.to_device(f) ez_gpu = cuda.to_device(f) hx_gpu = cuda.to_device(f) hy_gpu = cuda.to_device(f) hz_gpu = cuda.to_device(f) cex_gpu = cuda.to_device(set_c(cf, (None, -1, -1))) cey_gpu = cuda.to_device(set_c(cf, (-1, None, -1))) cez_gpu = cuda.to_device(set_c(cf, (-1, -1, None))) # prepare kernels from pycuda.compiler import SourceModule mod = SourceModule( kernels.replace('TPB', str(tpb)).replace('nyz', str(ny * nz)).replace( 'nx', str(nx)).replace('ny', str(ny)).replace('nz', str(nz))) update_h = mod.get_function("update_h") update_e = mod.get_function("update_e") update_src = mod.get_function("update_src") thx = mod.get_texref("thx") thy = mod.get_texref("thy") thz = mod.get_texref("thz") tcex = mod.get_texref("tcex") tcey = mod.get_texref("tcey") tcez = mod.get_texref("tcez") thx.set_address(hx_gpu, f.nbytes) thy.set_address(hy_gpu, f.nbytes) thz.set_address(hz_gpu, f.nbytes) tcex.set_address(cex_gpu, cf.nbytes)
mod = SourceModule(""" #include <math.h> // Updates matrix A by adding a scalar c multiplied by another matrix B (i.e. A -= c * B) // Only operates on the rows specified by batch: A[batch_j] -= lr * B[batch_i] // batch is assumed to 1 x p, A and B have q columns, c is a scalar __global__ void BatchMatSubtractInplaceKernel(const int p, const int q, const float c, float *A, const float *B, const int *batch) { int batch_index = blockIdx.y * blockDim.y + threadIdx.y; if (batch_index >= p) return; int row = batch[batch_index]; int col = blockIdx.x * blockDim.x + threadIdx.x; if (col >= q) return; atomicAdd(&A[row * q + col], -c * B[row * q + col]); } // Perform the update step for b/b_tilde using gradient descent // a[batch] -= lr * b[batch] __global__ void BatchVecSubtractInplaceKernel(const int p, const float lr, float *a, const float *b, const int *batch) { int batch_index = blockIdx.x * blockDim.x + threadIdx.x; if (batch_index >= p) return; int ind = batch[batch_index]; atomicAdd(&a[ind], -lr * b[ind]); } // For matrix A and vector b, multiply the i'th row of A by b[i] and store the result into the j'th row of C // Performs this operations only for the corresponding arrays of integers batch_i, batch_j // batch is assumed to be 1 x p, A has q columns __global__ void BatchMatVecRowMultKernel(const int p, const int q, const float *A, const float *b, float *C, const int *batch_i, const int *batch_j){ int batch_index = blockIdx.y * blockDim.y + threadIdx.y; if (batch_index >= p) return; int col = blockIdx.x * blockDim.x + threadIdx.x; if (col >= q) return; int row_A = batch_i[batch_index]; int row_C = batch_j[batch_index]; C[row_C * q + col] = A[row_A * q + col] * b[batch_index]; } // Copies the values from vector a to b, operating only on item indices specified by batch // batch is assumed to be 1 x p __global__ void BatchCopyVectorKernel(const int p, const float *a, float *b, const int *batch) { int batch_index = blockIdx.x * blockDim.x + threadIdx.x; if (batch_index >= p) return; int ind = batch[batch_index]; b[ind] = a[ind]; } // result = np.array([W[i].dot(W_tilde[j]) for i, j in batch]) // result must start off as a zero array __global__ void BatchMatColDotKernel(const int p, const int q, const float *W, const float *W_tilde, const int *batch_i, const int *batch_j, float *result) { int batch_index = blockIdx.y * blockDim.y + threadIdx.y; if (batch_index >= p) return; int col = blockIdx.x * blockDim.x + threadIdx.x; if (col >= q) return; int row_W = batch_i[batch_index]; int row_W_tilde = batch_j[batch_index]; atomicAdd(&result[batch_index], W[row_W * q + col] * W_tilde[row_W_tilde * q + col]); } """)
def ping_array(self): return self._ping_cuda.array(0,0) @property def pong_array(self): return self._pong_cuda.array(0,0) Density = Slab(GridWidth, GridHeight, 1, gl.GL_LINEAR) mod = SourceModule(""" surface<void, 2> surf; __global__ void kernel(int width, int height) { // Calculate surface coordinates unsigned int x = blockIdx.x * blockDim.x + threadIdx.x; unsigned int y = blockIdx.y * blockDim.y + threadIdx.y; if (x < 400 && y < 400) { float data = x / 400.f; // Write to output surface surf2Dwrite(data, surf, x*4, y); } } """) kernel_function = mod.get_function('kernel') surface_ref = mod.get_surfref('surf') # surface_ref.set_array(Density.ping_array,0) surface_ref.set_array(Density.ping_array) def Program(fragment): program = gloo.Program("vertex_passthrough.vert", fragment, count=4) program['Position'] = [(-1,-1), (-1,+1), (+1,-1), (+1,+1)]
def CudaColor(inPath, outPath): totalT0 = time.clock() im = Image.open(inPath) px = numpy.array(im) px = px.astype(numpy.float32) getAndConvertT1 = time.clock() allocT0 = time.clock() d_px = cuda.mem_alloc(px.nbytes) cuda.memcpy_htod(d_px, px) allocT1 = time.clock() #Kernel declaration kernelT0 = time.clock() #Kernel grid and block size BLOCK_SIZE = 1024 block = (1024, 1, 1) checkSize = numpy.int32(im.size[0] * im.size[1]) grid = (int(im.size[0] * im.size[1] / BLOCK_SIZE) + 1, 1, 1) #Kernel text kernel = """ __global__ void co( float *inIm, int check, int color){ int idx = (threadIdx.x ) + blockDim.x * blockIdx.x ; if(idx*3 < check*3) { if(color == 0) { inIm[idx*3+1] = inIm[idx*3+1]-255; inIm[idx*3+2] = inIm[idx*3+2]-255; } else if(color == 1) { inIm[idx*3] = inIm[idx*3]-255; inIm[idx*3+2] = inIm[idx*3+2]-255; } else if(color == 2) { inIm[idx*3] = inIm[idx*3]-255; inIm[idx*3+1] = inIm[idx*3+1]-255; } if(inIm[idx*3] < 0) inIm[idx*3] = 0; if(inIm[idx*3] > 255) inIm[idx*3] = 255; if(inIm[idx*3+1] < 0) inIm[idx*3+1] = 0; if(inIm[idx*3+1] > 255) inIm[idx*3+1] = 255; if(inIm[idx*3+2] < 0) inIm[idx*3+2] = 0; if(inIm[idx*3+2] > 255) inIm[idx*3+2] = 255; } } """ color = int( raw_input("Enter the color of the filter (0-Red;1-Green;2-Blue): ")) print color = numpy.int32(color) #Compile and get kernel function mod = SourceModule(kernel) func = mod.get_function("co") func(d_px, checkSize, color, block=block, grid=grid) kernelT1 = time.clock() #Get back data from gpu backDataT0 = time.clock() coPx = numpy.empty_like(px) cuda.memcpy_dtoh(coPx, d_px) coPx = (numpy.uint8(coPx)) backDataT1 = time.clock() #Save image storeImageT0 = time.clock() pil_im = Image.fromarray(coPx, mode="RGB") pil_im.save(outPath) totalT1 = time.clock() getAndConvertTime = getAndConvertT1 - totalT0 allocTime = allocT1 - allocT0 kernelTime = kernelT1 - kernelT0 backDataTime = backDataT1 - backDataT0 storeImageTime = totalT1 - storeImageT0 totalTime = totalT1 - totalT0 print "Color Filter" print "Image size : ", im.size print "Time taken to get and convert image data: ", getAndConvertTime print "Time taken to allocate memory on the GPU: ", allocTime print "Kernel execution time: ", kernelTime print "Time taken to get image data from GPU and convert it: ", backDataTime print "Time taken to save the image: ", storeImageTime print "Total execution time: ", totalTime print
def _diameter_kernel(): """Returns the CUDA kernel to estimate 3D diameter of structures in a 3D local window. """ diameter_kernel_src = """ texture<float, cudaTextureType3D, cudaReadModeElementType> tex_data; __global__ void diameter3d (unsigned int width, unsigned int height, unsigned int depth, const int n_points, const float norm_factor, const int max_iters, const int n_scan_angles, const int *X, const int *Y, const int *Z, const float *scan_angl_arr, const float *azth_data, const float *lat_data, float *radius_arr) { unsigned long blockId, idx; blockId = blockIdx.x + blockIdx.y * gridDim.x; idx = blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x; if (idx > n_points) { return; } float _x, _y, _z; _x = (float)X[idx] + 0.5; _y = (float)Y[idx] + 0.5; _z = (float)Z[idx] + 0.5; // ----------------------------------------- // Find the diameter float azth = azth_data[idx]; azth += M_PI_2; float lat = lat_data[idx]; float cy = cosf(lat), sy = sinf(lat); float cz = cosf(azth), sz = sinf(azth); //float cz = -sinf(azth), sz = cosf(azth); // taking into account azth + pi/2 //vector along fiber //float dx = -sy * sz; //float dy = cy* sz; //float dz = cy; float uvec[3] = {0.0, 0.0, 1.0}; float fiber_vector_x[3] = {0.0, uvec[2]*cy + uvec[3]*sy, -uvec[2]*sy + uvec[3]*cy}; float fiber_vector_z[3] = {fiber_vector_x[0]*cz - fiber_vector_x[1]*sz, fiber_vector_x[0]*sz + fiber_vector_x[1]*cz, fiber_vector_x[2]}; float dx = fiber_vector_z[0], dy = fiber_vector_z[1], dz = fiber_vector_z[2]; //scan vector perpendicular to a fiber vector (rotation X -> Z) float scan_vec[3] = {0, 1, 0}; // unit vector perpendicular to default (0,0,1) direction float rot_scan_vec_x[3] = {0.0, scan_vec[1]*cy + scan_vec[2]*sy, -scan_vec[1]*sy + scan_vec[2]*cy}; float rot_scan_vec_z[3] = {rot_scan_vec_x[0]*cz - rot_scan_vec_x[1]*sz, rot_scan_vec_x[0]*sz + rot_scan_vec_x[1]*cz, rot_scan_vec_x[2]}; float out_radius = 0; for (int scan_angl_idx = 0; scan_angl_idx < n_scan_angles; scan_angl_idx++) { float theta = scan_angl_arr[scan_angl_idx]; float ct = cosf(theta), st = sinf(theta); float x = rot_scan_vec_z[0], y = rot_scan_vec_z[1], z = rot_scan_vec_z[2]; float u = dx, v = dy, w = dz; //rotation of point (x,y,z) around axis (u,v,w) float scan_vec_coords[3] = {u*(u*x + v*y + w*z)*(1.0f - ct) + x*ct + (-w*y + v*z)*st, v*(u*x + v*y + w*z)*(1.0f - ct) + y*ct + (w*x - u*z)*st, w*(u*x + v*y + w*z)*(1.0f - ct) + z*ct + (-v*x + u*y)*st}; float nc[3] = {_x, _y, _z}; float p[3]; for (int i = 0; i < max_iters; i++) { nc[0] += scan_vec_coords[0]; nc[1] += scan_vec_coords[1]; nc[2] += scan_vec_coords[2]; if (tex3D(tex_data, nc[0], nc[1], nc[2]) == 0) { p[0] = nc[0]; p[1] = nc[1]; p[2] = nc[2]; break; } } out_radius += norm3df(p[0] - _x, p[1] - _y, p[2] - _z); } radius_arr[idx] = out_radius * norm_factor; } """ dm_program = SourceModule(diameter_kernel_src) diameter3d = dm_program.get_function("diameter3d") return dm_program, diameter3d
def CudaNegative(inPath, outPath): totalT0 = time.clock() im = Image.open(inPath) px = numpy.array(im) px = px.astype(numpy.float32) getAndConvertT1 = time.clock() allocT0 = time.clock() d_px = cuda.mem_alloc(px.nbytes) cuda.memcpy_htod(d_px, px) allocT1 = time.clock() #Kernel declaration kernelT0 = time.clock() #Kernel grid and block size BLOCK_SIZE = 1024 block = (1024, 1, 1) checkSize = numpy.int32(im.size[0] * im.size[1]) grid = (int(im.size[0] * im.size[1] / BLOCK_SIZE) + 1, 1, 1) #Kernel text kernel = """ __global__ void ng( float *inIm, int check ){ int idx = (threadIdx.x ) + blockDim.x * blockIdx.x ; if(idx *3 < check*3) { inIm[idx*3]= 255-inIm[idx*3]; inIm[idx*3+1]= 255-inIm[idx*3+1]; inIm[idx*3+2]= 255-inIm[idx*3+2]; } } """ #Compile and get kernel function mod = SourceModule(kernel) func = mod.get_function("ng") func(d_px, checkSize, block=block, grid=grid) kernelT1 = time.clock() #Get back data from gpu backDataT0 = time.clock() ngPx = numpy.empty_like(px) cuda.memcpy_dtoh(ngPx, d_px) ngPx = (numpy.uint8(ngPx)) backDataT1 = time.clock() #Save image storeImageT0 = time.clock() pil_im = Image.fromarray(ngPx, mode="RGB") pil_im.save(outPath) totalT1 = time.clock() getAndConvertTime = getAndConvertT1 - totalT0 allocTime = allocT1 - allocT0 kernelTime = kernelT1 - kernelT0 backDataTime = backDataT1 - backDataT0 storeImageTime = totalT1 - storeImageT0 totalTime = totalT1 - totalT0 print "Negative image" print "Image size: ", im.size print "Time taken to get and convert image data: ", getAndConvertTime print "Time taken to allocate memory on the GPU: ", allocTime print "Kernel execution time: ", kernelTime print "Time taken to get image data from GPU and convert it: ", backDataTime print "Time taken to save the image: ", storeImageTime print "Total execution time: ", totalTime print
def get_kernel(): kernel = SourceModule(""" #include <cstdint> #include <cfloat> __global__ void distance( uint32_t count, // size of incoming vectors (1D) and outgoing rates (2D) float *lats, // latitude of node N float *lons, // longitude of node N float *masses, // mass (population) of node N float p0, // gravity parameter 0, p0 * (m1 ** p1) * (m2 ** p2) * (distance ** p3) float p1, // gravity parameter 1 float p2, // gravity parameter 2 float p3, // gravity parameter 3 float *distances, // [row,col] rate from node row to node col float *rates // [row,col] rate from node row to node col ) { uint32_t x = blockIdx.x * blockDim.x + threadIdx.x; uint32_t y = blockIdx.y * blockDim.y + threadIdx.y; const double M_PI = 3.14; if ( (x < count) && (y < count) ) { float latx = lats[x] * 3.14159265f / 180; float lonx = lons[x] * 3.14159265f / 180; float laty = lats[y] * 3.14159265f / 180; float lony = lons[y] * 3.14159265f / 180; if ( (laty != latx) || (lony != lonx) ) { float popx = masses[x]; float popy = masses[y]; float a = 6378137; float b = 6356752.3142; float f = 1 / 298.257223563; float L = lony - lonx; float U1 = atan((1 - f) * tan(latx)); float U2 = atan((1 - f) * tan(laty)); float sinU1 = sin(U1); float cosU1 = cos(U1); float sinU2 = sin(U2); float cosU2 = cos(U2); float lambda = L; float lambdaP = 2 * M_PI; // Used in while iterations float sinlambda = 0.0; float coslambda = 0.0; float sinsigma = 0.0; float cossigma = 0.0; float sinalpha = 0.0; float cossqalpha = 0.0; float cos2sigmam = 0.0; float C = 0.0; float sigma = 0.0; uint32_t iterlimit = 20; float result = FLT_MAX; while (abs(lambda - lambdaP) > 1e-12 && --iterlimit > 0) { sinlambda = sin(lambda); coslambda = cos(lambda); sinsigma = sqrt((cosU2 * sinlambda) * (cosU2 * sinlambda) + (cosU1 * sinU2 - sinU1 * cosU2 * coslambda) * (cosU1 * sinU2 - sinU1 * cosU2 * coslambda)); if (sinsigma == 0) { result = 0.0f; break; //co-incident points } cossigma = sinU1 * sinU2 + cosU1 * cosU2 * coslambda; sigma = atan2(sinsigma, cossigma); sinalpha = cosU1 * cosU2 * sinlambda / sinsigma; cossqalpha = 1 - sinalpha * sinalpha; cos2sigmam = cossigma - 2 * sinU1 * sinU2 / cossqalpha; if (isnan(cos2sigmam)) cos2sigmam = 0; //equatorial line: cossqalpha=0 C = f / 16 * cossqalpha * (4 + f * (4 - 3 * cossqalpha)); lambdaP = lambda; lambda = L + (1 - C) * f * sinalpha * (sigma + C * sinsigma * (cos2sigmam + C * cossigma * (-1 + 2 * cos2sigmam * cos2sigmam))); } if (iterlimit > 0) { if (result != 0.0f) { float uSq = cossqalpha * (a * a - b * b) / (b * b); float A = 1 + uSq / 16384 * (4096 + uSq * (-768 + uSq * (320 - 175 * uSq))); float B = uSq / 1024 * (256 + uSq * (-128 + uSq * (74 - 47 * uSq))); float deltasigma = B * sinsigma * (cos2sigmam + B / 4 * (cossigma * (-1 + 2 * cos2sigmam * cos2sigmam) - B / 6 * cos2sigmam * (-3 + 4 * sinsigma * sinsigma) * (-3 + 4 * cos2sigmam * cos2sigmam))); float s = b * A * (sigma - deltasigma); result = float(s / 1000); } } else { result = float(nan("")); } float dist = result; distances[y * count + x] = dist; float rate = (dist != 0.0f) ? p0 * powf(popx, p1) * powf(popy, p2) * powf(dist, p3) : 0.0f; rates[y * count + x] = rate; // printf(\"Calculating %d (%f,%f:%d) -> %d (%f, %f:%d) = %f (%f)\\n\", x, latx, lonx, int(popx), y, laty, lony, int(popy), dist, rate); } else { // printf(\"Nodes %d and %d have the same lat/long.\\n\", x, y); distances[y * count + x] = 0.0f; rates[y * count + x] = 0.0f; } } return; } """) kernel_fn = kernel.get_function("distance") return kernel_fn
def CudaBrightness(inPath, outPath): totalT0 = time.clock() im = Image.open(inPath) px = numpy.array(im) px = px.astype(numpy.float32) getAndConvertT1 = time.clock() allocT0 = time.clock() d_px = cuda.mem_alloc(px.nbytes) cuda.memcpy_htod(d_px, px) allocT1 = time.clock() #Kernel declaration kernelT0 = time.clock() #Kernel grid and block size BLOCK_SIZE = 1024 block = (1024, 1, 1) checkSize = numpy.int32(im.size[0] * im.size[1]) grid = (int(im.size[0] * im.size[1] / BLOCK_SIZE) + 1, 1, 1) #Kernel text kernel = """ __global__ void br( float *inIm, int check, int brightness ){ int idx = (threadIdx.x ) + blockDim.x * blockIdx.x ; if(idx *3 < check*3) { if(inIm[idx*3]+brightness > 255) inIm[idx*3] = 255; else inIm[idx*3]= inIm[idx*3]+brightness; if(inIm[idx*3+1]+brightness > 255) inIm[idx*3+1] = 255; else inIm[idx*3+1]= inIm[idx*3+1]+brightness; if(inIm[idx*3+2]+brightness > 255) inIm[idx*3+2] = 255; else inIm[idx*3+2]= inIm[idx*3+2]+brightness; } } """ brightness = int( raw_input("Enter the level of brightness (-255 to 255): ")) print if brightness > 255: brightness = 255 if brightness < -255: brightness = -255 brightness = numpy.int32(brightness) #Compile and get kernel function mod = SourceModule(kernel) func = mod.get_function("br") func(d_px, checkSize, brightness, block=block, grid=grid) kernelT1 = time.clock() #Get back data from gpu backDataT0 = time.clock() brPx = numpy.empty_like(px) cuda.memcpy_dtoh(brPx, d_px) brPx = (numpy.uint8(brPx)) backDataT1 = time.clock() #Save image storeImageT0 = time.clock() pil_im = Image.fromarray(brPx, mode="RGB") pil_im.save(outPath) totalT1 = time.clock() getAndConvertTime = getAndConvertT1 - totalT0 allocTime = allocT1 - allocT0 kernelTime = kernelT1 - kernelT0 backDataTime = backDataT1 - backDataT0 storeImageTime = totalT1 - storeImageT0 totalTime = totalT1 - totalT0 print "Brightness filter" print "Image size : ", im.size print "Time taken to get and convert image data: ", getAndConvertTime print "Time taken to allocate memory on the GPU: ", allocTime print "Kernel execution time: ", kernelTime print "Time taken to get image data from GPU and convert it: ", backDataTime print "Time taken to save the image: ", storeImageTime print "Total execution time: ", totalTime print
__device__ int idint(){ const int blkId=blockIdx.x+blockIdx.y*gridDim.x+blockIdx.z*gridDim.x*gridDim.y; return threadIdx.x+threadIdx.y*blockDim.x+threadIdx.z*blockDim.x*blockDim.y+blkId*blockDim.x*blockDim.y*blockDim.z; } __global__ void test(Gfloat* outs, const Gfloat* ins, const int* sizes) { int thdId=idint(); if (thdId >= sizes[0]) { return; } outs[thdId] *= outs[thdId] * ins[thdId]; __syncthreads(); __shared__ int sharedata[128]; } """ mod = SourceModule(source) test = mod.get_function("test") size = 100 a = np.arange(size, dtype=np.float64) b = np.arange(size, dtype=np.float64) outs = drv.InOut(a) ins = drv.In(b) sizes = drv.In(np.array([size], dtype=np.int32)) test(outs, ins, sizes, block=(size, 1, 1), grid=(1, 1)) print("resule:", a.shape) def img2arr(img): if type(img) in [str, unicode]: import PIL.Image img = PIL.Image.open(img)
class TSDFVolume(object): def __init__(self, vol_bnds, voxel_size): # Define voxel volume parameters. self._vol_bnds = vol_bnds # 3x2, rows: (x, y, z), columns: (min, max) in world coordinates in meters self._voxel_size = voxel_size # in meters (determines volume discretization and resolution) self._trunc_margin = self._voxel_size * 5 # truncation on SDF # Adjust volume bounds. self._vol_dim = np.ceil((self._vol_bnds[:, 1] - self._vol_bnds[:, 0]) / self._voxel_size).copy(order='C').astype( int) # ensure C-order contigous self._vol_bnds[:, 1] = self._vol_bnds[:, 0] + self._vol_dim * self._voxel_size self._vol_origin = self._vol_bnds[:, 0].copy(order='C').astype( np.float32) # ensure C-order contigous print("Voxel volume size: {:d} x {:d} x {:d}".format( self._vol_dim[0], self._vol_dim[1], self._vol_dim[2])) # Initialize pointers to voxel volume in CPU memory. self._tsdf_vol_cpu = np.ones(self._vol_dim).astype(np.float32) self._weight_vol_cpu = np.zeros(self._vol_dim).astype( np.float32 ) # for computing the cumulative moving average of observations per voxel self._color_vol_cpu = np.zeros(self._vol_dim).astype(np.float32) # Copy voxel volumes to GPU. if TSDF_GPU_MODE: self._tsdf_vol_gpu = cuda.mem_alloc(self._tsdf_vol_cpu.nbytes) cuda.memcpy_htod(self._tsdf_vol_gpu, self._tsdf_vol_cpu) self._weight_vol_gpu = cuda.mem_alloc(self._weight_vol_cpu.nbytes) cuda.memcpy_htod(self._weight_vol_gpu, self._weight_vol_cpu) self._color_vol_gpu = cuda.mem_alloc(self._color_vol_cpu.nbytes) cuda.memcpy_htod(self._color_vol_gpu, self._color_vol_cpu) # Cuda kernel function (C++) self._cuda_src_mod = SourceModule(""" __global__ void integrate(float * tsdf_vol, float * weight_vol, float * color_vol, float * vol_dim, float * vol_origin, float * cam_intr, float * cam_pose, float * other_params, float * color_im, float * depth_im) { // Get voxel index. int gpu_loop_idx = (int) other_params[0]; int max_threads_per_block = blockDim.x; int block_idx = blockIdx.z * gridDim.y * gridDim.x + blockIdx.y * gridDim.x + blockIdx.x; int voxel_idx = gpu_loop_idx * gridDim.x * gridDim.y * gridDim.z * max_threads_per_block + block_idx * max_threads_per_block + threadIdx.x; int vol_dim_x = (int)vol_dim[0]; int vol_dim_y = (int)vol_dim[1]; int vol_dim_z = (int)vol_dim[2]; if (voxel_idx > vol_dim_x * vol_dim_y * vol_dim_z) return; // Get voxel grid coordinates. float voxel_x = floorf(((float)voxel_idx) / ((float)(vol_dim_y * vol_dim_z))); float voxel_y = floorf(((float)(voxel_idx - ((int)voxel_x) * vol_dim_y * vol_dim_z)) / ((float)vol_dim_z)); float voxel_z = (float)(voxel_idx - ((int)voxel_x) * vol_dim_y * vol_dim_z - ((int)voxel_y) * vol_dim_z); // Voxel grid coordinates to world coordinates. float voxel_size = other_params[1]; float pt_x = vol_origin[0] + voxel_x * voxel_size; float pt_y = vol_origin[1] + voxel_y * voxel_size; float pt_z = vol_origin[2] + voxel_z * voxel_size; // World coordinates to camera coordinates. float tmp_pt_x = pt_x - cam_pose[0*4+3]; float tmp_pt_y = pt_y - cam_pose[1*4+3]; float tmp_pt_z = pt_z - cam_pose[2*4+3]; float cam_pt_x = cam_pose[0*4+0] * tmp_pt_x + cam_pose[1*4+0] * tmp_pt_y + cam_pose[2*4+0] * tmp_pt_z; float cam_pt_y = cam_pose[0*4+1] * tmp_pt_x + cam_pose[1*4+1] * tmp_pt_y + cam_pose[2*4+1] * tmp_pt_z; float cam_pt_z = cam_pose[0*4+2] * tmp_pt_x + cam_pose[1*4+2] * tmp_pt_y + cam_pose[2*4+2] * tmp_pt_z; // Camera coordinates to image pixels. int pixel_x = (int) roundf(cam_intr[0*3+0] * (cam_pt_x / cam_pt_z) + cam_intr[0*3+2]); int pixel_y = (int) roundf(cam_intr[1*3+1] * (cam_pt_y / cam_pt_z) + cam_intr[1*3+2]); // Skip if outside view frustum. int im_h = (int) other_params[2]; int im_w = (int) other_params[3]; if (pixel_x < 0 || pixel_x >= im_w || pixel_y < 0 || pixel_y >= im_h || cam_pt_z < 0) return; // Skip invalid depth. float depth_value = depth_im[pixel_y*im_w+pixel_x]; if (depth_value == 0) return; // Integrate TSDF. float trunc_margin = other_params[4]; float depth_diff = depth_value-cam_pt_z; if (depth_diff < -trunc_margin) return; float dist = fmin(1.0f, depth_diff / trunc_margin); float w_old = weight_vol[voxel_idx]; float obs_weight = other_params[5]; float w_new = w_old + obs_weight; weight_vol[voxel_idx] = w_new; tsdf_vol[voxel_idx] = (tsdf_vol[voxel_idx] * w_old + dist) / w_new; // Integrate color. float old_color = color_vol[voxel_idx]; float old_b = floorf(old_color / (256 * 256)); float old_g = floorf((old_color - old_b * 256 * 256) / 256); float old_r = old_color - old_b * 256 * 256 - old_g * 256; float new_color = color_im[pixel_y*im_w+pixel_x]; float new_b = floorf(new_color / (256 * 256)); float new_g = floorf((new_color - new_b * 256 * 256) / 256); float new_r = new_color - new_b * 256 * 256 - new_g * 256; new_b = fmin(roundf((old_b*w_old + new_b) / w_new), 255.0f); new_g = fmin(roundf((old_g*w_old + new_g) / w_new), 255.0f); new_r = fmin(roundf((old_r*w_old + new_r) / w_new), 255.0f); color_vol[voxel_idx] = new_b * 256 * 256 + new_g * 256 + new_r; }""") self._cuda_integrate = self._cuda_src_mod.get_function("integrate") # Determine block/grid size on GPU. gpu_dev = cuda.Device(0) self._max_gpu_threads_per_block = gpu_dev.MAX_THREADS_PER_BLOCK n_blocks = int( np.ceil( float(np.prod(self._vol_dim)) / float(self._max_gpu_threads_per_block))) grid_dim_x = min(gpu_dev.MAX_GRID_DIM_X, int(np.floor(np.cbrt(n_blocks)))) grid_dim_y = min(gpu_dev.MAX_GRID_DIM_Y, int(np.floor(np.sqrt(n_blocks / grid_dim_x)))) grid_dim_z = min( gpu_dev.MAX_GRID_DIM_Z, int(np.ceil(float(n_blocks) / float(grid_dim_x * grid_dim_y)))) self._max_gpu_grid_dim = np.array( [grid_dim_x, grid_dim_y, grid_dim_z]).astype(int) self._n_gpu_loops = int( np.ceil( float(np.prod(self._vol_dim)) / float( np.prod(self._max_gpu_grid_dim) * self._max_gpu_threads_per_block))) def integrate(self, color_im, depth_im, cam_intr, cam_pose, obs_weight=1.): im_h = depth_im.shape[0] im_w = depth_im.shape[1] # Fold RGB color image into a single channel image. color_im = color_im.astype(np.float32) color_im = np.floor(color_im[:, :, 2] * 256 * 256 + color_im[:, :, 1] * 256 + color_im[:, :, 0]) # GPU mode: integrate voxel volume (calls CUDA kernel). if TSDF_GPU_MODE: for gpu_loop_idx in range(self._n_gpu_loops): self._cuda_integrate( self._tsdf_vol_gpu, self._weight_vol_gpu, self._color_vol_gpu, cuda.InOut(self._vol_dim.astype(np.float32)), cuda.InOut(self._vol_origin.astype(np.float32)), cuda.InOut(cam_intr.reshape(-1).astype(np.float32)), cuda.InOut(cam_pose.reshape(-1).astype(np.float32)), cuda.InOut( np.asarray([ gpu_loop_idx, self._voxel_size, im_h, im_w, self._trunc_margin, obs_weight ], np.float32)), cuda.InOut(color_im.reshape(-1).astype(np.float32)), cuda.InOut(depth_im.reshape(-1).astype(np.float32)), block=(self._max_gpu_threads_per_block, 1, 1), grid=(int(self._max_gpu_grid_dim[0]), int(self._max_gpu_grid_dim[1]), int(self._max_gpu_grid_dim[2]))) # CPU mode: integrate voxel volume (vectorized implementation). else: # Get voxel grid coordinates. xv, yv, zv = np.meshgrid(range(self._vol_dim[0]), range(self._vol_dim[1]), range(self._vol_dim[2]), indexing='ij') vox_coords = np.concatenate( (xv.reshape(1, -1), yv.reshape(1, -1), zv.reshape(1, -1)), axis=0).astype(int) # Voxel coordinates to world coordinates. world_pts = self._vol_origin.reshape( -1, 1) + vox_coords.astype(float) * self._voxel_size # World coordinates to camera coordinates. world2cam = np.linalg.inv(cam_pose) cam_pts = np.dot(world2cam[:3, :3], world_pts) + np.tile( world2cam[:3, 3].reshape(3, 1), (1, world_pts.shape[1])) # Camera coordinates to image pixels. pix_x = np.round(cam_intr[0, 0] * (cam_pts[0, :] / cam_pts[2, :]) + cam_intr[0, 2]).astype(int) pix_y = np.round(cam_intr[1, 1] * (cam_pts[1, :] / cam_pts[2, :]) + cam_intr[1, 2]).astype(int) # Skip if outside view frustum. valid_pix = np.logical_and( pix_x >= 0, np.logical_and( pix_x < im_w, np.logical_and( pix_y >= 0, np.logical_and(pix_y < im_h, cam_pts[2, :] > 0)))) depth_val = np.zeros(pix_x.shape) depth_val[valid_pix] = depth_im[pix_y[valid_pix], pix_x[valid_pix]] # Integrate TSDF. depth_diff = depth_val - cam_pts[2, :] valid_pts = np.logical_and(depth_val > 0, depth_diff >= -self._trunc_margin) dist = np.minimum(1., np.divide(depth_diff, self._trunc_margin)) w_old = self._weight_vol_cpu[vox_coords[0, valid_pts], vox_coords[1, valid_pts], vox_coords[2, valid_pts]] w_new = w_old + obs_weight self._weight_vol_cpu[vox_coords[0, valid_pts], vox_coords[1, valid_pts], vox_coords[2, valid_pts]] = w_new tsdf_vals = self._tsdf_vol_cpu[vox_coords[0, valid_pts], vox_coords[1, valid_pts], vox_coords[2, valid_pts]] self._tsdf_vol_cpu[vox_coords[0, valid_pts], vox_coords[1, valid_pts], vox_coords[2, valid_pts]] = np.divide( np.multiply(tsdf_vals, w_old) + dist[valid_pts], w_new) # Integrate color. old_color = self._color_vol_cpu[vox_coords[0, valid_pts], vox_coords[1, valid_pts], vox_coords[2, valid_pts]] old_b = np.floor(old_color / (256. * 256.)) old_g = np.floor((old_color - old_b * 256. * 256.) / 256.) old_r = old_color - old_b * 256. * 256. - old_g * 256. new_color = color_im[pix_y[valid_pts], pix_x[valid_pts]] new_b = np.floor(new_color / (256. * 256.)) new_g = np.floor((new_color - new_b * 256. * 256.) / 256.) new_r = new_color - new_b * 256. * 256. - new_g * 256. new_b = np.minimum( np.round(np.divide(np.multiply(old_b, w_old) + new_b, w_new)), 255.) new_g = np.minimum( np.round(np.divide(np.multiply(old_g, w_old) + new_g, w_new)), 255.) new_r = np.minimum( np.round(np.divide(np.multiply(old_r, w_old) + new_r, w_new)), 255.) self._color_vol_cpu[vox_coords[0, valid_pts], vox_coords[ 1, valid_pts], vox_coords[ 2, valid_pts]] = new_b * 256. * 256. + new_g * 256. + new_r # Copy voxel volume to CPU. def get_volume(self): if TSDF_GPU_MODE: cuda.memcpy_dtoh(self._tsdf_vol_cpu, self._tsdf_vol_gpu) cuda.memcpy_dtoh(self._color_vol_cpu, self._color_vol_gpu) return self._tsdf_vol_cpu, self._color_vol_cpu # Get mesh of voxel volume via marching cubes. def get_mesh(self): tsdf_vol, color_vol = self.get_volume() # Marching cubes. verts, faces, norms, _ = measure.marching_cubes_lewiner(tsdf_vol, level=0) verts_ind = np.round(verts).astype(int) verts = verts * self._voxel_size + self._vol_origin # voxel grid coordinates to world coordinates # Get vertex colors. rgb_vals = color_vol[verts_ind[:, 0], verts_ind[:, 1], verts_ind[:, 2]] colors_b = np.floor(rgb_vals / (256 * 256)) colors_g = np.floor((rgb_vals - colors_b * 256 * 256) / 256) colors_r = rgb_vals - colors_b * 256 * 256 - colors_g * 256 colors = np.floor(np.asarray([colors_r, colors_g, colors_b])).T colors = colors.astype(np.uint8) return verts, faces, norms, colors