def testBatchMatColDotKernel(): batch_size = 128 num_rows = 1000 num_cols = 200 c = 0.05 a_cpu = np.random.rand(num_rows, num_cols).astype('f') b_cpu = np.random.rand(num_rows, num_cols).astype('f') batch_i = np.random.choice(np.arange(num_rows, dtype=np.int32), size=batch_size, replace=False) batch_j = np.random.choice(np.arange(num_rows, dtype=np.int32), size=batch_size, replace=False) batch = zip(batch_i, batch_j) a_gpu = gpuarray.to_gpu(a_cpu) b_gpu = gpuarray.to_gpu(b_cpu) batch_i_gpu = gpuarray.to_gpu(batch_i) batch_j_gpu = gpuarray.to_gpu(batch_j) result = gpuarray.zeros(batch_size, dtype=np.float32) func = mod.get_function("BatchMatColDotKernel") func(np.int32(batch_size), np.int32(num_cols), a_gpu, b_gpu, batch_i_gpu, batch_j_gpu, result, \ block=(num_threads_x, num_threads_y, 1), grid=(num_blocks_x, num_blocks_y)) context.synchronize() gpu_result = result.get() actual_result = np.array([a_cpu[i].dot(b_cpu[j]) for i, j in batch]) assertResultsClose(gpu_result, actual_result)
def testBatchCopyVectorKernel(): batch_size = 128 num_elems = 1000 a_cpu = np.random.rand(num_elems).astype('f') b_cpu = np.random.rand(num_elems).astype('f') batch = np.random.choice(np.arange(num_elems, dtype=np.int32), size=batch_size, replace=False) a_gpu = gpuarray.to_gpu(a_cpu) b_gpu = gpuarray.to_gpu(b_cpu) batch_gpu = gpuarray.to_gpu(batch) func = mod.get_function("BatchCopyVectorKernel") func(np.int32(batch_size), a_gpu, b_gpu, batch_gpu, block=(num_threads_x * num_threads_y, 1, 1), \ grid=(num_blocks_x * num_blocks_y, 1)) context.synchronize() b_cpu[batch] = a_cpu[batch] gpu_result = b_gpu.get() actual_result = b_cpu assertResultsClose(gpu_result, actual_result)
def testBatchMatVecRowMultKernel(): batch_size = 128 num_rows = 1000 num_cols = 200 a_cpu = np.random.rand(num_rows, num_cols).astype('f') b_cpu = np.random.rand(batch_size).astype('f') c_cpu = np.random.rand(num_rows, num_cols).astype('f') batch_i = np.random.choice(np.arange(num_rows, dtype=np.int32), size=batch_size, replace=False) batch_j = np.random.choice(np.arange(num_rows, dtype=np.int32), size=batch_size, replace=False) a_gpu = gpuarray.to_gpu(a_cpu) b_gpu = gpuarray.to_gpu(b_cpu) batch_i_gpu = gpuarray.to_gpu(batch_i) batch_j_gpu = gpuarray.to_gpu(batch_j) c_gpu = gpuarray.to_gpu(c_cpu) func = mod.get_function("BatchMatVecRowMultKernel") func(np.int32(batch_size), np.int32(num_cols), a_gpu, b_gpu, c_gpu, batch_i_gpu, batch_j_gpu, \ block=(num_threads_x, num_threads_y, 1), grid=(num_blocks_x, num_blocks_y)) context.synchronize() c_cpu[batch_j] = (a_cpu[batch_i].T * b_cpu).T gpu_result = c_gpu.get() actual_result = c_cpu assertResultsClose(gpu_result, actual_result)
def testBatchVecSubtractInplaceKernel(): batch_size = 128 num_elems = 1000 c = 0.05 a_cpu = np.random.rand(num_elems).astype('f') b_cpu = np.random.rand(num_elems).astype('f') batch = np.random.choice(np.arange(num_elems, dtype=np.int32), size=batch_size, replace=False) a_gpu = gpuarray.to_gpu(a_cpu) b_gpu = gpuarray.to_gpu(b_cpu) batch_gpu = gpuarray.to_gpu(batch) func = mod.get_function("BatchVecSubtractInplaceKernel") func(np.int32(batch_size), np.float32(c), a_gpu, b_gpu, batch_gpu, \ block=(num_threads_x, 1, 1), grid=(num_blocks_x, 1)) context.synchronize() a_cpu[batch] -= c * b_cpu[batch] gpu_result = a_gpu.get() actual_result = a_cpu assertResultsClose(gpu_result, actual_result)
def ring_allreduce(self, send, recv, op=None): op = self.get_op(op) send_buff = self.buff(send) recv_buff = self.buff(recv) accum = self.temp(send.shape) accum[:] = send[:] context.synchronize() left = ((self.rank - 1) + self.size) % self.size right = (self.rank + 1) % self.size for i in range(self.size - 1): if i % 2 == 0: # Send send_buff send_req = self.comm.Isend(send_buff, dest=right) self.comm.Recv(recv_buff, source=left) # accum[:] += recv[:] op(recv, accum) else: # Send recv_buff send_req = self.comm.Isend(recv_buff, dest=right) self.comm.Recv(send_buff, source=left) # accum[:] += send[:] op(send, accum) send_req.Wait() context.synchronize() recv[:] = accum[:]
def generateSignals(self): starttime = time.time() parameterCombinations = [] ffVector = np.linspace(0, 1, self.NFF) for ffInd in range(len(ffVector)): for t2Ind in range(len(self.T2Points)): for b1Ind in range(len(self.B1Points)): parameterCombinations.append( (self.T2Points[t2Ind], self.B1Points[b1Ind], ffVector[ffInd])) parameterCombinations = np.array(parameterCombinations, dtype=np.float32) nParams = parameterCombinations.shape[0] signalsOut_gpu = ga.zeros((nParams * self.NEchoes), np.float32) print("Compiling/loading CUDA module...") cuda_cpmg = getCudaFunction(self.NEchoes, self.EchoSpacing, self.T1f, self.T1w, self.MagPreparePulse) print("Generating signals...") params_gpu = ga.to_gpu(parameterCombinations.ravel()) sp90_gpu = ga.to_gpu(self.sliceProf90.squeeze().astype(np.float32)) sp180_gpu = ga.to_gpu(self.sliceProf180.squeeze().astype(np.float32)) nBlocks = int(np.ceil(float(nParams) / self.CudaBlockSize)) cuda_cpmg(np.uint32(nParams), np.uint32(self.sliceProf90.shape[0]), np.float32(self.fatT2), sp90_gpu, sp180_gpu, params_gpu, signalsOut_gpu, block=(self.CudaBlockSize, 1, 1), grid=(nBlocks, 1)) signalsOut_gpu = signalsOut_gpu.reshape((nParams, self.NEchoes)) context.synchronize() signalsOut = signalsOut_gpu.get() print("Done") print("Time taken:", time.time() - starttime) params_gpu.gpudata.free() sp90_gpu.gpudata.free() sp180_gpu.gpudata.free() self.allSignals = signalsOut self.parameterCombinations = parameterCombinations self.signalsReady = True
def insert(self, x): context.synchronize() if TESTGPU: [t1, t2] = self.forwardGPU(x) [self.out, self.deriv] = self.forwardCPU(x) for i in range(len(self.out)): assert np.fabs(t1[i] - self.out[i]) < TOL assert np.fabs(t2[i] - self.deriv[i]) < TOL elif GPU: [self.out, self.deriv] = self.forwardGPU(x) else: [self.out, self.deriv] = self.forwardCPU(x) context.synchronize() return self.out
def compute(volume, offset): bsize = (32, 32, 1) gsize = (int(volume[0] / bsize[0]), int(volume[1] / bsize[1]), int(volume[2])) DEFINES = '\n#define SCALE ' + str(1) + \ '\n#define WIDTH ' + str(volume[0]) + \ '\n#define HEIGHT ' + str(volume[1]) + \ '\n#define bwidth ' + str(bsize[0]) + \ '\n#define bheight ' + str(bsize[1]) + \ '\n#define offx ' + str(-offset[0]) + \ '\n#define offy ' + str(-offset[1]) + \ '\n#define offz ' + str(-offset[2]) + \ '\n#define DEPTH ' + str(volume[2]) + \ '\n#define bdepth ' + str(1) + '\n' # inutile # Non optimal method path = os.path.split(__file__)[0] + '/cuda/' kernel_cu = open(path + 'kernel.cu', 'r') kernel_buf = kernel_cu.read() # Load complex2.cu complex2_cu = open(path + 'complex2.cu', 'r') complex2_buf = complex2_cu.read() # Load vectors.cu vectors_cu = open(path + 'vectors.cu', 'r') vectors_buf = vectors_cu.read() # Import cu files inside the kernel and copy the defines cu_buffer = vectors_buf + '\n' + complex2_buf kernel_buf = kernel_buf.replace('%DEFINES%', DEFINES).replace('%CUFILES%', cu_buffer) mod = SourceModule(kernel_buf, "nvcc", include_dirs=["/usr/local/cuda/include"], no_extern_c=True) compute = mod.get_function("compute") # Array di uscita dest = np.zeros(volume[0] * volume[1] * volume[2]).astype(np.float32) compute(drv.Out(dest), block=bsize, grid=gsize) context.synchronize() return dest
def findmax_gpu(corrMatrix_gpu, ffValues_gpu, ffParams_gpu): nVoxels = ffValues_gpu.shape[0] nParams = ffParams_gpu.shape[0] indexOut_gpu = ga.zeros((nVoxels), np.int32) nBlocks = int(np.ceil(float(nVoxels) / CUDA_BLOCK_SIZE)) findmax_gpu_fn(corrMatrix_gpu, ffValues_gpu, np.uint32(nVoxels), ffParams_gpu, np.uint32(nParams), indexOut_gpu, block=(CUDA_BLOCK_SIZE, 1, 1), grid=(nBlocks, 1)) context.synchronize() indexOut_host = indexOut_gpu.get() indexOut_gpu.gpudata.free() return indexOut_host
def bfs_sa(root, g): v = g.vertex_count e = g.edge_count beg_pos = numpy.asarray(g.beg_pos, dtype = numpy.long) csr = numpy.asarray(g.csr, dtype = numpy.long) weight = numpy.asarray(g.weight, dtype = numpy.float64) sa = numpy.zeros(v, dtype = numpy.float64) flag_traverse = numpy.ones(1, dtype = numpy.bool) print "v=" + str(v) + ", e=" + str(e) traverse_one = mod.get_function("traverse_one") traverse_one(drv.In(beg_pos), drv.In(csr) ,drv.In(weight), drv.Out(sa), drv.In(flag_traverse), block = (v,1,1)) context.synchronize() print sa printresult(sa)
if (i < numElements) { res[i] = 23.0 * a[i] + b[i]; } } """) func = mod.get_function("zaxpy") # Warmup func(a_gpu, b_gpu, res_gpu, np.int64(size), block=(16, 16, 1)) func(a_gpu, b_gpu, res_gpu, np.int64(size), block=(16, 16, 1)) func(a_gpu, b_gpu, res_gpu, np.int64(size), block=(16, 16, 1)) context.synchronize() print(time.time()) start = time.time() for i in range(10): func(a_gpu, b_gpu, res_gpu, np.int64(size), block=(16, 16, 1), grid=(16, 16, 1)) context.synchronize() end = time.time() print(time.time())
devB2[:] = devB1 devC2 = ng.empty(dimC, dtype=np.float32) # devC2 = devC2s.share(dimC, dtype=np.float32) devC2[:] = devC1 if op[0] == 't': devA1, devA2 = devA1.T, devA2.T if op[1] == 't': devB1, devB2 = devB1.T, devB2.T for tile in (32,64,128): if op == 'nt' and tile != 128: continue try: ng.dot(devA1, devB1, devC1, alpha=alpha, beta=beta, size=tile) context.synchronize() cublas_dot(devA2, devB2, devC2, alpha=alpha, beta=beta) partial1 = ng.empty((devC1.shape[0],1), dtype=np.float32) partial2 = partial1[0:1,0:1] if ng.min(ng.finite(devC1), partial=partial1, out=partial2).get()[0,0] == 0.0: print("Error: NaN KCN: (%d,%d,%d) ab: (%f,%f) dtype: %d" % (K,C,N, alpha,beta, itemsize)) exit() diff = ng.max(abs(devC2 - devC1), partial=partial1, out=partial2).get()[0,0] mean = ng.mean(abs(devC2), partial=partial1, out=partial2).get()[0,0] pctErr = 100 * diff / mean
def _project( self, camera_projection: geo.CameraProjection, ) -> np.ndarray: """Perform the projection over just one image. Args: camera_projection (geo.CameraProjection): a camera projection transform. Raises: RuntimeError: if the projector has not been initialized. Returns: np.ndarray: the output projection for each material. """ if not self.initialized: raise RuntimeError("Projector has not been initialized.") # initialize projection-specific arguments camera_center_in_volume = np.array(camera_projection.get_center_in_volume(self.volume)).astype(np.float32) logger.debug(f'camera_center_ijk (source point): {camera_center_in_volume}') ijk_from_index = camera_projection.get_ray_transform(self.volume) ijk_from_index = np.array(ijk_from_index).astype(np.float32) # spacing spacing = self.volume.spacing # copy the projection matrix to CUDA (output array initialized to zero by the kernel) cuda.memcpy_htod(self.rt_kinv_gpu, ijk_from_index) # Make the arguments to the CUDA "projectKernel". args = [ np.int32(self.camera_intrinsics.sensor_width), # out_width np.int32(self.camera_intrinsics.sensor_height), # out_height np.float32(self.step), # step np.float32(-0.5), # gVolumeEdgeMinPointX np.float32(-0.5), # gVolumeEdgeMinPointY np.float32(-0.5), # gVolumeEdgeMinPointZ np.float32(self.volume.shape[0] - 0.5), # gVolumeEdgeMaxPointX np.float32(self.volume.shape[1] - 0.5), # gVolumeEdgeMaxPointY np.float32(self.volume.shape[2] - 0.5), # gVolumeEdgeMaxPointZ np.float32(spacing[0]), # gVoxelElementSizeX np.float32(spacing[1]), # gVoxelElementSizeY np.float32(spacing[2]), # gVoxelElementSizeZ camera_center_in_volume[0], # sx camera_center_in_volume[1], # sy camera_center_in_volume[2], # sz self.rt_kinv_gpu, # RT_Kinv self.output_gpu, # output ] # Calculate required blocks blocks_w = np.int(np.ceil(self.sensor_size[0] / self.threads)) blocks_h = np.int(np.ceil(self.sensor_size[1] / self.threads)) block = (8, 8, 1) # lfkj("running:", blocks_w, "x", blocks_h, "blocks with ", self.threads, "x", self.threads, "threads") if blocks_w <= self.max_block_index and blocks_h <= self.max_block_index: offset_w = np.int32(0) offset_h = np.int32(0) self.project_kernel(*args, offset_w, offset_h, block=block, grid=(blocks_w, blocks_h)) else: # lfkj("running kernel patchwise") for w in range((blocks_w - 1) // (self.max_block_index + 1)): for h in range((blocks_h - 1) // (self.max_block_index + 1)): offset_w = np.int32(w * self.max_block_index) offset_h = np.int32(h * self.max_block_index) self.project_kernel(*args, offset_w, offset_h, block=block, grid=(self.max_block_index, self.max_block_index)) context.synchronize() # copy the output to CPU output = np.empty(self.output_shape, np.float32) cuda.memcpy_dtoh(output, self.output_gpu) # transpose the axes, which previously have width on the slow dimension output = np.swapaxes(output, 0, 1).copy() # normalize to centimeters output /= 10 return output
def minimize_batch(self, batch_size, eta, opt_method): #using a batch_gradient descent method cost = 0.0 eps = 1e-8 # for use in adagrad for lower_index in range(0, len(self.nonzeros), batch_size): upper_index = min(lower_index + batch_size, len(self.nonzeros)) # index of first element after the end of this batch batch = [self.nonzeros[k] for k in range(lower_index, upper_index)] batch_i = [index[0] for index in batch] batch_j = [index[1] for index in batch] cur_batch_len = np.int32(upper_index - lower_index) batch_i_gpu = gpuarray.to_gpu(np.array(batch_i, dtype=np.int32)) batch_j_gpu = gpuarray.to_gpu(np.array(batch_j, dtype=np.int32)) cost_inner = gpuarray.zeros(batch_size, dtype=np.float32) weighted_cost_inner = gpuarray.zeros_like(cost_inner) # calculate intermediate values # cost_inner = + self.b[batch_i] + \ # self.b_tilde[batch_j] - np.log(np.array([self.cooccurrence_mat[k] for k in range(lower_index, upper_index)])) batchMatColDot(cur_batch_len, self.v_dim, self.W, self.W_tilde, batch_i_gpu, batch_j_gpu, cost_inner, \ block=(self.blockDim_x, self.blockDim_y, 1), grid=(self.numBlocks_x, self.numBlocks_y)) context.synchronize() if lower_index == 0: print cost_inner.get() batchCostInner(np.int32(lower_index), np.int32(upper_index), cost_inner, self.b, self.b_tilde, \ self.cooccurrence_mat, batch_i_gpu, batch_j_gpu, block=(self.blockDim, 1, 1), grid=(self.numBlocks, 1)) if lower_index == 0: print cost_inner.get() context.synchronize() # weighted_cost_inner = np.array([self.f_x[k] for k in range(lower_index_upper_index)]) * cost_inner batchWeightedInnerCost(np.int32(lower_index), np.int32(upper_index), self.f_x, cost_inner, weighted_cost_inner, \ block=(self.blockDim, 1, 1), grid=(self.numBlocks, 1)) if lower_index == 0: print weighted_cost_inner.get() context.synchronize() # calculate the gradients of each parameter # self.gradW[batch_i] = (self.W_tilde[batch_j].T * weighted_cost_inner).T batchMatVecRowMult(cur_batch_len, self.v_dim, self.W_tilde, weighted_cost_inner, self.gradW, batch_j_gpu, batch_i_gpu, \ block=(self.blockDim_x, self.blockDim_y, 1), grid=(self.numBlocks_x, self.numBlocks_y)) # self.gradW_tilde[batch_j] = (self.W[batch_i].T * weighted_cost_inner).T batchMatVecRowMult(cur_batch_len, self.v_dim, self.W, weighted_cost_inner, self.gradW_tilde, batch_i_gpu, batch_j_gpu, \ block=(self.blockDim_x, self.blockDim_y, 1), grid=(self.numBlocks_x, self.numBlocks_y)) # self.gradb[batch_i] = self.gradb_tilde[batch_j] = weighted_cost_inner batchCopyVector(cur_batch_len, weighted_cost_inner, self.b, batch_i_gpu, \ block=(self.blockDim, 1, 1), grid=(self.numBlocks, 1)) batchCopyVector(cur_batch_len, weighted_cost_inner, self.b_tilde, batch_j_gpu, \ block=(self.blockDim, 1, 1), grid=(self.numBlocks, 1)) context.synchronize() # perform the main parameter updates # self.W[batch_i] -= eta * self.gradW[batch_i] batchMatSubtractInplace(cur_batch_len, self.v_dim, eta, self.W, self.gradW, batch_i_gpu, \ block=(self.blockDim_x, self.blockDim_y, 1), grid=(self.numBlocks_x, self.numBlocks_y)) # self.W_tilde[batch_j] -= eta * self.gradW_tilde[batch_j] batchMatSubtractInplace(cur_batch_len, self.v_dim, eta, self.W_tilde, self.gradW_tilde, batch_j_gpu, \ block=(self.blockDim_x, self.blockDim_y, 1), grid=(self.numBlocks_x, self.numBlocks_y)) # self.b[batch_i] -= eta * self.gradb[batch_i] batchVecSubtractInplace(cur_batch_len, eta, self.b, self.gradb, batch_i_gpu, \ block=(self.blockDim, 1, 1), grid=(self.numBlocks, 1)) # self.b_tilde[batch_j] -= eta * self.gradb_tilde[batch_j] batchVecSubtractInplace(cur_batch_len, eta, self.b_tilde, self.gradb_tilde, batch_j_gpu, \ block=(self.blockDim, 1, 1), grid=(self.numBlocks, 1)) context.synchronize() cost += gpuarray.dot(weighted_cost_inner, cost_inner).get() return cost
def project(self, proj_mat, threads=8, max_blockind=1024): if not self.initialized: print("Projector is not initialized") return inv_ar_mat, source_point = proj_mat.get_conanical_proj_matrix( voxel_size=self.voxelsize, volume_size=self.volumesize, origin_shift=self.origin) can_proj_matrix = inv_ar_mat.astype(np.float32) pixel_array = np.zeros( (self.proj_width, self.proj_height)).astype(np.float32) sourcex = source_point[0] sourcey = source_point[1] sourcez = source_point[2] g_volume_edge_min_point_x = np.float32(-0.5) g_volume_edge_min_point_y = np.float32(-0.5) g_volume_edge_min_point_z = np.float32(-0.5) g_volume_edge_max_point_x = np.float32(self.volumesize[0] - 0.5) g_volume_edge_max_point_y = np.float32(self.volumesize[1] - 0.5) g_volume_edge_max_point_z = np.float32(self.volumesize[2] - 0.5) g_voxel_element_size_x = self.voxelsize[0] g_voxel_element_size_y = self.voxelsize[1] g_voxel_element_size_z = self.voxelsize[2] #copy to gpu proj_matrix_gpu = cuda.mem_alloc(can_proj_matrix.nbytes) cuda.memcpy_htod(proj_matrix_gpu, can_proj_matrix) pixel_array_gpu = cuda.mem_alloc(pixel_array.nbytes) cuda.memcpy_htod(pixel_array_gpu, pixel_array) #calculate required blocks #threads = 8 blocks_w = np.int(np.ceil(self.proj_width / threads)) blocks_h = np.int(np.ceil(self.proj_height / threads)) print("running:", blocks_w, "x", blocks_h, "blocks with ", threads, "x", threads, "threads") if blocks_w <= max_blockind and blocks_h <= max_blockind: #run kernel offset_w = np.int32(0) offset_h = np.int32(0) self.projKernel(self.proj_width, self.proj_height, self.stepsize, g_volume_edge_min_point_x, g_volume_edge_min_point_y, g_volume_edge_min_point_z, g_volume_edge_max_point_x, g_volume_edge_max_point_y, g_volume_edge_max_point_z, g_voxel_element_size_x, g_voxel_element_size_y, g_voxel_element_size_z, sourcex, sourcey, sourcez, proj_matrix_gpu, pixel_array_gpu, offset_w, offset_h, block=(8, 8, 1), grid=(blocks_w, blocks_h)) else: print("running kernel patchwise") for w in range(0, (blocks_w - 1) // max_blockind + 1): for h in range(0, (blocks_h - 1) // max_blockind + 1): offset_w = np.int32(w * max_blockind) offset_h = np.int32(h * max_blockind) # print(offset_w, offset_h) self.projKernel(self.proj_width, self.proj_height, self.stepsize, g_volume_edge_min_point_x, g_volume_edge_min_point_y, g_volume_edge_min_point_z, g_volume_edge_max_point_x, g_volume_edge_max_point_y, g_volume_edge_max_point_z, g_voxel_element_size_x, g_voxel_element_size_y, g_voxel_element_size_z, sourcex, sourcey, sourcez, proj_matrix_gpu, pixel_array_gpu, offset_w, offset_h, block=(8, 8, 1), grid=(max_blockind, max_blockind)) context.synchronize() #context.synchronize() cuda.memcpy_dtoh(pixel_array, pixel_array_gpu) pixel_array = np.swapaxes(pixel_array, 0, 1) #normalize to cm return pixel_array / 10