def CFPLF_Hebbian_Sparse_GPU(projection): """ Sparse CF Projection learning function applying Hebbian learning to the weights in a projection. """ single_conn_lr = projection.learning_rate / projection.n_units # Transfering source and destination activities: src_activity_gpu = gpuarray.to_gpu_async( np.ravel(projection.src.activity).astype(np.float32), ) dest_activity_gpu = gpuarray.to_gpu_async( np.ravel(projection.dest.activity).astype(np.float32), ) # Computing Hebbian learning weights: projection.hebbian_kernel(single_conn_lr, projection.nzrows_gpu, projection.nzcols_gpu, src_activity_gpu, dest_activity_gpu, projection.weights_gpu.Val, range=slice(0, projection.nzcount, 1)) # Normalisation values: projection.weights_gpu.mv(projection.norm_ones_gpu, y=projection.norm_total_gpu, autosync=False) projection.has_norm_total = True
def cuda_sobel(m_wi): m_sobel = np.zeros_like(m_wi, np.float32) # Transfer image asynchronously. cu_m_wi = gpu.to_gpu_async(m_wi) cu_m_sobel = gpu.to_gpu_async(m_sobel) # Get block/grid size for steps 1-3. height, width = m_wi.shape block_hw = 32 block_size = (block_hw,block_hw,1) filter_radius = 1 tile_size = block_hw - filter_radius * 2 grid_width = (width + tile_size - 1) / tile_size grid_height = (height + tile_size- 1) / tile_size grid_size = (grid_width, grid_height) width = np.int32(width) height = np.int32(height) kernel_sobel(cu_m_wi,cu_m_sobel, width, height, block=block_size, grid=grid_size) m_sobel = cu_m_sobel.get() return m_sobel
def cuda_sobel(m_wi): m_sobel = np.zeros_like(m_wi, np.float32) # Transfer image asynchronously. cu_m_wi = gpu.to_gpu_async(m_wi) cu_m_sobel = gpu.to_gpu_async(m_sobel) # Get block/grid size for steps 1-3. height, width = m_wi.shape block_hw = 32 block_size = (block_hw, block_hw, 1) filter_radius = 1 tile_size = block_hw - filter_radius * 2 grid_width = (width + tile_size - 1) / tile_size grid_height = (height + tile_size - 1) / tile_size grid_size = (grid_width, grid_height) width = np.int32(width) height = np.int32(height) kernel_sobel(cu_m_wi, cu_m_sobel, width, height, block=block_size, grid=grid_size) m_sobel = cu_m_sobel.get() return m_sobel
def __init__(self, Y, verbose=False): """Initialize instance. """ self._verbose = verbose self._Y_gpu = gpuarray.to_gpu_async(Y) self._Y_gpu_scratch = gpuarray.to_gpu_async(Y) self._featuredim = Y.shape[0] self._framecount = Y.shape[1]
def fft_stitch(N, plan2d, plan1d, hostarr, largebox_d): w = hostarr.shape[0] META_GRID_SIZE = w/N fftbatch = 2 for meta_z in xrange(META_GRID_SIZE): #fft along x largebox_d = gpuarray.to_gpu_async(hostarr[:, :, meta_z*N:(meta_z+1)*N].transpose(1,2,0)) #print largebox_d.shape plan1d.execute(largebox_d, batch=fftbatch, inverse=True) hostarr[:, :, meta_z*N:(meta_z+1)*N] = largebox_d.real.get_async().transpose(2,0,1) for meta_x in xrange(META_GRID_SIZE): #fft along y, z largebox_d = gpuarray.to_gpu_async(hostarr[meta_x*N:(meta_x+1)*N, :, :].copy()) plan2d.execute(largebox_d, batch=fftbatch, inverse=True) hostarr[meta_x*N:(meta_x+1)*N, :, :] = largebox_d.real.get_async() return hostarr
def __init__(self, num_inputs=None, num_outputs=None, weights=None, b=None, stream=None, relu=False, sigmoid=False, delta=None): self.stream = stream if delta is None: self.delta = np.float32(0.001) else: self.delta = np.float32(delta) if weights is None: weights = (np.random.rand(num_outputs, num_inputs) - 0.5) self.num_inputs = np.int32(num_inputs) self.num_outputs = np.int32(num_outputs) if type(weights) != pycuda.gpuarray.GPUArray: self.weights = gpuarray.to_gpu_async(np.array(weights, dtype=np.float32), stream=self.stream) else: self.weights = weights if num_inputs is None or num_outputs is None: self.num_inputs = np.int32(self.weights.shape[1]) self.num_outputs = np.int32(self.weights.shape[0]) else: self.num_inputs = np.int32(num_inputs) self.num_outputs = np.int32(num_outputs) if b is None: b = gpuarray.zeros((self.num_outputs, ), dtype=np.float32) if type(b) != pycuda.gpuarray.GPUArray: self.b = gpuarray.to_gpu_async(np.array(b, dtype=np.float32), stream=self.stream) else: self.b = b self.relu = np.int32(relu) self.sigmoid = np.int32(sigmoid) self.block = (32, 1, 1) self.grid = (int(np.ceil(self.num_outputs / 32)), 1, 1)
def gpu_mandelbrot(width, height, real_low, real_high, imag_low, imag_high, max_iters, upper_bound): # we set up our complex lattice as such real_vals = np.matrix(np.linspace(real_low, real_high, width), dtype=np.complex64) imag_vals = np.matrix(np.linspace( imag_high, imag_low, height), dtype=np.complex64) * 1j mandelbrot_lattice = np.array(real_vals + imag_vals.transpose(), dtype=np.complex64) # copy complex lattice to the GPU mandelbrot_lattice_gpu = gpuarray.to_gpu_async(mandelbrot_lattice) # synchronize in current context pycuda.autoinit.context.synchronize() # allocate an empty array on the GPU mandelbrot_graph_gpu = gpuarray.empty(shape=mandelbrot_lattice.shape, dtype=np.float32) mandel_ker( mandelbrot_lattice_gpu, mandelbrot_graph_gpu, np.int32(max_iters), np.float32(upper_bound)) pycuda.autoinit.context.synchronize() mandelbrot_graph = mandelbrot_graph_gpu.get_async() pycuda.autoinit.context.synchronize() return mandelbrot_graph
def gpu_r2c_fft(in1, is_gpuarray=False, store_on_gpu=False): """ This function makes use of the scikits implementation of the FFT for GPUs to take the real to complex FFT. INPUTS: in1 (no default): The array on which the FFT is to be performed. is_gpuarray (default=True): Boolean specifier for whether or not input is on the gpu. store_on_gpu (default=False): Boolean specifier for whether the result is to be left on the gpu or not. OUTPUTS: gpu_out1 The gpu array containing the result. OR gpu_out1.get() The result from the gpu array. """ if is_gpuarray: gpu_in1 = in1 else: gpu_in1 = gpuarray.to_gpu_async(in1.astype(np.float32)) output_size = np.array(in1.shape) output_size[1] = 0.5*output_size[1] + 1 gpu_out1 = gpuarray.empty([output_size[0], output_size[1]], np.complex64) gpu_plan = Plan(gpu_in1.shape, np.float32, np.complex64) fft(gpu_in1, gpu_out1, gpu_plan) if store_on_gpu: return gpu_out1 else: return gpu_out1.get()
def CFPLF_Hebbian_Sparse_GPU(projection): """ Sparse CF Projection learning function applying Hebbian learning to the weights in a projection. """ single_conn_lr = projection.learning_rate/projection.n_units # Transfering source and destination activities: src_activity_gpu = gpuarray.to_gpu_async(np.ravel(projection.src.activity).astype(np.float32), ) dest_activity_gpu = gpuarray.to_gpu_async(np.ravel(projection.dest.activity).astype(np.float32), ) # Computing Hebbian learning weights: projection.hebbian_kernel(single_conn_lr, projection.nzrows_gpu, projection.nzcols_gpu, src_activity_gpu, dest_activity_gpu, projection.weights_gpu.Val, range=slice(0, projection.nzcount, 1)) # Normalisation values: projection.weights_gpu.mv(projection.norm_ones_gpu, y=projection.norm_total_gpu, autosync=False) projection.has_norm_total = True
def batch_get_sol_params(x_nd, K_nn, bend_coefs, rot_coef=np.r_[1e-4, 1e-4, 1e-1]): n, d = x_nd.shape x_gpu = gpuarray.to_gpu(x_nd) H_arr_gpu = [] for b in bend_coefs: cur_offset = np.zeros((1 + d + n, 1 + d + n), np.float32) cur_offset[d+1:, d+1:] = b * K_nn cur_offset[1:d+1, 1:d+1] = np.diag(rot_coef) H_arr_gpu.append(gpuarray.to_gpu(cur_offset)) H_ptr_gpu = get_gpu_ptrs(H_arr_gpu) A = np.r_[np.zeros((d+1,d+1)), np.c_[np.ones((n,1)), x_nd]].T n_cnts = A.shape[0] _u,_s,_vh = np.linalg.svd(A.T) N = _u[:,n_cnts:] F = np.zeros((n + d + 1, d), np.float32) F[1:d+1, :d] -= np.diag(rot_coef) Q = np.c_[np.ones((n,1)), x_nd, K_nn].astype(np.float32) F = F.astype(np.float32) N = N.astype(np.float32) Q_gpu = gpuarray.to_gpu(Q) Q_arr_gpu = [Q_gpu for _ in range(len(bend_coefs))] Q_ptr_gpu = get_gpu_ptrs(Q_arr_gpu) F_gpu = gpuarray.to_gpu(F) F_arr_gpu = [F_gpu for _ in range(len(bend_coefs))] F_ptr_gpu = get_gpu_ptrs(F_arr_gpu) N_gpu = gpuarray.to_gpu(N) N_arr_gpu = [N_gpu for _ in range(len(bend_coefs))] N_ptr_gpu = get_gpu_ptrs(N_arr_gpu) dot_batch_nocheck(Q_arr_gpu, Q_arr_gpu, H_arr_gpu, Q_ptr_gpu, Q_ptr_gpu, H_ptr_gpu, transa = 'T') # N'HN NHN_arr_gpu, NHN_ptr_gpu = m_dot_batch((N_arr_gpu, N_ptr_gpu, 'T'), (H_arr_gpu, H_ptr_gpu, 'N'), (N_arr_gpu, N_ptr_gpu, 'N')) iH_arr = [] for NHN in NHN_arr_gpu: iH_arr.append(scipy.linalg.inv(NHN.get()).copy()) iH_arr_gpu = [gpuarray.to_gpu_async(iH) for iH in iH_arr] iH_ptr_gpu = get_gpu_ptrs(iH_arr_gpu) proj_mats = m_dot_batch((N_arr_gpu, N_ptr_gpu, 'N'), (iH_arr_gpu, iH_ptr_gpu, 'N'), (N_arr_gpu, N_ptr_gpu, 'T'), (Q_arr_gpu, Q_ptr_gpu, 'T')) offset_mats = m_dot_batch((N_arr_gpu, N_ptr_gpu, 'N'), (iH_arr_gpu, iH_ptr_gpu, 'N'), (N_arr_gpu, N_ptr_gpu, 'T'), (F_arr_gpu, F_ptr_gpu, 'N')) return proj_mats, offset_mats
def full_scan(): # TODO: testing how slow a single full scan is with no parallelism sequential = SourceModule(""" #include <stdio.h> __global__ void full_scan(unsigned char *img, int line[2]) { int counter = 0; for(int y=0; y<853; y++) { for(int x=0; x<1918; x++) { if((img[x*3 + y*1918*3] <= 4) && (153 <= img[1 + x*3 + y*1918*3]) && (img[1 + x*3 + y*1918*3] <= 180) && (196 <= img[2 + x*3 + y*1918*3]) && (img[2 + x*3 + y*1918*3] <= 210)) { counter++; if(counter == 50) { line[0] = x; line[1] = y; return; } } else { counter = 0; } } } } """) image = cv.imread("test images/crop2.png") seq = sequential.get_function("full_scan") image_gpu = gpuarray.to_gpu_async(image) line = np.array([0, 0]) timer = time.clock() seq(image_gpu, cuda.InOut(line), block=(1, 1, 1)) print(time.clock() - timer) print(line)
def CFPRF_DotProduct_Sparse_GPU(projection): """ Sparse CF Projection response function calculating the dot-product between incoming activities and CF weights. Uses GPU. """ projection.input_buffer_pagelocked[:] = np.ravel(projection.input_buffer).astype(np.float32) projection.input_buffer_gpu = gpuarray.to_gpu_async(projection.input_buffer_pagelocked, stream=projection.pycuda_stream) projection.weights_gpu.mv(projection.input_buffer_gpu, alpha=projection.strength, y=projection.activity_gpu_buffer, autosync=False, stream=projection.pycuda_stream) projection.activity_gpu_buffer.get_async(ary=projection.activity, stream=projection.pycuda_stream)
def eval_(self, x, y=None, batch_size=None, stream=None, delta=None, w_t=None, b_t=None): if stream is None: stream = self.stream if type(x) != pycuda.gpuarray.GPUArray: x = gpuarray.to_gpu_async(np.array(x, dtype=np.float32), stream=self.stream) if batch_size is None: if len(x.shape) == 2: batch_size = np.int32(x.shape[0]) else: batch_size = np.int32(1) if delta is None: delta = self.delta delta = np.float32(delta) if w_t is None: w_t = np.int32(-1) if b_t is None: b_t = np.int32(-1) if y is None: if batch_size == 1: y = gpuarray.empty((self.num_outputs, ), dtype=np.float32) else: y = gpuarray.empty((batch_size, self.num_outputs), dtype=np.float32) eval_ker(self.num_outputs, self.num_inputs, self.relu, self.sigmoid, self.weights, self.b, x, y, np.int32(batch_size), w_t, b_t, delta, block=self.block, grid=self.grid, stream=stream) return y
def magnitude(vec, vec2): #, fn = mod.get_function('magnitude')): #gpu_vec = drv.mem_alloc(vec.nbytes) #drv.memcpy_htod(gpu_vec, vec) #fn(gpu_vec, block=(512, 1, 1)) #dest = drv.from_device_like(gpu_vec, vec) #print 'Dot product: ', dest[0] gpu_arry = gpuarr.to_gpu_async(vec) gpu_arry2 = gpuarr.to_gpu_async(vec2) mag = cumath.sqrt(gpuarr.dot(gpu_arry, gpu_arry, dtype=np.float32)) mag2 = cumath.sqrt(gpuarr.dot(gpu_arry2, gpu_arry2, dtype=np.float32)) product = gpuarr.dot(gpu_arry, gpu_arry2, dtype=np.float32) / mag + mag2 print product return product.get()
def row_scan(): # scan full rows each thread, multiple rows in parallel # TODO: this is bad for memory coalescing? # int bgr_pass = (img[x * 3 + y * 1918 * 3] <= 4) * (153 <= img[1 + x * 3 + y * 1918 * 3]) * ( # img[1 + x * 3 + y * 1918 * 3] <= 180) * # (196 <= img[2 + x * 3 + y * 1918 * 3]) * (img[2 + x * 3 + y * 1918 * 3] <= 210); # counter = counter * bgr_pass + bgr_pass; startscan = SourceModule(""" #include <stdio.h> __global__ void line_scan(unsigned char *img, int *flag) { int y = threadIdx.y + blockIdx.y * blockDim.y; int counter = 0; if(y > 852) { return; } for(int x=0; x<1918; x++) { int pix_g = img[1 + x*3 + y*1918*3]; int pix_r = img[2 + x*3 + y*1918*3]; if((img[x*3 + y*1918*3] < 5) & (152 < pix_g) & (pix_g < 181) & (195 < pix_r) & (pix_r < 211)) { counter++; if(counter == 50) { flag[0] = x; flag[1] = y; } } else { counter = 0; } } } """) image = cv.imread("test images/crop2.png") scan = startscan.get_function("line_scan") timer = time.clock() flag = np.array([0, 0]) image_gpu = gpuarray.to_gpu_async(image) flag_gpu = gpuarray.to_gpu_async(flag) scan(image_gpu, flag_gpu, block=(1, 32, 1), grid=(1, 27)) print(flag_gpu.get()) print(time.clock() - timer)
def _gpu_init(self, debug): self.dev_n = cuda.to_gpu_async(np.array([self.n]).astype(np.int32), stream=self.stream2) self.neighbors_index = cuda.to_gpu_async(self.num_neighbors, stream=self.stream1) exclusiveScan(self.neighbors_index, stream=self.stream1) self.dev_num_neighbors = cuda.to_gpu_async(self.num_neighbors, stream=self.stream2) self.dev_states = cuda.to_gpu_async(self.states, stream=self.stream2) self.dev_waypoints = cuda.to_gpu_async(self.waypoints, stream=self.stream2) self.dev_neighbors = cuda.to_gpu_async(self.neighbors, stream=self.stream2) self.dev_Gindicator = cuda.GPUArray(self.Vopen.shape, self.Vopen.dtype) self.dev_xindicator = cuda.GPUArray(self.Vopen.shape, self.Vopen.dtype) self.dev_xindicator_zeros = cuda.GPUArray(self.Vopen.shape, self.Vopen.dtype) self.zero_val = np.zeros((), np.int32) self.dev_xindicator_zeros.fill(self.zero_val, stream=self.stream1) # self.stream1.synchronize() self.stream2.synchronize()
def __init__(self, array, dtype=None, allocator=mem_alloc, stream=None): self.dtype = array.dtype if dtype is None else dtype self.nnz = array.nnz self.shape = array.shape if self.nnz == 0: # let's not waste time return if not sparse.isspmatrix_csr(array): array = sparse.csr_matrix(array, dtype=self.dtype) if not array.has_sorted_indices: array = array.sorted_indices() if stream is not None: self.data = gpuarray.to_gpu_async(array.data.astype(dtype=self.dtype), allocator=allocator, stream=stream) self.indptr = gpuarray.to_gpu_async(array.indptr, allocator=allocator, stream=stream) self.indices = gpuarray.to_gpu_async(array.indices, allocator=allocator, stream=stream) else: self.data = gpuarray.to_gpu(array.data.astype(dtype=self.dtype), allocator=allocator) self.indptr = gpuarray.to_gpu(array.indptr, allocator=allocator) self.indices = gpuarray.to_gpu(array.indices, allocator=allocator) self.descr = cusparse.cusparseCreateMatDescr()
def add_new_frame(self, x): x = x.flatten() assert len(x) == self._featuredim x_gpu = gpuarray.to_gpu_async(x) BLOCK_SIZE = (256,1,1) nblocks = int(np.ceil(float(self._featuredim) / BLOCK_SIZE[0])) GRID_SIZE = (nblocks, self._framecount, 1) cudabuffer.cyclebuffer(self._Y_gpu_scratch, x_gpu, self._Y_gpu, np.int32(self._featuredim), np.int32(self._framecount), block=BLOCK_SIZE, grid=GRID_SIZE) # Copy self._Y_gpu into self._Y_gpu_scratch cuda.memcpy_dtod_async(self._Y_gpu_scratch.gpudata, self._Y_gpu.gpudata, self._Y_gpu.nbytes)
def parrallel_scan(): # Array stored in memory in linear fashion, row by row, col by col, pix/bgr ie r0c0pb, r0c0pg, r0c0pr, r0c1pb # We should parallelize blocks of ~284y 1x 1z, keep array[853] to keep counter for each row, then we will be # scanning scanning across the image column by column, keeping counter for each row image = cv.imread("test images/crop2.png") modtest = SourceModule(""" #include <stdio.h> __global__ void line_scan(const unsigned char *img, int *counter) { //int x = blockIdx.x; //int y = threadIdx.y + blockIdx.y * blockDim.y; // transposed image height and width against x and y to get threads to run down then across columns of image int x = blockIdx.y; int y = 852 - (threadIdx.x + (blockIdx.x * 288)); if(y < 0) { return; } // attempt at removing thread divergence, doesnt seem to make any difference in speed //int bgr_pass = (img[x*3 + y*1918*3] <= 4)*(153 <= img[1 + x*3 + y*1918*3])*(img[1 + x*3 + y*1918*3] <= 180)* //(196 <= img[2 + x*3 + y*1918*3])*(img[2 + x*3 + y*1918*3] <= 210); //counter[y] = counter[y]*bgr_pass + bgr_pass; if((img[x*3 + y*1918*3] <= 4) && (153 <= img[1 + x*3 + y*1918*3]) && (img[1 + x*3 + y*1918*3] <= 180) && (196 <= img[2 + x*3 + y*1918*3]) && (img[2 + x*3 + y*1918*3] <= 210)) { counter[y] += 1; if(counter[y] == 50) { counter[853] = x; counter[854] = y; } } else { counter[y] = 0; } __syncthreads(); } """) func = modtest.get_function("line_scan") # TODO: added syncing of threads before they finish to try keep columns in sequence...not guaranteed to work? # blocks can still run out of order due to SMMs eg 50 line: last+1 line resets counter before last line is scanned test = np.array([[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]], [[12, 13, 14], [15, 16, 17], [18, 19, 20], [21, 22, 23]]]) counter = np.array([0 for _ in range(855)]) image_gpu = gpuarray.to_gpu_async(image) # counter_gpu = gpuarray.to_gpu_async(counter) # image_gpu = cuda.mem_alloc(image.nbytes) counter_gpu = cuda.mem_alloc(counter.nbytes) # cuda.memcpy_htod(image_gpu, image) cuda.memcpy_htod(counter_gpu, counter) timer = time.clock() func(image_gpu, counter_gpu, block=(288, 1, 1), grid=(3, 1918)) print(time.clock() - timer) cuda.memcpy_dtoh(counter, counter_gpu) print(counter[-2:])
def CFPRF_DotProduct_Sparse_GPU(projection): """ Sparse CF Projection response function calculating the dot-product between incoming activities and CF weights. Uses GPU. """ projection.input_buffer_pagelocked[:] = np.ravel( projection.input_buffer).astype(np.float32) projection.input_buffer_gpu = gpuarray.to_gpu_async( projection.input_buffer_pagelocked, stream=projection.pycuda_stream) projection.weights_gpu.mv(projection.input_buffer_gpu, alpha=projection.strength, y=projection.activity_gpu_buffer, autosync=False, stream=projection.pycuda_stream) projection.activity_gpu_buffer.get_async(ary=projection.activity, stream=projection.pycuda_stream)
def step_init(self, iter_parameters, debug): self.cost[self.start] = np.inf self.Vunexplored[self.start] = 1 self.Vopen[self.start] = 0 if self.start != iter_parameters['start'] and len(self.route) > 2: del self.route[-1] self.obstacles = iter_parameters['obstacles'] self.num_obs = iter_parameters['num_obs'] self.parent = np.full(self.n, -1).astype(np.int32) self.start = iter_parameters['start'] self.goal = iter_parameters['goal'] self.radius = iter_parameters['radius'] self.threshold = np.array([iter_parameters['threshold'] ]).astype(np.float32) self.cost[self.start] = 0 self.Vunexplored[self.start] = 0 self.Vopen[self.start] = 1 if debug: print('parents:', self.parent) print('cost: ', self.cost) print('Vunexplored: ', self.Vunexplored) print('Vopen: ', self.Vopen) self.dev_open = cuda.to_gpu_async(self.Vopen, stream=self.stream2) self.dev_threshold = cuda.to_gpu_async(self.threshold, stream=self.stream1) self.dev_radius = cuda.to_gpu_async(np.array([self.radius ]).astype(np.float32), stream=self.stream2) self.dev_obstacles = cuda.to_gpu_async(self.obstacles, stream=self.stream2) self.dev_num_obs = cuda.to_gpu_async(self.num_obs, stream=self.stream2) self.dev_parent = cuda.to_gpu_async(self.parent, stream=self.stream2) self.dev_cost = cuda.to_gpu_async(self.cost, stream=self.stream1) self.dev_unexplored = cuda.to_gpu_async(self.Vunexplored, stream=self.stream1)
def to_gpu_async(array, stream=None): """Copies the given CPU array asynchronously to the current device. Args: array: Array to be sent to GPU. If it is :class:`~numpy.ndarray`, then its memory must be pagelocked. stream (~pycuda.driver.Stream): CUDA stream. Returns: ~pycuda.gpuarray.GPUArray: Array on GPU. If given ``array`` is already on GPU, then this function just returns ``array`` without performing any copy. """ if isinstance(array, GPUArray): return array return gpuarray.to_gpu_async(array, allocator=mem_alloc, stream=stream)
def batch_sum(a_arr_gpu, a_ptr_gpu): """ computes a sum of all of the arrays pointed to by a_arr_gpu and a_ptr_gpu """ if len(a_arr_gpu[0].shape) != 1: n, m = a_arr_gpu[0].shape total_size = n * m flat_a_gpu = [a.ravel() for a in a_arr_gpu] else: total_size = a_arr_gpu[0].shape[0] flat_a_gpu = a_arr_gpu ones_vec = gpuarray.to_gpu_async(np.ones((total_size, 1), dtype=np.float32)) ones_arr_gpu = [ones_vec for i in range(len(a_arr_gpu))] ones_ptr_gpu = get_gpu_ptrs(ones_arr_gpu) res_arr, res_ptrs = dot_batch(flat_a_gpu, ones_arr_gpu, a_ptr_gpu, ones_ptr_gpu) return [r.get()[0] for r in res_arr]
def __init__(self, Xpoints, numMixtures): print "GPU Implementation Chosen" LikelihoodEvaluator.__init__(self, Xpoints, numMixtures) #Pycuda imports import pycuda.autoinit from pycuda import gpuarray from pycuda.compiler import SourceModule self.gpuarray = gpuarray with open("KernelV2.cu") as f: if self.numPoints >= 1024: mod = SourceModule(f.read().replace('512', '1024')) self.numThreads = 1024 else: mod = SourceModule(f.read()) self.numThreads = 512 if self.numPoints > self.numThreads: self.numBlocks = self.numPoints / self.numThreads if self.numPoints % self.numThreads != 0: self.numBlocks += 1 else: self.numBlocks = 1 print "numBlocks: {}, numPoints: {}".format(self.numBlocks, self.numPoints) #Set the right number of threads and blocks given the datasize #Using a max of 1024 threads, fix correct blocksize self.likelihoodKernel = mod.get_function("likelihoodKernel") self.likelihoodKernel.prepare('PPPPiiiP') self.Xpoints = self.Xpoints.astype(np.float32) self.Xpoints = gpuarray.to_gpu_async(self.Xpoints) self.means_gpu = gpuarray.zeros(shape = (self.numMixtures, self.dim), dtype = np.float32) self.diagCovs_gpu = gpuarray.zeros(shape = (self.numMixtures, self.dim), dtype = np.float32) self.weights_gpu = gpuarray.zeros(shape = self.numMixtures, dtype = np.float32) self.llVal = gpuarray.zeros(shape = self.numBlocks, dtype=np.float32)
def eval_(self, x, y=None, batch_size=None, stream=None): if stream is None: stream = self.stream if type(x) != pycuda.gpuarray.GPUArray: temp = np.array(x, dtype=np.float32) x = gpuarray.to_gpu_async(temp, stream=stream) if batch_size == None: if len(x.shape) == 2: batch_size = np.int32(x.shape[0]) else: batch_size = np.int32(1) else: batch_size = np.int32(batch_size) if y is None: if batch_size == 1: y = gpuarray.empty((self.num, ), dtype=np.float32) else: y = gpuarray.empty((batch_size, self.num), dtype=np.float32) exp_ker(self.num, x, y, batch_size, block=(32, 1, 1), grid=(int(np.ceil(self.num / 32)), 1, 1), stream=stream) mean_ker(self.num, y, y, batch_size, block=(32, 1, 1), grid=(int(np.ceil(batch_size / 32)), 1, 1), stream=stream) return y
def __init__(self, Xpoints, numMixtures): super().__init__(Xpoints, numMixtures) import pycuda.autoinit # must be run from pycuda import gpuarray from pycuda.compiler import SourceModule self.gpuarray = gpuarray self.numThreads, self.numBlocks = chooseGridThread(self.numPoints) with open(self.cuFile) as f: # have to use a replace here since we can't guarantee the format working # with the c syntax. A bit hacky sadly, but easier than mallocing mod = SourceModule(f.read().replace("MAX_THREADS", str(self.numThreads))) # print("numBlocks: {}, numPoints: {}".format(self.numBlocks, self.numPoints)) # Set the right number of threads and blocks given the datasize self.likelihoodKernel = mod.get_function("likelihoodKernel") # this is telling PyCUDA what args I'll be passing through later # same syntax as struct module (think pack unpack) self.likelihoodKernel.prepare('PPPPiiiP') self.Xpoints = self.Xpoints.astype(np.float32) # dump the X in the GPU memory so we don't need to keep transferring # note we assume size(Xpoints) < GPU memory self.Xpoints = gpuarray.to_gpu_async(self.Xpoints) # we allocate memory for the parameters so this is only done on startup self.means_gpu = gpuarray.zeros(shape=(self.numMixtures, self.dim), dtype=np.float32) self.diagCovs_gpu = gpuarray.zeros(shape=(self.numMixtures, self.dim), dtype=np.float32) self.weights_gpu = gpuarray.zeros(shape=self.numMixtures, dtype=np.float32) # Allocate Memory for all our computations self.llVal = gpuarray.zeros(shape=self.numBlocks, dtype=np.float32)
def gpu_mandelbrot(width, height, real_low, real_high, imag_low, imag_high, max_iters, upper_bound): # we set up our complex lattice as such real_vals = np.matrix(np.linspace(real_low, real_high, width), dtype=np.complex64) imag_vals = np.matrix(np.linspace(imag_high, imag_low, height), dtype=np.complex64) * 1j mandelbrot_lattice = np.array(real_vals + imag_vals.transpose(), dtype=np.complex64) # copy complex lattice to the GPU # changed this to be explicitly synchronized. (to: to_gpu_async) # We can copy to the GPU asynchronously with to_gpu_async, and then synchronize as follows: mandelbrot_lattice_gpu = gpuarray.to_gpu_async(mandelbrot_lattice) # We can access the current context object with pycuda.autoinit.context, # and we can synchronize in our current context by calling the pycuda.autoinit.context.synchronize() function. # synchronize in current context pycuda.autoinit.context.synchronize() # allocate an empty array on the GPU # allocates memory on the GPU with the gpuarray.empty function. Memory allocation in CUDA is, # by the nature of the GPU architecture, automatically synchronized; there is no asynchronous memory # allocation equivalent here. mandelbrot_graph_gpu = gpuarray.empty(shape=mandelbrot_lattice.shape, dtype=np.float32) mandel_ker(mandelbrot_lattice_gpu, mandelbrot_graph_gpu, np.int32(max_iters), np.float32(upper_bound)) pycuda.autoinit.context.synchronize() mandelbrot_graph = mandelbrot_graph_gpu.get_async() pycuda.autoinit.context.synchronize() return mandelbrot_graph
def _gpu_init(self, debug): self.dev_n = cuda.to_gpu_async(np.array([self.n]).astype(np.int32), stream=self.stream2) self.neighbors_index = cuda.to_gpu_async(self.num_neighbors, stream=self.stream1) exclusiveScan(self.neighbors_index, stream=self.stream1) self.dev_num_neighbors = cuda.to_gpu_async(self.num_neighbors, stream=self.stream2) self.dev_states = cuda.to_gpu_async(self.states, stream=self.stream2) self.dev_waypoints = cuda.to_gpu_async(self.waypoints, stream=self.stream2) self.dev_neighbors = cuda.to_gpu_async(self.neighbors, stream=self.stream2) self.stream1.synchronize() self.stream2.synchronize()
] y_pin = [ cuda.register_host_memory(y[i * N / nStreams:(i + 1) * N / nStreams]) for i in range(nStreams) ] h = cublas.cublasCreate() x_gpu = np.empty(nStreams, dtype=object) y_gpu = np.empty(nStreams, dtype=object) ans = np.empty(nStreams, dtype=object) for i in range(nStreams): cublas.cublasSetStream(h, streams[i].handle) x_gpu[i] = gpuarray.to_gpu_async(x_pin[i], stream=streams[i]) y_gpu[i] = gpuarray.to_gpu_async(y_pin[i], stream=streams[i]) cublas.cublasSaxpy(h, x_gpu[i].size, a, x_gpu[i].gpudata, 1, y_gpu[i].gpudata, 1) ans[i] = y_gpu[i].get_async(stream=streams[i]) cublas.cublasDestroy(h) # Uncomment to check for errors in the calculation #y_gpu = np.array([yg.get() for yg in y_gpu]) #y_gpu = np.array(y_gpu).reshape(y.shape) #print np.allclose(y_gpu, a*x + y) e.record() e.synchronize()
def gpu_source_extraction(in1, tolerance, store_on_gpu, neg_comp): """ The following function determines connectivity within a given wavelet decomposition. These connected and labelled structures are thresholded to within some tolerance of the maximum coefficient at the scale. This determines whether on not an object is to be considered as significant. Significant objects are extracted and factored into a mask which is finally multiplied by the wavelet coefficients to return only wavelet coefficients belonging to significant objects across all scales. This GPU accelerated version speeds up the extraction process. INPUTS: in1 (no default): Array containing the wavelet decomposition. tolerance (no default): Percentage of maximum coefficient at which objects are deemed significant. store_on_gpu(no default): Boolean specifier for whether the decomposition is stored on the gpu or not. OUTPUTS: objects*in1 The wavelet coefficients of the significant structures. objects The mask of the significant structures - if store_on_gpu is True, returns a gpuarray. """ # The following are pycuda kernels which are executed on the gpu. Specifically, these both perform thresholding # operations. The gpu is much faster at this on large arrays due to their massive parallel processing power. ker1 = SourceModule(""" __global__ void gpu_mask_kernel1(int *in1, int *in2) { const int len = gridDim.x*blockDim.x; const int i = (blockDim.x * blockIdx.x + threadIdx.x); const int j = (blockDim.y * blockIdx.y + threadIdx.y)*len; const int tid2 = i + j; if (in1[tid2] == in2[0]) { in1[tid2] = -1; } } """, keep=True) ker2 = SourceModule(""" __global__ void gpu_mask_kernel2(int *in1) { const int len = gridDim.x*blockDim.x; const int i = (blockDim.x * blockIdx.x + threadIdx.x); const int j = (blockDim.y * blockIdx.y + threadIdx.y)*len; const int tid2 = i + j; if (in1[tid2] >= 0) { in1[tid2] = 0; } else { in1[tid2] = 1; } } """, keep=True) ker3 = SourceModule(""" __global__ void gpu_store_objects(int *in1, float *out1, int *scale) { const int len = gridDim.x*blockDim.x; const int i = (blockDim.x * blockIdx.x + threadIdx.x); const int j = (blockDim.y * blockIdx.y + threadIdx.y)*len; const int k = (blockDim.z * blockIdx.z + threadIdx.z)*(len*len); const int tid2 = i + j; const int tid3 = i + j + k; if (blockIdx.z==scale[0]) { out1[tid3] = in1[tid2]; } } """, keep=True) # The following initialises some variables for storing the labelled image and the number of labels. The per scale # maxima are also initialised here. scale_maxima = np.empty([in1.shape[0],1], dtype=np.float32) objects = np.empty_like(in1, dtype=np.int32) object_count = np.empty([in1.shape[0],1], dtype=np.int32) # The following loop uses functionality from the ndimage module to assess connectivity. The maxima are also # calculated here. for i in range(in1.shape[0]): if neg_comp: scale_maxima[i] = np.max(abs(in1[i,:,:])) else: scale_maxima[i] = np.max(in1[i,:,:]) objects[i,:,:], object_count[i] = ndimage.label(in1[i,:,:], structure=[[1,1,1],[1,1,1],[1,1,1]]) # The following bind the pycuda kernels to the expressions on the left. gpu_mask_kernel1 = ker1.get_function("gpu_mask_kernel1") gpu_mask_kernel2 = ker2.get_function("gpu_mask_kernel2") # If store_on_gpu is the following handles some initialisation. if store_on_gpu: gpu_store_objects = ker3.get_function("gpu_store_objects") gpu_objects = gpuarray.empty(objects.shape, np.float32) gpu_idx = gpuarray.zeros([1], np.int32) gpu_idx += (objects.shape[0]-1) # The following removes the insignificant objects and then extracts the remaining ones. for i in range(-1,-in1.shape[0]-1,-1): condition = tolerance*scale_maxima[i] if neg_comp: if i==(-1): tmp = (abs(in1[i,:,:])>=condition)*objects[i,:,:] else: tmp = (abs(in1[i,:,:])>=condition)*objects[i,:,:]*objects[i+1,:,:] else: if i==(-1): tmp = (in1[i,:,:]>=condition)*objects[i,:,:] else: tmp = (in1[i,:,:]>=condition)*objects[i,:,:]*objects[i+1,:,:] labels = (np.unique(tmp[tmp>0])).astype(np.int32) gpu_objects_page = gpuarray.to_gpu_async(objects[i,:,:].astype(np.int32)) for j in labels: label = gpuarray.to_gpu_async(np.array(j)) gpu_mask_kernel1(gpu_objects_page, label, block=(32,32,1), grid=(in1.shape[1]//32, in1.shape[1]//32)) gpu_mask_kernel2(gpu_objects_page, block=(32,32,1), grid=(in1.shape[1]//32, in1.shape[1]//32)) objects[i,:,:] = gpu_objects_page.get() # In the event that all operations are to be done on the GPU, the following stores a version of the objects # on the GPU. A handle to the gpuarray is then returned. if store_on_gpu: gpu_store_objects(gpu_objects_page, gpu_objects, gpu_idx, block=(32,32,1), grid=(objects.shape[2]//32, objects.shape[1]//32, objects.shape[0])) gpu_idx -= 1 if store_on_gpu: return objects*in1, gpu_objects else: return objects*in1, objects
def init_stitch(N): """outputs the high resolution k-box, and the smoothed r box Input ----------- N: int32 size of box to load onto the GPU, should be related to DIM by powers of 2 """ if N is None: N = np.int32(HII_DIM) #prepare for stitching META_GRID_SIZE = DIM/N M = np.int32(HII_DIM/META_GRID_SIZE) #HII_DIM = np.int32(HII_DIM) f_pixel_factor = DIM/HII_DIM; scale = np.float32(BOX_LEN/DIM) print 'scale', scale HII_scale = np.float32(BOX_LEN/HII_DIM) shape = (DIM,DIM,N) stitch_grid_size = (DIM/(block_size[0]), DIM/(block_size[0]), N/(block_size[0])) HII_stitch_grid_size = (HII_DIM/(block_size[0]), HII_DIM/(block_size[0]), M/(block_size[0])) #ratio of large box to small size kernel_source = open(cmd_folder+"/initialize_stitch.cu").read() kernel_code = kernel_source % { 'DELTAK': DELTA_K, 'DIM': DIM, 'VOLUME': VOLUME, 'META_BLOCKDIM': N } main_module = nvcc.SourceModule(kernel_code) init_stitch = main_module.get_function("init_kernel") HII_filter = main_module.get_function("HII_filter") subsample_kernel = main_module.get_function("subsample") velocity_kernel = main_module.get_function("set_velocity") pspec_texture = main_module.get_texref("pspec") MRGgen = MRG32k3aRandomNumberGenerator(seed_getter=seed_getter_uniform, offset=0) plan2d = Plan((np.int64(DIM), np.int64(DIM)), dtype=np.complex64) plan1d = Plan((np.int64(DIM)), dtype=np.complex64) print "init pspec" interpPspec, interpSize = init_pspec() #interpPspec contains both k array and P array interp_cu = cuda.matrix_to_array(interpPspec, order='F') cuda.bind_array_to_texref(interp_cu, pspec_texture) #hbox_large = pyfftw.empty_aligned((DIM, DIM, DIM), dtype='complex64') hbox_large = np.zeros((DIM, DIM, DIM), dtype=np.complex64) #hbox_small = np.zeros(HII_shape, dtype=np.float32) #hbox_large = n smoothR = np.float32(L_FACTOR*BOX_LEN/HII_DIM) # Set up pinned memory for transfer #largebox_hs = cuda.aligned_empty(shape=shape, dtype=np.float32, alignment=resource.getpagesize()) largebox_pin = cuda.pagelocked_empty(shape=shape, dtype=np.float32) largecbox_pin = cuda.pagelocked_empty(shape=shape, dtype=np.complex64) largebox_d = gpuarray.zeros(shape, dtype=np.float32) largebox_d_imag = gpuarray.zeros(shape, dtype=np.float32) print "init boxes" for meta_z in xrange(META_GRID_SIZE): # MRGgen = MRG32k3aRandomNumberGenerator(seed_getter=seed_getter_uniform, offset=meta_x*N**3) init_stitch(largebox_d, DIM, np.int32(meta_z),block=block_size, grid=stitch_grid_size) init_stitch(largebox_d_imag, DIM, np.int32(meta_z),block=block_size, grid=stitch_grid_size) largebox_d *= MRGgen.gen_normal(shape, dtype=np.float32) largebox_d_imag *= MRGgen.gen_normal(shape, dtype=np.float32) largebox_d = largebox_d + np.complex64(1.j) * largebox_d_imag cuda.memcpy_dtoh_async(largecbox_pin, largebox_d) hbox_large[:, :, meta_z*N:(meta_z+1)*N] = largecbox_pin.copy() #if want to get velocity need to use this if True: print "saving kbox" np.save(parent_folder+"/Boxes/deltak_z0.00_{0:d}_{1:.0f}Mpc.npy".format(DIM, BOX_LEN), hbox_large) print "Executing FFT on device" #hbox_large = pyfftw.interfaces.numpy_fft.ifftn(hbox_large).real hbox_large = fft_stitch(N, plan2d, plan1d, hbox_large, largebox_d).real print hbox_large.dtype print "Finished FFT on device" np.save(parent_folder+"/Boxes/deltax_z0.00_{0:d}_{1:.0f}Mpc.npy".format(DIM, BOX_LEN), hbox_large) if True: print "loading kbox" hbox_large = np.load(parent_folder+"/Boxes/deltak_z0.00_{0:d}_{1:.0f}Mpc.npy".format(DIM, BOX_LEN)) for meta_z in xrange(META_GRID_SIZE): largebox_pin = hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy() #cuda.memcpy_htod_async(largebox_d, largebox_pin) largebox_d = gpuarray.to_gpu_async(hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy()) HII_filter(largebox_d, DIM, np.int32(meta_z), ZERO, smoothR, block=block_size, grid=stitch_grid_size); hbox_large[:, :, meta_z*N:(meta_z+1)*N] = largebox_d.get_async() #import IPython; IPython.embed() print "Executing FFT on host" #hbox_large = hifft(hbox_large).astype(np.complex64).real #hbox_large = pyfftw.interfaces.numpy_fft.ifftn(hbox_large).real hbox_large = fft_stitch(N, plan2d, plan1d, hbox_large, largebox_d).real print "Finished FFT on host" #import IPython; IPython.embed() # for meta_x in xrange(META_GRID_SIZE): # for meta_y in xrange(META_GRID_SIZE): # for meta_z in xrange(META_GRID_SIZE): # largebox_d = gpuarray.to_gpu(hbox_large[meta_x*N:(meta_x+1)*N, meta_y*N:(meta_y+1)*N, meta_z*N:(meta_z+1)*N]) # HII_filter(largebox_d, N, np.int32(meta_x), np.int32(meta_y), np.int32(meta_z), ZERO, smoothR, block=block_size, grid=grid_size); # hbox_large[meta_x*N:(meta_x+1)*N, meta_y*N:(meta_y+1)*N, meta_z*N:(meta_z+1)*N] = largebox_d.get() #plan = Plan(shape, dtype=np.complex64) #plan.execute(largebox_d, inverse=True) #FFT to real space of smoothed box #largebox_d /= VOLUME #divide by VOLUME if using fft (vs ifft) # This saves a large resolution deltax print "downsampling" smallbox_d = gpuarray.zeros((HII_DIM,HII_DIM,M), dtype=np.float32) for meta_z in xrange(META_GRID_SIZE): largebox_pin = hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy() cuda.memcpy_dtoh_async(largecbox_pin, largebox_d) #largebox_d = gpuarray.to_gpu_async(hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy()) largebox_d /= scale**3 # subsample_kernel(largebox_d, smallbox_d, DIM, HII_DIM, PIXEL_FACTOR, block=block_size, grid=HII_stitch_grid_size) #subsample in real space hbox_small[:, :, meta_z*M:(meta_z+1)*M] = smallbox_d.get_async() np.save(parent_folder+"/Boxes/smoothed_deltax_z0.00_{0:d}_{1:.0f}Mpc".format(HII_DIM, BOX_LEN), hbox_small) #import IPython; IPython.embed() # To get velocities: reload the k-space box hbox_large = np.load(parent_folder+"/Boxes/deltak_z0.00_{0:d}_{1:.0f}Mpc.npy".format(DIM, BOX_LEN)) hvbox_large = np.zeros((DIM, DIM, DIM), dtype=np.float32) hvbox_small = np.zeros(HII_shape, dtype=np.float32) smoothR = np.float32(L_FACTOR*BOX_LEN/HII_DIM) largevbox_d = gpuarray.zeros((DIM,DIM,N), dtype=np.complex64) smallvbox_d = gpuarray.zeros((HII_DIM, HII_DIM, M), dtype=np.float32) for num, mode in enumerate(['x', 'y', 'z']): for meta_z in xrange(META_GRID_SIZE): largebox_d = gpuarray.to_gpu_async(hbox_large[:, :, meta_z*N:(meta_z+1)*N].copy()) #largebox_d /= VOLUME #divide by VOLUME if using fft (vs ifft) velocity_kernel(largebox_d, largevbox_d, DIM, np.int32(meta_z), np.int32(num), block=block_size, grid=stitch_grid_size) HII_filter(largevbox_d, DIM, ZERO, smoothR, block=block_size, grid=stitch_grid_size) print hvbox_large.shape, largevbox_d.shape hvbox_large[:, :, meta_z*N:(meta_z+1)*N] = largevbox_d.get_async() hvbox_large = fft_stitch(N, plan2d, plan1d, hvbox_large, largevbox_d).real for meta_z in xrange(META_GRID_SIZE): largevbox_d = gpuarray.to_gpu_async(hvbox_large[:, :, meta_z*N:(meta_z+1)*N].copy()) subsample_kernel(largevbox_d.real, smallvbox_d, DIM, HII_DIM,PIXEL_FACTOR, block=block_size, grid=HII_stitch_grid_size) hvbox_small[:, :, meta_z*M:(meta_z+1)*M] = smallvbox_d.get_async() np.save(parent_folder+"/Boxes/v{0}overddot_{1:d}_{2:.0f}Mpc".format(mode, HII_DIM, BOX_LEN), smallvbox_d.get()) return
def allocate_globals(O): d_O = gpuarray.to_gpu_async(O) return d_O
def __init__(self, coordinate, mass, box_size, cutoff, eps, sig, dt=1): ''' Constructor ''' autoinit.context.set_cache_config(driver.func_cache.PREFER_L1) # Scalars self.dfloat = 'float32' self.dint = 'int32' self.iter = np.int32(0) self.kb = np.float32(8.6173324e-5) self.ions = np.int32(coordinate.shape[0]) self.mass = np.float32(mass) self.cutoff = np.float32(cutoff) self.eps = np.float32(eps) self.sig = np.float32(sig) self.dt = np.float32(dt) self.threads = 256 self.block = (self.threads, 1, 1) self.grid = (ceil(self.ions / self.threads), 1, 1) self.ion_type = ['Ar'] * self.ions # float3 self.box_size = np.array(box_size).astype(self.dfloat) # 1D arrays self.potential_energy = gpuarray.zeros(self.ions, np.float32) self.kinetic_energy = gpuarray.zeros(self.ions, np.float32) # 3D arrays self.coordinate = gpuarray.to_gpu_async(coordinate.astype(self.dfloat)) self.coordinate_sorted = gpuarray.to_gpu_async( coordinate.astype(self.dfloat)) self.velocity = gpuarray.zeros_like(self.coordinate) self.velocity_sorted = gpuarray.zeros_like(self.coordinate) self.force = gpuarray.zeros_like(self.coordinate) # Timers self.start = driver.Event() self.end = driver.Event() self.timer = 0 # System data self.system_pe = np.array([]).astype(self.dfloat) self.system_ke = np.array([]).astype(self.dfloat) # Create the bins self.create_bins() # Load kernels float3 = gpuarray.vec.float3 int3 = gpuarray.vec.int3 self.prefixsum = DoubleBuffer(self.bins, self.threads) self.fill_bins = self.load_function('lj_force.cu', 'fillBins', ('PPP', float3, int3, 'i')) self.counting_sort = self.load_function('lj_force.cu', 'countingSort', 'PPPPPPi') self.lj_force = self.load_function( 'lj_force.cu', 'ljForce', ('PPPPP', float3, float3, int3, 'fffi')) self.verlet_pre = self.load_function('lj_force.cu', 'verletPre', ('PPPPP', float3, 'ffi')) self.verlet_pos = self.load_function('lj_force.cu', 'verletPos', 'PPPffi') return
import skcuda.cublas as cublas import skcuda s = cuda.Event() e = cuda.Event() s.record() nStreams = 8 stream = [cuda.Stream() for i in range(nStreams)] N = 8192 print skcuda.misc.get_current_device() x = [np.asarray(np.random.rand(N/nStreams), np.float32) for i in range(nStreams)] #x_pin = cuda.register_host_memory(x) #xf = np.fft.fft(x) x_gpu = [gpuarray.to_gpu_async(x[i], stream=stream[i]) for i in range(nStreams)] xf_gpu = [gpuarray.empty((N/nStreams)/2 + 1, np.complex64) for i in range(nStreams)] plan = [Plan(x[0].shape, np.float32, np.complex64, stream=stream[i]) for i in range(nStreams)] print skcuda.misc.get_current_device() for i in range(nStreams): fft(x_gpu[i], xf_gpu[i], plan[i]) print skcuda.misc.get_current_device() x_pin = [xf_gpu[i].get_async(stream=stream[i]) for i in range(nStreams)] #print np.allclose(xf[0:N/2 + 1], xf_gpu.get(), atol=1e-6) e.record() e.synchronize() print s.time_till(e), "ms"
def gpu_iuwt_recomposition(in1, scale_adjust, store_on_gpu, smoothed_array): """ This function calls the a trous algorithm code to recompose the input into a single array. This is the implementation of the isotropic undecimated wavelet transform recomposition for a GPU. INPUTS: in1 (no default): Array containing wavelet coefficients. scale_adjust (no default): Indicates the number of omitted array pages. store_on_gpu (no default): Boolean specifier for whether the decomposition is stored on the gpu or not. OUTPUTS: recomposiiton Array containing the reconstructed array. """ wavelet_filter = (1./16)*np.array([1,4,6,4,1], dtype=np.float32) # Filter-bank for use in the a trous algorithm. wavelet_filter = gpuarray.to_gpu_async(wavelet_filter) # Determines scale with adjustment and creates a zero array on the GPU to store the output,unless smoothed_array # is given. max_scale = in1.shape[0] + scale_adjust if smoothed_array is None: recomposition = gpuarray.zeros([in1.shape[1], in1.shape[2]], np.float32) else: recomposition = gpuarray.to_gpu(smoothed_array.astype(np.float32)) # Determines whether the array is already on the GPU or not. If not, moves it to the GPU. try: gpu_in1 = gpuarray.to_gpu_async(in1.astype(np.float32)) except: gpu_in1 = in1 # Creates a working array on the GPU. gpu_tmp = gpuarray.empty_like(recomposition) # Creates and fills an array with the appropriate scale value. gpu_scale = gpuarray.zeros([1], np.int32) gpu_scale += max_scale-1 # Fetches the a trous kernels. gpu_a_trous_row_kernel, gpu_a_trous_col_kernel = gpu_a_trous() grid_rows = int(in1.shape[1]//32) grid_cols = int(in1.shape[2]//32) # The following loops call the a trous algorithm code to recompose the input. The first loop assumes that there are # non-zero wavelet coefficients at scales above scale_adjust, while the second loop completes the recomposition # on the scales less than scale_adjust. for i in range(max_scale-1, scale_adjust-1, -1): gpu_a_trous_row_kernel(recomposition, gpu_tmp, wavelet_filter, gpu_scale, block=(32,32,1), grid=(grid_cols, grid_rows)) gpu_a_trous_col_kernel(gpu_tmp, recomposition, wavelet_filter, gpu_scale, block=(32,32,1), grid=(grid_cols, grid_rows)) recomposition = recomposition[:,:] + gpu_in1[i-scale_adjust,:,:] gpu_scale -= 1 if scale_adjust>0: for i in range(scale_adjust-1, -1, -1): gpu_a_trous_row_kernel(recomposition, gpu_tmp, wavelet_filter, gpu_scale, block=(32,32,1), grid=(grid_cols, grid_rows)) gpu_a_trous_col_kernel(gpu_tmp, recomposition, wavelet_filter, gpu_scale, block=(32,32,1), grid=(grid_cols, grid_rows)) gpu_scale -= 1 # Return values depend on mode. if store_on_gpu: return recomposition else: return recomposition.get()
def gpu_iuwt_decomposition(in1, scale_count, scale_adjust, store_smoothed, store_on_gpu): """ This function calls the a trous algorithm code to decompose the input into its wavelet coefficients. This is the isotropic undecimated wavelet transform implemented for a GPU. INPUTS: in1 (no default): Array on which the decomposition is to be performed. scale_count (no default): Maximum scale to be considered. scale_adjust (no default): Adjustment to scale value if first scales are of no interest. store_smoothed (no default): Boolean specifier for whether the smoothed image is stored or not. store_on_gpu (no default): Boolean specifier for whether the decomposition is stored on the gpu or not. OUTPUTS: detail_coeffs Array containing the detail coefficients. C0 (optional): Array containing the smoothest version of the input. """ # The following simple kernel just allows for the construction of a 3D decomposition on the GPU. ker = SourceModule(""" __global__ void gpu_store_detail_coeffs(float *in1, float *in2, float* out1, int *scale, int *adjust) { const int len = gridDim.x*blockDim.x; const int i = (blockDim.x * blockIdx.x + threadIdx.x); const int j = (blockDim.y * blockIdx.y + threadIdx.y)*len; const int k = (blockDim.z * blockIdx.z + threadIdx.z)*(len*len); const int tid2 = i + j; const int tid3 = i + j + k; if ((blockIdx.z + adjust[0])==scale[0]) { out1[tid3] = in1[tid2] - in2[tid2]; } } """) wavelet_filter = (1./16)*np.array([1,4,6,4,1], dtype=np.float32) # Filter-bank for use in the a trous algorithm. wavelet_filter = gpuarray.to_gpu_async(wavelet_filter) # Initialises an empty array to store the detail coefficients. detail_coeffs = gpuarray.empty([scale_count-scale_adjust, in1.shape[0], in1.shape[1]], np.float32) # Determines whether the array is already on the GPU or not. If not, moves it to the GPU. try: gpu_in1 = gpuarray.to_gpu_async(in1.astype(np.float32)) except: gpu_in1 = in1 # Sets up some working arrays on the GPU to prevent memory transfers. gpu_tmp = gpuarray.empty_like(gpu_in1) gpu_out1 = gpuarray.empty_like(gpu_in1) gpu_out2 = gpuarray.empty_like(gpu_in1) # Sets up some parameters required by the algorithm on the GPU. gpu_scale = gpuarray.zeros([1], np.int32) gpu_adjust = gpuarray.zeros([1], np.int32) gpu_adjust += scale_adjust # Fetches the a trous kernels and sets up the unique storing kernel. gpu_a_trous_row_kernel, gpu_a_trous_col_kernel = gpu_a_trous() gpu_store_detail_coeffs = ker.get_function("gpu_store_detail_coeffs") grid_rows = int(in1.shape[0]//32) grid_cols = int(in1.shape[1]//32) # The following loop, which iterates up to scale_adjust, applies the a trous algorithm to the scales which are # considered insignificant. This is important as each set of wavelet coefficients depends on the last smoothed # version of the input. if scale_adjust>0: for i in range(0, scale_adjust): gpu_a_trous_row_kernel(gpu_in1, gpu_tmp, wavelet_filter, gpu_scale, block=(32,32,1), grid=(grid_cols, grid_rows)) gpu_a_trous_col_kernel(gpu_tmp, gpu_out1, wavelet_filter, gpu_scale, block=(32,32,1), grid=(grid_cols, grid_rows)) gpu_in1, gpu_out1 = gpu_out1, gpu_in1 gpu_scale += 1 # The meat of the algorithm - two sequential applications fo the a trous followed by determination and storing of # the detail coefficients. C0 is reassigned the value of C on each loop - C0 is always the smoothest version of the # input image. for i in range(scale_adjust, scale_count): gpu_a_trous_row_kernel(gpu_in1, gpu_tmp, wavelet_filter, gpu_scale, block=(32,32,1), grid=(grid_cols, grid_rows)) gpu_a_trous_col_kernel(gpu_tmp, gpu_out1, wavelet_filter, gpu_scale, block=(32,32,1), grid=(grid_cols, grid_rows)) # Approximation coefficients. gpu_a_trous_row_kernel(gpu_out1, gpu_tmp, wavelet_filter, gpu_scale, block=(32,32,1), grid=(grid_cols, grid_rows)) gpu_a_trous_col_kernel(gpu_tmp, gpu_out2, wavelet_filter, gpu_scale, block=(32,32,1), grid=(grid_cols, grid_rows)) # Approximation coefficients. gpu_store_detail_coeffs(gpu_in1, gpu_out2, detail_coeffs, gpu_scale, gpu_adjust, block=(32,32,1), grid=(grid_cols, grid_rows, int(scale_count))) # Detail coefficients. gpu_in1, gpu_out1 = gpu_out1, gpu_in1 gpu_scale += 1 # Return values depend on mode. NOTE: store_smoothed does not work if the result stays on the gpu. if store_on_gpu: return detail_coeffs elif store_smoothed: return detail_coeffs.get(), gpu_in1.get() else: return detail_coeffs.get()
self.stream.synchronize() def fromDevice(self, buf, shape=None): res = buf.get() if shape is not None: res = res.reshape(shape) return res def toDevice(self, buf, shape=None, async=False, dest=None): if shape is not None: buf = buf.reshape(shape) if dest is None: if async: # FIXME: there must be a warning in docs that buf has to be pagelocked return gpuarray.to_gpu_async(buf, stream=self.stream) else: return gpuarray.to_gpu(buf) else: cuda.memcpy_htod_async(dest.gpudata, buf, stream=None) def copyBuffer(self, buf, dest=None, src_offset=0, dest_offset=0, length=None): elem_size = buf.dtype.itemsize size = buf.nbytes if length is None else elem_size * length src_offset *= elem_size dest_offset *= elem_size if dest is None: ddest = self.allocate(buf.shape, buf.dtype)
# We can get a stream object from the pycuda.driver submodule with the Stream class for _ in range(num_arrays): streams.append(drv.Stream()) # generate random arrays. for _ in range(num_arrays): data.append(np.random.randn(array_len).astype('float32')) t_start = time() # copy arrays to GPU. # switch to the asynchronous and stream-friendly version of this function, # gpu_array.to_gpu_async, instead. (We must now also specify which stream # each memory operation should use with the stream parameter) for k in range(num_arrays): data_gpu.append(gpuarray.to_gpu_async(data[k], stream=streams[k])) # process arrays. # This is exactly as before, only we must specify what stream to use by using the stream parameter for k in range(num_arrays): mult_ker(data_gpu[k], np.int32(array_len), block=(64, 1, 1), grid=(1, 1, 1), stream=streams[k]) # copy arrays from GPU. # We can do this by switching the gpuarray get function to get_async, and again using the stream parameter for k in range(num_arrays): gpu_out.append(data_gpu[k].get_async(stream=streams[k]))
def watershed(I): # Get contiguous image + shape. height, width = I.shape I = np.float32(I.copy()) # Get block/grid size for steps 1-3. block_size = (6,6,1) grid_size = (width/(block_size[0]-2), height/(block_size[0]-2)) # Get block/grid size for step 4. block_size2 = (16,16,1) grid_size2 = (width/(block_size2[0]-2), height/(block_size2[0]-2)) # Initialize variables. labeled = np.zeros([height,width]) labeled = np.float32(labeled) width = np.int32(width) height = np.int32(height) count = np.int32([0]) # Transfer labels asynchronously. labeled_d = gpu.to_gpu_async(labeled) counter_d = gpu.to_gpu_async(count) # Bind CUDA textures. I_cu = cu.matrix_to_array(I, order='C') cu.bind_array_to_texref(I_cu, image_texture) # Step 1. descent_kernel(labeled_d, width, height, block=block_size, grid=grid_size) start_time = cu.Event() end_time = cu.Event() start_time.record() # Step 2. increment_kernel(labeled_d,width,height, block=block_size2,grid=grid_size2) counters_d = gpu.to_gpu(np.int32([0])) old, new = -1, -2 while old != new: old = new minima_kernel(labeled_d, counters_d, width, height, block=block_size, grid=grid_size) new = counters_d.get()[0] # Step 3. counters_d = gpu.to_gpu(np.int32([0])) old, new = -1, -2 while old != new: old = new plateau_kernel(labeled_d, counters_d, width, height, block=block_size, grid=grid_size) new = counters_d.get()[0] # Step 4 counters_d = gpu.to_gpu(np.int32([0])) old, new = -1, -2 while old != new: old = new flood_kernel(labeled_d, counters_d, width, height, block=block_size2, grid=grid_size2) new = counters_d.get()[0] result = labeled_d.get() # End GPU timers. end_time.record() end_time.synchronize() gpu_time = start_time.\ time_till(end_time) * 1e-3 # print str(gpu_time) return result
func(cuda.InOut(a_cpu), block=(4, 4, 1)) print a_cpu # # ---- prepared call ------------------------------------------- # grid = (1, 1) block = (4, 4, 1) func.prepare("P") func.prepared_call(grid, block, a_gpu) # # ---- GPUArray ------------------------------------------- # b_gpu = gpuarray.to_gpu(np.random.randn(4, 4).astype(np.float32)) b_doubled = (2 * b_gpu).get() print b_gpu print b_doubled # # ---- GPUArray ------------------------------------------- # b_gpu = gpuarray.to_gpu_async(np.random.randn(100, 100).astype(np.float32)) b_doubled = (2 * b_gpu).get_async() print "This is Async..........." print b_gpu print b_doubled
func(cuda.InOut(a_cpu), block=(4,4,1)) print a_cpu # # ---- prepared call ------------------------------------------- # grid = (1,1) block = (4,4,1) func.prepare("P") func.prepared_call(grid, block, a_gpu) # # ---- GPUArray ------------------------------------------- # b_gpu = gpuarray.to_gpu(np.random.randn(4,4).astype(np.float32)) b_doubled = (2 * b_gpu).get() print b_gpu print b_doubled # # ---- GPUArray ------------------------------------------- # b_gpu = gpuarray.to_gpu_async(np.random.randn(100,100).astype(np.float32)) b_doubled = (2 * b_gpu).get_async() print "This is Async..........." print b_gpu print b_doubled
def test_batch_get_sol_params(f, bend_coefs, rot_coef, atol=1e-7, index=0): seg_info = f.items()[index][1] inv_group = seg_info['inv'] ds_key = 'DS_SIZE_{}'.format(DS_SIZE) x_nd = inv_group[ds_key]['scaled_cloud_xyz'][:] K_nn = inv_group[ds_key]['scaled_K_nn'][:] n, d = x_nd.shape x_gpu = gpuarray.to_gpu(x_nd) H_arr_gpu = [] for b in bend_coefs: cur_offset = np.zeros((1 + d + n, 1 + d + n), np.float64) cur_offset[d+1:, d+1:] = b * K_nn cur_offset[1:d+1, 1:d+1] = np.diag(rot_coef) H_arr_gpu.append(gpuarray.to_gpu(cur_offset)) H_ptr_gpu = get_gpu_ptrs(H_arr_gpu) A = np.r_[np.zeros((d+1,d+1)), np.c_[np.ones((n,1)), x_nd]].T n_cnts = A.shape[0] _u,_s,_vh = np.linalg.svd(A.T) N = _u[:,n_cnts:] F = np.zeros((n + d + 1, d), np.float64) F[1:d+1, :d] += np.diag(rot_coef) Q = np.c_[np.ones((n,1)), x_nd, K_nn].astype(np.float64) F = F.astype(np.float64) N = N.astype(np.float64) Q_gpu = gpuarray.to_gpu(Q) Q_arr_gpu = [Q_gpu for _ in range(len(bend_coefs))] Q_ptr_gpu = get_gpu_ptrs(Q_arr_gpu) F_gpu = gpuarray.to_gpu(F) F_arr_gpu = [F_gpu for _ in range(len(bend_coefs))] F_ptr_gpu = get_gpu_ptrs(F_arr_gpu) N_gpu = gpuarray.to_gpu(N) N_arr_gpu = [N_gpu for _ in range(len(bend_coefs))] N_ptr_gpu = get_gpu_ptrs(N_arr_gpu) dot_batch_nocheck(Q_arr_gpu, Q_arr_gpu, H_arr_gpu, Q_ptr_gpu, Q_ptr_gpu, H_ptr_gpu, transa = 'T') QTQ = Q.T.dot(Q) H_list = [] for i, bend_coef in enumerate(bend_coefs): H = QTQ H[d+1:,d+1:] += bend_coef * K_nn rot_coefs = np.ones(d) * rot_coef if np.isscalar(rot_coef) else rot_coef H[1:d+1, 1:d+1] += np.diag(rot_coefs) # ipdb.set_trace() H_list.append(H) # N'HN NHN_arr_gpu, NHN_ptr_gpu = m_dot_batch((N_arr_gpu, N_ptr_gpu, 'T'), (H_arr_gpu, H_ptr_gpu, 'N'), (N_arr_gpu, N_ptr_gpu, 'N')) NHN_list = [N.T.dot(H.dot(N)) for H in H_list] for i, NHN in enumerate(NHN_list): assert(np.allclose(NHN, NHN_arr_gpu[i].get(), atol=atol)) iH_arr = [] for NHN in NHN_arr_gpu: iH_arr.append(scipy.linalg.inv(NHN.get()).copy()) h_inv_list = [scipy.linalg.inv(NHN) for NHN in NHN_list] assert(np.allclose(iH_arr, h_inv_list, atol=atol)) iH_arr_gpu = [gpuarray.to_gpu_async(iH) for iH in iH_arr] iH_ptr_gpu = get_gpu_ptrs(iH_arr_gpu) proj_mats = m_dot_batch((N_arr_gpu, N_ptr_gpu, 'N'), (iH_arr_gpu, iH_ptr_gpu, 'N'), (N_arr_gpu, N_ptr_gpu, 'T'), (Q_arr_gpu, Q_ptr_gpu, 'T')) proj_mats_list = [N.dot(h_inv.dot(N.T.dot(Q.T))) for h_inv in h_inv_list] assert(np.allclose(proj_mats_list, proj_mats[0][index].get(), atol=atol)) offset_mats = m_dot_batch((N_arr_gpu, N_ptr_gpu, 'N'), (iH_arr_gpu, iH_ptr_gpu, 'N'), (N_arr_gpu, N_ptr_gpu, 'T'), (F_arr_gpu, F_ptr_gpu, 'N')) offset_mats_list = [N.dot(h_inv.dot(N.T.dot(F))) for h_inv in h_inv_list] assert(np.allclose(offset_mats_list, offset_mats[0][index].get(), atol=atol))