def gaussian_gpu_v2(cls, sigma, size=None): """ Calculate a 1D gaussian using pyopencl. This is the same as scipy.signal.gaussian. Only one kernel to :param sigma: width of the gaussian :param size: can be calculated as 1 + 2 * 4sigma """ if not size: size = int(1 + 8 * sigma) g_gpu = pyopencl.array.empty(cls.queue, size, dtype=numpy.float32, order="C") t0 = time.time() evt = cls.kernels["gaussian"].gaussian( cls.queue, (64, ), (64, ), g_gpu.data, # __global float *data, numpy.float32(sigma), # const float sigma, numpy.int32(size), # const int SIZE pyopencl.LocalMemory(64 * 4), pyopencl.LocalMemory(64 * 4), ) g = g_gpu.get() if cls.PROFILE: logger.info("execution time: %.3fms; Kernel took %.3fms", 1e3 * (time.time() - t0), 1e-6 * (evt.profile.end - evt.profile.start)) return g
def process(self): """ Process for InputLayer does nothing. Simple invokes process for next layers. """ self.opencl.kernel_process_layer.set_arg(0, self.context._inputs_buf) self.opencl.kernel_process_layer.set_arg(1, self.context._weights_buf) self.opencl.kernel_process_layer.set_arg(7, pyopencl.LocalMemory(64 * 4)) self.opencl.kernel_process_layer.set_arg(8, self.context._outputs_buf) if self.context.training_allowed: self.opencl.kernel_calc_layer_gradient.set_arg( 0, self.context._inputs_buf) self.opencl.kernel_calc_layer_gradient.set_arg( 1, self.context._errors_backpropagation_buf) self.opencl.kernel_calc_layer_gradient.set_arg( 6, self.context._gradient_buf) self.opencl.kernel_propagate_errors.set_arg( 0, self.context._errors_backpropagation_buf) self.opencl.kernel_propagate_errors.set_arg( 1, self.context._weights_buf) self.opencl.kernel_propagate_errors.set_arg( 8, pyopencl.LocalMemory(256)) self.opencl.kernel_propagate_errors.set_arg( 9, self.context._outputs_buf) super(InputLayer, self).process() self.reset_processed()
def count_violations(self, queue, restraints, rotmat, access_interspace, viol_counter, weight): WORKGROUPSIZE = 32 kernel = self.kernels.count_violations rotmat16 = np.zeros(16, dtype=np.float32) rotmat16[:9] = rotmat.flatten()[:] shape = np.asarray(list(access_interspace.shape) + [access_interspace.size], dtype=np.int32) loc_viol = cl.LocalMemory(4 * restraints.shape[0]**2 * WORKGROUPSIZE) # float4 restraints_center = cl.LocalMemory(4 * restraints.shape[0] * 4) mindist2 = cl.LocalMemory(4 * restraints.shape[0]) maxdist2 = cl.LocalMemory(4 * restraints.shape[0]) kernel.set_args(restraints.data, rotmat16, access_interspace.data, viol_counter.data, loc_viol, restraints_center, mindist2, maxdist2, np.int32(restraints.shape[0]), shape, np.float32(weight)) gws = (8 * WORKGROUPSIZE * 8 * 4, ) lws = (WORKGROUPSIZE, ) status = cl.enqueue_nd_range_kernel(queue, kernel, gws, lws) return status
def test_scatter(cl_env, radix_kernels, key_dtype, ngroups, group_size): ctx, cq = cl_env radix_bits = 4 histogram_len = 2 ** radix_bits keys = np.random.randint(0, 64, size=(ngroups, group_size * 2), dtype=key_dtype) keys_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, keys.nbytes) out_keys_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, keys.nbytes) histogram_buf = cl.Buffer( ctx, cl.mem_flags.READ_ONLY, histogram_len * ngroups * np.dtype('uint32').itemsize ) offset_buf = cl.Buffer( ctx, cl.mem_flags.READ_ONLY, histogram_len * ngroups * np.dtype('uint32').itemsize ) for radix_pass in range(keys.dtype.itemsize * 8 // radix_bits): radix_keys = radix_key(keys, radix_bits, radix_pass).astype('uint16') order = np.argsort(radix_keys, kind='mergesort') grid = np.ogrid[tuple(slice(0, s) for s in keys.shape)] block_keys = keys[grid[:-1] + [order]] # Partially sort (keys_map, _) = cl.enqueue_map_buffer( cq, keys_buf, cl.map_flags.WRITE_INVALIDATE_REGION, 0, keys.shape, keys.dtype, wait_for=[], is_blocking=True ) keys_map[...] = block_keys del keys_map radix_keys = radix_key(block_keys, radix_bits, radix_pass).astype('uint16') (histogram_map, _) = cl.enqueue_map_buffer( cq, histogram_buf, cl.map_flags.WRITE_INVALIDATE_REGION, 0, (histogram_len, ngroups), np.dtype('uint32'), wait_for=[], is_blocking=True ) (offset_map, _) = cl.enqueue_map_buffer( cq, offset_buf, cl.map_flags.WRITE_INVALIDATE_REGION, 0, (histogram_len, ngroups), np.dtype('uint32'), wait_for=[], is_blocking=True ) histogram_map[...] = np.array([np.bincount(group_keys, minlength=16) for group_keys in radix_keys], dtype='uint32').T offset_map[...] = prefix_sum(histogram_map.flat).reshape(histogram_len, ngroups) del histogram_map, offset_map local_offset = cl.LocalMemory(histogram_len * np.dtype('uint32').itemsize) local_histogram = cl.LocalMemory(histogram_len * np.dtype('uint32').itemsize) e = radix_kernels['scatter']( cq, (ngroups,), (group_size,), keys_buf, out_keys_buf, None, None, offset_buf, local_offset, histogram_buf, local_histogram, radix_bits, radix_pass, g_times_l=True, ) (keys_map, _) = cl.enqueue_map_buffer( cq, out_keys_buf, cl.map_flags.READ, 0, (ngroups, group_size * 2), keys.dtype, wait_for=[e], is_blocking=True ) expected = block_keys.flat[np.argsort(radix_keys, axis=None, kind='mergesort')] np.testing.assert_equal(keys_map, expected.reshape(ngroups, 2 * group_size))
def test_ternary(context, q, float_data, float_data_gpu): kernelSource = """ __global__ void setValue(float *data, int idx, float value) { if(threadIdx.x == 0) { data[idx] = value; } } __global__ void testTernary(float *data) { data[0] = data[1] > 0 ? data[2] : data[3]; } """ setValueKernelName = test_common.mangle('setValue', ['float *', 'int', 'float']) setValueProg = compile_code(cl, context, kernelSource, setValueKernelName, num_clmems=1) testTernaryName = test_common.mangle('testTernary', ['float *']) testTernaryProg = compile_code(cl, context, kernelSource, testTernaryName, num_clmems=1) float_data_orig = np.copy(float_data) def set_float_value(gpu_buffer, idx, value): setValueProg.__getattr__(setValueKernelName)(q, (32, ), (32, ), float_data_gpu, offset_type(0), np.int32(idx), np.float32(value), cl.LocalMemory(4)) cl.enqueue_copy(q, float_data_gpu, float_data) print('float_data[:8]', float_data[:8]) set_float_value(float_data_gpu, 1, 10) testTernaryProg.__getattr__(testTernaryName)(q, (32, ), (32, ), float_data_gpu, offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() print('float_data[:8]', float_data[:8]) assert float_data[0] == float_data_orig[2] set_float_value(float_data_gpu, 1, -2) testTernaryProg.__getattr__(testTernaryName)(q, (32, ), (32, ), float_data_gpu, offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() print('float_data[:8]', float_data[:8]) assert float_data[0] == float_data_orig[3]
def hist_op_6_time(BS, D, left_buf, right_buf, H, W, out_buf): start = time.time() global_size = np.zeros(shape=(7*(H-6),9*(W-6-D))).astype(np.float32) # 7x9 group shape 63 work item group_size = np.zeros(shape=(7,9)).astype(np.float32) lclLeft = cl.LocalMemory(np.int32().nbytes*630) lclRight = cl.LocalMemory(np.int32().nbytes*182) func6(queue, global_size.shape, group_size.shape, lclRight, lclLeft, np.int32(BS/2), np.int32(D), left_buf, right_buf, np.int32(H), np.int32(W), out_buf) return time.time()-start
def sort(self, queue, N, a_buf, o_buf): loc_aux = cl.LocalMemory(16 * self.n_threads) loc_idx = cl.LocalMemory(16 * self.n_threads) #print("N==", N, "n_threads==", self.n_threads) minnt = min(N, self.n_threads) evt = self.prgsrt.ParallelBitonic_Local(queue, (minnt, ), (minnt, ), a_buf, o_buf, loc_aux, loc_idx) evt.wait()
def test_block_sort_random(cl_env, radix_kernels, key_dtype, ngroups, group_size): ctx, cq = cl_env radix_bits = 4 histogram_len = 2 ** radix_bits keys = np.random.randint(0, 64, size=(ngroups, group_size * 2), dtype=key_dtype) keys_buf = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, keys.nbytes) histogram_buf = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, ngroups * histogram_len * np.dtype('uint32').itemsize) local_keys = cl.LocalMemory(group_size * 2 * keys.dtype.itemsize) local_values = cl.LocalMemory(group_size * 2 * keys.dtype.itemsize) count = cl.LocalMemory(group_size * 2 * np.dtype('uint32').itemsize) local_histogram = cl.LocalMemory(histogram_len * np.dtype('uint32').itemsize) for radix_pass in range(keys.dtype.itemsize * 8 // radix_bits): (keys_map, _) = cl.enqueue_map_buffer( cq, keys_buf, cl.map_flags.WRITE_INVALIDATE_REGION, 0, (ngroups, group_size * 2), keys.dtype, wait_for=[], is_blocking=True ) keys_map[...] = keys del keys_map e = radix_kernels['block_sort']( cq, (ngroups,), (group_size,), keys_buf, local_keys, local_keys, None, local_values, local_values, histogram_buf, local_histogram, count, radix_bits, radix_pass, g_times_l=True, ) keys = keys.reshape(ngroups, group_size * 2) order = np.argsort(radix_key(keys, radix_bits, radix_pass), kind='mergesort') grid = np.ogrid[tuple(slice(0, s) for s in keys.shape)] (histogram_map, _) = cl.enqueue_map_buffer( cq, histogram_buf, cl.map_flags.READ, 0, (histogram_len, ngroups), np.dtype('uint32'), wait_for=[e], is_blocking=True ) i = 0 for group_keys, histogram in zip(keys, histogram_map.T): group_keys = radix_key(group_keys, radix_bits, radix_pass).astype('uint16') expected = np.bincount(group_keys, minlength=16) try: np.testing.assert_equal(histogram, expected) except AssertionError: print((radix_pass, i)) raise i += 1 expected = keys[grid[:-1] + [order]] (keys_map, _) = cl.enqueue_map_buffer( cq, keys_buf, cl.map_flags.READ, 0, (ngroups, group_size * 2), keys.dtype, wait_for=[e], is_blocking=True ) np.testing.assert_equal(keys_map, expected)
def opencl_dtw_run (SrcS, TrgS, ctx, queue, prg, dev_Param): #MAX_MEM_ALLOC_SIZE cot1 = int(dev_Param["LOCAL_MEM_SIZE"] / (TrgS.shape[1] *4 *3)) cot2 = int(dev_Param["MAX_WORK_ITEM_SIZES"][0] / TrgS.shape[1]) TRG_COT = min(cot1, cot2) Grp_Cot = int(dev_Param["MAX_MEM_ALLOC_SIZE"] / (TrgS.shape[1] *4 *TRG_COT)) T0 = TrgS.shape[0] T1 = TrgS.shape[1] TrgS_Alignment = TRG_COT -T0 % TRG_COT if TrgS_Alignment != TRG_COT: TrgS = numpy.concatenate ((TrgS, numpy.ones((TrgS_Alignment,T1),dtype=numpy.float32))) T0 = TrgS.shape[0] #print ("TrgS_Alignment,TRG_COT",TrgS_Alignment,TRG_COT) Splits = list(range(0, T0, Grp_Cot *TRG_COT)) Splits.append (T0) allret = numpy.empty ((SrcS.shape[0],TrgS.shape[0]), dtype=numpy.float32) for j in range(len(Splits)-1): TrgS_sub = TrgS[Splits[j]:Splits[j+1],:] Ts0 = TrgS_sub.shape[0] Ts1 = TrgS_sub.shape[1] local_size = TRG_COT *Ts1 global_size = Ts0 *Ts1 t = numpy.reshape(TrgS_sub,(Ts0 *Ts1)) t_dev = cl_array.to_device(queue, t) #print ("local_size, global_size ",local_size,global_size,t.nbytes/1024/1024) SRC_LEN = SrcS.shape[1] TRG_LEN = TrgS.shape[1] for i in range(SrcS.shape[0]): s = SrcS[i,:] s_dev = cl_array.to_device(queue, s) r_dev = cl_array.empty (queue, (Ts0,), dtype=numpy.float32) shared_mem_size = Ts1 *TRG_COT *4 prg.opencl_dtw(queue, (global_size,), (local_size,), \ numpy.uint32(SRC_LEN),numpy.uint32(TRG_LEN),numpy.uint32(TRG_COT), s_dev.data, t_dev.data, r_dev.data,\ cl.LocalMemory(shared_mem_size), cl.LocalMemory(shared_mem_size), cl.LocalMemory(shared_mem_size) ) r = r_dev.get() allret[i,Splits[j]:Splits[j+1]] = r #print(la.norm((dest_dev - (a_dev+b_dev)).get())) if TrgS_Alignment != TRG_COT: allret = allret[:,0:-TrgS_Alignment] return (allret)
def runAlgo(self): """ The program implementation """ #initialize client side (CPU) arrays N = 100 A_VAL = .5 B_VAL = 1 size = N * N h_A = np.empty(size).astype(np.float32) h_B = np.empty(size).astype(np.float32) h_A.fill(A_VAL) h_B.fill(B_VAL) #create OpenCL buffers d_A = self.vectToBuffer(h_A) d_B = self.vectToBuffer(h_B) d_C = self.outBuffer(h_A.nbytes) np.set_printoptions(threshold='nan') # execute program mmul = self.program.mmul mmul.set_scalar_arg_dtypes([np.int32, None, None, None]) mmul(self.queue, (N, N), None, N, d_A, d_B, d_C) print "First problem solved" h_C = np.empty_like(h_A) self.bufferToVect(d_C, h_C) print "{}".format(h_C) localmem = cl.LocalMemory(np.dtype(np.float32).itemsize * N) n_blocks = 10 mmul2 = self.program.mmul2 mmul2.set_scalar_arg_dtypes([np.int32, None, None, None, None]) mmul2(self.queue, (N, ), (N / n_blocks, ), N, d_A, d_B, d_C, localmem) print "Second problem solved" h_C = np.empty_like(h_A) self.bufferToVect(d_C, h_C) print "{}".format(h_C) blocksize = 10 A_block = cl.LocalMemory(np.dtype(np.float32).itemsize * blocksize**2) B_block = cl.LocalMemory(np.dtype(np.float32).itemsize * blocksize**2) mmul3 = self.program.mmul3 mmul3.set_scalar_arg_dtypes([np.int32, None, None, None, None, None]) mmul3(self.queue, (N, N), (blocksize, blocksize), N, d_A, d_B, d_C, A_block, B_block) print "Third problem solved" h_C = np.empty_like(h_A) self.bufferToVect(d_C, h_C) print "{}".format(h_C)
def reorder(self, startbit, num): totalBlocks = num / 2 / self.cta_size global_size = (self.cta_size * totalBlocks, ) local_size = (self.cta_size, ) reorder_args = (self.keys, self.values, self.d_tempKeys, self.d_tempValues, self.mBlockOffsets, self.mCountersSum, self.mCounters, np.uint32(startbit), np.uint32(num), np.uint32(totalBlocks), cl.LocalMemory(2 * self.cta_size * self.uintsz), cl.LocalMemory(2 * self.cta_size * self.uintsz)) self.radix_prg.reorderDataKeysValues(self.queue, global_size, local_size, *(reorder_args))
def execute(self, n_it=1, **kwargs): # this defines how often the calculations are copied back from the compute unit (GPU) # e.g. 10 means that every 10th iteration is copied from the computing unit (GPU) to "python" n_out = kwargs.get('n_out', 10) queue = self.queue prg = self.program local_size = self.local_size #(n_local,) #self.local_size n_local = 512 ng = self.ng # initialize the next step i_out = 0 total_out = (n_it // n_out + 1) time_axis = np.arange(total_out, dtype=np.float32) * self.t_step n_excited = np.zeros(total_out, dtype=np.float32) n_excited[0] = 1.0 tmp_1 = cl_array.zeros(queue, (n_local * total_out, ), dtype=np.float32) tmp_2 = cl_array.zeros(queue, (n_local * total_out, ), dtype=np.float32) p = self.p_gp n = self.n_gp b = self.b_gp d = self.d_gp k = self.k_gp #prg.copy3d(queue, self.global_size, None, # n.data, p.data, b).wait() for time_i in range(n_it): if time_i % 2 > 0: p, n = n, p prg.iterate(queue, self.global_size_3d, local_size, n, p, d, k, b) if time_i % n_out == 0: prg.reduce_decay(queue, self.global_size, self.local_size, p, k, cl.LocalMemory(n_local * 32), cl.LocalMemory(n_local * 32), np.int32(self.global_size[0]), np.int32(n_local), np.int32(i_out), np.float32(time_i), tmp_1.data, tmp_2.data) i_out += 1 self.it += 1 dc = (tmp_1.map_to_host()).reshape((total_out, n_local)).sum(axis=1) ds = (tmp_2.map_to_host()).reshape((total_out, n_local)).sum(axis=1) n_ex = dc / ds cl.enqueue_copy(queue, self.p_np, self.p_gp) self.p = self.p_np.reshape((ng, ng, ng), order='C') return time_axis, n_ex, self.p
def allocate_constants(self): """ Allocates constants and local memory to be used by OpenCL. """ self.w = cl.Buffer(self.context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=w) self.cx = cl.Buffer(self.context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=cx) self.cy = cl.Buffer(self.context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=cy) self.local_u = cl.LocalMemory(float_size * self.two_d_local_size[0]*self.two_d_local_size[1]) self.local_v = cl.LocalMemory(float_size * self.two_d_local_size[0]*self.two_d_local_size[1]) self.local_rho = cl.LocalMemory(float_size * self.two_d_local_size[0]*self.two_d_local_size[1])
def reorder(self, d_key, d_val, startbit, num): totalBlocks = num // 2 // self.cta_size global_size = (self.cta_size * totalBlocks, ) local_size = (self.cta_size, ) reorder_args = (d_key, d_val, self.d_temp_keys, self.d_temp_values, self.d_block_offsets, self.d_counters_sum, self.d_counters, np.uint32(startbit), np.uint32(num), np.uint32(totalBlocks), cl.LocalMemory(2 * self.cta_size * self.dtype_size), cl.LocalMemory(2 * self.cta_size * self.dtype_size)) self.radix_prg.reorderDataKeysValues(self.queue, global_size, local_size, *reorder_args)
def test_rgb(self): """ tests the int64 kernel """ max_wg = kernel_workgroup_size(self.reduction, "max_min_global_stage1") if max_wg < self.red_size: logger.warning( "test_uint16: Skipping test of WG=%s when maximum is %s (%s)", self.red_size, max_wg, self.max_wg) return lint = numpy.empty((self.input.shape[0], self.input.shape[1], 3), dtype=numpy.uint8) lint[:, :, 0] = self.input.astype(numpy.uint8) lint[:, :, 1] = self.input.astype(numpy.uint8) lint[:, :, 2] = self.input.astype(numpy.uint8) t0 = time.time() au8 = pyopencl.array.to_device(self.queue, lint) k1 = self.program.rgb_to_float(self.queue, self.shape, self.wg, au8.data, self.gpudata.data, self.IMAGE_W, self.IMAGE_H) k2 = self.reduction.max_min_global_stage1( self.queue, (self.red_size * self.red_size, ), (self.red_size, ), self.gpudata.data, self.buffers_max_min.data, (self.IMAGE_W * self.IMAGE_H), pyopencl.LocalMemory(8 * self.red_size)) k3 = self.reduction.max_min_global_stage2( self.queue, (self.red_size, ), (self.red_size, ), self.buffers_max_min.data, self.buffers_max.data, self.buffers_min.data, pyopencl.LocalMemory(8 * self.red_size)) k4 = self.program.normalizes(self.queue, self.shape, self.wg, self.gpudata.data, self.buffers_min.data, self.buffers_max.data, self.twofivefive.data, self.IMAGE_W, self.IMAGE_H) res = self.gpudata.get() t1 = time.time() ref = normalize(lint.max(axis=-1)) t2 = time.time() delta = abs(ref - res).max() if self.PROFILE: logger.info("Global execution time: CPU %.3fms, GPU: %.3fms." % (1000.0 * (t2 - t1), 1000.0 * (t1 - t0))) logger.info("Conversion RGB ->float took %.3fms" % (1e-6 * (k1.profile.end - k1.profile.start))) logger.info("Reduction stage1 took %.3fms" % (1e-6 * (k2.profile.end - k2.profile.start))) logger.info("Reduction stage2 took %.3fms" % (1e-6 * (k3.profile.end - k3.profile.start))) logger.info("Normalization %.3fms" % (1e-6 * (k4.profile.end - k4.profile.start))) logger.info("--------------------------------------") self.assert_(delta < 1e-4, "delta=%s" % delta)
def blocks(self, nbits, startbit, num): totalBlocks = num / 4 / self.cta_size global_size = (self.cta_size * totalBlocks, ) local_size = (self.cta_size, ) blocks_args = (self.keys, self.values, self.d_tempKeys, self.d_tempValues, np.uint32(nbits), np.uint32(startbit), np.uint32(num), np.uint32(totalBlocks), cl.LocalMemory(4 * self.cta_size * self.uintsz), cl.LocalMemory(4 * self.cta_size * self.uintsz)) self.radix_prg.radixSortBlocksKeysValues(self.queue, global_size, local_size, *(blocks_args)).wait()
def test_ieeefloats(context, q, float_data, float_data_gpu): cu_code = """ __global__ void mykernel(double *data) { double d_neginfinity = -INFINITY; double d_posinfinity = INFINITY; float f_neginfinity = -INFINITY; float f_posinfinity = INFINITY; data[0] = INFINITY; data[1] = -INFINITY; data[2] = f_neginfinity; data[3] = f_posinfinity; } """ kernel_name = test_common.mangle('mykernel', ['double*']) cl_code = test_common.cu_to_cl(cu_code, kernel_name, num_clmems=1) kernel = test_common.build_kernel(context, cl_code, kernel_name) kernel( q, (32,), (32,), float_data_gpu, offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() print(float_data[:4]) assert float_data[0] == np.inf assert float_data[1] == - np.inf assert float_data[2] == - np.inf assert float_data[3] == np.inf
def test_sitofp(context, q, float_data, float_data_gpu, int_data, int_data_gpu): code = """ __global__ void myKernel(float *float_data, int *int_data) { float_data[0] = (float)int_data[0]; } """ kernel = test_common.compile_code_v3(cl, context, code, test_common.mangle('myKernel', ['float *', 'int *']), num_clmems=2)['kernel'] int_data[0] = 5 int_data[1] = 2 int_data[2] = 4 cl.enqueue_copy(q, int_data_gpu, int_data) kernel( q, (32,), (32,), float_data_gpu, int_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, float_data, float_data_gpu) cl.enqueue_copy(q, int_data, int_data_gpu) q.finish() print('float_data[0]', float_data[0]) # expected = pow(float_data[1], float_data[2]) assert float_data[0] == 5
def test_sqrt(context, q, float_data, float_data_gpu): code = """ __global__ void myKernel(float *data) { data[threadIdx.x] = sqrt(data[threadIdx.x]); } """ kernel = test_common.compile_code_v3(cl, context, code, test_common.mangle('myKernel', ['float *']), num_clmems=1)['kernel'] float_data[0] = 1.5 float_data[1] = 4.6 float_data[2] = -1.5 float_data[3] = 0 float_data_orig = np.copy(float_data) cl.enqueue_copy(q, float_data_gpu, float_data) kernel( q, (32,), (32,), float_data_gpu, offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() print('float_data[:4]', float_data[:4]) for i in range(4): if float_data_orig[i] >= 0: assert abs(float_data[i] - math.sqrt(float_data_orig[i])) <= 1e-4 else: assert math.isnan(float_data[i])
def test_pow(context, q, float_data, float_data_gpu): code = """ __global__ void myKernel(float *data) { data[0] = pow(data[1], data[2]); data[3] = pow(data[4], data[5]); data[5] = pow(data[7], data[8]); } """ kernel = test_common.compile_code_v3(cl, context, code, test_common.mangle('myKernel', ['float *']), num_clmems=1)['kernel'] float_data[1] = 1.5 float_data[2] = 4.6 float_data[4] = -1.5 float_data[5] = 4.6 float_data[7] = 1.5 float_data[8] = -4.6 cl.enqueue_copy(q, float_data_gpu, float_data) kernel( q, (32,), (32,), float_data_gpu, offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() print('float_data[0]', float_data[0]) print('float_data[3]', float_data[3]) print('float_data[6]', float_data[6]) expected = pow(float_data[1], float_data[2]) assert abs(float_data[0] - expected) <= 1e-4
def setUp(self): if not test_options.opencl: self.skipTest("User request to skip OpenCL tests") if pyopencl is None or ocl is None: self.skipTest("OpenCL module (pyopencl) is not present or no device available") self.h_data = numpy.random.random(self.N).astype("float32") self.h2_data = numpy.random.random((self.N, self.N)).astype("float32").reshape((self.N, self.N)) self.ctx = ocl.create_context(devicetype="GPU") device = self.ctx.devices[0] try: devtype = pyopencl.device_type.to_string(device.type).upper() except ValueError: # pocl does not describe itself as a CPU ! devtype = "CPU" workgroup = device.max_work_group_size if (devtype == "CPU") and (device.platform.vendor == "Apple"): logger.info("For Apple's OpenCL on CPU: enforce max_work_goup_size=1") workgroup = 1 self.ws = min(workgroup, self.ws) self.queue = pyopencl.CommandQueue(self.ctx, properties=pyopencl.command_queue_properties.PROFILING_ENABLE) self.local_mem = pyopencl.LocalMemory(self.ws * 32) # 2float4 = 2*4*4 bytes per workgroup size src = read_cl_file("pyfai:openCL/bitonic.cl") self.prg = pyopencl.Program(self.ctx, src).build()
def test_inlining(context, q, float_data, float_data_gpu): cu_source = """ __global__ void myKernel(float *data) { data[0] = (data[3] * (data[1] + data[2])) / data[4]; data[7] = (data[3] / (data[1] - data[2])) * data[4]; } """ kernelName = test_common.mangle('myKernel', ['float *']) cl_sourcecode = test_common.cu_to_cl(cu_source, kernelName, num_clmems=1) print('cl_sourcecode', cl_sourcecode) kernel = test_common.build_kernel(context, cl_sourcecode, kernelName) for i in range(10): float_data[i] = i + 3 cl.enqueue_copy(q, float_data_gpu, float_data) q.finish() # prog = cl.Program(context, sourcecode).build() # prog.__getattr__(kernelName)( kernel(q, (32, ), (32, ), float_data_gpu, offset_type(0), cl.LocalMemory(4)) q.finish() float_data2 = np.zeros((1024, ), dtype=np.float32) cl.enqueue_copy(q, float_data2, float_data_gpu) q.finish() print('float_data2[0]', float_data2[0]) d = float_data d2 = float_data2 expect = (d[3] * (d[1] + d[2])) / d[4] assert abs(d2[0] - expect) < 1e-5
def test_use_template1(context, q, int_data, int_data_gpu, float_data, float_data_gpu): code = """ template< typename T > __device__ T addNumbers(T one, T two) { return one + two; } __global__ void use_template1(float *data, int *intdata) { if(threadIdx.x == 0 && blockIdx.x == 0) { data[0] = addNumbers(data[1], data[2]); intdata[0] = addNumbers(intdata[1], intdata[2]); } } """ kernelName = test_common.mangle('use_template1', ['float *', 'int *']) prog = compile_code(cl, context, code, kernelName, num_clmems=2) float_data_orig = np.copy(float_data) int_data_orig = np.copy(int_data) prog.__getattr__(kernelName)(q, (32, ), (32, ), float_data_gpu, int_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(4)) cl.enqueue_copy(q, float_data, float_data_gpu) cl.enqueue_copy(q, int_data, int_data_gpu) q.finish() assert float_data[0] == float_data_orig[1] + float_data_orig[2] assert int_data[0] == int_data_orig[1] + int_data_orig[2]
def test_sincos(context, q, float_data, float_data_gpu): cu_code = """ __global__ void mykernel(float *data) { sincosf(0.1, &data[0], &data[1]); sincosf(data[2], &data[3], &data[4]); } """ kernel_name = test_common.mangle('mykernel', ['float*']) cl_code = test_common.cu_to_cl(cu_code, kernel_name, num_clmems=1) print('cl_code', cl_code) float_data[2] = -0.3 float_data_orig = np.copy(float_data) cl.enqueue_copy(q, float_data_gpu, float_data) kernel = test_common.build_kernel(context, cl_code, kernel_name) kernel( q, (32,), (32,), float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() print(float_data[:5]) assert abs(float_data[0] - math.sin(0.1)) < 1e-4 assert abs(float_data[1] - math.cos(0.1)) < 1e-4 assert abs(float_data[3] - math.sin(float_data_orig[2])) < 1e-4 assert abs(float_data[4] - math.cos(float_data_orig[2])) < 1e-4
def reduce_min(self, queue, a_buf, N, o_buf, o_lid): r = np.empty(self.n_threads).astype(np.float32) r_buf = cl.Buffer(self.ctx, mf.READ_WRITE, size=r.nbytes) q_buf = cl.Buffer(self.ctx, mf.READ_WRITE, size=r.nbytes) loc_buf = cl.LocalMemory(4 * self.n_threads) loc_lid = cl.LocalMemory(4 * self.n_threads) #print("N==", N, "n_threads==", self.n_threads) minnt = min(N, self.n_threads) evt = self.prgmna.reduce(queue, (N, ), (minnt, ), a_buf, r_buf, q_buf, o_lid, loc_buf, loc_lid) evt.wait() #print(evt.profile.end - evt.profile.start) n_threads = N // minnt evt = self.prgmnb.reduce(queue, (n_threads, ), (n_threads, ), r_buf, o_buf, q_buf, o_lid, loc_buf, loc_lid) evt.wait()
def test_umulhi(context, q, int_data, int_data_gpu): ll_code = """ declare i32 @_Z8__umulhiii(i32, i32) define void @test_umulhi(i32* %data) { %1 = load i32, i32* %data %2 = getelementptr i32, i32* %data, i32 1 %3 = load i32, i32* %2 %4 = getelementptr i32, i32* %data, i32 2 %5 = load i32, i32* %4 %6 = call i32 @_Z8__umulhiii(i32 %3, i32 %5) store i32 %6, i32* %data ret void } """ cl_code = test_common.ll_to_cl(ll_code, 'test_umulhi', 1) print('cl_code', cl_code) int_data[0] = 0 int_data[1] = -50 int_data[2] = 2523123 cl.enqueue_copy(q, int_data_gpu, int_data) kernel = test_common.build_kernel(context, cl_code, 'test_umulhi') kernel(q, (32,), (32,), int_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(32)) from_gpu = np.copy(int_data) cl.enqueue_copy(q, from_gpu, int_data_gpu) q.finish() expected = (np.uint64(np.uint32(2523123)) * np.uint64(np.uint32(-50))) // 2**32 print('expected', expected) print('from_gpu[0]', from_gpu[0]) assert expected == from_gpu[0].item()
def allocate_constants(self): super(Clumpy_Surfactant_Nutrient_Wave, self).allocate_constants() # Allocate local memory for the finite difference code self.halo = np.int32(1) # As we are doing D2Q9, we have a halo of one self.buf_nx = np.int32(self.two_d_local_size[0] + 2 * self.halo) self.buf_ny = np.int32(self.two_d_local_size[1] + 2 * self.halo) self.psi_local = cl.LocalMemory(float_size * self.buf_nx * self.buf_ny)
def setUp(self): self.h_data = numpy.random.random(self.N).astype("float32") self.h2_data = numpy.random.random( (self.N, self.N)).astype("float32").reshape((self.N, self.N)) self.ctx = ocl.create_context(devicetype="GPU") device = self.ctx.devices[0] try: devtype = pyopencl.device_type.to_string(device.type).upper() except ValueError: # pocl does not describe itself as a CPU ! devtype = "CPU" workgroup = device.max_work_group_size if (devtype == "CPU") and (device.platform.vendor == "Apple"): logger.info( "For Apple's OpenCL on CPU: enforce max_work_goup_size=1") workgroup = 1 self.ws = min(workgroup, self.ws) self.queue = pyopencl.CommandQueue( self.ctx, properties=pyopencl.command_queue_properties.PROFILING_ENABLE) self.local_mem = pyopencl.LocalMemory( self.ws * 32) # 2float4 = 2*4*4 bytes per workgroup size src = pyFAI.utils.read_cl_file("bitonic.cl") self.prg = pyopencl.Program(self.ctx, src).build()
def test_sext(context, q, int_data, int_data_gpu): ll_code = """ define void @mykernel(i32* %data) { %1 = load i32, i32* %data %2 = sext i32 %1 to i64 %3 = lshr i64 %2, 32 %4 = trunc i64 %3 to i32 store i32 %4, i32* %data ret void } """ cl_code = test_common.ll_to_cl(ll_code, 'mykernel', 1) print('cl_code', cl_code) for experiment in [{'in': 23, 'out': 0}, {'in': -1, 'out': -1}]: int_data[0] = experiment['in'] cl.enqueue_copy(q, int_data_gpu, int_data) kernel = test_common.build_kernel(context, cl_code, 'mykernel') kernel(q, (32, ), (32, ), int_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(32)) from_gpu = np.copy(int_data) cl.enqueue_copy(q, from_gpu, int_data_gpu) q.finish() # expected = (np.uint32(int_data[1]) * np.uint32(int_data[2])) >> 32 expected = experiment['out'] print('expected', expected) print('from_gpu[0]', from_gpu[0]) assert expected == from_gpu[0].item() split_cl = cl_code.split('\n') found_long_cast = False for line in split_cl: if ' >> 32' in line and '(long)' in line: found_long_cast = True assert found_long_cast
def set_float_value(gpu_buffer, idx, value): setValueProg.__getattr__(setValueKernelName)(q, (32, ), (32, ), float_data_gpu, offset_type(0), np.int32(idx), np.float32(value), cl.LocalMemory(4))