def test_scatter(cl_env, radix_kernels, key_dtype, ngroups, group_size): ctx, cq = cl_env radix_bits = 4 histogram_len = 2 ** radix_bits keys = np.random.randint(0, 64, size=(ngroups, group_size * 2), dtype=key_dtype) keys_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, keys.nbytes) out_keys_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, keys.nbytes) histogram_buf = cl.Buffer( ctx, cl.mem_flags.READ_ONLY, histogram_len * ngroups * np.dtype('uint32').itemsize ) offset_buf = cl.Buffer( ctx, cl.mem_flags.READ_ONLY, histogram_len * ngroups * np.dtype('uint32').itemsize ) for radix_pass in range(keys.dtype.itemsize * 8 // radix_bits): radix_keys = radix_key(keys, radix_bits, radix_pass).astype('uint16') order = np.argsort(radix_keys, kind='mergesort') grid = np.ogrid[tuple(slice(0, s) for s in keys.shape)] block_keys = keys[grid[:-1] + [order]] # Partially sort (keys_map, _) = cl.enqueue_map_buffer( cq, keys_buf, cl.map_flags.WRITE_INVALIDATE_REGION, 0, keys.shape, keys.dtype, wait_for=[], is_blocking=True ) keys_map[...] = block_keys del keys_map radix_keys = radix_key(block_keys, radix_bits, radix_pass).astype('uint16') (histogram_map, _) = cl.enqueue_map_buffer( cq, histogram_buf, cl.map_flags.WRITE_INVALIDATE_REGION, 0, (histogram_len, ngroups), np.dtype('uint32'), wait_for=[], is_blocking=True ) (offset_map, _) = cl.enqueue_map_buffer( cq, offset_buf, cl.map_flags.WRITE_INVALIDATE_REGION, 0, (histogram_len, ngroups), np.dtype('uint32'), wait_for=[], is_blocking=True ) histogram_map[...] = np.array([np.bincount(group_keys, minlength=16) for group_keys in radix_keys], dtype='uint32').T offset_map[...] = prefix_sum(histogram_map.flat).reshape(histogram_len, ngroups) del histogram_map, offset_map local_offset = cl.LocalMemory(histogram_len * np.dtype('uint32').itemsize) local_histogram = cl.LocalMemory(histogram_len * np.dtype('uint32').itemsize) e = radix_kernels['scatter']( cq, (ngroups,), (group_size,), keys_buf, out_keys_buf, None, None, offset_buf, local_offset, histogram_buf, local_histogram, radix_bits, radix_pass, g_times_l=True, ) (keys_map, _) = cl.enqueue_map_buffer( cq, out_keys_buf, cl.map_flags.READ, 0, (ngroups, group_size * 2), keys.dtype, wait_for=[e], is_blocking=True ) expected = block_keys.flat[np.argsort(radix_keys, axis=None, kind='mergesort')] np.testing.assert_equal(keys_map, expected.reshape(ngroups, 2 * group_size))
def initBuffers(self,puzzle): #define lengths buffer and copy to the GPU #as we will not read from this buffer later, mapping is not required self.lengths = np.full(self.simulations,np.iinfo(np.int16).max,dtype=np.int16) self.lengthsBuffer = cl.Buffer(self.context, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=self.lengths) #define buffer for aggregated lengths for each workgroup self.groupLengths = np.full(self.workGroups,np.iinfo(np.int16).max,dtype=np.int16) self.groupLengthsBuffer = cl.Buffer(self.context, cl.mem_flags.READ_WRITE | cl.mem_flags.USE_HOST_PTR, hostbuf=self.groupLengths) #map group lengths buffer cl.enqueue_map_buffer(self.queue,self.groupLengthsBuffer,cl.map_flags.READ,0,self.groupLengths.shape,self.groupLengths.dtype) #get the input puzzle ready for the kernel; convert to 8 bit int (char) p = np.array(puzzle['puzzle']).astype(np.int8) #subtract 1 so that -1 denotes a gap and 0 denotes a square to be filled p = p - np.ones_like(p,dtype=p.dtype) #copy the puzzle, one for each simulation self.puzzles = np.zeros((self.simulations,self.height,self.width),dtype=p.dtype) self.puzzles[:,0:self.height,0:self.width] = p #define puzzles buffer and copy data (we do not need to worry about getting data out of this buffer, so mapping isn't required) #this buffer contains the input puzzles, one for each invocation (the puzzle is too large to hold in local or shared memory) self.puzzlesFlattened = self.puzzles.ravel() self.puzzlesBuffer = cl.Buffer(self.context, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=self.puzzlesFlattened) #define output buffer for best solutions aggregated across workgroups self.solutions = self.puzzles[0:self.workGroups] self.solutionsFlattened = self.solutions.ravel() self.solutionsBuffer = cl.Buffer(self.context, cl.mem_flags.READ_WRITE | cl.mem_flags.USE_HOST_PTR, hostbuf=self.solutionsFlattened) #map solutions buffer cl.enqueue_map_buffer(self.queue,self.solutionsBuffer,cl.map_flags.READ,0,self.solutionsFlattened.shape,self.solutions.dtype)
def estimate_niter(N): """returns niter s.t. the time spent on kernel is same as for memory transfer""" a = np.ones(N,np.float32) dev = get_device() context, queue = dev.context, dev.queue mf = cl.mem_flags t = time() copy_g = cl.Buffer(context, mf.ALLOC_HOST_PTR ,size = a.nbytes) cl.enqueue_map_buffer(queue, copy_g, a, device_offset=0, is_blocking=False) # cl.enqueue_copy(queue, copy_g, a, # device_offset=0, # is_blocking=False) queue.flush() # a_g = OCLArray.from_array(a, async = True) #a_g = array.to_device(queue, a, async = False) print time()-t
def execute(self): # start = timer() evtcompute = self.program.tthetaf4(self.queue, (self.npx / 4, self.npx), None, self.tth_buf, self.eta_buf, self.par_buf) #evtcompute.wait() #print timer()-start self.tthl, evtt = cl.enqueue_map_buffer(self.queue, self.tth_buf, cl.map_flags.READ, 0, (self.npx, self.npx), numpy.float32, 'C', wait_for=[evtcompute], is_blocking=False) self.etal, evte = cl.enqueue_map_buffer(self.queue, self.eta_buf, cl.map_flags.READ, 0, (self.npx, self.npx), numpy.float32, 'C', wait_for=[evtcompute], is_blocking=False) evtcompute.wait() evtt.wait() evte.wait() return self.tthl, self.etal
def enqueue_readouts(self, queue, buffers, range_start, range_end): if self._is_writable: nmr_problems = range_end - range_start cl.enqueue_map_buffer( queue, buffers[0], cl.map_flags.READ, range_start * self._data.strides[0], (nmr_problems,) + self._data.shape[1:], self._data.dtype, order="C", wait_for=None, is_blocking=False)
def test_random_collision_resized(cl_env, coord_dtype, collision_programs, old_shape, new_shape): ctx, cq = cl_env collider = Collider(ctx, *old_shape, coord_dtype, *collision_programs) collider.resize(*new_shape) np.random.seed(4) size = new_shape[0] or old_shape[0] coords = np.random.random((size, 3)).astype(coord_dtype) radius = 1 / (size**0.5) # Keep number of collisions under control radii = np.random.uniform(0, radius, len(coords)).astype(coord_dtype) expected = find_collisions(coords, radii) coords_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, len(coords) * 4 * coord_dtype.itemsize) (coords_map, _) = cl.enqueue_map_buffer(cq, coords_buf, cl.map_flags.WRITE_INVALIDATE_REGION, 0, (len(coords), 4), coord_dtype, is_blocking=True) coords_map[..., :3] = coords del coords_map radii_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=radii) collisions_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, len(expected) * 2 * collider.id_dtype.itemsize) n_collisions_buf = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, collider.counter_dtype.itemsize) e = collider.get_collisions(cq, coords_buf, radii_buf, n_collisions_buf, collisions_buf, len(expected)) (n_collisions_map, _) = cl.enqueue_map_buffer(cq, n_collisions_buf, cl.map_flags.READ, 0, 1, collider.counter_dtype, wait_for=[e], is_blocking=True) assert n_collisions_map[0] == len(expected) (collisions_map, _) = cl.enqueue_map_buffer(cq, collisions_buf, cl.map_flags.READ, 0, (n_collisions_map[0], 2), collider.id_dtype, wait_for=[e], is_blocking=True) # Need to sort, order is undefined collisions = set(map(tuple, np.sort(collisions_map, axis=1))) assert collisions == expected
def test_block_sort_random(cl_env, radix_kernels, key_dtype, ngroups, group_size): ctx, cq = cl_env radix_bits = 4 histogram_len = 2 ** radix_bits keys = np.random.randint(0, 64, size=(ngroups, group_size * 2), dtype=key_dtype) keys_buf = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, keys.nbytes) histogram_buf = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, ngroups * histogram_len * np.dtype('uint32').itemsize) local_keys = cl.LocalMemory(group_size * 2 * keys.dtype.itemsize) local_values = cl.LocalMemory(group_size * 2 * keys.dtype.itemsize) count = cl.LocalMemory(group_size * 2 * np.dtype('uint32').itemsize) local_histogram = cl.LocalMemory(histogram_len * np.dtype('uint32').itemsize) for radix_pass in range(keys.dtype.itemsize * 8 // radix_bits): (keys_map, _) = cl.enqueue_map_buffer( cq, keys_buf, cl.map_flags.WRITE_INVALIDATE_REGION, 0, (ngroups, group_size * 2), keys.dtype, wait_for=[], is_blocking=True ) keys_map[...] = keys del keys_map e = radix_kernels['block_sort']( cq, (ngroups,), (group_size,), keys_buf, local_keys, local_keys, None, local_values, local_values, histogram_buf, local_histogram, count, radix_bits, radix_pass, g_times_l=True, ) keys = keys.reshape(ngroups, group_size * 2) order = np.argsort(radix_key(keys, radix_bits, radix_pass), kind='mergesort') grid = np.ogrid[tuple(slice(0, s) for s in keys.shape)] (histogram_map, _) = cl.enqueue_map_buffer( cq, histogram_buf, cl.map_flags.READ, 0, (histogram_len, ngroups), np.dtype('uint32'), wait_for=[e], is_blocking=True ) i = 0 for group_keys, histogram in zip(keys, histogram_map.T): group_keys = radix_key(group_keys, radix_bits, radix_pass).astype('uint16') expected = np.bincount(group_keys, minlength=16) try: np.testing.assert_equal(histogram, expected) except AssertionError: print((radix_pass, i)) raise i += 1 expected = keys[grid[:-1] + [order]] (keys_map, _) = cl.enqueue_map_buffer( cq, keys_buf, cl.map_flags.READ, 0, (ngroups, group_size * 2), keys.dtype, wait_for=[e], is_blocking=True ) np.testing.assert_equal(keys_map, expected)
def initBuffers(self, puzzle): #define lengths buffer and copy to the GPU #as we will not read from this buffer later, mapping is not required self.lengths = np.full(self.simulations, np.iinfo(np.int16).max, dtype=np.int16) self.lengthsBuffer = cl.Buffer(self.context, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=self.lengths) #define buffer for aggregated lengths for each workgroup self.groupLengths = np.full(self.workGroups, np.iinfo(np.int16).max, dtype=np.int16) self.groupLengthsBuffer = cl.Buffer(self.context, cl.mem_flags.READ_WRITE | cl.mem_flags.USE_HOST_PTR, hostbuf=self.groupLengths) #map group lengths buffer cl.enqueue_map_buffer(self.queue, self.groupLengthsBuffer, cl.map_flags.READ, 0, self.groupLengths.shape, self.groupLengths.dtype) #get the input puzzle ready for the kernel; convert to 8 bit int (char) p = np.array(puzzle['puzzle']).astype(np.int8) #subtract 1 so that -1 denotes a gap and 0 denotes a square to be filled p = p - np.ones_like(p, dtype=p.dtype) #copy the puzzle, one for each simulation self.puzzles = np.zeros((self.simulations, self.height, self.width), dtype=p.dtype) self.puzzles[:, 0:self.height, 0:self.width] = p #define puzzles buffer and copy data (we do not need to worry about getting data out of this buffer, so mapping isn't required) #this buffer contains the input puzzles, one for each invocation (the puzzle is too large to hold in local or shared memory) self.puzzlesFlattened = self.puzzles.ravel() self.puzzlesBuffer = cl.Buffer(self.context, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=self.puzzlesFlattened) #define output buffer for best solutions aggregated across workgroups self.solutions = self.puzzles[0:self.workGroups] self.solutionsFlattened = self.solutions.ravel() self.solutionsBuffer = cl.Buffer(self.context, cl.mem_flags.READ_WRITE | cl.mem_flags.USE_HOST_PTR, hostbuf=self.solutionsFlattened) #map solutions buffer cl.enqueue_map_buffer(self.queue, self.solutionsBuffer, cl.map_flags.READ, 0, self.solutionsFlattened.shape, self.solutions.dtype)
def test_collision(cl_env, coord_dtype, collision_programs): ctx, cq = cl_env coords = np.array( [[0.0, 1.0, 3.0], [0.0, 1.0, 3.0], [4.0, 1.0, 8.0], [-4.0, -6.0, 3.0], [-5.0, 0.0, -1.0], [-5.0, 0.5, -0.5]], dtype=coord_dtype) radii = np.ones(len(coords), dtype=coord_dtype) expected = {(0, 1), (4, 5)} collider = Collider(ctx, len(coords), 3, 8, coord_dtype, *collision_programs) coords_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, len(coords) * 4 * coord_dtype.itemsize) (coords_map, _) = cl.enqueue_map_buffer(cq, coords_buf, cl.map_flags.WRITE_INVALIDATE_REGION, 0, (len(coords), 4), coord_dtype, is_blocking=True) coords_map[..., :3] = coords del coords_map radii_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=radii) collisions_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, len(expected) * 2 * collider.id_dtype.itemsize) n_collisions_buf = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, collider.counter_dtype.itemsize) e = collider.get_collisions(cq, coords_buf, radii_buf, n_collisions_buf, collisions_buf, len(expected)) (n_collisions_map, _) = cl.enqueue_map_buffer(cq, n_collisions_buf, cl.map_flags.READ, 0, 1, collider.counter_dtype, wait_for=[e], is_blocking=True) assert n_collisions_map[0] == len(expected) (collisions_map, _) = cl.enqueue_map_buffer(cq, collisions_buf, cl.map_flags.READ, 0, (n_collisions_map[0], 2), collider.id_dtype, wait_for=[e], is_blocking=True) assert set(map(tuple, collisions_map)) == expected
def test_scan(cl_env, scan_kernels): ctx, cq = cl_env values = np.array( [17, 6, 24, 28, 18, 22, 2, 1, 25, 17, 7, 17, 3, 19, 8, 23], dtype='uint32') block_size = 4 nblocks = len(values) // 2 // block_size values_buf = cl.Buffer(ctx, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=values) block_sums_buf = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, nblocks * values.dtype.itemsize) calc_scan = scan_kernels['local_scan']( cq, (len(values) // 2, ), (block_size, ), values_buf, cl.LocalMemory(block_size * 2 * values.dtype.itemsize), block_sums_buf, ) (values_map, _) = cl.enqueue_map_buffer( cq, values_buf, cl.map_flags.READ, 0, values.shape, values.dtype, wait_for=[calc_scan], is_blocking=True, ) (block_sums_map, _) = cl.enqueue_map_buffer( cq, block_sums_buf, cl.map_flags.READ, 0, (nblocks, ), values.dtype, wait_for=[calc_scan], is_blocking=True, ) expected = np.array( [0, 17, 23, 47, 75, 93, 115, 117, 0, 25, 42, 49, 66, 69, 88, 96], dtype=values.dtype) np.testing.assert_equal(values_map, expected) expected = np.array([118, 119], dtype=values.dtype) np.testing.assert_equal(block_sums_map, expected)
def test_block_scan(cl_env, scan_kernels): ctx, cq = cl_env values = np.array( [0, 17, 23, 47, 75, 93, 115, 117, 0, 25, 42, 49, 66, 69, 88, 96], dtype='uint32') block_sums = np.array([118, 119], dtype=values.dtype) values_buf = cl.Buffer(ctx, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=values) block_sums_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=block_sums) calc_block_scan = scan_kernels['local_scan']( cq, (1, ), (len(block_sums), ), block_sums_buf, cl.LocalMemory(len(block_sums) * 2 * values.dtype.itemsize), None, g_times_l=True) (block_sums_map, _) = cl.enqueue_map_buffer(cq, block_sums_buf, cl.map_flags.READ, 0, block_sums.shape, block_sums.dtype, wait_for=[calc_block_scan], is_blocking=True) expected = np.array([0, 118], dtype=values.dtype) np.testing.assert_equal(block_sums_map, expected) calc_scan = scan_kernels['block_scan'](cq, (len(values) // 2, ), (4, ), values_buf, block_sums_buf, wait_for=[calc_block_scan]) (values_map, _) = cl.enqueue_map_buffer(cq, values_buf, cl.map_flags.READ, 0, values.shape, values.dtype, wait_for=[calc_scan], is_blocking=True) expected = np.array([ 0, 17, 23, 47, 75, 93, 115, 117, 118, 143, 160, 167, 184, 187, 206, 214 ], dtype=values.dtype) np.testing.assert_equal(values_map, expected)
def solve(self,puzzle,simulations = 16384, iterations = 35, workGroupSize = 128): self.simulations = simulations self.iterations = iterations self.workGroupSize = workGroupSize self.workGroups = int(self.simulations / self.workGroupSize) self.width = np.int8(puzzle['width']) self.height = np.int8(puzzle['height']) #initialise buffers self.initBuffers(puzzle) #create kernel self.kernel = cl.Kernel(self.program,"montecarlo") self.kernel.set_args(self.lengthsBuffer,self.groupLengthsBuffer,self.puzzlesBuffer,self.solutionsBuffer,self.height,self.width,np.int32(self.iterations)) #execute program for a number of iterations cl.enqueue_nd_range_kernel(self.queue,self.kernel,(self.simulations,),(self.workGroupSize,)) #unmap group lengths buffer from device cl.enqueue_map_buffer(self.queue,self.groupLengthsBuffer,cl.map_flags.WRITE,0,self.groupLengths.shape,self.groupLengths.dtype) self.groupLengths = self.groupLengthsBuffer.get_host_array(self.groupLengths.shape,dtype=self.groupLengths.dtype) #unmap solutions buffer from device cl.enqueue_map_buffer(self.queue,self.solutionsBuffer,cl.map_flags.WRITE,0,self.solutionsFlattened.shape,self.solutions.dtype) self.solutions = self.solutionsBuffer.get_host_array(self.solutions.shape,dtype=self.solutions.dtype) #release buffers self.lengthsBuffer.release() self.groupLengthsBuffer.release() self.puzzlesBuffer.release() self.solutionsBuffer.release() #get the best solution i = self.groupLengths.argmin() bestSolution = np.array(self.solutions[i]) #convert solution to list format used by challenge solution = [] for row in range(0,puzzle['height']): for col in range(0,puzzle['width']): if bestSolution[row][col]!=-1: s = bestSolution[row][col] #add to solution list solution.append({'X': int(col),'Y': int(row),'Size':int(s)}) #clear cells in solution for i in range(0,s): for j in range(0,s): bestSolution[row+i][col+j]=-1 return solution
def test_codes(cl_env, kernels, coord_dtype): ctx, cq = cl_env coords = np.array([[ 0.0, 1.0, 3.0], [ 0.0, 1.0, 3.0], [ 4.0, 1.0, 8.0], [-4.0,-6.0, 3.0], [-5.0, 0.0,-1.0], [-5.0, 0.5,-0.5]], dtype=coord_dtype) coord_range = np.array([coords.min(axis=0), coords.max(axis=0)], dtype=coords.dtype) expected = np.array([862940378, 862940378, 1073741823, 20332620, 302580864, 306295426], dtype='int32') coords_buf = cl.Buffer( ctx, cl.mem_flags.READ_ONLY, len(coords) * 4 * coord_dtype.itemsize ) (coords_map, _) = cl.enqueue_map_buffer( cq, coords_buf, cl.map_flags.WRITE_INVALIDATE_REGION, 0, (len(coords), 4), coord_dtype, is_blocking=True ) coords_map[..., :3] = coords del coords_map range_buf = cl.Buffer( ctx, cl.mem_flags.READ_ONLY, 2 * 4 * coord_dtype.itemsize ) (range_map, _) = cl.enqueue_map_buffer( cq, range_buf, cl.map_flags.WRITE_INVALIDATE_REGION, 0, (len(coord_range), 4), coord_dtype, is_blocking=True ) range_map[..., :3] = coord_range del range_map codes_buf = cl.Buffer( ctx, cl.mem_flags.READ_WRITE, len(coords) * np.dtype('uint32').itemsize ) calc_codes = kernels['calculateCodes']( cq, (roundUp(len(coords), 32),), None, codes_buf, coords_buf, range_buf, len(coords), ) (codes_map, _) = cl.enqueue_map_buffer( cq, codes_buf, cl.map_flags.READ | cl.map_flags.WRITE, 0, (len(coords),), np.dtype('uint32'), wait_for=[calc_codes], is_blocking=True ) np.testing.assert_equal(codes_map, expected) del codes_map
def use_naive_kernel(ctx, queue, dev, A, B): newA, A_shape = pad(A.copy()) newB, B_shape = pad(B.copy()) C_shape = (A.shape[0], B.shape[1]) newC_shape = (newA.shape[0], newB.shape[1]) newC = np.zeros(newC_shape, dtype=np.float32) A_cache = np.array(newA.flatten(), dtype=np.float32) B_cache = np.array(newB.flatten(), dtype=np.float32) C_cache = np.array(newC.flatten(), dtype=np.float32) max_wg_size = dev.get_info(cl.device_info.MAX_WORK_GROUP_SIZE) kernel = naive_kernel() mf = cl.mem_flags flags = mf.READ_WRITE | mf.COPY_HOST_PTR | mf.ALLOC_HOST_PTR A_buffer = cl.Buffer(ctx, flags, hostbuf=A_cache) B_buffer = cl.Buffer(ctx, flags, hostbuf=B_cache) C_buffer = cl.Buffer(ctx, flags, hostbuf=C_cache) A_array, _ = cl.enqueue_map_buffer(queue, A_buffer, cl.map_flags.READ, 0, A_cache.shape, A_cache.dtype, "C") B_array, _ = cl.enqueue_map_buffer(queue, B_buffer, cl.map_flags.READ, 0, B_cache.shape, B_cache.dtype, "C") C_array, _ = cl.enqueue_map_buffer(queue, C_buffer, cl.map_flags.WRITE, 0, C_cache.shape, C_cache.dtype, "C") global_size = (round_up(C_cache.shape[0], max_wg_size), ) local_size = None print("Local Size: ", local_size) print("Global Size: ", global_size) prg = cl.Program(ctx, kernel).build() event = prg.naiveMatMul( queue, global_size, local_size, A_array.data, B_array.data, C_array.data, np.int32(A_shape[1]), np.int32(newC.shape[1]), np.int32(C_shape[0]), # row boundary np.int32(C_shape[1])) # col boundary event.wait() cl.enqueue_copy(queue, C_cache, C_array) return C_cache.reshape(newC_shape)[:C_shape[0], :C_shape[1]]
def pairwise_pyopencl_cpu(data): data = np.asarray(data, order='C') N, D = data.shape try: lower, upper = _cache[(data.shape, data.dtype)] except: lower, upper = pairwise_pyopencl_cpu_prepare(data.shape, data.dtype) _cache[(data.shape, data.dtype)] = lower, upper data_buf = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=data) dest_buf = cl.Buffer(ctx, mf.WRITE_ONLY, N * N * data.dtype.itemsize) try: rval, _ = cl.enqueue_map_buffer(queue, dest_buf, cl.map_flags.READ, offset=0, shape=(N, N), dtype=data.dtype) need_copy = False except TypeError: #OSX's OCL needs this? rval = np.empty((N, N), dtype=data.dtype) need_copy = True lower(queue, (N, 1), (1, 1), data_buf, dest_buf) upper(queue, (4, 4), (1, 1), data_buf, dest_buf) if need_copy: cl.enqueue_copy(queue, rval, dest_buf) else: queue.finish() if PROFILING: comptimes.append(1e-9 * (ev.profile.end - ev.profile.start)) print 'computation time', min(comptimes) return rval
def test_count_err(cl_env, coord_dtype, collision_programs, size, ngroups, group_size): ctx, cq = cl_env collider = Collider(ctx, size, ngroups, group_size, coord_dtype, *collision_programs) np.random.seed(4) coords = np.random.random((size, 3)).astype(coord_dtype) radius = 1 / (size**0.5) # Keep number of collisions under control radii = np.random.uniform(0, radius, len(coords)).astype(coord_dtype) expected = find_collisions(coords, radii) coords_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, len(coords) * 4 * coord_dtype.itemsize) (coords_map, _) = cl.enqueue_map_buffer(cq, coords_buf, cl.map_flags.WRITE_INVALIDATE_REGION, 0, (len(coords), 4), coord_dtype, is_blocking=True) coords_map[..., :3] = coords del coords_map radii_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=radii) n_collisions_buf = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, collider.counter_dtype.itemsize) with pytest.raises(ValueError): e = collider.get_collisions(cq, coords_buf, radii_buf, n_collisions_buf, None, len(expected))
def test_bounds_resized(cl_env, program, coord_dtype, size, old_shape, new_shape): ctx, cq = cl_env reducer = Bounds(ctx, *old_shape, coord_dtype, program=program) reducer.resize(*new_shape) if coord_dtype.shape == (3, ): value_dtype = dtype((coord_dtype.base, 4)) else: value_dtype = coord_dtype values = np.random.normal(size=(size, ) + value_dtype.shape).astype( value_dtype.base) values_buf = cl.Buffer(ctx, cl.mem_flags.HOST_READ_ONLY | cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=values) out_buf = cl.Buffer(ctx, cl.mem_flags.HOST_READ_ONLY | cl.mem_flags.WRITE_ONLY, 2 * dtype_sizeof(coord_dtype)) calc_reduce = reducer.reduce(cq, len(values), values_buf, out_buf) (out_buf, _) = cl.enqueue_map_buffer(cq, out_buf, cl.map_flags.READ, 0, (2, ) + value_dtype.shape, value_dtype.base, wait_for=[calc_reduce], is_blocking=True) expected = np.stack([values.min(axis=0), values.max(axis=0)]) if coord_dtype.shape == (3, ): out_buf = out_buf[..., :3] expected = expected[..., :3] np.testing.assert_equal(out_buf, expected)
def test_reducer(cl_env, reduce_program, size, ngroups, group_size, rounds, benchmark): ctx, cq = cl_env reducer = Bounds(ctx, ngroups, group_size, program=reduce_program) values = np.random.uniform(0.0, 1.0, size=(size, 4)).astype('float32') expected = np.array([np.min(values, axis=0), np.max(values, axis=0)]) values_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=values) output_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, expected.nbytes) calc_scan = benchmark.pedantic(reduce, (cq, reducer, size, values_buf, output_buf), rounds=rounds, warmup_rounds=10) (output_map, _) = cl.enqueue_map_buffer(cq, output_buf, cl.map_flags.READ, 0, expected.shape, expected.dtype, wait_for=[], is_blocking=True) np.testing.assert_equal(output_map[..., :3], expected[..., :3])
def test_scanner(cl_env, scan_program, size, group_size, rounds, benchmark): ctx, cq = cl_env scanner = PrefixScanner(ctx, size, group_size, program=scan_program) values = np.random.randint(0, 128, size=size, dtype='uint32') expected = np.cumsum(values) values_buf = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, values.nbytes) calc_scan = benchmark.pedantic(prefix_sum, (cq, scanner, values_buf), setup=partial(prefix_sum_setup, cq, values_buf, values), rounds=rounds, warmup_rounds=10) (values_map, _) = cl.enqueue_map_buffer(cq, values_buf, cl.map_flags.READ, 0, values.shape, values.dtype, wait_for=[], is_blocking=True) assert values_map[0] == 0 np.testing.assert_equal(values_map[1:], expected[:-1])
def test_scatter(cl_env, value_dtype, index_dtype): ctx, cq = cl_env size = 240 nindices = 30 indexer = Indexer(ctx, value_dtype, index_dtype) values = (np.random.uniform(0, 1000, (nindices,) + value_dtype.shape) .astype(value_dtype.base)) indices = np.random.choice(size, size=nindices, replace=False).astype(index_dtype) values_buf = cl.Buffer( ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=values ) values_out_buf = cl.Buffer( ctx, cl.mem_flags.WRITE_ONLY, size * value_dtype.itemsize ) index_buf = cl.Buffer( ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=indices ) e = cl.enqueue_fill_buffer( cq, values_out_buf, np.full(1, 1.0, value_dtype), 0, size * value_dtype.itemsize ) e = indexer.scatter(cq, nindices, values_buf, index_buf, values_out_buf, wait_for=[e]) (values_map, _) = cl.enqueue_map_buffer( cq, values_out_buf, cl.map_flags.READ, 0, (size,) + value_dtype.shape, value_dtype.base, wait_for=[e], is_blocking=True ) selection = np.zeros(size, dtype='bool') selection[indices] = True np.testing.assert_equal(values_map[indices], values) np.testing.assert_equal(values_map[~selection], 1.0)
def test_gather(cl_env, value_dtype, index_dtype): ctx, cq = cl_env size = 240 nindices = 30 indexer = Indexer(ctx, value_dtype, index_dtype) values = (np.random.uniform(0, 1000, (size,) + value_dtype.shape) .astype(value_dtype.base)) indices = np.random.choice(size, size=nindices, replace=False).astype(index_dtype) values_buf = cl.Buffer( ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=values ) values_out_buf = cl.Buffer( ctx, cl.mem_flags.WRITE_ONLY, nindices * value_dtype.itemsize ) index_buf = cl.Buffer( ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=indices ) e = indexer.gather(cq, nindices, values_buf, index_buf, values_out_buf) (values_map, _) = cl.enqueue_map_buffer( cq, values_out_buf, cl.map_flags.READ, 0, (nindices,) + value_dtype.shape, value_dtype.base, wait_for=[e], is_blocking=True ) np.testing.assert_equal(values_map, values[indices])
def test_fill_internal(cl_env, kernels): ctx, cq = cl_env n = 8 ids = np.random.permutation(n).astype('uint32') nodes_buf = cl.Buffer( ctx, cl.mem_flags.READ_WRITE, (2 * n - 1) * Node.itemsize ) ids_buf = cl.Buffer( ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=ids ) fill_internal = kernels['fillInternal']( cq, (roundUp(n, 32),), None, nodes_buf, ids_buf, n, ) (nodes_map, _) = cl.enqueue_map_buffer( cq, nodes_buf, cl.map_flags.READ, (n - 1) * Node.itemsize, n, Node, wait_for=[fill_internal], is_blocking=True ) nodes_map.dtype = Node np.testing.assert_equal(nodes_map['data'][:, 0], ids) np.testing.assert_equal(nodes_map['right_edge'], np.arange(n))
def test_radix_sort(cl_env, radix_program, scan_program, key_dtype, size, gen, group_size, rounds, benchmark): ctx, cq = cl_env sorter = RadixSorter(ctx, size, group_size, key_dtype=key_dtype, program=radix_program, scan_program=scan_program) keys = gen(size, dtype=key_dtype) expected = np.sort(keys) keys_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, keys.nbytes) out_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, keys.nbytes) if key_dtype == np.dtype('uint64'): rounds //= 2 benchmark.pedantic(radix_sort, (cq, sorter, keys_buf, out_buf), setup=partial(radix_sort_setup, cq, [keys_buf], [keys]), rounds=rounds, warmup_rounds=10) (out_map, _) = cl.enqueue_map_buffer(cq, out_buf, cl.map_flags.READ, 0, keys.shape, keys.dtype, wait_for=[], is_blocking=True) np.testing.assert_equal(out_map, expected)
def _read_buffer(self): if (self._addspc == 'global'): buf = cl.enqueue_map_buffer(self._ctrl.clqueue, self._buffer, cl.map_flags.READ, offset=0, shape=self._value.shape, dtype=self._value.dtype, order="C", strides=None, wait_for=None, is_blocking=True)[0] #cl.enqueue_read_buffer(self._ctrl.clqueue, self._buffer, self._value) self._ctrl.clqueue.finish() self._value = np.array(buf) del buf
def test_offset_missing(cl_env, offset_dtype, value_dtype, offset_program): ctx, cq = cl_env finder = OffsetFinder(ctx, value_dtype, offset_dtype, offset_program) values = np.array([1, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3], dtype=value_dtype) expected = np.array([0, 0, 7, 7, 13, 13, 13], dtype=offset_dtype) values_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.HOST_NO_ACCESS | cl.mem_flags.COPY_HOST_PTR, hostbuf=values) offset_buf = cl.Buffer( ctx, cl.mem_flags.WRITE_ONLY | cl.mem_flags.HOST_READ_ONLY, len(expected) * offset_dtype.itemsize) e = finder.find_offsets(cq, values_buf, len(values), offset_buf, 7) (offset_map, _) = cl.enqueue_map_buffer(cq, offset_buf, cl.map_flags.READ, 0, len(expected), offset_dtype, wait_for=[e], is_blocking=True) np.testing.assert_equal(offset_map, expected)
def test_sorter(cl_env, sort_program, scan_program, key_dtype, size, group_size): ctx, cq = cl_env sorter = RadixSorter(ctx, size, group_size, key_dtype=key_dtype, program=sort_program, scan_program=scan_program) data = np.random.randint(500, size=size, dtype=key_dtype) data_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=data) out_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, data.nbytes) calc_sort = sorter.sort(cq, data_buf, out_buf) (out_map, _) = cl.enqueue_map_buffer(cq, out_buf, cl.map_flags.READ, 0, data.shape, data.dtype, wait_for=[calc_sort], is_blocking=True) np.testing.assert_equal(out_map, np.sort(data))
def _enqueue_readout(self, buffer, host_array, range_start, range_end, wait_for=None): """Enqueue a readout for a buffer created with use_host_ptr. This encapsulates all the low level details needed to readout the given range of values. Args: buffer: the buffer on the device host_array (ndarray): the host side array of the given buffer range_start (int): the start of the range to read out (in the first dimension) range_end (int): the end of the range to read out (in the first dimension) wait_for (list of event): the list of events to wait for Returns: event; the event of the readout """ nmr_problems = range_end - range_start return cl.enqueue_map_buffer(self._cl_run_context.queue, buffer, cl.map_flags.READ, range_start * host_array.strides[0], (nmr_problems, ) + host_array.shape[1:], host_array.dtype, order="C", wait_for=wait_for, is_blocking=False)[1]
def _init_zero_copy_memory(self): self.logger.debug('Initializing NVIDIA zero-copy memory.') # Starting points host memory allocation and device copy memory = (self.size_of_startingpoint * self.maximum_number_starting_points * self.number_of_sequences * self.number_targets) self.pinned_starting_points_zero_copy = cl.Buffer( self.ctx, cl.mem_flags.ALLOC_HOST_PTR, size=memory) self.d_starting_points_zero_copy = cl.Buffer(self.ctx, cl.mem_flags.WRITE_ONLY, size=memory) self.h_starting_points_zero_copy = cl.enqueue_map_buffer( self.queue, self.pinned_starting_points_zero_copy, cl.map_flags.READ, 0, (memory, 1), dtype=numpy.byte)[0] mem_size = memory # Global directions host memory allocation and device copy memory = (self.length_of_x_sequences * self.number_of_sequences * self.length_of_y_sequences * self.number_targets) self.pinned_global_direction_zero_copy = cl.Buffer( self.ctx, cl.mem_flags.ALLOC_HOST_PTR, size=memory) self.d_global_direction_zero_copy = cl.Buffer(self.ctx, cl.mem_flags.WRITE_ONLY, size=memory) self.h_global_direction_zero_copy = cl.enqueue_map_buffer( self.queue, self.pinned_global_direction_zero_copy, cl.map_flags.READ, 0, (memory, 1), dtype=numpy.byte)[0] mem_size += memory # Maximum zero copy memory allocation and device copy memory = (self.number_of_sequences * self.number_of_targets * SmithWaterman.float_size) # self.pinned_max_possible_score_zero_copy = cl.Buffer(self.ctx, cl.mem_flags.ALLOC_HOST_PTR, size=memory) # self.d_max_possible_score_zero_copy = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY, size=memory) # self.h_max_possible_score_zero_copy = cl.enqueue_map_buffer(self.queue, self.pinned_max_possible_score_zero_copy, cl.map_flags.WRITE, 0, # (self.number_of_sequences * self.number_of_targets, 1), dtype=numpy.float32)[0] mem_size += memory # Zero copy buffers are allocated twice in NVIDIA return 2 * mem_size
def use_naive_kernel(ctx, queue, dev, A, B): newA, A_shape = pad(A.copy()) newB, B_shape = pad(B.copy()) C_shape = (A.shape[0], B.shape[1]) newC_shape = (newA.shape[0], newB.shape[1]) newC = np.zeros(newC_shape, dtype=np.float32) A_cache = np.array(newA.flatten(), dtype=np.float32) B_cache = np.array(newB.flatten(), dtype=np.float32) C_cache = np.array(newC.flatten(), dtype=np.float32) max_wg_size = dev.get_info(cl.device_info.MAX_WORK_GROUP_SIZE) kernel = naive_kernel() mf = cl.mem_flags flags = mf.READ_WRITE | mf.COPY_HOST_PTR | mf.ALLOC_HOST_PTR A_buffer = cl.Buffer(ctx, flags, hostbuf=A_cache) B_buffer = cl.Buffer(ctx, flags, hostbuf=B_cache) C_buffer = cl.Buffer(ctx, flags, hostbuf=C_cache) A_array, _ = cl.enqueue_map_buffer(queue, A_buffer, cl.map_flags.READ, 0, A_cache.shape, A_cache.dtype, "C") B_array, _ = cl.enqueue_map_buffer(queue, B_buffer, cl.map_flags.READ, 0, B_cache.shape, B_cache.dtype, "C") C_array, _ = cl.enqueue_map_buffer(queue, C_buffer, cl.map_flags.WRITE, 0, C_cache.shape, C_cache.dtype, "C") global_size = (round_up(C_cache.shape[0], max_wg_size),) local_size = None print("Local Size: ", local_size) print("Global Size: ", global_size) prg = cl.Program(ctx, kernel).build() event = prg.naiveMatMul( queue, global_size, local_size, A_array.data, B_array.data, C_array.data, np.int32(A_shape[1]), np.int32(newC.shape[1]), np.int32(C_shape[0]), # row boundary np.int32(C_shape[1])) # col boundary event.wait() cl.enqueue_copy(queue, C_cache, C_array) return C_cache.reshape(newC_shape)[: C_shape[0], : C_shape[1]]
def kernel_test(): for i in range(0, 100): program.array1d_add(queue, (N, ), None, buffer_a, buffer_b, buffer_c) array_a, event_a = cl.enqueue_map_buffer(queue, buffer_a, cl.map_flags.WRITE, 0, shape=(N), dtype=np.float32) array_c, event_c = cl.enqueue_map_buffer(queue, buffer_c, cl.map_flags.READ, 0, shape=(N), dtype=np.float32) with array_a.base, array_c.base: for i in range(0, N): array_a[i] = array_c[i]
def test_problem_codes(cl_env, kernels, coord_dtype): from .test_collision_py import find_collisions ctx, cq = cl_env codes = np.array([0b00000000000000000000000000000000, 0b00000000000000000000000000000000, 0b00000110110000110100000100000010, 0b00001001001001001001001001001001, 0b00001001001001001001001001001001, 0b00010010010010010010010010010010, 0b00010010010010010010010010010010, 0b00010010011010010010011011011010, 0b00011001001011001001011001001011, 0b00011011011011011011011011011011, 0b00100100010000100010110100010110, 0b00100100100100100100100100100100, 0b00100100100101101101100101100100, 0b00101001101001101101101101101001, 0b00101101101101101101101101101101, 0b00110110110110110110110110110110, # This node had no parent 0b00110110110110110110110110110110, 0b00110110110110110110110110110110, 0b00111111111111111111111111111111, 0b00111111111111111111111111111111, 0b00111111111111111111111111111111], dtype='uint32') ids = np.arange(len(codes), dtype='uint32') n_nodes = 2 * len(codes) - 1 codes_buf = cl.Buffer( ctx, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=codes ) ids_buf = cl.Buffer( ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=ids ) nodes_buf = cl.Buffer( ctx, cl.mem_flags.READ_WRITE, n_nodes * Node.itemsize ) fill_internal = kernels['fillInternal']( cq, (roundUp(len(codes), 32),), None, nodes_buf, ids_buf, len(codes), ) generate_bvh = kernels['generateBVH']( cq, (roundUp(len(codes) - 1, 32),), None, codes_buf, nodes_buf, len(codes), wait_for=[fill_internal] ) (nodes_map, _) = cl.enqueue_map_buffer( cq, nodes_buf, cl.map_flags.READ, 0, n_nodes, Node, wait_for=[generate_bvh], is_blocking=True ) nodes_map.dtype = Node assert set(nodes_map['parent'][1:]) == set(range(len(codes) - 1))
def map_write(self) -> np.ndarray: if self.mapping is None: self.mapping = cl.enqueue_map_buffer(self.queue, self.buffer, cl.map_flags.WRITE_INVALIDATE_REGION, 0, self.shape, self.dtype) self.map_count += 1 return self.mapping[0]
def map_read(self) -> np.ndarray: if self.mapping is None: self.mapping = cl.enqueue_map_buffer(self.queue, self.buffer, cl.map_flags.READ, 0, self.shape, self.dtype) self.map_count += 1 return self.mapping[0]
def _get_direction_byte_array(self): self.h_global_direction_zero_copy = cl.enqueue_map_buffer( self.queue, self.d_global_direction_zero_copy, cl.map_flags.READ, 0, (self.number_of_sequences, self.number_targets, self.x_div_shared_x, self.y_div_shared_y, self.shared_x, self.shared_y), dtype=numpy.byte)[0] return self.h_global_direction_zero_copy
def get_mem_map(self): """read buffer""" if (self._addspc == '__global'): buf = cl.enqueue_map_buffer(self._solverobj.clqueue, self._buffer, cl.map_flags.READ, offset=0, shape=self._array.shape, dtype=self._array.dtype, order="C", strides=None, wait_for=None, is_blocking=True)[0] self._array = np.array(buf) del buf self._solverobj.clqueue.finish() #for local vars return None if (self._addspc == '__local'): return None #return the array return self._array
def map(self, map_flags, offset=None, shape=None, wait_for=None): """ Context manager that maps the buffer data as a numpy array. `wait_for` can be either None or list of opencl.Event. """ if offset is None: offset = 0 if shape is None: shape = self.shape array, _event = pyopencl.enqueue_map_buffer( self.queue, self, map_flags, offset, shape, self.dtype, wait_for=wait_for, is_blocking=True, ) with array.base: yield array
# Enqueue command to copy from buffers to host memory # Store data transfer event (return value) prof_event = cl.enqueue_copy(queue, dest=c, src=c_buff, is_blocking=True) read_time += prof_event.profile.end - prof_event.profile.start # Execute the kernel repeatedly using enqueue_map_buffer map_time = 0.0 for i in range(NUM_ITERATIONS): # __call__(queue, global_size, local_size, *args, global_offset=None, wait_for=None, g_times_l=False) # Store kernel execution event (return value) kernel_event = kernel(queue, global_size, local_size, c_buff, np.int32(NUM_VECTORS)) # Enqueue command to map from buffer two to host memory (result_array, prof_event) = cl.enqueue_map_buffer(queue, buf=c_buff, flags=cl.map_flags.READ, offset=0, shape=(NUM_VECTORS,), dtype=cl.array.vec.char16) map_time += prof_event.profile.end - prof_event.profile.start # Release the mapping (is this necessary?) result_array.base.release(queue) # Print averaged results print('Average read time (ms): {}'.format(read_time / ( NUM_ITERATIONS * 1000))) print('Average map time (ms): {}'.format(map_time / ( NUM_ITERATIONS * 1000)))
# Create buffers flags = cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR buffer_one = cl.Buffer(context, flags, hostbuf=data_one) buffer_two = cl.Buffer(context, flags, hostbuf=data_two) # Set buffers as arguments to the kernel # The arguments can also be specified by calling kernel(....) directly instead kernel = prog.blank # Note: Every call like this produces a new object kernel.set_arg(0, buffer_one) kernel.set_arg(1, buffer_two) # Enqueue kernel (with arguments) n_globals = data_one.shape n_locals = None cl.enqueue_nd_range_kernel(queue, kernel, n_globals, n_locals) # Enqueue command to copy from buffer one to buffer two cl.enqueue_copy(queue, dest=buffer_two, src=buffer_one) # Enqueue command to map from buffer two to host memory # enqueue_map_buffer(queue, buf, flags, offset, shape, dtype, order="C", strides=None, wait_for=None, is_blocking=True) (result_array, _) = cl.enqueue_map_buffer( queue, buf=buffer_two, flags=cl.map_flags.READ, offset=0, shape=(100,), dtype=np.float32 ) print("\nSource array:") print(data_two) print("\nAfter copy back:") print(result_array)
#!/usr/bin/env python import pyopencl as cl import numpy as np import numpy.linalg as la ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) #ary = np.arange(3*4*5).reshape(3,4,5, order="C").astype(np.float32) ary = np.arange(3*4*5).reshape(3,4,5, order="F").astype(np.float32) #ary = np.arange(3*4*5).astype(np.float32) mf = cl.mem_flags flags = mf.READ_WRITE | mf.COPY_HOST_PTR | mf.ALLOC_HOST_PTR buf = cl.Buffer(ctx, flags, hostbuf = ary) queue.finish() ar2 = np.empty_like(ary) cl.enqueue_read_buffer(queue, buf, ar2) print la.norm(ary-ar2), ary.strides, ar2.strides #ar3, evt = cl.enqueue_map_buffer(queue, buf, cl.map_flags.READ, 0, ary.shape, ary.dtype, "C") ar3, evt = cl.enqueue_map_buffer(queue, buf, cl.map_flags.READ, 0, ary.shape, ary.dtype, "F") print la.norm(ary-ar3), ary.strides, ar3.strides