def main(): # setup OpenCL platforms = cl.get_platforms( ) # a platform corresponds to a driver (e.g. AMD, NVidia, Intel) platform = platforms[0] # take first platform devices = platform.get_devices( cl.device_type.GPU) # get GPU devices of selected platform device = devices[0] # take first GPU context = cl.Context([device]) # put selected GPU into context object queue = cl.CommandQueue( context, device) # create command queue for selected GPU and context # prepare data imgIn = cv2.imread('photographer.png', cv2.IMREAD_GRAYSCALE) rotation_angle = np.pi / 4 cos_theta = np.cos(rotation_angle) sin_theta = np.sin(rotation_angle) # setup sampler sampler = cl.Sampler(context, True, cl.addressing_mode.REPEAT, cl.filter_mode.NEAREST) # get shape of input image, allocate memory for output to which result can be copied to shape = imgIn.T.shape imgOut = np.empty_like(imgIn) # create image buffers which hold images for OpenCL imgInBuf = cl.image_from_array(context, ary=imgIn, mode="r", norm_int=True, num_channels=1) imgOutBuf = cl.image_from_array(context, ary=imgOut, mode="w", norm_int=True, num_channels=1) # load, compile and execute OpenCL program program = cl.Program(context, open('kernel.cl').read()).build() program.img_rotate(queue, shape, None, sampler, imgInBuf, imgOutBuf, np.double(sin_theta), np.double(cos_theta)) cl.enqueue_copy( queue, imgOut, imgOutBuf, origin=(0, 0), region=shape, is_blocking=True ) # wait until finished copying resulting image back from GPU to CPU # write output image cv2.imwrite('photographer_rotated.png', imgOut) # show images fig, ax = plt.subplots(1, 2) ax[0].imshow(imgIn, cmap='gray') ax[1].imshow(imgOut, cmap='gray') plt.show()
def __init__(self, queue, discr, dtype, allocator): context = queue.context self.discr = discr import pyopencl as cl self.allocator = allocator dtype4 = cl.array.vec.types[np.dtype(dtype), 4] l = discr.ldis drdsdt_unvec = np.zeros((l.Np, l.Np, 4), dtype) for i, mat in enumerate([l.Dr, l.Ds, l.Dt]): drdsdt_unvec[:, :, i] = mat self.drdsdt = cl.array.to_device( queue, drdsdt_unvec.view(dtype=dtype4)[:, :, 0].copy(order="F")) self.drdsdt_img = cl.image_from_array( context, drdsdt_unvec.view(dtype=dtype4)[:, :, 0]) drst_dx_unvec = np.zeros((discr.K, 4), dtype) drst_dy_unvec = np.zeros((discr.K, 4), dtype) drst_dz_unvec = np.zeros((discr.K, 4), dtype) for i in range(3): drst_dx_unvec[:, i] = discr.drst_dxyz[i, 0][:, 0] drst_dy_unvec[:, i] = discr.drst_dxyz[i, 1][:, 0] drst_dz_unvec[:, i] = discr.drst_dxyz[i, 2][:, 0] self.drst_dx = cl.array.to_device( queue, drst_dx_unvec.view(dtype=dtype4)[:, 0]) self.drst_dy = cl.array.to_device( queue, drst_dy_unvec.view(dtype=dtype4)[:, 0]) self.drst_dz = cl.array.to_device( queue, drst_dz_unvec.view(dtype=dtype4)[:, 0]) self.vmapP = cl.array.to_device( queue, discr.vmapP.astype(np.int32).copy().reshape(discr.K, -1)) self.vmapM = cl.array.to_device( queue, discr.vmapM.astype(np.int32).copy().reshape(discr.K, -1)) self.nx = cl.array.to_device(queue, discr.nx.astype(dtype)) self.ny = cl.array.to_device(queue, discr.ny.astype(dtype)) self.nz = cl.array.to_device(queue, discr.nz.astype(dtype)) self.Fscale = cl.array.to_device(queue, discr.Fscale.astype(dtype)) self.bc = cl.array.to_device(queue, discr.bc.astype(dtype)) self.LIFT = cl.array.to_device(queue, l.LIFT.copy(order="F").astype(dtype)) self.LIFT_img = cl.image_from_array(context, l.LIFT.astype(dtype)) self.volume_events = [] self.surface_events = []
def __call__(self, ctx, src, kernel1, kernel2): self.build(ctx) kernel1 = np.array(kernel1, copy=False, dtype=np.float32) kernel2 = np.array(kernel2, copy=False, dtype=np.float32) halflen = kernel1.shape[0] / 2 kernel1_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=kernel1) kernel2_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=kernel2) inshape = src.shape src = np.asarray(src) dims = len(src.shape) assert dims > 1 if dims == 2: assert src.shape[1] <= 4 src = src.reshape(src.shape[0], 1, src.shape[1]) else: assert src.shape[2] <= 4 src_padded = np.zeros((src.shape[0] + 2 * halflen, 1, 4), dtype=src.dtype) src_padded[ halflen:-halflen, :, :src.shape[2]] = src[:, :, :src.shape[2]] src_padded[:halflen, :, :] = src_padded[halflen:halflen * 2, :, :][::-1, ...] src_padded[-halflen:, :, :] = src_padded[-halflen * 2:-halflen, :, :][::-1, ...] norm = np.issubdtype(src.dtype, np.integer) src_buf = cl.image_from_array(self.ctx, src_padded, 4, norm_int=norm) dest = np.zeros((src.shape[0], src.shape[1], 4), dtype=src.dtype) dest_buf = cl.image_from_array(self.ctx, dest, 4, mode="w", norm_int=norm) queue = cl.CommandQueue(self.ctx) self.prg.convolve1d2_naive(queue, (dest.shape[1], dest.shape[0]), None, src_buf, dest_buf, kernel1_buf, kernel2_buf, np.int32(halflen)) cl.enqueue_copy(queue, dest, dest_buf, origin=(0, 0), region=(src.shape[1], src.shape[0])).wait() dest = dest[:, :, :src.shape[2]].reshape(inshape) src_buf.release() dest_buf.release() kernel1_buf.release() kernel2_buf.release() return dest
def __call__(self, ctx, src): self.build(ctx) src = np.asarray(src) src2 = np.zeros((src.shape[0], src.shape[1], 4), dtype=src.dtype) src2[:, :, 0:src.shape[2]] = src[:, :, 0:src.shape[2]] norm = np.issubdtype(src2.dtype, np.integer) src2_buf = cl.image_from_array(self.ctx, src2, 4, norm_int=norm) dest_buf = cl.image_from_array(self.ctx, src2, 4, mode="w", norm_int=norm) dest = np.empty_like(src2) queue = cl.CommandQueue(self.ctx) self.prg.YCrCb2RGB(queue, (src2.shape[1], src2.shape[0]), None, src2_buf, dest_buf) cl.enqueue_copy(queue, dest, dest_buf, origin=(0, 0), region=(src2.shape[1], src2.shape[0])).wait() dest = dest[:, :, 0:src.shape[2]].copy() src2_buf.release() dest_buf.release() return dest
def mask(self, mask): BaseCorrelator.mask.fset(self, mask) self._norm_factor = np.float32(self._norm_factor) self._rmax = np.int32(self._rmax) self._gtemplate = cl.image_from_array( self._queue.context, self._template.astype(np.float32)) self._gmask = cl.image_from_array(self._queue.context, self._mask.astype(np.float32))
def main(): # setup OpenCL platforms = cl.get_platforms() # a platform corresponds to a driver (e.g. AMD) platform = platforms[0] # take first platform devices = platform.get_devices(cl.device_type.GPU) # get GPU devices of selected platform device = devices[0] # take first GPU context = cl.Context([device]) # put selected GPU into context object queue = cl.CommandQueue(context, device) # create command queue for selected GPU and context # read image and setup convolution kernel image imgIn = cv2.imread('photographer.png', cv2.IMREAD_GRAYSCALE) tmp = [[1, 0, -1], [2, 0, -2], [1, 0, -1]] kernelImage = np.array(tmp, dtype=np.float32) # dtype=np.float32, because defaults to dtype=np.float64, which is unsupported by OpenCL images # tmp = [[1, 2, 1], [2, 4, -2], [1, 2, 1]] # kernelImage = 1/16 * np.array(tmp, dtype=np.float32) # dtype=np.float32, because defaults to dtype=np.float64, which is unsupported by OpenCL images # tmp = [[1, 4, 7, 4, 1], # [4,16,26,16, 4], # [7,26,41,26, 7], # [4,16,26,16, 4], # [1, 4, 7, 4, 1]] # kernelImage = 1/273 * np.array(tmp, dtype=np.float32) # dtype=np.float32, because defaults to dtype=np.float64, which is unsupported by OpenCL images # get shape of input image, allocate memory for output to which result can be copied to shape = imgIn.T.shape imgOut = np.empty_like(imgIn) # create image buffers which hold images for OpenCL imgInBuf = cl.image_from_array(context, ary=imgIn, mode="r", norm_int=True, num_channels=1) kernelImageBuf = cl.image_from_array(context, ary=kernelImage, mode="r", norm_int=False, num_channels=1) imgOutBuf = cl.image_from_array(context, ary=imgOut, mode="w", norm_int=True, num_channels=1) # load and compile OpenCL program program = cl.Program(context, open('convolution_kernel_code.cl').read()).build() # execute kernel with same global shape as input image program.custom_convolution_2d(queue, shape, None, imgInBuf, kernelImageBuf, imgOutBuf) # copy back output buffer cl.enqueue_copy(queue, imgOut, imgOutBuf, origin=(0, 0), region=shape, is_blocking=True) # wait until finished copying resulting image back from GPU to CPU # save imgOut cv2.imwrite('photographer_convolved.png', imgOut) # show images fig, ax = plt.subplots(1, 2) ax[0].imshow(imgIn, cmap='gray') ax[1].imshow(imgOut, cmap='gray') plt.show()
def __init__(self, queue, discr, dtype, allocator): context = queue.context self.discr = discr import pyopencl as cl self.allocator = allocator dtype4 = cl.array.vec.types[np.dtype(dtype), 4] l = discr.ldis drdsdt_unvec = np.zeros((l.Np, l.Np, 4), dtype) for i, mat in enumerate([l.Dr, l.Ds, l.Dt]): drdsdt_unvec[:, :, i] = mat self.drdsdt = cl.array.to_device( queue, drdsdt_unvec .view(dtype=dtype4)[:, :, 0].copy(order="F")) self.drdsdt_img = cl.image_from_array(context, drdsdt_unvec.view(dtype=dtype4)[:, :, 0]) drst_dx_unvec = np.zeros((discr.K, 4), dtype) drst_dy_unvec = np.zeros((discr.K, 4), dtype) drst_dz_unvec = np.zeros((discr.K, 4), dtype) for i in range(3): drst_dx_unvec[:, i] = discr.drst_dxyz[i, 0][:,0] drst_dy_unvec[:, i] = discr.drst_dxyz[i, 1][:,0] drst_dz_unvec[:, i] = discr.drst_dxyz[i, 2][:,0] self.drst_dx = cl.array.to_device(queue, drst_dx_unvec.view(dtype=dtype4)[:, 0]) self.drst_dy = cl.array.to_device(queue, drst_dy_unvec.view(dtype=dtype4)[:, 0]) self.drst_dz = cl.array.to_device(queue, drst_dz_unvec.view(dtype=dtype4)[:, 0]) self.vmapP = cl.array.to_device(queue, discr.vmapP.astype(np.int32).copy().reshape(discr.K, -1)) self.vmapM = cl.array.to_device(queue, discr.vmapM.astype(np.int32).copy().reshape(discr.K, -1)) self.nx = cl.array.to_device(queue, discr.nx.astype(dtype)) self.ny = cl.array.to_device(queue, discr.ny.astype(dtype)) self.nz = cl.array.to_device(queue, discr.nz.astype(dtype)) self.Fscale = cl.array.to_device(queue, discr.Fscale.astype(dtype)) self.bc = cl.array.to_device(queue, discr.bc.astype(dtype)) self.LIFT = cl.array.to_device(queue, l.LIFT.copy(order="F").astype(dtype)) self.LIFT_img = cl.image_from_array(context, l.LIFT.astype(dtype)) self.volume_events = [] self.surface_events = []
def prepare_param_tables(self): filename = os.path.join(os.path.split(__file__)[0], self.tabledatafile) d = np.load(filename) self.phases = d['coeffs_phase'] Tx = self.preprocess_params(d['coeffs_x']) Ty = self.preprocess_params(d['coeffs_y']) if self.simple: Tx[:] = np.array([0., 2.65, 2.65, 1 ])[:,None,None] #x0, xi_p, xi_m, n 2.6/1 bzw. 3./1.1 Ty[:] = np.array([0, 2., 2., 1.])[:,None,None] self.cl_params_x = cl.image_from_array(self.context, np.ascontiguousarray(Tx.T, np.float32), num_channels=4) self.cl_params_y = cl.image_from_array(self.context, np.ascontiguousarray(Ty.T, np.float32), num_channels=4) self.cl_table_E = cl.image_from_array(self.context, np.ascontiguousarray(self.calc_E().T, np.float32), num_channels=1)
def _upload_image(self, image): assert image.max() <= 1.0 # Check the number of channels in the image if image.ndim == 2: num_channels = 1 else: if sys.platform.startswith('win') and 'geforce' in self.ctx.devices[0].name.lower() and image.shape[2] == 3: # This is a hack for Windows/nVidia, as we believe and found so # far for various GeFoce cards that the nvidia OpenCL # implementation sucks. Reporting an out-of-resources error when # trying to upload an RGB three channel image to the GPU # Quite counterintuitively adding an unneeded fourth channel # makes the out-of-resources error go away. FIXME if you can. tmp = image image = np.ones((tmp.shape[0], tmp.shape[1], 4)) num_channels = 4 image[:, :, :3] = tmp[:] else: num_channels = image.shape[2] # Tell OpenCL to copy the image into device memory image_gpu = cl.image_from_array(self.ctx, image.astype(np.float32), num_channels=num_channels, mode="r") return image_gpu
def find_starburst_ray_boundaries(self, im, seed_point, cutoff_index, threshold, n_rays, n_samples, ray_step): if self.cached_shape != im.shape: self.setup_device(im.shape) #(im_, _, _) = self.sobel3x3_separable(im.astype(np.float32)) im_ = im.astype(np.float32) self.clIm2D = cl.image_from_array(self.ctx, im_, num_channels=1) # # load im to memory # cl.enqueue_copy(self.q, self.clIm2D, clIm.data, offset=0, # origin=(0, 0), region=clIm.shape) seed_point_ = (seed_point[1], seed_point[0]) # sample the rays, computing the "ray-wise" gradient and mean + std along the # way. We'll pull back the mean and running stds and compute the thresholds # on the CPU sampled = self.cl_find_ray_boundaries(self.clIm2D, n_rays, n_samples, ray_step, seed_point_, cutoff_index, threshold) # run through the resampled values to find cutoffs # pull back the cutoff and return them return sampled
def test_rotate_image3d_1(self): shape = (8, 6, 5) np_image = np.zeros(shape, dtype=np.float32) np_image[0, 0, 0] = 1 np_image[0, 0, 1] = 1 np_image[0, 0, 2] = 1 # 90 degree rotation around z-axis rotmat = [[0, 1, 0], [1, 0, 0], [0, 0, 1]] np_out = np.zeros_like(np_image) expected = np.zeros_like(np_image) expected[0, 0, 0] = 1 expected[0, 1, 0] = 1 expected[0, 2, 0] = 1 cl_image = cl.image_from_array(self.queue.context, np_image) cl_out = cl_array.to_device(self.queue, np_out) cl_sampler = cl.Sampler(self.queue.context, False, cl.addressing_mode.CLAMP, cl.filter_mode.LINEAR) self.kernels.rotate_image3d(self.queue, cl_sampler, cl_image, rotmat, cl_out) self.assertTrue(np.allclose(expected, cl_out.get()))
def __call__(self, ctx, src): self.build(ctx) src = np.asarray(src) src2 = np.zeros((src.shape[0], src.shape[1], 4),dtype=src.dtype) src2[:,:,0:src.shape[2]] = src[:,:,0:src.shape[2]] norm = np.issubdtype(src2.dtype, np.integer) src2_buf = cl.image_from_array(self.ctx, src2, 4, norm_int=norm) dest_buf = cl.image_from_array(self.ctx, src2, 4, mode="w", norm_int=norm) dest = np.empty_like(src2) queue = cl.CommandQueue(self.ctx) self.prg.YCrCb2RGB(queue, (src2.shape[1], src2.shape[0]), None, src2_buf, dest_buf) cl.enqueue_copy(queue, dest, dest_buf, origin=(0, 0), region=(src2.shape[1], src2.shape[0])).wait() dest = dest[:,:,0:src.shape[2]].copy() src2_buf.release() dest_buf.release() return dest
def process_view(self, intrinsics, rot, tvec, mask): """Process a new view. Parameters ---------- intrinsics: list [f_x, f_y, c_x, c_y] rot: list of list rotation matrix of the camera pose tvec: list translation vector of the camera pose mask: np.ndarray mask array (or float array if type is averaging) """ if self.dtype == np.float32 and mask.dtype == np.uint8: mask = mask / 255 intrinsics_h = np.ascontiguousarray(intrinsics).astype(np.float32) rot_h = np.ascontiguousarray(rot).astype(np.float32) tvec_h = np.ascontiguousarray(tvec).astype(np.float32) mask_h = np.ascontiguousarray(mask).astype(self.dtype) mask_d = cl.image_from_array(ctx, mask_h, 1) cl.enqueue_copy(queue, self.intrinsics_d, intrinsics_h) cl.enqueue_copy(queue, self.rot_d, rot_h) cl.enqueue_copy(queue, self.tvec_d, tvec_h) self.kernel(queue, [np.prod(self.shape)], None, mask_d, self.values_d, self.intrinsics_d, self.rot_d, self.tvec_d, self.volinfo_d, self.shape_d) queue.finish()
def setUpClass(self): self.queue = get_queue() self.shape = (4, 5, 6) self.size = 4 * 5 * 6 self.values = { 'shape_x': self.shape[2], 'shape_y': self.shape[1], 'shape_z': self.shape[0], 'llength': 2, } self.k = CLKernels(self.queue.context, self.values) self.grid = np.zeros(self.shape, dtype=np.float64) self.grid[0, 0, 0] = 1 self.grid[0, 0, 1] = 1 self.grid[0, 1, 1] = 1 self.grid[0, 0, 2] = 1 self.grid[0, 0, -1] = 1 self.grid[-1, 0, 0] = 1 self.cl_image = cl.image_from_array(self.queue.context, self.grid.astype(np.float32)) self.sampler_linear = cl.Sampler(self.queue.context, True, cl.addressing_mode.REPEAT, cl.filter_mode.LINEAR) self.sampler_nearest = cl.Sampler(self.queue.context, False, cl.addressing_mode.CLAMP, cl.filter_mode.NEAREST) self.out = np.zeros(self.shape, dtype=np.float64) self.cl_out = cl_array.zeros(self.queue, self.shape, dtype=np.float32)
def from_array(cls, arr, *args, **kwargs): ctx = get_device().context if not arr.ndim in [2, 3, 4]: raise ValueError( "dimension of array wrong, should be 1...4 but is %s" % arr.ndim) elif arr.ndim == 4: num_channels = arr.shape[-1] else: num_channels = 1 if arr.dtype.type == np.complex64: num_channels = 2 res = OCLImage.empty(arr.shape, dtype=np.float32, num_channels=num_channels) res.write_array(arr) res.dtype = np.float32 else: res = cl.image_from_array(ctx, prepare(arr), num_channels=num_channels, *args, **kwargs) res.dtype = arr.dtype res.num_channels = num_channels return res
def mask(self, mask): BaseCorrelator.mask.fset(self, mask) self._norm_factor = np.float32(self._norm_factor) self._rmax = np.int32(self._rmax) self._gtemplate = cl.image_from_array( self._ctx, self._template.astype(np.float32) ) self._gmask = cl.image_from_array( self._ctx, self._mask.astype(np.float32) ) max_items = self._queue.device.max_compute_units * 32 * 16 gws = [0] * 3 gws[0] = min(2 * self._rmax, max_items) gws[1] = min(max_items // gws[0], 2 * self._rmax) gws[2] = min(max(max_items // (gws[0] * gws[0]), 1), 2 * self._rmax) self._gws = tuple(gws)
def test_clashvol(self): NROT = np.random.randint(self.rotations.shape[0] + 1) rotmat = self.rotations[NROT] cpu_lsurf = np.zeros_like(self.im_lsurf.array) disvis.libdisvis.rotate_image3d(self.im_lsurf.array, self.vlength, np.linalg.inv(rotmat), self.im_center, cpu_lsurf) cpu_clashvol = numpy.fft.irfftn(numpy.fft.rfftn(cpu_lsurf).conj() * numpy.fft.rfftn(self.rcore.array), s=self.shape) gpu_rcore = cl_array.to_device(self.queue, np.asarray(self.rcore.array, dtype=np.float32)) gpu_im_lsurf = cl.image_from_array(self.queue.context, np.asarray(self.im_lsurf.array, dtype=np.float32)) gpu_lsurf = cl_array.zeros(self.queue, self.shape, dtype=np.float32) self.kernels.rotate_image3d(self.queue, self.sampler, gpu_im_lsurf, rotmat, gpu_lsurf, self.im_center) gpu_ft_lsurf = cl_array.zeros(self.queue, self.ft_shape, dtype=np.complex64) gpu_ft_rcore = cl_array.zeros(self.queue, self.ft_shape, dtype=np.complex64) gpu_ft_clashvol = cl_array.zeros(self.queue, self.ft_shape, dtype=np.complex64) gpu_clashvol = cl_array.zeros(self.queue, self.shape, dtype=np.float32) self.kernels.rfftn(self.queue, gpu_rcore, gpu_ft_rcore) self.kernels.rfftn(self.queue, gpu_lsurf, gpu_ft_lsurf) self.kernels.c_conj_multiply(self.queue, gpu_ft_lsurf, gpu_ft_rcore, gpu_ft_clashvol) self.kernels.irfftn(self.queue, gpu_ft_clashvol, gpu_clashvol) self.assertTrue(np.allclose(cpu_clashvol, gpu_clashvol.get(), atol=0.8))
def test_rotate_template_mask(self): shape = (5, 5, 5) template = np.zeros(shape, dtype=np.float32) template[2, 2, 1:4] = 1 template[2, 1:4, 2] = 1 rotmat = np.asarray([1, 0, 0, 0, 1, 0, 0, 0, 1] + [0] * 7, dtype=np.float32) self.queue.finish() cl_template = cl.image_from_array(self.queue.context, template) cl_out = cl_array.to_device(self.queue, np.zeros(shape, dtype=np.float32)) center = np.asarray([2, 2, 2, 0], dtype=np.float32) shape = np.asarray([5, 5, 5, 125], dtype=np.int32) self.k.rotate_template(self.queue, (125, ), None, self.s_linear, cl_template, rotmat, cl_out.data, center, shape) answer = np.zeros((5, 5, 5), dtype=np.float32) answer[0, 0, :2] = 1 answer[0, 0, -1] = 1 answer[0, :2, 0] = 1 answer[0, -1, 0] = 1 self.assertTrue(np.allclose(cl_out.get(), answer))
def get_color(self, img): # OpenCL only supports RGBA images, not RGB, so add an alpha channel src = np.array(img.convert('RGBA')) src.shape = w, h, _ = img.width, img.height, 4 w = int(w * self.SCALE_FACTOR) h = int(h * self.SCALE_FACTOR) local_size = self.max_work_item_sizes global_size = (math.ceil(h / local_size[0]), math.ceil(w / local_size[1])) total_work_groups = global_size[0] * global_size[1] mf = cl.mem_flags src_buf = cl.image_from_array(self.ctx, src, 4, norm_int=True) out = np.zeros(4 * total_work_groups, dtype=np.int32) out_buf = cl.Buffer(self.ctx, mf.WRITE_ONLY, size=out.itemsize * 4 * total_work_groups) kernel = self.prg.get_color kernel.set_scalar_arg_dtypes([None, None, np.uint32, np.uint32]) kernel(self.queue, global_size, local_size, src_buf, out_buf, w, h, g_times_l=True) cl.enqueue_copy(self.queue, dest=out, src=out_buf, is_blocking=True) # this sum takes .1 ms at 3440x1440, don't even bother OpenCL-ifying it resized_out = np.reshape(out, (out.shape[0] / 4, 4)) summed_out = np.sum(resized_out, axis=0) avg_out = (summed_out / summed_out[3])[:3].astype(int) return avg_out
def __call__(self, ctx, src, kernel): self.build(ctx) kernel = np.array(kernel, copy=False, dtype=np.float32) halflen = kernel.shape[0] / 2 kernelf = kernel.flatten() kernelf_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=kernelf) src_padded = np.zeros((src.shape[0]+2*halflen, src.shape[1]+2*halflen, 4), dtype=src.dtype) src_padded[halflen:-halflen,halflen:-halflen,:src.shape[2]] = src[:,:,:src.shape[2]] src_padded[halflen:-halflen,:halflen,:src.shape[2]] = src_padded[halflen:-halflen,halflen:halflen*2,:src.shape[2]][:,::-1] src_padded[halflen:-halflen,-halflen:,:src.shape[2]] = src_padded[halflen:-halflen,-halflen*2:-halflen,:src.shape[2]][:,::-1] src_padded[:halflen,:,:src.shape[2]] = src_padded[halflen:halflen*2,:,:src.shape[2]][::-1,...] src_padded[-halflen:,:,:src.shape[2]] = src_padded[-halflen*2:-halflen,:,:src.shape[2]][::-1,...] norm = np.issubdtype(src.dtype, np.integer) src_buf = cl.image_from_array(self.ctx, src_padded, 4, norm_int=norm) dest = np.zeros((src.shape[0], src.shape[1], 4), dtype=src.dtype) dest_buf = init_image(self.ctx, dest, 4, mode="w", norm_int=norm) queue = cl.CommandQueue(self.ctx) self.prg.convolve2d_naive(queue, (dest.shape[1], dest.shape[0]), None, src_buf, dest_buf, kernelf_buf, np.int32(kernel.shape[0])) cl.enqueue_copy(queue, dest, dest_buf, origin=(0, 0), region=(src.shape[1], src.shape[0])).wait() # src_buf.release() # dest_buf.release() # kernelf_buf.release() return dest[:,:,:src.shape[2]]
def _gpu_init(self): """Method to initialize all the data for GPU-accelerate search""" self.gpu_data = {} g = self.gpu_data d = self.data q = self.queue # move data to the GPU. All should be float32, as these is the native # lenght for GPUs g['rcore'] = cl_array.to_device(q, float32array(d['rcore'].array)) g['rsurf'] = cl_array.to_device(q, float32array(d['rsurf'].array)) # Make the scanning chain object an Image, as this is faster to rotate g['im_lsurf'] = cl.image_from_array(q.context, float32array(d['lsurf'].array)) g['sampler'] = cl.Sampler(q.context, False, cl.addressing_mode.CLAMP, cl.filter_mode.LINEAR) if self.distance_restraints: g['restraints'] = cl_array.to_device(q, float32array(d['restraints'])) # Allocate arrays on the GPU g['lsurf'] = cl_array.zeros_like(g['rcore']) g['clashvol'] = cl_array.zeros_like(g['rcore']) g['intervol'] = cl_array.zeros_like(g['rcore']) g['interspace'] = cl_array.zeros(q, d['shape'], dtype=np.int32) g['restspace'] = cl_array.zeros_like(g['interspace']) g['access_interspace'] = cl_array.zeros_like(g['interspace']) g['best_access_interspace'] = cl_array.zeros_like(g['interspace']) # arrays for counting # Reductions are typically tedious on GPU, and we need to define the # workgroupsize to allocate the correct amount of data WORKGROUPSIZE = 32 nsubhists = int(np.ceil(g['rcore'].size/WORKGROUPSIZE)) g['subhists'] = cl_array.zeros(q, (nsubhists, d['nrestraints'] + 1), dtype=np.float32) g['viol_counter'] = cl_array.zeros(q, (nsubhists, d['nrestraints'], d['nrestraints']), dtype=np.float32) # complex arrays g['ft_shape'] = list(d['shape']) g['ft_shape'][0] = d['shape'][0]//2 + 1 g['ft_rcore'] = cl_array.zeros(q, g['ft_shape'], dtype=np.complex64) g['ft_rsurf'] = cl_array.zeros_like(g['ft_rcore']) g['ft_lsurf'] = cl_array.zeros_like(g['ft_rcore']) g['ft_clashvol'] = cl_array.zeros_like(g['ft_rcore']) g['ft_intervol'] = cl_array.zeros_like(g['ft_rcore']) # other miscellanious data g['nrot'] = d['nrot'] g['max_clash'] = d['max_clash'] g['min_interaction'] = d['min_interaction'] # kernels g['k'] = Kernels(q.context) g['k'].rfftn = pyclfft.RFFTn(q.context, d['shape']) g['k'].irfftn = pyclfft.iRFFTn(q.context, d['shape']) # initial calculations g['k'].rfftn(q, g['rcore'], g['ft_rcore']) g['k'].rfftn(q, g['rsurf'], g['ft_rsurf'])
def frame_preprocessing(self, lower_bound, upper_bound): # *Load and convert source image frame = np.array(self.frame) # *Set properties h = frame.shape[0] w = frame.shape[1] mask = np.zeros((1, 2), cl.cltypes.float4) mask[0, 0] = (lower_bound) # Lower bound mask[0, 1] = (upper_bound) # Upper bound # *Buffors frame_buf = cl.image_from_array(GPUSetup.context, frame, 4) fmt = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.UNSIGNED_INT8) dest_buf = cl.Image(GPUSetup.context, cl.mem_flags.WRITE_ONLY, fmt, shape=(w, h)) # *RGB to HSV GPUSetup.program.rgb2hsv(GPUSetup.queue, (w, h), None, frame_buf, dest_buf) self.hsv = np.empty_like(frame) cl.enqueue_copy(GPUSetup.queue, self.hsv, dest_buf, origin=(0, 0), region=(w, h)) # *Apply mask frame_buf = cl.image_from_array(GPUSetup.context, self.hsv, 4) mask_buf = cl.Buffer(GPUSetup.context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=mask) GPUSetup.program.hsv_mask(GPUSetup.queue, (w, h), None, frame_buf, mask_buf, dest_buf) self.after_mask = np.empty_like(frame) cl.enqueue_copy(GPUSetup.queue, self.after_mask, dest_buf, origin=(0, 0), region=(w, h)) return self.after_mask
def test_touch(self): MAX_CLASH = 100 + 0.9 MIN_INTER = 300 + 0.9 NROT = np.random.randint(self.rotations.shape[0] + 1) rotmat = self.rotations[0] cpu_lsurf = np.zeros_like(self.im_lsurf.array) disvis.libdisvis.rotate_image3d(self.im_lsurf.array, self.vlength, np.linalg.inv(rotmat), self.im_center, cpu_lsurf) cpu_clashvol = numpy.fft.irfftn(numpy.fft.rfftn(cpu_lsurf).conj() * numpy.fft.rfftn(self.rcore.array)) gpu_rcore = cl_array.to_device(self.queue, np.asarray(self.rcore.array, dtype=np.float32)) gpu_im_lsurf = cl.image_from_array(self.queue.context, np.asarray(self.im_lsurf.array, dtype=np.float32)) gpu_lsurf = cl_array.zeros(self.queue, self.shape, dtype=np.float32) self.kernels.rotate_image3d(self.queue, self.sampler, gpu_im_lsurf, rotmat, gpu_lsurf, self.im_center) gpu_ft_lsurf = cl_array.zeros(self.queue, self.ft_shape, dtype=np.complex64) gpu_ft_rcore = cl_array.zeros(self.queue, self.ft_shape, dtype=np.complex64) gpu_ft_clashvol = cl_array.zeros(self.queue, self.ft_shape, dtype=np.complex64) gpu_clashvol = cl_array.zeros(self.queue, self.shape, dtype=np.float32) self.kernels.rfftn(self.queue, gpu_rcore, gpu_ft_rcore) self.kernels.rfftn(self.queue, gpu_lsurf, gpu_ft_lsurf) self.kernels.c_conj_multiply(self.queue, gpu_ft_lsurf, gpu_ft_rcore, gpu_ft_clashvol) self.kernels.irfftn(self.queue, gpu_ft_clashvol, gpu_clashvol) cpu_intervol = numpy.fft.irfftn(numpy.fft.rfftn(cpu_lsurf).conj() * numpy.fft.rfftn(self.rsurf.array)) gpu_rsurf = cl_array.to_device(self.queue, np.asarray(self.rsurf.array, dtype=np.float32)) gpu_ft_rsurf = cl_array.zeros(self.queue, self.ft_shape, dtype=np.complex64) gpu_ft_intervol = cl_array.zeros(self.queue, self.ft_shape, dtype=np.complex64) gpu_intervol = cl_array.zeros(self.queue, self.shape, dtype=np.float32) cpu_interspace = np.zeros(self.shape, dtype=np.int32) gpu_interspace = cl_array.zeros(self.queue, self.shape, dtype=np.int32) self.kernels.rfftn(self.queue, gpu_rsurf, gpu_ft_rsurf) self.kernels.rfftn(self.queue, gpu_lsurf, gpu_ft_lsurf) self.kernels.c_conj_multiply(self.queue, gpu_ft_lsurf, gpu_ft_rsurf, gpu_ft_intervol) self.kernels.irfftn(self.queue, gpu_ft_intervol, gpu_intervol) self.kernels.touch(self.queue, gpu_clashvol, MAX_CLASH, gpu_intervol, MIN_INTER, gpu_interspace) np.logical_and(cpu_clashvol < MAX_CLASH, cpu_intervol > MIN_INTER, cpu_interspace) disvis.volume.Volume(cpu_interspace, self.im_lsurf.voxelspacing, self.im_lsurf.origin).tofile('cpu_interspace.mrc') disvis.volume.Volume(gpu_interspace.get(), self.im_lsurf.voxelspacing, self.im_lsurf.origin).tofile('gpu_interspace.mrc') disvis.volume.Volume(cpu_interspace - gpu_interspace.get(), self.im_lsurf.voxelspacing, self.im_lsurf.origin).tofile('diff.mrc') print() print(cpu_interspace.sum(), gpu_interspace.get().sum()) print(np.abs(cpu_interspace - gpu_interspace.get()).sum()) self.assertTrue(np.allclose(gpu_interspace.get(), cpu_interspace))
def _get_image_buffer(image): """ Create the buffer object for a image :param image: PIL image object :return: CL buffer object """ image = image.convert("RGBA") image = np.array(image) return cl.image_from_array(_context, image, num_channels=4, mode="r", norm_int=False)
def __call__(self, ctx, ix, iy, rx, ry, sw, sh, ez, ex, ey, levels, halfres_eccentricity, contrast_sensitivity, decay_constant): self.build(ctx) w = 2 * ix h = 2 * iy assert levels == 6 var = np.array( [0.849, 0.4245, 0.21225, 0.106125, 0.0530625, 0.02653125], dtype=np.float32) horizontal_degree = subtended_angle(ctx, [0], [ry], [2 * rx], [ry], rx, ry, sw, sh, [ez], [ex], [ey])[0] freq = 0.5 / (horizontal_degree / (2 * rx)) critical_eccentricity = [0.0] for l in xrange(levels): ecc = halfres_eccentricity * ((np.log(1 / contrast_sensitivity) * (1 << l) / (decay_constant * freq)) - 1) if ecc > 90.0: ecc = 90.0 critical_eccentricity.append(ecc) critical_eccentricity.append(90.0) critical_eccentricity = np.array(critical_eccentricity, dtype=np.float32) ce_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=critical_eccentricity) var_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=var) dest = np.zeros((h, w, 4), dtype=np.float32) dest_buf = cl.image_from_array(self.ctx, dest, 4, mode="w") queue = cl.CommandQueue(self.ctx) self.prg.blendmap(queue, (dest.shape[1], dest.shape[0]), None, ce_buf, var_buf, np.float32(ix), np.float32(iy), np.float32(w), np.float32(h), np.float32(2 * sw), np.float32(2 * sh), np.float32(ez), np.float32(ex), np.float32(ey), np.uint32(levels), dest_buf) cl.enqueue_copy(queue, dest, dest_buf, origin=(0, 0), region=(dest.shape[1], dest.shape[0])).wait() dest = dest.copy() dest_buf.release() ce_buf.release() var_buf.release() return critical_eccentricity, dest
def cl_load_data(self, population, world): mf = cl.mem_flags out = cl.Buffer(self.ctx, mf.WRITE_ONLY, population.nbytes) population_cl = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=population) # world_cl = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=world.flatten()) world_cl = cl.image_from_array(self.ctx, world, mode="r") return population_cl, world_cl, out
def __init__(self,volumeNode,contextPreference='GPU',renderSize=(512,512)): self.volumeNode = volumeNode self.volumeArray = slicer.util.array(self.volumeNode.GetID()) self.renderSize = renderSize try: import pyopencl import numpy except ImportError: raise "No OpenCL for you!\nInstall pyopencl in slicer's python installation." import os os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1' self.ctx = None for platform in pyopencl.get_platforms(): for device in platform.get_devices(): if pyopencl.device_type.to_string(device.type) == contextPreference: self.ctx = pyopencl.Context([device]) break; if not self.ctx: self.ctx = pyopencl.create_some_context() self.queue = pyopencl.CommandQueue(self.ctx) inPath = os.path.dirname(slicer.modules.rendercl.path) + "/Render.cl.in" fp = open(inPath) sourceIn = fp.read() fp.close() source = sourceIn % { 'rayStepSize' : '0.01f', 'rayMaxSteps' : '500', } self.prg = pyopencl.Program(self.ctx, source).build() # create a 3d image from the volume num_channels = 1 self.volumeImage_dev = pyopencl.image_from_array(self.ctx, self.volumeArray, num_channels) # create a 2d array for the render buffer self.renderArray = numpy.zeros(self.renderSize,dtype=numpy.dtype('uint32')) self.renderArray_dev = pyopencl.array.to_device(self.queue, self.renderArray) self.volumeSampler = pyopencl.Sampler(self.ctx,False, pyopencl.addressing_mode.REPEAT, pyopencl.filter_mode.LINEAR) # TODO make 2D image of transfer function self.transferFunctionSampler = pyopencl.Sampler(self.ctx,False, pyopencl.addressing_mode.REPEAT, pyopencl.filter_mode.LINEAR)
def __call__(self, ctx, src, kernel1, kernel2): self.build(ctx) kernel1 = np.array(kernel1, copy=False, dtype=np.float32) kernel2 = np.array(kernel2, copy=False, dtype=np.float32) halflen = kernel1.shape[0] / 2 kernel1_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=kernel1) kernel2_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=kernel2) inshape = src.shape src = np.asarray(src) dims = len(src.shape) assert dims > 1 if dims == 2: assert src.shape[1] <= 4 src = src.reshape(src.shape[0],1,src.shape[1]) else: assert src.shape[2] <= 4 src_padded = np.zeros((src.shape[0]+2*halflen, 1, 4), dtype=src.dtype) src_padded[halflen:-halflen,:,:src.shape[2]] = src[:,:,:src.shape[2]] src_padded[:halflen,:,:] = src_padded[halflen:halflen*2,:,:][::-1,...] src_padded[-halflen:,:,:] = src_padded[-halflen*2:-halflen,:,:][::-1,...] norm = np.issubdtype(src.dtype, np.integer) src_buf = cl.image_from_array(self.ctx, src_padded, 4, norm_int=norm) dest = np.zeros((src.shape[0], src.shape[1], 4), dtype=src.dtype) dest_buf = cl.image_from_array(self.ctx, dest, 4, mode="w", norm_int=norm) queue = cl.CommandQueue(self.ctx) self.prg.convolve1d2_naive(queue, (dest.shape[1], dest.shape[0]), None, src_buf, dest_buf, kernel1_buf, kernel2_buf, np.int32(halflen)) cl.enqueue_copy(queue, dest, dest_buf, origin=(0, 0), region=(src.shape[1], src.shape[0])).wait() dest = dest[:,:,:src.shape[2]].reshape(inshape) src_buf.release() dest_buf.release() kernel1_buf.release() kernel2_buf.release() return dest
def from_array(cls,arr, *args, **kwargs): ctx = get_device().context if not arr.ndim in [1,2,3,4]: raise ValueError("dimension of array wrong, should be 1...4 but is %s"%arr.ndim) elif arr.ndim == 4: num_channels = arr.shape[-1] else: num_channels = None res = pyopencl.image_from_array(ctx, arr,num_channels = num_channels, *args, **kwargs) res.dtype = arr.dtype return res
def test_rotate_grids_and_multiply(self): shape = (5, 5, 5) template = np.zeros(shape, dtype=np.float32) template[2, 2, 1:4] = 1 template[2, 1:4, 2] = 1 mask = template * 2 np_out_template = np.zeros(shape, dtype=np.float32) np_out_template[0, 0, :2] = 1 np_out_template[0, 0, -1] = 1 np_out_template[0, :2, 0] = 1 np_out_template[0, -1, 0] = 1 np_out_mask = np_out_template * 2 np_out_mask2 = np_out_mask**2 cl_template = cl.image_from_array(self.ctx, template) cl_mask = cl.image_from_array(self.ctx, mask) cl_rotmat = np.asarray([1, 0, 0, 0, 1, 0, 0, 0, 1] + [0] * 7, dtype=np.float32) cl_center = np.asarray([2, 2, 2, 0], dtype=np.float32) cl_shape = np.asarray([5, 5, 5, 125], dtype=np.int32) cl_radius = np.int32(2) cl_out_template = cl_array.to_device(self.queue, np.zeros(shape, dtype=np.float32)) cl_out_mask = cl_array.to_device(self.queue, np.zeros(shape, dtype=np.float32)) cl_out_mask2 = cl_array.to_device(self.queue, np.zeros(shape, dtype=np.float32)) gws = tuple([int(2 * cl_radius + 1)] * 3) args = (cl_template, cl_mask, cl_rotmat, self.s_linear, self.s_nearest, cl_center, cl_shape, cl_radius, cl_out_template.data, cl_out_mask.data, cl_out_mask2.data) self.k.rotate_grids_and_multiply(self.queue, gws, None, *args) self.queue.finish() self.assertTrue(np.allclose(np_out_template, cl_out_template.get())) self.assertTrue(np.allclose(np_out_mask, cl_out_mask.get())) self.assertTrue(np.allclose(np_out_mask2, cl_out_mask2.get()))
def popCorn(self): mf = cl.mem_flags #initialize client side (CPU) arrays #self.a = numpy.array(range(10), dtype=numpy.float32) #self.b = numpy.array(range(10), dtype=numpy.float32) self.data = pyfits.getdata(os.path.join(self.path,self.file1)) #create OpenCL buffers #self.imagebuf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.ra1) self.imagebuf = cl.image_from_array(self.ctx,self.data) self.dest_buf = cl.Buffer(self.ctx, mf.WRITE_ONLY, self.dec2.nbytes)
def __call__(self, ctx, pyramid, blendmap, x, y): self.build(ctx) norm = np.issubdtype(pyramid.dtype, np.integer) pyramid_buf = cl.image_from_array(self.ctx, pyramid, 4, mode="r", norm_int=norm) dest = np.zeros_like(pyramid[:,:,0,:],dtype=pyramid.dtype) dest_buf = init_image(self.ctx, dest, 4, mode="w", norm_int=norm) xoff = dest.shape[1] - x yoff = dest.shape[0] - y blendmap_buf = cl.image_from_array(self.ctx, blendmap[yoff:yoff+dest.shape[0],xoff:xoff+dest.shape[1],:].copy(), 4, mode="r") queue = cl.CommandQueue(self.ctx) self.prg.blend(queue, (dest.shape[1], dest.shape[0]), None, pyramid_buf, blendmap_buf, dest_buf) cl.enqueue_copy(queue, dest, dest_buf, origin=(0, 0), region=(dest.shape[1], dest.shape[0])).wait() dest_buf.release() pyramid_buf.release() blendmap_buf.release() return dest
def __call__(self, ctx, src2, kernel): if self.ctx != ctx: self.ctx = ctx self.prg = cl.Program(self.ctx, pkg_resources.resource_string(__name__, "convolve2d.cl")).build() src2 = np.asarray(src2) src = np.zeros((src2.shape[0], src2.shape[1], 4),dtype=src2.dtype) src[:,:,0:src2.shape[2]] = src2[:,:,0:src2.shape[2]] norm = np.issubdtype(src.dtype, np.integer) src_buf = cl.image_from_array(self.ctx, src, 4, norm_int=norm) dest_buf = cl.image_from_array(self.ctx, src, 4, mode="w", norm_int=norm) dest = np.empty_like(src) kernel = np.array(kernel, dtype=np.float32) kernelf = kernel.flatten() kernelf_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=kernelf) halflen = (kernelf.shape[0]>>1) queue = cl.CommandQueue(self.ctx) self.prg.convolve2d_local(queue, (src.shape[1]-halflen, src.shape[0]-halflen), None, src_buf, dest_buf, kernelf_buf, np.int_(kernelf.shape[0])) cl.enqueue_copy(queue, dest, dest_buf, origin=(0, 0), region=(src.shape[1], src.shape[0])).wait() dest = dest[:,:,0:src2.shape[2]].copy() src_buf.release() dest_buf.release() kernelf_buf.release() return dest
def test_rotate_image3d(self): # CPU NROT = np.random.randint(self.rotations.shape[0] + 1) rotmat = self.rotations[NROT] #print(rotmat) cpu_lsurf = np.zeros_like(self.im_lsurf.array) disvis.libdisvis.rotate_image3d(self.im_lsurf.array, self.vlength, np.linalg.inv(rotmat), self.im_center, cpu_lsurf) gpu_im_lsurf = cl.image_from_array(self.queue.context, np.asarray(self.im_lsurf.array, dtype=np.float32)) gpu_lsurf = cl_array.zeros(self.queue, self.shape, dtype=np.float32) self.kernels.rotate_image3d(self.queue, self.sampler, gpu_im_lsurf, rotmat, gpu_lsurf, self.im_center) self.assertTrue(np.allclose(cpu_lsurf, gpu_lsurf.get(), atol=0.01))
def __call__(self, ctx, pyramid, blendmap, x, y): self.build(ctx) norm = np.issubdtype(pyramid.dtype, np.integer) pyramid_buf = cl.image_from_array(self.ctx, pyramid, 4, mode="r", norm_int=norm) dest = np.zeros_like(pyramid[:, :, 0, :], dtype=pyramid.dtype) dest_buf = init_image(self.ctx, dest, 4, mode="w", norm_int=norm) xoff = dest.shape[1] - x yoff = dest.shape[0] - y blendmap_buf = cl.image_from_array( self.ctx, blendmap[yoff:yoff + dest.shape[0], xoff:xoff + dest.shape[1], :].copy(), 4, mode="r") queue = cl.CommandQueue(self.ctx) self.prg.blend(queue, (dest.shape[1], dest.shape[0]), None, pyramid_buf, blendmap_buf, dest_buf) cl.enqueue_copy(queue, dest, dest_buf, origin=(0, 0), region=(dest.shape[1], dest.shape[0])).wait() dest_buf.release() pyramid_buf.release() blendmap_buf.release() return dest
def test_rotate_grids_and_multiply(self): shape = (5, 5, 5) template = np.zeros(shape, dtype=np.float32) template[2, 2, 1:4] = 1 template[2, 1:4, 2] = 1 mask = template * 2 np_out_template = np.zeros(shape, dtype=np.float32) np_out_template[0, 0, :2] = 1 np_out_template[0, 0, -1] = 1 np_out_template[0, :2, 0] = 1 np_out_template[0, -1, 0] = 1 np_out_mask = np_out_template * 2 np_out_mask2 = np_out_mask ** 2 cl_template = cl.image_from_array(self.ctx, template) cl_mask = cl.image_from_array(self.ctx, mask) cl_rotmat = np.asarray([1, 0, 0, 0, 1, 0, 0, 0, 1] + [0] * 7, dtype=np.float32) cl_center = np.asarray([2, 2, 2, 0], dtype=np.float32) cl_shape = np.asarray([5, 5, 5, 125], dtype=np.int32) cl_radius = np.int32(2) cl_out_template = cl_array.to_device(self.queue, np.zeros(shape, dtype=np.float32)) cl_out_mask = cl_array.to_device(self.queue, np.zeros(shape, dtype=np.float32)) cl_out_mask2 = cl_array.to_device(self.queue, np.zeros(shape, dtype=np.float32)) gws = tuple([int(2 * cl_radius + 1)] * 3) args = (cl_template, cl_mask, cl_rotmat, self.s_linear, self.s_nearest, cl_center, cl_shape, cl_radius, cl_out_template.data, cl_out_mask.data, cl_out_mask2.data) self.k.rotate_grids_and_multiply(self.queue, gws, None, *args) self.queue.finish() self.assertTrue(np.allclose(np_out_template, cl_out_template.get())) self.assertTrue(np.allclose(np_out_mask, cl_out_mask.get())) self.assertTrue(np.allclose(np_out_mask2, cl_out_mask2.get()))
def cl_is_grey(p_rgb, p_cl_context: cl.Context, p_cl_queue: cl.CommandQueue, p_cl_program: cl.Program) -> bool: """Check if an image is grey using OpenCL""" pixels = numpy.array(p_rgb) dev_buf = cl.image_from_array(p_cl_context, pixels, 4) color = numpy.uint32(0) dev_color = cl.Buffer(p_cl_context, cl.mem_flags.COPY_HOST_PTR, hostbuf=color) p_cl_program.isGrey(p_cl_queue, (pixels.shape[0], pixels.shape[1]), None, dev_buf, dev_color) nb_colors = numpy.empty_like(color) cl.enqueue_copy(p_cl_queue, nb_colors, dev_color) return bool(nb_colors == 0)
def __call__(self, ctx, src, kernel): self.build(ctx) kernel = np.array(kernel, copy=False, dtype=np.float32) halflen = kernel.shape[0] / 2 kernelf = kernel.flatten() kernelf_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=kernelf) src_padded = np.zeros( (src.shape[0] + 2 * halflen, src.shape[1] + 2 * halflen, 4), dtype=src.dtype) src_padded[halflen:-halflen, halflen:-halflen, :src.shape[2]] = src[:, :, :src.shape[2]] src_padded[halflen:-halflen, :halflen, :src.shape[2]] = src_padded[ halflen:-halflen, halflen:halflen * 2, :src.shape[2]][:, ::-1] src_padded[halflen:-halflen, -halflen:, :src.shape[2]] = src_padded[ halflen:-halflen, -halflen * 2:-halflen, :src.shape[2]][:, ::-1] src_padded[:halflen, :, :src.shape[2]] = src_padded[ halflen:halflen * 2, :, :src.shape[2]][::-1, ...] src_padded[-halflen:, :, :src.shape[2]] = src_padded[ -halflen * 2:-halflen, :, :src.shape[2]][::-1, ...] norm = np.issubdtype(src.dtype, np.integer) src_buf = cl.image_from_array(self.ctx, src_padded, 4, norm_int=norm) dest = np.zeros((src.shape[0], src.shape[1], 4), dtype=src.dtype) dest_buf = init_image(self.ctx, dest, 4, mode="w", norm_int=norm) queue = cl.CommandQueue(self.ctx) self.prg.convolve2d_naive(queue, (dest.shape[1], dest.shape[0]), None, src_buf, dest_buf, kernelf_buf, np.int32(kernel.shape[0])) cl.enqueue_copy(queue, dest, dest_buf, origin=(0, 0), region=(src.shape[1], src.shape[0])).wait() # src_buf.release() # dest_buf.release() # kernelf_buf.release() return dest[:, :, :src.shape[2]]
def from_array(cls, arr, *args, **kwargs): ctx = get_device().context if not arr.ndim in [1, 2, 3, 4]: raise ValueError( "dimension of array wrong, should be 1...4 but is %s" % arr.ndim) elif arr.ndim == 4: num_channels = arr.shape[-1] else: num_channels = None res = pyopencl.image_from_array(ctx, arr, num_channels=num_channels, *args, **kwargs) res.dtype = arr.dtype return res
def LoadImage(context, fileName): im = Image.open(fileName) img = np.array(im) IMG1 = scale_img(img, 8) im = Image.fromarray(IMG1) # Make sure the image is RGBA formatted if im.mode != "RGBA": im = im.convert("RGBA") IMG1 = np.array(im) if len(IMG1.shape) > 2: nchannels = IMG1.shape[-1] else: nchannels = None t0 = time.clock() clImage = cl.image_from_array(context, IMG1, num_channels=nchannels, mode="r", norm_int=False) t1 = time.clock() print t1 - t0, " Load to GPU..." return clImage, im.size, IMG1
def parallel_prediction_errors(self, image): """ Get the MILC prediction errors for a 3D image by means of OpenCL accelerated computation Keyword arguments: image -- a 3D numpy array (bitmap image) Return: a 3D numpy array of the same shape of "image", containing the prediction errors """ mf = cl.mem_flags # Define the image format for the prediction errors err_format = cl.ImageFormat(channel_order=cl.channel_order.R, channel_type=DataType.CL_ERR.value) # Define the input image from the numpy 3D array source_image = cl.image_from_array(self.ctx, image) original_shape = numpy.shape(image) cl_shape = list( reversed(original_shape)) # inverted shape (pyOpenCL bug?) # output image output_image = cl.Image(self.ctx, mf.WRITE_ONLY, err_format, shape=cl_shape) # sampler. pixels out of range have a value of '0' sampler = cl.Sampler(self.ctx, False, cl.addressing_mode.CLAMP, cl.filter_mode.NEAREST) # enqueue kernel self.program.image_test(self.queue, original_shape, None, source_image, output_image, sampler) # read the resulting image into a numpy array output_data = numpy.empty(shape=cl_shape, dtype=DataType.ERR.value) cl.enqueue_read_image(self.queue, output_image, (0, 0, 0), cl_shape, output_data) return output_data.reshape(original_shape)
def test_intervol(self): NROT = np.random.randint(self.rotations.shape[0] + 1) rotmat = self.rotations[NROT] cpu_lsurf = np.zeros_like(self.im_lsurf.array) disvis.libdisvis.rotate_image3d(self.im_lsurf.array, self.vlength, np.linalg.inv(rotmat), self.im_center, cpu_lsurf) cpu_intervol = numpy.fft.irfftn(numpy.fft.rfftn(cpu_lsurf).conj() * numpy.fft.rfftn(self.rsurf.array), s=self.shape) gpu_rsurf = cl_array.to_device( self.queue, np.asarray(self.rsurf.array, dtype=np.float32)) gpu_im_lsurf = cl.image_from_array( self.queue.context, np.asarray(self.im_lsurf.array, dtype=np.float32)) gpu_lsurf = cl_array.zeros(self.queue, self.shape, dtype=np.float32) self.kernels.rotate_image3d(self.queue, self.sampler, gpu_im_lsurf, rotmat, gpu_lsurf, self.im_center) gpu_ft_lsurf = cl_array.zeros(self.queue, self.ft_shape, dtype=np.complex64) gpu_ft_rsurf = cl_array.zeros(self.queue, self.ft_shape, dtype=np.complex64) gpu_ft_intervol = cl_array.zeros(self.queue, self.ft_shape, dtype=np.complex64) gpu_intervol = cl_array.zeros(self.queue, self.shape, dtype=np.float32) self.kernels.rfftn(self.queue, gpu_rsurf, gpu_ft_rsurf) self.kernels.rfftn(self.queue, gpu_lsurf, gpu_ft_lsurf) self.kernels.c_conj_multiply(self.queue, gpu_ft_lsurf, gpu_ft_rsurf, gpu_ft_intervol) self.kernels.irfftn(self.queue, gpu_ft_intervol, gpu_intervol) self.assertTrue(np.allclose(cpu_intervol, gpu_intervol.get(), atol=0.8))
def create_table(self, ctx, compile_options, nrow=200, ncol=1000): '''store the eos (ed, pr, T, s) in image2d_t table for fast linear interpolation, add some information to compile_options for EOS table''' import pyopencl as cl fmt = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.FLOAT) src = np.array(list(zip(self.cs2, self.pr, self.T, self.s)), dtype=np.float32).reshape(nrow, ncol, 4) eos_table = cl.image_from_array(ctx, src, 4) compile_options.append( '-D EOS_ED_START={value}f'.format(value=self.ed_start)) compile_options.append( '-D EOS_ED_STEP={value}f'.format(value=self.ed_step)) compile_options.append( '-D EOS_NUM_ED={value}'.format(value=self.num_of_ed)) compile_options.append('-D EOS_NUM_OF_ROWS=%s' % nrow) compile_options.append('-D EOS_NUM_OF_COLS=%s' % ncol) self.compile_options = compile_options return eos_table
def test_rotate_image3d(self): # CPU NROT = np.random.randint(self.rotations.shape[0] + 1) rotmat = self.rotations[NROT] #print(rotmat) cpu_lsurf = np.zeros_like(self.im_lsurf.array) disvis.libdisvis.rotate_image3d(self.im_lsurf.array, self.vlength, np.linalg.inv(rotmat), self.im_center, cpu_lsurf) gpu_im_lsurf = cl.image_from_array( self.queue.context, np.asarray(self.im_lsurf.array, dtype=np.float32)) gpu_lsurf = cl_array.zeros(self.queue, self.shape, dtype=np.float32) self.kernels.rotate_image3d(self.queue, self.sampler, gpu_im_lsurf, rotmat, gpu_lsurf, self.im_center) self.assertTrue(np.allclose(cpu_lsurf, gpu_lsurf.get(), atol=0.01))
def __init__(self): self.angle = 0. self.ch_angles = { "Key_UP": pi / 18., "Key_Down": -pi / 18., "Key_Right": -pi / 180., "Key_Left": pi / 180. } ctx = create_some_context() in_img = lena() h, w = map(int32, in_img.shape[:2]) # in pyopencl 2018.2.2 channel orders other than RGBA # cause segmentation fault i4 = zeros((h, w, 4), dtype=uint8) i4[:, :, 0] = in_img self.in_img_buf = image_from_array(ctx, i4, 4) fmt = ImageFormat(CHO.RGBA, CHANNEL.UNSIGNED_INT8) self.out_img_buf = Image(ctx, MEM.WRITE_ONLY, fmt, shape=(w, h)) prg = Program(ctx, load_cl_text("rotation.cl")).build() self.params = (ctx, self.in_img_buf, self.out_img_buf, h, w, prg)
def main(): CL_CODE = ''' constant float R_weight = 0.6; constant float G_weight = 0.4; constant float B_weight = 0.8; constant float ALL_weight = 1.8; constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; kernel void gray(__read_only image2d_t src_img, __write_only image2d_t dst_img) { int x = get_global_id(0); int y = get_global_id(1); int2 coord = (int2)(x, y); uint4 pixel = read_imageui(src_img, sampler, coord); uint g = (uint)((pixel[0] * R_weight + pixel[1] * G_weight + pixel[2] * B_weight) / ALL_weight); pixel = g; pixel[3] = 255; write_imageui(dst_img, coord, pixel); } ''' plf = [(cl.context_properties.PLATFORM, cl.get_platforms()[0])] ctx = cl.Context(dev_type=cl.device_type.GPU, properties=plf) prg = cl.Program(ctx, CL_CODE).build() queue = cl.CommandQueue(ctx) mf = cl.mem_flags src_raw = np.asarray(Image.open('res/tile-z16.png').convert("RGBA")) src_img = cl.image_from_array(ctx, src_raw, 4) (w, h, _) = src_raw.shape image_size = (w, h) fmt = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.UNSIGNED_INT8) dst_img = cl.Image(ctx, mf.WRITE_ONLY, fmt, shape=image_size) dst_raw = np.empty_like(src_raw) prg.gray(queue, image_size, (1, 1), src_img, dst_img) cl.enqueue_copy(queue, dst_raw, dst_img, origin=(0, 0), region=image_size) Image.fromarray(dst_raw).show()
def __call__(self, ctx, ix, iy, rx, ry, sw, sh, ez, ex, ey, levels, halfres_eccentricity, contrast_sensitivity, decay_constant): self.build(ctx) w = 2*ix h = 2*iy assert levels==6 var = np.array([0.849, 0.4245, 0.21225, 0.106125, 0.0530625, 0.02653125], dtype=np.float32) horizontal_degree = subtended_angle(ctx,[0],[ry],[2*rx],[ry],rx,ry,sw,sh,[ez],[ex],[ey])[0] freq = 0.5/(horizontal_degree/(2*rx)) critical_eccentricity = [0.0] for l in xrange(levels): ecc = halfres_eccentricity * ( (np.log(1/contrast_sensitivity)*(1<<l)/(decay_constant*freq))-1 ) if ecc > 90.0: ecc = 90.0 critical_eccentricity.append(ecc) critical_eccentricity.append(90.0) critical_eccentricity = np.array(critical_eccentricity, dtype=np.float32) ce_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=critical_eccentricity) var_buf = cl.Buffer(self.ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=var) dest = np.zeros((h, w, 4), dtype=np.float32) dest_buf = cl.image_from_array(self.ctx, dest, 4, mode="w") queue = cl.CommandQueue(self.ctx) self.prg.blendmap(queue, (dest.shape[1], dest.shape[0]), None, ce_buf, var_buf, np.float32(ix), np.float32(iy), np.float32(w), np.float32(h), np.float32(2*sw), np.float32(2*sh), np.float32(ez), np.float32(ex), np.float32(ey), np.uint32(levels), dest_buf) cl.enqueue_copy(queue, dest, dest_buf, origin=(0, 0), region=(dest.shape[1], dest.shape[0])).wait() dest = dest.copy() dest_buf.release() ce_buf.release() var_buf.release() return critical_eccentricity, dest
def convert(self, img): src = numpy.fromstring(img.bits().asstring(img.byteCount()), dtype=numpy.uint8) src.shape = h, w, _ = img.height(), img.width(), 4 mf = cl.mem_flags src_buf = cl.image_from_array(self.ctx, src, 4) fmt = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.UNSIGNED_INT8) dest_buf = cl.Image(self.ctx, mf.WRITE_ONLY, fmt, shape=(w, h)) self.prg.convert(self.queue, (w, h), None, src_buf, dest_buf, numpy.int32(w), numpy.int32(h)) dest = numpy.empty_like(src) cl.enqueue_copy(self.queue, dest, dest_buf, origin=(0, 0), region=(w, h)) return QtGui.QImage(str(dest.data), w, h, QtGui.QImage.Format_RGB32)
def test_rotate_template_mask(self): shape = (5, 5, 5) template = np.zeros(shape, dtype=np.float32) template[2, 2, 1:4] = 1 template[2, 1:4, 2] = 1 rotmat = np.asarray([1, 0, 0, 0, 1, 0, 0, 0, 1] + [0] * 7, dtype=np.float32) self.queue.finish() cl_template = cl.image_from_array(self.queue.context, template) cl_out = cl_array.to_device(self.queue, np.zeros(shape, dtype=np.float32)) center = np.asarray([2, 2, 2, 0], dtype=np.float32) shape = np.asarray([5, 5, 5, 125], dtype=np.int32) self.k.rotate_template(self.queue, (125,), None, self.s_linear, cl_template, rotmat, cl_out.data, center, shape) answer = np.zeros((5, 5, 5), dtype=np.float32) answer[0, 0, :2] = 1 answer[0, 0, -1] = 1 answer[0, :2, 0] = 1 answer[0, -1, 0] = 1 self.assertTrue(np.allclose(cl_out.get(), answer))
def template_preprocesing(self): names = [] for filename in glob.iglob(os.getcwd() + '/templates/*.png', recursive=True): names.append(filename) for name in sorted(names, key=self.sort_filenames): template = cv2.cvtColor(cv2.imread(name), cv2.COLOR_RGB2RGBA) self.templates.append(template) h = template.shape[0] w = template.shape[1] # *Buffors template_buf = cl.image_from_array(GPUSetup.context, template, 4) fmt = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.UNSIGNED_INT8) dest_buf = cl.Image(GPUSetup.context, cl.mem_flags.WRITE_ONLY, fmt, shape=(w, h)) # *RGB to HSV GPUSetup.program.rgb2hsv(GPUSetup.queue, (w, h), None, template_buf, dest_buf) template_hsv = np.empty_like(template) cl.enqueue_copy(GPUSetup.queue, template_hsv, dest_buf, origin=(0, 0), region=(w, h)) self.templates_hsv.append(template_hsv) # *Apply masks template_mask = self.clear_sign(template_hsv, template) self.templates_mask.append(template_mask)
def to_cl_image_8192(ctx, values, fill=0): """ Convert the given Nx3 array <values> to an RGBA image with a data type that is compatible to the datatype of <values>, with min(len(values), 8192) values along its axis 1, and the necessary number of values along axis 0, such that the image is filled putting values[0] into result[0, 0], values[1] into result[0, 1], etc. Excess image values and all values of the A channel are filled with the given <fill> value (defaults to 0). Note that, for whatever reason, OpenCL image axis 0 maps to array axis 1 and vice versa (cf. help(cl._cl.Image)) -- which should not make a difference if the result of this function is used transparently. Return the resulting <pyopencl._cl.Image> instance. """ assert values.shape[1] == 3 num_values = len(values) assert num_values > 0 # Determine image dimensions img_dim_1 = np.minimum(8192, num_values) img_dim_0 = (int(num_values - 1) >> 13) + 1 # 2 ** 13 = 8192 # Create appropriately sized Numpy array, then fill it values_2d = np.asarray(np.ones((img_dim_1 * img_dim_0, 4), dtype=values.dtype) * fill, dtype=values.dtype) values_2d[:num_values, :3] = values values_2d = values_2d.reshape(img_dim_0, img_dim_1, 4) values_2d = np.swapaxes(values_2d, 0, 1) values_2d = np.require(values_2d, requirements=["A", "C"]) # Create <cl._cl.Image> instance values_cl = cl.image_from_array(ctx, values_2d, num_channels=4, mode="r") return values_cl
def from_array(cls,arr, *args, **kwargs): ctx = get_device().context if not arr.ndim in [2,3,4]: raise ValueError("dimension of array wrong, should be 1...4 but is %s"%arr.ndim) elif arr.ndim == 4: num_channels = arr.shape[-1] else: num_channels = None if arr.dtype.type == np.complex64: num_channels = 2 res = OCLImage.empty(arr.shape,dtype = np.float32, num_channels=num_channels) res.write_array(arr) res.dtype = np.float32 else: res = cl.image_from_array(ctx, arr,num_channels = num_channels, *args, **kwargs) res.dtype = arr.dtype res.num_channels = num_channels return res
def test_image_3d(ctx_factory): #test for image_from_array for 3d image of float2 context = ctx_factory() device, = context.devices if not device.image_support: from pytest import skip skip("images not supported on %s" % device) if device.platform.vendor == "Intel(R) Corporation": from pytest import skip skip("images crashy on %s" % device) _skip_if_pocl(device.platform, 'pocl does not support CL_ADDRESS_CLAMP') prg = cl.Program(context, """ __kernel void copy_image_plane( __global float2 *dest, __read_only image3d_t src, sampler_t samp, int stride0, int stride1) { int d0 = get_global_id(0); int d1 = get_global_id(1); int d2 = get_global_id(2); /* const sampler_t samp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; */ dest[d0*stride0 + d1*stride1 + d2] = read_imagef( src, samp, (float4)(d2, d1, d0, 0)).xy; } """).build() num_channels = 2 shape = (3, 4, 2) a = np.random.random(shape + (num_channels,)).astype(np.float32) queue = cl.CommandQueue(context) try: a_img = cl.image_from_array(context, a, num_channels) except cl.RuntimeError: import sys exc = sys.exc_info()[1] if exc.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED: from pytest import skip skip("required image format not supported on %s" % device.name) else: raise a_dest = cl.Buffer(context, cl.mem_flags.READ_WRITE, a.nbytes) samp = cl.Sampler(context, False, cl.addressing_mode.CLAMP, cl.filter_mode.NEAREST) prg.copy_image_plane(queue, shape, None, a_dest, a_img, samp, np.int32(a.strides[0]/a.itemsize/num_channels), np.int32(a.strides[1]/a.itemsize/num_channels), ) a_result = np.empty_like(a) cl.enqueue_copy(queue, a_result, a_dest) good = la.norm(a_result - a) == 0 if not good: if queue.device.type & cl.device_type.CPU: assert good, ("The image implementation on your CPU CL platform '%s' " "returned bad values. This is bad, but common." % queue.device.platform) else: assert good
import cv2 def loadProgram(filename): with open(filename, 'r') as f: return "".join(f.readlines()) ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) build_opts = "-I." RGB2YCrCb = cl.Program(ctx, loadProgram("RGB2YCrCb.cl")).build(build_opts).RGB2YCrCb YCrCb2RGB = cl.Program(ctx, loadProgram("YCrCb2RGB.cl")).build(build_opts).YCrCb2RGB fmt = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.UNSIGNED_INT8) src = cv2.cvtColor(cv2.imread("PM5544_with_non-PAL_signals.png",cv2.IMREAD_UNCHANGED), cv2.COLOR_BGR2RGBA) src_buf = cl.image_from_array(ctx, src, 4) dest_buf = cl.Image(ctx, cl.mem_flags.WRITE_ONLY, fmt, shape=(src.shape[1],src.shape[0])) RGB2YCrCb(queue, (src.shape[1],src.shape[0]), None, src_buf, dest_buf) dest = np.empty_like(src) cl.enqueue_copy(queue, dest, dest_buf, origin=(0, 0), region=(src.shape[1],src.shape[0])) Y = cv2.merge((dest[:,:,0],dest[:,:,0],dest[:,:,0])) Cr = cv2.merge((dest[:,:,1],dest[:,:,1],dest[:,:,1])) Cb = cv2.merge((dest[:,:,2],dest[:,:,2],dest[:,:,2])) src2_buf = cl.image_from_array(ctx, dest, 4) dest2_buf = cl.Image(ctx, cl.mem_flags.WRITE_ONLY, fmt, shape=(src.shape[1],src.shape[0])) YCrCb2RGB(queue, (src.shape[1],src.shape[0]), None, src2_buf, dest2_buf)
def make_ref_args(kernel, impl_arg_info, queue, parameters): import pyopencl as cl import pyopencl.array as cl_array from loopy.kernel.data import ValueArg, GlobalArg, ImageArg, TemporaryVariable from pymbolic import evaluate ref_args = {} ref_arg_data = [] for arg in impl_arg_info: kernel_arg = kernel.impl_arg_to_arg.get(arg.name) if arg.arg_class is ValueArg: if arg.offset_for_name: continue arg_value = parameters[arg.name] try: argv_dtype = arg_value.dtype except AttributeError: argv_dtype = None if argv_dtype != arg.dtype: arg_value = arg.dtype.numpy_dtype.type(arg_value) ref_args[arg.name] = arg_value ref_arg_data.append(None) elif arg.arg_class is GlobalArg or arg.arg_class is ImageArg: if arg.shape is None or any(saxis is None for saxis in arg.shape): raise LoopyError("array '%s' needs known shape to use automatic " "testing" % arg.name) shape = evaluate_shape(arg.unvec_shape, parameters) dtype = kernel_arg.dtype is_output = arg.base_name in kernel.get_written_variables() if arg.arg_class is ImageArg: storage_array = ary = cl_array.empty( queue, shape, dtype, order="C") numpy_strides = None alloc_size = None strides = None else: strides = evaluate(arg.unvec_strides, parameters) from pytools import all assert all(s > 0 for s in strides) alloc_size = sum(astrd*(alen-1) for alen, astrd in zip(shape, strides)) + 1 if dtype is None: raise LoopyError("dtype for argument '%s' is not yet " "known. Perhaps you want to use " "loopy.add_dtypes " "or loopy.infer_argument_dtypes?" % arg.name) itemsize = dtype.itemsize numpy_strides = [itemsize*s for s in strides] storage_array = cl_array.empty(queue, alloc_size, dtype) if is_output and arg.arg_class is ImageArg: raise LoopyError("write-mode images not supported in " "automatic testing") fill_rand(storage_array) if arg.arg_class is ImageArg: # must be contiguous pre_run_ary = pre_run_storage_array = storage_array.copy() ref_args[arg.name] = cl.image_from_array( queue.context, ary.get()) else: pre_run_storage_array = storage_array.copy() ary = cl_array.as_strided(storage_array, shape, numpy_strides) pre_run_ary = cl_array.as_strided( pre_run_storage_array, shape, numpy_strides) ref_args[arg.name] = ary ref_arg_data.append( TestArgInfo( name=arg.name, ref_array=ary, ref_storage_array=storage_array, ref_pre_run_array=pre_run_ary, ref_pre_run_storage_array=pre_run_storage_array, ref_shape=shape, ref_strides=strides, ref_alloc_size=alloc_size, ref_numpy_strides=numpy_strides, needs_checking=is_output)) elif arg.arg_class is TemporaryVariable: # global temporary, handled by invocation logic pass else: raise LoopyError("arg type not understood") return ref_args, ref_arg_data
def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters): import pyopencl as cl import pyopencl.array as cl_array from loopy.kernel.data import ValueArg, GlobalArg, ImageArg, TemporaryVariable from pymbolic import evaluate args = {} for arg, arg_desc in zip(impl_arg_info, ref_arg_data): kernel_arg = kernel.impl_arg_to_arg.get(arg.name) if arg.arg_class is ValueArg: arg_value = parameters[arg.name] try: argv_dtype = arg_value.dtype except AttributeError: argv_dtype = None if argv_dtype != arg.dtype: arg_value = arg.dtype.numpy_dtype.type(arg_value) args[arg.name] = arg_value elif arg.arg_class is ImageArg: if arg.name in kernel.get_written_variables(): raise NotImplementedError("write-mode images not supported in " "automatic testing") shape = evaluate_shape(arg.unvec_shape, parameters) assert shape == arg_desc.ref_shape # must be contiguous args[arg.name] = cl.image_from_array( queue.context, arg_desc.ref_pre_run_array.get()) elif arg.arg_class is GlobalArg: shape = evaluate(arg.unvec_shape, parameters) strides = evaluate(arg.unvec_strides, parameters) dtype = kernel_arg.dtype itemsize = dtype.itemsize numpy_strides = [itemsize*s for s in strides] assert all(s > 0 for s in strides) alloc_size = sum(astrd*(alen-1) for alen, astrd in zip(shape, strides)) + 1 # use contiguous array to transfer to host host_ref_contig_array = arg_desc.ref_pre_run_storage_array.get() # use device shape/strides from pyopencl.compyte.array import as_strided host_ref_array = as_strided(host_ref_contig_array, arg_desc.ref_shape, arg_desc.ref_numpy_strides) # flatten the thing host_ref_flat_array = host_ref_array.flatten() # create host array with test shape (but not strides) host_contig_array = np.empty(shape, dtype=dtype) common_len = min( len(host_ref_flat_array), len(host_contig_array.ravel())) host_contig_array.ravel()[:common_len] = \ host_ref_flat_array[:common_len] # create host array with test shape and storage layout host_storage_array = np.empty(alloc_size, dtype) host_array = as_strided( host_storage_array, shape, numpy_strides) host_array[...] = host_contig_array host_contig_array = arg_desc.ref_storage_array.get() storage_array = cl_array.to_device(queue, host_storage_array) ary = cl_array.as_strided(storage_array, shape, numpy_strides) args[arg.name] = ary arg_desc.test_storage_array = storage_array arg_desc.test_array = ary arg_desc.test_shape = shape arg_desc.test_strides = strides arg_desc.test_numpy_strides = numpy_strides arg_desc.test_alloc_size = alloc_size elif arg.arg_class is TemporaryVariable: # global temporary, handled by invocation logic pass else: raise LoopyError("arg type not understood") return args
# Build program in the specified context using the kernel source code prog = cl.Program(context, kernel_src) try: prog.build(options=['-Werror', '-DSCALE={}'.format(SCALE_FACTOR)], devices=[dev], cache_dir=None) except: print('Build log:') print(prog.get_build_info(dev, cl.program_build_info.LOG)) raise # Data and buffers im_src = imread('input_car.png').astype(dtype=np.uint16) shape_dst = (im_src.shape[0]*SCALE_FACTOR, im_src.shape[1]*SCALE_FACTOR) im_dst = np.empty(shape=shape_dst, dtype=np.uint16) src_buff = cl.image_from_array(context, im_src, mode='r') dst_buff = cl.image_from_array(context, im_dst, mode='w') # Enqueue kernel # Note: Global indices is reversed due to OpenCL using column-major order when reading images global_size = im_src.shape[::-1] local_size = None # __call__(queue, global_size, local_size, *args, global_offset=None, wait_for=None, g_times_l=False) prog.interp(queue, global_size, local_size, src_buff, dst_buff) # Enqueue command to copy from buffers to host memory # Note: Region indices is reversed due to OpenCL using column-major order when reading images cl.enqueue_copy(queue, dest=im_dst, src=dst_buff, is_blocking=True, origin=(0, 0), region=im_dst.shape[::-1]) # Plot images with built-in scaling disabled
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ import pyopencl,pyopencl.array import numpy ctx = pyopencl.create_some_context() queue = pyopencl.CommandQueue(ctx, properties=pyopencl.command_queue_properties.PROFILING_ENABLE) x, y, z = numpy.ogrid[-10:10:0.05, -10:10:0.05, -10:10:0.05] r=numpy.sqrt(x*x+y*y+z*z) data = ((x * x - y * y + z * z) * numpy.exp(-r)).astype("float32") gpu_vol = pyopencl.image_from_array(ctx, data, 1) shape = (500, 500) img = numpy.empty(shape,dtype=numpy.float32) gpu_img = pyopencl.array.empty(queue, shape, numpy.float32) prg = open("interpolation.cl").read() sampler = pyopencl.Sampler(ctx, True, # normalized coordinates pyopencl.addressing_mode.CLAMP_TO_EDGE, pyopencl.filter_mode.LINEAR) prg = pyopencl.Program(ctx, prg).build() n = pyopencl.array.to_device(queue, numpy.array([1, 1, 1], dtype=numpy.float32)) c = pyopencl.array.to_device(queue, numpy.array([0.5, 0.5, 0.5], dtype=numpy.float32)) prg.interpolate(queue, (512, 512), (16, 16), gpu_vol, sampler, gpu_img.data, numpy.int32(shape[1]), numpy.int32(shape[1]), c.data, n.data) img = gpu_img.get()