def setUpClass(self): self.queue = get_queue() self.shape = (4, 5, 6) self.size = 4 * 5 * 6 self.values = { 'shape_x': self.shape[2], 'shape_y': self.shape[1], 'shape_z': self.shape[0], 'llength': 2, } self.k = CLKernels(self.queue.context, self.values) self.grid = np.zeros(self.shape, dtype=np.float64) self.grid[0, 0, 0] = 1 self.grid[0, 0, 1] = 1 self.grid[0, 1, 1] = 1 self.grid[0, 0, 2] = 1 self.grid[0, 0, -1] = 1 self.grid[-1, 0, 0] = 1 self.cl_image = cl.image_from_array(self.queue.context, self.grid.astype(np.float32)) self.sampler_linear = cl.Sampler(self.queue.context, True, cl.addressing_mode.REPEAT, cl.filter_mode.LINEAR) self.sampler_nearest = cl.Sampler(self.queue.context, False, cl.addressing_mode.CLAMP, cl.filter_mode.NEAREST) self.out = np.zeros(self.shape, dtype=np.float64) self.cl_out = cl_array.zeros(self.queue, self.shape, dtype=np.float32)
def __init__(self, ctx, values): self.sampler_nearest = cl.Sampler(ctx, True, cl.addressing_mode.REPEAT, cl.filter_mode.NEAREST) self.sampler_linear = cl.Sampler(ctx, True, cl.addressing_mode.REPEAT, cl.filter_mode.LINEAR) self.multiply = ElementwiseKernel(ctx, "float *x, float *y, float *z", "z[i] = x[i] * y[i];") self.conj_multiply = ElementwiseKernel( ctx, "cfloat_t *x, cfloat_t *y, cfloat_t *z", "z[i] = cfloat_mul(cfloat_conj(x[i]), y[i]);") self.calc_lcc_and_take_best = ElementwiseKernel( ctx, """float *gcc, float *ave, float *ave2, int *mask, float norm_factor, int nrot, float *lcc, int *grot""", """float _lcc; if (mask[i] > 0) { _lcc = gcc[i] / sqrt(ave2[i] * norm_factor - ave[i] * ave[i]); if (_lcc > lcc[i]) { lcc[i] = _lcc; grot[i] = nrot; }; }; """) kernel_file = os.path.join(os.path.dirname(__file__), 'kernels.cl') with open(kernel_file) as f: t = Template(f.read()).substitute(**values) self._program = cl.Program(ctx, t).build() self._gws_rotate_grid3d = (96, 64, 1)
def setUp(self): p = cl.get_platforms()[0] devs = p.get_devices() self.ctx = cl.Context(devices=devs) self.queue = cl.CommandQueue(self.ctx, device=devs[0]) self.k = CLKernels(self.ctx) self.k = CLKernels(self.ctx) self.s_linear = cl.Sampler(self.ctx, False, cl.addressing_mode.CLAMP, cl.filter_mode.LINEAR) self.s_nearest = cl.Sampler(self.ctx, False, cl.addressing_mode.CLAMP, cl.filter_mode.NEAREST)
def main(): # setup OpenCL platforms = cl.get_platforms( ) # a platform corresponds to a driver (e.g. AMD, NVidia, Intel) platform = platforms[0] # take first platform devices = platform.get_devices( cl.device_type.GPU) # get GPU devices of selected platform device = devices[0] # take first GPU context = cl.Context([device]) # put selected GPU into context object queue = cl.CommandQueue( context, device) # create command queue for selected GPU and context # prepare data imgIn = cv2.imread('photographer.png', cv2.IMREAD_GRAYSCALE) rotation_angle = np.pi / 4 cos_theta = np.cos(rotation_angle) sin_theta = np.sin(rotation_angle) # setup sampler sampler = cl.Sampler(context, True, cl.addressing_mode.REPEAT, cl.filter_mode.NEAREST) # get shape of input image, allocate memory for output to which result can be copied to shape = imgIn.T.shape imgOut = np.empty_like(imgIn) # create image buffers which hold images for OpenCL imgInBuf = cl.image_from_array(context, ary=imgIn, mode="r", norm_int=True, num_channels=1) imgOutBuf = cl.image_from_array(context, ary=imgOut, mode="w", norm_int=True, num_channels=1) # load, compile and execute OpenCL program program = cl.Program(context, open('kernel.cl').read()).build() program.img_rotate(queue, shape, None, sampler, imgInBuf, imgOutBuf, np.double(sin_theta), np.double(cos_theta)) cl.enqueue_copy( queue, imgOut, imgOutBuf, origin=(0, 0), region=shape, is_blocking=True ) # wait until finished copying resulting image back from GPU to CPU # write output image cv2.imwrite('photographer_rotated.png', imgOut) # show images fig, ax = plt.subplots(1, 2) ax[0].imshow(imgIn, cmap='gray') ax[1].imshow(imgOut, cmap='gray') plt.show()
def init_with_device(self): global context, program self.context = context self.program = program self.queue = pyopencl.CommandQueue(self.context) fm = pyopencl.filter_mode.NEAREST self.sampler = pyopencl.Sampler(self.context, False, pyopencl.addressing_mode.CLAMP_TO_EDGE, fm) k_def = KERNELS_DEFS.get((self.src_format, self.dst_format)) assert k_def, "no kernel found for %s to %s" % (self.src_format, self.dst_format) self.kernel_function_name, _, self.channel_order, src = k_def if self.src_format.endswith("P"): #yuv 2 rgb: self.do_convert_image = self.convert_image_yuv else: #rgb 2 yuv: self.do_convert_image = self.convert_image_rgb log("init_context(..) kernel source=%s", src) self.kernel_function = getattr(self.program, self.kernel_function_name) log("init_context(..) channel order=%s, filter mode=%s", CHANNEL_ORDER_TO_STR.get(self.channel_order, self.channel_order), FILTER_MODE_TO_STR.get(fm, fm)) log("init_context(..) kernel_function %s: %s", self.kernel_function_name, self.kernel_function) assert self.kernel_function
def test_rotate_image3d_1(self): shape = (8, 6, 5) np_image = np.zeros(shape, dtype=np.float32) np_image[0, 0, 0] = 1 np_image[0, 0, 1] = 1 np_image[0, 0, 2] = 1 # 90 degree rotation around z-axis rotmat = [[0, 1, 0], [1, 0, 0], [0, 0, 1]] np_out = np.zeros_like(np_image) expected = np.zeros_like(np_image) expected[0, 0, 0] = 1 expected[0, 1, 0] = 1 expected[0, 2, 0] = 1 cl_image = cl.image_from_array(self.queue.context, np_image) cl_out = cl_array.to_device(self.queue, np_out) cl_sampler = cl.Sampler(self.queue.context, False, cl.addressing_mode.CLAMP, cl.filter_mode.LINEAR) self.kernels.rotate_image3d(self.queue, cl_sampler, cl_image, rotmat, cl_out) self.assertTrue(np.allclose(expected, cl_out.get()))
def rescale(image, shape, sampler=None, queue=None, out=None, block=False): """Rescale *image* to *shape* and use *sampler* which is a :class:`pyopencl.Sampler` instance. Use OpenCL *queue* and *out* pyopencl Array. If *block* is True, wait for the copy to finish. """ if cfg.PRECISION.cl_float == 8: raise TypeError("Double precision mode not supported") shape = make_tuple(shape) # OpenCL order factor = float(shape[1]) / image.shape[1], float(shape[0]) / image.shape[0] LOG.debug("rescale, shape: %s, final_shape: %s, factor: %s", image.shape, shape, factor) if queue is None: queue = cfg.OPENCL.queue if out is None: out = cl.array.Array(queue, shape, dtype=cfg.PRECISION.np_float) if not sampler: sampler = cl.Sampler( cfg.OPENCL.ctx, False, cl.addressing_mode.CLAMP_TO_EDGE, cl.filter_mode.LINEAR ) image = g_util.get_image(image) ev = cfg.OPENCL.programs["improc"].rescale( queue, shape[::-1], None, image, out.data, sampler, g_util.make_vfloat2(*factor) ) if block: ev.wait() return out
def _varconvolve_2d_parametrized( image, parameters, kernel_name, sampler=None, queue=None, out=None, block=False ): """Variable convolution of *image* with *parameters*, use OpoenCL kernel *kernel_name*, *sampler*, *queue*, *out* and wait if *block* is True. Return *out*. """ if queue is None: queue = cfg.OPENCL.queue if out is None: out = cl.array.Array(queue, image.shape, dtype=cfg.PRECISION.np_float) if sampler is None: sampler = cl.Sampler( queue.context, False, cl.addressing_mode.CLAMP_TO_EDGE, cl.filter_mode.NEAREST ) if not isinstance(parameters, cl_array.Array): params_host = np.empty(parameters[0].shape, dtype=cfg.PRECISION.vfloat2) params_host["y"] = g_util.get_host(parameters[0]) params_host["x"] = g_util.get_host(parameters[1]) parameters = cl_array.to_device(queue, params_host) if parameters.shape != image.shape: raise ValueError( "Parameters shape '{}' differs from image shape '{}'".format( parameters.shape, image.shape ) ) image = g_util.get_image(image, queue=queue) args = (image, out.data, sampler, cl_array.vec.make_int2(0, 0), parameters.data) varconvolve(kernel_name, image.shape[::-1], args, queue=queue, block=block) return out
def compute(self): self.width, self.height = self.input(0).size self.devOutBuffer = cl.Image(self.engine.ctx, self.engine.mf.READ_WRITE, self.image_format, shape=(self.width, self.height)) sampler = cl.Sampler( self.engine.ctx, True, # Normalized coordinates cl.addressing_mode.CLAMP_TO_EDGE, cl.filter_mode.LINEAR) exec_evt = self.program.run_add( self.engine.queue, self.size, None, self.input(0).getOutDevBuffer(), self.input(1).getOutDevBuffer(), self.devOutBuffer, sampler, numpy.int32(self.width), numpy.int32(self.height), ) exec_evt.wait()
def setupClQueue(self, ctx): self.ctx = ctx self.queue = cl.CommandQueue(self.ctx, properties=cl.command_queue_properties. OUT_OF_ORDER_EXEC_MODE_ENABLE) self.mf = cl.mem_flags self.sampler = cl.Sampler(self.ctx, True, cl.addressing_mode.REPEAT, cl.filter_mode.LINEAR) pass
def main(): imageObjects = [0, 0] # Main if len(sys.argv) != 3: print "USAGE: " + sys.argv[0] + " <inputImageFile> <outputImageFile>" return 1 # Create an OpenCL context on first available platform context, device = CreateContext() if context == None: print "Failed to create OpenCL context." return 1 # Create a command-queue on the first device available commandQueue = cl.CommandQueue(context, device) # Make sure the device supports images, otherwise exit if not device.get_info(cl.device_info.IMAGE_SUPPORT): print "OpenCL device does not support images." return 1 # Load input image from file and load it into # an OpenCL image object imageObjects[0], imgSize = LoadImage(context, sys.argv[1]) # Create ouput image object clImageFormat = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.UNORM_INT8) imageObjects[1] = cl.Image(context, cl.mem_flags.WRITE_ONLY, clImageFormat, imgSize) # Create sampler for sampling image object sampler = cl.Sampler( context, False, # Non-normalized coordinates cl.addressing_mode.CLAMP_TO_EDGE, cl.filter_mode.NEAREST) # Create OpenCL program program = CreateProgram(context, device, "ImageFilter2D.cl") # Call the kernel directly localWorkSize = (16, 16) globalWorkSize = (RoundUp(localWorkSize[0], imgSize[0]), RoundUp(localWorkSize[1], imgSize[1])) program.gaussian_filter(commandQueue, globalWorkSize, localWorkSize, imageObjects[0], imageObjects[1], sampler, numpy.int32(imgSize[0]), numpy.int32(imgSize[1])) # Read the output buffer back to the Host buffer = numpy.zeros(imgSize[0] * imgSize[1] * 4, numpy.uint8) origin = (0, 0, 0) region = (imgSize[0], imgSize[1], 1) cl.enqueue_read_image(commandQueue, imageObjects[1], origin, region, buffer).wait() print "Executed program successfully." # Save the image to disk SaveImage(sys.argv[2], buffer, imgSize)
def set_envmap(self, envmap): fmt = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.FLOAT) em = np.zeros(envmap.shape[:2] + (4, ), dtype=np.float32) em[:, :, :3] = envmap em[:, :, 3] = 1 self.envmap = cl.Image(context, mf.READ_ONLY | mf.COPY_HOST_PTR, fmt, shape=em.shape[:2], hostbuf=em) self.sampler = cl.Sampler(context, True, cl.addressing_mode.CLAMP, cl.filter_mode.LINEAR)
def test_image_2d(self, device, ctx_getter): context = ctx_getter() if not device.image_support: from py.test import skip skip("images not supported on %s" % device) prg = cl.Program( context, """ __kernel void copy_image( __global float4 *dest, __read_only image2d_t src, sampler_t samp, int width) { int x = get_global_id(0); int y = get_global_id(1); /* const sampler_t samp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; */ dest[x + width*y] = read_imagef(src, samp, (float2)(x, y)); // dest[x + width*y] = get_image_height(src); } """).build() a = numpy.random.rand(1024, 1024, 4).astype(numpy.float32) queue = cl.CommandQueue(context) mf = cl.mem_flags a_img = cl.Image(context, mf.READ_ONLY | mf.COPY_HOST_PTR, cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.FLOAT), shape=a.shape[:2], hostbuf=a) a_dest = cl.Buffer(context, mf.READ_WRITE, a.nbytes) samp = cl.Sampler(context, False, cl.addressing_mode.CLAMP, cl.filter_mode.NEAREST) prg.copy_image(queue, a.shape, None, a_dest, a_img, samp, numpy.int32(a.shape[0])) a_result = numpy.empty_like(a) cl.enqueue_read_buffer(queue, a_dest, a_result, is_blocking=True) print a_result.dtype assert la.norm(a_result - a) == 0
def _transfer_real( self, shape, center, pixel_size, energy, exponent, compute_phase, is_parabola, out, queue, block, flux=1, ): flux = (self.get_flux(energy, None, pixel_size).rescale( 1 / q.s).magnitude.astype(cfg.PRECISION.np_float)) cl_image = gutil.get_image(flux, queue=queue) sampler = cl.Sampler(cfg.OPENCL.ctx, False, cl.addressing_mode.CLAMP, cl.filter_mode.LINEAR) cl_center = gutil.make_vfloat3(*center) cl_ps = gutil.make_vfloat2(*pixel_size.simplified.magnitude[::-1]) cl_input_ps = gutil.make_vfloat2( *self._pixel_size.simplified.magnitude[::-1]) z_sample = self.sample_distance.simplified.magnitude lam = energy_to_wavelength(energy).simplified.magnitude kernel = cfg.OPENCL.programs["physics"].make_flat_from_2D_profile ev = kernel( queue, shape[::-1], None, out.data, cl_image, sampler, cl_center, cl_ps, cl_input_ps, cfg.PRECISION.np_float(z_sample), cfg.PRECISION.np_float(lam), np.int32(exponent), np.int32(compute_phase), np.int32(is_parabola), ) if block: ev.wait()
def test_int_ptr(ctx_factory): def do_test(obj): new_obj = type(obj).from_int_ptr(obj.int_ptr) assert obj == new_obj assert type(obj) is type(new_obj) ctx = ctx_factory() device, = ctx.devices platform = device.platform do_test(device) do_test(platform) do_test(ctx) queue = cl.CommandQueue(ctx) do_test(queue) evt = cl.enqueue_marker(queue) do_test(evt) prg = cl.Program( ctx, """ __kernel void sum(__global float *a) { a[get_global_id(0)] *= 2; } """).build() do_test(prg) do_test(prg.sum) n = 2000 a_buf = cl.Buffer(ctx, 0, n * 4) do_test(a_buf) # crashes on intel... # and pocl does not support CL_ADDRESS_CLAMP if device.image_support and platform.vendor not in [ "Intel(R) Corporation", "The pocl project", ]: smp = cl.Sampler(ctx, False, cl.addressing_mode.CLAMP, cl.filter_mode.NEAREST) do_test(smp) img_format = cl.get_supported_image_formats( ctx, cl.mem_flags.READ_ONLY, cl.mem_object_type.IMAGE2D)[0] img = cl.Image(ctx, cl.mem_flags.READ_ONLY, img_format, (128, 256)) do_test(img)
def parallel_prediction_errors(self, image): """ Get the MILC prediction errors for a 3D image by means of OpenCL accelerated computation Keyword arguments: image -- a 3D numpy array (bitmap image) Return: a 3D numpy array of the same shape of "image", containing the prediction errors """ mf = cl.mem_flags # Define the image format for the prediction errors err_format = cl.ImageFormat(channel_order=cl.channel_order.R, channel_type=DataType.CL_ERR.value) # Define the input image from the numpy 3D array source_image = cl.image_from_array(self.ctx, image) original_shape = numpy.shape(image) cl_shape = list( reversed(original_shape)) # inverted shape (pyOpenCL bug?) # output image output_image = cl.Image(self.ctx, mf.WRITE_ONLY, err_format, shape=cl_shape) # sampler. pixels out of range have a value of '0' sampler = cl.Sampler(self.ctx, False, cl.addressing_mode.CLAMP, cl.filter_mode.NEAREST) # enqueue kernel self.program.image_test(self.queue, original_shape, None, source_image, output_image, sampler) # read the resulting image into a numpy array output_data = numpy.empty(shape=cl_shape, dtype=DataType.ERR.value) cl.enqueue_read_image(self.queue, output_image, (0, 0, 0), cl_shape, output_data) return output_data.reshape(original_shape)
def setUp(self): self.pdb1 = disvis.PDB.fromfile( os.path.join(os.path.dirname(__file__), 'data', 'O14250.pdb')) self.pdb2 = disvis.PDB.fromfile( os.path.join(os.path.dirname(__file__), 'data', 'Q9UT97.pdb')) q, w, a = disvis.rotations.proportional_orientations(10) self.rotations = disvis.rotations.quat_to_rotmat(q) self.vlength = int(np.linalg.norm(self.pdb2.coor - self.pdb2.center, axis=1).max() +\ 3 + 1.5)/1.0 self.shape = disvis.disvis.grid_shape(self.pdb1.coor, self.pdb2.coor, 1) radii = np.zeros(self.pdb1.coor.shape[0], dtype=np.float64) radii.fill(1.5 + 3) self.rsurf = disvis.disvis.rsurface(self.pdb1.coor, radii, self.shape, 1) self.rcore = disvis.volume.erode(self.rsurf, 3) self.origin = self.rsurf.origin self.voxelspacing = 1 radii = np.zeros(self.pdb2.coor.shape[0], dtype=np.float64) radii.fill(1.5) self.im_lsurf = disvis.points.dilate_points(self.pdb2.coor - self.pdb2.center \ + self.pdb1.center, radii, disvis.volume.zeros_like(self.rcore)) self.im_center = np.asarray( (self.pdb1.center - self.rcore.origin) / 1.0, dtype=np.float64) self.queue = disvis.helpers.get_queue() self.sampler = cl.Sampler(self.queue.context, False, cl.addressing_mode.CLAMP, cl.filter_mode.LINEAR) self.kernels = disvis.kernels.Kernels(self.queue.context) self.ft_shape = list(self.shape) self.ft_shape[0] = self.ft_shape[0] // 2 + 1 self.ft_shape = tuple(self.ft_shape) self.kernels.rfftn = disvis.pyclfft.RFFTn(self.queue.context, self.shape) self.kernels.irfftn = disvis.pyclfft.iRFFTn(self.queue.context, self.shape)
if not info_name.startswith("_") and info_name != "to_string": info = getattr(info_cls, info_name) try: info_value = obj.get_info(info) except: info_value = "<error>" print "%s: %s" % (info_name, info_value) platform = cl.get_platforms()[0] device = platform.get_devices()[0] context = cl.Context(devices=[device]) queue = cl.CommandQueue(context) mf = cl.mem_flags sampler = cl.Sampler(context, True, cl.addressing_mode.CLAMP, cl.filter_mode.LINEAR) def normal_maker(name, mat, matw, matr): def tup(i): return '(float4)' + repr(tuple(mat[i].tolist())) def tupw(i): return '(float4)' + repr(tuple(matw[i].tolist())) def tupr(i): return '(float4)' + repr(tuple(matr[i].tolist())) return """ inline float4 matmul3_%s(const float4 r1) { return (float4)(dot(%s,r1),dot(%s,r1),dot(%s,r1),0);
def dilate(): #headURI = 'http://www.slicer.org/slicerWiki/images/4/43/MR-head.nrrd' #labelURI = 'http://boggs.bwh.harvard.edu/tmp/MRHead-label.nrrd' base = '/tmp/hoot/' headURI = base + 'MR-head.nrrd' labelURI = base + 'MR-head-label.nrrd' print("Starting...") if not slicer.util.getNode('MR-head*'): print("Downloading...") vl = slicer.modules.volumes.logic() name = 'MR-head' volumeNode = vl.AddArchetypeVolume(headURI, name, 0) name = 'MR-head-label' labelNode = vl.AddArchetypeVolume(labelURI, name, 1) if volumeNode: storageNode = volumeNode.GetStorageNode() if storageNode: # Automatically select the volume to display appLogic = slicer.app.applicationLogic() selNode = appLogic.GetSelectionNode() selNode.SetReferenceActiveVolumeID(volumeNode.GetID()) selNode.SetReferenceActiveLabelVolumeID(labelNode.GetID()) appLogic.PropagateVolumeSelection(1) node = slicer.util.getNode('MR-head') volume = slicer.util.array('MR-head') oneOverVolumeMax = 1. / volume.max() labelNode = slicer.util.getNode('MR-head-label') labelVolume = slicer.util.array('MR-head-label') print("Creating Context...") ctx = None for platform in cl.get_platforms(): for device in platform.get_devices(): print(cl.device_type.to_string(device.type)) if cl.device_type.to_string(device.type) == "GPU": ctx = cl.Context([device]) break; if not ctx: print ("no GPU context available") ctx = cl.create_some_context() print("Creating Queue...") queue = cl.CommandQueue(ctx) print("Copying volumes...") mf = cl.mem_flags volume_dev = cl_array.to_device(queue, volume) volume_image_dev = cl.image_from_array(ctx, volume,1) label_dev = cl.array.to_device(queue, labelVolume) theta = numpy.zeros_like(volume) theta_dev = cl.array.to_device(queue,theta) thetaNext = numpy.zeros_like(volume) thetaNext_dev = cl.array.to_device(queue,thetaNext) dest_dev = cl_array.empty_like(volume_dev) sampler = cl.Sampler(ctx,False,cl.addressing_mode.REPEAT,cl.filter_mode.LINEAR) print("Building program...") slices,rows,columns = volume.shape prg = cl.Program(ctx, """ #pragma OPENCL EXTENSION cl_khr_fp64: enable __kernel void copy( __global short source[{slices}][{rows}][{columns}], __global short destination[{slices}][{rows}][{columns}]) {{ size_t slice = get_global_id(0); size_t column = get_global_id(1); size_t row = get_global_id(2); if (slice < {slices} && row < {rows} && column < {columns}) {{ destination[slice][row][column] = source [slice][row][column]; }} }} __kernel void dilate( __read_only image3d_t volume, __global short label[{slices}][{rows}][{columns}], sampler_t volumeSampler, __global short dest[{slices}][{rows}][{columns}]) {{ size_t slice = get_global_id(0); size_t column = get_global_id(1); size_t row = get_global_id(2); if (slice >= {slices} || row >= {rows} || column >= {columns}) {{ return; }} int size = 1; int sliceOff, rowOff, columnOff; unsigned int sampleSlice, sampleRow, sampleColumn; short samples = 0; float4 samplePosition; for (sliceOff = -size; sliceOff <= size; sliceOff++) {{ sampleSlice = slice + sliceOff; if (sampleSlice < 0 || sampleSlice >= {slices}) continue; for (rowOff = -size; rowOff <= size; rowOff++) {{ sampleRow = row + rowOff; if (sampleRow < 0 || sampleRow >= {rows}) continue; for (columnOff = -size; columnOff <= size; columnOff++) {{ sampleColumn = column + columnOff; if (sampleColumn < 0 || sampleColumn >= {columns}) continue; if (label[sampleSlice][sampleRow][sampleColumn] != 0) {{ samples++; }} }} }} }} dest[slice][row][column] = samples; }} """.format(slices=slices,rows=rows,columns=columns)).build() def iterate(iterations=10): print("Running!") for iteration in xrange(iterations): prg.dilate(queue, volume.shape, None, volume_image_dev, label_dev.data, sampler, dest_dev.data) prg.copy(queue, volume.shape, None, dest_dev.data, label_dev.data) print("Getting data...") labelVolume[:] = dest_dev.get() print("Rendering...") labelNode.GetImageData().Modified() node.GetImageData().Modified() print("Done!") def grow(iterations=10): for iteration in xrange(iterations): iterate(1) slicer.app.processEvents()
def imageBlur(): print("Starting...") if not slicer.util.getNode('MRHead*'): print("Downloading...") vl = slicer.modules.volumes.logic() uri = 'http://www.slicer.org/slicerWiki/images/4/43/MR-head.nrrd' name = 'MRHead' volumeNode = vl.AddArchetypeVolume(uri, name, 0) if volumeNode: storageNode = volumeNode.GetStorageNode() if storageNode: # Automatically select the volume to display appLogic = slicer.app.applicationLogic() selNode = appLogic.GetSelectionNode() selNode.SetReferenceActiveVolumeID(volumeNode.GetID()) appLogic.PropagateVolumeSelection(1) node = slicer.util.getNode('MRHead*') volume = slicer.util.array('MRHead*') print("Creating Context...") ctx = None for platform in cl.get_platforms(): for device in platform.get_devices(): print(cl.device_type.to_string(device.type)) if cl.device_type.to_string(device.type) == "GPU": ctx = cl.Context([device]) break; if not ctx: print ("no GPU context available") ctx = cl.create_some_context() print("Creating Queue...") queue = cl.CommandQueue(ctx) print("Copying volume...") mf = cl.mem_flags volume_dev = cl_array.to_device(queue, volume) volume_image_dev = cl.image_from_array(ctx, volume,1) dest_dev = cl_array.empty_like(volume_dev) sampler = cl.Sampler(ctx,False,cl.addressing_mode.REPEAT,cl.filter_mode.LINEAR) print("Building program...") slices,rows,columns = volume.shape prg = cl.Program(ctx, """ #pragma OPENCL EXTENSION cl_khr_fp64: enable __kernel void blur( __read_only image3d_t volume, sampler_t volumeSampler, __global short dest[{slices}][{rows}][{columns}]) {{ size_t slice = get_global_id(0); size_t column = get_global_id(1); size_t row = get_global_id(2); int size = 10; int sliceOff, rowOff, columnOff; unsigned int sampleSlice, sampleRow, sampleColumn; float sum = 0; unsigned int samples = 0; float4 samplePosition; int4 sample; for (sliceOff = -size; sliceOff <= size; sliceOff++) {{ sampleSlice = slice + sliceOff; if (sampleSlice < 0 || sampleSlice >= {slices}) continue; for (rowOff = -size; rowOff <= size; rowOff++) {{ sampleRow = row + rowOff; if (sampleRow < 0 || sampleRow >= {rows}) continue; for (columnOff = -size; columnOff <= size; columnOff++) {{ sampleColumn = column + columnOff; if (sampleColumn < 0 || sampleColumn >= {columns}) continue; samplePosition.x = sampleColumn; samplePosition.y = sampleRow; samplePosition.z = sampleSlice; sample = read_imagei(volume, volumeSampler, samplePosition); //sum += sampleSlice+sampleRow+sampleColumn; sum += sample.x; samples++; }} }} }} dest[slice][row][column] = (short) (sum / samples); }} """.format(slices=slices,rows=rows,columns=columns)).build() print("Running!") prg.blur(queue, volume.shape, None, volume_image_dev, label_image_dev, sampler, dest_dev.data) print("Getting data...") volume[:] = dest_dev.get() print("Rendering...") node.GetImageData().Modified() print("Done!")
import pyopencl, pyopencl.array import numpy ctx = pyopencl.create_some_context() queue = pyopencl.CommandQueue( ctx, properties=pyopencl.command_queue_properties.PROFILING_ENABLE) x, y, z = numpy.ogrid[-10:10:0.05, -10:10:0.05, -10:10:0.05] r = numpy.sqrt(x * x + y * y + z * z) data = ((x * x - y * y + z * z) * numpy.exp(-r)).astype("float32") gpu_vol = pyopencl.image_from_array(ctx, data, 1) shape = (500, 500) img = numpy.empty(shape, dtype=numpy.float32) gpu_img = pyopencl.array.empty(queue, shape, numpy.float32) prg = open("interpolation.cl").read() sampler = pyopencl.Sampler( ctx, True, # normalized coordinates pyopencl.addressing_mode.CLAMP_TO_EDGE, pyopencl.filter_mode.LINEAR) prg = pyopencl.Program(ctx, prg).build() n = pyopencl.array.to_device(queue, numpy.array([1, 1, 1], dtype=numpy.float32)) c = pyopencl.array.to_device(queue, numpy.array([0.5, 0.5, 0.5], dtype=numpy.float32)) prg.interpolate(queue, (512, 512), (16, 16), gpu_vol, sampler, gpu_img.data, numpy.int32(shape[1]), numpy.int32(shape[1]), c.data, n.data) img = gpu_img.get() #timing: evt = [] evt.append(
def test_image_2d(ctx_factory): context = ctx_factory() device, = context.devices if not device.image_support: from pytest import skip skip("images not supported on %s" % device) if "Intel" in device.vendor and "31360.31426" in device.version: from pytest import skip skip("images crashy on %s" % device) _xfail_if_pocl(device.platform, None, "pocl does not support CL_ADDRESS_CLAMP") prg = cl.Program(context, """ __kernel void copy_image( __global float *dest, __read_only image2d_t src, sampler_t samp, int stride0) { int d0 = get_global_id(0); int d1 = get_global_id(1); /* const sampler_t samp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; */ dest[d0*stride0 + d1] = read_imagef(src, samp, (float2)(d1, d0)).x; } """).build() num_channels = 1 a = np.random.rand(1024, 512, num_channels).astype(np.float32) if num_channels == 1: a = a[:, :, 0] queue = cl.CommandQueue(context) try: a_img = cl.image_from_array(context, a, num_channels) except cl.RuntimeError: import sys exc = sys.exc_info()[1] if exc.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED: from pytest import skip skip("required image format not supported on %s" % device.name) else: raise a_dest = cl.Buffer(context, cl.mem_flags.READ_WRITE, a.nbytes) samp = cl.Sampler(context, False, cl.addressing_mode.CLAMP, cl.filter_mode.NEAREST) prg.copy_image(queue, a.shape, None, a_dest, a_img, samp, np.int32(a.strides[0]/a.dtype.itemsize)) a_result = np.empty_like(a) cl.enqueue_copy(queue, a_result, a_dest) good = la.norm(a_result - a) == 0 if not good: if queue.device.type & cl.device_type.CPU: assert good, ("The image implementation on your CPU CL platform '%s' " "returned bad values. This is bad, but common." % queue.device.platform) else: assert good
def main(): os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1' imageObjects = [ 0, 0, 0 ] # Main if len(sys.argv) != 4: print "USAGE: " + sys.argv[0] + " <source> <palette> <output>" return 1 # Create an OpenCL context on first available platform context, device = CreateContext(); if context == None: print "Failed to create OpenCL context." return 1 # Create a command-queue on the first device available # on the created context commandQueue = cl.CommandQueue(context, device) # Make sure the device supports images, otherwise exit if not device.get_info(cl.device_info.IMAGE_SUPPORT): print "OpenCL device does not support images." return 1 # Load input image from file and load it into # an OpenCL image object imageObjects[0], srcSize = LoadImage(context, sys.argv[1]) imageObjects[1], palSize = LoadImage(context, sys.argv[2]) pixels = srcSize[0] * srcSize[1] if (palSize[0] * palSize[1] != pixels): print "Images do not contain the same number of pixels." return 1 ######################################### ### ### ### TODO Rearrange the palette to the correct dimensions here ### ### ######################################### # Create ouput image object clImageFormat = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.UNORM_INT8) bufferObject = cl.Buffer(context, cl.mem_flags.READ_WRITE, pixels * 4 * 4) imageObjects[2] = cl.Image(context, cl.mem_flags.WRITE_ONLY, clImageFormat, srcSize) # Create sampler for sampling image object sampler = cl.Sampler(context, False, # Non-normalized coordinates cl.addressing_mode.CLAMP_TO_EDGE, cl.filter_mode.NEAREST) # Create OpenCL program program = CreateProgram(context, device, "quad_swap.cl") # Call the kernel directly localWorkSize = (16,) globalWorkSize = ( RoundUp(localWorkSize[0], srcSize[0]*srcSize[1]/2), ) program.quad_swap(commandQueue, globalWorkSize, localWorkSize, imageObjects[0], imageObjects[1], bufferObject, imageObjects[2], sampler, numpy.int32(srcSize[0]), numpy.int32(srcSize[1])) # Read the output buffer back to the Host buffer = numpy.zeros(srcSize[0] * srcSize[1] * 4, numpy.uint8) origin = ( 0, 0, 0 ) region = ( srcSize[0], srcSize[1], 1 ) cl.enqueue_read_image(commandQueue, imageObjects[2], origin, region, buffer).wait() # Save the image to disk SaveImage(sys.argv[3], buffer, srcSize)
def test_get_info(self, platform, device): failure_count = [0] CRASH_QUIRKS = [ (("NVIDIA Corporation", "NVIDIA CUDA", "OpenCL 1.0 CUDA 3.0.1"), [ (cl.Event, cl.event_info.COMMAND_QUEUE), ]), ] QUIRKS = [] plat_quirk_key = (platform.vendor, platform.name, platform.version) def find_quirk(quirk_list, cl_obj, info): for entry_plat_key, quirks in quirk_list: if entry_plat_key == plat_quirk_key: for quirk_cls, quirk_info in quirks: if (isinstance(cl_obj, quirk_cls) and quirk_info == info): return True return False def do_test(cl_obj, info_cls, func=None, try_attr_form=True): if func is None: def func(info): cl_obj.get_info(info) for info_name in dir(info_cls): if not info_name.startswith("_") and info_name != "to_string": info = getattr(info_cls, info_name) if find_quirk(CRASH_QUIRKS, cl_obj, info): print "not executing get_info", type(cl_obj), info_name print "(known crash quirk for %s)" % platform.name continue try: func(info) except: msg = "failed get_info", type(cl_obj), info_name if find_quirk(QUIRKS, cl_obj, info): msg += ("(known quirk for %s)" % platform.name) else: failure_count[0] += 1 if try_attr_form: try: getattr(cl_obj, info_name.lower()) except: print "failed attr-based get_info", type( cl_obj), info_name if find_quirk(QUIRKS, cl_obj, info): print "(known quirk for %s)" % platform.name else: failure_count[0] += 1 do_test(platform, cl.platform_info) do_test(device, cl.device_info) ctx = cl.Context([device]) do_test(ctx, cl.context_info) props = 0 if (device.queue_properties & cl.command_queue_properties.PROFILING_ENABLE): profiling = True props = cl.command_queue_properties.PROFILING_ENABLE queue = cl.CommandQueue(ctx, properties=props) do_test(queue, cl.command_queue_info) prg = cl.Program( ctx, """ __kernel void sum(__global float *a) { a[get_global_id(0)] *= 2; } """).build() do_test(prg, cl.program_info) do_test(prg, cl.program_build_info, lambda info: prg.get_build_info(device, info), try_attr_form=False) cl.unload_compiler() # just for the heck of it mf = cl.mem_flags n = 2000 a_buf = cl.Buffer(ctx, 0, n * 4) do_test(a_buf, cl.mem_info) kernel = prg.sum do_test(kernel, cl.kernel_info) evt = kernel(queue, (n, ), None, a_buf) do_test(evt, cl.event_info) if profiling: evt.wait() do_test(evt, cl.profiling_info, lambda info: evt.get_profiling_info(info), try_attr_form=False) if device.image_support: smp = cl.Sampler(ctx, True, cl.addressing_mode.CLAMP, cl.filter_mode.NEAREST) do_test(smp, cl.sampler_info) img_format = cl.get_supported_image_formats( ctx, cl.mem_flags.READ_ONLY, cl.mem_object_type.IMAGE2D)[0] img = cl.Image(ctx, cl.mem_flags.READ_ONLY, img_format, (128, 256)) assert img.shape == (128, 256) img.depth img.image.depth do_test(img, cl.image_info, lambda info: img.get_image_info(info)) if failure_count[0]: raise RuntimeError( "get_info testing had %d errors " "(If you compiled against OpenCL 1.1 but are testing a 1.0 " "implementation, you can safely ignore this.)" % failure_count[0])
def test_findMembranePositionUsingMaxIncline(self): inputPath = 'C:/Private/PhD_Publications/Publication_of_Algorithm/Code/TrackingAlgorithm/TrackingAlgorithm/TestData/ReferenceDataForTests/UnitTests/OpenClKernels/findMembranePositionUsingMaxIncline_000/input' referencePath = 'C:/Private/PhD_Publications/Publication_of_Algorithm/Code/TrackingAlgorithm/TrackingAlgorithm/TestData/ReferenceDataForTests/UnitTests/OpenClKernels/findMembranePositionUsingMaxIncline_000/output' referenceVariableName1 = 'dev_membraneCoordinates' referenceVariableName2 = 'dev_membraneNormalVectors' referenceVariableName3 = 'dev_fitInclines' self.loadHostVariable('linFitSearchRangeXvalues', inputPath) self.setupTest("maximumIntensityIncline") self.nrOfLocalAngleSteps = 64 self.detectionKernelStrideSize = 2048 self.nrOfStrides = 1 self.nrOfDetectionAngleSteps = np.float64( self.nrOfStrides * self.detectionKernelStrideSize) self.sampler = cl.Sampler(self.ctx, True, cl.addressing_mode.REPEAT, cl.filter_mode.LINEAR) self.loadHostVariable('trackingGlobalSize', inputPath) self.loadHostVariable('trackingWorkGroupSize', inputPath) self.loadHostVariable('host_Img', inputPath) self.dev_Img = cl.image_from_array(self.ctx, ary=self.host_Img, mode="r", norm_int=False, num_channels=1) self.loadHostVariable('imgSizeX', inputPath) self.loadHostVariable('imgSizeY', inputPath) #self.saveDeviceVariable('buf_localRotationMatrices',inputPath) self.loadHostVariable('localRotationMatrices', inputPath) self.buf_localRotationMatrices = cl.Buffer( self.ctx, self.mf.READ_ONLY | self.mf.COPY_HOST_PTR, hostbuf=self.localRotationMatrices) #self.saveDeviceVariable('buf_linFitSearchRangeXvalues',inputPath) self.buf_linFitSearchRangeXvalues = cl.Buffer( self.ctx, self.mf.READ_ONLY | self.mf.COPY_HOST_PTR, hostbuf=self.linFitSearchRangeXvalues) self.loadHostVariable('linFitParameter', inputPath) self.loadHostVariable('fitIntercept_memSize', inputPath) self.fitIncline_memSize = self.fitIntercept_memSize self.loadHostVariable('rotatedUnitVector_memSize', inputPath) self.loadHostVariable('meanParameter', inputPath) #self.saveDeviceVariable('buf_meanRangeXvalues',inputPath) self.loadHostVariable('meanRangeXvalues', inputPath) self.buf_meanRangeXvalues = cl.Buffer(self.ctx, self.mf.READ_ONLY | self.mf.COPY_HOST_PTR, hostbuf=self.meanRangeXvalues) self.loadHostVariable('meanRangePositionOffset', inputPath) self.loadHostVariable('localMembranePositions_memSize', inputPath) self.loadDeviceVariable('dev_membraneCoordinates', inputPath) self.loadDeviceVariable('dev_membraneNormalVectors', inputPath) self.loadDeviceVariable('dev_fitInclines', inputPath) self.loadHostVariable('inclineTolerance', inputPath) self.inclineRefinementRange = np.int32(2) self.setWorkGroupSizes() for strideNr in range(self.nrOfStrides): # set the starting index of the coordinate array for each kernel instance kernelCoordinateStartingIndex = np.int32( strideNr * self.detectionKernelStrideSize) self.prg.findMembranePosition(self.queue, self.trackingGlobalSize, self.trackingWorkGroupSize, self.sampler, \ self.dev_Img, self.imgSizeX, self.imgSizeY, \ self.buf_localRotationMatrices, \ self.buf_linFitSearchRangeXvalues, \ self.linFitParameter, \ cl.LocalMemory(self.fitIntercept_memSize), cl.LocalMemory(self.fitIncline_memSize), \ cl.LocalMemory(self.rotatedUnitVector_memSize), \ self.meanParameter, \ self.buf_meanRangeXvalues, self.meanRangePositionOffset, \ cl.LocalMemory(self.localMembranePositions_memSize), \ self.dev_membraneCoordinates.data, \ self.dev_membraneNormalVectors.data, \ self.dev_fitInclines.data, \ kernelCoordinateStartingIndex, \ self.inclineTolerance, \ self.inclineRefinementRange) barrierEvent = cl.enqueue_barrier(self.queue) self.assertVector2EqualsExpectedResult( self.dev_membraneCoordinates, referencePath + '/' + referenceVariableName1 + '.npy') self.assertVector2EqualsExpectedResult( self.dev_membraneNormalVectors, referencePath + '/' + referenceVariableName2 + '.npy') self.assertVectorEqualsExpectedResult( self.dev_fitInclines, referencePath + '/' + referenceVariableName3 + '.npy') pass
def _gpu_init(self): """Method to initialize all the data for GPU-accelerate search""" self.gpu_data = {} g = self.gpu_data d = self.data q = self.queue # move data to the GPU. All should be float32, as these is the native # lenght for GPUs g['rcore'] = cl_array.to_device(q, float32array(d['rcore'].array)) g['rsurf'] = cl_array.to_device(q, float32array(d['rsurf'].array)) # Make the scanning chain object an Image, as this is faster to rotate g['im_lsurf'] = cl.image_from_array(q.context, float32array(d['lsurf'].array)) g['sampler'] = cl.Sampler(q.context, False, cl.addressing_mode.CLAMP, cl.filter_mode.LINEAR) if self.distance_restraints: g['restraints'] = cl_array.to_device(q, float32array(d['restraints'])) # Allocate arrays on the GPU g['lsurf'] = cl_array.zeros_like(g['rcore']) g['clashvol'] = cl_array.zeros_like(g['rcore']) g['intervol'] = cl_array.zeros_like(g['rcore']) g['interspace'] = cl_array.zeros(q, d['shape'], dtype=np.int32) g['restspace'] = cl_array.zeros_like(g['interspace']) g['access_interspace'] = cl_array.zeros_like(g['interspace']) g['best_access_interspace'] = cl_array.zeros_like(g['interspace']) # arrays for counting # Reductions are typically tedious on GPU, and we need to define the # workgroupsize to allocate the correct amount of data WORKGROUPSIZE = 32 nsubhists = int(np.ceil(g['rcore'].size / WORKGROUPSIZE)) g['subhists'] = cl_array.zeros(q, (nsubhists, d['nrestraints'] + 1), dtype=np.float32) g['viol_counter'] = cl_array.zeros( q, (nsubhists, d['nrestraints'], d['nrestraints']), dtype=np.float32) # complex arrays g['ft_shape'] = list(d['shape']) g['ft_shape'][0] = d['shape'][0] // 2 + 1 g['ft_rcore'] = cl_array.zeros(q, g['ft_shape'], dtype=np.complex64) g['ft_rsurf'] = cl_array.zeros_like(g['ft_rcore']) g['ft_lsurf'] = cl_array.zeros_like(g['ft_rcore']) g['ft_clashvol'] = cl_array.zeros_like(g['ft_rcore']) g['ft_intervol'] = cl_array.zeros_like(g['ft_rcore']) # other miscellanious data g['nrot'] = d['nrot'] g['max_clash'] = d['max_clash'] g['min_interaction'] = d['min_interaction'] # kernels g['k'] = Kernels(q.context) g['k'].rfftn = pyclfft.RFFTn(q.context, d['shape']) g['k'].irfftn = pyclfft.iRFFTn(q.context, d['shape']) # initial calculations g['k'].rfftn(q, g['rcore'], g['ft_rcore']) g['k'].rfftn(q, g['rsurf'], g['ft_rsurf'])
def test_image_3d(ctx_factory): #test for image_from_array for 3d image of float2 context = ctx_factory() device, = context.devices if not device.image_support: from pytest import skip skip("images not supported on %s" % device) if device.platform.vendor == "Intel(R) Corporation": from pytest import skip skip("images crashy on %s" % device) _skip_if_pocl(device.platform, 'pocl does not support CL_ADDRESS_CLAMP') prg = cl.Program( context, """ __kernel void copy_image_plane( __global float2 *dest, __read_only image3d_t src, sampler_t samp, int stride0, int stride1) { int d0 = get_global_id(0); int d1 = get_global_id(1); int d2 = get_global_id(2); /* const sampler_t samp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; */ dest[d0*stride0 + d1*stride1 + d2] = read_imagef( src, samp, (float4)(d2, d1, d0, 0)).xy; } """).build() num_channels = 2 shape = (3, 4, 2) a = np.random.random(shape + (num_channels, )).astype(np.float32) queue = cl.CommandQueue(context) try: a_img = cl.image_from_array(context, a, num_channels) except cl.RuntimeError: import sys exc = sys.exc_info()[1] if exc.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED: from pytest import skip skip("required image format not supported on %s" % device.name) else: raise a_dest = cl.Buffer(context, cl.mem_flags.READ_WRITE, a.nbytes) samp = cl.Sampler(context, False, cl.addressing_mode.CLAMP, cl.filter_mode.NEAREST) prg.copy_image_plane( queue, shape, None, a_dest, a_img, samp, np.int32(a.strides[0] / a.itemsize / num_channels), np.int32(a.strides[1] / a.itemsize / num_channels), ) a_result = np.empty_like(a) cl.enqueue_copy(queue, a_result, a_dest) good = la.norm(a_result - a) == 0 if not good: if queue.device.type & cl.device_type.CPU: assert good, ( "The image implementation on your CPU CL platform '%s' " "returned bad values. This is bad, but common." % queue.device.platform) else: assert good
def setup_device(self, imshape): print('Setting up with imshape = %s' % (str(imshape))) self.cached_shape = imshape self.clIm = cla.Array(self.q, imshape, np.float32) self.clm = cla.empty_like(self.clIm) self.clx = cla.empty_like(self.clIm) self.cly = cla.empty_like(self.clIm) self.clO = cla.zeros_like(self.clIm) self.clM = cla.zeros_like(self.clIm) self.clF = cla.empty_like(self.clIm) self.clS = cla.empty_like(self.clIm) self.clThisS = cla.empty_like(self.clIm) self.clScratch = cla.empty_like(self.clIm) self.radial_prg = pyopencl.Program(self.ctx, RADIAL_PROGRAM).build() self.sobel = Sobel(self.ctx, self.q) #self.sepcorr2d = NaiveSeparableCorrelation(self.ctx, self.q) self.sepcorr2d = LocalMemorySeparableCorrelation(self.ctx, self.q) self.accum = ElementwiseKernel(self.ctx, 'float *a, float *b', 'a[i] += b[i]') self.norm_s = ElementwiseKernel(self.ctx, 'float *s, const float nRadii', 's[i] = -1 * s[i] / nRadii', 'norm_s') self.accum_s = ElementwiseKernel(self.ctx, 'float *a, float *b, const float nr', 'a[i] -= b[i] / nr') self.gaussians = {} self.gaussian_prgs = {} self.minmax = MinMaxKernel(self.ctx, self.q) # starburst storage clImageFormat = cl.ImageFormat(cl.channel_order.R, cl.channel_type.FLOAT) self.clIm2D = cl.Image(self.ctx, mf.READ_ONLY, clImageFormat, imshape) # Create sampler for sampling image object self.imSampler = cl.Sampler(self.ctx, False, # Non-normalized coordinates cl.addressing_mode.CLAMP_TO_EDGE, cl.filter_mode.LINEAR) self.cl_find_ray_boundaries = FindRayBoundaries(self.ctx, self.q) self.calcF = self.radial_prg.calcF self.calcOM = self.radial_prg.calcOM
def test_get_info(ctx_factory): ctx = ctx_factory() device, = ctx.devices platform = device.platform failure_count = [0] pocl_quirks = [ (cl.Buffer, cl.mem_info.OFFSET), (cl.Program, cl.program_info.BINARIES), (cl.Program, cl.program_info.BINARY_SIZES), ] if ctx._get_cl_version() >= (1, 2) and cl.get_cl_header_version() >= (1, 2): pocl_quirks.extend([ (cl.Program, cl.program_info.KERNEL_NAMES), (cl.Program, cl.program_info.NUM_KERNELS), ]) CRASH_QUIRKS = [ # noqa (("NVIDIA Corporation", "NVIDIA CUDA", "OpenCL 1.0 CUDA 3.0.1"), [ (cl.Event, cl.event_info.COMMAND_QUEUE), ]), (("NVIDIA Corporation", "NVIDIA CUDA", "OpenCL 1.2 CUDA 7.5"), [ (cl.Buffer, getattr(cl.mem_info, "USES_SVM_POINTER", None)), ]), (("The pocl project", "Portable Computing Language", "OpenCL 1.2 pocl 0.8-pre"), pocl_quirks), (("The pocl project", "Portable Computing Language", "OpenCL 1.2 pocl 0.8"), pocl_quirks), (("The pocl project", "Portable Computing Language", "OpenCL 1.2 pocl 0.9-pre"), pocl_quirks), (("The pocl project", "Portable Computing Language", "OpenCL 1.2 pocl 0.9"), pocl_quirks), (("The pocl project", "Portable Computing Language", "OpenCL 1.2 pocl 0.10-pre"), pocl_quirks), (("The pocl project", "Portable Computing Language", "OpenCL 1.2 pocl 0.10"), pocl_quirks), (("Apple", "Apple", "OpenCL 1.2"), [ (cl.Program, cl.program_info.SOURCE), ]), ] QUIRKS = [] # noqa def find_quirk(quirk_list, cl_obj, info): for (vendor, name, version), quirks in quirk_list: if (vendor == platform.vendor and name == platform.name and platform.version.startswith(version)): for quirk_cls, quirk_info in quirks: if (isinstance(cl_obj, quirk_cls) and quirk_info == info): return True return False def do_test(cl_obj, info_cls, func=None, try_attr_form=True): if func is None: def func(info): cl_obj.get_info(info) for info_name in dir(info_cls): if not info_name.startswith("_") and info_name != "to_string": print(info_cls, info_name) info = getattr(info_cls, info_name) if find_quirk(CRASH_QUIRKS, cl_obj, info): print("not executing get_info", type(cl_obj), info_name) print("(known crash quirk for %s)" % platform.name) continue try: func(info) except: msg = "failed get_info", type(cl_obj), info_name if find_quirk(QUIRKS, cl_obj, info): msg += ("(known quirk for %s)" % platform.name) else: failure_count[0] += 1 if try_attr_form: try: getattr(cl_obj, info_name.lower()) except: print("failed attr-based get_info", type(cl_obj), info_name) if find_quirk(QUIRKS, cl_obj, info): print("(known quirk for %s)" % platform.name) else: failure_count[0] += 1 do_test(platform, cl.platform_info) do_test(device, cl.device_info) do_test(ctx, cl.context_info) props = 0 if (device.queue_properties & cl.command_queue_properties.PROFILING_ENABLE): profiling = True props = cl.command_queue_properties.PROFILING_ENABLE queue = cl.CommandQueue(ctx, properties=props) do_test(queue, cl.command_queue_info) prg = cl.Program( ctx, """ __kernel void sum(__global float *a) { a[get_global_id(0)] *= 2; } """).build() do_test(prg, cl.program_info) do_test(prg, cl.program_build_info, lambda info: prg.get_build_info(device, info), try_attr_form=False) n = 2000 a_buf = cl.Buffer(ctx, 0, n * 4) do_test(a_buf, cl.mem_info) kernel = prg.sum do_test(kernel, cl.kernel_info) evt = kernel(queue, (n, ), None, a_buf) do_test(evt, cl.event_info) if profiling: evt.wait() do_test(evt, cl.profiling_info, lambda info: evt.get_profiling_info(info), try_attr_form=False) # crashes on intel... # and pocl does not support CL_ADDRESS_CLAMP if device.image_support and platform.vendor not in [ "Intel(R) Corporation", "The pocl project", ]: smp = cl.Sampler(ctx, False, cl.addressing_mode.CLAMP, cl.filter_mode.NEAREST) do_test(smp, cl.sampler_info) img_format = cl.get_supported_image_formats( ctx, cl.mem_flags.READ_ONLY, cl.mem_object_type.IMAGE2D)[0] img = cl.Image(ctx, cl.mem_flags.READ_ONLY, img_format, (128, 256)) assert img.shape == (128, 256) img.depth img.image.depth do_test(img, cl.image_info, lambda info: img.get_image_info(info))
def gpu_filter(in_put='in.jpg'): gpu_filter_ = GpuFilter() image_objects = [0, 0] # if len(sys.argv) != 3: # print " : " + sys.argv[0] + " <inputImageFile> <outputImageFile>" # exit() # Create an OpenCL context on first available platform context, device = gpu_filter_.create_context() if context is None: print "Failed to create OpenCL context." exit() # Create a command-queue on the first device available on the context that has been created command_queue = cl.CommandQueue(context, device) # Make sure the device supports images, otherwise exit if not device.get_info(cl.device_info.IMAGE_SUPPORT): print "OpenCL device does not support images." exit() # Load input image from file and load it into an OpenCL image object image_objects[0], img_size = gpu_filter_.load_image(context, in_put) # print image_objects[0], img_size # Create output image object cl_image_format = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.UNORM_INT8) image_objects[1] = cl.Image(context, cl.mem_flags.WRITE_ONLY, cl_image_format, img_size) # Create sampler for sampling image object sampler = cl.Sampler( context, False, # Non-normalized coordinates cl.addressing_mode.CLAMP, cl.filter_mode.NEAREST) # Create OpenCL program program = gpu_filter_.create_program(context, device, "ImageFilter2D.cl") # Call the kernel directly local_work_size = (16, 16) global_work_size = (gpu_filter_.round_up(local_work_size[0], img_size[0]), gpu_filter_.round_up(local_work_size[1], img_size[1])) program.gaussian_filter(command_queue, global_work_size, local_work_size, image_objects[0], image_objects[1], sampler, numpy.int32(img_size[0]), numpy.int32(img_size[1])) # Read the output buffer back to the Host cl_buffer = numpy.zeros(img_size[0] * img_size[1] * 4, numpy.uint8) origin = (0, 0, 0) region = (img_size[0], img_size[1], 1) cl.enqueue_read_image(command_queue, image_objects[1], origin, region, cl_buffer).wait() print "Executed program successfully." # return the image matrix return gpu_filter_.image_convert(cl_buffer, img_size)