def setup_device(self, imshape): print('Setting up with imshape = %s' % (str(imshape))) self.cached_shape = imshape self.clIm = cla.Array(self.q, imshape, np.float32) self.clm = cla.empty_like(self.clIm) self.clx = cla.empty_like(self.clIm) self.cly = cla.empty_like(self.clIm) self.clO = cla.zeros_like(self.clIm) self.clM = cla.zeros_like(self.clIm) self.clF = cla.empty_like(self.clIm) self.clS = cla.empty_like(self.clIm) self.clThisS = cla.empty_like(self.clIm) self.clScratch = cla.empty_like(self.clIm) self.radial_prg = pyopencl.Program(self.ctx, RADIAL_PROGRAM).build() self.sobel = Sobel(self.ctx, self.q) #self.sepcorr2d = NaiveSeparableCorrelation(self.ctx, self.q) self.sepcorr2d = LocalMemorySeparableCorrelation(self.ctx, self.q) self.accum = ElementwiseKernel(self.ctx, 'float *a, float *b', 'a[i] += b[i]') self.norm_s = ElementwiseKernel(self.ctx, 'float *s, const float nRadii', 's[i] = -1 * s[i] / nRadii', 'norm_s') self.accum_s = ElementwiseKernel(self.ctx, 'float *a, float *b, const float nr', 'a[i] -= b[i] / nr') self.gaussians = {} self.gaussian_prgs = {} self.minmax = MinMaxKernel(self.ctx, self.q) # starburst storage clImageFormat = cl.ImageFormat(cl.channel_order.R, cl.channel_type.FLOAT) self.clIm2D = cl.Image(self.ctx, mf.READ_ONLY, clImageFormat, imshape) # Create sampler for sampling image object self.imSampler = cl.Sampler(self.ctx, False, # Non-normalized coordinates cl.addressing_mode.CLAMP_TO_EDGE, cl.filter_mode.LINEAR) self.cl_find_ray_boundaries = FindRayBoundaries(self.ctx, self.q) self.calcF = self.radial_prg.calcF self.calcOM = self.radial_prg.calcOM
//color.w = 1.0f; color.xyz=avg; color.w = 1.0f; write_imagef(output,coord,color); } """).build() # 打开图片文件 src1 = Image.open('temp/images/f2.png') print(src1.size) dist = Image.new('RGBA',(640,480),(255,255,255)) # OpenCL处理的图片文件格式RGBA,unit8 imageFormat = cl.ImageFormat(cl.channel_order.RGBA,cl.channel_type.UNSIGNED_INT8) # 将图片从Host复制到Device img1 = cl.Image(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR,imageFormat,src1.size,None,src1.tobytes()) output = cl.Image(context=ctx,flags=mf.WRITE_ONLY,format=imageFormat,shape=src1.size) # 根据图片大小定义WorkSize localWorkSize = ( 8, 8 ) globalWorkSize = ( RoundUp(localWorkSize[0], src1.size[0]), RoundUp(localWorkSize[1], src1.size[1])) # 执行Kernel prg.backto1980_filter(queue,globalWorkSize,localWorkSize,img1,output) buffer = np.zeros(src1.size[0] * src1.size[1] * 4, np.uint8) origin = ( 0, 0, 0 )
import pyopencl as cl import numpy as np import sys platforms = cl.get_platforms() platform = platforms[0] devs = platform.get_devices(cl.device_type.GPU) dev = devs[0] mf = cl.mem_flags ctx = cl.Context([dev]) queue = cl.CommandQueue(ctx, dev) a = np.arange(24).astype(np.int32).reshape(3, 4, 2) b1 = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a) fmt = cl.ImageFormat(cl.channel_order.R, cl.channel_type.SIGNED_INT32) i1 = cl.Image(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, fmt, hostbuf=a) prog = cl.Program( ctx, """ #define GL_ID (int4)(get_global_id(1), get_global_id(0), get_global_id(2), 0) __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; __kernel void Image( __read_only image3d_t img) { int4 id = GL_ID; int4 cl = read_imagei(img, sampler, id); printf("%d, %d, %d: %d\\n", id.x, id.y, id.z, cl.x); }
def main(): imageObjects = [ 0, 0 ] # Main if len(sys.argv) != 3: print "USAGE: " + sys.argv[0] + " <inputImageFile> <outputImageFile>" return 1 # Create an OpenCL context on first available platform context, device = CreateContext(); if context == None: print "Failed to create OpenCL context." return 1 # Create a command-queue on the first device available # on the created context commandQueue = cl.CommandQueue(context, device) # Make sure the device supports images, otherwise exit if not device.get_info(cl.device_info.IMAGE_SUPPORT): print "OpenCL device does not support images." return 1 # Load input image from file and load it into # an OpenCL image object imageObjects[0], imgSize = LoadImage(context, sys.argv[1]) # Create ouput image object clImageFormat = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.UNORM_INT8) imageObjects[1] = cl.Image(context, cl.mem_flags.WRITE_ONLY, clImageFormat, imgSize) # Create sampler for sampling image object sampler = cl.Sampler(context, False, # Non-normalized coordinates cl.addressing_mode.CLAMP_TO_EDGE, cl.filter_mode.NEAREST) # Create OpenCL program program = CreateProgram(context, device, "ImageFilter2D.cl") # Call the kernel directly localWorkSize = ( 16, 16 ) globalWorkSize = ( RoundUp(localWorkSize[0], imgSize[0]), RoundUp(localWorkSize[1], imgSize[1]) ) program.gaussian_filter(commandQueue, globalWorkSize, localWorkSize, imageObjects[0], imageObjects[1], sampler, numpy.int32(imgSize[0]), numpy.int32(imgSize[1])) # Read the output buffer back to the Host buffer = numpy.zeros(imgSize[0] * imgSize[1] * 4, numpy.uint8) origin = ( 0, 0, 0 ) region = ( imgSize[0], imgSize[1], 1 ) cl.enqueue_read_image(commandQueue, imageObjects[1], origin, region, buffer).wait() print "Executed program succesfully." # Save the image to disk SaveImage(sys.argv[2], buffer, imgSize)
def convert_image_yuv(self, image): start = time.time() iplanes = image.get_planes() width = image.get_width() height = image.get_height() strides = image.get_rowstride() pixels = image.get_pixels() assert pixels, "failed to get pixels from %s" % image assert iplanes==ImageWrapper._3_PLANES, "we only handle planar data as input!" assert image.get_pixel_format()==self.src_format, "invalid source format: %s (expected %s)" % (image.get_pixel_format(), self.src_format) assert len(strides)==len(pixels)==3, "invalid number of planes or strides (should be 3)" assert width>=self.src_width and height>=self.src_height, "expected source image with dimensions of at least %sx%s but got %sx%s" % (self.src_width, self.src_height, width, height) #adjust work dimensions for subsampling: #(we process N pixels at a time in each dimension) divs = get_subsampling_divs(self.src_format) wwidth = dimdiv(self.dst_width, max(x_div for x_div, _ in divs)) wheight = dimdiv(self.dst_height, max(y_div for _, y_div in divs)) globalWorkSize, localWorkSize = self.get_work_sizes(wwidth, wheight) kernelargs = [self.queue, globalWorkSize, localWorkSize] iformat = pyopencl.ImageFormat(pyopencl.channel_order.R, pyopencl.channel_type.UNSIGNED_INT8) input_images = [] for i in range(3): _, y_div = divs[i] shape = strides[i], self.src_height//y_div plane = pixels[i] if type(plane)==_memoryview: plane = plane.tobytes() if type(plane)==str: flags = mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR else: flags = mem_flags.READ_ONLY | mem_flags.USE_HOST_PTR iimage = pyopencl.Image(self.context, flags, iformat, shape=shape, hostbuf=plane) input_images.append(iimage) #output image: oformat = pyopencl.ImageFormat(self.channel_order, pyopencl.channel_type.UNORM_INT8) oimage = pyopencl.Image(self.context, mem_flags.WRITE_ONLY | mem_flags.ALLOC_HOST_PTR, oformat, shape=(self.dst_width, self.dst_height)) kernelargs += input_images + [numpy.int32(self.src_width), numpy.int32(self.src_height), numpy.int32(self.dst_width), numpy.int32(self.dst_height), self.sampler, oimage] kstart = time.time() log("convert_image(%s) calling %s%s after upload took %.1fms", image, self.kernel_function_name, tuple(kernelargs), 1000.0*(kstart-start)) self.kernel_function(*kernelargs) kend = time.time() log("%s took %.1fms", self.kernel_function, 1000.0*(kend-kstart)) out_array = numpy.empty(self.dst_width*self.dst_height*4, dtype=numpy.byte) log("out array=%s", out_array) pyopencl.enqueue_copy(self.queue, out_array, oimage, origin=(0,0), region=(self.dst_width,self.dst_height)) self.queue.finish() log("readback using %s took %.1fms", CHANNEL_ORDER_TO_STR.get(self.channel_order), 1000.0*(time.time()-kend)) #free input images: for iimage in input_images: iimage.release() oimage.release() self.time += time.time()-start self.frames += 1 return ImageWrapper(0, 0, self.dst_width, self.dst_height, out_array.data, self.dst_format, 24, self.dst_width*4, planes=ImageWrapper.PACKED)
def convert_image_rgb(self, image): start = time.time() iplanes = image.get_planes() width = image.get_width() height = image.get_height() stride = image.get_rowstride() pixels = image.get_pixels() #log("convert_image(%s) planes=%s, pixels=%s, size=%s", image, iplanes, type(pixels), len(pixels)) assert pixels, "failed to get pixels from %s" % image assert iplanes==ImageWrapper.PACKED, "we only handle packed data as input!" assert image.get_pixel_format()==self.src_format, "invalid source format: %s (expected %s)" % (image.get_pixel_format(), self.src_format) assert width>=self.src_width and height>=self.src_height, "expected source image with dimensions of at least %sx%s but got %sx%s" % (self.src_width, self.src_height, width, height) #adjust work dimensions for subsampling: #(we process N pixels at a time in each dimension) divs = get_subsampling_divs(self.dst_format) wwidth = dimdiv(self.dst_width, max([x_div for x_div, _ in divs])) wheight = dimdiv(self.dst_height, max([y_div for _, y_div in divs])) globalWorkSize, localWorkSize = self.get_work_sizes(wwidth, wheight) #input image: iformat = pyopencl.ImageFormat(self.channel_order, pyopencl.channel_type.UNSIGNED_INT8) shape = (stride//4, self.src_height) log("convert_image() type=%s, input image format=%s, shape=%s, work size: local=%s, global=%s", type(pixels), iformat, shape, localWorkSize, globalWorkSize) idata = pixels if type(idata)==_memoryview: idata = idata.tobytes() if type(idata)==str: #str is not a buffer, so we have to copy the data #alternatively, we could copy it first ourselves using this: #pixels = numpy.fromstring(pixels, dtype=numpy.byte).data #but I think this would be even slower flags = mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR else: flags = mem_flags.READ_ONLY | mem_flags.USE_HOST_PTR iimage = pyopencl.Image(self.context, flags, iformat, shape=shape, hostbuf=idata) kernelargs = [self.queue, globalWorkSize, localWorkSize, iimage, numpy.int32(self.src_width), numpy.int32(self.src_height), numpy.int32(self.dst_width), numpy.int32(self.dst_height), self.sampler] #calculate plane strides and allocate output buffers: strides = [] out_buffers = [] out_sizes = [] for i in range(3): x_div, y_div = divs[i] p_stride = roundup(self.dst_width // x_div, max(2, localWorkSize[0])) p_height = roundup(self.dst_height // y_div, 2) p_size = p_stride * p_height #log("output buffer for channel %s: stride=%s, height=%s, size=%s", i, p_stride, p_height, p_size) out_buf = pyopencl.Buffer(self.context, mem_flags.WRITE_ONLY, p_size) out_buffers.append(out_buf) kernelargs += [out_buf, numpy.int32(p_stride)] strides.append(p_stride) out_sizes.append(p_size) kstart = time.time() log("convert_image(%s) calling %s%s after %.1fms", image, self.kernel_function_name, tuple(kernelargs), 1000.0*(kstart-start)) self.kernel_function(*kernelargs) kend = time.time() log("%s took %.1fms", self.kernel_function_name, 1000.0*(kend-kstart)) #read back: pixels = [] for i in range(3): out_array = numpy.empty(out_sizes[i], dtype=numpy.byte) pixels.append(out_array.data) pyopencl.enqueue_copy(self.queue, out_array, out_buffers[i], is_blocking=False) readstart = time.time() log("queue read events took %.1fms (3 planes of size %s, with strides=%s)", 1000.0*(readstart-kend), out_sizes, strides) self.queue.finish() readend = time.time() log("wait for read events took %.1fms", 1000.0*(readend-readstart)) iimage.release() #free output buffers: for out_buf in out_buffers: out_buf.release() return ImageWrapper(0, 0, self.dst_width, self.dst_height, pixels, self.dst_format, 24, strides, planes=ImageWrapper._3_PLANES)
def loadEXR(self, filename): import OpenEXR import Imath pt = Imath.PixelType(Imath.PixelType.HALF) image = OpenEXR.InputFile(filename) header = image.header() dw = header['dataWindow'] channels = header['channels'] size = (dw.max.x - dw.min.x + 1, dw.max.y - dw.min.y + 1) self.source_width = size[0] self.source_height = size[1] if self.parm("width").eval() != 0: self.width = self.parm("width").eval() else: self.width = self.source_width if self.parm("height").eval() != 0: self.height = self.parm("height").eval() else: self.height = self.source_height redstr = image.channel('R', pt) host_buff_r = numpy.fromstring(redstr, dtype = numpy.float16) host_buff_r.shape = (size[1], size[0]) # Numpy arrays are (row, col) self.devInBufferR = cl.Image(self.engine.ctx, self.engine.mf.READ_ONLY | self.engine.mf.COPY_HOST_PTR, cl.ImageFormat(cl.channel_order.INTENSITY, cl.channel_type.HALF_FLOAT), shape=(self.source_width, self.source_height,), pitches=(self.source_width * 2,), hostbuf=host_buff_r) greenstr = image.channel('G', pt) host_buff_g = numpy.fromstring(greenstr, dtype = numpy.float16) host_buff_g.shape = (size[1], size[0]) # Numpy arrays are (row, col) self.devInBufferG = cl.Image(self.engine.ctx, self.engine.mf.READ_ONLY | self.engine.mf.COPY_HOST_PTR, cl.ImageFormat(cl.channel_order.INTENSITY, cl.channel_type.HALF_FLOAT), shape=(self.source_width, self.source_height,), pitches=(self.source_width * 2,), hostbuf=host_buff_g) bluestr = image.channel('B', pt) host_buff_b = numpy.fromstring(bluestr, dtype = numpy.float16) host_buff_b.shape = (size[1], size[0]) # Numpy arrays are (row, col) self.devInBufferB = cl.Image(self.engine.ctx, self.engine.mf.READ_ONLY | self.engine.mf.COPY_HOST_PTR, cl.ImageFormat(cl.channel_order.INTENSITY, cl.channel_type.HALF_FLOAT), shape=(self.source_width, self.source_height,), pitches=(self.source_width * 2,), hostbuf=host_buff_b) if(channels.get('A') is not None): alphastr = image.channel('A', pt) host_buff_a = numpy.fromstring(alphastr, dtype = numpy.float16) host_buff_a.shape = (size[1], size[0]) # Numpy arrays are (row, col) self.devInBufferA = cl.Image(self.engine.ctx, self.engine.mf.READ_ONLY | self.engine.mf.COPY_HOST_PTR, cl.ImageFormat(cl.channel_order.INTENSITY, cl.channel_type.HALF_FLOAT), shape=(self.source_width, self.source_height,), pitches=(self.source_width * 2,), hostbuf=host_buff_a) else: self.devInBufferA = cl.Image(self.engine.ctx, self.engine.mf.READ_ONLY | self.engine.mf.COPY_HOST_PTR, cl.ImageFormat(cl.channel_order.INTENSITY, cl.channel_type.HALF_FLOAT), shape=(self.source_width, self.source_height,), pitches=(self.source_width * 2,), hostbuf=numpy.ones(self.source_width * self.source_height, dtype = numpy.float16))
def applyMorphOp(imgIn, op): "apply morphological operation to image using GPU" # (1) setup OpenCL platforms = cl.get_platforms( ) # a platform corresponds to a driver (e.g. AMD) platform = platforms[1] # take first platform devices = platform.get_devices( cl.device_type.GPU) # get GPU devices of selected platform device = devices[0] # take first GPU context = cl.Context([device]) # put selected GPU into context object queue = cl.CommandQueue( context, device) # create command queue for selected GPU and context # (2) get shape of input image, allocate memory for output to which result can be copied to shape = imgIn.T.shape imgOut = np.empty_like(imgIn) # (2) create image buffers which hold images for OpenCL imgInBuf = cl.Image( context, cl.mem_flags.READ_ONLY, cl.ImageFormat(cl.channel_order.LUMINANCE, cl.channel_type.UNORM_INT8), shape=shape) # holds a gray-valued image of given shape imgOutBuf = cl.Image( context, cl.mem_flags.WRITE_ONLY, cl.ImageFormat(cl.channel_order.LUMINANCE, cl.channel_type.UNORM_INT8), shape=shape) # placeholder for gray-valued image of given shape # (3) load and compile OpenCL program program = cl.Program(context, open('Erosion_Dilation.cl').read()).build() # (3) from OpenCL program, get kernel object and set arguments (input image, operation type, output image) kernel = cl.Kernel( program, 'morphOpKernel') # name of function according to kernel.py kernel.set_arg(0, imgInBuf) # input image buffer kernel.set_arg(1, np.uint32( op)) # operation type passed as an integer value (dilate=0, erode=1) kernel.set_arg(2, imgOutBuf) # output image buffer # (4) copy image to device, execute kernel, copy data back cl.enqueue_copy(queue, imgInBuf, imgIn, origin=(0, 0), region=shape, is_blocking=False) # copy image from CPU to GPU cl.enqueue_nd_range_kernel( queue, kernel, shape, None ) # execute kernel, work is distributed across shape[0]*shape[1] work-items (one work-item per pixel of the image) cl.enqueue_copy( queue, imgOut, imgOutBuf, origin=(0, 0), region=shape, is_blocking=True ) # wait until finished copying resulting image back from GPU to CPU return imgOut
def loadImage(self, imgpath): print("Opening image to be processed") self.mf = cl.mem_flags # GETTING NDARRAY IMAGE AND DATA ABOUT THE IMAGE self.img = io.imread(imgpath) self.img_dtype = self.img.dtype self.img_ndim = self.img.ndim self.img_shape = (self.img.shape[1],self.img.shape[0]) self.img_origin = (0, 0, 0) self.img_region = (self.img_shape[0], self.img_shape[1], 1) if( self.img.dtype == np.uint8 ): self.img_dtype_cl = cl.channel_type.UNORM_INT8 # GETTING THE DIMENSIONS OF THE IMAGE if( self.img_ndim == 1 ): # WHAT TO DO IF THE IMAGE IS 1-DIMENSIONAL self.img_channel_order_cl = cl.channel_order.LUMINANCE elif( self.img_ndim == 2 ): # IF THE IMAGE IS 2-DIMENSIONAL, THEN IT IS A SHADES OF GRAY IMAGE # AND THE IMAGE TYPE IS LUMINANCE self.img_channel_order_cl = cl.channel_order.LUMINANCE self.img_nchannels = 1 elif( self.img_ndim == 3 ): # IF THE IMAGE ARRAY IS 3-DIMENSIONAL, THEN IT HAS MORE THAN 1 COLOR CHANNEL if( self.img[0,0,:].size == 2 ): # THEN IT CAN BE ANY 2-CHANNEL IMAGE. # DON'T HAVE ACCES TO ANY IMAGE LIKE THAT YET. self.img_nchannels = 2 if( self.img[0,0,:].size == 3 ): # THEN IT CAN BE ANY 3-CHANNEL IMAGE. IS NEEDED TO ADD THE 4TH CHANNEL TO IT self.img_nchannels = 3 """ DON'T KNOW YET HOW TO DISCOVER IF IT IS RGB, RBG, BGR, GBR, HSV OR SOME OTHER TYPE. FOR NOW, THIS PROGRAM WAS JUST TESTED WITH RGB AND SHOULD WORK WITH ITS 3-CHANNEL VARIANTS. HERE IS JUST ADDED AN ALPHA CHANNEL FOR THE IMAGE TO BE IN RGBA (BGRA, RBGA, [...], FORMAT) """ # TURNING INTO RGBA IMAGE self.img_aux = np.empty((self.img.shape[0],self.img.shape[1],4), self.img_dtype) self.img_aux[:,:,0] = self.img[:,:,0] self.img_aux[:,:,1] = self.img[:,:,1] self.img_aux[:,:,2] = self.img[:,:,2] self.img_aux[:,:,3] = 255 self.img = self.img_aux self.img_channel_order_cl = cl.channel_order.RGBA elif( self.img_ndim == 4 ): # THEN IT COULD BE ANY IMAGE WITH 4 COLOR CHANNELS # DON'T NEED TO DO ENYTHING WITH THE IMAGE # SO, JUST MAKING THE IMAGE FORMAT OBJECT self.img_channel_order_cl = cl.channel_order.RGBA # SETTING THE OPENCL IMAGE OBJECTS, WITHOUT THE COPY self.imgFormat = cl.ImageFormat(self.img_channel_order_cl, self.img_dtype_cl) self.img_in_cl = cl.Buffer(self.ctx, self.mf.READ_ONLY, self.img.nbytes) self.img_out_cl = cl.Buffer(self.ctx, self.mf.WRITE_ONLY, self.img.nbytes) # COPYING NDARRAY IMAGE TO OPENCL IMAGE OBJECT cl.enqueue_copy(self.queue, self.img_in_cl, self.img.tobytes(), is_blocking=True)
def loadJPG(self, filename): img = matplotlib.image.imread(filename) self.source_width = img.shape[1] self.source_height = img.shape[0] if self.parm("width").eval() != 0: self.width = self.parm("width").eval() else: self.width = self.source_width if self.parm("height").eval() != 0: self.height = self.parm("height").eval() else: self.height = self.source_height r = numpy.array(img[:,:,0],dtype=numpy.int8) g = numpy.array(img[:,:,1],dtype=numpy.int8) b = numpy.array(img[:,:,2],dtype=numpy.int8) self.devInBufferR = cl.Image(self.engine.ctx, self.engine.mf.READ_ONLY | self.engine.mf.COPY_HOST_PTR, cl.ImageFormat(cl.channel_order.INTENSITY, cl.channel_type.UNORM_INT8), shape=(self.source_width, self.source_height,), pitches=(self.source_width,), hostbuf=r) self.devInBufferG = cl.Image(self.engine.ctx, self.engine.mf.READ_ONLY | self.engine.mf.COPY_HOST_PTR, cl.ImageFormat(cl.channel_order.INTENSITY, cl.channel_type.UNORM_INT8), shape=(self.source_width, self.source_height,), pitches=(self.source_width,), hostbuf=g) self.devInBufferB = cl.Image(self.engine.ctx, self.engine.mf.READ_ONLY | self.engine.mf.COPY_HOST_PTR, cl.ImageFormat(cl.channel_order.INTENSITY, cl.channel_type.UNORM_INT8), shape=(self.source_width, self.source_height,), pitches=(self.source_width,), hostbuf=b)
def main(): imageObjects = [0, 0] #if len(sys.argv) != 3: # print "USAGE: " + sys.argv[0] + " <inputImageFile> <outputImageFile>" # return 1 # create context and command queue ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) # load image im = Image.open('../datas/f2.jpg') if im.mode != "RGBA": im = im.convert("RGBA") imgSize = im.size buffer = im.tobytes() # Create ouput image object clImageFormat = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.UNSIGNED_INT8) imageObjects[0] = cl.Image( ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, clImageFormat, imgSize, None, buffer) imageObjects[1] = cl.Image(ctx, cl.mem_flags.WRITE_ONLY, clImageFormat, imgSize) # load the kernel source code #kernelFile = open("grayscale.cl", "r") kernelSrc = """ const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST; __kernel void rgbaToGrayscale(__read_only image2d_t srcImg, __write_only image2d_t dstImg) { // Converts RGBA image to gray scale intensity using the following formula: // I = 0.2126 * R + 0.7152 * G + 0.0722 * B int2 coord = (int2) (get_global_id(0), get_global_id(1)); int width = get_image_width(srcImg); int height = get_image_height(srcImg); if (coord.x < width && coord.y < height) { uint4 color = read_imageui(srcImg, sampler, coord); float luminance = 0.2126f * color.x + 0.7152f * color.y + 0.0722f * color.z; color.x = color.y = color.z = (uint)luminance; // Write the output value to image write_imageui(dstImg, coord, color); } }""" # Create OpenCL program program = cl.Program(ctx, kernelSrc).build() # Call the kernel directly localWorkSize = (16, 16) globalWorkSize = (RoundUp(localWorkSize[0], imgSize[0]), RoundUp(localWorkSize[1], imgSize[1])) gr = time.time() program.rgbaToGrayscale(queue, globalWorkSize, localWorkSize, imageObjects[0], imageObjects[1]) # Read the output buffer back to the Host buffer = numpy.zeros(imgSize[0] * imgSize[1] * 4, numpy.uint8) origin = (0, 0, 0) region = (imgSize[0], imgSize[1], 1) cl.enqueue_read_image(queue, imageObjects[1], origin, region, buffer).wait() print(time.time() - gr) print("Executed program succesfully.") # Save the image to disk gsim = Image.frombytes("RGBA", imgSize, buffer.tobytes()) gsim.save('../temp/cl-out.png')
def __init__(self, images, labels): self.images = np.asarray(images, dtype=np.float32).flatten() self.labels = np.asarray(labels, dtype=np.int32).flatten() self.entries = [Entry(self.images[IMG_SIZE * i:IMG_SIZE * (i+1)], \ self.labels[i]) for i in range(len(self.labels))] """ Populate CL information """ self.cl_dev = clu.Q.device self.cl_height = int(self.cl_dev.get_info(cl.device_info \ .IMAGE3D_MAX_HEIGHT) / IMG_ROWS) * IMG_ROWS self.cl_width = int(self.cl_dev.get_info(cl.device_info \ .IMAGE3D_MAX_WIDTH) / IMG_COLS) * IMG_COLS self.cl_size = len(self.images) self.cl_depth = math.ceil(self.cl_size / (self.cl_height * \ self.cl_width)) self.cl_region = (self.cl_height, self.cl_width, self.cl_depth) self.cl_per_row = int(self.cl_width / IMG_COLS) self.cl_per_depth = int((self.cl_width * self.cl_height) \ / IMG_SIZE) self.cl_format = cl.ImageFormat(cl.channel_order.R, \ cl.channel_type.FLOAT) self.cl_length = np.int32(len(self.entries)) """ Build CL program """ # Macros kernel_cl = """ #define PER_ROW {per_row:d} #define PER_DEPTH {per_depth:d} #define SIZE {size:d} #define ROWS {rows:d} #define COLS {cols:d} #define HR {hr:f} #define HC {hc:f} """.format(per_row=self.cl_per_row, per_depth=self.cl_per_depth, \ size=IMG_SIZE, rows=IMG_ROWS, cols=IMG_COLS,\ hr=(IMG_ROWS - 1) / 2.0, hc=(IMG_COLS - 1) / 2.0) # Rest header kernel_cl += """ #define CY(ry) -(ry) + HR #define CX(rx) (rx) - HC #define RY(cy) HR - (cy) #define RX(cx) (cx) + HC // ABS is for absolute coords in the image // 1 is for destination 0 is for source #define GL_ID_102 (int4)( \ get_global_id(1), \ get_global_id(0), \ get_global_id(2), \ 0) #define IDX(abs) abs.z * PER_DEPTH \ + (int)(abs.y / ROWS) * PER_ROW \ + (int)(abs.x / COLS) #define ARY_IDX(idx, abs) idx * SIZE \ + (abs.y % ROWS) * COLS \ + abs.x % COLS #define CARTESIAN(abs) (float4)( \ (abs.x % COLS) - HC, \ -(abs.y % ROWS) + HR, \ abs.z, \ 0.0f) #define RASTER(cart) (float4)( \ cart.x + HC, \ HR - cart.y, \ cart.z, \ 0.0f) #define NO_Z_INTERPOLATION(abs) (float4)( \ abs.x, \ abs.y, \ abs.z + 0.5f, /* +0.5f makes z not interpolated */ \ 0.0f) #define SIGN_INT(i) ((i > 0) - (i < 0)) __constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; __constant sampler_t linear_sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_LINEAR; __kernel void ArrayToImage( __read_only image3d_t ary, __write_only image3d_t img, const int length ) { int4 ary_pos = GL_ID_102; int ary_idx = ary_pos.z * PER_DEPTH * SIZE + ary_pos.y * COLS \ * PER_ROW + ary_pos.x; int idx = ary_idx / SIZE; // return if padded if(idx >= length) { return; } // Array index in current depth int ary_idx_in_depth = ary_idx % (PER_DEPTH * SIZE); // Array index in each image int ary_idx_in_each = ary_idx_in_depth % SIZE; int idx_in_depth = idx % PER_DEPTH; int base_y = (idx_in_depth / PER_ROW) * ROWS; int base_x = (idx_in_depth % PER_ROW) * COLS; int img_y = base_y + ary_idx_in_each / COLS; int img_x = base_x + ary_idx_in_each % COLS; int4 img_pos = (int4)(img_x, img_y, ary_pos.z, 0); float4 cl = read_imagef(ary, sampler, ary_pos); write_imagef(img, img_pos, cl); } __kernel void ImageToArray( __read_only image3d_t img, __write_only image3d_t ary, const int length ) { int4 img_pos = GL_ID_102; int idx = img_pos.z * PER_DEPTH + (int)(img_pos.y / ROWS) * PER_ROW \ + (int)(img_pos.x / COLS); if(idx >= length) return; int idx_in_depth = idx % PER_DEPTH; int ary_idx = img_pos.z * PER_DEPTH * SIZE + idx_in_depth * SIZE + \ (img_pos.y % ROWS) * COLS + (img_pos.x % COLS); int ary_idx_in_depth = ary_idx % (PER_DEPTH * SIZE); int4 ary_pos = (int4)( ary_idx_in_depth % (PER_ROW * COLS), ary_idx_in_depth / (PER_ROW * COLS), img_pos.z, 0 ); float4 cl = read_imagef(img, sampler, img_pos); write_imagef(ary, ary_pos, cl); } """ # Kernels kernel_cl += "\n".join([ \ self.invert_cl, self.rotate_cl, self.noise_cl, self.scale_cl, \ self.corner_cl]) self.program = cl.Program(clu.CTX, kernel_cl).build()
def gpu_filter(in_put='in.jpg'): gpu_filter_ = GpuFilter() image_objects = [0, 0] # if len(sys.argv) != 3: # print " : " + sys.argv[0] + " <inputImageFile> <outputImageFile>" # exit() # Create an OpenCL context on first available platform context, device = gpu_filter_.create_context() if context is None: print "Failed to create OpenCL context." exit() # Create a command-queue on the first device available on the context that has been created command_queue = cl.CommandQueue(context, device) # Make sure the device supports images, otherwise exit if not device.get_info(cl.device_info.IMAGE_SUPPORT): print "OpenCL device does not support images." exit() # Load input image from file and load it into an OpenCL image object image_objects[0], img_size = gpu_filter_.load_image(context, in_put) # print image_objects[0], img_size # Create output image object cl_image_format = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.UNORM_INT8) image_objects[1] = cl.Image(context, cl.mem_flags.WRITE_ONLY, cl_image_format, img_size) # Create sampler for sampling image object sampler = cl.Sampler( context, False, # Non-normalized coordinates cl.addressing_mode.CLAMP, cl.filter_mode.NEAREST) # Create OpenCL program program = gpu_filter_.create_program(context, device, "ImageFilter2D.cl") # Call the kernel directly local_work_size = (16, 16) global_work_size = (gpu_filter_.round_up(local_work_size[0], img_size[0]), gpu_filter_.round_up(local_work_size[1], img_size[1])) program.gaussian_filter(command_queue, global_work_size, local_work_size, image_objects[0], image_objects[1], sampler, numpy.int32(img_size[0]), numpy.int32(img_size[1])) # Read the output buffer back to the Host cl_buffer = numpy.zeros(img_size[0] * img_size[1] * 4, numpy.uint8) origin = (0, 0, 0) region = (img_size[0], img_size[1], 1) cl.enqueue_read_image(command_queue, image_objects[1], origin, region, cl_buffer).wait() print "Executed program successfully." # return the image matrix return gpu_filter_.image_convert(cl_buffer, img_size)
def parallelSumRed(imgRGBA, width, height): global c1 global c2 C = 0. F = 259. * (C + 255.) / (255. * (259. - C)) #print(F) #Create buffers #host -> device width_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=numpy.int32(width)) height_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=numpy.int32(height)) dest_sum_buf = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=numpy.int32(0)) dest_sumY_buf = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=numpy.int32(0)) dest_N_buf = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=numpy.int32(0)) F_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=numpy.float32(F)) clImage = cl.Image( ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.UNORM_INT8), (640, 480), None, imgRGBA.tostring()) clOutImage = cl.Image( ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.UNORM_INT8), (640, 480), None, imgRGBA.tostring()) sampler = cl.Sampler( ctx, False, # Non-normalized coordinates cl.addressing_mode.CLAMP_TO_EDGE, cl.filter_mode.NEAREST) #compile openCL code prg = cl.Program(ctx, kernel).build() #define grid size gridSizeX = 640 gridSizeY = 480 globalWorkSize = (gridSizeX, gridSizeY) #run kernel prg.getLaserCoord( queue, globalWorkSize, clImage, clOutImage, sampler, width_buf, height_buf, dest_sum_buf, dest_N_buf, dest_sumY_buf ) #can't use Intel CPU for now, need to install NVidia drivers; use AMD for now #set up output buffers sumX = numpy.empty_like(0) sumY = numpy.empty_like(0) N = numpy.empty_like(0) buff = numpy.zeros( width * height * 4, numpy.uint8 ) #output is numpy array of (640, 480, 4); need to convert to RGBA -> RGB -> BGR and then display origin = (0, 0, 0) region = (width, height, 1) #device -> host cl.enqueue_copy(queue, sumX, dest_sum_buf) #from 3rd arg on device to 2nd arg on host cl.enqueue_copy(queue, N, dest_N_buf) cl.enqueue_copy(queue, sumY, dest_sumY_buf) cl.enqueue_read_image(queue, clOutImage, origin, region, buff).wait() #print("N = " + str(N) + "; SumX = " + str(sumX) + "; SumY = " + str(sumY)) #print(buff) #remember that every fourth value is alpha = 255 offsetX = 0 offsetY = 0 if N != 0: print("LASER (x,y) = (" + str(sumX / N) + ", " + str(sumY / N) + ")") if N > 5: offsetX = sumX / N - 320. offsetY = sumY / N - 240. return (buff, int(offsetX), int(offsetY))
def get_costvolume(self, left_img, right_img, dispRange=64, thread_num=(16, 16, 4), host_mem=False): assert left_img.shape == right_img.shape, "Shape of both imgs are different" # image object configuration mf = cl.mem_flags if len(left_img.shape) == 3: assert left_img.shape[2] >= 3, "Unavailable type of imgs" # RGB/BGR if left_img.shape[2] == 3: left_img = cv2.cvtColor(left_img, cv2.COLOR_BGR2RGBA) right_img = cv2.cvtColor(right_img, cv2.COLOR_BGR2RGBA) fmt = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.FLOAT) elif len(left_img.shape) == 2: # GRAY fmt = cl.ImageFormat(cl.channel_order.R, cl.channel_type.FLOAT) else: print("Unavailable type of imgs") exit() # arguments h = left_img.shape[0] w = left_img.shape[1] var_height = np.int32(h) var_width = np.int32(w) var_dispRange = np.int32(dispRange) if thread_num is not None: assert w % thread_num[0] == 0 and h % thread_num[ 1] == 0 and dispRange % thread_num[2] == 0 # output costvolume = np.zeros((h, w, dispRange), dtype=np.float32) # main output #### numpy to cl::Image2D memory parsing ctx = self.ctx left_image2D = cl.Image(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, fmt, shape=(w, h), hostbuf=left_img) right_image2D = cl.Image(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, fmt, shape=(w, h), hostbuf=right_img) cost_buffer = cl.Buffer(ctx, mf.READ_WRITE, size=w * h * dispRange * 4) # (w x h x dispRange) x float queue = self.queue #### get cost self.get_cost( queue, (w, h, dispRange), # global size (thread_num), # local size left_image2D, right_image2D, # input cost_buffer, # output var_width, var_dispRange).wait() if host_mem: costvolume = np.zeros((h, w, dispRange), dtype=np.float32) # host memory cl.enqueue_copy(queue, costvolume, cost_buffer, is_blocking=True) return cost_buffer, costvolume else: return cost_buffer