def loadKernel(self): try: filename = "../OpenCL/MMdil3D.cl" self.program = cl.Program(self.clattr.context, pkg.resource_string(__name__, filename)).build() except Exception: return False self.kernel = cl.Kernel(self.program, "MMdil3DFilterInit") self.kernel2 = cl.Kernel(self.program, "MMdil3DFilter") return True
def _setup_kernel(self, program, kernel_name, *argv): """Get kernel from OpenCL program and set arguments.""" kernel = cl.Kernel(program, kernel_name) for idx, value in enumerate(argv): kernel.set_arg(idx, value) return kernel
def applyMorphOp(imgIn, op): "apply morphological operation to image using GPU" # (1) setup OpenCL platforms = cl.get_platforms() # a platform corresponds to a driver (e.g. AMD) platform = platforms[0] # take first platform devices = platform.get_devices(cl.device_type.GPU) # get GPU devices of selected platform device = devices[0] # take first GPU context = cl.Context([device]) # put selected GPU into context object queue = cl.CommandQueue(context, device) # create command queue for selected GPU and context # (2) get shape of input image, allocate memory for output to which result can be copied to shape = imgIn.T.shape imgOut = np.empty_like(imgIn) # (2) create image buffers which hold images for OpenCL imgInBuf = cl.Image(context, cl.mem_flags.READ_ONLY, cl.ImageFormat(cl.channel_order.LUMINANCE, cl.channel_type.UNORM_INT8), shape=shape) # holds a gray-valued image of given shape imgOutBuf = cl.Image(context, cl.mem_flags.WRITE_ONLY, cl.ImageFormat(cl.channel_order.LUMINANCE, cl.channel_type.UNORM_INT8), shape=shape) # placeholder for gray-valued image of given shape # (3) load and compile OpenCL program program = cl.Program(context, open('Erosion_Dilation.cl').read()).build() # (3) from OpenCL program, get kernel object and set arguments (input image, operation type, output image) kernel = cl.Kernel(program, 'morphOpKernel') # name of function according to kernel.py kernel.set_arg(0, imgInBuf) # input image buffer kernel.set_arg(1, np.uint32(op)) # operation type passed as an integer value (dilate=0, erode=1) kernel.set_arg(2, imgOutBuf) # output image buffer # (4) copy image to device, execute kernel, copy data back cl.enqueue_copy(queue, imgInBuf, imgIn, origin=(0, 0), region=shape, is_blocking=False) # copy image from CPU to GPU cl.enqueue_nd_range_kernel(queue, kernel, shape, None) # execute kernel, work is distributed across shape[0]*shape[1] work-items (one work-item per pixel of the image) cl.enqueue_copy(queue, imgOut, imgOutBuf, origin=(0, 0), region=shape, is_blocking=True) # wait until finished copying resulting image back from GPU to CPU return imgOut
def calculate_estimated_kernel_usage(prog, ctx, kernel_name): try: import pyopencl as cl from pyopencl import context_info as ci from pyopencl import kernel_work_group_info as kwgi devices = ctx.get_info(ci.DEVICES) assert len(devices) == 1, 'Should only one device is used !' device = devices[0] # for name in kernel_names: kernel = cl.Kernel(prog, kernel_name) # gws = kernel.get_work_group_info(kwgi.GLOBAL_WORK_SIZE, device) lm = kernel.get_work_group_info(kwgi.LOCAL_MEM_SIZE, device) pm = kernel.get_work_group_info(kwgi.PRIVATE_MEM_SIZE, device) cwgs = kernel.get_work_group_info(kwgi.COMPILE_WORK_GROUP_SIZE, device) pwgsm = kernel.get_work_group_info(kwgi.PREFERRED_WORK_GROUP_SIZE_MULTIPLE, device) print('For kernel "{}" running on device {}:'.format(kernel.function_name, device.name)) # print('\t Max work size: {}'.format(gws)) print('\t Max work-group size: {}'.format(cwgs)) print('\t Recommended work-group multiple: {}'.format(pwgsm)) print('\t Local mem used: {} of {}'.format(lm, device.local_mem_size)) print('\t Private mem used: {}'.format(pm)) return cwgs, pwgsm, lm, pm except: import traceback traceback.print_exc() return None, None, None, None
def test_that_python_args_fail(ctx_factory): context = ctx_factory() prg = cl.Program( context, """ __kernel void mult(__global float *a, float b, int c) { a[get_global_id(0)] *= (b+c); } """).build() a = np.random.rand(50000) queue = cl.CommandQueue(context) mf = cl.mem_flags a_buf = cl.Buffer(context, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a) knl = cl.Kernel(prg, "mult") try: knl(queue, a.shape, None, a_buf, 2, 3) assert False, "PyOpenCL should not accept bare Python types as arguments" except cl.LogicError: pass try: prg.mult(queue, a.shape, None, a_buf, float(2), 3) assert False, "PyOpenCL should not accept bare Python types as arguments" except cl.LogicError: pass prg.mult(queue, a.shape, None, a_buf, np.float32(2), np.int32(3)) a_result = np.empty_like(a) cl.enqueue_read_buffer(queue, a_buf, a_result).wait()
def Difference(img1, img2, threshold): img1 = np.array(img1).astype('uint8') img2 = np.array(img2).astype('uint8') platforms = cl.get_platforms() platform = platforms[0] devices = platform.get_devices(cl.device_type.GPU) device = devices[0] context = cl.Context([device]) queue = cl.CommandQueue(context, device) shape = img1.T.shape result = np.empty_like(img1) imgInBuf1 = cl.Image(context, cl.mem_flags.READ_ONLY, cl.ImageFormat(cl.channel_order.LUMINANCE, cl.channel_type.UNORM_INT8), shape=shape) imgInBuf2 = cl.Image(context, cl.mem_flags.READ_ONLY, cl.ImageFormat(cl.channel_order.LUMINANCE, cl.channel_type.UNORM_INT8), shape=shape) imgOutBuf = cl.Image(context, cl.mem_flags.WRITE_ONLY, cl.ImageFormat(cl.channel_order.LUMINANCE, cl.channel_type.UNORM_INT8), shape=shape) program = cl.Program(context, open('Difference.cl').read()).build() kernel = cl.Kernel(program, 'Difference') kernel.set_arg(0, imgInBuf1) kernel.set_arg(1, imgInBuf2) kernel.set_arg(2, imgOutBuf) kernel.set_arg(3, np.float32(threshold)) cl.enqueue_copy(queue, imgInBuf1, img1, origin=(0, 0), region=shape, is_blocking=False) cl.enqueue_copy(queue, imgInBuf2, img2, origin=(0, 0), region=shape, is_blocking=False) cl.enqueue_nd_range_kernel(queue, kernel, shape, None) cl.enqueue_copy(queue, result, imgOutBuf, origin=(0, 0), region=shape, is_blocking=True) return result
def loadKernel(self): try: filename = "../OpenCL/Mask3D.cl" self.program = cl.Program(self.clattr.context, pkg.resource_string(__name__, filename).decode()).build() except Exception: return False self.kernel = cl.Kernel(self.program, self.maskChoice) return True
def loadKernel(self): try: filename = "../OpenCL/MMero3D.cl" self.program = cl.Program(self.clattr.context, pkg.resource_string(__name__, filename)).build() except Exception: return False if self.clattr.outputTmpBuffer is None: self.clattr.outputTmpBuffer = cl.Buffer( self.clattr.context, cl.mem_flags.READ_WRITE, self.clattr.inputBuffer.size) self.kernel = cl.Kernel(self.program, "MMero3DFilterInit") self.kernel2 = cl.Kernel(self.program, "MMero3DFilter") return True
def loadKernel(self): try: filename = "../OpenCL/FFTFilter.cl" program = cl.Program(self.clattr.context, pkg.resource_string(__name__, filename)).build() except Exception as e: raise e self.kernel = cl.Kernel(program, "FFTFilter") return True
def set_program(self, file_name, function_name): self._clear_program() self._fname = function_name self._get_cl_typenames_in_kernel(file_name, function_name) self._program = cl.Program(self._context, open(file_name).read()).build() self._kernel = cl.Kernel(self._program, function_name) self._mission_global_buffers = [None] * len(self._varying_args) self._mission_global_args = [None] * len(self._varying_args) for i in range(len(self._mission_global_args)): self._mission_global_args[i] = [] self._program_settle = True
def callFuncFromProgram(self, strMethodName, *args, **argd): methodCall = getattr(self.program, strMethodName) if methodCall: if len(args) >= 2 and type(args[1])==tuple and (not args[1]) != True: wgs = cl.Kernel(self.program, strMethodName).get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, self.curDevice) local_worksize = reduce(lambda x,y: x*y, args[1]) print 'local size : ', local_worksize assert wgs >= local_worksize, 'Out of capability, please reduce the local work size for %s()'%(strMethodName) evt = methodCall(self.queue, *args) return evt return None
def loadKernel(self): try: filename = "../OpenCL/BilateralFiltering.cl" self.program = cl.Program(self.clattr.context, pkg.resource_string(__name__, filename)).build() except Exception: return False self.spatialKernel = self.makeKernel(self.spatialRadius) self.rangeKernel = self.makeKernel(self.rangeRadius) self.kernel = cl.Kernel(self.program, 'BilateralFilter') return True
def makeKernel(self, r): radius = r + 1 minWorkingGroup = 256 if 'CPU' in self.clattr.device.name: minWorkingGroup = 64 bufferSize = radius**2 - 1 localSize = min(self.clattr.device.max_work_group_size, minWorkingGroup) globalSize = self.clattr.roundUp(localSize, bufferSize) * np.float32(0).nbytes buffer = cl.Buffer(self.clattr.context, cl.mem_flags.READ_WRITE, size=globalSize) kernel = cl.Kernel(self.program, 'makeKernel') kernel.set_args(np.float32(radius), buffer, np.int32(bufferSize)) globalSize = [int(globalSize)] localSize = [int(localSize)] cl.enqueue_nd_range_kernel(self.clattr.queue, kernel, globalSize, localSize) output = np.empty(bufferSize).astype(np.float32) cl.enqueue_copy(self.clattr.queue, output, buffer) self.clattr.queue.finish() total = np.float32(0) for i in range(bufferSize): total += output[i] normalizeKernel = cl.Kernel(self.program, 'normalizeKernel') normalizeKernel.set_args(np.float32(total), buffer, np.int32(bufferSize)) cl.enqueue_nd_range_kernel(self.clattr.queue, normalizeKernel, globalSize, localSize) return buffer
def init_hal(self, buffIn, buffOut): # Get platform/device information clPlatform = cl.get_platforms()[0] clDevices = clPlatform.get_devices() clDevice = clDevices[0] self.ocl_ctx = cl.Context(devices=clDevices) with open(self.xclbin, "rb") as binary_file: binary = binary_file.read() self.ocl_prg = cl.Program(self.ocl_ctx, clDevices, [binary]) # Init Command Queue qprops = cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE|\ cl.command_queue_properties.PROFILING_ENABLE self.ocl_q = cl.CommandQueue(context=self.ocl_ctx, device=clDevice, properties=qprops) # Create Kernels self.ocl_krnl_scramb_stage = cl.Kernel(self.ocl_prg, "krnl_scrambler_stage_rtl") self.ocl_krnl_input_stage = cl.Kernel(self.ocl_prg, "krnl_input_stage_rtl") self.ocl_krnl_output_stage = cl.Kernel(self.ocl_prg, "krnl_output_stage_rtl") # Create Buffers self.buffer_input = cl.Buffer(self.ocl_ctx, cl.mem_flags.USE_HOST_PTR | cl.mem_flags.READ_ONLY, size=0, hostbuf=buffIn) self.buffer_output = cl.Buffer(self.ocl_ctx, cl.mem_flags.USE_HOST_PTR | cl.mem_flags.WRITE_ONLY, size=0, hostbuf=buffOut)
def __init__(self, logger, difficulty): self.logger = logger try: #platform = cl.get_platforms()[0] #self.devices = platform.get_devices(cl.device_type.GPU) #self.context = cl.Context(self.devices, None, None) self.context = cl.create_some_context() self.devices = self.context.get_info(cl.context_info.DEVICES) self.queue = cl.CommandQueue( self.context, properties=cl.command_queue_properties.PROFILING_ENABLE) kernelFile = open('/tmp/chady256.cl', 'r') self.miner = cl.Program(self.context, kernelFile.read()).build() kernelFile.close() self.WORK_GROUP_SIZE = 0 self.preferred_multiple = 0 for device in self.devices: self.WORK_GROUP_SIZE += self.miner.sha256_crypt_kernel.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, device) self.preferred_multiple = cl.Kernel( self.miner, 'sha256_crypt_kernel').get_work_group_info( cl.kernel_work_group_info. PREFERRED_WORK_GROUP_SIZE_MULTIPLE, device) self.logger.info('Best workgroup size :' + str(self.WORK_GROUP_SIZE)) self.logger.info('Preferred multiple: ' + str(self.preferred_multiple)) self.nounce_begin = 0 self.data_info = np.zeros(1, np.uint32) self.data_info[0] = 76 self.globalThreads = self.WORK_GROUP_SIZE * 1000 self.localThreads = 1 self.blocks = np.zeros(self.data_info[0] * self.globalThreads, np.uint8) self.difficulty = difficulty self.logger.info("HERE") except Exception as inst: self.logger.exception("Init") self.logger.error(type(inst)) self.logger.error((inst.args))
def setup_device(): ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) mf = cl.mem_flags dev = cl.get_platforms()[1].get_devices() binary = open(xclbin_kernel, "rb").read() prg = cl.Program(ctx, dev, [binary]) prg.build() print(dev) print("Device is programmed, testing...") krnl_vadd = cl.Kernel(prg, "top") return [ctx, queue, mf, krnl_vadd]
def calculate(self, range_start, range_end): nmr_problems = range_end - range_start workgroup_size = cl.Kernel( self._sampling_kernel, 'sample').get_work_group_info( cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE, self._cl_environment.device) data_buffers, readout_items = self._get_buffers(workgroup_size) iteration_offset = self._mh_state.nmr_samples_drawn if self._burn_length > 0: iteration_offset = self._enqueue_burnin(range_start, range_end, workgroup_size, data_buffers, iteration_offset) samples_buf = cl.Buffer(self._cl_run_context.context, cl.mem_flags.WRITE_ONLY | cl.mem_flags.USE_HOST_PTR, hostbuf=self._samples) data_buffers.append(samples_buf) readout_items.append([samples_buf, self._samples]) iteration_batch_sizes = self._get_sampling_batch_sizes( self._nmr_samples * (self._sample_intervals + 1), self._max_iterations_per_batch) for nmr_iterations in iteration_batch_sizes: self._sampling_kernel.sample( self._cl_run_context.queue, (int(nmr_problems * workgroup_size), ), (int(workgroup_size), ), np.uint64(nmr_iterations), np.uint64(iteration_offset), *data_buffers, global_offset=(range_start * workgroup_size, )) iteration_offset += nmr_iterations for buffer, host_array in readout_items: self._enqueue_readout(buffer, host_array, range_start, range_end)
def get_context(self): platforms = pyopencl.get_platforms() if not platforms: raise EnvironmentError( "No tienes plataformas OpenCL, intenta instalar los drivers.") self.device = None for plt in platforms: dev = plt.get_devices(pyopencl.device_type.ALL) if dev: for d in dev: if self.device is None: self.device = d elif d.max_clock_frequency > self.device.max_clock_frequency and d.max_compute_units > self.device.max_compute_units: self.device = d self.ctx = pyopencl.Context([self.device]) self.queue = pyopencl.CommandQueue(self.ctx) self.program = pyopencl.Program(self.ctx, open("raytracer.cl", "r").read()).build() self.kernel = pyopencl.Kernel(self.program, "raytracer")
def calculate_estimated_kernel_usage(prog, ctx, kernel_names): try: import pyopencl as cl from pyopencl import context_info as ci from pyopencl import kernel_work_group_info as kwgi devices = ctx.get_info(ci.DEVICES) assert len(devices) == 1, "Should only one device is used !" device = devices[0] for name in kernel_names: kerKer = cl.Kernel(prog, name) lm = kerKer.get_work_group_info(kwgi.LOCAL_MEM_SIZE, device) pm = kerKer.get_work_group_info(kwgi.PRIVATE_MEM_SIZE, device) cwgs = kerKer.get_work_group_info(kwgi.COMPILE_WORK_GROUP_SIZE, device) pwgsm = kerKer.get_work_group_info( kwgi.PREFERRED_WORK_GROUP_SIZE_MULTIPLE, device) print("[%s]\tEstimated usage : Local mem (%d)/ Private mem (%d)"\ "/ Compile WG size (%s)/ Preffered WG size multiple (%d)"\ %(name, lm, pm, str(cwgs), pwgsm)) except: import traceback traceback.print_exc()
def rtl_adder_run(xclbinpath): global boardHandler source_input = np.arange(DATA_SIZE, dtype=np.uint32) source_sw_results = np.arange(INCR_VALUE, DATA_SIZE + INCR_VALUE, dtype=np.uint32) source_hw_results = np.zeros(DATA_SIZE, np.uint32) ##OPENCL HOST CODE AREA START # Get platform/device information clPlatform = cl.get_platforms()[0] clDevices = clPlatform.get_devices() clDevice = clDevices[0] ctx = cl.Context(devices=clDevices) with open(xclbinpath, "rb") as binary_file: binary = binary_file.read() prg = cl.Program(ctx, clDevices, [binary]) # Init xclhal2 library if xclProbe() < 1: print("[ERROR] xclProbe failed ...") raise boardHandler = xclOpen(0, ctypes.c_char_p(b"xrt_logfile.log"), xclVerbosityLevel.XCL_INFO) ##ACCELIZE DRMLIB CODE AREA START drm_manager = DrmManager( # Configuration files paths "./conf.json", "./cred.json", # Read/write register functions callbacks drm_read_callback, drm_write_callback, ) drm_manager.activate() print(f"[DRMLIB] Session ID: {drm_manager.get('session_id')}") time.sleep(2) ##ACCELIZE DRMLIB CODE AREA STOP qprops = cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE|\ cl.command_queue_properties.PROFILING_ENABLE with cl.CommandQueue(context=ctx, device=clDevice, properties=qprops) as q: # Create Kernels krnl_adder_stage = cl.Kernel(prg, "krnl_adder_stage_rtl") krnl_input_stage = cl.Kernel(prg, "krnl_input_stage_rtl") krnl_output_stage = cl.Kernel(prg, "krnl_output_stage_rtl") # Create Buffer buffer_input = cl.Buffer(ctx, cl.mem_flags.USE_HOST_PTR | cl.mem_flags.READ_ONLY, size=0, hostbuf=source_input) buffer_output = cl.Buffer(ctx, cl.mem_flags.USE_HOST_PTR | cl.mem_flags.READ_ONLY, size=0, hostbuf=source_hw_results) # Set the Kernel Arguments npSize = np.int32(DATA_SIZE) npIncr = np.int32(INCR_VALUE) krnl_input_stage.set_args(buffer_input, npSize) krnl_adder_stage.set_args(npIncr, npSize) krnl_output_stage.set_args(buffer_output, npSize) # Copy input data to device global memory cl.enqueue_migrate_mem_objects(q, [buffer_input], flags=0) # Launch the Kernel cl.enqueue_nd_range_kernel(q, krnl_input_stage, [1], [1]) cl.enqueue_nd_range_kernel(q, krnl_adder_stage, [1], [1]) cl.enqueue_nd_range_kernel(q, krnl_output_stage, [1], [1]) # Copy Result from Device Global Memory to Host Local Memory cl.enqueue_migrate_mem_objects(q, [buffer_output], flags=cl.mem_migration_flags.HOST) q.finish() ##OPENCL HOST CODE AREA STOP ##ACCELIZE DRMLIB CODE AREA START drm_manager.deactivate() ##ACCELIZE DRMLIB CODE AREA STOP # Release xclhal2 board handler #xclClose(boardHandler) # /!\ XRT Python binding is in development # state, xclClose() generate crash at the # time this script is written diff = source_hw_results != source_sw_results if diff.any(): print(f"Error: Result mismatch i={i} \ CPU={source_sw_results[i]} != \ DEVICE={source_hw_results[i]}") raise print("TEST PASSED")
def render(numParticles, fireColors, dumpFrames=False, clDebug=False): "render particle system with specified number of particles and particle color" # show output of OpenCL compiler if clDebug: os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1' # if frames should be dumped, create directory if necessary dumpDir = 'dump' if dumpFrames and not os.path.exists(dumpDir): os.mkdir(dumpDir) # setup OpenCL platforms = cl.get_platforms() # a platform corresponds to a driver (e.g. AMD) platform = platforms[0] # take first platform devices = platform.get_devices(cl.device_type.GPU) # get GPU devices of selected platform device = devices[0] # take first GPU context = cl.Context([device]) # put selected GPU into context object queue = cl.CommandQueue(context, device) # create command queue for selected GPU and context # setup buffer for particles sizeParticleStruct = 32 # sizeof(struct Particle) bufParticles = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=sizeParticleStruct*numParticles, hostbuf=None) # setup random values (for random speed and color) random.seed() randVals = np.array([random.random() - 0.5 for _ in range(2 * numParticles)], dtype=np.float32) bufRandVals = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=randVals) # setup output image windowSize = 480 colorChannels = 4 # RGBA sizeofColorChannel = 4 # we need int32 to perform atomic operations in the kernel (multiple particles at same position) img = np.zeros([windowSize, windowSize, colorChannels], dtype=np.int32) # must be square image to ignore distortion imgShape = (windowSize, windowSize) # 2d shape of image imgBuf = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, size=windowSize*windowSize*colorChannels*sizeofColorChannel) # setup kernels compilerSettings = ('-DWINDOW_SIZE=%d'%(windowSize)) + ' ' + ('-DFIRE_COLORS' if fireColors else '') program = cl.Program(context, open('kernel.cl').read()).build(compilerSettings) initParticles = cl.Kernel(program, 'initParticles') updateParticles = cl.Kernel(program, 'updateParticles') clearCanvas = cl.Kernel(program, 'clearCanvas') drawParticles = cl.Kernel(program, 'drawParticles') saturate = cl.Kernel(program, 'saturate') drawEmitter = cl.Kernel(program, 'drawEmitter') # init particles initParticles.set_arg(0, bufParticles) initParticles.set_arg(1, bufRandVals) cl.enqueue_nd_range_kernel(queue, initParticles, (numParticles,), None) # do some (invisible) iterations for smooth particle distribution for t in range(1000): updateParticles.set_arg(0, bufParticles) cl.enqueue_nd_range_kernel(queue, updateParticles, (numParticles,), None) # rendering loop ctr = 0 while True: # clear canvas clearCanvas.set_arg(0, imgBuf) cl.enqueue_nd_range_kernel(queue, clearCanvas, imgShape, None) # draw all particles drawParticles.set_arg(0, bufParticles) drawParticles.set_arg(1, imgBuf) cl.enqueue_nd_range_kernel(queue, drawParticles, (numParticles,), None) # saturate saturate.set_arg(0, imgBuf) cl.enqueue_nd_range_kernel(queue, saturate, imgShape, None) # draw emitter drawEmitter.set_arg(0, imgBuf) cl.enqueue_nd_range_kernel(queue, drawEmitter, (1,), None) # update particles updateParticles.set_arg(0, bufParticles) cl.enqueue_nd_range_kernel(queue, updateParticles, (numParticles,), None) # copy result from GPU cl.enqueue_copy(queue, img, imgBuf, is_blocking=True) # show image (and dump if specified) imgU8 = img[:,:,0:3].astype(np.uint8) cv2.imshow("Particle system [press ESC to exit]", imgU8) if dumpFrames: ctr += 1 cv2.imwrite('%s/%d.png'%(dumpDir, ctr), imgU8) # exit with ESC keyPressed = cv2.waitKey(10) if keyPressed == 27: break
import sys import os import numpy as np import pyopencl as cl if len(sys.argv) < 5: raise ValueError(sys.argv[0] + " platform_id device_id global_size local_size") platform_index = int(sys.argv[1]) device_index = int(sys.argv[2]) global_work_size = int(sys.argv[3]) local_work_size = int(sys.argv[4]) plat = cl.get_platforms()[platform_index] print("Platform: " + plat.name) dev = plat.get_devices()[device_index] print(" -- Device: " + dev.name) context = cl.Context(devices=[dev]) queue = cl.CommandQueue(context, dev) binary = open("hwv.bin", "rb").read() program = cl.Program(context, [dev], [binary]) program.build() kernel = cl.Kernel(program, "hello_world") cl.enqueue_nd_range_kernel(queue, kernel, [global_work_size], [local_work_size]) queue.finish()
c_temp = c_temp * (a_temp/2.0); // times 1/2 my a c[gid] = c_temp; // store result in global memory } """).build() global_size = (data_points, ) local_size = (workers, ) print("yo") #print(get_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE) #preferred_multiple = cl.Kernel(prg, 'sum').get_work_group_info( \ # cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE, \ # device) #preferred_multiple = cl.Kernel(prg, 'sum').get_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE, device) #preferred_multiple = cl.Kernel(prg, 'sum') x = cl.Kernel(prg, 'sum') preferred_multiple = 256 print("yo2") print("Data points:", data_points) print("Workers:", workers) print("Preferred work group size multiple:", preferred_multiple) print(preferred_multiple) if (workers % preferred_multiple): print("Number of workers not a preferred multiple (%d*N)." \ % (preferred_multiple)) print("Performance may be reduced.")
def __init__(self, program: Program, name: str) -> None: self._pyopencl_kernel = pyopencl.Kernel(program._pyopencl_program, name)
platform = cl.get_platforms()[1] device = platform.get_devices() context = cl.Context(device) queue = cl.CommandQueue(context, device[0], cl.command_queue_properties.PROFILING_ENABLE) """Set up GPU program""" program_file = open('sum.cl', 'r') program_text = program_file.read() program = cl.Program(context, program_text) try: program.build() except: print("Build log:") print(program.get_build_info(device, cl.program_build_info.LOG)) raise adder = cl.Kernel(program, 'add') print("Kernel Name:"), print(adder.get_info(cl.kernel_info.FUNCTION_NAME)) program_file.close() """Set up kernel data exchange piple""" size = 1000 a = np.ones(size, dtype=np.float32) b = np.ones(size, dtype=np.float32) * 2 c = np.zeros_like(a) a_buffer = cl.Buffer(context, cl.mem_flags.READ_WRITE, a.nbytes) b_buffer = cl.Buffer(context, cl.mem_flags.READ_WRITE, b.nbytes) c_buffer = cl.Buffer(context, cl.mem_flags.READ_WRITE, c.nbytes) cl.enqueue_copy(queue, a_buffer, a, is_blocking=True)
key_temp = key[gid]; // my a element (by global ref) ciph_temp = ciph[gid]; // my b element (by global ref) for (r = 0; r < 1; r++) { ciph_temp = (ciph_temp>>1)^bit(ciph_temp,31)^bit(ciph_temp,15)^bit(key_temp,(15-r)&63)^bit(KeeLoq_NLF,g5(ciph_temp,0,8,19,25,30)); } plain_temp = ciph_temp; plain[gid] = plain_temp; // store result in global memory } """).build() global_size = (data_points, ) local_size = (workers, ) preferred_multiple = cl.Kernel(prg, 'sum').get_work_group_info( \ cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE, \ device) ''' print("Data points:", data_points) print("Workers:", workers) print("Preferred work group size multiple:", preferred_multiple) if (workers % preferred_multiple): print("Number of workers not a preferred multiple (%d*N)." \ % (preferred_multiple)) print("Performance may be reduced.") ''' exec_evt = prg.sum(queue, global_size, local_size, key_buf, ciph_buf, dest_buf) exec_evt.wait()
def process_sub_matrix(self, *args, **kwargs): device = kwargs['device'] sub_matrix_queue = kwargs['sub_matrix_queue'] context = self.opencl.contexts[device] command_queue = self.opencl.command_queues[device] program = self.opencl.programs[device] vertical_kernel = cl.Kernel(program, 'vertical') diagonal_kernel = cl.Kernel(program, self.settings.diagonal_kernel_name) while True: try: sub_matrix = sub_matrix_queue.get(False) transfer_from_device_events = [] transfer_to_device_events = [] create_matrix_events = [] vertical_events = [] diagonal_events = [] # Vectors X vectors_x = self.get_vectors_x(sub_matrix) vectors_x_buffer = cl.Buffer( context, cl.mem_flags.READ_ONLY, vectors_x.size * vectors_x.itemsize) transfer_to_device_events.append( cl.enqueue_write_buffer(command_queue, vectors_x_buffer, vectors_x, device_offset=0, wait_for=None, is_blocking=False)) # Vectors Y vectors_y = self.get_vectors_y(sub_matrix) vectors_y_buffer = cl.Buffer( context, cl.mem_flags.READ_ONLY, vectors_y.size * vectors_y.itemsize) transfer_to_device_events.append( cl.enqueue_write_buffer(command_queue, vectors_y_buffer, vectors_y, device_offset=0, wait_for=None, is_blocking=False)) # Recurrence points recurrence_points, \ recurrence_points_start, \ recurrence_points_end = self.get_recurrence_points(sub_matrix) recurrence_points_buffer = cl.Buffer( context, cl.mem_flags.READ_WRITE, recurrence_points.size * recurrence_points.itemsize) transfer_to_device_events.append( cl.enqueue_write_buffer(command_queue, recurrence_points_buffer, recurrence_points, device_offset=0, wait_for=None, is_blocking=False)) # Vertical frequency distribution vertical_frequency_distribution = self.get_empty_local_frequency_distribution( ) vertical_frequency_distribution_buffer = cl.Buffer( context, cl.mem_flags.READ_WRITE, vertical_frequency_distribution.size * vertical_frequency_distribution.itemsize) transfer_to_device_events.append( cl.enqueue_write_buffer( command_queue, vertical_frequency_distribution_buffer, vertical_frequency_distribution, device_offset=0, wait_for=None, is_blocking=False)) # White vertical frequency distribution white_vertical_frequency_distribution = self.get_empty_local_frequency_distribution( ) white_vertical_frequency_distribution_buffer = cl.Buffer( context, cl.mem_flags.READ_WRITE, white_vertical_frequency_distribution.size * white_vertical_frequency_distribution.itemsize) transfer_to_device_events.append( cl.enqueue_write_buffer( command_queue, white_vertical_frequency_distribution_buffer, white_vertical_frequency_distribution, device_offset=0, wait_for=None, is_blocking=False)) # Diagonal frequency distribution diagonal_frequency_distribution = self.get_empty_local_frequency_distribution( ) diagonal_frequency_distribution_buffer = cl.Buffer( context, cl.mem_flags.READ_WRITE, diagonal_frequency_distribution.size * diagonal_frequency_distribution.itemsize) transfer_to_device_events.append( cl.enqueue_write_buffer( command_queue, diagonal_frequency_distribution_buffer, diagonal_frequency_distribution, device_offset=0, wait_for=None, is_blocking=False)) # Vertical carryover vertical_carryover, \ vertical_carryover_start,\ vertical_carryover_end = self.get_vertical_length_carryover(sub_matrix) vertical_carryover_buffer = cl.Buffer( context, cl.mem_flags.READ_WRITE, vertical_carryover.size * vertical_carryover.itemsize) transfer_to_device_events.append( cl.enqueue_write_buffer(command_queue, vertical_carryover_buffer, vertical_carryover, device_offset=0, wait_for=None, is_blocking=False)) # White vertical carryover white_vertical_carryover, \ white_vertical_carryover_start,\ white_vertical_carryover_end = self.get_white_vertical_length_carryover(sub_matrix) white_vertical_carryover_buffer = cl.Buffer( context, cl.mem_flags.READ_WRITE, white_vertical_carryover.size * white_vertical_carryover.itemsize) transfer_to_device_events.append( cl.enqueue_write_buffer(command_queue, white_vertical_carryover_buffer, white_vertical_carryover, device_offset=0, wait_for=None, is_blocking=False)) # Diagonal carryover diagonal_carryover, \ diagonal_carryover_start, \ diagonal_carryover_end = self.get_diagonal_length_carryover(sub_matrix) diagonal_carryover_buffer = cl.Buffer( context, cl.mem_flags.READ_WRITE, diagonal_carryover.size * diagonal_carryover.itemsize) transfer_to_device_events.append( cl.enqueue_write_buffer(command_queue, diagonal_carryover_buffer, diagonal_carryover, device_offset=0, wait_for=None, is_blocking=False)) command_queue.finish() # Vertical kernel vertical_args = [ vectors_x_buffer, vectors_y_buffer, np.uint32(sub_matrix.dim_x), np.uint32(sub_matrix.dim_y), np.uint32(self.settings.embedding_dimension), np.float32(self.settings.neighbourhood.radius), recurrence_points_buffer, vertical_frequency_distribution_buffer, vertical_carryover_buffer, white_vertical_frequency_distribution_buffer, white_vertical_carryover_buffer ] OpenCL.set_kernel_args(vertical_kernel, vertical_args) global_work_size = [ int(sub_matrix.dim_x + (device.max_work_group_size - (sub_matrix.dim_x % device.max_work_group_size))) ] local_work_size = None vertical_events.append( cl.enqueue_nd_range_kernel(command_queue, vertical_kernel, global_work_size, local_work_size)) command_queue.finish() # Diagonal kernel if self.settings.is_matrix_symmetric: diagonal_args = [ vectors_x_buffer, vectors_y_buffer, np.uint32(sub_matrix.dim_x), np.uint32(sub_matrix.dim_y), np.uint32(sub_matrix.start_x), np.uint32(sub_matrix.start_y), np.uint32(self.settings.embedding_dimension), np.float32(self.settings.neighbourhood.radius), np.uint32(self.settings.theiler_corrector), np.uint32(self.get_diagonal_offset(sub_matrix)), diagonal_frequency_distribution_buffer, diagonal_carryover_buffer ] global_work_size = [ int(sub_matrix.dim_x + (device.max_work_group_size - (sub_matrix.dim_x % device.max_work_group_size))) ] else: diagonal_args = [ vectors_x_buffer, vectors_y_buffer, np.uint32(sub_matrix.dim_x + sub_matrix.dim_y - 1), np.uint32(sub_matrix.dim_y), np.uint32(sub_matrix.start_x), np.uint32(sub_matrix.start_y), np.uint32(self.settings.embedding_dimension), np.float32(self.settings.neighbourhood.radius), np.uint32(self.settings.theiler_corrector), diagonal_frequency_distribution_buffer, diagonal_carryover_buffer ] global_work_size_x = sub_matrix.dim_x + sub_matrix.dim_y - 1 global_work_size = [ int(global_work_size_x + ( device.max_work_group_size - (global_work_size_x % device.max_work_group_size))) ] OpenCL.set_kernel_args(diagonal_kernel, diagonal_args) local_work_size = None diagonal_events.append( cl.enqueue_nd_range_kernel(command_queue, diagonal_kernel, global_work_size, local_work_size)) command_queue.finish() # Read buffer transfer_from_device_events.append( cl.enqueue_read_buffer( command_queue, recurrence_points_buffer, self.recurrence_points[ recurrence_points_start:recurrence_points_end], device_offset=0, wait_for=None, is_blocking=False)) transfer_from_device_events.append( cl.enqueue_read_buffer( command_queue, vertical_frequency_distribution_buffer, vertical_frequency_distribution, device_offset=0, wait_for=None, is_blocking=False)) transfer_from_device_events.append( cl.enqueue_read_buffer( command_queue, vertical_carryover_buffer, self.vertical_length_carryover[ vertical_carryover_start:vertical_carryover_end], device_offset=0, wait_for=None, is_blocking=False)) transfer_from_device_events.append( cl.enqueue_read_buffer( command_queue, white_vertical_frequency_distribution_buffer, white_vertical_frequency_distribution, device_offset=0, wait_for=None, is_blocking=False)) transfer_from_device_events.append( cl.enqueue_read_buffer( command_queue, white_vertical_carryover_buffer, self.white_vertical_length_carryover[ white_vertical_carryover_start: white_vertical_carryover_end], device_offset=0, wait_for=None, is_blocking=False)) transfer_from_device_events.append( cl.enqueue_read_buffer( command_queue, diagonal_frequency_distribution_buffer, diagonal_frequency_distribution, device_offset=0, wait_for=None, is_blocking=False)) transfer_from_device_events.append( cl.enqueue_read_buffer( command_queue, diagonal_carryover_buffer, self.diagonal_length_carryover[ diagonal_carryover_start:diagonal_carryover_end], device_offset=0, wait_for=None, is_blocking=False)) command_queue.finish() # Update frequency distributions self.threads_vertical_frequency_distribution[ device] += vertical_frequency_distribution self.threads_white_vertical_frequency_distribution[ device] += white_vertical_frequency_distribution self.threads_diagonal_frequency_distribution[ device] += diagonal_frequency_distribution # Get events runtimes runtimes = Runtimes() runtimes.transfer_to_device = self.opencl.convert_events_runtime( transfer_to_device_events) runtimes.transfer_from_device = self.opencl.convert_events_runtime( transfer_from_device_events) runtimes.create_matrix = self.opencl.convert_events_runtime( create_matrix_events) runtimes.detect_vertical_lines = self.opencl.convert_events_runtime( vertical_events) runtimes.detect_diagonal_lines = self.opencl.convert_events_runtime( diagonal_events) self.threads_runtimes[device] += runtimes except Queue.Empty: break
def __init__(self, batchSize, maxT, maxC, kernelVariant=1, enableGPUDebug=False): "specify size: number of batch elements, number of time-steps, number of characters. Set kernelVariant to either 1 or 2. Set enableGPUDebug to True to debug kernel via CodeXL." # force rebuild of program such that GPU debugger can attach to kernel self.enableGPUDebug = enableGPUDebug if enableGPUDebug: os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1' os.environ['PYOPENCL_NO_CACHE'] = '1' # consts self.batchSize = batchSize self.maxT = maxT self.maxC = maxC assert kernelVariant in [1, 2] self.kernelVariant = kernelVariant # platform, context, queue platforms = cl.get_platforms() assert platforms self.platform = platforms[0] # take first platform devices = self.platform.get_devices( cl.device_type.GPU) # get GPU devices assert devices self.device = devices[0] # take first GPU # context contains the first GPU self.context = cl.Context([self.device]) self.queue = cl.CommandQueue(self.context, self.device) # command queue to first GPU # buffer sizeOfFloat32 = 4 batchBufSize = batchSize * maxC * maxT * sizeOfFloat32 self.batchBuf = cl.Buffer(self.context, cl.mem_flags.READ_ONLY, size=batchBufSize, hostbuf=None) self.res = np.zeros([batchSize, maxT]).astype(np.int32) self.resBuf = cl.Buffer(self.context, cl.mem_flags.WRITE_ONLY, self.res.nbytes) self.tmpBuf = cl.Buffer(self.context, cl.mem_flags.WRITE_ONLY, self.res.nbytes) # compile program and use defines for program-constants to avoid # passing private variables buildOptions = '-D STEP_BEGIN={} -D MAX_T={} -D MAX_C={}'.format( 2**math.ceil(math.log2(maxT)), maxT, maxC) self.program = cl.Program( self.context, open('BestPathCL.cl').read()).build(buildOptions) # variant 1: single pass if kernelVariant == 1: self.kernel1 = cl.Kernel(self.program, 'bestPathAndCollapse') self.kernel1.set_arg(0, self.batchBuf) self.kernel1.set_arg(1, self.resBuf) # all time-steps must fit into a work-group assert maxT <= self.kernel1.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, self.device) # variant 2: two passes else: # kernel1: calculate best path self.kernel1 = cl.Kernel(self.program, 'bestPath') self.kernel1.set_arg(0, self.batchBuf) self.kernel1.set_arg(1, self.tmpBuf) # kernel2: collapse best path self.kernel2 = cl.Kernel(self.program, 'collapsePath') self.kernel2.set_arg(0, self.tmpBuf) self.kernel2.set_arg(1, self.resBuf) # all chars must fit into a work-group assert maxC <= self.kernel1.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, self.device)
import numpy as np if __name__ == "__main__": # Create context from devices in first accessible platform platform = cl.get_platforms()[0] devices = platform.get_devices() context = cl.Context(devices) # Create program from arith.cl file program_file = open('kernels/arith.cl', 'r') program_text = program_file.read() program = cl.Program(context, program_text) # Build program and print log in the event of an error try: program.build() except: print("Build log:") print(program.get_build_info(devices[0], cl.program_build_info.LOG)) raise # Create kernel from 'add' function add_kernel = cl.Kernel(program, 'add') # Create kernel from 'multiply' function mult_kernel = program.multiply print("Kernel Name:"), print(mult_kernel.get_info(cl.kernel_info.FUNCTION_NAME))
def process_sub_matrix(self, *args, **kwargs): device = kwargs['device'] sub_matrix_queue = kwargs['sub_matrix_queue'] context = self.opencl.contexts[device] command_queue = self.opencl.command_queues[device] program = self.opencl.programs[device] create_matrix_kernel = cl.Kernel(program, 'create_matrix') while True: try: sub_matrix = sub_matrix_queue.get(False) transfer_from_device_events = [] transfer_to_device_events = [] create_matrix_events = [] # Time series X time_series_x = self.get_time_series_x(sub_matrix) time_series_x_buffer = cl.Buffer( context, cl.mem_flags.READ_ONLY, time_series_x.size * time_series_x.itemsize) transfer_to_device_events.append( cl.enqueue_write_buffer(command_queue, time_series_x_buffer, time_series_x, device_offset=0, wait_for=None, is_blocking=False)) # Time series Y time_series_y = self.get_time_series_y(sub_matrix) time_series_y_buffer = cl.Buffer( context, cl.mem_flags.READ_ONLY, time_series_y.size * time_series_y.itemsize) transfer_to_device_events.append( cl.enqueue_write_buffer(command_queue, time_series_y_buffer, time_series_y, device_offset=0, wait_for=None, is_blocking=False)) # Recurrence matrix matrix = self.get_recurrence_matrix(sub_matrix, data_type=self.data_type) matrix_buffer = cl.Buffer(context, cl.mem_flags.READ_WRITE, matrix.size * matrix.itemsize) transfer_to_device_events.append( cl.enqueue_write_buffer(command_queue, matrix_buffer, matrix, device_offset=0, wait_for=None, is_blocking=False)) # matrix = np.zeros(1, dtype=self.data_type) # matrix_buffer = cl.Buffer(context, cl.mem_flags.READ_WRITE, int(self.get_matrix_size(sub_matrix, self.data_type))) # transfer_to_device_events.append( cl.enqueue_write_buffer(command_queue, matrix_buffer, matrix, device_offset=0, wait_for=None, is_blocking=False) ) # Create matrix kernel create_matrix_args = [ time_series_x_buffer, time_series_y_buffer, np.uint32(sub_matrix.dim_x), np.uint32(self.settings.embedding_dimension), np.uint32(self.settings.time_delay), np.float32(self.settings.neighbourhood.radius), matrix_buffer ] OpenCL.set_kernel_args(create_matrix_kernel, create_matrix_args) global_work_size = [ int(sub_matrix.dim_x + (device.max_work_group_size - (sub_matrix.dim_x % device.max_work_group_size))), int(sub_matrix.dim_y) ] local_work_size = None create_matrix_events.append( cl.enqueue_nd_range_kernel(command_queue, create_matrix_kernel, global_work_size, local_work_size)) command_queue.finish() # Read buffer transfer_from_device_events.append( cl.enqueue_read_buffer(command_queue, matrix_buffer, matrix, device_offset=0, wait_for=None, is_blocking=False)) command_queue.finish() # Insert in recurrence matrix self.insert_sub_matrix(sub_matrix, matrix) # Get events runtimes runtimes = Runtimes() runtimes.transfer_to_device = self.opencl.convert_events_runtime( transfer_to_device_events) runtimes.transfer_from_device = self.opencl.convert_events_runtime( transfer_from_device_events) runtimes.create_matrix = self.opencl.convert_events_runtime( create_matrix_events) self.threads_runtimes[device] += runtimes except Queue.Empty: break