예제 #1
0
    def loadKernel(self):


        try:
            filename = "../OpenCL/MMdil3D.cl"
            self.program = cl.Program(self.clattr.context, pkg.resource_string(__name__, filename)).build()
        except Exception:
            return False

        self.kernel = cl.Kernel(self.program, "MMdil3DFilterInit")
        self.kernel2 = cl.Kernel(self.program, "MMdil3DFilter")

        return True
예제 #2
0
    def _setup_kernel(self, program, kernel_name, *argv):
        """Get kernel from OpenCL program and set arguments."""
        kernel = cl.Kernel(program, kernel_name)
        for idx, value in enumerate(argv):
            kernel.set_arg(idx, value)

        return kernel
예제 #3
0
def applyMorphOp(imgIn, op):
	"apply morphological operation to image using GPU"
	
	# (1) setup OpenCL
	platforms = cl.get_platforms() # a platform corresponds to a driver (e.g. AMD)
	platform = platforms[0] # take first platform
	devices = platform.get_devices(cl.device_type.GPU) # get GPU devices of selected platform
	device = devices[0] # take first GPU
	context = cl.Context([device]) # put selected GPU into context object
	queue = cl.CommandQueue(context, device) # create command queue for selected GPU and context

	# (2) get shape of input image, allocate memory for output to which result can be copied to
	shape = imgIn.T.shape
	imgOut = np.empty_like(imgIn)	
	
	# (2) create image buffers which hold images for OpenCL
	imgInBuf = cl.Image(context, cl.mem_flags.READ_ONLY, cl.ImageFormat(cl.channel_order.LUMINANCE, cl.channel_type.UNORM_INT8), shape=shape) # holds a gray-valued image of given shape
	imgOutBuf = cl.Image(context, cl.mem_flags.WRITE_ONLY, cl.ImageFormat(cl.channel_order.LUMINANCE, cl.channel_type.UNORM_INT8), shape=shape) # placeholder for gray-valued image of given shape
	
	# (3) load and compile OpenCL program
	program = cl.Program(context, open('Erosion_Dilation.cl').read()).build()

	# (3) from OpenCL program, get kernel object and set arguments (input image, operation type, output image)
	kernel = cl.Kernel(program, 'morphOpKernel') # name of function according to kernel.py
	kernel.set_arg(0, imgInBuf) # input image buffer
	kernel.set_arg(1, np.uint32(op)) # operation type passed as an integer value (dilate=0, erode=1)
	kernel.set_arg(2, imgOutBuf) # output image buffer
	
	# (4) copy image to device, execute kernel, copy data back
	cl.enqueue_copy(queue, imgInBuf, imgIn, origin=(0, 0), region=shape, is_blocking=False) # copy image from CPU to GPU
	cl.enqueue_nd_range_kernel(queue, kernel, shape, None) # execute kernel, work is distributed across shape[0]*shape[1] work-items (one work-item per pixel of the image)
	cl.enqueue_copy(queue, imgOut, imgOutBuf, origin=(0, 0), region=shape, is_blocking=True) # wait until finished copying resulting image back from GPU to CPU
	
	return imgOut
예제 #4
0
def calculate_estimated_kernel_usage(prog, ctx, kernel_name):
    try:
        import pyopencl as cl
        from pyopencl import context_info as ci
        from pyopencl import kernel_work_group_info as kwgi
        devices = ctx.get_info(ci.DEVICES)
        assert len(devices) == 1, 'Should only one device is used !'
        device = devices[0]
        # for name in kernel_names:
        kernel = cl.Kernel(prog, kernel_name)
        # gws = kernel.get_work_group_info(kwgi.GLOBAL_WORK_SIZE, device)
        lm = kernel.get_work_group_info(kwgi.LOCAL_MEM_SIZE, device)
        pm = kernel.get_work_group_info(kwgi.PRIVATE_MEM_SIZE, device)
        cwgs = kernel.get_work_group_info(kwgi.COMPILE_WORK_GROUP_SIZE, device)
        pwgsm = kernel.get_work_group_info(kwgi.PREFERRED_WORK_GROUP_SIZE_MULTIPLE, device)

        print('For kernel "{}" running on device {}:'.format(kernel.function_name, device.name))
        # print('\t Max work size: {}'.format(gws))
        print('\t Max work-group size: {}'.format(cwgs))
        print('\t Recommended work-group multiple: {}'.format(pwgsm))
        print('\t Local mem used: {} of {}'.format(lm, device.local_mem_size))
        print('\t Private mem used: {}'.format(pm))
        return cwgs, pwgsm, lm, pm
    except:
        import traceback
        traceback.print_exc()
        return None, None, None, None
예제 #5
0
def test_that_python_args_fail(ctx_factory):
    context = ctx_factory()

    prg = cl.Program(
        context, """
        __kernel void mult(__global float *a, float b, int c)
        { a[get_global_id(0)] *= (b+c); }
        """).build()

    a = np.random.rand(50000)
    queue = cl.CommandQueue(context)
    mf = cl.mem_flags
    a_buf = cl.Buffer(context, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a)

    knl = cl.Kernel(prg, "mult")
    try:
        knl(queue, a.shape, None, a_buf, 2, 3)
        assert False, "PyOpenCL should not accept bare Python types as arguments"
    except cl.LogicError:
        pass

    try:
        prg.mult(queue, a.shape, None, a_buf, float(2), 3)
        assert False, "PyOpenCL should not accept bare Python types as arguments"
    except cl.LogicError:
        pass

    prg.mult(queue, a.shape, None, a_buf, np.float32(2), np.int32(3))

    a_result = np.empty_like(a)
    cl.enqueue_read_buffer(queue, a_buf, a_result).wait()
예제 #6
0
def Difference(img1, img2, threshold):
    img1 = np.array(img1).astype('uint8')
    img2 = np.array(img2).astype('uint8')
    platforms = cl.get_platforms()
    platform = platforms[0]
    devices = platform.get_devices(cl.device_type.GPU)
    device = devices[0]
    context = cl.Context([device])
    queue = cl.CommandQueue(context, device)

    shape = img1.T.shape
    result = np.empty_like(img1)

    imgInBuf1 = cl.Image(context,
                         cl.mem_flags.READ_ONLY,
                         cl.ImageFormat(cl.channel_order.LUMINANCE,
                                        cl.channel_type.UNORM_INT8),
                         shape=shape)
    imgInBuf2 = cl.Image(context,
                         cl.mem_flags.READ_ONLY,
                         cl.ImageFormat(cl.channel_order.LUMINANCE,
                                        cl.channel_type.UNORM_INT8),
                         shape=shape)
    imgOutBuf = cl.Image(context,
                         cl.mem_flags.WRITE_ONLY,
                         cl.ImageFormat(cl.channel_order.LUMINANCE,
                                        cl.channel_type.UNORM_INT8),
                         shape=shape)

    program = cl.Program(context, open('Difference.cl').read()).build()

    kernel = cl.Kernel(program, 'Difference')
    kernel.set_arg(0, imgInBuf1)
    kernel.set_arg(1, imgInBuf2)
    kernel.set_arg(2, imgOutBuf)
    kernel.set_arg(3, np.float32(threshold))

    cl.enqueue_copy(queue,
                    imgInBuf1,
                    img1,
                    origin=(0, 0),
                    region=shape,
                    is_blocking=False)
    cl.enqueue_copy(queue,
                    imgInBuf2,
                    img2,
                    origin=(0, 0),
                    region=shape,
                    is_blocking=False)
    cl.enqueue_nd_range_kernel(queue, kernel, shape, None)
    cl.enqueue_copy(queue,
                    result,
                    imgOutBuf,
                    origin=(0, 0),
                    region=shape,
                    is_blocking=True)

    return result
예제 #7
0
    def loadKernel(self):
        try:
            filename = "../OpenCL/Mask3D.cl"
            self.program = cl.Program(self.clattr.context, pkg.resource_string(__name__, filename).decode()).build()
        except Exception:
            return False

        self.kernel = cl.Kernel(self.program, self.maskChoice)
        return True
예제 #8
0
    def loadKernel(self):

        try:
            filename = "../OpenCL/MMero3D.cl"
            self.program = cl.Program(self.clattr.context,
                                      pkg.resource_string(__name__,
                                                          filename)).build()
        except Exception:
            return False

        if self.clattr.outputTmpBuffer is None:
            self.clattr.outputTmpBuffer = cl.Buffer(
                self.clattr.context, cl.mem_flags.READ_WRITE,
                self.clattr.inputBuffer.size)

        self.kernel = cl.Kernel(self.program, "MMero3DFilterInit")
        self.kernel2 = cl.Kernel(self.program, "MMero3DFilter")
        return True
예제 #9
0
    def loadKernel(self):
        try:
            filename = "../OpenCL/FFTFilter.cl"
            program = cl.Program(self.clattr.context,
                                 pkg.resource_string(__name__,
                                                     filename)).build()
        except Exception as e:
            raise e

        self.kernel = cl.Kernel(program, "FFTFilter")
        return True
예제 #10
0
 def set_program(self, file_name, function_name):
     self._clear_program()
     self._fname = function_name
     self._get_cl_typenames_in_kernel(file_name, function_name)
     self._program = cl.Program(self._context,
                                open(file_name).read()).build()
     self._kernel = cl.Kernel(self._program, function_name)
     self._mission_global_buffers = [None] * len(self._varying_args)
     self._mission_global_args = [None] * len(self._varying_args)
     for i in range(len(self._mission_global_args)):
         self._mission_global_args[i] = []
     self._program_settle = True
예제 #11
0
 def callFuncFromProgram(self, strMethodName, *args, **argd):
     methodCall = getattr(self.program, strMethodName)
     if methodCall:
         if len(args) >= 2 and type(args[1])==tuple and (not args[1]) != True:
             wgs = cl.Kernel(self.program, strMethodName).get_work_group_info(
                 cl.kernel_work_group_info.WORK_GROUP_SIZE, self.curDevice)
             local_worksize = reduce(lambda x,y: x*y, args[1])
             print 'local size : ', local_worksize
             assert wgs >= local_worksize, 'Out of capability, please reduce the local work size for %s()'%(strMethodName)
         evt = methodCall(self.queue, *args)
         return evt
     return None
예제 #12
0
    def loadKernel(self):
        try:
            filename = "../OpenCL/BilateralFiltering.cl"
            self.program = cl.Program(self.clattr.context,
                                      pkg.resource_string(__name__,
                                                          filename)).build()
        except Exception:
            return False

        self.spatialKernel = self.makeKernel(self.spatialRadius)
        self.rangeKernel = self.makeKernel(self.rangeRadius)
        self.kernel = cl.Kernel(self.program, 'BilateralFilter')
        return True
예제 #13
0
    def makeKernel(self, r):

        radius = r + 1
        minWorkingGroup = 256
        if 'CPU' in self.clattr.device.name: minWorkingGroup = 64

        bufferSize = radius**2 - 1
        localSize = min(self.clattr.device.max_work_group_size,
                        minWorkingGroup)
        globalSize = self.clattr.roundUp(localSize,
                                         bufferSize) * np.float32(0).nbytes

        buffer = cl.Buffer(self.clattr.context,
                           cl.mem_flags.READ_WRITE,
                           size=globalSize)
        kernel = cl.Kernel(self.program, 'makeKernel')
        kernel.set_args(np.float32(radius), buffer, np.int32(bufferSize))

        globalSize = [int(globalSize)]
        localSize = [int(localSize)]

        cl.enqueue_nd_range_kernel(self.clattr.queue, kernel, globalSize,
                                   localSize)
        output = np.empty(bufferSize).astype(np.float32)
        cl.enqueue_copy(self.clattr.queue, output, buffer)
        self.clattr.queue.finish()

        total = np.float32(0)
        for i in range(bufferSize):
            total += output[i]

        normalizeKernel = cl.Kernel(self.program, 'normalizeKernel')
        normalizeKernel.set_args(np.float32(total), buffer,
                                 np.int32(bufferSize))
        cl.enqueue_nd_range_kernel(self.clattr.queue, normalizeKernel,
                                   globalSize, localSize)

        return buffer
예제 #14
0
    def init_hal(self, buffIn, buffOut):
        # Get platform/device information
        clPlatform = cl.get_platforms()[0]
        clDevices = clPlatform.get_devices()
        clDevice = clDevices[0]

        self.ocl_ctx = cl.Context(devices=clDevices)

        with open(self.xclbin, "rb") as binary_file:
            binary = binary_file.read()
        self.ocl_prg = cl.Program(self.ocl_ctx, clDevices, [binary])

        # Init Command Queue
        qprops = cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE|\
             cl.command_queue_properties.PROFILING_ENABLE
        self.ocl_q = cl.CommandQueue(context=self.ocl_ctx,
                                     device=clDevice,
                                     properties=qprops)

        # Create Kernels
        self.ocl_krnl_scramb_stage = cl.Kernel(self.ocl_prg,
                                               "krnl_scrambler_stage_rtl")
        self.ocl_krnl_input_stage = cl.Kernel(self.ocl_prg,
                                              "krnl_input_stage_rtl")
        self.ocl_krnl_output_stage = cl.Kernel(self.ocl_prg,
                                               "krnl_output_stage_rtl")

        # Create Buffers
        self.buffer_input = cl.Buffer(self.ocl_ctx,
                                      cl.mem_flags.USE_HOST_PTR
                                      | cl.mem_flags.READ_ONLY,
                                      size=0,
                                      hostbuf=buffIn)
        self.buffer_output = cl.Buffer(self.ocl_ctx,
                                       cl.mem_flags.USE_HOST_PTR
                                       | cl.mem_flags.WRITE_ONLY,
                                       size=0,
                                       hostbuf=buffOut)
예제 #15
0
    def __init__(self, logger, difficulty):
        self.logger = logger
        try:
            #platform = cl.get_platforms()[0]
            #self.devices = platform.get_devices(cl.device_type.GPU)
            #self.context = cl.Context(self.devices, None, None)
            self.context = cl.create_some_context()
            self.devices = self.context.get_info(cl.context_info.DEVICES)
            self.queue = cl.CommandQueue(
                self.context,
                properties=cl.command_queue_properties.PROFILING_ENABLE)

            kernelFile = open('/tmp/chady256.cl', 'r')
            self.miner = cl.Program(self.context, kernelFile.read()).build()
            kernelFile.close()

            self.WORK_GROUP_SIZE = 0
            self.preferred_multiple = 0
            for device in self.devices:
                self.WORK_GROUP_SIZE += self.miner.sha256_crypt_kernel.get_work_group_info(
                    cl.kernel_work_group_info.WORK_GROUP_SIZE, device)
                self.preferred_multiple = cl.Kernel(
                    self.miner, 'sha256_crypt_kernel').get_work_group_info(
                        cl.kernel_work_group_info.
                        PREFERRED_WORK_GROUP_SIZE_MULTIPLE, device)

            self.logger.info('Best workgroup size :' +
                             str(self.WORK_GROUP_SIZE))
            self.logger.info('Preferred multiple: ' +
                             str(self.preferred_multiple))

            self.nounce_begin = 0
            self.data_info = np.zeros(1, np.uint32)
            self.data_info[0] = 76

            self.globalThreads = self.WORK_GROUP_SIZE * 1000
            self.localThreads = 1

            self.blocks = np.zeros(self.data_info[0] * self.globalThreads,
                                   np.uint8)

            self.difficulty = difficulty

            self.logger.info("HERE")

        except Exception as inst:
            self.logger.exception("Init")
            self.logger.error(type(inst))
            self.logger.error((inst.args))
예제 #16
0
def setup_device():
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    mf = cl.mem_flags

    dev = cl.get_platforms()[1].get_devices()
    binary = open(xclbin_kernel, "rb").read()

    prg = cl.Program(ctx, dev, [binary])
    prg.build()
    print(dev)
    print("Device is programmed, testing...")

    krnl_vadd = cl.Kernel(prg, "top")

    return [ctx, queue, mf, krnl_vadd]
예제 #17
0
    def calculate(self, range_start, range_end):
        nmr_problems = range_end - range_start

        workgroup_size = cl.Kernel(
            self._sampling_kernel, 'sample').get_work_group_info(
                cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,
                self._cl_environment.device)

        data_buffers, readout_items = self._get_buffers(workgroup_size)

        iteration_offset = self._mh_state.nmr_samples_drawn

        if self._burn_length > 0:
            iteration_offset = self._enqueue_burnin(range_start, range_end,
                                                    workgroup_size,
                                                    data_buffers,
                                                    iteration_offset)

        samples_buf = cl.Buffer(self._cl_run_context.context,
                                cl.mem_flags.WRITE_ONLY
                                | cl.mem_flags.USE_HOST_PTR,
                                hostbuf=self._samples)
        data_buffers.append(samples_buf)
        readout_items.append([samples_buf, self._samples])

        iteration_batch_sizes = self._get_sampling_batch_sizes(
            self._nmr_samples * (self._sample_intervals + 1),
            self._max_iterations_per_batch)

        for nmr_iterations in iteration_batch_sizes:
            self._sampling_kernel.sample(
                self._cl_run_context.queue,
                (int(nmr_problems * workgroup_size), ),
                (int(workgroup_size), ),
                np.uint64(nmr_iterations),
                np.uint64(iteration_offset),
                *data_buffers,
                global_offset=(range_start * workgroup_size, ))

            iteration_offset += nmr_iterations

        for buffer, host_array in readout_items:
            self._enqueue_readout(buffer, host_array, range_start, range_end)
예제 #18
0
    def get_context(self):
        platforms = pyopencl.get_platforms()
        if not platforms:
            raise EnvironmentError(
                "No tienes plataformas OpenCL, intenta instalar los drivers.")

        self.device = None
        for plt in platforms:
            dev = plt.get_devices(pyopencl.device_type.ALL)
            if dev:
                for d in dev:
                    if self.device is None:
                        self.device = d
                    elif d.max_clock_frequency > self.device.max_clock_frequency and d.max_compute_units > self.device.max_compute_units:
                        self.device = d

        self.ctx = pyopencl.Context([self.device])
        self.queue = pyopencl.CommandQueue(self.ctx)
        self.program = pyopencl.Program(self.ctx,
                                        open("raytracer.cl",
                                             "r").read()).build()
        self.kernel = pyopencl.Kernel(self.program, "raytracer")
예제 #19
0
파일: utils.py 프로젝트: lxy528/oclGA
def calculate_estimated_kernel_usage(prog, ctx, kernel_names):
    try:
        import pyopencl as cl
        from pyopencl import context_info as ci
        from pyopencl import kernel_work_group_info as kwgi
        devices = ctx.get_info(ci.DEVICES)
        assert len(devices) == 1, "Should only one device is used !"
        device = devices[0]
        for name in kernel_names:
            kerKer = cl.Kernel(prog, name)
            lm = kerKer.get_work_group_info(kwgi.LOCAL_MEM_SIZE, device)
            pm = kerKer.get_work_group_info(kwgi.PRIVATE_MEM_SIZE, device)
            cwgs = kerKer.get_work_group_info(kwgi.COMPILE_WORK_GROUP_SIZE,
                                              device)
            pwgsm = kerKer.get_work_group_info(
                kwgi.PREFERRED_WORK_GROUP_SIZE_MULTIPLE, device)
            print("[%s]\tEstimated usage : Local mem (%d)/ Private mem (%d)"\
                  "/ Compile WG size (%s)/ Preffered WG size multiple (%d)"\
                  %(name, lm, pm, str(cwgs), pwgsm))
    except:
        import traceback
        traceback.print_exc()
예제 #20
0
def rtl_adder_run(xclbinpath):
    global boardHandler

    source_input = np.arange(DATA_SIZE, dtype=np.uint32)
    source_sw_results = np.arange(INCR_VALUE,
                                  DATA_SIZE + INCR_VALUE,
                                  dtype=np.uint32)
    source_hw_results = np.zeros(DATA_SIZE, np.uint32)

    ##OPENCL HOST CODE AREA START

    # Get platform/device information
    clPlatform = cl.get_platforms()[0]
    clDevices = clPlatform.get_devices()
    clDevice = clDevices[0]

    ctx = cl.Context(devices=clDevices)

    with open(xclbinpath, "rb") as binary_file:
        binary = binary_file.read()

    prg = cl.Program(ctx, clDevices, [binary])

    # Init xclhal2 library
    if xclProbe() < 1:
        print("[ERROR] xclProbe failed ...")
        raise
    boardHandler = xclOpen(0, ctypes.c_char_p(b"xrt_logfile.log"),
                           xclVerbosityLevel.XCL_INFO)

    ##ACCELIZE DRMLIB CODE AREA START
    drm_manager = DrmManager(
        # Configuration files paths
        "./conf.json",
        "./cred.json",
        # Read/write register functions callbacks
        drm_read_callback,
        drm_write_callback,
    )
    drm_manager.activate()
    print(f"[DRMLIB] Session ID: {drm_manager.get('session_id')}")
    time.sleep(2)
    ##ACCELIZE DRMLIB CODE AREA STOP
    qprops = cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE|\
             cl.command_queue_properties.PROFILING_ENABLE
    with cl.CommandQueue(context=ctx, device=clDevice, properties=qprops) as q:

        # Create Kernels
        krnl_adder_stage = cl.Kernel(prg, "krnl_adder_stage_rtl")
        krnl_input_stage = cl.Kernel(prg, "krnl_input_stage_rtl")
        krnl_output_stage = cl.Kernel(prg, "krnl_output_stage_rtl")

        # Create Buffer
        buffer_input = cl.Buffer(ctx,
                                 cl.mem_flags.USE_HOST_PTR
                                 | cl.mem_flags.READ_ONLY,
                                 size=0,
                                 hostbuf=source_input)
        buffer_output = cl.Buffer(ctx,
                                  cl.mem_flags.USE_HOST_PTR
                                  | cl.mem_flags.READ_ONLY,
                                  size=0,
                                  hostbuf=source_hw_results)

        # Set the Kernel Arguments
        npSize = np.int32(DATA_SIZE)
        npIncr = np.int32(INCR_VALUE)
        krnl_input_stage.set_args(buffer_input, npSize)
        krnl_adder_stage.set_args(npIncr, npSize)
        krnl_output_stage.set_args(buffer_output, npSize)

        # Copy input data to device global memory
        cl.enqueue_migrate_mem_objects(q, [buffer_input], flags=0)

        # Launch the Kernel
        cl.enqueue_nd_range_kernel(q, krnl_input_stage, [1], [1])
        cl.enqueue_nd_range_kernel(q, krnl_adder_stage, [1], [1])
        cl.enqueue_nd_range_kernel(q, krnl_output_stage, [1], [1])

    # Copy Result from Device Global Memory to Host Local Memory
    cl.enqueue_migrate_mem_objects(q, [buffer_output],
                                   flags=cl.mem_migration_flags.HOST)
    q.finish()
    ##OPENCL HOST CODE AREA STOP

    ##ACCELIZE DRMLIB CODE AREA START
    drm_manager.deactivate()
    ##ACCELIZE DRMLIB CODE AREA STOP

    # Release xclhal2 board handler
    #xclClose(boardHandler) # /!\ XRT Python binding is in development
    # state, xclClose() generate crash at the
    # time this script is written

    diff = source_hw_results != source_sw_results
    if diff.any():
        print(f"Error: Result mismatch i={i} \
            CPU={source_sw_results[i]} != \
            DEVICE={source_hw_results[i]}")
        raise

    print("TEST PASSED")
예제 #21
0
def render(numParticles, fireColors, dumpFrames=False, clDebug=False):
	"render particle system with specified number of particles and particle color"

	# show output of OpenCL compiler
	if clDebug:
		os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1'
		
	# if frames should be dumped, create directory if necessary
	dumpDir = 'dump'
	if dumpFrames and not os.path.exists(dumpDir):
		os.mkdir(dumpDir)
		
	# setup OpenCL
	platforms = cl.get_platforms() # a platform corresponds to a driver (e.g. AMD)
	platform = platforms[0] # take first platform
	devices = platform.get_devices(cl.device_type.GPU) # get GPU devices of selected platform
	device = devices[0] # take first GPU
	context = cl.Context([device]) # put selected GPU into context object
	queue = cl.CommandQueue(context, device) # create command queue for selected GPU and context
	
	# setup buffer for particles
	sizeParticleStruct = 32 # sizeof(struct Particle)
	bufParticles = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=sizeParticleStruct*numParticles, hostbuf=None)
	
	# setup random values (for random speed and color)
	random.seed()
	randVals = np.array([random.random() - 0.5 for _ in range(2 * numParticles)], dtype=np.float32)
	bufRandVals = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=randVals)
	
	# setup output image
	windowSize = 480
	colorChannels = 4 # RGBA
	sizeofColorChannel = 4 # we need int32 to perform atomic operations in the kernel (multiple particles at same position)
	img = np.zeros([windowSize, windowSize, colorChannels], dtype=np.int32) # must be square image to ignore distortion
	imgShape = (windowSize, windowSize) # 2d shape of image
	imgBuf = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, size=windowSize*windowSize*colorChannels*sizeofColorChannel)
	
	# setup kernels
	compilerSettings = ('-DWINDOW_SIZE=%d'%(windowSize)) + ' ' + ('-DFIRE_COLORS' if fireColors else '')
	program = cl.Program(context, open('kernel.cl').read()).build(compilerSettings)
	initParticles = cl.Kernel(program, 'initParticles')
	updateParticles = cl.Kernel(program, 'updateParticles')
	clearCanvas = cl.Kernel(program, 'clearCanvas')
	drawParticles = cl.Kernel(program, 'drawParticles')
	saturate = cl.Kernel(program, 'saturate')
	drawEmitter = cl.Kernel(program, 'drawEmitter')
	
	# init particles
	initParticles.set_arg(0, bufParticles)
	initParticles.set_arg(1, bufRandVals)
	cl.enqueue_nd_range_kernel(queue, initParticles, (numParticles,), None)
	
	# do some (invisible) iterations for smooth particle distribution
	for t in range(1000):
		updateParticles.set_arg(0, bufParticles)
		cl.enqueue_nd_range_kernel(queue, updateParticles, (numParticles,), None)
	
	# rendering loop
	ctr = 0
	while True:
		# clear canvas
		clearCanvas.set_arg(0, imgBuf)	
		cl.enqueue_nd_range_kernel(queue, clearCanvas, imgShape, None)
		
		# draw all particles
		drawParticles.set_arg(0, bufParticles)
		drawParticles.set_arg(1, imgBuf)
		cl.enqueue_nd_range_kernel(queue, drawParticles, (numParticles,), None)
		
		# saturate
		saturate.set_arg(0, imgBuf)
		cl.enqueue_nd_range_kernel(queue, saturate, imgShape, None)
		
		# draw emitter
		drawEmitter.set_arg(0, imgBuf)
		cl.enqueue_nd_range_kernel(queue, drawEmitter, (1,), None)
		
		# update particles
		updateParticles.set_arg(0, bufParticles)
		cl.enqueue_nd_range_kernel(queue, updateParticles, (numParticles,), None)
		
		# copy result from GPU
		cl.enqueue_copy(queue, img, imgBuf, is_blocking=True)
		
		# show image (and dump if specified)
		imgU8 = img[:,:,0:3].astype(np.uint8)
		cv2.imshow("Particle system [press ESC to exit]", imgU8)
		if dumpFrames:
			ctr += 1
			cv2.imwrite('%s/%d.png'%(dumpDir, ctr), imgU8)
		
		# exit with ESC
		keyPressed = cv2.waitKey(10)
		if keyPressed == 27:
			break
예제 #22
0
import sys
import os
import numpy as np
import pyopencl as cl

if len(sys.argv) < 5:
    raise ValueError(sys.argv[0] +
                     " platform_id device_id global_size local_size")

platform_index = int(sys.argv[1])
device_index = int(sys.argv[2])
global_work_size = int(sys.argv[3])
local_work_size = int(sys.argv[4])

plat = cl.get_platforms()[platform_index]
print("Platform: " + plat.name)
dev = plat.get_devices()[device_index]
print("  -- Device: " + dev.name)

context = cl.Context(devices=[dev])
queue = cl.CommandQueue(context, dev)

binary = open("hwv.bin", "rb").read()

program = cl.Program(context, [dev], [binary])
program.build()
kernel = cl.Kernel(program, "hello_world")
cl.enqueue_nd_range_kernel(queue, kernel, [global_work_size],
                           [local_work_size])
queue.finish()
예제 #23
0
                        c_temp = c_temp * (a_temp/2.0); // times 1/2 my a
                        c[gid] = c_temp; // store result in global memory
                }
                """).build()

        global_size = (data_points, )
        local_size = (workers, )
        print("yo")
        #print(get_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE)

        #preferred_multiple = cl.Kernel(prg, 'sum').get_work_group_info( \
        #    cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE, \
        #    device)
        #preferred_multiple = cl.Kernel(prg, 'sum').get_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE, device)
        #preferred_multiple = cl.Kernel(prg, 'sum')
        x = cl.Kernel(prg, 'sum')
        preferred_multiple = 256

        print("yo2")

        print("Data points:", data_points)
        print("Workers:", workers)
        print("Preferred work group size multiple:", preferred_multiple)

        print(preferred_multiple)

        if (workers % preferred_multiple):
            print("Number of workers not a preferred multiple (%d*N)." \
                    % (preferred_multiple))
            print("Performance may be reduced.")
예제 #24
0
 def __init__(self, program: Program, name: str) -> None:
     self._pyopencl_kernel = pyopencl.Kernel(program._pyopencl_program, name)
예제 #25
0
platform = cl.get_platforms()[1]
device = platform.get_devices()
context = cl.Context(device)
queue = cl.CommandQueue(context, device[0],
                        cl.command_queue_properties.PROFILING_ENABLE)
"""Set up GPU program"""
program_file = open('sum.cl', 'r')
program_text = program_file.read()
program = cl.Program(context, program_text)
try:
    program.build()
except:
    print("Build log:")
    print(program.get_build_info(device, cl.program_build_info.LOG))
    raise
adder = cl.Kernel(program, 'add')
print("Kernel Name:"),
print(adder.get_info(cl.kernel_info.FUNCTION_NAME))
program_file.close()
"""Set up kernel data exchange piple"""

size = 1000
a = np.ones(size, dtype=np.float32)
b = np.ones(size, dtype=np.float32) * 2
c = np.zeros_like(a)

a_buffer = cl.Buffer(context, cl.mem_flags.READ_WRITE, a.nbytes)
b_buffer = cl.Buffer(context, cl.mem_flags.READ_WRITE, b.nbytes)
c_buffer = cl.Buffer(context, cl.mem_flags.READ_WRITE, c.nbytes)

cl.enqueue_copy(queue, a_buffer, a, is_blocking=True)
예제 #26
0
						key_temp = key[gid]; // my a element (by global ref)
						ciph_temp = ciph[gid]; // my b element (by global ref)
			
						for (r = 0; r < 1; r++)
						{
						ciph_temp = (ciph_temp>>1)^bit(ciph_temp,31)^bit(ciph_temp,15)^bit(key_temp,(15-r)&63)^bit(KeeLoq_NLF,g5(ciph_temp,0,8,19,25,30));
						}
						plain_temp = ciph_temp;
						plain[gid] = plain_temp; // store result in global memory
				}
				""").build()

        global_size = (data_points, )
        local_size = (workers, )
        preferred_multiple = cl.Kernel(prg, 'sum').get_work_group_info( \
         cl.kernel_work_group_info.PREFERRED_WORK_GROUP_SIZE_MULTIPLE, \
         device)
        '''
		print("Data points:", data_points)
		print("Workers:", workers)
		print("Preferred work group size multiple:", preferred_multiple)

		if (workers % preferred_multiple):
			print("Number of workers not a preferred multiple (%d*N)." \
					% (preferred_multiple))
			print("Performance may be reduced.")
		'''

        exec_evt = prg.sum(queue, global_size, local_size, key_buf, ciph_buf,
                           dest_buf)
        exec_evt.wait()
예제 #27
0
파일: row_no_mat.py 프로젝트: szhan/pyrqa
    def process_sub_matrix(self, *args, **kwargs):
        device = kwargs['device']
        sub_matrix_queue = kwargs['sub_matrix_queue']

        context = self.opencl.contexts[device]
        command_queue = self.opencl.command_queues[device]
        program = self.opencl.programs[device]

        vertical_kernel = cl.Kernel(program, 'vertical')
        diagonal_kernel = cl.Kernel(program,
                                    self.settings.diagonal_kernel_name)

        while True:
            try:
                sub_matrix = sub_matrix_queue.get(False)

                transfer_from_device_events = []
                transfer_to_device_events = []
                create_matrix_events = []
                vertical_events = []
                diagonal_events = []

                # Vectors X
                vectors_x = self.get_vectors_x(sub_matrix)

                vectors_x_buffer = cl.Buffer(
                    context, cl.mem_flags.READ_ONLY,
                    vectors_x.size * vectors_x.itemsize)

                transfer_to_device_events.append(
                    cl.enqueue_write_buffer(command_queue,
                                            vectors_x_buffer,
                                            vectors_x,
                                            device_offset=0,
                                            wait_for=None,
                                            is_blocking=False))

                # Vectors Y
                vectors_y = self.get_vectors_y(sub_matrix)

                vectors_y_buffer = cl.Buffer(
                    context, cl.mem_flags.READ_ONLY,
                    vectors_y.size * vectors_y.itemsize)

                transfer_to_device_events.append(
                    cl.enqueue_write_buffer(command_queue,
                                            vectors_y_buffer,
                                            vectors_y,
                                            device_offset=0,
                                            wait_for=None,
                                            is_blocking=False))

                # Recurrence points
                recurrence_points, \
                    recurrence_points_start, \
                    recurrence_points_end = self.get_recurrence_points(sub_matrix)

                recurrence_points_buffer = cl.Buffer(
                    context, cl.mem_flags.READ_WRITE,
                    recurrence_points.size * recurrence_points.itemsize)

                transfer_to_device_events.append(
                    cl.enqueue_write_buffer(command_queue,
                                            recurrence_points_buffer,
                                            recurrence_points,
                                            device_offset=0,
                                            wait_for=None,
                                            is_blocking=False))

                # Vertical frequency distribution
                vertical_frequency_distribution = self.get_empty_local_frequency_distribution(
                )

                vertical_frequency_distribution_buffer = cl.Buffer(
                    context, cl.mem_flags.READ_WRITE,
                    vertical_frequency_distribution.size *
                    vertical_frequency_distribution.itemsize)

                transfer_to_device_events.append(
                    cl.enqueue_write_buffer(
                        command_queue,
                        vertical_frequency_distribution_buffer,
                        vertical_frequency_distribution,
                        device_offset=0,
                        wait_for=None,
                        is_blocking=False))

                # White vertical frequency distribution
                white_vertical_frequency_distribution = self.get_empty_local_frequency_distribution(
                )

                white_vertical_frequency_distribution_buffer = cl.Buffer(
                    context, cl.mem_flags.READ_WRITE,
                    white_vertical_frequency_distribution.size *
                    white_vertical_frequency_distribution.itemsize)

                transfer_to_device_events.append(
                    cl.enqueue_write_buffer(
                        command_queue,
                        white_vertical_frequency_distribution_buffer,
                        white_vertical_frequency_distribution,
                        device_offset=0,
                        wait_for=None,
                        is_blocking=False))

                # Diagonal frequency distribution
                diagonal_frequency_distribution = self.get_empty_local_frequency_distribution(
                )

                diagonal_frequency_distribution_buffer = cl.Buffer(
                    context, cl.mem_flags.READ_WRITE,
                    diagonal_frequency_distribution.size *
                    diagonal_frequency_distribution.itemsize)

                transfer_to_device_events.append(
                    cl.enqueue_write_buffer(
                        command_queue,
                        diagonal_frequency_distribution_buffer,
                        diagonal_frequency_distribution,
                        device_offset=0,
                        wait_for=None,
                        is_blocking=False))

                # Vertical carryover
                vertical_carryover, \
                    vertical_carryover_start,\
                    vertical_carryover_end = self.get_vertical_length_carryover(sub_matrix)

                vertical_carryover_buffer = cl.Buffer(
                    context, cl.mem_flags.READ_WRITE,
                    vertical_carryover.size * vertical_carryover.itemsize)

                transfer_to_device_events.append(
                    cl.enqueue_write_buffer(command_queue,
                                            vertical_carryover_buffer,
                                            vertical_carryover,
                                            device_offset=0,
                                            wait_for=None,
                                            is_blocking=False))

                # White vertical carryover
                white_vertical_carryover, \
                    white_vertical_carryover_start,\
                    white_vertical_carryover_end = self.get_white_vertical_length_carryover(sub_matrix)

                white_vertical_carryover_buffer = cl.Buffer(
                    context, cl.mem_flags.READ_WRITE,
                    white_vertical_carryover.size *
                    white_vertical_carryover.itemsize)

                transfer_to_device_events.append(
                    cl.enqueue_write_buffer(command_queue,
                                            white_vertical_carryover_buffer,
                                            white_vertical_carryover,
                                            device_offset=0,
                                            wait_for=None,
                                            is_blocking=False))

                # Diagonal carryover
                diagonal_carryover, \
                    diagonal_carryover_start, \
                    diagonal_carryover_end = self.get_diagonal_length_carryover(sub_matrix)

                diagonal_carryover_buffer = cl.Buffer(
                    context, cl.mem_flags.READ_WRITE,
                    diagonal_carryover.size * diagonal_carryover.itemsize)

                transfer_to_device_events.append(
                    cl.enqueue_write_buffer(command_queue,
                                            diagonal_carryover_buffer,
                                            diagonal_carryover,
                                            device_offset=0,
                                            wait_for=None,
                                            is_blocking=False))

                command_queue.finish()

                # Vertical kernel
                vertical_args = [
                    vectors_x_buffer, vectors_y_buffer,
                    np.uint32(sub_matrix.dim_x),
                    np.uint32(sub_matrix.dim_y),
                    np.uint32(self.settings.embedding_dimension),
                    np.float32(self.settings.neighbourhood.radius),
                    recurrence_points_buffer,
                    vertical_frequency_distribution_buffer,
                    vertical_carryover_buffer,
                    white_vertical_frequency_distribution_buffer,
                    white_vertical_carryover_buffer
                ]

                OpenCL.set_kernel_args(vertical_kernel, vertical_args)

                global_work_size = [
                    int(sub_matrix.dim_x +
                        (device.max_work_group_size -
                         (sub_matrix.dim_x % device.max_work_group_size)))
                ]
                local_work_size = None

                vertical_events.append(
                    cl.enqueue_nd_range_kernel(command_queue, vertical_kernel,
                                               global_work_size,
                                               local_work_size))

                command_queue.finish()

                # Diagonal kernel
                if self.settings.is_matrix_symmetric:
                    diagonal_args = [
                        vectors_x_buffer, vectors_y_buffer,
                        np.uint32(sub_matrix.dim_x),
                        np.uint32(sub_matrix.dim_y),
                        np.uint32(sub_matrix.start_x),
                        np.uint32(sub_matrix.start_y),
                        np.uint32(self.settings.embedding_dimension),
                        np.float32(self.settings.neighbourhood.radius),
                        np.uint32(self.settings.theiler_corrector),
                        np.uint32(self.get_diagonal_offset(sub_matrix)),
                        diagonal_frequency_distribution_buffer,
                        diagonal_carryover_buffer
                    ]

                    global_work_size = [
                        int(sub_matrix.dim_x +
                            (device.max_work_group_size -
                             (sub_matrix.dim_x % device.max_work_group_size)))
                    ]

                else:
                    diagonal_args = [
                        vectors_x_buffer, vectors_y_buffer,
                        np.uint32(sub_matrix.dim_x + sub_matrix.dim_y - 1),
                        np.uint32(sub_matrix.dim_y),
                        np.uint32(sub_matrix.start_x),
                        np.uint32(sub_matrix.start_y),
                        np.uint32(self.settings.embedding_dimension),
                        np.float32(self.settings.neighbourhood.radius),
                        np.uint32(self.settings.theiler_corrector),
                        diagonal_frequency_distribution_buffer,
                        diagonal_carryover_buffer
                    ]

                    global_work_size_x = sub_matrix.dim_x + sub_matrix.dim_y - 1
                    global_work_size = [
                        int(global_work_size_x + (
                            device.max_work_group_size -
                            (global_work_size_x % device.max_work_group_size)))
                    ]

                OpenCL.set_kernel_args(diagonal_kernel, diagonal_args)

                local_work_size = None

                diagonal_events.append(
                    cl.enqueue_nd_range_kernel(command_queue, diagonal_kernel,
                                               global_work_size,
                                               local_work_size))

                command_queue.finish()

                # Read buffer
                transfer_from_device_events.append(
                    cl.enqueue_read_buffer(
                        command_queue,
                        recurrence_points_buffer,
                        self.recurrence_points[
                            recurrence_points_start:recurrence_points_end],
                        device_offset=0,
                        wait_for=None,
                        is_blocking=False))

                transfer_from_device_events.append(
                    cl.enqueue_read_buffer(
                        command_queue,
                        vertical_frequency_distribution_buffer,
                        vertical_frequency_distribution,
                        device_offset=0,
                        wait_for=None,
                        is_blocking=False))

                transfer_from_device_events.append(
                    cl.enqueue_read_buffer(
                        command_queue,
                        vertical_carryover_buffer,
                        self.vertical_length_carryover[
                            vertical_carryover_start:vertical_carryover_end],
                        device_offset=0,
                        wait_for=None,
                        is_blocking=False))

                transfer_from_device_events.append(
                    cl.enqueue_read_buffer(
                        command_queue,
                        white_vertical_frequency_distribution_buffer,
                        white_vertical_frequency_distribution,
                        device_offset=0,
                        wait_for=None,
                        is_blocking=False))

                transfer_from_device_events.append(
                    cl.enqueue_read_buffer(
                        command_queue,
                        white_vertical_carryover_buffer,
                        self.white_vertical_length_carryover[
                            white_vertical_carryover_start:
                            white_vertical_carryover_end],
                        device_offset=0,
                        wait_for=None,
                        is_blocking=False))

                transfer_from_device_events.append(
                    cl.enqueue_read_buffer(
                        command_queue,
                        diagonal_frequency_distribution_buffer,
                        diagonal_frequency_distribution,
                        device_offset=0,
                        wait_for=None,
                        is_blocking=False))

                transfer_from_device_events.append(
                    cl.enqueue_read_buffer(
                        command_queue,
                        diagonal_carryover_buffer,
                        self.diagonal_length_carryover[
                            diagonal_carryover_start:diagonal_carryover_end],
                        device_offset=0,
                        wait_for=None,
                        is_blocking=False))

                command_queue.finish()

                # Update frequency distributions
                self.threads_vertical_frequency_distribution[
                    device] += vertical_frequency_distribution
                self.threads_white_vertical_frequency_distribution[
                    device] += white_vertical_frequency_distribution
                self.threads_diagonal_frequency_distribution[
                    device] += diagonal_frequency_distribution

                # Get events runtimes
                runtimes = Runtimes()
                runtimes.transfer_to_device = self.opencl.convert_events_runtime(
                    transfer_to_device_events)
                runtimes.transfer_from_device = self.opencl.convert_events_runtime(
                    transfer_from_device_events)
                runtimes.create_matrix = self.opencl.convert_events_runtime(
                    create_matrix_events)
                runtimes.detect_vertical_lines = self.opencl.convert_events_runtime(
                    vertical_events)
                runtimes.detect_diagonal_lines = self.opencl.convert_events_runtime(
                    diagonal_events)

                self.threads_runtimes[device] += runtimes

            except Queue.Empty:
                break
예제 #28
0
    def __init__(self,
                 batchSize,
                 maxT,
                 maxC,
                 kernelVariant=1,
                 enableGPUDebug=False):
        "specify size: number of batch elements, number of time-steps, number of characters. Set kernelVariant to either 1 or 2. Set enableGPUDebug to True to debug kernel via CodeXL."

        # force rebuild of program such that GPU debugger can attach to kernel
        self.enableGPUDebug = enableGPUDebug
        if enableGPUDebug:
            os.environ['PYOPENCL_COMPILER_OUTPUT'] = '1'
            os.environ['PYOPENCL_NO_CACHE'] = '1'

        # consts
        self.batchSize = batchSize
        self.maxT = maxT
        self.maxC = maxC
        assert kernelVariant in [1, 2]
        self.kernelVariant = kernelVariant

        # platform, context, queue
        platforms = cl.get_platforms()
        assert platforms
        self.platform = platforms[0]  # take first platform
        devices = self.platform.get_devices(
            cl.device_type.GPU)  # get GPU devices
        assert devices
        self.device = devices[0]  # take first GPU
        # context contains the first GPU
        self.context = cl.Context([self.device])
        self.queue = cl.CommandQueue(self.context,
                                     self.device)  # command queue to first GPU

        # buffer
        sizeOfFloat32 = 4
        batchBufSize = batchSize * maxC * maxT * sizeOfFloat32
        self.batchBuf = cl.Buffer(self.context,
                                  cl.mem_flags.READ_ONLY,
                                  size=batchBufSize,
                                  hostbuf=None)
        self.res = np.zeros([batchSize, maxT]).astype(np.int32)
        self.resBuf = cl.Buffer(self.context, cl.mem_flags.WRITE_ONLY,
                                self.res.nbytes)
        self.tmpBuf = cl.Buffer(self.context, cl.mem_flags.WRITE_ONLY,
                                self.res.nbytes)

        # compile program and use defines for program-constants to avoid
        # passing private variables
        buildOptions = '-D STEP_BEGIN={} -D MAX_T={} -D MAX_C={}'.format(
            2**math.ceil(math.log2(maxT)), maxT, maxC)
        self.program = cl.Program(
            self.context,
            open('BestPathCL.cl').read()).build(buildOptions)

        # variant 1: single pass
        if kernelVariant == 1:
            self.kernel1 = cl.Kernel(self.program, 'bestPathAndCollapse')
            self.kernel1.set_arg(0, self.batchBuf)
            self.kernel1.set_arg(1, self.resBuf)

            # all time-steps must fit into a work-group
            assert maxT <= self.kernel1.get_work_group_info(
                cl.kernel_work_group_info.WORK_GROUP_SIZE, self.device)

        # variant 2: two passes
        else:
            # kernel1: calculate best path
            self.kernel1 = cl.Kernel(self.program, 'bestPath')
            self.kernel1.set_arg(0, self.batchBuf)
            self.kernel1.set_arg(1, self.tmpBuf)

            # kernel2: collapse best path
            self.kernel2 = cl.Kernel(self.program, 'collapsePath')
            self.kernel2.set_arg(0, self.tmpBuf)
            self.kernel2.set_arg(1, self.resBuf)

            # all chars must fit into a work-group
            assert maxC <= self.kernel1.get_work_group_info(
                cl.kernel_work_group_info.WORK_GROUP_SIZE, self.device)
import numpy as np

if __name__ == "__main__":

    # Create context from devices in first accessible platform
    platform = cl.get_platforms()[0]
    devices = platform.get_devices()
    context = cl.Context(devices)

    # Create program from arith.cl file
    program_file = open('kernels/arith.cl', 'r')
    program_text = program_file.read()
    program = cl.Program(context, program_text)

    # Build program and print log in the event of an error
    try:
        program.build()
    except:
        print("Build log:")
        print(program.get_build_info(devices[0], cl.program_build_info.LOG))
        raise

    # Create kernel from 'add' function
    add_kernel = cl.Kernel(program, 'add')

    # Create kernel from 'multiply' function
    mult_kernel = program.multiply

    print("Kernel Name:"),
    print(mult_kernel.get_info(cl.kernel_info.FUNCTION_NAME))
예제 #30
0
파일: column_byte.py 프로젝트: szhan/pyrqa
    def process_sub_matrix(self, *args, **kwargs):
        device = kwargs['device']
        sub_matrix_queue = kwargs['sub_matrix_queue']

        context = self.opencl.contexts[device]
        command_queue = self.opencl.command_queues[device]
        program = self.opencl.programs[device]

        create_matrix_kernel = cl.Kernel(program, 'create_matrix')

        while True:
            try:
                sub_matrix = sub_matrix_queue.get(False)

                transfer_from_device_events = []
                transfer_to_device_events = []
                create_matrix_events = []

                # Time series X
                time_series_x = self.get_time_series_x(sub_matrix)
                time_series_x_buffer = cl.Buffer(
                    context, cl.mem_flags.READ_ONLY,
                    time_series_x.size * time_series_x.itemsize)

                transfer_to_device_events.append(
                    cl.enqueue_write_buffer(command_queue,
                                            time_series_x_buffer,
                                            time_series_x,
                                            device_offset=0,
                                            wait_for=None,
                                            is_blocking=False))

                # Time series Y
                time_series_y = self.get_time_series_y(sub_matrix)
                time_series_y_buffer = cl.Buffer(
                    context, cl.mem_flags.READ_ONLY,
                    time_series_y.size * time_series_y.itemsize)

                transfer_to_device_events.append(
                    cl.enqueue_write_buffer(command_queue,
                                            time_series_y_buffer,
                                            time_series_y,
                                            device_offset=0,
                                            wait_for=None,
                                            is_blocking=False))

                # Recurrence matrix

                matrix = self.get_recurrence_matrix(sub_matrix,
                                                    data_type=self.data_type)
                matrix_buffer = cl.Buffer(context, cl.mem_flags.READ_WRITE,
                                          matrix.size * matrix.itemsize)

                transfer_to_device_events.append(
                    cl.enqueue_write_buffer(command_queue,
                                            matrix_buffer,
                                            matrix,
                                            device_offset=0,
                                            wait_for=None,
                                            is_blocking=False))

                # matrix = np.zeros(1, dtype=self.data_type)
                # matrix_buffer = cl.Buffer(context, cl.mem_flags.READ_WRITE, int(self.get_matrix_size(sub_matrix, self.data_type)))
                # transfer_to_device_events.append( cl.enqueue_write_buffer(command_queue, matrix_buffer, matrix, device_offset=0, wait_for=None, is_blocking=False) )

                # Create matrix kernel
                create_matrix_args = [
                    time_series_x_buffer, time_series_y_buffer,
                    np.uint32(sub_matrix.dim_x),
                    np.uint32(self.settings.embedding_dimension),
                    np.uint32(self.settings.time_delay),
                    np.float32(self.settings.neighbourhood.radius),
                    matrix_buffer
                ]

                OpenCL.set_kernel_args(create_matrix_kernel,
                                       create_matrix_args)

                global_work_size = [
                    int(sub_matrix.dim_x +
                        (device.max_work_group_size -
                         (sub_matrix.dim_x % device.max_work_group_size))),
                    int(sub_matrix.dim_y)
                ]
                local_work_size = None

                create_matrix_events.append(
                    cl.enqueue_nd_range_kernel(command_queue,
                                               create_matrix_kernel,
                                               global_work_size,
                                               local_work_size))

                command_queue.finish()

                # Read buffer
                transfer_from_device_events.append(
                    cl.enqueue_read_buffer(command_queue,
                                           matrix_buffer,
                                           matrix,
                                           device_offset=0,
                                           wait_for=None,
                                           is_blocking=False))

                command_queue.finish()

                # Insert in recurrence matrix
                self.insert_sub_matrix(sub_matrix, matrix)

                # Get events runtimes
                runtimes = Runtimes()
                runtimes.transfer_to_device = self.opencl.convert_events_runtime(
                    transfer_to_device_events)
                runtimes.transfer_from_device = self.opencl.convert_events_runtime(
                    transfer_from_device_events)
                runtimes.create_matrix = self.opencl.convert_events_runtime(
                    create_matrix_events)

                self.threads_runtimes[device] += runtimes

            except Queue.Empty:
                break