예제 #1
0
    def vglClNdCopy(self, img_input, img_output):
        print("# Running vglClNdCopy")
        if (not img_input.clForceAsBuf == vl.IMAGE_ND_ARRAY()):
            print(
                "vglClNdCopy: Error: this function supports only OpenCL data as buffer and img_input isn't."
            )
            exit()
        if (not img_output.clForceAsBuf == vl.IMAGE_ND_ARRAY()):
            print(
                "vglClNdCopy: Error: this function supports only OpenCL data as buffer and img_output isn't."
            )
            exit()

        vl.vglCheckContext(img_input, vl.VGL_CL_CONTEXT())
        vl.vglCheckContext(img_output, vl.VGL_CL_CONTEXT())

        _program = self.cl_ctx.get_compiled_kernel("../CL_ND/vglClNdCopy.cl",
                                                   "vglClNdCopy")
        kernel_run = _program.vglClNdCopy

        kernel_run.set_arg(0, img_input.get_oclPtr())
        kernel_run.set_arg(1, img_output.get_oclPtr())

        cl.enqueue_nd_range_kernel(self.ocl.commandQueue, kernel_run,
                                   img_output.get_ipl().shape, None)

        vl.vglSetContext(img_output, vl.VGL_CL_CONTEXT())
예제 #2
0
파일: tools.py 프로젝트: hagisgit/SLIC
def max_length_real4(ipt):
     out = CLReal(len(ipt)) 
     kern = _lengthkern_real4.kern
     kern.set_arg(0, ipt._buffer)
     kern.set_arg(1, out._buffer)
     cl.enqueue_nd_range_kernel(ipt._ctrl.clqueue, kern, (len(ipt),), None)
     return max_reduce(out)
예제 #3
0
    def search(self, midstate):
        msg = flipendian32(midstate)

        for i in xrange(8):
            self.sha512_fill.set_arg(i, msg[i * 4:i * 4 + 4])
        self.sha512_fill.set_arg(8, self.hashes_buf)
        self.sha512_fill.set_arg(9, self.keyhash_buf)
        # t1 = time.time()
        cl.enqueue_nd_range_kernel(self.queue, self.sha512_fill,
                                   (HASHES_NUM, ), (self.sha512_fill_ws, ))
        self.queue.finish()
        # print "fill %f" % (time.time() - t1)

        output = bytearray(OUTPUT_SIZE)
        cl.enqueue_write_buffer(self.queue, self.output_buf, output)
        self.queue.finish()

        self.ksearch.set_arg(0, self.hashes_buf)
        self.ksearch.set_arg(1, self.keyhash_buf)
        self.ksearch.set_arg(2, self.output_buf)
        cl.enqueue_nd_range_kernel(self.queue, self.ksearch, (KEYS_NUM, ),
                                   (self.ksearch_ws, ))
        self.queue.finish()
        cl.enqueue_read_buffer(self.queue, self.output_buf, output)
        self.queue.finish()
        return str(output)
예제 #4
0
파일: cl2py_BIN.py 프로젝트: arturxz/TCC
    def vglClBinConway(self, img_input, img_output):

        vl.vglCheckContext(img_input, vl.VGL_CL_CONTEXT())
        vl.vglCheckContext(img_output, vl.VGL_CL_CONTEXT())

        _program = self.cl_ctx.get_compiled_kernel(
            "../CL_BIN/vglClBinConway.cl", "vglClBinConway")
        kernel_run = _program.vglClBinConway

        mobj_img_shape = img_input.getVglShape().get_asVglClShape_buffer()

        kernel_run.set_arg(0, img_input.get_oclPtr())
        kernel_run.set_arg(1, img_output.get_oclPtr())
        kernel_run.set_arg(2, mobj_img_shape)

        _worksize_0 = img_input.getWidthIn()
        if (img_input.depth == vl.IPL_DEPTH_1U()):
            _worksize_0 = img_input.getWidthStep()
        if (img_output.depth == vl.IPL_DEPTH_1U()):
            _worksize_0 = img_output.getWidthStep()

        worksize = (int(_worksize_0), img_input.getHeigthIn(),
                    img_input.getNFrames())

        cl.enqueue_nd_range_kernel(self.ocl.commandQueue, kernel_run, worksize,
                                   None)
        #cl.enqueue_nd_range_kernel(self.ocl.commandQueue, kernel_run, img_output.get_oclPtr().shape, None)

        vl.vglSetContext(img_output, vl.VGL_CL_CONTEXT())
예제 #5
0
def do_opencl_pow(hash, target):
    global ctx, queue, program, gpus, hash_dt

    output = numpy.zeros(1, dtype=[("v", numpy.uint64, 1)])
    if ctx == False:
        return output[0][0]

    data = numpy.zeros(1, dtype=hash_dt, order="C")
    data[0]["v"] = ("0000000000000000" + hash).decode("hex")
    data[0]["target"] = target

    hash_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=data)
    dest_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, output.nbytes)

    kernel = program.kernel_sha512
    worksize = kernel.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, gpus[0])

    kernel.set_arg(0, hash_buf)
    kernel.set_arg(1, dest_buf)

    start = time.time()
    progress = 0
    globamt = worksize * 2000

    while output[0][0] == 0:
        kernel.set_arg(2, pack("<Q", progress))
        cl.enqueue_nd_range_kernel(queue, kernel, (globamt,), (worksize,))
        cl.enqueue_read_buffer(queue, dest_buf, output)
        queue.finish()
        progress += globamt
        sofar = time.time() - start
    # 		logger.debug("Working for %.3fs, %.2f Mh/s", sofar, (progress / sofar) / 1000000)
    taken = time.time() - start
    # 	logger.debug("Took %d tries.", progress)
    return output[0][0]
예제 #6
0
def do_opencl_pow(hash, target):
    output = numpy.zeros(1, dtype=[('v', numpy.uint64, 1)])
    if (len(enabledGpus) == 0):
        return output[0][0]

    data = numpy.zeros(1, dtype=hash_dt, order='C')
    data[0]['v'] = ("0000000000000000" + hash).decode("hex")
    data[0]['target'] = target

    hash_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=data)
    dest_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, output.nbytes)

    kernel = program.kernel_sha512
    worksize = kernel.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, enabledGpus[0])

    kernel.set_arg(0, hash_buf)
    kernel.set_arg(1, dest_buf)

    start = time.time()
    progress = 0
    globamt = worksize*2000

    while output[0][0] == 0 and shutdown == 0:
        kernel.set_arg(2, pack("<Q", progress))
        cl.enqueue_nd_range_kernel(queue, kernel, (globamt,), (worksize,))
        cl.enqueue_read_buffer(queue, dest_buf, output)
        queue.finish()
        progress += globamt
        sofar = time.time() - start
#       logger.debug("Working for %.3fs, %.2f Mh/s", sofar, (progress / sofar) / 1000000)
    if shutdown != 0:
        raise Exception ("Interrupted")
    taken = time.time() - start
#   logger.debug("Took %d tries.", progress)
    return output[0][0]
예제 #7
0
def vglClNdCopy(img_input, img_output):

    if (not img_input.clForceAsBuf == vl.IMAGE_ND_ARRAY()):
        print(
            "vglClNdCopy: Error: this function supports only OpenCL data as buffer and img_input isn't."
        )
        exit(1)

    if (not img_output.clForceAsBuf == vl.IMAGE_ND_ARRAY()):
        print(
            "vglClNdCopy: Error: this function supports only OpenCL data as buffer and img_output isn't."
        )
        exit(1)

    vl.vglCheckContext(img_input, vl.VGL_CL_CONTEXT())
    vl.vglCheckContext(img_output, vl.VGL_CL_CONTEXT())

    _program = vl.get_ocl_context().get_compiled_kernel(
        "CL_ND/vglClNdCopy.cl", "vglClNdCopy")
    _kernel = _program.vglClNdCopy

    _kernel.set_arg(0, img_input.get_oclPtr())
    _kernel.set_arg(1, img_output.get_oclPtr())

    # THIS IS A BLOCKING COMMAND. IT EXECUTES THE KERNEL.
    cl.enqueue_nd_range_kernel(vl.get_ocl().commandQueue, _kernel,
                               img_input.get_ipl().shape, None)

    vl.vglSetContext(img_input, vl.VGL_CL_CONTEXT())

    vl.vglSetContext(img_output, vl.VGL_CL_CONTEXT())
예제 #8
0
def do_opencl_pow(hash_, target):
    """Perform PoW using OpenCL"""
    output = numpy.zeros(1, dtype=[('v', numpy.uint64, 1)])
    if not enabledGpus:
        return output[0][0]

    data = numpy.zeros(1, dtype=hash_dt, order='C')
    data[0]['v'] = ("0000000000000000" + hash_).decode("hex")
    data[0]['target'] = target

    hash_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=data)
    dest_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, output.nbytes)

    kernel = program.kernel_sha512
    worksize = kernel.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, enabledGpus[0])

    kernel.set_arg(0, hash_buf)
    kernel.set_arg(1, dest_buf)

    progress = 0
    globamt = worksize * 2000

    while output[0][0] == 0 and shutdown == 0:
        kernel.set_arg(2, pack("<Q", progress))
        cl.enqueue_nd_range_kernel(queue, kernel, (globamt,), (worksize,))
        try:
            cl.enqueue_read_buffer(queue, dest_buf, output)
        except AttributeError:
            cl.enqueue_copy(queue, output, dest_buf)
        queue.finish()
        progress += globamt
    if shutdown != 0:
        raise Exception("Interrupted")
#   logger.debug("Took %d tries.", progress)
    return output[0][0]
예제 #9
0
    def runFilter(self):

        if self.atts.height == 1 and self.atts.slices == 1:
            mid = 1
        elif self.atts.slices == 1:
            mid = 4
        else:
            mid = 13

        globalSize = [0, 0]
        localSize = [0, 0]
        self.clattr.computeWorkingGroupSize(
            localSize, globalSize, [self.atts.width, self.atts.height, 1])

        try:
            # set up parameters
            self.kernel.set_args(self.clattr.inputBuffer,
                                 self.clattr.outputBuffer,
                                 np.int32(self.atts.width),
                                 np.int32(self.atts.height),
                                 np.int32(self.clattr.maxSliceCount),
                                 np.int32(mid))

            # execute kernel
            cl.enqueue_nd_range_kernel(self.clattr.queue, self.kernel,
                                       globalSize, localSize)

        except Exception as e:
            raise e

        # write results
        cl.enqueue_copy(self.clattr.queue, self.clattr.inputBuffer,
                        self.clattr.outputBuffer)
        self.clattr.queue.finish()
        return True
예제 #10
0
    def runFilter(self):

        globalSize = [0, 0]
        localSize = [0, 0]
        self.clattr.computeWorkingGroupSize(
            localSize, globalSize, [self.atts.width, self.atts.height, 1])

        try:
            self.kernel.set_args(
                self.clattr.inputBuffer, self.clattr.outputBuffer,
                np.int32(self.atts.width), np.int32(self.atts.height),
                np.int32(self.clattr.maxSliceCount + self.getInfo().overlapZ),
                self.spatialKernel, np.int32((self.spatialRadius + 1) * 2 - 1),
                self.rangeKernel, np.int32((self.rangeRadius + 1) * 2 - 1))

            cl.enqueue_nd_range_kernel(self.clattr.queue, self.kernel,
                                       globalSize, localSize)

        except Exception as e:
            raise e

        # write results
        cl.enqueue_copy(self.clattr.queue, self.clattr.inputBuffer,
                        self.clattr.outputBuffer)
        self.clattr.queue.finish()
        return True
예제 #11
0
파일: filter.py 프로젝트: Kobtul/documents
 def prefixSumUp(self, e, data, ndata, data2, ndata2, events):
     import numpy as np
     import pyopencl as cl
     mf = cl.mem_flags
     
     if not isinstance(data, cl.Buffer):
         data_buf = cl.Buffer(self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=data)
     else:
         data_buf = data
     
     if not isinstance(data2, cl.Buffer):
         data2_buf = cl.Buffer(self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=data2)
     else:
         data2_buf = data2
             
     kernel = self.prg.prefixSumUp
     kernel.set_args(data_buf, np.uint64(ndata), data2_buf, np.uint64(ndata2))
     
     global_dims = self.get_global(self.get_grid_dims(ndata))
     
     print "prefixSumUp"
     if e is None:
         e  = ( cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims, wait_for=e), )
     else:
         e  = ( cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims), )
     events += e
     
     return (e, data_buf, data2_buf)
예제 #12
0
    def compute(self, image, num_bins):
        width, height = np.shape(image)
        numpixels = width * height

        image = np.reshape(image, (numpixels, )).astype(np.float32)
        result = np.zeros((numpixels * num_bins, ), dtype=np.float32)

        mf = cl.mem_flags
        self.buf_image = cl.Buffer(self.context,
                                   mf.READ_ONLY | mf.COPY_HOST_PTR,
                                   hostbuf=image)
        self.output_buf = cl.Buffer(self.context, mf.READ_WRITE, result.nbytes)

        kernel = self.program.iif_binid
        kernel.set_scalar_arg_dtypes([np.uintc, np.uintc, np.ubyte] +
                                     [None] * 2)
        kernel.set_arg(0, np.uintc(width))
        kernel.set_arg(1, np.uintc(height))
        kernel.set_arg(2, np.ubyte(num_bins))
        kernel.set_arg(3, self.buf_image)
        kernel.set_arg(4, self.output_buf)

        cl.enqueue_nd_range_kernel(self.queue, kernel, image.shape,
                                   None).wait()

        cl.enqueue_read_buffer(self.queue, self.output_buf, result).wait()
        return np.reshape(result, (width, height, num_bins)).astype(np.float32)
예제 #13
0
    def compute(self, floatimage, histogram, k):
        width, height, nbins = np.shape(histogram)
        numpixels = width * height

        image_linear = np.reshape(floatimage, (numpixels, )).astype(np.float32)
        histogram_linear = np.reshape(
            histogram, (np.size(histogram), )).astype(np.float32)
        transform = np.zeros_like(image_linear).astype(np.float32)

        mf = cl.mem_flags
        self.buf_image = cl.Buffer(self.context,
                                   mf.READ_ONLY | mf.COPY_HOST_PTR,
                                   hostbuf=image_linear)
        self.buf_histogram = cl.Buffer(self.context,
                                       mf.READ_ONLY | mf.COPY_HOST_PTR,
                                       hostbuf=histogram_linear)
        self.output_buf = cl.Buffer(self.context, mf.READ_WRITE,
                                    transform.nbytes)

        kernel = self.program.IIF
        kernel.set_scalar_arg_dtypes([np.uintc, np.uintc, np.float32] +
                                     [None] * 3)
        kernel.set_arg(0, np.uintc(width))
        kernel.set_arg(1, np.uintc(height))
        kernel.set_arg(2, np.float32(k))
        kernel.set_arg(3, self.buf_image)
        kernel.set_arg(4, self.buf_histogram)
        kernel.set_arg(5, self.output_buf)

        cl.enqueue_nd_range_kernel(self.queue, kernel, image_linear.shape,
                                   None).wait()

        cl.enqueue_read_buffer(self.queue, self.output_buf, transform).wait()
        return np.reshape(transform, (width, height)).astype(np.float)
예제 #14
0
파일: opencl.py 프로젝트: GitPaean/PyOP2
 def __call__(self, thread_count, work_group_size, *args):
     fun = self.compile()
     for i, arg in enumerate(args):
         fun.set_arg(i, arg)
     with timed_region("ParLoop kernel"):
         cl.enqueue_nd_range_kernel(_queue, fun, (thread_count,),
                                    (work_group_size,), g_times_l=False).wait()
예제 #15
0
def applyMorphOp(imgIn, op):
	"apply morphological operation to image using GPU"
	
	# (1) setup OpenCL
	platforms = cl.get_platforms() # a platform corresponds to a driver (e.g. AMD)
	platform = platforms[0] # take first platform
	devices = platform.get_devices(cl.device_type.GPU) # get GPU devices of selected platform
	device = devices[0] # take first GPU
	context = cl.Context([device]) # put selected GPU into context object
	queue = cl.CommandQueue(context, device) # create command queue for selected GPU and context

	# (2) get shape of input image, allocate memory for output to which result can be copied to
	shape = imgIn.T.shape
	imgOut = np.empty_like(imgIn)	
	
	# (2) create image buffers which hold images for OpenCL
	imgInBuf = cl.Image(context, cl.mem_flags.READ_ONLY, cl.ImageFormat(cl.channel_order.LUMINANCE, cl.channel_type.UNORM_INT8), shape=shape) # holds a gray-valued image of given shape
	imgOutBuf = cl.Image(context, cl.mem_flags.WRITE_ONLY, cl.ImageFormat(cl.channel_order.LUMINANCE, cl.channel_type.UNORM_INT8), shape=shape) # placeholder for gray-valued image of given shape
	
	# (3) load and compile OpenCL program
	program = cl.Program(context, open('Erosion_Dilation.cl').read()).build()

	# (3) from OpenCL program, get kernel object and set arguments (input image, operation type, output image)
	kernel = cl.Kernel(program, 'morphOpKernel') # name of function according to kernel.py
	kernel.set_arg(0, imgInBuf) # input image buffer
	kernel.set_arg(1, np.uint32(op)) # operation type passed as an integer value (dilate=0, erode=1)
	kernel.set_arg(2, imgOutBuf) # output image buffer
	
	# (4) copy image to device, execute kernel, copy data back
	cl.enqueue_copy(queue, imgInBuf, imgIn, origin=(0, 0), region=shape, is_blocking=False) # copy image from CPU to GPU
	cl.enqueue_nd_range_kernel(queue, kernel, shape, None) # execute kernel, work is distributed across shape[0]*shape[1] work-items (one work-item per pixel of the image)
	cl.enqueue_copy(queue, imgOut, imgOutBuf, origin=(0, 0), region=shape, is_blocking=True) # wait until finished copying resulting image back from GPU to CPU
	
	return imgOut
예제 #16
0
파일: filter.py 프로젝트: Kobtul/documents
 def filterPrepare(self, e, data, keys, ndata, events):
     import numpy as np
     import pyopencl as cl
     mf = cl.mem_flags
     
     ndata = data.size
     if keys.size != ndata: raise Exception()
     
     filtbytes = np.bool8(False).nbytes * ndata
     
     if not isinstance(data, cl.Buffer):
         data_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf= data)
     else:
         data_buf = data
     
     if not isinstance(keys, cl.Buffer):
         keys_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf= keys)
     else:
         keys_buf = keys
     
     filt_buf = cl.Buffer(self.ctx, mf.READ_WRITE, filtbytes)
     
     kernel = self.prg.filterPrepare
     kernel.set_args(data_buf, keys_buf, np.uint64(ndata), np.uint8(33), np.uint8(66), filt_buf)
     global_dims = self.get_global(self.get_grid_dims(ndata))
     
     print "filterPrepare"
     if e is None:
         e  = [ cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims), ]
     else:
         e  = [ cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims, wait_for=e), ]
     events += e
     
     return (e, data_buf, keys_buf, filt_buf)
예제 #17
0
    def run_kernel(self, kernel, grid_size, stream=None):
        global_size = []
        for i, dim in enumerate(grid_size):
            global_size.append(dim * kernel.block[i])

        cl.enqueue_nd_range_kernel(self.default_queue, kernel, global_size,
                                   kernel.block[0:len(global_size)])
예제 #18
0
 def run(self):
     cl.enqueue_nd_range_kernel(
         self.queue,
         self.kernel,
         self.global_size,
         self.local_size,
     ).wait()
예제 #19
0
 def __call__(self, thread_count, work_group_size, *args):
     fun = self.compile()
     for i, arg in enumerate(args):
         fun.set_arg(i, arg)
     with timed_region("ParLoopCKernel"):
         cl.enqueue_nd_range_kernel(_queue, fun, (thread_count,),
                                    (work_group_size,), g_times_l=False).wait()
예제 #20
0
def do_opencl_pow(hash, target):
	output = numpy.zeros(1, dtype=[('v', numpy.uint64, 1)])
	if (ctx == False):
		return output[0][0]
	
	data = numpy.zeros(1, dtype=hash_dt, order='C')
	data[0]['v'] = ("0000000000000000" + hash).decode("hex")
	data[0]['target'] = target
	
	hash_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=data)
	dest_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, output.nbytes)
	
	kernel = program.kernel_sha512
	worksize = kernel.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, cl.get_platforms()[0].get_devices()[1])

	kernel.set_arg(0, hash_buf)
	kernel.set_arg(1, dest_buf)

	start = time.time()
	progress = 0
	globamt = worksize*2000

	while output[0][0] == 0:
		kernel.set_arg(2, pack("<Q", progress))
		cl.enqueue_nd_range_kernel(queue, kernel, (globamt,), (worksize,))
		cl.enqueue_read_buffer(queue, dest_buf, output)
		queue.finish()
		progress += globamt
		sofar = time.time() - start
		print sofar, progress / sofar, "hashes/sec"
	taken = time.time() - start
	print progress, taken
	return output[0][0]
예제 #21
0
def do_opencl_pow(hash, target):
	output = numpy.zeros(1, dtype=[('v', numpy.uint64, 1)])
	if (ctx == False):
		return output[0][0]
	
	data = numpy.zeros(1, dtype=hash_dt, order='C')
	data[0]['v'] = ("0000000000000000" + hash).decode("hex")
	data[0]['target'] = target
	
	hash_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=data)
	dest_buf = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, output.nbytes)
	
	kernel = program.kernel_sha512
	worksize = kernel.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, gpus[0])

	kernel.set_arg(0, hash_buf)
	kernel.set_arg(1, dest_buf)

	start = time.time()
	progress = 0
	globamt = worksize*2000

	while output[0][0] == 0:
		kernel.set_arg(2, pack("<Q", progress))
		cl.enqueue_nd_range_kernel(queue, kernel, (globamt,), (worksize,))
		cl.enqueue_read_buffer(queue, dest_buf, output)
		queue.finish()
		progress += globamt
		sofar = time.time() - start
		print sofar, progress / sofar, "hashes/sec"
	taken = time.time() - start
	print progress, taken
	return output[0][0]
예제 #22
0
    def runFilter(self):
        mask = self.atts.getMaskImages(self.mask, self.L)[0]

        if self.atts.width*self.atts.height*self.atts.slices != np.product(mask.shape):
            print("Mask dimensions not equal to original image's")
            return False

        globalSize = [0]
        localSize = [0]

        self.clattr.computeWorkingGroupSize(localSize, globalSize, [self.atts.width, self.atts.height,
                                                self.clattr.maxSliceCount + self.atts.overlap[self.index]])
        self.maskBuffer = self.atts.getStructElement(self.clattr.context, self.clattr.queue, mask, globalSize[0])

        try:
            self.kernel.set_args(self.clattr.inputBuffer, self.maskBuffer, self.clattr.outputBuffer,
                                 np.int32(self.atts.width), np.int32(self.atts.height),
                                 np.int32(self.clattr.maxSliceCount + self.atts.overlap[self.index]))

            cl.enqueue_nd_range_kernel(self.clattr.queue, self.kernel, globalSize, localSize)

        except Exception as e:
            raise e

            # write results
        cl.enqueue_copy(self.clattr.queue, self.clattr.inputBuffer, self.clattr.outputBuffer)
        self.clattr.queue.finish()

        return True
예제 #23
0
def vglCl3dThreshold(img_input, img_output, thresh, top=1.0):
	print("# Running vglCl3dThreshold")
	vl.vglCheckContext(img_input, vl.VGL_CL_CONTEXT())
	vl.vglCheckContext(img_output, vl.VGL_CL_CONTEXT())

	if( not isinstance(thresh, np.float32) ):
		print("vglCl3dThreshold: Warning: thresh not np.float32! Trying to convert...")
		try:
			thresh = np.float32(thresh)
		except Exception as e:
			print("vglCl3dThreshold: Error!! Impossible to convert thresh as a np.float32 object.")
			print(str(e))
			exit()
		
	if( not isinstance(top, np.float32) ):
		print("vglCl3dThreshold: Warning: top not np.float32! Trying to convert...")
		try:
			top = np.float32(top)
		except Exception as e:
			print("vglCl3dThreshold: Error!! Impossible to convert top as a np.float32 object.")
			print(str(e))
			exit()
		
	_program = vl.get_ocl_context().get_compiled_kernel("../CL/vglCl3dThreshold.cl", "vglCl3dThreshold")
	kernel_run = _program.vglCl3dThreshold

	kernel_run.set_arg(0, img_input.get_oclPtr())
	kernel_run.set_arg(1, img_output.get_oclPtr())
	kernel_run.set_arg(2, thresh)
	kernel_run.set_arg(3, top)
			
	cl.enqueue_nd_range_kernel(vl.get_ocl().commandQueue, kernel_run, img_output.get_oclPtr().shape, None)

	vl.vglSetContext(img_output, vl.VGL_CL_CONTEXT())
예제 #24
0
def vglClNdThreshold(img_input, img_output, thresh, top=255):

    if (not img_input.clForceAsBuf == vl.IMAGE_ND_ARRAY()):
        print(
            "vglClNdCopy: Error: this function supports only OpenCL data as buffer and img_input isn't."
        )
        exit(1)

    if (not img_output.clForceAsBuf == vl.IMAGE_ND_ARRAY()):
        print(
            "vglClNdCopy: Error: this function supports only OpenCL data as buffer and img_output isn't."
        )
        exit(1)

    vl.vglCheckContext(img_input, vl.VGL_CL_CONTEXT())
    vl.vglCheckContext(img_output, vl.VGL_CL_CONTEXT())
    # EVALUATING IF thresh IS IN CORRECT TYPE
    if (not isinstance(thresh, np.uint8)):
        print(
            "vglClConvolution: Warning: thresh not np.uint8! Trying to convert..."
        )
        try:
            thresh = np.uint8(thresh)
        except Exception as e:
            print(
                "vglClConvolution: Error!! Impossible to convert thresh as a np.uint8 object."
            )
            print(str(e))
            exit()
    # EVALUATING IF top IS IN CORRECT TYPE
    if (not isinstance(top, np.uint8)):
        print(
            "vglClConvolution: Warning: top not np.uint8! Trying to convert..."
        )
        try:
            top = np.uint8(top)
        except Exception as e:
            print(
                "vglClConvolution: Error!! Impossible to convert top as a np.uint8 object."
            )
            print(str(e))
            exit()

    _program = vl.get_ocl_context().get_compiled_kernel(
        "CL_ND/vglClNdThreshold.cl", "vglClNdThreshold")
    _kernel = _program.vglClNdThreshold

    _kernel.set_arg(0, img_input.get_oclPtr())
    _kernel.set_arg(1, img_output.get_oclPtr())
    _kernel.set_arg(2, thresh)
    _kernel.set_arg(3, top)

    # THIS IS A BLOCKING COMMAND. IT EXECUTES THE KERNEL.
    cl.enqueue_nd_range_kernel(vl.get_ocl().commandQueue, _kernel,
                               img_input.get_ipl().shape, None)

    vl.vglSetContext(img_input, vl.VGL_CL_CONTEXT())

    vl.vglSetContext(img_output, vl.VGL_CL_CONTEXT())
예제 #25
0
    def vglClNdBinThreshold(self, img_input, img_output, thresh):

        if (not img_input.clForceAsBuf == vl.IMAGE_ND_ARRAY()):
            print(
                "vglClNdBinThreshold: Error: this function supports only OpenCL data as buffer and img_input isn't."
            )
            exit()
        if (not img_output.clForceAsBuf == vl.IMAGE_ND_ARRAY()):
            print(
                "vglClNdBinThreshold: Error: this function supports only OpenCL data as buffer and img_output isn't."
            )
            exit()

        if (not isinstance(thresh, np.uint8)):
            print(
                "vglClNdBinThreshold: Warning: thresh not np.uint8! Trying to convert..."
            )
            try:
                thresh = np.uint8(thresh)
            except Exception as e:
                print(
                    "vglClNdBinThreshold: Error!! Impossible to convert thresh as a np.uint8 object."
                )
                print(str(e))
                exit()

        vl.vglCheckContext(img_input, vl.VGL_CL_CONTEXT())
        vl.vglCheckContext(img_output, vl.VGL_CL_CONTEXT())

        _program = self.cl_ctx.get_compiled_kernel(
            "../CL_BIN/vglClNdBinThreshold.cl", "vglClNdBinThreshold")
        kernel_run = _program.vglClNdBinThreshold

        mobj_img_shape_input = img_input.getVglShape().get_asVglClShape_buffer(
        )
        mobj_img_shape_output = img_output.getVglShape(
        ).get_asVglClShape_buffer()

        kernel_run.set_arg(0, img_input.get_oclPtr())
        kernel_run.set_arg(1, img_output.get_oclPtr())
        kernel_run.set_arg(2, thresh)
        kernel_run.set_arg(3, mobj_img_shape_input)
        kernel_run.set_arg(4, mobj_img_shape_output)

        _worksize_0 = img_input.getWidthIn()
        if (img_input.depth == vl.IPL_DEPTH_1U()):
            _worksize_0 = img_input.getWidthStep()
        if (img_output.depth == vl.IPL_DEPTH_1U()):
            _worksize_0 = img_output.getWidthStep()

        worksize = (int(_worksize_0), img_input.getHeigthIn(),
                    img_input.getNFrames())

        # ENQUEUEING KERNEL EXECUTION
        #cl.enqueue_nd_range_kernel(self.ocl.commandQueue, kernel_run, worksize, None)
        cl.enqueue_nd_range_kernel(self.ocl.commandQueue, kernel_run,
                                   img_output.ipl.shape, None)

        vl.vglSetContext(img_output, vl.VGL_CL_CONTEXT())
예제 #26
0
파일: QclKernel.py 프로젝트: hagisgit/qcl
 def exec_lsz_safe(self, localsize):
     """execute the kernel with specific localsize.
     Safe also for lernels with local variables"""
     oldloc = int(self._localsize)
     self.localsize = localsize
     cl.enqueue_nd_range_kernel(self._solverobj.clqueue, self._clkernel, (self.globalsize,), (self.localsize,))
     self._solverobj.clqueue.finish()
     self.localsize = oldloc 
예제 #27
0
def Difference(img1, img2, threshold):
    img1 = np.array(img1).astype('uint8')
    img2 = np.array(img2).astype('uint8')
    platforms = cl.get_platforms()
    platform = platforms[0]
    devices = platform.get_devices(cl.device_type.GPU)
    device = devices[0]
    context = cl.Context([device])
    queue = cl.CommandQueue(context, device)

    shape = img1.T.shape
    result = np.empty_like(img1)

    imgInBuf1 = cl.Image(context,
                         cl.mem_flags.READ_ONLY,
                         cl.ImageFormat(cl.channel_order.LUMINANCE,
                                        cl.channel_type.UNORM_INT8),
                         shape=shape)
    imgInBuf2 = cl.Image(context,
                         cl.mem_flags.READ_ONLY,
                         cl.ImageFormat(cl.channel_order.LUMINANCE,
                                        cl.channel_type.UNORM_INT8),
                         shape=shape)
    imgOutBuf = cl.Image(context,
                         cl.mem_flags.WRITE_ONLY,
                         cl.ImageFormat(cl.channel_order.LUMINANCE,
                                        cl.channel_type.UNORM_INT8),
                         shape=shape)

    program = cl.Program(context, open('Difference.cl').read()).build()

    kernel = cl.Kernel(program, 'Difference')
    kernel.set_arg(0, imgInBuf1)
    kernel.set_arg(1, imgInBuf2)
    kernel.set_arg(2, imgOutBuf)
    kernel.set_arg(3, np.float32(threshold))

    cl.enqueue_copy(queue,
                    imgInBuf1,
                    img1,
                    origin=(0, 0),
                    region=shape,
                    is_blocking=False)
    cl.enqueue_copy(queue,
                    imgInBuf2,
                    img2,
                    origin=(0, 0),
                    region=shape,
                    is_blocking=False)
    cl.enqueue_nd_range_kernel(queue, kernel, shape, None)
    cl.enqueue_copy(queue,
                    result,
                    imgOutBuf,
                    origin=(0, 0),
                    region=shape,
                    is_blocking=True)

    return result
예제 #28
0
 def futhark_main(self, screenX_700, screenY_701, depth_702, xmin_703,
                  ymin_704, xmax_705, ymax_706):
     res_707 = (xmax_705 - xmin_703)
     res_708 = (ymax_706 - ymin_704)
     y_711 = sitofp_i32_f32(screenX_700)
     y_712 = sitofp_i32_f32(screenY_701)
     x_713 = slt32(np.int32(0), depth_702)
     bytes_902 = (np.int32(4) * screenY_701)
     mem_903 = cl.Buffer(
         self.ctx, cl.mem_flags.READ_WRITE,
         long(
             long(bytes_902) if (bytes_902 > np.int32(0)) else np.int32(1)))
     mem_905 = cl.Buffer(
         self.ctx, cl.mem_flags.READ_WRITE,
         long(
             long(bytes_902) if (bytes_902 > np.int32(0)) else np.int32(1)))
     group_size_911 = np.int32(512)
     num_groups_912 = squot32(
         ((screenY_701 + group_size_911) - np.int32(1)), group_size_911)
     if ((np.int32(1) * (num_groups_912 * group_size_911)) != np.int32(0)):
         self.map_kernel_894_var.set_args(np.float32(ymin_704),
                                          np.float32(y_712),
                                          np.float32(res_708),
                                          np.int32(screenY_701), mem_903,
                                          mem_905)
         cl.enqueue_nd_range_kernel(
             self.queue, self.map_kernel_894_var, (long(
                 (num_groups_912 * group_size_911)), ),
             (long(group_size_911), ))
         if synchronous:
             self.queue.finish()
     nesting_size_844 = (screenX_700 * screenY_701)
     bytes_906 = (bytes_902 * screenX_700)
     mem_908 = cl.Buffer(
         self.ctx, cl.mem_flags.READ_WRITE,
         long(
             long(bytes_906) if (bytes_906 > np.int32(0)) else np.int32(1)))
     group_size_917 = np.int32(512)
     num_groups_918 = squot32(
         (((screenY_701 * screenX_700) + group_size_917) - np.int32(1)),
         group_size_917)
     if ((np.int32(1) * (num_groups_918 * group_size_917)) != np.int32(0)):
         self.map_kernel_846_var.set_args(np.int32(screenX_700),
                                          np.int32(screenY_701), mem_905,
                                          np.byte(x_713),
                                          np.int32(depth_702),
                                          np.float32(xmin_703), mem_903,
                                          np.float32(y_711),
                                          np.float32(res_707), mem_908)
         cl.enqueue_nd_range_kernel(
             self.queue, self.map_kernel_846_var, (long(
                 (num_groups_918 * group_size_917)), ),
             (long(group_size_917), ))
         if synchronous:
             self.queue.finish()
     out_mem_909 = mem_908
     out_memsize_910 = bytes_906
     return (out_memsize_910, out_mem_909)
예제 #29
0
    def dfunKernel(self, state_variables, coupling, local_coupling=0.0):
        n_states = state_variables.shape[0]
        n_nodes = state_variables.shape[1]
        n_mode = state_variables.shape[2]
        # allocate data if not yet done so
        if not hasattr(self, '_arrays'):
            self._alloc_opencl(n_nodes, n_states=n_states, n_mode=n_mode)

        # copy if passed host arrays
        if isinstance(state_variables, numpy.ndarray):
            # state_variables, coupling will be (1, n, 1)
            if (DEBUG):
                print("state_variables are ndarray", "states:",
                      state_variables.shape, "coupling:", coupling.shape)

            #self._arrays['state'][:] = state_variables.reshape((1, n_states*n_nodes*n_mode)).astype('f')
            #self._arrays['coupling'][:] = coupling.reshape((1, n_nodes)).astype('f')

            # self._arrays['state'] = state_variables.flatten()
            #self._arrays['coupling'] = coupling.reshape((1, n_nodes)).astype('f')
            if (DEBUG):
                print(
                    "state_variable shape:",
                    state_variables.reshape(
                        (n_states, n_nodes * n_mode, 1)).astype('f').shape)
                print("array state shape", self._arrays['state'][:].shape)
            self._arrays['state'][:] = state_variables.reshape(
                (n_states, n_nodes, n_mode)).astype('f')
            self._arrays['coupling'][:] = coupling.reshape(
                (1, n_nodes)).astype('f')

        # set kernel arg if passed device arrays
        elif isinstance(state_variables, pyopencl.array.Array):
            self._kernel.set_args(state_variables.data, coupling.data,
                                  self._arrays['param'].data,
                                  self._arrays['deriv'].data)

        # otherwise, complain
        else:
            raise TypeError('unsupported data type %r', type(state_variables))

        # run the kernel and wait
        print("Run kernel...")

        pyopencl.enqueue_nd_range_kernel(self._queue, self._kernel,
                                         (n_nodes, ), None).wait()

        # return derivatives following input type
        deriv = self._arrays['deriv']
        if (DEBUG):
            print("derive shape:", deriv.shape)
        if isinstance(state_variables, numpy.ndarray):
            deriv = deriv.get().reshape(
                (n_states, n_nodes, n_mode)).astype('d')

        return deriv
예제 #30
0
    def runKernel(self, maskImages, overlapAmount):

        globalSize = [0, 0]
        localSize = [0, 0]
        self.clattr.computeWorkingGroupSize(localSize, globalSize, [self.atts.width, self.atts.height, 1])

        for i in range(len(maskImages)):
            mask = maskImages[i]
            size = [0, 0, 0]
            size[2] = mask.shape[0]
            size[1] = mask.shape[1]
            size[0] = mask.shape[2]

            structElem = self.atts.getStructElement(self.clattr.context, self.clattr.queue, mask)
            startOffset = 0
            endOffset = 0

            if self.atts.overlap[self.index] > 0:
                startOffset = int(self.atts.overlap[self.index] / 2)
                endOffset = int(self.atts.overlap[self.index] / 2)

            if self.atts.sliceStart <= 0:
                startOffset = 0
            if self.atts.sliceEnd >= 0:
                endOffset = 0

            if i == 0:
                self.kernel.set_args(self.clattr.inputBuffer, self.clattr.outputTmpBuffer, np.int32(self.atts.width),
                                     np.int32(self.atts.height),
                                     np.int32(self.clattr.maxSliceCount+self.atts.overlap[self.index]),
                                     structElem, np.int32(size[0]), np.int32(size[1]), np.int32(size[2]),
                                     np.int32(startOffset), np.int32(endOffset))
            else:
                tmpBuffer1 = self.clattr.outputTmpBuffer if i%2 != 0 else self.clattr.outputBuffer
                tmpBuffer2 = self.clattr.outputTmpBuffer if i%2 == 0 else self.clattr.outputBuffer

                self.kernel2.set_args(self.clattr.inputBuffer, tmpBuffer1, tmpBuffer2, np.int32(self.atts.width),
                                      np.int32(self.atts.height),
                                      np.int32(self.clattr.maxSliceCount + self.atts.overlap[self.index]),
                                      structElem, np.int32(size[0]), np.int32(size[1]), np.int32(size[2]),
                                      np.int32(startOffset), np.int32(endOffset))

            try:
                cl.enqueue_nd_range_kernel(self.clattr.queue, self.kernel if i ==0 else self.kernel2,
                                           globalSize, localSize)
            except Exception:
                return False

            structElem.release()

        if len(maskImages)%2 != 0:
            tmpBuffer = self.clattr.outputTmpBuffer
            self.clattr.outputTmpBuffer = self.clattr.outputBuffer
            self.clattr.outputBuffer = tmpBuffer

        return True
예제 #31
0
    def test_algorithm(self):
        print "\n**************************"
        print "test_pbrs:"
        passed = 0
        buffersize_in = 188*8
        buffersize_out = 188*8
        # opencl buffer uint
        self.inputbuffer = cl.Buffer(self.ctx , cl.mem_flags.READ_WRITE, size=buffersize_in*4)
        # opencl buffer uint
        self.outputbuffer = cl.Buffer(self.ctx , cl.mem_flags.READ_WRITE, size=buffersize_out*4)

        for k in self.kernelname:
            kernel = self.load_kernel(self.filename, k)
            passed = 0
            self.fd_input = open('test_bench_pbrs_input.csv', 'r')
            self.fd_output = open('test_bench_pbrs_output.csv', 'r')
            for j in range(0,6):
                encoded_data = numpy.array(numpy.zeros(buffersize_out/4), dtype=numpy.uint32)
                data_to_encode = string.replace(self.fd_input.readline(),'\n','')
                reference_data = string.replace(self.fd_output.readline(),'\n','')
                for i in range(0,7):
                    data_to_encode = "%s,%s" % (data_to_encode, string.replace(self.fd_input.readline(),'\n',''))
                    reference_data = "%s,%s" % (reference_data, string.replace(self.fd_output.readline(),'\n',''))

                data_to_encode = numpy.fromstring(numpy.fromstring(data_to_encode, dtype=numpy.uint8, sep=",").tostring(), dtype=numpy.uint32)
                reference_data = numpy.fromstring(reference_data, dtype=numpy.uint8, sep=",")

                cl.enqueue_copy(self.queue, self.inputbuffer, data_to_encode).wait()
                kernel.set_args(self.inputbuffer, self.outputbuffer)
                cl.enqueue_nd_range_kernel(self.queue,kernel,(8,),(8,),None ).wait()
                cl.enqueue_copy(self.queue, encoded_data, self.outputbuffer).wait()
                encoded_data = (numpy.fromstring(encoded_data.tostring(), dtype=numpy.uint8))

                
                if encoded_data.tostring() == reference_data.tostring():
                    passed += 1
                    print "Test %d PASSED" % (j+1)
                else:
                    print "Test %d FAILED" % (j+1)
                    print "input data:"
                    print numpy.fromstring(data_to_encode.tostring(), dtype=numpy.uint8)
                    print "encoded data:"
                    print numpy.fromstring(encoded_data.tostring(), dtype=numpy.uint8)
                    print "reference data:"
                    print reference_data
                    print "error data:"
                    print (reference_data - numpy.fromstring(encoded_data.tostring(), dtype=numpy.uint8))
            print "%d pass out of 6" % passed
            self.fd_input.close()
            self.fd_output.close()
            if passed == 6:
                print "All pbrs tests PASS\n"
                return True
            else:
                print "at least one pbrs test FAILED\n"
                return False
예제 #32
0
    def prefixSumDownInplace(self, e, data, ndata, events):
        import numpy as np
        import pyopencl as cl
        mf = cl.mem_flags

        if not isinstance(data, cl.Buffer):
            data_buf = cl.Buffer(self.ctx,
                                 mf.READ_WRITE | mf.COPY_HOST_PTR,
                                 hostbuf=data)
        else:
            data_buf = data

        grid_dims = self.get_grid_dims(ndata)
        psumbytes = int(np.prod(grid_dims) * np.uint64(0).nbytes)
        npsumbytes = np.uint64(0).nbytes

        psum_buf = cl.Buffer(self.ctx, mf.READ_WRITE, psumbytes)
        npsum_buf = cl.Buffer(self.ctx, mf.READ_WRITE, npsumbytes)

        kernel = self.prg.prefixSumDownInplace
        kernel.set_args(data_buf, np.uint64(ndata), psum_buf, npsum_buf)

        global_dims = self.get_global(grid_dims)

        print "prefixSumDownInplace %s %s %d %d" % (
            str(global_dims), str(self.localDims), ndata, psumbytes)
        if e is None:
            e = (cl.enqueue_nd_range_kernel(self.queue,
                                            kernel,
                                            global_dims,
                                            self.localDims,
                                            wait_for=e), )
        else:
            e = (cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims,
                                            self.localDims), )
        events += e

        npsum = np.zeros(1, dtype=np.uint64)
        events += (cl.enqueue_copy(self.queue, npsum, npsum_buf, wait_for=e), )

        if npsum > 1:
            (e, psum_buf, psum1_buf, npsum1_buf,
             ndata2) = self.prefixSumDownInplace(e, psum_buf, npsum.item(),
                                                 events)
        else:
            ndata2 = np.zeros(1, dtype=np.uint64)
            events += (cl.enqueue_copy(self.queue,
                                       ndata2,
                                       psum_buf,
                                       wait_for=e), )
            ndata2 = ndata2.item()
            print ndata2

        self.prefixSumUp(e, data_buf, ndata, psum_buf, npsum, events)

        return (e, data_buf, psum_buf, npsum_buf, ndata2)
예제 #33
0
def vglCl3dDilate(img_input, img_output, convolution_window, window_size_x, window_size_y, window_size_z):
	print("# Running vglCl3dDilate")		
	vl.vglCheckContext(img_input, vl.VGL_CL_CONTEXT())
	vl.vglCheckContext(img_output, vl.VGL_CL_CONTEXT())
		
	# TRANSFORMAR EM BUFFER
	try:
		cl_convolution_window = cl.Buffer(vl.get_ocl().context, cl.mem_flags.READ_ONLY, convolution_window.nbytes)
		cl.enqueue_copy(vl.get_ocl().commandQueue, cl_convolution_window, convolution_window.tobytes(), is_blocking=True)
		convolution_window = cl_convolution_window
	except Exception as e:
		print("vglCl3dDilate: Error!! Impossible to convert convolution_window to cl.Buffer object.")
		print(str(e))
		exit()

	if( not isinstance(window_size_x, np.uint32) ):
		print("vglCl3dDilate: Warning: window_size_x not np.uint32! Trying to convert...")
		try:
			window_size_x = np.uint32(window_size_x)
		except Exception as e:
			print("vglCl3dDilate: Error!! Impossible to convert window_size_x as a np.uint32 object.")
			print(str(e))
			exit()
		
	if( not isinstance(window_size_y, np.uint32) ):
		print("vglCl3dDilate: Warning: window_size_y not np.uint32! Trying to convert...")
		try:
			window_size_y = np.uint32(window_size_y)
		except Exception as e:
			print("vglCl3dDilate: Error!! Impossible to convert window_size_y as a np.uint32 object.")
			print(str(e))
			exit()
		
	if( not isinstance(window_size_z, np.uint32) ):
		print("vglCl3dDilate: Warning: window_size_z not np.uint32! Trying to convert...")
		try:
			window_size_z = np.uint32(window_size_z)
		except Exception as e:
			print("vglCl3dDilate: Error!! Impossible to convert window_size_z as a np.uint32 object.")
			print(str(e))
			exit()
		
	_program = vl.get_ocl_context().get_compiled_kernel("../CL/vglCl3dDilate.cl", "vglCl3dDilate")
	kernel_run = _program.vglCl3dDilate

	kernel_run.set_arg(0, img_input.get_oclPtr())
	kernel_run.set_arg(1, img_output.get_oclPtr())
	kernel_run.set_arg(2, convolution_window)
	kernel_run.set_arg(3, window_size_x)
	kernel_run.set_arg(4, window_size_y)
	kernel_run.set_arg(5, window_size_z)

	cl.enqueue_nd_range_kernel(vl.get_ocl().commandQueue, kernel_run, img_output.get_oclPtr().shape, None)

	vl.vglSetContext(img_output, vl.VGL_CL_CONTEXT())
예제 #34
0
    def calc_weights_gradient(self):
        """
        Calculate gradient of weights.
        
        This method should be called only for processed layers as it's used
        inputs array which is valid only at processing time.
        """

        for l in self._next_layers:
            if not l[0].processed:
                l[0].calc_weights_gradient()

        queue = self.opencl.queue
        kernel = self.opencl.kernel_calc_layer_gradient

        kernel.set_arg(2, self._inputs_offset)
        kernel.set_arg(3, self._neurons_offset)
        kernel.set_arg(4, self._inputs_per_neuron)
        kernel.set_arg(5, self._weights_offset)
        kernel.set_arg(7, self._weights_count)
        kernel.set_arg(
            8,
            pyopencl.LocalMemory(
                int(4 *
                    (self._inputs_per_neuron + 1 +
                     self.opencl.max_local_size[0] // self._inputs_per_neuron))
            ))

        self._calc_gradient_event = pyopencl.enqueue_nd_range_kernel(
            queue,
            kernel, (int(self._weights_buf_size), ),
            (self.opencl.max_local_size[0], ),
            wait_for=self._calc_gradient_wait_for)
        del self._calc_gradient_wait_for[:]

        kernel = self.opencl.kernel_propagate_errors
        kernel.set_arg(2, self._neurons_offset)
        kernel.set_arg(5, self._neuron_count)
        kernel.set_arg(7, self._inputs_per_neuron)

        i_s = numpy.int32(1)
        for l in self._prev_layers:
            kernel.set_arg(3, l[0]._neurons_offset + l[1])
            kernel.set_arg(4, l[2])
            kernel.set_arg(6, self._weights_offset + i_s)

            l[0]._calc_gradient_wait_for.append(
                pyopencl.enqueue_nd_range_kernel(
                    queue,
                    kernel, (int(l[2] * 64), ), (64, ),
                    wait_for=(self._calc_gradient_event, )))

            i_s += l[2]

        self._processed = True
예제 #35
0
    def execute(self):
        kernel = self.program.mul
        kernel.set_args(self.a_buf, self.b_buf, self.c_buf, numpy.int32(2),
                        numpy.int32(5), numpy.int32(10))
        cl.enqueue_nd_range_kernel(self.queue, kernel, (2, 5), None)

        c = numpy.empty_like(self.a.dot(self.b))
        cl.enqueue_copy(self.queue, c, self.c_buf).wait()
        print("a", self.a)
        print("b", self.b)
        print("c", c)
예제 #36
0
    def vglClNdThreshold(self, img_input, img_output, thresh, top=255):
        print("# Running vglClNdThreshold")
        if (not img_input.clForceAsBuf == vl.IMAGE_ND_ARRAY()):
            print(
                "vglClNdThreshold: Error: this function supports only OpenCL data as buffer and img_input isn't."
            )
            exit()
        if (not img_output.clForceAsBuf == vl.IMAGE_ND_ARRAY()):
            print(
                "vglClNdThreshold: Error: this function supports only OpenCL data as buffer and img_output isn't."
            )
            exit()

        if (not isinstance(thresh, np.uint8)):
            print(
                "vglClNdThreshold: Warning: thresh not np.uint8! Trying to convert..."
            )
            try:
                thresh = np.uint8(thresh)
            except Exception as e:
                print(
                    "vglClNdThreshold: Error!! Impossible to convert thresh as a np.uint8 object."
                )
                print(str(e))
                exit()
        if (not isinstance(top, np.uint8)):
            print(
                "vglClNdThreshold: Warning: top not np.uint8! Trying to convert..."
            )
            try:
                top = np.uint8(top)
            except Exception as e:
                print(
                    "vglClNdThreshold: Error!! Impossible to convert top as a np.uint8 object."
                )
                print(str(e))
                exit()

        vl.vglCheckContext(img_input, vl.VGL_CL_CONTEXT())
        vl.vglCheckContext(img_output, vl.VGL_CL_CONTEXT())

        _program = self.cl_ctx.get_compiled_kernel(
            "../CL_ND/vglClNdThreshold.cl", "vglClNdThreshold")
        kernel_run = _program.vglClNdThreshold

        kernel_run.set_arg(0, img_input.get_oclPtr())
        kernel_run.set_arg(1, img_output.get_oclPtr())
        kernel_run.set_arg(2, thresh)
        kernel_run.set_arg(3, top)

        cl.enqueue_nd_range_kernel(self.ocl.commandQueue, kernel_run,
                                   img_output.get_ipl().shape, None)

        vl.vglSetContext(img_output, vl.VGL_CL_CONTEXT())
예제 #37
0
파일: tools.py 프로젝트: hagisgit/SLIC
def max_reduce_real4(ipt):
     x = CLReal(len(ipt)) 
     y = CLReal(len(ipt))
     z = CLReal(len(ipt))
     kern = _splitkern_real4.kern
     kern.set_arg(0, ipt._buffer)
     kern.set_arg(1, x._buffer)
     kern.set_arg(2, y._buffer)
     kern.set_arg(3, z._buffer)
     cl.enqueue_nd_range_kernel(ipt._ctrl.clqueue, kern, (len(ipt),), None)
     return max_reduce(x), max_reduce(y), max_reduce(z)
예제 #38
0
    def send(self):
        # Set the Kernel Arguments
        npSize = np.int32(self.data_size / 4)
        self.ocl_krnl_input_stage.set_args(self.buffer_input, npSize)

        # Copy input data to device global memory
        cl.enqueue_migrate_mem_objects(self.ocl_q, [self.buffer_input],
                                       flags=0)

        # Launch the Kernel
        cl.enqueue_nd_range_kernel(self.ocl_q, self.ocl_krnl_input_stage, [1],
                                   [1])
예제 #39
0
파일: filter.py 프로젝트: Kobtul/documents
 def prefixSum(self, e, data, keys, ndata, low, hi, events):
     import numpy as np
     import pyopencl as cl
     mf = cl.mem_flags
     
     if not isinstance(data, cl.Buffer):
         data_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf= data)
     else:
         data_buf = data
     
     if not isinstance(keys, cl.Buffer):
         keys_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf= keys)
     else:
         keys_buf = keys
     
     grid_dims = self.get_grid_dims(ndata)
     psumbytes = ndata * np.uint64(0).nbytes
     bsumbytes =  int(np.prod(grid_dims) * np.uint64(0).nbytes)
     nbsumbytes =  np.uint64(0).nbytes
     
     psum_buf = cl.Buffer(self.ctx, mf.READ_WRITE, psumbytes)
     bsum_buf = cl.Buffer(self.ctx, mf.READ_WRITE, bsumbytes)
     nbsum_buf = cl.Buffer(self.ctx, mf.READ_WRITE, nbsumbytes)
     
     low = PrefixSum.HOST_TYPE_KEYS(low)
     hi = PrefixSum.HOST_TYPE_KEYS(hi)
     
     kernel = self.prg.prefixSumDown
     kernel.set_args(data_buf, keys_buf, np.uint64(ndata), low, hi, psum_buf, bsum_buf, nbsum_buf)
     
     global_dims = self.get_global(grid_dims)
     
     print "prefixSumDown %s %s" % (str(global_dims), str(self.localDims))
     if e is None:
         e  = ( cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims, wait_for=e), )
     else:
         e  = ( cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims), )
     events += e
     
     nbsum = np.zeros(1, dtype = np.uint64)
     events += (cl.enqueue_copy(self.queue, nbsum, nbsum_buf, wait_for=e),)
     
     if nbsum>1:
         (e, bsum_buf, bsum1_buf, nbsum1_buf, ndata2) = self.prefixSumDownInplace(e, bsum_buf, nbsum.item(), events)
     else:
         ndata2 = np.zeros(1, dtype = np.uint64)
         events += (cl.enqueue_copy(self.queue, ndata2, bsum_buf, wait_for=e),)
         ndata2 = ndata2.item()
         print ndata2
     
     self.prefixSumUp(e, psum_buf, ndata, bsum_buf, nbsum, events)
     
     return (e, data_buf, keys_buf, psum_buf, bsum_buf, nbsum_buf, ndata2)
예제 #40
0
파일: bin_nd.py 프로젝트: arturxz/TCC
    def vglClNdBinMin(self, img_input, img_input2, img_output):

        if (not img_input.clForceAsBuf == vl.IMAGE_ND_ARRAY()):
            print(
                "vglClNdBinMin: Error: this function supports only OpenCL data as buffer and img_input isn't."
            )
            exit()
        if (not img_input2.clForceAsBuf == vl.IMAGE_ND_ARRAY()):
            print(
                "vglClNdBinMin: Error: this function supports only OpenCL data as buffer and img_input isn't."
            )
            exit()
        if (not img_output.clForceAsBuf == vl.IMAGE_ND_ARRAY()):
            print(
                "vglClNdBinMin: Error: this function supports only OpenCL data as buffer and img_output isn't."
            )
            exit()

        vl.vglCheckContext(img_input, vl.VGL_CL_CONTEXT())
        vl.vglCheckContext(img_input2, vl.VGL_CL_CONTEXT())
        vl.vglCheckContext(img_output, vl.VGL_CL_CONTEXT())

        if (not isinstance(window, vl.VglStrEl)):
            print(
                "vglClNdBinMin: Error: window is not a VglStrEl object. aborting execution."
            )
            exit()

        _program = self.cl_ctx.get_compiled_kernel(
            "../CL_BIN/vglClNdBinMin.cl", "vglClNdBinMin")
        kernel_run = _program.vglClNdBinMin

        kernel_run.set_arg(0, img_input.get_oclPtr())
        kernel_run.set_arg(1, img_input2.get_oclPtr())
        kernel_run.set_arg(2, img_output.get_oclPtr())

        _worksize_0 = img_input.getWidthIn()
        if (img_input.depth == vl.IPL_DEPTH_1U()):
            _worksize_0 = img_input.getWidthStep()
        if (img_input2.depth == vl.IPL_DEPTH_1U()):
            _worksize_0 = img_input2.getWidthStep()
        if (img_output.depth == vl.IPL_DEPTH_1U()):
            _worksize_0 = img_output.getWidthStep()

        worksize = (int(_worksize_0), img_input.getHeigthIn(),
                    img_input.getNFrames())

        # ENQUEUEING KERNEL EXECUTION
        #cl.enqueue_nd_range_kernel(self.ocl.commandQueue, kernel_run, worksize, None)
        cl.enqueue_nd_range_kernel(self.ocl.commandQueue, kernel_run,
                                   img_output.get_ipl().shape, None)

        vl.vglSetContext(img_output, vl.VGL_CL_CONTEXT())
예제 #41
0
파일: filter.py 프로젝트: Kobtul/documents
    def filter(self, data, keys, low, hi, events):
        import numpy as np
        import pyopencl as cl
        mf = cl.mem_flags
        
        ndata = data.size
        
        (e, data_buf, keys_buf, indices_buf, bsum_buf, nbsum_buf, ndata2) = self.prefixSum(None, data, keys, ndata, low, hi, events)
        
        filt = np.zeros(ndata, dtype = np.bool8)
        indices = np.zeros(ndata, dtype = np.uint64)
        data2 = np.zeros(ndata2, dtype = PrefixSum.HOST_TYPE_DATA)
        keys2 = np.zeros(ndata2, dtype = PrefixSum.HOST_TYPE_KEYS)
        
        ndata2bytes = np.uint64(0).nbytes
        
        if PrefixSum.RETURN_FILTER == 1:
            filt_buf = cl.Buffer(self.ctx, mf.READ_WRITE, filt.nbytes)
        print data2.nbytes
        data2_buf = cl.Buffer(self.ctx, mf.READ_WRITE, data2.nbytes)
        keys2_buf = cl.Buffer(self.ctx, mf.READ_WRITE, keys2.nbytes)
        ndata2_buf = cl.Buffer(self.ctx, mf.READ_WRITE, ndata2bytes)
        
        low = PrefixSum.HOST_TYPE_KEYS(low)
        hi = PrefixSum.HOST_TYPE_KEYS(hi)

        kernel = self.prg.filter
        if PrefixSum.RETURN_FILTER == 1:
            kernel.set_args(data_buf, keys_buf, indices_buf, np.uint64(ndata), low, hi, filt_buf, data2_buf, keys2_buf, ndata2_buf)
        else:
            kernel.set_args(data_buf, keys_buf, indices_buf, np.uint64(ndata), low, hi, data2_buf, keys2_buf, ndata2_buf)
        
        global_dims = self.get_global(self.get_grid_dims(ndata))
        
        print "filter"
        if e is None:
            e  = ( cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims, wait_for=e), )
        else:
            e  = ( cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims), )
        events += e
        
        if PrefixSum.RETURN_FILTER == 1:
            events += ( cl.enqueue_copy(self.queue, filt, filt_buf, wait_for=e), 
                        cl.enqueue_copy(self.queue, indices, indices_buf, wait_for=e),
                        cl.enqueue_copy(self.queue, data2, data2_buf, wait_for=e),
                        cl.enqueue_copy(self.queue, keys2, keys2_buf, wait_for=e) )
        else:
            events += ( cl.enqueue_copy(self.queue, indices, indices_buf, wait_for=e),
                        cl.enqueue_copy(self.queue, data2, data2_buf, wait_for=e),
                        cl.enqueue_copy(self.queue, keys2, keys2_buf, wait_for=e) )
        
        return (filt, indices, data2, keys2)
예제 #42
0
 def futhark_render(self, width_743, height_744, time_745, degree_746):
     y_749 = sitofp_i32_f32(height_744)
     y_750 = sitofp_i32_f32(width_743)
     x_751 = fpow32(time_745, np.float32((1.5)))
     y_752 = (x_751 * np.float32((5.0e-3)))
     res_753 = (np.float32((1.0)) + y_752)
     res_754 = (np.float32((3.1415927)) / res_753)
     group_size_893 = self.group_size
     y_894 = (group_size_893 - np.int32(1))
     x_895 = (width_743 + y_894)
     num_groups_896 = squot32(x_895, group_size_893)
     num_threads_897 = (num_groups_896 * group_size_893)
     bytes_911 = (np.int32(4) * width_743)
     mem_912 = cl.Buffer(
         self.ctx, cl.mem_flags.READ_WRITE,
         np.long(
             np.long(bytes_911) if (
                 bytes_911 > np.int32(0)) else np.int32(1)))
     if ((np.int32(1) * (num_groups_896 * group_size_893)) != np.int32(0)):
         self.map_kernel_898_var.set_args(mem_912, np.float32(y_749),
                                          np.int32(width_743))
         cl.enqueue_nd_range_kernel(
             self.queue, self.map_kernel_898_var, (np.long(
                 (num_groups_896 * group_size_893)), ),
             (np.long(group_size_893), ))
         if synchronous:
             self.queue.finish()
     nesting_size_833 = (height_744 * width_743)
     x_836 = (nesting_size_833 + y_894)
     num_groups_837 = squot32(x_836, group_size_893)
     num_threads_838 = (num_groups_837 * group_size_893)
     bytes_913 = (bytes_911 * height_744)
     mem_915 = cl.Buffer(
         self.ctx, cl.mem_flags.READ_WRITE,
         np.long(
             np.long(bytes_913) if (
                 bytes_913 > np.int32(0)) else np.int32(1)))
     if ((np.int32(1) * (num_groups_837 * group_size_893)) != np.int32(0)):
         self.map_kernel_839_var.set_args(mem_912, np.int32(height_744),
                                          np.float32(res_754),
                                          np.float32(y_750),
                                          np.int32(degree_746), mem_915,
                                          np.int32(width_743))
         cl.enqueue_nd_range_kernel(
             self.queue, self.map_kernel_839_var, (np.long(
                 (num_groups_837 * group_size_893)), ),
             (np.long(group_size_893), ))
         if synchronous:
             self.queue.finish()
     out_mem_917 = mem_915
     out_memsize_918 = bytes_913
     return (out_memsize_918, out_mem_917)
예제 #43
0
파일: CLSolve.py 프로젝트: ohlord/cimpress
    def solve(self,puzzle,simulations = 16384, iterations = 35, workGroupSize = 128):
        self.simulations = simulations
        self.iterations = iterations
        self.workGroupSize = workGroupSize
        self.workGroups = int(self.simulations / self.workGroupSize)
        self.width = np.int8(puzzle['width'])
        self.height = np.int8(puzzle['height'])
        
        #initialise buffers
        self.initBuffers(puzzle)
        
        #create kernel
        self.kernel = cl.Kernel(self.program,"montecarlo")
        self.kernel.set_args(self.lengthsBuffer,self.groupLengthsBuffer,self.puzzlesBuffer,self.solutionsBuffer,self.height,self.width,np.int32(self.iterations))
        
        #execute program for a number of iterations
        cl.enqueue_nd_range_kernel(self.queue,self.kernel,(self.simulations,),(self.workGroupSize,))
        
        #unmap group lengths buffer from device
        cl.enqueue_map_buffer(self.queue,self.groupLengthsBuffer,cl.map_flags.WRITE,0,self.groupLengths.shape,self.groupLengths.dtype)
        self.groupLengths = self.groupLengthsBuffer.get_host_array(self.groupLengths.shape,dtype=self.groupLengths.dtype)

        #unmap solutions buffer from device
        cl.enqueue_map_buffer(self.queue,self.solutionsBuffer,cl.map_flags.WRITE,0,self.solutionsFlattened.shape,self.solutions.dtype)
        self.solutions = self.solutionsBuffer.get_host_array(self.solutions.shape,dtype=self.solutions.dtype)
        
        #release buffers
        self.lengthsBuffer.release()
        self.groupLengthsBuffer.release()
        self.puzzlesBuffer.release()
        self.solutionsBuffer.release()

        #get the best solution
        i = self.groupLengths.argmin()
        bestSolution = np.array(self.solutions[i])
        
        #convert solution to list format used by challenge
        solution = []
        for row in range(0,puzzle['height']):
            for col in range(0,puzzle['width']):
                if bestSolution[row][col]!=-1:
                    s = bestSolution[row][col]
                    
                    #add to solution list
                    solution.append({'X': int(col),'Y': int(row),'Size':int(s)})
                    
                    #clear cells in solution
                    for i in range(0,s):
                        for j in range(0,s):
                            bestSolution[row+i][col+j]=-1
        
        return solution
예제 #44
0
def change_display(image) :

    image_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=image)
    mem = cl.GLBuffer(ctx, mf.WRITE_ONLY, numpy.float32(buf))

    cl.enqueue_acquire_gl_objects(queue, [mem])
    add_knl = prog.add
    add_knl.set_args(image_buf, mem)
    cl.enqueue_nd_range_kernel(queue, add_knl, image.shape, None)
    cl.enqueue_release_gl_objects(queue, [mem])

    queue.finish()
    glFlush()
예제 #45
0
파일: QclKernel.py 프로젝트: hagisgit/qcl
 def _exec_chunked_unsafe(self, chunksize=0):
     """Unsafe for kernels with local variables."""
     if chunksize > 0:
         self._prep_chunked_exec(chunksize)
     lenarr = self.leadingvar.length
     ncnk = int(ceil(float(lenarr)/float(self._cnksz)))
     cnksz = self._cnksz
     for i in range(ncnk):
         if (i == (ncnk - 1)) and not(lenarr % cnksz == 0):
             cnksz = lenarr % cnksz
         self._solverobj.__setattr__(self._cnk_name, i)
         cl.enqueue_nd_range_kernel(self._solverobj.clqueue, self._clkernel, (cnksz,), None)
     self._solverobj.clqueue.finish() 
예제 #46
0
    def updateEt_vanilla(self, algo="SHG"):
        root.debug("Updating Et using vanilla algorithm")
        t0 = time.clock()
        #         transform = FFT(self.ctx, self.q, (self.Esig_w_tau_cla,) , (self.Esig_t_tau_p_cla,) , axes = [1])
        #         events = transform.enqueue(forward = False)

        #         self.Esig_t_tau_p_cla.set(np.fft.ifft(self.Esig_w_tau_cla.get(), axis=1).astype(self.dtype_c).copy())

        if self.useCL == True:
            events = self.Esig_t_tau_p_fft.enqueue(forward=False)
            for e in events:
                e.wait()
            if algo == "SD":
                krn = self.progs.progs["updateEtVanillaSumSD"].updateEtVanillaSumSD
                krn.set_scalar_arg_dtypes((None, None, np.int32))
                krn.set_args(self.Esig_t_tau_p_cla.data, self.Et_cla.data, self.N)
                ev = cl.enqueue_nd_range_kernel(self.q, krn, self.Et.shape, None)
                ev.wait()

                Et = self.Et_cla.get()
                self.Et_cla.set(-np.conj(Et).astype(self.dtype_c).copy())

            #                 Esig_w_tau = self.Esig_w_tau_cla.get()
            #                 Gm  = np.conj(Esig_w_tau.sum(axis=1))[::-1]
            #                 self.Et_cla.set(Gm.copy())

            else:
                krn = self.progs.progs["updateEtVanillaSumSHG"].updateEtVanillaSumSHG
                krn.set_scalar_arg_dtypes((None, None, np.int32))
                krn.set_args(self.Esig_t_tau_p_cla.data, self.Et_cla.data, self.N)
                ev = cl.enqueue_nd_range_kernel(self.q, krn, self.Et.shape, None)
                ev.wait()

            krn = self.progs.progs["updateEtVanillaNorm"].updateEtVanillaNorm
            krn.set_scalar_arg_dtypes((None, np.int32))
            krn.set_args(self.Et_cla.data, self.N)
            ev = cl.enqueue_nd_range_kernel(self.q, krn, [1], None)
            ev.wait()
        else:
            self.Esig_t_tau_p_cla.set(np.fft.ifft(self.Esig_w_tau_cla.get(), axis=1).astype(self.dtype_c).copy())
            Esig_t_tau_p = self.Esig_t_tau_p_cla.get()
            if algo == "SD":
                Et = np.sqrt(Esig_t_tau_p.sum(axis=0))
            #                 Et = (Esig_t_tau_p.sum(axis=0))
            else:
                Et = Esig_t_tau_p.sum(axis=0)
            Et = Et / np.abs(Et).max()
            self.Et_cla.set(Et)

        root.debug("".join(("Time spent: ", str(time.clock() - t0))))
예제 #47
0
    def test_algorithm(self):
        print "\n**************************"
        print "test_reedsolomon:"
        passed = 0
        linecnt = 1

        # opencl buffer uint
        self.inputbuffer = cl.Buffer(self.ctx , cl.mem_flags.READ_WRITE, size=48*4)
        # opencl buffer uint
        self.outputbuffer = cl.Buffer(self.ctx , cl.mem_flags.READ_WRITE, size=51*4)

        for k in self.kernelname:
            kernel = self.load_kernel(self.filename, k)
            self.fd_input = open('test_bench_rs_input.csv', 'r')
            self.fd_output = open('test_bench_rs_output.csv', 'r')
            for line in self.fd_input:
                data_to_encode = numpy.fromstring(line, dtype=numpy.uint8, sep=",").tostring()
                data_to_encode = numpy.fromstring(data_to_encode, dtype=numpy.uint32)

                encoded_data = numpy.array(numpy.zeros(51), dtype=numpy.uint32)
                reference_data = numpy.fromstring(self.fd_output.readline(), dtype=numpy.uint8, sep=",")

                cl.enqueue_copy(self.queue, self.inputbuffer, data_to_encode).wait()
                kernel.set_args(self.inputbuffer, self.outputbuffer)
                cl.enqueue_nd_range_kernel(self.queue,kernel,(1,),None ).wait()
                cl.enqueue_copy(self.queue, encoded_data, self.outputbuffer).wait()

                if encoded_data.tostring() == reference_data.tostring():
                    passed += 1
                    print "Test %d PASSED" % linecnt
                else:
                    print "Test %d FAILED" % linecnt
                    print "input data:"
                    print numpy.fromstring(data_to_encode.tostring(), dtype=numpy.uint8)
                    print "encoded data:"
                    print numpy.fromstring(encoded_data.tostring(), dtype=numpy.uint8)
                    print "reference data:"
                    print reference_data
                    print "error data:"
                    print (reference_data - numpy.fromstring(encoded_data.tostring(), dtype=numpy.uint8))
                linecnt += 1
        print "%d pass out of %d" % (passed,(linecnt-1))
        self.fd_input.close()
        self.fd_output.close()
        if passed == (linecnt-1):
            print "All reedsolomon tests PASS\n"
            return True
        else:
            print "at least one reedsolomon test FAILED\n"
            return False
예제 #48
0
    def calc_weights_gradient( self ):
        """
        Calculate gradient of weights.
        
        This method should be called only for processed layers as it's used
        inputs array which is valid only at processing time.
        """

        for l in self._next_layers:
            if not l[0].processed:
                l[0].calc_weights_gradient()

        queue = self.opencl.queue
        kernel = self.opencl.kernel_calc_layer_gradient

        kernel.set_arg( 2, self._inputs_offset )
        kernel.set_arg( 3, self._neurons_offset )
        kernel.set_arg( 4, self._inputs_per_neuron )
        kernel.set_arg( 5, self._weights_offset )
        kernel.set_arg( 7, self._weights_count )
        kernel.set_arg( 8, pyopencl.LocalMemory( int( 
                    4 * ( self._inputs_per_neuron + 1 + self.opencl.max_local_size[ 0 ] // self._inputs_per_neuron ) ) ) )

        self._calc_gradient_event = pyopencl.enqueue_nd_range_kernel( queue, kernel,
            ( int( self._weights_buf_size ), ), ( self.opencl.max_local_size[ 0 ], ),
            wait_for = self._calc_gradient_wait_for
            )
        del self._calc_gradient_wait_for[:]

        kernel = self.opencl.kernel_propagate_errors
        kernel.set_arg( 2, self._neurons_offset )
        kernel.set_arg( 5, self._neuron_count )
        kernel.set_arg( 7, self._inputs_per_neuron )

        i_s = numpy.int32( 1 )
        for l in self._prev_layers:
            kernel.set_arg( 3, l[0]._neurons_offset + l[1] )
            kernel.set_arg( 4, l[2] )
            kernel.set_arg( 6, self._weights_offset + i_s )

            l[0]._calc_gradient_wait_for.append( pyopencl.enqueue_nd_range_kernel( queue, kernel,
                ( int( l[2] * 64 ), ), ( 64, ),
                wait_for = ( self._calc_gradient_event, )
                ) )

            i_s += l[2]

        self._processed = True
def gpu_amend_values(queue, kernels, gpu_params, buffers, amendments):
    """
        Transfers requested amendments (after collision detection check) to the GPU,
        where a kernel applies them to the data
    """
    intermediary_events = []
    packet = amendments.get_packet()
    if packet[amendments.amount_i] > 0:
        events = [
            cl.enqueue_copy(queue, buffers["global_amendments_n"],
                            packet[amendments.amount_i]),
            cl.enqueue_copy(queue, buffers["global_amendment_indices"],
                            packet[amendments.indices_i]),
            cl.enqueue_copy(queue, buffers["global_amendment_values"],
                            packet[amendments.values_i])]

        # X groups of 64 items (amendments.amount work items)
        intermediary_events.append(
            cl.enqueue_nd_range_kernel(
                queue, kernels["k_update_values"],
                (int(np.ceil(amendments.amount / gpu_params["preferred_multiple"]) *
                     gpu_params["preferred_multiple"]),),
                (gpu_params["preferred_multiple"],), global_work_offset=None,
                wait_for=events))
    return intermediary_events
예제 #50
0
 def enqueue(self, wait_for=None, profiling=False):
     ev = cl.enqueue_nd_range_kernel(
         self.queue, self.kern, self.gsize, self.lsize,
         wait_for=wait_for)
     if profiling:
         self._events_to_profile.append(ev)
     return ev
def update_map(queue, kernels, intermediary_events):
    """
        Updates map. Updating includes:
        - "Dissoluting" pheromones: pheromone level is reduced regularly to simulate ageing.
    """
    intermediary_events.append(cl.enqueue_nd_range_kernel(
        queue, kernels["k_update_map"], [1], [1]))
예제 #52
0
    def applyIntensityData(self, I_w_tau=None):
        root.debug("Applying intensity data from experiment")
        t0 = time.clock()

        krn = self.progs.progs["applyIntensityData"].applyIntensityData
        krn.set_scalar_arg_dtypes((None, None, np.int32))
        krn.set_args(self.Esig_w_tau_cla.data, self.I_w_tau_cla.data, self.N)
        ev = cl.enqueue_nd_range_kernel(self.q, krn, self.Esig_w_tau.shape, None)
        ev.wait()

        #         if self.useCL == True:
        #             krn = self.progs.progs['applyIntensityData'].applyIntensityData
        #             krn.set_scalar_arg_dtypes((None, None, np.int32))
        #             krn.set_args(self.Esig_w_tau_cla.data, self.I_w_tau_cla.data, self.N)
        #             ev = cl.enqueue_nd_range_kernel(self.q, krn, self.Esig_w_tau.shape, None)
        #             ev.wait()
        #         else:
        #             eps = 0.00
        #             Esig_w_tau = self.Esig_w_tau_cla.get()
        #             Esig_mag = np.abs(Esig_w_tau)
        #
        #             Esig_w_tau_p = np.zeros_like(Esig_w_tau)
        #             good_ind = np.where(Esig_mag > eps)
        #             Esig_w_tau_p[good_ind[0], good_ind[1]] = np.sqrt(self.I_w_tau_cla.get()[good_ind[0], good_ind[1]])*Esig_w_tau[good_ind[0], good_ind[1]]/Esig_mag[good_ind[0], good_ind[1]]

        root.debug("".join(("Time spent: ", str(time.clock() - t0))))
예제 #53
0
    def calc_chi2(self, queue, interspace, q, Iq, 
            rind, rxyz, lind, lxyz, origin, voxelspacing, fifj, targetIq, sq, chi2):

        kernel = self.kernels.calc_chi2
        workgroupsize = 16

        gws = (queue.device.max_compute_units * workgroupsize * 512,)
        lws = (workgroupsize,)

        floatsize = 4
        tmpIq = cl.LocalMemory(floatsize * q.shape[0] * workgroupsize)

        shape = np.zeros(4, dtype=np.int32)
        shape[:-1] = interspace.shape
        shape[-1] = interspace.size

        nq = np.int32(q.shape[0])
        nind1 = np.int32(rind.shape[0])
        nind2 = np.int32(lind.shape[0])

        fifj_shape = np.zeros(4, dtype=np.int32)
        fifj_shape[:-1] = fifj.shape
        fifj_shape[-1] = fifj.size

        kernel.set_args(interspace.data, q.data, Iq.data, tmpIq, rind.data, rxyz.data,
                lind.data, lxyz.data, origin, voxelspacing, fifj.data, targetIq.data, sq.data, chi2.data,
                shape, nq, nind1, nind2, fifj_shape)
        status = cl.enqueue_nd_range_kernel(queue, kernel, gws, lws)

        return status
예제 #54
0
 def gradZSD_gpu(self):
     root.debug("Calculating dZ for SD using gpu")
     krn = self.progs.progs["gradZSD"].gradZSD
     krn.set_scalar_arg_dtypes((None, None, None, np.int32))
     krn.set_args(self.Esig_t_tau_p_cla.data, self.Et_cla.data, self.dZ_cla.data, self.N)
     ev = cl.enqueue_nd_range_kernel(self.q, krn, self.Et.shape, None)
     ev.wait()
예제 #55
0
    def __call__(self, *args, **kwargs):
        vectors = []

        invocation_args = []
        for arg, arg_descr in zip(args, self.arguments):
            if isinstance(arg_descr, VectorArg):
                if not arg.flags.forc:
                    raise RuntimeError("ElementwiseKernel cannot "
                            "deal with non-contiguous arrays")

                vectors.append(arg)
                invocation_args.append(arg.data)
            else:
                invocation_args.append(arg)

        queue = kwargs.pop("queue", None)
        wait_for = kwargs.pop("wait_for", None)
        if kwargs:
            raise TypeError("too many/unknown keyword arguments")

        repr_vec = vectors[0]
        if queue is None:
            queue = repr_vec.queue
        invocation_args.append(repr_vec.mem_size)

        gs, ls = repr_vec.get_sizes(queue,
                self.kernel.get_work_group_info(
                    cl.kernel_work_group_info.WORK_GROUP_SIZE,
                    queue.device))
        self.kernel.set_args(*invocation_args)
        return cl.enqueue_nd_range_kernel(queue, self.kernel,
                gs, ls, wait_for=wait_for)
예제 #56
0
 def futhark_main(self, screenX_700, screenY_701, depth_702, xmin_703,
                  ymin_704, xmax_705, ymax_706):
   res_707 = (xmax_705 - xmin_703)
   res_708 = (ymax_706 - ymin_704)
   y_711 = sitofp_i32_f32(screenX_700)
   y_712 = sitofp_i32_f32(screenY_701)
   x_713 = slt32(np.int32(0), depth_702)
   bytes_902 = (np.int32(4) * screenY_701)
   mem_903 = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE,
                       long(long(bytes_902) if (bytes_902 > np.int32(0)) else np.int32(1)))
   mem_905 = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE,
                       long(long(bytes_902) if (bytes_902 > np.int32(0)) else np.int32(1)))
   group_size_911 = np.int32(512)
   num_groups_912 = squot32(((screenY_701 + group_size_911) - np.int32(1)),
                            group_size_911)
   if ((np.int32(1) * (num_groups_912 * group_size_911)) != np.int32(0)):
     self.map_kernel_894_var.set_args(np.float32(ymin_704), np.float32(y_712),
                                      np.float32(res_708),
                                      np.int32(screenY_701), mem_903, mem_905)
     cl.enqueue_nd_range_kernel(self.queue, self.map_kernel_894_var,
                                (long((num_groups_912 * group_size_911)),),
                                (long(group_size_911),))
     if synchronous:
       self.queue.finish()
   nesting_size_844 = (screenX_700 * screenY_701)
   bytes_906 = (bytes_902 * screenX_700)
   mem_908 = cl.Buffer(self.ctx, cl.mem_flags.READ_WRITE,
                       long(long(bytes_906) if (bytes_906 > np.int32(0)) else np.int32(1)))
   group_size_917 = np.int32(512)
   num_groups_918 = squot32((((screenY_701 * screenX_700) + group_size_917) - np.int32(1)),
                            group_size_917)
   if ((np.int32(1) * (num_groups_918 * group_size_917)) != np.int32(0)):
     self.map_kernel_846_var.set_args(np.int32(screenX_700),
                                      np.int32(screenY_701), mem_905,
                                      np.byte(x_713), np.int32(depth_702),
                                      np.float32(xmin_703), mem_903,
                                      np.float32(y_711), np.float32(res_707),
                                      mem_908)
     cl.enqueue_nd_range_kernel(self.queue, self.map_kernel_846_var,
                                (long((num_groups_918 * group_size_917)),),
                                (long(group_size_917),))
     if synchronous:
       self.queue.finish()
   out_mem_909 = mem_908
   out_memsize_910 = bytes_906
   return (out_memsize_910, out_mem_909)
예제 #57
0
파일: Convolution.py 프로젝트: fean9r/FeaCL
	def execute(self):
		global_work_size = [self.outputSignalWidth*self.outputSignalHeight]
		local_work_size = [1]
		if (debug==1):
			print global_work_size
			print local_work_size 
		kernel = self.program.convolve
		

		if (debug==1):
			print kernel.context
			print kernel.function_name
			print kernel.num_args
			print kernel.program
			print kernel.reference_count

		# Vecchia modalita' di creare un evento
		kernel.set_arg(0,self.inputSignalBuffer)
		kernel.set_arg(1,self.maskBuffer)
		kernel.set_arg(2,self.outputSignalBuffer)
		kernel.set_arg(3,self.inputSignalWidth)
		kernel.set_arg(4,self.maskWidth)
		self.event =cl.enqueue_nd_range_kernel(self.queue, kernel, global_work_size, local_work_size,
			 global_work_offset=None, wait_for=None, g_times_l=True)

		if (debug==1):
			wgi = cl.kernel_work_group_info
			for dev in self.ctx.devices:
				print "-------",dev,"-------" 
				print kernel.get_work_group_info(wgi.WORK_GROUP_SIZE,dev )
				print kernel.get_work_group_info(wgi.COMPILE_WORK_GROUP_SIZE,dev )
				print kernel.get_work_group_info(wgi.LOCAL_MEM_SIZE,dev )
				print kernel.get_work_group_info(wgi.PRIVATE_MEM_SIZE,dev )
				print kernel.get_work_group_info(wgi.PREFERRED_WORK_GROUP_SIZE_MULTIPLE,dev )


		# Nuova modalita' di creare un evento
		self.event = kernel(self.queue,global_work_size,None,
			self.inputSignalBuffer,self.maskBuffer,self.outputSignalBuffer ,self.inputSignalWidth ,self.maskWidth)
		
		if (debug==1):
			print "context",self.event.context
			print "command_execution_status",self.event.command_execution_status
			print "command_queue",self.event.command_queue
			print "command_type",self.event.command_type
			print "reference_count",self.event.reference_count
			#print self.event.profile.end
			#print self.event.profile.queued
			#print self.event.profile.start
			#print self.event.profile.submit

		
		cl.enqueue_copy(self.queue, self.outputSignal, self.outputSignalBuffer)
		print self.inputSignal
		print self.mask

		print self.outputSignal
예제 #58
0
    def minZerrKernSHG_gpu(self):
        krn = self.progs.progs["minZerrSHG"].minZerrSHG
        krn.set_scalar_arg_dtypes((None, None, None, None, None, None, None, None, np.int32))
        krn.set_args(
            self.Esig_t_tau_p_cla.data,
            self.Et_cla.data,
            self.dZ_cla.data,
            self.X0_cla.data,
            self.X1_cla.data,
            self.X2_cla.data,
            self.X3_cla.data,
            self.X4_cla.data,
            self.N,
        )
        ev = cl.enqueue_nd_range_kernel(self.q, krn, self.Et.shape, None)
        ev.wait()

        krn = self.progs.progs["normEsig"].normEsig
        krn.set_scalar_arg_dtypes((None, None, np.int32))
        krn.set_args(self.Esig_t_tau_p_cla.data, self.Esig_t_tau_norm_cla.data, self.N)
        ev = cl.enqueue_nd_range_kernel(self.q, krn, self.Esig_t_tau_p.shape, None)
        ev.wait()
        mx = cla.max(self.Esig_t_tau_norm_cla).get() * self.N * self.N

        #         Esig_t_tau = self.Esig_t_tau_p_cla.get()
        #         mx = ((Esig_t_tau*Esig_t_tau.conj()).real).max() * self.N*self.N

        X0 = cla.sum(self.X0_cla, queue=self.q).get() / mx
        X1 = cla.sum(self.X1_cla, queue=self.q).get() / mx
        X2 = cla.sum(self.X2_cla, queue=self.q).get() / mx
        X3 = cla.sum(self.X3_cla, queue=self.q).get() / mx
        X4 = cla.sum(self.X4_cla, queue=self.q).get() / mx

        root.debug("".join(("X0=", str(X0), ", type ", str(type(X0)))))

        root.debug(
            "".join(("Poly: ", str(X4), " x^4 + ", str(X3), " x^3 + ", str(X2), " x^2 + ", str(X1), " x + ", str(X0)))
        )
        # Polynomial in dZ (expansion of differential)
        X = np.array([X0, X1, X2, X3, X4]).astype(np.double)

        root.debug("".join(("Esig_t_tau_p norm max: ", str(mx / (self.N * self.N)))))

        return X
def prepare_device_memory(queue, kernels, buffers, flock):
    """
        Initializes device memory and transfers
        first flocks from host to the device.
    """
    print("Initializing the memory and transferring the first flock.")
    intermediary_events = [cl.enqueue_nd_range_kernel(
        queue, kernels["k_init_memory"], [1], [1]),
        cl.enqueue_copy(queue, buffers["global_generated_flocks"], flock.np_arrays)]
    return intermediary_events
예제 #60
0
파일: filter.py 프로젝트: Kobtul/documents
 def prefixSumDownInplace(self, e, data, ndata, events):
     import numpy as np
     import pyopencl as cl
     mf = cl.mem_flags
     
     if not isinstance(data, cl.Buffer):
         data_buf = cl.Buffer(self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=data)
     else:
         data_buf = data
     
     grid_dims = self.get_grid_dims(ndata)
     psumbytes = int(np.prod(grid_dims) * np.uint64(0).nbytes)
     npsumbytes =  np.uint64(0).nbytes
     
     psum_buf = cl.Buffer(self.ctx, mf.READ_WRITE, psumbytes)
     npsum_buf = cl.Buffer(self.ctx, mf.READ_WRITE, npsumbytes)
     
     kernel = self.prg.prefixSumDownInplace
     kernel.set_args(data_buf, np.uint64(ndata), psum_buf, npsum_buf)
     
     global_dims = self.get_global(grid_dims)
     
     print "prefixSumDownInplace %s %s %d %d" % (str(global_dims), str(self.localDims), ndata, psumbytes)
     if e is None:
         e  = ( cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims, wait_for=e), )
     else:
         e  = ( cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims, self.localDims), )
     events += e
     
     npsum = np.zeros(1, dtype = np.uint64)
     events += (cl.enqueue_copy(self.queue, npsum, npsum_buf, wait_for=e),)
     
     if npsum>1:
         (e, psum_buf, psum1_buf, npsum1_buf, ndata2) = self.prefixSumDownInplace(e, psum_buf, npsum.item(), events)
     else:
         ndata2 = np.zeros(1, dtype = np.uint64)
         events += (cl.enqueue_copy(self.queue, ndata2, psum_buf, wait_for=e),)
         ndata2 = ndata2.item()
         print ndata2
     
     self.prefixSumUp(e, data_buf, ndata, psum_buf, npsum, events)
     
     return (e, data_buf, psum_buf, npsum_buf, ndata2)