示例#1
0
    def toGpu(self, data):
        import pyopencl as cl

        gpu_buf = self.allocate(data.shape, data.dtype)
        queue = self._createQueue()
        cl.enqueue_copy(queue, gpu_buf, data, is_blocking=True)
        return gpu_buf
示例#2
0
  def __call__( self, forward, backward, costs, features, origin, h ):
    mf = cl.mem_flags

    w_features = features.astype( np.float64 )
    cummulated = costs * 1

    forward_buffer  = cl.Buffer( self.context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = forward )
    backward_buffer = cl.Buffer( self.context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = backward )
    cost_buffer     = cl.Buffer( self.context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = costs )
    cumm_buffer     = cl.Buffer( self.context, mf.READ_WRITE, costs.nbytes )
    feature_buffer  = cl.Buffer( self.context, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf = w_features )
    origin_buffer   = cl.Buffer( self.context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = origin )

    self.henryCummulated.cummulated1( 
      self.queue, costs.shape, None, 
      np.int32( w_features.shape[2] ), np.int32( w_features.shape[1] ),
      origin_buffer, np.int32( h ),
      self.idirection_buffer,
      forward_buffer, backward_buffer, cost_buffer, cumm_buffer
    )

    self.henryCummulated.cummulated2( 
      self.queue, costs.shape, None, 
      np.int32( w_features.shape[2] ), np.int32( w_features.shape[1] ),
      np.int32( features.shape[3] ),
      cumm_buffer, feature_buffer
    )

    cl.enqueue_copy( self.queue, cummulated, cumm_buffer )
    cl.enqueue_copy( self.queue, w_features, feature_buffer )
    return cummulated, w_features
示例#3
0
def test_enqueue_task(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)
    mf = cl.mem_flags

    prg = cl.Program(ctx, """
    __kernel void
    reverse(__global const float *in, __global float *out, int n)
    {
        for (int i = 0;i < n;i++) {
            out[i] = in[n - 1 - i];
        }
    }
    """).build()
    knl = prg.reverse

    n = 100
    a = np.random.rand(n).astype(np.float32)
    b = np.empty_like(a)

    buf1 = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
    buf2 = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)

    knl.set_args(buf1, buf2, np.int32(n))
    cl.enqueue_task(queue, knl)

    cl.enqueue_copy(queue, b, buf2).wait()
    assert la.norm(a[::-1] - b) == 0
示例#4
0
    def fromGpu(self, gpu_buf, target_shape, target_dtype):
        import pyopencl as cl

        data = numpy.empty(target_shape, target_dtype)
        queue = self._createQueue()
        cl.enqueue_copy(queue, data, gpu_buf, is_blocking=True)
        return data
示例#5
0
文件: pycl.py 项目: ufo-kit/pina
    def run(self, kernel, shape, *args):
        kargs = []

        for arg in args:
            if isinstance(arg, np.ndarray):
                if id(arg) in self.buffers:
                    buf = self.buffers[id(arg)]
                    cl.enqueue_copy(self.runtime.queues[0], buf, arg)
                else:
                    flags = cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR
                    buf = cl.Buffer(self.runtime.context, flags, arg.nbytes, hostbuf=arg)
                    self.buffers[id(arg)] = buf

                kargs.append(buf)
            else:
                kargs.append(np.float32(arg))

        # TODO: use user-supplied information if necessary
        first_np_array = [a for a in args if isinstance(a, np.ndarray)][0]
        workspace = shape if shape else first_np_array.shape

        if self.output is None:
            self.output = np.empty(workspace).astype(np.float32)
            out_buffer = cl.Buffer(self.runtime.context, cl.mem_flags.WRITE_ONLY, self.output.nbytes)
            self.buffers[id(self.output)] = out_buffer
        else:
            out_buffer = self.buffers[id(self.output)]

        kargs.append(out_buffer)

        start = time.time()
        kernel(self.runtime.queues[0], workspace, None, *kargs)
        cl.enqueue_copy(self.runtime.queues[0], self.output, out_buffer)
        self.time = time.time() - start
        return self.output
示例#6
0
    def runTest(self):
        nx, ny, nz, str_f, pt0, pt1 = self.args

        slidx = common.slice_index_two_points(pt0, pt1)
        str_fs = common.convert_to_tuple(str_f)

        # instance
        gpu_devices = common_gpu.gpu_device_list(print_info=False)
        context = cl.Context(gpu_devices)
        device = gpu_devices[0]
        fields = Fields(context, device, nx, ny, nz, '')
        getf = GetFields(fields, str_f, pt0, pt1) 
        
        # host allocations
        eh_dict = {}
        for sf in str_fs:
            eh_dict[sf] = np.random.rand(*fields.ns).astype(fields.dtype)
            cl.enqueue_copy(fields.queue, fields.get_buf(sf), eh_dict[sf])

        # verify
        getf.get_event().wait()

        for str_f in str_fs:
            original = eh_dict[str_f][slidx]
            copy = getf.get_fields(str_f)
            self.assertEqual(np.abs(eh_dict[str_f][slidx] - getf.get_fields(str_f)).max(), 0, self.args)
示例#7
0
文件: types.py 项目: Aerojspark/PyFR
    def _set(self, ary):
        # Allocate a new buffer with suitable padding and assign
        buf = np.zeros(self.datashape, dtype=self.dtype)
        buf[...,:self.ioshape[-1]] = ary

        # Copy
        cl.enqueue_copy(self.backend.qdflt, self.data, buf)
示例#8
0
文件: types.py 项目: pv101/PyFR
    def _set(self, ary):
        # Allocate a new buffer with suitable padding and pack it
        buf = np.zeros((self.nrow, self.leaddim), dtype=self.dtype)
        buf[:, :self.ncol] = self._pack(ary)

        # Copy
        cl.enqueue_copy(self.backend.qdflt, self.data, buf)
 def generate(self, chunk_array, ctx, queue, heightmap_kernel):
     assert isinstance(chunk_array, ChunkArray)
     hmap = self._generate_hmap()
     x_bounds = (0, 8)
     y_bounds = (0, 8)
     for z in range(1):
         chunk_array.allocate_layer(z, x_bounds, y_bounds)
         for x in range(8):
             for y in range(8):
                 chunk_array.allocate_chunk(x, y, z, level=0)
     print("allocated!")
     ihmap = numpy.empty((256,256), dtype=numpy.int32)
     for x in range(256):
         for y in range(256):
             height = hmap[x, y]
             ihmap[x, y] = int(max(min(height*7.4 + 8, 32), 0))
     """for x in range(256):
         print(x)
         for y in range(256):
             height = hmap[x, y]
             z_max = int(max(min(height + 8, 32), 0))
             for z in range(32):
                 voxel = chunk_array.get_voxel(x, y, z)
                 voxel['flags'] = 0 if z_max < z else 1"""
     chunk_array.upload_buffers()
     buffer = pyopencl.Buffer(ctx, pyopencl.mem_flags.READ_ONLY|pyopencl.mem_flags.COPY_HOST_PTR, hostbuf = ihmap)
     #pyopencl.enqueue_copy(queue, buffer, hmap)
     heightmap_kernel(queue, (255, 255, 32), None, chunk_array.array_buffer._d_buffer, buffer)
     pyopencl.enqueue_copy(queue, chunk_array.voxel_data.level_buffers[0]._h_buffer, chunk_array.voxel_data.level_buffers[0]._d_buffer)
     chunk_array.upload_buffers()
示例#10
0
def to_host(queue, data, dtype, start, shape, elemstrides, is_blocking=True):
    """Copy memory off the device, into a Numpy array.

    If the requested array is discontiguous, the whole block is copied off
    the device, and a view is created to show the appropriate part.
    """
    if min(elemstrides) < 0:
        raise NotImplementedError()

    m, n = shape
    sm, sn = elemstrides
    if m * n == 0:
        return np.zeros(shape, dtype=dtype)

    itemsize = dtype.itemsize
    bytestart = itemsize * start
    bytelen = itemsize * ((m-1)*sm + (n-1)*sn + 1)

    temp_buf = np.zeros(bytelen, dtype=np.int8)
    cl.enqueue_copy(queue, temp_buf, data,
                    device_offset=bytestart, is_blocking=is_blocking)

    bytestrides = (itemsize * sm, itemsize * sn)
    return np.ndarray(shape=(m, n), dtype=dtype, buffer=temp_buf.data,
                      offset=0, strides=bytestrides)
示例#11
0
def calc_range(start, num, perexec):
	"""Calculate the otp-md5 of the 64-bit numbers range(start, num),
	   with otp sequence of rounds."""

	assert(num % perexec == 0)

	# Boilerplate OpenCL stuff
	ctx = cl.create_some_context()
	queue = cl.CommandQueue(ctx)
	mf = cl.mem_flags

	# Read the program source and compile
	sourcecode = open("otpmd5.cl").read()
	prg = cl.Program(ctx, sourcecode).build()

	for i in xrange(num / perexec):
		offset = start + (perexec * i)

		host_input = numpy.arange(offset, offset+perexec, dtype=numpy.uint64)
		result = numpy.empty_like(host_input)
		dev_input = cl.Buffer(ctx, mf.READ_ONLY | mf.USE_HOST_PTR, hostbuf=host_input)
		dev_output = cl.Buffer(ctx, mf.READ_WRITE, size=result.size * result.itemsize)
		prg.get_otpmd5_64k_rounds(queue, host_input.shape, None, dev_input, dev_output).wait()
		cl.enqueue_copy(queue, result, dev_output).wait()
		send_output(host_input, result)
示例#12
0
    def __call__(self, call_details, values, cutoff):
        # type: (CallDetails, np.ndarray, np.ndarray, float) -> np.ndarray
        context = self.queue.context
        # Arrange data transfer to card
        details_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR,
                              hostbuf=call_details.buffer)
        values_b = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR,
                             hostbuf=values)

        # Call kernel and retrieve results
        step = 100
        #print("calling OpenCL")
        for start in range(0, call_details.pd_prod, step):
            stop = min(start+step, call_details.pd_prod)
            args = [
                np.uint32(self.q_input.nq), np.int32(start), np.int32(stop),
                details_b, values_b, self.q_input.q_b, self.result_b,
                self.real(cutoff),
            ]
            self.kernel(self.queue, self.q_input.global_size, None, *args)
        cl.enqueue_copy(self.queue, self.result, self.result_b)

        # Free buffers
        for v in (details_b, values_b):
            if v is not None: v.release()

        return self.result[:self.q_input.nq]
示例#13
0
    def __setitem__(self, item, new_value):
        if isinstance(item, slice) or is_iterable(item):
            raise NotImplementedError('TODO')
        else:
            m, n = self.shape0s[item], self.shape1s[item]
            sm, sn = self.stride0s[item], self.stride1s[item]

            if (sm, sn) in [(1, m), (n, 1)]:
                # contiguous
                clarray = self.getitem_device(item)
                if isinstance(new_value, np.ndarray):
                    array = np.asarray(new_value, order='C', dtype=self.dtype)
                else:
                    array = np.zeros(clarray.shape, dtype=clarray.dtype)
                    array[...] = new_value

                array.shape = clarray.shape  # reshape to avoid warning
                assert equal_strides(
                    array.strides, clarray.strides, clarray.shape)
                clarray.set(array)
            else:
                # discontiguous
                #   Copy a contiguous region off the device that surrounds the
                #   discontiguous, set the appropriate values, and copy back
                s = self.starts[item]
                array = to_host(self.queue, self.cl_buf.data, self.dtype,
                                s, (m, n), (sm, sn), is_blocking=True)
                array[...] = new_value

                buf = array.base if array.base is not None else array
                bytestart = self.dtype.itemsize * s
                cl.enqueue_copy(self.queue, self.cl_buf.data, buf,
                                device_offset=bytestart, is_blocking=True)
示例#14
0
def test_sub_buffers(ctx_factory):
    ctx = ctx_factory()
    if (ctx._get_cl_version() < (1, 1) or
            cl.get_cl_header_version() < (1, 1)):
        from pytest import skip
        skip("sub-buffers are only available in OpenCL 1.1")

    alignment = ctx.devices[0].mem_base_addr_align

    queue = cl.CommandQueue(ctx)

    n = 30000
    a = (np.random.rand(n) * 100).astype(np.uint8)

    mf = cl.mem_flags
    a_buf = cl.Buffer(ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a)

    start = (5000 // alignment) * alignment
    stop = start + 20 * alignment

    a_sub_ref = a[start:stop]

    a_sub = np.empty_like(a_sub_ref)
    cl.enqueue_copy(queue, a_sub, a_buf[start:stop])

    assert np.array_equal(a_sub, a_sub_ref)
示例#15
0
def final(config, ctx, queue, program, buffers, debug=False):
    matrixSize = config['matrixSize']
    bandwidth = config['bandwidth']
    partitionNumber = config['partitionNumber']
    partitionSize = config['partitionSize']
    offdiagonalSize = config['offdiagonalSize']
    rhsSize = config['rhsSize']

    xo  = np.ones((partitionNumber * (partitionSize - 2 * offdiagonalSize), rhsSize), dtype=np.float32)
    tmp = np.ones((partitionNumber * (partitionSize - 2 * offdiagonalSize), rhsSize), dtype=np.float32)

    mf = cl.mem_flags
    xo_buf  = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=xo)
    tmp_buf = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=tmp)

    kernel = program.reconstruct
    kernel.set_scalar_arg_dtypes([None, None, None, None, np.int32, np.int32, np.int32])

    cl.enqueue_barrier(queue)

    kernel(
        queue,
        (partitionNumber,),
        None,
        buffers[1], # Avwg buffer from factor, see if it is also readable and still valide
        buffers[3], # x buffer from solve, see if it is still valide
        xo_buf,
        tmp_buf,
        np.int32(partitionSize),
        np.int32(offdiagonalSize),
        np.int32(rhsSize)
    )

    xtb = np.ones((partitionNumber * 2 * offdiagonalSize, rhsSize), dtype=np.float32)
    cl.enqueue_copy(queue, xtb, buffers[3])

    if (debug) :
        print "X(t,b):"
        print xtb

    cl.enqueue_copy(queue, xo, xo_buf)

    if (debug) :
        print "X':"
        print xo

    xtb = sparse.csr_matrix(xtb)
    xo = sparse.csr_matrix(xo)

    x = []
    for i in range(0, partitionNumber) :
        t = i * (2 * offdiagonalSize)
        b = (i + 1) * (2 * offdiagonalSize)
        u = i * (partitionSize - 2 * offdiagonalSize)
        v = (i + 1) * (partitionSize - 2 * offdiagonalSize)
        x.append(xtb[t : t + offdiagonalSize, 0 : rhsSize])
        x.append(xo[u : v, 0 : rhsSize])
        x.append(xtb[b - offdiagonalSize : b, 0 : rhsSize])

    return sp.sparse.vstack(x)
示例#16
0
 def compute(self, simulations, window):
     simulationsOpenCL=simulations.reshape(-1, order = 'C').astype(np.float32)
     
     globalSize=(int(self.sizes['reshaped0']), int(self.sizes['reshaped1']))
     localSize=(int(self.openCL.workGroup[0]), int(self.openCL.workGroup[1]))
     
     mf = cl.mem_flags
     base=np.float32(2)
     chromosomeLength=np.int32(self.sizes['simulationsLength'])
     lim0=np.int32(self.sizes['originalSimulations'])
     lim1=np.int32(self.sizes['originalSimulations'])
     window=np.int32(window)
     simulationsBuffer = cl.Buffer(self.openCL.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=simulationsOpenCL)
     outBuffer = cl.Buffer(self.openCL.ctx, mf.WRITE_ONLY, int((self.sizes['originalSimulations']**2)*np.int32(1).nbytes))
     
     kernel=self.openCL.prg.phenCrowding
     
     kernel(self.openCL.queue, globalSize, localSize,
            base, chromosomeLength, window, lim0, lim1,
            simulationsBuffer,
            outBuffer)
     
     crowding = np.zeros((self.sizes['originalSimulations']**2,)).astype(np.float32)
     cl.enqueue_copy(self.openCL.queue, crowding, outBuffer)
     crowding=np.reshape(crowding, (self.sizes['originalSimulations'], -1), order='F')
     
     return crowding
示例#17
0
    def likelihood(self, outcomes, modelparams, expparams):
        # By calling the superclass implementation, we can consolidate
        # call counting there.
        super(AcceleratedPrecessionModel, self).likelihood(outcomes, modelparams, expparams)
        
        # Possibly add a second axis to modelparams.
        if len(modelparams.shape) == 1:
            modelparams = modelparams[..., np.newaxis]
        
        # Convert to float32 if needed.
        mps = modelparams.astype(np.float32)
        eps = expparams.astype(np.float32)

        # Allocating a buffer for the pr0 returns.
        pr0 = np.empty((mps.shape[0], eps.shape[0]), dtype=mps.dtype)

        # Move buffers to the GPU.
        mf = cl.mem_flags
        
        mps_buf = cl.Buffer(self._ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=mps)
        eps_buf = cl.Buffer(self._ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=eps)
        dest_buf = cl.Buffer(self._ctx, mf.WRITE_ONLY, pr0.nbytes)

        # Run the kernel with global worksize (n_models, n_experiments).
        self._prg.cos_model(self._queue, pr0.shape, None, np.int32(eps.shape[0]), mps_buf, eps_buf, dest_buf)

        # Copy the buffer back from the GPU and free memory there.
        cl.enqueue_copy(self._queue, pr0, dest_buf)
        mps_buf.release()
        eps_buf.release()
        dest_buf.release()
        
        # Now we concatenate over outcomes.
        return FiniteOutcomeModel.pr0_to_likelihood_array(outcomes, pr0)
示例#18
0
文件: main_new.py 项目: dptph/lat-qcd
   def S_reduction100(self):
        self.program.CalcSREDUCTLEN(self.queue, (self.ThreadsNumb,), (self.Slocalsize,), self.Const_int_buf, self.Const_float_buf, self.Link_buf, self.S_out_of_groups_buf,  cl.LocalMemory(self.Slocalsize*self.Type.itemsize), self.Seed_buf)
	cl.enqueue_copy(self.queue, self.S_out_of_groups_cpu, self.S_out_of_groups_buf).wait()
	r=numpy.zeros(7)
	for i in xrange(len(self.S_out_of_groups_cpu)):
		r[i%7]+=self.S_out_of_groups_cpu[i]
        return r/(self.prop['Ns']**3*self.prop['Nt']*6)
示例#19
0
    def process_data(self, pos, newbuf):
        assert newbuf.shape[0] ==self.chunksize

        if not newbuf.flags['C_CONTIGUOUS']:
            newbuf = newbuf.copy()

        pyopencl.enqueue_copy(self.queue,  self.sigs_cl, newbuf)
        event = self.kern_detect_peaks(self.queue,  (self.chunksize,), (self.max_wg_size,),
                                self.sigs_cl, self.ring_sum_cl, self.peaks_cl)
        event.wait()
        
        if pos-(newbuf.shape[0]+2*self.n_span)<0:
            # the very first buffer is sacrified because of peak span
            return None, None
        
        pyopencl.enqueue_copy(self.queue,  self.peaks, self.peaks_cl)
        ind_peaks,  = np.nonzero(self.peaks)
        
        if ind_peaks.size>0:
            ind_peaks += pos - newbuf.shape[0] - self.n_span
            self.n_peak += ind_peaks.size
            #~ peaks = np.zeros(ind_peaks.size, dtype = [('index', 'int64'), ('code', 'int64')])
            #~ peaks['index'] = ind_peaks
            #~ return self.n_peak, peaks
            return self.n_peak, ind_peaks
        
        return None, None
示例#20
0
    def get_fields(self):
        """
        :return: Returns a dictionary of all fields. Transfers data from the GPU to the CPU.
        """
        f = np.zeros((self.nx, self.ny, NUM_JUMPERS), dtype=np.float32, order='F')
        cl.enqueue_copy(self.queue, f, self.f, is_blocking=True)

        feq = np.zeros((self.nx, self.ny, NUM_JUMPERS), dtype=np.float32, order='F')
        cl.enqueue_copy(self.queue, feq, self.feq, is_blocking=True)

        u = np.zeros((self.nx, self.ny), dtype=np.float32, order='F')
        cl.enqueue_copy(self.queue, u, self.u, is_blocking=True)

        v = np.zeros((self.nx, self.ny), dtype=np.float32, order='F')
        cl.enqueue_copy(self.queue, v, self.v, is_blocking=True)

        rho = np.zeros((self.nx, self.ny), dtype=np.float32, order='F')
        cl.enqueue_copy(self.queue, rho, self.rho, is_blocking=True)

        results={}
        results['f'] = f
        results['u'] = u
        results['v'] = v
        results['rho'] = rho
        results['feq'] = feq
        return results
示例#21
0
    def get_fields(self):
        """
        :return: Returns a dictionary of all fields. Transfers data from the GPU to the CPU.
        """
        f = np.zeros((self.nx, self.ny, self.num_populations + 1, NUM_JUMPERS), dtype=np.float32, order="F")
        cl.enqueue_copy(self.queue, f, self.f, is_blocking=True)

        feq = np.zeros((self.nx, self.ny, self.num_populations + 1, NUM_JUMPERS), dtype=np.float32, order="F")
        cl.enqueue_copy(self.queue, feq, self.feq, is_blocking=True)

        u = np.zeros((self.nx, self.ny), dtype=np.float32, order="F")
        cl.enqueue_copy(self.queue, u, self.u, is_blocking=True)

        v = np.zeros((self.nx, self.ny), dtype=np.float32, order="F")
        cl.enqueue_copy(self.queue, v, self.v, is_blocking=True)

        rho = np.zeros((self.nx, self.ny, self.num_populations + 1), dtype=np.float32, order="F")
        cl.enqueue_copy(self.queue, rho, self.rho, is_blocking=True)

        results = {}
        results["f"] = f
        results["u"] = u
        results["v"] = v
        results["rho"] = rho
        results["feq"] = feq
        return results
示例#22
0
    def eval(self, pars):

        _ctx,queue = card()
        radius, length = \
            [GaussianDispersion(int(pars[base+'_pd_n']), pars[base+'_pd'], pars[base+'_pd_nsigma'])
             for base in OneDGpuCylinder.PD_PARS]

        #Get the weights for each
        radius.value, radius.weight = radius.get_weights(pars['radius'], 0, 10000, True)
        length.value, length.weight = length.get_weights(pars['length'], 0, 10000, True)

        #Perform the computation, with all weight points
        sum, norm, vol = 0.0, 0.0, 0.0,
        sub = pars['sldCyl'] - pars['sldSolv']

        real = np.float32 if self.q.dtype == np.dtype('float32') else np.float64
        #Loop over radius, length, theta, phi weight points
        for r in xrange(len(radius.weight)):
            for l in xrange(len(length.weight)):
                        self.prg.OneDCylKernel(queue, self.q.shape, None, self.q_b, self.res_b, real(sub),
                                           real(length.value[l]), real(radius.value[r]), real(pars['scale']),
                                           np.uint32(self.q.size), real(pars['uplim']), real(pars['bolim']))
                        cl.enqueue_copy(queue, self.res, self.res_b)
                        sum += radius.weight[r]*length.weight[l]*self.res*pow(radius.value[r],2)*length.value[l]
                        vol += radius.weight[r]*length.weight[l] *pow(radius.value[r],2)*length.value[l]
                        norm += radius.weight[r]*length.weight[l]

        if vol != 0.0 and norm != 0.0:
            sum *= norm/vol

        return sum/norm + pars['background']
示例#23
0
 def lnlikelihood_ocl(self, pv):
     self._lnl2d(pv)
     self.prg_lnl.lnl1d_chunked(self.cl_queue, [self.lnl2d.shape[0], self.cl_lnl_chunks], None,
                                uint32(self.lnl2d.shape[1]), self._b_lnl2d, self._b_lnl1d)
     cl.enqueue_copy(self.cl_queue, self.lnl1d, self._b_lnl1d)
     lnl = self.lnl1d.astype('d').sum(1)
     return lnl
示例#24
0
	def execute(self):
		kernel = self.program.fact
		self.event = kernel(self.queue,[self.a_dim],None,self.d_a_buf,self.d_c_buf)
		self.event.wait()
		cl.enqueue_copy(self.queue, self.h_c, self.d_c_buf)
		print "a", self.h_a
		print "ris", self.h_c
示例#25
0
文件: features.py 项目: adamgreig/iib
def get_edges(clctx, features, reductions, blurs, buf_in, summarise=True):
    """
    Using the *features* and *reductions* programs, and *blurs* program with
    sigma=2.0, find all edge pixels in *buf_in* and return the count.
    """
    gs, wgs = clctx.gs, clctx.wgs
    bufa = cl.Image(clctx.ctx, cl.mem_flags.READ_WRITE, clctx.ifmt, (gs, gs))
    bufb = cl.Image(clctx.ctx, cl.mem_flags.READ_WRITE, clctx.ifmt, (gs, gs))
    bufc = cl.Image(clctx.ctx, cl.mem_flags.READ_WRITE, clctx.ifmt, (gs, gs))

    blurs.convolve_x(clctx.queue, (gs, gs), (wgs, wgs), buf_in, bufb)
    blurs.convolve_y(clctx.queue, (gs, gs), (wgs, wgs), bufb, bufa)
    blurs.convolve_x(clctx.queue, (gs, gs), (wgs, wgs), bufa, bufc)
    blurs.convolve_y(clctx.queue, (gs, gs), (wgs, wgs), bufc, bufb)

    features.subtract(clctx.queue, (gs, gs), (wgs, wgs), bufb, bufa, bufc)
    features.edges(clctx.queue, (gs, gs), (wgs, wgs), bufc, bufa)
    counts = reduction.run_reduction(clctx, reductions.reduction_sum, bufa)

    if not summarise:
        edges = np.empty((gs, gs, 4), np.float32)
        cl.enqueue_copy(clctx.queue, edges, bufa,
                        origin=(0, 0), region=(gs, gs))

    bufa.release()
    bufb.release()
    bufc.release()

    if summarise:
        return counts
    else:
        return edges
示例#26
0
def to_host(queue, data, dtype, start, shape, elemstrides):
    """Copy memory off the device, into a Numpy array"""

    m, n = shape
    Sm, Sn = elemstrides
    if m * n == 0:
        return np.zeros(shape, dtype=dtype)

    if min(elemstrides) < 0:
        raise NotImplementedError()

    itemsize = dtype.itemsize
    bytestart = itemsize * start
    # -- TODO: is there an extra element transferred here?
    byteend = bytestart + itemsize * ((m-1) * Sm + (n-1) * Sn + 1)

    temp_buf = np.zeros((byteend - bytestart), dtype=np.int8)
    cl.enqueue_copy(queue, temp_buf, data,
                    device_offset=bytestart, is_blocking=True)

    bytestrides = (itemsize * Sm, itemsize * Sn)
    try:
        view = np.ndarray(
            shape=(m, n),
            dtype=dtype,
            buffer=temp_buf.data,
            offset=0,
            strides=bytestrides)
    except:
        raise
    return view
示例#27
0
    def __init__(self):
        t_np = np.arange(0, 100000000, dtype=np.float32)

        self.ctx = cl.create_some_context()
        self.queue = cl.CommandQueue(self.ctx)

        self.mf = cl.mem_flags
        self.t_g = cl.Buffer(
            self.ctx,
            self.mf.READ_ONLY | self.mf.COPY_HOST_PTR,
            hostbuf=t_np)

        f = open("ex.cl", "r")
        fstr = "".join(f.readlines())
        f.close()
        self.prg = cl.Program(self.ctx, fstr).build()

        self.res_g = cl.Buffer(self.ctx, self.mf.WRITE_ONLY, t_np.nbytes)
        self.prg.proc(self.queue, t_np.shape, None, self.t_g, self.res_g)

        res_np = np.empty_like(t_np)
        cl.enqueue_copy(self.queue, res_np, self.res_g)

        # Check on CPU with Numpy:
        print(res_np)
        print(np.amax(res_np))
示例#28
0
文件: render.py 项目: temik42/render
 def run(self):
     for ii in range(0,10):
         for jj in range(0,10):
             r = np.random.random([self.nsample,3])
             r[:,0]=(r[:,0]+ii)*0.1
             r[:,1]=(r[:,1]+jj)*0.1
             
             self.X = np.zeros((self.nsample,4), dtype = np.float32)
             self.X[:,0:3] = r
             self.X[:,3] = 1.
             
             self.I = np.zeros((self.nsample,4), dtype = np.float32)
             self.I[:,0:3] = 1.
             #self.I[:,3] = 0.
                             
             cl.enqueue_acquire_gl_objects(self.queue, [self.X_cl,self.I_cl])
             cl.enqueue_copy(self.queue, self.X_cl, self.X)
             cl.enqueue_copy(self.queue, self.I_cl, self.I)
             self.program.Solve(self.queue, (self.nsample, self.na), None, self.A_cl, self.X_cl, self.I_cl, self.alpha)
             cl.enqueue_release_gl_objects(self.queue, [self.X_cl,self.I_cl])
             self.queue.finish()             
             
             self.draw()
     
     self.scrnData = np.zeros((self.width,self.height), dtype = np.float32)
     glReadPixels(0, 0, self.width, self.height, GL_ALPHA, GL_FLOAT, self.scrnData)
     print np.max(self.scrnData)
     scipy.misc.imsave('render.png', np.flipud(self.scrnData))
	def get_color(self, img):
		# OpenCL only supports RGBA images, not RGB, so add an alpha channel
		src = np.array(img.convert('RGBA'))
		src.shape = w, h, _ = img.width, img.height, 4

		w = int(w * self.SCALE_FACTOR)
		h = int(h * self.SCALE_FACTOR)

		local_size = self.max_work_item_sizes
		global_size = (math.ceil(h / local_size[0]), math.ceil(w / local_size[1]))

		total_work_groups = global_size[0] * global_size[1]

		mf = cl.mem_flags
		src_buf = cl.image_from_array(self.ctx, src, 4, norm_int=True)

		out = np.zeros(4 * total_work_groups, dtype=np.int32)
		out_buf = cl.Buffer(self.ctx, mf.WRITE_ONLY, size=out.itemsize * 4 * total_work_groups)

		kernel = self.prg.get_color
		kernel.set_scalar_arg_dtypes([None, None, np.uint32, np.uint32])
		kernel(self.queue, global_size, local_size, src_buf, out_buf, w, h, g_times_l=True)

		cl.enqueue_copy(self.queue, dest=out, src=out_buf, is_blocking=True)

		# this sum takes .1 ms at 3440x1440, don't even bother OpenCL-ifying it
		resized_out = np.reshape(out, (out.shape[0] / 4, 4))
		summed_out = np.sum(resized_out, axis=0)

		avg_out = (summed_out / summed_out[3])[:3].astype(int)

		return avg_out
示例#30
0
文件: gpu.py 项目: mschachter/nads
    def update_state(self, cl_context, cl_queue, time):
        del self.next_state
        self.next_state = np.empty(self.num_unit_states, dtype="float32")

        mf = cl.mem_flags
        cl.enqueue_copy(cl_queue, self.next_state, self.next_state_buf)

        # print 'self.next_state:'
        # print list(self.next_state)

        self.state_buf.release()
        del self.state

        self.state = np.zeros([self.total_state_size], dtype="float32")
        self.state[: self.num_unit_states] = self.next_state
        for s in self.streams:
            sval = s.pull(time)
            for sindex in range(s.ndim):
                skey = (s.id, sindex)
                suid = self.stream_uids[skey]
                gpu_index = self.unit2gpu[suid]
                state_index = self.unit_state_index[gpu_index]
                self.state[state_index] = sval[sindex]
        # print 'self.state:'
        # print list(self.state)
        self.state_buf = cl.Buffer(cl_context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.state)
示例#31
0
    gpu_queue = cl.CommandQueue(gpu_context)

size = 1000
a = np.ones(size, dtype=np.float32)
b = np.ones(size, dtype=np.float32) * 2
c = np.zeros_like(a)

#lets first run this on the CPU, so create memory objects for the cpu_context
if has_cpu:
    a_dev = cl.Buffer(cpu_context, cl.mem_flags.READ_WRITE, a.nbytes)
    b_dev = cl.Buffer(cpu_context, cl.mem_flags.READ_WRITE, b.nbytes)
    c_dev = cl.Buffer(cpu_context, cl.mem_flags.READ_WRITE, c.nbytes)

    cpu_program = cl.Program(cpu_context, kernel_source).build()
    #copy memory objects to the device
    cl.enqueue_copy(cpu_queue, a_dev, a, is_blocking=True)
    cl.enqueue_copy(cpu_queue, b_dev, b, is_blocking=True)

    #__call__ (queue, global_size, local_size, *args, global_offset=None, wait_for=None, g_times_|=False
    cpu_program.sum(cpu_queue, a.shape, None, a_dev, b_dev, c_dev)
    cl.enqueue_copy(cpu_queue, c, c_dev)
    #check results
    python_kernel(a, b, c)

#now run on the GPU
if has_gpu:
    a_dev = cl.Buffer(gpu_context, cl.mem_flags.READ_WRITE, a.nbytes)
    b_dev = cl.Buffer(gpu_context, cl.mem_flags.READ_WRITE, b.nbytes)
    c_dev = cl.Buffer(gpu_context, cl.mem_flags.READ_WRITE, c.nbytes)

    gpu_program = cl.Program(gpu_context, kernel_source).build()
示例#32
0
    def process(self, pipe, image, data):
        import pyopencl as cl

        if not hasattr(self, "is_init"):
            # loop unrolling on mask
            mask = np.array(
                self.get("kernel", [[0, 0, 0], [0, 1, 0], [0, 0, 0]
                                    ])) * self.get("factor", 1.0)

            if mask.shape[0] != mask.shape[1]:
                print("WARNING: mask should be square for convolution!")

            if mask.shape[0] % 2 != 1:
                print("WARNING: mask should be an odd length")

            single_val_codeblock = """
            tx = x + {i};
            ty = y + {j};
            if (tx >= 0 && tx < w && ty >= 0 && ty < h) {{
                tidx = tx + ty * w;
                c = (float)({MASK_VAL});
                nR += c * (float)img_in[3*tidx+0];
                nG += c * (float)img_in[3*tidx+1];
                nB += c * (float)img_in[3*tidx+2];
            }}
            """

            kernel_unfolded = ""

            for i in range(0, mask.shape[0]):
                for j in range(0, mask.shape[1]):
                    if mask[j, i] != 0:
                        cur_cb = single_val_codeblock.format(
                            i=i - mask.shape[0] // 2,
                            j=mask.shape[0] // 2 - j,
                            MASK_VAL=mask[j, i])
                        kernel_unfolded += cur_cb

            #print (kernel_unfolded)

            OPENCL_SRC = """
__kernel void convert(__global __read_only uchar * img_in, __global __write_only uchar * img_out, int w, int h) {{
        int x = get_global_id(0), y = get_global_id(1);
        if (x >= w || y >= h) return;
        int idx = x + y * w;
        // img_in[3*idx+0] = R component at pixel x, y
        // img_in[3*idx+1] = G component at pixel x, y
        // img_in[3*idx+2] = B component at pixel x, y

        // new components
        float nR = 0, nG = 0, nB = 0;

        int tx, ty, tidx;
        float c;

        // AUTOGEN START

        {gen_code}

        // AUTOGEN END

        

        img_out[3 * idx + 0] = (uchar)clamp(nR, 0.0f, 255.0f);
        img_out[3 * idx + 1] = (uchar)clamp(nG, 0.0f, 255.0f);
        img_out[3 * idx + 2] = (uchar)clamp(nB, 0.0f, 255.0f);
    }}
""".format(gen_code=kernel_unfolded)

            #print(OPENCL_SRC)

            mf = cl.mem_flags

            platform = cl.get_platforms()[self.get("opencl_platform", 0)]
            devs = platform.get_devices()[self.get("opencl_device", -1)]
            self.ctx = cl.Context(devs if isinstance(devs, list) else [devs])
            self.queue = cl.CommandQueue(self.ctx)

            # now build the programs
            self.prg = cl.Program(self.ctx, OPENCL_SRC).build()

            self.src_buf = cl.Buffer(self.ctx, mf.READ_ONLY, image.nbytes)
            self.dest_buf = cl.Buffer(self.ctx, mf.WRITE_ONLY, image.nbytes)
            self.dest = np.empty_like(image)

            # we have initialized
            self.is_init = True

        h, w, _ = image.shape

        # write current image
        cl.enqueue_copy(self.queue, self.src_buf, image)

        self.prg.convert(self.queue, (w, h), None, self.src_buf, self.dest_buf,
                         np.int32(w), np.int32(h))

        # read back image
        cl.enqueue_copy(self.queue, self.dest, self.dest_buf)

        return self.dest, data
示例#33
0
  } 
  else 
  {
  	for (int i = thread_id*k; i < N; i++)
  	  y[thread_id] = y[thread_id] + x[i];
  }
}
"""

program = cl.Program(context, src).build(options='')

#Transfers the data
#hx = np.random.uniform(0, 1, N).astype(np.float32)
hx = np.ones(N).astype(np.float32)
print hx.shape
print W * Ng
hy = np.zeros(N).astype(np.float32)

cl.enqueue_copy(queue, x, hx, is_blocking=False)
event_execute = program.add_vectors(queue, (Ng * W, ), (W, ), x, y)
# print "RESULT B4", hy[W*Ng]
event_copy = cl.enqueue_copy(queue, hy, y, is_blocking=True)
print "RESULT = ", hy[:W * Ng], hx[9999999]
print 'here'
print "SUM RESULT = ", sum(hy[:W * Ng])

print "NUMPY RESULT = ", sum(hx)
#At this point, the queue is not flush. Nothing has been sent for execution.
queue.flush()
#At this point, the queue is not finished. The completion of the operations is not guaranteed!
queue.finish()
示例#34
0
def main():
    if len(sys.argv) != 2:
        _help()
        sys.exit(1)

    device = int(sys.argv[1])

    devices = get_devices()
    context = cl.Context(devices=[devices[device]])
    queue = cl.CommandQueue(context)

    # TODO use these to decide what "shape" to make the work groups
    # and add something which allows those shapes to be replaced in the
    # openCL source code
    max_group = devices[device].max_work_group_size
    max_item = devices[device].max_work_item_sizes

    cl_text = open("memcpy_3d.cl", "r").read().replace("LOCAL_SIZE", "256")
    program = cl.Program(context, cl_text).build()
    memcpy_3d = program.memcpy_3d

    memcpy_3d.set_scalar_arg_dtypes(
        [
            None,
            np.int32,
            np.int32,
            np.int32,
            None,
        ]
    )

    shape = (32, 512, 1028)
    size = shape[0] * shape[1] * shape[2]

    mem_in = np.random.randint(0, 256, size=size, dtype=np.uint16).reshape(shape)

    _mem_in = cl.Buffer(
        context, cl.mem_flags.READ_ONLY, mem_in.size * np.dtype(mem_in.dtype).itemsize
    )
    _mem_out = cl.Buffer(
        context, cl.mem_flags.WRITE_ONLY, mem_in.size * np.dtype(mem_in.dtype).itemsize
    )

    mem_out = np.zeros(shape=mem_in.shape, dtype=mem_in.dtype)

    cl.enqueue_copy(queue, _mem_in, mem_in)

    # work must be a multiple of group size
    group = (1, 12, 16)
    work = tuple(int(group[d] * np.ceil(shape[d] / group[d])) for d in (0, 1, 2))
    print(f"{shape} -> {work}")
    evt = memcpy_3d(
        queue,
        work,
        group,
        _mem_in,
        shape[0],
        shape[1],
        shape[2],
        _mem_out,
    )
    evt.wait()

    cl.enqueue_copy(queue, mem_out, _mem_out)

    assert np.array_equal(mem_in, mem_out)
示例#35
0
        //sort the columns
        sort( &(pixel00), &(pixel10), &(pixel20) );
        sort( &(pixel01), &(pixel11), &(pixel21) );
        sort( &(pixel02), &(pixel12), &(pixel22) );
        //sort the diagonal
        sort( &(pixel00), &(pixel11), &(pixel22) );
        // median is the the middle value of the diagonal
        result[i] = pixel11;
    }
}
'''

#Kernel function instantiation
prg = cl.Program(ctx, src).build()
#Allocate memory for variables on the device
img_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=img)
result_g = cl.Buffer(ctx, mf.WRITE_ONLY, img.nbytes)
width_g = cl.Buffer(ctx,
                    mf.READ_ONLY | mf.COPY_HOST_PTR,
                    hostbuf=np.int32(img.shape[1]))
height_g = cl.Buffer(ctx,
                     mf.READ_ONLY | mf.COPY_HOST_PTR,
                     hostbuf=np.int32(img.shape[0]))
# Call Kernel. Automatically takes care of block/grid distribution
prg.medianFilter(queue, img.shape, None, img_g, result_g, width_g, height_g)
result = np.empty_like(img)
cl.enqueue_copy(queue, result, result_g)

# Show the blurred image
imsave('medianFilter-OpenCL.jpg', result)
示例#36
0
def test_enqueue_copy_rect_2d(ctx_factory, honor_skip=True):
    """
    Test 2D sub-array (slice) copy.
    """
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    if (honor_skip
            and ctx.devices[0].platform.name == "Portable Computing Language"
            and get_pocl_version(ctx.devices[0].platform) <= (0, 13)):
        # https://github.com/pocl/pocl/issues/353
        pytest.skip("POCL's rectangular copies crash")

    ary_in_shp = 256, 128  # Entire array shape from which sub-array copied to device
    sub_ary_shp = 128, 96  # Sub-array shape to be copied to device
    ary_in_origin = 20, 13  # Sub-array origin
    ary_in_slice = generate_slice(ary_in_origin, sub_ary_shp)

    ary_out_origin = 11, 19  # Origin of sub-array copy from device to host-array
    ary_out_shp = 512, 256  # Entire host-array shape copy sub-array device->host
    ary_out_slice = generate_slice(ary_out_origin, sub_ary_shp)

    buf_in_origin = 7, 3  # Origin of sub-array in device buffer
    buf_in_shp = 300, 200  # shape of device buffer

    buf_out_origin = 31, 17  # Origin of 2nd device buffer
    buf_out_shp = 300, 400  # shape of 2nd device buffer

    # Create host array of random values.
    h_ary_in = \
        np.array(
            np.random.randint(
                0,
                256,
                np.product(ary_in_shp)
            ),
            dtype=np.uint8
        ).reshape(ary_in_shp)

    # Create device buffers
    d_in_buf = cl.Buffer(ctx,
                         cl.mem_flags.READ_ONLY,
                         size=np.product(buf_in_shp))
    d_out_buf = cl.Buffer(ctx,
                          cl.mem_flags.READ_ONLY,
                          size=np.product(buf_out_shp))

    # Copy sub-array (rectangular buffer) from host to device
    cl.enqueue_copy(queue,
                    d_in_buf,
                    h_ary_in,
                    buffer_origin=buf_in_origin[::-1],
                    host_origin=ary_in_origin[::-1],
                    region=sub_ary_shp[::-1],
                    buffer_pitches=(buf_in_shp[-1], ),
                    host_pitches=(ary_in_shp[-1], ))
    # Copy sub-array (rectangular buffer) from device-buffer to device-buffer
    cl.enqueue_copy(queue,
                    d_out_buf,
                    d_in_buf,
                    src_origin=buf_in_origin[::-1],
                    dst_origin=buf_out_origin[::-1],
                    region=sub_ary_shp[::-1],
                    src_pitches=(buf_in_shp[-1], ),
                    dst_pitches=(buf_out_shp[-1], ))

    # Create zero-initialised array to receive sub-array from device
    h_ary_out = np.zeros(ary_out_shp, dtype=h_ary_in.dtype)

    # Copy sub-array (rectangular buffer) from device to host-array.
    cl.enqueue_copy(queue,
                    h_ary_out,
                    d_out_buf,
                    buffer_origin=buf_out_origin[::-1],
                    host_origin=ary_out_origin[::-1],
                    region=sub_ary_shp[::-1],
                    buffer_pitches=(buf_out_shp[-1], ),
                    host_pitches=(ary_out_shp[-1], ))
    queue.finish()

    # Check that the sub-array copied to device is
    # the same as the sub-array received from device.
    assert np.all(h_ary_in[ary_in_slice] == h_ary_out[ary_out_slice])
示例#37
0
def copyResult(queue, result, outResult):
    cl.enqueue_copy(queue, result, outResult)
示例#38
0
print("Executing computation")
#prg.sum(queue, a_np.shape, None, a_g, b_g, res_g)
knl = prg.sum
knl.set_args(a_g, b_g, res_g)
local_work_size = None
#local_work_size = (10,)

t0 = time.perf_counter_ns()
ev = cl.enqueue_nd_range_kernel(queue=queue, kernel=knl, global_work_size=(vector_size//float_vector_size,), local_work_size=local_work_size)
t0_enqueue = time.perf_counter_ns()
ev.wait()
t1 = time.perf_counter_ns()

print("Transferring result to host")
t2 = time.perf_counter_ns()
cl.enqueue_copy(queue, res_np, res_g)
t3 = time.perf_counter_ns()

# Check on CPU with Numpy:
print("Computing on the host using numpy")
t4 = time.perf_counter_ns()
res_local = a_np + b_np
t5 = time.perf_counter_ns()
print("Local type:", res_local.dtype)

print("---------------------------------------------------------------------------")
print("Comparing results")
print("Difference   : {}".format(res_np - res_local))
print("A            : {}".format(a_np))
print("B            : {}".format(b_np))
print("Result OpenCL: {}".format(res_np))
示例#39
0
    def evaluate(self, k: Union[float, ndarray], ldc: ndarray, t0: Union[float, ndarray], p: Union[float, ndarray],
                 a: Union[float, ndarray], i: Union[float, ndarray], e: Union[float, ndarray] = None,
                 w: Union[float, ndarray] = None, copy: bool = True) -> ndarray:
        """Evaluate the transit model for a set of scalar or vector parameters.

        Parameters
        ----------
        k
            Radius ratio(s) either as a single float, 1D vector, or 2D array.
        ldc
            Limb darkening coefficients as a 1D or 2D array.
        t0
            Transit center(s) as a float or a 1D vector.
        p
            Orbital period(s) as a float or a 1D vector.
        a
            Orbital semi-major axis (axes) divided by the stellar radius as a float or a 1D vector.
        i
            Orbital inclination(s) as a float or a 1D vector.
        e : optional
            Orbital eccentricity as a float or a 1D vector.
        w : optional
            Argument of periastron as a float or a 1D vector.

        Notes
        -----
        The model can be evaluated either for one set of parameters or for many sets of parameters simultaneously. In
        the first case, the orbital parameters should all be given as floats. In the second case, the orbital parameters
        should be given as a 1D array-like.

        Returns
        -------
        ndarray
            Modelled flux either as a 1D or 2D ndarray.
        """
        npv = 1 if isinstance(t0, float) else len(t0)
        k = asarray(k)

        if k.size == 1:
            nk = 1
        elif npv == 1:
            nk = k.size
        else:
            nk = k.shape[1]

        if e is None:
            e, w = 0.0, 0.0

        pvp = empty((npv, nk + 6), dtype=float32)
        pvp[:, :nk] = k
        pvp[:, nk] = t0
        pvp[:, nk + 1] = p
        pvp[:, nk + 2] = a
        pvp[:, nk + 3] = i
        pvp[:, nk + 4] = e
        pvp[:, nk + 5] = w

        ldc = atleast_2d(ldc).astype(float32)
        self.npv = uint32(pvp.shape[0])
        self.spv = uint32(pvp.shape[1])

        # Release and reinitialise the GPU buffers if the sizes of the time or
        # limb darkening coefficient arrays change.
        if (ldc.size != self.u.size) or (pvp.size != self.pv.size):
            assert self.npb == ldc.shape[1] // 2

            if self._b_f is not None:
                self._b_f.release()
                self._b_u.release()
                self._b_p.release()

            self.pv = zeros(pvp.shape, float32)
            self.u = zeros((self.npv, 2 * self.npb), float32)
            self.f = zeros((self.npv, self.nptb), float32)

            mf = cl.mem_flags
            self._b_f = cl.Buffer(self.ctx, mf.WRITE_ONLY, self.time.nbytes * self.npv)
            self._b_u = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.u)
            self._b_p = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.pv)

        # Copy the limb darkening coefficient array to the GPU
        cl.enqueue_copy(self.queue, self._b_u, ldc)

        # Copy the parameter vector to the GPU
        self.pv[:] = pvp
        cl.enqueue_copy(self.queue, self._b_p, self.pv)

        self.prg.ma_eccentric_pop(self.queue, (self.npv, self.nptb), None, self._b_time, self._b_lcids, self._b_pbids,
                                  self._b_p, self._b_u,
                                  self._b_ed, self._b_le, self._b_ld, self._b_nsamples, self._b_etimes,
                                  self.k0, self.k1, self.nk, self.nz, self.dk, self.dz,
                                  self.spv, self.nlc, self.npb, self._b_f)

        if copy:
            cl.enqueue_copy(self.queue, self.f, self._b_f)
            return squeeze(self.f)
        else:
            return None
示例#40
0
FILE_NAME="krak.cl"
f=open(FILE_NAME,"r")
SRC = ''.join(f.readlines())
f.close()

prg = cl.Program(ctx, SRC).build()

print("\n\nCracking the following data for a challenge:")
print(a)

# launch the kernel
event = prg.krak(queue, a.shape, None, a_dev, s)
event.wait()
 
# copy the output from the context to the Python process
cl.enqueue_copy(queue, a, a_dev)
 
# if everything went fine, b should contain squares of integers
print("CL returned:")
print(a)

if(a[0] == 0xdfd05a8b899b6000):
  print("[OK] Distinguished point search over colors 2-7 passed, result=0xdfd05a8b899b6000")
if(a[12] == 0x3e248e031efda051):
  print("[OK] Key in table in color 2 found, result=0x3e248e031efda051")

print("Dame brmlab? Dame deku?")


#prg = cl.Program(ctx, SRC).build()
#krak = prg.krak
示例#41
0
# actual benchmark ------------------------------------------------------------
t1 = time()

count = 20
for i in range(count):
    event = kernel(queue, h_c.shape[::-1], (block_size, block_size), d_c_buf,
                   d_a_buf, d_b_buf)

event.wait()

gpu_time = (time() - t1) / count

# transfer device -> host -----------------------------------------------------
t1 = time()
cl.enqueue_copy(queue, h_c, d_c_buf)
pull_time = time() - t1

# timing output ---------------------------------------------------------------
gpu_total_time = gpu_time + push_time + pull_time

print "GPU push+compute+pull total [s]:", gpu_total_time
print "GPU push [s]:", push_time
print "GPU pull [s]:", pull_time
print "GPU compute (host-timed) [s]:", gpu_time
print "GPU compute (event-timed) [s]: ", (event.profile.end -
                                          event.profile.start) * 1e-9

gflop = h_c.size * (a_width * 2.) / (1000**3.)
gflops = gflop / gpu_time
示例#42
0
    def evaluate_pv(self, pvp: ndarray, ldc: ndarray, copy: bool = True):
        """Evaluate the transit model for 2D parameter array.

         Parameters
         ----------
         pvp
             Parameter array with a shape `(npv, npar)` where `npv` is the number of parameter vectors, and each row
             contains a set of parameters `[k, t0, p, a, i, e, w]`. The radius ratios can also be given per passband,
             in which case the row should be structured as `[k_0, k_1, k_2, ..., k_npb, t0, p, a, i, e, w]`.
         ldc
             Limb darkening coefficient array with shape `(npv, 2*npb)`, where `npv` is the number of parameter vectors
             and `npb` is the number of passbands.

         Notes
         -----
         This version of the `evaluate` method is optimized for calculating several models in parallel, such as when
         using *emcee* for MCMC sampling.

         Returns
         -------
         ndarray
             Modelled flux either as a 1D or 2D ndarray.
         """
        pvp = atleast_2d(pvp)
        ldc = atleast_2d(ldc).astype(float32)
        self.npv = uint32(pvp.shape[0])
        self.spv = uint32(pvp.shape[1])

        if pvp.shape[0] != ldc.shape[0]:
            raise ValueError("The parameter array and the ldc array have incompatible dimensions.")

        # Release and reinitialise the GPU buffers if the sizes of the time or
        # limb darkening coefficient arrays change.
        if (ldc.size != self.u.size) or (pvp.size != self.pv.size):
            assert self.npb == ldc.shape[1] // 2

            if self._b_f is not None:
                self._b_f.release()
                self._b_u.release()
                self._b_p.release()

            self.pv = zeros(pvp.shape, float32)
            self.u = zeros((self.npv, 2 * self.npb), float32)
            self.f = zeros((self.npv, self.nptb), float32)

            mf = cl.mem_flags
            self._b_f = cl.Buffer(self.ctx, mf.WRITE_ONLY, self.time.nbytes * self.npv)
            self._b_u = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.u)
            self._b_p = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.pv)

        # Copy the limb darkening coefficient array to the GPU
        cl.enqueue_copy(self.queue, self._b_u, ldc)

        # Copy the parameter vector to the GPU
        self.pv[:] = pvp
        cl.enqueue_copy(self.queue, self._b_p, self.pv)

        self.prg.ma_eccentric_pop(self.queue, (self.npv, self.nptb), None, self._b_time, self._b_lcids, self._b_pbids,
                                  self._b_p, self._b_u,
                                  self._b_ed, self._b_le, self._b_ld, self._b_nsamples, self._b_etimes,
                                  self.k0, self.k1, self.nk, self.nz, self.dk, self.dz,
                                  self.spv, self.nlc, self.npb, self._b_f)

        if copy:
            cl.enqueue_copy(self.queue, self.f, self._b_f)
            return squeeze(self.f)
        else:
            return None
示例#43
0
def update_opencl():
    global u
    u0 = u[0,:]
    u1 = u[1,:]
    e0     = np.zeros(imax-1).astype(np.float32)
    e1     = np.zeros(imax-1).astype(np.float32)
    res0   = np.zeros(imax  ).astype(np.float32)
    res1   = np.zeros(imax  ).astype(np.float32)

    a = .5 * (cp + cm)
    b = .5 * (cp - cm)
    u0_g = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=u0)
    u1_g = cl.Buffer(ctx, mf.COPY_HOST_PTR, hostbuf=u1)
    e0_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=e0)
    e1_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=e1)
    res0_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=res0)
    res1_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=res1)

    prg = cl.Program(ctx, """
    #define A %(a)f
    #define B %(b)f
    #define DX %(dx)f
    #define DT %(dt)f
    #define SIZE %(size)d

    __kernel void update(
        __global float *u0_g,
        __global float *u1_g,
        __global float *e0_g,
        __global float *e1_g,
        __global float *res0_g,
        __global float *res1_g
        ) {
        int i = get_global_id(0);
        
        // compute flux vector
        float U0 = .5 * (u0_g[i] + u0_g[i+1]);
        float U1 = .5 * (u1_g[i] + u1_g[i+1]);
        barrier(CLK_GLOBAL_MEM_FENCE);

        if (i != SIZE-1) {
            e0_g[i] = A * U0 + B * U1 - .5*DX/DT * (u0_g[i+1] - u0_g[i]);
            e1_g[i] = B * U0 + A * U1 - .5*DX/DT * (u1_g[i+1] - u1_g[i]);
        }
        barrier(CLK_GLOBAL_MEM_FENCE);

        // compute residual vector
        if (i != 0 && i != SIZE-1) {
            res0_g[i] = -(e0_g[i] - e0_g[i-1]) / DX;
            res1_g[i] = -(e1_g[i] - e1_g[i-1]) / DX;
        }

        // update
        u0_g[i] = u0_g[i] + DT * res0_g[i];
        u1_g[i] = u1_g[i] + DT * res1_g[i];
    }
    """ % {"a":a, "b":b , "dx":dx, "dt":dt, "size":imax}).build()

    prg.update(queue, u0.shape, None, u0_g, u1_g, e0_g, e1_g, res0_g, res1_g)
    
    cl.enqueue_copy(queue, u0, u0_g)
    cl.enqueue_copy(queue, u1, u1_g)
    u = np.vstack((u0, u1))

    return
示例#44
0
文件: bin.py 项目: arturxz/TCC
    morph_window_3d[1, 1, 1] = 255
    morph_window_3d[1, 1, 2] = 255
    morph_window_3d[1, 2, 1] = 255
    morph_window_3d[2, 1, 1] = 255

    vl.vglCheckContext(img_input, vl.VGL_CL_CONTEXT())
    vl.vglCheckContext(img_output, vl.VGL_CL_CONTEXT())
    vl.vglCheckContext(img_input, vl.VGL_RAM_CONTEXT())
    vl.vglCheckContext(img_output, vl.VGL_RAM_CONTEXT())

    wrp.vglClBinThreshold(img_input, img_output, np.float32(.5))
    salvando2d(img_output, "bin-vglClBinThreshold.pgm")

    cl.enqueue_copy(wrp.cl_ctx.queue,
                    img_input.get_oclPtr(),
                    img_output.get_oclPtr(),
                    dest_origin=(0, 0, 0),
                    src_origin=(0, 0, 0),
                    region=img_input.get_oclPtr().shape)

    wrp.vglClBinConway(img_input, img_output)
    salvando2d(img_output, "bin-vglClBinConway.pgm")

    wrp = None

    img_input = None
    img_input_3d = None

    img_input2 = None
    img_input2_3d = None

    img_output = None
示例#45
0
    def prefixSum(self, e, data, keys, ndata, low, hi, events):
        import numpy as np
        import pyopencl as cl
        mf = cl.mem_flags

        if not isinstance(data, cl.Buffer):
            data_buf = cl.Buffer(self.ctx,
                                 mf.READ_ONLY | mf.COPY_HOST_PTR,
                                 hostbuf=data)
        else:
            data_buf = data

        if not isinstance(keys, cl.Buffer):
            keys_buf = cl.Buffer(self.ctx,
                                 mf.READ_ONLY | mf.COPY_HOST_PTR,
                                 hostbuf=keys)
        else:
            keys_buf = keys

        grid_dims = self.get_grid_dims(ndata)
        psumbytes = ndata * np.uint64(0).nbytes
        bsumbytes = int(np.prod(grid_dims) * np.uint64(0).nbytes)
        nbsumbytes = np.uint64(0).nbytes

        psum_buf = cl.Buffer(self.ctx, mf.READ_WRITE, psumbytes)
        bsum_buf = cl.Buffer(self.ctx, mf.READ_WRITE, bsumbytes)
        nbsum_buf = cl.Buffer(self.ctx, mf.READ_WRITE, nbsumbytes)

        low = PrefixSum.HOST_TYPE_KEYS(low)
        hi = PrefixSum.HOST_TYPE_KEYS(hi)

        kernel = self.prg.prefixSumDown
        kernel.set_args(data_buf, keys_buf, np.uint64(ndata), low, hi,
                        psum_buf, bsum_buf, nbsum_buf)

        global_dims = self.get_global(grid_dims)

        print "prefixSumDown %s %s" % (str(global_dims), str(self.localDims))
        if e is None:
            e = (cl.enqueue_nd_range_kernel(self.queue,
                                            kernel,
                                            global_dims,
                                            self.localDims,
                                            wait_for=e), )
        else:
            e = (cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims,
                                            self.localDims), )
        events += e

        nbsum = np.zeros(1, dtype=np.uint64)
        events += (cl.enqueue_copy(self.queue, nbsum, nbsum_buf, wait_for=e), )

        if nbsum > 1:
            (e, bsum_buf, bsum1_buf, nbsum1_buf,
             ndata2) = self.prefixSumDownInplace(e, bsum_buf, nbsum.item(),
                                                 events)
        else:
            ndata2 = np.zeros(1, dtype=np.uint64)
            events += (cl.enqueue_copy(self.queue,
                                       ndata2,
                                       bsum_buf,
                                       wait_for=e), )
            ndata2 = ndata2.item()
            print ndata2

        self.prefixSumUp(e, psum_buf, ndata, bsum_buf, nbsum, events)

        return (e, data_buf, keys_buf, psum_buf, bsum_buf, nbsum_buf, ndata2)
示例#46
0
def main2():

    cosm = cosmology.Cosmology(0.7, 0.3, 0, 0.7)
    zd1, zd2, zd3 = 0.3, 0.7, 1.1

    plane1 = {
        "scaledlenses":
        [(lenses.PlummerLens(cosm.getAngularDiameterDistance(zd1), {
            "mass": 1e14 * MASS_SUN,
            "width": 2.0 * ANGLE_ARCSEC
        }), V(1, 0) * ANGLE_ARCSEC)],
        "unscaledlens":
        (lenses.MassSheetLens(cosm.getAngularDiameterDistance(zd1),
                              {"density": 1.5}), V(0, 0)),
    }
    plane2 = {
        "scaledlenses":
        [(lenses.PlummerLens(cosm.getAngularDiameterDistance(zd2), {
            "mass": 2e14 * MASS_SUN,
            "width": 1.5 * ANGLE_ARCSEC
        }), V(0, 1) * ANGLE_ARCSEC)],
        "unscaledlens":
        (lenses.MassSheetLens(cosm.getAngularDiameterDistance(zd2),
                              {"density": 1.5}), V(0, 0)),
    }
    plane3 = {
        "scaledlenses": [
            (lenses.CompositeLens(cosm.getAngularDiameterDistance(zd3), [
                {
                    "lens":
                    lenses.PlummerLens(cosm.getAngularDiameterDistance(zd3), {
                        "mass": 1.5e14 * MASS_SUN,
                        "width": 3.5 * ANGLE_ARCSEC
                    }),
                    "factor":
                    0.5,
                    "angle":
                    0,
                    "x":
                    ANGLE_ARCSEC,
                    "y":
                    0
                },
            ]), V(-1, -1) * ANGLE_ARCSEC),
            (lenses.CompositeLens(cosm.getAngularDiameterDistance(zd3), [
                {
                    "lens":
                    lenses.PlummerLens(cosm.getAngularDiameterDistance(zd3), {
                        "mass": 1.5e14 * MASS_SUN,
                        "width": 3.5 * ANGLE_ARCSEC
                    }),
                    "factor":
                    0.5,
                    "angle":
                    0,
                    "x":
                    -ANGLE_ARCSEC,
                    "y":
                    0
                },
            ]), V(-1, -1) * ANGLE_ARCSEC),
        ],
        "unscaledlens":
        (lenses.MassSheetLens(cosm.getAngularDiameterDistance(zd3),
                              {"density": 1.5}), V(0, 0)),
    }
    zds, lensplanes = [zd1, zd2, zd3], [plane1, plane2, plane3]
    code = getMultiPlaneOCLProgram(lensplanes)

    angularScale = ANGLE_ARCSEC
    thetas = (np.array([
        V(5, 5) * ANGLE_ARCSEC,
        V(-4, 4) * ANGLE_ARCSEC,
        V(10, 0) * ANGLE_ARCSEC
    ]) / angularScale).astype(np.float32)
    zss = [0.5, 0.95, 2.0]

    #thetas = (np.array([ V(-4,4)*ANGLE_ARCSEC ])/angularScale).astype(np.float32)
    #zss = [ 0.95 ]

    betas = np.zeros(thetas.shape, dtype=np.float32)
    assert (len(thetas) == len(zss))

    allNumPlanes, DsrcAll, Dmatrix, intParams, floatParams, weights, centers, intParamOffsets, floatParamOffsets, weightOffsets, planeWeights = getOpenCLData(
        cosm, zss, zds, lensplanes, angularScale)

    pprint.pprint(thetas)
    pprint.pprint(betas)
    print("allNumPlanes")
    pprint.pprint(allNumPlanes)
    pprint.pprint(DsrcAll)
    pprint.pprint(Dmatrix)
    print("Int params:")
    pprint.pprint(intParams)
    print("Float params:")
    pprint.pprint(floatParams)

    print("Weights:")
    weights = np.random.random(weights.shape).astype(np.float32)
    pprint.pprint(weights)

    pprint.pprint(centers)
    pprint.pprint(intParamOffsets)
    pprint.pprint(floatParamOffsets)
    pprint.pprint(weightOffsets)

    planeWeights = np.random.random(planeWeights.shape).astype(np.float32)
    pprint.pprint(planeWeights)

    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    mf = cl.mem_flags
    d_thetas = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=thetas)
    d_betas = cl.Buffer(ctx, mf.WRITE_ONLY, betas.nbytes)
    d_numPlanes = cl.Buffer(ctx,
                            mf.READ_ONLY | mf.COPY_HOST_PTR,
                            hostbuf=allNumPlanes)
    d_DsrcAll = cl.Buffer(ctx,
                          mf.READ_ONLY | mf.COPY_HOST_PTR,
                          hostbuf=DsrcAll)
    d_Dmatrix = cl.Buffer(ctx,
                          mf.READ_ONLY | mf.COPY_HOST_PTR,
                          hostbuf=Dmatrix)
    d_intParams = cl.Buffer(ctx,
                            mf.READ_ONLY | mf.COPY_HOST_PTR,
                            hostbuf=intParams)
    d_floatParams = cl.Buffer(ctx,
                              mf.READ_ONLY | mf.COPY_HOST_PTR,
                              hostbuf=floatParams)
    d_weights = cl.Buffer(ctx,
                          mf.READ_ONLY | mf.COPY_HOST_PTR,
                          hostbuf=weights)
    d_centers = cl.Buffer(ctx,
                          mf.READ_ONLY | mf.COPY_HOST_PTR,
                          hostbuf=centers)
    d_intParamOffsets = cl.Buffer(ctx,
                                  mf.READ_ONLY | mf.COPY_HOST_PTR,
                                  hostbuf=intParamOffsets)
    d_floatParamOffsets = cl.Buffer(ctx,
                                    mf.READ_ONLY | mf.COPY_HOST_PTR,
                                    hostbuf=floatParamOffsets)
    d_weightOffsets = cl.Buffer(ctx,
                                mf.READ_ONLY | mf.COPY_HOST_PTR,
                                hostbuf=weightOffsets)
    d_planeWeights = cl.Buffer(ctx,
                               mf.READ_ONLY | mf.COPY_HOST_PTR,
                               hostbuf=planeWeights)

    #print(code)

    #print("Using code from tst.cl")
    #code = open("tst.cl", "rt").read()

    prg = cl.Program(ctx, code).build()
    calcBetas = prg.calculateBetas

    calcBetas(queue, (len(zss), ), None, np.int32(len(zss)), d_thetas, d_betas,
              d_numPlanes, d_DsrcAll, d_Dmatrix, d_intParams, d_floatParams,
              d_weights, d_centers, d_intParamOffsets, d_floatParamOffsets,
              d_weightOffsets, d_planeWeights)

    cl.enqueue_copy(queue, betas, d_betas)

    print("GPU betas:")
    pprint.pprint(betas)

    tracer = CPUMultiPlaneTracer(zss, zds, lensplanes, cosm)
    tracer.setAllWeights(weights.tolist())
    tracer.setPlaneWeights(planeWeights.tolist())
    betas = tracer.trace(thetas)
    print("CPU:")
    print(betas)
示例#47
0
    # Shared memory
    partialDotProduct = cl.LocalMemory(
        np.dtype(np.float64).itemsize * localWorkSize * 3)

    start = timer()

    for i in range(LOOP_COUNT):

        for iSmp in range(nSmp):

            # Fill the source with values.
            srcM[:, :, :] = M[:, iSmp, :, :]
            srcV[:] = v[:, iSmp]

            eventV = cl.enqueue_copy(queues[0], vector_buf, srcV)

            eventM0 = cl.enqueue_copy(queues[0],
                                      matrix_buf,
                                      srcM[:indptr[halfSize]],
                                      is_blocking=False)
            # spMVOverlapping
            matrix_dot_vector_kernel_event0 = \
            program.matrix_dot_vector(queues[0], (globalWorkSize,), (localWorkSize,), np.int64(halfSize), np.int64(0),
                                      indptr_buf, indices_buf, matrix_buf, vector_buf, destination_buf, partialDotProduct,
                                      wait_for=[eventM0])

            eventM1 = cl.enqueue_copy(queues[1],
                                      matrix_buf,
                                      srcM[indptr[halfSize]:],
                                      is_blocking=False,
示例#48
0
    def filter(self, data, keys, low, hi, events):
        import numpy as np
        import pyopencl as cl
        mf = cl.mem_flags

        ndata = data.size

        (e, data_buf, keys_buf, indices_buf, bsum_buf, nbsum_buf,
         ndata2) = self.prefixSum(None, data, keys, ndata, low, hi, events)

        filt = np.zeros(ndata, dtype=np.bool8)
        indices = np.zeros(ndata, dtype=np.uint64)
        data2 = np.zeros(ndata2, dtype=PrefixSum.HOST_TYPE_DATA)
        keys2 = np.zeros(ndata2, dtype=PrefixSum.HOST_TYPE_KEYS)

        ndata2bytes = np.uint64(0).nbytes

        if PrefixSum.RETURN_FILTER == 1:
            filt_buf = cl.Buffer(self.ctx, mf.READ_WRITE, filt.nbytes)
        print data2.nbytes
        data2_buf = cl.Buffer(self.ctx, mf.READ_WRITE, data2.nbytes)
        keys2_buf = cl.Buffer(self.ctx, mf.READ_WRITE, keys2.nbytes)
        ndata2_buf = cl.Buffer(self.ctx, mf.READ_WRITE, ndata2bytes)

        low = PrefixSum.HOST_TYPE_KEYS(low)
        hi = PrefixSum.HOST_TYPE_KEYS(hi)

        kernel = self.prg.filter
        if PrefixSum.RETURN_FILTER == 1:
            kernel.set_args(data_buf, keys_buf, indices_buf, np.uint64(ndata),
                            low, hi, filt_buf, data2_buf, keys2_buf,
                            ndata2_buf)
        else:
            kernel.set_args(data_buf, keys_buf, indices_buf, np.uint64(ndata),
                            low, hi, data2_buf, keys2_buf, ndata2_buf)

        global_dims = self.get_global(self.get_grid_dims(ndata))

        print "filter"
        if e is None:
            e = (cl.enqueue_nd_range_kernel(self.queue,
                                            kernel,
                                            global_dims,
                                            self.localDims,
                                            wait_for=e), )
        else:
            e = (cl.enqueue_nd_range_kernel(self.queue, kernel, global_dims,
                                            self.localDims), )
        events += e

        if PrefixSum.RETURN_FILTER == 1:
            events += (cl.enqueue_copy(self.queue, filt, filt_buf, wait_for=e),
                       cl.enqueue_copy(self.queue,
                                       indices,
                                       indices_buf,
                                       wait_for=e),
                       cl.enqueue_copy(self.queue,
                                       data2,
                                       data2_buf,
                                       wait_for=e),
                       cl.enqueue_copy(self.queue,
                                       keys2,
                                       keys2_buf,
                                       wait_for=e))
        else:
            events += (cl.enqueue_copy(self.queue,
                                       indices,
                                       indices_buf,
                                       wait_for=e),
                       cl.enqueue_copy(self.queue,
                                       data2,
                                       data2_buf,
                                       wait_for=e),
                       cl.enqueue_copy(self.queue,
                                       keys2,
                                       keys2_buf,
                                       wait_for=e))

        return (filt, indices, data2, keys2)
示例#49
0
    kernel_src = kernelFile.read()

compile_options = "-D BINS=%i  -D NIMAGE=%i -D WORKGROUP_SIZE=%i -D EPS=%f" % \
                (bins, size, workgroup_size, numpy.finfo(numpy.float32).eps)

program = cl.Program(ctx, kernel_src).build(options=compile_options)

program.reduce1(queue, (workgroup_size * workgroup_size, ), (workgroup_size, ),
                d_pos.data, numpy.uint32(pos_size), d_preresult)

program.reduce2(queue, (workgroup_size, ), (workgroup_size, ), d_preresult,
                d_minmax)

result = numpy.ndarray(4, dtype=numpy.float32)

cl.enqueue_copy(queue, result, d_minmax)

min0 = pos[:, :, 0].min()
max0 = pos[:, :, 0].max()
min1 = pos[:, :, 1].min()
max1 = pos[:, :, 1].max()
minmax = (min0, max0, min1, max1)

print(minmax)
print(result)

d_outData = cl.Buffer(ctx, mf.READ_WRITE, 4 * bins)
d_outCount = cl.Buffer(ctx, mf.READ_WRITE, 4 * bins)
d_outMerge = cl.Buffer(ctx, mf.READ_WRITE, 4 * bins)

program.memset_out(queue, (1024, ), (workgroup_size, ), d_outData, d_outCount,
示例#50
0
prog = cl.Program(context, kernel_src)
try:
    prog.build(options=['-Werror'], devices=[dev])
except:
    print('Build log:')
    print(prog.get_build_info(dev, cl.program_build_info.LOG))
    raise

# Input
mutex = np.zeros(shape=(1, ), dtype=np.int32)
mutex_buff = cl.Buffer(context,
                       cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR,
                       hostbuf=mutex)

sum_out = np.zeros(shape=(1, ), dtype=np.int32)
sum_buff = cl.Buffer(context,
                     cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR,
                     hostbuf=sum_out)

# Enqueue kernel
global_size = (4, )
local_size = None

# __call__(queue, global_size, local_size, *args, global_offset=None, wait_for=None, g_times_l=False)
prog.mutex(queue, global_size, local_size, mutex_buff, sum_buff)

# Print averaged results
cl.enqueue_copy(queue, dest=sum_out, src=sum_buff, is_blocking=True)

print('Sum: ' + str(sum_out))
dados2_h = np.array(dados2_h).astype(np.complex64)
RES_h = np.empty_like(dados_h)

print "dados1"
print dados_h
print "\n Expected"
print dados_h * 2

dados_d = cl.Buffer(ctx, MF.READ_WRITE | MF.COPY_HOST_PTR, hostbuf=dados_h)
dados2_d = cl.Buffer(ctx, MF.READ_WRITE | MF.COPY_HOST_PTR, hostbuf=dados2_h)
RES_d = cl.Buffer(ctx, MF.READ_WRITE | MF.COPY_HOST_PTR, hostbuf=RES_h)

Source = """
__kernel void soma(__global float2 *dados, __global float2 *dados2, __global float2 *res, int W){
	const int gid_x = get_global_id(0);
	for(int i = 0; i<W; i++)
	{
             res[gid_x*W+i] = dados[gid_x*W+i] *2;
	}
}
"""
prg = cl.Program(ctx, Source).build()

completeEvent = prg.soma(queue, (M, ), None, dados_d, dados2_d, RES_d,
                         np.int32(2))
completeEvent.wait()

cl.enqueue_copy(queue, RES_h, RES_d)
print "\n RES"
print RES_h - (dados_h + dados2_h)
示例#52
0
def test_image_3d(ctx_factory):
    #test for image_from_array for 3d image of float2
    context = ctx_factory()

    device, = context.devices

    if not device.image_support:
        from pytest import skip
        skip("images not supported on %s" % device)

    if device.platform.vendor == "Intel(R) Corporation":
        from pytest import skip
        skip("images crashy on %s" % device)
    _skip_if_pocl(device.platform, 'pocl does not support CL_ADDRESS_CLAMP')

    prg = cl.Program(
        context, """
        __kernel void copy_image_plane(
          __global float2 *dest,
          __read_only image3d_t src,
          sampler_t samp,
          int stride0,
          int stride1)
        {
          int d0 = get_global_id(0);
          int d1 = get_global_id(1);
          int d2 = get_global_id(2);
          /*
          const sampler_t samp =
            CLK_NORMALIZED_COORDS_FALSE
            | CLK_ADDRESS_CLAMP
            | CLK_FILTER_NEAREST;
            */
          dest[d0*stride0 + d1*stride1 + d2] = read_imagef(
                src, samp, (float4)(d2, d1, d0, 0)).xy;
        }
        """).build()

    num_channels = 2
    shape = (3, 4, 2)
    a = np.random.random(shape + (num_channels, )).astype(np.float32)

    queue = cl.CommandQueue(context)
    try:
        a_img = cl.image_from_array(context, a, num_channels)
    except cl.RuntimeError:
        import sys
        exc = sys.exc_info()[1]
        if exc.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED:
            from pytest import skip
            skip("required image format not supported on %s" % device.name)
        else:
            raise

    a_dest = cl.Buffer(context, cl.mem_flags.READ_WRITE, a.nbytes)

    samp = cl.Sampler(context, False, cl.addressing_mode.CLAMP,
                      cl.filter_mode.NEAREST)
    prg.copy_image_plane(
        queue,
        shape,
        None,
        a_dest,
        a_img,
        samp,
        np.int32(a.strides[0] / a.itemsize / num_channels),
        np.int32(a.strides[1] / a.itemsize / num_channels),
    )

    a_result = np.empty_like(a)
    cl.enqueue_copy(queue, a_result, a_dest)

    good = la.norm(a_result - a) == 0
    if not good:
        if queue.device.type & cl.device_type.CPU:
            assert good, (
                "The image implementation on your CPU CL platform '%s' "
                "returned bad values. This is bad, but common." %
                queue.device.platform)
        else:
            assert good
示例#53
0
        if ary is None:
            ary = np.empty(self.shape, self.dtype)

            ary = _as_strided(ary, strides=self.strides)
        else:
            if ary.size != self.size:
                raise TypeError("'ary' has non-matching type")
            if ary.dtype != self.dtype:
                raise TypeError("'ary' has non-matching size")

        assert self.flags.forc, "Array in get() must be contiguous"

        if self.size:
            cl.enqueue_copy(queue or self.queue,
                            ary,
                            self.data,
                            is_blocking=not async)

        return ary

    def get_item(self, index, queue=None, wait_for=None):
        if not isinstance(index, tuple):
            index = (index, )
        if len(index) != len(self.shape):
            raise ValueError("incorrect number of indices")

        tgt = np.empty((), self.dtype)
        cl.enqueue_copy(queue or self.queue,
                        tgt,
                        self.data,
                        is_blocking=True,
示例#54
0
def Metropolis(sigma, J, B, T, iterations, Device, Divider):

    kernel_params = {'block_size': sigma.shape[0] / Divider}

    # Je detecte un peripherique GPU dans la liste des peripheriques
    Id = 1
    HasXPU = False
    for platform in cl.get_platforms():
        for device in platform.get_devices():
            if Id == Device:
                XPU = device
                print "CPU/GPU selected: ", device.name.lstrip()
                HasXPU = True
            Id += 1

    if HasXPU == False:
        print "No XPU #%i found in all of %i devices, sorry..." % (Device,
                                                                   Id - 1)
        sys.exit()

    ctx = cl.Context([XPU])
    queue = cl.CommandQueue(
        ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)

    # Je recupere les flag possibles pour les buffers
    mf = cl.mem_flags

    sigmaCL = cl.Buffer(ctx, mf.WRITE_ONLY | mf.COPY_HOST_PTR, hostbuf=sigma)
    #sigmaCL = cl.Buffer(ctx, mf.READ_WRITE, sigma.nbytes)
    # Program based on Kernel2
    MetropolisCL = cl.Program(ctx,
                              KERNEL_CODE.substitute(kernel_params)).build()

    divide = Divider * Divider
    step = STEP / divide
    i = 0
    duration = 0.
    while (step * i < iterations / divide):

        # Call OpenCL kernel
        # sigmaCL is lattice translated in CL format
        # step is number of iterations

        start_time = time.time()

        CLLaunch = MetropolisCL.MainLoop(
            queue, (numpy.int32(sigma.shape[0] * sigma.shape[1] / 2), 1), None,
            sigmaCL, numpy.float32(J), numpy.float32(B), numpy.float32(T),
            numpy.uint32(sigma.shape[0]), numpy.uint32(step),
            numpy.uint32(2008), numpy.uint32(1010))

        CLLaunch.wait()
        # elapsed = 1e-9*(CLLaunch.profile.end - CLLaunch.profile.start)
        elapsed = time.time() - start_time
        print "Iteration %i with T=%f and %i iterations in %f: " % (i, T, step,
                                                                    elapsed)
        if LAPIMAGE:
            cl.enqueue_copy(queue, sigma, sigmaCL).wait()
            checkLattice(sigma)
            ImageOutput(sigma,
                        "Ising2D_GPU_OddEven_%i_%1.1f_%.3i_Lap" % (SIZE, T, i))
        i = i + 1
        duration = duration + elapsed

    cl.enqueue_copy(queue, sigma, sigmaCL).wait()
    CheckLattice(sigma)
    sigmaCL.release()

    return (duration)
示例#55
0
    print 'The queue is using the device:', queue.device.name

    program = cl.Program(context,
                         open('label_regions.cl').read()).build(options='')

    host_image = np.load('maze1.npy')
    host_labels = np.empty_like(host_image)
    host_done_flag = np.zeros(1).astype(np.int32)

    gpu_image = cl.Buffer(context, cl.mem_flags.READ_ONLY, host_image.size * 4)
    gpu_labels = cl.Buffer(context, cl.mem_flags.READ_WRITE,
                           host_image.size * 4)
    gpu_done_flag = cl.Buffer(context, cl.mem_flags.READ_WRITE, 4)

    # Send to the device, non-blocking
    cl.enqueue_copy(queue, gpu_image, host_image, is_blocking=False)

    local_size = (8, 8)  # 64 pixels per work group
    global_size = tuple(
        [round_up(g, l) for g, l in zip(host_image.shape[::-1], local_size)])
    print global_size
    width = np.int32(host_image.shape[1])
    height = np.int32(host_image.shape[0])
    halo = np.int32(1)

    # Create a local memory per working group that is
    # the size of an int (4 bytes) * (N+2) * (N+2), where N is the local_size
    buf_size = (np.int32(local_size[0] + 2 * halo),
                np.int32(local_size[1] + 2 * halo))
    gpu_local_memory = cl.LocalMemory(4 * buf_size[0] * buf_size[1])
示例#56
0
class Array(object):
    """A :class:`numpy.ndarray` work-alike that stores its data and performs its
    computations on the compute device.  *shape* and *dtype* work exactly as in
    :mod:`numpy`.  Arithmetic methods in :class:`Array` support the
    broadcasting of scalars. (e.g. `array+5`)

    *cqa* must be a :class:`pyopencl.CommandQueue`. *cqa*
    specifies the queue in which the array carries out its
    computations by default. *cqa* will at some point be renamed *queue*,
    so it should be considered 'positional-only'.

    *allocator* may be `None` or a callable that, upon being called with an
    argument of the number of bytes to be allocated, returns an
    :class:`pyopencl.Buffer` object. (A :class:`pyopencl.tools.MemoryPool`
    instance is one useful example of an object to pass here.)

    .. versionchanged:: 2011.1
        Renamed *context* to *cqa*, made it general-purpose.

        All arguments beyond *order* should be considered keyword-only.

    .. attribute :: data

        The :class:`pyopencl.MemoryObject` instance created for the memory that backs
        this :class:`Array`.

    .. attribute :: shape

        The tuple of lengths of each dimension in the array.

    .. attribute :: dtype

        The :class:`numpy.dtype` of the items in the GPU array.

    .. attribute :: size

        The number of meaningful entries in the array. Can also be computed by
        multiplying up the numbers in :attr:`shape`.

    .. attribute :: nbytes

        The size of the entire array in bytes. Computed as :attr:`size` times
        ``dtype.itemsize``.

    .. attribute :: strides

        Tuple of bytes to step in each dimension when traversing an array.

    .. attribute :: flags

        Return an object with attributes `c_contiguous`, `f_contiguous` and `forc`,
        which may be used to query contiguity properties in analogy to
        :attr:`numpy.ndarray.flags`.
    """
    def __init__(self,
                 cqa,
                 shape,
                 dtype,
                 order="C",
                 allocator=None,
                 data=None,
                 queue=None,
                 strides=None):
        # {{{ backward compatibility

        from warnings import warn
        if queue is not None:
            warn(
                "Passing the queue to the array through anything but the "
                "first argument of the Array constructor is deprecated. "
                "This will be continue to be accepted throughout the 2013.[0-6] "
                "versions of PyOpenCL.", DeprecationWarning, 2)

        if isinstance(cqa, cl.CommandQueue):
            if queue is not None:
                raise TypeError("can't specify queue in 'cqa' and "
                                "'queue' arguments")
            queue = cqa

        elif isinstance(cqa, cl.Context):
            warn(
                "Passing a context for the 'cqa' parameter is deprecated. "
                "This usage will be continue to be accepted throughout the 2013.[0-6] "
                "versions of PyOpenCL.", DeprecationWarning, 2)

            if queue is not None:
                raise TypeError("may not pass a context and a queue "
                                "(just pass the queue)")
            if allocator is not None:
                raise TypeError("may not pass a context and an allocator "
                                "(just pass the queue)")

        else:
            # cqa is assumed to be an allocator
            warn(
                "Passing an allocator for the 'cqa' parameter is deprecated. "
                "This usage will be continue to be accepted throughout the 2013.[0-6] "
                "versions of PyOpenCL.", DeprecationWarning, 2)
            if allocator is not None:
                raise TypeError("can't specify allocator in 'cqa' and "
                                "'allocator' arguments")

            allocator = cqa

        if queue is None:
            warn(
                "Queue-less arrays are deprecated. "
                "They will continue to work throughout the 2013.[0-6] "
                "versions of PyOpenCL.", DeprecationWarning, 2)

        # }}}

        # invariant here: allocator, queue set

        # {{{ determine shape and strides
        dtype = np.dtype(dtype)

        try:
            s = 1
            for dim in shape:
                s *= dim
        except TypeError:
            import sys
            if sys.version_info >= (3, ):
                admissible_types = (int, np.integer)
            else:
                admissible_types = (int, long, np.integer)

            if not isinstance(shape, admissible_types):
                raise TypeError("shape must either be iterable or "
                                "castable to an integer")
            s = shape
            shape = (shape, )

        if isinstance(s, np.integer):
            # bombs if s is a Python integer
            s = np.asscalar(s)

        if strides is None:
            strides = _make_strides(dtype.itemsize, shape, order)

        else:
            # FIXME: We should possibly perform some plausibility
            # checking on 'strides' here.

            strides = tuple(strides)

        # }}}

        self.queue = queue
        self.shape = shape
        self.dtype = dtype
        self.strides = strides
        self.events = []

        self.size = s
        alloc_nbytes = self.nbytes = self.dtype.itemsize * self.size

        self.allocator = allocator

        if data is None:
            if not alloc_nbytes:
                # Work around CL not allowing zero-sized buffers.
                alloc_nbytes = 1

            if allocator is None:
                # FIXME remove me when queues become required
                if queue is not None:
                    context = queue.context

                self.data = cl.Buffer(context, cl.mem_flags.READ_WRITE,
                                      alloc_nbytes)
            else:
                self.data = self.allocator(alloc_nbytes)
        else:
            self.data = data

    @property
    def context(self):
        return self.data.context

    @property
    @memoize_method
    def flags(self):
        return _ArrayFlags(self)

    @property
    def mem_size(self):
        from warnings import warn
        warn("Array.mem_size is deprecated. Use Array.size",
             DeprecationWarning,
             stacklevel=2)

    def _new_with_changes(self,
                          data,
                          shape=None,
                          dtype=None,
                          strides=None,
                          queue=None):
        if shape is None:
            shape = self.shape
        if dtype is None:
            dtype = self.dtype
        if strides is None:
            strides = self.strides
        if queue is None:
            queue = self.queue

        if queue is not None:
            return Array(queue,
                         shape,
                         dtype,
                         allocator=self.allocator,
                         strides=strides,
                         data=data)
        elif self.allocator is not None:
            return Array(self.allocator,
                         shape,
                         dtype,
                         queue=queue,
                         strides=strides,
                         data=data)
        else:
            return Array(self.context,
                         shape,
                         dtype,
                         strides=strides,
                         data=data)

    #@memoize_method FIXME: reenable
    def get_sizes(self, queue, kernel_specific_max_wg_size=None):
        if not self.flags.forc:
            raise NotImplementedError("cannot operate on non-contiguous array")
        return splay(queue,
                     self.size,
                     kernel_specific_max_wg_size=kernel_specific_max_wg_size)

    def set(self, ary, queue=None, async=False):
        """Transfer the contents the :class:`numpy.ndarray` object *ary*
        onto the device.

        *ary* must have the same dtype and size (not necessarily shape) as *self*.
        """

        assert ary.size == self.size
        assert ary.dtype == self.dtype

        if not ary.flags.forc:
            raise RuntimeError("cannot set from non-contiguous array")

            ary = ary.copy()

        if ary.strides != self.strides:
            from warnings import warn
            warn(
                "Setting array from one with different strides/storage order. "
                "This will cease to work in 2013.x.",
                stacklevel=2)

        if self.size:
            cl.enqueue_copy(queue or self.queue,
                            self.data,
                            ary,
                            is_blocking=not async)
示例#57
0
    def run_parallel(self,
                     kernelName='',
                     scalarArgs=None,
                     slicedROArgs=None,
                     nonSlicedROArgs=None,
                     slicedRWArgs=None,
                     nonSlicedRWArgs=None,
                     dimension=0):

        # t0 = time.time()
        ka_offset = len(scalarArgs) if scalarArgs is not None else 0
        ro_offset = len(slicedROArgs) if slicedROArgs is not None else 0
        ns_offset = len(nonSlicedROArgs) if nonSlicedROArgs is not None else 0
        rw_offset = len(slicedRWArgs) if slicedRWArgs is not None else 0
        rw_pos = ka_offset + ro_offset + ns_offset
        nsrw_pos = ka_offset + ro_offset + ns_offset + rw_offset

        kernel_bufs = []
        global_size = []
        ev_h2d = []
        ev_run = []
        nCU = []
        ndstart = []
        ndslice = []
        ndsize = []
        minWGS = 1e20

        for ictx, ctx in enumerate(self.cl_ctx):
            nCUw = 1
            nCU.extend([nCUw])
            tmpWGS = ctx.devices[0].max_work_group_size
            if tmpWGS < minWGS:
                minWGS = tmpWGS
            # nCU.extend([ctx.devices[0].max_compute_units*nCUw])

        totalCUs = np.sum(nCU)
        minWGS = 256
        divider = minWGS * totalCUs
        n2f = np.remainder(dimension, divider)
        needResize = False
        # odd dimension performance fix
        if n2f != 0 and dimension > divider:
            oldSize = dimension
            dimension = (np.trunc(dimension / divider) + 1) * divider
            nDiff = int(dimension - oldSize)
            needResize = True

        work_cl_ctx = self.cl_ctx if dimension > totalCUs else [self.cl_ctx[0]]
        nctx = len(work_cl_ctx)

        for ictx, ctx in enumerate(work_cl_ctx):
            ev_h2d.extend([[]])
            kernel_bufs.extend([[]])
            if scalarArgs is not None:
                kernel_bufs[ictx].extend(scalarArgs)
            ndstart.extend([sum(ndsize)])

            if dimension > 1:
                if ictx < nctx - 1:
                    ndsize.extend([np.floor(dimension * nCU[ictx] / totalCUs)])
                else:
                    ndsize.extend([dimension - ndstart[ictx]])
                ndslice.extend(
                    [slice(ndstart[ictx], ndstart[ictx] + ndsize[ictx])])
            else:
                ndslice.extend([0])
# In case each photon has an array of input/output data we define a second
# dimension
            if slicedROArgs is not None and dimension > 1:
                for iarg, arg in enumerate(slicedROArgs):
                    newArg = np.concatenate([arg, arg[:nDiff]]) if needResize\
                        else arg
                    secondDim = np.int(len(newArg) / dimension)
                    iSlice = slice(
                        int(ndstart[ictx] * secondDim),
                        int((ndstart[ictx] + ndsize[ictx]) * secondDim))
                    kernel_bufs[ictx].extend([
                        cl.Buffer(self.cl_ctx[ictx],
                                  self.cl_mf.READ_ONLY
                                  | self.cl_mf.COPY_HOST_PTR,
                                  hostbuf=newArg[iSlice])
                    ])

            if nonSlicedROArgs is not None:
                for iarg, arg in enumerate(nonSlicedROArgs):
                    kernel_bufs[ictx].extend([
                        cl.Buffer(self.cl_ctx[ictx],
                                  self.cl_mf.READ_ONLY
                                  | self.cl_mf.COPY_HOST_PTR,
                                  hostbuf=arg)
                    ])

            if slicedRWArgs is not None:
                for iarg, arg in enumerate(slicedRWArgs):
                    newArg = np.concatenate([arg, arg[:nDiff]]) if needResize\
                        else arg
                    secondDim = np.int(len(newArg) / dimension)
                    iSlice = slice(
                        int(ndstart[ictx] * secondDim),
                        int((ndstart[ictx] + ndsize[ictx]) * secondDim))
                    kernel_bufs[ictx].extend([
                        cl.Buffer(self.cl_ctx[ictx],
                                  self.cl_mf.READ_WRITE
                                  | self.cl_mf.COPY_HOST_PTR,
                                  hostbuf=newArg[iSlice])
                    ])
                global_size.extend([(np.int(ndsize[ictx]), )])
            if nonSlicedRWArgs is not None:
                for iarg, arg in enumerate(nonSlicedRWArgs):
                    kernel_bufs[ictx].extend([
                        cl.Buffer(self.cl_ctx[ictx],
                                  self.cl_mf.READ_WRITE
                                  | self.cl_mf.COPY_HOST_PTR,
                                  hostbuf=arg)
                    ])
                global_size.extend([np.array([1]).shape])

        local_size = None
        for ictx, ctx in enumerate(work_cl_ctx):
            kernel = getattr(self.cl_program[ictx], kernelName)
            ev_run.extend([
                kernel(self.cl_queue[ictx], global_size[ictx], local_size,
                       *kernel_bufs[ictx])
            ])

        for iev, ev in enumerate(ev_run):
            status = cl.command_execution_status.to_string(
                ev.command_execution_status)
            if _DEBUG > 20:
                print("ctx status {0} {1}".format(iev, status))

        ret = ()

        if slicedRWArgs is not None:
            for ictx, ctx in enumerate(work_cl_ctx):
                for iarg, arg in enumerate(slicedRWArgs):
                    newArg = np.concatenate([arg, arg[:nDiff]]) if needResize\
                        else arg
                    secondDim = np.int(len(newArg) / dimension)
                    iSlice = slice(
                        int(ndstart[ictx] * secondDim),
                        int((ndstart[ictx] + ndsize[ictx]) * secondDim))
                    cl.enqueue_copy(self.cl_queue[ictx],
                                    slicedRWArgs[iarg][iSlice],
                                    kernel_bufs[ictx][iarg + rw_pos],
                                    is_blocking=self.cl_is_blocking)
            if needResize:
                for arg in slicedRWArgs:
                    arg = arg[:oldSize]
            ret += tuple(slicedRWArgs)

        if nonSlicedRWArgs is not None:
            for ictx, ctx in enumerate(work_cl_ctx):
                for iarg, arg in enumerate(nonSlicedRWArgs):
                    cl.enqueue_copy(self.cl_queue[ictx],
                                    nonSlicedRWArgs[iarg],
                                    kernel_bufs[ictx][iarg + nsrw_pos],
                                    is_blocking=self.cl_is_blocking)
            if needResize:
                for arg in nonSlicedRWArgs:
                    arg = arg[:oldSize]
            ret += tuple(nonSlicedRWArgs)
#        print("Total CL execution time:", time.time() - t0, "s")
        return ret
def buildGraph(ip, dev=0):
        """Builds the knn grap with intial params.
        params:
        ------
        ip: initial params

        return: 
        ------
        graph: graph object of Graph 
        """

        start = time()
        nbrs = NearestNeighbors(n_neighbors = ip.k + 1, algorithm="buffer_kd_tree", tree_depth=9, plat_dev_ids={0:[0]})    
        nbrs.fit(ip.position)
        dists, inds = nbrs.kneighbors(ip.position)
        print("success") if bool_1 else print()

        # now build the graph using those nns using gpu
        platform = cl.get_platforms()[0]
        print(platform)
        device = platform.get_devices()[dev]
        print(device)
        context = cl.Context([device])
        print(context)
        program = cl.Program(context, open(mywf).read()).build()
        print(program)
        queue = cl.CommandQueue(context)
        print(queue)
        
         # define the input here which is the ndbrs gpu
        ngbrs_gpu = inds
        ngbrs_gpu = ngbrs_gpu[0:,1:]
        ngbrs_gpu = unroll(ngbrs_gpu)
        ngbrs_gpu = ngbrs_gpu.astype('int32')
        
         # define the second input here which is the signal levels
        signal =  ip.signal
        n, chnl = signal.shape
        signal = np.reshape(signal,(n*chnl,),order='F')
        signal = signal.astype('float32')
        print("signal",signal.shape) if bool_1 else print()
        k = ip.k
        print("n is :", n) if bool_1 else print()
        scale = ip.sigma
        
         # create the buffers on the device, intensity, nbgrs, weights
        mem_flags = cl.mem_flags
        ngbrs_buf = cl.Buffer(context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR,hostbuf=ngbrs_gpu)
        signal_buf = cl.Buffer(context, mem_flags.READ_ONLY | mem_flags.COPY_HOST_PTR, hostbuf=signal)
        weight_vec = np.ndarray(shape=(n*k,), dtype=np.float32)
        weight_buf = cl.Buffer(context, mem_flags.WRITE_ONLY, weight_vec.nbytes)
        
         # run the kernel to compute the weights
        program.compute_weights(queue, (n,), None, signal_buf,  ngbrs_buf, weight_buf, np.int32(k), np.float32(scale), np.int32(chnl))
        
        queue.finish() #OT
        
        # copy the weihts to the host memory
        cl.enqueue_copy(queue, weight_vec, weight_buf)
        end = time() - start
        
        print('total time taken by the gpu python:', end) if bool_1 else print()
        # save the graph
        graph = Graph(weight_vec,ngbrs_gpu,k)
        return graph
示例#59
0
    def vglCl3dErode(self, img_input, img_output, convolution_window,
                     window_size_x, window_size_y, window_size_z):
        print("# Running vglCl3dErode")
        vl.vglCheckContext(img_input, vl.VGL_CL_CONTEXT())
        vl.vglCheckContext(img_output, vl.VGL_CL_CONTEXT())

        # TRANSFORMAR EM BUFFER
        try:
            cl_convolution_window = cl.Buffer(self.ocl.context,
                                              cl.mem_flags.READ_ONLY,
                                              convolution_window.nbytes)
            cl.enqueue_copy(self.ocl.commandQueue,
                            cl_convolution_window,
                            convolution_window.tobytes(),
                            is_blocking=True)
            convolution_window = cl_convolution_window
        except Exception as e:
            print(
                "vglCl3dErode: Error!! Impossible to convert convolution_window to cl.Buffer object."
            )
            print(str(e))
            exit()

        if (not isinstance(window_size_x, np.uint32)):
            print(
                "vglCl3dErode: Warning: window_size_x not np.uint32! Trying to convert..."
            )
            try:
                window_size_x = np.uint32(window_size_x)
            except Exception as e:
                print(
                    "vglCl3dErode: Error!! Impossible to convert window_size_x as a np.uint32 object."
                )
                print(str(e))
                exit()

        if (not isinstance(window_size_y, np.uint32)):
            print(
                "vglCl3dErode: Warning: window_size_y not np.uint32! Trying to convert..."
            )
            try:
                window_size_y = np.uint32(window_size_y)
            except Exception as e:
                print(
                    "vglCl3dErode: Error!! Impossible to convert window_size_y as a np.uint32 object."
                )
                print(str(e))
                exit()
        if (not isinstance(window_size_z, np.uint32)):
            print(
                "vglCl3dErode: Warning: window_size_z not np.uint32! Trying to convert..."
            )
            try:
                window_size_z = np.uint32(window_size_z)
            except Exception as e:
                print(
                    "vglCl3dErode: Error!! Impossible to convert window_size_z as a np.uint32 object."
                )
                print(str(e))
                exit()

        _program = self.cl_ctx.get_compiled_kernel("../CL/vglCl3dErode.cl",
                                                   "vglCl3dErode")
        kernel_run = _program.vglCl3dErode

        kernel_run.set_arg(0, img_input.get_oclPtr())
        kernel_run.set_arg(1, img_output.get_oclPtr())
        kernel_run.set_arg(2, convolution_window)
        kernel_run.set_arg(3, window_size_x)
        kernel_run.set_arg(4, window_size_y)
        kernel_run.set_arg(5, window_size_z)

        cl.enqueue_nd_range_kernel(self.ocl.commandQueue, kernel_run,
                                   img_output.get_oclPtr().shape, None)

        vl.vglSetContext(img_output, vl.VGL_CL_CONTEXT())
示例#60
0
                            vl.IMAGE_ND_ARRAY())
    vl.vglLoadImage(img_input)
    vl.vglClUpload(img_input)

    img_input2 = vl.VglImage("bin2.pgm", vl.VGL_IMAGE_2D_IMAGE(), None,
                             vl.IMAGE_ND_ARRAY())
    vl.vglLoadImage(img_input2)
    vl.vglClUpload(img_input2)

    # OUTPUT IMAGE
    img_output = vl.create_blank_image_as(img_input)
    img_output.set_oclPtr(vl.get_similar_oclPtr_object(img_input))
    vl.vglAddContext(img_output, vl.VGL_CL_CONTEXT())

    img_out_aux = vl.get_similar_oclPtr_object(img_input)
    cl.enqueue_copy(wrp.cl_ctx.queue, img_out_aux, img_output.get_oclPtr())

    # TRANSFORMANDO IMAGENS EM IMAGENS BINARIAS
    wrp.vglClNdBinThreshold(img_input, img_output, 100)
    cl.enqueue_copy(wrp.cl_ctx.queue, img_input.get_oclPtr(),
                    img_output.get_oclPtr())

    # STRUCTURANT ELEMENT
    window = vl.VglStrEl()
    window.constructorFromTypeNdim(vl.VGL_STREL_CROSS(), 2)

    # INPUT IMAGE
    #img_input = vl.VglImage("bin.pgm", vl.VGL_IMAGE_2D_IMAGE(), None, vl.IMAGE_ND_ARRAY())
    #vl.vglLoadImage(img_input)
    #vl.vglClUpload(img_input)
    #wrp.vglClNdBinThreshold(img_input, img_output, 100)