예제 #1
0
def _fold_exp_and_coh(t_array, w, tz, tau_arr):
    if tz != 0.:
        t_array -= tz

    shape = t_array.shape
    t_array = t_array.astype(np.float32)

    t_arr_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=t_array)
    tau_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR,
                        hostbuf=(1/tau_arr).astype(np.float32))
    shape = (shape[0], shape[1], tau_arr.size)
    shape_coh = (shape[0], shape[1], 3)
    out = cl_array.empty(queue, shape=shape, dtype=np.float32)
    out_coh = cl_array.empty(queue, shape=shape_coh, dtype=np.float32)

    global_work_size = t_array.size + (work_size[0] - t_array.size % work_size[0])

    prg.fold_exp(queue, (global_work_size, tau_arr.size), work_size, t_arr_gpu, np.float32(w),
                 tau_buf, out.data, np.uint32(t_array.size))

    coh_no_div.coh_gauss(queue, (global_work_size, 3), work_size, t_arr_gpu,
                   np.float32(w/1.4142), out_coh.data, np.uint32(t_array.size))

    queue.finish()
    a = out.get(async_=True)
    b = out_coh.get(async_=True)
    b /= np.abs(b).max(0)
    queue.finish()
    return a, b
예제 #2
0
def get_flux(params, G, P):
    sh = G.shapes

    # Just need 4 elements -- filled below
    F = [0] * 4

    global Pl, Pr, ctop
    if Pl is None:
        Pl = cl_array.empty(params['queue'],
                            sh.grid_primitives,
                            dtype=np.float64)
        Pr = cl_array.empty(params['queue'],
                            sh.grid_primitives,
                            dtype=np.float64)
        ctop = cl_array.empty(params['queue'],
                              sh.grid_vector,
                              dtype=np.float64)

    # reconstruct left- and right-going components
    reconstruct(params, G, P, 1, lout=Pl, rout=Pr)
    # turn these into a net flux
    F[1], ctop[1] = lr_to_flux(params, G, Pl, Pr, 1, Loci.FACE1)

    reconstruct(params, G, P, 2, lout=Pl, rout=Pr)
    F[2], ctop[2] = lr_to_flux(params, G, Pl, Pr, 2, Loci.FACE2)

    reconstruct(params, G, P, 3, lout=Pl, rout=Pr)
    F[3], ctop[3] = lr_to_flux(params, G, Pl, Pr, 3, Loci.FACE3)

    if params['dt_static']:
        ndt = params['dt_start']
    else:
        ndt = ndt_min(params, G, ctop)

    return F, ndt
예제 #3
0
    def __init__(self, height, width):
        """
        height, width : size of the screen
        """
        # Don't confuse 'Viewer' and 'Engine'
        # Size of Engine should always be the same while running
        self._height = height
        self._width = width
        self._image = np.zeros((self.size[0], self.size[1], 3), dtype=np.uint8)
        self._TM = ThingsManager()

        # OpenCl things
        self.device = cl.get_platforms()[0].get_devices()[0]
        self.ctx = cl.Context([self.device])
        self.queue = cl.CommandQueue(self.ctx)
        self.bg_color = np.array(colors.COLOR_BACKGROUND, dtype=np.uint8)
        self.wall_color = np.array(colors.COLOR_WALL, dtype=np.uint8)
        self.image_dev = cl_array.empty(self.queue, self.image.shape, np.uint8)
        self.bg_col_dev = cl_array.to_device(self.queue, self.bg_color)
        self.wall_col_dev = cl_array.to_device(self.queue, self.wall_color)
        self.fp_ray_dev = None
        self.delta_vec_dev = None
        self.observation_dev = cl_array.empty(self.queue, (2, ec.RayNum, 3),
                                              np.uint8)
        cl_path = path.join(path.dirname(__file__), 'cl_scripts/ray.cl')
        with open(cl_path, 'r') as f:
            fstr = "".join(f.readlines())
        self.program = cl.Program(self.ctx, fstr).build()

        # Initiate things first and then call CollisionManager
        self.initiate_things()
        self._CM = CollisionManager(self.size, self._TM)
예제 #4
0
    def __init__(self, par):
        self.C = par["C"]
        self.traj = par["traj"]
        self.NSlice = par["NSlice"]
        self.NScan = par["NScan"]
        self.dimX = par["dimX"]
        self.dimY = par["dimY"]
        self.NC = par["NC"]
        self.fval_min = 0
        self.fval = 0
        self.ctx = par["ctx"][0]
        self.queue = par["queue"][0]
        self.res = []
        self.N = par["N"]
        self.Nproj = par["Nproj"]
        self.incor = par["InScale"].astype(DTYPE)
        self.coil_buf = cl.Buffer(self.ctx,
                                  cl.mem_flags.READ_ONLY |
                                  cl.mem_flags.COPY_HOST_PTR,
                                  hostbuf=self.C.data)
        self.tmp_sino = clarray.empty(self.queue,
                                      (self.NScan, self.NC, self.NSlice,
                                       self.Nproj, self.N), DTYPE, "C")
        self.tmp_result = clarray.empty(self.queue,
                                        (self.NScan, self.NC, self.NSlice,
                                         self.dimY, self.dimX), DTYPE, "C")
        self.NUFFT = NUFFT(self.ctx, self.queue, par,
                           overgridfactor=par["ogf"])

        self.prg = Program(
            self.ctx,
            open(resource_filename(
                'rrsg_cgreco', 'kernels/opencl_operator_kernels.c')).read())
예제 #5
0
def maxpool2d(q, A, f, stride, out=None, indices=None):
    dtype = dtype_to_ctype(A.dtype)
    n, c, h, w = A.shape
    out_h = (h - f) / stride + 1
    out_w = (w - f) / stride + 1

    if out is None:
        out = clarray.empty(q, (n, c, out_h, out_w), dtype=A.dtype)
    if indices is None:
        indices = clarray.empty(q, (n, c, out_h, out_w), dtype=np.int32)

    if 'max_pool' not in _kernel_cache:
        prg = cl.Program(clplatf.ctx, _maxpool_template % {
            'dtype': dtype
        }).build()
        _kernel_cache['max_pool'] = prg.max_pool
    krnl = _kernel_cache['max_pool']
    # TODO better global and local dimensions (make divisible by 64 etc.)
    ev = krnl(q, (n * c * out_h * out_w, ), None, A.data, out.data,
              indices.data, np.int32(h), np.int32(w), np.int32(out_h),
              np.int32(out_w), np.int32(f), np.int32(f), np.int32(stride),
              np.int32(stride))

    ev.wait()
    return out, indices
    def init_OpenCL_quanti(self, N_var, msg_at_time, return_buffer_only=False):
        """Inits the OpenCL context and transfers all static data to the device"""

        self.context = cl.create_some_context()

        print(self.context.get_info(cl.context_info.DEVICES))
        path = os.path.split(os.path.abspath(__file__))
        kernelsource = open(os.path.join(path[0],
                                         'kernels_quanti_template.cl')).read()

        tpl = Template(kernelsource)
        rendered_tp = tpl.render(Nvar=N_var)

        self.program = cl.Program(self.context, str(rendered_tp)).build()

        self.return_buffer_only = return_buffer_only

        # Set up OpenCL
        self.queue = cl.CommandQueue(self.context)
        self.quantize = self.program.quantize
        self.quantize.set_scalar_arg_dtypes([np.int32, None, None, None])
        self.quantize_LLR = self.program.quantize_LLR
        self.quantize_LLR.set_scalar_arg_dtypes(
            [np.int32, None, None, None, None])
        self.limit_buff = cl_array.to_device(
            self.queue, self.cdf_t_given_x_equals_zero.astype(np.float64))
        self.cluster_buff = cl_array.empty(self.queue, (N_var, msg_at_time),
                                           dtype=np.int32)
        self.LLR_buff = cl_array.empty(self.queue, (N_var, msg_at_time),
                                       dtype=np.float64)
        self.LLR_values_buff = cl_array.to_device(
            self.queue, self.output_LLRs.astype(np.float64))
    def setup_arrays(self, nrays, nsamples, cutoff):

        prog_params = (nrays, nsamples, cutoff)

        if prog_params in self.array_cache:
            return self.array_cache[prog_params]

        else:
            arrays = ArraySet()
            arrays.scratch = cla.empty(self.queue,
                                 (nsamples, nrays),
                                 dtype=np.float32,
                                 allocator=self.memory_pool)

            arrays.result = cla.empty(self.queue,
                                (nrays,),
                                dtype=np.int32,
                                allocator=self.memory_pool)

            arrays.pre_cutoff = cla.empty(self.queue,
                                    (nrays, cutoff),
                                    dtype=np.float32,
                                    allocator=self.memory_pool)

            arrays.pre_cutoff_squared = cla.empty_like(arrays.pre_cutoff)

            arrays.idx = cla.arange(self.queue, 0, cutoff * nrays, 1,
                                    dtype=np.int32,
                                    allocator=self.memory_pool)

            self.array_cache[prog_params] = arrays
            return arrays
예제 #8
0
def _fold_exp_and_coh(t_array, w, tz, tau_arr):
    if tz != 0.:
        t_array -= tz

    shape = t_array.shape
    t_array = t_array.astype(np.float32)

    t_arr_gpu = cl.Buffer(ctx,
                          mf.READ_ONLY | mf.COPY_HOST_PTR,
                          hostbuf=t_array)
    tau_buf = cl.Buffer(ctx,
                        mf.READ_ONLY | mf.COPY_HOST_PTR,
                        hostbuf=(1 / tau_arr).astype(np.float32))
    shape = (shape[0], shape[1], tau_arr.size)
    shape_coh = (shape[0], shape[1], 3)
    out = cl_array.empty(queue, shape=shape, dtype=np.float32)
    out_coh = cl_array.empty(queue, shape=shape_coh, dtype=np.float32)

    global_work_size = t_array.size + (work_size[0] -
                                       t_array.size % work_size[0])

    prg.fold_exp(queue, (global_work_size, tau_arr.size), work_size, t_arr_gpu,
                 np.float32(w), tau_buf, out.data, np.uint32(t_array.size))

    coh_no_div.coh_gauss(queue, (global_work_size, 3), work_size, t_arr_gpu,
                         np.float32(w / 1.4142), out_coh.data,
                         np.uint32(t_array.size))

    queue.finish()
    a = out.get(async_=True)
    b = out_coh.get(async_=True)
    b /= np.abs(b).max(0)
    queue.finish()
    return a, b
예제 #9
0
    def _alloctmparrays(self,
                        inp_shape,
                        outp_shape):
        block_size = self.slices+self.overlap
        for j in range(self.num_fun):
            self.inp.append([])
            for i in range(2*self.num_dev):
                self.inp[j].append([])
                for k in range(len(inp_shape[j])):
                    if not len(inp_shape[j][k]) == 0:
                        self.inp[j][i].append(
                            clarray.empty(
                                self.queue[4*int(i/2)],
                                ((block_size, )+inp_shape[j][k][1:]),
                                dtype=self.dtype))
                    else:
                        self.inp[j][i].append([])

        for j in range(self.num_fun):
            self.outp.append([])
            for i in range(2*self.num_dev):
                self.outp[j].append(
                    clarray.empty(
                        self.queue[4*int(i/2)],
                        ((block_size, )+outp_shape[j][1:]),
                        dtype=self.dtype))
예제 #10
0
    def setup_arrays(self, nrays, nsamples, cutoff):

        prog_params = (nrays, nsamples, cutoff)

        if prog_params in self.array_cache:
            return self.array_cache[prog_params]

        else:
            arrays = ArraySet()
            arrays.scratch = cla.empty(self.queue, (nsamples, nrays),
                                       dtype=np.float32,
                                       allocator=self.memory_pool)

            arrays.result = cla.empty(self.queue, (nrays, ),
                                      dtype=np.int32,
                                      allocator=self.memory_pool)

            arrays.pre_cutoff = cla.empty(self.queue, (nrays, cutoff),
                                          dtype=np.float32,
                                          allocator=self.memory_pool)

            arrays.pre_cutoff_squared = cla.empty_like(arrays.pre_cutoff)

            arrays.idx = cla.arange(self.queue,
                                    0,
                                    cutoff * nrays,
                                    1,
                                    dtype=np.int32,
                                    allocator=self.memory_pool)

            self.array_cache[prog_params] = arrays
            return arrays
예제 #11
0
    def init_openCL(self,set_mem_pool_None = False):
        self.context = cl.create_some_context()
        print('###  OPENCL Device #####')
        print(self.context.get_info(cl.context_info.DEVICES))

        path = os.path.split(os.path.abspath(__file__))
        kernelsource = open(os.path.join(path[0], "IB_kernels.cl")).read()
        tpl = Template(kernelsource)
        rendered_tp = tpl.render(cardinality_T=self.cardinality_T)


        #kernelsource = open("information_bottleneck / information_bottleneck_algorithms / IB_kernels.cl").read()

        self.program = cl.Program(self.context, str(rendered_tp)).build()
        self.queue = cl.CommandQueue(self.context)
        if set_mem_pool_None:
            self.mem_pool = None
        else:
            self.mem_pool = cl.tools.MemoryPool(cl.tools.ImmediateAllocator(self.queue))

        self.p_x_given_y_buffer = cl_array.to_device(self.queue, self.p_x_given_y.astype(dtype=np.float64),allocator=self.mem_pool)
        self.p_x_and_y_buffer = cl_array.to_device(self.queue, self.p_x_y.astype(dtype=np.float64),allocator=self.mem_pool)
        self.p_y_buffer = cl_array.to_device(self.queue, self.p_y.astype(dtype=np.float64),allocator=self.mem_pool)


        self.p_x_and_t_buffer = cl_array.empty(self.queue, (self.cardinality_T, self.cardinality_X), dtype=np.float64,
                                               allocator=self.mem_pool)
        self.p_t_buffer = cl_array.empty(self.queue, self.cardinality_T, dtype=np.float64,
                                               allocator=self.mem_pool)
        self.argmin_buffer = cl_array.empty(self.queue,self.cardinality_Y,dtype=np.int32,allocator=self.mem_pool)
        self.dkl_mat_buffer = cl_array.empty(self.queue,(self.cardinality_Y,self.cardinality_T),dtype=np.float64,allocator=self.mem_pool)
        self.start_vec_buffer = cl_array.empty(self.queue,self.cardinality_T,dtype=np.int32,allocator=self.mem_pool)



        self.dkl_compute_prog = self.program.compute_dkl_mat
        self.dkl_compute_prog.set_scalar_arg_dtypes([np.int32, np.int32, np.int32, None, None, None])

        self.find_argmin_prog = self.program.find_argmin
        self.find_argmin_prog.set_scalar_arg_dtypes([np.int32, np.int32, None, None])



        self.allow_move_prog = self.program.allow_move
        self.allow_move_prog.set_scalar_arg_dtypes([np.int32, None, None, None])

        self.compute_p_x_and_t_parallel_prog = self.program.compute_p_x_and_t_parallel
        self.compute_p_x_and_t_parallel_prog.set_scalar_arg_dtypes([np.int32, np.int32, np.int32, None, None, None, None, None])


        self.compute_p_x_given_t_parallel_prog = self.program.compute_p_x_given_t_parallel
        self.compute_p_x_given_t_parallel_prog.set_scalar_arg_dtypes(
            [np.int32, None, None])

        self.compute_p_t_parallel_prog = self.program.compute_p_t_parallel
        self.compute_p_t_parallel_prog.set_scalar_arg_dtypes([np.int32, None, None])


        self.update_dist_prog = self.program.update_distributions
        self.update_dist_prog.set_scalar_arg_dtypes([np.int32, np.int32, np.int32, None, None, None,None, None])
예제 #12
0
def empty(n, dtype, backend='cython'):
    if backend == 'opencl':
        import pyopencl.array as gpuarray
        from .opencl import get_queue
        out = gpuarray.empty(get_queue(), n, dtype)
    elif backend == 'cuda':
        import pycuda.gpuarray as gpuarray
        out = gpuarray.empty(n, dtype)
    else:
        out = np.empty(n, dtype=dtype)
    return wrap_array(out, backend)
예제 #13
0
def empty(n, dtype, backend='cython'):
    if backend == 'opencl':
        import pyopencl.array as gpuarray
        dev_array = gpuarray.empty(get_queue(), n, dtype)
    elif backend == 'cuda':
        import pycuda.gpuarray as gpuarray
        dev_array = gpuarray.empty(n, dtype)
    else:
        return Array(np.empty(n, dtype=dtype))
    wrapped_array = Array()
    wrapped_array.set_dev_array(dev_array)
    return wrapped_array
    def init_OpenCL_decoding(self,msg_at_time_, context_=False):
        if not context_:
            self.context = cl.create_some_context()
        else:
            self.context = context_

        print(self.context.get_info(cl.context_info.DEVICES))
        path = os.path.split(os.path.abspath(__file__))

        kernelsource = open(os.path.join(path[0], "kernels_min_and_BP.cl")).read()
        tpl = Template(kernelsource)
        rendered_tp = tpl.render(cn_degree=self.d_c_max, vn_degree=self.d_v_max, msg_at_time=msg_at_time_)

        self.program = cl.Program(self.context, str(rendered_tp)).build()

        self.queue = cl.CommandQueue(self.context)

        self.inbox_memory_start_varnodes_buffer = cl_array.to_device(self.queue,
                                                                self.inbox_memory_start_varnodes.astype(np.int32))

        self.inbox_memory_start_checknodes_buffer = cl_array.to_device(self.queue,
                                                                  self.inbox_memory_start_checknodes.astype(np.int32))

        self.degree_varnode_nr_buffer = cl_array.to_device(self.queue, self.degree_varnode_nr.astype(np.int32))

        self.degree_checknode_nr_buffer = cl_array.to_device(self.queue, self.degree_checknode_nr.astype(np.int32))

        self.target_memorycells_varnodes_buffer = cl_array.to_device(self.queue,
                                                                self.target_memory_cells_varnodes.astype(np.int32))
        self.target_memorycells_checknodes_buffer = cl_array.to_device(self.queue,
                                                                  self.target_memory_cells_checknodes.astype(np.int32))


        self.checknode_inbox_buffer = cl_array.empty(self.queue, self.inbox_memory_checknodes.shape, dtype=np.float64)

        self.varnode_inbox_buffer = cl_array.empty(self.queue, self.inbox_memory_varnodes.shape, dtype=np.float64)

        self.syndrom_buffer = cl_array.empty(self.queue,
            (self.degree_checknode_nr.shape[0], self.inbox_memory_varnodes.shape[-1]), dtype=np.int32)

        self.krnl = get_sum_kernel(self.context, None,
                                   dtype_in=self.varnode_inbox_buffer.dtype)  # varnode_output_buffer.dtype )

        # define programs
        self.send_prog = self.program.send_channel_values_to_checknode_inbox

        self.varnode_update_prog = self.program.varnode_update

        self.checknode_update_prog = self.program.checknode_update

        self.calc_syndrom_prog = self.program.calc_syndrome

        self.varoutput_prog = self.program.calc_varnode_output
    def __init__(self,
                 sino_shape,
                 slice_shape=None,
                 axis_position=None,
                 angles=None,
                 ctx=None,
                 devicetype="all",
                 platformid=None,
                 deviceid=None,
                 profile=False):
        OpenclProcessing.__init__(self,
                                  ctx=ctx,
                                  devicetype=devicetype,
                                  platformid=platformid,
                                  deviceid=deviceid,
                                  profile=profile)

        # Create a backprojector
        self.backprojector = Backprojection(sino_shape,
                                            slice_shape=slice_shape,
                                            axis_position=axis_position,
                                            angles=angles,
                                            ctx=self.ctx,
                                            profile=profile)
        # Create a projector
        self.projector = Projection(self.backprojector.slice_shape,
                                    self.backprojector.angles,
                                    axis_position=axis_position,
                                    detector_width=self.backprojector.num_bins,
                                    normalize=False,
                                    ctx=self.ctx,
                                    profile=profile)
        self.sino_shape = sino_shape
        self.is_cpu = self.backprojector.is_cpu
        # Arrays
        self.d_data = parray.empty(self.queue, sino_shape, dtype=np.float32)
        self.d_data.fill(0.0)
        self.d_sino = parray.empty_like(self.d_data)
        self.d_sino.fill(0.0)
        self.d_x = parray.empty(self.queue,
                                self.backprojector.slice_shape,
                                dtype=np.float32)
        self.d_x.fill(0.0)
        self.d_x_old = parray.empty_like(self.d_x)
        self.d_x_old.fill(0.0)

        self.add_to_cl_mem({
            "d_data": self.d_data,
            "d_sino": self.d_sino,
            "d_x": self.d_x,
            "d_x_old": self.d_x_old,
        })
예제 #16
0
    def __init__(self, size, **kwargs):
        """
        Parameters
        ----------
        size : tuple of two int
            (height, width) of the map

        kwargs
        ------
        apple_num : int
            number of total apples in a map
        eat_apple : float
            reward given when apple is eaten.
        hit_wall : float
            punishment(or reward?) given when hit wall.
        """
        # Don't confuse 'Viewer' and 'Engine'

        # kwargs
        self._apple_num = kwargs['apple_num']
        self._rewards = dict(
            eat_apple=kwargs['eat_apple'],
            hit_wall=kwargs['hit_wall'],
        )

        # Size of Engine should always be the same while running
        self._height = size[0]
        self._width = size[1]
        self._image = np.zeros((self.size[0], self.size[1], 3), dtype=np.uint8)
        self._TM = ThingsManager()

        # OpenCl things
        self.device = cl.get_platforms()[0].get_devices()[0]
        self.ctx = cl.Context([self.device])
        self.queue = cl.CommandQueue(self.ctx)
        self.bg_color = np.array(colors.COLOR_BACKGROUND, dtype=np.uint8)
        self.wall_color = np.array(colors.COLOR_WALL, dtype=np.uint8)
        self.image_dev = cl_array.empty(self.queue, self.image.shape, np.uint8)
        self.bg_col_dev = cl_array.to_device(self.queue, self.bg_color)
        self.wall_col_dev = cl_array.to_device(self.queue, self.wall_color)
        self.fp_ray_dev = None
        self.delta_vec_dev = None
        self.observation_dev = cl_array.empty(self.queue, (2, ec.RayNum, 3),
                                              np.uint8)
        cl_path = path.join(path.dirname(__file__), 'cl_scripts/ray.cl')
        with open(cl_path, 'r') as f:
            fstr = "".join(f.readlines())
        self.program = cl.Program(self.ctx, fstr).build()

        # Initiate things first and then call CollisionManager
        self.initiate_things()
        self._CM = CollisionManager(self.size, self._TM)
예제 #17
0
def test_numpy_integer_shape(ctx_factory):
    try:
        list(np.int32(17))
    except:
        pass
    else:
        from pytest import skip
        skip("numpy implementation does not handle scalar correctly.")
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    cl_array.empty(queue, np.int32(17), np.float32)
    cl_array.empty(queue, (np.int32(17), np.int32(17)), np.float32)
예제 #18
0
def test_numpy_integer_shape(ctx_factory):
    try:
        list(np.int32(17))
    except Exception:
        pass
    else:
        from pytest import skip
        skip("numpy implementation does not handle scalar correctly.")
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    cl_array.empty(queue, np.int32(17), np.float32)
    cl_array.empty(queue, (np.int32(17), np.int32(17)), np.float32)
예제 #19
0
    def _init_cl_arrays(self):

        self.cl_G = cla.to_device(self.queue, self.G.astype(self.complexdtype))
        self.cl_G_conj = cla.to_device(self.queue,
                                       self.G.astype(self.complexdtype).conj())

        self.cl_work = cla.zeros(self.queue, tuple(self.N12_pad),
                                 self.complexdtype)
        self.cl_workF = cla.zeros_like(self.cl_work)

        self.cl_field1 = cla.empty(self.queue, tuple(self.N1),
                                   self.complexdtype)
        self.cl_field2 = cla.empty(self.queue, tuple(self.N2),
                                   self.complexdtype)
예제 #20
0
    def _prep_gpu():
        """ Set up GPU calculation dependencies """

        # try to import the necessary libraries
        fallback = False
        try:
            import gpu
            import string
            import pyopencl as cl
            import pyopencl.array as cla
            from pyfft.cl import Plan
        except ImportError:
            fallback = True
            
        # check gpu_info
        try:
            assert gpu.valid(gpu_info),\
            "gpu_info in propagate_distances improperly specified"
            
            context, device, queue, platform = gpu_info
        except AssertionError:
            fallback = True
            
        if fallback:
            propagate_distances(data, distances, energy_or_wavelength,
                                pixel_pitch, subregion=subregion,
                                silent=silent, band_limit=band_limit,
                                gpu_info=None, im_convert=im_convert)
    
        # if everything is OK, allocate memory and build kernels
        kp = string.join(gpu.__file__.split('/')[:-1], '/')+'/kernels/'
        build = _build_helper(context, device, kp)
        phase_multiply = build('propagate_phase_multiply.cl')
        copy_to_buffer = build('propagate_copy_to_save_buffer.cl')
        fftplan = Plan((N, N), queue=queue)

        # put the signals onto the gpu along with buffers for the
        # various operations
        rarray = cla.to_device(queue, r.astype(np.float32))
        fourier = cla.to_device(queue, data.astype(np.complex64))
        phase = cla.empty(queue, (N, N), np.complex64)
        back = cla.empty(queue, (N, N), np.complex64)
        store = cla.empty(queue, (nf, rows, cols), np.complex64)
        
        # precompute the fourier transform of data. 
        fftplan.execute(fourier.data, wait_for_finish=True)

        return phase_multiply, copy_to_buffer, fftplan, rarray, fourier,\
               phase, back, store, build
예제 #21
0
    def build_scratch(self, imshape):

        self.scratch = []
        self.index_scratch = []

        l = np.prod(imshape)
        self.array_indices = cla.arange(self.queue, 0, l, 1, dtype=np.int32)
        if l % self.runlen != 0:
            l += l % self.runlen
        while l > 1:
            l /= self.runlen
            self.scratch.append(cla.empty(self.queue, (l, ), np.float32))
            self.index_scratch.append(cla.empty(self.queue, (l, ), np.int32))

        self.imshape = imshape
예제 #22
0
    def build_scratch(self, imshape):

        self.scratch = []
        self.index_scratch = []

        l = np.prod(imshape)
        self.array_indices = cla.arange(self.queue, 0, l, 1, dtype=np.int32)
        if l % self.runlen != 0:
            l += l % self.runlen
        while l > 1:
            l /= self.runlen
            self.scratch.append(cla.empty(self.queue, (l,), np.float32))
            self.index_scratch.append(cla.empty(self.queue, (l,), np.int32))

        self.imshape = imshape
예제 #23
0
 def _dev_array(self):
     if not hasattr(self, '__dev_array'):
         setattr(self, '__dev_array',
                 array.empty(_queue,
                             self.sparsity.nz,
                             self.dtype))
     return getattr(self, '__dev_array')
예제 #24
0
    def __init__(self,
                 ctx,
                 queue,
                 par,
                 kwidth=3,
                 overgridfactor=2,
                 fft_dim=(1, 2),
                 klength=200,
                 DTYPE=np.complex64,
                 DTYPE_real=np.float32):
        print("Setting up PyOpenCL NUFFT.")
        self.DTYPE = DTYPE
        self.DTYPE_real = DTYPE_real
        self.fft_shape = (par["NScan"] * par["NC"] * par["NSlice"], par["N"],
                          par["N"])
        self.traj = par["traj"]
        self.dcf = par["dcf"]
        self.Nproj = par["Nproj"]
        self.ctx = ctx
        self.queue = queue

        self.overgridfactor = overgridfactor
        self.kerneltable, self.kerneltable_FT, self.u = calckbkernel(
            kwidth, overgridfactor, par["N"], klength)
        self.kernelpoints = self.kerneltable.size
        self.fft_scale = DTYPE_real(
            np.sqrt(np.prod(self.fft_shape[fft_dim[0]:])))
        self.deapo = 1 / self.kerneltable_FT.astype(DTYPE_real)
        self.kwidth = kwidth / 2
        self.cl_kerneltable = cl.Buffer(
            self.ctx,
            cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR,
            hostbuf=self.kerneltable.astype(DTYPE_real).data)
        self.deapo_cl = cl.Buffer(self.ctx,
                                  cl.mem_flags.READ_ONLY
                                  | cl.mem_flags.COPY_HOST_PTR,
                                  hostbuf=self.deapo.data)
        self.dcf = clarray.to_device(self.queue, self.dcf)
        self.traj = clarray.to_device(self.queue, self.traj)
        self.tmp_fft_array = (clarray.empty(self.queue, (self.fft_shape),
                                            dtype=DTYPE))
        self.check = np.ones(par["N"], dtype=DTYPE_real)
        self.check[1::2] = -1
        self.check = clarray.to_device(self.queue, self.check)
        self.par_fft = int(self.fft_shape[0] / par["NScan"])
        self.fft = FFT(ctx,
                       queue,
                       self.tmp_fft_array[0:int(self.fft_shape[0] /
                                                par["NScan"]), ...],
                       out_array=self.tmp_fft_array[0:int(self.fft_shape[0] /
                                                          par["NScan"]), ...],
                       axes=fft_dim)
        self.gridsize = par["N"]
        self.fwd_NUFFT = self.NUFFT
        self.adj_NUFFT = self.NUFFTH
        self.prg = Program(
            self.ctx,
            open(
                resource_filename('rrsg_cgreco',
                                  'kernels/opencl_nufft_kernels.c')).read())
예제 #25
0
파일: util.py 프로젝트: ufo-kit/syris
def get_array(data, queue=None):
    """Get pyopencl.array.Array from *data* which can be a numpy array, a pyopencl.array.Array or a
    pyopencl.Image. *queue* is an OpenCL command queue.
    """
    if not queue:
        queue = cfg.OPENCL.queue

    if isinstance(data, cl_array.Array):
        result = data
    elif isinstance(data, np.ndarray):
        if data.dtype.kind == 'c':
            if data.dtype.itemsize != cfg.PRECISION.cl_cplx:
                data = data.astype(cfg.PRECISION.np_cplx)
            result = cl_array.to_device(queue, data.astype(cfg.PRECISION.np_cplx))
        else:
            if data.dtype.kind != 'f' or data.dtype.itemsize != cfg.PRECISION.cl_float:
                data = data.astype(cfg.PRECISION.np_float)
            result = cl_array.to_device(queue, data.astype(cfg.PRECISION.np_float))
    elif isinstance(data, cl.Image):
        result = cl_array.empty(queue, data.shape[::-1], np.float32)
        cl.enqueue_copy(queue, result.data, data, offset=0, origin=(0, 0),
                        region=result.shape[::-1])
        if result.dtype.itemsize != cfg.PRECISION.cl_float:
            result = result.astype(cfg.PRECISION.np_float)
    else:
        raise TypeError('Unsupported data type {}'.format(type(data)))

    return result
예제 #26
0
def axis_convolve(X, h, axis=0, queue=None, output=None):
    """Filter along an of *X* using filter vector *h*.  If *h* has odd length, each
    output sample is aligned with each input sample and *Y* is the same size as
    *X*.  If *h* has even length, each output sample is aligned with the mid point
    of each pair of input samples, and the output matrix's shape is increased
    by one along the convolution axis.

    After convolution, the :py:class:`pyopencl.array.Array` instance holding the
    device-side output is returned. This may be accessed on the host via
    :py:func:`to_array`.

    The axis of convolution is specified by *axis*. The default direction of
    convolution is column-wise.

    If *queue* is non-``None``, it should be a :py:class:`pyopencl.CommandQueue`
    instance which is used to perform the computation. If ``None``, a default
    global queue is used.

    If *output* is non-``None``, it should be a :py:class:`pyopencl.array.Array`
    instance which the result is written into. If ``None``, an output array is
    created.
    """

    _check_cl()
    queue = to_queue(queue)
    kern = _convolve_kernel_for_queue(queue.context)

    # Create output if not specified
    if output is None:
        output_shape = list(X.shape)
        if h.shape[0] % 2 == 0:
            output_shape[axis] += 1
        output = cl_array.empty(queue, output_shape, np.float32)

    return _apply_kernel(X, h, kern, output, axis=axis)
예제 #27
0
 def _evaluate(self, valuation, cache):
     q = pl.qs[0]
     if id(self) not in cache:
         X = self.ops[0]._evaluate(valuation, cache)
         W = self.ops[1]._evaluate(valuation, cache)
         b = self.ops[2]._evaluate(valuation, cache)
         out_c, _, kh, kw = W.shape
         n, c, h, w = X.shape
         out_h = conv.get_conv_outsize(h,
                                       kh,
                                       self.sy,
                                       self.ph,
                                       cover_all=self.cover_all)
         out_w = conv.get_conv_outsize(w,
                                       kw,
                                       self.sx,
                                       self.pw,
                                       cover_all=self.cover_all)
         y = clarray.empty(q, (n, out_c, out_h, out_w), dtype=X.dtype)
         self.col, ev1 = conv.im2col(q, X, kh, kw, self.sy, self.sx,
                                     self.ph, self.pw, self.cover_all)
         W_mat = W.reshape(out_c, -1)
         ev1.wait()  # TODO asynchronize
         col_mats = self.col.reshape(n, -1, out_h * out_w)
         y_mats = y.reshape(n, out_c, -1)
         for i in xrange(n):
             y_mats[i] = linalg.dot(q, W_mat, col_mats[i])
         if b is not None:
             # y += b[:, None, None]
             _, ev3 = conv.bcast_add(q, y, b, y)
             ev3.wait()  # TODO asynchronize
         cache[id(self)] = y
     return cache[id(self)]
예제 #28
0
def get_fluid_source(params, G, P, D, out=None):
    """Calculate a small fluid source term, added to conserved variables for stability"""
    s = G.slices
    sh = G.shapes

    # T the old fashioned way: TODO Tmhd_full...
    T = cl_array.empty(params['queue'], sh.grid_tensor, dtype=np.float64)
    for mu in range(4):
        Tmhd_vec(params, G, P, D, mu, out=T[mu])

    if out is None:
        out = cl_array.empty_like(P)

    global gcon1_d, gcon2_d, gcon3_d
    if gcon1_d is None:
        gcon1_d = cl_array.to_device(params['queue'],
                                     (G.conn[:, :, 1, :, :] *
                                      G.gdet[Loci.CENT.value]).copy())
        gcon2_d = cl_array.to_device(params['queue'],
                                     (G.conn[:, :, 2, :, :] *
                                      G.gdet[Loci.CENT.value]).copy())
        gcon3_d = cl_array.to_device(params['queue'],
                                     (G.conn[:, :, 3, :, :] *
                                      G.gdet[Loci.CENT.value]).copy())

    # Contract mhd stress tensor with connection
    evt, _ = G.dot2D2geom(params['queue'], u=T, g=gcon1_d, out=out[s.U1])
    evt, _ = G.dot2D2geom(params['queue'], u=T, g=gcon2_d, out=out[s.U2])
    evt, _ = G.dot2D2geom(params['queue'], u=T, g=gcon3_d, out=out[s.U2])

    if 'profile' in params and params['profile']:
        evt.wait()

    return out
예제 #29
0
def get_array(data, queue=None):
    """Get pyopencl.array.Array from *data* which can be a numpy array, a pyopencl.array.Array or a
    pyopencl.Image. *queue* is an OpenCL command queue.
    """
    if not queue:
        queue = cfg.OPENCL.queue

    if isinstance(data, cl_array.Array):
        result = data
    elif isinstance(data, np.ndarray):
        if data.dtype.kind == 'c':
            if data.dtype.itemsize != cfg.PRECISION.cl_cplx:
                data = data.astype(cfg.PRECISION.np_cplx)
            result = cl_array.to_device(queue,
                                        data.astype(cfg.PRECISION.np_cplx))
        else:
            if data.dtype.kind != 'f' or data.dtype.itemsize != cfg.PRECISION.cl_float:
                data = data.astype(cfg.PRECISION.np_float)
            result = cl_array.to_device(queue,
                                        data.astype(cfg.PRECISION.np_float))
    elif isinstance(data, cl.Image):
        result = cl_array.empty(queue, data.shape[::-1], np.float32)
        cl.enqueue_copy(queue,
                        result.data,
                        data,
                        offset=0,
                        origin=(0, 0),
                        region=result.shape[::-1])
        if result.dtype.itemsize != cfg.PRECISION.cl_float:
            result = result.astype(cfg.PRECISION.np_float)
    else:
        raise TypeError('Unsupported data type {}'.format(type(data)))

    return result
예제 #30
0
    def __init_particle(self):
        print("Info- init particles")
        gen = PhiloxGenerator(self.ocl_ctx)

        self.x_gpu = cl_array.empty(self.ocl_queue,
                                    self.dim * self.np,
                                    dtype=self.dtype)

        # Init position on a sphere of diameter 0.05 and center (mu,mu,mu)
        # self.x_gpu = gen.normal(
        #     self.ocl_queue, (self.np * self.dim), self.dtype, mu=0.5, sigma=0.05
        # )

        # Init velocity
        self.v_gpu = gen.normal(self.ocl_queue, (self.np * self.dim),
                                self.dtype,
                                mu=0,
                                sigma=1)

        # Init time
        self.t_gpu = cl_array.zeros(self.ocl_queue, self.np, dtype=self.dtype)

        self.ocl_prg.rt_init_particles(
            self.ocl_queue,
            (self.np, ),
            None,
            self.x_gpu.data,
            self.v_gpu.data,
        ).wait()
예제 #31
0
파일: irgn.py 프로젝트: grlee77/PyQMRI
    def _calcFwdGNPartLinear(self, x):
        if self._imagespace is False:
            b = clarray.empty(self._queue[0],
                              self._data_shape,
                              dtype=self._DTYPE)
            self._FT.FFT(
                b,
                clarray.to_device(
                    self._queue[0],
                    (self._step_val[:, None, ...] * self.par["C"]))).wait()
            b = b.get()
        else:
            b = self._step_val

        x = clarray.to_device(self._queue[0], np.require(x, requirements="C"))
        grad = clarray.to_device(self._queue[0],
                                 np.zeros(x.shape + (4, ), dtype=self._DTYPE))
        grad.add_event(
            self._grad_op.fwd(grad, x, wait_for=grad.events + x.events))
        x = x.get()
        grad = grad.get()
        sym_grad = None
        if self._reg_type == 'TGV':
            v = clarray.to_device(self._queue[0], self._v)
            sym_grad = clarray.to_device(
                self._queue[0], np.zeros(x.shape + (8, ), dtype=self._DTYPE))
            sym_grad.add_event(
                self._symgrad_op.fwd(sym_grad,
                                     v,
                                     wait_for=sym_grad.events + v.events))
            sym_grad = sym_grad.get()

        return b, grad, sym_grad
예제 #32
0
def nd_arange(shape, axis=0, start=0, step=1, clq=None):
    """Fill an ND-array along one axis with a stepped range.

       nd_arange((Z, Y, X), axis=2, start=A, step=B) is functionally
       equivalent to:

         np.arange(A, A+X*B, B)[None,None,:] * np.ones((Z, Y, X), np.float32)

       but does the work on the OpenCL device and without relying on
       array-broadcasting which is not supported in PyOpenCL.

    """
    assert axis >= 0
    assert axis < len(shape)

    if clq is None:
        clq = cl.CommandQueue(ctx)
        return_dev = False
    else:
        return_dev = True

    out_dev = cl_array.empty(clq, shape, float32)
    nd_arange_dev(clq, out_dev, axis, start, step)

    if return_dev:
        return out_dev
    else:
        out = out_dev.map_to_host()
        clq.finish()
        return out
예제 #33
0
 def __init__(self,queue, array,shape=None,dtype=None,
                  orginal_cpu_readonly=False,force_release_gpu=False):
     #We need to have an array, or the ablity to create an array
     assert (array is not None) or (shape is not None and dtype is not None)
     self._queue = queue 
     
     if array is not None:
         self._array = array;
         self._created_orignal = False
         self._orginaly_on_gpu = isinstance(array,cl_array.Array);
         
         if self._orginaly_on_gpu:
             self._gpu_array = array;
             self._cpu_array = None;
         else:
             self._cpu_array = array;
             self._gpu_array =cl_array.to_device(queue,self._cpu_array)
         
         if shape is not None and array.shape != shape:
             raise ValueError("Array is not in correct shape")
         if dtype is not None and array.dtype != dtype:
             raise ValueError("Array has wrong data type")
         
     else:
         self._gpu_array = cl_array.empty(queue,shape,dtype=dtype)
         self._cpu_array = None
         self._created_orignal = True
         self._orginaly_on_gpu = True;
     self._cpu_readonly = orginal_cpu_readonly
     self._force_release_gpu = force_release_gpu
예제 #34
0
def test_index_preservation(ctx_factory):
    from pytest import importorskip
    importorskip("mako")

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    classes = [GenericScanKernel]

    dev = context.devices[0]
    if dev.type & cl.device_type.CPU:
        classes.append(GenericDebugScanKernel)

    for cls in classes:
        for n in scan_test_counts:
            knl = cls(
                    context, np.int32,
                    arguments="__global int *out",
                    input_expr="i",
                    scan_expr="b", neutral="0",
                    output_statement="""
                        out[i] = item;
                        """)

            out = cl_array.empty(queue, n, dtype=np.int32)
            knl(out)

            assert (out.get() == np.arange(n)).all()
            from gc import collect
            collect()
예제 #35
0
 def allocate_arrays(self):
     """
     Allocate various types of arrays for the tests
     """
     # numpy images
     self.grad = np.zeros(self.image.shape, dtype=np.complex64)
     self.grad2 = np.zeros((2, ) + self.image.shape, dtype=np.float32)
     self.grad_ref = gradient(self.image)
     self.div_ref = divergence(self.grad_ref)
     self.image2 = np.zeros_like(self.image)
     # Device images
     self.gradient_parray = parray.empty(self.la.queue, self.image.shape,
                                         np.complex64)
     self.gradient_parray.fill(0)
     # we should be using cl.Buffer(self.la.ctx, cl.mem_flags.READ_WRITE, size=self.image.nbytes*2),
     # but platforms not suporting openCL 1.2 have a problem with enqueue_fill_buffer,
     # so we use the parray "fill" utility
     self.gradient_buffer = self.gradient_parray.data
     # Do the same for image
     self.image_parray = parray.to_device(self.la.queue, self.image)
     self.image_buffer = self.image_parray.data
     # Refs
     tmp = np.zeros(self.image.shape, dtype=np.complex64)
     tmp.real = np.copy(self.grad_ref[0])
     tmp.imag = np.copy(self.grad_ref[1])
     self.grad_ref_parray = parray.to_device(self.la.queue, tmp)
     self.grad_ref_buffer = self.grad_ref_parray.data
예제 #36
0
 def nFTH(x, fft, par):
     siz = np.shape(x)
     result = np.zeros(
         (par["NScan"], par["NC"], par["NSlice"], par["dimY"],
          par["dimX"]),
         dtype=par["DTYPE"],
     )
     tmp_result = clarray.empty(
         fft.queue,
         (1, 1, par["NSlice"], par["dimY"], par["dimX"]),
         dtype=par["DTYPE"],
     )
     start = time.time()
     for j in range(siz[0]):
         for k in range(siz[1]):
             inp = clarray.to_device(
                 fft.queue,
                 np.require(x[j, k, ...][None, None, ...],
                            requirements="C"),
             )
             fft.FFTH(tmp_result, inp, scan_offset=j).wait()
             result[j, k, ...] = np.squeeze(tmp_result.get())
     end = time.time() - start
     print("FT took %f s" % end)
     return result
예제 #37
0
def test_index_preservation(ctx_factory):
    from pytest import importorskip
    importorskip("mako")

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    from pyopencl.scan import GenericScanKernel, GenericDebugScanKernel
    classes = [GenericScanKernel]

    dev = context.devices[0]
    if dev.type & cl.device_type.CPU:
        classes.append(GenericDebugScanKernel)

    for cls in classes:
        for n in scan_test_counts:
            knl = cls(
                    context, np.int32,
                    arguments="__global int *out",
                    input_expr="i",
                    scan_expr="b", neutral="0",
                    output_statement="""
                        out[i] = item;
                        """)

            out = cl_array.empty(queue, n, dtype=np.int32)
            knl(out)

            assert (out.get() == np.arange(n)).all()
            from gc import collect
            collect()
 def __init__(self, idata):
     # idata: an array of lowercase characters.
     # Get platform and device (complete)
     NAME = 'NVIDIA CUDA'
     platforms = cl.get_platforms()
     devs = None
     for platform in platforms:
         if platform.name == NAME:
             devs = platform.get_devices()
     
     # TODO:
     # Set up a command queue (complete)
     self.ctx = cl.Context(devs)
     self.queue = cl.CommandQueue(self.ctx)
     
     # host variables (incomplete)
     # N = 16 #get rid of N #deprecate
     self.a = idata #a is a bunch of letters
     #self.b = np.random.rand(N).astype(np.float32) #deprecate
     
     # device memory allocation (incomplete)
     self.a_gpu = cl_array.to_device(self.queue, self.a) 
     # self.b_gpu = cl_array.to_device(self.queue, self.b) #deprecate
     self.c_gpu = cl_array.empty(self.queue, self.a.shape, self.a.dtype)
     
     # kernel code (incomplete)
     self.kernel = """
    def __init__(self,
                 queue,
                 array,
                 shape=None,
                 dtype=None,
                 orginal_cpu_readonly=False,
                 force_release_gpu=False):
        #We need to have an array, or the ablity to create an array
        assert (array is not None) or (shape is not None and dtype is not None)
        self._queue = queue

        if array is not None:
            self._array = array
            self._created_orignal = False
            self._orginaly_on_gpu = isinstance(array, cl_array.Array)

            if self._orginaly_on_gpu:
                self._gpu_array = array
                self._cpu_array = None
            else:
                self._cpu_array = array
                self._gpu_array = cl_array.to_device(queue, self._cpu_array)

            if shape is not None and array.shape != shape:
                raise ValueError("Array is not in correct shape")
            if dtype is not None and array.dtype != dtype:
                raise ValueError("Array has wrong data type")

        else:
            self._gpu_array = cl_array.empty(queue, shape, dtype=dtype)
            self._cpu_array = None
            self._created_orignal = True
            self._orginaly_on_gpu = True
        self._cpu_readonly = orginal_cpu_readonly
        self._force_release_gpu = force_release_gpu
예제 #40
0
 def _init_cl_arrays(self):
     self.cl_farfield_intensity = cla.empty(
         self.cl_queue,
         shape=self.far_field.shape,
         dtype=np.float32,
         allocator=self.cl_allocator,
     )
예제 #41
0
    def uniform(self, *args, **kwargs):
        a = kwargs.pop("a", 0)
        b = kwargs.pop("b", 1)

        result = cl_array.empty(*args, **kwargs)

        self.fill_uniform(result, queue=result.queue, a=a, b=b)
        return result
예제 #42
0
파일: opencl.py 프로젝트: joshcc3/PyOP2
 def _allocate_device(self):
     if self.state is DeviceDataMixin.DEVICE_UNALLOCATED:
         if self.soa:
             shape = self._data.T.shape
         else:
             shape = self._data.shape
         self._device_data = array.empty(_queue, shape=shape, dtype=self.dtype)
         self.state = DeviceDataMixin.HOST
예제 #43
0
    def normal(self, *args, **kwargs):
        mu = kwargs.pop("mu", 0)
        sigma = kwargs.pop("sigma", 1)

        result = cl_array.empty(*args, **kwargs)

        self.fill_normal(result, queue=result.queue, mu=mu, sigma=sigma)
        return result
예제 #44
0
파일: clrandom.py 프로젝트: EyNuel/pyopencl
    def __init__(self, queue, num_work_items,
            luxury=None, seed=None, no_warmup=False,
            use_legacy_init=False, max_work_items=None):
        if luxury is None:
            luxury = 4

        if seed is None:
            from time import time
            seed = int(time()*1e6) % 2<<30

        self.context = queue.context
        self.luxury = luxury
        self.num_work_items = num_work_items

        from pyopencl.characterize import has_double_support
        self.support_double = has_double_support(queue.device)

        self.no_warmup = no_warmup
        self.use_legacy_init = use_legacy_init
        self.max_work_items = max_work_items

        src = """
            %(defines)s

            #include <pyopencl-ranluxcl.cl>

            kernel void init_ranlux(unsigned seeds, global ranluxcl_state_t *ranluxcltab)
            {
              if (get_global_id(0) < %(num_work_items)d)
                ranluxcl_initialization(seeds, ranluxcltab);
            }
            """ % {
                    "defines": self.generate_settings_defines(),
                    "num_work_items": num_work_items
                }
        prg = cl.Program(queue.context, src).build()

        # {{{ compute work group size

        wg_size = None

        import sys
        import platform
        if ("darwin" in sys.platform
                and "Apple" in queue.device.platform.vendor
                and platform.mac_ver()[0].startswith("10.7")
                and queue.device.type == cl.device_type.CPU):
            wg_size = (1,)

        self.wg_size = wg_size

        # }}}

        self.state = cl_array.empty(queue, (num_work_items, 112), dtype=np.uint8)
        self.state.fill(17)

        prg.init_ranlux(queue, (num_work_items,), self.wg_size, np.uint32(seed),
                self.state.data)
예제 #45
0
def multi_dot(a_gpu, c):
    #a_gpu = cl_array.to_device(queue, a.astype(np.float32))
    c_gpu = cl_array.to_device(queue, c.astype(np.float32))
    out = cl_array.empty(queue, shape=(a_gpu.shape[0], a_gpu.shape[1]), dtype=np.float32)

    prg3.multi_dot(queue, out.shape, (128,1), a_gpu.data, c_gpu.data,
               out.data, np.uint32(a_gpu.shape[-1]), np.uint32(30)).wait()
    ax = out.get()
    return ax
예제 #46
0
파일: gpu.py 프로젝트: dhparks/als_speckle
 def _allocate(self, size, dtype, name=None):
     """ Wrapper to define new arrays whether gpu or cpu path"""
     if self.use_gpu:
         import pyopencl.array as cla
         x = cla.empty(self.queue, size, dtype)
         y = arrayWrapper(x, name)
         return y
     else:
         return np.zeros(size, dtype)
예제 #47
0
파일: lowlevel.py 프로젝트: ghisvail/dtcwt
def q2c(X1, X2, X3, queue=None, output=None):
    _check_cl()
    queue = to_queue(queue)
    kern = _q2c_kernel_for_queue(queue.context)

    if X1.shape != X2.shape or X2.shape != X3.shape:
        raise ValueError('All three X matrices must have the same shape.')

    # Create output if not specified
    if output is None:
        output_shape = [1,1,1]
        output_shape[:len(X1.shape[:2])] = X1.shape[:2]
        output_shape[0] >>= 1
        output_shape[1] >>= 1
        output_shape[2] = 6
        output = cl_array.empty(queue, tuple(output_shape), np.complex64)

    # If necessary, convert X
    X1_device = to_device(X1, queue)
    X2_device = to_device(X2, queue)
    X3_device = to_device(X3, queue)

    # Work out size of work group taking into account element step
    work_shape = np.array(output.shape[:3])

    # Work out optimum group size
    if work_shape.shape[0] >= 2 and np.all(work_shape[:2] > 1):
        local_shape = (int(np.floor(np.sqrt(queue.device.max_work_group_size))),) * 2 + (1,1,)
    else:
        local_shape = (queue.device.max_work_group_size, 1, 1)
    local_shape = local_shape[:len(work_shape)]

    global_shape = list(int(np.ceil(x/float(y))*y) for x, y in zip(work_shape, local_shape))

    X_shape = struct.pack('iiii', *(tuple(X1_device.shape) + (1,1,1,1))[:4])

    X1_strides = struct.pack('iiii', *(tuple(s//X1_device.dtype.itemsize for s in X1_device.strides) + (0,0,0,0))[:4])
    X1_offset = np.int32(X1_device.offset)
    X2_strides = struct.pack('iiii', *(tuple(s//X2_device.dtype.itemsize for s in X2_device.strides) + (0,0,0,0))[:4])
    X2_offset = np.int32(X2_device.offset)
    X3_strides = struct.pack('iiii', *(tuple(s//X3_device.dtype.itemsize for s in X3_device.strides) + (0,0,0,0))[:4])
    X3_offset = np.int32(X3_device.offset)

    Y_strides = struct.pack('iiii', *(tuple(s//output.dtype.itemsize for s in output.strides) + (0,0,0,0))[:4])
    Y_shape = struct.pack('iiii', *(tuple(output.shape) + (1,1,1,1))[:4])
    Y_offset = np.int32(output.offset)

    # Perform actual convolution
    kern(queue, global_shape, local_shape,
            X_shape,
            X1_device.base_data, X1_strides, X1_offset,
            X2_device.base_data, X2_strides, X2_offset,
            X3_device.base_data, X3_strides, X3_offset,
            output.base_data, Y_strides, Y_shape, Y_offset)

    return output
예제 #48
0
    def normal(self, *args, **kwargs):
        """Make a new empty array, apply :meth:`fill_normal` to it.
        """
        mu = kwargs.pop("mu", 0)
        sigma = kwargs.pop("sigma", 1)

        result = cl_array.empty(*args, **kwargs)

        self.fill_normal(result, queue=result.queue, mu=mu, sigma=sigma)
        return result
예제 #49
0
    def uniform(self, *args, **kwargs):
        """Make a new empty array, apply :meth:`fill_uniform` to it.
        """
        a = kwargs.pop("a", 0)
        b = kwargs.pop("b", 1)

        result = cl_array.empty(*args, **kwargs)

        self.fill_uniform(result, queue=result.queue, a=a, b=b)
        return result
예제 #50
0
def axis_convolve_ifilter(X, h, axis=0, queue=None, output=None):
    _check_cl()
    queue = to_queue(queue)
    kern = _ifilter_kernel_for_queue(queue.context)

    # Create output if not specified
    if output is None:
        output_shape = list(X.shape)
        output_shape[axis] <<= 1
        output = cl_array.empty(queue, output_shape, np.float32)

    return _apply_kernel(X, h, kern, output, axis=axis, elementstep=0.5)
예제 #51
0
    def set_ggr(self,ggr):
        
        assert self.can_has_domains, "must set domains before ggr"
        assert isinstance(ggr,tuple) and len(ggr) == 2, "ggr must be a 2-tuple"
        
        growth_rate,ncrossings = ggr
        
        window_length      = 10  # can be changed but not exposed for simplicity      
        rate               = (1+growth_rate)**(1./window_length)-1
        self.plan          = self._ggr_make_plan(self.m0,rate,0.02,50)
        self.target        = 0
        self.optimized_spa = 0.05

        if not self.can_has_ggr:
            self.next_crossing = 0.0
            self.crossed       = False
            self.ggr_tracker   = np.zeros((len(self.plan),3),float)
            self.spa_buffer    = cla.empty(self.queue,(self.N,self.N),np.float32)
            self.whenflipped   = cla.empty(self.queue,(self.N,self.N),np.int32)

            # build the lookup table for the recency enforcement
            # these parameters can be changed but are not exposed to the user to keep things simple
            rmin, rmax, rrate = 0.05, 2., 0.5
            x = np.arange(len(self.plan)).astype('float')
            recency_need = rmin*rmax*np.exp(rrate*x)/(rmax+rmin*np.exp(rrate*x))
            self.recency_need = cla.to_device(self.queue,recency_need.astype(np.float32))
    
            self.set_zero(self.whenflipped)
        
            # self.crossings are the values of m_out which, when crossed over, generate a signal
            # to save the output to make a movie out of or whatever
            if isinstance(ncrossings,(int,float)): self.crossings = np.arange(0,1,1./ncrossings)[1:]
            if isinstance(ncrossings,(list,tuple,np.ndarray)): self.crossings = ncrossings
            if ncrossings != None: self.next_crossing = self.crossings[-1]
        
        self.direction = np.sign(self.m0-self.plan[-1])
        
        self.can_has_ggr = True
예제 #52
0
 def allocate_space(self,x_peak,y_peak,k_max,order,type):
     # step=k_max/order
     # points=numpy.array([i*step for i in range(order)])
     # weights=numpy.array([step for i in range(order)])
     [points,weights]=calc.triangle_contour(x_peak,y_peak,k_max,order) # Generate weights.
     self.k_max=k_max
     size=self.size=len(points)
     host_k=(numpy.array([points[i%size] for i in range(size**2)])).astype(type) # Generate k-matrix.
     host_k_prim=(numpy.array([points[(int)(i/size)] for i in range(size**2)])).astype(type) # Generate k_prim-matrix.
     host_step=(numpy.array([weights[(int)(i/size)] for i in range(size**2)])).astype(type) # Generate step-matrix.
     self.gpu_k=cl_array.to_device(self.ctx,self.queue,host_k) # Flush k to gpu
     self.gpu_k_prim=cl_array.to_device(self.ctx,self.queue,host_k_prim) # Flush k_prim to gpu.
     self.gpu_step=cl_array.to_device(self.ctx,self.queue,host_step) # Flush steps to gpu.
     self.gpu_result=cl_array.empty(self.queue,(size**2,1,),type) # Allocate space for results.
예제 #53
0
def _coh_gaussian2(t_array, w, tz):
    if tz != 0.:
        t_array -= tz

    shape = t_array.shape
    t_array = t_array.astype(np.float32)
    t_arr_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=t_array)
    shape = (shape[0], shape[1], 4)
    out = cl_array.empty(queue, shape=shape, dtype=np.float32)

    prg2.coh_gauss(queue, (t_array.size, 3), None, t_arr_gpu,  np.float32(w/1.4142), out.data).wait()

    a = out.get()
    a /= np.abs(a).max(0)
    return a
예제 #54
0
    def __init__(self, ctx, a_dev, b_dev, mat_vec_knl):
        self.a_dev = a_dev
        self.b_dev = b_dev
        self.context = ctx
        self.mat_vec_knl = mat_vec_knl

        queue = self.queue = cl.CommandQueue(ctx, properties=cq_props)

        mat_shape = a_dev.shape

        self.x_dev = cl_array.empty(queue, (mat_shape[1],),
                dtype=np.float32)
        mf = cl.mem_flags
        self.y_host_buf = cl.Buffer(ctx, mf.ALLOC_HOST_PTR, self.b_dev.nbytes)
        self.y_host = self.y_host_buf.get_host_array(
                mat_shape[0], dtype=np.float32)
예제 #55
0
 def color_deconvolution(self, rgb, stain):
     """Return stains in normal (non-logarithmic) color space.
     """
     rgb = self.check_contiguous(rgb)
     stain = self.check_contiguous(stain)
     assert(rgb.flags.c_contiguous == stain.flags.c_contiguous)
     queue = cl.CommandQueue(self.ctx)
     rgb2d = rgb.reshape(-1, 3)  # 2D array with R,G,B columns from 3D
     rgb2d_g = cla.to_device(queue, rgb2d, allocator=self.mem_pool)
     stain_g = cla.to_device(queue, stain, allocator=self.mem_pool)
     out_g = cla.empty(queue, (rgb2d.shape[0], stain.shape[1]), dtype=rgb2d_g.dtype, order="C", allocator=self.mem_pool)
     # Process as flat array
     self.prg.opticalDense(queue, (rgb2d.size, 1), None, rgb2d_g.data)
     # In PyOpenCL arrays rgb2d_g.shape[0] is column count (usually 3 columns here).
     self.prg.gemm_slow(queue, out_g.shape, None, out_g.data, rgb2d_g.data, stain_g.data, np.int32(rgb2d.shape[1]), np.int32(stain.shape[1]))
     self.prg.toColorDense(queue, (out_g.size, 1), None, out_g.data)
     return out_g.get().reshape(rgb.shape) # Again 3D array
예제 #56
0
    def dot(self, A, B):
        """Output must have same shape as A.

        Incoming RGB matrix "A" should be aligned
        """
        A = self.check_contiguous(A)
        B = self.check_contiguous(B)
        assert(A.flags.c_contiguous == B.flags.c_contiguous)
        queue = cl.CommandQueue(self.ctx)
        if A.dtype is not np.float32:
            A = A.astype(np.float32)
        if B.dtype is not np.float32:
            B = B.astype(np.float32)
        A_g = cla.to_device(queue, A, self.mem_pool)
        B_g = cla.to_device(queue, B, self.mem_pool)
        C_g = cla.empty(queue, (A.shape[0], B.shape[1]), dtype=A_g.dtype, order="C", allocator=self.mem_pool)
        self.prg.gemm_slow(queue, C_g.shape, None, C_g.data, A_g.data, B_g.data, np.int32(A.shape[1]), np.int32(B.shape[1]))
        return C_g.get()
예제 #57
0
def _fold_exp(t_array, w, tz, tau_arr):
    if tz != 0.:
        t_array -= tz

    shape = t_array.shape
    t_array = t_array.astype(np.float32)

    t_arr_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=t_array)
    tau_buf = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR,
                        hostbuf=(1/tau_arr).astype(np.float32))
    shape = (shape[0], shape[1], tau_arr.size)
    out = cl_array.empty(queue, shape=shape, dtype=np.float32)
    global_work_size = t_array.size + (work_size[0] - t_array.size % work_size[0])
    prg.fold_exp(queue, (global_work_size, tau_arr.size), work_size, t_arr_gpu, np.float32(w),
                 tau_buf, out.data, np.uint32(t_array.size)).wait()

    a = out.get()

    return a
예제 #58
0
    def unmix_stains(self, rgb, stain):
        """Take RGB IHC image and split it to stains like skimage version.
        """
        rgb = self.check_contiguous(rgb)
        stain = self.check_contiguous(stain)
        assert(rgb.flags.c_contiguous == stain.flags.c_contiguous)
        queue = cl.CommandQueue(self.ctx)
        rgb2d = rgb.reshape(-1, 3)  # 2D array with R,G,B columns from 3D
        rgb2d_g = cla.to_device(queue, rgb2d, allocator=self.mem_pool)
        stain_g = cla.to_device(queue, stain, allocator=self.mem_pool)
        out_g = cla.empty(queue, (rgb2d.shape[0], stain.shape[1]), dtype=rgb2d_g.dtype, order="C", allocator=self.mem_pool)

        # Process as flat array
        self.prg.opticalDense(queue, (rgb2d.size, 1), None, rgb2d_g.data)

        # In PyOpenCL arrays rgb2d_g.shape[0] is column count (usually 3 columns here).
        self.prg.gemm_slow(queue, out_g.shape, None, out_g.data, rgb2d_g.data, stain_g.data, np.int32(rgb2d.shape[1]), np.int32(stain.shape[1]))
        ### self.prg.gemm(queue, rgb2d_g.shape, None, out_g.data, rgb2d_g.data, stain_g.data, np.int32(rgb2d_g.shape[0]), np.int32(stain_g.shape[1]))
        # event =
        # event.wait()
        return out_g.get().reshape(rgb.shape) # Again 3D array
예제 #59
0
 def empty(self, shape, dtype, order="C"):
     from pyopencl.array import empty
     return empty(self.queue, shape, dtype, order=order)
예제 #60
0
    def __init__(self, queue, num_work_items=None,
            luxury=None, seed=None, no_warmup=False,
            use_legacy_init=False, max_work_items=None):
        """
        :param queue: :class:`pyopencl.CommandQueue`, only used for initialization
        :param luxury: the "luxury value" of the generator, and should be 0-4,
            where 0 is fastest and 4 produces the best numbers. It can also be
            >=24, in which case it directly sets the p-value of RANLUXCL.
        :param num_work_items: is the number of generators to initialize,
            usually corresponding to the number of work-items in the NDRange
            RANLUXCL will be used with.  May be `None`, in which case a default
            value is used.
        :param max_work_items: should reflect the maximum number of work-items
            that will be used on any parallel instance of RANLUXCL. So for
            instance if we are launching 5120 work-items on GPU1 and 10240
            work-items on GPU2, GPU1's RANLUXCLTab would be generated by
            calling ranluxcl_intialization with numWorkitems = 5120 while
            GPU2's RANLUXCLTab would use numWorkitems = 10240. However
            maxWorkitems must be at least 10240 for both GPU1 and GPU2, and it
            must be set to the same value for both. (may be `None`)

        .. versionchanged:: 2013.1
            Added default value for `num_work_items`.
        """

        if luxury is None:
            luxury = 4

        if num_work_items is None:
            if queue.device.type & cl.device_type.CPU:
                num_work_items = 8 * queue.device.max_compute_units
            else:
                num_work_items = 64 * queue.device.max_compute_units

        if seed is None:
            from time import time
            seed = int(time()*1e6) % 2 << 30

        self.context = queue.context
        self.luxury = luxury
        self.num_work_items = num_work_items

        from pyopencl.characterize import has_double_support
        self.support_double = has_double_support(queue.device)

        self.no_warmup = no_warmup
        self.use_legacy_init = use_legacy_init
        self.max_work_items = max_work_items

        src = """
            %(defines)s

            #include <pyopencl-ranluxcl.cl>

            kernel void init_ranlux(unsigned seeds,
                global ranluxcl_state_t *ranluxcltab)
            {
              if (get_global_id(0) < %(num_work_items)d)
                ranluxcl_initialization(seeds, ranluxcltab);
            }
            """ % {
                    "defines": self.generate_settings_defines(),
                    "num_work_items": num_work_items
                }
        prg = cl.Program(queue.context, src).build()

        # {{{ compute work group size

        wg_size = None

        import sys
        import platform
        if ("darwin" in sys.platform
                and "Apple" in queue.device.platform.vendor
                and platform.mac_ver()[0].startswith("10.7")
                and queue.device.type & cl.device_type.CPU):
            wg_size = (1,)

        self.wg_size = wg_size

        # }}}

        self.state = cl_array.empty(queue, (num_work_items, 112), dtype=np.uint8)
        self.state.fill(17)

        prg.init_ranlux(queue, (num_work_items,), self.wg_size, np.uint32(seed),
                self.state.data)