Пример #1
0
def test_struct_reduce(ctx_factory):
    pytest.importorskip("mako")

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    dev, = context.devices
    if (dev.vendor == "NVIDIA" and dev.platform.vendor == "Apple"
            and dev.driver_version == "8.12.47 310.40.00.05f01"):
        pytest.skip("causes a compiler hang on Apple/Nv GPU")

    mmc_dtype, mmc_c_decl = make_mmc_dtype(context.devices[0])

    preamble = mmc_c_decl + r"""//CL//

    minmax_collector mmc_neutral()
    {
        // FIXME: needs infinity literal in real use, ok here
        minmax_collector result;
        result.cur_min = 1<<30;
        result.cur_max = -(1<<30);
        return result;
    }

    minmax_collector mmc_from_scalar(float x)
    {
        minmax_collector result;
        result.cur_min = x;
        result.cur_max = x;
        return result;
    }

    minmax_collector agg_mmc(minmax_collector a, minmax_collector b)
    {
        minmax_collector result = a;
        if (b.cur_min < result.cur_min)
            result.cur_min = b.cur_min;
        if (b.cur_max > result.cur_max)
            result.cur_max = b.cur_max;
        return result;
    }

    """

    from pyopencl.clrandom import rand as clrand
    a_gpu = clrand(queue, (20000,), dtype=np.int32, a=0, b=10**6)
    a = a_gpu.get()

    from pyopencl.reduction import ReductionKernel
    red = ReductionKernel(context, mmc_dtype,
            neutral="mmc_neutral()",
            reduce_expr="agg_mmc(a, b)", map_expr="mmc_from_scalar(x[i])",
            arguments="__global int *x", preamble=preamble)

    minmax = red(a_gpu).get()
    #print minmax["cur_min"], minmax["cur_max"]
    #print np.min(a), np.max(a)

    assert abs(minmax["cur_min"] - np.min(a)) < 1e-5
    assert abs(minmax["cur_max"] - np.max(a)) < 1e-5
Пример #2
0
	def __init__(self, context, queue):
		""" Constructor.
		@param context OpenCL context where apply.
		@param queue OpenCL command queue.
		"""
		self.context = context
		self.queue   = queue
		self.program = clUtils.loadProgram(context, clUtils.path() + "/lsqr.cl")
		# Create OpenCL objects as null objects, that we will generate
		# at the first iteration
		self.A      = None
		self.B      = None
		self.X0     = None
		self.X      = None
		self.R      = None
		# Create dot operator
		self.dot = ReductionKernel(context, np.float32, neutral="0",
		                           reduce_expr="a+b", map_expr="x[i]*y[i]",
		                           arguments="__global float *x, __global float *y")
		self.dot_c_vec   = ElementwiseKernel(context,
		                                     "float c, float *v",
		                                     "v[i] *= c")
		self.copy_vec    = ElementwiseKernel(context,
		                                     "float* out, float *in",
		                                     "out[i] = in[i]")
		self.linear_comb = ElementwiseKernel(context,
		                                     "float* z,"
		                                     "float a, float *x, "
		                                     "float b, float *y",
		                                     "z[i] = a*x[i] + b*y[i]")
		self.prod        = ElementwiseKernel(context,
		                                     "float* z,"
		                                     "float *x, float *y",
		                                     "z[i] = x[i]*y[i]")
Пример #3
0
    def qubit_probability(self, target):
        """Get the probability of a single qubit begin measured as '0'"""

        preamble = """
        #include <pyopencl-complex.h>

        float probability(int target, int i, cfloat_t amp) {
            if ((i & (1 << target )) != 0) {
                return 0;
            }
            // return 6.0;
            float abs = cfloat_abs(amp);
            return abs * abs;
        }
        """
        

        kernel = ReductionKernel(
            context, 
            np.float, 
            neutral = "0",
            reduce_expr="a + b",
            map_expr="probability(target, i, amps[i])",
            arguments="__global cfloat_t *amps, __global int target",
            preamble=preamble
        )

        return kernel(self.buffer, target).get()
Пример #4
0
    def errest(self, x, y, z):
        if x.traits != y.traits != z.traits:
            raise ValueError('Incompatible matrix types')

        cnt = x.leaddim * x.nrow
        dtype = x.dtype

        # Build the reduction kernel
        rkern = ReductionKernel(
            self.backend.ctx,
            dtype,
            neutral='0',
            reduce_expr='a + b',
            map_expr='pow(x[i]/(atol + rtol*max(fabs(y[i]), fabs(z[i]))), 2)',
            arguments='__global {0}* x, __global {0}* y, __global {0}* z, '
            '{0} atol, {0} rtol'.format(npdtype_to_ctype(dtype)))

        class ErrestKernel(ComputeKernel):
            @property
            def retval(self):
                return self._retarr.get()

            def run(self, queue, atol, rtol):
                qcomp = queue.cl_queue_comp

                xarr = Array(qcomp, cnt, dtype, data=x.data)
                yarr = Array(qcomp, cnt, dtype, data=y.data)
                zarr = Array(qcomp, cnt, dtype, data=z.data)

                self._retarr = rkern(xarr, yarr, zarr, atol, rtol, queue=qcomp)

        return ErrestKernel()
Пример #5
0
    def __init__(self, ctx, queue, data, symmetry_modes):
        self._ctx = ctx
        self._queue = queue
        self.symmetry_modes = symmetry_modes

        self.data = data

        ctype = dtype_to_ctype(data.dtype)

        with open('sandpile.cl') as f:
            program = cl.Program(self._ctx, f.read())

        macros = _gen_macros(data, symmetry_modes)
        options = _macros_to_options(macros)
        self._program = program.build(options=options)

        from pyopencl.reduction import ReductionKernel
        self._diff_krnl = ReductionKernel(
            self._ctx,
            numpy.uint32,
            neutral='0',
            reduce_expr='a+b',
            map_expr='grid[i]!=new_grid[i]',
            arguments='const __global %s *grid, const __global %s *new_grid' %
            (ctype, ctype))
def argmin_kernal(context):

    import numpy as np
    mmc_dtype = np.dtype([
        ("cur_min", np.float32),
        ("cur_index", np.int32),
        ("pad", np.int32),
    ])

    name = "argmin_collector"
    from pyopencl.tools import get_or_register_dtype, match_dtype_to_c_struct

    mmc_dtype, mmc_c_decl = match_dtype_to_c_struct(device, name, mmc_dtype)
    mmc_dtype = get_or_register_dtype(name, mmc_dtype)

    preamble = mmc_c_decl + r"""//CL//

    argmin_collector mmc_neutral()
    {
        // FIXME: needs infinity literal in real use, ok here
        argmin_collector result;
        result.cur_min = INFINITY;
        result.cur_index = -1;
        return result;
    }

    argmin_collector mmc_from_scalar(float x,int index)
    {
        argmin_collector result;
        result.cur_min = x;
        result.cur_index = index;
        return result;
    }

    argmin_collector agg_mmc(argmin_collector a, argmin_collector b)
    {
        argmin_collector result = a;
        if (b.cur_min < result.cur_min)
        {
            result.cur_min = b.cur_min;
            result.cur_index = b.cur_index;
        }
        return result;
    }

    """

    from pyopencl.reduction import ReductionKernel
    red = ReductionKernel(context,
                          mmc_dtype,
                          neutral="mmc_neutral()",
                          reduce_expr="agg_mmc(a, b)",
                          map_expr="mmc_from_scalar(x[i],i)",
                          arguments="__global int *x",
                          preamble=preamble)

    return red
Пример #7
0
 def make_reduction_krnl(self):
     self.krnl = ReductionKernel(
         self.ctx,
         cltypes.float,
         neutral="0",
         reduce_expr="a+b",
         map_expr="pow(y_pred[i] - y_true[i], 2)",
         arguments=
         "__global const float* y_true, __global const float* y_pred",
         name="mse_reduction_kernel")
Пример #8
0
 def make_reduction_krnl(self):
     self.krnl = ReductionKernel(
         self.ctx,
         cltypes.float,
         neutral="0",
         reduce_expr="a+b",
         # p is the true distribution, q is predicted
         map_expr="y_true[i] * (-log(y_pred[i]))",
         arguments=
         "__global const float* y_true, __global const float* y_pred",
         name="categorical_crossentropy_reduction_kernel")
Пример #9
0
def test_struct_reduce(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    mmc_dtype, mmc_c_decl = make_mmc_dtype(context.devices[0])

    preamble = mmc_c_decl + r"""//CL//

    minmax_collector mmc_neutral()
    {
        // FIXME: needs infinity literal in real use, ok here
        minmax_collector result;
        result.cur_min = 1<<30;
        result.cur_max = -(1<<30);
        return result;
    }

    minmax_collector mmc_from_scalar(float x)
    {
        minmax_collector result;
        result.cur_min = x;
        result.cur_max = x;
        return result;
    }

    minmax_collector agg_mmc(minmax_collector a, minmax_collector b)
    {
        minmax_collector result = a;
        if (b.cur_min < result.cur_min)
            result.cur_min = b.cur_min;
        if (b.cur_max > result.cur_max)
            result.cur_max = b.cur_max;
        return result;
    }

    """

    from pyopencl.clrandom import rand as clrand
    a_gpu = clrand(queue, (20000,), dtype=np.int32, a=0, b=10**6)
    a = a_gpu.get()

    from pyopencl.reduction import ReductionKernel
    red = ReductionKernel(context, mmc_dtype,
            neutral="mmc_neutral()",
            reduce_expr="agg_mmc(a, b)", map_expr="mmc_from_scalar(x[i])",
            arguments="__global int *x", preamble=preamble)

    minmax = red(a_gpu).get()
    #print minmax["cur_min"], minmax["cur_max"]
    #print np.min(a), np.max(a)

    assert abs(minmax["cur_min"] - np.min(a)) < 1e-5
    assert abs(minmax["cur_max"] - np.max(a)) < 1e-5
Пример #10
0
    def compile_kernels(self):
        """Compile the kernel"""
        OpenclProcessing.compile_kernels(self,
                                         self.kernel_files,
                                         "-D NIMAGE=%i" % self.size)
        compiler_options = self.get_compiler_options(x87_volatile=True)
        src = concatenate_cl_kernel(("doubleword.cl", "statistics.cl"))
        self.reduction_comp = ReductionKernel(self.ctx,
                                              dtype_out=float8,
                                              neutral=zero8,
                                              map_expr="map_statistics(data, i)",
                                              reduce_expr="reduce_statistics(a,b)",
                                              arguments="__global float *data",
                                              preamble=src,
                                              options=compiler_options)
        self.reduction_simple = ReductionKernel(self.ctx,
                                                dtype_out=float8,
                                                neutral=zero8,
                                                map_expr="map_statistics(data, i)",
                                                reduce_expr="reduce_statistics_simple(a,b)",
                                                arguments="__global float *data",
                                                preamble=src,
                                                options=compiler_options)

        if "cl_khr_fp64" in self.device.extensions:
            self.reduction_double = ReductionKernel(self.ctx,
                                                    dtype_out=float8,
                                                    neutral=zero8,
                                                    map_expr="map_statistics(data, i)",
                                                    reduce_expr="reduce_statistics_double(a,b)",
                                                    arguments="__global float *data",
                                                    preamble=src,
                                                    options=compiler_options)
        else:
            logger.info("Device %s does not support double-precision arithmetics, fall-back on compensated one", self.device)
            self.reduction_double = self.reduction_comp
 def compile_kernels(self):
     """Compile the kernel"""
     OpenclProcessing.compile_kernels(self, self.kernel_files,
                                      "-D NIMAGE=%i" % self.size)
     compiler_options = self.get_compiler_options(x87_volatile=True)
     src = concatenate_cl_kernel(("kahan.cl", "statistics.cl"))
     self.reduction_comp = ReductionKernel(
         self.ctx,
         dtype_out=float8,
         neutral=zero8,
         map_expr="map_statistics(data, i)",
         reduce_expr="reduce_statistics(a,b)",
         arguments="__global float *data",
         preamble=src,
         options=compiler_options)
     self.reduction_simple = ReductionKernel(
         self.ctx,
         dtype_out=float8,
         neutral=zero8,
         map_expr="map_statistics(data, i)",
         reduce_expr="reduce_statistics_simple(a,b)",
         arguments="__global float *data",
         preamble=src,
         options=compiler_options)
Пример #12
0
    def init_kernels(self):
        """Set up the OpenCL kernels."""
        from pkg_resources import resource_string
        kernel_src = resource_string(__name__, 'CLBacterium.cl')

        self.program = cl.Program(self.context,
                                  kernel_src).build(cache_dir=False)
        # Some kernels that seem like they should be built into pyopencl...
        self.vclearf = ElementwiseKernel(self.context, "float8 *v", "v[i]=0.0",
                                         "vecclearf")
        self.vcleari = ElementwiseKernel(self.context, "int *v", "v[i]=0",
                                         "veccleari")
        self.vadd = ElementwiseKernel(
            self.context, "float8 *res, const float8 *in1, const float8 *in2",
            "res[i] = in1[i] + in2[i]", "vecadd")
        self.vsub = ElementwiseKernel(
            self.context, "float8 *res, const float8 *in1, const float8 *in2",
            "res[i] = in1[i] - in2[i]", "vecsub")
        self.vaddkx = ElementwiseKernel(
            self.context,
            "float8 *res, const float k, const float8 *in1, const float8 *in2",
            "res[i] = in1[i] + k*in2[i]", "vecaddkx")
        self.vsubkx = ElementwiseKernel(
            self.context,
            "float8 *res, const float k, const float8 *in1, const float8 *in2",
            "res[i] = in1[i] - k*in2[i]", "vecsubkx")

        # cell geometry kernels
        self.calc_cell_area = ElementwiseKernel(
            self.context, "float* res, float* r, float* l",
            "res[i] = 2.f*3.1415927f*r[i]*(2.f*r[i]+l[i])", "cell_area_kern")
        self.calc_cell_vol = ElementwiseKernel(
            self.context, "float* res, float* r, float* l",
            "res[i] = 3.1415927f*r[i]*r[i]*(2.f*r[i]+l[i])", "cell_vol_kern")

        # A dot product as sum of float4 dot products -
        # i.e. like flattening vectors of float8s into big float vectors
        # then computing dot
        # NB. Some openCLs seem not to implement dot(float8,float8) so split
        # into float4's
        self.vdot = ReductionKernel(
            self.context,
            numpy.float32,
            neutral="0",
            reduce_expr="a+b",
            map_expr="dot(x[i].s0123,y[i].s0123)+dot(x[i].s4567,y[i].s4567)",
            arguments="__global float8 *x, __global float8 *y")
Пример #13
0
def test_sum_without_data(ctx_factory):
    from pytest import importorskip
    importorskip("mako")

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    n = 2000

    from pyopencl.reduction import ReductionKernel
    red = ReductionKernel(context, np.int32,
            neutral="0",
            reduce_expr="a+b", map_expr="i",
            arguments=[])

    result_dev = red(range=slice(n), queue=queue).get()
    result_ref = n*(n-1)//2

    assert result_dev == result_ref
Пример #14
0
    def _get_minmax_kernel(self, ctx, dtype, mmc_dtype, prop_names, only_min,
                           only_max, name, mmc_c_decl):
        tpl_args = ", ".join([
            "%(dtype)s %(prop)s" % {
                'dtype': dtype,
                'prop': prop
            } for prop in prop_names
        ])

        mmc_preamble = mmc_c_decl + minmax_tpl
        preamble = mkt.Template(text=mmc_preamble).render(
            args=tpl_args,
            prop_names=prop_names,
            dtype=name,
            only_min=only_min,
            only_max=only_max)

        knl_args = ", ".join([
            "__global %(dtype)s *%(prop)s" % {
                'dtype': dtype,
                'prop': prop
            } for prop in prop_names
        ])

        map_args = ", ".join([
            "%(prop)s[i]" % {
                'dtype': dtype,
                'prop': prop
            } for prop in prop_names
        ])

        from pyopencl.reduction import ReductionKernel

        knl = ReductionKernel(ctx,
                              mmc_dtype,
                              neutral="mmc_neutral()",
                              reduce_expr="agg_mmc(a, b)",
                              map_expr="mmc_from_scalar(%s)" % map_args,
                              arguments=knl_args,
                              preamble=preamble)

        return knl
Пример #15
0
def test_reduction_not_first_argument(ctx_factory):
    # https://github.com/inducer/pyopencl/issues/535
    from pytest import importorskip
    importorskip("mako")

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    n = 400
    a = cl_array.arange(queue, n, dtype=np.float32)
    b = cl_array.arange(queue, n, dtype=np.float32)

    from pyopencl.reduction import ReductionKernel
    krnl = ReductionKernel(context, np.float32, neutral="0",
            reduce_expr="a+b", map_expr="z*x[i]*y[i]",
            arguments="float z, __global float *x, __global float *y")

    my_dot_prod = krnl(0.1, a, b).get()

    assert abs(my_dot_prod - 0.1*np.sum(np.arange(n)**2)) < 1e-4
Пример #16
0
    def __init__(self, context, queue):
        """ Constructor.
		@param context OpenCL context where apply.
		@param queue OpenCL command queue.
		"""
        self.context = context
        self.queue = queue
        self.program = clUtils.loadProgram(context,
                                           clUtils.path() + "/jacobi.cl")
        # Create OpenCL objects as null objects, that we will generate
        # at the first iteration
        self.A = None
        self.B = None
        self.X0 = None
        self.X = None
        self.x = None
        # Create dot operator
        self.dot = ReductionKernel(
            context,
            np.float32,
            neutral="0",
            reduce_expr="a+b",
            map_expr="x[i]*y[i]",
            arguments="__global float *x, __global float *y")
Пример #17
0
from pyopencl.array import arange, Array
from pyopencl.reduction import ReductionKernel
import numpy

ctx = pyopencl.create_some_context()
queue = pyopencl.CommandQueue(ctx)

#print dir(cl)
#a = arange(queue, 400, dtype=numpy.float32)
#b = arange(queue, 400, dtype=numpy.float32)
acpu = numpy.zeros((100, 1), dtype=numpy.int32)
for i in xrange(0, 100):
    if i % 5 == 0:
        acpu[i] = 1

a = Array(queue, (100, 1), numpy.int32)
a.set(acpu)
queue.finish()

krnl = ReductionKernel(
    ctx,
    numpy.int32,
    neutral="0",
    reduce_expr="a+b",
    map_expr="x[i]",  #*y[i]",
    arguments="__global int *x")  #, __global in *y")

my_sum = krnl(a).get()
queue.finish()
print my_sum
Пример #18
0
def get_minmax_kernel(ctx, dtype, inf, mmc_dtype, prop_names, only_min,
                      only_max, name, mmc_c_decl, backend):
    tpl_args = ", ".join([
        "%(dtype)s %(prop)s" % {
            'dtype': dtype,
            'prop': prop
        } for prop in prop_names
    ])

    if backend == 'cuda':
        # overload assignment operator in struct
        mmc_overload = mkt.Template(text=minmax_operator_tpl).render(
            prop_names=prop_names,
            dtype=name,
            only_min=only_min,
            only_max=only_max)
        mmc_c_decl_lines = mmc_c_decl.splitlines()
        mmc_c_decl_lines = mmc_c_decl_lines[:-2] + \
                mmc_overload.splitlines() + mmc_c_decl_lines[-2:]
        mmc_c_decl = '\n'.join(mmc_c_decl_lines)

    mmc_preamble = mmc_c_decl + minmax_tpl
    preamble = mkt.Template(text=mmc_preamble).render(args=tpl_args,
                                                      prop_names=prop_names,
                                                      dtype=name,
                                                      only_min=only_min,
                                                      only_max=only_max,
                                                      inf=inf)

    map_args = ", ".join([
        "%(prop)s[i]" % {
            'dtype': dtype,
            'prop': prop
        } for prop in prop_names
    ])

    if backend == 'opencl':
        knl_args = ", ".join([
            "__global %(dtype)s* %(prop)s" % {
                'dtype': dtype,
                'prop': prop
            } for prop in prop_names
        ])

        from pyopencl._cluda import CLUDA_PREAMBLE
        from pyopencl.reduction import ReductionKernel

        cluda_preamble = mkt.Template(text=CLUDA_PREAMBLE).render(
            double_support=True)

        knl = ReductionKernel(ctx,
                              mmc_dtype,
                              neutral="mmc_neutral()",
                              reduce_expr="agg_mmc(a, b)",
                              map_expr="mmc_from_scalar(%s)" % map_args,
                              arguments=knl_args,
                              preamble='\n'.join([cluda_preamble, preamble]))

    elif backend == 'cuda':
        knl_args = ", ".join([
            "%(dtype)s* %(prop)s" % {
                'dtype': dtype,
                'prop': prop
            } for prop in prop_names
        ])

        from pycuda._cluda import CLUDA_PREAMBLE
        from pycuda.reduction import ReductionKernel

        cluda_preamble = mkt.Template(text=CLUDA_PREAMBLE).render(
            double_support=True)

        knl = ReductionKernel(mmc_dtype,
                              neutral="mmc_neutral()",
                              reduce_expr="agg_mmc(a, b)",
                              map_expr="mmc_from_scalar(%s)" % map_args,
                              arguments=knl_args,
                              preamble='\n'.join([cluda_preamble, preamble]))

    return knl
Пример #19
0
def valueMonteCarloGPU(ctx,
                       queue,
                       S_init,
                       nPaths,
                       Exp_Time,
                       dtMonte,
                       Strike,
                       Int_Rate,
                       Vol,
                       PTYPE,
                       nMonteLoops=1):

    nextStepPathKernel = ElementwiseKernel(
        ctx,
        "float *latestStep, float *ran, float Strike, float Int_Rate, float Exp_Time, float dt, float Vol",
        "float rval = exp((Int_Rate - 0.5f * Vol*Vol)*dt + Vol * sqrt(dt) * ran[i]); latestStep[i] *= rval;",
        "nextStepPathKernel")

    excersisePriceKernel = ElementwiseKernel(
        ctx, "float *latestStep, float Strike, float Int_Rate, float Exp_Time",
        "float rval = (latestStep[i]-Strike); latestStep[i] = exp(-Int_Rate*Exp_Time)  * max(rval,0.0f);",
        "excersisePriceKernel")

    sumKernel = ReductionKernel(ctx,
                                numpy.float32,
                                neutral="0",
                                reduce_expr="a+b",
                                map_expr="x[i]",
                                arguments="__global float *x")

    maxWorkItems = 1 * 2**9
    multiplier = 1

    if (nPaths > maxWorkItems):
        multiplier = math.ceil(nPaths / maxWorkItems)
        nPaths = multiplier * maxWorkItems
    else:
        maxWorkItems = nPaths
    #print(maxWorkItems, multiplier, nPaths)
    nTimeStepsMonte = math.ceil(Exp_Time / dtMonte)
    #print(nTimeStepsMonte,nMonteLoops)
    #set up random number generator
    gen = RanluxGenerator(queue, maxWorkItems, luxury=4, seed=time.time())

    #the arrays
    ran = cl.array.zeros(queue, maxWorkItems, numpy.float32)
    latestStep = cl.array.zeros_like(ran)

    means = numpy.zeros(nMonteLoops)
    theMean = 0

    #the loop
    for loop in range(nMonteLoops):

        theSum = 0

        for mult in range(multiplier):

            latestStep.fill(S_init)

            for t in range(nTimeStepsMonte):
                gen.fill_normal(ran)
                gen.synchronize(queue)
                nextStepPathKernel(latestStep, ran, Strike, Int_Rate, Exp_Time,
                                   dtMonte, Vol)

            excersisePriceKernel(latestStep, Strike, Int_Rate, Exp_Time)
            #print(latestStep)

            #add to array

            theSum += sumKernel(latestStep, queue).get()
        means[loop] = theSum / nPaths

    monteAverage = numpy.mean(means)
    monteStdDeviation = numpy.std(means)

    return monteAverage, dtMonte, monteStdDeviation
Пример #20
0
    def _generate(self):
        if self.backend == 'cython':
            if self.func is not None:
                self.tp.add(self.func)
                py_data, c_data = self.cython_gen.get_func_signature(self.func)
                self._correct_return_type(c_data)
                name = self.func.__name__
                cargs = ', '.join(c_data[1])
                map_expr = '{name}({cargs})'.format(name=name, cargs=cargs)
            else:
                py_data = (['int i', '{type}[:] inp'.format(type=self.type)],
                           ['i', '&inp[0]'])
                c_data = (['int i', '{type}* inp'.format(type=self.type)],
                          ['i', 'inp'])
                map_expr = 'inp[i]'
            py_defn = ['long SIZE'] + py_data[0][1:]
            c_defn = ['long SIZE'] + c_data[0][1:]
            py_args = ['SIZE'] + py_data[1][1:]
            template = Template(text=reduction_cy_template)
            src = template.render(
                name=self.name,
                type=self.type,
                map_expr=map_expr,
                reduce_expr=self.reduce_expr,
                neutral=self.neutral,
                c_arg_sig=', '.join(c_defn),
                py_arg_sig=', '.join(py_defn),
                py_args=', '.join(py_args),
                openmp=self._config.use_openmp,
                get_parallel_range=get_parallel_range
            )
            self.tp.add_code(src)
            self.tp.compile()
            self.c_func = getattr(self.tp.mod, 'py_' + self.name)
        elif self.backend == 'opencl':
            if self.func is not None:
                self.tp.add(self.func)
                py_data, c_data = self.cython_gen.get_func_signature(self.func)
                self._correct_opencl_address_space(c_data)
                name = self.func.__name__
                expr = '{func}({args})'.format(
                    func=name,
                    args=', '.join(c_data[1])
                )
                arguments = convert_to_float_if_needed(
                    ', '.join(c_data[0][1:])
                )
                preamble = convert_to_float_if_needed(self.tp.get_code())
            else:
                arguments = '{type} *in'.format(type=self.type)
                expr = None
                preamble = ''

            from .opencl import get_context, get_queue
            from pyopencl.reduction import ReductionKernel
            ctx = get_context()
            self.queue = get_queue()
            knl = ReductionKernel(
                ctx,
                dtype_out=self.dtype_out,
                neutral=self.neutral,
                reduce_expr=self.reduce_expr,
                map_expr=expr,
                arguments=arguments,
                preamble=preamble
            )
            self.c_func = knl
Пример #21
0
def cl_reduct_krnl_build(cl_ctx, *args, **kwargs):
    return ReductionKernel(cl_ctx, *args, **kwargs)
Пример #22
0
    operation="res_g[i] = dot(a_g[i],b_g[i])",
    name="elem_wise_krnl"
)
elem_wise_event = elem_wise_krnl(a_g, b_g, res_g)

elem_wise_time = time.time()

# np.set_printoptions(precision=2)
# print(res_g.get())
# print(res_g.get().shape)
# print(res_g.get()[:10])

reduction_krnl = ReductionKernel(ctx,
    dtype_out=np.float32,
    neutral="0",
    reduce_expr="a+b",
    map_expr="x[i]",
    arguments="__global float *x",
    name="reduction_krnl",
)

res_reduction = reduction_krnl(res_g, queue=queue, wait_for=[elem_wise_event])

reduction_time = time.time()

print("elem_wise_time: {}".format(elem_wise_time-start))
print("reduction_time: {}".format(reduction_time-elem_wise_time))
print("Total: {}".format(reduction_time-start))
#print(res_reduction)

print(res_reduction.get())
Пример #23
0
import pyopencl as cl
import pyopencl.clrandom as clrand
from pyopencl.reduction import ReductionKernel
import numpy as np

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

n = 10**7
x = clrand.rand(queue, n, np.float32)

rknl = ReductionKernel(ctx,
                       np.float32,
                       neutral="0",
                       reduce_expr="a+b",
                       map_expr="x[i]*x[i]",
                       arguments="double *x")

result = rknl(x)
result_np = result.get()
Пример #24
0
    def _generate(self, declarations=None):
        if self.backend == 'cython':
            if self.func is not None:
                self.tp.add(self.func, declarations=declarations)
                py_data, c_data = self.cython_gen.get_func_signature(self.func)
                self._correct_return_type(c_data)
                name = self.func.__name__
                cargs = ', '.join(c_data[1])
                map_expr = '{name}({cargs})'.format(name=name, cargs=cargs)
            else:
                py_data = (['int i', '{type}[:] inp'.format(type=self.type)],
                           ['i', '&inp[0]'])
                c_data = (['int i',
                           '{type}* inp'.format(type=self.type)], ['i', 'inp'])
                map_expr = 'inp[i]'
            py_defn = ['long SIZE'] + py_data[0][1:]
            c_defn = ['long SIZE'] + c_data[0][1:]
            py_args = ['SIZE'] + py_data[1][1:]
            template = Template(text=reduction_cy_template)
            src = template.render(name=self.name,
                                  type=self.type,
                                  map_expr=map_expr,
                                  reduce_expr=self.reduce_expr,
                                  neutral=self.neutral,
                                  c_arg_sig=', '.join(c_defn),
                                  py_arg_sig=', '.join(py_defn),
                                  py_args=', '.join(py_args),
                                  openmp=self._config.use_openmp,
                                  get_parallel_range=get_parallel_range)
            # This is the user code source.
            self.source = self.tp.get_code()
            self.tp.add_code(src)
            self.tp.compile()
            self.all_source = self.tp.source
            return getattr(self.tp.mod, 'py_' + self.name)
        elif self.backend == 'opencl':
            if self.func is not None:
                self.tp.add(self.func, declarations=declarations)
                py_data, c_data = self.cython_gen.get_func_signature(self.func)
                self._correct_opencl_address_space(c_data)
                name = self.func.__name__
                expr = '{func}({args})'.format(func=name,
                                               args=', '.join(c_data[1]))
                arguments = convert_to_float_if_needed(', '.join(
                    c_data[0][1:]))
                preamble = convert_to_float_if_needed(self.tp.get_code())
            else:
                arguments = '{type} *in'.format(type=self.type)
                expr = None
                preamble = ''

            from .opencl import get_context, get_queue
            from pyopencl.reduction import ReductionKernel
            from pyopencl._cluda import CLUDA_PREAMBLE
            cluda_preamble = Template(text=CLUDA_PREAMBLE).render(
                double_support=True)

            ctx = get_context()
            self.queue = get_queue()
            knl = ReductionKernel(ctx,
                                  dtype_out=self.dtype_out,
                                  neutral=self.neutral,
                                  reduce_expr=self.reduce_expr,
                                  map_expr=expr,
                                  arguments=arguments,
                                  preamble="\n".join(
                                      [cluda_preamble, preamble]))
            # only code we generate is saved here.
            self.source = "\n".join([cluda_preamble, preamble])
            if knl.stage_1_inf.source:
                self.all_source = "\n".join([
                    "// ------ stage 1 -----",
                    knl.stage_1_inf.source,
                    "// ------ stage 2 -----",
                    knl.stage_2_inf.source,
                ])
            else:
                self.all_source = self.source
            return knl
        elif self.backend == 'cuda':
            if self.func is not None:
                self.tp.add(self.func, declarations=declarations)
                py_data, c_data = self.cython_gen.get_func_signature(self.func)
                self._correct_opencl_address_space(c_data)
                name = self.func.__name__
                expr = '{func}({args})'.format(func=name,
                                               args=', '.join(c_data[1]))
                arguments = convert_to_float_if_needed(', '.join(
                    c_data[0][1:]))
                preamble = convert_to_float_if_needed(self.tp.get_code())
            else:
                arguments = '{type} *in'.format(type=self.type)
                expr = None
                preamble = ''

            from .cuda import set_context
            set_context()
            from pycuda.reduction import ReductionKernel
            from pycuda._cluda import CLUDA_PREAMBLE
            cluda_preamble = Template(text=CLUDA_PREAMBLE).render(
                double_support=True)

            knl = ReductionKernel(dtype_out=self.dtype_out,
                                  neutral=self.neutral,
                                  reduce_expr=self.reduce_expr,
                                  map_expr=expr,
                                  arguments=arguments,
                                  preamble="\n".join(
                                      [cluda_preamble, preamble]))
            # only code we generate is saved here.
            self.source = cluda_preamble + preamble
            # FIXME: it is difficult to get the sources from pycuda.
            self.all_source = self.source
            return knl
Пример #25
0
    minmax_collector agg_mmc(minmax_collector a, minmax_collector b)
    {
        minmax_collector result = a;
        if (b.cur_min < result.cur_min)
            result.cur_min = b.cur_min;
        if (b.cur_max > result.cur_max)
            result.cur_max = b.cur_max;
        return result;
    }

    """

from pyopencl.clrandom import rand as clrand
a_gpu = clrand(queue, (20000, ), dtype=np.int32, a=0, b=10**6)
a = a_gpu.get()

from pyopencl.reduction import ReductionKernel
red = ReductionKernel(ctx,
                      mmc_dtype,
                      neutral="mmc_neutral()",
                      reduce_expr="agg_mmc(a, b)",
                      map_expr="mmc_from_scalar(x[i])",
                      arguments="__global int *x",
                      preamble=preamble)

minmax = red(a_gpu).get()

assert abs(minmax["cur_min"] - np.min(a)) < 1e-5
assert abs(minmax["cur_max"] - np.max(a)) < 1e-5