Exemplos de ReductionKernel em Python, exemplos de pycuda.reduction.ReductionKernel em Python

Exemplo n.º 1

0

Exibir arquivo

        def gpuFunc(iterator):
            # 1. Data preparation
            iterator = iter(iterator)
            cpu_data = list(iterator)
            cpu_dataset = " ".join(cpu_data)
            ascii_data = np.asarray([ord(x) for x in cpu_dataset],
                                    dtype=np.uint8)

            # 2. Driver initialization and data transfer
            cuda.init()
            dev = cuda.Device(0)
            contx = dev.make_context()
            gpu_dataset = gpuarray.to_gpu(ascii_data)

            # 3. GPU kernel.
            # The kernel's algorithm counts the words by keeping
            # track of the space between them
            countkrnl = reduction.ReductionKernel(
                long,
                neutral="0",
                map_expr="(a[i] == 32)*(b[i] != 32)",
                reduce_expr="a + b",
                arguments="char *a, char *b")

            results = countkrnl(gpu_dataset[:-1], gpu_dataset[1:]).get()
            yield results

            # Release GPU context resources
            contx.pop()
            del gpu_dataset
            del contx

            gc.collect()

Exemplo n.º 2

0

Exibir arquivo

Arquivo: pi_cuda.py Projeto: huixiangufl/cloud-computing-starsxg

        def gpuPi(iterator):
            iterator = iter(iterator)
            length = len(list(iterator))
            a = np.random.random_sample(length)
            b = np.random.random_sample(length)

            cuda.init()
            dev = cuda.Device(0)
            contx = dev.make_context()

            gpu_a = gpuarray.to_gpu(a)
            gpu_b = gpuarray.to_gpu(b)
            countkrnl = reduction.ReductionKernel(
                np.float64,
                neutral="0",
                map_expr="(a[i]*a[i] + b[i]*b[i] >= 1.0) ? 1.0 : 0.0",
                reduce_expr="a+b",
                arguments="float * a, float * b")
            pointInsideCircle = countkrnl(gpu_a, gpu_b).get()
            yield pointInsideCircle

            contx.pop()
            del gpu_a
            del gpu_b
            gc.collect()

Exemplo n.º 3

0

Exibir arquivo

    def get_reduction_kernel(self, reduce_expr, map_expr, neutral, *args):
        """Generate and return reduction kernel; see PyCUDA documentation
        of pycuda.reduction.ReductionKernel for detailed description.
        Function expects buffers that are in device address space,
        stored in gpu_* variables.

        :param reduce_expr: expression used to reduce two values into one,
            must use a and b as values names, e.g. 'a+b'
        :param map_expr: expression used to map value from input array,
            arrays are named x0, x1, etc., e.g. 'x0[i]*x1[i]
        :param neutral: neutral value in reduce_expr, e.g. '0'
        :param args: buffers on which to calculate reduction, e.g. backend.gpu_rho
        """
        arrays = []
        arguments = []
        for i, arg in enumerate(args):
            array = self.arrays[arg]
            arrays.append(array)
            arguments.append('const {0} *x{1}'.format(
                pycuda.tools.dtype_to_ctype(array.dtype), i))
        kernel = reduction.ReductionKernel(arrays[0].dtype,
                                           neutral=neutral,
                                           reduce_expr=reduce_expr,
                                           map_expr=map_expr,
                                           arguments=', '.join(arguments))
        return lambda: kernel(*arrays).get()

Exemplo n.º 4

0

Exibir arquivo

Arquivo: word_count_example.py Projeto: Freshield/LEARN_PYCUDA

def createCudaWordCountKernel():
    initvalue = "0"
    mapper = "(a[i] == 32)*(b[i] != 32)"  # 32 is ascii code for whitespace
    reducer = "a+b"
    cudafunctionarguments = "char* a, char* b"
    wordcountkernel = reduction.ReductionKernel(
        numpy.float32,
        neutral=initvalue,
        reduce_expr=reducer,
        map_expr=mapper,
        arguments=cudafunctionarguments)
    return wordcountkernel

Exemplo n.º 5

0

Exibir arquivo

def maxabs(x_gpu):
    """
    Get maximum absolute value.

    Find maximum absolute value in the specified array.

    Parameters
    ----------
    x_gpu : pycuda.gpuarray.GPUArray
        Input array.

    Returns
    -------
    m_gpu : pycuda.gpuarray.GPUArray
        Array containing maximum absolute value in `x_gpu`.        

    Examples
    --------
    >>> import pycuda.autoinit
    >>> import pycuda.gpuarray as gpuarray
    >>> import misc
    >>> x_gpu = gpuarray.to_gpu(np.array([-1, 2, -3], np.float32))
    >>> m_gpu = misc.maxabs(x_gpu)
    >>> np.allclose(m_gpu.get(), 3.0)
    True

    """

    try:
        func = maxabs.cache[x_gpu.dtype]
    except KeyError:
        ctype = tools.dtype_to_ctype(x_gpu.dtype)
        use_double = int(x_gpu.dtype in [np.float64, np.complex128])
        ret_type = np.float64 if use_double else np.float32
        func = reduction.ReductionKernel(
            ret_type,
            neutral="0",
            reduce_expr="max(a,b)",
            map_expr="abs(x[i])",
            arguments="{ctype} *x".format(ctype=ctype))
        maxabs.cache[x_gpu.dtype] = func
    return func(x_gpu)

Exemplo n.º 6

0

Exibir arquivo

Arquivo: pagerank_mapp.py Projeto: huixiangufl/cloud-computing-starsxg

	def gpuFunc(iterator):
	    # 1. Data preparation
            iterator = iter(iterator)
            cpu_data = list(iterator)
            """
            #print cpu_data
            cpu_dataset = " ".join(cpu_data)
            #print cpu_dataset
            ascii_data = np.asarray([ord(x) for x in cpu_dataset], dtype=np.uint8)
            #print ascii_data
	    # 2. Driver initialization and data transfer
	    cuda.init()
	    dev = cuda.Device(0)
	    contx = dev.make_context()
            gpu_dataset = gpuarray.to_gpu(ascii_data)

	    # 3. GPU kernel.
	    # The kernel's algorithm counts the words by keeping 
	    # track of the space between them
            countkrnl = reduction.ReductionKernel(long, neutral = "0",
            		map_expr = "(a[i] == 32)*(b[i] != 32)",
                        reduce_expr = "a + b", arguments = "char *a, char *b")

            results = countkrnl(gpu_dataset[:-1],gpu_dataset[1:]).get()
            #print results
            value.append(3)
            print "value " + str(value)
            yield results , [1,2,3]
            """
            print "PVector", PVector
            dpFactor = 0.85
            pSum = sum(PVector)
            numLines = len(cpu_data)
            
            pList = []
            cuda.init()
            dev = cuda.Device(0)
            contx = dev.make_context()

            for i in range(numLines):
                if cpu_data[i][0] != '#':
                    continue
                firstSpaceIndex = cpu_data[i].find(' ')
                secondSpaceIndex = cpu_data[i].find(' ', firstSpaceIndex+1)
                nodeID = int(cpu_data[i][firstSpaceIndex+1:secondSpaceIndex])
                probListStr = cpu_data[i][secondSpaceIndex+1:].split(' ')
                probListFlt = map(float,probListStr)              
                matSize = len(probListStr)
                #probListFlt = [float(x)*dpFactor + (1-dpFactor) for x in probListStr]
                probListFlt = [x*dpFactor + (1-dpFactor)/matSize for x in probListFlt]
                npProbListFlt = np.asarray(probListFlt,dtype = np.float32)
                npPVector = np.asarray(PVector,np.float32)
                #cuda.init()
                #dev = cuda.Device(0) 
                #contx = dev.make_context()
                gpu_matVect = gpuarray.to_gpu(npProbListFlt)               
                gpu_pVect = gpuarray.to_gpu(npPVector)
                countkrnl = reduction.ReductionKernel(np.float32, neutral = "0", 
                                                      map_expr = "a[i]*b[i]", reduce_expr = "a+b", arguments = "float *a, float *b")
                result = countkrnl(gpu_matVect, gpu_pVect).get()
                if i == 0:
                    pList = [0.0] * matSize
                pList[nodeID] = result
            yield pList  
	    # Release GPU context resources
	    contx.pop() 
	    del gpu_matVect
            del gpu_pVect
            del contx
	   
	    gc.collect()

Exemplo n.º 7

0

Exibir arquivo

import numpy as np
import pycuda.autoinit
from pycuda import gpuarray, reduction

x = np.arange(0, 1001, dtype=np.uint32)
kernel = reduction.ReductionKernel(
    dtype_out=np.float32,
    arguments="unsigned int* x",
    map_expr="(float)x[i] * x[i]",
    reduce_expr="a + b",
    neutral="0.0",
)
x_gpu = gpuarray.to_gpu(x)
result = kernel(x_gpu).get()

print(result)

Exemplo n.º 8

0

Exibir arquivo

Arquivo: test_reduction.py Projeto: Freshield/LEARN_PYCUDA

import pycuda.reduction as rd
import pycuda.driver as cuda
import pycuda.gpuarray as gpuarray
import pycuda.autoinit
import numpy as np

a = gpuarray.arange(400, dtype=np.float32)
b = gpuarray.arange(400, dtype=np.float32)

krnl = rd.ReductionKernel(np.float32,
                          neutral='0',
                          reduce_expr='a+b',
                          map_expr='x[i]*y[i]',
                          arguments='float *x, float *y')

my_dot_prod = krnl(a, b).get()

print my_dot_prod

print np.sum(np.arange(400)**2)

Exemplo n.º 9

0

Exibir arquivo

    def __init__(self):

        size = self.gridDIM_y * self.gridDIM_p_y * self.gridDIM_x * self.gridDIM_p_x

        self.FAFT_axes0 = 0
        self.FAFT_axes1 = 1
        self.FAFT_axes2 = 2
        self.FAFT_axes3 = 3

        #m = size

        self.FAFT_segment_axes0 = 0
        self.FAFT_segment_axes1 = 0
        self.FAFT_segment_axes2 = 0
        self.FAFT_segment_axes3 = 0

        self.NF = 1

        # Phase space step size
        self.dp_y = 2 * self.p_y_amplitude / float(self.gridDIM_p_y)  #axis 0
        self.dy = 2 * self.y_amplitude / float(self.gridDIM_y)  #axis 1
        self.dp_x = 2 * self.p_x_amplitude / float(self.gridDIM_p_x)  #axis 2
        self.dx = 2 * self.x_amplitude / float(self.gridDIM_x)  #axis 3

        # Ambiguity space step size
        self.dtheta_y = 2 * self.theta_y_amplitude / float(
            self.gridDIM_y)  #axis 0
        self.dlambda_y = 2 * self.lambda_y_amplitude / float(
            self.gridDIM_p_y)  #axis 1
        self.dtheta_x = 2 * self.theta_x_amplitude / float(
            self.gridDIM_x)  #axis 2
        self.dlambda_x = 2 * self.lambda_x_amplitude / float(
            self.gridDIM_p_x)  #axis 3

        # delta parameters
        self.delta_p_y = self.dp_y * self.dtheta_y / (2 * np.pi)  #axis 0
        self.delta_y = self.dy * self.dlambda_y / (2 * np.pi)  #axis 1
        self.delta_p_x = self.dp_x * self.dtheta_x / (2 * np.pi)  #axis 2
        self.delta_x = self.dx * self.dlambda_x / (2 * np.pi)  #axis 3

        # Phase space
        self.p_y_range = np.linspace(-self.p_y_amplitude,
                                     self.p_y_amplitude - self.dp_y,
                                     self.gridDIM_p_y)  #axis 0
        self.y_range = np.linspace(-self.y_amplitude,
                                   self.y_amplitude - self.dy,
                                   self.gridDIM_y)  #axis 1
        self.p_x_range = np.linspace(-self.p_x_amplitude,
                                     self.p_x_amplitude - self.dp_x,
                                     self.gridDIM_p_x)  #axis 2
        self.x_range = np.linspace(-self.x_amplitude,
                                   self.x_amplitude - self.dx,
                                   self.gridDIM_x)  #axis 3

        # Ambiguity space range
        self.theta_y_range = np.linspace(
            -self.theta_y_amplitude, self.theta_y_amplitude - self.dtheta_y,
            self.gridDIM_y)  #0
        self.lambda_y_range = np.linspace(
            -self.lambda_y_amplitude, self.lambda_y_amplitude - self.dlambda_y,
            self.gridDIM_p_y)  #1
        self.theta_x_range = np.linspace(
            -self.theta_x_amplitude, self.theta_x_amplitude - self.dtheta_x,
            self.gridDIM_x)  #2
        self.lambda_x_range = np.linspace(
            -self.lambda_x_amplitude, self.lambda_x_amplitude - self.dlambda_x,
            self.gridDIM_p_x)  #3

        # Grid
        self.y = self.y_range[np.newaxis, :, np.newaxis, np.newaxis]  #axis 1
        self.p_y = self.p_y_range[:, np.newaxis, np.newaxis,
                                  np.newaxis]  #axis 0
        self.p_x = self.p_x_range[np.newaxis, np.newaxis, :,
                                  np.newaxis]  #axis 2
        self.x = self.x_range[np.newaxis, np.newaxis, np.newaxis, :]  #axis 3

        self.CUDA_constants = '\n'

        self.CUDA_constants += '__device__ double dt   = %f;   ' % self.dt
        self.CUDA_constants += '__device__ double mass = %f; \n' % self.mass

        self.CUDA_constants += '__device__ double dp_y   = %f; ' % self.dp_y
        self.CUDA_constants += '__device__ double dy     = %f; ' % self.dy
        self.CUDA_constants += '__device__ double dp_x   = %f; ' % self.dp_x
        self.CUDA_constants += '__device__ double dx     = %f; \n' % self.dx

        self.CUDA_constants += '__device__ double dtheta_y   = %f; ' % self.dtheta_y
        self.CUDA_constants += '__device__ double dlambda_y  = %f; ' % self.dlambda_y
        self.CUDA_constants += '__device__ double dtheta_x   = %f; ' % self.dtheta_x
        self.CUDA_constants += '__device__ double dlambda_x  = %f; \n' % self.dlambda_x

        self.CUDA_constants += '__device__ int gridDIM_x = %d; ' % self.gridDIM_x
        self.CUDA_constants += '__device__ int gridDIM_y = %d; ' % self.gridDIM_y

        try:
            self.CUDA_constants += '__device__ double D_lambda_x = %f; ' % self.D_lambda_x
            self.CUDA_constants += '__device__ double D_lambda_y = %f; ' % self.D_lambda_y
            self.CUDA_constants += '__device__ double D_theta_x  = %f; ' % self.D_theta_x
            self.CUDA_constants += '__device__ double D_theta_y  = %f; \n' % self.D_theta_y
        except AttributeError:
            pass

        self.CUDA_constants += '\n'

        print self.CUDA_constants

        #...........................................................................

        print '         GPU memory Total               ', pycuda.driver.mem_get_info(
        )[1] / float(2**30), 'GB'
        print '         GPU memory Free  (Before)      ', pycuda.driver.mem_get_info(
        )[0] / float(2**30), 'GB'

        self.W_init_gpu = gpuarray.zeros((self.gridDIM_p_y, self.gridDIM_y,
                                          self.gridDIM_p_x, self.gridDIM_x),
                                         dtype=np.complex128)
        print '         GPU memory Free  (After)       ', pycuda.driver.mem_get_info(
        )[0] / float(2**30), 'GB'

        #............................................................................

        self.indexUnpack_x_p_string = """
			int i_x   = i%gridDIM_x;
			int i_p_x = (i/gridDIM_x) % gridDIM_x;
			int i_y   = (i/(gridDIM_x*gridDIM_x)) % gridDIM_y;
			int i_p_y = i/(gridDIM_x*gridDIM_x*gridDIM_y);

			double x   = dx  *( i_x   - gridDIM_x/2 );
			double p_x = dp_x*( i_p_x - gridDIM_x/2 );
			double y   = dy  *( i_y   - gridDIM_y/2 );
			double p_y = dp_y*( i_p_y - gridDIM_y/2 );			
			"""

        self.indexUnpack_lambda_theta_string = """
			int i_x   = i%gridDIM_x;
			int i_p_x = (i/gridDIM_x) % gridDIM_x;
			int i_y   = (i/(gridDIM_x*gridDIM_x)) % gridDIM_y;
			int i_p_y = i/(gridDIM_x*gridDIM_x*gridDIM_y);

			double lambda_x  = dlambda_x * ( i_x   - gridDIM_x/2 );
			double theta_x   = dtheta_x  * ( i_p_x - gridDIM_x/2 );
			double lambda_y  = dlambda_y * ( i_y   - gridDIM_y/2 );
			double theta_y   = dtheta_y  * ( i_p_y - gridDIM_y/2 );			
			"""

        self.indexUnpack_lambda_p_string = """
			int i_x    = i%gridDIM_x;
			int i_p_x  = (i/gridDIM_x) % gridDIM_x;
			int i_y    = (i/(gridDIM_x*gridDIM_x)) % gridDIM_y;
			int i_p_y  = i/(gridDIM_x*gridDIM_x*gridDIM_y);

			double lambda_x   = dlambda_x*( i_x   - gridDIM_x/2 );
			double p_x        = dp_x     *( i_p_x - gridDIM_x/2 );
			double lambda_y   = dlambda_y*( i_y   - gridDIM_y/2 );
			double p_y        = dp_y     *( i_p_y - gridDIM_y/2 );			
			"""
        self.indexUnpack_x_theta_string = """
			int i_x   = i%gridDIM_x;
			int i_p_x = (i/gridDIM_x) % gridDIM_x;
			int i_y   = (i/(gridDIM_x*gridDIM_x)) % gridDIM_y;
			int i_p_y = i/(gridDIM_x*gridDIM_x*gridDIM_y);

			double x       = dx      *( i_x   - gridDIM_x/2 );
			double theta_x = dtheta_x*( i_p_x - gridDIM_x/2 );
			double y       = dy      *( i_y   - gridDIM_y/2 );
			double theta_y = dtheta_y*( i_p_y - gridDIM_y/2 );	
			"""

        #...............................................................................................

        self.Gaussian_GPU = ElementwiseKernel(
            """pycuda::complex<double> *W , 
				double    mu_p_y, double    mu_y, double    mu_p_x, double    mu_x , 
				double sigma_p_y, double sigma_y, double sigma_p_x, double sigma_x """,
            self.indexUnpack_x_p_string + """
			double temp =   exp(-0.5*( x   - mu_x   )*( x   - mu_x   )/( sigma_x   * sigma_x   )  );
			       temp *=  exp(-0.5*( y   - mu_y   )*( y   - mu_y   )/( sigma_y   * sigma_y   )  );
			       temp *=	exp(-0.5*( p_x - mu_p_x )*( p_x - mu_p_x )/( sigma_p_x * sigma_p_x )  );
			       temp *=	exp(-0.5*( p_y - mu_p_y )*( p_y - mu_p_y )/( sigma_p_y * sigma_p_y )  );

			W[i] = pycuda::complex<double>(  temp , 0. ); 

					""",
            "Gaussian",
            preamble="#define _USE_MATH_DEFINES" + self.CUDA_constants)

        #
        self.HOscillatorGound_GPU = ElementwiseKernel(
            """pycuda::complex<double> *W, 
			double   x_mu, double    y_mu,
                        double p_x_mu, double  p_y_mu,
			double omega_x, double omega_y, double mass""",
            self.indexUnpack_x_p_string + """
			double temp  = (mass*pow( omega_x*(x-x_mu) ,2) + pow(p_x-p_x_mu,2)/mass)/omega_x;   
			       temp += (mass*pow( omega_y*(y-y_mu) ,2) + pow(p_y-p_y_mu,2)/mass)/omega_y;

			W[i] = pycuda::complex<double>(  exp(-temp) , 0. ); 
					          """,
            "Gaussian",
            preamble="#define _USE_MATH_DEFINES" + self.CUDA_constants)

        # Kinetic propagator ................................................................................

        kineticStringC = '__device__ double K(double p_x, double p_y){ \n return ' + self.kineticString + ';\n}'

        self.exp_p_lambda_GPU = ElementwiseKernel(
            """ pycuda::complex<double> *B """,
            self.indexUnpack_lambda_p_string + """ 				
			double  r  = exp( - dt*D_lambda_y * lambda_x*lambda_x );
	 			r *= exp( - dt*D_lambda_y * lambda_y*lambda_y );
 
			double phase  = dt*K(p_x + 0.5*lambda_x, p_y + 0.5*lambda_y) - dt*K(p_x - 0.5*lambda_x, p_y - 0.5*lambda_y);
			B[i] *= pycuda::complex<double>( r*cos(phase), -r*sin(phase) );

			""",
            "exp_p_lambda_GPU",
            preamble="#define _USE_MATH_DEFINES" + self.CUDA_constants +
            kineticStringC)

        #  Potential propagator ..............................................................................

        potentialStringC = '__device__ double V(double x, double y){ \n return ' + self.potentialString + ';\n}'

        self.exp_x_theta_GPU = ElementwiseKernel(
            """ pycuda::complex<double> *B """,
            self.indexUnpack_x_theta_string + """ 
			double phase  = dt*V(x-0.5*theta_x , y-0.5*theta_y) - dt*V( x+0.5*theta_x , y+0.5*theta_y );
			
			double  r  = exp( - dt*D_theta_y * theta_x*theta_x - dt*D_theta_y * theta_y*theta_y );

			B[i] *= pycuda::complex<double>( r*cos(phase), -r*sin(phase) );

			""",
            "exp_x_theta_GPU",
            preamble="#define _USE_MATH_DEFINES" + self.CUDA_constants +
            potentialStringC)

        # Ehrenfest theorems .................................................................................

        x_Define = "\n#define x(i)    dx*( (i%gridDIM_x) - 0.5*gridDIM_x )\n"
        p_x_Define = "\n#define p_x(i)  dp_x*( ((i/gridDIM_x) % gridDIM_x)-0.5*gridDIM_x)\n"

        y_Define = "\n#define y(i)   dy  *( (i/(gridDIM_x*gridDIM_x)) % gridDIM_y  - 0.5*gridDIM_y)\n"
        p_y_Define = "\n#define p_y(i) dp_y*(  i/(gridDIM_x*gridDIM_x*gridDIM_y) - 0.5*gridDIM_y )\n"

        p_x_p_y_Define = p_x_Define + p_y_Define
        phaseSpaceDefine = p_x_Define + p_y_Define + x_Define + y_Define

        self.Average_x_GPU = reduction.ReductionKernel(
            np.float64,
            neutral="0",
            reduce_expr="a+b",
            map_expr="pycuda::real<double>( x(i)*dx*dy*dp_x*dp_y*W[i] )",
            arguments="pycuda::complex<double> *W",
            preamble="#define _USE_MATH_DEFINES" + x_Define +
            self.CUDA_constants)

        self.Average_x_square_GPU = reduction.ReductionKernel(
            np.float64,
            neutral="0",
            reduce_expr="a+b",
            map_expr="pycuda::real<double>( x(i)*x(i)*dx*dy*dp_x*dp_y*W[i] )",
            arguments="pycuda::complex<double> *W",
            preamble="#define _USE_MATH_DEFINES" + x_Define +
            self.CUDA_constants)

        self.Average_p_x_GPU = reduction.ReductionKernel(
            np.float64,
            neutral="0",
            reduce_expr="a+b",
            map_expr="pycuda::real<double>( p_x(i)*dx*dy*dp_x*dp_y*W[i] )",
            arguments="pycuda::complex<double> *W",
            preamble="#define _USE_MATH_DEFINES" + p_x_Define +
            self.CUDA_constants)

        self.Average_p_x_square_GPU = reduction.ReductionKernel(
            np.float64,
            neutral="0",
            reduce_expr="a+b",
            map_expr=
            "pycuda::real<double>( p_x(i)*p_x(i)*dx*dy*dp_x*dp_y*W[i] )",
            arguments="pycuda::complex<double> *W",
            preamble="#define _USE_MATH_DEFINES" + p_x_Define +
            self.CUDA_constants)

        self.Average_y_GPU = reduction.ReductionKernel(
            np.float64,
            neutral="0",
            reduce_expr="a+b",
            map_expr="pycuda::real<double>( y(i)*dx*dy*dp_x*dp_y*W[i] )",
            arguments="pycuda::complex<double> *W",
            preamble="#define _USE_MATH_DEFINES" + y_Define +
            self.CUDA_constants)

        self.Average_p_y_GPU = reduction.ReductionKernel(
            np.float64,
            neutral="0",
            reduce_expr="a+b",
            map_expr="pycuda::real<double>( p_y(i)*dx*dy*dp_x*dp_y*W[i] )",
            arguments="pycuda::complex<double> *W",
            preamble="#define _USE_MATH_DEFINES" + p_y_Define +
            self.CUDA_constants)

        #
        self.Average_y_square_GPU = reduction.ReductionKernel(
            np.float64,
            neutral="0",
            reduce_expr="a+b",
            map_expr="pycuda::real<double>( y(i)*y(i)*dx*dy*dp_x*dp_y*W[i] )",
            arguments="pycuda::complex<double> *W",
            preamble="#define _USE_MATH_DEFINES" + y_Define +
            self.CUDA_constants)

        self.Average_p_y_square_GPU = reduction.ReductionKernel(
            np.float64,
            neutral="0",
            reduce_expr="a+b",
            map_expr=
            "pycuda::real<double>( p_y(i)*p_y(i)*dx*dy*dp_x*dp_y*W[i] )",
            arguments="pycuda::complex<double> *W",
            preamble="#define _USE_MATH_DEFINES" + p_y_Define +
            self.CUDA_constants)

        #
        kineticString = self.kineticString.replace('p_x', 'p_x(i)')
        kineticString = kineticString.replace('p_y', 'p_y(i)')
        potentialString = (self.potentialString.replace('x', 'x(i)')).replace(
            'y', 'y(i)')
        energyString = kineticString + "+" + potentialString

        print "\n"
        print energyString

        self.Energy_GPU = reduction.ReductionKernel(
            np.float64,
            neutral="0",
            reduce_expr="a+b",
            map_expr="pycuda::real<double>((" + energyString +
            ")*dx*dy*dp_x*dp_y*W[i])",
            arguments="pycuda::complex<double> *W",
            preamble="#define _USE_MATH_DEFINES" + phaseSpaceDefine +
            self.CUDA_constants)