Пример #1
0
	def prepare_kernels( s ):
		mod = cuda.SourceModule( file('./gpu_core/dielectric.cu','r').read() )
		s.update_e = mod.get_function("update_e")
		s.update_h = mod.get_function("update_h")

		Db = ( s.tpb_main, 1, 1 )
		s.update_e.prepare( "iiPPPPPPPPP", block=Db, shared=s.ns_main )
		s.update_h.prepare( "iiPPPPPP", block=Db, shared=s.ns_main )
Пример #2
0
if __name__ == '__main__':

    import pycuda.driver as drv
    import numpy
    import pycuda.autoinit

    mod = drv.SourceModule("""
    __global__ void multiply_them(float *dest, float *a, float *b)
    {
      const int i = threadIdx.x;
      dest[i] = a[i] * b[i];
    }
    """)

    multiply_them = mod.get_function("multiply_them")

    a = numpy.random.randn(400).astype(numpy.float32)
    b = numpy.random.randn(400).astype(numpy.float32)

    dest = numpy.zeros_like(a)
    multiply_them(drv.Out(dest), drv.In(a), drv.In(b), block=(400, 1, 1))

    print dest - a * b
	psizEyf = cuda.mem_alloc( size_psiz )
	psizEyb = cuda.mem_alloc( size_psiz )
	psizHxf = cuda.mem_alloc( size_psiz )
	psizHxb = cuda.mem_alloc( size_psiz )
	psizHyf = cuda.mem_alloc( size_psiz )
	psizHyb = cuda.mem_alloc( size_psiz )
	'''

	# Copy the arrays from host to device
	cuda.memcpy_htod( devCEx, CEx )
	cuda.memcpy_htod( devCEy, CEy )
	cuda.memcpy_htod( devCEz, CEz )


	# Get the module from the cuda files
	mod_common = cuda.SourceModule( file('common.cu','r').read() )
	mod_dielectric = cuda.SourceModule( file('dielectric.cu','r').read() )
	mod_source = cuda.SourceModule( file('source.cu','r').read() )
	#mod_cpml = cuda.SourceModule( file('cpml.cu','r').read().replace('NPMLp2',str(2*(Npml+1))).replace('NPMLp',str(Npml+1)).replace('NPML',str(Npml)  ) )

	'''
	# Get the global pointer from the module
	rcmbE = mod_cpml.get_global("rcmbE")
	rcmbH = mod_cpml.get_global("rcmbH")
	rcmaE = mod_cpml.get_global("rcmaE")
	rcmaH = mod_cpml.get_global("rcmaH")

	#print rcmaE
	#print bE

	# Copy the arrays from host to constant memory
Пример #4
0
            for (zi=0; zi < nz; zi++) {
                const real QdotR = Qx[qxi]*x[xi] +Qy[qyi]*y[yi] + Qz[qzi]*z[zi];
                real cx,sx;
                sincos(QdotR,&sx,&cx);
                Re += density[densityidx]*cx;
                Im += density[densityidx]*sx;
                densityidx++;
            }
        }
    }
    result[idx].x = tex2D(tex, Re);
    result[idx].y = tex2D(tex, Im);
}
*/
"""
cuda_texture = cuda.SourceModule(kernel_source)
import sys
sys.exit()
cuda_texture_func = cuda_texture.get_function("cudaBorn")
texref = cuda_texture.get_texref("tex")


def cudoSMBA_form(cell, Q, lattice, beam):
    '''
    This module ties the cudo calculation of the SMBA to my class structures. Other tie-ins may
    easily be included here.
    '''
    form = (born(cell.unit, cell.value_list[0], cell.value_list[1],
                 cell.value_list[2], Q.q_list[0], Q.q_list[1], Q.q_list[2]))
    return form
Пример #5
0
import pycuda.driver as cuda
import numpy as np

mod = cuda.SourceModule("""
      __global__ void doublify(float *a)
      {
        int idx = threadIdx.x + threadIdx.y*4;
        a[idx] *= 2;
      }
      """)


def test_cuda():
    cuda.init()
    assert cuda.Device.count() >= 1
    dev = cuda.Device(0)
    ctx = dev.make_context()
    a = np.random.randn(4, 4).astype(np.float32)
    a_gpu = cuda.mem_alloc(a.size * a.dtype.itemsize)
    cuda.memcpy_htod(a_gpu, a)
    func = mod.get_function("doublify")
    func(a_gpu, block=(4, 4, 1))
    a_doubled = np.empty_like(a)
    cuda.memcpy_dtoh(a_doubled, a_gpu)
Пример #6
0
mod = drv.SourceModule("""
__global__ void stateupdate(SCALAR *V_arr, SCALAR *ge_arr, SCALAR *gi_arr)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    SCALAR V = V_arr[i];
    SCALAR ge = ge_arr[i];
    SCALAR gi = gi_arr[i]; 
    SCALAR V__tmp = (ge+gi-(V+0.049))/0.02;
    SCALAR ge__tmp = -ge/0.005;
    SCALAR gi__tmp = -gi/0.01;
    V_arr[i] = V+0.0001*V__tmp;
    ge_arr[i] = ge+0.0001*ge__tmp;
    gi_arr[i] = gi+0.0001*gi__tmp;
}

__global__ void threshold(SCALAR *V, int *spikes, bool *spiked, unsigned int *global_j, int N)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    bool this_spiked = V[i]>-0.05; 
    spiked[i] = this_spiked;
    if(this_spiked) // && i<N) // can leave out i<N check if N%blocksize=0
    {
        unsigned int j = atomicInc(global_j, N);
        spikes[j] = i;
    }
}

__global__ void propagate(int *spikes, int numspikes, SCALAR *v, SCALAR *W, int N)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    SCALAR v_tmp = v[i];
    for(int j=0; j<numspikes; j++)
        v_tmp += W[i+N*spikes[j]];
    v[i] = v_tmp;
}

__global__ void reset(SCALAR *V, bool *spiked)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    bool has_spiked = spiked[i];
    V[i] = (V[i]*!has_spiked)+(-0.06)*has_spiked; // i.e. V[i]=-0.06 if spiked[i]
}
""".replace('SCALAR', precision))
Пример #7
0
def cuda_d_matern(matrixC_gpu, nx, ny, cmin, cmax, symm, diff_degree):

    blocksize = 16
    nbkmaxstandard = 10

    mod = cuda.SourceModule(s)

    prefac = 1.0
    if diff_degree < 0.0:
        return None
    if diff_degree >= 10.0:
        #TODO gaussian
        return None
    if diff_degree != 1.0:
        ga = scipy.special.gamma(diff_degree)
        prefac = pow(0.5, (diff_degree - 1.0)) / ga

    snu = sqrt(diff_degree) * 2.0
    fl = floor(diff_degree)  # fl = N
    rem = diff_degree - fl

    if ((matrixC_gpu != None) and (symm == True) and (nx == ny)):
        nb = nx / blocksize
        if ((cmin == 0) and (cmax == nx)):

            #Load cuda function
            cuda_fct = mod.get_function("d_fillMatrix_rkbesl_symmetric_full")

            if (fl + 1 > nbkmaxstandard):
                #TODO: Recompile function with larger NBKMAX and load module
                return None

            #Convert input parameters
            diff_degree = numpy.float64(diff_degree)
            snu = numpy.float64(snu)
            rem = numpy.float64(rem)
            fl = numpy.uint32(fl)
            prefac = numpy.float64(prefac)
            nx = numpy.uint32(nx)

            #Execute cuda function
            cuda_fct(matrixC_gpu,
                     diff_degree,
                     snu,
                     rem,
                     fl,
                     fl,
                     prefac,
                     nx,
                     block=(blocksize, blocksize, 1),
                     grid=(nb, nb))

            #Ouput
            #matrixC_cpu = numpy.ones(nx*nx,numpy.float64)
            #cuda.memcpy_dtoh(matrixC_cpu, matrixC_gpu)
            #print matrixC_cpu

            #return matrixC_gpu
            return matrixC_gpu

    return None
Пример #8
0
from pycuda.reduction import ReductionKernel
from pycuda.elementwise import ElementwiseKernel

from skcuda.fft import fft, ifft, Plan
from skcuda.linalg import conj
# from skcuda.linalg import sqrt as cusqrt

cross_correlatiob_mod = driver.SourceModule("""
    _global_ void update_scores_angles(float32 *scores, int *angles, float32 *ccc_map, int angleID)
    {
        int idx = threadIdx.x + threadIdx.y*10 + threadIdx.z*100;
           
        if scores[idx] < ccc_map[idx] {
            scores[idx] = ccc_map[idx];
            angles[idx] = angleID;
        }
          
    }
    __global__ void insert_volume(float32 *padded_volume, float32 *template, int sizePad, int sizeTem)
    {
        int idx = threadIdx.x + threadIdx.y * threadDimx.x + blockSize.x*threadDim.x*blockIdx.x
        int pad_idx = (1 + 2 * idx) * sizePad
        padded_volume[pad_idx] = template[idx]
    }
""")


update_scores_angles = cross_correlatiob_mod.get_function('update_scores_angles')


class TemplateMatchingPlan():
    def __init__(self, volume, template, gpu):
Пример #9
0
	def get_kernel_initmem( s ):
		mod = cuda.SourceModule( file('./gpu_core/initmem.cu','r').read() )
		return mod.get_function("initmem")
Пример #10
0
	def prepare_kernels( s ):
		mod = cuda.SourceModule( file('./gpu_core/source.cu','r').read() )
		s.update_src = mod.get_function("update_src")

		Db = ( s.tpb, 1, 1 )
		s.update_src.prepare( "iiiiP", block=Db )
Пример #11
0
	def get_module( s ):
		s.mod = cuda.SourceModule( file('./gpu_core/cpml_non_kapa.cu','r').read().replace('NPMLp2',str(2*(Npml+1))).replace('NPMLp',str(Npml+1)).replace('NPML',str(Npml)  ) )
Пример #12
0
        u = math.exp(vDt)
        d = math.exp(-vDt)
        pu = (If - d) / (u - d)
        pd = 1.0 - pu
        puByDf = pu * Df
        pdByDf = pd * Df
        processed = [data.S, data.X, vDt, puByDf, pdByDf]
        #print processed
        h_OptionData.append(processed)
        comp = binomialOptionFromProcessed(processed)
        comps.append(comp)
    array = numpy.array(h_OptionData,dtype=numpy.float32)

    a_gpu = cuda.to_device(array)
    b_gpu = cuda.mem_alloc(len(h_OptionData))
    func = mod.get_function("binomialOptionsKernel")
    print "Loaded function..."
    func(a_gpu, b_gpu, block=(5,1,1), grid=(len(h_OptionData),1))
    result = numpy.zeros((len(h_OptionData),1),dtype=numpy.float32)
    cuda.memcpy_dtoh(result,b_gpu)
    return zip(result.tolist(),comps)

optionData = []
for i in range(100):
    optionData.append(OptionData(random.uniform(5.0,30.0),random.uniform(1.0,100.0),random.uniform(0.25,10.0),0.06,0.10))
mod = cuda.SourceModule(binomial_kernel)
result = binomialOptionsGPU(optionData,mod)
for res,comp in result:
    print str(comp)+" : "+str(res[0])

Пример #13
0
import pycuda.driver as cuda
import pycuda.autoinit
import numpy

realrow = numpy.array([1.0, 2.0, 3.0, 4.0, 5.0],
                      dtype=numpy.float32).reshape(1, 5)

mod_copy_texture = cuda.SourceModule("""
texture<float, 1> tex;
texture<float, 1> tex2;

__global__ void copy_texture_kernel(float * data) {
    int ty=threadIdx.y;
    //data[ty] = tex1D(tex, (float)(ty));
    data[ty] = tex1D(tex, (float)(ty)/2.0f);
}
""")

copy_texture_func = mod_copy_texture.get_function("copy_texture_kernel")
texref = mod_copy_texture.get_texref("tex")
tex2ref = mod_copy_texture.get_texref("tex2")

cuda.matrix_to_texref(realrow, texref, order="C")

texref.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
texref.set_filter_mode(cuda.filter_mode.LINEAR)

gpu_output = numpy.zeros_like(realrow)
copy_texture_func(cuda.Out(gpu_output), block=(1, 1, 1), texrefs=[texref])

print "Output:"
Пример #14
0
mod = drv.SourceModule("""
__global__ void stateupdate(SCALAR *V, SCALAR *ge, SCALAR *gi)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x; 
    SCALAR V__tmp = (ge[i]+gi[i]-(V[i]+0.049))/0.02;
    SCALAR ge__tmp = -ge[i]/0.005;
    SCALAR gi__tmp = -gi[i]/0.01;
    V[i] += 0.0001*V__tmp;
    ge[i] += 0.0001*ge__tmp;
    gi[i] += 0.0001*gi__tmp;
}

__device__ unsigned int irng(unsigned int n)
{
    return ((n*1103515245+12345)/65536)%32768;
}
__device__ SCALAR srng(int i, int j)
{
    // formula guarantees srng(i,i)=0 which is useful for weight matrices
    return (SCALAR)(irng(irng((unsigned int)i)^irng((unsigned int)j)))/32768.0;
}
__device__ bool weightfunc(int i, int j)
{
    return srng(i,j)<SPARSENESS;
}

__global__ void threshold(SCALAR *V, int *spikes, bool *spiked, unsigned int *global_j, int N)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    bool this_spiked = V[i]>-0.05; 
    spiked[i] = this_spiked;
    if(this_spiked) // && i<N) // can leave out i<N check if N%blocksize=0
    {
        unsigned int j = atomicInc(global_j, N);
        spikes[j] = i;
    }
}

__global__ void single_thread_threshold(SCALAR *V, int *spikes, bool *spiked, unsigned int *global_j, int N)
{
    unsigned int j = 0;
    for(int i=0;i<N;i++)
    {
        if(V[i]>-0.05)
        {
            spiked[i] = true;
            spikes[j++] = i;
        } else
        {
            spiked[i] = false;
        }
    }
    *global_j = j;
}

__global__ void propagate(int *spikes, int numspikes, SCALAR *v, SCALAR W, int N, int offset)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    for(int j=0; j<numspikes; j++)
        v[i] += weightfunc(i,spikes[j]+offset)*W;
}

__global__ void reset(SCALAR *V, bool *spiked)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    V[i] = (V[i]*!spiked[i])+(-0.06)*spiked[i]; // i.e. V[i]=-0.06 if spiked[i]
}
""".replace('SCALAR', precision).replace('SPARSENESS', str(sparseness)))
Пример #15
0
mod = drv.SourceModule("""
__global__ void stateupdate(SCALAR *V, SCALAR *ge, SCALAR *gi)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x; 
    SCALAR V__tmp = (ge[i]+gi[i]-(V[i]+0.049))/0.02;
    SCALAR ge__tmp = -ge[i]/0.005;
    SCALAR gi__tmp = -gi[i]/0.01;
    V[i] += 0.0001*V__tmp;
    ge[i] += 0.0001*ge__tmp;
    gi[i] += 0.0001*gi__tmp;
}

__global__ void threshold(SCALAR *V, int *spikes, bool *spiked, unsigned int *global_j, int N)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    bool this_spiked = V[i]>-0.05; 
    spiked[i] = this_spiked;
    if(this_spiked) // && i<N) // can leave out i<N check if N%blocksize=0
    {
        unsigned int j = atomicInc(global_j, N);
        spikes[j] = i;
    }
}

__global__ void single_thread_threshold(SCALAR *V, int *spikes, bool *spiked, unsigned int *global_j, int N)
{
    unsigned int j = 0;
    for(int i=0;i<N;i++)
    {
        if(V[i]>-0.05)
        {
            spiked[i] = true;
            spikes[j++] = i;
        } else
        {
            spiked[i] = false;
        }
    }
    *global_j = j;
}

__global__ void propagate(int *spikes, int numspikes, SCALAR *v, SCALAR *W, int N)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    for(int j=0; j<numspikes; j++)
        v[i] += W[i+N*spikes[j]];
}

__global__ void propagate_spike(int spike, SCALAR *alldata, int *allj, int *rowind, SCALAR *V)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int startindex = rowind[spike];
    if(i>=rowind[spike+1]-startindex) return;
    i += startindex;
    V[allj[i]] += alldata[i];
}

__global__ void reset(SCALAR *V, bool *spiked)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    V[i] = (V[i]*!spiked[i])+(-0.06)*spiked[i]; // i.e. V[i]=-0.06 if spiked[i]
}
""".replace('SCALAR', precision))
Пример #16
0
mod = drv.SourceModule("""
/*__global__ void stateupdate(SCALAR *V_arr, SCALAR *ge_arr, SCALAR *gi_arr)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    SCALAR V = V_arr[i];
    SCALAR ge = ge_arr[i];
    SCALAR gi = gi_arr[i]; 
//    SCALAR V__tmp = (ge+gi-(V+0.049))/0.02;
//    SCALAR ge__tmp = -ge/0.005;
//    SCALAR gi__tmp = -gi/0.01;
    SCALAR V__tmp = (ge+gi-(V+0.049))*50;
    SCALAR ge__tmp = -ge*200;
    SCALAR gi__tmp = -gi*100;
    V_arr[i] = V+0.0001*V__tmp;
    ge_arr[i] = ge+0.0001*ge__tmp;
    gi_arr[i] = gi+0.0001*gi__tmp;
}*/

__global__ void stateupdate(SCALAR *V_arr, SCALAR *ge_arr, SCALAR *gi_arr)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    SCALAR V = V_arr[i];
    SCALAR ge = ge_arr[i];
    SCALAR gi = gi_arr[i];
    V_arr[i] = V+1;
    ge_arr[i] = ge+1;
    gi_arr[i] = gi+1;
}

__global__ void stateupdate_noglobal(SCALAR *V_arr, SCALAR *ge_arr, SCALAR *gi_arr)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    SCALAR V = 0.;//V_arr[i];
    SCALAR ge = 0.;//ge_arr[i];
    SCALAR gi = 0.;//gi_arr[i]; 
    SCALAR V__tmp = (ge+gi-(V+0.049))/0.02;
    SCALAR ge__tmp = -ge/0.005;
    SCALAR gi__tmp = -gi/0.01;
    V = V+0.0001*V__tmp;
    ge = ge+0.0001*ge__tmp;
    gi = gi+0.0001*gi__tmp;
}
""".replace('SCALAR', precision))
Пример #17
0
def cholesky_gpu(matrixA_gpu, matrixA_size, dtype, blocksize):
    if dtype_names[dtype] != 'float':
        raise NotImplementedError, 'Double precision not working yet.'

    # Compile a kernel for this dtype and blocksize, if it does not already exist.
    if cholesky_modules.has_key((dtype, blocksize)):
        mod = cholesky_modules[dtype, blocksize]
    else:
        s = templ_subs(cholesky_template,
                       blocksize=blocksize,
                       dtype=dtype_names[dtype])
        cholesky_sources[dtype, blocksize] = s
        mod = cholesky_modules[dtype, blocksize] = cuda.SourceModule(s)

    matrixA_size = numpy.uint32(matrixA_size)
    matrixBlocks = numpy.uint32(matrixA_size / blocksize)
    matrixRest = matrixA_size % blocksize

    if ((matrixA_gpu == None) or (matrixA_size == 0)):
        return None

    cuda_fct_topleft = mod.get_function("d_choldc_topleft")
    cuda_fct_strip = mod.get_function("d_choldc_strip")
    cuda_fct_diagupdate = mod.get_function("d_choldc_diagupdate")
    cuda_fct_loupdate = mod.get_function("d_choldc_loupdate")

    i = (int)(matrixBlocks)
    j = i

    while i > 2:
        logridx = 1
        logridy = i - 2
        stripgridx = i - 1
        cuda_fct_topleft(matrixA_gpu,
                         matrixA_size,
                         numpy.uint32(j - i),
                         block=(blocksize, blocksize, 1),
                         grid=(1, 1))
        cuda_fct_strip(matrixA_gpu,
                       matrixA_size,
                       numpy.uint32(j - i),
                       block=(blocksize, blocksize, 1),
                       grid=(stripgridx, 1))
        cuda_fct_diagupdate(matrixA_gpu,
                            matrixA_size,
                            numpy.uint32(j - i),
                            block=(blocksize, blocksize, 1),
                            grid=(stripgridx, 1))
        cuda_fct_loupdate(matrixA_gpu,
                          matrixA_size,
                          matrixBlocks,
                          numpy.uint32(j - i),
                          block=(blocksize, blocksize, 1),
                          grid=(logridx, logridy))
        i = i - 1
    if (j > 1):
        cuda_fct_topleft(matrixA_gpu,
                         matrixA_size,
                         numpy.uint32(j - 2),
                         block=(blocksize, blocksize, 1),
                         grid=(1, 1))
        cuda_fct_strip(matrixA_gpu,
                       matrixA_size,
                       numpy.uint32(j - 2),
                       block=(blocksize, blocksize, 1),
                       grid=(1, 1))
        cuda_fct_diagupdate(matrixA_gpu,
                            matrixA_size,
                            numpy.uint32(j - 2),
                            block=(blocksize, blocksize, 1),
                            grid=(1, 1))

    cuda_fct_topleft(matrixA_gpu,
                     matrixA_size,
                     numpy.uint32(j - 1),
                     block=(blocksize, blocksize, 1),
                     grid=(1, 1))
    cuda.Context.synchronize()
Пример #18
0
	def get_kernel_initmem( s ):
		fpath = '%s/core/initmem.cu' % base_dir
		mod = cuda.SourceModule( file( fpath,'r' ).read() )
		return mod.get_function("initmem")
Пример #19
0
if __name__ == '__main__':
    import pycuda.autoinit
    from pycuda import driver as drv

    def gpuarrstatus(z):
        return '(cpu_changed=' + str(
            z._cpu_data_changed) + ', gpu_changed=' + str(
                z._gpu_data_changed) + ')'

    x = array([1, 2, 3, 4], dtype=numpy.float32)
    print '+ About to initialise GPUBufferedArray'
    y = GPUBufferedArray(x)
    print '- Initialised GPUBufferedArray', gpuarrstatus(y)
    print '+ About to set item on CPU array', gpuarrstatus(y)
    y[3] = 5
    print '- Set item on CPU array', gpuarrstatus(y)
    mod = drv.SourceModule('''
    __global__ void doubleit(float *y)
    {
        int i = threadIdx.x;
        y[i] *= 2.0;
    }
    ''')
    doubleit = mod.get_function("doubleit")
    print '+ About to call GPU function', gpuarrstatus(y)
    doubleit(y.gpu_array, block=(4, 1, 1))
    print '- Called GPU function', gpuarrstatus(y)
    print '+ About to print CPU array', gpuarrstatus(y)
    print y
    print '- Printed CPU array', gpuarrstatus(y)
Пример #20
0
        d_Result[gmemPos] = sum;        
        //d_Result[gmemPos] = 128;
        smemPos += smemStride;
        gmemPos += gmemStride;
    }
}
'''
template = string.Template(template)
code = template.substitute(KERNEL_RADIUS = KERNEL_RADIUS,  
                           KERNEL_W = KERNEL_W,  
                           COLUMN_TILE_H=COLUMN_TILE_H,  
                           COLUMN_TILE_W=COLUMN_TILE_W,  
                           ROW_TILE_W=ROW_TILE_W,  
                           KERNEL_RADIUS_ALIGNED=KERNEL_RADIUS_ALIGNED)

module = cuda.SourceModule(code)
convolutionRowGPU = module.get_function('convolutionRowGPU')
convolutionColumnGPU = module.get_function('convolutionColumnGPU')
d_Kernel_rows = module.get_global('d_Kernel_rows')[0]
d_Kernel_columns = module.get_global('d_Kernel_columns')[0]

# Helper functions for computing alignment...
def iDivUp(a, b):
    # Round a / b to nearest higher integer value
    a = numpy.int32(a)
    b = numpy.int32(b)
    return (a / b + 1) if (a % b != 0) else (a / b)

def iDivDown(a, b):
    # Round a / b to nearest lower integer value
    a = numpy.int32(a)
Пример #21
0
def _cudaIsBound(data, truncate, ratio):
    bsize = __CUDA_BLOCK_SIZE
    import pycuda.driver as cuda
    import pycuda.autoinit
    import pycuda.gpuarray as gpuarray
    my_stream = cuda.Stream()
    cuda.init()
    assert cuda.Device.count() >= 1

    mass_scale_factor = 1.0 / (data['CellMass'].max())
    m = (data['CellMass'] * mass_scale_factor).astype('float32')
    assert (m.size > bsize)

    gsize = int(math.ceil(float(m.size) / bsize))
    assert (gsize > 16)

    # Now the tedious process of rescaling our values...
    length_scale_factor = data['dx'].max() / data['dx'].min()
    x = ((data['x'] - data['x'].min()) * length_scale_factor).astype('float32')
    y = ((data['y'] - data['y'].min()) * length_scale_factor).astype('float32')
    z = ((data['z'] - data['z'].min()) * length_scale_factor).astype('float32')
    p = na.zeros(z.shape, dtype='float32')

    x_gpu = cuda.mem_alloc(x.size * x.dtype.itemsize)
    y_gpu = cuda.mem_alloc(y.size * y.dtype.itemsize)
    z_gpu = cuda.mem_alloc(z.size * z.dtype.itemsize)
    m_gpu = cuda.mem_alloc(m.size * m.dtype.itemsize)
    p_gpu = cuda.mem_alloc(p.size * p.dtype.itemsize)
    for ag, a in [(x_gpu, x), (y_gpu, y), (z_gpu, z), (m_gpu, m), (p_gpu, p)]:
        cuda.memcpy_htod(ag, a)
    source = """

      extern __shared__ float array[];

      __global__ void isbound(float *x, float *y, float *z, float *m,
                              float *p, int *nelem)
      {

        /* My index in the array */
        int idx1 = blockIdx.x * blockDim.x + threadIdx.x;
        /* Note we are setting a start index */
        int idx2 = blockIdx.y * blockDim.x;
        int offset = threadIdx.x;

        /* Here we're just setting up convenience pointers to our
           shared array */

        float* x_data1 = (float*) array;
        float* y_data1 = (float*) &x_data1[blockDim.x];
        float* z_data1 = (float*) &y_data1[blockDim.x];
        float* m_data1 = (float*) &z_data1[blockDim.x];

        float* x_data2 = (float*) &m_data1[blockDim.x];
        float* y_data2 = (float*) &x_data2[blockDim.x];
        float* z_data2 = (float*) &y_data2[blockDim.x];
        float* m_data2 = (float*) &z_data2[blockDim.x];

        x_data1[offset] = x[idx1];
        y_data1[offset] = y[idx1];
        z_data1[offset] = z[idx1];
        m_data1[offset] = m[idx1];

        x_data2[offset] = x[idx2 + offset];
        y_data2[offset] = y[idx2 + offset];
        z_data2[offset] = z[idx2 + offset];
        m_data2[offset] = m[idx2 + offset];

        __syncthreads();

        float tx, ty, tz;

        float my_p = 0.0;

        if(idx1 < %(p)s) {
            for (int i = 0; i < blockDim.x; i++){
                if(i + idx2 < idx1 + 1) continue;
                tx = (x_data1[offset]-x_data2[i]);
                ty = (y_data1[offset]-y_data2[i]);
                tz = (z_data1[offset]-z_data2[i]);
                my_p += m_data1[offset]*m_data2[i] /
                    sqrt(tx*tx+ty*ty+tz*tz);
            }
        }
        p[idx1] += my_p;
        __syncthreads();
      }
    """
    mod = cuda.SourceModule(source % dict(p=m.size))
    func = mod.get_function('isbound')
    mylog.info("Running CUDA functions.  May take a while.  (%0.5e, %s)",
               x.size, gsize)
    import pycuda.tools as ct
    t1 = time.time()
    ret = func(x_gpu,
               y_gpu,
               z_gpu,
               m_gpu,
               p_gpu,
               shared=8 * bsize * m.dtype.itemsize,
               block=(bsize, 1, 1),
               grid=(gsize, gsize),
               time_kernel=True)
    cuda.memcpy_dtoh(p, p_gpu)
    p1 = p.sum()
    if na.any(na.isnan(p)): raise ValueError
    return p1 * (length_scale_factor / (mass_scale_factor**2.0))
Пример #22
0
	hermite_eval = cuda.SourceModule("""
	#define A_TERMS %(lenA)d
	#define TERMS %(termsAlpha)d
	#define CLUSTERS %(clusters)d
	#define POLY_TERMS %(polyTerms)d
	#define BLOCKSIZE %(blocksize)d
	#define SQRT_2 1.4142135623730951f

	#define SIGMA %(sigma)f
	#define LEN_ALPHA %(len_alpha)d
	#define NUM_TERMS %(num_terms)d
	#define OPTS3 %(opts3)d
	#define NUM_CLUSTERS %(num_clusters)d

	#define DEST_PER_THREAD 2

	
		// slightly optimised evaluation -- do all calculations for source clusters
		// at once -- save on memory bandwidth
		__global__ void eval_hermite2(float *r, float *A, float *tx, float *ty, float *sb,
		 							 float *alpha, float *H)
		{		
			float result, x, y;
			int alpha1;
			float h1, h2;
			int i, k;
		
			float pre_mult, t_x, t_y;
		
			// shared memory
			__shared__ float shared_alpha[TERMS];
			__shared__ float shared_A[A_TERMS];
			__shared__ float shared_sb[CLUSTERS];
			__shared__ float shared_H[POLY_TERMS];
		
			////////////////////////////////
			// Read vars into shared memory
			// WARNING: Each block needs more threads than (TERMS + A_TERMS + POLY_TERMS + CLUSTERS)
			// otherwise it won't work. 
			////////////////////////////////
			// select what each thread reads
			if (threadIdx.x < TERMS){
			// shared_alpha case
				i = 0;
				k = 0;
			} else if (threadIdx.x < TERMS + A_TERMS) {
			// shared_A case
				i = 1;
				k = - TERMS;
			} else if (threadIdx.x < TERMS + A_TERMS + POLY_TERMS) {
			// shared_H case
				i = 2;
				k = - TERMS - A_TERMS;
			} else if (threadIdx.x < TERMS + A_TERMS + POLY_TERMS + CLUSTERS) {
			// shared_sb case
				i = 3;
				k = - TERMS - A_TERMS - POLY_TERMS;
			} else {
			// No read case
				i = 4;
				k = 0;
			}
			// diverge the threads to have independent reads
			switch (i){
				case 0:
					shared_alpha[threadIdx.x + k] = alpha[threadIdx.x + k];
					break;
				case 1:
					shared_A[threadIdx.x + k] = A[threadIdx.x + k];
					break;
				case 2:
					shared_H[threadIdx.x + k] = H[threadIdx.x + k];
					break;
				case 3:
					shared_sb[threadIdx.x + k] = sb[threadIdx.x + k];
					break;
				default:
					break;
			}
		
			//__threadfence_block();
			__syncthreads();

			if (OPTS3 < threadIdx.x + BLOCKSIZE*blockIdx.x)
			{
				return;
			}

			t_x = tx[threadIdx.x + BLOCKSIZE*blockIdx.x];
			t_y = ty[threadIdx.x + BLOCKSIZE*blockIdx.x];
			result = 0.0;
		
			///////////////////////////////
			// Main loop, flops: (NumClusters * (19 + LenAlpha/2 * (14 + 4 * NumTerms)) + 2)
			///////////////////////////////
		
			// run through this code for each cluster center
			for (k=0; k < NUM_CLUSTERS; k++) {

				// distance operator
				x = (t_x - shared_sb[k*2+0]) / SQRT_2 / SIGMA;
				//x = (t_x - sb[k*2+0]) / SQRT_2 / SIGMA;
				y = (t_y - shared_sb[k*2+1]) / SQRT_2 / SIGMA;
				//y = (t_y - sb[k*2+1]) / SQRT_2 / SIGMA;

				pre_mult = exp(-(x*x))*exp(-(y*y));

				// look at shared memory - all variables called in
				// poly_eval should be in shared memory

				for (i=0; i < LEN_ALPHA/2; i++)
				{
					alpha1 = shared_alpha[i*2];
				
					// I avoid the inner loop and get a superb speedup, but it needs to be hardcoded
					// is it possible to do the same using MACROS? or generating this from python?
					// ONLY USE p=5 here
					h1 = h2 = 0.0f;
					h1 = shared_H[NUM_TERMS*alpha1 + 0] + x*h1;
					h1 = shared_H[NUM_TERMS*alpha1 + 1] + x*h1;
					h1 = shared_H[NUM_TERMS*alpha1 + 2] + x*h1;
					/*
					h1 = shared_H[NUM_TERMS*alpha1 + 3] + x*h1;
					h1 = shared_H[NUM_TERMS*alpha1 + 4] + x*h1;
					
					h1 = shared_H[NUM_TERMS*alpha1 + 5] + x*h1;
					h1 = shared_H[NUM_TERMS*alpha1 + 6] + x*h1;
					h1 = shared_H[NUM_TERMS*alpha1 + 7] + x*h1;
					h1 = shared_H[NUM_TERMS*alpha1 + 8] + x*h1;
					
					h1 = shared_H[NUM_TERMS*alpha1 + 9] + x*h1;
					h1 = shared_H[NUM_TERMS*alpha1 + 10] + x*h1;
					h1 = shared_H[NUM_TERMS*alpha1 + 11] + x*h1;
					*/
				
					//result += alpha1;
				
					alpha1 = shared_alpha[i*2 + 1];
					h2 = shared_H[NUM_TERMS*alpha1 + 0] + y*h2;
					h2 = shared_H[NUM_TERMS*alpha1 + 1] + y*h2;
					h2 = shared_H[NUM_TERMS*alpha1 + 2] + y*h2;
					
					/*
					h2 = shared_H[NUM_TERMS*alpha1 + 3] + y*h2;
					h2 = shared_H[NUM_TERMS*alpha1 + 4] + y*h2;
					
					h2 = shared_H[NUM_TERMS*alpha1 + 5] + y*h2;
					h2 = shared_H[NUM_TERMS*alpha1 + 6] + y*h2;
					h2 = shared_H[NUM_TERMS*alpha1 + 7] + y*h2;
					h2 = shared_H[NUM_TERMS*alpha1 + 8] + y*h2;
					
					h2 = shared_H[NUM_TERMS*alpha1 + 9] + y*h2;
					h2 = shared_H[NUM_TERMS*alpha1 + 10] + y*h2;
					h2 = shared_H[NUM_TERMS*alpha1 + 11] + y*h2;
					*/
					result += shared_A[k*LEN_ALPHA/2+i]*pre_mult*h1*h2;
				}
			}
			r[threadIdx.x + BLOCKSIZE*blockIdx.x] += result;
		}
	""" % {'lenA':len(A_curr),'termsAlpha':len(alpha),'clusters':len(sb_curr),'polyTerms':len(H), 'blocksize':blocksize,
	'sigma': delta, 'len_alpha': len(alpha), 'num_terms': p, 'opts3': len(tx), 'num_clusters':clusters_this_call},
	nvcc="nvcc",options=['-use_fast_math'], keep=False, no_extern_c=False)
Пример #23
0
N = 512 * 8
spikes = [i for i in range(N) if numpy.random.rand() < 0.1]
blocksize = 512
block = (blocksize, 1, 1)
grid = (N / blocksize, 1)
repeats = 1000

print 'N:', N
print 'numspikes:', len(spikes), 'proportion', float(len(spikes)) / N
print 'repeats:', repeats
print 'block:', block, 'grid:', grid

mod = drv.SourceModule("""
__global__ void propagate(int *spikes, int numspikes, float *v, float *W, int N)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    for(int j=0; j<numspikes; j++)
        v[i] += W[i+N*spikes[j]];
}
""")

propagate = mod.get_function("propagate")

W = numpy.random.randn(N, N)
W_gpu = gpuarray.to_gpu(numpy.array(W, dtype=numpy.float32))

v = gpuarray.to_gpu(numpy.zeros(N, dtype=numpy.float32))
v_pre = v.get()

gpu_spikes = drv.mem_alloc(4 * len(spikes))
spikes = numpy.array(spikes, dtype=int)
drv.memcpy_htod(gpu_spikes, spikes)
Пример #24
0
import pycuda.driver as drv
from pycuda.gpuarray import GPUArray
from pycuda import gpuarray
import bisect
import numpy, pylab, time, random
from scipy import weave

N = 1024

mod = drv.SourceModule(
    '''
#include <chag/pp/compact.cuh>

__global__ void test()
{
 int i = blockIdx.x * blockDim.x + threadIdx.x;
 i = i+1;
}
''',
    options=[
        r'-I"C:\Documents and Settings\goodman.CELERI\Bureau\source-20090929"'
    ],
    no_extern_c=True)

#x = numpy.random.randn(N)
#xgpu = gpuarray.to_gpu(x)
#ygpu = gpuarray.to_gpu(numpy.zeros(x.shape))
#vec = gpuarray.to_gpu(numpy.zeros(1, dtype=int))
#z = x[x>=0.0]
#
#xgpu_start = int(xgpu.gpudata)
#xgpu_end = xgpu_start+N*8
Пример #25
0
mod = drv.SourceModule("""
__global__ void stateupdate(SCALAR *V_arr, SCALAR *ge_arr, SCALAR *gi_arr)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    SCALAR V = V_arr[i];
    SCALAR ge = ge_arr[i];
    SCALAR gi = gi_arr[i]; 
    SCALAR V__tmp = (ge+gi-(V+0.049))/0.02;
    SCALAR ge__tmp = -ge/0.005;
    SCALAR gi__tmp = -gi/0.01;
    V_arr[i] = V+0.0001*V__tmp;
    ge_arr[i] = ge+0.0001*ge__tmp;
    gi_arr[i] = gi+0.0001*gi__tmp;
}

__global__ void threshold(SCALAR *V, int *spikes, bool *spiked, unsigned int *global_j, int N)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    bool this_spiked = V[i]>-0.05; 
    spiked[i] = this_spiked;
    if(this_spiked)
    {
        spikes[atomicInc(global_j, N)] = i;
    }
}

/////// TODO: next three functions are untested, but should be a non-blocking threshold function if N<=512*512

__global__ void threshold_blocksumcount(SCALAR *V, unsigned int *blocksumcount)
{ 
    __shared__ unsigned short int partialsum[BLOCKSIZE];
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int t = threadIdx.x;
    int b = blockIdx.x;
    bool this_spiked = V[i]>-0.05;
    partialsum[t] = (unsigned short int)this_spiked;
    for(unsigned int stride=blockDim.x/2; stride>=1; stride/=2)
    {
        __syncthreads();
        if(t<stride)
            partialsum[t] += partialsum[t+stride];
    }
    __syncthreads();
    if(t==0)
        blocksumcount[b] = partialsum[t];
}

__global__ void threshold_cumsum(unsigned int *blocksumcount, unsigned int *cumblocksumcount)
{
    __shared__ unsigned int partialsum[BLOCKSIZE];
    int t = threadIdx.x;
    partialsum[t] = blocksumcount[t];
    for(unsigned int stride=1; stride<blockDim.x; stride*=2)
    {
        __syncthreads();
        if(t>=stride)
            partialsum[t] += partialsum[t-stride];
    }
    cumblocksumcount[t+1] = partialsum[t];
}

__global__ void threshold_compact(SCALAR *V, int *spikes, unsigned int *blocksumcount, unsigned int *cumblocksumcount)
{
    __shared__ unsigned short int partialsum[BLOCKSIZE];
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int t = threadIdx.x;
    int b = blockIdx.x;
    // only compact those blocks with some spikes in (many will not in general, and this introduces no divergence)
    if(blocksumcount[b]>0)
    {
        bool this_spiked = V[i]>-0.05;
        partialsum[t] = (unsigned short int)this_spiked;
        for(unsigned int stride=1; stride<blockDim.x; stride*=2)
        {
            __syncthreads();
            if(t>=stride)
                partialsum[t] += partialsum[t-stride];
        }
        __syncthreads();
        if(this_spiked)
          spikes[(unsigned int)partialsum[t]+cumblocksumcount[b]-1] = i;
        //spikes[i] = partialsum[t]+cumblocksumcount[b];
    }
}
""".replace('SCALAR', precision).replace('BLOCKSIZE', str(blocksize)))
Пример #26
0
    psizExf = cuda.mem_alloc(size_psiz)
    psizExb = cuda.mem_alloc(size_psiz)
    psizEyf = cuda.mem_alloc(size_psiz)
    psizEyb = cuda.mem_alloc(size_psiz)
    psizHxf = cuda.mem_alloc(size_psiz)
    psizHxb = cuda.mem_alloc(size_psiz)
    psizHyf = cuda.mem_alloc(size_psiz)
    psizHyb = cuda.mem_alloc(size_psiz)

    # Copy the arrays from host to device
    cuda.memcpy_htod(devCEx, CEx)
    cuda.memcpy_htod(devCEy, CEy)
    cuda.memcpy_htod(devCEz, CEz)

    # Get the module from the cuda files
    mod_common = cuda.SourceModule(file('common.cu', 'r').read())
    mod_dielectric = cuda.SourceModule(file('dielectric.cu', 'r').read())
    mod_source = cuda.SourceModule(file('source.cu', 'r').read())
    mod_cpml = cuda.SourceModule(
        file('cpml.cu',
             'r').read().replace('NPMLp2', str(2 * (Npml + 1))).replace(
                 'NPMLp', str(Npml + 1)).replace('NPML', str(Npml)))

    # Get the global pointer from the module
    rcmbE = mod_cpml.get_global("rcmbE")
    rcmbH = mod_cpml.get_global("rcmbH")
    rcmaE = mod_cpml.get_global("rcmaE")
    rcmaH = mod_cpml.get_global("rcmaH")

    #print rcmaE
    #print bE
Пример #27
0
def get_cuda_func(z_max, max_arr_size, total_image_count):
    raw = open('/home/john/python_scripts/cuda/cuda_code.c', 'rb').read()
    processed = raw % {'z_max':z_max, 'max':max_arr_size, 'total_image_count':total_image_count}
    mod = cuda.SourceModule(processed)
    return mod.get_function("modify")
Пример #28
0
mod = cuda.SourceModule("""
    #define Z_MAX %(z_max)d
    __device__ float calc(float val_a, float val_b, int index)
    {
        /*if (index %% Z_MAX == 0)
            return 0.0;*/
        return val_a * val_b / 256.0;
        //return min(255.0, val_a+val_b);
        //return max(0.0, val_a-val_b);
    }
    __global__ void modify(float *a, float *b, float *c)
    {
        int row_len = gridDim.x*blockDim.x*Z_MAX;
        int block_offset_x = blockDim.x*blockIdx.x*Z_MAX;
        int block_offset_y = blockDim.y*blockIdx.y*row_len;
        int i = threadIdx.x*Z_MAX + threadIdx.y*row_len + threadIdx.z + block_offset_x + block_offset_y;
        if (i > %(max)d)
            return;
        c[i] = calc(a[i], b[i], i);
    }
    __global__ void modify_linear(float *a, float *b, float *c)
    {
        int block_offset_x = blockDim.x*blockIdx.x;
        int i = threadIdx.x + block_offset_x;
        /*if (i > %(max)d)
            return;*/
        c[i] = calc(a[i], b[i], i);
    }
    """ % {'z_max':3, 'max':2896782})#{'z_max':d, 'max':img1_arr.size})
if LINEAR:
Пример #29
0
import pycuda.autoinit as autoinit
import pycuda.driver as drv
from pycuda.gpuarray import GPUArray
from pycuda import gpuarray
import numpy, pylab, time

N = 1000000
x0 = 3.2
block = (512, 1, 1)
grid = (int(N / 512) + 1, 1)

mod = drv.SourceModule("""
__global__ void threshold(float *x, float x0, int *J, unsigned int *global_j, int N)
{
 int i = blockIdx.x * blockDim.x + threadIdx.x;
 if(x[i]>x0 && i<N){
  unsigned int j = atomicInc(global_j, N);
  J[j] = i;
 }
}
""")

threshold = mod.get_function("threshold")

v = gpuarray.to_gpu(numpy.array(numpy.random.randn(N), dtype=numpy.float32))

J = drv.mem_alloc(4 * N)
global_j = drv.mem_alloc(4)

Jret = numpy.zeros(N, dtype=int)
jret = numpy.zeros(1, dtype=numpy.uint32)