mod = SourceModule("""
__global__ void Fun_snr(float *S_td, float *f_picture,int *N,int *Modesize)
{
	const int Imgw = N[0,0];
	const int modesize = Modesize[0,0];
	const int i = blockIdx.x * blockDim.x + threadIdx.x;
	float s_td=0.0;
	float sum=0.0;   
	if(blockIdx.x <4060&& !((i+1)%Imgw/(Imgw-modesize+1)))
		{
		for(int p=0 ; p<modesize ; p++)
			for(int q=0 ; q<modesize ; q++)
			{
			sum  =sum + f_picture[ (blockIdx.x+Imgw/blockDim.x*p) * blockDim.x + threadIdx.x + q];	
			}
		sum = sum /pow(float(modesize),2);
		for(int p=0 ; p<modesize ; p++)
			for(int q=0 ; q<modesize  ; q++)
			{
			s_td =s_td + pow((f_picture[ (blockIdx.x+Imgw/blockDim.x*p)  * blockDim.x + threadIdx.x + q]-sum),2);
	 		}
		S_td[i]= sqrt(s_td / pow(float(modesize),2));
		}
	else
		{
			S_td[i]=0;
		}
}
__global__ void Fun_ROAD(float *ROAD, float *f_picture,int *N)
{	
	const int Imgw = N[0,0];
	const int i = blockIdx.x * blockDim.x + threadIdx.x;
	if(blockIdx.x <4088&& !((i+1)%Imgw/(Imgw-2)))
		{
		float a[9];
		int a_i=1;
		for(int p=0 ; p<3 ; p++)
			for(int q=0 ; q<3; q++)
			{
			a[a_i]=f_picture[(blockIdx.x+Imgw/blockDim.x*p) * blockDim.x + threadIdx.x + q]-f_picture[(blockIdx.x+Imgw/blockDim.x*1) * blockDim.x + threadIdx.x + 1];
			a[a_i]=abs(a[a_i]);
			a_i+=1;
	 		}
	 		{
				int ii,j;
				float t;
				for(ii=0;ii<8;ii++)
					for(j=ii+1;j<9;j++)
					{
						if(a[ii]>a[j])
						{
						t=a[ii];
						a[ii]=a[j];
						a[j]=t;
						}
					}
			}
		float a_sum=0;
		for(int kk=0;kk<5;kk++)
			a_sum=a_sum+a[kk];
		ROAD[i]= a_sum;
		}
	else
		{
			ROAD[i]=0;
		}
		__syncthreads();
}
""")
Пример #2
0
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule

mod = SourceModule("""
        #include <stdio.h>
        __global__ void hello_world(void)
        {
            printf("Hello world from block (%d, %d) thread (%d, %d, %d)\\n",
                   blockIdx.x, blockIdx.y, threadIdx.x, threadIdx.y, threadIdx.z);
        }
""")

hello_world = mod.get_function('hello_world')

hello_world(block=(1, 2, 3), grid=(2, 1))
Пример #3
0
import os

import pycuda.driver as drv
import numpy

from pycuda.compiler import SourceModule
mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
  const int i = threadIdx.x;
  dest[i] = a[i] * b[i];
}
""")

multiply_them = mod.get_function("multiply_them")

a = numpy.random.randn(400).astype(numpy.float32)
b = numpy.random.randn(400).astype(numpy.float32)

dest = numpy.zeros_like(a)
multiply_them(drv.Out(dest),
              drv.In(a),
              drv.In(b),
              block=(400, 1, 1),
              grid=(1, 1))

print(dest - a * b)
Пример #4
0
from pycuda.compiler import SourceModule

import numpy as np
from sklearn.neighbors import NearestNeighbors

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D  # NOQA

import time

from constants import Constant

with open("src/source.cu", "r") as file:
    source = file.read()

module_calculate_consts = SourceModule(source)  # NOQA

const = Constant()
nn = NearestNeighbors(n_neighbors=const.num_neighbhours)
timer = np.zeros([100, 7])
calc_density = module_calculate_consts.get_function("calc_density")
calc_forces = module_calculate_consts.get_function("calc_forces")
update_pos = module_calculate_consts.get_function("update_pos")
r1 = (np.random.rand(const.num_particles, 3).astype(np.float32) -
      np.array([0.5, 0.5, 0.5]).astype(np.float32)) * 10
v = np.zeros([const.num_particles, 3]).astype(np.float32)
plt.ion()
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
for iteration in range(100):
    timer[iteration][0] = time.time()
from pycuda_test_utils import (
    PyCudaImplementation,
    get_free_bytes,
    get_time,
    time_function,
    free_memory_pool,
    get_median_filter_string,
)

from write_and_read_results import ARRAY_SIZES, write_results_to_file

LIB_NAME = "pycuda"
mode = "sourcemodule"
REFLECT_MODE = "reflect"

median_filter_module = SourceModule(get_median_filter_string())
median_filter = median_filter_module.get_function("median_filter")


def pycuda_median_filter(data, padded_data, filter_height, filter_width):
    N = 32
    median_filter(
        data,
        padded_data,
        np.int32(data.shape[0]),
        np.int32(data.shape[1]),
        np.int32(data.shape[2]),
        np.int32(filter_height),
        np.int32(filter_width),
        block=(data.shape[0], data.shape[1], data.shape[2]),
    )
Пример #6
0
VecCode = '''
__global__ void vec_ker(int *ints, double *doubles) {     

 int4 f1, f2;

 f1 = *reinterpret_cast<int4*>(ints);
 f2 = *reinterpret_cast<int4*>(&ints[4]);

 printf("First int4: %d, %d, %d, %d\\n", f1.x, f1.y, f1.z, f1.w);
 printf("Second int4: %d, %d, %d, %d\\n", f2.x, f2.y, f2.z, f2.w);
 
 double2 d1, d2;
 
 d1 = *reinterpret_cast<double2*>(doubles);
 d2 = *reinterpret_cast<double2*>(&doubles[2]);
 
 printf("First double2: %f, %f\\n", d1.x, d1.y);
 printf("Second double2: %f, %f\\n", d2.x, d2.y);
 
}'''

vec_mod = SourceModule(VecCode)
vec_ker = vec_mod.get_function('vec_ker')

ints = gpuarray.to_gpu(np.int32([1, 2, 3, 4, 5, 6, 7, 8]))
doubles = gpuarray.to_gpu(np.double([1.11, 2.22, 3.33, 4.44]))

print('Vectorized Memory Test:')

vec_ker(ints, doubles, grid=(1, 1, 1), block=(1, 1, 1))
Пример #7
0
cu_matrix_kernel = SourceModule("""

#include <math.h>
#include <stdio.h>

#include "texture_fetch_functions.h"
#include "texture_types.h"

#define THREADS_PER_BLOCK 256
#define FIT_RADIUS 6

texture<float, cudaTextureType2DLayered, cudaReadModeElementType> tex;


__device__ void deconvolve3_columns(int width,int height,int rowstride,
                          float *data,float *buffer,float a,float b) {
    float *row;
    float q;
    int i, j;

/*
//    if (( height < 2) || (rowstride > width)) {
//      printf("Failure in deconvolve3_rows: height, rowstride, width, a, b = %//d %d %d %f %f\n",height, rowstride, width, a, b );
//		return;
//    }
*/


    if (!height || !width)
        return;
	
    if (height == 1) {
        q = a + 2.0*b;
        for (j = 0; j < width; j++)
            data[j] /= q;
        return;
    }
    if (height == 2) {
        q = a*(a + 2.0*b);
        for (j = 0; j < width; j++) {
            buffer[0] = (a + b)/q*data[j] - b/q*data[rowstride + j];
            data[rowstride + j] = (a + b)/q*data[rowstride + j] - b/q*data[j];
            data[j] = buffer[0];
        }
        return;
    }
	
    /* Special-case first row */
    buffer[0] = a + b;
    /* Inner rows */
    for (i = 1; i < height-1; i++) {
        q = b/buffer[i-1];
        buffer[i] = a - q*b;
        row = data + (i - 1)*rowstride;
        for (j = 0; j < width; j++)
            row[rowstride + j] -= q*row[j];
    }
    /* Special-case last row */
    q = b/buffer[i-1];
    buffer[i] = a + b*(1.0 - q);
    row = data + (i - 1)*rowstride;
    for (j = 0; j < width; j++)
        row[rowstride + j] -= q*row[j];
    /* Go back */
    row += rowstride;
    for (j = 0; j < width; j++)
        row[j] /= buffer[i];
    do {
        i--;
        row = data + i*rowstride;
        for (j = 0; j < width; j++)
            row[j] = (row[j] - b*row[rowstride + j])/buffer[i];
    } while (i > 0);
}


__device__ void deconvolve3_rows(int width,int height,int rowstride,float *data,
                                 float *buffer,float a,float b) {
    float *row;
    float q;
    int i, j;

/*
//    if (( height < 2) || (rowstride > width)) {
//		printf("Failure in deconvolve3_rows\n");
//		return;
//    }
*/


    if (!height || !width)
        return;
	
    if (width == 1) {
        q = a + 2.0*b;
        for (i = 0; i < height; i++)
            data[i*rowstride] /= q;
        return;
    }
    if (width == 2) {
        q = a*(a + 2.0*b);
        for (i = 0; i < height; i++) {
            row = data + i*rowstride;
            buffer[0] = (a + b)/q*row[0] - b/q*row[1];
            row[1] = (a + b)/q*row[1] - b/q*row[0];
            row[0] = buffer[0];
        }
        return;
    }
	
    /* Special-case first item */
    buffer[0] = a + b;
    /* Inner items */
    for (j = 1; j < width-1; j++) {
        q = b/buffer[j-1];
        buffer[j] = a - q*b;
        data[j] -= q*data[j-1];
    }
    /* Special-case last item */
    q = b/buffer[j-1];
    buffer[j] = a + b*(1.0 - q);
    data[j] -= q*data[j-1];
    /* Go back */
    data[j] /= buffer[j];
    do {
        j--;
        data[j] = (data[j] - b*data[j+1])/buffer[j];
    } while (j > 0);
	
    /* Remaining rows */
    for (i = 1; i < height; i++) {
        row = data + i*rowstride;
        /* Forward */
        for (j = 1; j < width-1; j++)
            row[j] -= b*row[j-1]/buffer[j-1];
        row[j] -= b*row[j-1]/buffer[j-1];
        /* Back */
        row[j] /= buffer[j];
        do {
            j--;
            row[j] = (row[j] - b*row[j+1])/buffer[j];
        } while (j > 0);
    }
}




__device__ void resolve_coeffs_2d(int width, int height, int rowstride, float *data) {
    float *buffer;
    int     max;
	
    max = width > height ? width : height;
    buffer = (float *)malloc(max*sizeof(float));
    deconvolve3_rows(width, height, rowstride, data, buffer, 13.0/21.0, 4.0/21.0);
    deconvolve3_columns(width, height, rowstride, data, buffer, 13.0/21.0, 4.0/21.0);
    free(buffer);
}


__device__ float interpolate_2d(float x,float y,int rowstride,float *coeff) {
    float wx[4], wy[4];
    int i, j;
    float v, vx;

/*
//    if (x < 0.0 || x > 1.0 || y < 0.0 || y > 1.0) {
//		printf("interpolate_2d: x or y out of bounds %f %f\n",x,y);
//		return(-1.0);
//    }
*/    
    wx[0] = 4.0/21.0 + (-11.0/21.0 + (0.5 - x/6.0)*x)*x;
    wx[1] = 13.0/21.0 + (1.0/14.0 + (-1.0 + x/2.0)*x)*x;
    wx[2] = 4.0/21.0 + (3.0/7.0 + (0.5 - x/2.0)*x)*x;
    wx[3] = (1.0/42.0 + x*x/6.0)*x;
    wy[0] = 4.0/21.0 + (-11.0/21.0 + (0.5 - y/6.0)*y)*y;
    wy[1] = 13.0/21.0 + (1.0/14.0 + (-1.0 + y/2.0)*y)*y;
    wy[2] = 4.0/21.0 + (3.0/7.0 + (0.5 - y/2.0)*y)*y;
    wy[3] = (1.0/42.0 + y*y/6.0)*y;
	
    v = 0.0;
    for (i = 0; i < 4; i++) {
        vx = 0.0;
        for (j = 0; j < 4; j++)
            vx += coeff[i*rowstride + j]*wx[j];
        v += wy[i]*vx;
    }
	
    return v;
}



__device__ float integrated_profile(int profile_type, int idx, int idy, float xpos,
                                   float ypos, float *psf_parameters, float *lut_0,
                                   float *lut_xd, float *lut_yd) {

    int psf_size;
    float   psf_height, psf_sigma_x, psf_sigma_y, psf_xpos, psf_ypos;
    float   p0;
    int     ip, jp;
    float  pi=3.14159265,fwtosig=0.8493218;
    
    psf_size = (int) psf_parameters[0];
    psf_height = psf_parameters[1];
    psf_sigma_x = psf_parameters[2];
    psf_sigma_y = psf_parameters[3];
    psf_ypos = psf_parameters[4];
    psf_xpos = psf_parameters[5];

    if (profile_type == 0) {
    
       // gaussian

       // PSF at location (Idx,Idy). PSF is centred at (7.5,7.5)
       // Analytic part

       p0 = 0.5*psf_height*pi*fwtosig*fwtosig*
            (erff((idx-7.5+0.5)/(1.41421356*psf_sigma_x)) - 
             erff((idx-7.5-0.5)/(1.41421356*psf_sigma_x))) *
            (erff((idy-7.5+0.5)/(1.41421356*psf_sigma_y)) - 
             erff((idy-7.5-0.5)/(1.41421356*psf_sigma_y)));

       // Index into the lookup table

       ip = psf_size/2 + 2*idx - 15;
       jp = psf_size/2 + 2*idy - 15;
       if ((ip>=0) && (ip<=psf_size-1) && (jp>=0) && (jp<=psf_size-1)) {
          p0 += lut_0[ip+psf_size*jp] + lut_xd[ip+psf_size*jp]*(xpos-psf_xpos) +
                lut_yd[ip+psf_size*jp]*(ypos-psf_ypos);
       }

       return p0;

    } else if (profile_type == 1) {

       //  moffat25
       //  From iraf/noao/digiphot/daophot/daolib/profile.x

       float d[4][4] = {{ 0.0,         0.0,        0.0,        0.0},
                        {-0.28867513,  0.28867513, 0.0,        0.0},
                        {-0.38729833,  0.0,        0.38729833, 0.0},
                        {-0.43056816, -0.16999052, 0.16999052, 0.43056816}};
       float w[4][4] = {{1.0,         0.0,        0.0,        0.0},
                        {0.5,         0.5,        0.0,        0.0},
                        {0.27777778,  0.44444444, 0.27777778, 0.0},
                        {0.17392742,  0.32607258, 0.32607258, 0.17392742}};

       float alpha = 0.3195079;
       float  p1sq, p2sq, p1p2, dx, dy, xy, denom, func, x[4], xsq[4], p1xsq[4];
       float  y, ysq, p2ysq, wt, p4fod, wp4fod, wf;
       int    npt, ix, iy;

       p1sq = psf_parameters[2]*psf_parameters[2];
       p2sq = psf_parameters[3]*psf_parameters[3];
       p1p2 = psf_parameters[2]*psf_parameters[3];
       dx = idx-7.5+0.5;
       dy = idy-7.5+0.5;
       xy = dx * dy;
       
       denom = 1.0 + alpha * (dx*dx/p1sq + dy*dy/p2sq + xy*psf_parameters[4]);
       if (denom > 1.0e4) {
          return 0.0;
       }

       p0 = 0.0;
       func = 1.0 / (p1p2*powf(float(denom),float(2.5)));
       if (func >= 0.046) {
          npt = 4;
       } else if (func >= 0.0022) {
          npt = 3;
       } else if (func >= 0.0001) {
          npt = 2;
       } else if (func >= 1.0e-10) {
          p0 = (2.5 - 1.0) * func;
       }

       if (func >= 0.0001) {
       
          for (ix=0; ix<npt; ix++) {
             x[ix] = dx + d[npt][ix];
             xsq[ix] = x[ix]*x[ix];
             p1xsq[ix] = xsq[ix]/p1sq;
          }

          for (iy=0; iy<npt; iy++) {
             y = dy + d[npt][iy];
             ysq = y*y;
             p2ysq = ysq/p2sq;
             for (ix=0; ix<npt; ix++) {
                wt = w[npt][iy] * w[npt][ix];
                xy = x[ix] * y;
                denom = 1.0 + alpha * (p1xsq[ix] + p2ysq + xy*psf_parameters[4]);
                func = (2.5 - 1.0) / (p1p2 * powf(denom,2.5) );
                p4fod = 2.5 * alpha * func / denom;
                wp4fod = wt * p4fod;
                wf = wt * func;
                p0 += wf;
             }
          }
          
       }

       p0 *= psf_parameters[1];

       // Index into the lookup table

       ip = psf_size/2 + 2*idx - 15;
       jp = psf_size/2 + 2*idy - 15;
       if ((ip>=0) && (ip<=psf_size-1) && (jp>=0) && (jp<=psf_size-1)) {
           p0 += lut_0[ip+psf_size*jp] + lut_xd[ip+psf_size*jp]*(xpos-psf_xpos) +
                 lut_yd[ip+psf_size*jp]*(ypos-psf_ypos);
       }
       
       return p0;

   } else {

      return 0.0;

   }
   
}


__global__ void convolve_image_psf(int profile_type, int nx, int ny, int dx, int dy,
                          int dp, int ds, int n_coeff, int nkernel,
                          int kernel_radius,int *kxindex,
                          int *kyindex, int* ext_basis, float *psf_parameters,
                          float *psf_0, float *psf_xd, float *psf_yd,
                          float *coeff,float *cim1, float* cim2) {

   int     id, txa, tyb, txag, tybg;
   int     np, ns, i, j, ii, ip, jp, ic, ki, a, b;
   int     d1, sidx, l, m, l1, m1, ig, jg;
   int     psf_size, ix, jx;
   float   x, y, p0, p1, p1g, cpsf_pixel, xpos, ypos;
   float   psf_height, psf_sigma_x, psf_sigma_y, psf_sigma_xy, psf_xpos, psf_ypos;
   float   gain,psf_rad,psf_rad2, px, py;
   float   sx2, sy2, sxy2, sx2msy2, sx2psy2; 
   float  psf_norm,dd;
   float  pi=3.14159265,fwtosig=0.8493218;

   __shared__ float psf_sum[256];
   __shared__ float cpsf[256];
   __shared__ float cpix1[256];
   __shared__ float cpix2[256];


   // initialise memory
   id = threadIdx.x+threadIdx.y*16;
   cpsf[id] = 0.0;

   // star position in normalised units
   xpos = blockIdx.x*dx + dx/2;
   ypos = blockIdx.y*dy + dy/2;
   x = (xpos - 0.5*(nx-1))/(nx-1);
   y = (ypos - 0.5*(ny-1))/(ny-1);


   // number of polynomial coefficients per basis function
   np = (dp+1)*(dp+2)/2;
   ns = (ds+1)*(ds+2)/2;

   // PSF parameters
   psf_size = (int) psf_parameters[0];
   psf_height = psf_parameters[1];
   psf_sigma_x = psf_parameters[2];
   psf_sigma_y = psf_parameters[3];
   psf_ypos = psf_parameters[4];
   psf_xpos = psf_parameters[5];
   psf_rad = psf_parameters[6];
   gain = psf_parameters[7];
   if (psf_rad > 5.0) {
      psf_rad = 5.0;
   }
   psf_rad2 = psf_rad*psf_rad;


   // PSF integral

   __syncthreads();

   psf_sum[id] = 0.0;
   for (i=threadIdx.x+1; i<psf_size-1; i+=blockDim.x) {
     for (j=threadIdx.y+1; j<psf_size-1; j+=blockDim.y) {
       psf_sum[id] += psf_0[i+j*psf_size];
     }
   }

   __syncthreads();

   i = 128;
   while (i != 0) {
     if (id < i) {
       psf_sum[id] += psf_sum[id + i];
     }
     __syncthreads();
     i /= 2;
   }
   __syncthreads();

   if (profile_type == 0) {
      // gaussian
      psf_norm = 0.25*psf_sum[0] + psf_height*2*pi*fwtosig*fwtosig;
   } else if (profile_type == 1) {
      // moffat25
      psf_sigma_xy = psf_parameters[8];
      sx2 = psf_sigma_x*psf_sigma_x;
      sy2 = psf_sigma_y*psf_sigma_y;
      sxy2 = psf_sigma_xy*psf_sigma_xy;
      sx2msy2 = 1.0/sx2 - 1.0/sy2;
      sx2psy2 = 1.0/sx2 + 1.0/sy2;
      px = 1.0/sqrt( sx2psy2 + sqrt(sx2msy2*sx2msy2 + sxy2) );
      py = 1.0/sqrt( sx2psy2 - sqrt(sx2msy2*sx2msy2 + sxy2) );
      psf_norm = 0.25*psf_sum[0] + psf_height*pi*(px*py)/(psf_sigma_x*psf_sigma_y);
   }
   
   // Construct the convolved PSF

   // PSF at location (Idx,Idy). PSF is centred at (7.5,7.5)
   // Analytic part

   p0 = integrated_profile(profile_type, threadIdx.x, threadIdx.y, xpos, ypos,
                           psf_parameters, psf_0, psf_xd, psf_yd);

   __syncthreads();

   cpsf_pixel = 0.0;
    
   // Iterate over coefficients

   for (ic=0; ic<n_coeff; ic++) {
 
      // basis function position
      ki = ic < np ? 0 : (ic-np)/ns + 1;

      if (ki<nkernel) {
       
        a = kxindex[ki];
        b = kyindex[ki];
 
        // Set the polynomial degree for the subvector and the
        // index within the subvector
        if (ki == 0) {
          d1 = dp;
          sidx = ic;
        } else {
          d1 = ds;
          sidx = ic - np - (ki-1)*ns;
        }
       

        // Compute the polynomial index (l,m) values corresponding
        // to the index within the subvector
        l1 = m1 = 0;
        if (d1 > 0) {
          i = 0;
          for (l=0; l<=d1; l++) {
            for (m=0; m<=d1-l; m++) {
              if (i == sidx) {
                l1 = l;
                m1 = m;
              }
              i++;
            }
          }
        }

        // Indices into the PSF

        if (ki > 0) {

          txa = threadIdx.x + a;
          tyb = threadIdx.y + b;
          
          p1 = integrated_profile(profile_type, txa, tyb, xpos, ypos,
                                  psf_parameters, psf_0, psf_xd, psf_yd);

          __syncthreads();


        // If we have an extended basis function, we need to
        // average the PSF over a 3x3 grid
          if (ext_basis[ki]) {

            p1 = 0.0;
            for (ig=-1; ig<2; ig++) {
              for (jg=-1; jg<2; jg++) {
                txag = txa + ig;
                tybg = tyb + jg;
                               
                p1g = integrated_profile(profile_type, txag, tybg, xpos, ypos,
                                         psf_parameters, psf_0, psf_xd, psf_yd);

                __syncthreads();

                p1 += p1g;
              }
            }
            p1 /= 9.0;
           
          }

          cpsf_pixel += coeff[ic]*(p1-p0)*powf(x,l1)*powf(y,m1);

        } else {

          cpsf_pixel += coeff[ic]*p0*powf(x,l1)*powf(y,m1);

        }
      
     }
       
   } //end ic loop

   __syncthreads();

   cpsf[id] = cpsf_pixel/psf_norm;

   __syncthreads();

   // Now convolve the image section with the convolved PSF

   for (i=xpos-dx/2; i<xpos+dx/2; i++) {
     for (j=ypos-dy/2; j<ypos+dy/2; j++) {
       ix = (int)floorf(i+0.5)+threadIdx.x-8.0;
       jx = (int)floorf(j+0.5)+threadIdx.y-8.0;
       cpix1[id] = cpsf[id]*tex2DLayered(tex,ix,jx,0);
       cpix2[id] = cpsf[id]*tex2DLayered(tex,ix,jx,1);

       __syncthreads();
       
       // Parallel sum
       ii = 128;
       while (ii != 0) {
         if (id < ii) {
           cpix1[id] += cpix1[id + ii];
           cpix2[id] += cpix2[id + ii];
         }
         __syncthreads();
         ii /= 2;
       }

       if (id == 0) {
          cim1[i+j*nx] = cpix1[0];
          cim2[i+j*nx] = cpix2[0];
       }
       
       __syncthreads();

     }
   }

   return;

}




__global__ void cu_photom(int profile_type,
                          int nx, int ny, int dp, int ds, int n_coeff, int nkernel,
                          int kernel_radius,int *kxindex,
                          int *kyindex, int* ext_basis, float *psf_parameters,
                          float *psf_0, float *psf_xd, float *psf_yd,
                          float *posx,
                          float *posy, float *coeff, 
                          float *flux, float *dflux) {

   int     id, txa, tyb, txag, tybg;
   int     np, ns, i, j, ip, jp, ic, ki, a, b;
   int     d1, sidx, l, m, l1, m1, ig, jg;
   int     psf_size, ix, jx;
   float   x, y, p0, p1, p1g, cpsf_pixel, xpos, ypos, dd;
   float   psf_height, psf_sigma_x, psf_sigma_y, psf_sigma_xy, psf_xpos, psf_ypos;
   float   psf_rad, psf_rad2, gain, fl, inv_var, px, py;
   float   sx2, sy2, sxy2, sx2msy2, sx2psy2; 
   float  subx, suby, psf_norm, bgnd, dr2;
   float  pi=3.14159265,fwtosig=0.8493218, RON=5.0;

   __shared__ float psf_sum[256];
   __shared__ float cpsf[256];
   __shared__ float  mpsf[256];
   __shared__ float  fsum1[256];
   __shared__ float  fsum2[256];
   __shared__ float  fsum3[256];
   __shared__ float  fsum4[256];
   __shared__ float  fsum5[256];


   // initialise memory
   id = threadIdx.x+threadIdx.y*16;
   cpsf[id] = 0.0;
   mpsf[id] = 0.0;

   // star position in normalised units
   xpos = posx[blockIdx.x];
   ypos = posy[blockIdx.x];
   x = (xpos - 0.5*(nx-1))/(nx-1);
   y = (ypos - 0.5*(ny-1))/(ny-1);


   // number of polynomial coefficients per basis function
   np = (dp+1)*(dp+2)/2;
   ns = (ds+1)*(ds+2)/2;

   // PSF parameters
   psf_size = (int) psf_parameters[0];
   psf_height = psf_parameters[1];
   psf_sigma_x = psf_parameters[2];
   psf_sigma_y = psf_parameters[3];
   psf_ypos = psf_parameters[4];
   psf_xpos = psf_parameters[5];
   psf_rad = psf_parameters[6];
   gain = psf_parameters[7];
   if (psf_rad > 7.0) {
     psf_rad = 7.0;
   }
   psf_rad2 = psf_rad*psf_rad;


   // PSF integral

   __syncthreads();

   psf_sum[id] = 0.0;
   for (i=threadIdx.x; i<psf_size; i+=blockDim.x) {
     for (j=threadIdx.y; j<psf_size; j+=blockDim.y) {
       psf_sum[id] += psf_0[i+j*psf_size];
       //if (blockIdx.x == 120) {
       //   printf("i, j, id, psf_0: %d %d %d %f\\n",i,j,id,psf_0[i+j*psf_size]);
       //}
     }
   }

   __syncthreads();

   i = 128;
   while (i != 0) {
     if (id < i) {
       psf_sum[id] += psf_sum[id + i];
     }
     __syncthreads();
     i /= 2;
   }
   __syncthreads();

   if (profile_type == 0) {
      // gaussian
      psf_norm = 0.25*psf_sum[0] + psf_height*2*pi*fwtosig*fwtosig;
      //if ((id == 0) && (blockIdx.x==120)){
      //   printf("psf_sum0, psf_height, psf_norm: %f %f %f\\n",psf_sum[0],psf_height,psf_norm);
      //}
   } else if (profile_type == 1) {
      // moffat25
      psf_sigma_xy = psf_parameters[8];
      sx2 = psf_sigma_x*psf_sigma_x;
      sy2 = psf_sigma_y*psf_sigma_y;
      sxy2 = psf_sigma_xy*psf_sigma_xy;
      sx2msy2 = 1.0/sx2 - 1.0/sy2;
      sx2psy2 = 1.0/sx2 + 1.0/sy2;
      px = 1.0/sqrt( sx2psy2 + sqrt(sx2msy2*sx2msy2 + sxy2) );
      py = 1.0/sqrt( sx2psy2 - sqrt(sx2msy2*sx2msy2 + sxy2) );
      psf_norm = 0.25*psf_sum[0] + psf_height*pi*(px*py)/(psf_sigma_x*psf_sigma_y);
      //if ((id == 0) && (blockIdx.x==120)){
      //   printf("psf_sum0, psf_height, psf_norm: %f %f %f\\n",psf_sum[0],psf_height, psf_norm);
      //}
   }
   
   
   // Construct the convolved PSF

   // PSF at location (Idx,Idy). PSF is centred at (7.5,7.5)
   // Analytic part

   p0 = integrated_profile(profile_type, threadIdx.x, threadIdx.y, xpos, ypos,
                           psf_parameters, psf_0, psf_xd, psf_yd);

   __syncthreads();

// Spatially variable part 
//
//       +
//               psf_xd[ipsf+psf_size*jpsf]*(xpos-psf_xpos) +
//               psf_yd[ipsf+psf_size*jpsf]*(ypos-psf_ypos);
//       }
//


   cpsf_pixel = 0.0;
    
   // Iterate over coefficients

   for (ic=0; ic<n_coeff; ic++) {
 
      // basis function position
      ki = ic < np ? 0 : (ic-np)/ns + 1;

      if (ki<nkernel) {
       
        a = kxindex[ki];
        b = kyindex[ki];
 
        // Set the polynomial degree for the subvector and the
        // index within the subvector
        if (ki == 0) {
          d1 = dp;
          sidx = ic;
        } else {
          d1 = ds;
          sidx = ic - np - (ki-1)*ns;
        }
       

        // Compute the polynomial index (l,m) values corresponding
        // to the index within the subvector
        l1 = m1 = 0;
        if (d1 > 0) {
          i = 0;
          for (l=0; l<=d1; l++) {
            for (m=0; m<=d1-l; m++) {
              if (i == sidx) {
                l1 = l;
                m1 = m;
              }
              i++;
            }
          }
        }

        // Indices into the PSF

        if (ki > 0) {

          txa = threadIdx.x + a;
          tyb = threadIdx.y + b;

          p1 = integrated_profile(profile_type, txa, tyb, xpos, ypos,
                                  psf_parameters, psf_0, psf_xd, psf_yd);

          __syncthreads();


//
//             +
//                     psf_xd[ipsf+psf_size*jpsf]*(xpos-psf_xpos) +
//                     psf_yd[ipsf+psf_size*jpsf]*(ypos-psf_ypos);
//             }
//

        // If we have an extended basis function, we need to
        // average the PSF over a 3x3 grid
          if (ext_basis[ki]) {

            p1 = 0.0;
            for (ig=-1; ig<2; ig++) {
              for (jg=-1; jg<2; jg++) {
                txag = txa + ig;
                tybg = tyb + jg;
                               
                p1g = integrated_profile(profile_type, txag, tybg, xpos, ypos,
                                  psf_parameters, psf_0, psf_xd, psf_yd);

                __syncthreads();


//
//                   +
//                            psf_xd[ipsf+psf_size*jpsf]*(xpos-psf_xpos) +
//                            psf_yd[ipsf+psf_size*jpsf]*(ypos-psf_ypos);
//                   }
//


                p1 += p1g;
              }
            }
            p1 /= 9.0;
           
          }

          cpsf_pixel += coeff[ic]*(p1-p0)*powf(x,l1)*powf(y,m1);

        } else {

          cpsf_pixel += coeff[ic]*p0*powf(x,l1)*powf(y,m1);

        }
      
     }
       
   } //end ic loop

    __syncthreads();

    cpsf[id] = cpsf_pixel/psf_norm;

    __syncthreads();

/* Uncomment to print convolved PSF   
   if ((id == 0) && (blockIdx.x==14)){
     txa = 7;
     tyb = 7;
     ip = psf_size/2 + 2*txa - 15;
     jp = psf_size/2 + 2*tyb - 15;
     if (profile_type == 0) {
       printf("psf_test: %lf %lf %lf %lf\\n",
               0.5*psf_height*pi*fwtosig*fwtosig*
                 (erff((txa-7.5+0.5)/(1.41421356*psf_sigma_x)) - 
                  erff((txa-7.5-0.5)/(1.41421356*psf_sigma_x))) *
                 (erff((tyb-7.5+0.5)/(1.41421356*psf_sigma_y)) - 
                  erff((tyb-7.5-0.5)/(1.41421356*psf_sigma_y))),
              psf_0[ip+psf_size*jp],
              psf_xd[ip+psf_size*jp]*(xpos-psf_xpos),
              psf_yd[ip+psf_size*jp]*(ypos-psf_ypos));
      }    
             
              

     dd = 0.0;
     printf("cpsf\\n");
     for (j=15; j>=0; j--) {
       printf("%2d ",j);
       for (i=0; i<16; i++) {
         printf("%6.4f  ",cpsf[i+j*16]);
         dd += cpsf[i+j*16];
       }
       printf("\\n");
     }
     printf("sum = %f\\n",dd);
     printf("psf lookup table fraction: %f\\n",psf_sum[0]/psf_norm);
   }
   */


   
   __syncthreads();



   // Map the convolved PSF to the subpixel star coordinates
   
   if (id == 0) {
     resolve_coeffs_2d(16,16,16,cpsf);
   }

   __syncthreads();

   mpsf[id] = 0.0;

   subx = ceilf(xpos+0.5+0.00001) - (xpos+0.5);
   suby = ceilf(ypos+0.5+0.00001) - (ypos+0.5);
   if ((threadIdx.x > 1) && (threadIdx.x < 14) &&
       (threadIdx.y > 1) && (threadIdx.y < 14)) {
      mpsf[id] = interpolate_2d(subx,suby,16,&cpsf[threadIdx.x-2+(threadIdx.y-2)*16]);
   }

   __syncthreads();

   // force negative pixels to zero
   mpsf[id] = mpsf[id] > 0.0 ? mpsf[id] : 0.0;
  
   __syncthreads();

   //
   // Normalise mapped PSF
   //  (No - the convolved PSF contain the phot scale)
/*
   cpsf[id] = mpsf[id];
   __syncthreads();
   i = 128;
   while (i != 0) {
     if (id < i) {
       cpsf[id] += cpsf[id + i];
     }
     __syncthreads();
     i /= 2;
   }
   
   mpsf[id] /= cpsf[0];
*/

/* Uncomment to print mapped PSF */
  if ((id == 0) && (blockIdx.x==14)){
     printf("xpos, ypos: %f %f\\n",xpos,ypos);
     printf("subx, suby: %f %f\\n",subx,suby);
     printf("mpsf\\n");
     dd = 0.0;
     for (j=15; j>=0; j--) {
       printf("%2d ",j);
       for (i=0; i<16; i++) {
         printf("%6.4f  ",mpsf[i+j*16]);
         dd += mpsf[i+j*16];
       }
       printf("\\n");
     }
     printf("sum = %f\\n",dd);
   }  
   __syncthreads();

   

   // Fit the mapped PSF to the difference image to compute an
   // optimal flux estimate.
   // Assume the difference image is in tex(:,:,0)
   // and the inverse variance in tex(:,:,1).
   // We need to iterate to get the variance correct
   //

   fl = 0.0;

   for (j=0; j<3; j++) {

     fsum1[id] = 0.0;
     fsum2[id] = 0.0;
     fsum3[id] = 0.0;

     __syncthreads();

     /*
     if ((id == 0) && (blockIdx.x==14)){
         printf("photom, j=%d\\n",j);
     }
     */
     
     if (powf(threadIdx.x-8.0,2)+powf(threadIdx.y-8.0,2) < psf_rad2) {

        ix = (int)floorf(xpos+0.5)+threadIdx.x-8.0;
        jx = (int)floorf(ypos+0.5)+threadIdx.y-8.0;

        inv_var = 1.0/(1.0/tex2DLayered(tex,ix,jx,1) + fl*mpsf[id]/gain);

        fsum1[id] = mpsf[id]*tex2DLayered(tex,ix,jx,0)*inv_var;
        fsum2[id] = mpsf[id]*mpsf[id]*inv_var;
        fsum3[id] = mpsf[id]; 

        /*
        if ((blockIdx.x==14)){
           printf("ix jx xpos ypos dr2 mpsf inv_var im: %03d %03d %8.3f %8.3f %6.3f %6.5f %g %g\\n",ix,jx,xpos,ypos,dr2, mpsf[id],inv_var, tex2DLayered(tex,ix,jx,0));
        }
        */


     }

     __syncthreads();
   
   
     // Parallel sum
     i = 128;
     while (i != 0) {
       if (id < i) {
         fsum1[id] += fsum1[id + i];
         fsum2[id] += fsum2[id + i];
         fsum3[id] += fsum3[id + i];
       }
       __syncthreads();
       i /= 2;
     }

     fl = fsum1[0]/fsum2[0];

   }
     
   if (id == 0) {
     flux[blockIdx.x] = fl;
     dflux[blockIdx.x] = sqrt(fsum3[0]*fsum3[0]/fsum2[0]);
   }

/* Uncomment for debug info */
/*
   __syncthreads();
   i = 128;
   while (i != 0) {
     if (id < i) {
       mpsf[id] += mpsf[id + i];
     }
     __syncthreads();
     i /= 2;
   }
   __syncthreads();

   if (id == 0) {
     if (blockIdx.x == 120) {
       printf("result: %f %f %f %f %f %f %f %f %f %f %f %f\\n",fsum1[0],fsum2[0],fsum3[0],mpsf[0],psf_norm,psf_sum[0],bgnd,flux[blockIdx.x],flux[blockIdx.x]*fsum3[0],flux[blockIdx.x]*mpsf[0],fsum4[0],dflux[blockIdx.x]);
     }
   }
*/

   __syncthreads();

   return;

}


__global__ void cu_compute_model(int dp, int ds, int db, int *kxindex,
                int *kyindex, int* ext_basis, int nkernel, float *coefficient,
                float *M) {

   int  np, ns, nb, hs, idx, ki, a, b, d1, sidx, l, m, l1, m1, i;
   float x, y, Bi;

   __shared__ float count[THREADS_PER_BLOCK];

   // Calculate number of terms in subvectors
   np = (dp+1)*(dp+2)/2;
   ns = (ds+1)*(ds+2)/2;
   nb = (db+1)*(db+2)/2;
   hs = (nkernel-1)*ns+np+nb;

   x = (blockIdx.x - 0.5*(gridDim.x-1))/(gridDim.x-1);
   y = (blockIdx.y - 0.5*(gridDim.y-1))/(gridDim.y-1);

   count[threadIdx.x] = 0.0;

   for (idx = threadIdx.x; idx < hs; idx += blockDim.x) {

     // This is the index of the subvector and its kernel offsets
     ki = idx < np ? 0 : (idx-np)/ns + 1;
     a = b = 0;
     if (ki<nkernel) {
       a = kxindex[ki];
       b = kyindex[ki];
     }
   
     // Set the polynomial degree for the subvector and the
     // index within the subvector
     if (ki == 0) {
       d1 = dp;
       sidx = idx;
     } else if (ki < nkernel) {
       d1 = ds;
       sidx = idx - np - (ki-1)*ns;
     } else {
       d1 = db;
       sidx = idx - np - (ki-1)*ns;
     }

     // Compute the (l,m) values corresponding to the index within
     // the subvector
     l1 = m1 = 0;
     if (d1 > 0) {
       i = 0;
       for (l=0; l<=d1; l++) {
         for (m=0; m<=d1-l; m++) {
           if (i == sidx) {
             l1 = l;
             m1 = m;
           }
           i++;
         }
       }
     }

     if (ki == 0) {
       Bi = tex2DLayered(tex,blockIdx.x,blockIdx.y,0);
     } else if (ki < nkernel) {
       if (ext_basis[ki]) {
         Bi = tex2DLayered(tex,blockIdx.x+a,blockIdx.y+b,1)-
                tex2DLayered(tex,blockIdx.x,blockIdx.y,0);
       } else {
         Bi =  tex2DLayered(tex,blockIdx.x+a,blockIdx.y+b,0)-
                tex2DLayered(tex,blockIdx.x,blockIdx.y,0);
       }
     } else {
       Bi = 1.0;
     }
     
     count[threadIdx.x] += coefficient[idx]*powf(x,l1)*powf(y,m1)*Bi;


   }

   __syncthreads();

   // Then parallel-sum the results
   i = blockDim.x/2;
   while (i != 0) {
     if (threadIdx.x < i) {
       count[threadIdx.x] += count[threadIdx.x + i];
     }
     __syncthreads();
     i /= 2;
   }
   if (threadIdx.x == 0) {
     M[blockIdx.x+gridDim.x*blockIdx.y] = count[0];
   }

}


__global__ void cu_compute_vector(int dp, int ds, int db, int nx,
                int ny, int *kxindex, int *kyindex, int *ext_basis, int nkernel,
                int kernelRadius,float *V) {

   int idx; 
   int np, ns, ki, a, b, d1, i, j;
   int l, m, l1, m1;
   float py, x, y, Bi;
   float temp;
   
    __shared__ float count[THREADS_PER_BLOCK];

   // Calculate number of terms in subvectors
   np = (dp+1)*(dp+2)/2;
   ns = (ds+1)*(ds+2)/2;

   // This is the index of the subvector and its kernel offsets
   ki = blockIdx.x < np ? 0 : (blockIdx.x-np)/ns + 1;
   a = b = 0;
   if (ki<nkernel) {
     a = kxindex[ki];
     b = kyindex[ki];
   }
   
   // Set the polynomial degrees for the submatrix and the
   // indices within the submatrix
   if (ki == 0) {
     d1 = dp;
     idx = blockIdx.x;
   } else if (ki < nkernel) {
     d1 = ds;
     idx = blockIdx.x - np - (ki-1)*ns;
   } else {
     d1 = db;
     idx = blockIdx.x - np - (ki-1)*ns;
   }

   // Compute the (l,m) values corresponding to the index within
   // the subvector
   i = 0;
   for (l=0; l<=d1; l++) {
     for (m=0; m<=d1-l; m++) {
       if (i == idx) {
         l1 = l;
         m1 = m;
       }
       i++;
     }
   }   

   // Compute the contribution to V from each image location.
   // Use individual threads to sum over columns.
   // tex[:,:,0] is the reference image,
   // tex[:,:,1] is the blurred reference image,
   // tex[:,:,2] is the target image,
   // tex[:,:,3] is the inverse variance,
   // tex[:,:,4] is the mask.
   // Bi is the basis image value.
   temp = 0.0;
   Bi = 1.0;
   __syncthreads();
   for (j=kernelRadius; j<ny-kernelRadius; j++) {
     y = (j - 0.5*(ny-1))/(ny-1);
     py = powf(y,m1);
     for (i=threadIdx.x+kernelRadius; i<nx-kernelRadius; i+=blockDim.x) {
         x = (i - 0.5*(nx-1))/(nx-1);
         if (ki == 0) {
           Bi = tex2DLayered(tex,i,j,0);
         } else if (ki < nkernel) {
           if (ext_basis[ki]) {
             Bi = tex2DLayered(tex,i+a,j+b,1)-tex2DLayered(tex,i,j,0);
           } else {
             Bi = tex2DLayered(tex,i+a,j+b,0)-tex2DLayered(tex,i,j,0);
           }
         } else {
           Bi = 1.0;
         }
         temp += powf(x,l1)*py*Bi*tex2DLayered(tex,i,j,2)*tex2DLayered(tex,i,j,3)*
                 tex2DLayered(tex,i,j,4);
     }
   }

   count[threadIdx.x] = temp;

   __syncthreads();

   // Then parallel-sum the rows
   i = blockDim.x/2;
   while (i != 0) {
     if (threadIdx.x < i) {
       count[threadIdx.x] += count[threadIdx.x + i];
     }
     __syncthreads();
     i /= 2;
   }
   if (threadIdx.x == 0) {
     V[blockIdx.x] = count[0];
   }

}



__global__ void cu_compute_vector_stamps(int dp, int ds, int db, int nx,
                int ny, int nstamps, int stamp_half_width, float *stamp_xpos, float* stamp_ypos,
                int *kxindex, int *kyindex, int *ext_basis, int nkernel,
                int kernelRadius,float *V) {

   int idx; 
   int np, ns, ki, a, b, d1, i, j, i1, i2, j1, j2;
   int l, m, l1, m1;
   float py, x, y, Bi;
   float temp;
   
    __shared__ float count[THREADS_PER_BLOCK];

   // Calculate number of terms in subvectors
   np = (dp+1)*(dp+2)/2;
   ns = (ds+1)*(ds+2)/2;

   // This is the index of the subvector and its kernel offsets
   ki = blockIdx.x < np ? 0 : (blockIdx.x-np)/ns + 1;
   a = b = 0;
   if (ki<nkernel) {
     a = kxindex[ki];
     b = kyindex[ki];
   }
   
   // Set the polynomial degrees for the submatrix and the
   // indices within the submatrix
   if (ki == 0) {
     d1 = dp;
     idx = blockIdx.x;
   } else if (ki < nkernel) {
     d1 = ds;
     idx = blockIdx.x - np - (ki-1)*ns;
   } else {
     d1 = db;
     idx = blockIdx.x - np - (ki-1)*ns;
   }

   // Compute the (l,m) values corresponding to the index within
   // the subvector
   i = 0;
   for (l=0; l<=d1; l++) {
     for (m=0; m<=d1-l; m++) {
       if (i == idx) {
         l1 = l;
         m1 = m;
       }
       i++;
     }
   }   

   // Compute the contribution to V from each image location.
   // Use individual threads to sum over columns.
   // tex[:,:,0] is the reference image,
   // tex[:,:,1] is the blurred reference image,
   // tex[:,:,2] is the target image,
   // tex[:,:,3] is the inverse variance,
   // tex[:,:,4] is the mask.
   // Bi is the basis image value.
   temp = 0.0;
   Bi = 1.0;
   __syncthreads();
   for (idx = threadIdx.x; idx<nstamps; idx += blockDim.x) {
     j1 = max(0,(int)stamp_ypos[idx]-stamp_half_width);
     j2 = min(ny,(int)stamp_ypos[idx]+stamp_half_width);
     for (j=j1; j<j2; j++) {
       y = (j - 0.5*(ny-1))/(ny-1);
       py = powf(y,m1);
       i1 = max(0,(int)stamp_xpos[idx]-stamp_half_width);
       i2 = min(nx,(int)stamp_xpos[idx]+stamp_half_width);
       for (i=i1; i<i2; i++) {
         x = (i - 0.5*(nx-1))/(nx-1);
         if (ki == 0) {
           Bi = tex2DLayered(tex,i,j,0);
         } else if (ki < nkernel) {
           if (ext_basis[ki]) {
             Bi = tex2DLayered(tex,i+a,j+b,1)-tex2DLayered(tex,i,j,0);
           } else {
             Bi = tex2DLayered(tex,i+a,j+b,0)-tex2DLayered(tex,i,j,0);
           }
         } else {
           Bi = 1.0;
         }
         temp += powf(x,l1)*py*Bi*tex2DLayered(tex,i,j,2)*tex2DLayered(tex,i,j,3)*
                 tex2DLayered(tex,i,j,4);
       }
     }
   }

   count[threadIdx.x] = temp;

   __syncthreads();

   // Then parallel-sum the rows
   i = blockDim.x/2;
   while (i != 0) {
     if (threadIdx.x < i) {
       count[threadIdx.x] += count[threadIdx.x + i];
     }
     __syncthreads();
     i /= 2;
   }
   if (threadIdx.x == 0) {
     V[blockIdx.x] = count[0];
   }

}




__global__ void cu_compute_matrix(int dp, int ds, int db, int nx,
                int ny, int *kxindex, int *kyindex, int *ext_basis, int nkernel,
                int kernelRadius,float *H) {

   int idx, idy, idx0, idy0, idx1, idy1; 
   int np, ns, ki, kj, a, b, c, d, d1, d2, i, j;
   int l, m, l1, m1, l2, m2;
   float py, x, y, Bi, Bj;
   float temp;
   
   __shared__ float count[THREADS_PER_BLOCK];


   // Terminate if we are not in the lower triangle
   if (blockIdx.x > blockIdx.y) {
     return;
   }

   // Calculate number of terms in submatrices
   np = (dp+1)*(dp+2)/2;
   ns = (ds+1)*(ds+2)/2;

   // These are indices of the submatrix and their kernel offsets
   ki = blockIdx.x < np ? 0 : (blockIdx.x-np)/ns + 1;
   kj = blockIdx.y < np ? 0 : (blockIdx.y-np)/ns + 1;


   a = b = 0;
   if (ki<nkernel) {
     a = kxindex[ki];
     b = kyindex[ki];
   }
   if (kj<nkernel) {
     c = kxindex[kj];
     d = kyindex[kj];
   }
   
   // Set the polynomial degrees for the submatrix and the
   // indices within the submatrix
   if (ki == 0) {
     d1 = dp;
     idx = blockIdx.x;
   } else if (ki < nkernel) {
     d1 = ds;
     idx = blockIdx.x - np - (ki-1)*ns;
   } else {
     d1 = db;
     idx = blockIdx.x - np - (ki-1)*ns;
   }
   if (kj == 0) {
     d2 = dp;
     idy = blockIdx.y;
   } else if (kj < nkernel) {
     d2 = ds;
     idy = blockIdx.y - np - (kj-1)*ns;
   } else {
     d2 = db;
     idy = blockIdx.y - np - (kj-1)*ns;
   }


   if ((ki>0) && (ki<nkernel) && (kj>0) && (kj<nkernel) && (idx > idy)) {
     return;
   }

   idx0 = idx;
   idy0 = idy;

   // Compute the (l,m) values corresponding to the indices within
   // the submatrix
   i = 0;
   for (l=0; l<=d1; l++) {
     for (m=0; m<=d1-l; m++) {
       if (i == idx) {
         l1 = l;
         m1 = m;
       }
       i++;
     }
   }   
   i = 0;
   for (l=0; l<=d2; l++) {
     for (m=0; m<=d2-l; m++) {
       if (i == idy) {
         l2 = l;
         m2 = m;
       }
       i++;
     }
   }

   // Compute the contribution to H from each image location.
   // Use individual threads to sum over columns.
   // tex[:,:,0] is the reference image,
   // tex[:,:,1] is the blurred reference image,
   // tex[:,:,2] is the target image,
   // tex[:,:,3] is the inverse variance,
   // tex[:,:,4] is the mask.
   // Bi and Bj are the basis image values.
   temp = 0.0;
   Bi = Bj = 1.0;
   __syncthreads();
   for (j=kernelRadius; j<ny-kernelRadius; j++) {
     y = (j - 0.5*(ny-1))/(ny-1);
     py = powf(y,m1+m2);
     for (i=threadIdx.x+kernelRadius; i<nx-kernelRadius; i+=blockDim.x) {
         x = (i - 0.5*(nx-1))/(nx-1);
         if (ki == 0) {
           Bi = tex2DLayered(tex,i,j,0);
         } else if (ki < nkernel) {
           if (ext_basis[ki]) {
             Bi = tex2DLayered(tex,i+a,j+b,1)-tex2DLayered(tex,i,j,0);
           } else {
             Bi = tex2DLayered(tex,i+a,j+b,0)-tex2DLayered(tex,i,j,0);
           }
         } else {
           Bi = 1.0;
         }
         if (kj == 0) {
           Bj = tex2DLayered(tex,i,j,0);
         } else if (kj < nkernel) {
           if (ext_basis[kj]) {
             Bj = tex2DLayered(tex,i+c,j+d,1)-tex2DLayered(tex,i,j,0);
           } else {
             Bj = tex2DLayered(tex,i+c,j+d,0)-tex2DLayered(tex,i,j,0);
           }
         } else {
           Bj = 1.0;
         }
         temp += powf(x,l1+l2)*py*Bi*Bj*tex2DLayered(tex,i,j,3)*tex2DLayered(tex,i,j,4);
     }
   }

   count[threadIdx.x] = temp;

   __syncthreads();

   // Then parallel-sum the rows
   i = blockDim.x/2;
   while (i != 0) {
     if (threadIdx.x < i) {
       count[threadIdx.x] += count[threadIdx.x + i];
     }
     __syncthreads();
     i /= 2;
   }

   if (threadIdx.x == 0) {

     H[blockIdx.x+gridDim.x*blockIdx.y] = count[0];
     H[blockIdx.y+gridDim.x*blockIdx.x] = count[0];
     if ((ki>0) && (ki<nkernel) && (kj>0) && (kj<nkernel)) {
       idx1 = np + (ki-1)*ns;
       idy1 = np + (kj-1)*ns;
       H[(idx1+idy0)+gridDim.x*(idy1+idx0)] = count[0];
       H[(idy1+idx0)+gridDim.x*(idx1+idy0)] = count[0];
     }
   }

}



__global__ void cu_compute_matrix_stamps(int dp, int ds, int db, int nx,
                int ny, int nstamps, int stamp_half_width, float *stamp_xpos, float* stamp_ypos,
                int *kxindex, int *kyindex, int *ext_basis, int nkernel,
                int kernelRadius,float *H) {

   int idx, idy, idx0, idy0, idx1, idy1; 
   int np, ns, ki, kj, a, b, c, d, d1, d2, i, j, i1, i2, j1, j2;
   int l, m, l1, m1, l2, m2;
   float px, py, x, y, Bi, Bj;
   float temp;
   
    __shared__ float count[THREADS_PER_BLOCK];

   // Terminate if we are not in the lower triangle
   if (blockIdx.x > blockIdx.y) {
     return;
   }

   // Calculate number of terms in submatrices
   np = (dp+1)*(dp+2)/2;
   ns = (ds+1)*(ds+2)/2;

   // These are indices of the submatrix and their kernel offsets
   ki = blockIdx.x < np ? 0 : (blockIdx.x-np)/ns + 1;
   kj = blockIdx.y < np ? 0 : (blockIdx.y-np)/ns + 1;

   a = b = 0;
   if (ki<nkernel) {
     a = kxindex[ki];
     b = kyindex[ki];
   }
   if (kj<nkernel) {
     c = kxindex[kj];
     d = kyindex[kj];
   }
   
   // Set the polynomial degrees for the submatrix and the
   // indices within the submatrix
   if (ki == 0) {
     d1 = dp;
     idx = blockIdx.x;
   } else if (ki < nkernel) {
     d1 = ds;
     idx = blockIdx.x - np - (ki-1)*ns;
   } else {
     d1 = db;
     idx = blockIdx.x - np - (ki-1)*ns;
   }
   if (kj == 0) {
     d2 = dp;
     idy = blockIdx.y;
   } else if (kj < nkernel) {
     d2 = ds;
     idy = blockIdx.y - np - (kj-1)*ns;
   } else {
     d2 = db;
     idy = blockIdx.y - np - (kj-1)*ns;
   }

   if ((ki>0) && (ki<nkernel) && (kj>0) && (kj<nkernel) && (idx > idy)) {
     return;
   }

   idx0 = idx;
   idy0 = idy;

   // Compute the (l,m) values corresponding to the indices within
   // the submatrix
   i = 0;
   for (l=0; l<=d1; l++) {
     for (m=0; m<=d1-l; m++) {
       if (i == idx) {
         l1 = l;
         m1 = m;
       }
       i++;
     }
   }   
   i = 0;
   for (l=0; l<=d2; l++) {
     for (m=0; m<=d2-l; m++) {
       if (i == idy) {
         l2 = l;
         m2 = m;
       }
       i++;
     }
   }

   // Compute the contribution to H from each image location.
   // Use individual threads to sum over stamps.
   // tex[:,:,0] is the reference image,
   // tex[:,:,1] is the blurred reference image,
   // tex[:,:,2] is the target image,
   // tex[:,:,3] is the inverse variance,
   // tex[:,:,4] is the mask.
   // Bi and Bj are the basis image values.
   temp = 0.0;
   Bi = Bj = 1.0;
   __syncthreads();


   for (idx = threadIdx.x; idx<nstamps; idx += blockDim.x) {
     i1 = max(0,(int)stamp_xpos[idx]-stamp_half_width);
     i2 = min(nx,(int)stamp_xpos[idx]+stamp_half_width);
     for (i=i1; i<i2; i++) {
       x = (i - 0.5*(nx-1))/(nx-1);
       px = powf(x,l1+l2);
       j1 = max(0,(int)stamp_ypos[idx]-stamp_half_width);
       j2 = min(ny,(int)stamp_ypos[idx]+stamp_half_width);
       for (j=j1; j<j2; j++) {
         y = (j - 0.5*(ny-1))/(ny-1);
         py = powf(y,m1+m2);
         if (ki == 0) {
           Bi = tex2DLayered(tex,i,j,0);
         } else if (ki < nkernel) {
           if (ext_basis[ki]) {
             Bi = tex2DLayered(tex,i+a,j+b,1)-tex2DLayered(tex,i,j,0);
           } else {
             Bi = tex2DLayered(tex,i+a,j+b,0)-tex2DLayered(tex,i,j,0);
           }
         } else {
           Bi = 1.0;
         }
         if (kj == 0) {
           Bj = tex2DLayered(tex,i,j,0);
         } else if (kj < nkernel) {
           if (ext_basis[kj]) {
             Bj = tex2DLayered(tex,i+c,j+d,1)-tex2DLayered(tex,i,j,0);
           } else {
             Bj = tex2DLayered(tex,i+c,j+d,0)-tex2DLayered(tex,i,j,0);
           }
         } else {
           Bj = 1.0;
         }
         temp += px*py*Bi*Bj*tex2DLayered(tex,i,j,3)*tex2DLayered(tex,i,j,4);
       }
     }
   }

   count[threadIdx.x] = temp;

   __syncthreads();

   // Then parallel-sum the rows
   i = blockDim.x/2;
   while (i != 0) {
     if (threadIdx.x < i) {
       count[threadIdx.x] += count[threadIdx.x + i];
     }
     __syncthreads();
     i /= 2;
   }

   if (threadIdx.x == 0) {
     H[blockIdx.x+gridDim.x*blockIdx.y] = count[0];
     H[blockIdx.y+gridDim.x*blockIdx.x] = count[0];
     if ((ki>0) && (ki<nkernel) && (kj>0) && (kj<nkernel)) {
       idx1 = np + (ki-1)*ns;
       idy1 = np + (kj-1)*ns;
       H[(idx1+idy0)+gridDim.x*(idy1+idx0)] = count[0];
       H[(idy1+idx0)+gridDim.x*(idx1+idy0)] = count[0];
     }

   }

}


""")
Пример #8
0
        def gpuFunc(iterator):
            iterator = iter(iterator)
            cpu_data = numpy.asarray(list(iterator), dtype=numpy.float32)
            datasize = numpy.asarray(len(cpu_data))
            gridNum = (
                datasize *
                3) / 256 + 1  # * 3 for data dimensions. /256 for block size.
            # +1 for overprovisioning in case there is dangling threads
            centroids = numpy.empty((datasize), gpuarray.vec.float4)
            cuda.init()
            dev = cuda.Device(0)
            contx = dev.make_context()

            # The GPU kernel below takes centroids IDs and 3-D data points in form of float4 (x,y,z,w).
            # X is for the centroid ID whereas (y,z,w) are the actual point coordinate.
            mod = SourceModule("""
        	__global__ void assignToCentroid(float *data, int* datasize, int *k, float *kPoints, float4 * clusters){ 
                	//1d grid index
                	unsigned idxInLeftBlocks = blockIdx.x * (blockDim.x * blockDim.y);
                	unsigned idxInCurrBlock  = threadIdx.y * blockDim.x + threadIdx.x;
                	unsigned idx = idxInLeftBlocks + idxInCurrBlock;
                	int size = datasize[0];
                	int K = k[0];   
                	if (idx < size*3){//consider xyz for each data point
                        	if (idx % 3 == 0){
                                	unsigned dataIdx = idx/3;
                                	clusters[dataIdx].x = 999999; 
                                	clusters[dataIdx].y = data[idx];
                                	clusters[dataIdx].z = data[idx+1];
                                	clusters[dataIdx].w = data[idx+2];      
                        	}
                        	float inf =  99999.9999;
                        	float closest = inf;

                        	float tempDist=0, tempDistX=0, tempDistY=0, tempDistZ=0;
				//updateing tempDist
                        	for (int i = 0; i < K; i++){
                                	if( idx % 3 == 0 /*0,3,6*/)
                                        	tempDistX = data[idx] - kPoints[i*3];
                                	else if(idx % 3 == 1 /*1,4,7*/)
                                        	tempDistY = data[idx] - kPoints[i*3 + 1];
                                	else {/*2,5,8*/
                                        	tempDistZ = data[idx] - kPoints[i*3 + 2];
                                        	tempDist = pow(tempDistX,2) + pow(tempDistY,2) + pow(tempDistZ,2);
                                	}

                                	if(tempDist < closest){
                                        	closest = tempDist;
                                        	int dataIdx = idx/3; 
                                        	clusters[dataIdx].x = i;//coordinate belongs to cluster i
                                	}
                        	}
                	}
        	}

    		""")
            func = mod.get_function("assignToCentroid")
            func(cuda.In(cpu_data),
                 cuda.In(datasize),
                 cuda.In(k),
                 cuda.In(kPoints),
                 cuda.Out(centroids),
                 block=(16, 16, 1),
                 grid=(gridNum, 1),
                 shared=0)
            closest = [(x[0], (numpy.asarray([x[1], x[2], x[3]]), 1))
                       for x in centroids]
            contx.pop()
            del cpu_data
            del datasize
            del centroids
            del contx
            return iter(closest)
Пример #9
0
from pycuda.compiler import SourceModule

TPB = 16
N = 1024
N2 = N**2

mod = SourceModule("""
    #include <stdio.h>
    #define N 1024
    #define N2 1048576

    __global__ void prime(int *array, int *prime_mask) {
        int i = threadIdx.x + blockIdx.x * blockDim.x;
        
        if(array[i] > 1){
            if(prime_mask[i] == 0){
                int inc = array[i];
                for(int j = i+inc ; j < N2 ; j += inc ){
                    prime_mask[j] = 1;
                }            
            }
        }
    }

""")


def p_prime(array, prime_mask):
    for i in range(1, N2):
        if prime_mask[i] == 0:
            print("El numero {} es primo".format(array[i]))
    ex_gpu = cuda.to_device(f)
    ey_gpu = cuda.to_device(f)
    ez_gpu = cuda.to_device(f)
    hx_gpu = cuda.to_device(f)
    hy_gpu = cuda.to_device(f)
    hz_gpu = cuda.to_device(f)

    cex_gpu = cuda.to_device(set_c(f, 'yz'))
    cey_gpu = cuda.to_device(set_c(f, 'zx'))
    cez_gpu = cuda.to_device(set_c(f, 'xy'))

    # prepare kernels
    from pycuda.compiler import SourceModule
    mod = SourceModule(
        kernels.replace('TPB', str(tpb)).replace('nxy', str(nx * ny)).replace(
            'nx', str(nx)).replace('ny', str(ny)).replace('nz', str(nz)))
    update_h = mod.get_function("update_h")
    update_e = mod.get_function("update_e")
    update_src = mod.get_function("update_src")

    update_h.prepare("PPPPPP", block=(tpb, 1, 1))
    update_e.prepare("PPPPPPPPP", block=(16, 16, 1))
    update_src.prepare("fP", block=(nx, 1, 1))

    # prepare for plot
    from matplotlib.pyplot import *
    ion()
    imsh = imshow(np.ones((ny, nz), 'f').T,
                  cmap=cm.hot,
                  origin='lower',
	ey_gpu = cuda.to_device(f)
	ez_gpu = cuda.to_device(f)
	hx_gpu = cuda.to_device(f)
	hy_gpu = cuda.to_device(f)
	hz_gpu = cuda.to_device(f)

	cex_gpu = cuda.to_device( set_c(f,(None,-1,-1)) )
	cey_gpu = cuda.to_device( set_c(f,(-1,None,-1)) )
	cez_gpu = cuda.to_device( set_c(f,(-1,-1,None)) )
	chx_gpu = cuda.to_device( set_c(f,(None,0,0)) )
	chy_gpu = cuda.to_device( set_c(f,(0,None,0)) )
	chz_gpu = cuda.to_device( set_c(f,(0,0,None)) )

	# prepare kernels
	from pycuda.compiler import SourceModule
	mod = SourceModule(kernels)
	update_e = mod.get_function("update_e")
	update_h = mod.get_function("update_h")
	update_src = mod.get_function("update_src")
	tex = mod.get_texref("tex")
	tey = mod.get_texref("tey")
	tez = mod.get_texref("tez")
	thx = mod.get_texref("thx")
	thy = mod.get_texref("thy")
	thz = mod.get_texref("thz")
	tcex = mod.get_texref("tcex")
	tcey = mod.get_texref("tcey")
	tcez = mod.get_texref("tcez")
	tchx = mod.get_texref("tchx")
	tchy = mod.get_texref("tchy")
	tchz = mod.get_texref("tchz")
Пример #12
0
    block_size = 64 
    block = (block_size,1,1)
    grid = (A_cols/block_size+1,1)
    concatenate_cols_kernel(A,A_conc, np.intc(A_rows), np.intc(A_cols), block= block, grid=grid)
    
    
def sign(x):
    return np.float32(2.*np.greater_equal(x,0)-1.)
    
if __name__ == "__main__":   
    
    context = pycuda.autoinit.context
    
    print "Building the kernels..."
    
    mod = SourceModule(open("binary_kernels.cu").read())
    gemm_kernel = mod.get_function("gemm")
    concatenate_rows_kernel = mod.get_function("concatenate_rows_kernel")
    concatenate_cols_kernel = mod.get_function("concatenate_cols_kernel")
    xnor_gemm_kernel = mod.get_function("xnor_gemm")
    
    print "Loading matrices to device..."
    
    # Matrices dimensions
    N = 8192
    A_rows = N
    A_cols = N
    B_cols = N
    # A_rows = 784
    # A_cols = 1024
    # B_cols = 4096
Пример #13
0
mod = SourceModule("""

    __device__ __constant__ float t_c[9] ;
    __device__ __constant__ int c_c[18] ;
    
    __global__ void funRT(float* fin_g0,float* fin_g1,float* fin_g2,float* fin_g3,float* fin_g4,float* fin_g5,float* fin_g6,float* fin_g7,float* fin_g8, 
                          float* feq_g0,float* feq_g1,float* feq_g2,float* feq_g3,float* feq_g4,float* feq_g5,float* feq_g6,float* feq_g7,float* feq_g8, 
                          float* rho_g, float* u_g, float* taus_g){
        int x     = threadIdx.x + blockIdx.x * blockDim.x;
        int y     = threadIdx.y + blockIdx.y * blockDim.y;
        int xsize    = blockDim.x * gridDim.x; 
        int ysize    = blockDim.y * gridDim.y; 
        int d = xsize * ysize;
        int i = x + y * xsize;
        //int k = 0; //int BC = 0; // defining whether the cell is a boundary condition or not
        
        float omega = %s; //1.940391;

        float cu[9];
        float rho_l= 0.0; //rho_g[i];

        float fin_l[9]; float u_l[2] ; // local fin to avoid multiple global memory access
        
        
        if ( (x>=0) and (x<xsize) and (y>=0) and (y<ysize) ){
            fin_l[0] = fin_g0[i];
        } else { fin_l[0] = fin_g0[i]; }
        if ( (x-1>=0) and (x-1<xsize) and (y>=0) and (y<ysize) ){
            fin_l[1] = fin_g1[i-1];
        } else { fin_l[1] = fin_g1[i]; }        
        if ( (x>=0) and (x<xsize) and (y+1>=0) and (y+1<ysize) ){
            fin_l[2] = fin_g2[i+xsize];
        } else { fin_l[2] = fin_g2[i]; }       
        if ( (x+1>=0) and (x+1<xsize) and (y>=0) and (y<ysize) ){
            fin_l[3] = fin_g3[i+1];
        } else { fin_l[3] = fin_g3[i]; }    
        if ( (x>=0) and (x<xsize) and (y-1>=0) and (y-1<ysize) ){
            fin_l[4] = fin_g4[i-xsize];
        } else { fin_l[4] = fin_g4[i]; }        
        if ( (x-1>=0) and (x-1<xsize) and (y+1>=0) and (y+1<ysize) ){
            fin_l[5] = fin_g5[i-1+xsize];
        } else { fin_l[5] = fin_g5[i]; }        
        if ( (x+1>=0) and (x+1<xsize) and (y+1>=0) and (y+1<ysize) ){
            fin_l[6] = fin_g6[i+1+xsize];
        } else { fin_l[6] = fin_g6[i]; }        
        if ( (x+1>=0) and (x+1<xsize) and (y-1>=0) and (y-1<ysize) ){
            fin_l[7] = fin_g7[i+1-xsize];
        } else { fin_l[7] = fin_g7[i]; }        
        if ( (x-1>=0) and (x-1<xsize) and (y-1>=0) and (y-1<ysize) ){
            fin_l[8] = fin_g8[i-1-xsize];
        } else { fin_l[8] = fin_g8[i]; }
        // Applying boundary conditions
        
        //__syncthreads();
        
        if (x ==0 ){
            fin_l[1] = feq_g1[i] - feq_g3[i] + fin_l[3];
            fin_l[5] = feq_g5[i] - feq_g7[i] + fin_l[7];
            fin_l[8] = feq_g8[i] - feq_g6[i] + fin_l[6];
        }else if(x==xsize-1){
            fin_l[3] = -feq_g1[i] + feq_g3[i] + fin_l[1];
            fin_l[6] = -feq_g8[i] + feq_g6[i] + fin_l[8];
            fin_l[7] = -feq_g5[i] + feq_g7[i] + fin_l[5];
        }
                 
        if (y==ysize-1){   
            fin_l[2] = -feq_g4[i] + feq_g2[i] + fin_l[4];
            fin_l[5] = -feq_g7[i] + feq_g5[i] + fin_l[7];
            fin_l[6] = -feq_g8[i] + feq_g6[i] + fin_l[8];
        }else if (y==0){     
            fin_l[4] = -feq_g2[i] + feq_g4[i] + fin_l[2];
            fin_l[7] = -feq_g5[i] + feq_g7[i] + fin_l[5];
            fin_l[8] = -feq_g6[i] + feq_g8[i] + fin_l[6];
        } 
        
        rho_l = fin_l[0]+fin_l[1]+fin_l[2]+fin_l[3]+fin_l[4]+fin_l[5]+fin_l[6]+fin_l[7]+fin_l[8];
        
        u_l[0] = (c_c[0]*fin_l[0]+c_c[2]*fin_l[1]+c_c[4]*fin_l[2]+c_c[6]*fin_l[3]+c_c[8]*fin_l[4]+c_c[10]*fin_l[5]+c_c[12]*fin_l[6]+c_c[14]*fin_l[7]+c_c[16]*fin_l[8])/rho_l;
        u_l[1] = (c_c[1]*fin_l[0]+c_c[3]*fin_l[1]+c_c[5]*fin_l[2]+c_c[7]*fin_l[3]+c_c[9]*fin_l[4]+c_c[11]*fin_l[5]+c_c[13]*fin_l[6]+c_c[15]*fin_l[7]+c_c[17]*fin_l[8])/rho_l;
            
        // BCs left wall, right wall, bottom wall and top wall
        
        if ( x == 0 or x == xsize-1 or y == ysize-1){
            u_l[0] =0;
            u_l[1] =0;    
        }           
        else if (y==0){ 
            rho_l = fin_l[0]+fin_l[1]+fin_l[3]+2*(fin_l[2]+fin_l[5]+fin_l[6]);
            u_l[0]= 0.08 ; // uLB; 
            u_l[1] =0;
        }
        
        u_g[0*d+i] = u_l[0]; u_g[1*d+i] = u_l[1];
        rho_g[i] = rho_l;

        cu[0] = 0 ;      
        feq_g0[i] = rho_l*t_c[0]*(1. + 3.0*cu[0] + 9*0.5*cu[0]*cu[0] - 3.0*0.5*(u_l[0]*u_l[0] + u_l[1]*u_l[1]));
        fin_g0[i] = fin_l[0] - omega*(fin_l[0]-feq_g0[i]) ; 

        cu[1] = (u_l[0]) ;      
        feq_g1[i] = rho_l*t_c[1]*(1. + 3.0*cu[1] + 9*0.5*cu[1]*cu[1] - 3.0*0.5*(u_l[0]*u_l[0] + u_l[1]*u_l[1]));
        fin_g1[i] = fin_l[1] - omega*(fin_l[1]-feq_g1[i]) ; 
        
        cu[2] = (u_l[1]) ;      
        feq_g2[i] = rho_l*t_c[2]*(1. + 3.0*cu[2] + 9*0.5*cu[2]*cu[2] - 3.0*0.5*(u_l[0]*u_l[0] + u_l[1]*u_l[1]));
        fin_g2[i] = fin_l[2] - omega*(fin_l[2]-feq_g2[i]) ; 
        
        cu[3] = (-u_l[0]) ;      
        feq_g3[i] = rho_l*t_c[3]*(1. + 3.0*cu[3] + 9*0.5*cu[3]*cu[3] - 3.0*0.5*(u_l[0]*u_l[0] + u_l[1]*u_l[1]));
        fin_g3[i] = fin_l[3] - omega*(fin_l[3]-feq_g3[i]) ; 
        
        cu[4] = (-u_l[1]) ;      
        feq_g4[i] = rho_l*t_c[4]*(1. + 3.0*cu[4] + 9*0.5*cu[4]*cu[4] - 3.0*0.5*(u_l[0]*u_l[0] + u_l[1]*u_l[1]));
        fin_g4[i] = fin_l[4] - omega*(fin_l[4]-feq_g4[i]) ; 
        
        cu[5] = (u_l[0] + u_l[1]) ;      
        feq_g5[i] = rho_l*t_c[5]*(1. + 3.0*cu[5] + 9*0.5*cu[5]*cu[5] - 3.0*0.5*(u_l[0]*u_l[0] + u_l[1]*u_l[1]));
        fin_g5[i] = fin_l[5] - omega*(fin_l[5]-feq_g5[i]) ; 
        
        cu[6] = (-u_l[0] + u_l[1]) ;      
        feq_g6[i] = rho_l*t_c[6]*(1. + 3.0*cu[6] + 9*0.5*cu[6]*cu[6] - 3.0*0.5*(u_l[0]*u_l[0] + u_l[1]*u_l[1]));
        fin_g6[i] = fin_l[6] - omega*(fin_l[6]-feq_g6[i]) ; 
        
        cu[7] = (-u_l[0] -u_l[1]) ;      
        feq_g7[i] = rho_l*t_c[7]*(1. + 3.0*cu[7] + 9*0.5*cu[7]*cu[7] - 3.0*0.5*(u_l[0]*u_l[0] + u_l[1]*u_l[1]));
        fin_g7[i] = fin_l[7] - omega*(fin_l[7]-feq_g7[i]) ; 
        
        cu[8] = (u_l[0] -u_l[1]) ;      
        feq_g8[i] = rho_l*t_c[8]*(1. + 3.0*cu[8] + 9*0.5*cu[8]*cu[8] - 3.0*0.5*(u_l[0]*u_l[0] + u_l[1]*u_l[1]));
        fin_g8[i] = fin_l[8] - omega*(fin_l[8]-feq_g8[i]) ; 
        
        
                   
    }

""" % (omega))
Пример #14
0
import pycuda.driver as cuda
import pycuda.autoinit
from pycuda.compiler import SourceModule

import numpy

a = numpy.random.randn(5, 5)
a = a.astype(numpy.float32)

a_gpu = cuda.mem_alloc(a.nbytes)
cuda.memcpy_htod(a_gpu, a)

mod = SourceModule("""
  __global__ void doubles_matrix(float *a)
  {
    int idx = threadIdx.x + threadIdx.y*4;
    a[idx] *= 2;
  }
  """)

func = mod.get_function("doubles_matrix")
func(a_gpu, block=(5, 5, 1))

a_doubled = numpy.empty_like(a)
cuda.memcpy_dtoh(a_doubled, a_gpu)
print("ORIGINAL MATRIX")
print a
print("DOUBLED MATRIX AFTER PyCUDA EXECUTION")
print a_doubled
mod = SourceModule("""
#include <math.h>

__device__ int phase(float x0, float y0, float x1, float y1)
{    
    bool realGreater = (x0 >= x1);
    bool imagGreater = (y0 >= y1);

    if (realGreater && imagGreater)
    {
        return 1;
    }
    else if (!realGreater && imagGreater)
    {
        return 2;
    }
    else if (!realGreater && !imagGreater)
    {
        return 3;
    };

    return 4;
}

__device__ float calculateNewCentreX(float x1, int previousResult, int n)
{
    if (previousResult == 1 || previousResult == 4)
    {
        return x1 + powf(2, n - 2);
    };

    return x1 - powf(2, n - 2);
}

__device__ float calculateNewCentreY(float y1, int previousResult, int n)
{
    if (previousResult == 1 || previousResult == 2)
    {
        return y1 + powf(2, n - 2);
    };

    return y1 - powf(2, n - 2);
}

__global__ void getGridValue(int *regexStringGrid, int n)
{
    int real = blockDim.x * blockIdx.x + threadIdx.x;
    int imag = blockDim.y * blockIdx.y + threadIdx.y;
    int dim = gridDim.x * (blockIdx.y * blockDim.y + threadIdx.y) + blockDim.x * blockIdx.x + threadIdx.x;

    float x1 = powf(2, n - 1);
    float y1 = powf(2, n - 1);

    for (int i = 0; i < n; ++i)
    {
        int value = phase(real, imag, x1, y1);
        x1 = calculateNewCentreX(x1, value, n - i);
        y1 = calculateNewCentreY(y1, value, n - i);
        regexStringGrid[dim] += value * powf(10, i);
    }
}""")
Пример #16
0
descr.flags = 0
tcex_gpu = cuda.Array(descr)
tcey_gpu = cuda.Array(descr)
tcez_gpu = cuda.Array(descr)

mcopy = cuda.Memcpy3D()
mcopy.width_in_bytes = mcopy.src_pitch = f.strides[1]
mcopy.src_height = mcopy.height = ny
mcopy.depth = nx
arrcopy(mcopy, cex, tcex_gpu)
arrcopy(mcopy, cey, tcey_gpu)
arrcopy(mcopy, cez, tcez_gpu)

# prepare kernels
from pycuda.compiler import SourceModule
mod = SourceModule( kernels.replace('Dx',str(Dx)).replace('Dy',str(Dy)).replace('nyz',str(ny*nz)).replace('nx',str(nx)).replace('ny',str(ny)).replace('nz',str(nz)) )
update_h = mod.get_function("update_h")
update_e = mod.get_function("update_e")
update_src = mod.get_function("update_src")

tcex = mod.get_texref("tcex")
tcey = mod.get_texref("tcey")
tcez = mod.get_texref("tcez")
tcex.set_array(tcex_gpu)
tcey.set_array(tcey_gpu)
tcez.set_array(tcez_gpu)

update_h.prepare("iPPPPPP", block=(Dx,Dy,1))
update_e.prepare("iPPPPPP", block=(Dx,Dy,1), texrefs=[tcex,tcey,tcez])
update_src.prepare("fP", block=(nz,1,1))
Пример #17
0
mod = SourceModule("""
__global__ void matmul2(int n, float *a, float *b, float *c)
{
  int row_num_a,col_num_a,row_num_b,col_num_b,row_num_c,col_num_c;
  row_num_a=col_num_a=row_num_b=col_num_b=row_num_c=col_num_c = n;
  __shared__ float A[32][32];//2D array for storing shared matrix values of A & B
  __shared__ float B[32][32];//Process subMatrix in block

  
  int Col=blockIdx.x*blockDim.x+threadIdx.x;//Col and Row Ids of threads
  int Row=blockIdx.y*blockDim.y+threadIdx.y;
  double temp = 0;
  for (int i = 0; i < (col_num_a-1)/blockDim.x+1; ++i) 
  {
     if (Row < row_num_a && i*blockDim.x+threadIdx.x < col_num_a)
        A[threadIdx.y][threadIdx.x] = a[Row*col_num_a + i*blockDim.x+threadIdx.x];//Memory Fetch from a
     else
        A[threadIdx.y][threadIdx.x] = 0;//In case the block dim is not a multiple of matrix

     if (Col < col_num_b && i*blockDim.x+threadIdx.y < row_num_b)
        B[threadIdx.y][threadIdx.x] = b[(i*blockDim.x+threadIdx.y)*col_num_b+Col];//Memory Fetch from b
     else
        B[threadIdx.y][threadIdx.x] = 0;

     __syncthreads();//Wait for all matrix loads to shared memory - then proceed with for loop
      if (Row < row_num_c && Col < col_num_c)
         
      for (int j = 0; j < blockDim.x; ++j)//Matrix multiplication
              temp += A[threadIdx.y][j] * B[j][threadIdx.x];
     __syncthreads();
  }
    if(Row<row_num_c && Col<col_num_c)//If the matrix is needed, then do this
       c[Row*col_num_c+Col] = (float)temp;//Save to c
  
  
}""")
Пример #18
0
from shared_gpu_kernels import gen_kernel, transform_to_cuda

from config import SIGNIFICANT_LENGTH, SIZE, MT_N, M, STATE_SIZE, TEST_ITERATIONS


MT_state_result = np.zeros((SIGNIFICANT_LENGTH, SIZE)).astype(np.uint32)

Stream = drv.Stream()
Stream2 = drv.Stream()

MT_state_buf = drv.mem_alloc(SIZE * MT_N * 4)
MT_state_res_buf = drv.mem_alloc(MT_state_result.nbytes)

prg = SourceModule(
    transform_to_cuda(
        gen_kernel(MT_N, STATE_SIZE, M, SIZE, SIGNIFICANT_LENGTH)
    )
)
prog = prg.get_function('mt_brute')

zzz = time.time()

ev = prog(np.uint32(0), MT_state_buf, MT_state_res_buf, block=(STATE_SIZE, 1, 1), grid=(SIZE/STATE_SIZE, 1), stream=Stream)
drv.memcpy_dtoh_async(MT_state_result, MT_state_res_buf, stream=Stream2)

for i in xrange(TEST_ITERATIONS):
    prog(np.uint32(i*SIZE), MT_state_buf, MT_state_res_buf, block=(STATE_SIZE, 1, 1), grid=(SIZE/STATE_SIZE, 1), stream=Stream)
    drv.memcpy_dtoh(MT_state_result, MT_state_res_buf)#, stream=Stream2)

zzz = time.time() - zzz
print '>>>', zzz
Пример #19
0
mod = SourceModule("""
#include <math.h>

__global__ void fitness(double *x, double *y, double eps, double a, double b, double c, double d)
{
  const int i = threadIdx.x + blockDim.x * blockIdx.x;
  
  double tempx = x[i];
  double tempy = y[i];
  
  //double avgfit = (x[i]*(x[i]*a + y[i]*c) + y[i]*(x[i]*b+y[i]*d))/pow(x[i]+y[i],2);
  
  double pop = tempx + tempy; 
  double lambda1 = 2;
  double lambda2 = .02;
  double avgfit = lambda1 - lambda2*pop;
  
  y[i] = tempy + tempy*eps*(   (tempx*c + tempy*d)/(tempx+tempy)  +  avgfit );
  x[i] = tempx + tempx*eps*(   (tempx*a + tempy*b)/(tempx+tempy)  +  avgfit );

}


__global__ void diffuse_display(double *picture, double *destx, double *desty, double *x, double *y, double mu1, double mu2, double eps, int side)
{
  const int i = threadIdx.x + blockDim.x * blockIdx.x;
  const int ROW    = i/side;
  const int COLUMN = i%side;
  
  const int N   =   ROW*side               + (COLUMN+side-1)%side;
  const int S   =   ROW*side               + (COLUMN +1)%side;
  const int E   = ((ROW+1)%side)*side      +  COLUMN;
  const int W   = ((ROW+side-1)%side)*side +  COLUMN;
  const int NE  = ((ROW+1)%side)*side      + (COLUMN+side-1)%side;
  const int NW  = ((ROW-1+side)%side)*side + (COLUMN+side-1)%side;
  const int SE  = ((ROW+1)%side)*side      + (COLUMN+1)%side;
  const int SW  = ((ROW+side-1)%side)*side + (COLUMN+1)%side;
  
  double mu1eps = mu1*eps;
  double mu2eps = mu2*eps;
  
  double d = 4+4/sqrt(2.0);
      
  
  destx[i] = (1-mu1eps)*x[i] + (( mu1eps ) / d ) * (x[N]+x[S]+x[W]+x[E]) + ( (1/sqrt(2.0) ) * mu1eps / d )*(x[NE]+x[NW]+x[SE]+x[SW]);
  desty[i] = (1-mu2eps)*y[i] + (( mu2eps ) / d ) * (y[N]+y[S]+y[W]+y[E]) + ( (1/sqrt(2.0) ) * mu2eps / d )*(y[NE]+y[NW]+y[SE]+y[SW]);

  
  picture[i] = destx[i]/(destx[i]+desty[i]);
}
""")
Пример #20
0
import pycuda.autoinit
from pycuda.compiler import SourceModule

import numpy as np

from cutools import mt_rand

mod = SourceModule("""
#include "mt_rand.cu.h"
__global__ void cu_rand_test(float *x, int N)
{
  unsigned int idx = __mul24(blockIdx.x, blockDim.x) + threadIdx.x;
  if (idx < N)
  {
    // initialize the MT
    MersenneTwisterState mtState;
    MersenneTwisterInitialise(mtState, idx);

    //
    x[idx] = mt_rand(mtState, idx);
  }
}

    """,
                   include_dirs=[mt_rand.get_include_dir()])

# seed the random number generator
mt_rand.seed(cuda, mod)

cu_rand = mod.get_function("cu_rand_test")

asize = 10000
Пример #21
0
#!python
import pycuda.gpuarray as gpuarray
import pycuda.driver as drv
import pycuda.autoinit
import numpy as np

from pycuda.compiler import SourceModule
func_mod = SourceModule("""
template <class T>
__device__ T incr(T x) {
    return (x + 1.0);
}

// Needed to avoid name mangling so that PyCUDA can
// find the kernel function:
extern "C" {
    __global__ void func(float *a, int N)
    {
        int idx = threadIdx.x;
        if (idx < N)
            a[idx] = incr(a[idx]);
    }
}
""",
                        no_extern_c=1)

func = func_mod.get_function('func')

N = 5
x = np.asarray(np.random.rand(N), np.float32)
x_orig = x.copy()
x_gpu = gpuarray.to_gpu(x)
Пример #22
0
import pycuda.autoinit
from pycuda.compiler import SourceModule
import pycuda.gpuarray as gpuarray
import numpy
mod = SourceModule(r"""
void __global__ add(const float *x, const float *y, float *z)
{
    const int n = blockDim.x * blockIdx.x + threadIdx.x;
    z[n] = x[n] + y[n];
}
""")
add = mod.get_function("add")
num = 6
A = numpy.random.rand(num)
B = numpy.random.rand(num)
C = numpy.zeros(num)
A_GPU = gpuarray.to_gpu(A.astype(numpy.float32))
B_GPU = gpuarray.to_gpu(B.astype(numpy.float32))
C_GPU = gpuarray.to_gpu(B.astype(numpy.float32))
add(A_GPU, B_GPU, C_GPU, grid=(2,), block=(4,1,1))
C = C_GPU.get()
print('A=', A)
print('B=', B)
print('C=', C)

# memory allocate
f = np.zeros((nx,ny,nz), dtype=np.float64)
cf = np.ones_like(f)*0.5

eh_gpus = ex_gpu, ey_gpu, ez_gpu, hx_gpu, hy_gpu, hz_gpu = [cuda.to_device(f) for i in range(6)]
ce_gpus = cex_gpu, cey_gpu, cez_gpu = [cuda.to_device(cf) for i in range(3)]


# prepare kernels
tpb = 256
for bpg in xrange(65535, 0, -1):
	if (nx * ny * nz / tpb) % bpg == 0: break
print 'tpb = %d, bpg = %g' % (tpb, bpg)

from pycuda.compiler import SourceModule
mod = SourceModule( kernels.replace('Dx',str(tpb)).replace('nxyz',str(nx*ny*nz)).replace('nyz',str(ny*nz)).replace('nx',str(nx)).replace('ny',str(ny)).replace('nz',str(nz)), options=['-m 64'] )
update_h = mod.get_function("update_h")
update_e = mod.get_function("update_e")
update_src = mod.get_function("update_src")

update_h.prepare("PPPPPP", block=(tpb,1,1))
update_e.prepare("PPPPPPPPP", block=(tpb,1,1))
update_src.prepare("fP", block=(nz,1,1))


# prepare for plot
#import matplotlib.pyplot as plt
#plt.ion()
#imsh = plt.imshow(np.ones((nx,ny),'f').T, cmap=plt.cm.hot, origin='lower', vmin=0, vmax=0.005)
#plt.colorbar()
Пример #24
0
    def __init__(self, img_size, **kwargs):
        self.num = CorrelStage.num
        CorrelStage.num += 1
        self.verbose = kwargs.get("verbose", 0)
        self.debug(2, "Initializing with resolution", img_size)
        self.h, self.w = img_size
        self._ready = False
        self.nbIter = kwargs.get("iterations", 5)
        self.showDiff = kwargs.get("show_diff", False)
        if self.showDiff:
            import cv2
            cv2.namedWindow("Residual",
                            cv2.WINDOW_NORMAL | cv2.WINDOW_KEEPRATIO)
        self.mul = kwargs.get("mul", 3)
        # These two store the values of the last resampled array
        # It is meant to allocate output array only once (see resampleD)
        self.rX, self.rY = -1, -1
        # self.loop will be incremented every time getDisp is called
        # It will be used to measure performance and output some info
        self.loop = 0

        # Allocating stuff #

        # Grid and block for kernels called with the size of the image #
        # All the images and arrays in the kernels will be in order (x,y)
        self.grid = (int(ceil(self.w / 32)), int(ceil(self.h / 32)))
        self.block = (int(ceil(self.w / self.grid[0])),
                      int(ceil(self.h / self.grid[1])), 1)
        self.debug(3, "Default grid:", self.grid, "block", self.block)

        # We need the number of fields to allocate the G tables #
        self.Nfields = kwargs.get("Nfields")
        if self.Nfields is None:
            self.Nfields = len(kwargs.get("fields")[0])

        # Allocating everything we need #
        self.devG = []
        self.devFieldsX = []
        self.devFieldsY = []
        for i in range(self.Nfields):
            # devG stores the G arrays (to compute the research direction)
            self.devG.append(gpuarray.empty(img_size, np.float32))
            # devFieldsX/Y store the fields value along X and Y
            self.devFieldsX.append(gpuarray.empty((self.h, self.w),
                                                  np.float32))
            self.devFieldsY.append(gpuarray.empty((self.h, self.w),
                                                  np.float32))
        # devH Stores the Hessian matrix
        self.H = np.zeros((self.Nfields, self.Nfields), np.float32)
        # And devHi stores its invert
        self.devHi = gpuarray.empty((self.Nfields, self.Nfields), np.float32)
        # devOut is written with the difference of the images
        self.devOut = gpuarray.empty((self.h, self.w), np.float32)
        # devX stores the value of the parameters (what is actually computed)
        self.devX = gpuarray.empty((self.Nfields), np.float32)
        # to store the research direction
        self.devVec = gpuarray.empty((self.Nfields), np.float32)
        # To store the original image on the device
        self.devOrig = gpuarray.empty(img_size, np.float32)
        # To store the gradient along X of the original image on the device
        self.devGradX = gpuarray.empty(img_size, np.float32)
        # And along Y
        self.devGradY = gpuarray.empty(img_size, np.float32)

        # Locating the kernel file #
        kernelFile = kwargs.get("kernel_file")
        if kernelFile is None:
            self.debug(2, "Kernel file not specified")
            from crappy import __path__ as crappyPath
            kernelFile = crappyPath[0] + "/data/kernels.cu"
        # Reading kernels and compiling module #
        with open(kernelFile, "r") as f:
            self.debug(3, "Sourcing module")
            self.mod = SourceModule(f.read() % (self.w, self.h, self.Nfields))
        # Assigning functions to the kernels #
        # These kernels are defined in data/kernels.cu
        self._resampleOrigKrnl = self.mod.get_function('resampleO')
        self._resampleKrnl = self.mod.get_function('resample')
        self._gradientKrnl = self.mod.get_function('gradient')
        self._makeGKrnl = self.mod.get_function('makeG')
        self._makeDiff = self.mod.get_function('makeDiff')
        self._dotKrnl = self.mod.get_function('myDot')
        self._addKrnl = self.mod.get_function('kadd')
        # These ones use pyCuda reduction module to generate efficient kernels
        self._mulRedKrnl = ReductionKernel(np.float32,
                                           neutral="0",
                                           reduce_expr="a+b",
                                           map_expr="x[i]*y[i]",
                                           arguments="float *x, float *y")
        self._leastSquare = ReductionKernel(np.float32,
                                            neutral="0",
                                            reduce_expr="a+b",
                                            map_expr="x[i]*x[i]",
                                            arguments="float *x")
        # We could have used use mulRedKrnl(x,x), but this is probably faster ?

        # Getting texture references #
        self.tex = self.mod.get_texref('tex')
        self.tex_d = self.mod.get_texref('tex_d')
        self.texMask = self.mod.get_texref('texMask')
        # Setting proper flags #
        # All textures use normalized coordinates except for the mask
        for t in [self.tex, self.tex_d]:
            t.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
        for t in [self.tex, self.tex_d, self.texMask]:
            t.set_filter_mode(cuda.filter_mode.LINEAR)
            t.set_address_mode(0, cuda.address_mode.BORDER)
            t.set_address_mode(1, cuda.address_mode.BORDER)

        # Preparing kernels for less overhead when called #
        self._resampleOrigKrnl.prepare("Pii", texrefs=[self.tex])
        self._resampleKrnl.prepare("Pii", texrefs=[self.tex_d])
        self._gradientKrnl.prepare("PP", texrefs=[self.tex])
        self._makeDiff.prepare("PPPP",
                               texrefs=[self.tex, self.tex_d, self.texMask])
        self._addKrnl.prepare("PfP")
        # Reading original image if provided #
        if kwargs.get("img") is not None:
            self.setOrig(kwargs.get("img"))
        # Reading fields if provided #
        if kwargs.get("fields") is not None:
            self.setFields(kwargs.get("fields"))
        # Reading mask if provided #
        if kwargs.get("mask") is not None:
            self.setMask(kwargs.get("mask"))
Пример #25
0
lines = numpy.array(data, dtype=str)

# Allocate cuda memory for input data
lines_gpu = cuda.mem_alloc(lines.size * lines.dtype.itemsize)
# Copy data from host to device
cuda.memcpy_htod(lines_gpu, lines)

# Dimensions
blocks = len(data)
thread_per_block = lines.dtype.itemsize
nbr_values = lines.size * lines.dtype.itemsize

# Create destination array
dest = numpy.zeros((nbr_values, ), dtype=numpy.int32)

# Allocate cuda memory for output data
dest_gpu = cuda.mem_alloc(dest.size * dest.dtype.itemsize)

# Define cuda function
mod = SourceModule("""
__global__ void process(int **dest, char **line){
    int index = threadIdx.x + blockIdx.x * blockDim.x;
    dest[index] = (int *)line[index];
}
""")

# Run GPU model
gpu_func = mod.get_function("process")
gpu_func(dest_gpu, lines_gpu, grid=(blocks, 1), block=(thread_per_block, 1, 1))
cuda.memcpy_dtoh(dest, dest_gpu)
print(dest)
Пример #26
0
    def __init__(self, img_size, **kwargs):
        cuda.init()
        from pycuda.tools import make_default_context
        global context
        context = make_default_context()
        unknown = []
        for k in kwargs.keys():
            if k not in [
                    'verbose', 'levels', 'resampling_factor', 'kernel_file',
                    'iterations', 'show_diff', 'Nfields', 'img', 'fields',
                    'mask', 'mul'
            ]:
                unknown.append(k)
        if len(unknown) != 0:
            warnings.warn(
                "Unrecognized parameter" +
                ('s: ' + str(unknown) if len(unknown) > 1 else ': ' +
                 unknown[0]), SyntaxWarning)
        self.verbose = kwargs.get("verbose", 0)
        self.debug(
            3, "You set the verbose level to the maximum.\n\
It may help finding bugs or tracking errors but it may also \
impact the program performance as it will print A LOT of \
output and add GPU->CPU copies only to print information.\n\
If it is not desired, consider lowering the verbosity: \
1 or 2 is a reasonable choice, \
0 won't show anything except for errors.")
        self.levels = kwargs.get("levels", 5)
        self.loop = 0
        self.resamplingFactor = kwargs.get("resampling_factor", 2)
        h, w = img_size
        self.nbIter = kwargs.get("iterations", 4)
        self.debug(1, "Initializing... Master resolution:", img_size,
                   "levels:", self.levels, "verbosity:", self.verbose)

        # Computing dimensions of the different levels #
        self.h, self.w = [], []
        for i in range(self.levels):
            self.h.append(int(round(h / (self.resamplingFactor**i))))
            self.w.append(int(round(w / (self.resamplingFactor**i))))

        if kwargs.get("Nfields") is not None:
            self.Nfields = kwargs.get("Nfields")
        else:
            try:
                self.Nfields = len(kwargs["fields"])
            except KeyError:
                self.debug(
                    0, "Error! You must provide the number of fields at init. \
Add Nfields=x or directly set fields with fields=list/tuple")
                raise ValueError

        kernelFile = kwargs.get("kernel_file")
        if kernelFile is None:
            self.debug(
                3, "Kernel file not specified, using the one in crappy dir")
            from crappy import __path__ as crappyPath
            kernelFile = crappyPath[0] + "/data/kernels.cu"
        self.debug(3, "Kernel file:", kernelFile)

        # Creating a new instance of CorrelStage for each stage #
        self.correl = []
        for i in range(self.levels):
            self.correl.append(
                CorrelStage((self.h[i], self.w[i]),
                            verbose=self.verbose,
                            Nfields=self.Nfields,
                            iterations=self.nbIter,
                            show_diff=(i == 0
                                       and kwargs.get("show_diff", False)),
                            mul=kwargs.get("mul", 3),
                            kernel_file=kernelFile))

        # Set original image if provided #
        if kwargs.get("img") is not None:
            self.setOrig(kwargs.get("img"))

        s = """
    texture<float, cudaTextureType2D, cudaReadModeElementType> texFx{0};
    texture<float, cudaTextureType2D, cudaReadModeElementType> texFy{0};
    __global__ void resample{0}(float* outX, float* outY, int x, int y)
    {{
      int idx = blockIdx.x*blockDim.x+threadIdx.x;
      int idy = blockIdx.y*blockDim.y+threadIdx.y;
      if(idx < x && idy < y)
      {{
        outX[idy*x+idx] = tex2D(texFx{0},(float)idx/x, (float)idy/y);
        outY[idy*x+idx] = tex2D(texFy{0},(float)idx/x, (float)idy/y);
      }}
    }}
    """
        self.src = ""
        for i in range(self.Nfields):
            self.src += s.format(
                i)  # Adding textures for the quick fields resampling

        self.mod = SourceModule(self.src)

        self.texFx = []
        self.texFy = []
        self.resampleF = []
        for i in range(self.Nfields):
            self.texFx.append(self.mod.get_texref("texFx%d" % i))
            self.texFy.append(self.mod.get_texref("texFy%d" % i))
            self.resampleF.append(self.mod.get_function("resample%d" % i))
            self.resampleF[i].prepare("PPii",
                                      texrefs=[self.texFx[i], self.texFy[i]])

        for t in self.texFx + self.texFy:
            t.set_flags(cuda.TRSF_NORMALIZED_COORDINATES)
            t.set_filter_mode(cuda.filter_mode.LINEAR)
            t.set_address_mode(0, cuda.address_mode.BORDER)
            t.set_address_mode(1, cuda.address_mode.BORDER)

        # Set fields if provided #
        if kwargs.get("fields") is not None:
            self.setFields(kwargs.get("fields"))

        if kwargs.get("mask") is not None:
            self.setMask(kwargs.get("mask"))
Пример #27
0
ker = SourceModule('''
// row-column dot-product for matrix multiplication
__device__ float rowcol_dot(float *matrix_a, float *matrix_b, int row, int col, int N)
{
    printf("threadIdx.x,y: %d,%d blockIdx.x,y: %d,%d -- row is %d, col is %d, N is %d.\\n", threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, row, col, N);
	float val = 0;

	for (int k=0; k < N; k++)
	{
	    if(threadIdx.x == 0 && threadIdx.y == 0 && blockIdx.x == 0 && blockIdx.y == 0)
            printf("Dot-product loop: k value is %d, matrix_a value is %f, matrix_b is %f.\\n", k, matrix_a[ row + k*N ], matrix_b[ col*N + k]);
        val += matrix_a[ row*N + k ] * matrix_b[ col + k*N];
	}

	return(val);

}

// matrix multiplication kernel that is parallelized over row/column tuples.
__global__ void matrix_mult_ker(float * matrix_a, float * matrix_b, float * output_matrix, int N)
{ 

    int row = blockIdx.x*blockDim.x + threadIdx.x;
    int col = blockIdx.y*blockDim.y + threadIdx.y;
    printf("threadIdx.x,y: %d,%d blockIdx.x,y: %d,%d -- row is %d, col is %d.\\n", threadIdx.x, threadIdx.y, blockIdx.x, blockIdx.y, row, col);

	output_matrix[col + row*N] = rowcol_dot(matrix_a, matrix_b, row, col, N);

}
''')
Пример #28
0
def solve_gpu(currentmodelrun, modelend, G):
    """Solving using FDTD method on GPU. Implemented using Nvidia CUDA.

    Args:
        currentmodelrun (int): Current model run number.
        modelend (int): Number of last model to run.
        G (class): Grid class instance - holds essential parameters describing the model.

    Returns:
        tsolve (float): Time taken to execute solving
    """

    import pycuda.driver as drv
    from pycuda.compiler import SourceModule
    drv.init()

    # Create device handle and context on specifc GPU device (and make it current context)
    dev = drv.Device(G.gpu.deviceID)
    ctx = dev.make_context()

    # Electric and magnetic field updates - prepare kernels, and get kernel functions
    if Material.maxpoles > 0:
        kernels_fields = SourceModule(kernels_template_fields.substitute(REAL=cudafloattype, COMPLEX=cudacomplextype, N_updatecoeffsE=G.updatecoeffsE.size, N_updatecoeffsH=G.updatecoeffsH.size, NY_MATCOEFFS=G.updatecoeffsE.shape[1], NY_MATDISPCOEFFS=G.updatecoeffsdispersive.shape[1], NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2], NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3], NX_T=G.Tx.shape[1], NY_T=G.Tx.shape[2], NZ_T=G.Tx.shape[3]))
    else:   # Set to one any substitutions for dispersive materials
        kernels_fields = SourceModule(kernels_template_fields.substitute(REAL=cudafloattype, COMPLEX=cudacomplextype, N_updatecoeffsE=G.updatecoeffsE.size, N_updatecoeffsH=G.updatecoeffsH.size, NY_MATCOEFFS=G.updatecoeffsE.shape[1], NY_MATDISPCOEFFS=1, NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2], NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3], NX_T=1, NY_T=1, NZ_T=1))
    update_e_gpu = kernels_fields.get_function("update_e")
    update_h_gpu = kernels_fields.get_function("update_h")

    # Copy material coefficient arrays to constant memory of GPU (must be <64KB) for fields kernels
    updatecoeffsE = kernels_fields.get_global('updatecoeffsE')[0]
    updatecoeffsH = kernels_fields.get_global('updatecoeffsH')[0]
    if G.updatecoeffsE.nbytes + G.updatecoeffsH.nbytes > G.gpu.constmem:
        raise GeneralError('Too many materials in the model to fit onto constant memory of size {} on {} - {} GPU'.format(human_size(G.gpu.constmem), G.gpu.deviceID, G.gpu.name))
    else:
        drv.memcpy_htod(updatecoeffsE, G.updatecoeffsE)
        drv.memcpy_htod(updatecoeffsH, G.updatecoeffsH)

    # Electric and magnetic field updates - dispersive materials - get kernel functions and initialise array on GPU
    if Material.maxpoles > 0:  # If there are any dispersive materials (updates are split into two parts as they require present and updated electric field values).
        update_e_dispersive_A_gpu = kernels_fields.get_function("update_e_dispersive_A")
        update_e_dispersive_B_gpu = kernels_fields.get_function("update_e_dispersive_B")
        G.gpu_initialise_dispersive_arrays()

    # Electric and magnetic field updates - set blocks per grid and initialise field arrays on GPU
    G.gpu_set_blocks_per_grid()
    G.gpu_initialise_arrays()

    # PML updates
    if G.pmls:
        # Prepare kernels
        kernels_pml = SourceModule(kernels_template_pml.substitute(REAL=cudafloattype, N_updatecoeffsE=G.updatecoeffsE.size, N_updatecoeffsH=G.updatecoeffsH.size, NY_MATCOEFFS=G.updatecoeffsE.shape[1], NY_R=G.pmls[0].ERA.shape[1], NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2], NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3]))
        # Copy material coefficient arrays to constant memory of GPU (must be <64KB) for PML kernels
        updatecoeffsE = kernels_pml.get_global('updatecoeffsE')[0]
        updatecoeffsH = kernels_pml.get_global('updatecoeffsH')[0]
        drv.memcpy_htod(updatecoeffsE, G.updatecoeffsE)
        drv.memcpy_htod(updatecoeffsH, G.updatecoeffsH)
        # Set block per grid, initialise arrays on GPU, and get kernel functions
        for pml in G.pmls:
            pml.gpu_set_blocks_per_grid(G)
            pml.gpu_initialise_arrays()
            pml.gpu_get_update_funcs(kernels_pml)

    # Receivers
    if G.rxs:
        # Initialise arrays on GPU
        rxcoords_gpu, rxs_gpu = gpu_initialise_rx_arrays(G)
        # Prepare kernel and get kernel function
        kernel_store_outputs = SourceModule(kernel_template_store_outputs.substitute(REAL=cudafloattype, NY_RXCOORDS=3, NX_RXS=6, NY_RXS=G.iterations, NZ_RXS=len(G.rxs), NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2]))
        store_outputs_gpu = kernel_store_outputs.get_function("store_outputs")

    # Sources - initialise arrays on GPU, prepare kernel and get kernel functions
    if G.voltagesources + G.hertziandipoles + G.magneticdipoles:
        kernels_sources = SourceModule(kernels_template_sources.substitute(REAL=cudafloattype, N_updatecoeffsE=G.updatecoeffsE.size, N_updatecoeffsH=G.updatecoeffsH.size, NY_MATCOEFFS=G.updatecoeffsE.shape[1], NY_SRCINFO=4, NY_SRCWAVES=G.iterations, NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2], NX_ID=G.ID.shape[1], NY_ID=G.ID.shape[2], NZ_ID=G.ID.shape[3]))
        # Copy material coefficient arrays to constant memory of GPU (must be <64KB) for source kernels
        updatecoeffsE = kernels_sources.get_global('updatecoeffsE')[0]
        updatecoeffsH = kernels_sources.get_global('updatecoeffsH')[0]
        drv.memcpy_htod(updatecoeffsE, G.updatecoeffsE)
        drv.memcpy_htod(updatecoeffsH, G.updatecoeffsH)
        if G.hertziandipoles:
            srcinfo1_hertzian_gpu, srcinfo2_hertzian_gpu, srcwaves_hertzian_gpu = gpu_initialise_src_arrays(G.hertziandipoles, G)
            update_hertzian_dipole_gpu = kernels_sources.get_function("update_hertzian_dipole")
        if G.magneticdipoles:
            srcinfo1_magnetic_gpu, srcinfo2_magnetic_gpu, srcwaves_magnetic_gpu = gpu_initialise_src_arrays(G.magneticdipoles, G)
            update_magnetic_dipole_gpu = kernels_sources.get_function("update_magnetic_dipole")
        if G.voltagesources:
            srcinfo1_voltage_gpu, srcinfo2_voltage_gpu, srcwaves_voltage_gpu = gpu_initialise_src_arrays(G.voltagesources, G)
            update_voltage_source_gpu = kernels_sources.get_function("update_voltage_source")

    # Snapshots - initialise arrays on GPU, prepare kernel and get kernel functions
    if G.snapshots:
        # Initialise arrays on GPU
        snapEx_gpu, snapEy_gpu, snapEz_gpu, snapHx_gpu, snapHy_gpu, snapHz_gpu = gpu_initialise_snapshot_array(G)
        # Prepare kernel and get kernel function
        kernel_store_snapshot = SourceModule(kernel_template_store_snapshot.substitute(REAL=cudafloattype, NX_SNAPS=Snapshot.nx_max, NY_SNAPS=Snapshot.ny_max, NZ_SNAPS=Snapshot.nz_max, NX_FIELDS=G.Ex.shape[0], NY_FIELDS=G.Ex.shape[1], NZ_FIELDS=G.Ex.shape[2]))
        store_snapshot_gpu = kernel_store_snapshot.get_function("store_snapshot")

    # Iteration loop timer
    iterstart = drv.Event()
    iterend = drv.Event()
    iterstart.record()

    for iteration in tqdm(range(G.iterations), desc='Running simulation, model ' + str(currentmodelrun) + '/' + str(modelend), ncols=get_terminal_width() - 1, file=sys.stdout, disable=G.tqdmdisable):

        # Get GPU memory usage on final iteration
        if iteration == G.iterations - 1:
            memsolve = drv.mem_get_info()[1] - drv.mem_get_info()[0]

        # Store field component values for every receiver
        if G.rxs:
            store_outputs_gpu(np.int32(len(G.rxs)), np.int32(iteration),
                              rxcoords_gpu.gpudata, rxs_gpu.gpudata,
                              G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata,
                              G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata,
                              block=(1, 1, 1), grid=(round32(len(G.rxs)), 1, 1))

        # Store any snapshots
        for i, snap in enumerate(G.snapshots):
            if snap.time == iteration + 1:
                store_snapshot_gpu(np.int32(i), np.int32(snap.xs),
                                   np.int32(snap.xf), np.int32(snap.ys),
                                   np.int32(snap.yf), np.int32(snap.zs),
                                   np.int32(snap.zf), np.int32(snap.dx),
                                   np.int32(snap.dy), np.int32(snap.dz),
                                   G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata,
                                   G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata,
                                   snapEx_gpu.gpudata, snapEy_gpu.gpudata, snapEz_gpu.gpudata,
                                   snapHx_gpu.gpudata, snapHy_gpu.gpudata, snapHz_gpu.gpudata,
                                   block=Snapshot.tpb, grid=Snapshot.bpg)
                if G.snapsgpu2cpu:
                    gpu_get_snapshot_array(snapEx_gpu.get(), snapEy_gpu.get(), snapEz_gpu.get(),
                                           snapHx_gpu.get(), snapHy_gpu.get(), snapHz_gpu.get(), i, snap)

        # Update magnetic field components
        update_h_gpu(np.int32(G.nx), np.int32(G.ny), np.int32(G.nz),
                     G.ID_gpu.gpudata, G.Hx_gpu.gpudata, G.Hy_gpu.gpudata,
                     G.Hz_gpu.gpudata, G.Ex_gpu.gpudata, G.Ey_gpu.gpudata,
                     G.Ez_gpu.gpudata, block=G.tpb, grid=G.bpg)

        # Update magnetic field components with the PML correction
        for pml in G.pmls:
            pml.gpu_update_magnetic(G)

        # Update magnetic field components for magetic dipole sources
        if G.magneticdipoles:
            update_magnetic_dipole_gpu(np.int32(len(G.magneticdipoles)), np.int32(iteration),
                                       floattype(G.dx), floattype(G.dy), floattype(G.dz),
                                       srcinfo1_magnetic_gpu.gpudata, srcinfo2_magnetic_gpu.gpudata,
                                       srcwaves_magnetic_gpu.gpudata, G.ID_gpu.gpudata,
                                       G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata,
                                       block=(1, 1, 1), grid=(round32(len(G.magneticdipoles)), 1, 1))

        # Update electric field components
        # If all materials are non-dispersive do standard update
        if Material.maxpoles == 0:
            update_e_gpu(np.int32(G.nx), np.int32(G.ny), np.int32(G.nz), G.ID_gpu.gpudata,
                         G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata,
                         G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata,
                         block=G.tpb, grid=G.bpg)
        # If there are any dispersive materials do 1st part of dispersive update
        # (it is split into two parts as it requires present and updated electric field values).
        else:
            update_e_dispersive_A_gpu(np.int32(G.nx), np.int32(G.ny), np.int32(G.nz),
                                      np.int32(Material.maxpoles), G.updatecoeffsdispersive_gpu.gpudata,
                                      G.Tx_gpu.gpudata, G.Ty_gpu.gpudata, G.Tz_gpu.gpudata, G.ID_gpu.gpudata,
                                      G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata,
                                      G.Hx_gpu.gpudata, G.Hy_gpu.gpudata, G.Hz_gpu.gpudata,
                                      block=G.tpb, grid=G.bpg)

        # Update electric field components with the PML correction
        for pml in G.pmls:
            pml.gpu_update_electric(G)

        # Update electric field components for voltage sources
        if G.voltagesources:
            update_voltage_source_gpu(np.int32(len(G.voltagesources)), np.int32(iteration),
                                      floattype(G.dx), floattype(G.dy), floattype(G.dz),
                                      srcinfo1_voltage_gpu.gpudata, srcinfo2_voltage_gpu.gpudata,
                                      srcwaves_voltage_gpu.gpudata, G.ID_gpu.gpudata,
                                      G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata,
                                      block=(1, 1, 1), grid=(round32(len(G.voltagesources)), 1, 1))

        # Update electric field components for Hertzian dipole sources (update any Hertzian dipole sources last)
        if G.hertziandipoles:
            update_hertzian_dipole_gpu(np.int32(len(G.hertziandipoles)), np.int32(iteration),
                                       floattype(G.dx), floattype(G.dy), floattype(G.dz),
                                       srcinfo1_hertzian_gpu.gpudata, srcinfo2_hertzian_gpu.gpudata,
                                       srcwaves_hertzian_gpu.gpudata, G.ID_gpu.gpudata,
                                       G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata,
                                       block=(1, 1, 1), grid=(round32(len(G.hertziandipoles)), 1, 1))

        # If there are any dispersive materials do 2nd part of dispersive update (it is split into two parts as it requires present and updated electric field values). Therefore it can only be completely updated after the electric field has been updated by the PML and source updates.
        if Material.maxpoles > 0:
            update_e_dispersive_B_gpu(np.int32(G.nx), np.int32(G.ny), np.int32(G.nz),
                                      np.int32(Material.maxpoles), G.updatecoeffsdispersive_gpu.gpudata,
                                      G.Tx_gpu.gpudata, G.Ty_gpu.gpudata, G.Tz_gpu.gpudata, G.ID_gpu.gpudata,
                                      G.Ex_gpu.gpudata, G.Ey_gpu.gpudata, G.Ez_gpu.gpudata,
                                      block=G.tpb, grid=G.bpg)

    # Copy output from receivers array back to correct receiver objects
    if G.rxs:
        gpu_get_rx_array(rxs_gpu.get(), rxcoords_gpu.get(), G)

    # Copy data from any snapshots back to correct snapshot objects
    if G.snapshots and not G.snapsgpu2cpu:
        for i, snap in enumerate(G.snapshots):
            gpu_get_snapshot_array(snapEx_gpu.get(), snapEy_gpu.get(), snapEz_gpu.get(),
                                   snapHx_gpu.get(), snapHy_gpu.get(), snapHz_gpu.get(), i, snap)

    iterend.record()
    iterend.synchronize()
    tsolve = iterstart.time_till(iterend) * 1e-3

    # Remove context from top of stack and delete
    ctx.pop()
    del ctx

    return tsolve, memsolve
Пример #29
0
import numpy as np
import pycuda.autoinit
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
mod = SourceModule('''
__global__ void Text_GPU(float *A , float *B){
 
    int bid = blockIdx.x;  
    int tid = threadIdx.x;
 
    __shared__ float s_data[128];
 
    s_data[tid] = A[bid*128 + tid];
    __syncthreads();
 
    for(int i = 64;i>0;i/=2){
        if(tid < i)
            s_data[tid] = s_data[tid] + s_data[tid +i];
        __syncthreads();
    }
    if(tid == 0)
        B[bid] = s_data[0];
 
}
''')
Text_GPU = mod.get_function("Text_GPU")
A = np.ones((32, 128), dtype=np.float32)
B = np.ones((32,), dtype=np.float32)
Text_GPU(cuda.In(A), cuda.InOut(B), grid=(32, 1), block=(128, 1, 1))
print(B)
mod = SourceModule("""
#include <stdio.h>

/*
  This function compute the force between two blobs
  with vector between blob centers r.

  In this example the force is derived from a Yukawa potential
  
  U = eps * exp(-r_norm / b) / r_norm
  
  with
  eps = potential strength
  r_norm = distance between blobs
  b = Debye length
*/
__device__ void blob_blob_force(const double rx, 
                                const double ry, 
                                const double rz, 
                                double &fx, 
                                double &fy, 
                                double &fz, 
                                const double eps, 
                                const double b,
                                const double a){

  double r_norm = sqrt(rx*rx + ry*ry + rz*rz);
  double f = -((eps / b) + (eps / r_norm)) * exp(-r_norm / b) / (r_norm*r_norm);
  
  fx += f * rx;
  fy += f * ry;
  fz += f * rz;
}

/*
 This function computes the blob-blob force for all blobs.
*/
__global__ void calc_blob_blob_force(const double *x, 
                                     double *f, 
                                     const double repulsion_strength, 
                                     const double debye_length,
                                     const double blob_radius,
                                     const double Lx,
                                     const double Ly,
                                     const double Lz,
                                     const int number_of_blobs){
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if(i >= number_of_blobs) return;   

  int offset_i = i * 3;
  int offset_j;
  double rx, ry, rz;
  double fx = 0;
  double fy = 0;
  double fz = 0;

  // Loop over blobs to add interanctions
  for(int j=0; j<number_of_blobs; j++){
    offset_j = j * 3;

    // Compute blob to blob vector
    rx = x[offset_j]     - x[offset_i];
    ry = x[offset_j + 1] - x[offset_i + 1];
    rz = x[offset_j + 2] - x[offset_i + 2];

    // Project a vector r to the minimal image representation
    // centered around (0,0,0) and of size L=(Lx, Ly, Lz). If 
    // any dimension of L is equal or smaller than zero the 
    // box is assumed to be infinite in that direction.
    if(Lx > 0){
      rx = rx - int(rx / Lx + 0.5 * (int(rx>0) - int(rx<0))) * Lx;
    }
    if(Ly > 0){
      ry = ry - int(ry / Ly + 0.5 * (int(ry>0) - int(ry<0))) * Ly;
    }
    if(Lz > 0){
      rz = rz - int(rz / Lz + 0.5 * (int(rz>0) - int(rz<0))) * Lz;
    }

    // Compute force between blobs i and j
    if(i != j){
      blob_blob_force(rx, ry, rz, fx, fy, fz, repulsion_strength, debye_length, blob_radius);
    }
  }
  
  // Return forces
  f[offset_i]     = fx;
  f[offset_i + 1] = fy;
  f[offset_i + 2] = fz;
}
""")