def get_G_kernel(dtype, dtypew): template = """ __device__ double G_dirichlet_time(double tk1, double tk2, double tl1,double tl2, int M, double WM) { double sum = 0; for(int m = 1; m <= M; ++m) { sum += (cos(m * WM * (tk2-tl2)) - cos(m * WM * (tk2-tl1)) - cos( m * WM * (tk1-tl2)) + cos(m * WM * (tk1-tl1))) / (m*m); } sum = sum * 2 / (WM * WM) + (tk2-tk1)*(tl2-tl1); return sum; } __global__ void compute_G_Kernel(%(type)s* g_G, int G_ld, double* g_tk1, double* g_tk2, double Wt, int Mt, %(typew)s* SWeight, int Sweight_ld, int* neuron_ind) { unsigned int tid = threadIdx.x; unsigned int bdim = blockDim.x; unsigned int bid = blockIdx.x; int size = gridDim.x; double tl[2]; __shared__ double tk[2]; __shared__ int ind1; int ind2; if(tid == 0) { tk[0] = g_tk1[bid]; //roundf(g_tk1[bid] * rintf(1 / dt)); }else if(tid == 1) { tk[1] = g_tk2[bid]; //roundf(g_tk2[bid] * rintf(1 / dt)); }else if(tid ==2) { ind1 = neuron_ind[bid]; } __syncthreads(); for(int i = tid; i < size; i += bdim) { ind2 = neuron_ind[i]; tl[0] = g_tk1[i]; tl[1] = g_tk2[i]; g_G[bid * G_ld + i] = G_dirichlet_time(tk[0],tk[1],tl[0],tl[1],Mt,Wt/Mt) * SWeight[ind1 * Sweight_ld + ind2]; } } """ func = func_compile("compute_G_Kernel", template % {"type": dtype_to_ctype(dtype), "typew": dtype_to_ctype(dtypew)}) return func
def get_fft2Ds_kernel(dtype=np.dtype(np.complex128)): fft2Ds_template = """ #include <pycuda/pycuda-complex.hpp> __global__ void fft2Ds_kernel(%(type)s* d_Ds, %(type)s* d_gabor_fft, int Mx, int My, int LPx, int LPy, int N_filters, int pitch, double dxdy) { const int tid = threadIdx.x; const int mx = blockIdx.x - Mx; const int my = blockIdx.y - My; int fftind; double sgndxdy; if(mx < 0 && my <0) { fftind = LPx * (LPy + my) + (LPx + mx); }else if(my < 0 && mx >= 0) { fftind = mx + (LPx * (LPy + my)); }else if(mx >= 0 && my >=0) { fftind = my*LPx + mx; }else//(my >= 0 && mx < 0) { fftind = my * LPx + (LPx + mx); } if( (mx+my)%%2 ==0) { sgndxdy = dxdy; }else { sgndxdy = -dxdy; } %(type)s tmp; if(mx*mx * My * My + my*my * Mx * Mx <= Mx*Mx*My*My ) { for(int i = tid; i < N_filters; i+=blockDim.x) { tmp = d_gabor_fft[i * LPx * LPy + fftind]; d_Ds[i * pitch + (My-my)*(2*Mx+1) + (mx+Mx)] = %(type)s(sgndxdy * pycuda::real(tmp), sgndxdy * pycuda::imag(tmp)); } }else { for(int i = tid; i < N_filters; i+=blockDim.x) { d_Ds[i * pitch + (My - my)*(2*Mx+1) + (mx+Mx)] = 0; } } } """ func = func_compile("fft2Ds_kernel", fft2Ds_template % {"type": dtype_to_ctype(dtype)}) return func
def get_Ds2fft_kernel(dtype): template = """ #include <pycuda/pycuda-complex.hpp> __global__ void Ds2fft_kernel(%(type)s* d_Ds, int Ds_ld, %(type)s* filter, int filter_ld, int Mx, int My, int Px, int Py, int N_filters) { const int tid = threadIdx.x; const int mx = blockIdx.x - Mx; const int my = blockIdx.y - My; int fftind; double sgndxdy; if(mx < 0 && my <0) { fftind = Px * (Py + my) + (Px + mx); }else if(my < 0 && mx >= 0) { fftind = mx + (Px * (Py + my)); }else if(mx >= 0 && my >=0) { fftind = my*Px + mx; }else//(my > 0 && mx < 0) { fftind = my * Px + (Px + mx); } if( (mx+my)%%2 ==0) { sgndxdy = 1; }else { sgndxdy = -1; } %(type)s tmp; for(int i = tid; i < N_filters; i+=blockDim.x) { tmp = d_Ds[i * Ds_ld + (My-my)*(2*Mx+1) + (mx+Mx)]; filter[i * filter_ld + fftind] = %(type)s(sgndxdy * pycuda::real(tmp),sgndxdy * pycuda::imag(tmp)); } } """ func = func_compile("Ds2fft_kernel", template % {"type": dtype_to_ctype(dtype)}) return func
def get_diag_add_kernel(dtype): template = """ __global__ void diag_add_Kernel(%(type)s* d_G, int ld, int size, %(type)s addin) { int tid = threadIdx.x + blockIdx.x * blockDim.x; int total = gridDim.x * blockDim.x; for(int i = tid; i < size; i+=total) { d_G[i * ld + i] += addin; } } """ func = func_compile("diag_add_Kernel", template % {"type": dtype_to_ctype(dtype)}) return func
def get_filter2rec_kernel(dtype): if dtype == np.complex128: dtypef = np.float64 else: dtypef = np.float32 template = """ #include <pycuda/pycuda-complex.hpp> #define BLOCK_SIZE 16 __global__ void filter2rec_kernel(%(typef)s* d_dirich_space, int dirich_ld, %(type)s* d_filter_complex, int filter_ld, int N_filters, int Px, int Py, int Px1, int Py1) { unsigned int tid_x; tid_x = threadIdx.x; unsigned int tid_y; tid_y = threadIdx.y; int xld = (Px-1)/BLOCK_SIZE + 1; unsigned int bid_x = blockIdx.x %% xld; unsigned int bid_y = blockIdx.x / xld; unsigned int filter_id = blockIdx.y; unsigned int dim_x = blockDim.x; unsigned int dim_y = blockDim.y; int pix_x = dim_x * bid_x + tid_x; int pix_y = dim_y * bid_y + tid_y; int xdrift = (Px1 - Px)/2; int ydrift = (Py1 - Py)/2; int input_ind = (pix_y + ydrift) * Px1 + pix_x + xdrift + filter_id * filter_ld; int output_ind = (pix_y) * Px + (pix_x) + filter_id * dirich_ld; if(pix_x < Px && pix_y < Py) { d_dirich_space[output_ind] = pycuda::real(d_filter_complex[input_ind]); } } """ func = func_compile("filter2rec_kernel", template % {"type": dtype_to_ctype(dtype), "typef": dtype_to_ctype(dtypef)}) return func
def get_compute_q_kernel(dtype): template = """ __global__ void compute_q_Kernel(%(type)s* g_q, double* g_tk1, double* g_tk2, int* neuron_ind, double* kappa, double* delta, double* bias, double* d_norm, int size) { int ind = blockIdx.x * blockDim.x + threadIdx.x; int total_threads = blockDim.x * gridDim.x; for(int i = ind; i < size; i += total_threads) { int neuron = neuron_ind[i]; g_q[i] = (kappa[neuron] * delta[neuron] - bias[neuron] * (g_tk2[i] - g_tk1[i])) * d_norm[neuron]; } } """ func = func_compile("compute_q_Kernel", template % {"type": dtype_to_ctype(dtype)}) return func
def get_put_norm_kernel(dtype): template = """ __global__ void put_norm(%(type)s* d_SWeight, double* d_norm, int ld) { int tid = threadIdx.x; int bid = blockIdx.x; int NUM_NEURONS = gridDim.x; double norm1 = d_norm[bid]; double norm2; for(int i = tid; i < NUM_NEURONS; i += blockDim.x) { norm2 = d_norm[i]; d_SWeight[i + bid * ld] *= norm1 * norm2; } } """ func = func_compile("put_norm", template % {"type": dtype_to_ctype(dtype)}) return func
def get_put_norm_unaligned_kernel(dtype): template = """ __global__ void put_norm_unaligned(%(type)s* d_SWeight, int ld, double* d_norm, int* neuron_ind1, int* neuron_ind2, int num_neurons1) { int tid = threadIdx.x; int bid = blockIdx.x; int ind1 = neuron_ind1[bid]; double norm1 = d_norm[ind1]; int ind2; double norm2; for(int i = tid; i < num_neurons1; i += blockDim.x) { ind2 = neuron_ind2[i]; norm2 = d_norm[ind2]; d_SWeight[i + bid * ld] *= norm1 * norm2; } } """ func = func_compile("put_norm_unaligned", template % {"type": dtype_to_ctype(dtype)}) return func
def get_dirich_space_kernel(dtype = np.dtype(np.complex128), typef = np.dtype(np.float64)): dirich_template = """ #include <pycuda/pycuda-complex.hpp> #define BLOCK_SIZE 16 __device__ pycuda::complex<float> Iexpf(const float x) { float s,c; sincosf(x, &s, &c); return pycuda::complex<float>( c, s); } __device__ pycuda::complex<double> Iexp(const double x) { double s,c; sincos(x, &s, &c); return pycuda::complex<double>( c, s); } __global__ void psi_dirich_space_Kernel(%(typef)s* dirich_space, int dirich_ld, %(type)s* Ds, int Ds_ld, int Px, int Py, int Mx, int My, double Sx, double Sy, double x_start, double y_start, double WMx, double WMy) { unsigned int filter_id = blockIdx.y; //actually y index; extern __shared__ %(type)s s_Ds[]; unsigned int tid_x = threadIdx.x; unsigned int tid_y = threadIdx.y; int xld = (Px-1)/BLOCK_SIZE+1; unsigned int bid_x = blockIdx.x %% xld; unsigned int bid_y = blockIdx.x / xld; unsigned int dim_x = blockDim.x; unsigned int dim_y = blockDim.y; double x, y; int pix_x = dim_x * bid_x + tid_x; int pix_y = dim_y * bid_y + tid_y; // degree per pixel double dxdy; dxdy = (double)(Sx / (Px )); x = (double)(pix_x) * dxdy + x_start; dxdy = (double)(Sy / (Py )); y = -((double)(pix_y) * dxdy + y_start); %(type)s sum = %(type)s(0,0); int a = 0; for(int my = -My; my <= My; ++my) { for(int i = threadIdx.x + threadIdx.y * BLOCK_SIZE; i < (2*Mx+1); i += BLOCK_SIZE*BLOCK_SIZE) { s_Ds[i] = Ds[filter_id * Ds_ld + i + (my+My) * (2*Mx+1)]; } __syncthreads(); a = 0; for(int mx = -Mx; mx <= Mx; ++mx) { if( mx*mx * My * My + my*my * Mx * Mx <= Mx*Mx*My*My)// && mx*mx + my*my > 0) { sum += s_Ds[a] * %(exp)s(mx * WMx * x + my * WMy * y); } ++a; } __syncthreads(); } if(pix_x < Px && pix_y < Py) { int output_ind = (pix_y) * Px + (pix_x) + filter_id * dirich_ld; dirich_space[output_ind] = pycuda::real(sum); } } """ if typef == np.float64: Iexp = "Iexp" else: Iexp = "Iexpf" func = func_compile("psi_dirich_space_Kernel", dirich_template % {"type": dtype_to_ctype(dtype), "typef": dtype_to_ctype(typef), "exp": Iexp}) return func
def get_reconstruct_kernel(dtype, dtypeq): template = """ __global__ void reconstruct_Kernel(double* u_rec, int u_rec_ld, %(type)s* dirich_space, int dirich_ld, double* g_tk1, double* g_tk2, %(typeq)s* g_ck, double* d_t, int* neuron_ind, double* d_norm, int M, double WM, int size) { unsigned int tid = threadIdx.x; unsigned int bid = blockIdx.x; unsigned int bdim = blockDim.x; unsigned int pix = bid*bdim + tid; __shared__ double t; double u = 0; __shared__ double ck[128]; __shared__ double tk1[128]; __shared__ double tk2[128]; __shared__ int ind[128]; double space; double norm; if(tid == 0) { t = d_t[blockIdx.y]; } for(unsigned int i = 0; i < size; i+=bdim) { if(i + tid < size) { ck[tid] = g_ck[i + tid]; tk1[tid] = g_tk1[i + tid]; tk2[tid] = g_tk2[i + tid]; ind[tid] = neuron_ind[i + tid]; } __syncthreads(); for(unsigned int j = 0; j < bdim; ++j) { if(j + i < size) { space = dirich_space[ind[j] * dirich_ld + pix]; norm = d_norm[ind[j]]; double phi = 0; for(int m = 1; m <= M; ++m) { phi += (sin(m*WM*(t-tk1[j])) - sin(m*WM*(t - tk2[j]))) / m; } u += ck[j] * (phi * 2 / WM + tk2[j] - tk1[j]) * space * norm; } } __syncthreads(); } if(pix < u_rec_ld) { u_rec[pix + u_rec_ld * blockIdx.y] = u; } } """ func = func_compile( "reconstruct_Kernel", template % { "type": dtype_to_ctype(dtype), "typeq": dtype_to_ctype(dtypeq) }) return func
def get_IAF_kernel_linear_rt(dtype = np.float64): IAF_linear = """ __global__ void ensemble_encode(%(type)s* g_input, int input_ld, int num_neurons, int size, %(type)s* g_spike, int spike_ld, %(type)s* g_v0, %(type)s* g_kappa, %(type)s* g_bias, %(type)s* g_delta, %(type)s* g_time_count, int* g_spike_count, int max_spike, %(type)s dt, %(type)s* g_delta_value, %(type)s* g_sigma) { int tid = threadIdx.x; int bdim = blockDim.x; int bid = blockIdx.x; int btIdx_to_neuIdx = bdim * bid + tid; %(type)s kappa, bias, ddt, v0, time_count; %(type)s delta,sigma, delta_mean; int spike_count; if(btIdx_to_neuIdx<num_neurons) { kappa=g_kappa[btIdx_to_neuIdx]; bias=g_bias[btIdx_to_neuIdx]; ddt=dt; v0=g_v0[btIdx_to_neuIdx]; spike_count=0; time_count=g_time_count[btIdx_to_neuIdx]; delta_mean=g_delta[btIdx_to_neuIdx]; delta = g_delta_value[btIdx_to_neuIdx]; sigma = g_sigma[btIdx_to_neuIdx]; } for(unsigned int i = 0; i < size - 1; ++i) { if(btIdx_to_neuIdx < num_neurons) { %(type)s y1 = (g_input[i * input_ld + btIdx_to_neuIdx] + bias) / kappa; %(type)s y2 = (g_input[(i+1) * input_ld + btIdx_to_neuIdx] + bias) / kappa; if(y2 >= 0 || (y2<0 && y1<=0) ) { %(type)s area = (y1 + y2) / 2 * ddt; while(v0 + area >= delta) { %(type)s a = ( (y2-y1)/ ddt); %(type)s remain = (fabs(a) <= 1e-12)? ((delta - v0) / y1) : (sqrt(y1 * y1 + 2 * (delta - v0) * a) - y1)/ (a); time_count += remain; delta = g_spike[min(max_spike-1,spike_count) * spike_ld + btIdx_to_neuIdx] * sigma + delta_mean; g_spike[min(max_spike-1,spike_count) * spike_ld + btIdx_to_neuIdx] = time_count; spike_count++; area -= (delta - v0); v0 = 0; ddt -= remain; y1 = y1 + remain * a; // y1 += remain * (y2-y1)/ddt; } v0 += area; time_count += ddt; ddt = dt; }else { %(type)s a = ( (y2-y1)/ ddt); %(type)s remain; while((remain = y1*y1 + 2*(delta - v0)*a) >= 0) { remain = (sqrt(remain) - y1) / a; time_count += remain; delta = g_spike[min(max_spike-1,spike_count) * spike_ld + btIdx_to_neuIdx] * sigma + delta_mean; g_spike[min(max_spike-1,spike_count) * spike_ld + btIdx_to_neuIdx] = time_count; spike_count++; v0 = 0; ddt -= remain; y1 = y1 + remain * a; a = ( (y2-y1) /ddt); } v0 += (y1+ y2) / 2 * ddt; time_count += ddt; ddt = dt; } } } __syncthreads(); g_v0[btIdx_to_neuIdx]=v0; g_time_count[btIdx_to_neuIdx]=time_count; g_spike_count[btIdx_to_neuIdx]=spike_count; g_delta_value[btIdx_to_neuIdx] = delta; } """ func = func_compile("ensemble_encode", IAF_linear % {"type": dtype_to_ctype(dtype)}) return func
def get_G_kernel(dtype, dtypew): template = """ __device__ double G_dirichlet_time(double tk1, double tk2, double tl1,double tl2, int M, double WM) { double sum = 0; for(int m = 1; m <= M; ++m) { sum += (cos(m * WM * (tk2-tl2)) - cos(m * WM * (tk2-tl1)) - cos( m * WM * (tk1-tl2)) + cos(m * WM * (tk1-tl1))) / (m*m); } sum = sum * 2 / (WM * WM) + (tk2-tk1)*(tl2-tl1); return sum; } __global__ void compute_G_Kernel(%(type)s* g_G, int G_ld, double* g_tk1, double* g_tk2, double Wt, int Mt, %(typew)s* SWeight, int Sweight_ld, int* neuron_ind) { unsigned int tid = threadIdx.x; unsigned int bdim = blockDim.x; unsigned int bid = blockIdx.x; int size = gridDim.x; double tl[2]; __shared__ double tk[2]; __shared__ int ind1; int ind2; if(tid == 0) { tk[0] = g_tk1[bid]; //roundf(g_tk1[bid] * rintf(1 / dt)); }else if(tid == 1) { tk[1] = g_tk2[bid]; //roundf(g_tk2[bid] * rintf(1 / dt)); }else if(tid ==2) { ind1 = neuron_ind[bid]; } __syncthreads(); for(int i = tid; i < size; i += bdim) { ind2 = neuron_ind[i]; tl[0] = g_tk1[i]; tl[1] = g_tk2[i]; g_G[bid * G_ld + i] = G_dirichlet_time(tk[0],tk[1],tl[0],tl[1],Mt,Wt/Mt) * SWeight[ind1 * Sweight_ld + ind2]; } } """ func = func_compile( "compute_G_Kernel", template % { "type": dtype_to_ctype(dtype), "typew": dtype_to_ctype(dtypew) }) return func
def get_gabor_kernel(dtype=np.dtype(np.float64)): gabor_template = """ #include <pycuda/pycuda-complex.hpp> #define PI 3.141592653589793238462643383279 #define BLOCK_SIZE 16 __global__ void gabor_Kernel(%(type)s* g_filter, int filter_ld, double* g_m, double* g_l, double* g_x0, double* g_y0, int* g_ab, int Px, int Py, double Sx, double Sy, double x_start, double y_start, double KAPPA) { unsigned int tid_x = threadIdx.x; unsigned int tid_y = threadIdx.y; int xld = (Px-1)/BLOCK_SIZE + 1; unsigned int bid_x = blockIdx.x %% xld; unsigned int bid_y = blockIdx.x / xld; unsigned int filter_id = blockIdx.y; //actually y index; unsigned int dim_x = blockDim.x; unsigned int dim_y = blockDim.y; __shared__ double alpha; __shared__ double theta; __shared__ double x0; __shared__ double y0; __shared__ int sc; if(tid_y == 0) { if(tid_x == 0) { alpha = g_m[filter_id]; }else if(tid_x == 1) { theta = g_l[filter_id]; }else if(tid_x == 2) { x0 = g_x0[filter_id]; }else if(tid_x == 3) { y0 = g_y0[filter_id]; }else if(tid_x == 4) { sc = g_ab[filter_id]; } } __syncthreads(); double x, y; int pix_x = dim_x * bid_x + tid_x; int pix_y = dim_y * bid_y + tid_y; // degree per pixel double dxdy; dxdy = (double)(Sx / (Px)); x = (double)(pix_x) * dxdy + x_start; dxdy = (double)(Sy / (Py)); y = - ((double)(pix_y) * dxdy + y_start); x = alpha * (x - x0); y = alpha * (y - y0); double sint, cost; sincos(theta, &sint, &cost); double X = x * cost + y * sint; double Y = -x * sint + y * cost; double first_part = alpha * (1/sqrt(2 * PI)) * exp (- ( (4 * X * X) + (Y * Y)) / 8); sincos(KAPPA * X, &sint, &cost); double gb; if(sc == 0) { gb = first_part * cost; }else { gb = first_part * sint; } if(pix_x < Px && pix_y < Py) { int output_ind = (pix_y) * Px + (pix_x) + filter_id * filter_ld; g_filter[output_ind] = gb; } } """ func = func_compile("gabor_Kernel", gabor_template % {"type": dtype_to_ctype(dtype)}, options=["--ptxas-options=-v --maxrregcount=32"]) return func
def get_reconstruct_kernel(dtype, dtypeq): template = """ __global__ void reconstruct_Kernel(double* u_rec, int u_rec_ld, %(type)s* dirich_space, int dirich_ld, double* g_tk1, double* g_tk2, %(typeq)s* g_ck, double* d_t, int* neuron_ind, double* d_norm, int M, double WM, int size) { unsigned int tid = threadIdx.x; unsigned int bid = blockIdx.x; unsigned int bdim = blockDim.x; unsigned int pix = bid*bdim + tid; __shared__ double t; double u = 0; __shared__ double ck[128]; __shared__ double tk1[128]; __shared__ double tk2[128]; __shared__ int ind[128]; double space; double norm; if(tid == 0) { t = d_t[blockIdx.y]; } for(unsigned int i = 0; i < size; i+=bdim) { if(i + tid < size) { ck[tid] = g_ck[i + tid]; tk1[tid] = g_tk1[i + tid]; tk2[tid] = g_tk2[i + tid]; ind[tid] = neuron_ind[i + tid]; } __syncthreads(); for(unsigned int j = 0; j < bdim; ++j) { if(j + i < size) { space = dirich_space[ind[j] * dirich_ld + pix]; norm = d_norm[ind[j]]; double phi = 0; for(int m = 1; m <= M; ++m) { phi += (sin(m*WM*(t-tk1[j])) - sin(m*WM*(t - tk2[j]))) / m; } u += ck[j] * (phi * 2 / WM + tk2[j] - tk1[j]) * space * norm; } } __syncthreads(); } if(pix < u_rec_ld) { u_rec[pix + u_rec_ld * blockIdx.y] = u; } } """ func = func_compile("reconstruct_Kernel", template % {"type": dtype_to_ctype(dtype), "typeq": dtype_to_ctype(dtypeq)}) return func
def get_cs_kernel(dtype=np.dtype(np.float64)): gabor_template = """ #include <pycuda/pycuda-complex.hpp> #define PI 3.141592653589793238462643383279 #define BLOCK_SIZE 16 __global__ void cs_Kernel(%(type)s* g_filter, int filter_ld, double* g_m, double* g_x0, double* g_y0, int Px, int Py, double Sx, double Sy, double x_start, double y_start, double sigma_c_square, double sigma_s_square) { unsigned int tid_x = threadIdx.x; unsigned int tid_y = threadIdx.y; int xld = (Px-1)/BLOCK_SIZE + 1; unsigned int bid_x = blockIdx.x %% xld; unsigned int bid_y = blockIdx.x / xld; unsigned int filter_id = blockIdx.y; //actually y index; unsigned int dim_x = blockDim.x; unsigned int dim_y = blockDim.y; __shared__ double alpha; __shared__ double x0; __shared__ double y0; if(tid_y == 0) { if(tid_x == 0) { alpha = g_m[filter_id]; }else if(tid_x == 1) { x0 = g_x0[filter_id]; }else if(tid_x == 2) { y0 = g_y0[filter_id]; } } __syncthreads(); double x, y; int pix_x = dim_x * bid_x + tid_x; int pix_y = dim_y * bid_y + tid_y; // degree per pixel double dxdy; dxdy = (double)(Sx / (Px)); x = (double)(pix_x) * dxdy + x_start; dxdy = (double)(Sy / (Py)); y = - ((double)(pix_y) * dxdy + y_start); x = alpha * (x - x0); y = alpha * (y - y0); double XY = -(x*x + y*y); double gb = (alpha) * ( exp ( XY / (2 * sigma_c_square)) / (sigma_c_square) - 0.9 * exp ( XY / (2 * sigma_s_square)) / (sigma_s_square)); if(pix_x < Px && pix_y < Py) { int output_ind = (pix_y) * Px + (pix_x) + filter_id * filter_ld; g_filter[output_ind] = gb / (2*PI); } } """ func = func_compile("cs_Kernel", gabor_template % {"type": dtype_to_ctype(dtype)}, options=["--ptxas-options=-v --maxrregcount=32"]) return func