def gpu_kernels(self, node, name): dim = get_scalar_constant_value(node.inputs[1]) flags = Kernel.get_flags(node.inputs[0].dtype) def_macros, undef_macros = self._macros(node, name) hsup = (self._hash_support_code() + "\n" + self._lookup_code()) knames = ["build_hash", "dedup", "find_valid"] kcodes = [ "".join( open("%s%s%s.cu" % (os.path.dirname(__file__), os.path.sep, kn)).readlines()) for kn in knames ] kcodes = [ "\n".join([def_macros, hsup, code, undef_macros]) for code in kcodes ] kcodes = ["#include \"cluda.h\"\n" + code for code in kcodes] kparams = ([ GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE, SIZE, SIZE ], [GpuArray, SIZE, GpuArray, SIZE, SIZE], [GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE, SIZE]) return [ Kernel(code=kcode, name="%s_%d" % (kname, dim), params=kparams, flags=flags) for kcode, kname, kparams in zip(kcodes, knames, kparams) ]
def gpu_kernels(self, node, name): dtype_x = node.inputs[0].dtype type_x = gpuarray.dtype_to_ctype(dtype_x) dtype_y = node.outputs[0].dtype type_y = gpuarray.dtype_to_ctype(dtype_y) work_x = gpuarray.dtype_to_ctype(work_dtype(dtype_x)) load_x = load_w(dtype_x) write_y = write_w(dtype_y) code = """ #include "cluda.h" KERNEL void extract(const ga_ssize stridesX0, const ga_ssize stridesX1, GLOBAL_MEM %(type_x)s *x, ga_size x_off, const ga_ssize stridesY0, GLOBAL_MEM %(type_y)s *y, ga_size y_off, ga_ssize k, ga_size l) { x = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)x) + x_off); y = (GLOBAL_MEM %(type_y)s *)(((GLOBAL_MEM char *)y) + y_off); ga_ssize coff = max(k, (ga_ssize) 0); ga_ssize roff = -min(k, (ga_ssize) 0); ga_size index = GID_0 * LDIM_0 + LID_0; if (index < l) { %(work_x)s t = %(load_x)s(x[(index + roff) * stridesX0 + (index + coff) * stridesX1]); y[index * stridesY0] = %(write_y)s(t); } }""" % dict(type_x=type_x, type_y=type_y, work_x=work_x, load_x=load_x, write_y=write_y, name=name) return [Kernel( code=code, name="extract", params=[gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SIZE], flags=Kernel.get_flags(dtype_x, dtype_y), objvar='k_extract_' + name)]
def gpu_kernels(self, node, name): dtype_d = node.inputs[0].dtype type_d = gpuarray.dtype_to_ctype(dtype_d) dtype_x = node.inputs[1].dtype type_x = gpuarray.dtype_to_ctype(dtype_x) dtype_y = node.outputs[0].dtype type_y = gpuarray.dtype_to_ctype(dtype_y) work_d = gpuarray.dtype_to_ctype(work_dtype(dtype_d)) load_d = load_w(dtype_d) work_x = gpuarray.dtype_to_ctype(work_dtype(dtype_x)) load_x = load_w(dtype_x) code = """ #include "cluda.h" KERNEL void binsearchsorted(const ga_ssize stridesD0, GLOBAL_MEM %(type_d)s *d, ga_size d_off, const ga_ssize stridesX0, GLOBAL_MEM %(type_x)s *x, ga_size x_off, const ga_ssize stridesY0, GLOBAL_MEM %(type_y)s *y, ga_size y_off, ga_size lx, ga_ssize ld) { d = (GLOBAL_MEM %(type_d)s *)(((GLOBAL_MEM char *)d) + d_off); x = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)x) + x_off); y = (GLOBAL_MEM %(type_y)s *)(((GLOBAL_MEM char *)y) + y_off); ga_size index = threadIdx.x + blockIdx.x * blockDim.x; if (index < lx) { ga_long a = 0; ga_long b = (ga_long)(ld - 1); %(work_d)s minval = %(load_d)s(d[a]); %(work_d)s maxval = %(load_d)s(d[b * stridesD0]); %(work_x)s val = %(load_x)s(x[index * stridesX0]); if (val > maxval) { a = (ga_long)ld; b = (ga_long)ld; } else if (val <= minval) { a = 0; b = 0; } while (b - a > 0) { ga_long h = (b + a) / 2; %(work_d)s t = %(load_d)s(d[h * stridesD0]); if (val < t) { b = h; } else { a = h + 1; } } y[index * stridesY0] = b; } }""" % dict(type_d=type_d, type_x=type_x, type_y=type_y, work_d=work_d, load_d=load_d, work_x=work_x, load_x=load_x, name=name) return [Kernel( code=code, name="binsearchsorted", params=[gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE, gpuarray.SSIZE], flags=Kernel.get_flags(dtype_d, dtype_x, dtype_y), objvar='k_binsearchsorted_' + name)]
def gpu_kernels(self, node, name): dt = node.inputs[0].type code = """ KERNEL void doublek(GLOBAL_MEM %(ctype) *out, GLOBAL_MEM const %(ctype)s *a, ga_size n) { for (ga_size i = LID_0; i < n; i += LDIM_0) { out[i] = 2 * a[i]; } } """ % dict(ctype=gpuarray.dtype_to_ctype(dt)) return [ Kernel( code=code, name="doublek", params=[gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE], flags=Kernel.get_flags(dt)) ]
def gpu_kernels(self, node, name): code = """ KERNEL void axpb(GLOBAL_MEM %(ctype)s *x, GLOBAL_MEM %(ctype)s *z, ga_size n, ga_size m) { for (ga_size i = LID_0; i < n; i += LDIM_0) { for (ga_size j = LID_0; j < m; j += LDIM_0) { z[i*m + j] = %(write_a)s( 2 * x[i*m + j] ); } } }""" % dict(ctype=pygpu.gpuarray.dtype_to_ctype(self.dtype), name=name, write_a=write_w(self.dtype)) return [ Kernel(code=code, name="axpb", params=[ gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE ], flags=Kernel.get_flags(self.dtype), objvar='k_axpb_' + name) ]
def gpu_kernels(self, node, name): rdim = get_scalar_constant_value(node.inputs[2]) vdim = get_scalar_constant_value(node.inputs[3]) flags = Kernel.get_flags(node.inputs[0].dtype, node.inputs[1].dtype) def_macros, undef_macros = self._macros(node, name) hsup = (GpuHashTable._hash_support_code() + "\n" + GpuHashTable._lookup_code()) knames = ["splat", "blur", "slice"] kcodes = [ "".join( open("%s%s%s.cu" % (os.path.dirname(__file__), os.path.sep, kn)).readlines()) for kn in knames ] kcodes = [ "\n".join([def_macros, hsup, code, undef_macros]) for code in kcodes ] kparams = ([ GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE ], [ GpuArray, GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE, SIZE, SIZE ], [ GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE, GpuArray, SIZE ]) return [ Kernel(code=kcode, name="%s_%d_%d" % (kname, rdim, vdim), params=kparams, flags=flags) for kcode, kname, kparams in zip(kcodes, knames, kparams) ]
def gpu_kernels(self, node, name): dt = node.inputs[0].type code = """ KERNEL void doublek(GLOBAL_MEM %(ctype) *out, GLOBAL_MEM const %(ctype)s *a, ga_size n) { for (ga_size i = LID_0; i < n; i += LDIM_0) { out[i] = 2 * a[i]; } } """ % dict(ctype=gpuarray.dtype_to_ctype(dt)) return [Kernel(code=code, name="doublek", params=[gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE], flags=Kernel.get_flags(dt))]
def gpu_kernels(self, node, nodename): CHARMAP = dict(int32='i', uint32='I', int64='l', uint64='L', float16='e', float32='f', float64='d') dtype_in = node.inputs[0].dtype dtype_out = node.outputs[0].dtype dtype_idx = node.inputs[1].dtype type_in = gpuarray.dtype_to_ctype(dtype_in) type_out = gpuarray.dtype_to_ctype(dtype_out) type_idx = gpuarray.dtype_to_ctype(dtype_idx) flags = Kernel.get_flags(dtype_in, dtype_out, dtype_idx) kname = "k_vector_select_fast" k_var = "k_vector_select_fast_" + nodename code = """#include "cluda.h" KERNEL void k_vector_select_fast(const ga_size numRowsOut, const ga_size numColsOut, const ga_ssize stridesOut0, const ga_ssize stridesOut1, GLOBAL_MEM %(type_out)s *Out, const ga_size offset_Out, const ga_size numRowsIn, const ga_size numColsIn, const ga_ssize stridesIn0, const ga_ssize stridesIn1, GLOBAL_MEM %(type_in)s *In, const ga_size offset_In, const ga_size numIndices, const ga_ssize stridesIndices, GLOBAL_MEM %(type_idx)s *indices_arr, const ga_size offset_indices_arr, GLOBAL_MEM ga_int *err) { Out = (GLOBAL_MEM %(type_out)s *)(((GLOBAL_MEM char *)Out)+offset_Out); In = (GLOBAL_MEM %(type_in)s *)(((GLOBAL_MEM char *)In)+offset_In); indices_arr = (GLOBAL_MEM %(type_idx)s *)(((GLOBAL_MEM char *)indices_arr)+offset_indices_arr); for (ga_int i = GID_0; i < numIndices; i += GDIM_0) { for (ga_int j = LID_0; j < numColsIn; j += LDIM_0) { ga_ssize in_row = indices_arr[i * stridesIndices]; if (in_row < 0) in_row += numRowsIn; ga_ssize out_row = i; if (in_row < numRowsIn && in_row >= 0) { Out[(out_row * stridesOut0) + (j * stridesOut1)] = In[(in_row * stridesIn0) + (j * stridesIn1)]; } else { *err = 1; } } } return; } """ % dict(type_in=type_in, type_out=type_out, type_idx=type_idx, tc=CHARMAP[dtype_in]) from pygpu.gpuarray import SIZE, SSIZE params = [ SIZE, SIZE, SSIZE, SSIZE, gpuarray.GpuArray, SIZE, SIZE, SIZE, SSIZE, SSIZE, gpuarray.GpuArray, SIZE, SIZE, SSIZE, gpuarray.GpuArray, SIZE, gpuarray.GpuArray] return [Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)]
def gpu_kernels(self, node, nodename): # We can't rely on numpy for this, it changes with the OS CHARMAP = dict( int32="i", uint32="I", int64="l", uint64="L", float16="e", float32="f", float64="d", ) dtype_x = node.inputs[0].dtype dtype_y = node.inputs[1].dtype dtype_ind = node.inputs[2].dtype type_x = gpuarray.dtype_to_ctype(dtype_x) type_y = gpuarray.dtype_to_ctype(dtype_y) type_ind = gpuarray.dtype_to_ctype(dtype_ind) flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind) kname = "k_vector_add_fast" k_var = "k_vector_add_fast_" + nodename code = """#include "cluda.h" KERNEL void k_vector_add_fast(const ga_size numRowsX, const ga_size numColsX, const ga_ssize stridesX0, const ga_ssize stridesX1, GLOBAL_MEM %(type_x)s *X, const ga_size offset_X, const ga_size numRowsY, const ga_size numColsY, const ga_ssize stridesY0, const ga_ssize stridesY1, GLOBAL_MEM %(type_y)s *Y, const ga_size offset_Y, const ga_size numIndices, const ga_ssize stridesIndices, GLOBAL_MEM %(type_ind)s *indices_arr, const ga_size offset_indices_arr, const ga_int set_instead_of_inc, GLOBAL_MEM ga_int *err) { X = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)X)+offset_X); Y = (GLOBAL_MEM %(type_y)s *)(((GLOBAL_MEM char *)Y)+offset_Y); indices_arr = (GLOBAL_MEM %(type_ind)s *)(((GLOBAL_MEM char *)indices_arr)+offset_indices_arr); for (ga_int i = GID_0; i < numIndices; i += GDIM_0) { for (ga_int j = LID_0; j < numColsX; j += LDIM_0) { ga_ssize x_row = indices_arr[i * stridesIndices]; if (x_row < 0) x_row += numRowsX; ga_ssize y_row = i; if (x_row < numRowsX && x_row >= 0) { if (set_instead_of_inc) { atom_xchg_%(tc)sg(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]); } else { atom_add_%(tc)sg(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]); } } else { *err = 1; } } } return; } """ % dict( type_x=type_x, type_y=type_y, type_ind=type_ind, tc=CHARMAP[dtype_x] ) from pygpu.gpuarray import SIZE, SSIZE params = [ SIZE, SIZE, SSIZE, SSIZE, gpuarray.GpuArray, SIZE, SIZE, SIZE, SSIZE, SSIZE, gpuarray.GpuArray, SIZE, SIZE, SSIZE, gpuarray.GpuArray, SIZE, "int32", gpuarray.GpuArray, ] return [Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)]
def gpu_kernels(self, node, nodename): dtype_ten4 = node.inputs[0].dtype dtype_z = node.outputs[0].dtype flags = Kernel.get_flags(dtype_ten4, dtype_z) type_ten4 = gpuarray.dtype_to_ctype(dtype_ten4) type_z = gpuarray.dtype_to_ctype(dtype_z) # `BORDER_MODE`'s c_support_code() contains C constants definitions that are useful here. mode_constants = self.BORDER_MODE.c_support_code() kernels = [] kname = "k_multi_warp_less" k_var = "k_multi_warp_less_" + nodename code = """#include "cluda.h" // a version that uses less registers but doesn't work in all cases. %(mode_constants)s KERNEL void %(kname)s( const ga_int mode, const ga_int nb_batch, const ga_int nb_stack, const ga_int height, const ga_int width, const ga_int c, const ga_int d, const ga_int step_x, const ga_int step_y, const ga_int grid_c, const ga_int grid_d, const ga_size stride0, const ga_size stride1, const ga_size stride2, const ga_size stride3, GLOBAL_MEM const %(type_ten4)s * global_ten4, const ga_size offset_ten4, const ga_size out_s0, const ga_size out_s1, GLOBAL_MEM %(type_z)s * global_out, const ga_size offset_out ) { const ga_int wrap_centered_half_idx_shift_x = c/2; const ga_int wrap_centered_half_idx_shift_y = d/2; global_ten4 = (GLOBAL_MEM const %(type_ten4)s *)(((GLOBAL_MEM char *)global_ten4)+offset_ten4); global_out = (GLOBAL_MEM %(type_z)s *)(((GLOBAL_MEM char *)global_out)+offset_out); for(ga_int tblock = GID_0*LDIM_2+LID_2; tblock<nb_batch*nb_stack*grid_c*grid_d; tblock+=GDIM_0*LDIM_2){ const ga_int b = tblock%%grid_d; ga_int left = tblock/grid_d; const ga_int a = left%%grid_c; left = left/grid_c; const ga_int s = left%%nb_stack; left = left/nb_stack; const ga_int n = left; if(n>nb_batch)continue; if(s>nb_stack)continue; if(a>grid_c)continue; if(b>grid_d)continue; ga_int z_row = b + grid_d*(a + grid_c* (s + nb_stack*n)); ga_int i = LID_1; // loop over c { ga_int ten4_2 = i + a * step_x; if(mode == MODE_WRAP_CENTERED) { ten4_2 -= wrap_centered_half_idx_shift_x; if ( ten4_2 < 0 ) ten4_2 += height; else if (ten4_2 >= height) ten4_2 -= height; } else if (mode == MODE_HALF) { ten4_2 -= wrap_centered_half_idx_shift_x; } else if (mode == MODE_FULL) { ten4_2 -= c - 1; } ga_int j = LID_0; // loop over d { ga_int ten4_3 = j + b * step_y; if(mode == MODE_WRAP_CENTERED){ ten4_3 -= wrap_centered_half_idx_shift_y; if ( ten4_3 < 0 ) ten4_3 += width; else if (ten4_3 >= width) ten4_3 -= width; } else if (mode == MODE_HALF) { ten4_3 -= wrap_centered_half_idx_shift_y; } else if (mode == MODE_FULL) { ten4_3 -= d - 1; } ga_int z_col = j + d * i; ga_int z_idx = z_col * out_s1 + z_row * out_s0; if(ten4_2 < 0 || ten4_2 >= height || ten4_3 < 0 || ten4_3 >= width){ global_out[z_idx] = 0; } else { ga_int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n; global_out[z_idx] = global_ten4[ten4_idx]; } } } } }""" % dict( kname=kname, type_ten4=type_ten4, type_z=type_z, mode_constants=mode_constants, ) params = [ "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "uintp", "uintp", "uintp", "uintp", gpuarray.GpuArray, "uintp", "uintp", "uintp", gpuarray.GpuArray, "uintp", ] kernels.append( Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)) kname = "k_multi_warp" k_var = "k_multi_warp_" + nodename code = """#include "cluda.h" %(mode_constants)s KERNEL void %(kname)s( const ga_int mode, const ga_int nb_batch, const ga_int nb_stack, const ga_int height, const ga_int width, const ga_int c, const ga_int d, const ga_int step_x, const ga_int step_y, const ga_int grid_c, const ga_int grid_d, const ga_size stride0, const ga_size stride1, const ga_size stride2, const ga_size stride3, GLOBAL_MEM const %(type_ten4)s * global_ten4, const ga_size offset_ten4, const ga_size out_s0, const ga_size out_s1, GLOBAL_MEM %(type_z)s * global_out, const ga_size offset_out ) { const ga_int wrap_centered_half_idx_shift_x = c/2; const ga_int wrap_centered_half_idx_shift_y = d/2; global_ten4 = (GLOBAL_MEM const %(type_ten4)s *)(((GLOBAL_MEM char *)global_ten4)+offset_ten4); global_out = (GLOBAL_MEM %(type_z)s *)(((GLOBAL_MEM char *)global_out)+offset_out); for(ga_int tblock = GID_0*LDIM_2+LID_2; tblock<nb_batch*nb_stack*grid_c*grid_d; tblock+=GDIM_0*LDIM_2){ const ga_int b = tblock%%grid_d; ga_int left = tblock/grid_d; const ga_int a = left%%grid_c; left = left/grid_c; const ga_int s = left%%nb_stack; left = left/nb_stack; const ga_int n = left; if(n>nb_batch)continue; if(s>nb_stack)continue; if(a>grid_c)continue; if(b>grid_d)continue; ga_int z_row = b + grid_d*(a + grid_c* (s + nb_stack*n)); // loop over c for (ga_int i = LID_1; i < c; i+=LDIM_1) { ga_int ten4_2 = i + a * step_x; if(mode == MODE_WRAP_CENTERED) { ten4_2 -= wrap_centered_half_idx_shift_x; if ( ten4_2 < 0 ) ten4_2 += height; else if (ten4_2 >= height) ten4_2 -= height; } else if (mode == MODE_HALF) { ten4_2 -= wrap_centered_half_idx_shift_x; } else if (mode == MODE_FULL) { ten4_2 -= c - 1; } // loop over d for (ga_int j = LID_0; j < d; j+=LDIM_0) { ga_int ten4_3 = j + b * step_y; if(mode == MODE_WRAP_CENTERED) { ten4_3 -= wrap_centered_half_idx_shift_y; if ( ten4_3 < 0 ) ten4_3 += width; else if (ten4_3 >= width) ten4_3 -= width; } else if (mode == MODE_HALF) { ten4_3 -= wrap_centered_half_idx_shift_y; } else if (mode == MODE_FULL) { ten4_3 -= d - 1; } ga_int z_col = j + d * i; ga_int z_idx = z_col * out_s1 + z_row * out_s0; if(ten4_2 < 0 || ten4_2 >= height || ten4_3 < 0 || ten4_3 >= width){ global_out[z_idx] = 0; } else { ga_int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n; global_out[z_idx] = global_ten4[ten4_idx]; } } } } } """ % dict( kname=kname, type_ten4=type_ten4, type_z=type_z, mode_constants=mode_constants, ) params = [ "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "uintp", "uintp", "uintp", "uintp", gpuarray.GpuArray, "uintp", "uintp", "uintp", gpuarray.GpuArray, "uintp", ] kernels.append( Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)) return kernels