def inline_softmax(N, buf, buf2, threadPos, threadCount, dtype="float32"): """ Generate code for a softmax. On entry, `buf` and `buf2` must contain two identical copies of the input to softmax. After the code returns `buf` contains the softmax, `buf2` contains un-normalized softmax. Parameters ---------- N Length of the buffer. threadPos Index of executing thread. threadCount Number of executing threads. dtype Dtype of the softmax's output. Notes ----- `buf` and `buf2` should be in gpu shared memory, we access it many times. We use __i as an int variable in a loop. """ ctype = gpuarray.dtype_to_ctype(dtype) # get max of buf (trashing all but buf[0]) return [inline_reduce_max(N, buf, threadPos, threadCount), '__syncthreads()', ('%s row_max = ' + buf + '[0]') % ctype, '__syncthreads()', 'for(int __i=' + threadPos + '; __i<' + N + '; __i+=' + threadCount + '){', buf + '[__i] = exp(' + buf2 + '[__i] - row_max)', buf2 + '[__i] = ' + buf + '[__i]', '}', '__syncthreads()', inline_reduce_sum(N, buf, threadPos, threadCount), '__syncthreads()', ('%s row_sum = ' + buf + '[0]') % ctype, '__syncthreads()', # divide each exp() result by the sum to complete the job. 'for(int __i=' + threadPos + '; __i<' + N + '; __i+=' + threadCount + '){', buf + '[__i] = ' + buf2 + '[__i] / row_sum', '}', '__syncthreads()', ]
def gpu_kernels(self, node, name): dt = node.inputs[0].type code = """ KERNEL void doublek(GLOBAL_MEM %(ctype) *out, GLOBAL_MEM const %(ctype)s *a, ga_size n) { for (ga_size i = LID_0; i < n; i += LDIM_0) { out[i] = 2 * a[i]; } } """ % dict(ctype=gpuarray.dtype_to_ctype(dt)) return [Kernel(code=code, name="doublek", params=[gpuarray.GpuArray, gpuarray.GpuArray, gpuarray.SIZE], flags=Kernel.get_flags(dt))]
def gpu_kernels(self, node, nodename): # load kernel source device_type = node.inputs[0].type.context.kind kernel_ext = {b'cuda': '.cu', b'opencl': '.cl'}[device_type] common_ext = {b'cuda': '.cuh', b'opencl': '.h'}[device_type] # prepare "$" macros if device_type == b'cuda': ndim = node.inputs[0].ndim dstv_strides_code = ''.join('ssize_t dstv_strides_%d, ' % i for i in range(ndim)) dsti_strides_code = ''.join('ssize_t dsti_strides_%d, ' % i for i in range(ndim)) src_strides_code = ''.join('ssize_t src_strides_%d, ' % i for i in range(ndim)) set_slice_code = ''' gidx = gid %% dims_%(i)d; gid /= dims_%(i)d; {dstv}; {dsti}; src = ptr_add(src, gidx*src_strides_%(i)d);\n'''.format( dstv='dstv = ptr_add(dstv, gidx*dstv_strides_%(i)d)' if self.return_values else '', dsti='dsti = ptr_add(dsti, gidx*dsti_strides_%(i)d)' if self.return_indices else '') set_slice_code = ''.join( set_slice_code % dict(i=j) for j in range(1, ndim)) flags = Kernel.get_flags(node.inputs[0].dtype) subs = dict( inp_t=ga.dtype_to_ctype(node.inputs[0].dtype), out_t=ga.dtype_to_ctype(self.idx_dtype), dims=''.join('size_t dims_%d, ' % i for i in range(1, ndim)), dstv='INPUT_TYPE *dstv,' if self.return_values else '', dsti='INDEX_TYPE *dsti,' if self.return_indices else '', dstv_strides=dstv_strides_code if self.return_values else '', dsti_strides=dsti_strides_code if self.return_indices else '', src_strides=src_strides_code, set_slice=set_slice_code, write_value=int(self.return_values), write_index=int(self.return_indices), ndim=str(ndim), use_half=int(node.inputs[0].dtype == 'float16') ) elif device_type == b'opencl': raise NotImplementedError() # setup parameters param_types = [ga.SIZE] * (ndim - 1) # dims for _ in range(self.return_values + self.return_indices): param_types.append(ga.GpuArray) # dst* param_types.extend([ga.SSIZE] * ndim) # dst*_strides param_types.append(ga.SIZE) # k param_types.append(ga.GpuArray) # src param_types.extend([ga.SSIZE] * ndim) # src_strides param_types.append(ga.SIZE) # size # load and compile kernels with open(os.path.join( os.path.dirname(__file__), 'c_code', 'topk_common' + common_ext )) as f: common_src = f.read() kernels = [] def build_kernel(fname, kname, subs): with open(os.path.join( os.path.dirname(__file__), 'c_code', fname) ) as f: kernel_src = f.read() ker = Kernel( code=Template(common_src + kernel_src).substitute(**subs), name=kname, params=param_types, flags=flags, objvar=kname + nodename) return ker subs['count_t'] = 'int' kernels.append( build_kernel('topk_dense' + kernel_ext, 'k_topk_dense', subs)) subs['kname'] = 'k_topk_dense_large' kernels.append( build_kernel('topk_dense_large' + kernel_ext, 'k_topk_dense_large', subs)) subs['count_t'] = 'long long' subs['kname'] = 'k_topk_dense_xlarge' kernels.append( build_kernel('topk_dense_large' + kernel_ext, 'k_topk_dense_xlarge', subs)) return kernels
def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x, sm, sm_stride, write_sm, threadPos, threadCount, b='', stride_b='', load_b='', dtype="float32"): """ Generate code to perform softmax with a fixed amount of shared memory. On entry, `buf` is assumed to be empty. On exit, `buf[0]` contains the softmax, `buf2` contains un-normalized softmax. Parameters ---------- N Length of the buffer, atleast waprSize(32). buf A shared memory buffer of size warpSize * sizeof(dtype). x A ptr to the gpu memory where the row is stored. stride_x The stride between each element in x. load_x Wrapper to read from x. sm A ptr to the gpu memory to store the result. sm_stride The stride between each sm element. write_sm Wrapper before writing to sm. threadPos Index of executing thread. threadCount Number of executing threads. b Optional, pointer to the bias. stride_b Optional, the stride of b if b is provided. load_b Optional, wrapper to read from b if b is provided. dtype Optional, the dtype of the softmax's output if not float32. Notes ----- `buf` should be in gpu shared memory, we access it many times. We use tx as an int variable in a loop. """ ctype = gpuarray.dtype_to_ctype(dtype) ret = [ # get max of buf (trashing all but buf[0]) inline_reduce_fixed_shared_max(N, buf, x, stride_x, load_x, threadPos, threadCount, b, stride_b, load_b, dtype), '__syncthreads()', ('%s row_max = ' + buf + '[0]') % ctype, '__syncthreads()', inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, threadPos, threadCount, lambda a, b: "%s + %s" % (a, b), lambda a: "exp(%s - row_max)" % a, b, stride_b, load_b, dtype), '__syncthreads()', ('%s row_sum = ' + buf + '[0]') % ctype, '__syncthreads()', "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", ] # This set all value correctly if b: ret += [ "%(sm)s[tx * %(sm_stride)s] = " " %(write_sm)s(exp(%(load_x)s(%(x)s[tx * %(stride_x)s]) +" " %(load_b)s(%(b)s[tx * %(stride_b)s]) - row_max)" " / row_sum)" % locals()] else: ret += [ "%(sm)s[tx * %(sm_stride)s] = " "%(write_sm)s(exp(%(load_x)s(%(x)s[tx * %(stride_x)s]) - row_max)" " / row_sum)" % locals()] ret += [ "}", '__syncthreads()', ] return ret
def gpu_kernels(self, node, nodename): dtype_x = node.inputs[0].dtype dtype_b = node.inputs[1].dtype dtype_y_idx = node.inputs[2].dtype work_x = work_dtype(dtype_x) work_b = work_dtype(dtype_b) load_x = load_w(dtype_x) load_b = load_w(dtype_b) write_x = write_w(dtype_x) write_b = write_w(dtype_b) flags = Kernel.get_flags(dtype_x, dtype_b, dtype_y_idx) type_x = gpuarray.dtype_to_ctype(dtype_x) type_b = gpuarray.dtype_to_ctype(dtype_b) work_x = gpuarray.dtype_to_ctype(work_x) type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx) kname = "k_xent_sm_1hot_bias" k_var = "k_xent_sm_1hot_bias_" + nodename if node.inputs[0].type.context.kind != b'cuda': f = '' else: f = '' if dtype_x == 'float64' else 'f' params = [ gpuarray.SIZE, gpuarray.SIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE ] sio = StringIO() print("""#include "cluda.h" KERNEL void %(kname)s(const ga_size M, const ga_size N, GLOBAL_MEM const %(type_x)s* x_data, const ga_size offset_x, const ga_ssize xs0, const ga_ssize xs1, GLOBAL_MEM const %(type_b)s* b, const ga_size offset_b, const ga_ssize bs0, GLOBAL_MEM const %(type_y_idx)s* y_idx_data, const ga_size offset_y_idx, const ga_ssize y_idxs0, GLOBAL_MEM %(type_x)s* nll_data, const ga_size offset_nll, const ga_ssize nlls0, GLOBAL_MEM %(type_x)s* sm_data, const ga_size offset_sm, const ga_ssize sms0, const ga_ssize sms1, GLOBAL_MEM %(type_y_idx)s* am_data, const ga_size offset_am, const ga_ssize ams0 GA_DECL_SHARED_PARAM(%(work_x)s, per_thread_values)) { x_data = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x_data)+offset_x); b = (GLOBAL_MEM const %(type_b)s *)(((GLOBAL_MEM char *)b)+offset_b); y_idx_data = (GLOBAL_MEM const %(type_y_idx)s *)(((GLOBAL_MEM char *)y_idx_data)+offset_y_idx); nll_data = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)nll_data)+offset_nll); sm_data = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)sm_data)+offset_sm); am_data = (GLOBAL_MEM %(type_y_idx)s *)(((GLOBAL_MEM char *)am_data)+offset_am); for (ga_int row = GID_0; row < M; row += GDIM_0){ GLOBAL_MEM const %(type_x)s* x = x_data + xs0 * row; GLOBAL_MEM %(type_x)s* sm = sm_data + sms0 * row; GA_DECL_SHARED_BODY(%(work_x)s, per_thread_values); LOCAL_MEM %(work_x)s row_max, sum, sum_inv; LOCAL_MEM ga_int row_max_threadIdx; %(work_x)s per_thread_row_max, per_thread_sum; ga_int per_thread_row_max_j; // COMPUTE ROW MAX AND ARGMAX // compute separate per-thread maximums and argmaxes per_thread_row_max = NAN; per_thread_row_max_j = 0; for (ga_int j = LID_0; j < N; j += LDIM_0) { %(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]); per_thread_row_max_j = (row_ij > per_thread_row_max) ? j : per_thread_row_max_j; per_thread_row_max = fmax%(f)s(row_ij, per_thread_row_max); } per_thread_values[LID_0] = per_thread_row_max; local_barrier(); if (LID_0 == 0) { row_max = NAN; row_max_threadIdx = 0; for (ga_int j = 0; j < LDIM_0; j++) { %(work_x)s per_thread_max = per_thread_values[j]; row_max_threadIdx = (per_thread_max > row_max) ? j : row_max_threadIdx; row_max = fmax%(f)s(per_thread_max, row_max); } } local_barrier(); // The thread with the highest max writes out which of its // values was the winner. if (LID_0 == row_max_threadIdx) am_data[row * ams0] = per_thread_row_max_j; // COMPUTE SOFTMAX per_thread_sum = 0.0; for (ga_int j = LID_0; j < N; j += LDIM_0) { %(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]); %(work_x)s sm_ij = exp%(f)s(row_ij - row_max); per_thread_sum += sm_ij; sm[j * sms1] = %(write_x)s(sm_ij); } per_thread_values[LID_0] = per_thread_sum; local_barrier(); if (LID_0 == 0) { sum = 0.0; for (ga_int j = 0; j < LDIM_0; j++) { sum += per_thread_values[j]; } sum_inv = 1.0 / sum; } local_barrier(); for (ga_int j = LID_0; j < N; j += LDIM_0) { sm[j * sms1] = %(write_x)s(%(load_x)s(sm[j * sms1]) * sum_inv); } if (LID_0 == 0) { const %(type_y_idx)s y_idx = (ga_int)y_idx_data[row * y_idxs0]; if ((y_idx >= N || y_idx < 0)) { // raise some suspicion. nll_data[row * nlls0] = %(write_x)s(0.0); } else { nll_data[row * nlls0] = %(write_x)s( - %(load_x)s(x[y_idx * xs1]) - %(load_b)s(b[y_idx * bs0]) + row_max + log%(f)s(sum)); } } } } """ % locals(), file=sio) return [Kernel(code=sio.getvalue(), name=kname, params=params, flags=flags, objvar=k_var)]
def gpu_kernels(self, node, nodename): dtype_x = node.inputs[0].dtype dtype_y = node.inputs[1].dtype dtype_ind = node.inputs[2].dtype type_x = gpuarray.dtype_to_ctype(dtype_x) type_y = gpuarray.dtype_to_ctype(dtype_y) type_ind = gpuarray.dtype_to_ctype(dtype_ind) flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind) kname = "k_vector_add_fast" k_var = "k_vector_add_fast_" + nodename code = """#include "cluda.h" KERNEL void k_vector_add_fast(const ga_size numRowsX, const ga_size numColsX, const ga_ssize stridesX0, const ga_ssize stridesX1, GLOBAL_MEM %(type_x)s *X, const ga_size offset_X, const ga_size numRowsY, const ga_size numColsY, const ga_ssize stridesY0, const ga_ssize stridesY1, GLOBAL_MEM %(type_y)s *Y, const ga_size offset_Y, const ga_size numIndices, const ga_ssize stridesIndices, GLOBAL_MEM %(type_ind)s *indices_arr, const ga_size offset_indices_arr, const ga_int set_instead_of_inc, GLOBAL_MEM ga_int *err) { X = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)X)+offset_X); Y = (GLOBAL_MEM %(type_y)s *)(((GLOBAL_MEM char *)Y)+offset_Y); indices_arr = (GLOBAL_MEM %(type_ind)s *)(((GLOBAL_MEM char *)indices_arr)+offset_indices_arr); for (ga_int i = GID_0; i < numIndices; i += GDIM_0) { for (ga_int j = LID_0; j < numColsX; j += LDIM_0) { ga_ssize x_row = indices_arr[i * stridesIndices]; if (x_row < 0) x_row += numRowsX; ga_ssize y_row = i; if (x_row < numRowsX && x_row >= 0) { if (set_instead_of_inc) { atom_xchg_%(tc)sg(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]); } else { atom_add_%(tc)sg(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]); } } else { *err = 1; } } } return; } """ % dict(type_x=type_x, type_y=type_y, type_ind=type_ind, tc=np.dtype(dtype_x).char) from pygpu.gpuarray import SIZE, SSIZE params = [ SIZE, SIZE, SSIZE, SSIZE, gpuarray.GpuArray, SIZE, SIZE, SIZE, SSIZE, SSIZE, gpuarray.GpuArray, SIZE, SIZE, SSIZE, gpuarray.GpuArray, SIZE, 'int32', gpuarray.GpuArray] return [Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)]
def c_code(self, node, name, inputs, outputs, sub): nd = node.outputs[0].ndim fail = sub["fail"] initial_dims = ','.join('1' for i in xrange(nd)) opname = str(self.scalar_op) #check that all inputs have valid dimensions emitted_inames = {} code = """ int n_blocks = 0; int threads_per_block = 0; size_t numEls = 0; """ if nd > 0: code += """ size_t dims[%(nd)s] = {%(initial_dims)s}; """ % locals() else: code += """ size_t *dims = NULL; """ for idx, iname in enumerate(inputs): if iname in emitted_inames: assert emitted_inames[iname] is node.inputs[idx] continue broadcasts = map(int, node.inputs[idx].broadcastable) broadcasts = ', '.join(map(str, broadcasts)) nd = node.inputs[idx].ndim if nd > 0: code += """ int broadcasts_%(iname)s[%(nd)s] = {%(broadcasts)s}; """ % locals() else: code += """ int *broadcasts_%(iname)s = NULL; """ % locals() emitted_inames[iname] = node.inputs[idx] #check that all inputs have valid dimensions emitted_inames = {} for idx, iname in enumerate(inputs): if iname in emitted_inames: continue code += """ //std::cerr << "C_CODE %(opname)s checking input %(iname)s\\n"; if (%(nd)s != PyGpuArray_NDIM(%(iname)s)) { PyErr_Format(PyExc_TypeError, "need %(nd)s dims, not %%i", PyGpuArray_NDIM(%(iname)s)); %(fail)s; } for (int i = 0; i< %(nd)s; ++i) { dims[i] = (dims[i] == 1) ? PyGpuArray_DIMS(%(iname)s)[i] : dims[i]; if ((!(broadcasts_%(iname)s[i] && PyGpuArray_DIMS(%(iname)s)[i] == 1)) && (dims[i] != PyGpuArray_DIMS(%(iname)s)[i])) { //std::cerr << "C_CODE %(opname)s checking input %(iname)s failed\\n"; PyErr_Format(PyExc_ValueError, "GpuElemwise. Input dimension mis-match. Input" " %(idx)d (indices start at 0) has shape[%%i] == %%i" ", but the output's size on that axis is %%i.", i, PyGpuArray_DIMS(%(iname)s)[i], dims[i] ); %(fail)s; } } """ % locals() emitted_inames[iname] = True #check that all outputs have valid dimensions for idx, oname in enumerate(outputs): typecode = dtype_to_typecode(node.outputs[idx].dtype) if idx not in self.inplace_pattern.keys(): code += """ for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) { if (dims[i] != PyGpuArray_DIMS(%(oname)s)[i]) { Py_DECREF(%(oname)s); %(oname)s = NULL; } } if (%(oname)s && !GpuArray_CHKFLAGS(&(%(oname)s->ga), GA_C_CONTIGUOUS)) { Py_XDECREF(%(oname)s); %(oname)s = NULL; } if (NULL == %(oname)s) { %(oname)s = pygpu_empty(%(nd)d, dims, %(typecode)s, GA_C_ORDER, pygpu_default_context(), Py_None); if (!%(oname)s) { //TODO, this check don't seam good. //TODO, set exception? %(fail)s } } //std::cerr << "ELEMWISE NEW %(oname)s nd" << PyGpuArray_NDIM(%(oname)s) << "\\n"; //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n"; """ % locals() else: input_idx = self.inplace_pattern[idx] iname = inputs[input_idx] code += """ Py_XDECREF(%(oname)s); %(oname)s = %(iname)s; Py_INCREF(%(oname)s); for (int i = 0; (i< %(nd)s) && (%(oname)s); ++i) { if (dims[i] != PyGpuArray_DIMS(%(oname)s)[i]) { PyErr_Format(PyExc_ValueError, "GpuElemwise. Output dimension mis-match. Output" " %(idx)d (indices start at 0), working inplace" " on input %(input_idx)s, has shape[%%i] == %%i" ", but the output's size on that axis is %%i.", i, PyGpuArray_DIMS(%(oname)s)[i], dims[i] ); Py_DECREF(%(oname)s); %(oname)s = NULL; %(fail)s; } } //std::cerr << "ELEMWISE NEW %(oname)s nd" << PyGpuArray_NDIM(%(oname)s) << "\\n"; //std::cerr << "ELEMWISE NEW %(oname)s data" << %(oname)s->devdata << "\\n"; """ % locals() z = outputs[0] code += """numEls = PyGpuArray_SIZE(%(z)s); //first use at least a full warp threads_per_block = std::min(numEls, (size_t)32); //WARP SIZE //next start adding multiprocessors // UP TO NUMBER OF MULTIPROCESSORS, use 30 for now. n_blocks = std::min(numEls/threads_per_block + (numEls %% threads_per_block?1:0), (size_t)30); // next start adding more warps per multiprocessor if (threads_per_block * n_blocks < numEls) threads_per_block = std::min(numEls/n_blocks, (size_t) 256); //std::cerr << "calling callkernel returned\\n"; """ % locals() code += "elem_%(nd)s<<<n_blocks, threads_per_block>>>(numEls,\n" % locals() param = [] for i in range(nd): param.append("%(z)s->ga.dimensions[%(i)d]" % dict(z=outputs[0], i=i)) for n, (name, var) in enumerate(zip(inputs + outputs, node.inputs + node.outputs)): if (n - len(inputs)) in self.inplace_pattern: continue dtype = dtype_to_ctype(var.dtype) param.append("(%(dtype)s*)(cuda_get_ptr(%(name)s->ga.data))" % locals()) param.append("%(name)s->ga.offset" % locals()) for i in range(nd): param.append("PyGpuArray_DIMS(%(name)s)[%(i)d] == 1 ? 0 : PyGpuArray_STRIDES(%(name)s)[%(i)d]" % locals()) code += ',\n'.join(param) + ");\n" if config.gpuarray.sync: code += "GpuArray_sync(&%(zz)s->ga);\n" % dict(zz=zz) return str(code)
def gpu_kernels(self, node, nodename): dtype_x = node.inputs[0].dtype dtype_y = node.inputs[1].dtype dtype_ind = node.inputs[2].dtype type_x = gpuarray.dtype_to_ctype(dtype_x) type_y = gpuarray.dtype_to_ctype(dtype_y) type_ind = gpuarray.dtype_to_ctype(dtype_ind) flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind) kname = "k_vector_add_fast" k_var = "k_vector_add_fast_" + nodename code = """ /* * This is an atomicAdd that works for doubles since that is not provided * natively by cuda before arch 6.0. */ #if __CUDA_ARCH__ < 600 __device__ ga_double atomicAdd(ga_double* address, ga_double val) { ga_ulong *address_as_ull = (ga_ulong *)address; ga_ulong old = *address_as_ull, assumed; do { assumed = old; old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); } while (assumed != old); return __longlong_as_double(old); } #endif __device__ ga_double atomicExch(ga_double *address, ga_double val) { return atomicExch((ga_ulong *)address, __double_as_longlong(val)); } /* GA_LONG */ __device__ ga_long atomicAdd(ga_long* address, ga_long val) { ga_ulong *address_as_ull = (ga_ulong *)address; ga_ulong old = *address_as_ull, assumed; do { assumed = old; old = atomicCAS(address_as_ull, assumed, (ga_ulong)(val + (ga_long)assumed)); } while (assumed != old); return (ga_long)old; } __device__ ga_long atomicExch(ga_long *address, ga_long val) { return (ga_long)atomicExch((ga_ulong *)address, (ga_ulong)val); } /* GA_HALF */ /* * This may read and write 2 bytes more than the size of the array * if the array has an uneven number of elements. The actual value * at that spot will not be modified. */ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) { ga_uint *base = (ga_uint *)((ga_size)addr & ~2); ga_uint old, assumed, sum, new_; old = *base; do { assumed = old; sum = __float2half_rn( __half2float(val) + __half2float((ga_half)__byte_perm(old, 0, ((ga_size)addr & 2) ? 0x4432 : 0x4410))); new_ = __byte_perm(old, sum, ((ga_size)addr & 2) ? 0x5410 : 0x3254); old = atomicCAS(base, assumed, new_); } while (assumed != old); return (ga_half)__byte_perm(old, 0, ((ga_size)addr & 2) ? 0x4432 : 0x4410); } __device__ ga_half atomicExch(ga_half *addr, ga_half val) { ga_uint *base = (ga_uint *)((ga_size)addr & ~2); ga_uint old, assumed, new_; old = *base; do { assumed = old; new_ = __byte_perm(old, val, ((ga_size)addr & 2) ? 0x5410 : 0x3254); old = atomicCAS(base, assumed, new_); } while (assumed != old); return (ga_half)__byte_perm(old, 0, ((ga_size)addr & 2) ? 0x4432 : 0x4410); } KERNEL void k_vector_add_fast(const ga_size numRowsX, const ga_size numColsX, const ga_ssize stridesX0, const ga_ssize stridesX1, %(type_x)s *X, const ga_size offset_X, const ga_size numRowsY, const ga_size numColsY, const ga_ssize stridesY0, const ga_ssize stridesY1, %(type_y)s *Y, const ga_size offset_Y, const ga_size numIndices, const ga_ssize stridesIndices, %(type_ind)s *indices_arr, const ga_size offset_indices_arr, const int set_instead_of_inc, ga_int *err) { X = (%(type_x)s *)(((char *)X)+offset_X); Y = (%(type_y)s *)(((char *)Y)+offset_Y); indices_arr = (%(type_ind)s *)(((char *)indices_arr)+offset_indices_arr); for (int i = (blockIdx.x); i < numIndices; i += gridDim.x) { for(int j = (threadIdx.x); j < numColsX;j += blockDim.x) { ga_ssize x_row = indices_arr[i * stridesIndices]; if (x_row < 0) x_row += numRowsX; ga_ssize y_row = i; if (x_row < numRowsX && x_row >= 0) { if (set_instead_of_inc) { atomicExch(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]); } else { atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]); } } else { *err = 1; } } } return; } """ % dict(type_x=type_x, type_y=type_y, type_ind=type_ind) params = [ 'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp', 'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp', 'uintp', 'intp', gpuarray.GpuArray, 'uintp', 'int', gpuarray.GpuArray] return [Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)]
def gpu_kernels(self, node, nodename): dtype_x = node.inputs[0].dtype dtype_b = node.inputs[1].dtype dtype_sm = node.outputs[0].dtype load_x = load_w(node.inputs[0].dtype) load_b = load_w(node.inputs[1].dtype) write_sm = write_w(node.outputs[0].dtype) work_sm = work_dtype(node.outputs[0].dtype) flags = Kernel.get_flags(dtype_x, dtype_b, dtype_sm) type_x = gpuarray.dtype_to_ctype(dtype_x) type_b = gpuarray.dtype_to_ctype(dtype_b) type_sm = gpuarray.dtype_to_ctype(dtype_sm) type_acc = gpuarray.dtype_to_ctype(work_sm) params = [ 'uintp', 'uintp', gpuarray.GpuArray, 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp', 'intp', gpuarray.GpuArray, 'uintp', 'intp', 'intp' ] kernels = [] kname = "kSoftmaxWithBias" k_var = "kSoftmaxWithBias_" + nodename code = nvcc_kernel( kname, params=['const ga_size M', 'const ga_size N', 'const %s * x' % type_x, 'const ga_size offset_x', 'const ga_ssize sx0', 'const ga_ssize sx1', 'const %s * b' % type_b, 'const ga_size offset_b', 'const ga_ssize sb0', '%s * sm' % type_sm, 'const ga_size offset_sm', 'const ga_ssize sm_s0', 'const ga_ssize sm_s1'], body=["extern __shared__ %s buf[]" % type_acc, "%s * buf2 = buf + N" % type_acc, "x = (const %s *)(((char *)x)+offset_x)" % type_x, "b = (const %s *)(((char *)b)+offset_b)" % type_b, "sm = (%s *)(((char *)sm)+offset_sm)" % type_sm, "for (int blockIDX = blockIdx.x; blockIDX < M;" " blockIDX += gridDim.x){", "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "buf[tx] = %s(x[blockIDX * sx0 + tx * sx1])" % load_x, "buf[tx] += %s(b[tx * sb0])" % load_b, "buf2[tx] = buf[tx]", "}", "__syncthreads()", inline_softmax('N', 'buf', 'buf2', 'threadIdx.x', 'blockDim.x', work_sm), "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", "sm[blockIDX * sm_s0 + tx * sm_s1] = %s(buf[tx])" % write_sm, "}", "__syncthreads()", "}", ]) kernels.append(Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)) kname = "kSoftmaxWithBias_fixed_shared" k_var = "kSoftmaxWithBias_fixed_shared" + nodename code = nvcc_kernel( kname, params=['const ga_size M', 'const ga_size N', 'const %s * x' % type_x, 'const ga_size offset_x', 'const ga_ssize sx0', 'const ga_ssize sx1', 'const %s * b' % type_b, 'const ga_size offset_b', 'const ga_ssize sb0', '%s * sm' % type_sm, 'const ga_size offset_sm', 'const ga_ssize sm_s0', 'const ga_ssize sm_s1'], body=["extern __shared__ %s buf[]" % type_acc, "x = (const %s *)(((char *)x)+offset_x)" % type_x, "b = (const %s *)(((char *)b)+offset_b)" % type_b, "sm = (%s *)(((char *)sm)+offset_sm)" % type_sm, "for (int blockIDX = blockIdx.x; blockIDX < M;" " blockIDX += gridDim.x){", "const %s *x_ptr = &x[blockIDX * sx0]" % type_x, "%s *sm_ptr = &sm[blockIDX * sm_s0]" % type_sm, inline_softmax_fixed_shared('N', 'buf', 'x_ptr', 'sx1', load_x, 'sm_ptr', 'sm_s1', write_sm, 'threadIdx.x', 'blockDim.x', 'b', 'sb0', load_b, work_sm), "__syncthreads()", "}", ]) kernels.append(Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)) return kernels
def gpu_kernels(self, node, nodename): dtype_dnll = node.inputs[0].dtype dtype_sm = node.inputs[1].dtype dtype_y_idx = node.inputs[2].dtype dtype_dx = node.outputs[0].dtype work_dnll = work_dtype(dtype_dnll) load_dnll = load_w(dtype_dnll) load_sm = load_w(dtype_sm) write_dx = write_w(dtype_dx) flags = Kernel.get_flags(dtype_dnll, dtype_sm, dtype_y_idx, dtype_dx) type_dnll = gpuarray.dtype_to_ctype(work_dnll) type_sm = gpuarray.dtype_to_ctype(dtype_sm) type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx) type_dx = gpuarray.dtype_to_ctype(dtype_dx) kname = "kCrossEntropySoftmax1HotWithBiasDx" k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename sio = StringIO() print(""" KERNEL void %(kname)s( const ga_size N, const ga_size K, const %(type_dnll)s* dnll, const ga_size offset_dnll, const ga_ssize dnll_s0, const %(type_sm)s* sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1, const %(type_y_idx)s* y_idx, const ga_size offset_y_idx, const ga_ssize y_idx_s0, %(type_dx)s* dx, const ga_size offset_dx, const ga_ssize dx_s0, const ga_ssize dx_s1) { dnll = (const %(type_dnll)s *)(((char *)dnll)+offset_dnll); sm = (const %(type_sm)s *)(((char *)sm)+offset_sm); y_idx = (const %(type_y_idx)s *)(((char *)y_idx)+offset_y_idx); dx = (%(type_dx)s *)(((char *)dx)+offset_dx); for (int i = blockIdx.x; i < N; i += gridDim.x) { %(type_dnll)s dnll_i = %(load_dnll)s(dnll[i * dnll_s0]); %(type_y_idx)s y_i = y_idx[i * y_idx_s0]; for (int j = threadIdx.x; j < K; j += blockDim.x) { if (y_i == j) { dx[i * dx_s0 + j * dx_s1] = %(write_dx)s(dnll_i * (%(load_sm)s(sm[i * sm_s0 + j * sm_s1]) - 1.0)); } else { dx[i * dx_s0 + j * dx_s1] = %(write_dx)s(dnll_i * %(load_sm)s(sm[i * sm_s0 + j * sm_s1])); } } } } """ % locals(), file=sio) params = [ 'uintp', 'uintp', gpuarray.GpuArray, 'uintp', 'intp', gpuarray.GpuArray, 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp', 'intp', gpuarray.GpuArray, 'uintp', 'intp', 'intp' ] return [Kernel(code=sio.getvalue(), name=kname, params=params, flags=flags, objvar=k_var)]
def gpu_kernels(self, node, nodename): dtype_x = node.inputs[0].dtype dtype_b = node.inputs[1].dtype dtype_sm = node.outputs[0].dtype load_x = load_w(node.inputs[0].dtype) load_b = load_w(node.inputs[1].dtype) write_sm = write_w(node.outputs[0].dtype) work_sm = work_dtype(node.outputs[0].dtype) flags = Kernel.get_flags(dtype_x, dtype_b, dtype_sm) type_x = gpuarray.dtype_to_ctype(dtype_x) type_b = gpuarray.dtype_to_ctype(dtype_b) type_sm = gpuarray.dtype_to_ctype(dtype_sm) type_acc = gpuarray.dtype_to_ctype(work_sm) ctype = gpuarray.dtype_to_ctype(work_sm) params = [ gpuarray.SIZE, gpuarray.SIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE, ] kernels = [] kname = "kSoftmaxWithBias" k_var = "kSoftmaxWithBias_" + nodename code = ("""#include "cluda.h" KERNEL void %(kname)s (const ga_size M, const ga_size N, GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1, GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0, GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf)) { GA_DECL_SHARED_BODY(%(type_acc)s, buf); LOCAL_MEM_ARG %(type_acc)s * buf2 = buf + N; x = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x)+offset_x); b = (GLOBAL_MEM const %(type_b)s *)(((GLOBAL_MEM char *)b)+offset_b); sm = (GLOBAL_MEM %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm); for (ga_int blockIDX = GID_0; blockIDX < M; blockIDX += GDIM_0){ for (ga_int tx = LID_0; tx< N; tx += LDIM_0){ buf[tx] = %(load_x)s(x[blockIDX * sx0 + tx * sx1]); buf[tx] += %(load_b)s(b[tx * sb0]); buf2[tx] = buf[tx]; } local_barrier(); { // This function trashes buf[1..GA_WARP_SIZE], // leaving the reduction result in buf[0]. if (LID_0 < GA_WARP_SIZE) { for (ga_int i = LID_0 + GA_WARP_SIZE; i < N; i += GA_WARP_SIZE) { buf[LID_0] = max(buf[LID_0], buf[i]); } } local_barrier(); //reduce so that LID_0 0 has the reduction of everything for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) { if (LID_0 < _n && LID_0 + _n < N) buf[LID_0] = max(buf[LID_0], buf[LID_0+_n]); local_barrier(); } } %(ctype)s row_max = buf[0]; local_barrier(); for(ga_int __i=LID_0; __i<N; __i+=LDIM_0){; buf[__i] = exp(buf2[__i] - row_max); buf2[__i] = buf[__i]; } local_barrier(); { // This function trashes buf[1..GA_WARP_SIZE], // leaving the reduction result in buf[0]. if (LID_0 < GA_WARP_SIZE) { for (ga_int i = LID_0 + GA_WARP_SIZE; i < N; i += GA_WARP_SIZE) { buf[LID_0] = buf[LID_0] + buf[i]; } } local_barrier(); //reduce so that LID_0 0 has the reduction of everything for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) { if (LID_0 < _n && LID_0 + _n < N) buf[LID_0] = buf[LID_0] + buf[LID_0+_n]; local_barrier(); } } %(ctype)s row_sum = buf[0]; local_barrier(); for(ga_int __i=LID_0; __i<N; __i+=LDIM_0){ buf[__i] = buf2[__i] / row_sum; } local_barrier(); for (ga_int tx = LID_0; tx< N; tx += LDIM_0){ sm[blockIDX * sm_s0 + tx * sm_s1] = %(write_sm)s(buf[tx]); } local_barrier(); } } """ % locals()) kernels.append( Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)) kname = "kSoftmaxWithBias_fixed_shared" k_var = "kSoftmaxWithBias_fixed_shared" + nodename code = ("""#include "cluda.h" KERNEL void %(kname)s (const ga_size M, const ga_size N, GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1, GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0, GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf)) { GA_DECL_SHARED_BODY(%(type_acc)s, buf); x = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x)+offset_x); b = (GLOBAL_MEM const %(type_b)s *)(((GLOBAL_MEM char *)b)+offset_b); sm = (GLOBAL_MEM %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm); for (ga_int blockIDX = GID_0; blockIDX < M; blockIDX += GDIM_0){ GLOBAL_MEM const %(type_x)s *x_ptr = &x[blockIDX * sx0]; GLOBAL_MEM %(type_sm)s *sm_ptr = &sm[blockIDX * sm_s0]; { // This function trashes buf[1..n_threads], // leaving the reduction result in buf[0]. %(ctype)s red = %(load_x)s(x_ptr[LID_0 * sx1]) + %(load_b)s(b[LID_0 * sb0]); #pragma unroll 16 for (ga_int i = LID_0 + LDIM_0; i<N; i += LDIM_0) { red = max(red, %(load_x)s(x_ptr[i * sx1]) + %(load_b)s(b[i * sb0])); } buf[LID_0] = red; local_barrier(); if (LID_0 < GA_WARP_SIZE) { for (ga_int i = LID_0 + GA_WARP_SIZE; i < LDIM_0; i += GA_WARP_SIZE) { buf[LID_0] = max(buf[LID_0], buf[i]); } } local_barrier(); //reduce so that LID_0 0 has the reduction of everything for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) { if (LID_0 < _n && LID_0 + _n < N) buf[LID_0] = max(buf[LID_0], buf[LID_0+_n]); local_barrier(); } } %(ctype)s row_max = buf[0]; local_barrier(); { // This function trashes buf[1..n_threads], // leaving the reduction result in buf[0]. %(ctype)s red = exp(%(load_x)s(x_ptr[LID_0 * sx1]) + %(load_b)s(b[LID_0 * sb0]) - row_max); #pragma unroll 16 for (ga_int i = LID_0 + LDIM_0; i<N; i += LDIM_0) { red = red + exp(%(load_x)s(x_ptr[i * sx1]) + %(load_b)s(b[i * sb0]) - row_max); } buf[LID_0] = red; local_barrier(); if (LID_0 < GA_WARP_SIZE) { for (ga_int i = LID_0 + GA_WARP_SIZE; i < LDIM_0; i += GA_WARP_SIZE) { buf[LID_0] = buf[LID_0] + buf[i]; } } local_barrier(); //reduce so that LID_0 0 has the reduction of everything for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) { if (LID_0 < _n && LID_0 + _n < N) buf[LID_0] = buf[LID_0] + buf[LID_0+_n]; local_barrier(); } } %(ctype)s row_sum = buf[0]; local_barrier(); for (ga_int tx = LID_0; tx< N; tx += LDIM_0){ sm_ptr[tx * sm_s1] = %(write_sm)s(exp(%(load_x)s(x_ptr[tx * sx1]) + %(load_b)s(b[tx * sb0]) - row_max) / row_sum); } local_barrier(); } } """ % locals()) kernels.append( Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)) return kernels
def gpu_kernels(self, node, nodename): dtype_x = node.inputs[0].dtype dtype_b = node.inputs[1].dtype dtype_y_idx = node.inputs[2].dtype work_x = work_dtype(dtype_x) work_b = work_dtype(dtype_b) load_x = load_w(dtype_x) load_b = load_w(dtype_b) write_x = write_w(dtype_x) write_b = write_w(dtype_b) flags = Kernel.get_flags(dtype_x, dtype_b, dtype_y_idx) type_x = gpuarray.dtype_to_ctype(dtype_x) type_b = gpuarray.dtype_to_ctype(dtype_b) work_x = gpuarray.dtype_to_ctype(work_x) type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx) kname = "k_xent_sm_1hot_bias" k_var = "k_xent_sm_1hot_bias_" + nodename if node.inputs[0].type.context.kind != b"cuda": f = "" else: f = "" if dtype_x == "float64" else "f" params = [ gpuarray.SIZE, gpuarray.SIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, ] sio = StringIO() print( """#include "cluda.h" KERNEL void %(kname)s(const ga_size M, const ga_size N, GLOBAL_MEM const %(type_x)s* x_data, const ga_size offset_x, const ga_ssize xs0, const ga_ssize xs1, GLOBAL_MEM const %(type_b)s* b, const ga_size offset_b, const ga_ssize bs0, GLOBAL_MEM const %(type_y_idx)s* y_idx_data, const ga_size offset_y_idx, const ga_ssize y_idxs0, GLOBAL_MEM %(type_x)s* nll_data, const ga_size offset_nll, const ga_ssize nlls0, GLOBAL_MEM %(type_x)s* sm_data, const ga_size offset_sm, const ga_ssize sms0, const ga_ssize sms1, GLOBAL_MEM %(type_y_idx)s* am_data, const ga_size offset_am, const ga_ssize ams0 GA_DECL_SHARED_PARAM(%(work_x)s, per_thread_values)) { x_data = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x_data)+offset_x); b = (GLOBAL_MEM const %(type_b)s *)(((GLOBAL_MEM char *)b)+offset_b); y_idx_data = (GLOBAL_MEM const %(type_y_idx)s *)(((GLOBAL_MEM char *)y_idx_data)+offset_y_idx); nll_data = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)nll_data)+offset_nll); sm_data = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)sm_data)+offset_sm); am_data = (GLOBAL_MEM %(type_y_idx)s *)(((GLOBAL_MEM char *)am_data)+offset_am); for (ga_int row = GID_0; row < M; row += GDIM_0){ GLOBAL_MEM const %(type_x)s* x = x_data + xs0 * row; GLOBAL_MEM %(type_x)s* sm = sm_data + sms0 * row; GA_DECL_SHARED_BODY(%(work_x)s, per_thread_values); LOCAL_MEM %(work_x)s row_max, sum, sum_inv; LOCAL_MEM ga_int row_max_threadIdx; %(work_x)s per_thread_row_max, per_thread_sum; ga_int per_thread_row_max_j; // COMPUTE ROW MAX AND ARGMAX // compute separate per-thread maximums and argmaxes per_thread_row_max = NAN; per_thread_row_max_j = 0; for (ga_int j = LID_0; j < N; j += LDIM_0) { %(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]); per_thread_row_max_j = (row_ij > per_thread_row_max) ? j : per_thread_row_max_j; per_thread_row_max = fmax%(f)s(row_ij, per_thread_row_max); } per_thread_values[LID_0] = per_thread_row_max; local_barrier(); if (LID_0 == 0) { row_max = NAN; row_max_threadIdx = 0; for (ga_int j = 0; j < LDIM_0; j++) { %(work_x)s per_thread_max = per_thread_values[j]; row_max_threadIdx = (per_thread_max > row_max) ? j : row_max_threadIdx; row_max = fmax%(f)s(per_thread_max, row_max); } } local_barrier(); // The thread with the highest max writes out which of its // values was the winner. if (LID_0 == row_max_threadIdx) am_data[row * ams0] = per_thread_row_max_j; // COMPUTE SOFTMAX per_thread_sum = 0.0; for (ga_int j = LID_0; j < N; j += LDIM_0) { %(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]); %(work_x)s sm_ij = exp%(f)s(row_ij - row_max); per_thread_sum += sm_ij; sm[j * sms1] = %(write_x)s(sm_ij); } per_thread_values[LID_0] = per_thread_sum; local_barrier(); if (LID_0 == 0) { sum = 0.0; for (ga_int j = 0; j < LDIM_0; j++) { sum += per_thread_values[j]; } sum_inv = 1.0 / sum; } local_barrier(); for (ga_int j = LID_0; j < N; j += LDIM_0) { sm[j * sms1] = %(write_x)s(%(load_x)s(sm[j * sms1]) * sum_inv); } if (LID_0 == 0) { const %(type_y_idx)s y_idx = (ga_int)y_idx_data[row * y_idxs0]; if ((y_idx >= N || y_idx < 0)) { // raise some suspicion. nll_data[row * nlls0] = %(write_x)s(0.0); } else { nll_data[row * nlls0] = %(write_x)s( - %(load_x)s(x[y_idx * xs1]) - %(load_b)s(b[y_idx * bs0]) + row_max + log%(f)s(sum)); } } } } """ % locals(), file=sio, ) return [ Kernel( code=sio.getvalue(), name=kname, params=params, flags=flags, objvar=k_var, ) ]
def gpu_kernels(self, node, nodename): dtype_dnll = node.inputs[0].dtype dtype_sm = node.inputs[1].dtype dtype_y_idx = node.inputs[2].dtype dtype_dx = node.outputs[0].dtype work_dnll = work_dtype(dtype_dnll) load_dnll = load_w(dtype_dnll) load_sm = load_w(dtype_sm) write_dx = write_w(dtype_dx) flags = Kernel.get_flags(dtype_dnll, dtype_sm, dtype_y_idx, dtype_dx) wtype_dnll = gpuarray.dtype_to_ctype(work_dnll) type_dnll = gpuarray.dtype_to_ctype(dtype_dnll) type_sm = gpuarray.dtype_to_ctype(dtype_sm) type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx) type_dx = gpuarray.dtype_to_ctype(dtype_dx) kname = "kCrossEntropySoftmax1HotWithBiasDx" k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename params = [ gpuarray.SIZE, gpuarray.SIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE, ] sio = StringIO() print( """#include "cluda.h" KERNEL void %(kname)s( const ga_size N, const ga_size K, GLOBAL_MEM const %(type_dnll)s* dnll, const ga_size offset_dnll, const ga_ssize dnll_s0, GLOBAL_MEM const %(type_sm)s* sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1, GLOBAL_MEM const %(type_y_idx)s* y_idx, const ga_size offset_y_idx, const ga_ssize y_idx_s0, GLOBAL_MEM %(type_dx)s* dx, const ga_size offset_dx, const ga_ssize dx_s0, const ga_ssize dx_s1) { dnll = (GLOBAL_MEM const %(type_dnll)s *)(((GLOBAL_MEM char *)dnll)+offset_dnll); sm = (GLOBAL_MEM const %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm); y_idx = (GLOBAL_MEM const %(type_y_idx)s *)(((GLOBAL_MEM char *)y_idx)+offset_y_idx); dx = (GLOBAL_MEM %(type_dx)s *)(((GLOBAL_MEM char *)dx)+offset_dx); for (ga_int i = GID_0; i < N; i += GDIM_0) { %(wtype_dnll)s dnll_i = %(load_dnll)s(dnll[i * dnll_s0]); %(type_y_idx)s y_i = y_idx[i * y_idx_s0]; for (ga_int j = LID_0; j < K; j += LDIM_0) { if (y_i == j) { dx[i * dx_s0 + j * dx_s1] = %(write_dx)s(dnll_i * (%(load_sm)s(sm[i * sm_s0 + j * sm_s1]) - 1.0)); } else { dx[i * dx_s0 + j * dx_s1] = %(write_dx)s(dnll_i * %(load_sm)s(sm[i * sm_s0 + j * sm_s1])); } } } } """ % locals(), file=sio, ) return [ Kernel( code=sio.getvalue(), name=kname, params=params, flags=flags, objvar=k_var, ) ]
def gpu_kernels(self, node, nodename): dtype_x = node.inputs[0].dtype dtype_b = node.inputs[1].dtype dtype_y_idx = node.inputs[2].dtype work_x = work_dtype(dtype_x) work_b = work_dtype(dtype_b) load_x = load_w(dtype_x) load_b = load_w(dtype_b) write_x = write_w(dtype_x) write_b = write_w(dtype_b) flags = Kernel.get_flags(dtype_x, dtype_b, dtype_y_idx) type_x = gpuarray.dtype_to_ctype(work_x) type_b = gpuarray.dtype_to_ctype(work_b) type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx) kname = "k_xent_sm_1hot_bias" k_var = "k_xent_sm_1hot_bias_" + nodename sio = StringIO() print(""" KERNEL void %(kname)s(const ga_size M, const ga_size N, const %(type_x)s* x_data, const ga_size offset_x, const ga_ssize xs0, const ga_ssize xs1, const %(type_b)s* b, const ga_size offset_b, const ga_ssize bs0, const %(type_y_idx)s* y_idx_data, const ga_size offset_y_idx, const ga_ssize y_idxs0, %(type_x)s* nll_data, const ga_size offset_nll, const ga_ssize nlls0, %(type_x)s* sm_data, const ga_size offset_sm, const ga_ssize sms0, const ga_ssize sms1, %(type_y_idx)s* am_data, const ga_size offset_am, const ga_ssize ams0) { x_data = (const %(type_x)s *)(((char *)x_data)+offset_x); b = (const %(type_b)s *)(((char *)b)+offset_b); y_idx_data = (const %(type_y_idx)s *)(((char *)y_idx_data)+offset_y_idx); nll_data = (%(type_x)s *)(((char *)nll_data)+offset_nll); sm_data = (%(type_x)s *)(((char *)sm_data)+offset_sm); am_data = (%(type_y_idx)s *)(((char *)am_data)+offset_am); for (int row = blockIdx.x; row < M; row += gridDim.x){ const %(type_x)s* x = x_data + xs0 * row; const %(type_y_idx)s y_idx = y_idx_data[row * y_idxs0]; %(type_x)s* sm = sm_data + sms0 * row; %(type_x)s sum = 0.0; int row_max_j = 0; %(type_x)s row_max = %(load_x)s(x[0]) + %(load_b)s(b[0]); for (int j = 1; j < N; ++j) { %(type_x)s row_ij = %(load_x)s(x[j*xs1]) + %(load_b)s(b[j*bs0]); //todo: store to shared memory row_max_j = (row_ij > row_max) ? j : row_max_j; row_max = (row_ij > row_max) ? row_ij : row_max; } //compute the exp for (int j = 0; j < N; ++j) { %(type_x)s row_ij = %(load_x)s(x[j*xs1]) + %(load_b)s(b[j*bs0]); %(type_x)s sm_ij = exp(row_ij - row_max); sum += sm_ij; sm[j * sms1] = %(write_x)s(sm_ij); } %(type_x)s sum_inv = 1.0 / sum; for (int j = 0; j < N; ++j) { %(type_x)s __tmp = %(load_x)s(sm[j * sms1]); __tmp *= sum_inv; sm[j * sms1] = %(write_x)s(__tmp); } if ((y_idx >= N) || (y_idx < 0)) { //TODO: set raise an error bit in a global var? nll_data[row*nlls0] = %(write_x)s(0.0); // raise some suspicion at least... } else { nll_data[row*nlls0] = %(write_x)s(- %(load_x)s(x[y_idx*xs1]) - %(load_b)s(b[y_idx*bs0]) + row_max + log(sum)); } am_data[row*ams0] = row_max_j; } } """ % locals(), file=sio) params = [ 'uintp', 'uintp', gpuarray.GpuArray, 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp', 'intp', gpuarray.GpuArray, 'uintp', 'intp', gpuarray.GpuArray, 'uintp', 'intp', gpuarray.GpuArray, 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp', 'intp' ] return [ Kernel(code=sio.getvalue(), name=kname, params=params, flags=flags, objvar=k_var) ]
def c_code(self, node, nodename, inp, out, sub): if node.inputs[0].type.context.kind != 'cuda': raise NotImplementedError("cuda only") typecode_dx = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype) itemsize_dnll = numpy.dtype(node.inputs[0].dtype).itemsize itemsize_sm = numpy.dtype(node.inputs[1].dtype).itemsize itemsize_y_idx = numpy.dtype(node.inputs[2].dtype).itemsize itemsize_dx = numpy.dtype(node.outputs[0].dtype).itemsize dtype_dnll = node.inputs[0].dtype dtype_sm = node.inputs[1].dtype dtype_y_idx = node.inputs[2].dtype dtype_dx = node.outputs[0].dtype type_intp = gpuarray.dtype_to_ctype(numpy.intp) dnll, sm, y_idx = inp dx, = out fail = sub['fail'] ctx = sub['params'] k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename err_check = """ if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "gpuarray error: %(k_var)s: %%s.", GpuKernel_error(&%(k_var)s, err)); %(fail)s; } """ % locals() sync = "" if config.gpuarray.sync: sync = """ err = GpuArray_sync(&%(z)s->ga); %(err_check)s """ % locals() return """ // Get `dnll.shape[0]` or set it to zero if `dnll` is a scalar. const ssize_t %(dnll)s_dims0 = (PyGpuArray_NDIM(%(dnll)s) > 0 ? PyGpuArray_DIMS(%(dnll)s)[0] : (ssize_t) 0); // Get `dnll.strides[0]` and set it to zero if `dnll` is a scalar // or a vector with just one element. const ssize_t %(dnll)s_strides0 = (%(dnll)s_dims0 > 1 ? PyGpuArray_STRIDES(%(dnll)s)[0] : (ssize_t) 0); if ((PyGpuArray_NDIM(%(dnll)s) > 1) || (PyGpuArray_NDIM(%(sm)s) != 2) || (PyGpuArray_NDIM(%(y_idx)s) != 1)) { PyErr_SetString(PyExc_ValueError, "rank error"); %(fail)s; } if (%(dnll)s_dims0 != PyGpuArray_DIMS(%(sm)s)[0] && %(dnll)s_dims0 > 1) { PyErr_Format(PyExc_ValueError, "dnll.shape[0] == %%i, but sm.shape[0] == %%i", %(dnll)s_dims0, PyGpuArray_DIMS(%(sm)s)[0]); %(fail)s; } if (%(dnll)s_dims0 != PyGpuArray_DIMS(%(y_idx)s)[0] && %(dnll)s_dims0 > 1) { PyErr_SetString(PyExc_ValueError, "dnll.shape[0] != y_idx.shape[0]"); %(fail)s; } if (PyGpuArray_DIMS(%(sm)s)[0] != PyGpuArray_DIMS(%(y_idx)s)[0]) { PyErr_SetString(PyExc_ValueError, "sm.shape[0] != y_idx.shape[0]"); %(fail)s; } if ((NULL == %(dx)s) || (PyGpuArray_DIMS(%(dx)s)[0] != PyGpuArray_DIMS(%(sm)s)[0]) || (PyGpuArray_DIMS(%(dx)s)[1] != PyGpuArray_DIMS(%(sm)s)[1])) { Py_XDECREF(%(dx)s); %(dx)s = pygpu_empty(2, PyGpuArray_DIMS(%(sm)s), %(typecode_dx)s, GA_C_ORDER, %(ctx)s, Py_None); if (!%(dx)s) { %(fail)s } } { size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(dx)s)[0], (size_t)256), 1, 1}; size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(dx)s)[1], (size_t)256), 1, 1}; ssize_t stride_DNLL0 = %(dnll)s_strides0 / %(itemsize_dnll)s; ssize_t stride_SM0 = PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s; ssize_t stride_SM1 = PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s; ssize_t stride_YIDX0 = PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s; ssize_t stride_DX0 = PyGpuArray_STRIDES(%(dx)s)[0] / %(itemsize_dx)s; ssize_t stride_DX1 = PyGpuArray_STRIDES(%(dx)s)[1] / %(itemsize_dx)s; void *kernel_params[] = { (void *)&PyGpuArray_DIMS(%(dx)s)[0], (void *)&PyGpuArray_DIMS(%(dx)s)[1], (void *)%(dnll)s->ga.data, (void *)&%(dnll)s->ga.offset, (void *)&stride_DNLL0, (void *)%(sm)s->ga.data, (void *)&%(sm)s->ga.offset, (void *)&stride_SM0, (void *)&stride_SM1, (void *)%(y_idx)s->ga.data, (void *)&%(y_idx)s->ga.offset, (void *)&stride_YIDX0, (void *)%(dx)s->ga.data, (void *)&%(dx)s->ga.offset, (void *)&stride_DX0, (void *)&stride_DX1}; int err = GpuKernel_call(&%(k_var)s, 3, threads_per_block, n_blocks, 0, kernel_params); %(err_check)s %(sync)s } assert(%(dx)s); """ % locals()
def gpu_kernels(self, node, nodename): # We can't rely on numpy for this, it changes with the OS CHARMAP = dict( int32="i", uint32="I", int64="l", uint64="L", float16="e", float32="f", float64="d", ) dtype_x = node.inputs[0].dtype dtype_y = node.inputs[1].dtype dtype_ind = node.inputs[2].dtype type_x = gpuarray.dtype_to_ctype(dtype_x) type_y = gpuarray.dtype_to_ctype(dtype_y) type_ind = gpuarray.dtype_to_ctype(dtype_ind) flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind) kname = "k_vector_add_fast" k_var = "k_vector_add_fast_" + nodename code = """#include "cluda.h" KERNEL void k_vector_add_fast(const ga_size numRowsX, const ga_size numColsX, const ga_ssize stridesX0, const ga_ssize stridesX1, GLOBAL_MEM %(type_x)s *X, const ga_size offset_X, const ga_size numRowsY, const ga_size numColsY, const ga_ssize stridesY0, const ga_ssize stridesY1, GLOBAL_MEM %(type_y)s *Y, const ga_size offset_Y, const ga_size numIndices, const ga_ssize stridesIndices, GLOBAL_MEM %(type_ind)s *indices_arr, const ga_size offset_indices_arr, const ga_int set_instead_of_inc, GLOBAL_MEM ga_int *err) { X = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)X)+offset_X); Y = (GLOBAL_MEM %(type_y)s *)(((GLOBAL_MEM char *)Y)+offset_Y); indices_arr = (GLOBAL_MEM %(type_ind)s *)(((GLOBAL_MEM char *)indices_arr)+offset_indices_arr); for (ga_int i = GID_0; i < numIndices; i += GDIM_0) { for (ga_int j = LID_0; j < numColsX; j += LDIM_0) { ga_ssize x_row = indices_arr[i * stridesIndices]; if (x_row < 0) x_row += numRowsX; ga_ssize y_row = i; if (x_row < numRowsX && x_row >= 0) { if (set_instead_of_inc) { atom_xchg_%(tc)sg(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]); } else { atom_add_%(tc)sg(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]); } } else { *err = 1; } } } return; } """ % dict( type_x=type_x, type_y=type_y, type_ind=type_ind, tc=CHARMAP[dtype_x] ) from pygpu.gpuarray import SIZE, SSIZE params = [ SIZE, SIZE, SSIZE, SSIZE, gpuarray.GpuArray, SIZE, SIZE, SIZE, SSIZE, SSIZE, gpuarray.GpuArray, SIZE, SIZE, SSIZE, gpuarray.GpuArray, SIZE, "int32", gpuarray.GpuArray, ] return [Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)]
def gpu_kernels(self, node, nodename): # load kernel source device_type = node.inputs[0].type.context.kind kernel_ext = {b'cuda': '.cu', b'opencl': '.cl'}[device_type] common_ext = {b'cuda': '.cuh', b'opencl': '.h'}[device_type] # prepare "$" macros if device_type == b'cuda': ndim = node.inputs[0].ndim dstv_strides_code = ''.join('ssize_t dstv_strides_%d, ' % i for i in range(ndim)) dsti_strides_code = ''.join('ssize_t dsti_strides_%d, ' % i for i in range(ndim)) src_strides_code = ''.join('ssize_t src_strides_%d, ' % i for i in range(ndim)) set_slice_code = ''' gidx = gid %% dims_%(i)d; gid /= dims_%(i)d; {dstv}; {dsti}; src = ptr_add(src, gidx*src_strides_%(i)d);\n'''.format( dstv='dstv = ptr_add(dstv, gidx*dstv_strides_%(i)d)' if self.return_values else '', dsti='dsti = ptr_add(dsti, gidx*dsti_strides_%(i)d)' if self.return_indices else '') set_slice_code = ''.join(set_slice_code % dict(i=j) for j in range(1, ndim)) if self.return_values: set_slice_code += """ dstv = ptr_add(dstv, dstv_offset); """ if self.return_indices: set_slice_code += """ dsti = ptr_add(dsti, dsti_offset); """ set_slice_code += """ src = ptr_add(src, src_offset); """ flags = Kernel.get_flags(node.inputs[0].dtype) subs = dict( inp_t=ga.dtype_to_ctype(node.inputs[0].dtype), out_t=ga.dtype_to_ctype(self.idx_dtype), dims=''.join('size_t dims_%d, ' % i for i in range(1, ndim)), dstv='INPUT_TYPE *dstv,' if self.return_values else '', dstv_offset='size_t dstv_offset,' if self.return_values else '', dsti='INDEX_TYPE *dsti,' if self.return_indices else '', dsti_offset='size_t dsti_offset,' if self.return_indices else '', dstv_strides=dstv_strides_code if self.return_values else '', dsti_strides=dsti_strides_code if self.return_indices else '', src_strides=src_strides_code, set_slice=set_slice_code, write_value=int(self.return_values), write_index=int(self.return_indices), ndim=str(ndim)) elif device_type == b'opencl': raise NotImplementedError() # setup parameters param_types = [ga.SIZE] * (ndim - 1) # dims for _ in range(self.return_values + self.return_indices): param_types.append(ga.GpuArray) # dst* param_types.append(ga.SIZE) # offset param_types.extend([ga.SSIZE] * ndim) # dst*_strides param_types.append(ga.SIZE) # k param_types.append(ga.GpuArray) # src param_types.append(ga.SIZE) # offset param_types.extend([ga.SSIZE] * ndim) # src_strides param_types.append(ga.SIZE) # size # load and compile kernels with open( os.path.join(os.path.dirname(__file__), 'c_code', 'topk_common' + common_ext)) as f: common_src = f.read() kernels = [] def build_kernel(fname, kname, subs): with open(os.path.join(os.path.dirname(__file__), 'c_code', fname)) as f: kernel_src = f.read() ker = Kernel( code=("#include <cluda.h>\n" + Template(common_src + kernel_src).substitute(**subs)), name=kname, params=param_types, flags=flags, objvar=kname + nodename) return ker subs['count_t'] = 'int' kernels.append( build_kernel('topk_dense' + kernel_ext, 'k_topk_dense', subs)) subs['kname'] = 'k_topk_dense_large' kernels.append( build_kernel('topk_dense_large' + kernel_ext, 'k_topk_dense_large', subs)) subs['count_t'] = 'long long' subs['kname'] = 'k_topk_dense_xlarge' kernels.append( build_kernel('topk_dense_large' + kernel_ext, 'k_topk_dense_xlarge', subs)) return kernels
def gpu_kernels(self, node, name): dtype_d = node.inputs[0].dtype type_d = gpuarray.dtype_to_ctype(dtype_d) dtype_x = node.inputs[1].dtype type_x = gpuarray.dtype_to_ctype(dtype_x) dtype_y = node.outputs[0].dtype type_y = gpuarray.dtype_to_ctype(dtype_y) work_d = gpuarray.dtype_to_ctype(work_dtype(dtype_d)) load_d = load_w(dtype_d) work_x = gpuarray.dtype_to_ctype(work_dtype(dtype_x)) load_x = load_w(dtype_x) code = """ #include "cluda.h" KERNEL void binsearchsorted(const ga_ssize stridesD0, GLOBAL_MEM %(type_d)s *d, ga_size d_off, const ga_ssize stridesX0, GLOBAL_MEM %(type_x)s *x, ga_size x_off, const ga_ssize stridesY0, GLOBAL_MEM %(type_y)s *y, ga_size y_off, ga_size lx, ga_ssize ld) { d = (GLOBAL_MEM %(type_d)s *)(((GLOBAL_MEM char *)d) + d_off); x = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)x) + x_off); y = (GLOBAL_MEM %(type_y)s *)(((GLOBAL_MEM char *)y) + y_off); ga_size index = threadIdx.x + blockIdx.x * blockDim.x; if (index < lx) { ga_long a = 0; ga_long b = (ga_long)(ld - 1); %(work_d)s minval = %(load_d)s(d[a]); %(work_d)s maxval = %(load_d)s(d[b * stridesD0]); %(work_x)s val = %(load_x)s(x[index * stridesX0]); if (val > maxval) { a = (ga_long)ld; b = (ga_long)ld; } else if (val <= minval) { a = 0; b = 0; } while (b - a > 0) { ga_long h = (b + a) / 2; %(work_d)s t = %(load_d)s(d[h * stridesD0]); if (val < t) { b = h; } else { a = h + 1; } } y[index * stridesY0] = b; } }""" % dict(type_d=type_d, type_x=type_x, type_y=type_y, work_d=work_d, load_d=load_d, work_x=work_x, load_x=load_x, name=name) return [ Kernel(code=code, name="binsearchsorted", params=[ gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SIZE, gpuarray.SSIZE ], flags=Kernel.get_flags(dtype_d, dtype_x, dtype_y), objvar='k_binsearchsorted_' + name) ]
def gpu_kernels(self, node, nodename): dtype_ten4 = node.inputs[0].dtype dtype_z = node.outputs[0].dtype flags = Kernel.get_flags(dtype_ten4, dtype_z) type_ten4 = gpuarray.dtype_to_ctype(dtype_ten4) type_z = gpuarray.dtype_to_ctype(dtype_z) mode = self.mode kernels = [] kname = "k_multi_warp_less" k_var = "k_multi_warp_less_" + nodename code = ( """ // a version that uses less registers but doesn't work in all cases. KERNEL void %(kname)s( const int nb_batch, const int nb_stack, const int height, const int width, const int c, const int d, const int step_x, const int step_y, const int grid_c, const int grid_d, const size_t stride0, const size_t stride1, const size_t stride2, const size_t stride3, const %(type_ten4)s * global_ten4, const size_t offset_ten4, const size_t out_s0, const size_t out_s1, %(type_z)s * global_out, const size_t offset_out ) { const int wrap_centered_idx_shift_x = c/2; const int wrap_centered_idx_shift_y = d/2; global_ten4 = (const %(type_ten4)s *)(((char *)global_ten4)+offset_ten4); global_out = (%(type_z)s *)(((char *)global_out)+offset_out); for(int tblock = blockIdx.x*blockDim.z+threadIdx.z; tblock<nb_batch*nb_stack*grid_c*grid_d; tblock+=gridDim.x*blockDim.z){ const int b = tblock%%grid_d; int left = tblock/grid_d; const int a = left%%grid_c; left = left/grid_c; const int s = left%%nb_stack; left = left/nb_stack; const int n = left; if(n>nb_batch)continue; if(s>nb_stack)continue; if(a>grid_c)continue; if(b>grid_d)continue; int z_row = b + grid_d*(a + grid_c* (s + nb_stack*n)); int i = threadIdx.y; // loop over c { int ten4_2 = i + a * step_x; if("%(mode)s"=="wrap_centered"){ ten4_2 -= wrap_centered_idx_shift_x; if ( ten4_2 < 0 ) ten4_2 += height; else if (ten4_2 >= height) ten4_2 -= height; } int j = threadIdx.x; // loop over d { int ten4_3 = j + b * step_y; if("%(mode)s"=="wrap_centered"){ ten4_3 -= wrap_centered_idx_shift_y; if ( ten4_3 < 0 ) ten4_3 += width; else if (ten4_3 >= width) ten4_3 -= width; } int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n; int z_col = j + d * i; int z_idx = z_col * out_s1 + z_row * out_s0; global_out[z_idx] = global_ten4[ten4_idx]; } } } }""" % locals() ) params = [ "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "uintp", "uintp", "uintp", "uintp", gpuarray.GpuArray, "uintp", "uintp", "uintp", gpuarray.GpuArray, "uintp", ] kernels.append(Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)) kname = "k_multi_warp" k_var = "k_multi_warp_" + nodename code = ( """ KERNEL void %(kname)s( const int nb_batch, const int nb_stack, const int height, const int width, const int c, const int d, const int step_x, const int step_y, const int grid_c, const int grid_d, const size_t stride0, const size_t stride1, const size_t stride2, const size_t stride3, const %(type_ten4)s * global_ten4, const size_t offset_ten4, const size_t out_s0, const size_t out_s1, %(type_z)s * global_out, const size_t offset_out ) { const int wrap_centered_idx_shift_x = c/2; const int wrap_centered_idx_shift_y = d/2; global_ten4 = (const %(type_ten4)s *)(((char *)global_ten4)+offset_ten4); global_out = (%(type_z)s *)(((char *)global_out)+offset_out); for(int tblock = blockIdx.x*blockDim.z+threadIdx.z; tblock<nb_batch*nb_stack*grid_c*grid_d; tblock+=gridDim.x*blockDim.z){ const int b = tblock%%grid_d; int left = tblock/grid_d; const int a = left%%grid_c; left = left/grid_c; const int s = left%%nb_stack; left = left/nb_stack; const int n = left; if(n>nb_batch)continue; if(s>nb_stack)continue; if(a>grid_c)continue; if(b>grid_d)continue; int z_row = b + grid_d*(a + grid_c* (s + nb_stack*n)); // loop over c for (int i = threadIdx.y; i < c; i+=blockDim.y) { int ten4_2 = i + a * step_x; if("%(mode)s"=="wrap_centered"){ ten4_2 -= wrap_centered_idx_shift_x; if ( ten4_2 < 0 ) ten4_2 += height; else if (ten4_2 >= height) ten4_2 -= height; } // loop over d for (int j = threadIdx.x; j < d; j+=blockDim.x) { int ten4_3 = j + b * step_y; if("%(mode)s"=="wrap_centered"){ ten4_3 -= wrap_centered_idx_shift_y; if ( ten4_3 < 0 ) ten4_3 += width; else if (ten4_3 >= width) ten4_3 -= width; } int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n; int z_col = j + d * i; int z_idx = z_col * out_s1 + z_row * out_s0; global_out[z_idx] = global_ten4[ten4_idx]; } } } } """ % locals() ) params = [ "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "uintp", "uintp", "uintp", "uintp", gpuarray.GpuArray, "uintp", "uintp", "uintp", gpuarray.GpuArray, "uintp", ] kernels.append(Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)) return kernels
def gpu_kernels(self, node, nodename): CHARMAP = dict(int32='i', uint32='I', int64='l', uint64='L', float16='e', float32='f', float64='d') dtype_in = node.inputs[0].dtype dtype_out = node.outputs[0].dtype dtype_idx = node.inputs[1].dtype type_in = gpuarray.dtype_to_ctype(dtype_in) type_out = gpuarray.dtype_to_ctype(dtype_out) type_idx = gpuarray.dtype_to_ctype(dtype_idx) flags = Kernel.get_flags(dtype_in, dtype_out, dtype_idx) kname = "k_vector_select_fast" k_var = "k_vector_select_fast_" + nodename code = """#include "cluda.h" KERNEL void k_vector_select_fast(const ga_size numRowsOut, const ga_size numColsOut, const ga_ssize stridesOut0, const ga_ssize stridesOut1, GLOBAL_MEM %(type_out)s *Out, const ga_size offset_Out, const ga_size numRowsIn, const ga_size numColsIn, const ga_ssize stridesIn0, const ga_ssize stridesIn1, GLOBAL_MEM %(type_in)s *In, const ga_size offset_In, const ga_size numIndices, const ga_ssize stridesIndices, GLOBAL_MEM %(type_idx)s *indices_arr, const ga_size offset_indices_arr, GLOBAL_MEM ga_int *err) { Out = (GLOBAL_MEM %(type_out)s *)(((GLOBAL_MEM char *)Out)+offset_Out); In = (GLOBAL_MEM %(type_in)s *)(((GLOBAL_MEM char *)In)+offset_In); indices_arr = (GLOBAL_MEM %(type_idx)s *)(((GLOBAL_MEM char *)indices_arr)+offset_indices_arr); for (ga_int i = GID_0; i < numIndices; i += GDIM_0) { for (ga_int j = LID_0; j < numColsIn; j += LDIM_0) { ga_ssize in_row = indices_arr[i * stridesIndices]; if (in_row < 0) in_row += numRowsIn; ga_ssize out_row = i; if (in_row < numRowsIn && in_row >= 0) { Out[(out_row * stridesOut0) + (j * stridesOut1)] = In[(in_row * stridesIn0) + (j * stridesIn1)]; } else { *err = 1; } } } return; } """ % dict(type_in=type_in, type_out=type_out, type_idx=type_idx, tc=CHARMAP[dtype_in]) from pygpu.gpuarray import SIZE, SSIZE params = [ SIZE, SIZE, SSIZE, SSIZE, gpuarray.GpuArray, SIZE, SIZE, SIZE, SSIZE, SSIZE, gpuarray.GpuArray, SIZE, SIZE, SSIZE, gpuarray.GpuArray, SIZE, gpuarray.GpuArray ] return [ Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var) ]
def gpu_kernels(self, node, nodename): dtype_dnll = node.inputs[0].dtype dtype_sm = node.inputs[1].dtype dtype_y_idx = node.inputs[2].dtype dtype_dx = node.outputs[0].dtype work_dnll = work_dtype(dtype_dnll) load_dnll = load_w(dtype_dnll) load_sm = load_w(dtype_sm) write_dx = write_w(dtype_dx) flags = Kernel.get_flags(dtype_dnll, dtype_sm, dtype_y_idx, dtype_dx) wtype_dnll = gpuarray.dtype_to_ctype(work_dnll) type_dnll = gpuarray.dtype_to_ctype(dtype_dnll) type_sm = gpuarray.dtype_to_ctype(dtype_sm) type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx) type_dx = gpuarray.dtype_to_ctype(dtype_dx) kname = "kCrossEntropySoftmax1HotWithBiasDx" k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename params = [ gpuarray.SIZE, gpuarray.SIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE, ] sio = StringIO() print("""#include "cluda.h" KERNEL void %(kname)s( const ga_size N, const ga_size K, GLOBAL_MEM const %(type_dnll)s* dnll, const ga_size offset_dnll, const ga_ssize dnll_s0, GLOBAL_MEM const %(type_sm)s* sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1, GLOBAL_MEM const %(type_y_idx)s* y_idx, const ga_size offset_y_idx, const ga_ssize y_idx_s0, GLOBAL_MEM %(type_dx)s* dx, const ga_size offset_dx, const ga_ssize dx_s0, const ga_ssize dx_s1) { dnll = (GLOBAL_MEM const %(type_dnll)s *)(((GLOBAL_MEM char *)dnll)+offset_dnll); sm = (GLOBAL_MEM const %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm); y_idx = (GLOBAL_MEM const %(type_y_idx)s *)(((GLOBAL_MEM char *)y_idx)+offset_y_idx); dx = (GLOBAL_MEM %(type_dx)s *)(((GLOBAL_MEM char *)dx)+offset_dx); for (ga_int i = GID_0; i < N; i += GDIM_0) { %(wtype_dnll)s dnll_i = %(load_dnll)s(dnll[i * dnll_s0]); %(type_y_idx)s y_i = y_idx[i * y_idx_s0]; for (ga_int j = LID_0; j < K; j += LDIM_0) { if (y_i == j) { dx[i * dx_s0 + j * dx_s1] = %(write_dx)s(dnll_i * (%(load_sm)s(sm[i * sm_s0 + j * sm_s1]) - 1.0)); } else { dx[i * dx_s0 + j * dx_s1] = %(write_dx)s(dnll_i * %(load_sm)s(sm[i * sm_s0 + j * sm_s1])); } } } } """ % locals(), file=sio) return [Kernel(code=sio.getvalue(), name=kname, params=params, flags=flags, objvar=k_var)]
def gpu_kernels(self, node, nodename): dtype_ten4 = node.inputs[0].dtype dtype_z = node.outputs[0].dtype flags = Kernel.get_flags(dtype_ten4, dtype_z) type_ten4 = gpuarray.dtype_to_ctype(dtype_ten4) type_z = gpuarray.dtype_to_ctype(dtype_z) mode = self.mode kernels = [] kname = "k_multi_warp_less" k_var = "k_multi_warp_less_" + nodename code = """ //a version that use less register but don't work in all case. KERNEL void %(kname)s( const int nb_batch, const int nb_stack, const int height, const int width, const int c, const int d, const int step_x, const int step_y, const int grid_c, const int grid_d, const size_t stride0, const size_t stride1, const size_t stride2, const size_t stride3, const %(type_ten4)s * global_ten4, const size_t offset_ten4, const size_t out_s0, const size_t out_s1, %(type_z)s * global_out, const size_t offset_out ) { const int wrap_centered_idx_shift_x = c/2; const int wrap_centered_idx_shift_y = d/2; global_ten4 = (const %(type_ten4)s *)(((char *)global_ten4)+offset_ten4); global_out = (%(type_z)s *)(((char *)global_out)+offset_out); for(int tblock = blockIdx.x*blockDim.z+threadIdx.z; tblock<nb_batch*nb_stack*grid_c*grid_d; tblock+=gridDim.x*blockDim.z){ const int b = tblock%%grid_d; int left = tblock/grid_d; const int a = left%%grid_c; left = left/grid_c; const int s = left%%nb_stack; left = left/nb_stack; const int n = left; if(n>nb_batch)continue; if(s>nb_stack)continue; if(a>grid_c)continue; if(b>grid_d)continue; int z_row = b + grid_d*(a + grid_c* (s + nb_stack*n)); int i = threadIdx.y; // loop over c { int ten4_2 = i + a * step_x; if("%(mode)s"=="wrap_centered"){ ten4_2 -= wrap_centered_idx_shift_x; if ( ten4_2 < 0 ) ten4_2 += height; else if (ten4_2 >= height) ten4_2 -= height; } int j = threadIdx.x; // loop over d { int ten4_3 = j + b * step_y; if("%(mode)s"=="wrap_centered"){ ten4_3 -= wrap_centered_idx_shift_y; if ( ten4_3 < 0 ) ten4_3 += width; else if (ten4_3 >= width) ten4_3 -= width; } int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n; int z_col = j + d * i; int z_idx = z_col * out_s1 + z_row * out_s0; global_out[z_idx] = global_ten4[ten4_idx]; } } } }""" % locals() params = [ 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'uintp', 'uintp', 'uintp', 'uintp', gpuarray.GpuArray, 'uintp', 'uintp', 'uintp', gpuarray.GpuArray, 'uintp', ] kernels.append( Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)) kname = "k_multi_warp" k_var = "k_multi_warp_" + nodename code = """ KERNEL void %(kname)s( const int nb_batch, const int nb_stack, const int height, const int width, const int c, const int d, const int step_x, const int step_y, const int grid_c, const int grid_d, const size_t stride0, const size_t stride1, const size_t stride2, const size_t stride3, const %(type_ten4)s * global_ten4, const size_t offset_ten4, const size_t out_s0, const size_t out_s1, %(type_z)s * global_out, const size_t offset_out ) { const int wrap_centered_idx_shift_x = c/2; const int wrap_centered_idx_shift_y = d/2; global_ten4 = (const %(type_ten4)s *)(((char *)global_ten4)+offset_ten4); global_out = (%(type_z)s *)(((char *)global_out)+offset_out); for(int tblock = blockIdx.x*blockDim.z+threadIdx.z; tblock<nb_batch*nb_stack*grid_c*grid_d; tblock+=gridDim.x*blockDim.z){ const int b = tblock%%grid_d; int left = tblock/grid_d; const int a = left%%grid_c; left = left/grid_c; const int s = left%%nb_stack; left = left/nb_stack; const int n = left; if(n>nb_batch)continue; if(s>nb_stack)continue; if(a>grid_c)continue; if(b>grid_d)continue; int z_row = b + grid_d*(a + grid_c* (s + nb_stack*n)); // loop over c for (int i = threadIdx.y; i < c; i+=blockDim.y) { int ten4_2 = i + a * step_x; if("%(mode)s"=="wrap_centered"){ ten4_2 -= wrap_centered_idx_shift_x; if ( ten4_2 < 0 ) ten4_2 += height; else if (ten4_2 >= height) ten4_2 -= height; } // loop over d for (int j = threadIdx.x; j < d; j+=blockDim.x) { int ten4_3 = j + b * step_y; if("%(mode)s"=="wrap_centered"){ ten4_3 -= wrap_centered_idx_shift_y; if ( ten4_3 < 0 ) ten4_3 += width; else if (ten4_3 >= width) ten4_3 -= width; } int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n; int z_col = j + d * i; int z_idx = z_col * out_s1 + z_row * out_s0; global_out[z_idx] = global_ten4[ten4_idx]; } } } } """ % locals() params = [ 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'uintp', 'uintp', 'uintp', 'uintp', gpuarray.GpuArray, 'uintp', 'uintp', 'uintp', gpuarray.GpuArray, 'uintp', ] kernels.append( Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)) return kernels
def gpu_kernels(self, node, nodename): dtype_ten4 = node.inputs[0].dtype dtype_z = node.outputs[0].dtype flags = Kernel.get_flags(dtype_ten4, dtype_z) type_ten4 = gpuarray.dtype_to_ctype(dtype_ten4) type_z = gpuarray.dtype_to_ctype(dtype_z) mode = self.mode kernels = [] kname = "k_multi_warp_less" k_var = "k_multi_warp_less_" + nodename code = """ // a version that uses less registers but doesn't work in all cases. KERNEL void %(kname)s( const ga_int nb_batch, const ga_int nb_stack, const ga_int height, const ga_int width, const ga_int c, const ga_int d, const ga_int step_x, const ga_int step_y, const ga_int grid_c, const ga_int grid_d, const ga_size stride0, const ga_size stride1, const ga_size stride2, const ga_size stride3, GLOBAL_MEM const %(type_ten4)s * global_ten4, const ga_size offset_ten4, const ga_size out_s0, const ga_size out_s1, GLOBAL_MEM %(type_z)s * global_out, const ga_size offset_out ) { const ga_int wrap_centered_idx_shift_x = c/2; const ga_int wrap_centered_idx_shift_y = d/2; global_ten4 = (GLOBAL_MEM const %(type_ten4)s *)(((GLOBAL_MEM char *)global_ten4)+offset_ten4); global_out = (GLOBAL_MEM %(type_z)s *)(((GLOBAL_MEM char *)global_out)+offset_out); for(ga_int tblock = GID_0*LDIM_2+LID_2; tblock<nb_batch*nb_stack*grid_c*grid_d; tblock+=GDIM_0*LDIM_2){ const ga_int b = tblock%%grid_d; ga_int left = tblock/grid_d; const ga_int a = left%%grid_c; left = left/grid_c; const ga_int s = left%%nb_stack; left = left/nb_stack; const ga_int n = left; if(n>nb_batch)continue; if(s>nb_stack)continue; if(a>grid_c)continue; if(b>grid_d)continue; ga_int z_row = b + grid_d*(a + grid_c* (s + nb_stack*n)); ga_int i = LID_1; // loop over c { ga_int ten4_2 = i + a * step_x; if("%(mode)s"=="wrap_centered"){ ten4_2 -= wrap_centered_idx_shift_x; if ( ten4_2 < 0 ) ten4_2 += height; else if (ten4_2 >= height) ten4_2 -= height; } ga_int j = LID_0; // loop over d { ga_int ten4_3 = j + b * step_y; if("%(mode)s"=="wrap_centered"){ ten4_3 -= wrap_centered_idx_shift_y; if ( ten4_3 < 0 ) ten4_3 += width; else if (ten4_3 >= width) ten4_3 -= width; } ga_int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n; ga_int z_col = j + d * i; ga_int z_idx = z_col * out_s1 + z_row * out_s0; global_out[z_idx] = global_ten4[ten4_idx]; } } } }""" % locals() params = [ 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'uintp', 'uintp', 'uintp', 'uintp', gpuarray.GpuArray, 'uintp', 'uintp', 'uintp', gpuarray.GpuArray, 'uintp', ] kernels.append(Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)) kname = "k_multi_warp" k_var = "k_multi_warp_" + nodename code = """ KERNEL void %(kname)s( const ga_int nb_batch, const ga_int nb_stack, const ga_int height, const ga_int width, const ga_int c, const ga_int d, const ga_int step_x, const ga_int step_y, const ga_int grid_c, const ga_int grid_d, const ga_size stride0, const ga_size stride1, const ga_size stride2, const ga_size stride3, GLOBAL_MEM const %(type_ten4)s * global_ten4, const ga_size offset_ten4, const ga_size out_s0, const ga_size out_s1, GLOBAL_MEM %(type_z)s * global_out, const ga_size offset_out ) { const ga_int wrap_centered_idx_shift_x = c/2; const ga_int wrap_centered_idx_shift_y = d/2; global_ten4 = (GLOBAL_MEM const %(type_ten4)s *)(((GLOBAL_MEM char *)global_ten4)+offset_ten4); global_out = (GLOBAL_MEM %(type_z)s *)(((GLOBAL_MEM char *)global_out)+offset_out); for(ga_int tblock = GID_0*LDIM_2+LID_2; tblock<nb_batch*nb_stack*grid_c*grid_d; tblock+=GDIM_0*LDIM_2){ const ga_int b = tblock%%grid_d; ga_int left = tblock/grid_d; const ga_int a = left%%grid_c; left = left/grid_c; const ga_int s = left%%nb_stack; left = left/nb_stack; const ga_int n = left; if(n>nb_batch)continue; if(s>nb_stack)continue; if(a>grid_c)continue; if(b>grid_d)continue; ga_int z_row = b + grid_d*(a + grid_c* (s + nb_stack*n)); // loop over c for (ga_int i = LID_1; i < c; i+=LDIM_1) { ga_int ten4_2 = i + a * step_x; if("%(mode)s"=="wrap_centered"){ ten4_2 -= wrap_centered_idx_shift_x; if ( ten4_2 < 0 ) ten4_2 += height; else if (ten4_2 >= height) ten4_2 -= height; } // loop over d for (ga_int j = LID_0; j < d; j+=LDIM_0) { ga_int ten4_3 = j + b * step_y; if("%(mode)s"=="wrap_centered"){ ten4_3 -= wrap_centered_idx_shift_y; if ( ten4_3 < 0 ) ten4_3 += width; else if (ten4_3 >= width) ten4_3 -= width; } ga_int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n; ga_int z_col = j + d * i; ga_int z_idx = z_col * out_s1 + z_row * out_s0; global_out[z_idx] = global_ten4[ten4_idx]; } } } } """ % locals() params = [ 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'intc', 'uintp', 'uintp', 'uintp', 'uintp', gpuarray.GpuArray, 'uintp', 'uintp', 'uintp', gpuarray.GpuArray, 'uintp', ] kernels.append(Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)) return kernels
def gpu_kernels(self, node, nodename): dtype_ten4 = node.inputs[0].dtype dtype_z = node.outputs[0].dtype flags = Kernel.get_flags(dtype_ten4, dtype_z) type_ten4 = gpuarray.dtype_to_ctype(dtype_ten4) type_z = gpuarray.dtype_to_ctype(dtype_z) # `BORDER_MODE`'s c_support_code() contains C constants definitions that are useful here. mode_constants = self.BORDER_MODE.c_support_code() kernels = [] kname = "k_multi_warp_less" k_var = "k_multi_warp_less_" + nodename code = """#include "cluda.h" // a version that uses less registers but doesn't work in all cases. %(mode_constants)s KERNEL void %(kname)s( const ga_int mode, const ga_int nb_batch, const ga_int nb_stack, const ga_int height, const ga_int width, const ga_int c, const ga_int d, const ga_int step_x, const ga_int step_y, const ga_int grid_c, const ga_int grid_d, const ga_size stride0, const ga_size stride1, const ga_size stride2, const ga_size stride3, GLOBAL_MEM const %(type_ten4)s * global_ten4, const ga_size offset_ten4, const ga_size out_s0, const ga_size out_s1, GLOBAL_MEM %(type_z)s * global_out, const ga_size offset_out ) { const ga_int wrap_centered_half_idx_shift_x = c/2; const ga_int wrap_centered_half_idx_shift_y = d/2; global_ten4 = (GLOBAL_MEM const %(type_ten4)s *)(((GLOBAL_MEM char *)global_ten4)+offset_ten4); global_out = (GLOBAL_MEM %(type_z)s *)(((GLOBAL_MEM char *)global_out)+offset_out); for(ga_int tblock = GID_0*LDIM_2+LID_2; tblock<nb_batch*nb_stack*grid_c*grid_d; tblock+=GDIM_0*LDIM_2){ const ga_int b = tblock%%grid_d; ga_int left = tblock/grid_d; const ga_int a = left%%grid_c; left = left/grid_c; const ga_int s = left%%nb_stack; left = left/nb_stack; const ga_int n = left; if(n>nb_batch)continue; if(s>nb_stack)continue; if(a>grid_c)continue; if(b>grid_d)continue; ga_int z_row = b + grid_d*(a + grid_c* (s + nb_stack*n)); ga_int i = LID_1; // loop over c { ga_int ten4_2 = i + a * step_x; if(mode == MODE_WRAP_CENTERED) { ten4_2 -= wrap_centered_half_idx_shift_x; if ( ten4_2 < 0 ) ten4_2 += height; else if (ten4_2 >= height) ten4_2 -= height; } else if (mode == MODE_HALF) { ten4_2 -= wrap_centered_half_idx_shift_x; } else if (mode == MODE_FULL) { ten4_2 -= c - 1; } ga_int j = LID_0; // loop over d { ga_int ten4_3 = j + b * step_y; if(mode == MODE_WRAP_CENTERED){ ten4_3 -= wrap_centered_half_idx_shift_y; if ( ten4_3 < 0 ) ten4_3 += width; else if (ten4_3 >= width) ten4_3 -= width; } else if (mode == MODE_HALF) { ten4_3 -= wrap_centered_half_idx_shift_y; } else if (mode == MODE_FULL) { ten4_3 -= d - 1; } ga_int z_col = j + d * i; ga_int z_idx = z_col * out_s1 + z_row * out_s0; if(ten4_2 < 0 || ten4_2 >= height || ten4_3 < 0 || ten4_3 >= width){ global_out[z_idx] = 0; } else { ga_int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n; global_out[z_idx] = global_ten4[ten4_idx]; } } } } }""" % dict( kname=kname, type_ten4=type_ten4, type_z=type_z, mode_constants=mode_constants, ) params = [ "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "uintp", "uintp", "uintp", "uintp", gpuarray.GpuArray, "uintp", "uintp", "uintp", gpuarray.GpuArray, "uintp", ] kernels.append( Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)) kname = "k_multi_warp" k_var = "k_multi_warp_" + nodename code = """#include "cluda.h" %(mode_constants)s KERNEL void %(kname)s( const ga_int mode, const ga_int nb_batch, const ga_int nb_stack, const ga_int height, const ga_int width, const ga_int c, const ga_int d, const ga_int step_x, const ga_int step_y, const ga_int grid_c, const ga_int grid_d, const ga_size stride0, const ga_size stride1, const ga_size stride2, const ga_size stride3, GLOBAL_MEM const %(type_ten4)s * global_ten4, const ga_size offset_ten4, const ga_size out_s0, const ga_size out_s1, GLOBAL_MEM %(type_z)s * global_out, const ga_size offset_out ) { const ga_int wrap_centered_half_idx_shift_x = c/2; const ga_int wrap_centered_half_idx_shift_y = d/2; global_ten4 = (GLOBAL_MEM const %(type_ten4)s *)(((GLOBAL_MEM char *)global_ten4)+offset_ten4); global_out = (GLOBAL_MEM %(type_z)s *)(((GLOBAL_MEM char *)global_out)+offset_out); for(ga_int tblock = GID_0*LDIM_2+LID_2; tblock<nb_batch*nb_stack*grid_c*grid_d; tblock+=GDIM_0*LDIM_2){ const ga_int b = tblock%%grid_d; ga_int left = tblock/grid_d; const ga_int a = left%%grid_c; left = left/grid_c; const ga_int s = left%%nb_stack; left = left/nb_stack; const ga_int n = left; if(n>nb_batch)continue; if(s>nb_stack)continue; if(a>grid_c)continue; if(b>grid_d)continue; ga_int z_row = b + grid_d*(a + grid_c* (s + nb_stack*n)); // loop over c for (ga_int i = LID_1; i < c; i+=LDIM_1) { ga_int ten4_2 = i + a * step_x; if(mode == MODE_WRAP_CENTERED) { ten4_2 -= wrap_centered_half_idx_shift_x; if ( ten4_2 < 0 ) ten4_2 += height; else if (ten4_2 >= height) ten4_2 -= height; } else if (mode == MODE_HALF) { ten4_2 -= wrap_centered_half_idx_shift_x; } else if (mode == MODE_FULL) { ten4_2 -= c - 1; } // loop over d for (ga_int j = LID_0; j < d; j+=LDIM_0) { ga_int ten4_3 = j + b * step_y; if(mode == MODE_WRAP_CENTERED) { ten4_3 -= wrap_centered_half_idx_shift_y; if ( ten4_3 < 0 ) ten4_3 += width; else if (ten4_3 >= width) ten4_3 -= width; } else if (mode == MODE_HALF) { ten4_3 -= wrap_centered_half_idx_shift_y; } else if (mode == MODE_FULL) { ten4_3 -= d - 1; } ga_int z_col = j + d * i; ga_int z_idx = z_col * out_s1 + z_row * out_s0; if(ten4_2 < 0 || ten4_2 >= height || ten4_3 < 0 || ten4_3 >= width){ global_out[z_idx] = 0; } else { ga_int ten4_idx = stride3*ten4_3 + stride2*ten4_2 + stride1*s + stride0*n; global_out[z_idx] = global_ten4[ten4_idx]; } } } } } """ % dict( kname=kname, type_ten4=type_ten4, type_z=type_z, mode_constants=mode_constants, ) params = [ "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "intc", "uintp", "uintp", "uintp", "uintp", gpuarray.GpuArray, "uintp", "uintp", "uintp", gpuarray.GpuArray, "uintp", ] kernels.append( Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)) return kernels
def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count, manner_fn, manner_init, b='', stride_b='', load_b='', dtype='float32'): """ Return C++ code for a function that reduces a contiguous buffer. This function leaves the answer in position 0 of the buffer. The rest of the buffer is trashed by this function. Parameters ---------- N Length of the buffer. buf Buffer pointer of size warpSize * sizeof(dtype). x Input data. stride_x Input data stride. load_x Wrapper to read from x. pos Index of executing thread. count Number of executing threads. b Optional, pointer to the bias. stride_b Optional, the stride of b if b is provided. load_b Optional, wrapper to read from b if b is provided. dtype Optional, the dtype of the output. manner_fn A function that accepts strings of arguments a and b, and returns c code for their reduction. return "%(a)s + %(b)s" for a sum reduction. manner_init A function that accepts strings of arguments a and return c code for its initialization. Notes ----- `buf` should be in gpu shared memory, we access it many times. """ if b: init = manner_init("%(load_x)s(%(x)s[%(pos)s * %(stride_x)s]) +" " %(load_b)s(%(b)s[%(pos)s * %(stride_b)s])" % locals()) loop_line = manner_fn("red", manner_init("%(load_x)s(%(x)s[i * %(stride_x)s]) + " "%(load_b)s(%(b)s[i * %(stride_b)s])" % locals())) else: init = manner_init("%(load_x)s(%(x)s[%(pos)s * %(stride_x)s])" % locals()) loop_line = manner_fn("red", manner_init("%(load_x)s(%(x)s[i * %(stride_x)s])" % locals())) loop_line2 = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % buf) r_16 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+16]" % (buf, pos)) r_8 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+8]" % (buf, pos)) r_4 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+4]" % (buf, pos)) r_2 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+2]" % (buf, pos)) r_1 = manner_fn("%s[%s]" % (buf, pos), "%s[%s+1]" % (buf, pos)) ctype = gpuarray.dtype_to_ctype(dtype) return """ { // This function trashes buf[1..n_threads], // leaving the reduction result in buf[0]. %(ctype)s red = %(init)s; #pragma unroll 16 for (int i = %(pos)s + %(count)s; i<%(N)s; i += %(count)s){ red = %(loop_line)s; } buf[%(pos)s] = red; __syncthreads(); if (%(pos)s < warpSize) { for (int i = %(pos)s + warpSize; i < %(count)s; i += warpSize) { %(buf)s[%(pos)s] = %(loop_line2)s; } if (%(pos)s < 16) { //reduce so that %(pos)s 0 has the reduction of everything if(%(pos)s + 16 < %(N)s) %(buf)s[%(pos)s] = %(r_16)s; if(%(pos)s + 8 < %(N)s) %(buf)s[%(pos)s] = %(r_8)s; if(%(pos)s + 4 < %(N)s) %(buf)s[%(pos)s] = %(r_4)s; if(%(pos)s + 2 < %(N)s) %(buf)s[%(pos)s] = %(r_2)s; if(%(pos)s + 1 < %(N)s) %(buf)s[%(pos)s] = %(r_1)s; } } } """ % locals()
def gpu_kernels(self, node, nodename): dtype_x = node.inputs[0].dtype dtype_y = node.inputs[1].dtype dtype_ind = node.inputs[2].dtype dtype_out = node.outputs[0].dtype itemsize_x = numpy.dtype(dtype_x).itemsize itemsize_y = numpy.dtype(dtype_y).itemsize itemsize_ind = numpy.dtype(dtype_ind).itemsize itemsize_out = numpy.dtype(dtype_out).itemsize flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind) type_x = gpuarray.dtype_to_ctype(dtype_x) type_y = gpuarray.dtype_to_ctype(dtype_y) type_ind = gpuarray.dtype_to_ctype(dtype_ind) type_out = gpuarray.dtype_to_ctype(dtype_out) kname = "k_vector_add_fast" k_var = "k_vector_add_fast_" + nodename code = """ /* * This is an atomicAdd that works for doubles since that is not provided * natively by cuda. */ __device__ ga_double atomicAdd(ga_double* address, ga_double val) { unsigned long long int* address_as_ull = (unsigned long long int*)address; unsigned long long int old = *address_as_ull, assumed; do { assumed = old; old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); } while (assumed != old); return __longlong_as_double(old); } __device__ ga_double atomicExch(ga_double *address, ga_double val) { return atomicExch((unsigned long long int *)address, __double_as_longlong(val)); } /* * This is a version of atomicAdd that works for half-floats. It may * read and write 2 bytes more than the size of the array if the array * has an uneven number of elements. The actual value at that spot * will not be modified. */ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) { ga_uint *base = (ga_uint *)((ga_size)addr & ~2); ga_uint old, assumed, sum, new_; old = *base; do { assumed = old; sum = __float2half_rn( __half2float(val) + __half2float((ga_half)__byte_perm(old, 0, ((ga_size)addr & 2) ? 0x4432 : 0x4410))); new_ = __byte_perm(old, sum, ((ga_size)addr & 2) ? 0x5410 : 0x3254); old = atomicCAS(base, assumed, new_); } while (assumed != old); return (ga_half)__byte_perm(old, 0, ((ga_size)addr & 2) ? 0x4432 : 0x4410); } __device__ ga_half atomicExch(ga_half *addr, ga_half val) { ga_uint *base = (ga_uint *)((ga_size)addr & ~2); ga_uint old, assumed, new_; old = *base; do { assumed = old; new_ = __byte_perm(old, val, ((ga_size)addr & 2) ? 0x5410 : 0x3254); old = atomicCAS(base, assumed, new_); } while (assumed != old); return (ga_half)__byte_perm(old, 0, ((ga_size)addr & 2) ? 0x4432 : 0x4410); } KERNEL void k_vector_add_fast(const ga_size numRowsX, const ga_size numColsX, const ga_ssize stridesX0, const ga_ssize stridesX1, %(type_x)s *X, const ga_size offset_X, const ga_size numRowsY, const ga_size numColsY, const ga_ssize stridesY0, const ga_ssize stridesY1, %(type_y)s *Y, const ga_size offset_Y, const ga_size numIndices, const ga_ssize stridesIndices, %(type_ind)s *indices_arr, const ga_size offset_indices_arr, const int set_instead_of_inc, ga_int *err) { X = (%(type_x)s *)(((char *)X)+offset_X); Y = (%(type_y)s *)(((char *)Y)+offset_Y); indices_arr = (%(type_ind)s *)(((char *)indices_arr)+offset_indices_arr); for (int i = (blockIdx.x); i < numIndices; i += gridDim.x) { for(int j = (threadIdx.x); j < numColsX;j += blockDim.x) { ga_ssize x_row = indices_arr[i * stridesIndices]; if (x_row < 0) x_row += numRowsX; ga_ssize y_row = i; if (x_row < numRowsX && x_row >= 0) { if (set_instead_of_inc) { atomicExch(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]); } else { atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]); } } else { *err = 1; } } } return; } """ % locals() params = [ 'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp', 'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp', 'uintp', 'intp', gpuarray.GpuArray, 'uintp', 'int', gpuarray.GpuArray ] return [ Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var) ]
def gpu_kernels(self, node, nodename): dtype_x = node.inputs[0].dtype dtype_b = node.inputs[1].dtype dtype_y_idx = node.inputs[2].dtype work_x = work_dtype(dtype_x) work_b = work_dtype(dtype_b) load_x = load_w(dtype_x) load_b = load_w(dtype_b) write_x = write_w(dtype_x) write_b = write_w(dtype_b) flags = Kernel.get_flags(dtype_x, dtype_b, dtype_y_idx) type_x = gpuarray.dtype_to_ctype(work_x) type_b = gpuarray.dtype_to_ctype(work_b) type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx) kname = "k_xent_sm_1hot_bias" k_var = "k_xent_sm_1hot_bias_" + nodename sio = StringIO() print(""" KERNEL void %(kname)s(const ga_size M, const ga_size N, const %(type_x)s* x_data, const ga_size offset_x, const ga_ssize xs0, const ga_ssize xs1, const %(type_b)s* b, const ga_size offset_b, const ga_ssize bs0, const %(type_y_idx)s* y_idx_data, const ga_size offset_y_idx, const ga_ssize y_idxs0, %(type_x)s* nll_data, const ga_size offset_nll, const ga_ssize nlls0, %(type_x)s* sm_data, const ga_size offset_sm, const ga_ssize sms0, const ga_ssize sms1, %(type_y_idx)s* am_data, const ga_size offset_am, const ga_ssize ams0) { x_data = (const %(type_x)s *)(((char *)x_data)+offset_x); b = (const %(type_b)s *)(((char *)b)+offset_b); y_idx_data = (const %(type_y_idx)s *)(((char *)y_idx_data)+offset_y_idx); nll_data = (%(type_x)s *)(((char *)nll_data)+offset_nll); sm_data = (%(type_x)s *)(((char *)sm_data)+offset_sm); am_data = (%(type_y_idx)s *)(((char *)am_data)+offset_am); for (int row = blockIdx.x; row < M; row += gridDim.x){ const %(type_x)s* x = x_data + xs0 * row; const %(type_y_idx)s y_idx = y_idx_data[row * y_idxs0]; %(type_x)s* sm = sm_data + sms0 * row; %(type_x)s sum = 0.0; int row_max_j = 0; %(type_x)s row_max = %(load_x)s(x[0]) + %(load_b)s(b[0]); for (int j = 1; j < N; ++j) { %(type_x)s row_ij = %(load_x)s(x[j*xs1]) + %(load_b)s(b[j*bs0]); //todo: store to shared memory row_max_j = (row_ij > row_max) ? j : row_max_j; row_max = (row_ij > row_max) ? row_ij : row_max; } //compute the exp for (int j = 0; j < N; ++j) { %(type_x)s row_ij = %(load_x)s(x[j*xs1]) + %(load_b)s(b[j*bs0]); %(type_x)s sm_ij = exp(row_ij - row_max); sum += sm_ij; sm[j * sms1] = %(write_x)s(sm_ij); } %(type_x)s sum_inv = 1.0 / sum; for (int j = 0; j < N; ++j) { %(type_x)s __tmp = %(load_x)s(sm[j * sms1]); __tmp *= sum_inv; sm[j * sms1] = %(write_x)s(__tmp); } if ((y_idx >= N) || (y_idx < 0)) { //TODO: set raise an error bit in a global var? nll_data[row*nlls0] = %(write_x)s(0.0); // raise some suspicion at least... } else { nll_data[row*nlls0] = %(write_x)s(- %(load_x)s(x[y_idx*xs1]) - %(load_b)s(b[y_idx*bs0]) + row_max + log(sum)); } am_data[row*ams0] = row_max_j; } } """ % locals(), file=sio) params = [ 'uintp', 'uintp', gpuarray.GpuArray, 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp', 'intp', gpuarray.GpuArray, 'uintp', 'intp', gpuarray.GpuArray, 'uintp', 'intp', gpuarray.GpuArray, 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp', 'intp' ] return [Kernel(code=sio.getvalue(), name=kname, params=params, flags=flags, objvar=k_var)]
def gpu_kernels(self, node, nodename): # load kernel source device_type = node.inputs[0].type.context.kind kernel_ext = {b"cuda": ".cu", b"opencl": ".cl"}[device_type] common_ext = {b"cuda": ".cuh", b"opencl": ".h"}[device_type] # prepare "$" macros if device_type == b"cuda": ndim = node.inputs[0].ndim dstv_strides_code = "".join( f"ssize_t dstv_strides_{i}, " for i in range(ndim) ) dsti_strides_code = "".join( f"ssize_t dsti_strides_{i}, " for i in range(ndim) ) src_strides_code = "".join( f"ssize_t src_strides_{i}, " for i in range(ndim) ) set_slice_code = """ gidx = gid %% dims_%(i)d; gid /= dims_%(i)d; {dstv}; {dsti}; src = ptr_add(src, gidx*src_strides_%(i)d);\n""".format( dstv="dstv = ptr_add(dstv, gidx*dstv_strides_%(i)d)" if self.return_values else "", dsti="dsti = ptr_add(dsti, gidx*dsti_strides_%(i)d)" if self.return_indices else "", ) set_slice_code = "".join(set_slice_code % dict(i=j) for j in range(1, ndim)) if self.return_values: set_slice_code += """ dstv = ptr_add(dstv, dstv_offset); """ if self.return_indices: set_slice_code += """ dsti = ptr_add(dsti, dsti_offset); """ set_slice_code += """ src = ptr_add(src, src_offset); """ flags = Kernel.get_flags(node.inputs[0].dtype) subs = dict( inp_t=ga.dtype_to_ctype(node.inputs[0].dtype), out_t=ga.dtype_to_ctype(self.idx_dtype), dims="".join(f"size_t dims_{i}, " for i in range(1, ndim)), dstv="INPUT_TYPE *dstv," if self.return_values else "", dstv_offset="size_t dstv_offset," if self.return_values else "", dsti="INDEX_TYPE *dsti," if self.return_indices else "", dsti_offset="size_t dsti_offset," if self.return_indices else "", dstv_strides=dstv_strides_code if self.return_values else "", dsti_strides=dsti_strides_code if self.return_indices else "", src_strides=src_strides_code, set_slice=set_slice_code, write_value=int(self.return_values), write_index=int(self.return_indices), ndim=str(ndim), ) elif device_type == b"opencl": raise NotImplementedError() # setup parameters param_types = [ga.SIZE] * (ndim - 1) # dims for _ in range(self.return_values + self.return_indices): param_types.append(ga.GpuArray) # dst* param_types.append(ga.SIZE) # offset param_types.extend([ga.SSIZE] * ndim) # dst*_strides param_types.append(ga.SIZE) # k param_types.append(ga.GpuArray) # src param_types.append(ga.SIZE) # offset param_types.extend([ga.SSIZE] * ndim) # src_strides param_types.append(ga.SIZE) # size # load and compile kernels with open( os.path.join( os.path.dirname(__file__), "c_code", "topk_common" + common_ext ) ) as f: common_src = f.read() kernels = [] def build_kernel(fname, kname, subs): with open(os.path.join(os.path.dirname(__file__), "c_code", fname)) as f: kernel_src = f.read() ker = Kernel( code=( "#include <cluda.h>\n" + Template(common_src + kernel_src).substitute(**subs) ), name=kname, params=param_types, flags=flags, objvar=kname + nodename, ) return ker subs["count_t"] = "int" kernels.append(build_kernel("topk_dense" + kernel_ext, "k_topk_dense", subs)) subs["kname"] = "k_topk_dense_large" kernels.append( build_kernel("topk_dense_large" + kernel_ext, "k_topk_dense_large", subs) ) subs["count_t"] = "long long" subs["kname"] = "k_topk_dense_xlarge" kernels.append( build_kernel("topk_dense_large" + kernel_ext, "k_topk_dense_xlarge", subs) ) return kernels
def gpu_kernels(self, node, nodename): dtype_x = node.inputs[0].dtype dtype_b = node.inputs[1].dtype dtype_sm = node.outputs[0].dtype load_x = load_w(node.inputs[0].dtype) load_b = load_w(node.inputs[1].dtype) write_sm = write_w(node.outputs[0].dtype) work_sm = work_dtype(node.outputs[0].dtype) flags = Kernel.get_flags(dtype_x, dtype_b, dtype_sm) type_x = gpuarray.dtype_to_ctype(dtype_x) type_b = gpuarray.dtype_to_ctype(dtype_b) type_sm = gpuarray.dtype_to_ctype(dtype_sm) type_acc = gpuarray.dtype_to_ctype(work_sm) ctype = gpuarray.dtype_to_ctype(work_sm) params = [ gpuarray.SIZE, gpuarray.SIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE, ] kernels = [] kname = "kSoftmaxWithBias" k_var = "kSoftmaxWithBias_" + nodename code = """ KERNEL void %(kname)s (const ga_size M, const ga_size N, GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1, GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0, GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf)) { GA_DECL_SHARED_BODY(%(type_acc)s, buf); LOCAL_MEM_ARG %(type_acc)s * buf2 = buf + N; x = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x)+offset_x); b = (GLOBAL_MEM const %(type_b)s *)(((GLOBAL_MEM char *)b)+offset_b); sm = (GLOBAL_MEM %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm); for (ga_int blockIDX = GID_0; blockIDX < M; blockIDX += GDIM_0){ for (ga_int tx = LID_0; tx< N; tx += LDIM_0){ buf[tx] = %(load_x)s(x[blockIDX * sx0 + tx * sx1]); buf[tx] += %(load_b)s(b[tx * sb0]); buf2[tx] = buf[tx]; } local_barrier(); { // This function trashes buf[1..GA_WARP_SIZE], // leaving the reduction result in buf[0]. if (LID_0 < GA_WARP_SIZE) { for (ga_int i = LID_0 + GA_WARP_SIZE; i < N; i += GA_WARP_SIZE) { buf[LID_0] = max(buf[LID_0], buf[i]); } } local_barrier(); //reduce so that LID_0 0 has the reduction of everything for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) { if (LID_0 < _n && LID_0 + _n < N) buf[LID_0] = max(buf[LID_0], buf[LID_0+_n]); local_barrier(); } } %(ctype)s row_max = buf[0]; local_barrier(); for(ga_int __i=LID_0; __i<N; __i+=LDIM_0){; buf[__i] = exp(buf2[__i] - row_max); buf2[__i] = buf[__i]; } local_barrier(); { // This function trashes buf[1..GA_WARP_SIZE], // leaving the reduction result in buf[0]. if (LID_0 < GA_WARP_SIZE) { for (ga_int i = LID_0 + GA_WARP_SIZE; i < N; i += GA_WARP_SIZE) { buf[LID_0] = buf[LID_0] + buf[i]; } } local_barrier(); //reduce so that LID_0 0 has the reduction of everything for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) { if (LID_0 < _n && LID_0 + _n < N) buf[LID_0] = buf[LID_0] + buf[LID_0+_n]; local_barrier(); } } %(ctype)s row_sum = buf[0]; local_barrier(); for(ga_int __i=LID_0; __i<N; __i+=LDIM_0){ buf[__i] = buf2[__i] / row_sum; } local_barrier(); for (ga_int tx = LID_0; tx< N; tx += LDIM_0){ sm[blockIDX * sm_s0 + tx * sm_s1] = %(write_sm)s(buf[tx]); } local_barrier(); } } """ % locals() kernels.append(Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)) kname = "kSoftmaxWithBias_fixed_shared" k_var = "kSoftmaxWithBias_fixed_shared" + nodename code = """ KERNEL void %(kname)s (const ga_size M, const ga_size N, GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1, GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0, GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf)) { GA_DECL_SHARED_BODY(%(type_acc)s, buf); x = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x)+offset_x); b = (GLOBAL_MEM const %(type_b)s *)(((GLOBAL_MEM char *)b)+offset_b); sm = (GLOBAL_MEM %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm); for (ga_int blockIDX = GID_0; blockIDX < M; blockIDX += GDIM_0){ GLOBAL_MEM const %(type_x)s *x_ptr = &x[blockIDX * sx0]; GLOBAL_MEM %(type_sm)s *sm_ptr = &sm[blockIDX * sm_s0]; { // This function trashes buf[1..n_threads], // leaving the reduction result in buf[0]. %(ctype)s red = %(load_x)s(x_ptr[LID_0 * sx1]) + %(load_b)s(b[LID_0 * sb0]); #pragma unroll 16 for (ga_int i = LID_0 + LDIM_0; i<N; i += LDIM_0) { red = max(red, %(load_x)s(x_ptr[i * sx1]) + %(load_b)s(b[i * sb0])); } buf[LID_0] = red; local_barrier(); if (LID_0 < GA_WARP_SIZE) { for (ga_int i = LID_0 + GA_WARP_SIZE; i < LDIM_0; i += GA_WARP_SIZE) { buf[LID_0] = max(buf[LID_0], buf[i]); } } local_barrier(); //reduce so that LID_0 0 has the reduction of everything for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) { if (LID_0 < _n && LID_0 + _n < N) buf[LID_0] = max(buf[LID_0], buf[LID_0+_n]); local_barrier(); } } %(ctype)s row_max = buf[0]; local_barrier(); { // This function trashes buf[1..n_threads], // leaving the reduction result in buf[0]. %(ctype)s red = exp(%(load_x)s(x_ptr[LID_0 * sx1]) + %(load_b)s(b[LID_0 * sb0]) - row_max); #pragma unroll 16 for (ga_int i = LID_0 + LDIM_0; i<N; i += LDIM_0) { red = red + exp(%(load_x)s(x_ptr[i * sx1]) + %(load_b)s(b[i * sb0]) - row_max); } buf[LID_0] = red; local_barrier(); if (LID_0 < GA_WARP_SIZE) { for (ga_int i = LID_0 + GA_WARP_SIZE; i < LDIM_0; i += GA_WARP_SIZE) { buf[LID_0] = buf[LID_0] + buf[i]; } } local_barrier(); //reduce so that LID_0 0 has the reduction of everything for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) { if (LID_0 < _n && LID_0 + _n < N) buf[LID_0] = buf[LID_0] + buf[LID_0+_n]; local_barrier(); } } %(ctype)s row_sum = buf[0]; local_barrier(); for (ga_int tx = LID_0; tx< N; tx += LDIM_0){ sm_ptr[tx * sm_s1] = %(write_sm)s(exp(%(load_x)s(x_ptr[tx * sx1]) + %(load_b)s(b[tx * sb0]) - row_max) / row_sum); } local_barrier(); } } """ % locals() kernels.append(Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)) return kernels
def inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, pos, count, manner_fn, manner_init, b='', stride_b='', load_b='', dtype='float32'): """ Return C++ code for a function that reduces a contiguous buffer. This function leaves the answer in position 0 of the buffer. The rest of the buffer is trashed by this function. Parameters ---------- N Length of the buffer. buf Buffer pointer of size warpSize * sizeof(dtype). x Input data. stride_x Input data stride. load_x Wrapper to read from x. pos Index of executing thread. count Number of executing threads. manner_fn A function that accepts strings of arguments a and b, and returns c code for their reduction. return "%(a)s + %(b)s" for a sum reduction. manner_init A function that accepts strings of arguments a and return c code for its initialization. b Optional, pointer to the bias. stride_b Optional, the stride of b if b is provided. load_b Optional, wrapper to read from b if b is provided. dtype Optional, the dtype of the output. Notes ----- `buf` should be in gpu shared memory, we access it many times. """ if b: init = manner_init("%(load_x)s(%(x)s[%(pos)s * %(stride_x)s]) +" " %(load_b)s(%(b)s[%(pos)s * %(stride_b)s])" % locals()) loop_line = manner_fn( "red", manner_init("%(load_x)s(%(x)s[i * %(stride_x)s]) + " "%(load_b)s(%(b)s[i * %(stride_b)s])" % locals())) else: init = manner_init("%(load_x)s(%(x)s[%(pos)s * %(stride_x)s])" % locals()) loop_line = manner_fn( "red", manner_init("%(load_x)s(%(x)s[i * %(stride_x)s])" % locals())) loop_line2 = manner_fn("%s[%s]" % (buf, pos), "%s[i]" % buf) r_n = manner_fn("%s[%s]" % (buf, pos), "%s[%s+_n]" % (buf, pos)) ctype = gpuarray.dtype_to_ctype(dtype) return """ { // This function trashes buf[1..n_threads], // leaving the reduction result in buf[0]. %(ctype)s red = %(init)s; #pragma unroll 16 for (int i = %(pos)s + %(count)s; i<%(N)s; i += %(count)s) { red = %(loop_line)s; } buf[%(pos)s] = red; __syncthreads(); if (%(pos)s < warpSize) { for (int i = %(pos)s + warpSize; i < %(count)s; i += warpSize) { %(buf)s[%(pos)s] = %(loop_line2)s; } } __syncthreads(); //reduce so that %(pos)s 0 has the reduction of everything for (unsigned int _n = warpSize / 2; _n > 0; _n /= 2) { if (%(pos)s < _n && %(pos)s + _n < %(N)s) %(buf)s[%(pos)s] = %(r_n)s; __syncthreads(); } } """ % locals()
def gpu_kernels(self, node, nodename): dtype_x = node.inputs[0].dtype dtype_b = node.inputs[1].dtype dtype_y_idx = node.inputs[2].dtype work_x = work_dtype(dtype_x) work_b = work_dtype(dtype_b) load_x = load_w(dtype_x) load_b = load_w(dtype_b) write_x = write_w(dtype_x) write_b = write_w(dtype_b) flags = Kernel.get_flags(dtype_x, dtype_b, dtype_y_idx) type_x = gpuarray.dtype_to_ctype(dtype_x) type_b = gpuarray.dtype_to_ctype(dtype_b) work_x = gpuarray.dtype_to_ctype(work_x) type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx) kname = "k_xent_sm_1hot_bias" k_var = "k_xent_sm_1hot_bias_" + nodename f = '' if dtype_x == 'float64' else 'f' sio = StringIO() print(""" KERNEL void %(kname)s(const ga_size M, const ga_size N, const %(type_x)s* x_data, const ga_size offset_x, const ga_ssize xs0, const ga_ssize xs1, const %(type_b)s* b, const ga_size offset_b, const ga_ssize bs0, const %(type_y_idx)s* y_idx_data, const ga_size offset_y_idx, const ga_ssize y_idxs0, %(type_x)s* nll_data, const ga_size offset_nll, const ga_ssize nlls0, %(type_x)s* sm_data, const ga_size offset_sm, const ga_ssize sms0, const ga_ssize sms1, %(type_y_idx)s* am_data, const ga_size offset_am, const ga_ssize ams0) { x_data = (const %(type_x)s *)(((char *)x_data)+offset_x); b = (const %(type_b)s *)(((char *)b)+offset_b); y_idx_data = (const %(type_y_idx)s *)(((char *)y_idx_data)+offset_y_idx); nll_data = (%(type_x)s *)(((char *)nll_data)+offset_nll); sm_data = (%(type_x)s *)(((char *)sm_data)+offset_sm); am_data = (%(type_y_idx)s *)(((char *)am_data)+offset_am); for (int row = blockIdx.x; row < M; row += gridDim.x){ const %(type_x)s* x = x_data + xs0 * row; %(type_x)s* sm = sm_data + sms0 * row; extern LOCAL_MEM %(work_x)s per_thread_values[]; LOCAL_MEM %(work_x)s row_max, sum, sum_inv; LOCAL_MEM int row_max_threadIdx; %(work_x)s per_thread_row_max, per_thread_sum; int per_thread_row_max_j; // COMPUTE ROW MAX AND ARGMAX // compute separate per-thread maximums and argmaxes per_thread_row_max = NAN; per_thread_row_max_j = 0; for (int j = threadIdx.x; j < N; j += blockDim.x) { %(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]); per_thread_row_max_j = (row_ij > per_thread_row_max) ? j : per_thread_row_max_j; per_thread_row_max = fmax%(f)s(row_ij, per_thread_row_max); } per_thread_values[threadIdx.x] = per_thread_row_max; local_barrier(); if (threadIdx.x == 0) { row_max = NAN; row_max_threadIdx = 0; for (int j = 0; j < blockDim.x; j++) { %(work_x)s per_thread_max = per_thread_values[j]; row_max_threadIdx = (per_thread_max > row_max) ? j : row_max_threadIdx; row_max = fmax%(f)s(per_thread_max, row_max); } } local_barrier(); // The thread with the higest max writes out which of its // values was the winner. if (threadIdx.x == row_max_threadIdx) am_data[row * ams0] = per_thread_row_max_j; // COMPUTE SOFTMAX per_thread_sum = 0.0; for (int j = threadIdx.x; j < N; j += blockDim.x) { %(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]); %(work_x)s sm_ij = exp%(f)s(row_ij - row_max); per_thread_sum += sm_ij; sm[j * sms1] = %(write_x)s(sm_ij); } per_thread_values[threadIdx.x] = per_thread_sum; local_barrier(); if (threadIdx.x == 0) { sum = 0.0; for (int j = 0; j < blockDim.x; j++) { sum += per_thread_values[j]; } sum_inv = 1.0 / sum; } local_barrier(); for (int j = threadIdx.x; j < N; j += blockDim.x) { sm[j * sms1] = %(write_x)s(%(load_x)s(sm[j * sms1]) * sum_inv); } if (threadIdx.x == 0) { const %(type_y_idx)s y_idx = (int)y_idx_data[row * y_idxs0]; if ((y_idx >= N || y_idx < 0)) { // raise some suspicion. nll_data[row * nlls0] = %(write_x)s(0.0); } else { nll_data[row * nlls0] = %(write_x)s( - %(load_x)s(x[y_idx * xs1]) - %(load_b)s(b[y_idx * bs0]) + row_max + log%(f)s(sum)); } } } } """ % locals(), file=sio) params = [ 'uintp', 'uintp', gpuarray.GpuArray, 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp', 'intp', gpuarray.GpuArray, 'uintp', 'intp', gpuarray.GpuArray, 'uintp', 'intp', gpuarray.GpuArray, 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp', 'intp' ] return [Kernel(code=sio.getvalue(), name=kname, params=params, flags=flags, objvar=k_var)]
def inline_softmax_fixed_shared(N, buf, x, stride_x, load_x, sm, sm_stride, write_sm, threadPos, threadCount, b='', stride_b='', load_b='', dtype="float32"): """ Generate code to perform softmax with a fixed amount of shared memory. On entry, `buf` is assumed to be empty. On exit, `buf[0]` contains the softmax, `buf2` contains un-normalized softmax. Parameters ---------- N Length of the buffer, atleast waprSize(32). buf A shared memory buffer of size warpSize * sizeof(dtype). x A ptr to the gpu memory where the row is stored. stride_x The stride between each element in x. load_x Wrapper to read from x. sm A ptr to the gpu memory to store the result. sm_stride The stride between each sm element. write_sm Wrapper before writing to sm. threadPos Index of executing thread. threadCount Number of executing threads. b Optional, pointer to the bias. stride_b Optional, the stride of b if b is provided. load_b Optional, wrapper to read from b if b is provided. dtype Optional, the dtype of the softmax's output if not float32. Notes ----- `buf` should be in gpu shared memory, we access it many times. We use tx as an int variable in a loop. """ ctype = gpuarray.dtype_to_ctype(dtype) ret = [ # get max of buf (trashing all but buf[0]) inline_reduce_fixed_shared_max(N, buf, x, stride_x, load_x, threadPos, threadCount, b, stride_b, load_b, dtype), '__syncthreads()', ('%s row_max = ' + buf + '[0]') % ctype, '__syncthreads()', inline_reduce_fixed_shared(N, buf, x, stride_x, load_x, threadPos, threadCount, lambda a, b: "%s + %s" % (a, b), lambda a: "exp(%s - row_max)" % a, b, stride_b, load_b, dtype), '__syncthreads()', ('%s row_sum = ' + buf + '[0]') % ctype, '__syncthreads()', "for (int tx = threadIdx.x; tx< N; tx += blockDim.x){", ] # This set all value correctly if b: ret += [ "%(sm)s[tx * %(sm_stride)s] = " " %(write_sm)s(exp(%(load_x)s(%(x)s[tx * %(stride_x)s]) +" " %(load_b)s(%(b)s[tx * %(stride_b)s]) - row_max)" " / row_sum)" % locals() ] else: ret += [ "%(sm)s[tx * %(sm_stride)s] = " "%(write_sm)s(exp(%(load_x)s(%(x)s[tx * %(stride_x)s]) - row_max)" " / row_sum)" % locals() ] ret += [ "}", '__syncthreads()', ] return ret
def c_code(self, node, nodename, inp, out, sub): if node.inputs[0].type.context.kind != b'cuda': raise NotImplementedError("cuda only") typecode_dx = pygpu.gpuarray.dtype_to_typecode(node.outputs[0].dtype) itemsize_dnll = numpy.dtype(node.inputs[0].dtype).itemsize itemsize_sm = numpy.dtype(node.inputs[1].dtype).itemsize itemsize_y_idx = numpy.dtype(node.inputs[2].dtype).itemsize itemsize_dx = numpy.dtype(node.outputs[0].dtype).itemsize dtype_dnll = node.inputs[0].dtype dtype_sm = node.inputs[1].dtype dtype_y_idx = node.inputs[2].dtype dtype_dx = node.outputs[0].dtype type_intp = gpuarray.dtype_to_ctype(numpy.intp) dnll, sm, y_idx = inp dx, = out fail = sub['fail'] ctx = sub['params'] k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename err_check = """ if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "gpuarray error: %(k_var)s: %%s.", GpuKernel_error(&%(k_var)s, err)); %(fail)s; } """ % locals() sync = "" if config.gpuarray.sync: sync = """ err = GpuArray_sync(&%(z)s->ga); %(err_check)s """ % locals() return """ // Get `dnll.shape[0]` or set it to zero if `dnll` is a scalar. const ssize_t %(dnll)s_dims0 = (PyGpuArray_NDIM(%(dnll)s) > 0 ? PyGpuArray_DIMS(%(dnll)s)[0] : (ssize_t) 0); // Get `dnll.strides[0]` and set it to zero if `dnll` is a scalar // or a vector with just one element. const ssize_t %(dnll)s_strides0 = (%(dnll)s_dims0 > 1 ? PyGpuArray_STRIDES(%(dnll)s)[0] : (ssize_t) 0); if ((PyGpuArray_NDIM(%(dnll)s) > 1) || (PyGpuArray_NDIM(%(sm)s) != 2) || (PyGpuArray_NDIM(%(y_idx)s) != 1)) { PyErr_SetString(PyExc_ValueError, "rank error"); %(fail)s; } if (%(dnll)s_dims0 != PyGpuArray_DIMS(%(sm)s)[0] && %(dnll)s_dims0 > 1) { PyErr_Format(PyExc_ValueError, "dnll.shape[0] == %%i, but sm.shape[0] == %%i", %(dnll)s_dims0, PyGpuArray_DIMS(%(sm)s)[0]); %(fail)s; } if (%(dnll)s_dims0 != PyGpuArray_DIMS(%(y_idx)s)[0] && %(dnll)s_dims0 > 1) { PyErr_SetString(PyExc_ValueError, "dnll.shape[0] != y_idx.shape[0]"); %(fail)s; } if (PyGpuArray_DIMS(%(sm)s)[0] != PyGpuArray_DIMS(%(y_idx)s)[0]) { PyErr_SetString(PyExc_ValueError, "sm.shape[0] != y_idx.shape[0]"); %(fail)s; } if ((NULL == %(dx)s) || (PyGpuArray_DIMS(%(dx)s)[0] != PyGpuArray_DIMS(%(sm)s)[0]) || (PyGpuArray_DIMS(%(dx)s)[1] != PyGpuArray_DIMS(%(sm)s)[1])) { Py_XDECREF(%(dx)s); %(dx)s = pygpu_empty(2, PyGpuArray_DIMS(%(sm)s), %(typecode_dx)s, GA_C_ORDER, %(ctx)s, Py_None); if (!%(dx)s) { %(fail)s } } { size_t n_blocks[3] = {std::min(PyGpuArray_DIMS(%(dx)s)[0], (size_t)256), 1, 1}; size_t threads_per_block[3] = {std::min(PyGpuArray_DIMS(%(dx)s)[1], (size_t)256), 1, 1}; ssize_t stride_DNLL0 = %(dnll)s_strides0 / %(itemsize_dnll)s; ssize_t stride_SM0 = PyGpuArray_STRIDES(%(sm)s)[0] / %(itemsize_sm)s; ssize_t stride_SM1 = PyGpuArray_STRIDES(%(sm)s)[1] / %(itemsize_sm)s; ssize_t stride_YIDX0 = PyGpuArray_STRIDES(%(y_idx)s)[0] / %(itemsize_y_idx)s; ssize_t stride_DX0 = PyGpuArray_STRIDES(%(dx)s)[0] / %(itemsize_dx)s; ssize_t stride_DX1 = PyGpuArray_STRIDES(%(dx)s)[1] / %(itemsize_dx)s; void *kernel_params[] = { (void *)&PyGpuArray_DIMS(%(dx)s)[0], (void *)&PyGpuArray_DIMS(%(dx)s)[1], (void *)%(dnll)s->ga.data, (void *)&%(dnll)s->ga.offset, (void *)&stride_DNLL0, (void *)%(sm)s->ga.data, (void *)&%(sm)s->ga.offset, (void *)&stride_SM0, (void *)&stride_SM1, (void *)%(y_idx)s->ga.data, (void *)&%(y_idx)s->ga.offset, (void *)&stride_YIDX0, (void *)%(dx)s->ga.data, (void *)&%(dx)s->ga.offset, (void *)&stride_DX0, (void *)&stride_DX1}; int err = GpuKernel_call(&%(k_var)s, 3, threads_per_block, n_blocks, 0, kernel_params); %(err_check)s %(sync)s } assert(%(dx)s); """ % locals()
def gpu_kernels(self, node, nodename): dtype_x = node.inputs[0].dtype dtype_y = node.inputs[1].dtype dtype_ind = node.inputs[2].dtype dtype_out = node.outputs[0].dtype itemsize_x = numpy.dtype(dtype_x).itemsize itemsize_y = numpy.dtype(dtype_y).itemsize itemsize_ind = numpy.dtype(dtype_ind).itemsize itemsize_out = numpy.dtype(dtype_out).itemsize flags = Kernel.get_flags(dtype_x, dtype_y, dtype_ind) type_x = gpuarray.dtype_to_ctype(dtype_x) type_y = gpuarray.dtype_to_ctype(dtype_y) type_ind = gpuarray.dtype_to_ctype(dtype_ind) type_out = gpuarray.dtype_to_ctype(dtype_out) kname = "k_vector_add_fast" k_var = "k_vector_add_fast_" + nodename code = """ /* * This is an atomicAdd that works for doubles since that is not provided * natively by cuda. */ __device__ double atomicAdd(ga_double* address, ga_double val) { unsigned long long int* address_as_ull = (unsigned long long int*)address; unsigned long long int old = *address_as_ull, assumed; do { assumed = old; old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); } while (assumed != old); return __longlong_as_double(old); } /* * This is a version of atomicAdd that works for half-floats. It may * read and write 2 bytes more than the size of the array if the array * has an uneven number of elements. The actual value at that spot * will not be modified. */ __device__ ga_half atomicAdd(ga_half *addr, ga_half val) { ga_uint *base = (ga_uint *)((ga_size)addr & ~2); ga_uint old, assumed, sum, new_; old = *base; do { assumed = old; sum = __float2half_rn( __half2float(val) + __half2float((ga_half)__byte_perm(old, 0, ((ga_size)addr & 2) ? 0x4432 : 0x4410))); new_ = __byte_perm(old, sum, ((ga_size)addr & 2) ? 0x5410 : 0x3254); old = atomicCAS(base, assumed, new_); } while (assumed != old); return (ga_half)__byte_perm(old, 0, ((ga_size)addr & 2) ? 0x4432 : 0x4410); } KERNEL void k_vector_add_fast(const ga_size numRowsX, const ga_size numColsX, const ga_ssize stridesX0, const ga_ssize stridesX1, %(type_x)s *X, const ga_size offset_X, const ga_size numRowsY, const ga_size numColsY, const ga_ssize stridesY0, const ga_ssize stridesY1, %(type_y)s *Y, const ga_size offset_Y, const ga_size numIndices, const ga_ssize stridesIndices, %(type_ind)s *indices_arr, const ga_size offset_indices_arr, ga_int *err) { X = (%(type_x)s *)(((char *)X)+offset_X); Y = (%(type_y)s *)(((char *)Y)+offset_Y); indices_arr = (%(type_ind)s *)(((char *)indices_arr)+offset_indices_arr); for (int i = (blockIdx.x); i < numIndices; i += gridDim.x) { for(int j = (threadIdx.x); j < numColsX;j += blockDim.x) { ga_ssize x_row = indices_arr[i * stridesIndices]; if (x_row < 0) x_row += numRowsX; ga_ssize y_row = i; if (x_row < numRowsX && x_row >= 0) { atomicAdd(&X[(x_row * stridesX0) + (j * stridesX1)], Y[(y_row * stridesY0) + (j * stridesY1)]); } else { *err = 1; } } } return; } """ % locals() params = [ 'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp', 'uintp', 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp', 'uintp', 'intp', gpuarray.GpuArray, 'uintp', gpuarray.GpuArray] return [Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)]
def gpu_kernels(self, node, nodename): dtype_x = node.inputs[0].dtype dtype_b = node.inputs[1].dtype dtype_y_idx = node.inputs[2].dtype work_x = work_dtype(dtype_x) work_b = work_dtype(dtype_b) load_x = load_w(dtype_x) load_b = load_w(dtype_b) write_x = write_w(dtype_x) write_b = write_w(dtype_b) flags = Kernel.get_flags(dtype_x, dtype_b, dtype_y_idx) type_x = gpuarray.dtype_to_ctype(dtype_x) type_b = gpuarray.dtype_to_ctype(dtype_b) work_x = gpuarray.dtype_to_ctype(work_x) type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx) kname = "k_xent_sm_1hot_bias" k_var = "k_xent_sm_1hot_bias_" + nodename f = '' if dtype_x == 'float64' else 'f' sio = StringIO() print(""" KERNEL void %(kname)s(const ga_size M, const ga_size N, const %(type_x)s* x_data, const ga_size offset_x, const ga_ssize xs0, const ga_ssize xs1, const %(type_b)s* b, const ga_size offset_b, const ga_ssize bs0, const %(type_y_idx)s* y_idx_data, const ga_size offset_y_idx, const ga_ssize y_idxs0, %(type_x)s* nll_data, const ga_size offset_nll, const ga_ssize nlls0, %(type_x)s* sm_data, const ga_size offset_sm, const ga_ssize sms0, const ga_ssize sms1, %(type_y_idx)s* am_data, const ga_size offset_am, const ga_ssize ams0) { x_data = (const %(type_x)s *)(((char *)x_data)+offset_x); b = (const %(type_b)s *)(((char *)b)+offset_b); y_idx_data = (const %(type_y_idx)s *)(((char *)y_idx_data)+offset_y_idx); nll_data = (%(type_x)s *)(((char *)nll_data)+offset_nll); sm_data = (%(type_x)s *)(((char *)sm_data)+offset_sm); am_data = (%(type_y_idx)s *)(((char *)am_data)+offset_am); for (int row = blockIdx.x; row < M; row += gridDim.x){ const %(type_x)s* x = x_data + xs0 * row; %(type_x)s* sm = sm_data + sms0 * row; extern LOCAL_MEM %(work_x)s per_thread_values[]; LOCAL_MEM %(work_x)s row_max, sum, sum_inv; LOCAL_MEM int row_max_threadIdx; %(work_x)s per_thread_row_max, per_thread_sum; int per_thread_row_max_j; // COMPUTE ROW MAX AND ARGMAX // compute separate per-thread maximums and argmaxes per_thread_row_max = NAN; per_thread_row_max_j = 0; for (int j = threadIdx.x; j < N; j += blockDim.x) { %(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]); per_thread_row_max_j = (row_ij > per_thread_row_max) ? j : per_thread_row_max_j; per_thread_row_max = fmax%(f)s(row_ij, per_thread_row_max); } per_thread_values[threadIdx.x] = per_thread_row_max; local_barrier(); if (threadIdx.x == 0) { row_max = NAN; row_max_threadIdx = 0; for (int j = 0; j < blockDim.x; j++) { %(work_x)s per_thread_max = per_thread_values[j]; row_max_threadIdx = (per_thread_max > row_max) ? j : row_max_threadIdx; row_max = fmax%(f)s(per_thread_max, row_max); } } local_barrier(); // The thread with the higest max writes out which of its // values was the winner. if (threadIdx.x == row_max_threadIdx) am_data[row * ams0] = per_thread_row_max_j; // COMPUTE SOFTMAX per_thread_sum = 0.0; for (int j = threadIdx.x; j < N; j += blockDim.x) { %(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]); %(work_x)s sm_ij = exp%(f)s(row_ij - row_max); per_thread_sum += sm_ij; sm[j * sms1] = %(write_x)s(sm_ij); } per_thread_values[threadIdx.x] = per_thread_sum; local_barrier(); if (threadIdx.x == 0) { sum = 0.0; for (int j = 0; j < blockDim.x; j++) { sum += per_thread_values[j]; } sum_inv = 1.0 / sum; } local_barrier(); for (int j = threadIdx.x; j < N; j += blockDim.x) { sm[j * sms1] = %(write_x)s(%(load_x)s(sm[j * sms1]) * sum_inv); } if (threadIdx.x == 0) { const %(type_y_idx)s y_idx = (int)y_idx_data[row * y_idxs0]; if ((y_idx >= N || y_idx < 0)) { // raise some suspicion. nll_data[row * nlls0] = %(write_x)s(0.0); } else { nll_data[row * nlls0] = %(write_x)s( - %(load_x)s(x[y_idx * xs1]) - %(load_b)s(b[y_idx * bs0]) + row_max + log%(f)s(sum)); } } } } """ % locals(), file=sio) params = [ 'uintp', 'uintp', gpuarray.GpuArray, 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp', 'intp', gpuarray.GpuArray, 'uintp', 'intp', gpuarray.GpuArray, 'uintp', 'intp', gpuarray.GpuArray, 'uintp', 'intp', 'intp', gpuarray.GpuArray, 'uintp', 'intp' ] return [ Kernel(code=sio.getvalue(), name=kname, params=params, flags=flags, objvar=k_var) ]