def gpu_kernels(self, node, name): write = write_w(self.output_type.dtype) if self.output_type.dtype == "float16": otype = "ga_half" # limit the values of the state that we use. mask = "& 0x7fff" offset = "+ 1" NORM = "3.0458e-05f" # numpy.float16(1.0/(2**15+33)) # this was determined by finding the biggest number such that # numpy.float16(number * ((M1 & 0x7fff) + 1)) < 1.0 elif self.output_type.dtype == "float32": otype = "float" mask = "" offset = "" NORM = "4.6566126e-10f" # numpy.float32(1.0/(2**31+65)) # this was determined by finding the biggest number such that # numpy.float32(number * M1) < 1.0 elif self.output_type.dtype == "float64": otype = "double" mask = "" offset = "" NORM = "4.656612873077392578125e-10" else: raise ValueError("Unsupported data type for output", self.output_type.dtype) code = ("""#include "cluda.h" KERNEL void mrg_uniform( GLOBAL_MEM %(otype)s *sample_data, ga_size sample_offset, GLOBAL_MEM ga_int *state_data, ga_size state_offset, const ga_uint Nsamples, const ga_uint Nstreams_used) { sample_data = (GLOBAL_MEM %(otype)s *)(((GLOBAL_MEM char *)sample_data) + sample_offset); state_data = (GLOBAL_MEM ga_int *)(((GLOBAL_MEM char *)state_data) + state_offset); /* * The cluda backend makes sure that ga_int corresponds to * a 32 bit signed type on the target device. It is not a * variable width type. */ const ga_int i7 = 7; const ga_int i9 = 9; const ga_int i15 = 15; const ga_int i16 = 16; const ga_int i22 = 22; const ga_int i24 = 24; const ga_int M1 = 2147483647; //2^31 - 1 const ga_int M2 = 2147462579; //2^31 - 21069 const ga_int MASK12 = 511; //2^9 - 1 const ga_int MASK13 = 16777215; //2^24 - 1 const ga_int MASK2 = 65535; //2^16 - 1 const ga_int MULT2 = 21069; const ga_uint idx = GID_0 * LDIM_0 + LID_0; ga_int y1, y2, x11, x12, x13, x21, x22, x23; if (idx < Nstreams_used) { x11 = state_data[idx*6+0]; x12 = state_data[idx*6+1]; x13 = state_data[idx*6+2]; x21 = state_data[idx*6+3]; x22 = state_data[idx*6+4]; x23 = state_data[idx*6+5]; for (ga_uint i = idx; i < Nsamples; i += Nstreams_used) { y1 = ((x12 & MASK12) << i22) + (x12 >> i9) + ((x13 & MASK13) << i7) + (x13 >> i24); y1 -= (y1 < 0 || y1 >= M1) ? M1 : 0; y1 += x13; y1 -= (y1 < 0 || y1 >= M1) ? M1 : 0; x13 = x12; x12 = x11; x11 = y1; y1 = ((x21 & MASK2) << i15) + (MULT2 * (x21 >> i16)); y1 -= (y1 < 0 || y1 >= M2) ? M2 : 0; y2 = ((x23 & MASK2) << i15) + (MULT2 * (x23 >> i16)); y2 -= (y2 < 0 || y2 >= M2) ? M2 : 0; y2 += x23; y2 -= (y2 < 0 || y2 >= M2) ? M2 : 0; y2 += y1; y2 -= (y2 < 0 || y2 >= M2) ? M2 : 0; x23 = x22; x22 = x21; x21 = y2; if (x11 <= x21) { sample_data[i] = %(write)s((((x11 - x21 + M1) %(mask)s) %(offset)s) * %(NORM)s); } else { sample_data[i] = %(write)s((((x11 - x21) %(mask)s) %(offset)s) * %(NORM)s); } } state_data[idx*6+0]= x11; state_data[idx*6+1]= x12; state_data[idx*6+2]= x13; state_data[idx*6+3]= x21; state_data[idx*6+4]= x22; state_data[idx*6+5]= x23; } } """ % locals()) # we shouldn't get to this line if it's about to fail from pygpu import gpuarray return [ Kernel( code=code, name="mrg_uniform", params=[ gpuarray.GpuArray, gpuarray.SIZE, gpuarray.GpuArray, gpuarray.SIZE, "uint32", "uint32", ], flags=Kernel.get_flags(self.output_type.dtype, "int32"), ) ]
def gpu_kernels(self, node, nodename): dtype_x = node.inputs[0].dtype dtype_b = node.inputs[1].dtype dtype_sm = node.outputs[0].dtype load_x = load_w(node.inputs[0].dtype) load_b = load_w(node.inputs[1].dtype) write_sm = write_w(node.outputs[0].dtype) work_sm = work_dtype(node.outputs[0].dtype) flags = Kernel.get_flags(dtype_x, dtype_b, dtype_sm) type_x = gpuarray.dtype_to_ctype(dtype_x) type_b = gpuarray.dtype_to_ctype(dtype_b) type_sm = gpuarray.dtype_to_ctype(dtype_sm) type_acc = gpuarray.dtype_to_ctype(work_sm) ctype = gpuarray.dtype_to_ctype(work_sm) params = [ gpuarray.SIZE, gpuarray.SIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE, ] kernels = [] kname = "kSoftmaxWithBias" k_var = "kSoftmaxWithBias_" + nodename code = ("""#include "cluda.h" KERNEL void %(kname)s (const ga_size M, const ga_size N, GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1, GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0, GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf)) { GA_DECL_SHARED_BODY(%(type_acc)s, buf); LOCAL_MEM_ARG %(type_acc)s * buf2 = buf + N; x = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x)+offset_x); b = (GLOBAL_MEM const %(type_b)s *)(((GLOBAL_MEM char *)b)+offset_b); sm = (GLOBAL_MEM %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm); for (ga_int blockIDX = GID_0; blockIDX < M; blockIDX += GDIM_0){ for (ga_int tx = LID_0; tx< N; tx += LDIM_0){ buf[tx] = %(load_x)s(x[blockIDX * sx0 + tx * sx1]); buf[tx] += %(load_b)s(b[tx * sb0]); buf2[tx] = buf[tx]; } local_barrier(); { // This function trashes buf[1..GA_WARP_SIZE], // leaving the reduction result in buf[0]. if (LID_0 < GA_WARP_SIZE) { for (ga_int i = LID_0 + GA_WARP_SIZE; i < N; i += GA_WARP_SIZE) { buf[LID_0] = max(buf[LID_0], buf[i]); } } local_barrier(); //reduce so that LID_0 0 has the reduction of everything for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) { if (LID_0 < _n && LID_0 + _n < N) buf[LID_0] = max(buf[LID_0], buf[LID_0+_n]); local_barrier(); } } %(ctype)s row_max = buf[0]; local_barrier(); for(ga_int __i=LID_0; __i<N; __i+=LDIM_0){; buf[__i] = exp(buf2[__i] - row_max); buf2[__i] = buf[__i]; } local_barrier(); { // This function trashes buf[1..GA_WARP_SIZE], // leaving the reduction result in buf[0]. if (LID_0 < GA_WARP_SIZE) { for (ga_int i = LID_0 + GA_WARP_SIZE; i < N; i += GA_WARP_SIZE) { buf[LID_0] = buf[LID_0] + buf[i]; } } local_barrier(); //reduce so that LID_0 0 has the reduction of everything for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) { if (LID_0 < _n && LID_0 + _n < N) buf[LID_0] = buf[LID_0] + buf[LID_0+_n]; local_barrier(); } } %(ctype)s row_sum = buf[0]; local_barrier(); for(ga_int __i=LID_0; __i<N; __i+=LDIM_0){ buf[__i] = buf2[__i] / row_sum; } local_barrier(); for (ga_int tx = LID_0; tx< N; tx += LDIM_0){ sm[blockIDX * sm_s0 + tx * sm_s1] = %(write_sm)s(buf[tx]); } local_barrier(); } } """ % locals()) kernels.append( Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)) kname = "kSoftmaxWithBias_fixed_shared" k_var = "kSoftmaxWithBias_fixed_shared" + nodename code = ("""#include "cluda.h" KERNEL void %(kname)s (const ga_size M, const ga_size N, GLOBAL_MEM const %(type_x)s * x, const ga_size offset_x, const ga_ssize sx0, const ga_ssize sx1, GLOBAL_MEM const %(type_b)s * b, const ga_size offset_b, const ga_ssize sb0, GLOBAL_MEM %(type_sm)s * sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1 GA_DECL_SHARED_PARAM(%(type_acc)s, buf)) { GA_DECL_SHARED_BODY(%(type_acc)s, buf); x = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x)+offset_x); b = (GLOBAL_MEM const %(type_b)s *)(((GLOBAL_MEM char *)b)+offset_b); sm = (GLOBAL_MEM %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm); for (ga_int blockIDX = GID_0; blockIDX < M; blockIDX += GDIM_0){ GLOBAL_MEM const %(type_x)s *x_ptr = &x[blockIDX * sx0]; GLOBAL_MEM %(type_sm)s *sm_ptr = &sm[blockIDX * sm_s0]; { // This function trashes buf[1..n_threads], // leaving the reduction result in buf[0]. %(ctype)s red = %(load_x)s(x_ptr[LID_0 * sx1]) + %(load_b)s(b[LID_0 * sb0]); #pragma unroll 16 for (ga_int i = LID_0 + LDIM_0; i<N; i += LDIM_0) { red = max(red, %(load_x)s(x_ptr[i * sx1]) + %(load_b)s(b[i * sb0])); } buf[LID_0] = red; local_barrier(); if (LID_0 < GA_WARP_SIZE) { for (ga_int i = LID_0 + GA_WARP_SIZE; i < LDIM_0; i += GA_WARP_SIZE) { buf[LID_0] = max(buf[LID_0], buf[i]); } } local_barrier(); //reduce so that LID_0 0 has the reduction of everything for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) { if (LID_0 < _n && LID_0 + _n < N) buf[LID_0] = max(buf[LID_0], buf[LID_0+_n]); local_barrier(); } } %(ctype)s row_max = buf[0]; local_barrier(); { // This function trashes buf[1..n_threads], // leaving the reduction result in buf[0]. %(ctype)s red = exp(%(load_x)s(x_ptr[LID_0 * sx1]) + %(load_b)s(b[LID_0 * sb0]) - row_max); #pragma unroll 16 for (ga_int i = LID_0 + LDIM_0; i<N; i += LDIM_0) { red = red + exp(%(load_x)s(x_ptr[i * sx1]) + %(load_b)s(b[i * sb0]) - row_max); } buf[LID_0] = red; local_barrier(); if (LID_0 < GA_WARP_SIZE) { for (ga_int i = LID_0 + GA_WARP_SIZE; i < LDIM_0; i += GA_WARP_SIZE) { buf[LID_0] = buf[LID_0] + buf[i]; } } local_barrier(); //reduce so that LID_0 0 has the reduction of everything for (ga_uint _n = GA_WARP_SIZE / 2; _n > 0; _n /= 2) { if (LID_0 < _n && LID_0 + _n < N) buf[LID_0] = buf[LID_0] + buf[LID_0+_n]; local_barrier(); } } %(ctype)s row_sum = buf[0]; local_barrier(); for (ga_int tx = LID_0; tx< N; tx += LDIM_0){ sm_ptr[tx * sm_s1] = %(write_sm)s(exp(%(load_x)s(x_ptr[tx * sx1]) + %(load_b)s(b[tx * sb0]) - row_max) / row_sum); } local_barrier(); } } """ % locals()) kernels.append( Kernel(code=code, name=kname, params=params, flags=flags, objvar=k_var)) return kernels
def gpu_kernels(self, node, nodename): dtype_dnll = node.inputs[0].dtype dtype_sm = node.inputs[1].dtype dtype_y_idx = node.inputs[2].dtype dtype_dx = node.outputs[0].dtype work_dnll = work_dtype(dtype_dnll) load_dnll = load_w(dtype_dnll) load_sm = load_w(dtype_sm) write_dx = write_w(dtype_dx) flags = Kernel.get_flags(dtype_dnll, dtype_sm, dtype_y_idx, dtype_dx) wtype_dnll = gpuarray.dtype_to_ctype(work_dnll) type_dnll = gpuarray.dtype_to_ctype(dtype_dnll) type_sm = gpuarray.dtype_to_ctype(dtype_sm) type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx) type_dx = gpuarray.dtype_to_ctype(dtype_dx) kname = "kCrossEntropySoftmax1HotWithBiasDx" k_var = "kCrossEntropySoftmax1HotWithBiasDx_" + nodename params = [ gpuarray.SIZE, gpuarray.SIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE, ] sio = StringIO() print( """#include "cluda.h" KERNEL void %(kname)s( const ga_size N, const ga_size K, GLOBAL_MEM const %(type_dnll)s* dnll, const ga_size offset_dnll, const ga_ssize dnll_s0, GLOBAL_MEM const %(type_sm)s* sm, const ga_size offset_sm, const ga_ssize sm_s0, const ga_ssize sm_s1, GLOBAL_MEM const %(type_y_idx)s* y_idx, const ga_size offset_y_idx, const ga_ssize y_idx_s0, GLOBAL_MEM %(type_dx)s* dx, const ga_size offset_dx, const ga_ssize dx_s0, const ga_ssize dx_s1) { dnll = (GLOBAL_MEM const %(type_dnll)s *)(((GLOBAL_MEM char *)dnll)+offset_dnll); sm = (GLOBAL_MEM const %(type_sm)s *)(((GLOBAL_MEM char *)sm)+offset_sm); y_idx = (GLOBAL_MEM const %(type_y_idx)s *)(((GLOBAL_MEM char *)y_idx)+offset_y_idx); dx = (GLOBAL_MEM %(type_dx)s *)(((GLOBAL_MEM char *)dx)+offset_dx); for (ga_int i = GID_0; i < N; i += GDIM_0) { %(wtype_dnll)s dnll_i = %(load_dnll)s(dnll[i * dnll_s0]); %(type_y_idx)s y_i = y_idx[i * y_idx_s0]; for (ga_int j = LID_0; j < K; j += LDIM_0) { if (y_i == j) { dx[i * dx_s0 + j * dx_s1] = %(write_dx)s(dnll_i * (%(load_sm)s(sm[i * sm_s0 + j * sm_s1]) - 1.0)); } else { dx[i * dx_s0 + j * dx_s1] = %(write_dx)s(dnll_i * %(load_sm)s(sm[i * sm_s0 + j * sm_s1])); } } } } """ % locals(), file=sio, ) return [ Kernel( code=sio.getvalue(), name=kname, params=params, flags=flags, objvar=k_var, ) ]
def gpu_kernels(self, node, nodename): dtype_x = node.inputs[0].dtype dtype_b = node.inputs[1].dtype dtype_y_idx = node.inputs[2].dtype work_x = work_dtype(dtype_x) work_b = work_dtype(dtype_b) load_x = load_w(dtype_x) load_b = load_w(dtype_b) write_x = write_w(dtype_x) write_b = write_w(dtype_b) flags = Kernel.get_flags(dtype_x, dtype_b, dtype_y_idx) type_x = gpuarray.dtype_to_ctype(dtype_x) type_b = gpuarray.dtype_to_ctype(dtype_b) work_x = gpuarray.dtype_to_ctype(work_x) type_y_idx = gpuarray.dtype_to_ctype(dtype_y_idx) kname = "k_xent_sm_1hot_bias" k_var = "k_xent_sm_1hot_bias_" + nodename if node.inputs[0].type.context.kind != b"cuda": f = "" else: f = "" if dtype_x == "float64" else "f" params = [ gpuarray.SIZE, gpuarray.SIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, gpuarray.SSIZE, gpuarray.GpuArray, gpuarray.SIZE, gpuarray.SSIZE, ] sio = StringIO() print( """#include "cluda.h" KERNEL void %(kname)s(const ga_size M, const ga_size N, GLOBAL_MEM const %(type_x)s* x_data, const ga_size offset_x, const ga_ssize xs0, const ga_ssize xs1, GLOBAL_MEM const %(type_b)s* b, const ga_size offset_b, const ga_ssize bs0, GLOBAL_MEM const %(type_y_idx)s* y_idx_data, const ga_size offset_y_idx, const ga_ssize y_idxs0, GLOBAL_MEM %(type_x)s* nll_data, const ga_size offset_nll, const ga_ssize nlls0, GLOBAL_MEM %(type_x)s* sm_data, const ga_size offset_sm, const ga_ssize sms0, const ga_ssize sms1, GLOBAL_MEM %(type_y_idx)s* am_data, const ga_size offset_am, const ga_ssize ams0 GA_DECL_SHARED_PARAM(%(work_x)s, per_thread_values)) { x_data = (GLOBAL_MEM const %(type_x)s *)(((GLOBAL_MEM char *)x_data)+offset_x); b = (GLOBAL_MEM const %(type_b)s *)(((GLOBAL_MEM char *)b)+offset_b); y_idx_data = (GLOBAL_MEM const %(type_y_idx)s *)(((GLOBAL_MEM char *)y_idx_data)+offset_y_idx); nll_data = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)nll_data)+offset_nll); sm_data = (GLOBAL_MEM %(type_x)s *)(((GLOBAL_MEM char *)sm_data)+offset_sm); am_data = (GLOBAL_MEM %(type_y_idx)s *)(((GLOBAL_MEM char *)am_data)+offset_am); for (ga_int row = GID_0; row < M; row += GDIM_0){ GLOBAL_MEM const %(type_x)s* x = x_data + xs0 * row; GLOBAL_MEM %(type_x)s* sm = sm_data + sms0 * row; GA_DECL_SHARED_BODY(%(work_x)s, per_thread_values); LOCAL_MEM %(work_x)s row_max, sum, sum_inv; LOCAL_MEM ga_int row_max_threadIdx; %(work_x)s per_thread_row_max, per_thread_sum; ga_int per_thread_row_max_j; // COMPUTE ROW MAX AND ARGMAX // compute separate per-thread maximums and argmaxes per_thread_row_max = NAN; per_thread_row_max_j = 0; for (ga_int j = LID_0; j < N; j += LDIM_0) { %(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]); per_thread_row_max_j = (row_ij > per_thread_row_max) ? j : per_thread_row_max_j; per_thread_row_max = fmax%(f)s(row_ij, per_thread_row_max); } per_thread_values[LID_0] = per_thread_row_max; local_barrier(); if (LID_0 == 0) { row_max = NAN; row_max_threadIdx = 0; for (ga_int j = 0; j < LDIM_0; j++) { %(work_x)s per_thread_max = per_thread_values[j]; row_max_threadIdx = (per_thread_max > row_max) ? j : row_max_threadIdx; row_max = fmax%(f)s(per_thread_max, row_max); } } local_barrier(); // The thread with the highest max writes out which of its // values was the winner. if (LID_0 == row_max_threadIdx) am_data[row * ams0] = per_thread_row_max_j; // COMPUTE SOFTMAX per_thread_sum = 0.0; for (ga_int j = LID_0; j < N; j += LDIM_0) { %(work_x)s row_ij = %(load_x)s(x[j * xs1]) + %(load_b)s(b[j * bs0]); %(work_x)s sm_ij = exp%(f)s(row_ij - row_max); per_thread_sum += sm_ij; sm[j * sms1] = %(write_x)s(sm_ij); } per_thread_values[LID_0] = per_thread_sum; local_barrier(); if (LID_0 == 0) { sum = 0.0; for (ga_int j = 0; j < LDIM_0; j++) { sum += per_thread_values[j]; } sum_inv = 1.0 / sum; } local_barrier(); for (ga_int j = LID_0; j < N; j += LDIM_0) { sm[j * sms1] = %(write_x)s(%(load_x)s(sm[j * sms1]) * sum_inv); } if (LID_0 == 0) { const %(type_y_idx)s y_idx = (ga_int)y_idx_data[row * y_idxs0]; if ((y_idx >= N || y_idx < 0)) { // raise some suspicion. nll_data[row * nlls0] = %(write_x)s(0.0); } else { nll_data[row * nlls0] = %(write_x)s( - %(load_x)s(x[y_idx * xs1]) - %(load_b)s(b[y_idx * bs0]) + row_max + log%(f)s(sum)); } } } } """ % locals(), file=sio, ) return [ Kernel( code=sio.getvalue(), name=kname, params=params, flags=flags, objvar=k_var, ) ]
def gpu_kernels(self, node, name): out_ctype = pygpu.gpuarray.dtype_to_ctype(node.outputs[0].dtype) pvals_ctype = pygpu.gpuarray.dtype_to_ctype(node.inputs[0].dtype) unis_ctype = pygpu.gpuarray.dtype_to_ctype(node.inputs[1].dtype) work_ctype = pygpu.gpuarray.dtype_to_ctype(work_dtype(node.inputs[0].dtype)) write_out_ctype = write_w(node.outputs[0].dtype) load_in_ctype = load_w(node.inputs[0].dtype) code = """#include "cluda.h" KERNEL void k_multi_warp_multinomial( const ga_size nb_multi, const ga_size nb_outcomes, GLOBAL_MEM %(pvals_ctype)s *global_pvals, const ga_size global_pvals_offset, const ga_ssize pvals_row_stride, const ga_ssize pvals_col_stride, GLOBAL_MEM %(unis_ctype)s *global_unis, const ga_size global_unis_offset, const ga_ssize unis_stride, GLOBAL_MEM %(out_ctype)s *global_outs, const ga_size global_outs_offset, const ga_ssize outs_row_stride, const ga_ssize outs_col_stride ) { global_pvals = (GLOBAL_MEM %(pvals_ctype)s *)(((GLOBAL_MEM char *)global_pvals) + global_pvals_offset); global_unis = (GLOBAL_MEM %(unis_ctype)s *)(((GLOBAL_MEM char *)global_unis) + global_unis_offset); global_outs = (GLOBAL_MEM %(out_ctype)s *)(((GLOBAL_MEM char *)global_outs) + global_outs_offset); // each thread takes care of one multinomial draw int n = LDIM_0*GID_0 + LID_0; if (n < nb_multi) { %(work_ctype)s cummul = 0.; bool done = false; const %(work_ctype)s unis_n = %(load_in_ctype)s(global_unis[n*unis_stride]); for (ga_size m = 0; m < nb_outcomes; ++m) { %(work_ctype)s current_out = 0; if (!done) { cummul += %(load_in_ctype)s(global_pvals[m * pvals_col_stride + n * pvals_row_stride]); if (unis_n < cummul) { current_out = 1; done = true; } } //write out transposed for speed. global_outs[n * outs_col_stride + m * outs_row_stride] = %(write_out_ctype)s(current_out); } } } """ % dict( out_ctype=out_ctype, write_out_ctype=write_out_ctype, work_ctype=work_ctype, pvals_ctype=pvals_ctype, unis_ctype=unis_ctype, load_in_ctype=load_in_ctype, ) return [ Kernel( code=code, name="k_multi_warp_multinomial", params=[ pygpu.gpuarray.SIZE, pygpu.gpuarray.SIZE, pygpu.gpuarray.GpuArray, pygpu.gpuarray.SIZE, pygpu.gpuarray.SSIZE, pygpu.gpuarray.SSIZE, pygpu.gpuarray.GpuArray, pygpu.gpuarray.SIZE, pygpu.gpuarray.SSIZE, pygpu.gpuarray.GpuArray, pygpu.gpuarray.SIZE, pygpu.gpuarray.SSIZE, pygpu.gpuarray.SSIZE, ], flags=Kernel.get_flags(node.outputs[0].dtype), objvar="k_multi_warp_multinomial_" + name, ) ]