def _make_chunk_wise_kernel(func, argnames, extras): """ Make a kernel that does a stride loop over the input chunks. Each block is responsible for a chunk in each iteration. Several iteration may be needed to handling a large number of chunks. The user function *func* will have all threads in the block for its computation. The resulting kernel can be used with any 1D grid size and 1D block size. """ # Build kernel source argnames = list(map(_mangle_user, argnames)) extras = list(map(_mangle_user, extras)) source = """ def chunk_wise_kernel(nrows, chunks, {args}): {body} """ args = ', '.join(argnames) body = [] body.append('blkid = cuda.blockIdx.x') body.append('nblkid = cuda.gridDim.x') body.append('tid = cuda.threadIdx.x') body.append('ntid = cuda.blockDim.x') # Stride loop over the block body.append('for curblk in range(blkid, chunks.size, nblkid):') indent = ' ' * 4 body.append(indent + 'start = chunks[curblk]') body.append( indent + 'stop = chunks[curblk + 1] if curblk + 1 < chunks.size else nrows') slicedargs = {} for a in argnames: if a not in extras: slicedargs[a] = "{}[start:stop]".format(a) else: slicedargs[a] = str(a) body.append("{}inner({})".format( indent, ', '.join(slicedargs[k] for k in argnames))) indented = ['{}{}'.format(' ' * 4, ln) for ln in body] # Finalize source concrete = source.format(args=args, body='\n'.join(indented)) # Get bytecode glbs = {'inner': cuda.jit(device=True)(func), 'cuda': cuda} exec_(concrete, glbs) # Compile as CUDA kernel kernel = cuda.jit(glbs['chunk_wise_kernel']) return kernel
def _make_row_wise_kernel(func, argnames, extras): """ Make a kernel that does a stride loop over the input rows. Each thread is responsible for a row in each iteration. Several iteration may be needed to handling a large number of rows. The resulting kernel can be used with any 1D grid size and 1D block size. """ # Build kernel source argnames = list(map(_mangle_user, argnames)) extras = list(map(_mangle_user, extras)) source = """ def row_wise_kernel({args}): {body} """ args = ", ".join(argnames) body = [] body.append("tid = cuda.grid(1)") body.append("ntid = cuda.gridsize(1)") for a in argnames: if a not in extras: start = "tid" stop = "" stride = "ntid" srcidx = "{a} = {a}[{start}:{stop}:{stride}]" body.append( srcidx.format(a=a, start=start, stop=stop, stride=stride) ) body.append("inner({})".format(args)) indented = ["{}{}".format(" " * 4, ln) for ln in body] # Finalize source concrete = source.format(args=args, body="\n".join(indented)) # Get bytecode glbs = {"inner": cuda.jit(device=True)(func), "cuda": cuda} exec_(concrete, glbs) # Compile as CUDA kernel kernel = cuda.jit(glbs["row_wise_kernel"]) return kernel