def f(x, y): tid = jit.threadIdx.x ntid = jit.blockDim.x bid = jit.blockIdx.x i = tid + bid * ntid smem = jit.shared_memory(numpy.int32, 32) smem[tid] = x[i] jit.syncthreads() y[i] = smem[ntid - tid - 1]
def reduction(x, y, size): tid = jit.threadIdx.x ntid = jit.blockDim.x value = cupy.float32(0) for i in range(tid, size, ntid): value += x[i] smem = jit.shared_memory(cupy.float32, 1024) smem[tid] = value jit.syncthreads() if tid == cupy.uint32(0): value = cupy.float32(0) for i in range(ntid): value += smem[i] y[0] = value
def reduction(x, y, size): tid = jit.blockIdx.x * jit.blockDim.x + jit.threadIdx.x ntid = jit.blockDim.x * jit.gridDim.x value = cupy.float32(0) for i in range(tid, size, ntid): value += x[i] smem = jit.shared_memory(cupy.float32, 1024) smem[jit.threadIdx.x] = value jit.syncthreads() if jit.threadIdx.x == cupy.uint32(0): value = cupy.float32(0) for i in range(jit.blockDim.x): value += smem[i] jit.atomic_add(y, 0, value)
def f(x, y, buf): tid = jit.threadIdx.x + jit.threadIdx.y * jit.blockDim.x ntid = jit.blockDim.x * jit.blockDim.y buf[tid] = x[ntid - tid - 1] jit.syncthreads() y[tid] = buf[ntid - tid - 1]
def f(x, y): tid = jit.threadIdx.x y[tid] = x[tid] jit.syncthreads() g(1)(y)