예제 #1
0
def reduction(x, y, size):
    tid = jit.blockIdx.x * jit.blockDim.x + jit.threadIdx.x
    ntid = jit.blockDim.x * jit.gridDim.x

    value = cupy.float32(0)
    for i in range(tid, size, ntid):
        value += x[i]

    smem = jit.shared_memory(cupy.float32, 1024)
    smem[jit.threadIdx.x] = value

    jit.syncthreads()

    if jit.threadIdx.x == cupy.uint32(0):
        value = cupy.float32(0)
        for i in range(jit.blockDim.x):
            value += smem[i]
        jit.atomic_add(y, 0, value)
예제 #2
0
 def f(x, index, out):
     tid = jit.blockDim.x * jit.blockIdx.x + jit.threadIdx.x
     jit.atomic_add(out, index[tid], x[tid])