Exemplo n.º 1
0
 def test_copy(x, y):
     # do two batches of copies to test relevant APIs
     if test_aligned:
         smem = jit.shared_memory(cupy.int32, 32*2, alignment=16)
     else:
         smem = jit.shared_memory(cupy.int32, 32*2)
     g = jit.cg.this_thread_block()
     tid = g.thread_rank()
     # int32 is 4 bytes
     if test_aligned:
         # CuPy ensures x is 256B-aligned
         jit.cg.memcpy_async(
             g, smem, 0, x, 0, 4*32, aligned_size=16)
         jit.cg.memcpy_async(
             g, smem, 32, x, 32, 4*32, aligned_size=16)
     else:
         jit.cg.memcpy_async(
             g, smem, 0, x, 0, 4*32)
         jit.cg.memcpy_async(
             g, smem, 32, x, 32, 4*32)
     jit.cg.wait_prior(g, 1)
     if tid < 32:
         y[tid] = smem[tid]
     jit.cg.wait(g)
     if 32 <= tid and tid < 64:  # can't do "32 <= tid < 64" yet...
         y[tid] = smem[tid]
Exemplo n.º 2
0
        def f(x, y):
            tid = jit.threadIdx.x
            ntid = jit.blockDim.x
            bid = jit.blockIdx.x
            i = tid + bid * ntid

            smem = jit.shared_memory(numpy.int32, 32)
            smem[tid] = x[i]
            jit.syncthreads()
            y[i] = smem[ntid - tid - 1]
Exemplo n.º 3
0
def reduction(x, y, size):
    tid = jit.threadIdx.x
    ntid = jit.blockDim.x

    value = cupy.float32(0)
    for i in range(tid, size, ntid):
        value += x[i]

    smem = jit.shared_memory(cupy.float32, 1024)
    smem[tid] = value

    jit.syncthreads()

    if tid == cupy.uint32(0):
        value = cupy.float32(0)
        for i in range(ntid):
            value += smem[i]
        y[0] = value
Exemplo n.º 4
0
def reduction(x, y, size):
    tid = jit.blockIdx.x * jit.blockDim.x + jit.threadIdx.x
    ntid = jit.blockDim.x * jit.gridDim.x

    value = cupy.float32(0)
    for i in range(tid, size, ntid):
        value += x[i]

    smem = jit.shared_memory(cupy.float32, 1024)
    smem[jit.threadIdx.x] = value

    jit.syncthreads()

    if jit.threadIdx.x == cupy.uint32(0):
        value = cupy.float32(0)
        for i in range(jit.blockDim.x):
            value += smem[i]
        jit.atomic_add(y, 0, value)