Пример #1
0
def coop_syncwarp(res):
    sm = cuda.shared.array(32, int32)
    i = cuda.grid(1)

    sm[i] = i
    cuda.syncwarp()

    if i < 16:
        sm[i] = sm[i] + sm[i + 16]
        cuda.syncwarp(0xFFFF)

    if i < 8:
        sm[i] = sm[i] + sm[i + 8]
        cuda.syncwarp(0xFF)

    if i < 4:
        sm[i] = sm[i] + sm[i + 4]
        cuda.syncwarp(0xF)

    if i < 2:
        sm[i] = sm[i] + sm[i + 2]
        cuda.syncwarp(0x3)

    if i == 0:
        res[0] = sm[0] + sm[1]
Пример #2
0
    def device_reduce_full_block(arr, partials, sm_partials):
        """
        Partially reduce `arr` into `partials` using `sm_partials` as working
        space.  The algorithm goes like:

            array chunks of 128:  |   0 | 128 | 256 | 384 | 512 |
                        block-0:  |   x |     |     |   x |     |
                        block-1:  |     |   x |     |     |   x |
                        block-2:  |     |     |   x |     |     |

        The array is divided into chunks of 128 (size of a threadblock).
        The threadblocks consumes the chunks in roundrobin scheduling.
        First, a threadblock loads a chunk into temp memory.  Then, all
        subsequent chunks are combined into the temp memory.

        Once all chunks are processed.  Inner-block reduction is performed
        on the temp memory.  So that, there will just be one scalar result
        per block.  The result from each block is stored to `partials` at
        the dedicated slot.
        """
        tid = cuda.threadIdx.x
        blkid = cuda.blockIdx.x
        blksz = cuda.blockDim.x
        gridsz = cuda.gridDim.x

        # block strided loop to compute the reduction
        start = tid + blksz * blkid
        stop = arr.size
        step = blksz * gridsz

        # load first value
        tmp = arr[start]
        # loop over all values in block-stride
        for i in range(start + step, stop, step):
            tmp = reduce_op(tmp, arr[i])

        cuda.syncthreads()
        # inner-warp reduction
        inner_warp_reduction(sm_partials, tmp)

        cuda.syncthreads()
        # at this point, only the first slot for each warp in tsm_partials
        # is valid.

        # finish up block reduction
        # warning: this is assuming 4 warps.
        # assert numwarps == 4
        if tid < 2:
            sm_partials[tid, 0] = reduce_op(sm_partials[tid, 0],
                                            sm_partials[tid + 2, 0])
            cuda.syncwarp()
        if tid == 0:
            partials[blkid] = reduce_op(sm_partials[0, 0], sm_partials[1, 0])
Пример #3
0
    def inner_warp_reduction(sm_partials, init):
        """
        Compute reduction within a single warp
        """
        tid = cuda.threadIdx.x
        warpid = tid // _WARPSIZE
        laneid = tid % _WARPSIZE

        sm_this = sm_partials[warpid, :]
        sm_this[laneid] = init
        cuda.syncwarp()

        width = _WARPSIZE // 2
        while width:
            if laneid < width:
                old = sm_this[laneid]
                sm_this[laneid] = reduce_op(old, sm_this[laneid + width])
            cuda.syncwarp()
            width //= 2
Пример #4
0
def useful_syncwarp(ary):
    i = cuda.grid(1)
    if i == 0:
        ary[0] = 42
    cuda.syncwarp(0xffffffff)
    ary[i] = ary[0]
Пример #5
0
def useless_syncwarp_with_mask(ary):
    i = cuda.grid(1)
    cuda.syncwarp(0xFFFF)
    ary[i] = i
Пример #6
0
def useless_syncwarp(ary):
    i = cuda.grid(1)
    cuda.syncwarp()
    ary[i] = i