def atomicAggInc(counter):
    active = cuda.cg.coalesced_threads()

    mask = active.ballot(True)
    # Select the leader
    leader = cuda.ffs(mask) - 1

    # Leader does the update
    if active.thread_rank == leader:
        res = cuda.atomic.add(counter, 0, cuda.popc(mask))

    # Broadcast result
    res = active.shfl(res, leader)

    # Each thread computes its own value
    return res + cuda.popc(mask & ((1 << active.thread_rank) - 1))
예제 #2
0
def simple_popc(ary, c):
    ary[0] = cuda.popc(c)
예제 #3
0
def simple_popc(ary, c):
    ary[0] = cuda.popc(c)