예제 #1
0
def sum_reduction_kernel(A, partial_sums):
    """
    The example demonstrates a reduction kernel implemented as a ``kernel``
    function.
    """
    local_id = dppy.get_local_id(0)
    global_id = dppy.get_global_id(0)
    group_size = dppy.get_local_size(0)
    group_id = dppy.get_group_id(0)

    local_sums = dppy.local.array(64, int32)

    # Copy from global to local memory
    local_sums[local_id] = A[global_id]

    # Loop for computing local_sums : divide workgroup into 2 parts
    stride = group_size // 2
    while stride > 0:
        # Waiting for each 2x2 addition into given workgroup
        dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE)

        # Add elements 2 by 2 between local_id and local_id + stride
        if local_id < stride:
            local_sums[local_id] += local_sums[local_id + stride]

        stride >>= 1

    if local_id == 0:
        partial_sums[group_id] = local_sums[0]
예제 #2
0
 def f(a):
     lm = dppy.local.array(1, dtype)
     lm[0] = a[0]
     dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     op(lm, 0, 1)
     dppy.barrier(dppy.CLK_GLOBAL_MEM_FENCE)
     a[0] = lm[0]
예제 #3
0
def sum_reduction_kernel(A, input_size, partial_sums):
    local_id = dppy.get_local_id(0)
    global_id = dppy.get_global_id(0)
    group_size = dppy.get_local_size(0)
    group_id = dppy.get_group_id(0)

    local_sums = dppy.local.array(64, int32)

    local_sums[local_id] = 0

    if global_id < input_size:
        local_sums[local_id] = A[global_id]

    # Loop for computing local_sums : divide workgroup into 2 parts
    stride = group_size // 2
    while stride > 0:
        # Waiting for each 2x2 addition into given workgroup
        dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE)

        # Add elements 2 by 2 between local_id and local_id + stride
        if local_id < stride:
            local_sums[local_id] += local_sums[local_id + stride]

        stride >>= 1

    if local_id == 0:
        partial_sums[group_id] = local_sums[0]
예제 #4
0
    def reverse_array(A):
        lm = dppy.local.array(shape=10, dtype=np.float32)
        i = dppy.get_global_id(0)

        # preload
        lm[i] = A[i]
        # barrier local or global will both work as we only have one work group
        dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE)  # local mem fence
        # write
        A[i] += lm[blocksize - 1 - i]
    def private_memory_kernel(A):
        memory = numba_dppy.private.array(shape=1, dtype=np.float32)
        i = numba_dppy.get_global_id(0)

        # preload
        memory[0] = i
        numba_dppy.barrier(numba_dppy.CLK_LOCAL_MEM_FENCE)  # local mem fence

        # memory will not hold correct deterministic result if it is not
        # private to each thread.
        A[i] = memory[0] * 2
예제 #6
0
 def private_memory_kernel(A):
     i = numba_dppy.get_global_id(0)
     prvt_mem = numba_dppy.private.array(shape=1, dtype=np.float32)
     prvt_mem[0] = i
     numba_dppy.barrier(numba_dppy.CLK_LOCAL_MEM_FENCE)  # local mem fence
     A[i] = prvt_mem[0] * 2
예제 #7
0
 def twice(A):
     i = dppy.get_global_id(0)
     d = A[i]
     # no argument defaults to global mem fence
     dppy.barrier()
     A[i] = d * 2
예제 #8
0
 def twice(A):
     i = dppy.get_global_id(0)
     d = A[i]
     dppy.barrier(dppy.CLK_LOCAL_MEM_FENCE)  # local mem fence
     A[i] = d * 2