예제 #1
0
def bincount_pycuda(x, minlength=None):
    """PyCUDA implementation of `bincount()`"""

    if not interop_pycuda.available():
        raise NotImplementedError("CUDA not available")

    import pycuda
    from pycuda.compiler import SourceModule

    interop_pycuda.init()

    x_max = int(x.max())
    if x_max < 0:
        raise RuntimeError(
            "bincount(): first argument must be a 1 dimensional, non-negative int array"
        )
    if x_max > np.iinfo(np.uint32).max:
        raise NotImplementedError(
            "CUDA: the elements in the first argument must fit in a 32bit integer"
        )
    if minlength is not None:
        x_max = max(x_max, minlength)

    # TODO: handle large max element by running multiple bincount() on a range
    if x_max >= interop_pycuda.max_local_memory() // x.itemsize:
        raise NotImplementedError("CUDA: max element is too large for the GPU")

    # Let's create the output array and retrieve the in-/output OpenCL buffers
    # NB: we always return uint32 array
    ret = array_create.ones((x_max + 1, ), dtype=np.uint32)
    x_buf = interop_pycuda.get_gpuarray(x)
    ret_buf = interop_pycuda.get_gpuarray(ret)

    # CUDA kernel is based on the book "OpenCL Programming Guide" by Aaftab Munshi at al.
    source = """
    __global__ void histogram_partial(
        DTYPE *input,
        uint *partial_histo,
        uint input_size
    ){
        int local_size = blockDim.x;
        int group_indx = blockIdx.x * HISTO_SIZE;
        int gid = (blockIdx.x * blockDim.x + threadIdx.x);
        int tid = threadIdx.x;

        __shared__ uint tmp_histogram[HISTO_SIZE];

        int j = HISTO_SIZE;
        int indx = 0;

        // clear the local buffer that will generate the partial histogram
        do {
            if (tid < j)
                tmp_histogram[indx+tid] = 0;
            j -= local_size;
            indx += local_size;
        } while (j > 0);

        __syncthreads();

        if (gid < input_size) {
            atomicAdd(&tmp_histogram[input[gid]], 1);
        }

        __syncthreads();

        // copy the partial histogram to appropriate location in
        // histogram given by group_indx
        if (local_size >= HISTO_SIZE){
            if (tid < HISTO_SIZE)
                partial_histo[group_indx + tid] = tmp_histogram[tid];
        }else{
            j = HISTO_SIZE;
            indx = 0;
            do {
                if (tid < j)
                    partial_histo[group_indx + indx + tid] = tmp_histogram[indx + tid];

                j -= local_size;
                indx += local_size;
            } while (j > 0);
        }
    }

    __global__ void histogram_sum_partial_results(
        uint *partial_histogram,
        int num_groups,
        uint *histogram
    ){
        int gid = (blockIdx.x * blockDim.x + threadIdx.x);
        int group_indx;
        int n = num_groups;
        __shared__ uint tmp_histogram[HISTO_SIZE];

        tmp_histogram[gid] = partial_histogram[gid];
        group_indx = HISTO_SIZE;
        while (--n > 0) {
            tmp_histogram[gid] += partial_histogram[group_indx + gid];
            group_indx += HISTO_SIZE;
        }
        histogram[gid] = tmp_histogram[gid];
    }
    """
    source = source.replace("HISTO_SIZE", "%d" % ret.shape[0])
    source = source.replace("DTYPE", interop_pycuda.type_np2cuda_str(x.dtype))
    prg = SourceModule(source)

    # Calculate sizes for the kernel execution
    kernel = prg.get_function("histogram_partial")
    local_size = kernel.get_attribute(
        pycuda.driver.function_attribute.MAX_THREADS_PER_BLOCK
    )  # Max work-group size
    num_groups = int(math.ceil(x.shape[0] / float(local_size)))
    global_size = local_size * num_groups

    # First we compute the partial histograms
    partial_res_g = pycuda.driver.mem_alloc(num_groups * ret.nbytes)
    kernel(x_buf,
           partial_res_g,
           np.uint32(x.shape[0]),
           block=(local_size, 1, 1),
           grid=(num_groups, 1))

    # Then we sum the partial histograms into the final histogram
    kernel = prg.get_function("histogram_sum_partial_results")
    kernel(partial_res_g,
           np.uint32(num_groups),
           ret_buf,
           block=(1, 1, 1),
           grid=(ret.shape[0], 1))
    return ret
예제 #2
0
def bincount_pyopencl(x, minlength=None):
    """PyOpenCL implementation of `bincount()`"""

    if not interop_pyopencl.available():
        raise NotImplementedError("OpenCL not available")

    import pyopencl as cl
    ctx = interop_pyopencl.get_context()
    queue = cl.CommandQueue(ctx)

    x_max = int(x.max())
    if x_max < 0:
        raise RuntimeError(
            "bincount(): first argument must be a 1 dimensional, non-negative int array"
        )
    if x_max > np.iinfo(np.uint32).max:
        raise NotImplementedError(
            "OpenCL: the elements in the first argument must fit in a 32bit integer"
        )
    if minlength is not None:
        x_max = max(x_max, minlength)

    # TODO: handle large max element by running multiple bincount() on a range
    if x_max >= interop_pyopencl.max_local_memory(queue.device) // x.itemsize:
        raise NotImplementedError(
            "OpenCL: max element is too large for the GPU")

    # Let's create the output array and retrieve the in-/output OpenCL buffers
    # NB: we always return uint32 array
    ret = array_create.empty((x_max + 1, ), dtype=np.uint32)
    x_buf = interop_pyopencl.get_buffer(x)
    ret_buf = interop_pyopencl.get_buffer(ret)

    # OpenCL kernel is based on the book "OpenCL Programming Guide" by Aaftab Munshi at al.
    source = """
    kernel void histogram_partial(
        global DTYPE *input,
        global uint *partial_histo,
        uint input_size
    ){
        int local_size = (int)get_local_size(0);
        int group_indx = get_group_id(0) * HISTO_SIZE;
        int gid = get_global_id(0);
        int tid = get_local_id(0);

        local uint tmp_histogram[HISTO_SIZE];

        int j = HISTO_SIZE;
        int indx = 0;

        // clear the local buffer that will generate the partial histogram
        do {
            if (tid < j)
                tmp_histogram[indx+tid] = 0;
            j -= local_size;
            indx += local_size;
        } while (j > 0);

        barrier(CLK_LOCAL_MEM_FENCE);

        if (gid < input_size) {
            atomic_inc(&tmp_histogram[input[gid]]);
        }

        barrier(CLK_LOCAL_MEM_FENCE);

        // copy the partial histogram to appropriate location in
        // histogram given by group_indx
        if (local_size >= HISTO_SIZE){
            if (tid < HISTO_SIZE)
                partial_histo[group_indx + tid] = tmp_histogram[tid];
        }else{
            j = HISTO_SIZE;
            indx = 0;
            do {
                if (tid < j)
                    partial_histo[group_indx + indx + tid] = tmp_histogram[indx + tid];

                j -= local_size;
                indx += local_size;
            } while (j > 0);
        }
    }

    kernel void histogram_sum_partial_results(
        global uint *partial_histogram,
        int num_groups,
        global uint *histogram
    ){
        int gid = (int)get_global_id(0);
        int group_indx;
        int n = num_groups;
        local uint tmp_histogram[HISTO_SIZE];

        tmp_histogram[gid] = partial_histogram[gid];
        group_indx = HISTO_SIZE;
        while (--n > 0) {
            tmp_histogram[gid] += partial_histogram[group_indx + gid];
            group_indx += HISTO_SIZE;
        }
        histogram[gid] = tmp_histogram[gid];
    }
    """
    source = source.replace("HISTO_SIZE", "%d" % ret.shape[0])
    source = source.replace("DTYPE",
                            interop_pyopencl.type_np2opencl_str(x.dtype))
    prg = cl.Program(ctx, source).build()

    # Calculate sizes for the kernel execution
    local_size = interop_pyopencl.kernel_info(prg.histogram_partial,
                                              queue)[0]  # Max work-group size
    num_groups = int(math.ceil(x.shape[0] / float(local_size)))
    global_size = local_size * num_groups

    # First we compute the partial histograms
    partial_res_g = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY,
                              num_groups * ret.nbytes)
    prg.histogram_partial(queue, (global_size, ), (local_size, ), x_buf,
                          partial_res_g, np.uint32(x.shape[0]))

    # Then we sum the partial histograms into the final histogram
    prg.histogram_sum_partial_results(queue, ret.shape, None, partial_res_g,
                                      np.uint32(num_groups), ret_buf)
    return ret