示例#1
0
def test_vector_fill(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    a_gpu = cl_array.Array(queue, 100, dtype=cltypes.float4)
    a_gpu.fill(cltypes.make_float4(0.0, 0.0, 1.0, 0.0))
    a = a_gpu.get()
    assert a.dtype == cltypes.float4

    a_gpu = cl_array.zeros(queue, 100, dtype=cltypes.float4)
示例#2
0
def test_vector_fill(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    a_gpu = cl_array.Array(queue, 100, dtype=cltypes.float4)
    a_gpu.fill(cltypes.make_float4(0.0, 0.0, 1.0, 0.0))
    a = a_gpu.get()
    assert a.dtype == cltypes.float4

    a_gpu = cl_array.zeros(queue, 100, dtype=cltypes.float4)
示例#3
0
def test_vector_args(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    prg = cl.Program(context, """
        __kernel void set_vec(float4 x, __global float4 *dest)
        { dest[get_global_id(0)] = x; }
        """).build()

    x = cltypes.make_float4(1, 2, 3, 4)
    dest = np.empty(50000, cltypes.float4)
    mf = cl.mem_flags
    dest_buf = cl.Buffer(context, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=dest)

    prg.set_vec(queue, dest.shape, None, x, dest_buf)

    cl.enqueue_copy(queue, dest, dest_buf).wait()

    assert (dest == x).all()
示例#4
0
def test_axpy(ctx_factory):
    logging.basicConfig(level="INFO")
    ctx = ctx_factory()

    n = 3145182

    if ctx.devices[0].platform.vendor.startswith("Advanced Micro"):
        pytest.skip("crashes on AMD 15.12")

    for dtype, check, a, b in [
        (np.complex64, None, 5, 7),
        (
            cltypes.float4,
            check_float4,  # pylint:disable=no-member
            cltypes.make_float4(1, 2, 3, 4),  # pylint:disable=no-member
            cltypes.make_float4(6, 7, 8, 9)),  # pylint:disable=no-member
        (np.float32, None, 5, 7),
    ]:
        knl = lp.make_kernel("[n] -> {[i]: 0<=i<n}", ["z[i] = a*x[i]+b*y[i]"],
                             [
                                 lp.ValueArg("a", dtype),
                                 lp.GlobalArg("x", dtype, shape="n,"),
                                 lp.ValueArg("b", dtype),
                                 lp.GlobalArg("y", dtype, shape="n,"),
                                 lp.GlobalArg("z", dtype, shape="n,"),
                                 lp.ValueArg("n", np.int32, approximately=n),
                             ],
                             name="axpy",
                             assumptions="n>=1")

        seq_knl = knl

        def variant_cpu(knl):
            unroll = 16
            block_size = unroll * 4096
            knl = lp.split_iname(knl,
                                 "i",
                                 block_size,
                                 outer_tag="g.0",
                                 slabs=(0, 1))
            knl = lp.split_iname(knl, "i_inner", unroll, inner_tag="unr")
            return knl

        def variant_gpu(knl):
            unroll = 4
            block_size = 256
            knl = lp.split_iname(knl,
                                 "i",
                                 unroll * block_size,
                                 outer_tag="g.0",
                                 slabs=(0, 1))
            knl = lp.split_iname(knl,
                                 "i_inner",
                                 block_size,
                                 outer_tag="unr",
                                 inner_tag="l.0")
            return knl

        #for variant in [ variant_gpu]:
        for variant in [variant_cpu, variant_gpu]:
            lp.auto_test_vs_ref(
                seq_knl,
                ctx,
                variant(knl),
                op_count=[np.dtype(dtype).itemsize * n * 3 / 1e9],
                op_label=["GBytes"],
                parameters={
                    "a": a,
                    "b": b,
                    "n": n
                },
                check_result=check,
                blacklist_ref_vendors=["Advanced Micro"])
示例#5
0
def csr_to_ocl_sell4(n_row, rowptr, colidx, val):
    """
    Convert CSR format to Sliced ELLPACK format and slice height = 4.

    Parameters
    ----------
    n_row : int
        Number of rows
    rowptr : ndarrays
        Row pointer of CSR format
    colidx : ndarrays
        Column index of CSR format
    val : ndarrays
        None zero elements value of CSR format

    Returns
    -------
    slice_count : int
        Number of slices
    ell_colidx : ndarrays
        Column index of Sliced ELLPACK format
    ell_sliceptr : ndarrays
        Slice pointer of Sliced ELLPACK format
    ell_slicecol : ndarrays
        Column length of a slice
    ell_val : ndarrays
        None zero elements value of CSR format
    """

    slice_height = 4
    slice_number = math.floor(n_row / slice_height)  # number of full slices
    slice_count = math.ceil(n_row / slice_height)  # real number of slices
    nnz_count = 0
    total_col_count = 0

    ell_colidx = []
    ell_sliceptr = []
    ell_slicecol = [0]
    ell_val = []

    for i in range(slice_number):
        max_nnz = 0
        for s in range(slice_height):
            col_count = rowptr[i * slice_height + s + 1] - \
                        rowptr[i * slice_height + s]
            max_nnz = max(max_nnz, col_count)

        ell_sliceptr.append(nnz_count)
        total_col_count += max_nnz
        ell_slicecol.append(total_col_count)
        pre_idx = 0
        for j in range(max_nnz):  # column scan
            slice_row_val = []
            slice_row_colidx = []
            for k in range(slice_height):  # row scan
                idx = i * slice_height + k  # row index
                now_ptr = rowptr[idx]  # start index of this row
                next_ptr = rowptr[idx + 1]  # start index of next row
                nnz_count += 1  # count non-zero number
                if now_ptr + j < next_ptr:
                    pre_idx = colidx[now_ptr + j]
                    slice_row_colidx.append(colidx[now_ptr + j])
                    slice_row_val.append(val[now_ptr + j])
                else:
                    slice_row_colidx.append(pre_idx)
                    slice_row_val.append(0)  # padded zero

            # convert to vector int
            int4_slice_row_colidx = cltypes.make_int4(slice_row_colidx[0],
                                                      slice_row_colidx[1],
                                                      slice_row_colidx[2],
                                                      slice_row_colidx[3])

            # convert to vector float
            float4_slice_row_val = cltypes.make_float4(slice_row_val[0],
                                                       slice_row_val[1],
                                                       slice_row_val[2],
                                                       slice_row_val[3])
            ell_colidx.append(int4_slice_row_colidx)
            ell_val.append(float4_slice_row_val)

    if n_row % slice_height != 0:  # if have remainder
        now_row = slice_number * slice_height
        remain_rows = n_row - now_row
        max_nnz = 0
        for s in range(remain_rows):
            col_count = rowptr[now_row + s + 1] - rowptr[now_row + s]
            max_nnz = max(max_nnz, col_count)

        ell_sliceptr.append(nnz_count)
        total_col_count += max_nnz
        ell_slicecol.append(total_col_count)
        pre_idx = 0
        for j in range(max_nnz):  # column
            slice_row_val = []
            slice_row_colidx = []
            for k in range(slice_height):  # row
                nnz_count += 1  # count non-zero number
                if k >= remain_rows:
                    slice_row_colidx.append(0)
                    slice_row_val.append(0)  # padded zero
                else:
                    idx = now_row + k  # row index
                    now_ptr = rowptr[idx]  # start index of this row
                    next_ptr = rowptr[idx + 1]  # start index of next row
                    if now_ptr + j < next_ptr:
                        pre_idx = colidx[now_ptr + j]
                        slice_row_colidx.append(colidx[now_ptr + j])
                        slice_row_val.append(val[now_ptr + j])
                    else:
                        slice_row_colidx.append(pre_idx)
                        slice_row_val.append(0)  # padded zero

            # convert to vector int
            int4_slice_row_colidx = cltypes.make_int4(slice_row_colidx[0],
                                                      slice_row_colidx[1],
                                                      slice_row_colidx[2],
                                                      slice_row_colidx[3])

            # convert to vector float
            float4_slice_row_val = cltypes.make_float4(slice_row_val[0],
                                                       slice_row_val[1],
                                                       slice_row_val[2],
                                                       slice_row_val[3])
            ell_colidx.append(int4_slice_row_colidx)
            ell_val.append(float4_slice_row_val)

    ell_sliceptr.append(nnz_count)
    return slice_count, \
        np.array(ell_colidx), \
        np.array(ell_sliceptr, dtype=np.int32), \
        np.array(ell_slicecol, dtype=np.int32), \
        np.array(ell_val)