Python get_pocl_version 예제들, pyopencl.characterize.get_pocl_version Python 예제들

예제 #1

0

파일 보기

파일: test_algorithm.py 프로젝트: numa65536/pyopencl

def test_bitonic_argsort(ctx_factory, size, dtype):
    import sys
    is_pypy = "__pypy__" in sys.builtin_module_names

    if not size and is_pypy:
        # https://bitbucket.org/pypy/numpy/issues/53/specifying-strides-on-zero-sized-array
        pytest.xfail("pypy doesn't seem to handle as_strided "
                "on zero-sized arrays very well")

    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    device = queue.device
    if device.platform.vendor == "The pocl project" \
            and device.type & cl.device_type.GPU:
        pytest.xfail("bitonic argsort fails on POCL + Nvidia,"
                "at least the K40, as of pocl 1.6, 2021-01-20")

    dev = ctx.devices[0]
    if (dev.platform.name == "Portable Computing Language"
            and sys.platform == "darwin"):
        pytest.xfail("Bitonic sort crashes on Apple POCL")
    if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU):
        pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup "
            "parallelism")
    if (dev.platform.name == "Portable Computing Language"
            and dtype == np.float64
            and get_pocl_version(dev.platform) < (1, 0)):
        pytest.xfail("Double precision bitonic sort doesn't work on POCL < 1.0")
    if (dev.platform.name == "Intel(R) OpenCL" and size == 0):
        pytest.xfail("size-0 arange fails on Intel CL")

    if dtype == np.float64 and not has_double_support(dev):
        from pytest import skip
        skip("double precision not supported on %s" % dev)

    import pyopencl.clrandom as clrandom
    from pyopencl.bitonic_sort import BitonicSort

    index = cl_array.arange(queue, 0, size, 1, dtype=np.int32)
    m = clrandom.rand(queue, (size,), dtype, luxury=None, a=0, b=239432234)

    sorterm = BitonicSort(ctx)

    ms = m.copy()
    # enqueue_marker crashes under CL 1.1 pocl if there is anything to wait for
    # (no clEnqueueWaitForEvents) https://github.com/inducer/pyopencl/pull/237
    if (dev.platform.name == "Portable Computing Language"
            and cl.get_cl_header_version() < (1, 2)):
        ms.finish()
        index.finish()
    ms, evt = sorterm(ms, idx=index, axis=0)

    assert np.array_equal(np.sort(m.get()), ms.get())

    # may be False because of identical values in array
    # assert np.array_equal(np.argsort(m.get()), index.get())

    # Check values by indices
    assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])

예제 #2

0

파일 보기

파일: test_algorithm.py 프로젝트: zeta1999/pyopencl

def test_bitonic_sort(ctx_factory, size, dtype):
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    dev = ctx.devices[0]
    if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU):
        pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup "
            "parallelism")
    if (dev.platform.name == "Portable Computing Language"
            and dtype == np.float64
            and get_pocl_version(dev.platform) < (1, 0)):
        pytest.xfail("Double precision bitonic sort doesn't work on POCL < 1.0")

    if dtype == np.float64 and not has_double_support(dev):
        from pytest import skip
        skip("double precision not supported on %s" % dev)

    import pyopencl.clrandom as clrandom
    from pyopencl.bitonic_sort import BitonicSort

    s = clrandom.rand(queue, (2, size, 3,), dtype, luxury=None, a=0, b=239482333)
    sgs = s.copy()
    # enqueue_marker crashes under CL 1.1 pocl if there is anything to wait for
    # (no clEnqueueWaitForEvents) https://github.com/inducer/pyopencl/pull/237
    if (dev.platform.name == "Portable Computing Language"
            and cl.get_cl_header_version() < (1, 2)):
        sgs.finish()
    sorter = BitonicSort(ctx)
    sgs, evt = sorter(sgs, axis=1)
    assert np.array_equal(np.sort(s.get(), axis=1), sgs.get())

예제 #3

0

파일 보기

def test_bitonic_sort(ctx_factory, size, dtype):
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    dev = ctx.devices[0]
    if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU):
        pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup "
                     "parallelism")
    if (dev.platform.name == "Portable Computing Language"
            and dtype == np.float64 and get_pocl_version(dev.platform) <
        (1, 0)):
        pytest.xfail(
            "Double precision bitonic sort doesn't work on POCL < 1.0")

    if dtype == np.float64 and not has_double_support(dev):
        from pytest import skip
        skip("double precision not supported on %s" % dev)

    import pyopencl.clrandom as clrandom
    from pyopencl.bitonic_sort import BitonicSort

    s = clrandom.rand(queue, (
        2,
        size,
        3,
    ),
                      dtype,
                      luxury=None,
                      a=0,
                      b=239482333)
    sorter = BitonicSort(ctx)
    sgs, evt = sorter(s.copy(), axis=1)
    assert np.array_equal(np.sort(s.get(), axis=1), sgs.get())

예제 #4

0

파일 보기

def test_coarse_grain_svm(ctx_factory):
    import sys
    is_pypy = '__pypy__' in sys.builtin_module_names

    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    dev = ctx.devices[0]

    has_svm = (ctx._get_cl_version() >= (2, 0)
               and ctx.devices[0]._get_cl_version() >= (2, 0)
               and cl.get_cl_header_version() >= (2, 0))

    if dev.platform.name == "Portable Computing Language":
        has_svm = (get_pocl_version(dev.platform) >= (1, 0)
                   and cl.get_cl_header_version() >= (2, 0))

    if not has_svm:
        from pytest import skip
        skip("SVM only available in OpenCL 2.0 and higher")

    if ("AMD" in dev.platform.name and dev.type & cl.device_type.CPU):
        pytest.xfail("AMD CPU doesn't do coarse-grain SVM")

    n = 3000
    svm_ary = cl.SVM(cl.csvm_empty(ctx, (n, ), np.float32, alignment=64))
    if not is_pypy:
        # https://bitbucket.org/pypy/numpy/issues/52
        assert isinstance(svm_ary.mem.base, cl.SVMAllocation)

    cl.enqueue_svm_memfill(queue, svm_ary, np.zeros((), svm_ary.mem.dtype))

    with svm_ary.map_rw(queue) as ary:
        ary.fill(17)
        orig_ary = ary.copy()

    prg = cl.Program(
        ctx, """
        __kernel void twice(__global float *a_g)
        {
          a_g[get_global_id(0)] *= 2;
        }
        """).build()

    prg.twice(queue, svm_ary.mem.shape, None, svm_ary)

    with svm_ary.map_ro(queue) as ary:
        print(ary)
        assert np.array_equal(orig_ary * 2, ary)

    new_ary = np.empty_like(orig_ary)
    new_ary.fill(-1)

    if ctx.devices[0].platform.name != "Portable Computing Language":
        # "Blocking memcpy is unimplemented (clEnqueueSVMMemcpy.c:61)"
        # in pocl up to and including 1.0rc1.

        cl.enqueue_copy(queue, new_ary, svm_ary)
        assert np.array_equal(orig_ary * 2, new_ary)

예제 #5

0

파일 보기

def test_bitonic_argsort(ctx_factory, size, dtype):
    import sys
    is_pypy = '__pypy__' in sys.builtin_module_names

    if not size and is_pypy:
        # https://bitbucket.org/pypy/numpy/issues/53/specifying-strides-on-zero-sized-array
        pytest.xfail("pypy doesn't seem to handle as_strided "
                     "on zero-sized arrays very well")

    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    dev = ctx.devices[0]
    if (dev.platform.name == "Portable Computing Language"
            and sys.platform == "darwin"):
        pytest.xfail("Bitonic sort crashes on Apple POCL")
    if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU):
        pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup "
                     "parallelism")
    if (dev.platform.name == "Portable Computing Language"
            and dtype == np.float64 and get_pocl_version(dev.platform) <
        (1, 0)):
        pytest.xfail(
            "Double precision bitonic sort doesn't work on POCL < 1.0")

    if dtype == np.float64 and not has_double_support(dev):
        from pytest import skip
        skip("double precision not supported on %s" % dev)

    import pyopencl.clrandom as clrandom
    from pyopencl.bitonic_sort import BitonicSort

    index = cl_array.arange(queue, 0, size, 1, dtype=np.int32)
    m = clrandom.rand(queue, (size, ), dtype, luxury=None, a=0, b=239432234)

    sorterm = BitonicSort(ctx)

    ms, evt = sorterm(m.copy(), idx=index, axis=0)

    assert np.array_equal(np.sort(m.get()), ms.get())

    # may be False because of identical values in array
    # assert np.array_equal(np.argsort(m.get()), index.get())

    # Check values by indices
    assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])

예제 #6

0

파일 보기

파일: test_algorithm.py 프로젝트: kif/pyopencl

def test_bitonic_argsort(ctx_factory, size, dtype):
    import sys
    is_pypy = '__pypy__' in sys.builtin_module_names

    if not size and is_pypy:
        # https://bitbucket.org/pypy/numpy/issues/53/specifying-strides-on-zero-sized-array
        pytest.xfail("pypy doesn't seem to handle as_strided "
                "on zero-sized arrays very well")

    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    dev = ctx.devices[0]
    if (dev.platform.name == "Portable Computing Language"
            and sys.platform == "darwin"):
        pytest.xfail("Bitonic sort crashes on Apple POCL")
    if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU):
        pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup "
            "parallelism")
    if (dev.platform.name == "Portable Computing Language"
            and dtype == np.float64
            and get_pocl_version(dev.platform) < (1, 0)):
        pytest.xfail("Double precision bitonic sort doesn't work on POCL < 1.0")

    if dtype == np.float64 and not has_double_support(dev):
        from pytest import skip
        skip("double precision not supported on %s" % dev)

    import pyopencl.clrandom as clrandom
    from pyopencl.bitonic_sort import BitonicSort

    index = cl_array.arange(queue, 0, size, 1, dtype=np.int32)
    m = clrandom.rand(queue, (size,), dtype, luxury=None, a=0, b=239432234)

    sorterm = BitonicSort(ctx)

    ms, evt = sorterm(m.copy(), idx=index, axis=0)

    assert np.array_equal(np.sort(m.get()), ms.get())

    # may be False because of identical values in array
    # assert np.array_equal(np.argsort(m.get()), index.get())

    # Check values by indices
    assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])

예제 #7

0

파일 보기

def _get_max_parameter_size(dev):
    """Return the device's maximum parameter size adjusted for pocl."""
    from pyopencl.characterize import get_pocl_version

    dev_limit = dev.max_parameter_size
    pocl_version = get_pocl_version(dev.platform, fallback_value=(1, 8))
    if pocl_version is not None and pocl_version < (3, 0):
        # Current pocl versions (as of 04/2022) have an incorrect parameter
        # size limit of 1024; see e.g. https://github.com/pocl/pocl/pull/1046
        if dev_limit == 1024:
            if dev.type & cl.device_type.CPU:
                return 1024 * 1024
            if dev.type & cl.device_type.GPU:
                # All modern Nvidia GPUs (starting from Compute Capability 2)
                # have this limit
                return 4352

    return dev_limit

예제 #8

0

파일 보기

파일: test_algorithm.py 프로젝트: kif/pyopencl

def test_bitonic_sort(ctx_factory, size, dtype):
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    dev = ctx.devices[0]
    if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU):
        pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup "
            "parallelism")
    if (dev.platform.name == "Portable Computing Language"
            and dtype == np.float64
            and get_pocl_version(dev.platform) < (1, 0)):
        pytest.xfail("Double precision bitonic sort doesn't work on POCL < 1.0")

    if dtype == np.float64 and not has_double_support(dev):
        from pytest import skip
        skip("double precision not supported on %s" % dev)

    import pyopencl.clrandom as clrandom
    from pyopencl.bitonic_sort import BitonicSort

    s = clrandom.rand(queue, (2, size, 3,), dtype, luxury=None, a=0, b=239482333)
    sorter = BitonicSort(ctx)
    sgs, evt = sorter(s.copy(), axis=1)
    assert np.array_equal(np.sort(s.get(), axis=1), sgs.get())

예제 #9

0

파일 보기

def _skip_if_pocl(plat, up_to_version, msg='unsupported by pocl'):
    if plat.vendor == "The pocl project":
        if up_to_version is None or get_pocl_version(plat) <= up_to_version:
            pytest.skip(msg)

예제 #10

0

파일 보기

def _xfail_if_pocl(plat, up_to_version, msg="unsupported by pocl"):
    if plat.vendor == "The pocl project":
        if up_to_version is None or get_pocl_version(plat) <= up_to_version:
            pytest.xfail(msg)

예제 #11

0

파일 보기

파일: test_enqueue_copy.py 프로젝트: inducer/pyopencl

def test_enqueue_copy_rect_2d(ctx_factory, honor_skip=True):
    """
    Test 2D sub-array (slice) copy.
    """
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    if (honor_skip
            and ctx.devices[0].platform.name == "Portable Computing Language"
            and get_pocl_version(ctx.devices[0].platform) <= (0, 13)):
        # https://github.com/pocl/pocl/issues/353
        pytest.skip("POCL's rectangular copies crash")

    ary_in_shp = 256, 128  # Entire array shape from which sub-array copied to device
    sub_ary_shp = 128, 96  # Sub-array shape to be copied to device
    ary_in_origin = 20, 13  # Sub-array origin
    ary_in_slice = generate_slice(ary_in_origin, sub_ary_shp)

    ary_out_origin = 11, 19  # Origin of sub-array copy from device to host-array
    ary_out_shp = 512, 256  # Entire host-array shape copy sub-array device->host
    ary_out_slice = generate_slice(ary_out_origin, sub_ary_shp)

    buf_in_origin = 7, 3  # Origin of sub-array in device buffer
    buf_in_shp = 300, 200  # shape of device buffer

    buf_out_origin = 31, 17  # Origin of 2nd device buffer
    buf_out_shp = 300, 400  # shape of 2nd device buffer

    # Create host array of random values.
    h_ary_in = \
        np.array(
            np.random.randint(
                0,
                256,
                np.product(ary_in_shp)
            ),
            dtype=np.uint8
        ).reshape(ary_in_shp)

    # Create device buffers
    d_in_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, size=np.product(buf_in_shp))
    d_out_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, size=np.product(buf_out_shp))

    # Copy sub-array (rectangular buffer) from host to device
    cl.enqueue_copy(
        queue,
        d_in_buf,
        h_ary_in,
        buffer_origin=buf_in_origin[::-1],
        host_origin=ary_in_origin[::-1],
        region=sub_ary_shp[::-1],
        buffer_pitches=(buf_in_shp[-1],),
        host_pitches=(ary_in_shp[-1],)
    )
    # Copy sub-array (rectangular buffer) from device-buffer to device-buffer
    cl.enqueue_copy(
        queue,
        d_out_buf,
        d_in_buf,
        src_origin=buf_in_origin[::-1],
        dst_origin=buf_out_origin[::-1],
        region=sub_ary_shp[::-1],
        src_pitches=(buf_in_shp[-1],),
        dst_pitches=(buf_out_shp[-1],)
    )

    # Create zero-initialised array to receive sub-array from device
    h_ary_out = np.zeros(ary_out_shp, dtype=h_ary_in.dtype)

    # Copy sub-array (rectangular buffer) from device to host-array.
    cl.enqueue_copy(
        queue,
        h_ary_out,
        d_out_buf,
        buffer_origin=buf_out_origin[::-1],
        host_origin=ary_out_origin[::-1],
        region=sub_ary_shp[::-1],
        buffer_pitches=(buf_out_shp[-1],),
        host_pitches=(ary_out_shp[-1],)
    )
    queue.finish()

    # Check that the sub-array copied to device is
    # the same as the sub-array received from device.
    assert np.all(h_ary_in[ary_in_slice] == h_ary_out[ary_out_slice])

예제 #12

0

파일 보기

파일: test_enqueue_copy.py 프로젝트: t20100/pyopencl

def test_enqueue_copy_rect_2d(ctx_factory, honor_skip=True):
    """
    Test 2D sub-array (slice) copy.
    """
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    if (honor_skip
            and ctx.devices[0].platform.name == "Portable Computing Language"
            and get_pocl_version(ctx.devices[0].platform) <= (0, 13)):
        # https://github.com/pocl/pocl/issues/353
        pytest.skip("POCL's rectangular copies crash")

    device = queue.device
    if device.platform.vendor == "The pocl project" \
            and device.type & cl.device_type.GPU:
        pytest.xfail("rect copies fail on POCL + Nvidia,"
                     "at least the K40, as of pocl 1.6, 2021-01-20")

    if honor_skip and queue.device.platform.name == "Apple":
        pytest.xfail("Apple's CL implementation crashes on this.")

    ary_in_shp = 256, 128  # Entire array shape from which sub-array copied to device
    sub_ary_shp = 128, 96  # Sub-array shape to be copied to device
    ary_in_origin = 20, 13  # Sub-array origin
    ary_in_slice = generate_slice(ary_in_origin, sub_ary_shp)

    ary_out_origin = 11, 19  # Origin of sub-array copy from device to host-array
    ary_out_shp = 512, 256  # Entire host-array shape copy sub-array device->host
    ary_out_slice = generate_slice(ary_out_origin, sub_ary_shp)

    buf_in_origin = 7, 3  # Origin of sub-array in device buffer
    buf_in_shp = 300, 200  # shape of device buffer

    buf_out_origin = 31, 17  # Origin of 2nd device buffer
    buf_out_shp = 300, 400  # shape of 2nd device buffer

    # Create host array of random values.
    h_ary_in = \
        np.array(
            np.random.randint(
                0,
                256,
                np.product(ary_in_shp)
            ),
            dtype=np.uint8
        ).reshape(ary_in_shp)

    # Create device buffers
    d_in_buf = cl.Buffer(ctx,
                         cl.mem_flags.READ_ONLY,
                         size=np.product(buf_in_shp))
    d_out_buf = cl.Buffer(ctx,
                          cl.mem_flags.READ_ONLY,
                          size=np.product(buf_out_shp))

    # Copy sub-array (rectangular buffer) from host to device
    cl.enqueue_copy(queue,
                    d_in_buf,
                    h_ary_in,
                    buffer_origin=buf_in_origin[::-1],
                    host_origin=ary_in_origin[::-1],
                    region=sub_ary_shp[::-1],
                    buffer_pitches=(buf_in_shp[-1], ),
                    host_pitches=(ary_in_shp[-1], ))
    # Copy sub-array (rectangular buffer) from device-buffer to device-buffer
    cl.enqueue_copy(queue,
                    d_out_buf,
                    d_in_buf,
                    src_origin=buf_in_origin[::-1],
                    dst_origin=buf_out_origin[::-1],
                    region=sub_ary_shp[::-1],
                    src_pitches=(buf_in_shp[-1], ),
                    dst_pitches=(buf_out_shp[-1], ))

    # Create zero-initialised array to receive sub-array from device
    h_ary_out = np.zeros(ary_out_shp, dtype=h_ary_in.dtype)

    # Copy sub-array (rectangular buffer) from device to host-array.
    cl.enqueue_copy(queue,
                    h_ary_out,
                    d_out_buf,
                    buffer_origin=buf_out_origin[::-1],
                    host_origin=ary_out_origin[::-1],
                    region=sub_ary_shp[::-1],
                    buffer_pitches=(buf_out_shp[-1], ),
                    host_pitches=(ary_out_shp[-1], ))
    queue.finish()

    # Check that the sub-array copied to device is
    # the same as the sub-array received from device.
    assert np.all(h_ary_in[ary_in_slice] == h_ary_out[ary_out_slice])

예제 #13

0

파일 보기

def test_enqueue_copy_rect_3d(ctx_factory, honor_skip=True):
    """
    Test 3D sub-array (slice) copy.
    """
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    if (honor_skip
            and ctx.devices[0].platform.name == "Portable Computing Language"
            and get_pocl_version(ctx.devices[0].platform) <= (0, 13)):
        # https://github.com/pocl/pocl/issues/353
        pytest.skip("POCL's rectangular copies crash")

    ary_in_shp = 256, 128, 31  # array shape from which sub-array copied to device
    sub_ary_shp = 128, 96, 20  # Sub-array shape to be copied to device
    ary_in_origin = 20, 13, 7  # Sub-array origin
    ary_in_slice = generate_slice(ary_in_origin, sub_ary_shp)

    ary_out_origin = 11, 19, 14  # Origin of sub-array copy from device to host-array
    ary_out_shp = 192, 256, 128  # Entire host-array shape copy sub-array dev->host
    ary_out_slice = generate_slice(ary_out_origin, sub_ary_shp)

    buf_in_origin = 7, 3, 6  # Origin of sub-array in device buffer
    buf_in_shp = 300, 200, 30  # shape of device buffer

    buf_out_origin = 31, 17, 3  # Origin of 2nd device buffer
    buf_out_shp = 300, 400, 40  # shape of 2nd device buffer

    # Create host array of random values.
    h_ary_in = \
        np.array(
            np.random.randint(
                0,
                256,
                np.product(ary_in_shp)
            ),
            dtype=np.uint8
        ).reshape(ary_in_shp)

    # Create device buffers
    d_in_buf = cl.Buffer(ctx,
                         cl.mem_flags.READ_ONLY,
                         size=np.product(buf_in_shp))
    d_out_buf = cl.Buffer(ctx,
                          cl.mem_flags.READ_ONLY,
                          size=np.product(buf_out_shp))

    # Copy sub-array (rectangular buffer) from host to device
    cl.enqueue_copy(queue,
                    d_in_buf,
                    h_ary_in,
                    buffer_origin=buf_in_origin[::-1],
                    host_origin=ary_in_origin[::-1],
                    region=sub_ary_shp[::-1],
                    buffer_pitches=(buf_in_shp[-1],
                                    buf_in_shp[-1] * buf_in_shp[-2]),
                    host_pitches=(ary_in_shp[-1],
                                  ary_in_shp[-1] * ary_in_shp[-2]))
    # Copy sub-array (rectangular buffer) from device-buffer to device-buffer
    cl.enqueue_copy(queue,
                    d_out_buf,
                    d_in_buf,
                    src_origin=buf_in_origin[::-1],
                    dst_origin=buf_out_origin[::-1],
                    region=sub_ary_shp[::-1],
                    src_pitches=(buf_in_shp[-1],
                                 buf_in_shp[-1] * buf_in_shp[-2]),
                    dst_pitches=(buf_out_shp[-1],
                                 buf_out_shp[-1] * buf_out_shp[-2]))

    # Create zero-initialised array to receive sub-array from device
    h_ary_out = np.zeros(ary_out_shp, dtype=h_ary_in.dtype)

    # Copy sub-array (rectangular buffer) from device to host-array.
    cl.enqueue_copy(queue,
                    h_ary_out,
                    d_out_buf,
                    buffer_origin=buf_out_origin[::-1],
                    host_origin=ary_out_origin[::-1],
                    region=sub_ary_shp[::-1],
                    buffer_pitches=(buf_out_shp[-1],
                                    buf_out_shp[-1] * buf_out_shp[-2]),
                    host_pitches=(ary_out_shp[-1],
                                  ary_out_shp[-1] * ary_out_shp[-2]))
    queue.finish()

    # Check that the sub-array copied to device is
    # the same as the sub-array received from device.
    assert np.array_equal(h_ary_in[ary_in_slice], h_ary_out[ary_out_slice])