def test_bitonic_argsort(ctx_factory, size, dtype): import sys is_pypy = "__pypy__" in sys.builtin_module_names if not size and is_pypy: # https://bitbucket.org/pypy/numpy/issues/53/specifying-strides-on-zero-sized-array pytest.xfail("pypy doesn't seem to handle as_strided " "on zero-sized arrays very well") ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) device = queue.device if device.platform.vendor == "The pocl project" \ and device.type & cl.device_type.GPU: pytest.xfail("bitonic argsort fails on POCL + Nvidia," "at least the K40, as of pocl 1.6, 2021-01-20") dev = ctx.devices[0] if (dev.platform.name == "Portable Computing Language" and sys.platform == "darwin"): pytest.xfail("Bitonic sort crashes on Apple POCL") if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU): pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup " "parallelism") if (dev.platform.name == "Portable Computing Language" and dtype == np.float64 and get_pocl_version(dev.platform) < (1, 0)): pytest.xfail("Double precision bitonic sort doesn't work on POCL < 1.0") if (dev.platform.name == "Intel(R) OpenCL" and size == 0): pytest.xfail("size-0 arange fails on Intel CL") if dtype == np.float64 and not has_double_support(dev): from pytest import skip skip("double precision not supported on %s" % dev) import pyopencl.clrandom as clrandom from pyopencl.bitonic_sort import BitonicSort index = cl_array.arange(queue, 0, size, 1, dtype=np.int32) m = clrandom.rand(queue, (size,), dtype, luxury=None, a=0, b=239432234) sorterm = BitonicSort(ctx) ms = m.copy() # enqueue_marker crashes under CL 1.1 pocl if there is anything to wait for # (no clEnqueueWaitForEvents) https://github.com/inducer/pyopencl/pull/237 if (dev.platform.name == "Portable Computing Language" and cl.get_cl_header_version() < (1, 2)): ms.finish() index.finish() ms, evt = sorterm(ms, idx=index, axis=0) assert np.array_equal(np.sort(m.get()), ms.get()) # may be False because of identical values in array # assert np.array_equal(np.argsort(m.get()), index.get()) # Check values by indices assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])
def test_bitonic_sort(ctx_factory, size, dtype): ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) dev = ctx.devices[0] if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU): pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup " "parallelism") if (dev.platform.name == "Portable Computing Language" and dtype == np.float64 and get_pocl_version(dev.platform) < (1, 0)): pytest.xfail("Double precision bitonic sort doesn't work on POCL < 1.0") if dtype == np.float64 and not has_double_support(dev): from pytest import skip skip("double precision not supported on %s" % dev) import pyopencl.clrandom as clrandom from pyopencl.bitonic_sort import BitonicSort s = clrandom.rand(queue, (2, size, 3,), dtype, luxury=None, a=0, b=239482333) sgs = s.copy() # enqueue_marker crashes under CL 1.1 pocl if there is anything to wait for # (no clEnqueueWaitForEvents) https://github.com/inducer/pyopencl/pull/237 if (dev.platform.name == "Portable Computing Language" and cl.get_cl_header_version() < (1, 2)): sgs.finish() sorter = BitonicSort(ctx) sgs, evt = sorter(sgs, axis=1) assert np.array_equal(np.sort(s.get(), axis=1), sgs.get())
def test_bitonic_sort(ctx_factory, size, dtype): ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) dev = ctx.devices[0] if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU): pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup " "parallelism") if (dev.platform.name == "Portable Computing Language" and dtype == np.float64 and get_pocl_version(dev.platform) < (1, 0)): pytest.xfail( "Double precision bitonic sort doesn't work on POCL < 1.0") if dtype == np.float64 and not has_double_support(dev): from pytest import skip skip("double precision not supported on %s" % dev) import pyopencl.clrandom as clrandom from pyopencl.bitonic_sort import BitonicSort s = clrandom.rand(queue, ( 2, size, 3, ), dtype, luxury=None, a=0, b=239482333) sorter = BitonicSort(ctx) sgs, evt = sorter(s.copy(), axis=1) assert np.array_equal(np.sort(s.get(), axis=1), sgs.get())
def test_coarse_grain_svm(ctx_factory): import sys is_pypy = '__pypy__' in sys.builtin_module_names ctx = ctx_factory() queue = cl.CommandQueue(ctx) dev = ctx.devices[0] has_svm = (ctx._get_cl_version() >= (2, 0) and ctx.devices[0]._get_cl_version() >= (2, 0) and cl.get_cl_header_version() >= (2, 0)) if dev.platform.name == "Portable Computing Language": has_svm = (get_pocl_version(dev.platform) >= (1, 0) and cl.get_cl_header_version() >= (2, 0)) if not has_svm: from pytest import skip skip("SVM only available in OpenCL 2.0 and higher") if ("AMD" in dev.platform.name and dev.type & cl.device_type.CPU): pytest.xfail("AMD CPU doesn't do coarse-grain SVM") n = 3000 svm_ary = cl.SVM(cl.csvm_empty(ctx, (n, ), np.float32, alignment=64)) if not is_pypy: # https://bitbucket.org/pypy/numpy/issues/52 assert isinstance(svm_ary.mem.base, cl.SVMAllocation) cl.enqueue_svm_memfill(queue, svm_ary, np.zeros((), svm_ary.mem.dtype)) with svm_ary.map_rw(queue) as ary: ary.fill(17) orig_ary = ary.copy() prg = cl.Program( ctx, """ __kernel void twice(__global float *a_g) { a_g[get_global_id(0)] *= 2; } """).build() prg.twice(queue, svm_ary.mem.shape, None, svm_ary) with svm_ary.map_ro(queue) as ary: print(ary) assert np.array_equal(orig_ary * 2, ary) new_ary = np.empty_like(orig_ary) new_ary.fill(-1) if ctx.devices[0].platform.name != "Portable Computing Language": # "Blocking memcpy is unimplemented (clEnqueueSVMMemcpy.c:61)" # in pocl up to and including 1.0rc1. cl.enqueue_copy(queue, new_ary, svm_ary) assert np.array_equal(orig_ary * 2, new_ary)
def test_bitonic_argsort(ctx_factory, size, dtype): import sys is_pypy = '__pypy__' in sys.builtin_module_names if not size and is_pypy: # https://bitbucket.org/pypy/numpy/issues/53/specifying-strides-on-zero-sized-array pytest.xfail("pypy doesn't seem to handle as_strided " "on zero-sized arrays very well") ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) dev = ctx.devices[0] if (dev.platform.name == "Portable Computing Language" and sys.platform == "darwin"): pytest.xfail("Bitonic sort crashes on Apple POCL") if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU): pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup " "parallelism") if (dev.platform.name == "Portable Computing Language" and dtype == np.float64 and get_pocl_version(dev.platform) < (1, 0)): pytest.xfail( "Double precision bitonic sort doesn't work on POCL < 1.0") if dtype == np.float64 and not has_double_support(dev): from pytest import skip skip("double precision not supported on %s" % dev) import pyopencl.clrandom as clrandom from pyopencl.bitonic_sort import BitonicSort index = cl_array.arange(queue, 0, size, 1, dtype=np.int32) m = clrandom.rand(queue, (size, ), dtype, luxury=None, a=0, b=239432234) sorterm = BitonicSort(ctx) ms, evt = sorterm(m.copy(), idx=index, axis=0) assert np.array_equal(np.sort(m.get()), ms.get()) # may be False because of identical values in array # assert np.array_equal(np.argsort(m.get()), index.get()) # Check values by indices assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])
def test_bitonic_argsort(ctx_factory, size, dtype): import sys is_pypy = '__pypy__' in sys.builtin_module_names if not size and is_pypy: # https://bitbucket.org/pypy/numpy/issues/53/specifying-strides-on-zero-sized-array pytest.xfail("pypy doesn't seem to handle as_strided " "on zero-sized arrays very well") ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) dev = ctx.devices[0] if (dev.platform.name == "Portable Computing Language" and sys.platform == "darwin"): pytest.xfail("Bitonic sort crashes on Apple POCL") if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU): pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup " "parallelism") if (dev.platform.name == "Portable Computing Language" and dtype == np.float64 and get_pocl_version(dev.platform) < (1, 0)): pytest.xfail("Double precision bitonic sort doesn't work on POCL < 1.0") if dtype == np.float64 and not has_double_support(dev): from pytest import skip skip("double precision not supported on %s" % dev) import pyopencl.clrandom as clrandom from pyopencl.bitonic_sort import BitonicSort index = cl_array.arange(queue, 0, size, 1, dtype=np.int32) m = clrandom.rand(queue, (size,), dtype, luxury=None, a=0, b=239432234) sorterm = BitonicSort(ctx) ms, evt = sorterm(m.copy(), idx=index, axis=0) assert np.array_equal(np.sort(m.get()), ms.get()) # may be False because of identical values in array # assert np.array_equal(np.argsort(m.get()), index.get()) # Check values by indices assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])
def _get_max_parameter_size(dev): """Return the device's maximum parameter size adjusted for pocl.""" from pyopencl.characterize import get_pocl_version dev_limit = dev.max_parameter_size pocl_version = get_pocl_version(dev.platform, fallback_value=(1, 8)) if pocl_version is not None and pocl_version < (3, 0): # Current pocl versions (as of 04/2022) have an incorrect parameter # size limit of 1024; see e.g. https://github.com/pocl/pocl/pull/1046 if dev_limit == 1024: if dev.type & cl.device_type.CPU: return 1024 * 1024 if dev.type & cl.device_type.GPU: # All modern Nvidia GPUs (starting from Compute Capability 2) # have this limit return 4352 return dev_limit
def test_bitonic_sort(ctx_factory, size, dtype): ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) dev = ctx.devices[0] if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU): pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup " "parallelism") if (dev.platform.name == "Portable Computing Language" and dtype == np.float64 and get_pocl_version(dev.platform) < (1, 0)): pytest.xfail("Double precision bitonic sort doesn't work on POCL < 1.0") if dtype == np.float64 and not has_double_support(dev): from pytest import skip skip("double precision not supported on %s" % dev) import pyopencl.clrandom as clrandom from pyopencl.bitonic_sort import BitonicSort s = clrandom.rand(queue, (2, size, 3,), dtype, luxury=None, a=0, b=239482333) sorter = BitonicSort(ctx) sgs, evt = sorter(s.copy(), axis=1) assert np.array_equal(np.sort(s.get(), axis=1), sgs.get())
def _skip_if_pocl(plat, up_to_version, msg='unsupported by pocl'): if plat.vendor == "The pocl project": if up_to_version is None or get_pocl_version(plat) <= up_to_version: pytest.skip(msg)
def _xfail_if_pocl(plat, up_to_version, msg="unsupported by pocl"): if plat.vendor == "The pocl project": if up_to_version is None or get_pocl_version(plat) <= up_to_version: pytest.xfail(msg)
def test_enqueue_copy_rect_2d(ctx_factory, honor_skip=True): """ Test 2D sub-array (slice) copy. """ ctx = ctx_factory() queue = cl.CommandQueue(ctx) if (honor_skip and ctx.devices[0].platform.name == "Portable Computing Language" and get_pocl_version(ctx.devices[0].platform) <= (0, 13)): # https://github.com/pocl/pocl/issues/353 pytest.skip("POCL's rectangular copies crash") ary_in_shp = 256, 128 # Entire array shape from which sub-array copied to device sub_ary_shp = 128, 96 # Sub-array shape to be copied to device ary_in_origin = 20, 13 # Sub-array origin ary_in_slice = generate_slice(ary_in_origin, sub_ary_shp) ary_out_origin = 11, 19 # Origin of sub-array copy from device to host-array ary_out_shp = 512, 256 # Entire host-array shape copy sub-array device->host ary_out_slice = generate_slice(ary_out_origin, sub_ary_shp) buf_in_origin = 7, 3 # Origin of sub-array in device buffer buf_in_shp = 300, 200 # shape of device buffer buf_out_origin = 31, 17 # Origin of 2nd device buffer buf_out_shp = 300, 400 # shape of 2nd device buffer # Create host array of random values. h_ary_in = \ np.array( np.random.randint( 0, 256, np.product(ary_in_shp) ), dtype=np.uint8 ).reshape(ary_in_shp) # Create device buffers d_in_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, size=np.product(buf_in_shp)) d_out_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, size=np.product(buf_out_shp)) # Copy sub-array (rectangular buffer) from host to device cl.enqueue_copy( queue, d_in_buf, h_ary_in, buffer_origin=buf_in_origin[::-1], host_origin=ary_in_origin[::-1], region=sub_ary_shp[::-1], buffer_pitches=(buf_in_shp[-1],), host_pitches=(ary_in_shp[-1],) ) # Copy sub-array (rectangular buffer) from device-buffer to device-buffer cl.enqueue_copy( queue, d_out_buf, d_in_buf, src_origin=buf_in_origin[::-1], dst_origin=buf_out_origin[::-1], region=sub_ary_shp[::-1], src_pitches=(buf_in_shp[-1],), dst_pitches=(buf_out_shp[-1],) ) # Create zero-initialised array to receive sub-array from device h_ary_out = np.zeros(ary_out_shp, dtype=h_ary_in.dtype) # Copy sub-array (rectangular buffer) from device to host-array. cl.enqueue_copy( queue, h_ary_out, d_out_buf, buffer_origin=buf_out_origin[::-1], host_origin=ary_out_origin[::-1], region=sub_ary_shp[::-1], buffer_pitches=(buf_out_shp[-1],), host_pitches=(ary_out_shp[-1],) ) queue.finish() # Check that the sub-array copied to device is # the same as the sub-array received from device. assert np.all(h_ary_in[ary_in_slice] == h_ary_out[ary_out_slice])
def test_enqueue_copy_rect_2d(ctx_factory, honor_skip=True): """ Test 2D sub-array (slice) copy. """ ctx = ctx_factory() queue = cl.CommandQueue(ctx) if (honor_skip and ctx.devices[0].platform.name == "Portable Computing Language" and get_pocl_version(ctx.devices[0].platform) <= (0, 13)): # https://github.com/pocl/pocl/issues/353 pytest.skip("POCL's rectangular copies crash") device = queue.device if device.platform.vendor == "The pocl project" \ and device.type & cl.device_type.GPU: pytest.xfail("rect copies fail on POCL + Nvidia," "at least the K40, as of pocl 1.6, 2021-01-20") if honor_skip and queue.device.platform.name == "Apple": pytest.xfail("Apple's CL implementation crashes on this.") ary_in_shp = 256, 128 # Entire array shape from which sub-array copied to device sub_ary_shp = 128, 96 # Sub-array shape to be copied to device ary_in_origin = 20, 13 # Sub-array origin ary_in_slice = generate_slice(ary_in_origin, sub_ary_shp) ary_out_origin = 11, 19 # Origin of sub-array copy from device to host-array ary_out_shp = 512, 256 # Entire host-array shape copy sub-array device->host ary_out_slice = generate_slice(ary_out_origin, sub_ary_shp) buf_in_origin = 7, 3 # Origin of sub-array in device buffer buf_in_shp = 300, 200 # shape of device buffer buf_out_origin = 31, 17 # Origin of 2nd device buffer buf_out_shp = 300, 400 # shape of 2nd device buffer # Create host array of random values. h_ary_in = \ np.array( np.random.randint( 0, 256, np.product(ary_in_shp) ), dtype=np.uint8 ).reshape(ary_in_shp) # Create device buffers d_in_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, size=np.product(buf_in_shp)) d_out_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, size=np.product(buf_out_shp)) # Copy sub-array (rectangular buffer) from host to device cl.enqueue_copy(queue, d_in_buf, h_ary_in, buffer_origin=buf_in_origin[::-1], host_origin=ary_in_origin[::-1], region=sub_ary_shp[::-1], buffer_pitches=(buf_in_shp[-1], ), host_pitches=(ary_in_shp[-1], )) # Copy sub-array (rectangular buffer) from device-buffer to device-buffer cl.enqueue_copy(queue, d_out_buf, d_in_buf, src_origin=buf_in_origin[::-1], dst_origin=buf_out_origin[::-1], region=sub_ary_shp[::-1], src_pitches=(buf_in_shp[-1], ), dst_pitches=(buf_out_shp[-1], )) # Create zero-initialised array to receive sub-array from device h_ary_out = np.zeros(ary_out_shp, dtype=h_ary_in.dtype) # Copy sub-array (rectangular buffer) from device to host-array. cl.enqueue_copy(queue, h_ary_out, d_out_buf, buffer_origin=buf_out_origin[::-1], host_origin=ary_out_origin[::-1], region=sub_ary_shp[::-1], buffer_pitches=(buf_out_shp[-1], ), host_pitches=(ary_out_shp[-1], )) queue.finish() # Check that the sub-array copied to device is # the same as the sub-array received from device. assert np.all(h_ary_in[ary_in_slice] == h_ary_out[ary_out_slice])
def test_enqueue_copy_rect_3d(ctx_factory, honor_skip=True): """ Test 3D sub-array (slice) copy. """ ctx = ctx_factory() queue = cl.CommandQueue(ctx) if (honor_skip and ctx.devices[0].platform.name == "Portable Computing Language" and get_pocl_version(ctx.devices[0].platform) <= (0, 13)): # https://github.com/pocl/pocl/issues/353 pytest.skip("POCL's rectangular copies crash") ary_in_shp = 256, 128, 31 # array shape from which sub-array copied to device sub_ary_shp = 128, 96, 20 # Sub-array shape to be copied to device ary_in_origin = 20, 13, 7 # Sub-array origin ary_in_slice = generate_slice(ary_in_origin, sub_ary_shp) ary_out_origin = 11, 19, 14 # Origin of sub-array copy from device to host-array ary_out_shp = 192, 256, 128 # Entire host-array shape copy sub-array dev->host ary_out_slice = generate_slice(ary_out_origin, sub_ary_shp) buf_in_origin = 7, 3, 6 # Origin of sub-array in device buffer buf_in_shp = 300, 200, 30 # shape of device buffer buf_out_origin = 31, 17, 3 # Origin of 2nd device buffer buf_out_shp = 300, 400, 40 # shape of 2nd device buffer # Create host array of random values. h_ary_in = \ np.array( np.random.randint( 0, 256, np.product(ary_in_shp) ), dtype=np.uint8 ).reshape(ary_in_shp) # Create device buffers d_in_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, size=np.product(buf_in_shp)) d_out_buf = cl.Buffer(ctx, cl.mem_flags.READ_ONLY, size=np.product(buf_out_shp)) # Copy sub-array (rectangular buffer) from host to device cl.enqueue_copy(queue, d_in_buf, h_ary_in, buffer_origin=buf_in_origin[::-1], host_origin=ary_in_origin[::-1], region=sub_ary_shp[::-1], buffer_pitches=(buf_in_shp[-1], buf_in_shp[-1] * buf_in_shp[-2]), host_pitches=(ary_in_shp[-1], ary_in_shp[-1] * ary_in_shp[-2])) # Copy sub-array (rectangular buffer) from device-buffer to device-buffer cl.enqueue_copy(queue, d_out_buf, d_in_buf, src_origin=buf_in_origin[::-1], dst_origin=buf_out_origin[::-1], region=sub_ary_shp[::-1], src_pitches=(buf_in_shp[-1], buf_in_shp[-1] * buf_in_shp[-2]), dst_pitches=(buf_out_shp[-1], buf_out_shp[-1] * buf_out_shp[-2])) # Create zero-initialised array to receive sub-array from device h_ary_out = np.zeros(ary_out_shp, dtype=h_ary_in.dtype) # Copy sub-array (rectangular buffer) from device to host-array. cl.enqueue_copy(queue, h_ary_out, d_out_buf, buffer_origin=buf_out_origin[::-1], host_origin=ary_out_origin[::-1], region=sub_ary_shp[::-1], buffer_pitches=(buf_out_shp[-1], buf_out_shp[-1] * buf_out_shp[-2]), host_pitches=(ary_out_shp[-1], ary_out_shp[-1] * ary_out_shp[-2])) queue.finish() # Check that the sub-array copied to device is # the same as the sub-array received from device. assert np.array_equal(h_ary_in[ary_in_slice], h_ary_out[ary_out_slice])