def test_virtual_sizes_error_propagated(mock_backend_pycuda): # Testing for PyCUDA backend only since mocked PyOpenCL backend does not have a way # to set maximum global sizes (PyOpenCL devices don't have a corresponding parameter), # and PyCUDA is enough to test the required code path. device_info = PyCUDADeviceInfo( max_threads_per_block=2**4, max_block_dim_x=2**4, max_block_dim_y=2**4, max_block_dim_z=2**4, max_grid_dim_x=2**10, max_grid_dim_y=2**10, max_grid_dim_z=2**8) mock_backend_pycuda.add_devices([device_info]) api = API.from_api_id(mock_backend_pycuda.api_id) device = api.platforms[0].devices[0] context = Context.from_devices([device]) kernel = MockKernel('test', [None], max_total_local_sizes={0: 16}) src = MockDefTemplate(kernels=[kernel]) # Just enough to fit in the grid limits multiply = StaticKernel(context.device, src, 'test', (2**14, 2**10, 2**8), (2**4, 1, 1)) # Global size is too large to fit on the device, # so virtual size finding fails and the error is propagated to the user. with pytest.raises( VirtualSizeError, match="Bounding global size \\(16384, 2048, 256\\) is too large"): multiply = StaticKernel(context.device, src, 'test', (2**14, 2**11, 2**8), (2**4, 1, 1))
def test_compile_static_multi_device(mock_or_real_multi_device_context): context, mocked = mock_or_real_multi_device_context if mocked: kernel = MockKernel( 'multiply', [None, None, None], max_total_local_sizes={0: 1024, 1: 512}) src = MockDefTemplate(kernels=[kernel]) else: src = SRC a = numpy.arange(22).astype(numpy.int32) b = numpy.arange(15).astype(numpy.int32) ref = numpy.outer(a, b) mqueue = MultiQueue.on_devices(context.devices[[0, 1]]) a_dev = MultiArray.from_host(mqueue, a) b_dev = MultiArray.from_host(mqueue, b, splay=MultiArray.CloneSplay()) res_dev = MultiArray.empty(mqueue.devices, (22, 15), ref.dtype) multiply = StaticKernel(mqueue.devices, src, 'multiply', res_dev.shapes) multiply(mqueue, res_dev, a_dev, b_dev) res = res_dev.get(mqueue) if not mocked: assert (res == ref).all()
def test_compile_static(mock_or_real_context): context, mocked = mock_or_real_context if mocked: kernel = MockKernel('multiply', [None, None, None], max_total_local_sizes={0: 1024}) src = MockDefTemplate(kernels=[kernel]) else: src = SRC a = numpy.arange(11).astype(numpy.int32) b = numpy.arange(15).astype(numpy.int32) ref = numpy.outer(a, b) queue = Queue(context.device) a_dev = Array.from_host(queue, a) b_dev = Array.from_host(queue, b) res_dev = Array.empty(context.device, (11, 15), numpy.int32) multiply = StaticKernel(context.device, src, 'multiply', (11, 15)) multiply(queue, res_dev, a_dev, b_dev) res = res_dev.get(queue) if not mocked: assert (res == ref).all()
def test_builtin_globals(mock_backend_pycuda): mock_backend_pycuda.add_devices([ PyCUDADeviceInfo(max_threads_per_block=1024), PyCUDADeviceInfo(max_threads_per_block=512)]) source_template = DefTemplate.from_string( 'mock_source', [], """ KERNEL void test() { int max_total_local_size = ${device_params.max_total_local_size}; } """) api = API.from_api_id(mock_backend_pycuda.api_id) context = Context.from_devices([api.platforms[0].devices[0], api.platforms[0].devices[1]]) src = MockDefTemplate( kernels=[MockKernel('test', [None], max_total_local_sizes={0: 1024, 1: 512})], source_template=source_template) kernel = StaticKernel(context.devices, src, 'test', (1024,)) assert 'max_total_local_size = 1024' in kernel.sources[context.devices[0]].source assert 'max_total_local_size = 512' in kernel.sources[context.devices[1]].source
def test_zero_max_total_local_size(mock_context): kernel = MockKernel('test', [None], max_total_local_sizes={0: 0}) src = MockDefTemplate(kernels=[kernel]) with pytest.raises( VirtualSizeError, match="The kernel requires too much resourses to be executed with any local size"): multiply = StaticKernel(mock_context.device, src, 'test', (1024,))
def test_set_constant_array_errors(mock_4_device_context): context = mock_4_device_context api = API.from_api_id(mock_4_device_context.api.id) other_context = Context.from_criteria(api) other_queue = Queue(other_context.devices[0]) # Contexts don't know about each other and can't interact with stack in a consistent manner. # So we deactivate the other context if we're on CUDA API. if api.id == cuda_api_id(): other_context.deactivate() cm1 = numpy.arange(16).astype(numpy.int32) src = MockDefTemplate(kernels=[ MockKernel('kernel', [], max_total_local_sizes={ 0: 1024, 1: 1024, 2: 1024, 3: 1024 }) ], constant_mem={'cm1': cm1.size * cm1.dtype.itemsize}) queue = Queue(context.devices[0]) if context.api.id == cuda_api_id(): program = Program(context.devices, src, constant_arrays=dict(cm1=cm1)) with pytest.raises( ValueError, match= "The provided queue must belong to the same context as this program uses" ): program.set_constant_array(other_queue, 'cm1', cm1) with pytest.raises(TypeError, match="Unsupported array type"): program.set_constant_array(queue, 'cm1', [1]) with pytest.raises(ValueError, match="Incorrect size of the constant buffer;"): program.set_constant_array(queue, 'cm1', cm1[:8]) with pytest.raises(TypeError, match="Unknown constant array metadata type"): program = Program(context.devices[[0, 1, 2]], src, constant_arrays=dict(cm1=1)) program = Program(context.devices[[0, 1, 2]], src, constant_arrays=dict(cm1=cm1)) queue3 = Queue(context.devices[3]) with pytest.raises( ValueError, match= "The program was not compiled for the device this queue uses"): program.set_constant_array(queue3, 'cm1', cm1) else: with pytest.raises( ValueError, match= "Compile-time constant arrays are only supported by CUDA API"): program = Program(context.devices, src, constant_arrays=dict(cm1=cm1)) program = Program(context.devices, src) with pytest.raises( ValueError, match="Constant arrays are only supported for CUDA API"): program.set_constant_array(queue, 'cm1', cm1) with pytest.raises( ValueError, match= "Compile-time constant arrays are only supported by CUDA API"): sk = StaticKernel(context.devices, src, 'kernel', 1024, constant_arrays=dict(cm1=cm1)) sk = StaticKernel(context.devices, src, 'kernel', 1024) with pytest.raises( ValueError, match="Constant arrays are only supported for CUDA API"): sk.set_constant_array(queue, 'cm1', cm1)
def _test_constant_memory(context, mocked, is_static): cm1 = numpy.arange(16).astype(numpy.int32) cm2 = numpy.arange(16).astype(numpy.int32) * 2 + 1 cm3 = numpy.arange(16).astype(numpy.int32) * 3 + 2 if mocked: kernel = MockKernel('copy_from_cm', [None] if context.api.id == cuda_api_id() else [None, None, None, None], max_total_local_sizes={0: 1024}) src = MockDefTemplate(constant_mem={ 'cm1': cm1.size * cm1.dtype.itemsize, 'cm2': cm2.size * cm2.dtype.itemsize, 'cm3': cm3.size * cm3.dtype.itemsize }, kernels=[kernel]) else: src = SRC_CONSTANT_MEM_STATIC if is_static else SRC_CONSTANT_MEM queue = Queue(context.device) cm1_dev = Array.from_host(queue, cm1) cm2_dev = Array.from_host(queue, cm2) cm3_dev = Array.from_host(queue, cm3) res_dev = Array.empty(context.device, 16, numpy.int32) if context.api.id == cuda_api_id(): # Use different forms of constant array representation constant_arrays = dict( cm1=cm1, # as an array(-like) object cm2=(cm2.shape, cm2.dtype), # as a tuple of shape and dtype cm3=cm3_dev) # as a device array if is_static: copy_from_cm = StaticKernel(context.device, src, 'copy_from_cm', global_size=16, constant_arrays=constant_arrays) copy_from_cm.set_constant_array( queue, 'cm1', cm1_dev) # setting from a device array copy_from_cm.set_constant_array(queue, 'cm2', cm2) # setting from a host array copy_from_cm.set_constant_array( queue, 'cm3', cm3_dev.data) # setting from a host buffer else: program = Program(context.device, src, constant_arrays=constant_arrays) program.set_constant_array(queue, 'cm1', cm1_dev) # setting from a device array program.set_constant_array(queue, 'cm2', cm2) # setting from a host array program.set_constant_array( queue, 'cm3', cm3_dev.data) # setting from a host buffer copy_from_cm = lambda queue, *args: program.kernel.copy_from_cm( queue, 16, None, *args) copy_from_cm(queue, res_dev) else: if is_static: copy_from_cm = StaticKernel(context.device, src, 'copy_from_cm', global_size=16) else: program = Program(context.device, src) copy_from_cm = lambda queue, *args: program.kernel.copy_from_cm( queue, 16, None, *args) copy_from_cm(queue, res_dev, cm1_dev, cm2_dev, cm3_dev) res = res_dev.get(queue) if not mocked: assert (res == cm1 + cm2 + cm3).all()
def test_find_local_size(mock_context): kernel = MockKernel('multiply', [None], max_total_local_sizes={0: 64}) src = MockDefTemplate(kernels=[kernel]) multiply = StaticKernel(mock_context.device, src, 'multiply', (11, 15)) assert multiply._vs_metadata[mock_context.devices[0]].real_global_size == (16, 12) assert multiply._vs_metadata[mock_context.devices[0]].real_local_size == (16, 4)
def test_reserved_names(mock_context): kernel = MockKernel('test', [None]) src = MockDefTemplate(kernels=[kernel]) with pytest.raises(ValueError, match="The global name 'static' is reserved in static kernels"): multiply = StaticKernel(mock_context.device, src, 'test', (1024,), render_globals=dict(static=1))