def test_get_async(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) device = queue.device if device.platform.vendor == "The pocl project" \ and device.type & cl.device_type.GPU: pytest.xfail("the async get test fails on POCL + Nvidia," "at least the K40, as of pocl 1.6, 2021-01-20") a = np.random.rand(10**6).astype(np.dtype("float32")) a_gpu = cl_array.to_device(queue, a) b = a + a**5 + 1 b_gpu = a_gpu + a_gpu**5 + 1 # deprecated, but still test b1 = b_gpu.get(async_=True) # testing that this waits for events b_gpu.finish() assert np.abs(b1 - b).mean() < 1e-5 b1, evt = b_gpu.get_async() # testing that this waits for events evt.wait() assert np.abs(b1 - b).mean() < 1e-5 wait_event = cl.UserEvent(context) b_gpu.add_event(wait_event) b, evt = b_gpu.get_async() # testing that this doesn't hang wait_event.set_status(cl.command_execution_status.COMPLETE) evt.wait() assert np.abs(b1 - b).mean() < 1e-5
def test_event_set_callback(ctx_factory): import sys if sys.platform.startswith("win"): pytest.xfail("Event.set_callback not present on Windows") ctx = ctx_factory() queue = cl.CommandQueue(ctx) _xfail_if_pocl_gpu(queue.device, "event callbacks") if ctx._get_cl_version() < (1, 1): pytest.skip("OpenCL 1.1 or newer required for set_callback") a_np = np.random.rand(50000).astype(np.float32) b_np = np.random.rand(50000).astype(np.float32) got_called = [] def cb(status): got_called.append(status) mf = cl.mem_flags a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_np) b_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=b_np) prg = cl.Program( ctx, """ __kernel void sum(__global const float *a_g, __global const float *b_g, __global float *res_g) { int gid = get_global_id(0); res_g[gid] = a_g[gid] + b_g[gid]; } """).build() res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes) uevt = cl.UserEvent(ctx) evt = prg.sum(queue, a_np.shape, None, a_g, b_g, res_g, wait_for=[uevt]) evt.set_callback(cl.command_execution_status.COMPLETE, cb) uevt.set_status(cl.command_execution_status.COMPLETE) queue.finish() counter = 0 # yuck while not got_called: from time import sleep sleep(0.01) # wait up to five seconds (?!) counter += 1 if counter >= 500: break assert got_called
def test_user_event(ctx_factory): ctx = ctx_factory() if (ctx._get_cl_version() < (1, 1) and cl.get_cl_header_version() < (1, 1)): from pytest import skip skip("UserEvent is only available in OpenCL 1.1") if ctx.devices[0].platform.name == "Portable Computing Language": # https://github.com/pocl/pocl/issues/201 pytest.xfail("POCL's user events don't work right") status = {} def event_waiter1(e, key): e.wait() status[key] = True def event_waiter2(e, key): cl.wait_for_events([e]) status[key] = True from threading import Thread from time import sleep evt = cl.UserEvent(ctx) Thread(target=event_waiter1, args=(evt, 1)).start() sleep(.05) if status.get(1, False): raise RuntimeError('UserEvent triggered before set_status') evt.set_status(cl.command_execution_status.COMPLETE) sleep(.05) if not status.get(1, False): raise RuntimeError('UserEvent.wait timeout') assert evt.command_execution_status == cl.command_execution_status.COMPLETE evt = cl.UserEvent(ctx) Thread(target=event_waiter2, args=(evt, 2)).start() sleep(.05) if status.get(2, False): raise RuntimeError('UserEvent triggered before set_status') evt.set_status(cl.command_execution_status.COMPLETE) sleep(.05) if not status.get(2, False): raise RuntimeError('cl.wait_for_events timeout on UserEvent') assert evt.command_execution_status == cl.command_execution_status.COMPLETE
def test_user_event(ctx_factory): ctx = ctx_factory() if (ctx._get_cl_version() < (1, 1) and cl.get_cl_header_version() < (1, 1)): from pytest import skip skip("UserEvent is only available in OpenCL 1.1") status = {} def event_waiter1(e, key): e.wait() status[key] = True def event_waiter2(e, key): cl.wait_for_events([e]) status[key] = True from threading import Thread from time import sleep evt = cl.UserEvent(ctx) Thread(target=event_waiter1, args=(evt, 1)).start() sleep(.05) if status.get(1, False): raise RuntimeError('UserEvent triggered before set_status') evt.set_status(cl.command_execution_status.COMPLETE) sleep(.05) if not status.get(1, False): raise RuntimeError('UserEvent.wait timeout') assert evt.command_execution_status == cl.command_execution_status.COMPLETE evt = cl.UserEvent(ctx) Thread(target=event_waiter2, args=(evt, 2)).start() sleep(.05) if status.get(2, False): raise RuntimeError('UserEvent triggered before set_status') evt.set_status(cl.command_execution_status.COMPLETE) sleep(.05) if not status.get(2, False): raise RuntimeError('cl.wait_for_events timeout on UserEvent') assert evt.command_execution_status == cl.command_execution_status.COMPLETE
def __call__(self, global_size: KernelGridType = None, local_size: KernelGridType = None, **kwargs: Union[TypesClArray, object]) -> cl.Event: # e.g. if two kernels of a program shall run concurrently, this can be enable by passing another queue here queue = kwargs.pop('queue', get_current_queue()) global_size, local_size, args = self._prepare_arguments( queue=queue, knl=self.kernel_model, global_size=global_size, local_size=local_size, **kwargs) self.function(global_size, local_size, *args) # create user event with context retrieved from first arg of type Array event = cl.UserEvent([ _ for _ in args if isinstance(_, TypesClArray.__args__) ][0].context) event.set_status(cl.command_execution_status.COMPLETE) return event
def test_get_async(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) a = np.random.rand(10**6).astype(np.dtype('float32')) a_gpu = cl_array.to_device(queue, a) b = a + a**5 + 1 b_gpu = a_gpu + a_gpu**5 + 1 # deprecated, but still test b1 = b_gpu.get(async_=True) # testing that this waits for events b_gpu.finish() assert np.abs(b1 - b).mean() < 1e-5 b1, evt = b_gpu.get_async() # testing that this waits for events evt.wait() assert np.abs(b1 - b).mean() < 1e-5 wait_event = cl.UserEvent(context) b_gpu.add_event(wait_event) b, evt = b_gpu.get_async() # testing that this doesn't hang wait_event.set_status(cl.command_execution_status.COMPLETE) evt.wait() assert np.abs(b1 - b).mean() < 1e-5
except: print('Build log:') print(prog.get_build_info(dev, cl.program_build_info.LOG)) raise # Data v = np.arange(4, dtype=np.float32) print('Input: ' + str(v)) # Create output buffer v_buff = cl.Buffer(context, flags=cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=v) # Create user event user_event = cl.UserEvent(context) def read_complete(status, data): print('Output: ' + str(data)) # Enqueue kernel that waits for user event before executing global_size = (1, ) local_size = None # __call__(queue, global_size, local_size, *args, global_offset=None, wait_for=None, g_times_l=False) kernel_event = prog.user_event(queue, global_size, local_size, v_buff,