def axnpby(self, *arr): if any(arr[0].traits != x.traits for x in arr[1:]): raise ValueError('Incompatible matrix types') nv = len(arr) nrow, leaddim, leadsubdim, dtype = arr[0].traits # Render the kernel template tpl = self.backend.lookup.get_template('axnpby') src = tpl.render(nv=nv, alignb=self.backend.alignb, fpdtype=dtype) # Build the kernel kern = self._build_kernel('axnpby', src, [np.int32] + [np.intp]*nv + [dtype]*nv) # Determine the total element count in the matrices cnt = leaddim*nrow # Compute a suitable global and local workgroup sizes gs, ls = splay(self.backend.qdflt, cnt) class AxnpbyKernel(ComputeKernel): def run(self, queue, *consts): args = [x.data for x in arr] + list(consts) kern(queue.cl_queue_comp, gs, ls, cnt, *args) return AxnpbyKernel()
def _fill(self, distribution, ary, scale, shift, queue=None): """Fill *ary* with uniformly distributed random numbers in the interval *(a, b)*, endpoints excluded. :return: a :class:`pyopencl.Event` """ if queue is None: queue = ary.queue knl, counter_multiplier, size_multiplier = \ self.get_gen_kernel(ary.dtype, distribution) args = self.key + self.counter + [ ary.data, ary.size*size_multiplier, scale, shift] n = ary.size from pyopencl.array import splay gsize, lsize = splay(queue, ary.size) evt = knl(queue, gsize, lsize, *args) self.counter[0] += n * counter_multiplier c1_incr, self.counter[0] = divmod(self.counter[0], self.counter_max) if c1_incr: self.counter[1] += c1_incr c2_incr, self.counter[1] = divmod(self.counter[1], self.counter_max) self.counter[2] += c2_incr return evt
def axnpby(self, *arr): if any(arr[0].traits != x.traits for x in arr[1:]): raise ValueError('Incompatible matrix types') nv = len(arr) nrow, leaddim, leadsubdim, dtype = arr[0].traits # Render the kernel template src = self.backend.lookup.get_template('axnpby').render(nv=nv) # Build the kernel kern = self._build_kernel('axnpby', src, [np.int32] + [np.intp] * nv + [dtype] * nv) # Determine the total element count in the matrices cnt = leaddim * nrow # Compute a suitable global and local workgroup sizes gs, ls = splay(self.backend.qdflt, cnt) class AxnpbyKernel(ComputeKernel): def run(self, queue, *consts): args = [x.data for x in arr] + list(consts) kern(queue.cl_queue_comp, gs, ls, cnt, *args) return AxnpbyKernel()
def __call__(self, *args, **kwargs): repr_vec = None range_ = kwargs.pop("range", None) slice_ = kwargs.pop("slice", None) capture_as = kwargs.pop("capture_as", None) use_range = range_ is not None or slice_ is not None kernel, arg_descrs = self.get_kernel(use_range) # {{{ assemble arg array invocation_args = [] for arg, arg_descr in zip(args, arg_descrs): if isinstance(arg_descr, VectorArg): if not arg.flags.forc: raise RuntimeError("ElementwiseKernel cannot " "deal with non-contiguous arrays") if repr_vec is None: repr_vec = arg invocation_args.append(arg.base_data) if arg_descr.with_offset: invocation_args.append(arg.offset) else: invocation_args.append(arg) # }}} queue = kwargs.pop("queue", None) wait_for = kwargs.pop("wait_for", None) if kwargs: raise TypeError("unknown keyword arguments: '%s'" % ", ".join(kwargs)) if queue is None: queue = repr_vec.queue if slice_ is not None: if range_ is not None: raise TypeError("may not specify both range and slice " "keyword arguments") range_ = slice(*slice_.indices(repr_vec.size)) max_wg_size = kernel.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, queue.device) if range_ is not None: start = range_.start if start is None: start = 0 invocation_args.append(start) invocation_args.append(range_.stop) if range_.step is None: step = 1 else: step = range_.step invocation_args.append(step) from pyopencl.array import splay gs, ls = splay(queue, abs(range_.stop - start)//step, max_wg_size) else: invocation_args.append(repr_vec.size) gs, ls = repr_vec.get_sizes(queue, max_wg_size) if capture_as is not None: kernel.set_args(*invocation_args) kernel.capture_call( capture_as, queue, gs, ls, *invocation_args, wait_for=wait_for) kernel.set_args(*invocation_args) return cl.enqueue_nd_range_kernel(queue, kernel, gs, ls, wait_for=wait_for)
def __call__(self, queue, n_objects, *args, **kwargs): """ :arg args: arguments corresponding to arg_decls in the constructor. :class:`pyopencl.array.Array` are not allowed directly and should be passed as their :attr:`pyopencl.array.Array.data` attribute instead. :arg allocator: optionally, the allocator to use to allocate new arrays. :arg wait_for: |explain-waitfor| :returns: a tuple ``(lists, event)``, where *lists* a mapping from (built) list names to objects which have attributes * ``count`` for the total number of entries in all lists combined * ``lists`` for the array containing all lists. * ``starts`` for the array of starting indices in `lists`. `starts` is built so that it has n+1 entries, so that the *i*'th entry is the start of the *i*'th list, and the *i*'th entry is the index one past the *i*'th list's end, even for the last list. This implies that all lists are contiguous. *event* is a :class:`pyopencl.Event` for dependency management. """ if n_objects >= int(np.iinfo(np.int32).max): index_dtype = np.int64 else: index_dtype = np.int32 index_dtype = np.dtype(index_dtype) allocator = kwargs.pop("allocator", None) wait_for = kwargs.pop("wait_for", None) if kwargs: raise TypeError("invalid keyword arguments: '%s'" % ", ".join(kwargs)) result = {} count_list_args = [] if wait_for is None: wait_for = [] count_kernel = self.get_count_kernel(index_dtype) write_kernel = self.get_write_kernel(index_dtype) scan_kernel = self.get_scan_kernel(index_dtype) # {{{ allocate memory for counts for name, dtype in self.list_names_and_dtypes: if name in self.count_sharing: continue counts = cl.array.empty(queue, (n_objects + 1), index_dtype, allocator=allocator) counts[-1] = 0 wait_for = wait_for + counts.events # The scan will turn the "counts" array into the "starts" array # in-place. result[name] = BuiltList(starts=counts) count_list_args.append(counts.data) # }}} if self.debug: gsize = (1, ) lsize = (1, ) elif self.complex_kernel and queue.device.type == cl.device_type.CPU: gsize = (4 * queue.device.max_compute_units, ) lsize = (1, ) else: from pyopencl.array import splay gsize, lsize = splay(queue, n_objects) count_event = count_kernel( queue, gsize, lsize, *(tuple(count_list_args) + args + (n_objects, )), **dict(wait_for=wait_for)) # {{{ run scans scan_events = [] for name, dtype in self.list_names_and_dtypes: if name in self.count_sharing: continue info_record = result[name] starts_ary = info_record.starts evt = scan_kernel(starts_ary, wait_for=[count_event], size=n_objects) starts_ary.setitem(0, 0, queue=queue, wait_for=[evt]) scan_events.extend(starts_ary.events) # retrieve count info_record.count = int(starts_ary[-1].get()) # }}} # {{{ deal with count-sharing lists, allocate memory for lists write_list_args = [] for name, dtype in self.list_names_and_dtypes: if name in self.count_sharing: sharing_from = self.count_sharing[name] info_record = result[name] = BuiltList( count=result[sharing_from].count, starts=result[sharing_from].starts, ) else: info_record = result[name] info_record.lists = cl.array.empty(queue, info_record.count, dtype, allocator=allocator) write_list_args.append(info_record.lists.data) if name not in self.count_sharing: write_list_args.append(info_record.starts.data) # }}} evt = write_kernel(queue, gsize, lsize, *(tuple(write_list_args) + args + (n_objects, )), **dict(wait_for=scan_events)) return result, evt
def __call__(self, queue, n_objects, *args, **kwargs): """ :arg args: arguments corresponding to arg_decls in the constructor. :class:`pyopencl.array.Array` are not allowed directly and should be passed as their :attr:`pyopencl.array.Array.data` attribute instead. :arg allocator: optionally, the allocator to use to allocate new arrays. :returns: a mapping from names to objects which have attributes * `count` for the total number of entries in all lists combined * `lists` for the array containing all lists. * `starts` for the array of starting indices in `lists`. `starts` is built so that it has n+1 entries, so that the *i*'th entry is the start of the *i*'th list, and the *i*'th entry is the index one past the *i*'th list's end, even for the last list. This implies that all lists are contiguous. """ if n_objects >= int(np.iinfo(np.int32).max): index_dtype = np.int64 else: index_dtype = np.int32 index_dtype = np.dtype(index_dtype) allocator = kwargs.pop("allocator", None) if kwargs: raise TypeError("invalid keyword arguments: '%s'" % ", ".join(kwargs)) result = {} count_list_args = [] count_kernel = self.get_count_kernel(index_dtype) write_kernel = self.get_write_kernel(index_dtype) scan_kernel = self.get_scan_kernel(index_dtype) # {{{ allocate memory for counts for name, dtype in self.list_names_and_dtypes: if name in self.count_sharing: continue counts = cl.array.empty(queue, (n_objects + 1), index_dtype, allocator=allocator) # The scan will turn the "counts" array into the "starts" array # in-place. result[name] = BuiltList(starts=counts) count_list_args.append(counts.data) # }}} if self.debug: gsize = (1,) lsize = (1,) elif self.complex_kernel and queue.device.type == cl.device_type.CPU: gsize = (4 * queue.device.max_compute_units,) lsize = (1,) else: from pyopencl.array import splay gsize, lsize = splay(queue, n_objects) count_kernel(queue, gsize, lsize, *(tuple(count_list_args) + args + (n_objects,))) # {{{ run scans for name, dtype in self.list_names_and_dtypes: if name in self.count_sharing: continue info_record = result[name] starts_ary = info_record.starts scan_kernel(starts_ary) # set first entry to zero cl.enqueue_copy(queue, starts_ary.data, index_dtype.type(0)) # retrieve count count = np.array(1, index_dtype) cl.enqueue_copy(queue, count, starts_ary.data, device_offset=index_dtype.itemsize * n_objects) info_record.count = int(count) # }}} # {{{ deal with count-sharing lists, allocate memory for lists write_list_args = [] for name, dtype in self.list_names_and_dtypes: if name in self.count_sharing: sharing_from = self.count_sharing[name] info_record = result[name] = BuiltList( count=result[sharing_from].count, starts=result[sharing_from].starts ) else: info_record = result[name] info_record.lists = cl.array.empty(queue, info_record.count, dtype, allocator=allocator) write_list_args.append(info_record.lists.data) if name not in self.count_sharing: write_list_args.append(info_record.starts.data) # }}} write_kernel(queue, gsize, lsize, *(tuple(write_list_args) + args + (n_objects,))) return result
def __call__(self, queue, n_objects, *args, **kwargs): """ :arg args: arguments corresponding to arg_decls in the constructor. Array-like arguments must be either 1D :class:`pyopencl.array.Array` objects or :class:`pyopencl.MemoryObject` objects, of which the latter can be obtained from a :class:`pyopencl.array.Array` using the :attr:`pyopencl.array.Array.data` attribute. :arg allocator: optionally, the allocator to use to allocate new arrays. :arg omit_lists: An iterable of list names that should *not* be built with this invocation. The kernel code may *not* call ``APPEND_name`` for these omitted lists. If it does, undefined behavior will result. The returned *lists* dictionary will not contain an entry for names in *omit_lists*. :arg wait_for: |explain-waitfor| :returns: a tuple ``(lists, event)``, where *lists* a mapping from (built) list names to objects which have attributes * ``count`` for the total number of entries in all lists combined * ``lists`` for the array containing all lists. * ``starts`` for the array of starting indices in `lists`. `starts` is built so that it has n+1 entries, so that the *i*'th entry is the start of the *i*'th list, and the *i*'th entry is the index one past the *i*'th list's end, even for the last list. This implies that all lists are contiguous. If the list name is specified in *eliminate_empty_output_lists* constructor argument, *lists* has two additional attributes ``num_nonempty_lists`` and ``nonempty_indices`` * ``num_nonempty_lists`` for the number of nonempty lists. * ``nonempty_indices`` for the index of nonempty list in input objects. In this case, `starts` has `num_nonempty_lists` + 1 entries. The *i*'s entry is the start of the *i*'th nonempty list, which is generated by the object with index *nonempty_indices[i]*. *event* is a :class:`pyopencl.Event` for dependency management. .. versionchanged:: 2016.2 Added omit_lists. """ if n_objects >= int(np.iinfo(np.int32).max): index_dtype = np.int64 else: index_dtype = np.int32 index_dtype = np.dtype(index_dtype) allocator = kwargs.pop("allocator", None) omit_lists = kwargs.pop("omit_lists", []) wait_for = kwargs.pop("wait_for", None) if kwargs: raise TypeError("invalid keyword arguments: '%s'" % ", ".join(kwargs)) for oml in omit_lists: if not any(oml == name for name, _ in self.list_names_and_dtypes): raise ValueError("invalid list name '%s' in omit_lists") result = {} count_list_args = [] if wait_for is None: wait_for = [] else: # We'll be modifying it below. wait_for = list(wait_for) count_kernel = self.get_count_kernel(index_dtype) write_kernel = self.get_write_kernel(index_dtype) scan_kernel = self.get_scan_kernel(index_dtype) if self.eliminate_empty_output_lists: compress_kernel = self.get_compress_kernel(index_dtype) data_args = [] for i, (arg_descr, arg_val) in enumerate(zip(self.arg_decls, args)): from pyopencl.tools import VectorArg if isinstance(arg_descr, VectorArg): from pyopencl import MemoryObject if isinstance(arg_val, MemoryObject): data_args.append(arg_val) if arg_descr.with_offset: raise ValueError( "with_offset=True specified for argument %d " "but the argument is not an array" % i) continue if arg_val.ndim != 1: raise ValueError("argument %d is a multidimensional array" % i) data_args.append(arg_val.base_data) if arg_descr.with_offset: data_args.append(arg_val.offset) wait_for.extend(arg_val.events) else: data_args.append(arg_val) del args data_args = tuple(data_args) # {{{ allocate memory for counts for name, dtype in self.list_names_and_dtypes: if name in self.count_sharing: continue if name in omit_lists: count_list_args.append(None) continue counts = cl.array.empty(queue, (n_objects + 1), index_dtype, allocator=allocator) counts[-1] = 0 wait_for = wait_for + counts.events # The scan will turn the "counts" array into the "starts" array # in-place. if name in self.eliminate_empty_output_lists: result[name] = BuiltList(count=None, starts=counts, lists=None, num_nonempty_lists=None, nonempty_indices=None) else: result[name] = BuiltList(count=None, starts=counts, lists=None) count_list_args.append(counts.data) # }}} if self.debug: gsize = (1,) lsize = (1,) elif self.do_not_vectorize(): gsize = (4*queue.device.max_compute_units,) lsize = (1,) else: from pyopencl.array import splay gsize, lsize = splay(queue, n_objects) count_event = count_kernel(queue, gsize, lsize, *(tuple(count_list_args) + data_args + (n_objects,)), wait_for=wait_for) compress_events = {} for name, dtype in self.list_names_and_dtypes: if name in omit_lists: continue if name in self.count_sharing: continue if name not in self.eliminate_empty_output_lists: continue compressed_counts = cl.array.empty( queue, (n_objects + 1,), index_dtype, allocator=allocator) info_record = result[name] info_record.nonempty_indices = cl.array.empty( queue, (n_objects + 1,), index_dtype, allocator=allocator) info_record.num_nonempty_lists = cl.array.empty( queue, (1,), index_dtype, allocator=allocator) info_record.compressed_indices = cl.array.empty( queue, (n_objects + 1,), index_dtype, allocator=allocator) info_record.compressed_indices[0] = 0 compress_events[name] = compress_kernel( info_record.starts, compressed_counts, info_record.nonempty_indices, info_record.compressed_indices, info_record.num_nonempty_lists, wait_for=[count_event] + info_record.compressed_indices.events) info_record.starts = compressed_counts # {{{ run scans scan_events = [] for name, dtype in self.list_names_and_dtypes: if name in self.count_sharing: continue if name in omit_lists: continue info_record = result[name] if name in self.eliminate_empty_output_lists: compress_events[name].wait() num_nonempty_lists = info_record.num_nonempty_lists.get()[0] info_record.num_nonempty_lists = num_nonempty_lists info_record.starts = info_record.starts[:num_nonempty_lists + 1] info_record.nonempty_indices = \ info_record.nonempty_indices[:num_nonempty_lists] info_record.starts[-1] = 0 starts_ary = info_record.starts if name in self.eliminate_empty_output_lists: evt = scan_kernel( starts_ary, size=info_record.num_nonempty_lists, wait_for=starts_ary.events) else: evt = scan_kernel(starts_ary, wait_for=[count_event], size=n_objects) starts_ary.setitem(0, 0, queue=queue, wait_for=[evt]) scan_events.extend(starts_ary.events) # retrieve count info_record.count = int(starts_ary[-1].get()) # }}} # {{{ deal with count-sharing lists, allocate memory for lists write_list_args = [] for name, dtype in self.list_names_and_dtypes: if name in omit_lists: write_list_args.append(None) if name not in self.count_sharing: write_list_args.append(None) if name in self.eliminate_empty_output_lists: write_list_args.append(None) continue if name in self.count_sharing: sharing_from = self.count_sharing[name] info_record = result[name] = BuiltList( count=result[sharing_from].count, starts=result[sharing_from].starts, ) else: info_record = result[name] info_record.lists = cl.array.empty(queue, info_record.count, dtype, allocator=allocator) write_list_args.append(info_record.lists.data) if name not in self.count_sharing: write_list_args.append(info_record.starts.data) if name in self.eliminate_empty_output_lists: write_list_args.append(info_record.compressed_indices.data) # }}} evt = write_kernel(queue, gsize, lsize, *(tuple(write_list_args) + data_args + (n_objects,)), wait_for=scan_events) return result, evt
def __call__(self, queue, n_objects, *args, **kwargs): """ :arg args: arguments corresponding to arg_decls in the constructor. Array-like arguments must be either 1D :class:`pyopencl.array.Array` objects or :class:`pyopencl.MemoryObject` objects, of which the latter can be obtained from a :class:`pyopencl.array.Array` using the :attr:`pyopencl.array.Array.data` attribute. :arg allocator: optionally, the allocator to use to allocate new arrays. :arg omit_lists: An iterable of list names that should *not* be built with this invocation. The kernel code may *not* call ``APPEND_name`` for these omitted lists. If it does, undefined behavior will result. The returned *lists* dictionary will not contain an entry for names in *omit_lists*. :arg wait_for: |explain-waitfor| :returns: a tuple ``(lists, event)``, where *lists* a mapping from (built) list names to objects which have attributes * ``count`` for the total number of entries in all lists combined * ``lists`` for the array containing all lists. * ``starts`` for the array of starting indices in `lists`. `starts` is built so that it has n+1 entries, so that the *i*'th entry is the start of the *i*'th list, and the *i*'th entry is the index one past the *i*'th list's end, even for the last list. This implies that all lists are contiguous. If the list name is specified in *eliminate_empty_output_lists* constructor argument, *lists* has two additional attributes ``num_nonempty_lists`` and ``nonempty_indices`` * ``num_nonempty_lists`` for the number of nonempty lists. * ``nonempty_indices`` for the index of nonempty list in input objects. In this case, `starts` has `num_nonempty_lists` + 1 entries. The *i*'s entry is the start of the *i*'th nonempty list, which is generated by the object with index *nonempty_indices[i]*. *event* is a :class:`pyopencl.Event` for dependency management. .. versionchanged:: 2016.2 Added omit_lists. """ if n_objects >= int(np.iinfo(np.int32).max): index_dtype = np.int64 else: index_dtype = np.int32 index_dtype = np.dtype(index_dtype) allocator = kwargs.pop("allocator", None) omit_lists = kwargs.pop("omit_lists", []) wait_for = kwargs.pop("wait_for", None) if kwargs: raise TypeError("invalid keyword arguments: '%s'" % ", ".join(kwargs)) for oml in omit_lists: if not any(oml == name for name, _ in self.list_names_and_dtypes): raise ValueError("invalid list name '%s' in omit_lists") result = {} count_list_args = [] if wait_for is None: wait_for = [] count_kernel = self.get_count_kernel(index_dtype) write_kernel = self.get_write_kernel(index_dtype) scan_kernel = self.get_scan_kernel(index_dtype) if self.eliminate_empty_output_lists: compress_kernel = self.get_compress_kernel(index_dtype) data_args = [] for i, (arg_descr, arg_val) in enumerate(zip(self.arg_decls, args)): from pyopencl.tools import VectorArg if isinstance(arg_descr, VectorArg): from pyopencl import MemoryObject if isinstance(arg_val, MemoryObject): data_args.append(arg_val) if arg_descr.with_offset: raise ValueError( "with_offset=True specified for argument %d " "but the argument is not an array" % i) continue if arg_val.ndim != 1: raise ValueError("argument %d is a multidimensional array" % i) data_args.append(arg_val.base_data) if arg_descr.with_offset: data_args.append(arg_val.offset) else: data_args.append(arg_val) del args data_args = tuple(data_args) # {{{ allocate memory for counts for name, dtype in self.list_names_and_dtypes: if name in self.count_sharing: continue if name in omit_lists: count_list_args.append(None) continue counts = cl.array.empty(queue, (n_objects + 1), index_dtype, allocator=allocator) counts[-1] = 0 wait_for = wait_for + counts.events # The scan will turn the "counts" array into the "starts" array # in-place. if name in self.eliminate_empty_output_lists: result[name] = BuiltList(count=None, starts=counts, lists=None, num_nonempty_lists=None, nonempty_indices=None) else: result[name] = BuiltList(count=None, starts=counts, lists=None) count_list_args.append(counts.data) # }}} if self.debug: gsize = (1,) lsize = (1,) elif self.do_not_vectorize(): gsize = (4*queue.device.max_compute_units,) lsize = (1,) else: from pyopencl.array import splay gsize, lsize = splay(queue, n_objects) count_event = count_kernel(queue, gsize, lsize, *(tuple(count_list_args) + data_args + (n_objects,)), **dict(wait_for=wait_for)) compress_events = {} for name, dtype in self.list_names_and_dtypes: if name in omit_lists: continue if name in self.count_sharing: continue if name not in self.eliminate_empty_output_lists: continue compressed_counts = cl.array.empty( queue, (n_objects + 1,), index_dtype, allocator=allocator) info_record = result[name] info_record.nonempty_indices = cl.array.empty( queue, (n_objects + 1,), index_dtype, allocator=allocator) info_record.num_nonempty_lists = cl.array.empty( queue, (1,), index_dtype, allocator=allocator) info_record.compressed_indices = cl.array.empty( queue, (n_objects + 1,), index_dtype, allocator=allocator) info_record.compressed_indices[0] = 0 compress_events[name] = compress_kernel( info_record.starts, compressed_counts, info_record.nonempty_indices, info_record.compressed_indices, info_record.num_nonempty_lists, wait_for=[count_event] + info_record.compressed_indices.events) info_record.starts = compressed_counts # {{{ run scans scan_events = [] for name, dtype in self.list_names_and_dtypes: if name in self.count_sharing: continue if name in omit_lists: continue info_record = result[name] if name in self.eliminate_empty_output_lists: compress_events[name].wait() num_nonempty_lists = info_record.num_nonempty_lists.get()[0] info_record.num_nonempty_lists = num_nonempty_lists info_record.starts = info_record.starts[:num_nonempty_lists + 1] info_record.nonempty_indices = \ info_record.nonempty_indices[:num_nonempty_lists] info_record.starts[-1] = 0 starts_ary = info_record.starts if name in self.eliminate_empty_output_lists: evt = scan_kernel( starts_ary, size=info_record.num_nonempty_lists, wait_for=starts_ary.events) else: evt = scan_kernel(starts_ary, wait_for=[count_event], size=n_objects) starts_ary.setitem(0, 0, queue=queue, wait_for=[evt]) scan_events.extend(starts_ary.events) # retrieve count info_record.count = int(starts_ary[-1].get()) # }}} # {{{ deal with count-sharing lists, allocate memory for lists write_list_args = [] for name, dtype in self.list_names_and_dtypes: if name in omit_lists: write_list_args.append(None) if name not in self.count_sharing: write_list_args.append(None) if name in self.eliminate_empty_output_lists: write_list_args.append(None) continue if name in self.count_sharing: sharing_from = self.count_sharing[name] info_record = result[name] = BuiltList( count=result[sharing_from].count, starts=result[sharing_from].starts, ) else: info_record = result[name] info_record.lists = cl.array.empty(queue, info_record.count, dtype, allocator=allocator) write_list_args.append(info_record.lists.data) if name not in self.count_sharing: write_list_args.append(info_record.starts.data) if name in self.eliminate_empty_output_lists: write_list_args.append(info_record.compressed_indices.data) # }}} evt = write_kernel(queue, gsize, lsize, *(tuple(write_list_args) + data_args + (n_objects,)), **dict(wait_for=scan_events)) return result, evt
def __call__(self, queue, n_objects, *args, **kwargs): """ :arg args: arguments corresponding to arg_decls in the constructor. :class:`pyopencl.array.Array` are not allowed directly and should be passed as their :attr:`pyopencl.array.Array.data` attribute instead. :arg allocator: optionally, the allocator to use to allocate new arrays. :arg wait_for: |explain-waitfor| :returns: a tuple ``(lists, event)``, where *lists* a mapping from (built) list names to objects which have attributes * ``count`` for the total number of entries in all lists combined * ``lists`` for the array containing all lists. * ``starts`` for the array of starting indices in `lists`. `starts` is built so that it has n+1 entries, so that the *i*'th entry is the start of the *i*'th list, and the *i*'th entry is the index one past the *i*'th list's end, even for the last list. This implies that all lists are contiguous. *event* is a :class:`pyopencl.Event` for dependency management. """ if n_objects >= int(np.iinfo(np.int32).max): index_dtype = np.int64 else: index_dtype = np.int32 index_dtype = np.dtype(index_dtype) allocator = kwargs.pop("allocator", None) wait_for = kwargs.pop("wait_for", None) if kwargs: raise TypeError("invalid keyword arguments: '%s'" % ", ".join(kwargs)) result = {} count_list_args = [] if wait_for is None: wait_for = [] count_kernel = self.get_count_kernel(index_dtype) write_kernel = self.get_write_kernel(index_dtype) scan_kernel = self.get_scan_kernel(index_dtype) # {{{ allocate memory for counts for name, dtype in self.list_names_and_dtypes: if name in self.count_sharing: continue counts = cl.array.empty(queue, (n_objects + 1), index_dtype, allocator=allocator) counts[-1] = 0 wait_for = wait_for + counts.events # The scan will turn the "counts" array into the "starts" array # in-place. result[name] = BuiltList(starts=counts) count_list_args.append(counts.data) # }}} if self.debug: gsize = (1,) lsize = (1,) elif self.complex_kernel and queue.device.type == cl.device_type.CPU: gsize = (4*queue.device.max_compute_units,) lsize = (1,) else: from pyopencl.array import splay gsize, lsize = splay(queue, n_objects) count_event = count_kernel(queue, gsize, lsize, *(tuple(count_list_args) + args + (n_objects,)), **dict(wait_for=wait_for)) # {{{ run scans scan_events = [] for name, dtype in self.list_names_and_dtypes: if name in self.count_sharing: continue info_record = result[name] starts_ary = info_record.starts evt = scan_kernel(starts_ary, wait_for=[count_event], size=n_objects) starts_ary.setitem(0, 0, queue=queue, wait_for=[evt]) scan_events.extend(starts_ary.events) # retrieve count info_record.count = int(starts_ary[-1].get()) # }}} # {{{ deal with count-sharing lists, allocate memory for lists write_list_args = [] for name, dtype in self.list_names_and_dtypes: if name in self.count_sharing: sharing_from = self.count_sharing[name] info_record = result[name] = BuiltList( count=result[sharing_from].count, starts=result[sharing_from].starts, ) else: info_record = result[name] info_record.lists = cl.array.empty(queue, info_record.count, dtype, allocator=allocator) write_list_args.append(info_record.lists.data) if name not in self.count_sharing: write_list_args.append(info_record.starts.data) # }}} evt = write_kernel(queue, gsize, lsize, *(tuple(write_list_args) + args + (n_objects,)), **dict(wait_for=scan_events)) return result, evt
def __call__(self, queue, n_objects, *args, **kwargs): """ :arg args: arguments corresponding to arg_decls in the constructor. :class:`pyopencl.array.Array` are not allowed directly and should be passed as their :attr:`pyopencl.array.Array.data` attribute instead. :arg allocator: optionally, the allocator to use to allocate new arrays. :returns: a mapping from names to objects which have attributes * `count` for the total number of entries in all lists combined * `lists` for the array containing all lists. * `starts` for the array of starting indices in `lists`. `starts` is built so that it has n+1 entries, so that the *i*'th entry is the start of the *i*'th list, and the *i*'th entry is the index one past the *i*'th list's end, even for the last list. This implies that all lists are contiguous. """ if n_objects >= int(np.iinfo(np.int32).max): index_dtype = np.int64 else: index_dtype = np.int32 index_dtype = np.dtype(index_dtype) allocator = kwargs.pop("allocator", None) if kwargs: raise TypeError("invalid keyword arguments: '%s'" % ", ".join(kwargs)) result = {} count_list_args = [] count_kernel = self.get_count_kernel(index_dtype) write_kernel = self.get_write_kernel(index_dtype) scan_kernel = self.get_scan_kernel(index_dtype) # {{{ allocate memory for counts for name, dtype in self.list_names_and_dtypes: if name in self.count_sharing: continue counts = cl.array.empty(queue, (n_objects + 1), index_dtype, allocator=allocator) # The scan will turn the "counts" array into the "starts" array # in-place. result[name] = BuiltList(starts=counts) count_list_args.append(counts.data) # }}} if self.debug: gsize = (1, ) lsize = (1, ) elif self.complex_kernel and queue.device.type == cl.device_type.CPU: gsize = (4 * queue.device.max_compute_units, ) lsize = (1, ) else: from pyopencl.array import splay gsize, lsize = splay(queue, n_objects) count_kernel(queue, gsize, lsize, *(tuple(count_list_args) + args + (n_objects, ))) # {{{ run scans for name, dtype in self.list_names_and_dtypes: if name in self.count_sharing: continue info_record = result[name] starts_ary = info_record.starts scan_kernel(starts_ary) # set first entry to zero cl.enqueue_copy(queue, starts_ary.data, index_dtype.type(0)) # retrieve count count = np.array(1, index_dtype) cl.enqueue_copy(queue, count, starts_ary.data, device_offset=index_dtype.itemsize * n_objects) info_record.count = int(count) # }}} # {{{ deal with count-sharing lists, allocate memory for lists write_list_args = [] for name, dtype in self.list_names_and_dtypes: if name in self.count_sharing: sharing_from = self.count_sharing[name] info_record = result[name] = BuiltList( count=result[sharing_from].count, starts=result[sharing_from].starts, ) else: info_record = result[name] info_record.lists = cl.array.empty(queue, info_record.count, dtype, allocator=allocator) write_list_args.append(info_record.lists.data) if name not in self.count_sharing: write_list_args.append(info_record.starts.data) # }}} write_kernel(queue, gsize, lsize, *(tuple(write_list_args) + args + (n_objects, ))) return result