예제 #1
0
파일: blasext.py 프로젝트: abudulemusa/PyFR
    def axnpby(self, *arr):
        if any(arr[0].traits != x.traits for x in arr[1:]):
            raise ValueError('Incompatible matrix types')

        nv = len(arr)
        nrow, leaddim, leadsubdim, dtype = arr[0].traits

        # Render the kernel template
        tpl = self.backend.lookup.get_template('axnpby')
        src = tpl.render(nv=nv, alignb=self.backend.alignb, fpdtype=dtype)

        # Build the kernel
        kern = self._build_kernel('axnpby', src,
                                  [np.int32] + [np.intp]*nv + [dtype]*nv)

        # Determine the total element count in the matrices
        cnt = leaddim*nrow

        # Compute a suitable global and local workgroup sizes
        gs, ls = splay(self.backend.qdflt, cnt)

        class AxnpbyKernel(ComputeKernel):
            def run(self, queue, *consts):
                args = [x.data for x in arr] + list(consts)
                kern(queue.cl_queue_comp, gs, ls, cnt, *args)

        return AxnpbyKernel()
예제 #2
0
    def _fill(self, distribution, ary, scale, shift, queue=None):
        """Fill *ary* with uniformly distributed random numbers in the interval
        *(a, b)*, endpoints excluded.

        :return: a :class:`pyopencl.Event`
        """

        if queue is None:
            queue = ary.queue

        knl, counter_multiplier, size_multiplier = \
                self.get_gen_kernel(ary.dtype, distribution)

        args = self.key + self.counter + [
                ary.data, ary.size*size_multiplier,
                scale, shift]

        n = ary.size
        from pyopencl.array import splay
        gsize, lsize = splay(queue, ary.size)

        evt = knl(queue, gsize, lsize, *args)

        self.counter[0] += n * counter_multiplier
        c1_incr, self.counter[0] = divmod(self.counter[0], self.counter_max)
        if c1_incr:
            self.counter[1] += c1_incr
            c2_incr, self.counter[1] = divmod(self.counter[1], self.counter_max)
            self.counter[2] += c2_incr

        return evt
예제 #3
0
    def axnpby(self, *arr):
        if any(arr[0].traits != x.traits for x in arr[1:]):
            raise ValueError('Incompatible matrix types')

        nv = len(arr)
        nrow, leaddim, leadsubdim, dtype = arr[0].traits

        # Render the kernel template
        src = self.backend.lookup.get_template('axnpby').render(nv=nv)

        # Build the kernel
        kern = self._build_kernel('axnpby', src,
                                  [np.int32] + [np.intp] * nv + [dtype] * nv)

        # Determine the total element count in the matrices
        cnt = leaddim * nrow

        # Compute a suitable global and local workgroup sizes
        gs, ls = splay(self.backend.qdflt, cnt)

        class AxnpbyKernel(ComputeKernel):
            def run(self, queue, *consts):
                args = [x.data for x in arr] + list(consts)
                kern(queue.cl_queue_comp, gs, ls, cnt, *args)

        return AxnpbyKernel()
예제 #4
0
    def __call__(self, *args, **kwargs):
        repr_vec = None

        range_ = kwargs.pop("range", None)
        slice_ = kwargs.pop("slice", None)
        capture_as = kwargs.pop("capture_as", None)

        use_range = range_ is not None or slice_ is not None
        kernel, arg_descrs = self.get_kernel(use_range)

        # {{{ assemble arg array

        invocation_args = []
        for arg, arg_descr in zip(args, arg_descrs):
            if isinstance(arg_descr, VectorArg):
                if not arg.flags.forc:
                    raise RuntimeError("ElementwiseKernel cannot "
                            "deal with non-contiguous arrays")

                if repr_vec is None:
                    repr_vec = arg

                invocation_args.append(arg.base_data)
                if arg_descr.with_offset:
                    invocation_args.append(arg.offset)
            else:
                invocation_args.append(arg)

        # }}}

        queue = kwargs.pop("queue", None)
        wait_for = kwargs.pop("wait_for", None)
        if kwargs:
            raise TypeError("unknown keyword arguments: '%s'"
                    % ", ".join(kwargs))

        if queue is None:
            queue = repr_vec.queue

        if slice_ is not None:
            if range_ is not None:
                raise TypeError("may not specify both range and slice "
                        "keyword arguments")

            range_ = slice(*slice_.indices(repr_vec.size))

        max_wg_size = kernel.get_work_group_info(
                cl.kernel_work_group_info.WORK_GROUP_SIZE,
                queue.device)

        if range_ is not None:
            start = range_.start
            if start is None:
                start = 0
            invocation_args.append(start)
            invocation_args.append(range_.stop)
            if range_.step is None:
                step = 1
            else:
                step = range_.step

            invocation_args.append(step)

            from pyopencl.array import splay
            gs, ls = splay(queue,
                    abs(range_.stop - start)//step,
                    max_wg_size)
        else:
            invocation_args.append(repr_vec.size)
            gs, ls = repr_vec.get_sizes(queue, max_wg_size)

        if capture_as is not None:
            kernel.set_args(*invocation_args)
            kernel.capture_call(
                    capture_as, queue,
                    gs, ls, *invocation_args, wait_for=wait_for)

        kernel.set_args(*invocation_args)
        return cl.enqueue_nd_range_kernel(queue, kernel,
                gs, ls, wait_for=wait_for)
    def __call__(self, queue, n_objects, *args, **kwargs):
        """
        :arg args: arguments corresponding to arg_decls in the constructor.
            :class:`pyopencl.array.Array` are not allowed directly and should
            be passed as their :attr:`pyopencl.array.Array.data` attribute instead.
        :arg allocator: optionally, the allocator to use to allocate new
            arrays.
        :arg wait_for: |explain-waitfor|
        :returns: a tuple ``(lists, event)``, where
            *lists* a mapping from (built) list names to objects which
            have attributes

            * ``count`` for the total number of entries in all lists combined
            * ``lists`` for the array containing all lists.
            * ``starts`` for the array of starting indices in `lists`.
              `starts` is built so that it has n+1 entries, so that
              the *i*'th entry is the start of the *i*'th list, and the
              *i*'th entry is the index one past the *i*'th list's end,
              even for the last list.

              This implies that all lists are contiguous.

              *event* is a :class:`pyopencl.Event` for dependency management.
        """
        if n_objects >= int(np.iinfo(np.int32).max):
            index_dtype = np.int64
        else:
            index_dtype = np.int32
        index_dtype = np.dtype(index_dtype)

        allocator = kwargs.pop("allocator", None)
        wait_for = kwargs.pop("wait_for", None)
        if kwargs:
            raise TypeError("invalid keyword arguments: '%s'" %
                            ", ".join(kwargs))

        result = {}
        count_list_args = []

        if wait_for is None:
            wait_for = []

        count_kernel = self.get_count_kernel(index_dtype)
        write_kernel = self.get_write_kernel(index_dtype)
        scan_kernel = self.get_scan_kernel(index_dtype)

        # {{{ allocate memory for counts

        for name, dtype in self.list_names_and_dtypes:
            if name in self.count_sharing:
                continue

            counts = cl.array.empty(queue, (n_objects + 1),
                                    index_dtype,
                                    allocator=allocator)
            counts[-1] = 0
            wait_for = wait_for + counts.events

            # The scan will turn the "counts" array into the "starts" array
            # in-place.
            result[name] = BuiltList(starts=counts)
            count_list_args.append(counts.data)

        # }}}

        if self.debug:
            gsize = (1, )
            lsize = (1, )
        elif self.complex_kernel and queue.device.type == cl.device_type.CPU:
            gsize = (4 * queue.device.max_compute_units, )
            lsize = (1, )
        else:
            from pyopencl.array import splay
            gsize, lsize = splay(queue, n_objects)

        count_event = count_kernel(
            queue, gsize, lsize,
            *(tuple(count_list_args) + args + (n_objects, )),
            **dict(wait_for=wait_for))

        # {{{ run scans

        scan_events = []

        for name, dtype in self.list_names_and_dtypes:
            if name in self.count_sharing:
                continue

            info_record = result[name]
            starts_ary = info_record.starts
            evt = scan_kernel(starts_ary,
                              wait_for=[count_event],
                              size=n_objects)

            starts_ary.setitem(0, 0, queue=queue, wait_for=[evt])
            scan_events.extend(starts_ary.events)

            # retrieve count
            info_record.count = int(starts_ary[-1].get())

        # }}}

        # {{{ deal with count-sharing lists, allocate memory for lists

        write_list_args = []
        for name, dtype in self.list_names_and_dtypes:
            if name in self.count_sharing:
                sharing_from = self.count_sharing[name]

                info_record = result[name] = BuiltList(
                    count=result[sharing_from].count,
                    starts=result[sharing_from].starts,
                )

            else:
                info_record = result[name]

            info_record.lists = cl.array.empty(queue,
                                               info_record.count,
                                               dtype,
                                               allocator=allocator)
            write_list_args.append(info_record.lists.data)

            if name not in self.count_sharing:
                write_list_args.append(info_record.starts.data)

        # }}}

        evt = write_kernel(queue, gsize, lsize,
                           *(tuple(write_list_args) + args + (n_objects, )),
                           **dict(wait_for=scan_events))

        return result, evt
예제 #6
0
    def __call__(self, queue, n_objects, *args, **kwargs):
        """
        :arg args: arguments corresponding to arg_decls in the constructor.
            :class:`pyopencl.array.Array` are not allowed directly and should
            be passed as their :attr:`pyopencl.array.Array.data` attribute instead.
        :arg allocator: optionally, the allocator to use to allocate new
            arrays.
        :returns: a mapping from names to objects which have attributes

            * `count` for the total number of entries in all lists combined
            * `lists` for the array containing all lists.
            * `starts` for the array of starting indices in `lists`.
              `starts` is built so that it has n+1 entries, so that
              the *i*'th entry is the start of the *i*'th list, and the
              *i*'th entry is the index one past the *i*'th list's end,
              even for the last list.

              This implies that all lists are contiguous.
        """
        if n_objects >= int(np.iinfo(np.int32).max):
            index_dtype = np.int64
        else:
            index_dtype = np.int32
        index_dtype = np.dtype(index_dtype)

        allocator = kwargs.pop("allocator", None)
        if kwargs:
            raise TypeError("invalid keyword arguments: '%s'" % ", ".join(kwargs))

        result = {}
        count_list_args = []

        count_kernel = self.get_count_kernel(index_dtype)
        write_kernel = self.get_write_kernel(index_dtype)
        scan_kernel = self.get_scan_kernel(index_dtype)

        # {{{ allocate memory for counts

        for name, dtype in self.list_names_and_dtypes:
            if name in self.count_sharing:
                continue

            counts = cl.array.empty(queue, (n_objects + 1), index_dtype, allocator=allocator)

            # The scan will turn the "counts" array into the "starts" array
            # in-place.
            result[name] = BuiltList(starts=counts)
            count_list_args.append(counts.data)

        # }}}

        if self.debug:
            gsize = (1,)
            lsize = (1,)
        elif self.complex_kernel and queue.device.type == cl.device_type.CPU:
            gsize = (4 * queue.device.max_compute_units,)
            lsize = (1,)
        else:
            from pyopencl.array import splay

            gsize, lsize = splay(queue, n_objects)

        count_kernel(queue, gsize, lsize, *(tuple(count_list_args) + args + (n_objects,)))

        # {{{ run scans

        for name, dtype in self.list_names_and_dtypes:
            if name in self.count_sharing:
                continue

            info_record = result[name]
            starts_ary = info_record.starts
            scan_kernel(starts_ary)

            # set first entry to zero
            cl.enqueue_copy(queue, starts_ary.data, index_dtype.type(0))

            # retrieve count
            count = np.array(1, index_dtype)
            cl.enqueue_copy(queue, count, starts_ary.data, device_offset=index_dtype.itemsize * n_objects)

            info_record.count = int(count)

        # }}}

        # {{{ deal with count-sharing lists, allocate memory for lists

        write_list_args = []
        for name, dtype in self.list_names_and_dtypes:
            if name in self.count_sharing:
                sharing_from = self.count_sharing[name]

                info_record = result[name] = BuiltList(
                    count=result[sharing_from].count, starts=result[sharing_from].starts
                )

            else:
                info_record = result[name]

            info_record.lists = cl.array.empty(queue, info_record.count, dtype, allocator=allocator)
            write_list_args.append(info_record.lists.data)

            if name not in self.count_sharing:
                write_list_args.append(info_record.starts.data)

        # }}}

        write_kernel(queue, gsize, lsize, *(tuple(write_list_args) + args + (n_objects,)))

        return result
예제 #7
0
    def __call__(self, queue, n_objects, *args, **kwargs):
        """
        :arg args: arguments corresponding to arg_decls in the constructor.
            Array-like arguments must be either
            1D :class:`pyopencl.array.Array` objects or
            :class:`pyopencl.MemoryObject` objects, of which the latter
            can be obtained from a :class:`pyopencl.array.Array` using the
            :attr:`pyopencl.array.Array.data` attribute.
        :arg allocator: optionally, the allocator to use to allocate new
            arrays.
        :arg omit_lists: An iterable of list names that should *not* be built
            with this invocation. The kernel code may *not* call ``APPEND_name``
            for these omitted lists. If it does, undefined behavior will result.
            The returned *lists* dictionary will not contain an entry for names
            in *omit_lists*.
        :arg wait_for: |explain-waitfor|
        :returns: a tuple ``(lists, event)``, where
            *lists* a mapping from (built) list names to objects which
            have attributes

            * ``count`` for the total number of entries in all lists combined
            * ``lists`` for the array containing all lists.
            * ``starts`` for the array of starting indices in `lists`.
              `starts` is built so that it has n+1 entries, so that
              the *i*'th entry is the start of the *i*'th list, and the
              *i*'th entry is the index one past the *i*'th list's end,
              even for the last list.

              This implies that all lists are contiguous.

            If the list name is specified in *eliminate_empty_output_lists*
            constructor argument, *lists* has two additional attributes
            ``num_nonempty_lists`` and ``nonempty_indices``

            * ``num_nonempty_lists`` for the number of nonempty lists.
            * ``nonempty_indices`` for the index of nonempty list in input objects.

            In this case, `starts` has `num_nonempty_lists` + 1 entries. The *i*'s
            entry is the start of the *i*'th nonempty list, which is generated by
            the object with index *nonempty_indices[i]*.

            *event* is a :class:`pyopencl.Event` for dependency management.

        .. versionchanged:: 2016.2

            Added omit_lists.
        """
        if n_objects >= int(np.iinfo(np.int32).max):
            index_dtype = np.int64
        else:
            index_dtype = np.int32
        index_dtype = np.dtype(index_dtype)

        allocator = kwargs.pop("allocator", None)
        omit_lists = kwargs.pop("omit_lists", [])
        wait_for = kwargs.pop("wait_for", None)
        if kwargs:
            raise TypeError("invalid keyword arguments: '%s'" % ", ".join(kwargs))

        for oml in omit_lists:
            if not any(oml == name for name, _ in self.list_names_and_dtypes):
                raise ValueError("invalid list name '%s' in omit_lists")

        result = {}
        count_list_args = []

        if wait_for is None:
            wait_for = []
        else:
            # We'll be modifying it below.
            wait_for = list(wait_for)

        count_kernel = self.get_count_kernel(index_dtype)
        write_kernel = self.get_write_kernel(index_dtype)
        scan_kernel = self.get_scan_kernel(index_dtype)
        if self.eliminate_empty_output_lists:
            compress_kernel = self.get_compress_kernel(index_dtype)

        data_args = []
        for i, (arg_descr, arg_val) in enumerate(zip(self.arg_decls, args)):
            from pyopencl.tools import VectorArg
            if isinstance(arg_descr, VectorArg):
                from pyopencl import MemoryObject
                if isinstance(arg_val, MemoryObject):
                    data_args.append(arg_val)
                    if arg_descr.with_offset:
                        raise ValueError(
                                "with_offset=True specified for argument %d "
                                "but the argument is not an array" % i)
                    continue

                if arg_val.ndim != 1:
                    raise ValueError("argument %d is a multidimensional array" % i)

                data_args.append(arg_val.base_data)
                if arg_descr.with_offset:
                    data_args.append(arg_val.offset)
                wait_for.extend(arg_val.events)
            else:
                data_args.append(arg_val)

        del args
        data_args = tuple(data_args)

        # {{{ allocate memory for counts

        for name, dtype in self.list_names_and_dtypes:
            if name in self.count_sharing:
                continue
            if name in omit_lists:
                count_list_args.append(None)
                continue

            counts = cl.array.empty(queue,
                    (n_objects + 1), index_dtype, allocator=allocator)
            counts[-1] = 0
            wait_for = wait_for + counts.events

            # The scan will turn the "counts" array into the "starts" array
            # in-place.
            if name in self.eliminate_empty_output_lists:
                result[name] = BuiltList(count=None, starts=counts, lists=None,
                                         num_nonempty_lists=None,
                                         nonempty_indices=None)
            else:
                result[name] = BuiltList(count=None, starts=counts, lists=None)
            count_list_args.append(counts.data)

        # }}}

        if self.debug:
            gsize = (1,)
            lsize = (1,)
        elif self.do_not_vectorize():
            gsize = (4*queue.device.max_compute_units,)
            lsize = (1,)
        else:
            from pyopencl.array import splay
            gsize, lsize = splay(queue, n_objects)

        count_event = count_kernel(queue, gsize, lsize,
                *(tuple(count_list_args) + data_args + (n_objects,)),
                wait_for=wait_for)

        compress_events = {}
        for name, dtype in self.list_names_and_dtypes:
            if name in omit_lists:
                continue
            if name in self.count_sharing:
                continue
            if name not in self.eliminate_empty_output_lists:
                continue

            compressed_counts = cl.array.empty(
                queue, (n_objects + 1,), index_dtype, allocator=allocator)
            info_record = result[name]
            info_record.nonempty_indices = cl.array.empty(
                queue, (n_objects + 1,), index_dtype, allocator=allocator)
            info_record.num_nonempty_lists = cl.array.empty(
                queue, (1,), index_dtype, allocator=allocator)
            info_record.compressed_indices = cl.array.empty(
                queue, (n_objects + 1,), index_dtype, allocator=allocator)
            info_record.compressed_indices[0] = 0
            compress_events[name] = compress_kernel(
                info_record.starts,
                compressed_counts,
                info_record.nonempty_indices,
                info_record.compressed_indices,
                info_record.num_nonempty_lists,
                wait_for=[count_event] + info_record.compressed_indices.events)

            info_record.starts = compressed_counts

        # {{{ run scans

        scan_events = []

        for name, dtype in self.list_names_and_dtypes:
            if name in self.count_sharing:
                continue
            if name in omit_lists:
                continue

            info_record = result[name]
            if name in self.eliminate_empty_output_lists:
                compress_events[name].wait()
                num_nonempty_lists = info_record.num_nonempty_lists.get()[0]
                info_record.num_nonempty_lists = num_nonempty_lists
                info_record.starts = info_record.starts[:num_nonempty_lists + 1]
                info_record.nonempty_indices = \
                    info_record.nonempty_indices[:num_nonempty_lists]
                info_record.starts[-1] = 0

            starts_ary = info_record.starts
            if name in self.eliminate_empty_output_lists:
                evt = scan_kernel(
                        starts_ary,
                        size=info_record.num_nonempty_lists,
                        wait_for=starts_ary.events)
            else:
                evt = scan_kernel(starts_ary, wait_for=[count_event],
                        size=n_objects)

            starts_ary.setitem(0, 0, queue=queue, wait_for=[evt])
            scan_events.extend(starts_ary.events)

            # retrieve count
            info_record.count = int(starts_ary[-1].get())

        # }}}

        # {{{ deal with count-sharing lists, allocate memory for lists

        write_list_args = []
        for name, dtype in self.list_names_and_dtypes:
            if name in omit_lists:
                write_list_args.append(None)
                if name not in self.count_sharing:
                    write_list_args.append(None)
                if name in self.eliminate_empty_output_lists:
                    write_list_args.append(None)
                continue

            if name in self.count_sharing:
                sharing_from = self.count_sharing[name]

                info_record = result[name] = BuiltList(
                        count=result[sharing_from].count,
                        starts=result[sharing_from].starts,
                        )

            else:
                info_record = result[name]

            info_record.lists = cl.array.empty(queue,
                    info_record.count, dtype, allocator=allocator)
            write_list_args.append(info_record.lists.data)

            if name not in self.count_sharing:
                write_list_args.append(info_record.starts.data)

            if name in self.eliminate_empty_output_lists:
                write_list_args.append(info_record.compressed_indices.data)

        # }}}

        evt = write_kernel(queue, gsize, lsize,
                *(tuple(write_list_args) + data_args + (n_objects,)),
                wait_for=scan_events)

        return result, evt
예제 #8
0
    def __call__(self, queue, n_objects, *args, **kwargs):
        """
        :arg args: arguments corresponding to arg_decls in the constructor.
            Array-like arguments must be either
            1D :class:`pyopencl.array.Array` objects or
            :class:`pyopencl.MemoryObject` objects, of which the latter
            can be obtained from a :class:`pyopencl.array.Array` using the
            :attr:`pyopencl.array.Array.data` attribute.
        :arg allocator: optionally, the allocator to use to allocate new
            arrays.
        :arg omit_lists: An iterable of list names that should *not* be built
            with this invocation. The kernel code may *not* call ``APPEND_name``
            for these omitted lists. If it does, undefined behavior will result.
            The returned *lists* dictionary will not contain an entry for names
            in *omit_lists*.
        :arg wait_for: |explain-waitfor|
        :returns: a tuple ``(lists, event)``, where
            *lists* a mapping from (built) list names to objects which
            have attributes

            * ``count`` for the total number of entries in all lists combined
            * ``lists`` for the array containing all lists.
            * ``starts`` for the array of starting indices in `lists`.
              `starts` is built so that it has n+1 entries, so that
              the *i*'th entry is the start of the *i*'th list, and the
              *i*'th entry is the index one past the *i*'th list's end,
              even for the last list.

              This implies that all lists are contiguous.

            If the list name is specified in *eliminate_empty_output_lists*
            constructor argument, *lists* has two additional attributes
            ``num_nonempty_lists`` and ``nonempty_indices``

            * ``num_nonempty_lists`` for the number of nonempty lists.
            * ``nonempty_indices`` for the index of nonempty list in input objects.

            In this case, `starts` has `num_nonempty_lists` + 1 entries. The *i*'s
            entry is the start of the *i*'th nonempty list, which is generated by
            the object with index *nonempty_indices[i]*.

            *event* is a :class:`pyopencl.Event` for dependency management.

        .. versionchanged:: 2016.2

            Added omit_lists.
        """
        if n_objects >= int(np.iinfo(np.int32).max):
            index_dtype = np.int64
        else:
            index_dtype = np.int32
        index_dtype = np.dtype(index_dtype)

        allocator = kwargs.pop("allocator", None)
        omit_lists = kwargs.pop("omit_lists", [])
        wait_for = kwargs.pop("wait_for", None)
        if kwargs:
            raise TypeError("invalid keyword arguments: '%s'" % ", ".join(kwargs))

        for oml in omit_lists:
            if not any(oml == name for name, _ in self.list_names_and_dtypes):
                raise ValueError("invalid list name '%s' in omit_lists")

        result = {}
        count_list_args = []

        if wait_for is None:
            wait_for = []

        count_kernel = self.get_count_kernel(index_dtype)
        write_kernel = self.get_write_kernel(index_dtype)
        scan_kernel = self.get_scan_kernel(index_dtype)
        if self.eliminate_empty_output_lists:
            compress_kernel = self.get_compress_kernel(index_dtype)

        data_args = []
        for i, (arg_descr, arg_val) in enumerate(zip(self.arg_decls, args)):
            from pyopencl.tools import VectorArg
            if isinstance(arg_descr, VectorArg):
                from pyopencl import MemoryObject
                if isinstance(arg_val, MemoryObject):
                    data_args.append(arg_val)
                    if arg_descr.with_offset:
                        raise ValueError(
                                "with_offset=True specified for argument %d "
                                "but the argument is not an array" % i)
                    continue

                if arg_val.ndim != 1:
                    raise ValueError("argument %d is a multidimensional array" % i)

                data_args.append(arg_val.base_data)
                if arg_descr.with_offset:
                    data_args.append(arg_val.offset)
            else:
                data_args.append(arg_val)

        del args
        data_args = tuple(data_args)

        # {{{ allocate memory for counts

        for name, dtype in self.list_names_and_dtypes:
            if name in self.count_sharing:
                continue
            if name in omit_lists:
                count_list_args.append(None)
                continue

            counts = cl.array.empty(queue,
                    (n_objects + 1), index_dtype, allocator=allocator)
            counts[-1] = 0
            wait_for = wait_for + counts.events

            # The scan will turn the "counts" array into the "starts" array
            # in-place.
            if name in self.eliminate_empty_output_lists:
                result[name] = BuiltList(count=None, starts=counts, lists=None,
                                         num_nonempty_lists=None,
                                         nonempty_indices=None)
            else:
                result[name] = BuiltList(count=None, starts=counts, lists=None)
            count_list_args.append(counts.data)

        # }}}

        if self.debug:
            gsize = (1,)
            lsize = (1,)
        elif self.do_not_vectorize():
            gsize = (4*queue.device.max_compute_units,)
            lsize = (1,)
        else:
            from pyopencl.array import splay
            gsize, lsize = splay(queue, n_objects)

        count_event = count_kernel(queue, gsize, lsize,
                *(tuple(count_list_args) + data_args + (n_objects,)),
                **dict(wait_for=wait_for))

        compress_events = {}
        for name, dtype in self.list_names_and_dtypes:
            if name in omit_lists:
                continue
            if name in self.count_sharing:
                continue
            if name not in self.eliminate_empty_output_lists:
                continue

            compressed_counts = cl.array.empty(
                queue, (n_objects + 1,), index_dtype, allocator=allocator)
            info_record = result[name]
            info_record.nonempty_indices = cl.array.empty(
                queue, (n_objects + 1,), index_dtype, allocator=allocator)
            info_record.num_nonempty_lists = cl.array.empty(
                queue, (1,), index_dtype, allocator=allocator)
            info_record.compressed_indices = cl.array.empty(
                queue, (n_objects + 1,), index_dtype, allocator=allocator)
            info_record.compressed_indices[0] = 0
            compress_events[name] = compress_kernel(
                info_record.starts,
                compressed_counts,
                info_record.nonempty_indices,
                info_record.compressed_indices,
                info_record.num_nonempty_lists,
                wait_for=[count_event] + info_record.compressed_indices.events)

            info_record.starts = compressed_counts

        # {{{ run scans

        scan_events = []

        for name, dtype in self.list_names_and_dtypes:
            if name in self.count_sharing:
                continue
            if name in omit_lists:
                continue

            info_record = result[name]
            if name in self.eliminate_empty_output_lists:
                compress_events[name].wait()
                num_nonempty_lists = info_record.num_nonempty_lists.get()[0]
                info_record.num_nonempty_lists = num_nonempty_lists
                info_record.starts = info_record.starts[:num_nonempty_lists + 1]
                info_record.nonempty_indices = \
                    info_record.nonempty_indices[:num_nonempty_lists]
                info_record.starts[-1] = 0

            starts_ary = info_record.starts
            if name in self.eliminate_empty_output_lists:
                evt = scan_kernel(
                        starts_ary,
                        size=info_record.num_nonempty_lists,
                        wait_for=starts_ary.events)
            else:
                evt = scan_kernel(starts_ary, wait_for=[count_event],
                        size=n_objects)

            starts_ary.setitem(0, 0, queue=queue, wait_for=[evt])
            scan_events.extend(starts_ary.events)

            # retrieve count
            info_record.count = int(starts_ary[-1].get())

        # }}}

        # {{{ deal with count-sharing lists, allocate memory for lists

        write_list_args = []
        for name, dtype in self.list_names_and_dtypes:
            if name in omit_lists:
                write_list_args.append(None)
                if name not in self.count_sharing:
                    write_list_args.append(None)
                if name in self.eliminate_empty_output_lists:
                    write_list_args.append(None)
                continue

            if name in self.count_sharing:
                sharing_from = self.count_sharing[name]

                info_record = result[name] = BuiltList(
                        count=result[sharing_from].count,
                        starts=result[sharing_from].starts,
                        )

            else:
                info_record = result[name]

            info_record.lists = cl.array.empty(queue,
                    info_record.count, dtype, allocator=allocator)
            write_list_args.append(info_record.lists.data)

            if name not in self.count_sharing:
                write_list_args.append(info_record.starts.data)

            if name in self.eliminate_empty_output_lists:
                write_list_args.append(info_record.compressed_indices.data)

        # }}}

        evt = write_kernel(queue, gsize, lsize,
                *(tuple(write_list_args) + data_args + (n_objects,)),
                **dict(wait_for=scan_events))

        return result, evt
예제 #9
0
파일: algorithm.py 프로젝트: AI42/pyopencl
    def __call__(self, queue, n_objects, *args, **kwargs):
        """
        :arg args: arguments corresponding to arg_decls in the constructor.
            :class:`pyopencl.array.Array` are not allowed directly and should
            be passed as their :attr:`pyopencl.array.Array.data` attribute instead.
        :arg allocator: optionally, the allocator to use to allocate new
            arrays.
        :arg wait_for: |explain-waitfor|
        :returns: a tuple ``(lists, event)``, where
            *lists* a mapping from (built) list names to objects which
            have attributes

            * ``count`` for the total number of entries in all lists combined
            * ``lists`` for the array containing all lists.
            * ``starts`` for the array of starting indices in `lists`.
              `starts` is built so that it has n+1 entries, so that
              the *i*'th entry is the start of the *i*'th list, and the
              *i*'th entry is the index one past the *i*'th list's end,
              even for the last list.

              This implies that all lists are contiguous.

              *event* is a :class:`pyopencl.Event` for dependency management.
        """
        if n_objects >= int(np.iinfo(np.int32).max):
            index_dtype = np.int64
        else:
            index_dtype = np.int32
        index_dtype = np.dtype(index_dtype)

        allocator = kwargs.pop("allocator", None)
        wait_for = kwargs.pop("wait_for", None)
        if kwargs:
            raise TypeError("invalid keyword arguments: '%s'" % ", ".join(kwargs))

        result = {}
        count_list_args = []

        if wait_for is None:
            wait_for = []

        count_kernel = self.get_count_kernel(index_dtype)
        write_kernel = self.get_write_kernel(index_dtype)
        scan_kernel = self.get_scan_kernel(index_dtype)

        # {{{ allocate memory for counts

        for name, dtype in self.list_names_and_dtypes:
            if name in self.count_sharing:
                continue

            counts = cl.array.empty(queue,
                    (n_objects + 1), index_dtype, allocator=allocator)
            counts[-1] = 0
            wait_for = wait_for + counts.events

            # The scan will turn the "counts" array into the "starts" array
            # in-place.
            result[name] = BuiltList(starts=counts)
            count_list_args.append(counts.data)

        # }}}

        if self.debug:
            gsize = (1,)
            lsize = (1,)
        elif self.complex_kernel and queue.device.type == cl.device_type.CPU:
            gsize = (4*queue.device.max_compute_units,)
            lsize = (1,)
        else:
            from pyopencl.array import splay
            gsize, lsize = splay(queue, n_objects)

        count_event = count_kernel(queue, gsize, lsize,
                *(tuple(count_list_args) + args + (n_objects,)),
                **dict(wait_for=wait_for))

        # {{{ run scans

        scan_events = []

        for name, dtype in self.list_names_and_dtypes:
            if name in self.count_sharing:
                continue

            info_record = result[name]
            starts_ary = info_record.starts
            evt = scan_kernel(starts_ary, wait_for=[count_event],
                    size=n_objects)

            starts_ary.setitem(0, 0, queue=queue, wait_for=[evt])
            scan_events.extend(starts_ary.events)

            # retrieve count
            info_record.count = int(starts_ary[-1].get())

        # }}}

        # {{{ deal with count-sharing lists, allocate memory for lists

        write_list_args = []
        for name, dtype in self.list_names_and_dtypes:
            if name in self.count_sharing:
                sharing_from = self.count_sharing[name]

                info_record = result[name] = BuiltList(
                        count=result[sharing_from].count,
                        starts=result[sharing_from].starts,
                        )

            else:
                info_record = result[name]

            info_record.lists = cl.array.empty(queue,
                    info_record.count, dtype, allocator=allocator)
            write_list_args.append(info_record.lists.data)

            if name not in self.count_sharing:
                write_list_args.append(info_record.starts.data)

        # }}}

        evt = write_kernel(queue, gsize, lsize,
                *(tuple(write_list_args) + args + (n_objects,)),
                **dict(wait_for=scan_events))

        return result, evt
예제 #10
0
    def __call__(self, queue, n_objects, *args, **kwargs):
        """
        :arg args: arguments corresponding to arg_decls in the constructor.
            :class:`pyopencl.array.Array` are not allowed directly and should
            be passed as their :attr:`pyopencl.array.Array.data` attribute instead.
        :arg allocator: optionally, the allocator to use to allocate new
            arrays.
        :returns: a mapping from names to objects which have attributes

            * `count` for the total number of entries in all lists combined
            * `lists` for the array containing all lists.
            * `starts` for the array of starting indices in `lists`.
              `starts` is built so that it has n+1 entries, so that
              the *i*'th entry is the start of the *i*'th list, and the
              *i*'th entry is the index one past the *i*'th list's end,
              even for the last list.

              This implies that all lists are contiguous.
        """
        if n_objects >= int(np.iinfo(np.int32).max):
            index_dtype = np.int64
        else:
            index_dtype = np.int32
        index_dtype = np.dtype(index_dtype)

        allocator = kwargs.pop("allocator", None)
        if kwargs:
            raise TypeError("invalid keyword arguments: '%s'" %
                            ", ".join(kwargs))

        result = {}
        count_list_args = []

        count_kernel = self.get_count_kernel(index_dtype)
        write_kernel = self.get_write_kernel(index_dtype)
        scan_kernel = self.get_scan_kernel(index_dtype)

        # {{{ allocate memory for counts

        for name, dtype in self.list_names_and_dtypes:
            if name in self.count_sharing:
                continue

            counts = cl.array.empty(queue, (n_objects + 1),
                                    index_dtype,
                                    allocator=allocator)

            # The scan will turn the "counts" array into the "starts" array
            # in-place.
            result[name] = BuiltList(starts=counts)
            count_list_args.append(counts.data)

        # }}}

        if self.debug:
            gsize = (1, )
            lsize = (1, )
        elif self.complex_kernel and queue.device.type == cl.device_type.CPU:
            gsize = (4 * queue.device.max_compute_units, )
            lsize = (1, )
        else:
            from pyopencl.array import splay
            gsize, lsize = splay(queue, n_objects)

        count_kernel(queue, gsize, lsize,
                     *(tuple(count_list_args) + args + (n_objects, )))

        # {{{ run scans

        for name, dtype in self.list_names_and_dtypes:
            if name in self.count_sharing:
                continue

            info_record = result[name]
            starts_ary = info_record.starts
            scan_kernel(starts_ary)

            # set first entry to zero
            cl.enqueue_copy(queue, starts_ary.data, index_dtype.type(0))

            # retrieve count
            count = np.array(1, index_dtype)
            cl.enqueue_copy(queue,
                            count,
                            starts_ary.data,
                            device_offset=index_dtype.itemsize * n_objects)

            info_record.count = int(count)

        # }}}

        # {{{ deal with count-sharing lists, allocate memory for lists

        write_list_args = []
        for name, dtype in self.list_names_and_dtypes:
            if name in self.count_sharing:
                sharing_from = self.count_sharing[name]

                info_record = result[name] = BuiltList(
                    count=result[sharing_from].count,
                    starts=result[sharing_from].starts,
                )

            else:
                info_record = result[name]

            info_record.lists = cl.array.empty(queue,
                                               info_record.count,
                                               dtype,
                                               allocator=allocator)
            write_list_args.append(info_record.lists.data)

            if name not in self.count_sharing:
                write_list_args.append(info_record.starts.data)

        # }}}

        write_kernel(queue, gsize, lsize,
                     *(tuple(write_list_args) + args + (n_objects, )))

        return result