Пример #1
0
def identify_affine_map(from_points, to_points):
    """Return an affine map that maps *from_points[i]* to *to_points[i]*.
    For an n-dimensional affine map, n+1 points are needed.
    """

    from pytools import single_valued
    dim = single_valued([
        single_valued(len(fp) for fp in from_points),
        single_valued(len(tp) for tp in to_points)
    ])

    if dim == 0:
        return AffineMap(numpy.zeros((0, 0), dtype=numpy.float64),
                         numpy.zeros((0, ), dtype=numpy.float64))

    if len(from_points) != dim + 1 or len(to_points) != dim + 1:
        raise ValueError("need dim+1 points to identify an affine map")

    # columns contain points
    x_mat = numpy.array(from_points).T
    y_mat = numpy.array(to_points).T

    # We are trying to solve
    # a*x_i + b = y_i
    # for a and b.  To eliminate b, subtract equation (i+1) from equation i,
    # then chop the last column.
    xdiff_mat = (x_mat - numpy.roll(x_mat, -1, axis=1))[:, :dim]
    ydiff_mat = (y_mat - numpy.roll(y_mat, -1, axis=1))[:, :dim]

    from hedge.tools.linalg import leftsolve
    a = numpy.asarray(leftsolve(xdiff_mat, ydiff_mat), order="C")
    b = to_points[0] - numpy.dot(a, from_points[0])

    return AffineMap(a, b)
Пример #2
0
def _bmat(blocks, dtypes):
    from pytools import single_valued
    from pytential.symbolic.matrix import is_zero

    nrows = blocks.shape[0]
    ncolumns = blocks.shape[1]

    # "block row starts"/"block column starts"
    brs = np.cumsum([0]
            + [single_valued(blocks[ibrow, ibcol].shape[0]
                             for ibcol in range(ncolumns)
                             if not is_zero(blocks[ibrow, ibcol]))
             for ibrow in range(nrows)])

    bcs = np.cumsum([0]
            + [single_valued(blocks[ibrow, ibcol].shape[1]
                             for ibrow in range(nrows)
                             if not is_zero(blocks[ibrow, ibcol]))
             for ibcol in range(ncolumns)])

    result = np.zeros((brs[-1], bcs[-1]),
                      dtype=np.find_common_type(dtypes, []))
    for ibcol in range(ncolumns):
        for ibrow in range(nrows):
            result[brs[ibrow]:brs[ibrow + 1], bcs[ibcol]:bcs[ibcol + 1]] = \
                    blocks[ibrow, ibcol]

    return result
Пример #3
0
    def map_ref_diff_op_binding(self, expr, codegen_state):
        try:
            return self.expr_to_var[expr]
        except KeyError:
            all_diffs = [diff
                    for diff in self.diff_ops
                    if diff.op.equal_except_for_axis(expr.op)
                    and diff.field == expr.field]

            names = [self.name_gen("expr") for d in all_diffs]

            from pytools import single_valued
            op_class = single_valued(type(d.op) for d in all_diffs)

            codegen_state.get_code_list(self).append(
                    DiffBatchAssign(
                        names=names,
                        op_class=op_class,
                        operators=[d.op for d in all_diffs],
                        field=self.rec(
                            single_valued(d.field for d in all_diffs),
                            codegen_state)))

            from pymbolic import var
            for n, d in zip(names, all_diffs):
                self.expr_to_var[d] = var(n)

            return self.expr_to_var[expr]
Пример #4
0
    def map_ref_diff_op_binding(self, expr):
        try:
            return self.expr_to_var[expr]
        except KeyError:
            all_diffs = [
                diff for diff in self.diff_ops
                if diff.op.equal_except_for_axis(expr.op)
                and diff.field == expr.field
            ]

            names = [self.get_var_name() for d in all_diffs]

            from pytools import single_valued
            op_class = single_valued(type(d.op) for d in all_diffs)

            from hedge.optemplate.operators import \
                    ReferenceQuadratureStiffnessTOperator
            if isinstance(op_class, ReferenceQuadratureStiffnessTOperator):
                assign_class = QuadratureDiffBatchAssign
            else:
                assign_class = DiffBatchAssign

            self.code.append(
                assign_class(names=names,
                             op_class=op_class,
                             operators=[d.op for d in all_diffs],
                             field=self.rec(
                                 single_valued(d.field for d in all_diffs)),
                             dep_mapper_factory=self.dep_mapper_factory))

            from pymbolic import var
            for n, d in zip(names, all_diffs):
                self.expr_to_var[d] = var(n)

            return self.expr_to_var[expr]
Пример #5
0
    def map_ref_diff_op_binding(self, expr):
        try:
            return self.expr_to_var[expr]
        except KeyError:
            all_diffs = [diff
                    for diff in self.diff_ops
                    if diff.op.equal_except_for_axis(expr.op)
                    and diff.field == expr.field]

            names = [self.get_var_name() for d in all_diffs]

            from pytools import single_valued
            op_class=single_valued(type(d.op) for d in all_diffs)

            from hedge.optemplate.operators import \
                    ReferenceQuadratureStiffnessTOperator
            if isinstance(op_class, ReferenceQuadratureStiffnessTOperator):
                assign_class = QuadratureDiffBatchAssign
            else:
                assign_class = DiffBatchAssign

            self.code.append(
                    assign_class(
                        names=names,
                        op_class=op_class,
                        operators=[d.op for d in all_diffs],
                        field=self.rec(
                            single_valued(d.field for d in all_diffs)),
                        dep_mapper_factory=self.dep_mapper_factory))

            from pymbolic import var
            for n, d in zip(names, all_diffs):
                self.expr_to_var[d] = var(n)

            return self.expr_to_var[expr]
Пример #6
0
def _bmat(blocks, dtypes):
    from pytools import single_valued
    from pytential.symbolic.matrix import is_zero

    nrows = blocks.shape[0]
    ncolumns = blocks.shape[1]

    # "block row starts"/"block column starts"
    brs = np.cumsum([0] + [
        single_valued(blocks[ibrow, ibcol].shape[0]
                      for ibcol in range(ncolumns)
                      if not is_zero(blocks[ibrow, ibcol]))
        for ibrow in range(nrows)
    ])

    bcs = np.cumsum([0] + [
        single_valued(blocks[ibrow, ibcol].shape[1] for ibrow in range(nrows)
                      if not is_zero(blocks[ibrow, ibcol]))
        for ibcol in range(ncolumns)
    ])

    result = np.zeros((brs[-1], bcs[-1]),
                      dtype=np.find_common_type(dtypes, []))
    for ibcol in range(ncolumns):
        for ibrow in range(nrows):
            result[brs[ibrow]:brs[ibrow + 1], bcs[ibcol]:bcs[ibcol + 1]] = \
                    blocks[ibrow, ibcol]

    return result
Пример #7
0
def make_superblocks(devdata, struct_name, single_item, multi_item, extra_fields={}):
    from hedge.backends.cuda.tools import pad_and_join

    # single_item = [([ block1, block2, ... ], decl), ...]
    # multi_item = [([ [ item1, item2, ...], ... ], decl), ...]

    multi_blocks = [
            ["".join(s) for s in part_data]
            for part_data, part_decls in multi_item]
    block_sizes = [
            max(len(b) for b in part_blocks)
            for part_blocks in multi_blocks]

    from pytools import single_valued
    block_count = single_valued(
            len(si_part_blocks) for si_part_blocks, si_part_decl in single_item)

    from cgen import Struct, ArrayOf

    struct_members = []
    for part_data, part_decl in single_item:
        assert block_count == len(part_data)
        single_valued(len(block) for block in part_data)
        struct_members.append(part_decl)

    for part_data, part_decl in multi_item:
        struct_members.append(
                ArrayOf(part_decl, max(len(s) for s in part_data)))

    superblocks = []
    for superblock_num in range(block_count):
        data = ""
        for part_data, part_decl in single_item:
            data += part_data[superblock_num]

        for part_blocks, part_size in zip(multi_blocks, block_sizes):
            assert block_count == len(part_blocks)
            data += pad(part_blocks[superblock_num], part_size)

        superblocks.append(data)

    superblock_size = devdata.align(
            single_valued(len(sb) for sb in superblocks))

    data = pad_and_join(superblocks, superblock_size)
    assert len(data) == superblock_size*block_count

    class SuperblockedDataStructure(Record):
        pass

    return SuperblockedDataStructure(
            struct=Struct(struct_name, struct_members),
            device_memory=cuda.to_device(data),
            block_bytes=superblock_size,
            data=data,
            **extra_fields
            )
Пример #8
0
def make_superblocks(devdata, struct_name, single_item, multi_item, extra_fields={}):
    from hedge.backends.cuda.tools import pad_and_join

    # single_item = [([ block1, block2, ... ], decl), ...]
    # multi_item = [([ [ item1, item2, ...], ... ], decl), ...]

    multi_blocks = [
            ["".join(s) for s in part_data]
            for part_data, part_decls in multi_item]
    block_sizes = [
            max(len(b) for b in part_blocks)
            for part_blocks in multi_blocks]

    from pytools import single_valued
    block_count = single_valued(
            len(si_part_blocks) for si_part_blocks, si_part_decl in single_item)

    from cgen import Struct, ArrayOf

    struct_members = []
    for part_data, part_decl in single_item:
        assert block_count == len(part_data)
        single_valued(len(block) for block in part_data)
        struct_members.append(part_decl)

    for part_data, part_decl in multi_item:
        struct_members.append(
                ArrayOf(part_decl, max(len(s) for s in part_data)))

    superblocks = []
    for superblock_num in range(block_count):
        data = ""
        for part_data, part_decl in single_item:
            data += part_data[superblock_num]

        for part_blocks, part_size in zip(multi_blocks, block_sizes):
            assert block_count == len(part_blocks)
            data += pad(part_blocks[superblock_num], part_size)

        superblocks.append(data)

    superblock_size = devdata.align(
            single_valued(len(sb) for sb in superblocks))

    data = pad_and_join(superblocks, superblock_size)
    assert len(data) == superblock_size*block_count

    class SuperblockedDataStructure(Record):
        pass

    return SuperblockedDataStructure(
            struct=Struct(struct_name, struct_members),
            device_memory=cuda.to_device(data),
            block_bytes=superblock_size,
            data=data,
            **extra_fields
            )
Пример #9
0
def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None):
    if not len(arrays):
        return []

    from pytools import single_valued
    a_dtype = single_valued(a.dtype for a in arrays)
    a_allocator = arrays[0].allocator
    context = dest_indices.context
    queue = queue or dest_indices.queue

    vec_count = len(arrays)

    if out is None:
        out = [
            Array(context,
                  dest_shape,
                  a_dtype,
                  allocator=a_allocator,
                  queue=queue) for i in range(vec_count)
        ]
    else:
        if a_dtype != single_valued(o.dtype for o in out):
            raise TypeError("arrays and out must have the same dtype")
        if len(out) != vec_count:
            raise ValueError("out and arrays must have the same length")

    if len(dest_indices.shape) != 1:
        raise ValueError("dest_indices must be 1D")

    chunk_size = _builtin_min(vec_count, 10)

    def make_func_for_chunk_size(chunk_size):
        knl = elementwise.get_put_kernel(context,
                                         a_dtype,
                                         dest_indices.dtype,
                                         vec_count=chunk_size)
        return knl

    knl = make_func_for_chunk_size(chunk_size)

    for start_i in range(0, len(arrays), chunk_size):
        chunk_slice = slice(start_i, start_i + chunk_size)

        if start_i + chunk_size > vec_count:
            knl = make_func_for_chunk_size(vec_count - start_i)

        gs, ls = dest_indices.get_sizes(
            queue,
            knl.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE,
                                    queue.device))

        knl(
            queue, gs, ls,
            *([o.data for o in out[chunk_slice]] + [dest_indices.data] +
              [i.data for i in arrays[chunk_slice]] + [dest_indices.size]))

    return out
Пример #10
0
def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None):
    if not len(arrays):
        return []

    from pytools import single_valued

    a_dtype = single_valued(a.dtype for a in arrays)
    a_allocator = arrays[0].allocator
    context = dest_indices.context
    queue = queue or dest_indices.queue

    vec_count = len(arrays)

    if out is None:
        out = [Array(context, dest_shape, a_dtype, allocator=a_allocator, queue=queue) for i in range(vec_count)]
    else:
        if a_dtype != single_valued(o.dtype for o in out):
            raise TypeError("arrays and out must have the same dtype")
        if len(out) != vec_count:
            raise ValueError("out and arrays must have the same length")

    if len(dest_indices.shape) != 1:
        raise ValueError("src_indices must be 1D")

    chunk_size = _builtin_min(vec_count, 10)

    def make_func_for_chunk_size(chunk_size):
        knl = elementwise.get_put_kernel(a_dtype, dest_indices.dtype, vec_count=chunk_size)
        knl.set_block_shape(*dest_indices._block)
        return knl

    knl = make_func_for_chunk_size(chunk_size)

    for start_i in range(0, len(arrays), chunk_size):
        chunk_slice = slice(start_i, start_i + chunk_size)

        if start_i + chunk_size > vec_count:
            knl = make_func_for_chunk_size(vec_count - start_i)

        gs, ls = dest_indices.get_sizes(
            queue, knl.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, queue.device)
        )

        knl(
            queue,
            gs,
            ls,
            *(
                [o.data for o in out[chunk_slice]]
                + [dest_indices.data]
                + [i.data for i in arrays[chunk_slice]]
                + [dest_indices.size]
            )
        )

    return out
Пример #11
0
    def index_list_backend(self, ilists):
        from pytools import single_valued

        ilist_length = single_valued(len(il) for il in ilists)
        assert ilist_length == self.plan.dofs_per_face

        from cgen import Typedef, POD

        from pytools import flatten

        flat_ilists_uncast = numpy.array(list(flatten(ilists)))

        if numpy.max(flat_ilists_uncast) >= 256:
            tp = numpy.uint16
        else:
            tp = numpy.uint8

        flat_ilists = numpy.asarray(flat_ilists_uncast, dtype=tp)
        assert (flat_ilists == flat_ilists_uncast).all()

        return GPUIndexLists(
            type=tp,
            code=[Typedef(POD(tp, "index_list_entry_t"))],
            device_memory=cuda.to_device(flat_ilists),
            bytes=flat_ilists.size * flat_ilists.itemsize,
        )
Пример #12
0
    def exec_diff_batch_assign(self, insn):
        field = self.rec(insn.field)

        discr = self.executor.discr
        if discr.instrumented:
            discr.diff_counter.add(discr.dimensions)
            discr.diff_flop_counter.add(discr.dimensions*(
                self.executor.diff_rst_flops + self.executor.diff_rescale_one_flops))

        repr_op = insn.operators[0]

        from hedge.optemplate.operators import \
                ReferenceQuadratureStiffnessTOperator
        if isinstance(repr_op, ReferenceQuadratureStiffnessTOperator):
            eg, = discr.element_groups
            from pytools import single_valued
            q_info = discr.get_cuda_elgroup_quadrature_info(
                    eg, single_valued(op.quadrature_tag for op in insn.operators))

            kernel = discr.diff_kernel(
                aligned_preimage_dofs_per_microblock
                =q_info.aligned_dofs_per_microblock,
                preimage_dofs_per_el=q_info.ldis_quad_info.node_count())

            rst_diff = kernel(repr_op, field)
        else:
            rst_diff = self.executor.diff_kernel(repr_op, field)

        return [(name, rst_diff[op.rst_axis])
                for name, op in zip(insn.names, insn.operators)], []
Пример #13
0
    def __call__(self, evaluate_subexpr, stats_callback=None):
        vectors = [evaluate_subexpr(vec_expr) 
                for vec_expr in self.vector_deps]
        scalars = [evaluate_subexpr(scal_expr) 
                for scal_expr in self.scalar_deps]

        from pytools import single_valued
        shape = single_valued(vec.shape for vec in vectors)

        kernel_rec = self.get_kernel(
                tuple(v.dtype for v in vectors),
                tuple(s.dtype for s in scalars))

        results = [numpy.empty(shape, kernel_rec.result_dtype)
                for vei in self.result_vec_expr_info_list]

        size = results[0].size
        args = (results+vectors+scalars)

        if stats_callback is not None:
            timer = stats_callback(size, self)
            sub_timer = timer.start_sub_timer()
            kernel_rec.kernel(*args)
            sub_timer.stop().submit()
        else:
            kernel_rec.kernel(*args)

        return results
Пример #14
0
def nd_quad_submesh(node_tuples):
    """Return a list of tuples of indices into the node list that
    generate a tesselation of the reference element.

    :arg node_tuples: A list of tuples *(i, j, ...)* of integers
        indicating node positions inside the unit element. The
        returned list references indices in this list.

        :func:`pytools.generate_nonnegative_integer_tuples_below`
        may be used to generate *node_tuples*.

    See also :func:`modepy.tools.simplex_submesh`.
    """

    from pytools import single_valued, add_tuples
    dims = single_valued(len(nt) for nt in node_tuples)

    node_dict = dict(
            (ituple, idx)
            for idx, ituple in enumerate(node_tuples))

    from pytools import generate_nonnegative_integer_tuples_below as gnitb

    result = []
    for current in node_tuples:
        try:
            result.append(tuple(
                    node_dict[add_tuples(current, offset)]
                    for offset in gnitb(2, dims)))

        except KeyError:
            pass

    return result
Пример #15
0
    def _vis_connectivity(self):
        """
        :return: an array of shape
            ``(vis_discr.nelements,nsubelements,primitive_element_size)``
        """
        # Assume that we're using modepy's default node ordering.

        from pytools import generate_nonnegative_integer_tuples_summing_to_at_most \
                as gnitstam, single_valued
        vis_order = single_valued(
                group.order for group in self.vis_discr.groups)
        node_tuples = list(gnitstam(vis_order, self.vis_discr.dim))

        from modepy.tools import submesh
        el_connectivity = np.array(
                submesh(node_tuples),
                dtype=np.intp)

        nelements = sum(group.nelements for group in self.vis_discr.groups)
        vis_connectivity = np.empty(
                (nelements,) + el_connectivity.shape, dtype=np.intp)

        el_nr_base = 0
        for group in self.vis_discr.groups:
            assert len(node_tuples) == group.nunit_nodes
            vis_connectivity[el_nr_base:el_nr_base+group.nelements] = (
                    np.arange(
                        el_nr_base*group.nunit_nodes,
                        (el_nr_base+group.nelements)*group.nunit_nodes,
                        group.nunit_nodes
                        )[:, np.newaxis, np.newaxis]
                    + el_connectivity)
            el_nr_base += group.nelements

        return vis_connectivity
Пример #16
0
def nd_quad_submesh(node_tuples):
    """Return a list of tuples of indices into the node list that
    generate a tesselation of the reference element.

    :arg node_tuples: A list of tuples *(i, j, ...)* of integers
        indicating node positions inside the unit element. The
        returned list references indices in this list.

        :func:`pytools.generate_nonnegative_integer_tuples_below`
        may be used to generate *node_tuples*.

    See also :func:`modepy.tools.simplex_submesh`.
    """

    from pytools import single_valued, add_tuples
    dims = single_valued(len(nt) for nt in node_tuples)

    node_dict = dict((ituple, idx) for idx, ituple in enumerate(node_tuples))

    from pytools import generate_nonnegative_integer_tuples_below as gnitb

    result = []
    for current in node_tuples:
        try:
            result.append(
                tuple(node_dict[add_tuples(current, offset)]
                      for offset in gnitb(2, dims)))

        except KeyError:
            pass

    return result
Пример #17
0
    def __call__(self, evaluate_subexpr, stats_callback=None):
        vectors = [evaluate_subexpr(vec_expr) for vec_expr in self.vector_deps]
        scalars = [
            evaluate_subexpr(scal_expr) for scal_expr in self.scalar_deps
        ]

        from pytools import single_valued
        shape = single_valued(vec.shape for vec in vectors)

        kernel_rec = self.get_kernel(tuple(v.dtype for v in vectors),
                                     tuple(s.dtype for s in scalars))

        results = [
            gpuarray.empty(shape, kernel_rec.result_dtype, self.allocator)
            for expr in self.result_vec_expr_info_list
        ]

        size = results[0].size
        args = ([r.gpudata
                 for r in results] + [v.gpudata
                                      for v in vectors] + scalars + [size])

        if stats_callback is not None:
            stats_callback(
                size, self,
                kernel_rec.kernel.prepared_timed_call(vectors[0]._grid,
                                                      results[0]._block,
                                                      *args))
        else:
            kernel_rec.kernel.prepared_async_call(vectors[0]._grid,
                                                  results[0]._block,
                                                  self.stream, *args)

        return results
Пример #18
0
    def __call__(self, evaluate_subexpr, stats_callback=None):
        vectors = [evaluate_subexpr(vec_expr)
                for vec_expr in self.vector_deps]
        scalars = [evaluate_subexpr(scal_expr)
                for scal_expr in self.scalar_deps]

        from pytools import single_valued
        shape = single_valued(vec.shape for vec in vectors)

        kernel_rec = self.get_kernel(
                tuple(v.dtype for v in vectors),
                tuple(s.dtype for s in scalars))

        results = [gpuarray.empty(
            shape, kernel_rec.result_dtype, self.allocator)
            for expr in self.result_vec_expr_info_list]

        size = results[0].size
        args = ([r.gpudata for r in results]
                +[v.gpudata for v in vectors]
                +scalars
                +[size])

        if stats_callback is not None:
            stats_callback(size,  self,
                    kernel_rec.kernel.prepared_timed_call(vectors[0]._grid, results[0]._block, *args))
        else:
            kernel_rec.kernel.prepared_async_call(vectors[0]._grid, results[0]._block, self.stream, *args)

        return results
Пример #19
0
    def exec_diff_batch_assign(self, insn):
        field = self.rec(insn.field)

        discr = self.executor.discr
        if discr.instrumented:
            discr.diff_counter.add(discr.dimensions)
            discr.diff_flop_counter.add(discr.dimensions *
                                        (self.executor.diff_rst_flops +
                                         self.executor.diff_rescale_one_flops))

        repr_op = insn.operators[0]

        from hedge.optemplate.operators import \
                ReferenceQuadratureStiffnessTOperator
        if isinstance(repr_op, ReferenceQuadratureStiffnessTOperator):
            eg, = discr.element_groups
            from pytools import single_valued
            q_info = discr.get_cuda_elgroup_quadrature_info(
                eg, single_valued(op.quadrature_tag for op in insn.operators))

            kernel = discr.diff_kernel(
                aligned_preimage_dofs_per_microblock=q_info.
                aligned_dofs_per_microblock,
                preimage_dofs_per_el=q_info.ldis_quad_info.node_count())

            rst_diff = kernel(repr_op, field)
        else:
            rst_diff = self.executor.diff_kernel(repr_op, field)

        return [(name, rst_diff[op.rst_axis])
                for name, op in zip(insn.names, insn.operators)], []
Пример #20
0
    def _vis_connectivity(self):
        """
        :return: an array of shape
            ``(vis_discr.nelements,nsubelements,primitive_element_size)``
        """
        # Assume that we're using modepy's default node ordering.

        from pytools import generate_nonnegative_integer_tuples_summing_to_at_most \
                as gnitstam, single_valued
        vis_order = single_valued(group.order
                                  for group in self.vis_discr.groups)
        node_tuples = list(gnitstam(vis_order, self.vis_discr.dim))

        from modepy.tools import submesh
        el_connectivity = np.array(submesh(node_tuples), dtype=np.intp)

        nelements = sum(group.nelements for group in self.vis_discr.groups)
        vis_connectivity = np.empty((nelements, ) + el_connectivity.shape,
                                    dtype=np.intp)

        el_nr_base = 0
        for group in self.vis_discr.groups:
            assert len(node_tuples) == group.nunit_nodes
            vis_connectivity[el_nr_base:el_nr_base + group.nelements] = (
                np.arange(el_nr_base * group.nunit_nodes,
                          (el_nr_base + group.nelements) * group.nunit_nodes,
                          group.nunit_nodes)[:, np.newaxis, np.newaxis] +
                el_connectivity)
            el_nr_base += group.nelements

        return vis_connectivity
Пример #21
0
    def __call__(self, evaluate_subexpr, stats_callback=None):
        vectors = [evaluate_subexpr(vec_expr) for vec_expr in self.vector_deps]
        scalars = [
            evaluate_subexpr(scal_expr) for scal_expr in self.scalar_deps
        ]

        from pytools import single_valued
        shape = single_valued(vec.shape for vec in vectors)

        kernel_rec = self.get_kernel(tuple(v.dtype for v in vectors),
                                     tuple(s.dtype for s in scalars))

        results = [
            numpy.empty(shape, kernel_rec.result_dtype)
            for vei in self.result_vec_expr_info_list
        ]

        size = results[0].size
        args = (results + vectors + scalars)

        if stats_callback is not None:
            timer = stats_callback(size, self)
            sub_timer = timer.start_sub_timer()
            kernel_rec.kernel(*args)
            sub_timer.stop().submit()
        else:
            kernel_rec.kernel(*args)

        return results
Пример #22
0
def multi_put(arrays, dest_indices, dest_shape=None, out=None, stream=None):
    if not len(arrays):
        return []

    from pytools import single_valued
    a_dtype = single_valued(a.dtype for a in arrays)
    a_allocator = arrays[0].allocator

    vec_count = len(arrays)

    if out is None:
        out = [
            GPUArray(dest_shape, a_dtype, a_allocator)
            for i in range(vec_count)
        ]
    else:
        if a_dtype != single_valued(o.dtype for o in out):
            raise TypeError("arrays and out must have the same dtype")
        if len(out) != vec_count:
            raise ValueError("out and arrays must have the same length")

    if len(dest_indices.shape) != 1:
        raise ValueError("src_indices must be 1D")

    chunk_size = _builtin_min(vec_count, 10)

    def make_func_for_chunk_size(chunk_size):
        func = elementwise.get_put_kernel(a_dtype,
                                          dest_indices.dtype,
                                          vec_count=chunk_size)
        func.set_block_shape(*dest_indices._block)
        return func

    func = make_func_for_chunk_size(chunk_size)

    for start_i in range(0, len(arrays), chunk_size):
        chunk_slice = slice(start_i, start_i + chunk_size)

        if start_i + chunk_size > vec_count:
            func = make_func_for_chunk_size(vec_count - start_i)

        func.prepared_async_call(
            dest_indices._grid, stream, dest_indices.gpudata,
            *([o.gpudata for o in out[chunk_slice]] +
              [i.gpudata for i in arrays[chunk_slice]] + [dest_indices.size]))

    return out
Пример #23
0
def multi_put(arrays, dest_indices, dest_shape=None, out=None, stream=None):
    if not len(arrays):
        return []

    from pytools import single_valued
    a_dtype = single_valued(a.dtype for a in arrays)
    a_allocator = arrays[0].allocator

    vec_count = len(arrays)

    if out is None:
        out = [GPUArray(dest_shape, a_dtype, a_allocator)
                for i in range(vec_count)]
    else:
        if a_dtype != single_valued(o.dtype for o in out):
            raise TypeError("arrays and out must have the same dtype")
        if len(out) != vec_count:
            raise ValueError("out and arrays must have the same length")

    if len(dest_indices.shape) != 1:
        raise ValueError("src_indices must be 1D")

    chunk_size = _builtin_min(vec_count, 10)

    def make_func_for_chunk_size(chunk_size):
        func = elementwise.get_put_kernel(
                a_dtype, dest_indices.dtype, vec_count=chunk_size)
        func.set_block_shape(*dest_indices._block)
        return func

    func = make_func_for_chunk_size(chunk_size)

    for start_i in range(0, len(arrays), chunk_size):
        chunk_slice = slice(start_i, start_i+chunk_size)

        if start_i + chunk_size > vec_count:
            func = make_func_for_chunk_size(vec_count-start_i)

        func.prepared_async_call(dest_indices._grid, stream,
                dest_indices.gpudata, 
                *([o.gpudata for o in out[chunk_slice]]
                    + [i.gpudata for i in arrays[chunk_slice]]
                    + [dest_indices.size]))

    return out
Пример #24
0
    def __init__(self, ctx, expansions, strength_usage=None,
            value_dtypes=None,
            options=[], name="layerpot", device=None):
        KernelComputation.__init__(self, ctx, expansions, strength_usage,
                value_dtypes,
                name, options, device)

        from pytools import single_valued
        self.dim = single_valued(knl.dim for knl in self.expansions)
Пример #25
0
    def order(self):
        from warnings import warn
        warn("DGDiscretizationWithBoundaries.order is deprecated, "
                "consider the orders of element groups instead. "
                "'order' will go away in 2021.",
                DeprecationWarning, stacklevel=2)

        from pytools import single_valued
        return single_valued(egrp.order for egrp in self._volume_discr.groups)
Пример #26
0
    def make_flux_batch_assign(self, names, expressions, repr_op):
        from pytools import single_valued
        quadrature_tag = single_valued(
                wdflux.quadrature_tag
                for wdflux in expressions)

        return CUDAFluxBatchAssign(names=names, expressions=expressions, repr_op=repr_op,
                dep_mapper_factory=self.dep_mapper_factory,
                quadrature_tag=quadrature_tag)
Пример #27
0
    def map_int_g(self, expr, name_hint=None):
        try:
            return self.expr_to_var[expr]
        except KeyError:
            # make sure operator assignments stand alone and don't get muddled
            # up in vector arithmetic
            density_var = self.assign_to_new_var(self.rec(expr.density))

            group = self.group_to_operators[self.op_group_features(expr)]
            names = [self.get_var_name() for op in group]

            kernel_to_index = {}
            kernels = []
            for op in group:
                if op.kernel not in kernel_to_index:
                    kernel_to_index[op.kernel] = len(kernels)
                    kernels.append(op.kernel)

            from pytools import single_valued
            from sumpy.kernel import AxisTargetDerivativeRemover
            atdr = AxisTargetDerivativeRemover()
            base_kernel = single_valued(
                    atdr(kernel) for kernel in kernels)

            for op in group:
                assert op.qbx_forced_limit in [-1, 0, 1]

            kernel_arguments = dict(
                    (arg_name, self.rec(arg_val))
                    for arg_name, arg_val in six.iteritems(expr.kernel_arguments))

            outputs = [
                    LayerPotentialOutput(
                        name=name,
                        kernel_index=kernel_to_index[op.kernel],
                        target_name=op.target,
                        qbx_forced_limit=op.qbx_forced_limit,
                        )
                    for name, op in zip(names, group)
                    ]

            self.code.append(
                    LayerPotentialInstruction(
                        outputs=outputs,
                        kernels=tuple(kernels),
                        kernel_arguments=kernel_arguments,
                        base_kernel=base_kernel,
                        density=density_var,
                        source=expr.source,
                        priority=max(getattr(op, "priority", 0) for op in group),
                        dep_mapper_factory=self.dep_mapper_factory))

            from pymbolic.primitives import Variable
            for name, group_expr in zip(names, group):
                self.expr_to_var[group_expr] = Variable(name)

            return self.expr_to_var[expr]
Пример #28
0
    def make_flux_batch_assign(self, names, expressions, repr_op):
        from pytools import single_valued
        quadrature_tag = single_valued(wdflux.quadrature_tag
                                       for wdflux in expressions)

        return CUDAFluxBatchAssign(names=names,
                                   expressions=expressions,
                                   repr_op=repr_op,
                                   dep_mapper_factory=self.dep_mapper_factory,
                                   quadrature_tag=quadrature_tag)
Пример #29
0
def multi_take(arrays, indices, out=None, queue=None):
    if not len(arrays):
        return []

    assert len(indices.shape) == 1

    from pytools import single_valued
    a_dtype = single_valued(a.dtype for a in arrays)
    a_allocator = arrays[0].dtype
    context = indices.context
    queue = queue or indices.queue

    vec_count = len(arrays)

    if out is None:
        out = [
            Array(context,
                  queue,
                  indices.shape,
                  a_dtype,
                  allocator=a_allocator) for i in range(vec_count)
        ]
    else:
        if len(out) != len(arrays):
            raise ValueError("out and arrays must have the same length")

    chunk_size = _builtin_min(vec_count, 10)

    def make_func_for_chunk_size(chunk_size):
        knl = elementwise.get_take_kernel(indices.context,
                                          a_dtype,
                                          indices.dtype,
                                          vec_count=chunk_size)
        knl.set_block_shape(*indices._block)
        return knl

    knl = make_func_for_chunk_size(chunk_size)

    for start_i in range(0, len(arrays), chunk_size):
        chunk_slice = slice(start_i, start_i + chunk_size)

        if start_i + chunk_size > vec_count:
            knl = make_func_for_chunk_size(vec_count - start_i)

        gs, ls = indices.get_sizes(
            queue,
            knl.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE,
                                    queue.device))

        knl(
            queue, gs, ls, indices.data,
            *([o.data for o in out[chunk_slice]] +
              [i.data for i in arrays[chunk_slice]] + [indices.size]))

    return out
Пример #30
0
    def __call__(self, *args):
        from pytools import indices_in_shape, single_valued

        oa_shape = single_valued(ary.shape for fac, ary in args)
        result = numpy.zeros(oa_shape, dtype=object)

        for i in indices_in_shape(oa_shape):
            args_i = [(fac, ary[i]) for fac, ary in args]
            result[i] = self.scalar_kernel(*args_i)

        return result
Пример #31
0
    def __call__(self, *args):
        from pytools import indices_in_shape, single_valued

        oa_shape = single_valued(ary.shape for fac, ary in args)
        result = numpy.zeros(oa_shape, dtype=object)

        for i in indices_in_shape(oa_shape):
            args_i = [(fac, ary[i]) for fac, ary in args]
            result[i] = self.scalar_kernel(*args_i)

        return result
Пример #32
0
    def map_int_g(self, expr, name_hint=None):
        try:
            return self.expr_to_var[expr]
        except KeyError:
            # make sure operator assignments stand alone and don't get muddled
            # up in vector arithmetic
            density_var = self.assign_to_new_var(self.rec(expr.density))

            group = self.group_to_operators[self.op_group_features(expr)]
            names = [self.get_var_name() for op in group]

            kernels = sorted({op.kernel for op in group}, key=repr)

            kernel_to_index = {kernel: i for i, kernel in enumerate(kernels)}

            from pytools import single_valued
            from sumpy.kernel import AxisTargetDerivativeRemover
            atdr = AxisTargetDerivativeRemover()
            base_kernel = single_valued(atdr(kernel) for kernel in kernels)

            for op in group:
                assert op.qbx_forced_limit in [-2, -1, None, 1, 2]

            kernel_arguments = {
                arg_name: self.rec(arg_val)
                for arg_name, arg_val in expr.kernel_arguments.items()
            }

            outputs = [
                PotentialOutput(
                    name=name,
                    kernel_index=kernel_to_index[op.kernel],
                    target_name=op.target,
                    qbx_forced_limit=op.qbx_forced_limit,
                ) for name, op in zip(names, group)
            ]

            self.code.append(
                ComputePotentialInstruction(
                    outputs=outputs,
                    kernels=tuple(kernels),
                    kernel_arguments=kernel_arguments,
                    base_kernel=base_kernel,
                    density=density_var,
                    source=expr.source,
                    priority=max(getattr(op, "priority", 0) for op in group),
                    dep_mapper_factory=self.dep_mapper_factory))

            from pymbolic.primitives import Variable
            for name, group_expr in zip(names, group):
                self.expr_to_var[group_expr] = Variable(name)

            return self.expr_to_var[expr]
Пример #33
0
    def __init__(self,
                 ctx,
                 expansions,
                 strength_usage=None,
                 value_dtypes=None,
                 name=None,
                 device=None):
        KernelComputation.__init__(self, ctx, expansions, strength_usage,
                                   value_dtypes, name, device)

        from pytools import single_valued
        self.dim = single_valued(knl.dim for knl in self.expansions)
Пример #34
0
    def find_index_rank(self, name):
        irf = IndexRankFinder(name)

        for insn in self.instructions:
            insn.with_transformed_expressions(
                    lambda expr: irf(self.submap(expr)))

        if not irf.index_ranks:
            return 0
        else:
            from pytools import single_valued
            return single_valued(irf.index_ranks)
Пример #35
0
    def get_or_register_dtype(self, c_names, dtype=None):
        """Get or register a :class:`numpy.dtype` associated with the C type names
        in the string list *c_names*. If *dtype* is `None`, no registration is
        performed, and the :class:`numpy.dtype` must already have been registered.
        If so, it is returned.  If not, :exc:`TypeNameNotKnown` is raised.

        If *dtype* is not `None`, registration is attempted. If the *c_names* are
        already known and registered to identical :class:`numpy.dtype` objects,
        then the previously dtype object of the previously  registered type is
        returned. If the *c_names* are not yet known, the type is registered. If
        one of the *c_names* is known but registered to a different type, an error
        is raised. In this latter case, the type may end up partially registered
        and any further behavior is undefined.

        .. versionadded:: 2012.2
        """

        if isinstance(c_names, str):
            c_names = [c_names]

        if dtype is None:
            from pytools import single_valued
            return single_valued(self.name_to_dtype[name] for name in c_names)

        dtype = np.dtype(dtype)

        # check if we've seen an identical dtype, if so retrieve exact dtype object.
        try:
            existing_name = self.dtype_to_name[dtype]
        except KeyError:
            existed = False
        else:
            existed = True
            existing_dtype = self.name_to_dtype[existing_name]
            assert existing_dtype == dtype
            dtype = existing_dtype

        for nm in c_names:
            try:
                name_dtype = self.name_to_dtype[nm]
            except KeyError:
                self.name_to_dtype[nm] = dtype
            else:
                if name_dtype != dtype:
                    raise RuntimeError("name '%s' already registered to "
                            "different dtype" % nm)

        if not existed:
            self.dtype_to_name[dtype] = c_names[0]
        if not str(dtype) in self.dtype_to_name:
            self.dtype_to_name[str(dtype)] = c_names[0]

        return dtype
Пример #36
0
def join_conserved(dim, mass, energy, momentum):
    """Create an agglomerated solution array from the conserved quantities."""
    from pytools import single_valued
    aux_shape = single_valued([
        _aux_shape(mass, ()),
        _aux_shape(energy, ()),
        _aux_shape(momentum, (dim,))])

    result = np.zeros((2+dim,) + aux_shape, dtype=object)
    result[0] = mass
    result[1] = energy
    result[2:] = momentum
    return result
Пример #37
0
    def __call__(self, array_context, *args):
        func_name = self.identifier
        from pytools import single_valued
        if single_valued(should_use_numpy(arg) for arg in args):
            func = getattr(np, func_name)
            return func(*args)

        if func_name == "fabs":  # FIXME
            # Loopy has a type-adaptive "abs", but no "fabs".
            func_name = "abs"

        sfunc = getattr(array_context.np, func_name)
        return sfunc(*args)
Пример #38
0
def multi_take(arrays, indices, out=None, queue=None):
    if not len(arrays):
        return []

    assert len(indices.shape) == 1

    from pytools import single_valued
    a_dtype = single_valued(a.dtype for a in arrays)
    a_allocator = arrays[0].dtype
    context = indices.context
    queue = queue or indices.queue

    vec_count = len(arrays)

    if out is None:
        out = [Array(context, queue, indices.shape, a_dtype,
            allocator=a_allocator)
                for i in range(vec_count)]
    else:
        if len(out) != len(arrays):
            raise ValueError("out and arrays must have the same length")

    chunk_size = _builtin_min(vec_count, 10)

    def make_func_for_chunk_size(chunk_size):
        knl = elementwise.get_take_kernel(
                indices.context, a_dtype, indices.dtype,
                vec_count=chunk_size)
        knl.set_block_shape(*indices._block)
        return knl

    knl = make_func_for_chunk_size(chunk_size)

    for start_i in range(0, len(arrays), chunk_size):
        chunk_slice = slice(start_i, start_i+chunk_size)

        if start_i + chunk_size > vec_count:
            knl = make_func_for_chunk_size(vec_count-start_i)

        gs, ls = indices.get_sizes(queue,
                knl.get_work_group_info(
                    cl.kernel_work_group_info.WORK_GROUP_SIZE,
                    queue.device))

        knl(queue, gs, ls,
                indices.data,
                *([o.data for o in out[chunk_slice]]
                    + [i.data for i in arrays[chunk_slice]]
                    + [indices.size]))

    return out
Пример #39
0
    def gpu_diffmats(self, diff_op_cls, elgroup):
        discr = self.discr
        given = self.plan.given

        columns = given.dofs_per_el() * discr.dimensions
        additional_columns = 0
        # avoid smem fetch bank conflicts by ensuring odd col count
        if columns % 2 == 0:
            columns += 1
            additional_columns += 1

        block_floats = given.devdata.align_dtype(
            columns * self.plan.segment_size, given.float_size())

        vstacked_matrices = [
            numpy.vstack(given.microblock.elements * (m, ))
            for m in diff_op_cls.matrices(elgroup)
        ]

        segments = []

        from pytools import single_valued
        for segment_start in range(
                0, given.microblock.elements * given.dofs_per_el(),
                self.plan.segment_size):
            matrices = [
                m[segment_start:segment_start + self.plan.segment_size]
                for m in vstacked_matrices
            ]

            matrices.append(
                numpy.zeros(
                    (single_valued(m.shape[0]
                                   for m in matrices), additional_columns)))

            diffmats = numpy.asarray(numpy.hstack(matrices),
                                     dtype=given.float_type,
                                     order="C")
            segments.append(buffer(diffmats))

        from hedge.backends.cuda.tools import pad_and_join

        from pytools import Record

        class GPUDifferentiationMatrices(Record):
            pass

        return GPUDifferentiationMatrices(device_memory=cuda.to_device(
            pad_and_join(segments, block_floats * given.float_size())),
                                          block_floats=block_floats,
                                          matrix_columns=columns)
Пример #40
0
    def gpu_diffmats(self, diff_op_cls, elgroup):
        discr = self.discr
        given = self.plan.given

        columns = given.dofs_per_el()*discr.dimensions
        additional_columns = 0
        # avoid smem fetch bank conflicts by ensuring odd col count
        if columns % 2 == 0:
            columns += 1
            additional_columns += 1

        block_floats = given.devdata.align_dtype(
                columns*self.plan.segment_size, given.float_size())

        vstacked_matrices = [
                numpy.vstack(given.microblock.elements*(m,))
                for m in diff_op_cls.matrices(elgroup)
                ]

        segments = []

        from pytools import single_valued
        for segment_start in range(0, given.microblock.elements*given.dofs_per_el(), self.plan.segment_size):
            matrices = [
                m[segment_start:segment_start+self.plan.segment_size]
                for m in vstacked_matrices]

            matrices.append(
                numpy.zeros((single_valued(m.shape[0] for m in matrices),
                    additional_columns))
                )

            diffmats = numpy.asarray(
                    numpy.hstack(matrices),
                    dtype=given.float_type,
                    order="C")
            segments.append(buffer(diffmats))

        from hedge.backends.cuda.tools import pad_and_join

        from pytools import Record
        class GPUDifferentiationMatrices(Record):
            pass

        return GPUDifferentiationMatrices(
                device_memory=cuda.to_device(
                    pad_and_join(segments, block_floats*given.float_size())),
                block_floats=block_floats,
                matrix_columns=columns)
Пример #41
0
def get_or_register_dtype(c_names, dtype=None):
    """Get or register a :class:`numpy.dtype` associated with the C type names in the
    string list *c_names*. If *dtype* is `None`, no registration is performed, and the
    :class:`numpy.dtype` must already have been registered. If so, it is returned.
    If not, :exc:`TypeNameNotKnown` is raised.

    If *dtype* is not `None`, registration is attempted. If the *c_names* are already
    known and registered to identical :class:`numpy.dtype` objects, then the previously
    registered type is returned. Otherwise, the type is registered.

    .. versionadded:: 2012.2
    """

    if isinstance(c_names, str):
        c_names = [c_names]

    if dtype is None:
        from pytools import single_valued
        return single_valued(NAME_TO_DTYPE[name] for name in c_names)

    dtype = np.dtype(dtype)

    # check if we've seen an identical dtype, if so retrieve exact dtype object.
    try:
        existing_name = DTYPE_TO_NAME[dtype]
    except KeyError:
        existed = False
    else:
        existed = True
        existing_dtype = NAME_TO_DTYPE[existing_name]
        assert existing_dtype == dtype
        dtype = existing_dtype

    for nm in c_names:
        try:
            name_dtype = NAME_TO_DTYPE[nm]
        except KeyError:
            NAME_TO_DTYPE[nm] = dtype
        else:
            if name_dtype != dtype:
                raise RuntimeError(
                    "name '%s' already registered to different dtype" % nm)

    if not existed:
        DTYPE_TO_NAME[dtype] = c_names[0]
    if not str(dtype) in DTYPE_TO_NAME:
        DTYPE_TO_NAME[str(dtype)] = c_names[0]

    return dtype
Пример #42
0
def get_or_register_dtype(c_names, dtype=None):
    """Get or register a :class:`numpy.dtype` associated with the C type names in the
    string list *c_names*. If *dtype* is `None`, no registration is performed, and the
    :class:`numpy.dtype` must already have been registered. If so, it is returned.
    If not, :exc:`TypeNameNotKnown` is raised.

    If *dtype* is not `None`, registration is attempted. If the *c_names* are already
    known and registered to identical :class:`numpy.dtype` objects, then the previously
    registered type is returned. Otherwise, the type is registered.

    .. versionadded:: 2012.2
    """

    if isinstance(c_names, str):
        c_names = [c_names]

    if dtype is None:
        from pytools import single_valued

        return single_valued(NAME_TO_DTYPE[name] for name in c_names)

    dtype = np.dtype(dtype)

    # check if we've seen an identical dtype, if so retrieve exact dtype object.
    try:
        existing_name = DTYPE_TO_NAME[dtype]
    except KeyError:
        existed = False
    else:
        existed = True
        existing_dtype = NAME_TO_DTYPE[existing_name]
        assert existing_dtype == dtype
        dtype = existing_dtype

    for nm in c_names:
        try:
            name_dtype = NAME_TO_DTYPE[nm]
        except KeyError:
            NAME_TO_DTYPE[nm] = dtype
        else:
            if name_dtype != dtype:
                raise RuntimeError("name '%s' already registered to different dtype" % nm)

    if not existed:
        DTYPE_TO_NAME[dtype] = c_names[0]
    if not str(dtype) in DTYPE_TO_NAME:
        DTYPE_TO_NAME[str(dtype)] = c_names[0]

    return dtype
Пример #43
0
    def combine(dtypes):
        # dtypes may just be a generator expr
        dtypes = list(dtypes)

        from loopy.types import LoopyType, NumpyType
        assert all(isinstance(dtype, LoopyType) for dtype in dtypes)

        if not all(isinstance(dtype, NumpyType) for dtype in dtypes):
            from pytools import is_single_valued, single_valued
            if not is_single_valued(dtypes):
                raise TypeInferenceFailure(
                        "Nothing known about operations between '%s'"
                        % ", ".join(str(dt) for dt in dtypes))

            return single_valued(dtypes)

        dtypes = [dtype.dtype for dtype in dtypes]

        result = dtypes.pop()
        while dtypes:
            other = dtypes.pop()

            if result.fields is None and other.fields is None:
                if (result, other) in [
                        (np.int32, np.float32), (np.float32, np.int32)]:
                    # numpy makes this a double. I disagree.
                    result = np.dtype(np.float32)
                else:
                    result = (
                            np.empty(0, dtype=result)
                            + np.empty(0, dtype=other)
                            ).dtype

            elif result.fields is None and other.fields is not None:
                # assume the non-native type takes over
                # (This is used for vector types.)
                result = other
            elif result.fields is not None and other.fields is None:
                # assume the non-native type takes over
                # (This is used for vector types.)
                pass
            else:
                if result is not other:
                    raise TypeInferenceFailure(
                            "nothing known about result of operation on "
                            "'%s' and '%s'" % (result, other))

        return NumpyType(result)
Пример #44
0
    def __call__(self, particles, radii, wait_for=None):
        dimensions = len(particles)

        from pytools import single_valued
        coord_dtype = single_valued(coord.dtype for coord in particles)

        if radii is None:
            radii_tuple = ()
        else:
            radii_tuple = (radii,)

        knl = self.get_kernel(dimensions, coord_dtype,
                # have_radii:
                radii is not None)
        return knl(*(tuple(particles) + radii_tuple),
                wait_for=wait_for, return_event=True)
Пример #45
0
def _entry_dtype(ary):
    from meshmode.dof_array import DOFArray
    if isinstance(ary, DOFArray):
        # the "normal case"
        return ary.entry_dtype
    elif isinstance(ary, np.ndarray):
        if ary.dtype.char == "O":
            from pytools import single_valued
            return single_valued(_entry_dtype(entry) for entry in ary.flat)
        else:
            return ary.dtype
    elif isinstance(ary, cl.array.Array):
        # for "unregularized" layer potential sources
        return ary.dtype
    else:
        raise TypeError(f"unexpected type '{type(ary)}' in _entry_dtype")
Пример #46
0
    def get_lpot_applier(self, target_kernels, source_kernels):
        # needs to be separate method for caching

        if any(knl.is_complex_valued for knl in target_kernels):
            value_dtype = self.density_discr.complex_dtype
        else:
            value_dtype = self.density_discr.real_dtype

        base_kernel = single_valued(knl.get_base_kernel() for knl in source_kernels)

        from sumpy.qbx import LayerPotential
        return LayerPotential(self.cl_context,
                    expansion=self.get_expansion_for_qbx_direct_eval(
                        base_kernel, target_kernels),
                    target_kernels=target_kernels, source_kernels=source_kernels,
                    value_dtypes=value_dtype)
Пример #47
0
def multi_take(arrays, indices, out=None, stream=None):
    if not len(arrays):
        return []

    assert len(indices.shape) == 1

    from pytools import single_valued
    a_dtype = single_valued(a.dtype for a in arrays)
    a_allocator = arrays[0].dtype

    vec_count = len(arrays)

    if out is None:
        out = [
            GPUArray(indices.shape, a_dtype, a_allocator)
            for i in range(vec_count)
        ]
    else:
        if len(out) != len(arrays):
            raise ValueError("out and arrays must have the same length")

    chunk_size = _builtin_min(vec_count, 20)

    def make_func_for_chunk_size(chunk_size):
        func, tex_src = elementwise.get_take_kernel(a_dtype,
                                                    indices.dtype,
                                                    vec_count=chunk_size)
        func.set_block_shape(*indices._block)
        return func, tex_src

    func, tex_src = make_func_for_chunk_size(chunk_size)

    for start_i in range(0, len(arrays), chunk_size):
        chunk_slice = slice(start_i, start_i + chunk_size)

        if start_i + chunk_size > vec_count:
            func, tex_src = make_func_for_chunk_size(vec_count - start_i)

        for i, a in enumerate(arrays[chunk_slice]):
            a.bind_to_texref_ext(tex_src[i], allow_double_hack=True)

        func.prepared_async_call(
            indices._grid, stream, indices.gpudata,
            *([o.gpudata for o in out[chunk_slice]] + [indices.size]))

    return out
Пример #48
0
    def get_lpot_applier_on_tgt_subset(self, target_kernels, source_kernels):
        # needs to be separate method for caching

        if any(knl.is_complex_valued for knl in target_kernels):
            value_dtype = self.density_discr.complex_dtype
        else:
            value_dtype = self.density_discr.real_dtype

        base_kernel = single_valued(knl.get_base_kernel() for knl in source_kernels)

        from pytential.qbx.direct import LayerPotentialOnTargetAndCenterSubset
        from sumpy.expansion.local import VolumeTaylorLocalExpansion
        return LayerPotentialOnTargetAndCenterSubset(
                self.cl_context,
                expansion=VolumeTaylorLocalExpansion(base_kernel, self.qbx_order),
                target_kernels=target_kernels, source_kernels=source_kernels,
                value_dtypes=value_dtype)
Пример #49
0
    def __init__(self, ctx, kernels, exclude_self, strength_usage=None,
            value_dtypes=None,
            options=[], name=None, device=None):
        """
        :arg kernels: list of :class:`sumpy.kernel.Kernel` instances
        :arg strength_usage: A list of integers indicating which expression
          uses which source strength indicator. This implicitly specifies the
          number of strength arrays that need to be passed.
          Default: all kernels use the same strength.
        """
        KernelComputation.__init__(self, ctx, kernels, strength_usage,
                value_dtypes,
                name, options, device)

        self.exclude_self = exclude_self

        from pytools import single_valued
        self.dim = single_valued(knl.dim for knl in self.kernels)
Пример #50
0
def multi_take(arrays, indices, out=None, stream=None):
    if not len(arrays):
        return []

    assert len(indices.shape) == 1

    from pytools import single_valued
    a_dtype = single_valued(a.dtype for a in arrays)
    a_allocator = arrays[0].dtype

    vec_count = len(arrays)

    if out is None:
        out = [GPUArray(indices.shape, a_dtype, a_allocator)
                for i in range(vec_count)]
    else:
        if len(out) != len(arrays):
            raise ValueError("out and arrays must have the same length")

    chunk_size = _builtin_min(vec_count, 20)

    def make_func_for_chunk_size(chunk_size):
        func, tex_src = elementwise.get_take_kernel(a_dtype, indices.dtype, 
                vec_count=chunk_size)
        func.set_block_shape(*indices._block)
        return func, tex_src

    func, tex_src = make_func_for_chunk_size(chunk_size)

    for start_i in range(0, len(arrays), chunk_size):
        chunk_slice = slice(start_i, start_i+chunk_size)

        if start_i + chunk_size > vec_count:
            func, tex_src = make_func_for_chunk_size(vec_count-start_i)

        for i, a in enumerate(arrays[chunk_slice]):
            a.bind_to_texref_ext(tex_src[i], allow_double_hack=True)

        func.prepared_async_call(indices._grid, stream, 
                indices.gpudata, 
                *([o.gpudata for o in out[chunk_slice]]
                    + [indices.size]))

    return out
Пример #51
0
    def __call__(self, particles, radii, wait_for=None):
        dimensions = len(particles)

        from pytools import single_valued

        coord_dtype = single_valued(coord.dtype for coord in particles)

        if radii is None:
            radii_tuple = ()
        else:
            radii_tuple = (radii,)

        knl = self.get_kernel(
            dimensions,
            coord_dtype,
            # have_radii:
            radii is not None,
        )
        return knl(*(tuple(particles) + radii_tuple), wait_for=wait_for, return_event=True)
Пример #52
0
    def __call__(self, *args):
        args = list(args)

        from pytools import single_valued
        size = single_valued(
            args[i].size for i in self.vec_arg_indices
            if not (isinstance(args[i], (int, float)) and args[i] == 0))
        for i in self.vec_arg_indices:
            if isinstance(args[i], (int, float)) and args[i] == 0:
                args[i] = numpy.zeros(size, dtype=self.arguments[i].dtype)

        # no need to do type checking--pyublas does that for us
        arg_struct = self.module.ArgStruct()
        for arg_descr, arg in zip(self.arguments, args):
            setattr(arg_struct, arg_descr.arg_name(), arg)

        assert not arg_struct.__dict__

        self.func(size, arg_struct)
Пример #53
0
    def __call__(self, *args):
        vectors = []
        args = list(args)

        from pytools import single_valued
        size = single_valued(args[i].size for i in self.vec_arg_indices
                if not (isinstance(args[i], (int, float)) and args[i] == 0))
        for i in self.vec_arg_indices:
            if isinstance(args[i], (int, float)) and args[i] == 0:
                args[i] = numpy.zeros(size, dtype=self.arguments[i].dtype)

        # no need to do type checking--pyublas does that for us
        arg_struct = self.module.ArgStruct()
        for arg_descr, arg in zip(self.arguments, args):
            setattr(arg_struct, arg_descr.arg_name(), arg)

        assert not arg_struct.__dict__

        self.func(size, arg_struct)
Пример #54
0
    def expansion_wrangler_code_container(self, source_kernels,
                                          target_kernels):
        from functools import partial
        base_kernel = single_valued(kernel.get_base_kernel()
                                    for kernel in source_kernels)
        mpole_expn_class = \
                self.expansion_factory.get_multipole_expansion_class(base_kernel)
        local_expn_class = \
                self.expansion_factory.get_local_expansion_class(base_kernel)

        fmm_mpole_factory = partial(mpole_expn_class, base_kernel)
        fmm_local_factory = partial(local_expn_class, base_kernel)
        qbx_local_factory = partial(local_expn_class, base_kernel)

        if self.fmm_backend == "sumpy":
            from pytential.qbx.fmm import \
                    QBXSumpyExpansionWranglerCodeContainer
            return QBXSumpyExpansionWranglerCodeContainer(
                self.cl_context,
                fmm_mpole_factory,
                fmm_local_factory,
                qbx_local_factory,
                target_kernels=target_kernels,
                source_kernels=source_kernels)

        elif self.fmm_backend == "fmmlib":
            source_kernel, = source_kernels
            target_kernels_new = [
                target_kernel.replace_base_kernel(source_kernel)
                for target_kernel in target_kernels
            ]
            from pytential.qbx.fmmlib import \
                    QBXFMMLibExpansionWranglerCodeContainer
            return QBXFMMLibExpansionWranglerCodeContainer(
                self.cl_context,
                fmm_mpole_factory,
                fmm_local_factory,
                qbx_local_factory,
                target_kernels=target_kernels_new)

        else:
            raise ValueError(f"invalid FMM backend: {self.fmm_backend}")
Пример #55
0
    def reassemble(self, parts_vol_vectors):
        from pytools import single_valued, indices_in_shape
        from hedge.tools import log_shape
        ls = single_valued(log_shape(pvv) for pvv in parts_vol_vectors)

        def remap_scalar_field(idx):
            result = self.whole_discr.volume_zeros()
            for part_emb, part_vol_vector in zip(
                    self._embeddings(), parts_vol_vectors):
                result[part_emb] = part_vol_vector[idx]

            return result

        if ls != ():
            result = numpy.zeros(ls, dtype=object)
            for i in indices_in_shape(ls):
                result[i] = remap_scalar_field(i)
            return result
        else:
            return remap_scalar_field(())
Пример #56
0
def multi_take_put(arrays, dest_indices, src_indices, dest_shape=None,
        out=None, queue=None, src_offsets=None):
    if not len(arrays):
        return []

    from pytools import single_valued
    a_dtype = single_valued(a.dtype for a in arrays)
    a_allocator = arrays[0].allocator
    context = src_indices.context
    queue = queue or src_indices.queue

    vec_count = len(arrays)

    if out is None:
        out = [Array(context, dest_shape, a_dtype, a_allocator, queue=queue)
                for i in range(vec_count)]
    else:
        if a_dtype != single_valued(o.dtype for o in out):
            raise TypeError("arrays and out must have the same dtype")
        if len(out) != vec_count:
            raise ValueError("out and arrays must have the same length")

    if src_indices.dtype != dest_indices.dtype:
        raise TypeError("src_indices and dest_indices must have the same dtype")

    if len(src_indices.shape) != 1:
        raise ValueError("src_indices must be 1D")

    if src_indices.shape != dest_indices.shape:
        raise ValueError("src_indices and dest_indices must have the same shape")

    if src_offsets is None:
        src_offsets_list = []
    else:
        src_offsets_list = src_offsets
        if len(src_offsets) != vec_count:
            raise ValueError("src_indices and src_offsets must have the same length")

    max_chunk_size = 10

    chunk_size = _builtin_min(vec_count, max_chunk_size)

    def make_func_for_chunk_size(chunk_size):
        return elementwise.get_take_put_kernel(context,
                a_dtype, src_indices.dtype,
                with_offsets=src_offsets is not None,
                vec_count=chunk_size)

    knl = make_func_for_chunk_size(chunk_size)

    for start_i in range(0, len(arrays), chunk_size):
        chunk_slice = slice(start_i, start_i+chunk_size)

        if start_i + chunk_size > vec_count:
            knl = make_func_for_chunk_size(vec_count-start_i)

        knl(queue, src_indices._global_size, src_indices._local_size,
                dest_indices.data, src_indices.data,
                *([i.data for i in arrays[chunk_slice]]
                    + [o.data for o in out[chunk_slice]]
                    + src_offsets_list[chunk_slice]
                    + [src_indices.size]))

    return out
Пример #57
0
 def dim(self):
     from pytools import single_valued
     return single_valued(grp.dim for grp in self.groups)
Пример #58
0
    def get_temporary_decls(self, codegen_state, schedule_index):
        from loopy.kernel.data import AddressSpace

        kernel = codegen_state.kernel

        base_storage_decls = []
        temp_decls = []

        # {{{ declare temporaries

        base_storage_sizes = {}
        base_storage_to_scope = {}
        base_storage_to_align_bytes = {}

        from cgen import ArrayOf, Initializer, AlignedAttribute, Value, Line
        # Getting the temporary variables that are needed for the current
        # sub-kernel.
        from loopy.schedule.tools import (
                temporaries_read_in_subkernel,
                temporaries_written_in_subkernel)
        subkernel = kernel.schedule[schedule_index].kernel_name
        sub_knl_temps = (
                temporaries_read_in_subkernel(kernel, subkernel) |
                temporaries_written_in_subkernel(kernel, subkernel))

        for tv in sorted(
                six.itervalues(kernel.temporary_variables),
                key=lambda tv: tv.name):
            decl_info = tv.decl_info(self.target, index_dtype=kernel.index_dtype)

            if not tv.base_storage:
                for idi in decl_info:
                    # global temp vars are mapped to arguments or global declarations
                    if tv.address_space != AddressSpace.GLOBAL and (
                            tv.name in sub_knl_temps):
                        decl = self.wrap_temporary_decl(
                                self.get_temporary_decl(
                                    codegen_state, schedule_index, tv, idi),
                                tv.address_space)

                        if tv.initializer is not None:
                            assert tv.read_only
                            decl = Initializer(decl, generate_array_literal(
                                codegen_state, tv, tv.initializer))

                        temp_decls.append(decl)

            else:
                assert tv.initializer is None

                offset = 0
                base_storage_sizes.setdefault(tv.base_storage, []).append(
                        tv.nbytes)
                base_storage_to_scope.setdefault(tv.base_storage, []).append(
                        tv.address_space)

                align_size = tv.dtype.itemsize

                from loopy.kernel.array import VectorArrayDimTag
                for dim_tag, axis_len in zip(tv.dim_tags, tv.shape):
                    if isinstance(dim_tag, VectorArrayDimTag):
                        align_size *= axis_len

                base_storage_to_align_bytes.setdefault(tv.base_storage, []).append(
                        align_size)

                for idi in decl_info:
                    cast_decl = POD(self, idi.dtype, "")
                    temp_var_decl = POD(self, idi.dtype, idi.name)

                    cast_decl = self.wrap_temporary_decl(cast_decl, tv.address_space)
                    temp_var_decl = self.wrap_temporary_decl(
                            temp_var_decl, tv.address_space)

                    if tv._base_storage_access_may_be_aliasing:
                        ptrtype = _ConstPointer
                    else:
                        # The 'restrict' part of this is a complete lie--of course
                        # all these temporaries are aliased. But we're promising to
                        # not use them to shovel data from one representation to the
                        # other. That counts, right?
                        ptrtype = _ConstRestrictPointer

                    cast_decl = ptrtype(cast_decl)
                    temp_var_decl = ptrtype(temp_var_decl)

                    cast_tp, cast_d = cast_decl.get_decl_pair()
                    temp_var_decl = Initializer(
                            temp_var_decl,
                            "(%s %s) (%s + %s)" % (
                                " ".join(cast_tp), cast_d,
                                tv.base_storage,
                                offset))

                    temp_decls.append(temp_var_decl)

                    from pytools import product
                    offset += (
                            idi.dtype.itemsize
                            * product(si for si in idi.shape))

        ecm = self.get_expression_to_code_mapper(codegen_state)

        for bs_name, bs_sizes in sorted(six.iteritems(base_storage_sizes)):
            bs_var_decl = Value("char", bs_name)
            from pytools import single_valued
            bs_var_decl = self.wrap_temporary_decl(
                    bs_var_decl, single_valued(base_storage_to_scope[bs_name]))

            # FIXME: Could try to use isl knowledge to simplify max.
            if all(isinstance(bs, int) for bs in bs_sizes):
                bs_size_max = max(bs_sizes)
            else:
                bs_size_max = p.Max(tuple(bs_sizes))

            bs_var_decl = ArrayOf(bs_var_decl, ecm(bs_size_max))

            alignment = max(base_storage_to_align_bytes[bs_name])
            bs_var_decl = AlignedAttribute(alignment, bs_var_decl)

            base_storage_decls.append(bs_var_decl)

        # }}}

        result = base_storage_decls + temp_decls

        if result:
            result.append(Line())

        return result
Пример #59
0
def multi_take_put(arrays, dest_indices, src_indices, dest_shape=None,
        out=None, stream=None, src_offsets=None):
    if not len(arrays):
        return []

    from pytools import single_valued
    a_dtype = single_valued(a.dtype for a in arrays)
    a_allocator = arrays[0].allocator

    vec_count = len(arrays)

    if out is None:
        out = [GPUArray(dest_shape, a_dtype, a_allocator)
                for i in range(vec_count)]
    else:
        if a_dtype != single_valued(o.dtype for o in out):
            raise TypeError("arrays and out must have the same dtype")
        if len(out) != vec_count:
            raise ValueError("out and arrays must have the same length")

    if src_indices.dtype != dest_indices.dtype:
        raise TypeError("src_indices and dest_indices must have the same dtype")

    if len(src_indices.shape) != 1:
        raise ValueError("src_indices must be 1D")

    if src_indices.shape != dest_indices.shape:
        raise ValueError("src_indices and dest_indices must have the same shape")

    if src_offsets is None:
        src_offsets_list = []
        max_chunk_size = 20
    else:
        src_offsets_list = src_offsets
        if len(src_offsets) != vec_count:
            raise ValueError("src_indices and src_offsets must have the same length")
        max_chunk_size = 10

    chunk_size = _builtin_min(vec_count, max_chunk_size)

    def make_func_for_chunk_size(chunk_size):
        return elementwise.get_take_put_kernel(
                a_dtype, src_indices.dtype,
                with_offsets=src_offsets is not None,
                vec_count=chunk_size)

    func, tex_src = make_func_for_chunk_size(chunk_size)

    for start_i in range(0, len(arrays), chunk_size):
        chunk_slice = slice(start_i, start_i+chunk_size)

        if start_i + chunk_size > vec_count:
            func, tex_src = make_func_for_chunk_size(vec_count-start_i)

        for src_tr, a in zip(tex_src, arrays[chunk_slice]):
            a.bind_to_texref_ext(src_tr, allow_double_hack=True)

        func.prepared_async_call(src_indices._grid,  src_indices._block, stream,
                dest_indices.gpudata, src_indices.gpudata,
                *([o.gpudata for o in out[chunk_slice]]
                    + src_offsets_list[chunk_slice]
                    + [src_indices.size]))

    return out