def identify_affine_map(from_points, to_points): """Return an affine map that maps *from_points[i]* to *to_points[i]*. For an n-dimensional affine map, n+1 points are needed. """ from pytools import single_valued dim = single_valued([ single_valued(len(fp) for fp in from_points), single_valued(len(tp) for tp in to_points) ]) if dim == 0: return AffineMap(numpy.zeros((0, 0), dtype=numpy.float64), numpy.zeros((0, ), dtype=numpy.float64)) if len(from_points) != dim + 1 or len(to_points) != dim + 1: raise ValueError("need dim+1 points to identify an affine map") # columns contain points x_mat = numpy.array(from_points).T y_mat = numpy.array(to_points).T # We are trying to solve # a*x_i + b = y_i # for a and b. To eliminate b, subtract equation (i+1) from equation i, # then chop the last column. xdiff_mat = (x_mat - numpy.roll(x_mat, -1, axis=1))[:, :dim] ydiff_mat = (y_mat - numpy.roll(y_mat, -1, axis=1))[:, :dim] from hedge.tools.linalg import leftsolve a = numpy.asarray(leftsolve(xdiff_mat, ydiff_mat), order="C") b = to_points[0] - numpy.dot(a, from_points[0]) return AffineMap(a, b)
def _bmat(blocks, dtypes): from pytools import single_valued from pytential.symbolic.matrix import is_zero nrows = blocks.shape[0] ncolumns = blocks.shape[1] # "block row starts"/"block column starts" brs = np.cumsum([0] + [single_valued(blocks[ibrow, ibcol].shape[0] for ibcol in range(ncolumns) if not is_zero(blocks[ibrow, ibcol])) for ibrow in range(nrows)]) bcs = np.cumsum([0] + [single_valued(blocks[ibrow, ibcol].shape[1] for ibrow in range(nrows) if not is_zero(blocks[ibrow, ibcol])) for ibcol in range(ncolumns)]) result = np.zeros((brs[-1], bcs[-1]), dtype=np.find_common_type(dtypes, [])) for ibcol in range(ncolumns): for ibrow in range(nrows): result[brs[ibrow]:brs[ibrow + 1], bcs[ibcol]:bcs[ibcol + 1]] = \ blocks[ibrow, ibcol] return result
def map_ref_diff_op_binding(self, expr, codegen_state): try: return self.expr_to_var[expr] except KeyError: all_diffs = [diff for diff in self.diff_ops if diff.op.equal_except_for_axis(expr.op) and diff.field == expr.field] names = [self.name_gen("expr") for d in all_diffs] from pytools import single_valued op_class = single_valued(type(d.op) for d in all_diffs) codegen_state.get_code_list(self).append( DiffBatchAssign( names=names, op_class=op_class, operators=[d.op for d in all_diffs], field=self.rec( single_valued(d.field for d in all_diffs), codegen_state))) from pymbolic import var for n, d in zip(names, all_diffs): self.expr_to_var[d] = var(n) return self.expr_to_var[expr]
def map_ref_diff_op_binding(self, expr): try: return self.expr_to_var[expr] except KeyError: all_diffs = [ diff for diff in self.diff_ops if diff.op.equal_except_for_axis(expr.op) and diff.field == expr.field ] names = [self.get_var_name() for d in all_diffs] from pytools import single_valued op_class = single_valued(type(d.op) for d in all_diffs) from hedge.optemplate.operators import \ ReferenceQuadratureStiffnessTOperator if isinstance(op_class, ReferenceQuadratureStiffnessTOperator): assign_class = QuadratureDiffBatchAssign else: assign_class = DiffBatchAssign self.code.append( assign_class(names=names, op_class=op_class, operators=[d.op for d in all_diffs], field=self.rec( single_valued(d.field for d in all_diffs)), dep_mapper_factory=self.dep_mapper_factory)) from pymbolic import var for n, d in zip(names, all_diffs): self.expr_to_var[d] = var(n) return self.expr_to_var[expr]
def map_ref_diff_op_binding(self, expr): try: return self.expr_to_var[expr] except KeyError: all_diffs = [diff for diff in self.diff_ops if diff.op.equal_except_for_axis(expr.op) and diff.field == expr.field] names = [self.get_var_name() for d in all_diffs] from pytools import single_valued op_class=single_valued(type(d.op) for d in all_diffs) from hedge.optemplate.operators import \ ReferenceQuadratureStiffnessTOperator if isinstance(op_class, ReferenceQuadratureStiffnessTOperator): assign_class = QuadratureDiffBatchAssign else: assign_class = DiffBatchAssign self.code.append( assign_class( names=names, op_class=op_class, operators=[d.op for d in all_diffs], field=self.rec( single_valued(d.field for d in all_diffs)), dep_mapper_factory=self.dep_mapper_factory)) from pymbolic import var for n, d in zip(names, all_diffs): self.expr_to_var[d] = var(n) return self.expr_to_var[expr]
def _bmat(blocks, dtypes): from pytools import single_valued from pytential.symbolic.matrix import is_zero nrows = blocks.shape[0] ncolumns = blocks.shape[1] # "block row starts"/"block column starts" brs = np.cumsum([0] + [ single_valued(blocks[ibrow, ibcol].shape[0] for ibcol in range(ncolumns) if not is_zero(blocks[ibrow, ibcol])) for ibrow in range(nrows) ]) bcs = np.cumsum([0] + [ single_valued(blocks[ibrow, ibcol].shape[1] for ibrow in range(nrows) if not is_zero(blocks[ibrow, ibcol])) for ibcol in range(ncolumns) ]) result = np.zeros((brs[-1], bcs[-1]), dtype=np.find_common_type(dtypes, [])) for ibcol in range(ncolumns): for ibrow in range(nrows): result[brs[ibrow]:brs[ibrow + 1], bcs[ibcol]:bcs[ibcol + 1]] = \ blocks[ibrow, ibcol] return result
def make_superblocks(devdata, struct_name, single_item, multi_item, extra_fields={}): from hedge.backends.cuda.tools import pad_and_join # single_item = [([ block1, block2, ... ], decl), ...] # multi_item = [([ [ item1, item2, ...], ... ], decl), ...] multi_blocks = [ ["".join(s) for s in part_data] for part_data, part_decls in multi_item] block_sizes = [ max(len(b) for b in part_blocks) for part_blocks in multi_blocks] from pytools import single_valued block_count = single_valued( len(si_part_blocks) for si_part_blocks, si_part_decl in single_item) from cgen import Struct, ArrayOf struct_members = [] for part_data, part_decl in single_item: assert block_count == len(part_data) single_valued(len(block) for block in part_data) struct_members.append(part_decl) for part_data, part_decl in multi_item: struct_members.append( ArrayOf(part_decl, max(len(s) for s in part_data))) superblocks = [] for superblock_num in range(block_count): data = "" for part_data, part_decl in single_item: data += part_data[superblock_num] for part_blocks, part_size in zip(multi_blocks, block_sizes): assert block_count == len(part_blocks) data += pad(part_blocks[superblock_num], part_size) superblocks.append(data) superblock_size = devdata.align( single_valued(len(sb) for sb in superblocks)) data = pad_and_join(superblocks, superblock_size) assert len(data) == superblock_size*block_count class SuperblockedDataStructure(Record): pass return SuperblockedDataStructure( struct=Struct(struct_name, struct_members), device_memory=cuda.to_device(data), block_bytes=superblock_size, data=data, **extra_fields )
def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None): if not len(arrays): return [] from pytools import single_valued a_dtype = single_valued(a.dtype for a in arrays) a_allocator = arrays[0].allocator context = dest_indices.context queue = queue or dest_indices.queue vec_count = len(arrays) if out is None: out = [ Array(context, dest_shape, a_dtype, allocator=a_allocator, queue=queue) for i in range(vec_count) ] else: if a_dtype != single_valued(o.dtype for o in out): raise TypeError("arrays and out must have the same dtype") if len(out) != vec_count: raise ValueError("out and arrays must have the same length") if len(dest_indices.shape) != 1: raise ValueError("dest_indices must be 1D") chunk_size = _builtin_min(vec_count, 10) def make_func_for_chunk_size(chunk_size): knl = elementwise.get_put_kernel(context, a_dtype, dest_indices.dtype, vec_count=chunk_size) return knl knl = make_func_for_chunk_size(chunk_size) for start_i in range(0, len(arrays), chunk_size): chunk_slice = slice(start_i, start_i + chunk_size) if start_i + chunk_size > vec_count: knl = make_func_for_chunk_size(vec_count - start_i) gs, ls = dest_indices.get_sizes( queue, knl.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, queue.device)) knl( queue, gs, ls, *([o.data for o in out[chunk_slice]] + [dest_indices.data] + [i.data for i in arrays[chunk_slice]] + [dest_indices.size])) return out
def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None): if not len(arrays): return [] from pytools import single_valued a_dtype = single_valued(a.dtype for a in arrays) a_allocator = arrays[0].allocator context = dest_indices.context queue = queue or dest_indices.queue vec_count = len(arrays) if out is None: out = [Array(context, dest_shape, a_dtype, allocator=a_allocator, queue=queue) for i in range(vec_count)] else: if a_dtype != single_valued(o.dtype for o in out): raise TypeError("arrays and out must have the same dtype") if len(out) != vec_count: raise ValueError("out and arrays must have the same length") if len(dest_indices.shape) != 1: raise ValueError("src_indices must be 1D") chunk_size = _builtin_min(vec_count, 10) def make_func_for_chunk_size(chunk_size): knl = elementwise.get_put_kernel(a_dtype, dest_indices.dtype, vec_count=chunk_size) knl.set_block_shape(*dest_indices._block) return knl knl = make_func_for_chunk_size(chunk_size) for start_i in range(0, len(arrays), chunk_size): chunk_slice = slice(start_i, start_i + chunk_size) if start_i + chunk_size > vec_count: knl = make_func_for_chunk_size(vec_count - start_i) gs, ls = dest_indices.get_sizes( queue, knl.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, queue.device) ) knl( queue, gs, ls, *( [o.data for o in out[chunk_slice]] + [dest_indices.data] + [i.data for i in arrays[chunk_slice]] + [dest_indices.size] ) ) return out
def index_list_backend(self, ilists): from pytools import single_valued ilist_length = single_valued(len(il) for il in ilists) assert ilist_length == self.plan.dofs_per_face from cgen import Typedef, POD from pytools import flatten flat_ilists_uncast = numpy.array(list(flatten(ilists))) if numpy.max(flat_ilists_uncast) >= 256: tp = numpy.uint16 else: tp = numpy.uint8 flat_ilists = numpy.asarray(flat_ilists_uncast, dtype=tp) assert (flat_ilists == flat_ilists_uncast).all() return GPUIndexLists( type=tp, code=[Typedef(POD(tp, "index_list_entry_t"))], device_memory=cuda.to_device(flat_ilists), bytes=flat_ilists.size * flat_ilists.itemsize, )
def exec_diff_batch_assign(self, insn): field = self.rec(insn.field) discr = self.executor.discr if discr.instrumented: discr.diff_counter.add(discr.dimensions) discr.diff_flop_counter.add(discr.dimensions*( self.executor.diff_rst_flops + self.executor.diff_rescale_one_flops)) repr_op = insn.operators[0] from hedge.optemplate.operators import \ ReferenceQuadratureStiffnessTOperator if isinstance(repr_op, ReferenceQuadratureStiffnessTOperator): eg, = discr.element_groups from pytools import single_valued q_info = discr.get_cuda_elgroup_quadrature_info( eg, single_valued(op.quadrature_tag for op in insn.operators)) kernel = discr.diff_kernel( aligned_preimage_dofs_per_microblock =q_info.aligned_dofs_per_microblock, preimage_dofs_per_el=q_info.ldis_quad_info.node_count()) rst_diff = kernel(repr_op, field) else: rst_diff = self.executor.diff_kernel(repr_op, field) return [(name, rst_diff[op.rst_axis]) for name, op in zip(insn.names, insn.operators)], []
def __call__(self, evaluate_subexpr, stats_callback=None): vectors = [evaluate_subexpr(vec_expr) for vec_expr in self.vector_deps] scalars = [evaluate_subexpr(scal_expr) for scal_expr in self.scalar_deps] from pytools import single_valued shape = single_valued(vec.shape for vec in vectors) kernel_rec = self.get_kernel( tuple(v.dtype for v in vectors), tuple(s.dtype for s in scalars)) results = [numpy.empty(shape, kernel_rec.result_dtype) for vei in self.result_vec_expr_info_list] size = results[0].size args = (results+vectors+scalars) if stats_callback is not None: timer = stats_callback(size, self) sub_timer = timer.start_sub_timer() kernel_rec.kernel(*args) sub_timer.stop().submit() else: kernel_rec.kernel(*args) return results
def nd_quad_submesh(node_tuples): """Return a list of tuples of indices into the node list that generate a tesselation of the reference element. :arg node_tuples: A list of tuples *(i, j, ...)* of integers indicating node positions inside the unit element. The returned list references indices in this list. :func:`pytools.generate_nonnegative_integer_tuples_below` may be used to generate *node_tuples*. See also :func:`modepy.tools.simplex_submesh`. """ from pytools import single_valued, add_tuples dims = single_valued(len(nt) for nt in node_tuples) node_dict = dict( (ituple, idx) for idx, ituple in enumerate(node_tuples)) from pytools import generate_nonnegative_integer_tuples_below as gnitb result = [] for current in node_tuples: try: result.append(tuple( node_dict[add_tuples(current, offset)] for offset in gnitb(2, dims))) except KeyError: pass return result
def _vis_connectivity(self): """ :return: an array of shape ``(vis_discr.nelements,nsubelements,primitive_element_size)`` """ # Assume that we're using modepy's default node ordering. from pytools import generate_nonnegative_integer_tuples_summing_to_at_most \ as gnitstam, single_valued vis_order = single_valued( group.order for group in self.vis_discr.groups) node_tuples = list(gnitstam(vis_order, self.vis_discr.dim)) from modepy.tools import submesh el_connectivity = np.array( submesh(node_tuples), dtype=np.intp) nelements = sum(group.nelements for group in self.vis_discr.groups) vis_connectivity = np.empty( (nelements,) + el_connectivity.shape, dtype=np.intp) el_nr_base = 0 for group in self.vis_discr.groups: assert len(node_tuples) == group.nunit_nodes vis_connectivity[el_nr_base:el_nr_base+group.nelements] = ( np.arange( el_nr_base*group.nunit_nodes, (el_nr_base+group.nelements)*group.nunit_nodes, group.nunit_nodes )[:, np.newaxis, np.newaxis] + el_connectivity) el_nr_base += group.nelements return vis_connectivity
def nd_quad_submesh(node_tuples): """Return a list of tuples of indices into the node list that generate a tesselation of the reference element. :arg node_tuples: A list of tuples *(i, j, ...)* of integers indicating node positions inside the unit element. The returned list references indices in this list. :func:`pytools.generate_nonnegative_integer_tuples_below` may be used to generate *node_tuples*. See also :func:`modepy.tools.simplex_submesh`. """ from pytools import single_valued, add_tuples dims = single_valued(len(nt) for nt in node_tuples) node_dict = dict((ituple, idx) for idx, ituple in enumerate(node_tuples)) from pytools import generate_nonnegative_integer_tuples_below as gnitb result = [] for current in node_tuples: try: result.append( tuple(node_dict[add_tuples(current, offset)] for offset in gnitb(2, dims))) except KeyError: pass return result
def __call__(self, evaluate_subexpr, stats_callback=None): vectors = [evaluate_subexpr(vec_expr) for vec_expr in self.vector_deps] scalars = [ evaluate_subexpr(scal_expr) for scal_expr in self.scalar_deps ] from pytools import single_valued shape = single_valued(vec.shape for vec in vectors) kernel_rec = self.get_kernel(tuple(v.dtype for v in vectors), tuple(s.dtype for s in scalars)) results = [ gpuarray.empty(shape, kernel_rec.result_dtype, self.allocator) for expr in self.result_vec_expr_info_list ] size = results[0].size args = ([r.gpudata for r in results] + [v.gpudata for v in vectors] + scalars + [size]) if stats_callback is not None: stats_callback( size, self, kernel_rec.kernel.prepared_timed_call(vectors[0]._grid, results[0]._block, *args)) else: kernel_rec.kernel.prepared_async_call(vectors[0]._grid, results[0]._block, self.stream, *args) return results
def __call__(self, evaluate_subexpr, stats_callback=None): vectors = [evaluate_subexpr(vec_expr) for vec_expr in self.vector_deps] scalars = [evaluate_subexpr(scal_expr) for scal_expr in self.scalar_deps] from pytools import single_valued shape = single_valued(vec.shape for vec in vectors) kernel_rec = self.get_kernel( tuple(v.dtype for v in vectors), tuple(s.dtype for s in scalars)) results = [gpuarray.empty( shape, kernel_rec.result_dtype, self.allocator) for expr in self.result_vec_expr_info_list] size = results[0].size args = ([r.gpudata for r in results] +[v.gpudata for v in vectors] +scalars +[size]) if stats_callback is not None: stats_callback(size, self, kernel_rec.kernel.prepared_timed_call(vectors[0]._grid, results[0]._block, *args)) else: kernel_rec.kernel.prepared_async_call(vectors[0]._grid, results[0]._block, self.stream, *args) return results
def exec_diff_batch_assign(self, insn): field = self.rec(insn.field) discr = self.executor.discr if discr.instrumented: discr.diff_counter.add(discr.dimensions) discr.diff_flop_counter.add(discr.dimensions * (self.executor.diff_rst_flops + self.executor.diff_rescale_one_flops)) repr_op = insn.operators[0] from hedge.optemplate.operators import \ ReferenceQuadratureStiffnessTOperator if isinstance(repr_op, ReferenceQuadratureStiffnessTOperator): eg, = discr.element_groups from pytools import single_valued q_info = discr.get_cuda_elgroup_quadrature_info( eg, single_valued(op.quadrature_tag for op in insn.operators)) kernel = discr.diff_kernel( aligned_preimage_dofs_per_microblock=q_info. aligned_dofs_per_microblock, preimage_dofs_per_el=q_info.ldis_quad_info.node_count()) rst_diff = kernel(repr_op, field) else: rst_diff = self.executor.diff_kernel(repr_op, field) return [(name, rst_diff[op.rst_axis]) for name, op in zip(insn.names, insn.operators)], []
def _vis_connectivity(self): """ :return: an array of shape ``(vis_discr.nelements,nsubelements,primitive_element_size)`` """ # Assume that we're using modepy's default node ordering. from pytools import generate_nonnegative_integer_tuples_summing_to_at_most \ as gnitstam, single_valued vis_order = single_valued(group.order for group in self.vis_discr.groups) node_tuples = list(gnitstam(vis_order, self.vis_discr.dim)) from modepy.tools import submesh el_connectivity = np.array(submesh(node_tuples), dtype=np.intp) nelements = sum(group.nelements for group in self.vis_discr.groups) vis_connectivity = np.empty((nelements, ) + el_connectivity.shape, dtype=np.intp) el_nr_base = 0 for group in self.vis_discr.groups: assert len(node_tuples) == group.nunit_nodes vis_connectivity[el_nr_base:el_nr_base + group.nelements] = ( np.arange(el_nr_base * group.nunit_nodes, (el_nr_base + group.nelements) * group.nunit_nodes, group.nunit_nodes)[:, np.newaxis, np.newaxis] + el_connectivity) el_nr_base += group.nelements return vis_connectivity
def __call__(self, evaluate_subexpr, stats_callback=None): vectors = [evaluate_subexpr(vec_expr) for vec_expr in self.vector_deps] scalars = [ evaluate_subexpr(scal_expr) for scal_expr in self.scalar_deps ] from pytools import single_valued shape = single_valued(vec.shape for vec in vectors) kernel_rec = self.get_kernel(tuple(v.dtype for v in vectors), tuple(s.dtype for s in scalars)) results = [ numpy.empty(shape, kernel_rec.result_dtype) for vei in self.result_vec_expr_info_list ] size = results[0].size args = (results + vectors + scalars) if stats_callback is not None: timer = stats_callback(size, self) sub_timer = timer.start_sub_timer() kernel_rec.kernel(*args) sub_timer.stop().submit() else: kernel_rec.kernel(*args) return results
def multi_put(arrays, dest_indices, dest_shape=None, out=None, stream=None): if not len(arrays): return [] from pytools import single_valued a_dtype = single_valued(a.dtype for a in arrays) a_allocator = arrays[0].allocator vec_count = len(arrays) if out is None: out = [ GPUArray(dest_shape, a_dtype, a_allocator) for i in range(vec_count) ] else: if a_dtype != single_valued(o.dtype for o in out): raise TypeError("arrays and out must have the same dtype") if len(out) != vec_count: raise ValueError("out and arrays must have the same length") if len(dest_indices.shape) != 1: raise ValueError("src_indices must be 1D") chunk_size = _builtin_min(vec_count, 10) def make_func_for_chunk_size(chunk_size): func = elementwise.get_put_kernel(a_dtype, dest_indices.dtype, vec_count=chunk_size) func.set_block_shape(*dest_indices._block) return func func = make_func_for_chunk_size(chunk_size) for start_i in range(0, len(arrays), chunk_size): chunk_slice = slice(start_i, start_i + chunk_size) if start_i + chunk_size > vec_count: func = make_func_for_chunk_size(vec_count - start_i) func.prepared_async_call( dest_indices._grid, stream, dest_indices.gpudata, *([o.gpudata for o in out[chunk_slice]] + [i.gpudata for i in arrays[chunk_slice]] + [dest_indices.size])) return out
def multi_put(arrays, dest_indices, dest_shape=None, out=None, stream=None): if not len(arrays): return [] from pytools import single_valued a_dtype = single_valued(a.dtype for a in arrays) a_allocator = arrays[0].allocator vec_count = len(arrays) if out is None: out = [GPUArray(dest_shape, a_dtype, a_allocator) for i in range(vec_count)] else: if a_dtype != single_valued(o.dtype for o in out): raise TypeError("arrays and out must have the same dtype") if len(out) != vec_count: raise ValueError("out and arrays must have the same length") if len(dest_indices.shape) != 1: raise ValueError("src_indices must be 1D") chunk_size = _builtin_min(vec_count, 10) def make_func_for_chunk_size(chunk_size): func = elementwise.get_put_kernel( a_dtype, dest_indices.dtype, vec_count=chunk_size) func.set_block_shape(*dest_indices._block) return func func = make_func_for_chunk_size(chunk_size) for start_i in range(0, len(arrays), chunk_size): chunk_slice = slice(start_i, start_i+chunk_size) if start_i + chunk_size > vec_count: func = make_func_for_chunk_size(vec_count-start_i) func.prepared_async_call(dest_indices._grid, stream, dest_indices.gpudata, *([o.gpudata for o in out[chunk_slice]] + [i.gpudata for i in arrays[chunk_slice]] + [dest_indices.size])) return out
def __init__(self, ctx, expansions, strength_usage=None, value_dtypes=None, options=[], name="layerpot", device=None): KernelComputation.__init__(self, ctx, expansions, strength_usage, value_dtypes, name, options, device) from pytools import single_valued self.dim = single_valued(knl.dim for knl in self.expansions)
def order(self): from warnings import warn warn("DGDiscretizationWithBoundaries.order is deprecated, " "consider the orders of element groups instead. " "'order' will go away in 2021.", DeprecationWarning, stacklevel=2) from pytools import single_valued return single_valued(egrp.order for egrp in self._volume_discr.groups)
def make_flux_batch_assign(self, names, expressions, repr_op): from pytools import single_valued quadrature_tag = single_valued( wdflux.quadrature_tag for wdflux in expressions) return CUDAFluxBatchAssign(names=names, expressions=expressions, repr_op=repr_op, dep_mapper_factory=self.dep_mapper_factory, quadrature_tag=quadrature_tag)
def map_int_g(self, expr, name_hint=None): try: return self.expr_to_var[expr] except KeyError: # make sure operator assignments stand alone and don't get muddled # up in vector arithmetic density_var = self.assign_to_new_var(self.rec(expr.density)) group = self.group_to_operators[self.op_group_features(expr)] names = [self.get_var_name() for op in group] kernel_to_index = {} kernels = [] for op in group: if op.kernel not in kernel_to_index: kernel_to_index[op.kernel] = len(kernels) kernels.append(op.kernel) from pytools import single_valued from sumpy.kernel import AxisTargetDerivativeRemover atdr = AxisTargetDerivativeRemover() base_kernel = single_valued( atdr(kernel) for kernel in kernels) for op in group: assert op.qbx_forced_limit in [-1, 0, 1] kernel_arguments = dict( (arg_name, self.rec(arg_val)) for arg_name, arg_val in six.iteritems(expr.kernel_arguments)) outputs = [ LayerPotentialOutput( name=name, kernel_index=kernel_to_index[op.kernel], target_name=op.target, qbx_forced_limit=op.qbx_forced_limit, ) for name, op in zip(names, group) ] self.code.append( LayerPotentialInstruction( outputs=outputs, kernels=tuple(kernels), kernel_arguments=kernel_arguments, base_kernel=base_kernel, density=density_var, source=expr.source, priority=max(getattr(op, "priority", 0) for op in group), dep_mapper_factory=self.dep_mapper_factory)) from pymbolic.primitives import Variable for name, group_expr in zip(names, group): self.expr_to_var[group_expr] = Variable(name) return self.expr_to_var[expr]
def make_flux_batch_assign(self, names, expressions, repr_op): from pytools import single_valued quadrature_tag = single_valued(wdflux.quadrature_tag for wdflux in expressions) return CUDAFluxBatchAssign(names=names, expressions=expressions, repr_op=repr_op, dep_mapper_factory=self.dep_mapper_factory, quadrature_tag=quadrature_tag)
def multi_take(arrays, indices, out=None, queue=None): if not len(arrays): return [] assert len(indices.shape) == 1 from pytools import single_valued a_dtype = single_valued(a.dtype for a in arrays) a_allocator = arrays[0].dtype context = indices.context queue = queue or indices.queue vec_count = len(arrays) if out is None: out = [ Array(context, queue, indices.shape, a_dtype, allocator=a_allocator) for i in range(vec_count) ] else: if len(out) != len(arrays): raise ValueError("out and arrays must have the same length") chunk_size = _builtin_min(vec_count, 10) def make_func_for_chunk_size(chunk_size): knl = elementwise.get_take_kernel(indices.context, a_dtype, indices.dtype, vec_count=chunk_size) knl.set_block_shape(*indices._block) return knl knl = make_func_for_chunk_size(chunk_size) for start_i in range(0, len(arrays), chunk_size): chunk_slice = slice(start_i, start_i + chunk_size) if start_i + chunk_size > vec_count: knl = make_func_for_chunk_size(vec_count - start_i) gs, ls = indices.get_sizes( queue, knl.get_work_group_info(cl.kernel_work_group_info.WORK_GROUP_SIZE, queue.device)) knl( queue, gs, ls, indices.data, *([o.data for o in out[chunk_slice]] + [i.data for i in arrays[chunk_slice]] + [indices.size])) return out
def __call__(self, *args): from pytools import indices_in_shape, single_valued oa_shape = single_valued(ary.shape for fac, ary in args) result = numpy.zeros(oa_shape, dtype=object) for i in indices_in_shape(oa_shape): args_i = [(fac, ary[i]) for fac, ary in args] result[i] = self.scalar_kernel(*args_i) return result
def map_int_g(self, expr, name_hint=None): try: return self.expr_to_var[expr] except KeyError: # make sure operator assignments stand alone and don't get muddled # up in vector arithmetic density_var = self.assign_to_new_var(self.rec(expr.density)) group = self.group_to_operators[self.op_group_features(expr)] names = [self.get_var_name() for op in group] kernels = sorted({op.kernel for op in group}, key=repr) kernel_to_index = {kernel: i for i, kernel in enumerate(kernels)} from pytools import single_valued from sumpy.kernel import AxisTargetDerivativeRemover atdr = AxisTargetDerivativeRemover() base_kernel = single_valued(atdr(kernel) for kernel in kernels) for op in group: assert op.qbx_forced_limit in [-2, -1, None, 1, 2] kernel_arguments = { arg_name: self.rec(arg_val) for arg_name, arg_val in expr.kernel_arguments.items() } outputs = [ PotentialOutput( name=name, kernel_index=kernel_to_index[op.kernel], target_name=op.target, qbx_forced_limit=op.qbx_forced_limit, ) for name, op in zip(names, group) ] self.code.append( ComputePotentialInstruction( outputs=outputs, kernels=tuple(kernels), kernel_arguments=kernel_arguments, base_kernel=base_kernel, density=density_var, source=expr.source, priority=max(getattr(op, "priority", 0) for op in group), dep_mapper_factory=self.dep_mapper_factory)) from pymbolic.primitives import Variable for name, group_expr in zip(names, group): self.expr_to_var[group_expr] = Variable(name) return self.expr_to_var[expr]
def __init__(self, ctx, expansions, strength_usage=None, value_dtypes=None, name=None, device=None): KernelComputation.__init__(self, ctx, expansions, strength_usage, value_dtypes, name, device) from pytools import single_valued self.dim = single_valued(knl.dim for knl in self.expansions)
def find_index_rank(self, name): irf = IndexRankFinder(name) for insn in self.instructions: insn.with_transformed_expressions( lambda expr: irf(self.submap(expr))) if not irf.index_ranks: return 0 else: from pytools import single_valued return single_valued(irf.index_ranks)
def get_or_register_dtype(self, c_names, dtype=None): """Get or register a :class:`numpy.dtype` associated with the C type names in the string list *c_names*. If *dtype* is `None`, no registration is performed, and the :class:`numpy.dtype` must already have been registered. If so, it is returned. If not, :exc:`TypeNameNotKnown` is raised. If *dtype* is not `None`, registration is attempted. If the *c_names* are already known and registered to identical :class:`numpy.dtype` objects, then the previously dtype object of the previously registered type is returned. If the *c_names* are not yet known, the type is registered. If one of the *c_names* is known but registered to a different type, an error is raised. In this latter case, the type may end up partially registered and any further behavior is undefined. .. versionadded:: 2012.2 """ if isinstance(c_names, str): c_names = [c_names] if dtype is None: from pytools import single_valued return single_valued(self.name_to_dtype[name] for name in c_names) dtype = np.dtype(dtype) # check if we've seen an identical dtype, if so retrieve exact dtype object. try: existing_name = self.dtype_to_name[dtype] except KeyError: existed = False else: existed = True existing_dtype = self.name_to_dtype[existing_name] assert existing_dtype == dtype dtype = existing_dtype for nm in c_names: try: name_dtype = self.name_to_dtype[nm] except KeyError: self.name_to_dtype[nm] = dtype else: if name_dtype != dtype: raise RuntimeError("name '%s' already registered to " "different dtype" % nm) if not existed: self.dtype_to_name[dtype] = c_names[0] if not str(dtype) in self.dtype_to_name: self.dtype_to_name[str(dtype)] = c_names[0] return dtype
def join_conserved(dim, mass, energy, momentum): """Create an agglomerated solution array from the conserved quantities.""" from pytools import single_valued aux_shape = single_valued([ _aux_shape(mass, ()), _aux_shape(energy, ()), _aux_shape(momentum, (dim,))]) result = np.zeros((2+dim,) + aux_shape, dtype=object) result[0] = mass result[1] = energy result[2:] = momentum return result
def __call__(self, array_context, *args): func_name = self.identifier from pytools import single_valued if single_valued(should_use_numpy(arg) for arg in args): func = getattr(np, func_name) return func(*args) if func_name == "fabs": # FIXME # Loopy has a type-adaptive "abs", but no "fabs". func_name = "abs" sfunc = getattr(array_context.np, func_name) return sfunc(*args)
def multi_take(arrays, indices, out=None, queue=None): if not len(arrays): return [] assert len(indices.shape) == 1 from pytools import single_valued a_dtype = single_valued(a.dtype for a in arrays) a_allocator = arrays[0].dtype context = indices.context queue = queue or indices.queue vec_count = len(arrays) if out is None: out = [Array(context, queue, indices.shape, a_dtype, allocator=a_allocator) for i in range(vec_count)] else: if len(out) != len(arrays): raise ValueError("out and arrays must have the same length") chunk_size = _builtin_min(vec_count, 10) def make_func_for_chunk_size(chunk_size): knl = elementwise.get_take_kernel( indices.context, a_dtype, indices.dtype, vec_count=chunk_size) knl.set_block_shape(*indices._block) return knl knl = make_func_for_chunk_size(chunk_size) for start_i in range(0, len(arrays), chunk_size): chunk_slice = slice(start_i, start_i+chunk_size) if start_i + chunk_size > vec_count: knl = make_func_for_chunk_size(vec_count-start_i) gs, ls = indices.get_sizes(queue, knl.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, queue.device)) knl(queue, gs, ls, indices.data, *([o.data for o in out[chunk_slice]] + [i.data for i in arrays[chunk_slice]] + [indices.size])) return out
def gpu_diffmats(self, diff_op_cls, elgroup): discr = self.discr given = self.plan.given columns = given.dofs_per_el() * discr.dimensions additional_columns = 0 # avoid smem fetch bank conflicts by ensuring odd col count if columns % 2 == 0: columns += 1 additional_columns += 1 block_floats = given.devdata.align_dtype( columns * self.plan.segment_size, given.float_size()) vstacked_matrices = [ numpy.vstack(given.microblock.elements * (m, )) for m in diff_op_cls.matrices(elgroup) ] segments = [] from pytools import single_valued for segment_start in range( 0, given.microblock.elements * given.dofs_per_el(), self.plan.segment_size): matrices = [ m[segment_start:segment_start + self.plan.segment_size] for m in vstacked_matrices ] matrices.append( numpy.zeros( (single_valued(m.shape[0] for m in matrices), additional_columns))) diffmats = numpy.asarray(numpy.hstack(matrices), dtype=given.float_type, order="C") segments.append(buffer(diffmats)) from hedge.backends.cuda.tools import pad_and_join from pytools import Record class GPUDifferentiationMatrices(Record): pass return GPUDifferentiationMatrices(device_memory=cuda.to_device( pad_and_join(segments, block_floats * given.float_size())), block_floats=block_floats, matrix_columns=columns)
def gpu_diffmats(self, diff_op_cls, elgroup): discr = self.discr given = self.plan.given columns = given.dofs_per_el()*discr.dimensions additional_columns = 0 # avoid smem fetch bank conflicts by ensuring odd col count if columns % 2 == 0: columns += 1 additional_columns += 1 block_floats = given.devdata.align_dtype( columns*self.plan.segment_size, given.float_size()) vstacked_matrices = [ numpy.vstack(given.microblock.elements*(m,)) for m in diff_op_cls.matrices(elgroup) ] segments = [] from pytools import single_valued for segment_start in range(0, given.microblock.elements*given.dofs_per_el(), self.plan.segment_size): matrices = [ m[segment_start:segment_start+self.plan.segment_size] for m in vstacked_matrices] matrices.append( numpy.zeros((single_valued(m.shape[0] for m in matrices), additional_columns)) ) diffmats = numpy.asarray( numpy.hstack(matrices), dtype=given.float_type, order="C") segments.append(buffer(diffmats)) from hedge.backends.cuda.tools import pad_and_join from pytools import Record class GPUDifferentiationMatrices(Record): pass return GPUDifferentiationMatrices( device_memory=cuda.to_device( pad_and_join(segments, block_floats*given.float_size())), block_floats=block_floats, matrix_columns=columns)
def get_or_register_dtype(c_names, dtype=None): """Get or register a :class:`numpy.dtype` associated with the C type names in the string list *c_names*. If *dtype* is `None`, no registration is performed, and the :class:`numpy.dtype` must already have been registered. If so, it is returned. If not, :exc:`TypeNameNotKnown` is raised. If *dtype* is not `None`, registration is attempted. If the *c_names* are already known and registered to identical :class:`numpy.dtype` objects, then the previously registered type is returned. Otherwise, the type is registered. .. versionadded:: 2012.2 """ if isinstance(c_names, str): c_names = [c_names] if dtype is None: from pytools import single_valued return single_valued(NAME_TO_DTYPE[name] for name in c_names) dtype = np.dtype(dtype) # check if we've seen an identical dtype, if so retrieve exact dtype object. try: existing_name = DTYPE_TO_NAME[dtype] except KeyError: existed = False else: existed = True existing_dtype = NAME_TO_DTYPE[existing_name] assert existing_dtype == dtype dtype = existing_dtype for nm in c_names: try: name_dtype = NAME_TO_DTYPE[nm] except KeyError: NAME_TO_DTYPE[nm] = dtype else: if name_dtype != dtype: raise RuntimeError( "name '%s' already registered to different dtype" % nm) if not existed: DTYPE_TO_NAME[dtype] = c_names[0] if not str(dtype) in DTYPE_TO_NAME: DTYPE_TO_NAME[str(dtype)] = c_names[0] return dtype
def get_or_register_dtype(c_names, dtype=None): """Get or register a :class:`numpy.dtype` associated with the C type names in the string list *c_names*. If *dtype* is `None`, no registration is performed, and the :class:`numpy.dtype` must already have been registered. If so, it is returned. If not, :exc:`TypeNameNotKnown` is raised. If *dtype* is not `None`, registration is attempted. If the *c_names* are already known and registered to identical :class:`numpy.dtype` objects, then the previously registered type is returned. Otherwise, the type is registered. .. versionadded:: 2012.2 """ if isinstance(c_names, str): c_names = [c_names] if dtype is None: from pytools import single_valued return single_valued(NAME_TO_DTYPE[name] for name in c_names) dtype = np.dtype(dtype) # check if we've seen an identical dtype, if so retrieve exact dtype object. try: existing_name = DTYPE_TO_NAME[dtype] except KeyError: existed = False else: existed = True existing_dtype = NAME_TO_DTYPE[existing_name] assert existing_dtype == dtype dtype = existing_dtype for nm in c_names: try: name_dtype = NAME_TO_DTYPE[nm] except KeyError: NAME_TO_DTYPE[nm] = dtype else: if name_dtype != dtype: raise RuntimeError("name '%s' already registered to different dtype" % nm) if not existed: DTYPE_TO_NAME[dtype] = c_names[0] if not str(dtype) in DTYPE_TO_NAME: DTYPE_TO_NAME[str(dtype)] = c_names[0] return dtype
def combine(dtypes): # dtypes may just be a generator expr dtypes = list(dtypes) from loopy.types import LoopyType, NumpyType assert all(isinstance(dtype, LoopyType) for dtype in dtypes) if not all(isinstance(dtype, NumpyType) for dtype in dtypes): from pytools import is_single_valued, single_valued if not is_single_valued(dtypes): raise TypeInferenceFailure( "Nothing known about operations between '%s'" % ", ".join(str(dt) for dt in dtypes)) return single_valued(dtypes) dtypes = [dtype.dtype for dtype in dtypes] result = dtypes.pop() while dtypes: other = dtypes.pop() if result.fields is None and other.fields is None: if (result, other) in [ (np.int32, np.float32), (np.float32, np.int32)]: # numpy makes this a double. I disagree. result = np.dtype(np.float32) else: result = ( np.empty(0, dtype=result) + np.empty(0, dtype=other) ).dtype elif result.fields is None and other.fields is not None: # assume the non-native type takes over # (This is used for vector types.) result = other elif result.fields is not None and other.fields is None: # assume the non-native type takes over # (This is used for vector types.) pass else: if result is not other: raise TypeInferenceFailure( "nothing known about result of operation on " "'%s' and '%s'" % (result, other)) return NumpyType(result)
def __call__(self, particles, radii, wait_for=None): dimensions = len(particles) from pytools import single_valued coord_dtype = single_valued(coord.dtype for coord in particles) if radii is None: radii_tuple = () else: radii_tuple = (radii,) knl = self.get_kernel(dimensions, coord_dtype, # have_radii: radii is not None) return knl(*(tuple(particles) + radii_tuple), wait_for=wait_for, return_event=True)
def _entry_dtype(ary): from meshmode.dof_array import DOFArray if isinstance(ary, DOFArray): # the "normal case" return ary.entry_dtype elif isinstance(ary, np.ndarray): if ary.dtype.char == "O": from pytools import single_valued return single_valued(_entry_dtype(entry) for entry in ary.flat) else: return ary.dtype elif isinstance(ary, cl.array.Array): # for "unregularized" layer potential sources return ary.dtype else: raise TypeError(f"unexpected type '{type(ary)}' in _entry_dtype")
def get_lpot_applier(self, target_kernels, source_kernels): # needs to be separate method for caching if any(knl.is_complex_valued for knl in target_kernels): value_dtype = self.density_discr.complex_dtype else: value_dtype = self.density_discr.real_dtype base_kernel = single_valued(knl.get_base_kernel() for knl in source_kernels) from sumpy.qbx import LayerPotential return LayerPotential(self.cl_context, expansion=self.get_expansion_for_qbx_direct_eval( base_kernel, target_kernels), target_kernels=target_kernels, source_kernels=source_kernels, value_dtypes=value_dtype)
def multi_take(arrays, indices, out=None, stream=None): if not len(arrays): return [] assert len(indices.shape) == 1 from pytools import single_valued a_dtype = single_valued(a.dtype for a in arrays) a_allocator = arrays[0].dtype vec_count = len(arrays) if out is None: out = [ GPUArray(indices.shape, a_dtype, a_allocator) for i in range(vec_count) ] else: if len(out) != len(arrays): raise ValueError("out and arrays must have the same length") chunk_size = _builtin_min(vec_count, 20) def make_func_for_chunk_size(chunk_size): func, tex_src = elementwise.get_take_kernel(a_dtype, indices.dtype, vec_count=chunk_size) func.set_block_shape(*indices._block) return func, tex_src func, tex_src = make_func_for_chunk_size(chunk_size) for start_i in range(0, len(arrays), chunk_size): chunk_slice = slice(start_i, start_i + chunk_size) if start_i + chunk_size > vec_count: func, tex_src = make_func_for_chunk_size(vec_count - start_i) for i, a in enumerate(arrays[chunk_slice]): a.bind_to_texref_ext(tex_src[i], allow_double_hack=True) func.prepared_async_call( indices._grid, stream, indices.gpudata, *([o.gpudata for o in out[chunk_slice]] + [indices.size])) return out
def get_lpot_applier_on_tgt_subset(self, target_kernels, source_kernels): # needs to be separate method for caching if any(knl.is_complex_valued for knl in target_kernels): value_dtype = self.density_discr.complex_dtype else: value_dtype = self.density_discr.real_dtype base_kernel = single_valued(knl.get_base_kernel() for knl in source_kernels) from pytential.qbx.direct import LayerPotentialOnTargetAndCenterSubset from sumpy.expansion.local import VolumeTaylorLocalExpansion return LayerPotentialOnTargetAndCenterSubset( self.cl_context, expansion=VolumeTaylorLocalExpansion(base_kernel, self.qbx_order), target_kernels=target_kernels, source_kernels=source_kernels, value_dtypes=value_dtype)
def __init__(self, ctx, kernels, exclude_self, strength_usage=None, value_dtypes=None, options=[], name=None, device=None): """ :arg kernels: list of :class:`sumpy.kernel.Kernel` instances :arg strength_usage: A list of integers indicating which expression uses which source strength indicator. This implicitly specifies the number of strength arrays that need to be passed. Default: all kernels use the same strength. """ KernelComputation.__init__(self, ctx, kernels, strength_usage, value_dtypes, name, options, device) self.exclude_self = exclude_self from pytools import single_valued self.dim = single_valued(knl.dim for knl in self.kernels)
def multi_take(arrays, indices, out=None, stream=None): if not len(arrays): return [] assert len(indices.shape) == 1 from pytools import single_valued a_dtype = single_valued(a.dtype for a in arrays) a_allocator = arrays[0].dtype vec_count = len(arrays) if out is None: out = [GPUArray(indices.shape, a_dtype, a_allocator) for i in range(vec_count)] else: if len(out) != len(arrays): raise ValueError("out and arrays must have the same length") chunk_size = _builtin_min(vec_count, 20) def make_func_for_chunk_size(chunk_size): func, tex_src = elementwise.get_take_kernel(a_dtype, indices.dtype, vec_count=chunk_size) func.set_block_shape(*indices._block) return func, tex_src func, tex_src = make_func_for_chunk_size(chunk_size) for start_i in range(0, len(arrays), chunk_size): chunk_slice = slice(start_i, start_i+chunk_size) if start_i + chunk_size > vec_count: func, tex_src = make_func_for_chunk_size(vec_count-start_i) for i, a in enumerate(arrays[chunk_slice]): a.bind_to_texref_ext(tex_src[i], allow_double_hack=True) func.prepared_async_call(indices._grid, stream, indices.gpudata, *([o.gpudata for o in out[chunk_slice]] + [indices.size])) return out
def __call__(self, particles, radii, wait_for=None): dimensions = len(particles) from pytools import single_valued coord_dtype = single_valued(coord.dtype for coord in particles) if radii is None: radii_tuple = () else: radii_tuple = (radii,) knl = self.get_kernel( dimensions, coord_dtype, # have_radii: radii is not None, ) return knl(*(tuple(particles) + radii_tuple), wait_for=wait_for, return_event=True)
def __call__(self, *args): args = list(args) from pytools import single_valued size = single_valued( args[i].size for i in self.vec_arg_indices if not (isinstance(args[i], (int, float)) and args[i] == 0)) for i in self.vec_arg_indices: if isinstance(args[i], (int, float)) and args[i] == 0: args[i] = numpy.zeros(size, dtype=self.arguments[i].dtype) # no need to do type checking--pyublas does that for us arg_struct = self.module.ArgStruct() for arg_descr, arg in zip(self.arguments, args): setattr(arg_struct, arg_descr.arg_name(), arg) assert not arg_struct.__dict__ self.func(size, arg_struct)
def __call__(self, *args): vectors = [] args = list(args) from pytools import single_valued size = single_valued(args[i].size for i in self.vec_arg_indices if not (isinstance(args[i], (int, float)) and args[i] == 0)) for i in self.vec_arg_indices: if isinstance(args[i], (int, float)) and args[i] == 0: args[i] = numpy.zeros(size, dtype=self.arguments[i].dtype) # no need to do type checking--pyublas does that for us arg_struct = self.module.ArgStruct() for arg_descr, arg in zip(self.arguments, args): setattr(arg_struct, arg_descr.arg_name(), arg) assert not arg_struct.__dict__ self.func(size, arg_struct)
def expansion_wrangler_code_container(self, source_kernels, target_kernels): from functools import partial base_kernel = single_valued(kernel.get_base_kernel() for kernel in source_kernels) mpole_expn_class = \ self.expansion_factory.get_multipole_expansion_class(base_kernel) local_expn_class = \ self.expansion_factory.get_local_expansion_class(base_kernel) fmm_mpole_factory = partial(mpole_expn_class, base_kernel) fmm_local_factory = partial(local_expn_class, base_kernel) qbx_local_factory = partial(local_expn_class, base_kernel) if self.fmm_backend == "sumpy": from pytential.qbx.fmm import \ QBXSumpyExpansionWranglerCodeContainer return QBXSumpyExpansionWranglerCodeContainer( self.cl_context, fmm_mpole_factory, fmm_local_factory, qbx_local_factory, target_kernels=target_kernels, source_kernels=source_kernels) elif self.fmm_backend == "fmmlib": source_kernel, = source_kernels target_kernels_new = [ target_kernel.replace_base_kernel(source_kernel) for target_kernel in target_kernels ] from pytential.qbx.fmmlib import \ QBXFMMLibExpansionWranglerCodeContainer return QBXFMMLibExpansionWranglerCodeContainer( self.cl_context, fmm_mpole_factory, fmm_local_factory, qbx_local_factory, target_kernels=target_kernels_new) else: raise ValueError(f"invalid FMM backend: {self.fmm_backend}")
def reassemble(self, parts_vol_vectors): from pytools import single_valued, indices_in_shape from hedge.tools import log_shape ls = single_valued(log_shape(pvv) for pvv in parts_vol_vectors) def remap_scalar_field(idx): result = self.whole_discr.volume_zeros() for part_emb, part_vol_vector in zip( self._embeddings(), parts_vol_vectors): result[part_emb] = part_vol_vector[idx] return result if ls != (): result = numpy.zeros(ls, dtype=object) for i in indices_in_shape(ls): result[i] = remap_scalar_field(i) return result else: return remap_scalar_field(())
def multi_take_put(arrays, dest_indices, src_indices, dest_shape=None, out=None, queue=None, src_offsets=None): if not len(arrays): return [] from pytools import single_valued a_dtype = single_valued(a.dtype for a in arrays) a_allocator = arrays[0].allocator context = src_indices.context queue = queue or src_indices.queue vec_count = len(arrays) if out is None: out = [Array(context, dest_shape, a_dtype, a_allocator, queue=queue) for i in range(vec_count)] else: if a_dtype != single_valued(o.dtype for o in out): raise TypeError("arrays and out must have the same dtype") if len(out) != vec_count: raise ValueError("out and arrays must have the same length") if src_indices.dtype != dest_indices.dtype: raise TypeError("src_indices and dest_indices must have the same dtype") if len(src_indices.shape) != 1: raise ValueError("src_indices must be 1D") if src_indices.shape != dest_indices.shape: raise ValueError("src_indices and dest_indices must have the same shape") if src_offsets is None: src_offsets_list = [] else: src_offsets_list = src_offsets if len(src_offsets) != vec_count: raise ValueError("src_indices and src_offsets must have the same length") max_chunk_size = 10 chunk_size = _builtin_min(vec_count, max_chunk_size) def make_func_for_chunk_size(chunk_size): return elementwise.get_take_put_kernel(context, a_dtype, src_indices.dtype, with_offsets=src_offsets is not None, vec_count=chunk_size) knl = make_func_for_chunk_size(chunk_size) for start_i in range(0, len(arrays), chunk_size): chunk_slice = slice(start_i, start_i+chunk_size) if start_i + chunk_size > vec_count: knl = make_func_for_chunk_size(vec_count-start_i) knl(queue, src_indices._global_size, src_indices._local_size, dest_indices.data, src_indices.data, *([i.data for i in arrays[chunk_slice]] + [o.data for o in out[chunk_slice]] + src_offsets_list[chunk_slice] + [src_indices.size])) return out
def dim(self): from pytools import single_valued return single_valued(grp.dim for grp in self.groups)
def get_temporary_decls(self, codegen_state, schedule_index): from loopy.kernel.data import AddressSpace kernel = codegen_state.kernel base_storage_decls = [] temp_decls = [] # {{{ declare temporaries base_storage_sizes = {} base_storage_to_scope = {} base_storage_to_align_bytes = {} from cgen import ArrayOf, Initializer, AlignedAttribute, Value, Line # Getting the temporary variables that are needed for the current # sub-kernel. from loopy.schedule.tools import ( temporaries_read_in_subkernel, temporaries_written_in_subkernel) subkernel = kernel.schedule[schedule_index].kernel_name sub_knl_temps = ( temporaries_read_in_subkernel(kernel, subkernel) | temporaries_written_in_subkernel(kernel, subkernel)) for tv in sorted( six.itervalues(kernel.temporary_variables), key=lambda tv: tv.name): decl_info = tv.decl_info(self.target, index_dtype=kernel.index_dtype) if not tv.base_storage: for idi in decl_info: # global temp vars are mapped to arguments or global declarations if tv.address_space != AddressSpace.GLOBAL and ( tv.name in sub_knl_temps): decl = self.wrap_temporary_decl( self.get_temporary_decl( codegen_state, schedule_index, tv, idi), tv.address_space) if tv.initializer is not None: assert tv.read_only decl = Initializer(decl, generate_array_literal( codegen_state, tv, tv.initializer)) temp_decls.append(decl) else: assert tv.initializer is None offset = 0 base_storage_sizes.setdefault(tv.base_storage, []).append( tv.nbytes) base_storage_to_scope.setdefault(tv.base_storage, []).append( tv.address_space) align_size = tv.dtype.itemsize from loopy.kernel.array import VectorArrayDimTag for dim_tag, axis_len in zip(tv.dim_tags, tv.shape): if isinstance(dim_tag, VectorArrayDimTag): align_size *= axis_len base_storage_to_align_bytes.setdefault(tv.base_storage, []).append( align_size) for idi in decl_info: cast_decl = POD(self, idi.dtype, "") temp_var_decl = POD(self, idi.dtype, idi.name) cast_decl = self.wrap_temporary_decl(cast_decl, tv.address_space) temp_var_decl = self.wrap_temporary_decl( temp_var_decl, tv.address_space) if tv._base_storage_access_may_be_aliasing: ptrtype = _ConstPointer else: # The 'restrict' part of this is a complete lie--of course # all these temporaries are aliased. But we're promising to # not use them to shovel data from one representation to the # other. That counts, right? ptrtype = _ConstRestrictPointer cast_decl = ptrtype(cast_decl) temp_var_decl = ptrtype(temp_var_decl) cast_tp, cast_d = cast_decl.get_decl_pair() temp_var_decl = Initializer( temp_var_decl, "(%s %s) (%s + %s)" % ( " ".join(cast_tp), cast_d, tv.base_storage, offset)) temp_decls.append(temp_var_decl) from pytools import product offset += ( idi.dtype.itemsize * product(si for si in idi.shape)) ecm = self.get_expression_to_code_mapper(codegen_state) for bs_name, bs_sizes in sorted(six.iteritems(base_storage_sizes)): bs_var_decl = Value("char", bs_name) from pytools import single_valued bs_var_decl = self.wrap_temporary_decl( bs_var_decl, single_valued(base_storage_to_scope[bs_name])) # FIXME: Could try to use isl knowledge to simplify max. if all(isinstance(bs, int) for bs in bs_sizes): bs_size_max = max(bs_sizes) else: bs_size_max = p.Max(tuple(bs_sizes)) bs_var_decl = ArrayOf(bs_var_decl, ecm(bs_size_max)) alignment = max(base_storage_to_align_bytes[bs_name]) bs_var_decl = AlignedAttribute(alignment, bs_var_decl) base_storage_decls.append(bs_var_decl) # }}} result = base_storage_decls + temp_decls if result: result.append(Line()) return result
def multi_take_put(arrays, dest_indices, src_indices, dest_shape=None, out=None, stream=None, src_offsets=None): if not len(arrays): return [] from pytools import single_valued a_dtype = single_valued(a.dtype for a in arrays) a_allocator = arrays[0].allocator vec_count = len(arrays) if out is None: out = [GPUArray(dest_shape, a_dtype, a_allocator) for i in range(vec_count)] else: if a_dtype != single_valued(o.dtype for o in out): raise TypeError("arrays and out must have the same dtype") if len(out) != vec_count: raise ValueError("out and arrays must have the same length") if src_indices.dtype != dest_indices.dtype: raise TypeError("src_indices and dest_indices must have the same dtype") if len(src_indices.shape) != 1: raise ValueError("src_indices must be 1D") if src_indices.shape != dest_indices.shape: raise ValueError("src_indices and dest_indices must have the same shape") if src_offsets is None: src_offsets_list = [] max_chunk_size = 20 else: src_offsets_list = src_offsets if len(src_offsets) != vec_count: raise ValueError("src_indices and src_offsets must have the same length") max_chunk_size = 10 chunk_size = _builtin_min(vec_count, max_chunk_size) def make_func_for_chunk_size(chunk_size): return elementwise.get_take_put_kernel( a_dtype, src_indices.dtype, with_offsets=src_offsets is not None, vec_count=chunk_size) func, tex_src = make_func_for_chunk_size(chunk_size) for start_i in range(0, len(arrays), chunk_size): chunk_slice = slice(start_i, start_i+chunk_size) if start_i + chunk_size > vec_count: func, tex_src = make_func_for_chunk_size(vec_count-start_i) for src_tr, a in zip(tex_src, arrays[chunk_slice]): a.bind_to_texref_ext(src_tr, allow_double_hack=True) func.prepared_async_call(src_indices._grid, src_indices._block, stream, dest_indices.gpudata, src_indices.gpudata, *([o.gpudata for o in out[chunk_slice]] + src_offsets_list[chunk_slice] + [src_indices.size])) return out