def __call__(self, queue, tree, wait_for=None): """ :arg queue: a :class:`pyopencl.CommandQueue` :arg tree: a :class:`boxtree.Tree`. :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event` instances for whose completion this command waits before starting execution. :returns: a tuple *(pl, event)*, where *pl* is an instance of :class:`PeerListLookup`, and *event* is a :class:`pyopencl.Event` for dependency management. """ from pytools import div_ceil # Round up level count--this gets included in the kernel as # a stack bound. Rounding avoids too many kernel versions. max_levels = div_ceil(tree.nlevels, 10) * 10 peer_list_finder_kernel = self.get_peer_list_finder_kernel( tree.dimensions, tree.coord_dtype, tree.box_id_dtype, max_levels) pl_plog = ProcessLogger(logger, "find peer lists") result, evt = peer_list_finder_kernel( queue, tree.nboxes, tree.box_centers.data, tree.root_extent, tree.box_levels.data, tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags.data, wait_for=wait_for) pl_plog.done() return PeerListLookup( tree=tree, peer_list_starts=result["peers"].starts, peer_lists=result["peers"].lists).with_queue(None), evt
def __call__(self, queue, tree, wait_for=None): """ :arg queue: a :class:`pyopencl.CommandQueue` :arg tree: a :class:`boxtree.Tree`. :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event` instances for whose completion this command waits before starting execution. :returns: a tuple *(pl, event)*, where *pl* is an instance of :class:`PeerListLookup`, and *event* is a :class:`pyopencl.Event` for dependency management. """ from pytools import div_ceil # Round up level count--this gets included in the kernel as # a stack bound. Rounding avoids too many kernel versions. max_levels = div_ceil(tree.nlevels, 10) * 10 peer_list_finder_kernel = self.get_peer_list_finder_kernel( tree.dimensions, tree.coord_dtype, tree.box_id_dtype, max_levels) pl_plog = ProcessLogger(logger, "find peer lists") result, evt = peer_list_finder_kernel(queue, tree.nboxes, tree.box_centers.data, tree.root_extent, tree.box_levels, tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags, wait_for=wait_for) pl_plog.done() return PeerListLookup( tree=tree, peer_list_starts=result["peers"].starts, peer_lists=result["peers"].lists).with_queue(None), evt
def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None, wait_for=None): """ :arg queue: a :class:`pyopencl.CommandQueue` :arg tree: a :class:`boxtree.Tree`. :arg ball_centers: an object array of coordinate :class:`pyopencl.array.Array` instances. Their *dtype* must match *tree*'s :attr:`boxtree.Tree.coord_dtype`. :arg ball_radii: a :class:`pyopencl.array.Array` of positive numbers. Its *dtype* must match *tree*'s :attr:`boxtree.Tree.coord_dtype`. :arg peer_lists: may either be *None* or an instance of :class:`PeerListLookup` associated with `tree`. :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event` instances for whose completion this command waits before starting execution. :returns: a tuple *(sqi, event)*, where *sqi* is an instance of :class:`pyopencl.array.Array`, and *event* is a :class:`pyopencl.Event` for dependency management. The *dtype* of *sqi* is *tree*'s :attr:`boxtree.Tree.coord_dtype` and its shape is *(tree.nboxes,)* (see :attr:`boxtree.Tree.nboxes`). The entries of *sqi* are indexed by the global box index and are as follows: * if *i* is not the index of a leaf box, *sqi[i] = 0*. * if *i* is the index of a leaf box, *sqi[i]* is the outer space invader distance for *i*. """ from pytools import single_valued if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype: raise TypeError("ball_centers dtype must match tree.coord_dtype") if ball_radii.dtype != tree.coord_dtype: raise TypeError("ball_radii dtype must match tree.coord_dtype") from pytools import div_ceil # Avoid generating too many kernels. max_levels = div_ceil(tree.nlevels, 10) * 10 if peer_lists is None: peer_lists, evt = self.peer_list_finder(queue, tree, wait_for=wait_for) wait_for = [evt] if len(peer_lists.peer_list_starts) != tree.nboxes + 1: raise ValueError( "size of peer lists must match with number of boxes") space_invader_query_kernel = self.get_space_invader_query_kernel( tree.dimensions, tree.coord_dtype, tree.box_id_dtype, peer_lists.peer_list_starts.dtype, max_levels) si_plog = ProcessLogger(logger, "space invader query") outer_space_invader_dists = cl.array.zeros(queue, tree.nboxes, np.float32) if not wait_for: wait_for = [] wait_for = wait_for + outer_space_invader_dists.events evt = space_invader_query_kernel( *SPACE_INVADER_QUERY_TEMPLATE.unwrap_args( tree, peer_lists, ball_radii, outer_space_invader_dists, *tuple(bc for bc in ball_centers)), wait_for=wait_for, queue=queue, range=slice(len(ball_radii))) if tree.coord_dtype != np.dtype(np.float32): # The kernel output is always an array of float32 due to limited # support for atomic operations with float64 in OpenCL. # Here the output is cast to match the coord dtype. outer_space_invader_dists.finish() outer_space_invader_dists = outer_space_invader_dists.astype( tree.coord_dtype) evt, = outer_space_invader_dists.events si_plog.done() return outer_space_invader_dists, evt
def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None, wait_for=None): """ :arg queue: a :class:`pyopencl.CommandQueue` :arg tree: a :class:`boxtree.Tree`. :arg ball_centers: an object array of coordinate :class:`pyopencl.array.Array` instances. Their *dtype* must match *tree*'s :attr:`boxtree.Tree.coord_dtype`. :arg ball_radii: a :class:`pyopencl.array.Array` of positive numbers. Its *dtype* must match *tree*'s :attr:`boxtree.Tree.coord_dtype`. :arg peer_lists: may either be *None* or an instance of :class:`PeerListLookup` associated with `tree`. :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event` instances for whose completion this command waits before starting execution. :returns: a tuple *(lbl, event)*, where *lbl* is an instance of :class:`LeavesToBallsLookup`, and *event* is a :class:`pyopencl.Event` for dependency management. """ from pytools import single_valued if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype: raise TypeError("ball_centers dtype must match tree.coord_dtype") if ball_radii.dtype != tree.coord_dtype: raise TypeError("ball_radii dtype must match tree.coord_dtype") ltb_plog = ProcessLogger(logger, "leaves-to-balls lookup: run area query") area_query, evt = self.area_query_builder(queue, tree, ball_centers, ball_radii, peer_lists, wait_for) wait_for = [evt] logger.debug("leaves-to-balls lookup: expand starts") nkeys = tree.nboxes nballs_p_1 = len(area_query.leaves_near_ball_starts) assert nballs_p_1 == len(ball_radii) + 1 # We invert the area query in two steps: # # 1. Turn the area query result into (ball number, box number) pairs. # This is done in the "starts expander kernel." # # 2. Key-value sort the (ball number, box number) pairs by box number. starts_expander_knl = self.get_starts_expander_kernel( tree.box_id_dtype) expanded_starts = cl.array.empty( queue, len(area_query.leaves_near_ball_lists), tree.box_id_dtype) evt = starts_expander_knl( expanded_starts, area_query.leaves_near_ball_starts.with_queue(queue), nballs_p_1) wait_for = [evt] logger.debug("leaves-to-balls lookup: key-value sort") balls_near_box_starts, balls_near_box_lists, evt \ = self.key_value_sorter( queue, # keys area_query.leaves_near_ball_lists.with_queue(queue), # values expanded_starts, nkeys, starts_dtype=tree.box_id_dtype, wait_for=wait_for) ltb_plog.done() return LeavesToBallsLookup( tree=tree, balls_near_box_starts=balls_near_box_starts, balls_near_box_lists=balls_near_box_lists).with_queue(None), evt
def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None, wait_for=None): """ :arg queue: a :class:`pyopencl.CommandQueue` :arg tree: a :class:`boxtree.Tree`. :arg ball_centers: an object array of coordinate :class:`pyopencl.array.Array` instances. Their *dtype* must match *tree*'s :attr:`boxtree.Tree.coord_dtype`. :arg ball_radii: a :class:`pyopencl.array.Array` of positive numbers. Its *dtype* must match *tree*'s :attr:`boxtree.Tree.coord_dtype`. :arg peer_lists: may either be *None* or an instance of :class:`PeerListLookup` associated with `tree`. :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event` instances for whose completion this command waits before starting exeuction. :returns: a tuple *(aq, event)*, where *aq* is an instance of :class:`AreaQueryResult`, and *event* is a :class:`pyopencl.Event` for dependency management. """ from pytools import single_valued if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype: raise TypeError("ball_centers dtype must match tree.coord_dtype") if ball_radii.dtype != tree.coord_dtype: raise TypeError("ball_radii dtype must match tree.coord_dtype") ball_id_dtype = tree.particle_id_dtype # ? from pytools import div_ceil # Avoid generating too many kernels. max_levels = div_ceil(tree.nlevels, 10) * 10 if peer_lists is None: peer_lists, evt = self.peer_list_finder(queue, tree, wait_for=wait_for) wait_for = [evt] if len(peer_lists.peer_list_starts) != tree.nboxes + 1: raise ValueError( "size of peer lists must match with number of boxes") area_query_kernel = self.get_area_query_kernel( tree.dimensions, tree.coord_dtype, tree.box_id_dtype, ball_id_dtype, peer_lists.peer_list_starts.dtype, max_levels) aq_plog = ProcessLogger(logger, "area query") result, evt = area_query_kernel(queue, len(ball_radii), tree.box_centers.data, tree.root_extent, tree.box_levels, tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags, peer_lists.peer_list_starts, peer_lists.peer_lists, ball_radii, *(tuple(tree.bounding_box[0]) + tuple(bc for bc in ball_centers)), wait_for=wait_for) aq_plog.done() return AreaQueryResult( tree=tree, leaves_near_ball_starts=result["leaves"].starts, leaves_near_ball_lists=result["leaves"].lists).with_queue( None), evt
def generate_code_v2(kernel): """ :returns: a :class:`CodeGenerationResult` """ from loopy.kernel import KernelState if kernel.state == KernelState.INITIAL: from loopy.preprocess import preprocess_kernel kernel = preprocess_kernel(kernel) if kernel.schedule is None: from loopy.schedule import get_one_scheduled_kernel kernel = get_one_scheduled_kernel(kernel) if kernel.state != KernelState.LINEARIZED: raise LoopyError("cannot generate code for a kernel that has not been " "scheduled") # {{{ cache retrieval from loopy import CACHING_ENABLED if CACHING_ENABLED: input_kernel = kernel try: result = code_gen_cache[input_kernel] logger.debug("%s: code generation cache hit" % kernel.name) return result except KeyError: pass # }}} from loopy.type_inference import infer_unknown_types kernel = infer_unknown_types(kernel, expect_completion=True) from loopy.check import pre_codegen_checks pre_codegen_checks(kernel) codegen_plog = ProcessLogger(logger, f"{kernel.name}: generate code") # {{{ examine arg list from loopy.kernel.data import ValueArg from loopy.kernel.array import ArrayBase implemented_data_info = [] for arg in kernel.args: is_written = arg.name in kernel.get_written_variables() if isinstance(arg, ArrayBase): implemented_data_info.extend( arg.decl_info(kernel.target, is_written=is_written, index_dtype=kernel.index_dtype)) elif isinstance(arg, ValueArg): implemented_data_info.append( ImplementedDataInfo(target=kernel.target, name=arg.name, dtype=arg.dtype, arg_class=ValueArg, is_written=is_written)) else: raise ValueError("argument type not understood: '%s'" % type(arg)) allow_complex = False for var in kernel.args + list(kernel.temporary_variables.values()): if var.dtype.involves_complex(): allow_complex = True # }}} seen_dtypes = set() seen_functions = set() seen_atomic_dtypes = set() initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions) from loopy.codegen.tools import CodegenOperationCacheManager codegen_state = CodeGenerationState( kernel=kernel, implemented_data_info=implemented_data_info, implemented_domain=initial_implemented_domain, implemented_predicates=frozenset(), seen_dtypes=seen_dtypes, seen_functions=seen_functions, seen_atomic_dtypes=seen_atomic_dtypes, var_subst_map={}, allow_complex=allow_complex, var_name_generator=kernel.get_var_name_generator(), is_generating_device_code=False, gen_program_name=(kernel.target.host_program_name_prefix + kernel.name + kernel.target.host_program_name_suffix), schedule_index_end=len(kernel.schedule), codegen_cachemanager=CodegenOperationCacheManager.from_kernel(kernel), ) from loopy.codegen.result import generate_host_or_device_program codegen_result = generate_host_or_device_program(codegen_state, schedule_index=0) device_code_str = codegen_result.device_code() from loopy.check import check_implemented_domains assert check_implemented_domains(kernel, codegen_result.implemented_domains, device_code_str) # {{{ handle preambles for idi in codegen_state.implemented_data_info: seen_dtypes.add(idi.dtype) for tv in kernel.temporary_variables.values(): for idi in tv.decl_info(kernel.target, index_dtype=kernel.index_dtype): seen_dtypes.add(idi.dtype) if kernel.all_inames(): seen_dtypes.add(kernel.index_dtype) preambles = kernel.preambles[:] preamble_info = PreambleInfo( kernel=kernel, seen_dtypes=seen_dtypes, seen_functions=seen_functions, # a set of LoopyTypes (!) seen_atomic_dtypes=seen_atomic_dtypes, codegen_state=codegen_state) preamble_generators = ( kernel.preamble_generators + kernel.target.get_device_ast_builder().preamble_generators()) for prea_gen in preamble_generators: preambles.extend(prea_gen(preamble_info)) codegen_result = codegen_result.copy(device_preambles=preambles) # }}} # For faster unpickling in the common case when implemented_domains isn't needed. from loopy.tools import LazilyUnpicklingDict codegen_result = codegen_result.copy( implemented_domains=LazilyUnpicklingDict( codegen_result.implemented_domains)) codegen_plog.done() if CACHING_ENABLED: code_gen_cache.store_if_not_present(input_kernel, codegen_result) return codegen_result
def get_stored_ids_and_unscaled_projection_matrix(self): from pytools import ProcessLogger plog = ProcessLogger(logger, "compute PDE for Taylor coefficients") mis = self.get_full_coefficient_identifiers() coeff_ident_enumerate_dict = { tuple(mi): i for (i, mi) in enumerate(mis) } diff_op = self.get_pde_as_diff_op() assert len(diff_op.eqs) == 1 pde_dict = {k.mi: v for k, v in diff_op.eqs[0].items()} for ident in pde_dict.keys(): if ident not in coeff_ident_enumerate_dict: # Order of the expansion is less than the order of the PDE. # In that case, the compression matrix is the identity matrix # and there's nothing to project from_input_coeffs_by_row = [[(i, 1)] for i in range(len(mis))] from_output_coeffs_by_row = [[] for _ in range(len(mis))] shape = (len(mis), len(mis)) op = CSEMatVecOperator(from_input_coeffs_by_row, from_output_coeffs_by_row, shape) return mis, op # Calculate the multi-index that appears last in in the PDE in # reverse degree lexicographic order (degrevlex). max_mi_idx = max(coeff_ident_enumerate_dict[ident] for ident in pde_dict.keys()) max_mi = mis[max_mi_idx] max_mi_coeff = pde_dict[max_mi] max_mi_mult = -1 / sym.sympify(max_mi_coeff) def is_stored(mi): """ A multi_index mi is not stored if mi >= max_mi """ return any(mi[d] < max_mi[d] for d in range(self.dim)) stored_identifiers = [] from_input_coeffs_by_row = [] from_output_coeffs_by_row = [] for i, mi in enumerate(mis): # If the multi-index is to be stored, keep the projection matrix # entry empty if is_stored(mi): idx = len(stored_identifiers) stored_identifiers.append(mi) from_input_coeffs_by_row.append([(idx, 1)]) from_output_coeffs_by_row.append([]) continue diff = [mi[d] - max_mi[d] for d in range(self.dim)] # eg: u_xx + u_yy + u_zz is represented as # [((2, 0, 0), 1), ((0, 2, 0), 1), ((0, 0, 2), 1)] assignment = [] for other_mi, coeff in pde_dict.items(): j = coeff_ident_enumerate_dict[add_mi(other_mi, diff)] if i == j: # Skip the u_zz part here. continue # PDE might not have max_mi_coeff = -1, divide by -max_mi_coeff # to get a relation of the form, u_zz = - u_xx - u_yy for Laplace 3D. assignment.append((j, coeff * max_mi_mult)) from_input_coeffs_by_row.append([]) from_output_coeffs_by_row.append(assignment) plog.done() logger.debug( "number of Taylor coefficients was reduced from {orig} to {red}". format(orig=len(self.get_full_coefficient_identifiers()), red=len(stored_identifiers))) shape = (len(mis), len(stored_identifiers)) op = CSEMatVecOperator(from_input_coeffs_by_row, from_output_coeffs_by_row, shape) return stored_identifiers, op
def drive_fmm(expansion_wrangler, src_weights, timing_data=None): """Top-level driver routine for the QBX fast multipole calculation. :arg geo_data: A :class:`QBXFMMGeometryData` instance. :arg expansion_wrangler: An object exhibiting the :class:`ExpansionWranglerInterface`. :arg src_weights: Source 'density/weights/charges'. Passed unmodified to *expansion_wrangler*. :arg timing_data: Either *None* or a dictionary that collects timing data. Returns the potentials computed by *expansion_wrangler*. See also :func:`boxtree.fmm.drive_fmm`. """ wrangler = expansion_wrangler geo_data = wrangler.geo_data traversal = geo_data.traversal() tree = traversal.tree recorder = TimingRecorder() # Interface guidelines: Attributes of the tree are assumed to be known # to the expansion wrangler and should not be passed. fmm_proc = ProcessLogger(logger, "qbx fmm") src_weights = wrangler.reorder_sources(src_weights) # {{{ construct local multipoles mpole_exps, timing_future = wrangler.form_multipoles( traversal.level_start_source_box_nrs, traversal.source_boxes, src_weights) recorder.add("form_multipoles", timing_future) # }}} # {{{ propagate multipoles upward mpole_exps, timing_future = wrangler.coarsen_multipoles( traversal.level_start_source_parent_box_nrs, traversal.source_parent_boxes, mpole_exps) recorder.add("coarsen_multipoles", timing_future) # }}} # {{{ direct evaluation from neighbor source boxes ("list 1") non_qbx_potentials, timing_future = wrangler.eval_direct( traversal.target_boxes, traversal.neighbor_source_boxes_starts, traversal.neighbor_source_boxes_lists, src_weights) recorder.add("eval_direct", timing_future) # }}} # {{{ translate separated siblings' ("list 2") mpoles to local local_exps, timing_future = wrangler.multipole_to_local( traversal.level_start_target_or_target_parent_box_nrs, traversal.target_or_target_parent_boxes, traversal.from_sep_siblings_starts, traversal.from_sep_siblings_lists, mpole_exps) recorder.add("multipole_to_local", timing_future) # }}} # {{{ evaluate sep. smaller mpoles ("list 3") at particles # (the point of aiming this stage at particles is specifically to keep its # contribution *out* of the downward-propagating local expansions) mpole_result, timing_future = wrangler.eval_multipoles( traversal.target_boxes_sep_smaller_by_source_level, traversal.from_sep_smaller_by_level, mpole_exps) recorder.add("eval_multipoles", timing_future) non_qbx_potentials = non_qbx_potentials + mpole_result # assert that list 3 close has been merged into list 1 assert traversal.from_sep_close_smaller_starts is None # }}} # {{{ form locals for separated bigger source boxes ("list 4") local_result, timing_future = wrangler.form_locals( traversal.level_start_target_or_target_parent_box_nrs, traversal.target_or_target_parent_boxes, traversal.from_sep_bigger_starts, traversal.from_sep_bigger_lists, src_weights) recorder.add("form_locals", timing_future) local_exps = local_exps + local_result # assert that list 4 close has been merged into list 1 assert traversal.from_sep_close_bigger_starts is None # }}} # {{{ propagate local_exps downward local_exps, timing_future = wrangler.refine_locals( traversal.level_start_target_or_target_parent_box_nrs, traversal.target_or_target_parent_boxes, local_exps) recorder.add("refine_locals", timing_future) # }}} # {{{ evaluate locals local_result, timing_future = wrangler.eval_locals( traversal.level_start_target_box_nrs, traversal.target_boxes, local_exps) recorder.add("eval_locals", timing_future) non_qbx_potentials = non_qbx_potentials + local_result # }}} # {{{ wrangle qbx expansions qbx_expansions, timing_future = wrangler.form_global_qbx_locals(src_weights) recorder.add("form_global_qbx_locals", timing_future) local_result, timing_future = ( wrangler.translate_box_multipoles_to_qbx_local(mpole_exps)) recorder.add("translate_box_multipoles_to_qbx_local", timing_future) qbx_expansions = qbx_expansions + local_result local_result, timing_future = ( wrangler.translate_box_local_to_qbx_local(local_exps)) recorder.add("translate_box_local_to_qbx_local", timing_future) qbx_expansions = qbx_expansions + local_result qbx_potentials, timing_future = wrangler.eval_qbx_expansions(qbx_expansions) recorder.add("eval_qbx_expansions", timing_future) # }}} # {{{ reorder potentials nqbtl = geo_data.non_qbx_box_target_lists() all_potentials_in_tree_order = wrangler.full_output_zeros() for ap_i, nqp_i in zip(all_potentials_in_tree_order, non_qbx_potentials): ap_i[nqbtl.unfiltered_from_filtered_target_indices] = nqp_i all_potentials_in_tree_order += qbx_potentials def reorder_and_finalize_potentials(x): # "finalize" gives host FMMs (like FMMlib) a chance to turn the # potential back into a CL array. return wrangler.finalize_potentials(x[tree.sorted_target_ids]) from pytools.obj_array import with_object_array_or_scalar result = with_object_array_or_scalar( reorder_and_finalize_potentials, all_potentials_in_tree_order) # }}} fmm_proc.done() if timing_data is not None: timing_data.update(recorder.summarize()) return result
def as_scalar_pde(pde, vec_idx): r""" Returns a scalar PDE that is satisfied by the *vec_idx* component of *pde*. :arg pde: An instance of :class:`LinearPDESystemOperator` :arg vec_idx: the index of the vector-valued function that we want as a scalar PDE """ from sumpy.tools import nullspace indices = set() for eq in pde.eqs: for deriv_ident in eq.keys(): indices.add(deriv_ident.vec_idx) # this is already a scalar pde if len(indices) == 1 and list(indices)[0] == vec_idx: return pde from pytools import ProcessLogger plog = ProcessLogger(logger, "computing single PDE for multiple PDEs") from pytools import ( generate_nonnegative_integer_tuples_summing_to_at_most as gnitstam) dim = pde.total_dims # slowly increase the order of the derivatives that we take of the # system of PDEs. Once we reach the order of the scalar PDE, this # loop will break for order in range(2, 100): mis = sorted(gnitstam(order, dim), key=sum) pde_mat = [] coeff_ident_enumerate_dict = dict((tuple(mi), i) for (i, mi) in enumerate(mis)) offset = len(mis) # Create a matrix of equations that are derivatives of the # original system of PDEs for mi in mis: for pde_dict in pde.eqs: eq = [0]*(len(mis)*(max(indices)+1)) for ident, coeff in pde_dict.items(): c = tuple(add_mi(ident.mi, mi)) if c not in coeff_ident_enumerate_dict: break idx = offset*ident.vec_idx + coeff_ident_enumerate_dict[c] eq[idx] = coeff else: pde_mat.append(eq) if len(pde_mat) == 0: continue # Get the nullspace of the matrix and get the rows related to this # vec_idx n = nullspace(pde_mat)[offset*vec_idx:offset*(vec_idx+1), :] indep_row = find_linear_relationship(n) if len(indep_row) > 0: pde_dict = {} mult = indep_row[max(indep_row.keys())] for k, v in indep_row.items(): pde_dict[DerivativeIdentifier(mis[k], 0)] = v / mult plog.done() return LinearPDESystemOperator(pde.dim, pmap(pde_dict)) plog.done() assert False
def drive_fmm(traversal, expansion_wrangler, src_weights, timing_data=None): """Top-level driver routine for a fast multipole calculation. In part, this is intended as a template for custom FMMs, in the sense that you may copy and paste its `source code <https://github.com/inducer/boxtree/blob/master/boxtree/fmm.py>`_ as a starting point. Nonetheless, many common applications (such as point-to-point FMMs) can be covered by supplying the right *expansion_wrangler* to this routine. :arg traversal: A :class:`boxtree.traversal.FMMTraversalInfo` instance. :arg expansion_wrangler: An object exhibiting the :class:`ExpansionWranglerInterface`. :arg src_weights: Source 'density/weights/charges'. Passed unmodified to *expansion_wrangler*. :arg timing_data: Either *None*, or a :class:`dict` that is populated with timing information for the stages of the algorithm (in the form of :class:`TimingResult`), if such information is available. Returns the potentials computed by *expansion_wrangler*. """ wrangler = expansion_wrangler # Interface guidelines: Attributes of the tree are assumed to be known # to the expansion wrangler and should not be passed. fmm_proc = ProcessLogger(logger, "qbx fmm") recorder = TimingRecorder() src_weights = wrangler.reorder_sources(src_weights) # {{{ "Step 2.1:" Construct local multipoles mpole_exps, timing_future = wrangler.form_multipoles( traversal.level_start_source_box_nrs, traversal.source_boxes, src_weights) recorder.add("form_multipoles", timing_future) # }}} # {{{ "Step 2.2:" Propagate multipoles upward mpole_exps, timing_future = wrangler.coarsen_multipoles( traversal.level_start_source_parent_box_nrs, traversal.source_parent_boxes, mpole_exps) recorder.add("coarsen_multipoles", timing_future) # mpole_exps is called Phi in [1] # }}} # {{{ "Stage 3:" Direct evaluation from neighbor source boxes ("list 1") potentials, timing_future = wrangler.eval_direct( traversal.target_boxes, traversal.neighbor_source_boxes_starts, traversal.neighbor_source_boxes_lists, src_weights) recorder.add("eval_direct", timing_future) # these potentials are called alpha in [1] # }}} # {{{ "Stage 4:" translate separated siblings' ("list 2") mpoles to local local_exps, timing_future = wrangler.multipole_to_local( traversal.level_start_target_or_target_parent_box_nrs, traversal.target_or_target_parent_boxes, traversal.from_sep_siblings_starts, traversal.from_sep_siblings_lists, mpole_exps) recorder.add("multipole_to_local", timing_future) # local_exps represents both Gamma and Delta in [1] # }}} # {{{ "Stage 5:" evaluate sep. smaller mpoles ("list 3") at particles # (the point of aiming this stage at particles is specifically to keep its # contribution *out* of the downward-propagating local expansions) mpole_result, timing_future = wrangler.eval_multipoles( traversal.target_boxes_sep_smaller_by_source_level, traversal.from_sep_smaller_by_level, mpole_exps) recorder.add("eval_multipoles", timing_future) potentials = potentials + mpole_result # these potentials are called beta in [1] if traversal.from_sep_close_smaller_starts is not None: logger.debug("evaluate separated close smaller interactions directly " "('list 3 close')") direct_result, timing_future = wrangler.eval_direct( traversal.target_boxes, traversal.from_sep_close_smaller_starts, traversal.from_sep_close_smaller_lists, src_weights) recorder.add("eval_direct", timing_future) potentials = potentials + direct_result # }}} # {{{ "Stage 6:" form locals for separated bigger source boxes ("list 4") local_result, timing_future = wrangler.form_locals( traversal.level_start_target_or_target_parent_box_nrs, traversal.target_or_target_parent_boxes, traversal.from_sep_bigger_starts, traversal.from_sep_bigger_lists, src_weights) recorder.add("form_locals", timing_future) local_exps = local_exps + local_result if traversal.from_sep_close_bigger_starts is not None: direct_result, timing_future = wrangler.eval_direct( traversal.target_boxes, traversal.from_sep_close_bigger_starts, traversal.from_sep_close_bigger_lists, src_weights) recorder.add("eval_direct", timing_future) potentials = potentials + direct_result # }}} # {{{ "Stage 7:" propagate local_exps downward local_exps, timing_future = wrangler.refine_locals( traversal.level_start_target_or_target_parent_box_nrs, traversal.target_or_target_parent_boxes, local_exps) recorder.add("refine_locals", timing_future) # }}} # {{{ "Stage 8:" evaluate locals local_result, timing_future = wrangler.eval_locals( traversal.level_start_target_box_nrs, traversal.target_boxes, local_exps) recorder.add("eval_locals", timing_future) potentials = potentials + local_result # }}} result = wrangler.reorder_potentials(potentials) result = wrangler.finalize_potentials(result) fmm_proc.done() if timing_data is not None: timing_data.update(recorder.summarize()) return result
def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None, wait_for=None): """ :arg queue: a :class:`pyopencl.CommandQueue` :arg tree: a :class:`boxtree.Tree`. :arg ball_centers: an object array of coordinate :class:`pyopencl.array.Array` instances. Their *dtype* must match *tree*'s :attr:`boxtree.Tree.coord_dtype`. :arg ball_radii: a :class:`pyopencl.array.Array` of positive numbers. Its *dtype* must match *tree*'s :attr:`boxtree.Tree.coord_dtype`. :arg peer_lists: may either be *None* or an instance of :class:`PeerListLookup` associated with `tree`. :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event` instances for whose completion this command waits before starting execution. :returns: a tuple *(sqi, event)*, where *sqi* is an instance of :class:`pyopencl.array.Array`, and *event* is a :class:`pyopencl.Event` for dependency management. The *dtype* of *sqi* is *tree*'s :attr:`boxtree.Tree.coord_dtype` and its shape is *(tree.nboxes,)* (see :attr:`boxtree.Tree.nboxes`). The entries of *sqi* are indexed by the global box index and are as follows: * if *i* is not the index of a leaf box, *sqi[i] = 0*. * if *i* is the index of a leaf box, *sqi[i]* is the outer space invader distance for *i*. """ from pytools import single_valued if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype: raise TypeError("ball_centers dtype must match tree.coord_dtype") if ball_radii.dtype != tree.coord_dtype: raise TypeError("ball_radii dtype must match tree.coord_dtype") from pytools import div_ceil # Avoid generating too many kernels. max_levels = div_ceil(tree.nlevels, 10) * 10 if peer_lists is None: peer_lists, evt = self.peer_list_finder(queue, tree, wait_for=wait_for) wait_for = [evt] if len(peer_lists.peer_list_starts) != tree.nboxes + 1: raise ValueError("size of peer lists must match with number of boxes") space_invader_query_kernel = self.get_space_invader_query_kernel( tree.dimensions, tree.coord_dtype, tree.box_id_dtype, peer_lists.peer_list_starts.dtype, max_levels) si_plog = ProcessLogger(logger, "space invader query") outer_space_invader_dists = cl.array.zeros(queue, tree.nboxes, np.float32) if not wait_for: wait_for = [] wait_for = wait_for + outer_space_invader_dists.events evt = space_invader_query_kernel( *SPACE_INVADER_QUERY_TEMPLATE.unwrap_args( tree, peer_lists, ball_radii, outer_space_invader_dists, *tuple(bc for bc in ball_centers)), wait_for=wait_for, queue=queue, range=slice(len(ball_radii))) if tree.coord_dtype != np.dtype(np.float32): # The kernel output is always an array of float32 due to limited # support for atomic operations with float64 in OpenCL. # Here the output is cast to match the coord dtype. outer_space_invader_dists.finish() outer_space_invader_dists = outer_space_invader_dists.astype( tree.coord_dtype) evt, = outer_space_invader_dists.events si_plog.done() return outer_space_invader_dists, evt
def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None, wait_for=None): """ :arg queue: a :class:`pyopencl.CommandQueue` :arg tree: a :class:`boxtree.Tree`. :arg ball_centers: an object array of coordinate :class:`pyopencl.array.Array` instances. Their *dtype* must match *tree*'s :attr:`boxtree.Tree.coord_dtype`. :arg ball_radii: a :class:`pyopencl.array.Array` of positive numbers. Its *dtype* must match *tree*'s :attr:`boxtree.Tree.coord_dtype`. :arg peer_lists: may either be *None* or an instance of :class:`PeerListLookup` associated with `tree`. :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event` instances for whose completion this command waits before starting execution. :returns: a tuple *(lbl, event)*, where *lbl* is an instance of :class:`LeavesToBallsLookup`, and *event* is a :class:`pyopencl.Event` for dependency management. """ from pytools import single_valued if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype: raise TypeError("ball_centers dtype must match tree.coord_dtype") if ball_radii.dtype != tree.coord_dtype: raise TypeError("ball_radii dtype must match tree.coord_dtype") ltb_plog = ProcessLogger(logger, "leaves-to-balls lookup: run area query") area_query, evt = self.area_query_builder( queue, tree, ball_centers, ball_radii, peer_lists, wait_for) wait_for = [evt] logger.debug("leaves-to-balls lookup: expand starts") nkeys = tree.nboxes nballs_p_1 = len(area_query.leaves_near_ball_starts) assert nballs_p_1 == len(ball_radii) + 1 # We invert the area query in two steps: # # 1. Turn the area query result into (ball number, box number) pairs. # This is done in the "starts expander kernel." # # 2. Key-value sort the (ball number, box number) pairs by box number. starts_expander_knl = self.get_starts_expander_kernel(tree.box_id_dtype) expanded_starts = cl.array.empty( queue, len(area_query.leaves_near_ball_lists), tree.box_id_dtype) evt = starts_expander_knl( expanded_starts, area_query.leaves_near_ball_starts.with_queue(queue), nballs_p_1) wait_for = [evt] logger.debug("leaves-to-balls lookup: key-value sort") balls_near_box_starts, balls_near_box_lists, evt \ = self.key_value_sorter( queue, # keys area_query.leaves_near_ball_lists.with_queue(queue), # values expanded_starts, nkeys, starts_dtype=tree.box_id_dtype, wait_for=wait_for) ltb_plog.done() return LeavesToBallsLookup( tree=tree, balls_near_box_starts=balls_near_box_starts, balls_near_box_lists=balls_near_box_lists).with_queue(None), evt
def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None, wait_for=None): """ :arg queue: a :class:`pyopencl.CommandQueue` :arg tree: a :class:`boxtree.Tree`. :arg ball_centers: an object array of coordinate :class:`pyopencl.array.Array` instances. Their *dtype* must match *tree*'s :attr:`boxtree.Tree.coord_dtype`. :arg ball_radii: a :class:`pyopencl.array.Array` of positive numbers. Its *dtype* must match *tree*'s :attr:`boxtree.Tree.coord_dtype`. :arg peer_lists: may either be *None* or an instance of :class:`PeerListLookup` associated with `tree`. :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event` instances for whose completion this command waits before starting exeuction. :returns: a tuple *(aq, event)*, where *aq* is an instance of :class:`AreaQueryResult`, and *event* is a :class:`pyopencl.Event` for dependency management. """ from pytools import single_valued if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype: raise TypeError("ball_centers dtype must match tree.coord_dtype") if ball_radii.dtype != tree.coord_dtype: raise TypeError("ball_radii dtype must match tree.coord_dtype") ball_id_dtype = tree.particle_id_dtype # ? from pytools import div_ceil # Avoid generating too many kernels. max_levels = div_ceil(tree.nlevels, 10) * 10 if peer_lists is None: peer_lists, evt = self.peer_list_finder(queue, tree, wait_for=wait_for) wait_for = [evt] if len(peer_lists.peer_list_starts) != tree.nboxes + 1: raise ValueError("size of peer lists must match with number of boxes") area_query_kernel = self.get_area_query_kernel(tree.dimensions, tree.coord_dtype, tree.box_id_dtype, ball_id_dtype, peer_lists.peer_list_starts.dtype, max_levels) aq_plog = ProcessLogger(logger, "area query") result, evt = area_query_kernel( queue, len(ball_radii), tree.box_centers.data, tree.root_extent, tree.box_levels.data, tree.aligned_nboxes, tree.box_child_ids.data, tree.box_flags.data, peer_lists.peer_list_starts.data, peer_lists.peer_lists.data, ball_radii.data, *(tuple(tree.bounding_box[0]) + tuple(bc.data for bc in ball_centers)), wait_for=wait_for) aq_plog.done() return AreaQueryResult( tree=tree, leaves_near_ball_starts=result["leaves"].starts, leaves_near_ball_lists=result["leaves"].lists).with_queue(None), evt
def __call__(self, queue, balls_to_leaves_lookup=None, wait_for=None): """ :arg queue: a :class:`pyopencl.CommandQueue` """ slk_plog = ProcessLogger(logger, "element-to-source lookup: run area query") if balls_to_leaves_lookup is None: balls_to_leaves_lookup, evt = \ self.compute_short_lists(queue, wait_for=wait_for) wait_for = [evt] # ----------------------------------------------------------------- # Refine the area query using point-in-simplex test logger.debug("element-to-source lookup: refine starts") element_lookup_kernel = self.get_simplex_lookup_kernel() vertices_dev = make_obj_array([ cl.array.to_device(queue, verts) for verts in self.discr.mesh.vertices ]) mesh_vertices_kwargs = { f"mesh_vertices_{iaxis}": vertices_dev[iaxis] for iaxis in range(self.dim) } source_points_kwargs = { f"source_points_{iaxis}": self.tree.sources[iaxis] for iaxis in range(self.dim) } evt, res = element_lookup_kernel( queue, dim=self.dim, nboxes=self.tree.nboxes, nelements=self.discr.mesh.nelements, nsources=self.tree.nsources, result=cl.array.zeros(queue, self.tree.nsources, dtype=np.int32) - 1, mesh_vertex_indices=self.discr.mesh.groups[0].vertex_indices, box_source_starts=self.tree.box_source_starts, box_source_counts_cumul=self.tree.box_source_counts_cumul, leaves_near_ball_starts=balls_to_leaves_lookup. leaves_near_ball_starts, leaves_near_ball_lists=balls_to_leaves_lookup. leaves_near_ball_lists, wait_for=wait_for, **mesh_vertices_kwargs, **source_points_kwargs) source_to_element_lookup, = res wait_for = [evt] # elements = source_to_element_lookup.get() # for idx in [362, 365, 874, 877, 1386, 1389, 1898, 1901]) # ----------------------------------------------------------------- # Invert the source-to-element lookup by a key-value sort logger.debug("element-to-source lookup: key-value sort") sources_in_element_starts, sources_in_element_lists, evt = \ self.key_value_sorter( queue, keys=source_to_element_lookup, values=cl.array.arange( queue, self.tree.nsources, dtype=self.tree.box_id_dtype), nkeys=self.discr.mesh.nelements, starts_dtype=self.tree.box_id_dtype, wait_for=wait_for) slk_plog.done() return ElementsToSourcesLookup( tree=self.tree, discr=self.discr, sources_in_element_starts=sources_in_element_starts, sources_in_element_lists=sources_in_element_lists), evt
def drive_fmm(expansion_wrangler, src_weight_vecs, timing_data=None, traversal=None): """Top-level driver routine for the QBX fast multipole calculation. :arg geo_data: A :class:`pytential.qbx.geometry.QBXFMMGeometryData` instance. :arg expansion_wrangler: An object exhibiting the :class:`boxtree.fmm.ExpansionWranglerInterface`. :arg src_weight_vecs: A sequence of source 'density/weights/charges'. Passed unmodified to *expansion_wrangler*. :arg timing_data: Either *None* or a dictionary that collects timing data. Returns the potentials computed by *expansion_wrangler*. See also :func:`boxtree.fmm.drive_fmm`. """ wrangler = expansion_wrangler geo_data = wrangler.geo_data if traversal is None: traversal = geo_data.traversal() tree = traversal.tree recorder = TimingRecorder() # Interface guidelines: Attributes of the tree are assumed to be known # to the expansion wrangler and should not be passed. fmm_proc = ProcessLogger(logger, "qbx fmm") src_weight_vecs = [wrangler.reorder_sources(weight) for weight in src_weight_vecs] # {{{ construct local multipoles mpole_exps, timing_future = wrangler.form_multipoles( traversal.level_start_source_box_nrs, traversal.source_boxes, src_weight_vecs) recorder.add("form_multipoles", timing_future) # }}} # {{{ propagate multipoles upward mpole_exps, timing_future = wrangler.coarsen_multipoles( traversal.level_start_source_parent_box_nrs, traversal.source_parent_boxes, mpole_exps) recorder.add("coarsen_multipoles", timing_future) # }}} # {{{ direct evaluation from neighbor source boxes ("list 1") non_qbx_potentials, timing_future = wrangler.eval_direct( traversal.target_boxes, traversal.neighbor_source_boxes_starts, traversal.neighbor_source_boxes_lists, src_weight_vecs) recorder.add("eval_direct", timing_future) # }}} # {{{ translate separated siblings' ("list 2") mpoles to local local_exps, timing_future = wrangler.multipole_to_local( traversal.level_start_target_or_target_parent_box_nrs, traversal.target_or_target_parent_boxes, traversal.from_sep_siblings_starts, traversal.from_sep_siblings_lists, mpole_exps) recorder.add("multipole_to_local", timing_future) # }}} # {{{ evaluate sep. smaller mpoles ("list 3") at particles # (the point of aiming this stage at particles is specifically to keep its # contribution *out* of the downward-propagating local expansions) mpole_result, timing_future = wrangler.eval_multipoles( traversal.target_boxes_sep_smaller_by_source_level, traversal.from_sep_smaller_by_level, mpole_exps) recorder.add("eval_multipoles", timing_future) non_qbx_potentials = non_qbx_potentials + mpole_result # assert that list 3 close has been merged into list 1 assert traversal.from_sep_close_smaller_starts is None # }}} # {{{ form locals for separated bigger source boxes ("list 4") local_result, timing_future = wrangler.form_locals( traversal.level_start_target_or_target_parent_box_nrs, traversal.target_or_target_parent_boxes, traversal.from_sep_bigger_starts, traversal.from_sep_bigger_lists, src_weight_vecs) recorder.add("form_locals", timing_future) local_exps = local_exps + local_result # assert that list 4 close has been merged into list 1 assert traversal.from_sep_close_bigger_starts is None # }}} # {{{ propagate local_exps downward local_exps, timing_future = wrangler.refine_locals( traversal.level_start_target_or_target_parent_box_nrs, traversal.target_or_target_parent_boxes, local_exps) recorder.add("refine_locals", timing_future) # }}} # {{{ evaluate locals local_result, timing_future = wrangler.eval_locals( traversal.level_start_target_box_nrs, traversal.target_boxes, local_exps) recorder.add("eval_locals", timing_future) non_qbx_potentials = non_qbx_potentials + local_result # }}} # {{{ wrangle qbx expansions # form_global_qbx_locals and eval_target_specific_qbx_locals are responsible # for the same interactions (directly evaluated portion of the potentials # via unified List 1). Which one is used depends on the wrangler. If one of # them is unused the corresponding output entries will be zero. qbx_expansions, timing_future = wrangler.form_global_qbx_locals(src_weight_vecs) recorder.add("form_global_qbx_locals", timing_future) local_result, timing_future = ( wrangler.translate_box_multipoles_to_qbx_local(mpole_exps)) recorder.add("translate_box_multipoles_to_qbx_local", timing_future) qbx_expansions = qbx_expansions + local_result local_result, timing_future = ( wrangler.translate_box_local_to_qbx_local(local_exps)) recorder.add("translate_box_local_to_qbx_local", timing_future) qbx_expansions = qbx_expansions + local_result qbx_potentials, timing_future = wrangler.eval_qbx_expansions(qbx_expansions) recorder.add("eval_qbx_expansions", timing_future) ts_result, timing_future = \ wrangler.eval_target_specific_qbx_locals(src_weight_vecs) qbx_potentials = qbx_potentials + ts_result recorder.add("eval_target_specific_qbx_locals", timing_future) # }}} # {{{ reorder potentials nqbtl = geo_data.non_qbx_box_target_lists() all_potentials_in_tree_order = wrangler.full_output_zeros() for ap_i, nqp_i in zip(all_potentials_in_tree_order, non_qbx_potentials): ap_i[nqbtl.unfiltered_from_filtered_target_indices] = nqp_i all_potentials_in_tree_order += qbx_potentials def reorder_and_finalize_potentials(x): # "finalize" gives host FMMs (like FMMlib) a chance to turn the # potential back into a CL array. return wrangler.finalize_potentials(x[tree.sorted_target_ids]) from pytools.obj_array import obj_array_vectorize result = obj_array_vectorize( reorder_and_finalize_potentials, all_potentials_in_tree_order) # }}} fmm_proc.done() if timing_data is not None: timing_data.update(recorder.summarize()) return result
def drive_fmm(traversal, expansion_wrangler, src_weights, timing_data=None): """Top-level driver routine for a fast multipole calculation. In part, this is intended as a template for custom FMMs, in the sense that you may copy and paste its `source code <https://github.com/inducer/boxtree/blob/master/boxtree/fmm.py>`_ as a starting point. Nonetheless, many common applications (such as point-to-point FMMs) can be covered by supplying the right *expansion_wrangler* to this routine. :arg traversal: A :class:`boxtree.traversal.FMMTraversalInfo` instance. :arg expansion_wrangler: An object exhibiting the :class:`ExpansionWranglerInterface`. :arg src_weights: Source 'density/weights/charges'. Passed unmodified to *expansion_wrangler*. :arg timing_data: Either *None*, or a :class:`dict` that is populated with timing information for the stages of the algorithm (in the form of :class:`TimingResult`), if such information is available. Returns the potentials computed by *expansion_wrangler*. """ wrangler = expansion_wrangler # Interface guidelines: Attributes of the tree are assumed to be known # to the expansion wrangler and should not be passed. fmm_proc = ProcessLogger(logger, "fmm") recorder = TimingRecorder() src_weights = wrangler.reorder_sources(src_weights) # {{{ "Step 2.1:" Construct local multipoles mpole_exps, timing_future = wrangler.form_multipoles( traversal.level_start_source_box_nrs, traversal.source_boxes, src_weights) recorder.add("form_multipoles", timing_future) # }}} # {{{ "Step 2.2:" Propagate multipoles upward mpole_exps, timing_future = wrangler.coarsen_multipoles( traversal.level_start_source_parent_box_nrs, traversal.source_parent_boxes, mpole_exps) recorder.add("coarsen_multipoles", timing_future) # mpole_exps is called Phi in [1] # }}} # {{{ "Stage 3:" Direct evaluation from neighbor source boxes ("list 1") potentials, timing_future = wrangler.eval_direct( traversal.target_boxes, traversal.neighbor_source_boxes_starts, traversal.neighbor_source_boxes_lists, src_weights) recorder.add("eval_direct", timing_future) # these potentials are called alpha in [1] # }}} # {{{ "Stage 4:" translate separated siblings' ("list 2") mpoles to local local_exps, timing_future = wrangler.multipole_to_local( traversal.level_start_target_or_target_parent_box_nrs, traversal.target_or_target_parent_boxes, traversal.from_sep_siblings_starts, traversal.from_sep_siblings_lists, mpole_exps) recorder.add("multipole_to_local", timing_future) # local_exps represents both Gamma and Delta in [1] # }}} # {{{ "Stage 5:" evaluate sep. smaller mpoles ("list 3") at particles # (the point of aiming this stage at particles is specifically to keep its # contribution *out* of the downward-propagating local expansions) mpole_result, timing_future = wrangler.eval_multipoles( traversal.target_boxes_sep_smaller_by_source_level, traversal.from_sep_smaller_by_level, mpole_exps) recorder.add("eval_multipoles", timing_future) potentials = potentials + mpole_result # these potentials are called beta in [1] if traversal.from_sep_close_smaller_starts is not None: logger.debug("evaluate separated close smaller interactions directly " "('list 3 close')") direct_result, timing_future = wrangler.eval_direct( traversal.target_boxes, traversal.from_sep_close_smaller_starts, traversal.from_sep_close_smaller_lists, src_weights) recorder.add("eval_direct", timing_future) potentials = potentials + direct_result # }}} # {{{ "Stage 6:" form locals for separated bigger source boxes ("list 4") local_result, timing_future = wrangler.form_locals( traversal.level_start_target_or_target_parent_box_nrs, traversal.target_or_target_parent_boxes, traversal.from_sep_bigger_starts, traversal.from_sep_bigger_lists, src_weights) recorder.add("form_locals", timing_future) local_exps = local_exps + local_result if traversal.from_sep_close_bigger_starts is not None: direct_result, timing_future = wrangler.eval_direct( traversal.target_boxes, traversal.from_sep_close_bigger_starts, traversal.from_sep_close_bigger_lists, src_weights) recorder.add("eval_direct", timing_future) potentials = potentials + direct_result # }}} # {{{ "Stage 7:" propagate local_exps downward local_exps, timing_future = wrangler.refine_locals( traversal.level_start_target_or_target_parent_box_nrs, traversal.target_or_target_parent_boxes, local_exps) recorder.add("refine_locals", timing_future) # }}} # {{{ "Stage 8:" evaluate locals local_result, timing_future = wrangler.eval_locals( traversal.level_start_target_box_nrs, traversal.target_boxes, local_exps) recorder.add("eval_locals", timing_future) potentials = potentials + local_result # }}} result = wrangler.reorder_potentials(potentials) result = wrangler.finalize_potentials(result) fmm_proc.done() if timing_data is not None: timing_data.update(recorder.summarize()) return result
def parse_fortran(source, filename="<floopy code>", free_form=None, strict=None, seq_dependencies=None, auto_dependencies=None, target=None): """ :returns: a :class:`loopy.TranslationUnit` """ parse_plog = ProcessLogger(logger, "parsing fortran file '%s'" % filename) if seq_dependencies is not None and auto_dependencies is not None: raise TypeError( "may not specify both seq_dependencies and auto_dependencies") if auto_dependencies is not None: from warnings import warn warn("auto_dependencies is deprecated, use seq_dependencies instead", DeprecationWarning, stacklevel=2) seq_dependencies = auto_dependencies if seq_dependencies is None: seq_dependencies = True if free_form is None: free_form = True if strict is None: strict = True import logging console = logging.StreamHandler() console.setLevel(logging.INFO) formatter = logging.Formatter("%(name)-12s: %(levelname)-8s %(message)s") console.setFormatter(formatter) logging.getLogger("fparser").addHandler(console) from fparser import api tree = api.parse(source, isfree=free_form, isstrict=strict, analyze=False, ignore_comments=False) if tree is None: raise LoopyError("Fortran parser was unhappy with source code " "and returned invalid data (Sorry!)") from loopy.frontend.fortran.translator import F2LoopyTranslator f2loopy = F2LoopyTranslator(filename, target=target) f2loopy(tree) kernels = f2loopy.make_kernels(seq_dependencies=seq_dependencies) from loopy.transform.callable import merge prog = merge(kernels) all_kernels = [clbl.subkernel for clbl in prog.callables_table.values()] for knl in all_kernels: prog.with_kernel(_add_assignees_to_calls(knl, all_kernels)) if len(all_kernels) == 1: # guesssing in the case of only one function prog = prog.with_entrypoints(all_kernels[0].name) from loopy.frontend.fortran.translator import specialize_fortran_division prog = specialize_fortran_division(prog) parse_plog.done() return prog