def ast_matmul(self, F_a): """Generate an AST for a PyOP2 kernel performing a matrix-vector multiplication. :param F_a: Assembled firedrake.Function object for the RHS""" # The number of dofs on each element is /ndofs*cdim/ F_a_fs = F_a.function_space() ndofs = sum(F_a_fs.topological.dofs_per_entity) cdim = F_a_fs.dim name = 'mat_vec_mul_kernel_%s' % F_a_fs.name identifier = (ndofs, cdim, name) if identifier in self.asts: return self.asts[identifier] # Craft the AST body = ast.Incr(ast.Symbol('C', ('i/%d' % cdim, 'i%%%d' % cdim)), ast.Prod(ast.Symbol('A', ('i',), ((ndofs*cdim, 'j*%d + k' % cdim),)), ast.Symbol('B', ('j', 'k')))) body = ast.c_for('k', cdim, body).children[0] body = [ast.Assign(ast.Symbol('C', ('i/%d' % cdim, 'i%%%d' % cdim)), '0.0'), ast.c_for('j', ndofs, body).children[0]] body = ast.Root([ast.c_for('i', ndofs*cdim, body).children[0]]) funargs = [ast.Decl('double*', 'A'), ast.Decl('double**', 'B'), ast.Decl('double**', 'C')] fundecl = ast.FunDecl('void', name, funargs, body, ['static', 'inline']) # Track the AST for later fast retrieval self.asts[identifier] = fundecl return fundecl
def _tabulate_tensor(ir, parameters): "Generate code for a single integral (tabulate_tensor())." p_format = parameters["format"] precision = parameters["precision"] f_comment = format["comment"] f_G = format["geometry constant"] f_const_double = format["assign"] f_float = format["float"] f_assign = format["assign"] f_A = format["element tensor"][p_format] f_r = format["free indices"][0] f_j = format["first free index"] f_k = format["second free index"] f_loop = format["generate loop"] f_int = format["int"] f_weight = format["weight"] # Get data. opt_par = ir["optimise_parameters"] integral_type = ir["integral_type"] cell = ir["cell"] gdim = cell.geometric_dimension() tdim = cell.topological_dimension() num_facets = ir["num_facets"] num_vertices= ir["num_vertices"] integrals = ir["trans_integrals"] geo_consts = ir["geo_consts"] oriented = ir["needs_oriented"] # Create sets of used variables. used_weights = set() used_psi_tables = set() used_nzcs = set() trans_set = set() sets = [used_weights, used_psi_tables, used_nzcs, trans_set] affine_tables = {} # TODO: This is not populated anywhere, remove? quadrature_weights = ir["quadrature_weights"] #The pyop2 format requires dereferencing constant coefficients since # these are passed in as double * common = [] if p_format == "pyop2": for n, c in zip(ir["coefficient_names"], ir["coefficient_elements"]): if c.family() == 'Real': # Second index is always? 0, so we cast to (double (*)[1]). common += ['double (*w%(n)s)[1] = (double (*)[1])c%(n)s;\n' % {'n': n[1:]}] operations = [] if integral_type == "cell": # Update transformer with facets and generate code + set of used geometry terms. nest_ir, num_ops = _generate_element_tensor(integrals, sets, \ opt_par, parameters) # Set operations equal to num_ops (for printing info on operations). operations.append([num_ops]) # Generate code for basic geometric quantities # @@@: Jacobian snippet jacobi_code = "" jacobi_code += format["compute_jacobian"](cell) jacobi_code += "\n" jacobi_code += format["compute_jacobian_inverse"](cell) if oriented and tdim != gdim: # NEED TO THINK ABOUT THIS FOR EXTRUSION jacobi_code += format["orientation"][p_format](tdim, gdim) jacobi_code += "\n" jacobi_code += format["scale factor snippet"][p_format] # Generate code for cell volume and circumradius -- note that the # former will be incorrect on extruded meshes by a constant factor. jacobi_code += "\n\n" + format["generate cell volume"][p_format](tdim, gdim, integral_type) jacobi_code += "\n\n" + format["generate circumradius"][p_format](tdim, gdim, integral_type) elif integral_type in ("exterior_facet", "exterior_facet_vert"): if p_format == 'pyop2': common += ["unsigned int facet = *facet_p;\n"] # Generate tensor code for facets + set of used geometry terms. nest_ir, ops = _generate_element_tensor(integrals, sets, opt_par, parameters) # Save number of operations (for printing info on operations). operations.append([ops]) # Generate code for basic geometric quantities # @@@: Jacobian snippet jacobi_code = "" jacobi_code += format["compute_jacobian"](cell) jacobi_code += "\n" jacobi_code += format["compute_jacobian_inverse"](cell) if oriented and tdim != gdim: # NEED TO THINK ABOUT THIS FOR EXTRUSION jacobi_code += format["orientation"][p_format](tdim, gdim) jacobi_code += "\n" if integral_type == "exterior_facet": jacobi_code += "\n\n" + format["facet determinant"](cell, p_format, integral_type) jacobi_code += "\n\n" + format["generate normal"](cell, p_format, integral_type) jacobi_code += "\n\n" + format["generate facet area"](tdim, gdim) if tdim == 3: jacobi_code += "\n\n" + format["generate min facet edge length"](tdim, gdim) jacobi_code += "\n\n" + format["generate max facet edge length"](tdim, gdim) # Generate code for cell volume and circumradius jacobi_code += "\n\n" + format["generate cell volume"][p_format](tdim, gdim, integral_type) jacobi_code += "\n\n" + format["generate circumradius"][p_format](tdim, gdim, integral_type) elif integral_type == "exterior_facet_vert": jacobi_code += "\n\n" + format["facet determinant"](cell, p_format, integral_type) jacobi_code += "\n\n" + format["generate normal"](cell, p_format, integral_type) # OTHER THINGS NOT IMPLEMENTED YET else: raise RuntimeError("Invalid integral_type") elif integral_type in ("exterior_facet_top", "exterior_facet_bottom"): nest_ir, ops = _generate_element_tensor(integrals, sets, opt_par, parameters) operations.append([ops]) # Generate code for basic geometric quantities # @@@: Jacobian snippet jacobi_code = "" jacobi_code += format["compute_jacobian"](cell) jacobi_code += "\n" jacobi_code += format["compute_jacobian_inverse"](cell) if oriented: # NEED TO THINK ABOUT THIS FOR EXTRUSION jacobi_code += format["orientation"][p_format](tdim, gdim) jacobi_code += "\n" jacobi_code += "\n\n" + format["facet determinant"](cell, p_format, integral_type) jacobi_code += "\n\n" + format["generate normal"](cell, p_format, integral_type) # THE REST IS NOT IMPLEMENTED YET elif integral_type in ("interior_facet", "interior_facet_vert"): if p_format == 'pyop2': common += ["unsigned int facet_0 = facet_p[0];"] common += ["unsigned int facet_1 = facet_p[1];"] common += ["double **coordinate_dofs_0 = coordinate_dofs;"] # Note that the following line is unsafe for isoparametric elements. common += ["double **coordinate_dofs_1 = coordinate_dofs + %d;" % num_vertices] # Generate tensor code for facets + set of used geometry terms. nest_ir, ops = _generate_element_tensor(integrals, sets, opt_par, parameters) # Save number of operations (for printing info on operations). operations.append([ops]) # Generate code for basic geometric quantities # @@@: Jacobian snippet jacobi_code = "" for _r in ["+", "-"]: if p_format == "pyop2": jacobi_code += format["compute_jacobian_interior"](cell, r=_r) else: jacobi_code += format["compute_jacobian"](cell, r=_r) jacobi_code += "\n" jacobi_code += format["compute_jacobian_inverse"](cell, r=_r) if oriented and tdim != gdim: # NEED TO THINK ABOUT THIS FOR EXTRUSION jacobi_code += format["orientation"][p_format](tdim, gdim, r=_r) jacobi_code += "\n" if integral_type == "interior_facet": jacobi_code += "\n\n" + format["facet determinant"](cell, p_format, integral_type, r="+") jacobi_code += "\n\n" + format["generate normal"](cell, p_format, integral_type) jacobi_code += "\n\n" + format["generate facet area"](tdim, gdim) if tdim == 3: jacobi_code += "\n\n" + format["generate min facet edge length"](tdim, gdim, r="+") jacobi_code += "\n\n" + format["generate max facet edge length"](tdim, gdim, r="+") # Generate code for cell volume and circumradius jacobi_code += "\n\n" + format["generate cell volume"][p_format](tdim, gdim, integral_type) jacobi_code += "\n\n" + format["generate circumradius interior"](tdim, gdim, integral_type) elif integral_type == "interior_facet_vert": # THE REST IS NOT IMPLEMENTED YET jacobi_code += "\n\n" + format["facet determinant"](cell, p_format, integral_type, r="+") jacobi_code += "\n\n" + format["generate normal"](cell, p_format, integral_type) else: raise RuntimeError("Invalid integral_type") elif integral_type == "interior_facet_horiz": common += ["double **coordinate_dofs_0 = coordinate_dofs;"] # Note that the following line is unsafe for isoparametric elements. common += ["double **coordinate_dofs_1 = coordinate_dofs + %d;" % num_vertices] nest_ir, ops = _generate_element_tensor(integrals, sets, opt_par, parameters) # Save number of operations (for printing info on operations). operations.append([ops]) # Generate code for basic geometric quantities # @@@: Jacobian snippet jacobi_code = "" for _r in ["+", "-"]: jacobi_code += format["compute_jacobian_interior"](cell, r=_r) jacobi_code += "\n" jacobi_code += format["compute_jacobian_inverse"](cell, r=_r) if oriented: # NEED TO THINK ABOUT THIS FOR EXTRUSION jacobi_code += format["orientation"][p_format](tdim, gdim, r=_r) jacobi_code += "\n" # TODO: verify that this is correct (we think it is) jacobi_code += "\n\n" + format["facet determinant"](cell, p_format, integral_type, r="+") jacobi_code += "\n\n" + format["generate normal"](cell, p_format, integral_type) # THE REST IS NOT IMPLEMENTED YET elif integral_type == "point": # Update transformer with vertices and generate code + set of used geometry terms. nest_ir, ops = _generate_element_tensor(integrals, sets, opt_par, parameters) # Save number of operations (for printing info on operations). operations.append([ops]) # Generate code for basic geometric quantities # @@@: Jacobian snippet jacobi_code = "" jacobi_code += format["compute_jacobian"](cell) jacobi_code += "\n" jacobi_code += format["compute_jacobian_inverse"](cell) if oriented and tdim != gdim: jacobi_code += format["orientation"][p_format](tdim, gdim) jacobi_code += "\n" else: error("Unhandled integral type: " + str(integral_type)) # Embedded manifold, need to pass in cell orientations if oriented and tdim != gdim and p_format == 'pyop2': if integral_type in ("interior_facet", "interior_facet_vert", "interior_facet_horiz"): common += ["const int cell_orientation%s = cell_orientation_[0][0];" % _choose_map('+'), "const int cell_orientation%s = cell_orientation_[1][0];" % _choose_map('-')] else: common += ["const int cell_orientation = cell_orientation_[0][0];"] # After we have generated the element code for all facets we can remove # the unused transformations and tabulate the used psi tables and weights. common += [remove_unused(jacobi_code, trans_set)] jacobi_ir = pyop2.FlatBlock("\n".join(common)) # @@@: const double W3[3] = {{...}} pyop2_weights = [] for weights, points in [quadrature_weights[p] for p in used_weights]: n_points = len(points) w_sym = pyop2.Symbol(f_weight(n_points), () if n_points == 1 else (n_points,)) pyop2_weights.append(pyop2.Decl("double", w_sym, pyop2.ArrayInit(weights, precision), qualifiers=["static", "const"])) name_map = ir["name_map"] tables = ir["unique_tables"] tables.update(affine_tables) # TODO: This is not populated anywhere, remove? # @@@: const double FE0[] = {{...}} code, decl = _tabulate_psis(tables, used_psi_tables, name_map, used_nzcs, opt_par, parameters) pyop2_basis = [] for name, data in decl.items(): rank, _, values = data zeroflags = values.get_zeros() feo_sym = pyop2.Symbol(name, rank) init = pyop2.ArrayInit(values, precision) if zeroflags is not None and not zeroflags.all(): nz_indices = numpy.logical_not(zeroflags).nonzero() # Note: in the following, we take the last entry of /nz_indices/ since we /know/ # we have been tracking only zero-valued columns nz_indices = nz_indices[-1] nz_bounds = tuple([(i, 0)] for i in rank[:-1]) nz_bounds += ([(max(nz_indices) - min(nz_indices) + 1, min(nz_indices))],) init = pyop2.SparseArrayInit(values, precision, nz_bounds) pyop2_basis.append(pyop2.Decl("double", feo_sym, init, ["static", "const"])) # Build the root of the PyOP2' ast pyop2_tables = pyop2_weights + [tab for tab in pyop2_basis] root = pyop2.Root([jacobi_ir] + pyop2_tables + nest_ir) return root
def build_hard_fusion_kernel(base_loop, fuse_loop, fusion_map, loop_chain_index): """ Build AST and :class:`Kernel` for two loops suitable to hard fusion. The AST consists of three functions: fusion, base, fuse. base and fuse are respectively the ``base_loop`` and the ``fuse_loop`` kernels, whereas fusion is the orchestrator that invokes, for each ``base_loop`` iteration, base and, if still to be executed, fuse. The orchestrator has the following structure: :: fusion (buffer, ..., executed): base (buffer, ...) for i = 0 to arity: if not executed[i]: additional pointer staging required by kernel2 fuse (sub_buffer, ...) insertion into buffer The executed array tracks whether the i-th iteration (out of /arity/) adjacent to the main kernel1 iteration has been executed. """ finder = Find((ast.FunDecl, ast.PreprocessNode)) base = base_loop.kernel base_ast = dcopy(base._ast) base_info = finder.visit(base_ast) base_headers = base_info[ast.PreprocessNode] base_fundecl = base_info[ast.FunDecl] assert len(base_fundecl) == 1 base_fundecl = base_fundecl[0] fuse = fuse_loop.kernel fuse_ast = dcopy(fuse._ast) fuse_info = finder.visit(fuse_ast) fuse_headers = fuse_info[ast.PreprocessNode] fuse_fundecl = fuse_info[ast.FunDecl] assert len(fuse_fundecl) == 1 fuse_fundecl = fuse_fundecl[0] # Create /fusion/ arguments and signature body = ast.Block([]) fusion_name = '%s_%s' % (base_fundecl.name, fuse_fundecl.name) fusion_args = dcopy(base_fundecl.args + fuse_fundecl.args) fusion_fundecl = ast.FunDecl(base_fundecl.ret, fusion_name, fusion_args, body) # Make sure kernel and variable names are unique base_fundecl.name = "%s_base" % base_fundecl.name fuse_fundecl.name = "%s_fuse" % fuse_fundecl.name for i, decl in enumerate(fusion_args): decl.sym.symbol += '_%d' % i # Filter out duplicate arguments, and append extra arguments to the fundecl binding = WeakFilter().kernel_args([base_loop, fuse_loop], fusion_fundecl) fusion_args += [ast.Decl('int*', 'executed'), ast.Decl('int*', 'fused_iters'), ast.Decl('int', 'i')] # Which args are actually used in /fuse/, but not in /base/ ? The gather for # such arguments is moved to /fusion/, to avoid usless memory LOADs base_dats = set(a.data for a in base_loop.args) fuse_dats = set(a.data for a in fuse_loop.args) unshared = OrderedDict() for arg, decl in binding.items(): if arg.data in fuse_dats - base_dats: unshared.setdefault(decl, arg) # Track position of Args that need a postponed gather # Can't track Args themselves as they change across different parloops fargs = {fusion_args.index(i): ('postponed', False) for i in unshared.keys()} fargs.update({len(set(binding.values())): ('onlymap', True)}) # Add maps for arguments that need a postponed gather for decl, arg in unshared.items(): decl_pos = fusion_args.index(decl) fusion_args[decl_pos].sym.symbol = arg.c_arg_name() if arg._is_indirect: fusion_args[decl_pos].sym.rank = () fusion_args.insert(decl_pos + 1, ast.Decl('int*', arg.c_map_name(0, 0))) # Append the invocation of /base/; then, proceed with the invocation # of the /fuse/ kernels base_funcall_syms = [binding[a].sym.symbol for a in base_loop.args] body.children.append(ast.FunCall(base_fundecl.name, *base_funcall_syms)) for idx in range(fusion_map.arity): fused_iter = ast.Assign('i', ast.Symbol('fused_iters', (idx,))) fuse_funcall = ast.FunCall(fuse_fundecl.name) if_cond = ast.Not(ast.Symbol('executed', ('i',))) if_update = ast.Assign(ast.Symbol('executed', ('i',)), 1) if_body = ast.Block([fuse_funcall, if_update], open_scope=True) if_exec = ast.If(if_cond, [if_body]) body.children.extend([ast.FlatBlock('\n'), fused_iter, if_exec]) # Modify the /fuse/ kernel # This is to take into account that many arguments are shared with # /base/, so they will only staged once for /base/. This requires # tweaking the way the arguments are declared and accessed in /fuse/. # For example, the shared incremented array (called /buffer/ in # the pseudocode in the comment above) now needs to take offsets # to be sure the locations that /base/ is supposed to increment are # actually accessed. The same concept apply to indirect arguments. init = lambda v: '{%s}' % ', '.join([str(j) for j in v]) for i, fuse_loop_arg in enumerate(fuse_loop.args): fuse_kernel_arg = binding[fuse_loop_arg] buffer_name = '%s_vec' % fuse_kernel_arg.sym.symbol fuse_funcall_sym = ast.Symbol(buffer_name) # What kind of temporaries do we need ? if fuse_loop_arg.access == INC: op, lvalue, rvalue = ast.Incr, fuse_kernel_arg.sym.symbol, buffer_name stager = lambda b, l: b.children.extend(l) indexer = lambda indices: [(k, j) for j, k in enumerate(indices)] pointers = [] elif fuse_loop_arg.access == READ: op, lvalue, rvalue = ast.Assign, buffer_name, fuse_kernel_arg.sym.symbol stager = lambda b, l: [b.children.insert(0, j) for j in reversed(l)] indexer = lambda indices: [(j, k) for j, k in enumerate(indices)] pointers = list(fuse_kernel_arg.pointers) # Now gonna handle arguments depending on their type and rank ... if fuse_loop_arg._is_global: # ... Handle global arguments. These can be dropped in the # kernel without any particular fiddling fuse_funcall_sym = ast.Symbol(fuse_kernel_arg.sym.symbol) elif fuse_kernel_arg in unshared: # ... Handle arguments that appear only in /fuse/ staging = unshared[fuse_kernel_arg].c_vec_init(False).split('\n') rvalues = [ast.FlatBlock(j.split('=')[1]) for j in staging] lvalues = [ast.Symbol(buffer_name, (j,)) for j in range(len(staging))] staging = [ast.Assign(j, k) for j, k in zip(lvalues, rvalues)] # Set up the temporary buffer_symbol = ast.Symbol(buffer_name, (len(staging),)) buffer_decl = ast.Decl(fuse_kernel_arg.typ, buffer_symbol, qualifiers=fuse_kernel_arg.qual, pointers=list(pointers)) # Update the if-then AST body stager(if_exec.children[0], staging) if_exec.children[0].children.insert(0, buffer_decl) elif fuse_loop_arg._is_mat: # ... Handle Mats staging = [] for b in fused_inc_arg._block_shape: for rc in b: lvalue = ast.Symbol(lvalue, (idx, idx), ((rc[0], 'j'), (rc[1], 'k'))) rvalue = ast.Symbol(rvalue, ('j', 'k')) staging = ItSpace(mode=0).to_for([(0, rc[0]), (0, rc[1])], ('j', 'k'), [op(lvalue, rvalue)])[:1] # Set up the temporary buffer_symbol = ast.Symbol(buffer_name, (fuse_kernel_arg.sym.rank,)) buffer_init = ast.ArrayInit(init([init([0.0])])) buffer_decl = ast.Decl(fuse_kernel_arg.typ, buffer_symbol, buffer_init, qualifiers=fuse_kernel_arg.qual, pointers=pointers) # Update the if-then AST body stager(if_exec.children[0], staging) if_exec.children[0].children.insert(0, buffer_decl) elif fuse_loop_arg._is_indirect: cdim = fuse_loop_arg.data.cdim if cdim == 1 and fuse_kernel_arg.sym.rank: # [Special case] # ... Handle rank 1 indirect arguments that appear in both # /base/ and /fuse/: just point into the right location rank = (idx,) if fusion_map.arity > 1 else () fuse_funcall_sym = ast.Symbol(fuse_kernel_arg.sym.symbol, rank) else: # ... Handle indirect arguments. At the C level, these arguments # are of pointer type, so simple pointer arithmetic is used # to ensure the kernel accesses are to the correct locations fuse_arity = fuse_loop_arg.map.arity base_arity = fuse_arity*fusion_map.arity size = fuse_arity*cdim # Set the proper storage layout before invoking /fuse/ ofs_vals = [[base_arity*j + k for k in range(fuse_arity)] for j in range(cdim)] ofs_vals = [[fuse_arity*j + k for k in flatten(ofs_vals)] for j in range(fusion_map.arity)] ofs_vals = list(flatten(ofs_vals)) indices = [ofs_vals[idx*size + j] for j in range(size)] staging = [op(ast.Symbol(lvalue, (j,)), ast.Symbol(rvalue, (k,))) for j, k in indexer(indices)] # Set up the temporary buffer_symbol = ast.Symbol(buffer_name, (size,)) if fuse_loop_arg.access == INC: buffer_init = ast.ArrayInit(init([0.0])) else: buffer_init = ast.EmptyStatement() pointers.pop() buffer_decl = ast.Decl(fuse_kernel_arg.typ, buffer_symbol, buffer_init, qualifiers=fuse_kernel_arg.qual, pointers=pointers) # Update the if-then AST body stager(if_exec.children[0], staging) if_exec.children[0].children.insert(0, buffer_decl) else: # Nothing special to do for direct arguments pass # Finally update the /fuse/ funcall fuse_funcall.children.append(fuse_funcall_sym) fused_headers = set([str(h) for h in base_headers + fuse_headers]) fused_ast = ast.Root([ast.PreprocessNode(h) for h in fused_headers] + [base_fundecl, fuse_fundecl, fusion_fundecl]) return Kernel([base, fuse], fused_ast, loop_chain_index), fargs