def _make_remainder(self, hs, key, callcompute, *args): assert callcompute.is_Call body = [ callcompute._rebuild(dynamic_args_mapper=i) for _, i in hs.omapper.owned ] return make_efunc('remainder%d' % key, body)
def _make_poke(self, hs, key, msgs): lflag = Symbol(name='lflag') gflag = Symbol(name='gflag') # Init flags body = [Expression(DummyEq(lflag, 0)), Expression(DummyEq(gflag, 1))] # For each msg, build an Iteration calling MPI_Test on all peers for msg in msgs: dim = Dimension(name='i') msgi = IndexedPointer(msg, dim) rrecv = Byref(FieldFromComposite(msg._C_field_rrecv, msgi)) testrecv = Call( 'MPI_Test', [rrecv, Byref(lflag), Macro('MPI_STATUS_IGNORE')]) rsend = Byref(FieldFromComposite(msg._C_field_rsend, msgi)) testsend = Call( 'MPI_Test', [rsend, Byref(lflag), Macro('MPI_STATUS_IGNORE')]) update = AugmentedExpression(DummyEq(gflag, lflag), '&') body.append( Iteration([testsend, update, testrecv, update], dim, msg.npeers - 1)) body.append(Return(gflag)) return make_efunc('pokempi%d' % key, List(body=body), retval='int')
def _make_compute(self, hs, key, msgs, callpoke): if hs.body.is_Call: return None else: mapper = {i: List(body=[callpoke, i]) for i in FindNodes(ExpressionBundle).visit(hs.body)} iet = Transformer(mapper).visit(hs.body) return make_efunc('compute%d' % key, iet, hs.arguments)
def _make_compute(self, hs, key, msgs, callpoke): if hs.body.is_Call: return None else: mapper = {i: List(body=[callpoke, i]) for i in FindNodes(ExpressionBundle).visit(hs.body)} iet = Transformer(mapper).visit(hs.body) return make_efunc('compute%s' % key, iet, hs.dimensions)
def test_make_efuncs(self, exprs, nfuncs, ntimeiters, nests): """Test construction of ElementalFunctions.""" exprs = list(as_tuple(exprs)) grid = Grid(shape=(10, 10)) t = grid.stepping_dim # noqa x, y = grid.dimensions # noqa u = Function(name='u', grid=grid) # noqa v = TimeFunction(name='v', grid=grid) # noqa # List comprehension would need explicit locals/globals mappings to eval for i, e in enumerate(list(exprs)): exprs[i] = eval(e) op = Operator(exprs) # We create one ElementalFunction for each Iteration nest over space dimensions efuncs = [] for n, tree in enumerate(retrieve_iteration_tree(op)): root = filter_iterations(tree, key=lambda i: i.dim.is_Space, stop='asap') efuncs.append(make_efunc('f%d' % n, root)) assert len(efuncs) == len(nfuncs) == len(ntimeiters) == len(nests) for efunc, nf, nt, nest in zip(efuncs, nfuncs, ntimeiters, nests): # Check the `efunc` parameters assert all(i in efunc.parameters for i in (x.symbolic_min, x.symbolic_max)) assert all(i in efunc.parameters for i in (y.symbolic_min, y.symbolic_max)) functions = FindSymbols().visit(efunc) assert len(functions) == nf assert all(i in efunc.parameters for i in functions) timeiters = [ i for i in FindSymbols('free-symbols').visit(efunc) if i.is_Dimension and i.is_Time ] assert len(timeiters) == nt assert all(i in efunc.parameters for i in timeiters) assert len(efunc.parameters) == 4 + len(functions) + len(timeiters) # Check there's exactly one ArrayCast for each Function assert len(FindNodes(ArrayCast).visit(efunc)) == nf # Check the loop nest structure trees = retrieve_iteration_tree(efunc) assert len(trees) == 1 tree = trees[0] assert all(i.dim.name == j for i, j in zip(tree, nest)) assert efunc.make_call()
def _make_remainder(self, hs, key, callcompute, region): assert callcompute.is_Call dim = Dimension(name='i') regioni = IndexedPointer(region, dim) dynamic_args_mapper = {d: (FieldFromComposite(d.min_name, regioni), FieldFromComposite(d.max_name, regioni)) for d in hs.dimensions} iet = callcompute._rebuild(dynamic_args_mapper=dynamic_args_mapper) # The -1 below is because an Iteration, by default, generates <= iet = Iteration(iet, dim, region.nregions - 1) return make_efunc('remainder%s' % key, iet)
def _make_poke(self, hs, key, msgs): flag = Symbol(name='flag') initflag = LocalExpression(DummyEq(flag, 0)) body = [initflag] for msg in msgs: dim = Dimension(name='i') msgi = IndexedPointer(msg, dim) rrecv = Byref(FieldFromComposite(msg._C_field_rrecv, msgi)) rsend = Byref(FieldFromComposite(msg._C_field_rsend, msgi)) testrecv = Call('MPI_Test', [rrecv, Byref(flag), Macro('MPI_STATUS_IGNORE')]) testsend = Call('MPI_Test', [rsend, Byref(flag), Macro('MPI_STATUS_IGNORE')]) body.append(Iteration([testsend, testrecv], dim, msg.npeers - 1)) return make_efunc('pokempi%s' % key, body)
def test_make_efuncs(self, exprs, nfuncs, ntimeiters, nests): """Test construction of ElementalFunctions.""" exprs = list(as_tuple(exprs)) grid = Grid(shape=(10, 10)) t = grid.stepping_dim # noqa x, y = grid.dimensions # noqa u = Function(name='u', grid=grid) # noqa v = TimeFunction(name='v', grid=grid) # noqa # List comprehension would need explicit locals/globals mappings to eval for i, e in enumerate(list(exprs)): exprs[i] = eval(e) op = Operator(exprs) # We create one ElementalFunction for each Iteration nest over space dimensions efuncs = [] for n, tree in enumerate(retrieve_iteration_tree(op)): root = filter_iterations(tree, key=lambda i: i.dim.is_Space)[0] efuncs.append(make_efunc('f%d' % n, root)) assert len(efuncs) == len(nfuncs) == len(ntimeiters) == len(nests) for efunc, nf, nt, nest in zip(efuncs, nfuncs, ntimeiters, nests): # Check the `efunc` parameters assert all(i in efunc.parameters for i in (x.symbolic_min, x.symbolic_max)) assert all(i in efunc.parameters for i in (y.symbolic_min, y.symbolic_max)) functions = FindSymbols().visit(efunc) assert len(functions) == nf assert all(i in efunc.parameters for i in functions) timeiters = [i for i in FindSymbols('free-symbols').visit(efunc) if isinstance(i, Dimension) and i.is_Time] assert len(timeiters) == nt assert all(i in efunc.parameters for i in timeiters) assert len(efunc.parameters) == 4 + len(functions) + len(timeiters) # Check the loop nest structure trees = retrieve_iteration_tree(efunc) assert len(trees) == 1 tree = trees[0] assert all(i.dim.name == j for i, j in zip(tree, nest)) assert efunc.make_call()
def _make_remainder(self, hs, key, callcompute, region): assert callcompute.is_Call dim = Dimension(name='i') region_i = IndexedPointer(region, dim) dynamic_args_mapper = {} for i in hs.arguments: if i.is_Dimension: dynamic_args_mapper[i] = (FieldFromComposite(i.min_name, region_i), FieldFromComposite(i.max_name, region_i)) else: dynamic_args_mapper[i] = (FieldFromComposite(i.name, region_i),) iet = callcompute._rebuild(dynamic_args_mapper=dynamic_args_mapper) # The -1 below is because an Iteration, by default, generates <= iet = Iteration(iet, dim, region.nregions - 1) return make_efunc('remainder%d' % key, iet)
def _make_remainder(self, hs, key, callcompute, *args): assert callcompute.is_Call items = [] mapper = OrderedDict() for d, (left, right) in hs.omapper.items(): defleft, defright = callcompute.dynamic_defaults[d] dmapper = OrderedDict() dmapper[(d, CORE, CENTER)] = (defleft, defright) dmapper[(d, OWNED, LEFT)] = (defleft - left, defleft) dmapper[(d, OWNED, RIGHT)] = (defright, defright - right) mapper.update(dmapper) items.append(list(dmapper)) body = [] for i in product(*items): if all(r is CORE for _, r, _ in i): continue dynamic_args_mapper = {d: mapper[(d, r, s)] for d, r, s in i} body.append(callcompute._rebuild(dynamic_args_mapper=dynamic_args_mapper, incr=False)) return make_efunc('remainder%s' % key, body)
def _make_poke(self, hs, key, msgs): flag = Symbol(name='flag') initflag = LocalExpression(DummyEq(flag, 0)) body = [initflag] for msg in msgs: dim = Dimension(name='i') msgi = IndexedPointer(msg, dim) rrecv = Byref(FieldFromComposite(msg._C_field_rrecv, msgi)) rsend = Byref(FieldFromComposite(msg._C_field_rsend, msgi)) testrecv = Call( 'MPI_Test', [rrecv, Byref(flag), Macro('MPI_STATUS_IGNORE')]) testsend = Call( 'MPI_Test', [rsend, Byref(flag), Macro('MPI_STATUS_IGNORE')]) body.append(Iteration([testsend, testrecv], dim, msg.npeers - 1)) return make_efunc('pokempi%d' % key, body)
def _make_remainder(self, compute, hs, key): assert compute.is_Call items = [] mapper = OrderedDict() for d, (left, right) in hs.omapper.items(): defleft, defright = compute.dynamic_defaults[d] dmapper = OrderedDict() dmapper[(d, CORE, CENTER)] = (defleft, defright) dmapper[(d, OWNED, LEFT)] = (defleft - left, defleft) dmapper[(d, OWNED, RIGHT)] = (defright, defright - right) mapper.update(dmapper) items.append(list(dmapper)) body = [] for i in product(*items): if all(r is CORE for _, r, _ in i): continue dynamic_args_mapper = {d: mapper[(d, r, s)] for d, r, s in i} body.append( compute._rebuild(dynamic_args_mapper=dynamic_args_mapper, incr=False)) return make_efunc('remainder%s' % key, body)
def _make_compute(self, hs, key, *args): if hs.body.is_Call: return None else: return make_efunc('compute%d' % key, hs.body, hs.arguments)
def _make_compute(self, hs, key, *args): if hs.body.is_Call: return None else: return make_efunc('compute%s' % key, hs.body, hs.dimensions)
def relax_incr_dimensions(iet, **kwargs): """ Recast Iterations over IncrDimensions as ElementalFunctions; insert ElementalCalls to iterate over the "main" and "remainder" regions induced by the IncrDimensions. """ sregistry = kwargs['sregistry'] efuncs = [] mapper = {} for tree in retrieve_iteration_tree(iet): iterations = [i for i in tree if i.dim.is_Incr] if not iterations: continue root = iterations[0] if root in mapper: continue outer, inner = split(iterations, lambda i: not i.dim.parent.is_Incr) # Compute the iteration ranges ranges = [] for i in outer: maxb = i.symbolic_max - (i.symbolic_size % i.dim.step) ranges.append(((i.symbolic_min, maxb, i.dim.step), (maxb + 1, i.symbolic_max, i.symbolic_max - maxb))) # Remove any offsets # E.g., `x = x_m + 2 to x_M - 2` --> `x = x_m to x_M` outer = [i._rebuild(limits=(i.dim.root.symbolic_min, i.dim.root.symbolic_max, i.step)) for i in outer] # Create the ElementalFunction name = sregistry.make_name(prefix="bf") body = compose_nodes(outer) dynamic_parameters = flatten((i.symbolic_bounds, i.step) for i in outer) dynamic_parameters.extend([i.step for i in inner if not is_integer(i.step)]) efunc = make_efunc(name, body, dynamic_parameters) efuncs.append(efunc) # Create the ElementalCalls calls = [] for p in product(*ranges): dynamic_args_mapper = {} for i, (m, M, b) in zip(outer, p): dynamic_args_mapper[i.symbolic_min] = m dynamic_args_mapper[i.symbolic_max] = M dynamic_args_mapper[i.step] = b for j in inner: if j.dim.root is i.dim.root and not is_integer(j.step): value = j.step if b is i.step else b dynamic_args_mapper[j.step] = (value,) calls.append(efunc.make_call(dynamic_args_mapper)) mapper[root] = List(body=calls) iet = Transformer(mapper).visit(iet) return iet, {'efuncs': efuncs}
def make_blocking(self, iet): """ Apply loop blocking to PARALLEL Iteration trees. """ # Make sure loop blocking will span as many Iterations as possible iet = fold_blockable_tree(iet, self.blockinner) mapper = {} efuncs = [] block_dims = [] for tree in retrieve_iteration_tree(iet): # Is the Iteration tree blockable ? iterations = filter_iterations(tree, lambda i: i.is_Parallel and i.is_Affine) if not self.blockinner: iterations = iterations[:-1] if len(iterations) <= 1: continue root = iterations[0] if not self.blockalways: # Heuristically bypass loop blocking if we think `tree` # won't be computationally expensive. This will help with code # size/readbility, JIT time, and auto-tuning time if not (tree.root.is_Sequential or iet.is_Callable): # E.g., not inside a time-stepping Iteration continue if any(i.dim.is_Sub and i.dim.local for i in tree): # At least an outer Iteration is over a local SubDimension, # which suggests the computational cost of this Iteration # nest will be negligible w.r.t. the "core" Iteration nest # (making use of non-local (Sub)Dimensions only) continue if not IsPerfectIteration().visit(root): # Don't know how to block non-perfect nests continue # Apply hierarchical loop blocking to `tree` level_0 = [] # Outermost level of blocking level_i = [[] for i in range(1, self.nlevels)] # Inner levels of blocking intra = [] # Within the smallest block for i in iterations: template = "%s%d_blk%s" % (i.dim.name, self.nblocked, '%d') properties = (PARALLEL,) + ((AFFINE,) if i.is_Affine else ()) # Build Iteration across `level_0` blocks d = BlockDimension(i.dim, name=template % 0) level_0.append(Iteration([], d, d.symbolic_max, properties=properties)) # Build Iteration across all `level_i` blocks, `i` in (1, self.nlevels] for n, li in enumerate(level_i, 1): di = BlockDimension(d, name=template % n) li.append(Iteration([], di, limits=(d, d+d.step-1, di.step), properties=properties)) d = di # Build Iteration within the smallest block intra.append(i._rebuild([], limits=(d, d+d.step-1, 1), offsets=(0, 0))) level_i = flatten(level_i) # Track all constructed BlockDimensions block_dims.extend(i.dim for i in level_0 + level_i) # Construct the blocked tree blocked = compose_nodes(level_0 + level_i + intra + [iterations[-1].nodes]) blocked = unfold_blocked_tree(blocked) # Promote to a separate Callable dynamic_parameters = flatten((l0.dim, l0.step) for l0 in level_0) dynamic_parameters.extend([li.step for li in level_i]) efunc = make_efunc("bf%d" % self.nblocked, blocked, dynamic_parameters) efuncs.append(efunc) # Compute the iteration ranges ranges = [] for i, l0 in zip(iterations, level_0): maxb = i.symbolic_max - (i.symbolic_size % l0.step) ranges.append(((i.symbolic_min, maxb, l0.step), (maxb + 1, i.symbolic_max, i.symbolic_max - maxb))) # Build Calls to the `efunc` body = [] for p in product(*ranges): dynamic_args_mapper = {} for l0, (m, M, b) in zip(level_0, p): dynamic_args_mapper[l0.dim] = (m, M) dynamic_args_mapper[l0.step] = (b,) for li in level_i: if li.dim.root is l0.dim.root: value = li.step if b is l0.step else b dynamic_args_mapper[li.step] = (value,) call = efunc.make_call(dynamic_args_mapper) body.append(List(body=call)) mapper[root] = List(body=body) # Next blockable nest, use different (unique) variable/function names self.nblocked += 1 iet = Transformer(mapper).visit(iet) # Force-unfold if some folded Iterations haven't been blocked in the end iet = unfold_blocked_tree(iet) return iet, {'dimensions': block_dims, 'efuncs': efuncs, 'args': [i.step for i in block_dims]}
def _loop_blocking(self, iet): """ Apply loop blocking to PARALLEL Iteration trees. """ blockinner = bool(self.params.get('blockinner')) blockalways = bool(self.params.get('blockalways')) # Make sure loop blocking will span as many Iterations as possible iet = fold_blockable_tree(iet, blockinner) mapper = {} efuncs = [] block_dims = [] for tree in retrieve_iteration_tree(iet): # Is the Iteration tree blockable ? iterations = filter_iterations(tree, lambda i: i.is_Parallel) if not blockinner: iterations = iterations[:-1] if len(iterations) <= 1: continue root = iterations[0] if not blockalways: # Heuristically bypass loop blocking if we think `tree` # won't be computationally expensive. This will help with code # size/redability, JIT time, and auto-tuning time if not (tree.root.is_Sequential or iet.is_Callable): # E.g., not inside a time-stepping Iteration continue if any(i.dim.is_Sub and i.dim.local for i in tree): # At least an outer Iteration is over a local SubDimension, # which suggests the computational cost of this Iteration # nest will be negligible w.r.t. the "core" Iteration nest # (making use of non-local (Sub)Dimensions only) continue if not IsPerfectIteration().visit(root): # Don't know how to block non-perfect nests continue # Apply loop blocking to `tree` interb = [] intrab = [] for i in iterations: d = BlockDimension(i.dim, name="%s%d_blk" % (i.dim.name, len(mapper))) block_dims.append(d) # Build Iteration over blocks properties = (PARALLEL,) + ((AFFINE,) if i.is_Affine else ()) interb.append(Iteration([], d, d.symbolic_max, properties=properties)) # Build Iteration within a block intrab.append(i._rebuild([], limits=(d, d+d.step-1, 1), offsets=(0, 0))) # Construct the blocked tree blocked = compose_nodes(interb + intrab + [iterations[-1].nodes]) blocked = unfold_blocked_tree(blocked) # Promote to a separate Callable dynamic_parameters = flatten((bi.dim, bi.dim.symbolic_size) for bi in interb) efunc = make_efunc("bf%d" % len(mapper), blocked, dynamic_parameters) efuncs.append(efunc) # Compute the iteration ranges ranges = [] for i, bi in zip(iterations, interb): maxb = i.symbolic_max - (i.symbolic_size % bi.dim.step) ranges.append(((i.symbolic_min, maxb, bi.dim.step), (maxb + 1, i.symbolic_max, i.symbolic_max - maxb))) # Build Calls to the `efunc` body = [] for p in product(*ranges): dynamic_args_mapper = {} for bi, (m, M, b) in zip(interb, p): dynamic_args_mapper[bi.dim] = (m, M) dynamic_args_mapper[bi.dim.step] = (b,) call = efunc.make_call(dynamic_args_mapper) body.append(List(body=call)) mapper[root] = List(body=body) iet = Transformer(mapper).visit(iet) return iet, {'dimensions': block_dims, 'efuncs': efuncs, 'args': [i.step for i in block_dims]}
def _loop_blocking(self, iet): """ Apply loop blocking to PARALLEL Iteration trees. """ blockinner = bool(self.params.get('blockinner')) blockalways = bool(self.params.get('blockalways')) # Make sure loop blocking will span as many Iterations as possible iet = fold_blockable_tree(iet, blockinner) mapper = {} efuncs = [] block_dims = [] for tree in retrieve_iteration_tree(iet): # Is the Iteration tree blockable ? iterations = filter_iterations(tree, lambda i: i.is_Parallel) if not blockinner: iterations = iterations[:-1] if len(iterations) <= 1: continue root = iterations[0] if not (tree.root.is_Sequential or iet.is_Callable) and not blockalways: # Heuristic: avoid polluting the generated code with blocked # nests (thus increasing JIT compilation time and affecting # readability) if the blockable tree isn't embedded in a # sequential loop (e.g., a timestepping loop) continue # Apply loop blocking to `tree` interb = [] intrab = [] for i in iterations: d = BlockDimension(i.dim, name="%s%d_blk" % (i.dim.name, len(mapper))) block_dims.append(d) # Build Iteration over blocks interb.append( Iteration([], d, d.symbolic_max, properties=PARALLEL)) # Build Iteration within a block intrab.append( i._rebuild([], limits=(d, d + d.step - 1, 1), offsets=(0, 0))) # Construct the blocked tree blocked = compose_nodes(interb + intrab + [iterations[-1].nodes]) blocked = unfold_blocked_tree(blocked) # Promote to a separate Callable dynamic_parameters = flatten( (bi.dim, bi.dim.symbolic_size) for bi in interb) efunc = make_efunc("bf%d" % len(mapper), blocked, dynamic_parameters) efuncs.append(efunc) # Compute the iteration ranges ranges = [] for i, bi in zip(iterations, interb): maxb = i.symbolic_max - (i.symbolic_size % bi.dim.step) ranges.append( ((i.symbolic_min, maxb, bi.dim.step), (maxb + 1, i.symbolic_max, i.symbolic_max - maxb))) # Build Calls to the `efunc` body = [] for p in product(*ranges): dynamic_args_mapper = {} for bi, (m, M, b) in zip(interb, p): dynamic_args_mapper[bi.dim] = (m, M) dynamic_args_mapper[bi.dim.step] = (b, ) call = efunc.make_call(dynamic_args_mapper) body.append(List(body=call)) mapper[root] = List(body=body) iet = Transformer(mapper).visit(iet) return iet, { 'dimensions': block_dims, 'efuncs': efuncs, 'args': [i.step for i in block_dims] }
def _make_compute(self, hs, key): if hs.body.is_Call: return None else: return make_efunc('compute%s' % key, hs.body, hs.dimensions)
def _loop_blocking(self, iet): """ Apply loop blocking to PARALLEL Iteration trees. """ blockinner = bool(self.params.get('blockinner')) blockalways = bool(self.params.get('blockalways')) noinline = self._compiler_decoration('noinline', cgen.Comment('noinline?')) # Make sure loop blocking will span as many Iterations as possible iet = fold_blockable_tree(iet, blockinner) mapper = {} efuncs = OrderedDict() block_dims = [] for tree in retrieve_iteration_tree(iet): # Is the Iteration tree blockable ? candidates = [i for i in tree if i.is_Parallel] if blockinner: iterations = candidates else: iterations = [i for i in candidates if not i.is_Vectorizable] if len(iterations) <= 1: continue root = iterations[0] if not IsPerfectIteration().visit(root): # Illegal/unsupported continue if not tree.root.is_Sequential and not blockalways: # Heuristic: avoid polluting the generated code with blocked # nests (thus increasing JIT compilation time and affecting # readability) if the blockable tree isn't embedded in a # sequential loop (e.g., a timestepping loop) continue # Apply loop blocking to `tree` interb = [] intrab = [] for i in iterations: d = BlockDimension(i.dim, name="%s%d_block" % (i.dim.name, len(mapper))) # Build Iteration over blocks interb.append(Iteration([], d, d.symbolic_max, offsets=i.offsets, properties=PARALLEL)) # Build Iteration within a block intrab.append(i._rebuild([], limits=(d, d+d.step-1, 1), offsets=(0, 0))) # Record that a new BlockDimension has been introduced block_dims.append(d) # Construct the blocked tree blocked = compose_nodes(interb + intrab + [iterations[-1].nodes]) blocked = unfold_blocked_tree(blocked) # Promote to a separate Callable dynamic_parameters = flatten((bi.dim, bi.dim.symbolic_size) for bi in interb) efunc0 = make_efunc("bf%d" % len(mapper), blocked, dynamic_parameters) # Compute the iteration ranges ranges = [] for i, bi in zip(iterations, interb): maxb = i.symbolic_max - (i.symbolic_size % bi.dim.step) ranges.append(((i.symbolic_min, maxb, bi.dim.step), (maxb + 1, i.symbolic_max, i.symbolic_max - maxb))) # Build Calls to the `efunc` body = [] for p in product(*ranges): dynamic_args_mapper = {} for bi, (m, M, b) in zip(interb, p): dynamic_args_mapper[bi.dim] = (m, M) dynamic_args_mapper[bi.dim.step] = (b,) call = efunc0.make_call(dynamic_args_mapper) body.append(List(header=noinline, body=call)) # Build indirect Call to the `efunc0` Calls dynamic_parameters = [i.dim.root for i in candidates] dynamic_parameters.extend([bi.dim.step for bi in interb]) efunc1 = make_efunc("f%d" % len(mapper), body, dynamic_parameters) # Track everything to ultimately transform the input `iet` mapper[root] = efunc1.make_call() efuncs[efunc1] = None efuncs[efunc0] = [efunc1.name] iet = Transformer(mapper).visit(iet) return iet, {'dimensions': block_dims, 'efuncs': efuncs}
def _loop_blocking(self, iet): """ Apply loop blocking to PARALLEL Iteration trees. """ blockinner = bool(self.params.get('blockinner')) blockalways = bool(self.params.get('blockalways')) # Make sure loop blocking will span as many Iterations as possible iet = fold_blockable_tree(iet, blockinner) mapper = {} efuncs = [] block_dims = [] for tree in retrieve_iteration_tree(iet): # Is the Iteration tree blockable ? iterations = filter_iterations(tree, lambda i: i.is_Parallel) if not blockinner: iterations = iterations[:-1] if len(iterations) <= 1: continue root = iterations[0] if not (tree.root.is_Sequential or iet.is_Callable) and not blockalways: # Heuristic: avoid polluting the generated code with blocked # nests (thus increasing JIT compilation time and affecting # readability) if the blockable tree isn't embedded in a # sequential loop (e.g., a timestepping loop) continue # Apply loop blocking to `tree` interb = [] intrab = [] for i in iterations: d = BlockDimension(i.dim, name="%s%d_blk" % (i.dim.name, len(mapper))) block_dims.append(d) # Build Iteration over blocks properties = (PARALLEL,) + ((AFFINE,) if i.is_Affine else ()) interb.append(Iteration([], d, d.symbolic_max, properties=properties)) # Build Iteration within a block intrab.append(i._rebuild([], limits=(d, d+d.step-1, 1), offsets=(0, 0))) # Construct the blocked tree blocked = compose_nodes(interb + intrab + [iterations[-1].nodes]) blocked = unfold_blocked_tree(blocked) # Promote to a separate Callable dynamic_parameters = flatten((bi.dim, bi.dim.symbolic_size) for bi in interb) efunc = make_efunc("bf%d" % len(mapper), blocked, dynamic_parameters) efuncs.append(efunc) # Compute the iteration ranges ranges = [] for i, bi in zip(iterations, interb): maxb = i.symbolic_max - (i.symbolic_size % bi.dim.step) ranges.append(((i.symbolic_min, maxb, bi.dim.step), (maxb + 1, i.symbolic_max, i.symbolic_max - maxb))) # Build Calls to the `efunc` body = [] for p in product(*ranges): dynamic_args_mapper = {} for bi, (m, M, b) in zip(interb, p): dynamic_args_mapper[bi.dim] = (m, M) dynamic_args_mapper[bi.dim.step] = (b,) call = efunc.make_call(dynamic_args_mapper) body.append(List(body=call)) mapper[root] = List(body=body) iet = Transformer(mapper).visit(iet) return iet, {'dimensions': block_dims, 'efuncs': efuncs, 'args': [i.step for i in block_dims]}