def _make_partree(self, candidates, omp_pragma): """Parallelize `root` attaching a suitable OpenMP pragma.""" assert candidates root = candidates[0] # Get the collapsable Iterations collapsable = [] if ncores() >= Ompizer.COLLAPSE and IsPerfectIteration().visit(root): for n, i in enumerate(candidates[1:], 1): # The OpenMP specification forbids collapsed loops to use iteration # variables in initializer expressions. E.g., the following is forbidden: # # #pragma omp ... collapse(2) # for (i = ... ) # for (j = i ...) # ... # # Here, we make sure this won't happen if any(j.dim in i.symbolic_min.free_symbols for j in candidates[:n]): break # Also, we do not want to collapse vectorizable Iterations if i.is_Vectorizable: break collapsable.append(i) # Attach an OpenMP pragma-for with a collapse clause ncollapse = 1 + len(collapsable) partree = root._rebuild(pragmas=root.pragmas + (omp_pragma(ncollapse),), properties=root.properties + (COLLAPSED(ncollapse),)) collapsed = [partree] + collapsable return root, partree, collapsed
def _find_collapsable(self, root, candidates): collapsable = [] if ncores() >= self.COLLAPSE_NCORES and IsPerfectIteration().visit( root): for n, i in enumerate(candidates[1:], 1): # The OpenMP specification forbids collapsed loops to use iteration # variables in initializer expressions. E.g., the following is forbidden: # # #pragma omp ... collapse(2) # for (i = ... ) # for (j = i ...) # ... # # Here, we make sure this won't happen if any(j.dim in i.symbolic_min.free_symbols for j in candidates[:n]): break # Also, we do not want to collapse vectorizable Iterations if i.is_Vectorized: break # Would there be enough work per parallel iteration? try: work = prod( [int(j.dim.symbolic_size) for j in candidates[n + 1:]]) if work < self.COLLAPSE_WORK: break except TypeError: pass collapsable.append(i) return collapsable
def _select_candidates(self, candidates): assert candidates if self.ncores < self.collapse_ncores: return candidates[0], [] mapper = {} for n0, root in enumerate(candidates): collapsable = [] for n, i in enumerate(candidates[n0+1:], n0+1): # The Iteration nest [root, ..., i] must be perfect if not IsPerfectIteration(depth=i).visit(root): break # Loops are collapsable only if none of the iteration variables appear # in initializer expressions. For example, the following two loops # cannot be collapsed # # for (i = ... ) # for (j = i ...) # ... # # Here, we make sure this won't happen if any(j.dim in i.symbolic_min.free_symbols for j in candidates[n0:n]): break # Also, we do not want to collapse SIMD-vectorized Iterations if i.is_Vectorized: break # Would there be enough work per parallel iteration? nested = candidates[n+1:] if nested: try: work = prod([int(j.dim.symbolic_size) for j in nested]) if work < self.collapse_work: break except TypeError: pass collapsable.append(i) # Give a score to this candidate, based on the number of fully-parallel # Iterations and their position (i.e. outermost to innermost) in the nest score = ( int(root.is_ParallelNoAtomic), int(len([i for i in collapsable if i.is_ParallelNoAtomic]) >= 1), int(len([i for i in collapsable if i.is_ParallelRelaxed]) >= 1), -(n0 + 1) # The outermost, the better ) mapper[(root, tuple(collapsable))] = score # Retrieve the candidates with highest score root, collapsable = max(mapper, key=mapper.get) return root, list(collapsable)
def _make_partree(self, candidates, omp_pragma=None): """Parallelize `root` attaching a suitable OpenMP pragma.""" assert candidates root = candidates[0] # Pick up an omp-pragma template # Caller-provided -> stick to it # Affine -> ... schedule(static,1) ... # Non-affine -> ... schedule(static) ... if omp_pragma is None: if all(i.is_Affine for i in candidates): omp_pragma = self.lang['for-static-1'] else: omp_pragma = self.lang['for-static'] # Get the collapsable Iterations collapsable = [] if ncores() >= Ompizer.COLLAPSE_NCORES and IsPerfectIteration().visit( root): for n, i in enumerate(candidates[1:], 1): # The OpenMP specification forbids collapsed loops to use iteration # variables in initializer expressions. E.g., the following is forbidden: # # #pragma omp ... collapse(2) # for (i = ... ) # for (j = i ...) # ... # # Here, we make sure this won't happen if any(j.dim in i.symbolic_min.free_symbols for j in candidates[:n]): break # Also, we do not want to collapse vectorizable Iterations if i.is_Vectorizable: break # Would there be enough work per parallel iteration? try: work = prod( [int(j.dim.symbolic_size) for j in candidates[n + 1:]]) if work < Ompizer.COLLAPSE_WORK: break except TypeError: pass collapsable.append(i) # Attach an OpenMP pragma-for with a collapse clause ncollapse = 1 + len(collapsable) partree = root._rebuild( pragmas=root.pragmas + (omp_pragma(ncollapse), ), properties=root.properties + (COLLAPSED(ncollapse), )) collapsed = [partree] + collapsable return root, partree, collapsed
def fold_blockable_tree(iet, blockinner=True): """ Create IterationFolds from sequences of nested Iterations. """ mapper = {} for k, sequence in FindAdjacent(Iteration).visit(iet).items(): # Group based on Dimension groups = [] for subsequence in sequence: for _, v in groupby(subsequence, lambda i: i.dim): i = list(v) if len(i) >= 2: groups.append(i) for i in groups: # Pre-condition: they all must be perfect iterations if any(not IsPerfectIteration().visit(j) for j in i): continue # Only retain consecutive trees having same depth trees = [retrieve_iteration_tree(j)[0] for j in i] handle = [] for j in trees: if len(j) != len(trees[0]): break handle.append(j) trees = handle if not trees: continue # Check foldability pairwise_folds = list(zip(*reversed(trees))) if any(not is_foldable(j) for j in pairwise_folds): continue # Maybe heuristically exclude innermost Iteration if blockinner is False: pairwise_folds = pairwise_folds[:-1] # Perhaps there's nothing to fold if len(pairwise_folds) == 0: continue # TODO: we do not currently support blocking if any of the foldable # iterations writes to user data (need min/max loop bounds?) exprs = flatten(FindNodes(Expression).visit(j.root) for j in trees[:-1]) if any(j.write.is_Input for j in exprs): continue # Perform folding for j in pairwise_folds: r, remainder = j[0], j[1:] folds = [(tuple(y-x for x, y in zip(i.offsets, r.offsets)), i.nodes) for i in remainder] mapper[r] = IterationFold(folds=folds, **r.args) for k in remainder: mapper[k] = None # Insert the IterationFolds in the Iteration/Expression tree iet = Transformer(mapper, nested=True).visit(iet) return iet
def unfold_blocked_tree(iet): """ Unfold nested IterationFolds. Examples -------- Given a section of Iteration/Expression tree as below: :: for i = 1 to N-1 // folded for j = 1 to N-1 // folded foo1() Assuming a fold with offset 1 in both /i/ and /j/ and body ``foo2()``, create: :: for i = 1 to N-1 for j = 1 to N-1 foo1() for i = 2 to N-2 for j = 2 to N-2 foo2() """ # Search the unfolding candidates candidates = [] for tree in retrieve_iteration_tree(iet): handle = tuple(i for i in tree if i.is_IterationFold) if handle: # Sanity check assert IsPerfectIteration().visit(handle[0]) candidates.append(handle) # Perform unfolding mapper = {} for tree in candidates: trees = list(zip(*[i.unfold() for i in tree])) trees = optimize_unfolded_tree(trees[:-1], trees[-1]) mapper[tree[0]] = List(body=trees) # Insert the unfolded Iterations in the Iteration/Expression tree iet = Transformer(mapper).visit(iet) return iet
def _find_collapsable(self, root, candidates): collapsable = [] if self.ncores >= self.collapse_ncores: for n, i in enumerate(candidates[1:], 1): # The Iteration nest [root, ..., i] must be perfect if not IsPerfectIteration(depth=i).visit(root): break # Loops are collapsable only if none of the iteration variables appear # in initializer expressions. For example, the following two loops # cannot be collapsed # # for (i = ... ) # for (j = i ...) # ... # # Here, we make sure this won't happen if any(j.dim in i.symbolic_min.free_symbols for j in candidates[:n]): break # Also, we do not want to collapse SIMD-vectorized Iterations if i.is_Vectorized: break # Would there be enough work per parallel iteration? nested = candidates[n + 1:] if nested: try: work = prod([int(j.dim.symbolic_size) for j in nested]) if work < self.collapse_work: break except TypeError: pass collapsable.append(i) return collapsable
def make_simd(self, iet): mapper = {} for tree in retrieve_iteration_tree(iet): candidates = [i for i in tree if i.is_ParallelRelaxed] # As long as there's an outer level of parallelism, the innermost # PARALLEL Iteration gets vectorized if len(candidates) < 2: continue candidate = candidates[-1] # Only fully-parallel Iterations will be SIMD-ized (ParallelRelaxed # might not be enough then) if not candidate.is_Parallel: continue # This check catches cases where an iteration appears as the vectorizable # candidate in tree A but has actually less priority over a candidate in # another tree B. # # Example: # # for (i = ... ) (End of tree A - i is the candidate for tree A) # Expr1 # for (j = ...) (End of tree B - j is the candidate for tree B) # Expr2 # ... if not IsPerfectIteration(depth=candidates[-2]).visit(candidate): continue # If it's an array reduction, we need to be sure the backend compiler # actually supports it. For example, it may be possible to # # #pragma parallel reduction(a[...]) # for (i = ...) # #pragma simd # for (j = ...) # a[j] += ... # # While the following could be unsupported # # #pragma parallel // compiler doesn't support array reduction # for (i = ...) # #pragma simd # for (j = ...) # #pragma atomic // cannot nest simd and atomic # a[j] += ... if any(i.is_ParallelAtomic for i in candidates[:-1]) and \ not self._support_array_reduction(self.compiler): exprs = FindNodes(Expression).visit(candidate) reductions = [i.output for i in exprs if i.is_Increment] if any(i.is_Indexed for i in reductions): continue # Add SIMD pragma indexeds = FindSymbols('indexeds').visit(candidate) aligned = {i.name for i in indexeds if i.function.is_DiscreteFunction} if aligned: simd = self.lang['simd-for-aligned'] simd = as_tuple(simd(','.join(sorted(aligned)), self.simd_reg_size)) else: simd = as_tuple(self.lang['simd-for']) pragmas = candidate.pragmas + simd # Add VECTORIZED property properties = list(candidate.properties) + [VECTORIZED] mapper[candidate] = candidate._rebuild(pragmas=pragmas, properties=properties) iet = Transformer(mapper).visit(iet) return iet, {}
def make_blocking(self, iet): """ Apply loop blocking to PARALLEL Iteration trees. """ # Make sure loop blocking will span as many Iterations as possible iet = fold_blockable_tree(iet, self.blockinner) mapper = {} efuncs = [] block_dims = [] for tree in retrieve_iteration_tree(iet): # Is the Iteration tree blockable ? iterations = filter_iterations(tree, lambda i: i.is_Tilable) if not self.blockinner: iterations = iterations[:-1] if len(iterations) <= 1: continue root = iterations[0] if not IsPerfectIteration().visit(root): # Don't know how block non-perfect Iteration nests continue # Apply hierarchical loop blocking to `tree` level_0 = [] # Outermost level of blocking level_i = [[] for i in range(1, self.nlevels)] # Inner levels of blocking intra = [] # Within the smallest block for i in iterations: template = "%s%d_blk%s" % (i.dim.name, self.nblocked, '%d') properties = (PARALLEL,) + ((AFFINE,) if i.is_Affine else ()) # Build Iteration across `level_0` blocks d = BlockDimension(i.dim, name=template % 0) level_0.append(Iteration([], d, d.symbolic_max, properties=properties)) # Build Iteration across all `level_i` blocks, `i` in (1, self.nlevels] for n, li in enumerate(level_i, 1): di = BlockDimension(d, name=template % n) li.append(Iteration([], di, limits=(d, d+d.step-1, di.step), properties=properties)) d = di # Build Iteration within the smallest block intra.append(i._rebuild([], limits=(d, d+d.step-1, 1), offsets=(0, 0))) level_i = flatten(level_i) # Track all constructed BlockDimensions block_dims.extend(i.dim for i in level_0 + level_i) # Construct the blocked tree blocked = compose_nodes(level_0 + level_i + intra + [iterations[-1].nodes]) blocked = unfold_blocked_tree(blocked) # Promote to a separate Callable dynamic_parameters = flatten((l0.dim, l0.step) for l0 in level_0) dynamic_parameters.extend([li.step for li in level_i]) efunc = make_efunc("bf%d" % self.nblocked, blocked, dynamic_parameters) efuncs.append(efunc) # Compute the iteration ranges ranges = [] for i, l0 in zip(iterations, level_0): maxb = i.symbolic_max - (i.symbolic_size % l0.step) ranges.append(((i.symbolic_min, maxb, l0.step), (maxb + 1, i.symbolic_max, i.symbolic_max - maxb))) # Build Calls to the `efunc` body = [] for p in product(*ranges): dynamic_args_mapper = {} for l0, (m, M, b) in zip(level_0, p): dynamic_args_mapper[l0.dim] = (m, M) dynamic_args_mapper[l0.step] = (b,) for li in level_i: if li.dim.root is l0.dim.root: value = li.step if b is l0.step else b dynamic_args_mapper[li.step] = (value,) call = efunc.make_call(dynamic_args_mapper) body.append(List(body=call)) mapper[root] = List(body=body) # Next blockable nest, use different (unique) variable/function names self.nblocked += 1 iet = Transformer(mapper).visit(iet) # Force-unfold if some folded Iterations haven't been blocked in the end iet = unfold_blocked_tree(iet) return iet, {'dimensions': block_dims, 'efuncs': efuncs, 'args': [i.step for i in block_dims]}
def _make_partree(self, candidates, nthreads=None): """Parallelize `root` attaching a suitable OpenMP pragma.""" assert candidates root = candidates[0] # Get the collapsable Iterations collapsable = [] if ncores() >= Ompizer.COLLAPSE_NCORES and IsPerfectIteration().visit( root): for n, i in enumerate(candidates[1:], 1): # The OpenMP specification forbids collapsed loops to use iteration # variables in initializer expressions. E.g., the following is forbidden: # # #pragma omp ... collapse(2) # for (i = ... ) # for (j = i ...) # ... # # Here, we make sure this won't happen if any(j.dim in i.symbolic_min.free_symbols for j in candidates[:n]): break # Also, we do not want to collapse vectorizable Iterations if i.is_Vectorizable: break # Would there be enough work per parallel iteration? try: work = prod( [int(j.dim.symbolic_size) for j in candidates[n + 1:]]) if work < Ompizer.COLLAPSE_WORK: break except TypeError: pass collapsable.append(i) ncollapse = 1 + len(collapsable) # Prepare to build a ParallelTree prefix = [] if all(i.is_Affine for i in candidates): if nthreads is None: # pragma omp for ... schedule(..., 1) nthreads = self.nthreads omp_pragma = self.lang['for'](ncollapse, 1) else: # pragma omp parallel for ... schedule(..., 1) omp_pragma = self.lang['par-for'](ncollapse, 1, nthreads) else: # pragma omp for ... schedule(..., expr) assert nthreads is None nthreads = self.nthreads_nonaffine chunk_size = Symbol(name='chunk_size') omp_pragma = self.lang['for'](ncollapse, chunk_size) niters = prod([root.symbolic_size] + [j.symbolic_size for j in collapsable]) value = INT(Max(niters / (nthreads * self.CHUNKSIZE_NONAFFINE), 1)) prefix.append( Expression(DummyEq(chunk_size, value, dtype=np.int32))) # Create a ParallelTree body = root._rebuild(pragmas=root.pragmas + (omp_pragma, ), properties=root.properties + (COLLAPSED(ncollapse), )) partree = ParallelTree(prefix, body, nthreads=nthreads) collapsed = [partree] + collapsable return root, partree, collapsed