def _make_partree(self, candidates, nthreads=None): """Parallelize the `candidates` Iterations attaching suitable OpenMP pragmas.""" assert candidates root = candidates[0] # Get the collapsable Iterations collapsable = self._find_collapsable(root, candidates) ncollapse = 1 + len(collapsable) # Prepare to build a ParallelTree if all(i.is_Affine for i in candidates): bundles = FindNodes(ExpressionBundle).visit(root) sops = sum(i.ops for i in bundles) if sops >= self.dynamic_work: schedule = 'dynamic' else: schedule = 'static' if nthreads is None: # pragma omp for ... schedule(..., 1) nthreads = self.nthreads body = OpenMPIteration(schedule=schedule, ncollapse=ncollapse, **root.args) else: # pragma omp parallel for ... schedule(..., 1) body = OpenMPIteration(schedule=schedule, parallel=True, ncollapse=ncollapse, nthreads=nthreads, **root.args) prefix = [] else: # pragma omp for ... schedule(..., expr) assert nthreads is None nthreads = self.nthreads_nonaffine chunk_size = Symbol(name='chunk_size') body = OpenMPIteration(ncollapse=ncollapse, chunk_size=chunk_size, **root.args) niters = prod([root.symbolic_size] + [j.symbolic_size for j in collapsable]) value = INT(Max(niters / (nthreads * self.chunk_nonaffine), 1)) prefix = [Expression(DummyEq(chunk_size, value, dtype=np.int32))] # Create a ParallelTree partree = ParallelTree(prefix, body, nthreads=nthreads) collapsed = [partree] + collapsable return root, partree, collapsed
def _make_partree(self, candidates, nthreads=None): """Parallelize the `candidates` Iterations attaching suitable OpenMP pragmas.""" assert candidates root = candidates[0] # Get the collapsable Iterations collapsable = self._find_collapsable(root, candidates) ncollapse = 1 + len(collapsable) # Prepare to build a ParallelTree prefix = [] if all(i.is_Affine for i in candidates): if nthreads is None: # pragma omp for ... schedule(..., 1) nthreads = self.nthreads omp_pragma = self.lang['for'](ncollapse, 1) else: # pragma omp parallel for ... schedule(..., 1) omp_pragma = self.lang['par-for'](ncollapse, 1, nthreads) else: # pragma omp for ... schedule(..., expr) assert nthreads is None nthreads = self.nthreads_nonaffine chunk_size = Symbol(name='chunk_size') omp_pragma = self.lang['for'](ncollapse, chunk_size) niters = prod([root.symbolic_size] + [j.symbolic_size for j in collapsable]) value = INT(Max(niters / (nthreads * self.CHUNKSIZE_NONAFFINE), 1)) prefix.append( Expression(DummyEq(chunk_size, value, dtype=np.int32))) # Create a ParallelTree body = root._rebuild(pragmas=root.pragmas + (omp_pragma, ), properties=root.properties + (COLLAPSED(ncollapse), )) partree = ParallelTree(prefix, body, nthreads=nthreads) collapsed = [partree] + collapsable return root, partree, collapsed
def _make_partree(self, candidates, nthreads=None): """Parallelize `root` attaching a suitable OpenMP pragma.""" assert candidates root = candidates[0] # Get the collapsable Iterations collapsable = [] if ncores() >= Ompizer.COLLAPSE_NCORES and IsPerfectIteration().visit( root): for n, i in enumerate(candidates[1:], 1): # The OpenMP specification forbids collapsed loops to use iteration # variables in initializer expressions. E.g., the following is forbidden: # # #pragma omp ... collapse(2) # for (i = ... ) # for (j = i ...) # ... # # Here, we make sure this won't happen if any(j.dim in i.symbolic_min.free_symbols for j in candidates[:n]): break # Also, we do not want to collapse vectorizable Iterations if i.is_Vectorizable: break # Would there be enough work per parallel iteration? try: work = prod( [int(j.dim.symbolic_size) for j in candidates[n + 1:]]) if work < Ompizer.COLLAPSE_WORK: break except TypeError: pass collapsable.append(i) ncollapse = 1 + len(collapsable) # Prepare to build a ParallelTree prefix = [] if all(i.is_Affine for i in candidates): if nthreads is None: # pragma omp for ... schedule(..., 1) nthreads = self.nthreads omp_pragma = self.lang['for'](ncollapse, 1) else: # pragma omp parallel for ... schedule(..., 1) omp_pragma = self.lang['par-for'](ncollapse, 1, nthreads) else: # pragma omp for ... schedule(..., expr) assert nthreads is None nthreads = self.nthreads_nonaffine chunk_size = Symbol(name='chunk_size') omp_pragma = self.lang['for'](ncollapse, chunk_size) niters = prod([root.symbolic_size] + [j.symbolic_size for j in collapsable]) value = INT(Max(niters / (nthreads * self.CHUNKSIZE_NONAFFINE), 1)) prefix.append( Expression(DummyEq(chunk_size, value, dtype=np.int32))) # Create a ParallelTree body = root._rebuild(pragmas=root.pragmas + (omp_pragma, ), properties=root.properties + (COLLAPSED(ncollapse), )) partree = ParallelTree(prefix, body, nthreads=nthreads) collapsed = [partree] + collapsable return root, partree, collapsed