def _(iet): # TODO: we need to pick the rank from `comm_shm`, not `comm`, # so that we have nranks == ngpus (as long as the user has launched # the right number of MPI processes per node given the available # number of GPUs per node) objcomm = None for i in iet.parameters: if isinstance(i, MPICommObject): objcomm = i break devicetype = as_list(self.lang[self.platform]) try: lang_init = [self.lang['init'](devicetype)] except TypeError: # Not all target languages need to be explicitly initialized lang_init = [] deviceid = DeviceID() if objcomm is not None: rank = Symbol(name='rank') rank_decl = LocalExpression(DummyEq(rank, 0)) rank_init = Call('MPI_Comm_rank', [objcomm, Byref(rank)]) ngpus = Symbol(name='ngpus') call = self.lang['num-devices'](devicetype) ngpus_init = LocalExpression(DummyEq(ngpus, call)) osdd_then = self.lang['set-device']([deviceid] + devicetype) osdd_else = self.lang['set-device']([rank % ngpus] + devicetype) body = lang_init + [ Conditional( CondNe(deviceid, -1), osdd_then, List( body=[rank_decl, rank_init, ngpus_init, osdd_else ]), ) ] header = c.Comment('Begin of %s+MPI setup' % self.lang['name']) footer = c.Comment('End of %s+MPI setup' % self.lang['name']) else: body = lang_init + [ Conditional( CondNe(deviceid, -1), self.lang['set-device']([deviceid] + devicetype)) ] header = c.Comment('Begin of %s setup' % self.lang['name']) footer = c.Comment('End of %s setup' % self.lang['name']) init = List(header=header, body=body, footer=(footer, c.Line())) iet = iet._rebuild(body=(init, ) + iet.body) return iet, {'args': deviceid}
def test_cse(exprs, expected): """Test common subexpressions elimination.""" grid = Grid((3, 3, 3)) dims = grid.dimensions tu = TimeFunction(name="tu", grid=grid, space_order=2) # noqa tv = TimeFunction(name="tv", grid=grid, space_order=2) # noqa tw = TimeFunction(name="tw", grid=grid, space_order=2) # noqa tz = TimeFunction(name="tz", grid=grid, space_order=2) # noqa ti0 = Array(name='ti0', shape=(3, 5, 7), dimensions=dims).indexify() # noqa ti1 = Array(name='ti1', shape=(3, 5, 7), dimensions=dims).indexify() # noqa t0 = Scalar(name='t0') # noqa t1 = Scalar(name='t1') # noqa t2 = Scalar(name='t2') # noqa # List comprehension would need explicit locals/globals mappings to eval for i, e in enumerate(list(exprs)): exprs[i] = DummyEq(indexify(eval(e).evaluate)) counter = generator() make = lambda: Scalar(name='r%d' % counter()).indexify() processed = _cse(exprs, make) assert len(processed) == len(expected) assert all(str(i.rhs) == j for i, j in zip(processed, expected))
def common_subexprs_elimination(exprs, make, mode='default'): """ Perform common sub-expressions elimination, or CSE. Note: the output is guaranteed to be topologically sorted. Parameters ---------- exprs : expr-like or list of expr-like One or more expressions to which CSE is applied. make : callable Build symbols to store temporary, redundant values. mode : str, optional The CSE algorithm applied. Accepted: ['default']. """ # Note: not defaulting to SymPy's CSE() function for three reasons: # - it also captures array index access functions (eg, i+1 in A[i+1] and B[i+1]); # - it sometimes "captures too much", losing factorization opportunities; # - very slow # TODO: a second "sympy" mode will be provided, relying on SymPy's CSE() but # also ensuring some sort of post-processing assert mode == 'default' # Only supported mode ATM processed = list(exprs) mapped = [] while True: # Detect redundancies counted = count(mapped + processed, q_xop).items() targets = OrderedDict([(k, estimate_cost(k, True)) for k, v in counted if v > 1]) if not targets: break # Create temporaries hit = max(targets.values()) picked = [k for k, v in targets.items() if v == hit] mapper = OrderedDict([(e, make()) for i, e in enumerate(picked)]) # Apply replacements processed = [e.xreplace(mapper) for e in processed] mapped = [e.xreplace(mapper) for e in mapped] mapped = [DummyEq(v, k) for k, v in reversed(list(mapper.items()))] + mapped # Prepare for the next round for k in picked: targets.pop(k) processed = mapped + processed # At this point we may have useless temporaries (e.g., r0=r1). Let's drop them processed = _compact_temporaries(processed) # Perform topological sorting so that reads-after-writes are honored processed = _topological_sort(processed) return processed
def _make_partree(self, candidates, nthreads=None): """Parallelize the `candidates` Iterations attaching suitable OpenMP pragmas.""" assert candidates root = candidates[0] # Get the collapsable Iterations collapsable = self._find_collapsable(root, candidates) ncollapse = 1 + len(collapsable) # Prepare to build a ParallelTree if all(i.is_Affine for i in candidates): bundles = FindNodes(ExpressionBundle).visit(root) sops = sum(i.ops for i in bundles) if sops >= self.dynamic_work: schedule = 'dynamic' else: schedule = 'static' if nthreads is None: # pragma omp for ... schedule(..., 1) nthreads = self.nthreads body = OpenMPIteration(schedule=schedule, ncollapse=ncollapse, **root.args) else: # pragma omp parallel for ... schedule(..., 1) body = OpenMPIteration(schedule=schedule, parallel=True, ncollapse=ncollapse, nthreads=nthreads, **root.args) prefix = [] else: # pragma omp for ... schedule(..., expr) assert nthreads is None nthreads = self.nthreads_nonaffine chunk_size = Symbol(name='chunk_size') body = OpenMPIteration(ncollapse=ncollapse, chunk_size=chunk_size, **root.args) niters = prod([root.symbolic_size] + [j.symbolic_size for j in collapsable]) value = INT(Max(niters / (nthreads * self.chunk_nonaffine), 1)) prefix = [Expression(DummyEq(chunk_size, value, dtype=np.int32))] # Create a ParallelTree partree = ParallelTree(prefix, body, nthreads=nthreads) collapsed = [partree] + collapsable return root, partree, collapsed
def _make_partree(self, candidates, nthreads=None): """Parallelize the `candidates` Iterations attaching suitable OpenMP pragmas.""" assert candidates root = candidates[0] # Get the collapsable Iterations collapsable = self._find_collapsable(root, candidates) ncollapse = 1 + len(collapsable) # Prepare to build a ParallelTree prefix = [] if all(i.is_Affine for i in candidates): if nthreads is None: # pragma omp for ... schedule(..., 1) nthreads = self.nthreads omp_pragma = self.lang['for'](ncollapse, 1) else: # pragma omp parallel for ... schedule(..., 1) omp_pragma = self.lang['par-for'](ncollapse, 1, nthreads) else: # pragma omp for ... schedule(..., expr) assert nthreads is None nthreads = self.nthreads_nonaffine chunk_size = Symbol(name='chunk_size') omp_pragma = self.lang['for'](ncollapse, chunk_size) niters = prod([root.symbolic_size] + [j.symbolic_size for j in collapsable]) value = INT(Max(niters / (nthreads * self.CHUNKSIZE_NONAFFINE), 1)) prefix.append( Expression(DummyEq(chunk_size, value, dtype=np.int32))) # Create a ParallelTree body = root._rebuild(pragmas=root.pragmas + (omp_pragma, ), properties=root.properties + (COLLAPSED(ncollapse), )) partree = ParallelTree(prefix, body, nthreads=nthreads) collapsed = [partree] + collapsable return root, partree, collapsed
def _cse(maybe_exprs, make, mode='default'): """ Main common sub-expressions elimination routine. Note: the output is guaranteed to be topologically sorted. Parameters ---------- maybe_exprs : expr-like or list of expr-like or Cluster One or more expressions to which CSE is applied. make : callable Build symbols to store temporary, redundant values. mode : str, optional The CSE algorithm applied. Accepted: ['default']. """ # Note: not defaulting to SymPy's CSE() function for three reasons: # - it also captures array index access functions (eg, i+1 in A[i+1] and B[i+1]); # - it sometimes "captures too much", losing factorization opportunities; # - very slow # TODO: a second "sympy" mode will be provided, relying on SymPy's CSE() but # also ensuring some form of post-processing assert mode == 'default' # Only supported mode ATM # Just for flexibility, accept either Clusters or exprs if isinstance(maybe_exprs, Cluster): cluster = maybe_exprs processed = list(cluster.exprs) scope = cluster.scope else: processed = list(maybe_exprs) scope = Scope(maybe_exprs) # Some sub-expressions aren't really "common" -- that's the case of Dimension- # independent data dependences. For example: # # ... = ... a[i] + 1 ... # a[i] = ... # ... = ... a[i] + 1 ... # # `a[i] + 1` will be excluded, as there's a flow Dimension-independent data # dependence involving `a` exclude = {i.source.indexed for i in scope.d_flow.independent()} mapped = [] while True: # Detect redundancies counted = count(mapped + processed, q_xop).items() targets = OrderedDict([(k, estimate_cost(k, True)) for k, v in counted if v > 1]) # Rule out Dimension-independent data dependencies targets = OrderedDict([(k, v) for k, v in targets.items() if not k.free_symbols & exclude]) if not targets: break # Create temporaries hit = max(targets.values()) picked = [k for k, v in targets.items() if v == hit] mapper = OrderedDict([(e, make()) for i, e in enumerate(picked)]) # Apply replacements processed = [uxreplace(e, mapper) for e in processed] mapped = [uxreplace(e, mapper) for e in mapped] mapped = [DummyEq(v, k) for k, v in reversed(list(mapper.items()))] + mapped # Update `exclude` for the same reasons as above -- to rule out CSE across # Dimension-independent data dependences exclude.update({i for i in mapper.values()}) # Prepare for the next round for k in picked: targets.pop(k) processed = mapped + processed # At this point we may have useless temporaries (e.g., r0=r1). Let's drop them processed = _compact_temporaries(processed) return processed
def _make_partree(self, candidates, nthreads=None): """Parallelize `root` attaching a suitable OpenMP pragma.""" assert candidates root = candidates[0] # Get the collapsable Iterations collapsable = [] if ncores() >= Ompizer.COLLAPSE_NCORES and IsPerfectIteration().visit( root): for n, i in enumerate(candidates[1:], 1): # The OpenMP specification forbids collapsed loops to use iteration # variables in initializer expressions. E.g., the following is forbidden: # # #pragma omp ... collapse(2) # for (i = ... ) # for (j = i ...) # ... # # Here, we make sure this won't happen if any(j.dim in i.symbolic_min.free_symbols for j in candidates[:n]): break # Also, we do not want to collapse vectorizable Iterations if i.is_Vectorizable: break # Would there be enough work per parallel iteration? try: work = prod( [int(j.dim.symbolic_size) for j in candidates[n + 1:]]) if work < Ompizer.COLLAPSE_WORK: break except TypeError: pass collapsable.append(i) ncollapse = 1 + len(collapsable) # Prepare to build a ParallelTree prefix = [] if all(i.is_Affine for i in candidates): if nthreads is None: # pragma omp for ... schedule(..., 1) nthreads = self.nthreads omp_pragma = self.lang['for'](ncollapse, 1) else: # pragma omp parallel for ... schedule(..., 1) omp_pragma = self.lang['par-for'](ncollapse, 1, nthreads) else: # pragma omp for ... schedule(..., expr) assert nthreads is None nthreads = self.nthreads_nonaffine chunk_size = Symbol(name='chunk_size') omp_pragma = self.lang['for'](ncollapse, chunk_size) niters = prod([root.symbolic_size] + [j.symbolic_size for j in collapsable]) value = INT(Max(niters / (nthreads * self.CHUNKSIZE_NONAFFINE), 1)) prefix.append( Expression(DummyEq(chunk_size, value, dtype=np.int32))) # Create a ParallelTree body = root._rebuild(pragmas=root.pragmas + (omp_pragma, ), properties=root.properties + (COLLAPSED(ncollapse), )) partree = ParallelTree(prefix, body, nthreads=nthreads) collapsed = [partree] + collapsable return root, partree, collapsed