def mpiize(iet, **kwargs): """ Add MPI routines performing halo exchanges to emit distributed-memory parallel code. """ mode = kwargs.pop('mode') # To produce unique object names generators = {'msg': generator(), 'comm': generator(), 'comp': generator()} sync_heb = HaloExchangeBuilder('basic', **generators) user_heb = HaloExchangeBuilder(mode, **generators) mapper = {} for hs in FindNodes(HaloSpot).visit(iet): heb = user_heb if isinstance(hs, OverlappableHaloSpot) else sync_heb mapper[hs] = heb.make(hs) efuncs = sync_heb.efuncs + user_heb.efuncs objs = filter_sorted(sync_heb.objs + user_heb.objs) iet = Transformer(mapper, nested=True).visit(iet) # Must drop the PARALLEL tag from the Iterations within which halo # exchanges are performed mapper = {} for tree in retrieve_iteration_tree(iet): for i in reversed(tree): if i in mapper: # Already seen this subtree, skip break if FindNodes(Call).visit(i): mapper.update({n: n._rebuild(properties=set(n.properties)-{PARALLEL}) for n in tree[:tree.index(i)+1]}) break iet = Transformer(mapper, nested=True).visit(iet) return iet, {'includes': ['mpi.h'], 'efuncs': efuncs, 'args': objs}
def __new__(cls, mode, **generators): if mode is True or mode == 'basic': obj = object.__new__(BasicHaloExchangeBuilder) elif mode == 'diag': obj = object.__new__(DiagHaloExchangeBuilder) elif mode == 'overlap': obj = object.__new__(OverlapHaloExchangeBuilder) elif mode == 'overlap2': obj = object.__new__(Overlap2HaloExchangeBuilder) elif mode == 'full': obj = object.__new__(FullHaloExchangeBuilder) else: assert False, "unexpected value `mode=%s`" % mode # Unique name generators obj._gen_msgkey = generators.get('msg', generator()) obj._gen_commkey = generators.get('comm', generator()) obj._gen_compkey = generators.get('comp', generator()) obj._cache_halo = OrderedDict() obj._cache_dims = OrderedDict() obj._objs = OrderedSet() obj._regions = OrderedDict() obj._msgs = OrderedDict() obj._efuncs = [] return obj
def test_common_subexprs_elimination(tu, tv, tw, ti0, ti1, t0, t1, exprs, expected): counter = generator() make = lambda: Scalar(name='r%d' % counter()).indexify() processed = common_subexprs_elimination(EVAL(exprs, tu, tv, tw, ti0, ti1, t0, t1), make) assert len(processed) == len(expected) assert all(str(i.rhs) == j for i, j in zip(processed, expected))
def test_cse(exprs, expected): """Test common subexpressions elimination.""" grid = Grid((3, 3, 3)) dims = grid.dimensions tu = TimeFunction(name="tu", grid=grid, space_order=2) # noqa tv = TimeFunction(name="tv", grid=grid, space_order=2) # noqa tw = TimeFunction(name="tw", grid=grid, space_order=2) # noqa tz = TimeFunction(name="tz", grid=grid, space_order=2) # noqa ti0 = Array(name='ti0', shape=(3, 5, 7), dimensions=dims).indexify() # noqa ti1 = Array(name='ti1', shape=(3, 5, 7), dimensions=dims).indexify() # noqa t0 = Scalar(name='t0') # noqa t1 = Scalar(name='t1') # noqa t2 = Scalar(name='t2') # noqa # List comprehension would need explicit locals/globals mappings to eval for i, e in enumerate(list(exprs)): exprs[i] = DummyEq(indexify(eval(e).evaluate)) counter = generator() make = lambda: Scalar(name='r%d' % counter()).indexify() processed = _cse(exprs, make) assert len(processed) == len(expected) assert all(str(i.rhs) == j for i, j in zip(processed, expected))
def _do_generate(self, exprs, exclude, cbk_search, cbk_compose=None): """ Carry out the bulk of the work of ``_generate``. """ counter = generator() make = lambda: Symbol(name='dummy%d' % counter()) if cbk_compose is None: cbk_compose = lambda *args: None mapper = Uxmapper() for e in exprs: for i in cbk_search(e): if not i.is_commutative: continue terms = cbk_compose(i) # Make sure we won't break any data dependencies if terms: free_symbols = set().union( *[i.free_symbols for i in terms]) else: free_symbols = i.free_symbols if {a.function for a in free_symbols} & exclude: continue mapper.add(i, make, terms) return mapper
def optimize(clusters, dse_mode): """ Optimize a topologically-ordered sequence of Clusters by applying the following transformations: * [cross-cluster] Fusion * [intra-cluster] Several flop-reduction passes via the DSE * [cross-cluster] Lifting * [cross-cluster] Scalarization * [cross-cluster] Arrays Elimination """ # To create temporaries counter = generator() template = lambda: "r%d" % counter() # Fusion clusters = fuse(clusters) from devito.dse import rewrite clusters = rewrite(clusters, template, mode=dse_mode) # Lifting clusters = Lift().process(clusters) # Lifting may create fusion opportunities clusters = fuse(clusters) # Fusion may create opportunities to eliminate Arrays (thus shrinking the # working set) if these store identical expressions clusters = eliminate_arrays(clusters, template) # Fusion may create scalarization opportunities clusters = scalarize(clusters, template) return ClusterGroup(clusters)
def _specialize_iet(cls, graph, **kwargs): options = kwargs['options'] platform = kwargs['platform'] # Flush denormal numbers avoid_denormals(graph) # Distributed-memory parallelism optimize_halospots(graph) if options['mpi']: mpiize(graph, mode=options['mpi']) # Lower IncrDimensions so that blocks of arbitrary shape may be used relax_incr_dimensions(graph, counter=generator()) # SIMD-level parallelism ompizer = Ompizer() ompizer.make_simd(graph, simd_reg_size=platform.simd_reg_size) # Shared-memory parallelism ompizer.make_parallel(graph) # Misc optimizations hoist_prodders(graph) # Symbol definitions data_manager = DataManager() data_manager.place_definitions(graph) data_manager.place_casts(graph) return graph
def _specialize_clusters(cls, clusters, **kwargs): options = kwargs['options'] platform = kwargs['platform'] # To create temporaries counter = generator() template = lambda: "r%d" % counter() # Toposort+Fusion (the former to expose more fusion opportunities) clusters = Toposort().process(clusters) clusters = fuse(clusters) # Hoist and optimize Dimension-invariant sub-expressions clusters = cire(clusters, template, 'invariants', options, platform) clusters = Lift().process(clusters) # Reduce flops (potential arithmetic alterations) clusters = extract_increments(clusters, template) clusters = cire(clusters, template, 'sops', options, platform) clusters = factorize(clusters) clusters = optimize_pows(clusters) # Reduce flops (no arithmetic alterations) clusters = cse(clusters, template) # The previous passes may have created fusion opportunities, which in # turn may enable further optimizations clusters = fuse(clusters) clusters = eliminate_arrays(clusters, template) # Blocking to improve data locality clusters = Blocking(options).process(clusters) return clusters
def _specialize_clusters(cls, clusters, **kwargs): """ Optimize Clusters for better runtime performance. """ # To create temporaries counter = generator() template = lambda: "r%d" % counter() # Toposort+Fusion (the former to expose more fusion opportunities) clusters = Toposort().process(clusters) clusters = fuse(clusters) # Flop reduction via the DSE clusters = rewrite(clusters, template, **kwargs) # Lifting clusters = Lift().process(clusters) # Lifting may create fusion opportunities, which in turn may enable # further optimizations clusters = fuse(clusters) clusters = eliminate_arrays(clusters, template) clusters = scalarize(clusters, template) return clusters
def _specialize_clusters(cls, clusters, **kwargs): # TODO: this is currently identical to CPU64NoopOperator._specialize_clusters, # but it will have to change # To create temporaries counter = generator() template = lambda: "r%d" % counter() # Toposort+Fusion (the former to expose more fusion opportunities) clusters = Toposort().process(clusters) clusters = fuse(clusters) # Flop reduction via the DSE clusters = rewrite(clusters, template, **kwargs) # Lifting clusters = Lift().process(clusters) # Lifting may create fusion opportunities, which in turn may enable # further optimizations clusters = fuse(clusters) clusters = eliminate_arrays(clusters, template) clusters = scalarize(clusters, template) return clusters
def __new__(cls, mode, **generators): obj = object.__new__(mpi_registry[mode]) # Unique name generators obj._gen_msgkey = generators.get('msg', generator()) obj._gen_commkey = generators.get('comm', generator()) obj._gen_compkey = generators.get('comp', generator()) obj._cache_halo = OrderedDict() obj._cache_dims = OrderedDict() obj._objs = OrderedSet() obj._regions = OrderedDict() obj._msgs = OrderedDict() obj._efuncs = [] return obj
def test_yreplace_time_invariants(tu, tv, tw, ti0, ti1, t0, t1, exprs, expected): exprs = EVAL(exprs, tu, tv, tw, ti0, ti1, t0, t1) counter = generator() make = lambda: Scalar(name='r%d' % counter()).indexify() processed, found = yreplace(exprs, make, make_is_time_invariant(exprs), lambda i: estimate_cost(i) > 0) assert len(found) == len(expected) assert all(str(i.rhs) == j for i, j in zip(found, expected))
def __init__(self, profile=True, template=None): self.profile = profile # Used to build globally-unique temporaries if template is None: counter = generator() self.template = lambda: "%s%d" % (AbstractRewriter.tempname, counter()) else: assert callable(template) self.template = template
def test_xreplace_constrained_time_varying(tu, tv, tw, ti0, ti1, t0, t1, exprs, expected): exprs = EVAL(exprs, tu, tv, tw, ti0, ti1, t0, t1) counter = generator() make = lambda: Scalar(name='r%d' % counter()).indexify() processed, found = xreplace_constrained(exprs, make, iq_timevarying(FlowGraph(exprs)), lambda i: estimate_cost(i) > 0) assert len(found) == len(expected) assert all(str(i.rhs) == j for i, j in zip(found, expected))
def make_name(self, prefix=None): # By default we're creating a new symbol if prefix is None: prefix = self._symbol_prefix try: counter = self.counters[prefix] except KeyError: counter = self.counters.setdefault(prefix, generator()) return "%s%d" % (prefix, counter())
def __init__(self, profile=True, template=None): self.profile = profile # Used to build globally-unique temporaries if template is None: counter = generator() self.template = lambda: "%s%d" % (AbstractRewriter.tempname, counter()) else: assert callable(template) self.template = template # Track performance of each cluster self.run_summary = []
def _make_iet_passes_mapper(cls, **kwargs): options = kwargs['options'] platform = kwargs['platform'] ompizer = Ompizer() return { 'denormals': avoid_denormals, 'optcomms': optimize_halospots, 'wrapping': loop_wrapping, 'blocking': partial(relax_incr_dimensions, counter=generator()), 'openmp': ompizer.make_parallel, 'mpi': partial(mpiize, mode=options['mpi']), 'simd': partial(ompizer.make_simd, simd_reg_size=platform.simd_reg_size), 'prodders': hoist_prodders }
def test_yreplace_time_invariants(exprs, expected): grid = Grid((3, 3, 3)) dims = grid.dimensions tu = TimeFunction(name="tu", grid=grid, space_order=4).indexify() tv = TimeFunction(name="tv", grid=grid, space_order=4).indexify() tw = TimeFunction(name="tw", grid=grid, space_order=4).indexify() ti0 = Array(name='ti0', shape=(3, 5, 7), dimensions=dims).indexify() ti1 = Array(name='ti1', shape=(3, 5, 7), dimensions=dims).indexify() t0 = Scalar(name='t0').indexify() t1 = Scalar(name='t1').indexify() exprs = EVAL(exprs, tu, tv, tw, ti0, ti1, t0, t1) counter = generator() make = lambda: Scalar(name='r%d' % counter()).indexify() processed, found = yreplace(exprs, make, make_is_time_invariant(exprs), lambda i: estimate_cost(i) > 0) assert len(found) == len(expected) assert all(str(i.rhs) == j for i, j in zip(found, expected))
def optimize(clusters, dse_mode): """ Optimize a topologically-ordered sequence of Clusters by applying the following transformations: * [cross-cluster] Fusion * [intra-cluster] Several flop-reduction passes via the DSE * [cross-cluster] Lifting * [cross-cluster] Scalarization * [cross-cluster] Arrays Elimination """ # To create temporaries counter = generator() template = lambda: "r%d" % counter() # Toposort+Fusion (the former to expose more fusion opportunities) clusters = Toposort().process(clusters) clusters = fuse(clusters) # Flop reduction via the DSE from devito.dse import rewrite clusters = rewrite(clusters, template, mode=dse_mode) # Lifting clusters = Lift().process(clusters) # Lifting may create fusion opportunities clusters = fuse(clusters) # Fusion may create opportunities to eliminate Arrays (thus shrinking the # working set) if these store identical expressions clusters = eliminate_arrays(clusters, template) # Fusion may create scalarization opportunities clusters = scalarize(clusters, template) # Determine computational properties (e.g., parallelism) that will be # necessary for the later passes clusters = analyze(clusters) return ClusterGroup(clusters)