Exemplo n.º 1
0
def process(func, state):
    """
    Apply ``func`` to the IETs in ``state._efuncs``, and update ``state`` accordingly.
    """
    # Create a Call graph. `func` will be applied to each node in the Call graph.
    # `func` might change an `efunc` signature; the Call graph will be used to
    # propagate such change through the `efunc` callers
    dag = DAG(nodes=['root'])
    queue = ['root']
    while queue:
        caller = queue.pop(0)
        callees = FindNodes(Call).visit(state._efuncs[caller])
        for callee in filter_ordered([i.name for i in callees]):
            if callee in state._efuncs:  # Exclude foreign Calls, e.g., MPI calls
                try:
                    dag.add_node(callee)
                    queue.append(callee)
                except KeyError:
                    # `callee` already in `dag`
                    pass
                dag.add_edge(callee, caller)
    assert dag.size == len(state._efuncs)

    # Apply `func`
    for i in dag.topological_sort():
        state._efuncs[i], metadata = func(state._efuncs[i])

        # Track any new Dimensions introduced by `func`
        state._dimensions.extend(list(metadata.get('dimensions', [])))

        # Track any new #include required by `func`
        state._includes.extend(list(metadata.get('includes', [])))
        state._includes = filter_ordered(state._includes)

        # Track any new ElementalFunctions
        state._efuncs.update(OrderedDict([(i.name, i)
                                          for i in metadata.get('efuncs', [])]))

        # If there's a change to the `args` and the `iet` is an efunc, then
        # we must update the call sites as well, as the arguments dropped down
        # to the efunc have just increased
        args = as_tuple(metadata.get('args'))
        if args:
            # `extif` avoids redundant updates to the parameters list, due
            # to multiple children wanting to add the same input argument
            extif = lambda v: list(v) + [e for e in args if e not in v]
            stack = [i] + dag.all_downstreams(i)
            for n in stack:
                efunc = state._efuncs[n]
                calls = [c for c in FindNodes(Call).visit(efunc) if c.name in stack]
                mapper = {c: c._rebuild(arguments=extif(c.arguments)) for c in calls}
                efunc = Transformer(mapper).visit(efunc)
                if efunc.is_Callable:
                    efunc = efunc._rebuild(parameters=extif(efunc.parameters))
                state._efuncs[n] = efunc
Exemplo n.º 2
0
def create_profile(name, iet):
    """
    Enrich the Iteration/Expression tree ``iet`` adding nodes for C-level
    performance profiling. In particular, turn all :class:`Section`s within ``iet``
    into :class:`TimedList`s.

    A :class:`Profiler` is returned to access profiling data.
    """
    sections = FindNodes(Section).visit(iet)

    # Construct the Profiler
    profiler = Profiler(name)
    for section in sections:
        # All ExpressionBundles within `section`
        bundles = FindNodes(ExpressionBundle).visit(section)

        # Total operation count
        ops = sum(i.ops for i in bundles)

        # Operation count at each section iteration
        sops = sum(
            estimate_cost(i.expr) for i in flatten(b.exprs for b in bundles))

        # Total memory traffic
        mapper = {}
        for i in bundles:
            for k, v in i.traffic.items():
                mapper.setdefault(k, []).append(v)
        traffic = [
            IntervalGroup.generate('merge', *i) for i in mapper.values()
        ]
        traffic = sum(i.extent for i in traffic)

        # Each ExpressionBundle lives in its own iteration space
        itershapes = [i.shape for i in bundles]

        # Track how many grid points are written within `section`
        points = []
        for i in bundles:
            writes = {
                e.write
                for e in i.exprs if e.is_tensor and e.write.is_TimeFunction
            }
            points.append(reduce(mul, i.shape) * len(writes))
        points = sum(points)

        profiler.add(section,
                     SectionData(ops, sops, points, traffic, itershapes))

    # Transform the Iteration/Expression tree introducing the C-level timers
    mapper = {
        i: TimedList(gname=name, lname=i.name, body=i.body)
        for i in sections
    }
    iet = Transformer(mapper).visit(iet)

    return iet, profiler
Exemplo n.º 3
0
def test_transformer_add_replace(exprs, block2, block3):
    """Basic transformer test that adds one expression and replaces another"""
    line1 = '// Replaced expression'
    line2 = '// Adding a simple line'
    replacer = Block(c.Line(line1))
    adder = lambda n: Block(c.Line(line2), n)
    transformer = Transformer({exprs[0]: replacer,
                               exprs[1]: adder(exprs[1])})

    for block in [block2, block3]:
        newblock = transformer.visit(block)
        newcode = str(newblock.ccode)
        oldnumlines = len(str(block.ccode).split('\n'))
        newnumlines = len(newcode.split('\n'))
        assert newnumlines >= oldnumlines + 1
        assert line1 in newcode
        assert line2 in newcode
        assert "a[i0] = a[i0] + b[i0] + 5.0F;" not in newcode
Exemplo n.º 4
0
def _hoist_halospots(iet):
    """
    Hoist HaloSpots from inner to outer Iterations where all data dependencies
    would be honored.
    """
    # Precompute scopes to save time
    scopes = {i: Scope([e.expr for e in v]) for i, v in MapNodes().visit(iet).items()}

    # Analysis
    hsmapper = {}
    imapper = defaultdict(list)
    for iters, halo_spots in MapNodes(Iteration, HaloSpot, 'groupby').visit(iet).items():
        for hs in halo_spots:
            hsmapper[hs] = hs.halo_scheme

            for f in hs.fmapper:
                for n, i in enumerate(iters):
                    maybe_hoistable = set().union(*[i.dim._defines for i in iters[n:]])
                    d_flow = scopes[i].d_flow.project(f)

                    if all(not (dep.cause & maybe_hoistable) or dep.write.is_increment
                           for dep in d_flow):
                        hsmapper[hs] = hsmapper[hs].drop(f)
                        imapper[i].append(hs.halo_scheme.project(f))
                        break

    # Post-process analysis
    mapper = {i: HaloSpot(HaloScheme.union(hss), i._rebuild())
              for i, hss in imapper.items()}
    mapper.update({i: i.body if hs.is_void else i._rebuild(halo_scheme=hs)
                   for i, hs in hsmapper.items()})

    # Transform the IET hoisting/dropping HaloSpots as according to the analysis
    iet = Transformer(mapper, nested=True).visit(iet)

    # Clean up: de-nest HaloSpots if necessary
    mapper = {}
    for hs in FindNodes(HaloSpot).visit(iet):
        if hs.body.is_HaloSpot:
            halo_scheme = HaloScheme.union([hs.halo_scheme, hs.body.halo_scheme])
            mapper[hs] = hs._rebuild(halo_scheme=halo_scheme, body=hs.body.body)
    iet = Transformer(mapper, nested=True).visit(iet)

    return iet
Exemplo n.º 5
0
def fold_blockable_tree(iet, blockinner=True):
    """
    Create IterationFolds from sequences of nested Iterations.
    """
    mapper = {}
    for k, sequence in FindAdjacent(Iteration).visit(iet).items():
        # Group based on Dimension
        groups = []
        for subsequence in sequence:
            for _, v in groupby(subsequence, lambda i: i.dim):
                i = list(v)
                if len(i) >= 2:
                    groups.append(i)
        for i in groups:
            # Pre-condition: they all must be perfect iterations
            if any(not IsPerfectIteration().visit(j) for j in i):
                continue
            # Only retain consecutive trees having same depth
            trees = [retrieve_iteration_tree(j)[0] for j in i]
            handle = []
            for j in trees:
                if len(j) != len(trees[0]):
                    break
                handle.append(j)
            trees = handle
            if not trees:
                continue
            # Check foldability
            pairwise_folds = list(zip(*reversed(trees)))
            if any(not is_foldable(j) for j in pairwise_folds):
                continue
            # Maybe heuristically exclude innermost Iteration
            if blockinner is False:
                pairwise_folds = pairwise_folds[:-1]
            # Perhaps there's nothing to fold
            if len(pairwise_folds) == 0:
                continue
            # TODO: we do not currently support blocking if any of the foldable
            # iterations writes to user data (need min/max loop bounds?)
            exprs = flatten(
                FindNodes(Expression).visit(j.root) for j in trees[:-1])
            if any(j.write.is_Input for j in exprs):
                continue
            # Perform folding
            for j in pairwise_folds:
                r, remainder = j[0], j[1:]
                folds = [(tuple(y - x for x, y in zip(i.offsets, r.offsets)),
                          i.nodes) for i in remainder]
                mapper[r] = IterationFold(folds=folds, **r.args)
                for k in remainder:
                    mapper[k] = None

    # Insert the IterationFolds in the Iteration/Expression tree
    iet = Transformer(mapper, nested=True).visit(iet)

    return iet
Exemplo n.º 6
0
    def _specialize_iet(self, iet):
        """Transform the Iteration/Expression tree to offload the computation of
        one or more loop nests onto YASK. This involves calling the YASK compiler
        to generate YASK code. Such YASK code is then called from within the
        transformed Iteration/Expression tree."""
        log("Specializing a Devito Operator for YASK...")

        self.context = YaskNullContext()
        self.yk_soln = YaskNullKernel()

        offloadable = find_offloadable_trees(iet)
        if len(offloadable) == 0:
            log("No offloadable trees found")
        elif len(offloadable) == 1:
            tree, bundle, grid, dtype = offloadable[0]
            self.context = contexts.fetch(grid, dtype)

            # Create a YASK compiler solution for this Operator
            yc_soln = self.context.make_yc_solution(namespace['jit-yc-soln'])

            transform = sympy2yask(self.context, yc_soln)
            try:
                for i in bundle.exprs:
                    transform(i.expr)

                funcall = make_sharedptr_funcall(namespace['code-soln-run'], ['time'],
                                                 namespace['code-soln-name'])
                funcall = Element(c.Statement(ccode(funcall)))
                iet = Transformer({tree[1]: funcall}).visit(iet)

                # Track /funcall/ as an external function call
                self.func_table[namespace['code-soln-run']] = MetaCall(None, False)

                # JIT-compile the newly-created YASK kernel
                local_grids = [i for i in transform.mapper if i.is_Array]
                self.yk_soln = self.context.make_yk_solution(namespace['jit-yk-soln'],
                                                             yc_soln, local_grids)

                # Print some useful information about the newly constructed solution
                log("Solution '%s' contains %d grid(s) and %d equation(s)." %
                    (yc_soln.get_name(), yc_soln.get_num_grids(),
                     yc_soln.get_num_equations()))

            except:
                log("Unable to offload a candidate tree.")
        else:
            exit("Found more than one offloadable trees in a single Operator")

        # Some Iteration/Expression trees are not offloaded to YASK and may
        # require further processing to be executed in YASK, due to the differences
        # in storage layout employed by Devito and YASK
        iet = make_grid_accesses(iet)

        log("Specialization successfully performed!")

        return iet
Exemplo n.º 7
0
def test_nested_transformer(exprs, iters, block2):
    """When created with the kwarg ``nested=True``, a Transformer performs
    nested replacements. This test simultaneously replace an inner expression
    and an Iteration sorrounding it."""
    target_loop = block2.nodes[1]
    target_expr = target_loop.nodes[0].nodes[0]
    mapper = {target_loop: iters[3](target_loop.nodes[0]),
              target_expr: exprs[3]}
    processed = Transformer(mapper, nested=True).visit(block2)
    assert printAST(processed) == """<Iteration i::i::(0, 3, 1)::(0, 0)>
Exemplo n.º 8
0
    def instrument(self, iet):
        sections = FindNodes(Section).visit(iet)

        # Transform the Iteration/Expression tree introducing Advisor calls that
        # resume and stop data collection
        mapper = {i: List(body=[Call(self._api_resume), i, Call(self._api_pause)])
                  for i in sections}
        iet = Transformer(mapper).visit(iet)

        return iet
Exemplo n.º 9
0
 def _make_compute(self, hs, key, msgs, callpoke):
     if hs.body.is_Call:
         return None
     else:
         mapper = {
             i: List(body=[callpoke, i])
             for i in FindNodes(ExpressionBundle).visit(hs.body)
         }
         iet = Transformer(mapper).visit(hs.body)
         return make_efunc('compute%d' % key, iet, hs.arguments)
Exemplo n.º 10
0
    def _make_parallel_tree(self, root, candidates):
        """
        Return a mapper to parallelize the :class:`Iteration`s within /root/.
        """
        parallel = self._pragma_for(root, candidates)

        # Introduce the `omp for` pragma
        mapper = OrderedDict()
        if root.is_ParallelAtomic:
            # Introduce the `omp atomic` pragmas
            exprs = FindNodes(Expression).visit(root)
            subs = {i: List(header=self.lang['atomic'], body=i)
                    for i in exprs if i.is_increment}
            handle = Transformer(subs).visit(root)
            mapper[root] = handle._rebuild(pragmas=root.pragmas + (parallel,))
        else:
            mapper[root] = root._rebuild(pragmas=root.pragmas + (parallel,))

        return mapper
Exemplo n.º 11
0
    def _optimize_halo_updates(self, iet, state):
        """
        Drop unnecessary halo exchanges, or shuffle them around to improve
        computation-communication overlap.
        """
        hss = FindNodes(HaloSpot).visit(iet)
        mapper = {i: None for i in hss if i.is_Redundant}
        processed = Transformer(mapper, nested=True).visit(iet)

        return processed, {}
Exemplo n.º 12
0
    def instrument(self, iet):
        """
        Enrich the Iteration/Expression tree ``iet`` adding nodes for C-level
        performance profiling. In particular, turn all Sections within ``iet``
        into TimedLists.
        """
        sections = FindNodes(Section).visit(iet)
        for section in sections:
            bundles = FindNodes(ExpressionBundle).visit(section)

            # Total operation count
            ops = sum(i.ops for i in bundles)

            # Operation count at each section iteration
            sops = sum(
                estimate_cost(i.expr)
                for i in flatten(b.exprs for b in bundles))

            # Total memory traffic
            mapper = {}
            for i in bundles:
                for k, v in i.traffic.items():
                    mapper.setdefault(k, []).append(v)
            traffic = 0
            for i in mapper.values():
                try:
                    traffic += IntervalGroup.generate('union', *i).size
                except ValueError:
                    # Over different iteration spaces
                    traffic += sum(j.size for j in i)

            # Each ExpressionBundle lives in its own iteration space
            itermaps = [i.ispace.dimension_map for i in bundles]

            # Track how many grid points are written within `section`
            points = []
            for i in bundles:
                writes = {
                    e.write
                    for e in i.exprs if e.is_tensor and e.write.is_TimeFunction
                }
                points.append(i.size * len(writes))
            points = sum(points)

            self._sections[section] = SectionData(ops, sops, points, traffic,
                                                  itermaps)

        # Transform the Iteration/Expression tree introducing the C-level timers
        mapper = {
            i: TimedList(timer=self.timer, lname=i.name, body=i)
            for i in sections
        }
        iet = Transformer(mapper).visit(iet)

        return iet
Exemplo n.º 13
0
def iet_insert_C_decls(iet, func_table):
    """
    Given an Iteration/Expression tree ``iet``, build a new tree with the
    necessary symbol declarations. Declarations are placed as close as
    possible to the first symbol use.

    :param iet: The input Iteration/Expression tree.
    :param func_table: A mapper from callable names to :class:`Callable`s
                       called from within ``iet``.
    """
    # Resolve function calls first
    scopes = []
    me = MapExpressions()
    for k, v in me.visit(iet).items():
        if k.is_Call:
            func = func_table[k.name]
            if func.local:
                scopes.extend(me.visit(func.root, queue=list(v)).items())
        else:
            scopes.append((k, v))

    # Determine all required declarations
    allocator = Allocator()
    mapper = OrderedDict()
    for k, v in scopes:
        if k.is_scalar:
            # Inline declaration
            mapper[k] = LocalExpression(**k.args)
        elif k.write is None or k.write._mem_external:
            # Nothing to do, e.g., variable passed as kernel argument
            continue
        elif k.write._mem_stack:
            # On the stack
            key = lambda i: not i.is_Parallel
            site = filter_iterations(v, key=key, stop='asap') or [iet]
            allocator.push_stack(site[-1], k.write)
        else:
            # On the heap, as a tensor that must be globally accessible
            allocator.push_heap(k.write)

    # Introduce declarations on the stack
    for k, v in allocator.onstack:
        mapper[k] = tuple(Element(i) for i in v)
    iet = NestedTransformer(mapper).visit(iet)
    for k, v in list(func_table.items()):
        if v.local:
            func_table[k] = MetaCall(
                Transformer(mapper).visit(v.root), v.local)

    # Introduce declarations on the heap (if any)
    if allocator.onheap:
        decls, allocs, frees = zip(*allocator.onheap)
        iet = List(header=decls + allocs, body=iet, footer=frees)

    return iet
Exemplo n.º 14
0
    def _parallelize_dist(self, iet):
        """
        Add MPI routines performing halo exchanges to emit distributed-memory
        parallel code.
        """
        if not self.params['mpi']:
            return iet, {}

        # To produce unique object names
        generators = {
            'msg': generator(),
            'comm': generator(),
            'comp': generator()
        }
        sync_heb = HaloExchangeBuilder('basic', **generators)
        user_heb = HaloExchangeBuilder(self.params['mpi'], **generators)
        mapper = {}
        for hs in FindNodes(HaloSpot).visit(iet):
            heb = user_heb if hs.is_Overlappable else sync_heb
            mapper[hs] = heb.make(hs)
        efuncs = sync_heb.efuncs + user_heb.efuncs
        objs = sync_heb.objs + user_heb.objs
        iet = Transformer(mapper, nested=True).visit(iet)

        # Must drop the PARALLEL tag from the Iterations within which halo
        # exchanges are performed
        mapper = {}
        for tree in retrieve_iteration_tree(iet):
            for i in reversed(tree):
                if i in mapper:
                    # Already seen this subtree, skip
                    break
                if FindNodes(Call).visit(i):
                    mapper.update({
                        n:
                        n._rebuild(properties=set(n.properties) - {PARALLEL})
                        for n in tree[:tree.index(i) + 1]
                    })
                    break
        iet = Transformer(mapper, nested=True).visit(iet)

        return iet, {'includes': ['mpi.h'], 'efuncs': efuncs, 'args': objs}
Exemplo n.º 15
0
    def make_parallel(self, iet):
        """
        Transform ``iet`` by decorating its parallel :class:`Iteration`s with
        suitable ``#pragma omp ...`` for thread-level parallelism.
        """
        # Group sequences of loops that should go within the same parallel region
        was_tagged = False
        groups = OrderedDict()
        for tree in retrieve_iteration_tree(iet):
            # Determine the number of consecutive parallelizable Iterations
            candidates = filter_iterations(tree, key=self.key, stop='asap')
            if not candidates:
                was_tagged = False
                continue
            # Consecutive tagged Iteration go in the same group
            is_tagged = any(i.tag is not None for i in tree)
            key = len(groups) - (is_tagged & was_tagged)
            handle = groups.setdefault(key, OrderedDict())
            handle[candidates[0]] = candidates
            was_tagged = is_tagged

        mapper = OrderedDict()
        for group in groups.values():
            private = []
            for root, candidates in group.items():
                mapper.update(self._make_parallel_tree(root, candidates))

                # Track the thread-private and thread-shared variables
                private.extend([
                    i for i in FindSymbols('symbolics').visit(root)
                    if i.is_Array and i._mem_stack
                ])

            # Build the parallel region
            private = sorted(set([i.name for i in private]))
            private = ('private(%s)' % ','.join(private)) if private else ''
            rebuilt = [v for k, v in mapper.items() if k in group]
            par_region = Block(header=self.lang['par-region'](private),
                               body=rebuilt)
            for k, v in list(mapper.items()):
                if isinstance(v, Iteration):
                    mapper[k] = None if v.is_Remainder else par_region
        processed = Transformer(mapper).visit(iet)

        # Hack/workaround to the fact that the OpenMP pragmas are not true
        # IET nodes, so the `nthreads` variables won't be detected as a
        # Callable parameter unless inserted in a mock Expression
        if mapper:
            nt = NThreads()
            eq = LocalExpression(DummyEq(Symbol(name='nt', dtype=np.int32),
                                         nt))
            return List(body=[eq, processed]), {'input': [nt]}
        else:
            return List(body=processed), {}
Exemplo n.º 16
0
    def process(self, iet):
        def key(s):
            # The SyncOps are to be processed in the following order
            return [WaitLock, WithLock, Delete, FetchWait,
                    FetchWaitPrefetch].index(s)

        callbacks = {
            WaitLock: self._make_waitlock,
            WithLock: self._make_withlock,
            FetchWait: self._make_fetchwait,
            FetchWaitPrefetch: self._make_fetchwaitprefetch,
            Delete: self._make_delete
        }

        sync_spots = FindNodes(SyncSpot).visit(iet)

        if not sync_spots:
            return iet, {}

        pieces = namedtuple('Pieces', 'init finalize funcs threads')([], [],
                                                                     [], [])

        subs = {}
        for n in sync_spots:
            mapper = as_mapper(n.sync_ops, lambda i: type(i))
            for _type in sorted(mapper, key=key):
                subs[n] = callbacks[_type](subs.get(n, n), mapper[_type],
                                           pieces, iet)

        iet = Transformer(subs).visit(iet)

        # Add initialization and finalization code
        init = List(body=pieces.init, footer=c.Line())
        finalize = List(header=c.Line(), body=pieces.finalize)
        iet = iet._rebuild(body=(init, ) + iet.body + (finalize, ))

        return iet, {
            'efuncs': pieces.funcs,
            'includes': ['pthread.h'],
            'args': [i.size for i in pieces.threads if not is_integer(i.size)]
        }
Exemplo n.º 17
0
def fold_blockable_tree(node, exclude_innermost=False):
    """
    Create IterationFolds from sequences of nested Iterations.
    """
    found = FindAdjacent(Iteration).visit(node)

    mapper = {}
    for k, v in found.items():
        for i in v:
            # Pre-condition: they all must be perfect iterations
            assert len(i) > 1
            if any(not IsPerfectIteration().visit(j) for j in i):
                continue
            # Only retain consecutive trees having same depth
            trees = [retrieve_iteration_tree(j)[0] for j in i]
            handle = []
            for j in trees:
                if len(j) != len(trees[0]):
                    break
                handle.append(j)
            trees = handle
            if not trees:
                continue
            # Check foldability
            pairwise_folds = list(zip(*reversed(trees)))
            if any(not is_foldable(j) for j in pairwise_folds):
                continue
            # Maybe heuristically exclude innermost Iteration
            if exclude_innermost is True:
                pairwise_folds = pairwise_folds[:-1]
            # Perhaps there's nothing to fold
            if len(pairwise_folds) == 1:
                continue
            # TODO: we do not currently support blocking if any of the foldable
            # iterations writes to user data (need min/max loop bounds?)
            exprs = flatten(
                FindNodes(Expression).visit(j.root) for j in trees[:-1])
            if any(j.write.is_Input for j in exprs):
                continue
            # Perform folding
            for j in pairwise_folds:
                root, remainder = j[0], j[1:]
                folds = [(tuple(y - x
                                for x, y in zip(i.offsets, root.offsets)),
                          i.nodes) for i in remainder]
                mapper[root] = IterationFold(folds=folds, **root.args)
                for k in remainder:
                    mapper[k] = None

    # Insert the IterationFolds in the Iteration/Expression tree
    processed = Transformer(mapper, nested=True).visit(node)

    return processed
Exemplo n.º 18
0
    def _simdize(self, iet):
        # No SIMD-ization for devices. We then drop the VECTOR property
        # so that later passes can perform more aggressive transformations
        mapper = {}
        for i in FindNodes(Iteration).visit(iet):
            if i.is_Vectorizable:
                properties = [p for p in i.properties if p is not VECTOR]
                mapper[i] = i._rebuild(properties=properties)

        iet = Transformer(mapper).visit(iet)

        return iet, {}
Exemplo n.º 19
0
    def _make_parallel_tree(self, root, candidates):
        """Return a mapper to parallelize the Iterations within ``root``."""
        ncollapse = self._ncollapse(root, candidates)
        parallel = self.lang['for'](ncollapse)

        pragmas = root.pragmas + (parallel,)
        properties = root.properties + (COLLAPSED(ncollapse),)

        # Introduce the `omp for` pragma
        mapper = OrderedDict()
        if root.is_ParallelAtomic:
            # Introduce the `omp atomic` pragmas
            exprs = FindNodes(Expression).visit(root)
            subs = {i: List(header=self.lang['atomic'], body=i)
                    for i in exprs if i.is_Increment}
            handle = Transformer(subs).visit(root)
            mapper[root] = handle._rebuild(pragmas=pragmas, properties=properties)
        else:
            mapper[root] = root._rebuild(pragmas=pragmas, properties=properties)

        return mapper
Exemplo n.º 20
0
Arquivo: mpi.py Projeto: ofmla/devito
def mpiize(iet, **kwargs):
    """
    Add MPI routines performing halo exchanges to emit distributed-memory
    parallel code.
    """
    mode = kwargs.pop('mode')
    language = kwargs.pop('language')
    sregistry = kwargs.pop('sregistry')

    # To produce unique object names
    generators = {'msg': generator(), 'comm': generator(), 'comp': generator()}

    sync_heb = HaloExchangeBuilder('basic', language, sregistry, **generators)
    user_heb = HaloExchangeBuilder(mode, language, sregistry, **generators)
    mapper = {}
    for hs in FindNodes(HaloSpot).visit(iet):
        heb = user_heb if isinstance(hs, OverlappableHaloSpot) else sync_heb
        mapper[hs] = heb.make(hs)

    efuncs = sync_heb.efuncs + user_heb.efuncs
    objs = filter_sorted(sync_heb.objs + user_heb.objs)
    iet = Transformer(mapper, nested=True).visit(iet)

    # Must drop the PARALLEL tag from the Iterations within which halo
    # exchanges are performed
    mapper = {}
    for tree in retrieve_iteration_tree(iet):
        for i in reversed(tree):
            if i in mapper:
                # Already seen this subtree, skip
                break
            if FindNodes(Call).visit(i):
                mapper.update({
                    n: n._rebuild(properties=set(n.properties) - {PARALLEL})
                    for n in tree[:tree.index(i) + 1]
                })
                break
    iet = Transformer(mapper, nested=True).visit(iet)

    return iet, {'includes': ['mpi.h'], 'efuncs': efuncs, 'args': objs}
Exemplo n.º 21
0
    def _dist_parallelize(self, iet):
        """
        Add MPI routines performing halo exchanges to emit distributed-memory
        parallel code.
        """
        # Build send/recv Callables and Calls
        heb = HaloExchangeBuilder(self.params['mpi'])
        call_trees, calls = heb.make(FindNodes(HaloSpot).visit(iet))

        # Transform the IET by adding in the `haloupdate` Calls
        iet = Transformer(calls, nested=True).visit(iet)

        return iet, {'includes': ['mpi.h'], 'call_trees': call_trees}
Exemplo n.º 22
0
def mpi_gpu_direct(iet, **kwargs):
    """
    Modify MPI Callables to enable multiple GPUs performing GPU-Direct communication.
    """
    mapper = {}
    for node in FindNodes((IsendCall, IrecvCall)).visit(iet):
        header = c.Pragma('omp target data use_device_ptr(%s)' %
                          node.arguments[0].name)
        mapper[node] = Block(header=header, body=node)

    iet = Transformer(mapper).visit(iet)

    return iet, {}
Exemplo n.º 23
0
def iet_analyze(iet):
    analysis = mark_halospot_useless(iet)
    analysis = mark_halospot_hoistable(analysis)
    analysis = mark_halospot_overlappable(analysis)

    # Decorate the Iteration/Expression tree with the found properties
    mapper = OrderedDict()
    for k, v in list(analysis.properties.items()):
        args = k.args
        properties = as_tuple(args.pop('properties')) + as_tuple(v)
        mapper[k] = k._rebuild(properties=properties, **args)
    processed = Transformer(mapper, nested=True).visit(iet)

    return processed
Exemplo n.º 24
0
    def instrument(self, iet, timer):
        # Look for the presence of a time loop within the IET of the Operator
        mapper = {}
        for i in FindNodes(Iteration).visit(iet):
            if i.dim.is_Time:
                # The calls to Advisor's Collection Control API are only for Operators
                # with a time loop
                mapper[i] = List(header=c.Statement('%s()' % self._api_resume),
                                 body=i,
                                 footer=c.Statement('%s()' % self._api_pause))
                return Transformer(mapper).visit(iet)

        # Return the IET intact if no time loop is found
        return iet
Exemplo n.º 25
0
 def instrument(self, iet, timer):
     """
     Instrument the given IET for C-level performance profiling.
     """
     sections = FindNodes(Section).visit(iet)
     if sections:
         mapper = {}
         for i in sections:
             n = i.name
             assert n in timer.fields
             mapper[i] = i._rebuild(body=TimedList(timer=timer, lname=n, body=i.body))
         return Transformer(mapper, nested=True).visit(iet)
     else:
         return iet
Exemplo n.º 26
0
    def _generate_mpi(self, iet, **kwargs):
        if configuration['mpi'] is False:
            return iet

        halo_spots = FindNodes(HaloSpot).visit(iet)

        # For each MPI-distributed TensorFunction, generate all necessary
        # C-level routines to perform a halo update
        callables = OrderedDict()
        for hs in halo_spots:
            for f, v in hs.fmapper.items():
                callables[f] = [update_halo(f, v.loc_indices)]
                callables[f].append(sendrecv(f, v.loc_indices))
                callables[f].append(copy(f, v.loc_indices))
                callables[f].append(copy(f, v.loc_indices, True))
        callables = flatten(callables.values())

        # Replace HaloSpots with suitable calls performing the halo update
        mapper = {}
        for hs in halo_spots:
            for f, v in hs.fmapper.items():
                stencil = [int(i) for i in hs.mask[f].values()]
                comm = f.grid.distributor._C_comm
                nb = f.grid.distributor._C_neighbours.obj
                loc_indices = list(v.loc_indices.values())
                dsizes = [d.symbolic_size for d in f.dimensions]
                parameters = [f] + stencil + [comm, nb] + loc_indices + dsizes
                call = Call('halo_exchange_%s' % f.name, parameters)
                mapper.setdefault(hs, []).append(call)

        # Sorting is for deterministic code generation. However, in practice,
        # we don't expect `cstructs` to contain more than one element because
        # there should always be one grid per Operator (though we're not really
        # enforcing it)
        cstructs = {
            f.grid.distributor._C_neighbours.cdef
            for f in flatten(i.fmapper for i in halo_spots)
        }
        self._globals.extend(sorted(cstructs, key=lambda i: i.tpname))

        self._includes.append('mpi.h')

        self._func_table.update(
            OrderedDict([(i.name, MetaCall(i, True)) for i in callables]))

        # Add in the halo update calls
        mapper = {k: List(body=v + list(k.body)) for k, v in mapper.items()}
        iet = Transformer(mapper, nested=True).visit(iet)

        return iet
Exemplo n.º 27
0
    def __init__(self, expressions, **kwargs):
        super(OperatorDebug, self).__init__(expressions, **kwargs)
        self._includes.append('stdio.h')

        # Minimize the trip count of the sequential loops
        iterations = set(flatten(retrieve_iteration_tree(self.body)))
        mapper = {
            i: i._rebuild(limits=(max(i.offsets) + 2))
            for i in iterations if i.is_Sequential
        }
        self.body = Transformer(mapper).visit(self.body)

        # Mark entry/exit points of each non-sequential Iteration tree in the body
        iterations = [
            filter_iterations(i, lambda i: not i.is_Sequential, 'any')
            for i in retrieve_iteration_tree(self.body)
        ]
        iterations = [i[0] for i in iterations if i]
        mapper = {
            t: List(header=printmark('In nest %d' % i), body=t)
            for i, t in enumerate(iterations)
        }
        self.body = Transformer(mapper).visit(self.body)
Exemplo n.º 28
0
    def _process(self, func):
        """Apply ``func`` to all tracked ``IETs``."""

        for i in self._call_graph.topological_sort():
            self._efuncs[i], metadata = func(self._efuncs[i])

            # Track any new Dimensions and includes introduced by `func`
            self._dimensions.extend(list(metadata.get('dimensions', [])))
            self._includes.extend(list(metadata.get('includes', [])))

            # If there's a change to the `input` and the `iet` is an efunc, then
            # we must update the call sites as well, as the arguments dropped down
            # to the efunc have just increased
            _input = as_tuple(metadata.get('input'))
            if _input:
                # `extif` avoids redundant updates to the parameters list, due
                # to multiple children wanting to add the same input argument
                extif = lambda v: list(v) + [e for e in _input if e not in v]
                stack = [i] + self._call_graph.all_downstreams(i)
                for n in stack:
                    efunc = self._efuncs[n]
                    calls = [c for c in FindNodes(Call).visit(efunc) if c.name in stack]
                    mapper = {c: c._rebuild(arguments=extif(c.arguments)) for c in calls}
                    efunc = Transformer(mapper).visit(efunc)
                    if efunc.is_Callable:
                        efunc = efunc._rebuild(parameters=extif(efunc.parameters))
                    self._efuncs[n] = efunc
                self._input.extend(list(_input))

            for k, v in metadata.get('efuncs', {}).items():
                # Update the efuncs
                if k.is_Callable:
                    self._efuncs[k.name] = k
                # Update the call graph
                self._call_graph.add_node(k.name, ignore_existing=True)
                for target in (v or [None]):
                    self._call_graph.add_edge(k.name, target or 'main', force_add=True)
Exemplo n.º 29
0
def test_create_elemental_functions_simple(simple_function):
    roots = [i[-1] for i in retrieve_iteration_tree(simple_function)]
    retagged = [i._rebuild(properties=tagger(0)) for i in roots]
    mapper = {
        i: j._rebuild(properties=(j.properties + (ELEMENTAL, )))
        for i, j in zip(roots, retagged)
    }
    function = Transformer(mapper).visit(simple_function)
    handle = transform(function, mode='split')
    block = List(body=[handle.nodes] + handle.elemental_functions)
    output = str(block.ccode)
    # Make output compiler independent
    output = [
        i for i in output.split('\n')
        if all([j not in i for j in ('#pragma', '/*')])
    ]
    assert '\n'.join(output) == \
        ("""void foo(float *restrict a_vec, float *restrict b_vec,"""
         """ float *restrict c_vec, float *restrict d_vec)
{
  float (*restrict a) __attribute__((aligned(64))) = (float (*)) a_vec;
  float (*restrict b) __attribute__((aligned(64))) = (float (*)) b_vec;
  float (*restrict c)[j_size] __attribute__((aligned(64))) = (float (*)[j_size]) c_vec;
  float (*restrict d)[j_size][k_size] __attribute__((aligned(64))) ="""
         """ (float (*)[j_size][k_size]) d_vec;
  for (int i = 0; i < 3; i += 1)
  {
    for (int j = 0; j < 5; j += 1)
    {
      f_0(0,7,(float*)a,(float*)b,(float*)c,(float*)d,i,i_size,j,j_size,k_size);
    }
  }
}
void f_0(const int k_start, const int k_finish,"""
         """ float *restrict a_vec, float *restrict b_vec,"""
         """ float *restrict c_vec, float *restrict d_vec,"""
         """ const int i, const int i_size, const int j, const int j_size, const int k_size)
{
  float (*restrict a) __attribute__((aligned(64))) = (float (*)) a_vec;
  float (*restrict b) __attribute__((aligned(64))) = (float (*)) b_vec;
  float (*restrict c)[j_size] __attribute__((aligned(64))) = (float (*)[j_size]) c_vec;
  float (*restrict d)[j_size][k_size] __attribute__((aligned(64))) ="""
         """ (float (*)[j_size][k_size]) d_vec;
  for (int k = k_start; k < k_finish; k += 1)
  {
    a[i] = a[i] + b[i] + 5.0F;
    a[i] = -a[i]*c[i][j] + b[i]*d[i][j][k];
  }
}""")
Exemplo n.º 30
0
def make_grid_accesses(node, yk_grid_objs):
    """
    Construct a new Iteration/Expression based on ``node``, in which all
    :class:`types.Indexed` accesses have been converted into YASK grid
    accesses.
    """
    def make_grid_gets(expr):
        mapper = {}
        indexeds = retrieve_indexed(expr)
        data_carriers = [i for i in indexeds if i.base.function.from_YASK]
        for i in data_carriers:
            args = [
                ListInitializer([INT(make_grid_gets(j)) for j in i.indices])
            ]
            mapper[i] = make_sharedptr_funcall(
                namespace['code-grid-get'], args,
                yk_grid_objs[i.base.function.name])
        return expr.xreplace(mapper)

    mapper = {}
    for i, e in enumerate(FindNodes(Expression).visit(node)):
        if e.is_ForeignExpression:
            continue

        lhs, rhs = e.expr.args

        # RHS translation
        rhs = make_grid_gets(rhs)

        # LHS translation
        if e.write.from_YASK:
            args = [rhs]
            args += [
                ListInitializer([INT(make_grid_gets(i)) for i in lhs.indices])
            ]
            call = namespace['code-grid-add' if e.
                             is_Increment else 'code-grid-put']
            handle = make_sharedptr_funcall(call, args,
                                            yk_grid_objs[e.write.name])
            processed = ForeignExpression(handle,
                                          e.dtype,
                                          is_Increment=e.is_Increment)
        else:
            # Writing to a scalar temporary
            processed = e._rebuild(expr=e.expr.func(lhs, rhs))

        mapper.update({e: processed})

    return Transformer(mapper).visit(node)
Exemplo n.º 31
0
    def _insert_declarations(self, nodes):
        """Populate the Operator's body with the necessary variable declarations."""

        # Resolve function calls first
        scopes = []
        me = MapExpressions()
        for k, v in me.visit(nodes).items():
            if k.is_Call:
                func = self.func_table[k.name]
                if func.local:
                    scopes.extend(me.visit(func.root, queue=list(v)).items())
            else:
                scopes.append((k, v))

        # Determine all required declarations
        allocator = Allocator()
        mapper = OrderedDict()
        for k, v in scopes:
            if k.is_scalar:
                # Inline declaration
                mapper[k] = LocalExpression(**k.args)
            elif k.write._mem_external:
                # Nothing to do, variable passed as kernel argument
                continue
            elif k.write._mem_stack:
                # On the stack, as established by the DLE
                key = lambda i: not i.is_Parallel
                site = filter_iterations(v, key=key, stop='asap') or [nodes]
                allocator.push_stack(site[-1], k.write)
            else:
                # On the heap, as a tensor that must be globally accessible
                allocator.push_heap(k.write)

        # Introduce declarations on the stack
        for k, v in allocator.onstack:
            mapper[k] = tuple(Element(i) for i in v)
        nodes = NestedTransformer(mapper).visit(nodes)
        for k, v in list(self.func_table.items()):
            if v.local:
                self.func_table[k] = FunMeta(
                    Transformer(mapper).visit(v.root), v.local)

        # Introduce declarations on the heap (if any)
        if allocator.onheap:
            decls, allocs, frees = zip(*allocator.onheap)
            nodes = List(header=decls + allocs, body=nodes, footer=frees)

        return nodes