Exemplo n.º 1
0
    def _nontemporal_stores(self, nodes):
        """
        Add compiler-specific pragmas and instructions to generate nontemporal
        stores (ie, non-cached stores).
        """
        pragma = self._backend_compiler_pragma('ntstores')
        fence = self._backend_compiler_pragma('storefence')
        if not pragma or not fence:
            return {}

        mapper = {}
        for tree in retrieve_iteration_tree(nodes):
            for i in tree:
                if i.is_Parallel:
                    mapper[i] = List(body=i, footer=fence)
                    break
        processed = Transformer(mapper).visit(nodes)

        mapper = {}
        for tree in retrieve_iteration_tree(processed):
            for i in tree:
                if i.is_Vectorizable:
                    mapper[i] = List(header=pragma, body=i)
        processed = Transformer(mapper).visit(processed)

        return processed, {}
Exemplo n.º 2
0
def test_cache_blocking_structure(blockinner, exp_calls, exp_iters):
    # Check code structure
    _, op = _new_operator1((10, 31, 45), dle=('blocking', {'blockalways': True,
                                                           'blockinner': blockinner}))
    calls = FindNodes(Call).visit(op)
    assert len(calls) == exp_calls
    trees = retrieve_iteration_tree(op._func_table['bf0'].root)
    assert len(trees) == 1
    assert len(trees[0]) == exp_iters

    # Check presence of openmp pragmas at the right place
    _, op = _new_operator1((10, 31, 45), dle=('blocking',
                                              {'openmp': True,
                                               'blockalways': True,
                                               'blockinner': blockinner}))
    trees = retrieve_iteration_tree(op._func_table['bf0'].root)
    assert len(trees) == 1
    tree = trees[0]
    assert len(tree.root.pragmas) == 1
    assert 'omp for' in tree.root.pragmas[0].value
    # Also, with omp parallelism enabled, the step increment must be != 0
    # to avoid omp segfaults at scheduling time (only certain omp implementations,
    # including Intel's)
    conditionals = FindNodes(Conditional).visit(op._func_table['bf0'].root)
    assert len(conditionals) == 1
    conds = conditionals[0].condition.args
    expected_guarded = tree[:2+blockinner]
    assert len(conds) == len(expected_guarded)
    assert all(i.lhs == j.step for i, j in zip(conds, expected_guarded))
Exemplo n.º 3
0
    def test_multiple_loop_nests(self):
        """
        Compute a simple stencil S, preceded by an "initialization loop" I and
        followed by a "random loop" R.

            * S is the trivial equation ``u[t+1,x,y,z] = u[t,x,y,z] + 1``;
            * I initializes ``u`` to 0;
            * R adds 2 to another field ``v`` along the ``z`` dimension but only
                over the planes ``[x=0, y=2]`` and ``[x=0, y=5]``.

        Out of these three loop nests, only S should be "offloaded" to YASK; indeed,
        I is outside the time loop, while R does not loop over space dimensions.
        This test checks that S is the only loop nest "offloaded" to YASK, and
        that the numerical output is correct.
        """
        grid = Grid(shape=(12, 12, 12))
        x, y, z = grid.dimensions
        t = grid.stepping_dim
        u = TimeFunction(name='yu4D', grid=grid, space_order=0)
        v = TimeFunction(name='yv4D', grid=grid, space_order=0)
        v.data[:] = 0.
        eqs = [Eq(u[0, x, y, z], 0),
               Eq(u[1, x, y, z], 0),
               Eq(u.forward, u + 1.),
               Eq(v[t + 1, 0, 2, z], v[t + 1, 0, 2, z] + 2.),
               Eq(v[t + 1, 0, 5, z], v[t + 1, 0, 5, z] + 2.)]
        op = Operator(eqs)
        op(yu4D=u, yv4D=v, time=0)
        assert 'run_solution' in str(op)
        assert len(retrieve_iteration_tree(op)) == 3
        assert np.all(u.data[0] == 0.)
        assert np.all(u.data[1] == 1.)
        assert np.all(v.data[0] == 0.)
        assert np.all(v.data[1, 0, 2] == 2.)
        assert np.all(v.data[1, 0, 5] == 2.)
Exemplo n.º 4
0
    def __init__(self, iet):
        self.iet = iet
        self.properties = OrderedDict()

        self.trees = retrieve_iteration_tree(iet, mode='superset')
        self.scopes = OrderedDict([(k, Scope([i.expr for i in v]))
                                   for k, v in MapNodes().visit(iet).items()])
Exemplo n.º 5
0
    def test_make_efuncs(self, exprs, nfuncs, ntimeiters, nests):
        """Test construction of ElementalFunctions."""
        exprs = list(as_tuple(exprs))

        grid = Grid(shape=(10, 10))
        t = grid.stepping_dim  # noqa
        x, y = grid.dimensions  # noqa

        u = Function(name='u', grid=grid)  # noqa
        v = TimeFunction(name='v', grid=grid)  # noqa

        # List comprehension would need explicit locals/globals mappings to eval
        for i, e in enumerate(list(exprs)):
            exprs[i] = eval(e)

        op = Operator(exprs)

        # We create one ElementalFunction for each Iteration nest over space dimensions
        efuncs = []
        for n, tree in enumerate(retrieve_iteration_tree(op)):
            root = filter_iterations(tree, key=lambda i: i.dim.is_Space)[0]
            efuncs.append(make_efunc('f%d' % n, root))

        assert len(efuncs) == len(nfuncs) == len(ntimeiters) == len(nests)

        for efunc, nf, nt, nest in zip(efuncs, nfuncs, ntimeiters, nests):
            # Check the `efunc` parameters
            assert all(i in efunc.parameters for i in (x.symbolic_min, x.symbolic_max))
            assert all(i in efunc.parameters for i in (y.symbolic_min, y.symbolic_max))
            functions = FindSymbols().visit(efunc)
            assert len(functions) == nf
            assert all(i in efunc.parameters for i in functions)
            timeiters = [i for i in FindSymbols('free-symbols').visit(efunc)
                         if isinstance(i, Dimension) and i.is_Time]
            assert len(timeiters) == nt
            assert all(i in efunc.parameters for i in timeiters)
            assert len(efunc.parameters) == 4 + len(functions) + len(timeiters)

            # Check the loop nest structure
            trees = retrieve_iteration_tree(efunc)
            assert len(trees) == 1
            tree = trees[0]
            assert all(i.dim.name == j for i, j in zip(tree, nest))

            assert efunc.make_call()
Exemplo n.º 6
0
def fold_blockable_tree(node, blockinner=True):
    """
    Create IterationFolds from sequences of nested Iterations.
    """
    mapper = {}
    for k, v in FindAdjacent(Iteration).visit(node).items():
        for i in v:
            # Pre-condition: they all must be perfect iterations
            assert len(i) > 1
            if any(not IsPerfectIteration().visit(j) for j in i):
                continue
            # Only retain consecutive trees having same depth
            trees = [retrieve_iteration_tree(j)[0] for j in i]
            handle = []
            for j in trees:
                if len(j) != len(trees[0]):
                    break
                handle.append(j)
            trees = handle
            if not trees:
                continue
            # Check foldability
            pairwise_folds = list(zip(*reversed(trees)))
            if any(not is_foldable(j) for j in pairwise_folds):
                continue
            # Maybe heuristically exclude innermost Iteration
            if blockinner is False:
                pairwise_folds = pairwise_folds[:-1]
            # Perhaps there's nothing to fold
            if len(pairwise_folds) == 1:
                continue
            # TODO: we do not currently support blocking if any of the foldable
            # iterations writes to user data (need min/max loop bounds?)
            exprs = flatten(FindNodes(Expression).visit(j.root) for j in trees[:-1])
            if any(j.write.is_Input for j in exprs):
                continue
            # Perform folding
            for j in pairwise_folds:
                root, remainder = j[0], j[1:]
                folds = [(tuple(y-x for x, y in zip(i.offsets, root.offsets)), i.nodes)
                         for i in remainder]
                mapper[root] = IterationFold(folds=folds, **root.args)
                for k in remainder:
                    mapper[k] = None

    # Insert the IterationFolds in the Iteration/Expression tree
    processed = Transformer(mapper, nested=True).visit(node)

    return processed
Exemplo n.º 7
0
def unfold_blocked_tree(node):
    """
    Unfold nested IterationFolds.

    Examples
    --------

    Given a section of Iteration/Expression tree as below: ::

        for i = 1 to N-1  // folded
          for j = 1 to N-1  // folded
            foo1()

    Assuming a fold with offset 1 in both /i/ and /j/ and body ``foo2()``, create: ::

        for i = 1 to N-1
          for j = 1 to N-1
            foo1()
        for i = 2 to N-2
          for j = 2 to N-2
            foo2()
    """
    # Search the unfolding candidates
    candidates = []
    for tree in retrieve_iteration_tree(node):
        handle = tuple(i for i in tree if i.is_IterationFold)
        if handle:
            # Sanity check
            assert IsPerfectIteration().visit(handle[0])
            candidates.append(handle)

    # Perform unfolding
    mapper = {}
    for tree in candidates:
        trees = list(zip(*[i.unfold() for i in tree]))
        trees = optimize_unfolded_tree(trees[:-1], trees[-1])
        mapper[tree[0]] = List(body=trees)

    # Insert the unfolded Iterations in the Iteration/Expression tree
    processed = Transformer(mapper).visit(node)

    return processed
Exemplo n.º 8
0
    def _hoist_prodders(self, iet):
        """
        Move Prodders within the outer levels of an Iteration tree.
        """
        mapper = {}
        for tree in retrieve_iteration_tree(iet):
            for prodder in FindNodes(Prodder).visit(tree.root):
                if prodder._periodic:
                    try:
                        key = lambda i: isinstance(i.dim, BlockDimension)
                        candidate = filter_iterations(tree, key)[-1]
                    except IndexError:
                        # Fallback: use the outermost Iteration
                        candidate = tree.root
                    mapper[candidate] = candidate._rebuild(nodes=((prodder._rebuild(),) +
                                                                  candidate.nodes))
                    mapper[prodder] = None

        iet = Transformer(mapper, nested=True).visit(iet)

        return iet, {}
Exemplo n.º 9
0
def test_cache_blocking_imperfect_nest(blockinner):
    """
    Test that a non-perfect Iteration nest is blocked correctly.
    """
    grid = Grid(shape=(4, 4, 4), dtype=np.float64)

    u = TimeFunction(name='u', grid=grid, space_order=2)
    v = TimeFunction(name='v', grid=grid, space_order=2)

    eqns = [Eq(u.forward, v.laplace), Eq(v.forward, u.forward.dz)]

    op0 = Operator(eqns, opt='noop')
    op1 = Operator(eqns, opt=('advanced', {'blockinner': blockinner}))

    # First, check the generated code
    bns, _ = assert_blocking(op1, {'x0_blk0'})
    trees = retrieve_iteration_tree(bns['x0_blk0'])
    assert len(trees) == 2
    assert len(trees[0]) == len(trees[1])
    assert all(i is j for i, j in zip(trees[0][:4], trees[1][:4]))
    assert trees[0][4] is not trees[1][4]
    assert trees[0].root.dim.is_Incr
    assert trees[1].root.dim.is_Incr
    assert op1.parameters[7] is trees[0][0].step
    assert op1.parameters[10] is trees[0][1].step

    u.data[:] = 0.2
    v.data[:] = 1.5
    op0(time_M=0)

    u1 = TimeFunction(name='u1', grid=grid, space_order=2)
    v1 = TimeFunction(name='v1', grid=grid, space_order=2)

    u1.data[:] = 0.2
    v1.data[:] = 1.5
    op1(u=u1, v=v1, time_M=0)

    assert np.all(u.data == u1.data)
    assert np.all(v.data == v1.data)
Exemplo n.º 10
0
    def test_timeparallel_reduction(self):
        grid = Grid(shape=(3, 3, 3))
        i = Dimension(name='i')

        f = Function(name='f', shape=(1, ), dimensions=(i, ), grid=grid)
        u = TimeFunction(name='u', grid=grid)

        op = Operator(Inc(f[0], u + 1), opt='noop')

        trees = retrieve_iteration_tree(op)
        assert len(trees) == 1
        tree = trees[0]
        assert tree.root.is_Sequential
        assert all(i.is_ParallelRelaxed and not i.is_Parallel
                   for i in tree[1:])

        # The time loop is not in OpenMP canonical form, so it won't be parallelized
        assert not tree.root.pragmas
        assert len(tree[1].pragmas) == 1
        assert tree[1].pragmas[0].value ==\
            ('omp target teams distribute parallel for collapse(3)'
             ' reduction(+:f[0])')
Exemplo n.º 11
0
    def test_basic(self):
        grid = Grid(shape=(3, 3, 3))

        u = TimeFunction(name='u', grid=grid)

        op = Operator(Eq(u.forward, u + 1))

        trees = retrieve_iteration_tree(op)
        assert len(trees) == 1

        assert trees[0][1].pragmas[0].value ==\
            'omp target teams distribute parallel for collapse(3)'
        assert op.body[2].header[0].value ==\
            ('omp target enter data map(to: u[0:u_vec->size[0]]'
             '[0:u_vec->size[1]][0:u_vec->size[2]][0:u_vec->size[3]])')
        assert str(op.body[2].footer[0]) == ''
        assert op.body[2].footer[1].contents[0].value ==\
            ('omp target update from(u[0:u_vec->size[0]]'
             '[0:u_vec->size[1]][0:u_vec->size[2]][0:u_vec->size[3]])')
        assert op.body[2].footer[1].contents[1].value ==\
            ('omp target exit data map(release: u[0:u_vec->size[0]]'
             '[0:u_vec->size[1]][0:u_vec->size[2]][0:u_vec->size[3]]) if(devicerm)')
Exemplo n.º 12
0
    def _parallelize_dist(self, iet):
        """
        Add MPI routines performing halo exchanges to emit distributed-memory
        parallel code.
        """
        # To produce unique object names
        generators = {
            'msg': generator(),
            'comm': generator(),
            'comp': generator()
        }
        sync_heb = HaloExchangeBuilder('basic', **generators)
        user_heb = HaloExchangeBuilder(self.params['mpi'], **generators)
        mapper = {}
        for hs in FindNodes(HaloSpot).visit(iet):
            heb = user_heb if hs.is_Overlappable else sync_heb
            mapper[hs] = heb.make(hs)
        efuncs = sync_heb.efuncs + user_heb.efuncs
        objs = sync_heb.objs + user_heb.objs
        iet = Transformer(mapper, nested=True).visit(iet)

        # Must drop the PARALLEL tag from the Iterations within which halo
        # exchanges are performed
        mapper = {}
        for tree in retrieve_iteration_tree(iet):
            for i in reversed(tree):
                if i in mapper:
                    # Already seen this subtree, skip
                    break
                if FindNodes(Call).visit(i):
                    mapper.update({
                        n:
                        n._rebuild(properties=set(n.properties) - {PARALLEL})
                        for n in tree[:tree.index(i) + 1]
                    })
                    break
        iet = Transformer(mapper, nested=True).visit(iet)

        return iet, {'includes': ['mpi.h'], 'efuncs': efuncs, 'args': objs}
Exemplo n.º 13
0
def iet_build(clusters, dtype):
    """
    Create an Iteration/Expression tree (IET) given an iterable of :class:`Cluster`s.
    The nodes in the returned IET are decorated with properties deriving from
    data dependence analysis.
    """
    # Clusters -> Iteration/Expression tree
    iet = iet_make(clusters, dtype)

    # Data dependency analysis. Properties are attached directly to nodes
    iet = iet_analyze(iet)

    # Substitute derived dimensions (e.g., t -> t0, t + 1 -> t1)
    # This is postponed up to this point to ease /iet_analyze/'s life
    subs = {}
    for tree in retrieve_iteration_tree(iet):
        uindices = flatten(i.uindices for i in tree)
        subs.update({i.expr: LoweredDimension(name=i.index.name, origin=i.expr)
                     for i in uindices})
    iet = SubstituteExpression(subs).visit(iet)

    return iet
Exemplo n.º 14
0
    def make_parallel(self, iet):
        """Transform ``iet`` by introducing shared-memory parallelism."""
        mapper = OrderedDict()
        for tree in retrieve_iteration_tree(iet):
            # Get the first omp-parallelizable Iteration in `tree`
            candidates = filter_iterations(tree, key=self.key, stop='asap')
            if not candidates:
                continue
            root = candidates[0]

            # Build the `omp-for` tree
            partree = self._make_parallel_tree(root, candidates)

            # Find out the thread-private and thread-shared variables
            private = [
                i for i in FindSymbols().visit(partree)
                if i.is_Array and i._mem_stack
            ]

            # Build the `omp-parallel` region
            private = sorted(set([i.name for i in private]))
            private = ('private(%s)' % ','.join(private)) if private else ''
            partree = Block(header=self.lang['par-region'](self.nthreads.name,
                                                           private),
                            body=partree)

            # Do not enter the parallel region if the step increment might be 0; this
            # would raise a `Floating point exception (core dumped)` in some OpenMP
            # implementation. Note that using an OpenMP `if` clause won't work
            if isinstance(root.step, Symbol):
                cond = Conditional(CondEq(root.step, 0),
                                   Element(c.Statement('return')))
                partree = List(body=[cond, partree])

            mapper[root] = partree
        iet = Transformer(mapper).visit(iet)

        return iet, {'input': [self.nthreads] if mapper else []}
Exemplo n.º 15
0
    def test_basic(self):
        grid = Grid(shape=(3, 3, 3))

        u = TimeFunction(name='u', grid=grid)

        op = Operator(Eq(u.forward, u + 1),
                      platform='nvidiaX',
                      language='openacc')

        trees = retrieve_iteration_tree(op)
        assert len(trees) == 1

        assert trees[0][1].pragmas[0].value ==\
            'acc parallel loop collapse(3)'
        assert op.body[1].header[0].value ==\
            ('acc enter data copyin(u[0:u_vec->size[0]]'
             '[0:u_vec->size[1]][0:u_vec->size[2]][0:u_vec->size[3]])')
        assert op.body[1].footer[0].contents[0].value ==\
            ('acc exit data copyout(u[0:u_vec->size[0]]'
             '[0:u_vec->size[1]][0:u_vec->size[2]][0:u_vec->size[3]])')
        assert op.body[1].footer[0].contents[1].value ==\
            ('acc exit data delete(u[0:u_vec->size[0]]'
             '[0:u_vec->size[1]][0:u_vec->size[2]][0:u_vec->size[3]])')
Exemplo n.º 16
0
    def test_multiple_eqns(self):
        grid = Grid(shape=(3, 3, 3))

        u = TimeFunction(name='u', grid=grid)
        v = TimeFunction(name='v', grid=grid)

        op = Operator([Eq(u.forward, u + v + 1), Eq(v.forward, u + v + 4)],
                      dle=('advanced', {'openmp': True}))

        trees = retrieve_iteration_tree(op)
        assert len(trees) == 1

        assert trees[0][1].pragmas[0].value ==\
            'omp target teams distribute parallel for collapse(3)'
        for i, f in enumerate([u, v]):
            assert op.body[2].header[2 + i].value ==\
                ('omp target enter data map(to: %(n)s[0:%(n)s_vec->size[0]]'
                 '[0:%(n)s_vec->size[1]][0:%(n)s_vec->size[2]][0:%(n)s_vec->size[3]])' %
                 {'n': f.name})
            assert op.body[2].footer[i].value ==\
                ('omp target exit data map(from: %(n)s[0:%(n)s_vec->size[0]]'
                 '[0:%(n)s_vec->size[1]][0:%(n)s_vec->size[2]][0:%(n)s_vec->size[3]])' %
                 {'n': f.name})
Exemplo n.º 17
0
    def test_multiple_loop_nests(self):
        """
        Compute a simple stencil S, preceded by an "initialization loop" I and
        followed by a "random loop" R.

            * S is the trivial equation ``u[t+1,x,y,z] = u[t,x,y,z] + 1``;
            * I initializes ``u`` to 0;
            * R adds 2 to another field ``v`` along the ``z`` dimension but only
                over the planes ``[x=0, y=2]`` and ``[x=0, y=5]``.

        Out of these three loop nests, only S should be "offloaded" to YASK; indeed,
        I is outside the time loop, while R does not loop over space dimensions.
        This test checks that S is the only loop nest "offloaded" to YASK, and
        that the numerical output is correct.
        """
        grid = Grid(shape=(12, 12, 12))
        x, y, z = grid.dimensions
        t = grid.stepping_dim
        u = TimeFunction(name='yu4D', grid=grid, space_order=0)
        v = TimeFunction(name='yv4D', grid=grid, space_order=0)
        v.data[:] = 0.
        eqs = [
            Eq(u.indexed[0, x, y, z], 0),
            Eq(u.indexed[1, x, y, z], 0),
            Eq(u.forward, u + 1.),
            Eq(v.indexed[t + 1, 0, 2, z], v.indexed[t + 1, 0, 2, z] + 2.),
            Eq(v.indexed[t + 1, 0, 5, z], v.indexed[t + 1, 0, 5, z] + 2.)
        ]
        op = Operator(eqs)
        op(yu4D=u, yv4D=v, time=0)
        assert 'run_solution' in str(op)
        assert len(retrieve_iteration_tree(op)) == 3
        assert np.all(u.data[0] == 0.)
        assert np.all(u.data[1] == 1.)
        assert np.all(v.data[0] == 0.)
        assert np.all(v.data[1, 0, 2] == 2.)
        assert np.all(v.data[1, 0, 5] == 2.)
Exemplo n.º 18
0
    def test_multiple_subnests_v0(self):
        grid = Grid(shape=(3, 3, 3))
        x, y, z = grid.dimensions
        t = grid.stepping_dim

        f = Function(name='f', grid=grid)
        u = TimeFunction(name='u', grid=grid, space_order=3)

        eqn = Eq(
            u.forward,
            _R(
                _R(u[t, x, y, z] + u[t, x + 1, y + 1, z + 1]) * 3. * f +
                _R(u[t, x + 2, y + 2, z + 2] + u[t, x + 3, y + 3, z + 3]) *
                3. * f) + 1.)
        op = Operator(eqn,
                      opt=('advanced', {
                          'openmp': True,
                          'cire-mingain': 0,
                          'par-nested': 0,
                          'par-collapse-ncores': 1,
                          'par-dynamic-work': 0
                      }))

        bns, _ = assert_blocking(op, {'x0_blk0'})

        trees = retrieve_iteration_tree(bns['x0_blk0'])
        assert len(trees) == 2

        assert trees[0][0] is trees[1][0]
        assert trees[0][0].pragmas[0].value ==\
            'omp for collapse(2) schedule(dynamic,1)'
        assert trees[0][2].pragmas[0].value == ('omp parallel for collapse(2) '
                                                'schedule(dynamic,1) '
                                                'num_threads(nthreads_nested)')
        assert trees[1][2].pragmas[0].value == ('omp parallel for collapse(2) '
                                                'schedule(dynamic,1) '
                                                'num_threads(nthreads_nested)')
Exemplo n.º 19
0
def test_cache_blocking_structure_subdims():
    """
    Test that:

        * With local SubDimensions no-blocking is expected.
        * With non-local SubDimensions, blocking is expected.
    """
    grid = Grid(shape=(4, 4, 4))
    x, y, z = grid.dimensions
    xi, yi, zi = grid.interior.dimensions
    t = grid.stepping_dim
    xl = SubDimension.left(name='xl', parent=x, thickness=4)

    f = TimeFunction(name='f', grid=grid)

    assert xl.local

    # Local SubDimension -> no blocking expected
    op = Operator(Eq(f[t + 1, xl, y, z], f[t, xl, y, z] + 1))
    assert len(op._func_table) == 0

    # Non-local SubDimension -> blocking expected
    op = Operator(Eq(f.forward, f + 1, subdomain=grid.interior))
    trees = retrieve_iteration_tree(op._func_table['bf0'].root)
    assert len(trees) == 1
    tree = trees[0]
    assert len(tree) == 5
    assert tree[
        0].dim.is_Incr and tree[0].dim.parent is xi and tree[0].dim.root is x
    assert tree[
        1].dim.is_Incr and tree[1].dim.parent is yi and tree[1].dim.root is y
    assert tree[2].dim.is_Incr and tree[2].dim.parent is tree[0].dim and\
        tree[2].dim.root is x
    assert tree[3].dim.is_Incr and tree[3].dim.parent is tree[1].dim and\
        tree[3].dim.root is y
    assert not tree[
        4].dim.is_Incr and tree[4].dim is zi and tree[4].dim.parent is z
Exemplo n.º 20
0
    def test_multiple_subnests_v1(self):
        """
        Unlike ``test_multiple_subnestes_v0``, now we use the ``cire-rotate=True``
        option, which trades some of the inner parallelism for a smaller working set.
        """
        grid = Grid(shape=(3, 3, 3))
        x, y, z = grid.dimensions
        t = grid.stepping_dim

        f = Function(name='f', grid=grid)
        u = TimeFunction(name='u', grid=grid, space_order=3)

        eqn = Eq(u.forward, ((u[t, x, y, z] + u[t, x+1, y+1, z+1])*3*f +
                             (u[t, x+2, y+2, z+2] + u[t, x+3, y+3, z+3])*3*f + 1))
        op = Operator(eqn, opt=('advanced', {'openmp': True,
                                             'cire-mincost-sops': 1,
                                             'cire-rotate': True,
                                             'par-nested': 0,
                                             'par-collapse-ncores': 1,
                                             'par-dynamic-work': 0}))

        trees = retrieve_iteration_tree(op._func_table['bf0'].root)
        assert len(trees) == 2

        assert trees[0][0] is trees[1][0]
        assert trees[0][0].pragmas[0].value ==\
            'omp for collapse(2) schedule(dynamic,1)'
        assert not trees[0][2].pragmas
        assert not trees[0][3].pragmas
        assert trees[0][4].pragmas[0].value == ('omp parallel for collapse(1) '
                                                'schedule(dynamic,1) '
                                                'num_threads(nthreads_nested)')
        assert not trees[1][2].pragmas
        assert trees[1][3].pragmas[0].value == ('omp parallel for collapse(1) '
                                                'schedule(dynamic,1) '
                                                'num_threads(nthreads_nested)')
Exemplo n.º 21
0
 def test_consistency_anti_dependences(self, exprs, directions, expected,
                                       visit, ti0, ti1, ti3, tu, tv, tw):
     """
     Test that anti dependences end up generating multi loop nests, rather
     than a single loop nest enclosing all of the equations.
     """
     eq1, eq2, eq3 = EVAL(exprs, ti0.base, ti1.base, ti3.base, tu.base,
                          tv.base, tw.base)
     op = Operator([eq1, eq2, eq3], dse='noop', dle='noop')
     trees = retrieve_iteration_tree(op)
     iters = FindNodes(Iteration).visit(op)
     assert len(trees) == len(expected)
     assert len(iters) == len(directions)
     # mapper just makes it quicker to write out the test parametrization
     mapper = {'time': 't'}
     assert [
         "".join(mapper.get(i.dim.name, i.dim.name) for i in j)
         for j in trees
     ] == expected
     assert "".join(mapper.get(i.dim.name, i.dim.name)
                    for i in iters) == visit
     # mapper just makes it quicker to write out the test parametrization
     mapper = {'+': Forward, '-': Backward, '*': Any}
     assert all(i.direction == mapper[j] for i, j in zip(iters, directions))
Exemplo n.º 22
0
    def _simdize(self, iet):
        """
        Add pragmas to the Iteration/Expression tree to enforce SIMD auto-vectorization
        by the backend compiler.
        """
        ignore_deps = as_tuple(self._backend_compiler_pragma('ignore-deps'))

        mapper = {}
        for tree in retrieve_iteration_tree(iet):
            vector_iterations = [i for i in tree if i.is_Vectorizable]
            for i in vector_iterations:
                aligned = [j for j in FindSymbols('symbolics').visit(i)
                           if j.is_DiscreteFunction]
                if aligned:
                    simd = Ompizer.lang['simd-for-aligned']
                    simd = as_tuple(simd(','.join([j.name for j in aligned]),
                                    self.platform.simd_reg_size))
                else:
                    simd = as_tuple(Ompizer.lang['simd-for'])
                mapper[i] = i._rebuild(pragmas=i.pragmas + ignore_deps + simd)

        processed = Transformer(mapper).visit(iet)

        return processed, {}
Exemplo n.º 23
0
    def test_multiple_subnests(self):
        grid = Grid(shape=(3, 3, 3))
        x, y, z = grid.dimensions
        t = grid.stepping_dim

        f = Function(name='f', grid=grid)
        u = TimeFunction(name='u', grid=grid)

        eqn = Eq(u.forward, ((u[t, x, y, z] + u[t, x+1, y+1, z+1])*3*f +
                             (u[t, x+2, y+2, z+2] + u[t, x+3, y+3, z+3])*3*f + 1))
        op = Operator(eqn, dse='aggressive', dle=('advanced', {'openmp': True}))

        trees = retrieve_iteration_tree(op._func_table['bf0'].root)
        assert len(trees) == 2

        assert trees[0][0] is trees[1][0]
        assert trees[0][0].pragmas[0].value ==\
            'omp for collapse(1) schedule(dynamic,1)'
        assert trees[0][2].pragmas[0].value == ('omp parallel for collapse(1) '
                                                'schedule(dynamic,1) '
                                                'num_threads(nthreads_nested)')
        assert trees[1][2].pragmas[0].value == ('omp parallel for collapse(1) '
                                                'schedule(dynamic,1) '
                                                'num_threads(nthreads_nested)')
Exemplo n.º 24
0
    def _simdize(self, iet):
        """
        Add pragmas to the Iteration/Expression tree to enforce SIMD auto-vectorization
        by the backend compiler.
        """
        ignore_deps = as_tuple(self._backend_compiler_pragma('ignore-deps'))

        mapper = {}
        for tree in retrieve_iteration_tree(iet):
            vector_iterations = [i for i in tree if i.is_Vectorizable]
            for i in vector_iterations:
                aligned = [j for j in FindSymbols('symbolics').visit(i)
                           if j.is_DiscreteFunction]
                if aligned:
                    simd = Ompizer.lang['simd-for-aligned']
                    simd = as_tuple(simd(','.join([j.name for j in aligned]),
                                    self.platform.simd_reg_size))
                else:
                    simd = as_tuple(Ompizer.lang['simd-for'])
                mapper[i] = i._rebuild(pragmas=i.pragmas + ignore_deps + simd)

        processed = Transformer(mapper).visit(iet)

        return processed, {}
Exemplo n.º 25
0
    def test_multiple_subnests(self):
        grid = Grid(shape=(3, 3, 3))
        x, y, z = grid.dimensions
        t = grid.stepping_dim

        f = Function(name='f', grid=grid)
        u = TimeFunction(name='u', grid=grid)

        eqn = Eq(u.forward, (u[t, x, y, z]*u[t, x+1, y+1, z+1]*3*f +
                             u[t, x+2, y+2, z+2]*u[t, x+3, y+3, z+3]*3*f + 1))
        op = Operator(eqn, dse='aggressive', dle=('advanced', {'openmp': True}))

        trees = retrieve_iteration_tree(op._func_table['bf0'].root)
        assert len(trees) == 2

        assert trees[0][0] is trees[1][0]
        assert trees[0][0].pragmas[0].value ==\
            'omp for collapse(1) schedule(static,1)'
        assert trees[0][2].pragmas[0].value ==\
            ('omp parallel for collapse(1) schedule(static,1) num_threads(%d)'
             % nhyperthreads())
        assert trees[1][2].pragmas[0].value ==\
            ('omp parallel for collapse(1) schedule(static,1) num_threads(%d)'
             % nhyperthreads())
Exemplo n.º 26
0
    def _loop_fission(self, nodes, state):
        """
        Apply loop fission to innermost :class:`Iteration` objects. This pass
        is not applied if the number of statements in an Iteration's body is
        lower than ``self.thresholds['fission'].``
        """

        mapper = {}
        for tree in retrieve_iteration_tree(nodes):
            if len(tree) <= 1:
                # Heuristically avoided
                continue

            candidate = tree[-1]
            expressions = [e for e in candidate.nodes if e.is_Expression]

            if len(expressions) < self.thresholds['max_fission']:
                # Heuristically avoided
                continue
            if len(expressions) != len(candidate.nodes):
                # Dangerous for correctness
                continue

            functions = list(
                set.union(*[set(e.functions) for e in expressions]))
            wrapped = [e.expr for e in expressions]

            if not functions or not wrapped:
                # Heuristically avoided
                continue

            # Promote temporaries from scalar to tensors
            handle = functions[0]
            dim = handle.indices[-1]
            size = handle.shape[-1]
            if any(dim != i.indices[-1] for i in functions):
                # Dangerous for correctness
                continue

            wrapped = promote_scalar_expressions(wrapped, (size, ), (dim, ),
                                                 True)

            assert len(wrapped) == len(expressions)
            rebuilt = [
                Expression(s, e.dtype) for s, e in zip(wrapped, expressions)
            ]

            # Group statements
            # TODO: Need a heuristic here to maximize reuse
            args_frozen = candidate.args_frozen
            properties = as_tuple(args_frozen['properties']) + (ELEMENTAL, )
            args_frozen['properties'] = properties
            n = self.thresholds['min_fission']
            fissioned = [
                Iteration(g, **args_frozen) for g in grouper(rebuilt, n)
            ]

            mapper[candidate] = List(body=fissioned)

        processed = Transformer(mapper).visit(nodes)

        return processed, {}
Exemplo n.º 27
0
    def _loop_blocking(self, iet):
        """
        Apply loop blocking to PARALLEL Iteration trees.
        """
        blockinner = bool(self.params.get('blockinner'))
        blockalways = bool(self.params.get('blockalways'))

        # Make sure loop blocking will span as many Iterations as possible
        iet = fold_blockable_tree(iet, blockinner)

        mapper = {}
        efuncs = []
        block_dims = []
        for tree in retrieve_iteration_tree(iet):
            # Is the Iteration tree blockable ?
            iterations = filter_iterations(tree, lambda i: i.is_Parallel)
            if not blockinner:
                iterations = iterations[:-1]
            if len(iterations) <= 1:
                continue
            root = iterations[0]
            if not (tree.root.is_Sequential or iet.is_Callable) and not blockalways:
                # Heuristic: avoid polluting the generated code with blocked
                # nests (thus increasing JIT compilation time and affecting
                # readability) if the blockable tree isn't embedded in a
                # sequential loop (e.g., a timestepping loop)
                continue

            # Apply loop blocking to `tree`
            interb = []
            intrab = []
            for i in iterations:
                d = BlockDimension(i.dim, name="%s%d_blk" % (i.dim.name, len(mapper)))
                block_dims.append(d)
                # Build Iteration over blocks
                properties = (PARALLEL,) + ((AFFINE,) if i.is_Affine else ())
                interb.append(Iteration([], d, d.symbolic_max, properties=properties))
                # Build Iteration within a block
                intrab.append(i._rebuild([], limits=(d, d+d.step-1, 1), offsets=(0, 0)))

            # Construct the blocked tree
            blocked = compose_nodes(interb + intrab + [iterations[-1].nodes])
            blocked = unfold_blocked_tree(blocked)

            # Promote to a separate Callable
            dynamic_parameters = flatten((bi.dim, bi.dim.symbolic_size) for bi in interb)
            efunc = make_efunc("bf%d" % len(mapper), blocked, dynamic_parameters)
            efuncs.append(efunc)

            # Compute the iteration ranges
            ranges = []
            for i, bi in zip(iterations, interb):
                maxb = i.symbolic_max - (i.symbolic_size % bi.dim.step)
                ranges.append(((i.symbolic_min, maxb, bi.dim.step),
                               (maxb + 1, i.symbolic_max, i.symbolic_max - maxb)))

            # Build Calls to the `efunc`
            body = []
            for p in product(*ranges):
                dynamic_args_mapper = {}
                for bi, (m, M, b) in zip(interb, p):
                    dynamic_args_mapper[bi.dim] = (m, M)
                    dynamic_args_mapper[bi.dim.step] = (b,)
                call = efunc.make_call(dynamic_args_mapper)
                body.append(List(body=call))

            mapper[root] = List(body=body)

        iet = Transformer(mapper).visit(iet)

        return iet, {'dimensions': block_dims, 'efuncs': efuncs,
                     'args': [i.step for i in block_dims]}
Exemplo n.º 28
0
    def _minimize_remainders(self, iet):
        """
        Reshape temporary tensors and adjust loop trip counts to prevent as many
        compiler-generated remainder loops as possible.
        """
        # The innermost dimension is the one that might get padded
        p_dim = -1

        mapper = {}
        for tree in retrieve_iteration_tree(iet):
            vector_iterations = [i for i in tree if i.is_Vectorizable]
            if not vector_iterations or len(vector_iterations) > 1:
                continue
            root = vector_iterations[0]

            # Padding
            writes = [i.write for i in FindNodes(Expression).visit(root)
                      if i.write.is_Array]
            padding = []
            for i in writes:
                try:
                    simd_items = self.platform.simd_items_per_reg(i.dtype)
                except KeyError:
                    return iet, {}
                padding.append(simd_items - i.shape[-1] % simd_items)
            if len(set(padding)) == 1:
                padding = padding[0]
                for i in writes:
                    padded = (i._padding[p_dim][0], i._padding[p_dim][1] + padding)
                    i.update(padding=i._padding[:p_dim] + (padded,))
            else:
                # Padding must be uniform -- not the case, so giving up
                continue

            # Dynamic trip count adjustment
            endpoint = root.symbolic_max
            if not endpoint.is_Symbol:
                continue
            condition = []
            externals = set(i.symbolic_shape[-1] for i in FindSymbols().visit(root)
                            if i.is_Tensor)
            for i in root.uindices:
                for j in externals:
                    condition.append(root.symbolic_max + padding < j)
            condition = ' && '.join(ccode(i) for i in condition)
            endpoint_padded = endpoint.func('_%s' % endpoint.name)
            init = cgen.Initializer(
                cgen.Value("const int", endpoint_padded),
                cgen.Line('(%s) ? %s : %s' % (condition,
                                              ccode(endpoint + padding),
                                              endpoint))
            )

            # Update the Iteration bound
            limits = list(root.limits)
            limits[1] = endpoint_padded.func(endpoint_padded.name)
            rebuilt = list(tree)
            rebuilt[rebuilt.index(root)] = root._rebuild(limits=limits)

            mapper[tree[0]] = List(header=init, body=compose_nodes(rebuilt))

        processed = Transformer(mapper).visit(iet)

        return processed, {}
Exemplo n.º 29
0
def relax_incr_dimensions(iet, **kwargs):
    """
    Recast Iterations over IncrDimensions as ElementalFunctions; insert
    ElementalCalls to iterate over the "main" and "remainder" regions induced
    by the IncrDimensions.
    """
    sregistry = kwargs['sregistry']

    efuncs = []
    mapper = {}
    for tree in retrieve_iteration_tree(iet):
        iterations = [i for i in tree if i.dim.is_Incr]
        if not iterations:
            continue

        root = iterations[0]
        if root in mapper:
            continue

        outer, inner = split(iterations, lambda i: not i.dim.parent.is_Incr)

        # Compute the iteration ranges
        ranges = []
        for i in outer:
            maxb = i.symbolic_max - (i.symbolic_size % i.dim.step)
            ranges.append(((i.symbolic_min, maxb, i.dim.step),
                           (maxb + 1, i.symbolic_max, i.symbolic_max - maxb)))

        # Remove any offsets
        # E.g., `x = x_m + 2 to x_M - 2` --> `x = x_m to x_M`
        outer = [i._rebuild(limits=(i.dim.root.symbolic_min, i.dim.root.symbolic_max,
                                    i.step))
                 for i in outer]

        # Create the ElementalFunction
        name = sregistry.make_name(prefix="bf")
        body = compose_nodes(outer)
        dynamic_parameters = flatten((i.symbolic_bounds, i.step) for i in outer)
        dynamic_parameters.extend([i.step for i in inner if not is_integer(i.step)])
        efunc = make_efunc(name, body, dynamic_parameters)

        efuncs.append(efunc)

        # Create the ElementalCalls
        calls = []
        for p in product(*ranges):
            dynamic_args_mapper = {}
            for i, (m, M, b) in zip(outer, p):
                dynamic_args_mapper[i.symbolic_min] = m
                dynamic_args_mapper[i.symbolic_max] = M
                dynamic_args_mapper[i.step] = b
                for j in inner:
                    if j.dim.root is i.dim.root and not is_integer(j.step):
                        value = j.step if b is i.step else b
                        dynamic_args_mapper[j.step] = (value,)
            calls.append(efunc.make_call(dynamic_args_mapper))

        mapper[root] = List(body=calls)

    iet = Transformer(mapper).visit(iet)

    return iet, {'efuncs': efuncs}
Exemplo n.º 30
0
def test_create_efuncs_complex(complex_function):
    roots = [i[-1] for i in retrieve_iteration_tree(complex_function)]
    retagged = [j._rebuild(properties=tagger(i)) for i, j in enumerate(roots)]
    mapper = {
        i: j._rebuild(properties=(j.properties + (ELEMENTAL, )))
        for i, j in zip(roots, retagged)
    }
    function = Transformer(mapper).visit(complex_function)
    handle = transform(function, mode='split')
    block = List(body=[handle.nodes] + handle.efuncs)
    output = str(block.ccode)
    # Make output compiler independent
    output = [
        i for i in output.split('\n')
        if all([j not in i for j in ('#pragma', '/*')])
    ]
    assert '\n'.join(output) == \
        ("""void foo(float *restrict a_vec, float *restrict b_vec,"""
         """ float *restrict c_vec, float *restrict d_vec)
{
  for (int i = 0; i <= 3; i += 1)
  {
    f_0((float *)a,(float *)b,i_size,i,4,0);
    for (int j = 0; j <= 5; j += 1)
    {
      f_1((float *)a,(float *)b,(float *)c,(float *)d,i_size,j_size,k_size,i,j,7,0);
    }
    f_2((float *)a,(float *)b,i_size,i,4,0);
  }
}
void f_0(float *restrict a_vec, float *restrict b_vec,"""
         """ const int i_size, const int i, const int sf_M, const int sf_m)
{
  float (*restrict a) __attribute__ ((aligned (64))) = (float (*)) a_vec;
  float (*restrict b) __attribute__ ((aligned (64))) = (float (*)) b_vec;
  for (int s = sf_m; s <= sf_M; s += 1)
  {
    b[i] = a[i] + pow(b[i], 2) + 3;
  }
}
void f_1(float *restrict a_vec, float *restrict b_vec,"""
         """ float *restrict c_vec, float *restrict d_vec,"""
         """ const int i_size, const int j_size, const int k_size,"""
         """ const int i, const int j, const int kf_M, const int kf_m)
{
  float (*restrict a) __attribute__ ((aligned (64))) = (float (*)) a_vec;
  float (*restrict b) __attribute__ ((aligned (64))) = (float (*)) b_vec;
  float (*restrict c)[j_size] __attribute__ ((aligned (64))) = (float (*)[j_size]) c_vec;
  float (*restrict d)[j_size][k_size] __attribute__ ((aligned (64))) ="""
         """ (float (*)[j_size][k_size]) d_vec;
  for (int k = kf_m; k <= kf_M; k += 1)
  {
    a[i] = a[i]*b[i]*c[i][j]*d[i][j][k];
    a[i] = 4*(a[i] + c[i][j])*(b[i] + d[i][j][k]);
  }
}
void f_2(float *restrict a_vec, float *restrict b_vec,"""
         """ const int i_size, const int i, const int qf_M, const int qf_m)
{
  float (*restrict a) __attribute__ ((aligned (64))) = (float (*)) a_vec;
  float (*restrict b) __attribute__ ((aligned (64))) = (float (*)) b_vec;
  for (int q = qf_m; q <= qf_M; q += 1)
  {
    a[i] = 8.0F*a[i] + 6.0F/b[i];
  }
}""")
Exemplo n.º 31
0
    def _create_elemental_functions(self, nodes, state):
        """
        Extract :class:`Iteration` sub-trees and move them into :class:`Callable`s.

        Currently, only tagged, elementizable Iteration objects are targeted.
        """
        noinline = self._compiler_decoration('noinline',
                                             c.Comment('noinline?'))

        functions = OrderedDict()
        mapper = {}
        for tree in retrieve_iteration_tree(nodes, mode='superset'):
            # Search an elementizable sub-tree (if any)
            tagged = filter_iterations(tree, lambda i: i.tag is not None,
                                       'asap')
            if not tagged:
                continue
            root = tagged[0]
            if not root.is_Elementizable:
                continue
            target = tree[tree.index(root):]

            # Elemental function arguments
            args = []  # Found so far (scalars, tensors)
            defined_args = {}  # Map of argument values defined by loop bounds

            # Build a new Iteration/Expression tree with free bounds
            free = []
            for i in target:
                name, bounds = i.dim.name, i.bounds_symbolic
                # Iteration bounds
                start = Scalar(name='%s_start' % name, dtype=np.int32)
                finish = Scalar(name='%s_finish' % name, dtype=np.int32)
                defined_args[start.name] = bounds[0]
                defined_args[finish.name] = bounds[1]

                # Iteration unbounded indices
                ufunc = [
                    Scalar(name='%s_ub%d' % (name, j), dtype=np.int32)
                    for j in range(len(i.uindices))
                ]
                defined_args.update(
                    {uf.name: j.start
                     for uf, j in zip(ufunc, i.uindices)})
                limits = [
                    Scalar(name=start.name, dtype=np.int32),
                    Scalar(name=finish.name, dtype=np.int32), 1
                ]
                uindices = [
                    UnboundedIndex(j.index, i.dim + as_symbol(k))
                    for j, k in zip(i.uindices, ufunc)
                ]
                free.append(
                    i._rebuild(limits=limits, offsets=None, uindices=uindices))

            # Construct elemental function body, and inspect it
            free = NestedTransformer(dict((zip(target, free)))).visit(root)

            # Insert array casts for all non-defined
            f_symbols = FindSymbols('symbolics').visit(free)
            defines = [s.name for s in FindSymbols('defines').visit(free)]
            casts = [
                ArrayCast(f) for f in f_symbols
                if f.is_Tensor and f.name not in defines
            ]
            free = (List(body=casts), free)

            for i in derive_parameters(free):
                if i.name in defined_args:
                    args.append((defined_args[i.name], i))
                elif i.is_Dimension:
                    d = Scalar(name=i.name, dtype=i.dtype)
                    args.append((d, d))
                else:
                    args.append((i, i))

            call, params = zip(*args)
            name = "f_%d" % root.tag

            # Produce the new Call
            mapper[root] = List(header=noinline, body=Call(name, call))

            # Produce the new Callable
            functions.setdefault(
                name,
                Callable(name, free, 'void', flatten(params), ('static', )))

        # Transform the main tree
        processed = Transformer(mapper).visit(nodes)

        return processed, {'elemental_functions': functions.values()}
Exemplo n.º 32
0
    def _loop_blocking(self, nodes, state):
        """Apply loop blocking to PARALLEL Iteration trees."""
        exclude_innermost = not self.params.get('blockinner', False)
        ignore_heuristic = self.params.get('blockalways', False)

        # Make sure loop blocking will span as many Iterations as possible
        fold = fold_blockable_tree(nodes, exclude_innermost)

        mapper = {}
        blocked = OrderedDict()
        for tree in retrieve_iteration_tree(fold):
            # Is the Iteration tree blockable ?
            iterations = [i for i in tree if i.is_Parallel]
            if exclude_innermost:
                iterations = [i for i in iterations if not i.is_Vectorizable]
            if len(iterations) <= 1:
                continue
            root = iterations[0]
            if not IsPerfectIteration().visit(root):
                # Illegal/unsupported
                continue
            if not tree.root.is_Sequential and not ignore_heuristic:
                # Heuristic: avoid polluting the generated code with blocked
                # nests (thus increasing JIT compilation time and affecting
                # readability) if the blockable tree isn't embedded in a
                # sequential loop (e.g., a timestepping loop)
                continue

            # Decorate intra-block iterations with an IterationProperty
            TAG = tagger(len(mapper))

            # Build all necessary Iteration objects, individually. These will
            # subsequently be composed to implement loop blocking.
            inter_blocks = []
            intra_blocks = []
            remainders = []
            for i in iterations:
                # Build Iteration over blocks
                name = "%s%d_block" % (i.dim.name, len(mapper))
                dim = blocked.setdefault(i, BlockDimension(i.dim, name=name))
                binnersize = i.symbolic_size + (i.offsets[1] - i.offsets[0])
                bmax = i.dim.symbolic_max - (binnersize % dim.step)
                inter_block = Iteration([], dim, bmax, offsets=i.offsets,
                                        properties=PARALLEL)
                inter_blocks.append(inter_block)

                # Build Iteration within a block
                limits = (dim, dim + dim.step - 1, 1)
                intra_block = i._rebuild([], limits=limits, offsets=(0, 0),
                                         properties=i.properties + (TAG, ELEMENTAL))
                intra_blocks.append(intra_block)

                # Build unitary-increment Iteration over the 'leftover' region.
                # This will be used for remainder loops, executed when any
                # dimension size is not a multiple of the block size.
                remainder = i._rebuild([], limits=[bmax + 1, i.dim.symbolic_max, 1],
                                       offsets=(i.offsets[1], i.offsets[1]))
                remainders.append(remainder)

            # Build blocked Iteration nest
            blocked_tree = compose_nodes(inter_blocks + intra_blocks +
                                         [iterations[-1].nodes])

            # Build remainder Iterations
            remainder_trees = []
            for n in range(len(iterations)):
                for c in combinations([i.dim for i in iterations], n + 1):
                    # First all inter-block Interations
                    nodes = [b._rebuild(properties=b.properties + (REMAINDER,))
                             for b, r in zip(inter_blocks, remainders)
                             if r.dim not in c]
                    # Then intra-block or remainder, for each dim (in order)
                    properties = (REMAINDER, TAG, ELEMENTAL)
                    for b, r in zip(intra_blocks, remainders):
                        handle = r if b.dim in c else b
                        nodes.append(handle._rebuild(properties=properties))
                    nodes.extend([iterations[-1].nodes])
                    remainder_trees.append(compose_nodes(nodes))

            # Will replace with blocked loop tree
            mapper[root] = List(body=[blocked_tree] + remainder_trees)

        rebuilt = Transformer(mapper).visit(fold)

        # Finish unrolling any previously folded Iterations
        processed = unfold_blocked_tree(rebuilt)

        return processed, {'dimensions': list(blocked.values())}
Exemplo n.º 33
0
    def _minimize_remainders(self, iet):
        """
        Reshape temporary tensors and adjust loop trip counts to prevent as many
        compiler-generated remainder loops as possible.
        """
        # The innermost dimension is the one that might get padded
        p_dim = -1

        mapper = {}
        for tree in retrieve_iteration_tree(iet):
            vector_iterations = [i for i in tree if i.is_Vectorizable]
            if not vector_iterations or len(vector_iterations) > 1:
                continue
            root = vector_iterations[0]

            # Padding
            writes = [i.write for i in FindNodes(Expression).visit(root)
                      if i.write.is_Array]
            padding = []
            for i in writes:
                try:
                    simd_items = self.platform.simd_items_per_reg(i.dtype)
                except KeyError:
                    return iet, {}
                padding.append(simd_items - i.shape[-1] % simd_items)
            if len(set(padding)) == 1:
                padding = padding[0]
                for i in writes:
                    padded = (i._padding[p_dim][0], i._padding[p_dim][1] + padding)
                    i.update(padding=i._padding[:p_dim] + (padded,))
            else:
                # Padding must be uniform -- not the case, so giving up
                continue

            # Dynamic trip count adjustment
            endpoint = root.symbolic_max
            if not endpoint.is_Symbol:
                continue
            condition = []
            externals = set(i.symbolic_shape[-1] for i in FindSymbols().visit(root)
                            if i.is_Tensor)
            for i in root.uindices:
                for j in externals:
                    condition.append(root.symbolic_max + padding < j)
            condition = ' && '.join(ccode(i) for i in condition)
            endpoint_padded = endpoint.func('_%s' % endpoint.name)
            init = cgen.Initializer(
                cgen.Value("const int", endpoint_padded),
                cgen.Line('(%s) ? %s : %s' % (condition,
                                              ccode(endpoint + padding),
                                              endpoint))
            )

            # Update the Iteration bound
            limits = list(root.limits)
            limits[1] = endpoint_padded.func(endpoint_padded.name)
            rebuilt = list(tree)
            rebuilt[rebuilt.index(root)] = root._rebuild(limits=limits)

            mapper[tree[0]] = List(header=init, body=compose_nodes(rebuilt))

        processed = Transformer(mapper).visit(iet)

        return processed, {}
Exemplo n.º 34
0
    def _loop_blocking(self, iet):
        """
        Apply loop blocking to PARALLEL Iteration trees.
        """
        blockinner = bool(self.params.get('blockinner'))
        blockalways = bool(self.params.get('blockalways'))

        # Make sure loop blocking will span as many Iterations as possible
        iet = fold_blockable_tree(iet, blockinner)

        mapper = {}
        efuncs = []
        block_dims = []
        for tree in retrieve_iteration_tree(iet):
            # Is the Iteration tree blockable ?
            iterations = filter_iterations(tree, lambda i: i.is_Parallel)
            if not blockinner:
                iterations = iterations[:-1]
            if len(iterations) <= 1:
                continue
            root = iterations[0]
            if not blockalways:
                # Heuristically bypass loop blocking if we think `tree`
                # won't be computationally expensive. This will help with code
                # size/redability, JIT time, and auto-tuning time
                if not (tree.root.is_Sequential or iet.is_Callable):
                    # E.g., not inside a time-stepping Iteration
                    continue
                if any(i.dim.is_Sub and i.dim.local for i in tree):
                    # At least an outer Iteration is over a local SubDimension,
                    # which suggests the computational cost of this Iteration
                    # nest will be negligible w.r.t. the "core" Iteration nest
                    # (making use of non-local (Sub)Dimensions only)
                    continue
            if not IsPerfectIteration().visit(root):
                # Don't know how to block non-perfect nests
                continue

            # Apply loop blocking to `tree`
            interb = []
            intrab = []
            for i in iterations:
                d = BlockDimension(i.dim, name="%s%d_blk" % (i.dim.name, len(mapper)))
                block_dims.append(d)
                # Build Iteration over blocks
                properties = (PARALLEL,) + ((AFFINE,) if i.is_Affine else ())
                interb.append(Iteration([], d, d.symbolic_max, properties=properties))
                # Build Iteration within a block
                intrab.append(i._rebuild([], limits=(d, d+d.step-1, 1), offsets=(0, 0)))

            # Construct the blocked tree
            blocked = compose_nodes(interb + intrab + [iterations[-1].nodes])
            blocked = unfold_blocked_tree(blocked)

            # Promote to a separate Callable
            dynamic_parameters = flatten((bi.dim, bi.dim.symbolic_size) for bi in interb)
            efunc = make_efunc("bf%d" % len(mapper), blocked, dynamic_parameters)
            efuncs.append(efunc)

            # Compute the iteration ranges
            ranges = []
            for i, bi in zip(iterations, interb):
                maxb = i.symbolic_max - (i.symbolic_size % bi.dim.step)
                ranges.append(((i.symbolic_min, maxb, bi.dim.step),
                               (maxb + 1, i.symbolic_max, i.symbolic_max - maxb)))

            # Build Calls to the `efunc`
            body = []
            for p in product(*ranges):
                dynamic_args_mapper = {}
                for bi, (m, M, b) in zip(interb, p):
                    dynamic_args_mapper[bi.dim] = (m, M)
                    dynamic_args_mapper[bi.dim.step] = (b,)
                call = efunc.make_call(dynamic_args_mapper)
                body.append(List(body=call))

            mapper[root] = List(body=body)

        iet = Transformer(mapper).visit(iet)

        return iet, {'dimensions': block_dims, 'efuncs': efuncs,
                     'args': [i.step for i in block_dims]}
Exemplo n.º 35
0
    def _loop_blocking(self, iet):
        """
        Apply loop blocking to PARALLEL Iteration trees.
        """
        blockinner = bool(self.params.get('blockinner'))
        blockalways = bool(self.params.get('blockalways'))

        # Make sure loop blocking will span as many Iterations as possible
        iet = fold_blockable_tree(iet, blockinner)

        mapper = {}
        efuncs = []
        block_dims = []
        for tree in retrieve_iteration_tree(iet):
            # Is the Iteration tree blockable ?
            iterations = filter_iterations(tree, lambda i: i.is_Parallel)
            if not blockinner:
                iterations = iterations[:-1]
            if len(iterations) <= 1:
                continue
            root = iterations[0]
            if not (tree.root.is_Sequential
                    or iet.is_Callable) and not blockalways:
                # Heuristic: avoid polluting the generated code with blocked
                # nests (thus increasing JIT compilation time and affecting
                # readability) if the blockable tree isn't embedded in a
                # sequential loop (e.g., a timestepping loop)
                continue

            # Apply loop blocking to `tree`
            interb = []
            intrab = []
            for i in iterations:
                d = BlockDimension(i.dim,
                                   name="%s%d_blk" % (i.dim.name, len(mapper)))
                block_dims.append(d)
                # Build Iteration over blocks
                interb.append(
                    Iteration([], d, d.symbolic_max, properties=PARALLEL))
                # Build Iteration within a block
                intrab.append(
                    i._rebuild([],
                               limits=(d, d + d.step - 1, 1),
                               offsets=(0, 0)))

            # Construct the blocked tree
            blocked = compose_nodes(interb + intrab + [iterations[-1].nodes])
            blocked = unfold_blocked_tree(blocked)

            # Promote to a separate Callable
            dynamic_parameters = flatten(
                (bi.dim, bi.dim.symbolic_size) for bi in interb)
            efunc = make_efunc("bf%d" % len(mapper), blocked,
                               dynamic_parameters)
            efuncs.append(efunc)

            # Compute the iteration ranges
            ranges = []
            for i, bi in zip(iterations, interb):
                maxb = i.symbolic_max - (i.symbolic_size % bi.dim.step)
                ranges.append(
                    ((i.symbolic_min, maxb, bi.dim.step),
                     (maxb + 1, i.symbolic_max, i.symbolic_max - maxb)))

            # Build Calls to the `efunc`
            body = []
            for p in product(*ranges):
                dynamic_args_mapper = {}
                for bi, (m, M, b) in zip(interb, p):
                    dynamic_args_mapper[bi.dim] = (m, M)
                    dynamic_args_mapper[bi.dim.step] = (b, )
                call = efunc.make_call(dynamic_args_mapper)
                body.append(List(body=call))

            mapper[root] = List(body=body)

        iet = Transformer(mapper).visit(iet)

        return iet, {
            'dimensions': block_dims,
            'efuncs': efuncs,
            'args': [i.step for i in block_dims]
        }
Exemplo n.º 36
0
    def make_blocking(self, iet):
        """
        Apply loop blocking to PARALLEL Iteration trees.
        """
        # Make sure loop blocking will span as many Iterations as possible
        iet = fold_blockable_tree(iet, self.blockinner)

        mapper = {}
        efuncs = []
        block_dims = []
        for tree in retrieve_iteration_tree(iet):
            # Is the Iteration tree blockable ?
            iterations = filter_iterations(tree, lambda i: i.is_Parallel and i.is_Affine)
            if not self.blockinner:
                iterations = iterations[:-1]
            if len(iterations) <= 1:
                continue
            root = iterations[0]
            if not self.blockalways:
                # Heuristically bypass loop blocking if we think `tree`
                # won't be computationally expensive. This will help with code
                # size/readbility, JIT time, and auto-tuning time
                if not (tree.root.is_Sequential or iet.is_Callable):
                    # E.g., not inside a time-stepping Iteration
                    continue
                if any(i.dim.is_Sub and i.dim.local for i in tree):
                    # At least an outer Iteration is over a local SubDimension,
                    # which suggests the computational cost of this Iteration
                    # nest will be negligible w.r.t. the "core" Iteration nest
                    # (making use of non-local (Sub)Dimensions only)
                    continue
            if not IsPerfectIteration().visit(root):
                # Don't know how to block non-perfect nests
                continue

            # Apply hierarchical loop blocking to `tree`
            level_0 = []  # Outermost level of blocking
            level_i = [[] for i in range(1, self.nlevels)]  # Inner levels of blocking
            intra = []  # Within the smallest block
            for i in iterations:
                template = "%s%d_blk%s" % (i.dim.name, self.nblocked, '%d')
                properties = (PARALLEL,) + ((AFFINE,) if i.is_Affine else ())

                # Build Iteration across `level_0` blocks
                d = BlockDimension(i.dim, name=template % 0)
                level_0.append(Iteration([], d, d.symbolic_max, properties=properties))

                # Build Iteration across all `level_i` blocks, `i` in (1, self.nlevels]
                for n, li in enumerate(level_i, 1):
                    di = BlockDimension(d, name=template % n)
                    li.append(Iteration([], di, limits=(d, d+d.step-1, di.step),
                                        properties=properties))
                    d = di

                # Build Iteration within the smallest block
                intra.append(i._rebuild([], limits=(d, d+d.step-1, 1), offsets=(0, 0)))
            level_i = flatten(level_i)

            # Track all constructed BlockDimensions
            block_dims.extend(i.dim for i in level_0 + level_i)

            # Construct the blocked tree
            blocked = compose_nodes(level_0 + level_i + intra + [iterations[-1].nodes])
            blocked = unfold_blocked_tree(blocked)

            # Promote to a separate Callable
            dynamic_parameters = flatten((l0.dim, l0.step) for l0 in level_0)
            dynamic_parameters.extend([li.step for li in level_i])
            efunc = make_efunc("bf%d" % self.nblocked, blocked, dynamic_parameters)
            efuncs.append(efunc)

            # Compute the iteration ranges
            ranges = []
            for i, l0 in zip(iterations, level_0):
                maxb = i.symbolic_max - (i.symbolic_size % l0.step)
                ranges.append(((i.symbolic_min, maxb, l0.step),
                               (maxb + 1, i.symbolic_max, i.symbolic_max - maxb)))

            # Build Calls to the `efunc`
            body = []
            for p in product(*ranges):
                dynamic_args_mapper = {}
                for l0, (m, M, b) in zip(level_0, p):
                    dynamic_args_mapper[l0.dim] = (m, M)
                    dynamic_args_mapper[l0.step] = (b,)
                    for li in level_i:
                        if li.dim.root is l0.dim.root:
                            value = li.step if b is l0.step else b
                            dynamic_args_mapper[li.step] = (value,)
                call = efunc.make_call(dynamic_args_mapper)
                body.append(List(body=call))

            mapper[root] = List(body=body)

            # Next blockable nest, use different (unique) variable/function names
            self.nblocked += 1

        iet = Transformer(mapper).visit(iet)

        # Force-unfold if some folded Iterations haven't been blocked in the end
        iet = unfold_blocked_tree(iet)

        return iet, {'dimensions': block_dims,
                     'efuncs': efuncs,
                     'args': [i.step for i in block_dims]}
Exemplo n.º 37
0
Arquivo: basic.py Projeto: nw0/devito
    def _create_elemental_functions(self, nodes, state):
        """
        Extract :class:`Iteration` sub-trees and move them into :class:`Callable`s.

        Currently, only tagged, elementizable Iteration objects are targeted.
        """
        noinline = self._compiler_decoration('noinline',
                                             c.Comment('noinline?'))

        functions = OrderedDict()
        mapper = {}
        for tree in retrieve_iteration_tree(nodes, mode='superset'):
            # Search an elementizable sub-tree (if any)
            tagged = filter_iterations(tree, lambda i: i.tag is not None,
                                       'asap')
            if not tagged:
                continue
            root = tagged[0]
            if not root.is_Elementizable:
                continue
            target = tree[tree.index(root):]

            # Elemental function arguments
            args = []  # Found so far (scalars, tensors)
            maybe_required = set()  # Scalars that *may* have to be passed in
            not_required = set()  # Elemental function locally declared scalars

            # Build a new Iteration/Expression tree with free bounds
            free = []
            for i in target:
                name, bounds = i.dim.name, i.bounds_symbolic
                # Iteration bounds
                start = Scalar(name='%s_start' % name, dtype=np.int32)
                finish = Scalar(name='%s_finish' % name, dtype=np.int32)
                args.extend(zip([ccode(j) for j in bounds], (start, finish)))
                # Iteration unbounded indices
                ufunc = [
                    Scalar(name='%s_ub%d' % (name, j), dtype=np.int32)
                    for j in range(len(i.uindices))
                ]
                args.extend(zip([ccode(j.start) for j in i.uindices], ufunc))
                limits = [Symbol(start.name), Symbol(finish.name), 1]
                uindices = [
                    UnboundedIndex(j.index, i.dim + as_symbol(k))
                    for j, k in zip(i.uindices, ufunc)
                ]
                free.append(
                    i._rebuild(limits=limits, offsets=None, uindices=uindices))
                not_required.update({i.dim}, set(j.index for j in i.uindices))

            # Construct elemental function body, and inspect it
            free = NestedTransformer(dict((zip(target, free)))).visit(root)
            expressions = FindNodes(Expression).visit(free)
            fsymbols = FindSymbols('symbolics').visit(free)

            # Add all definitely-required arguments
            not_required.update({i.output for i in expressions if i.is_scalar})
            for i in fsymbols:
                if i in not_required:
                    continue
                elif i.is_Array:
                    args.append(
                        ("(%s*)%s" % (c.dtype_to_ctype(i.dtype), i.name), i))
                elif i.is_TensorFunction:
                    args.append(("%s_vec" % i.name, i))
                elif i.is_Scalar:
                    args.append((i.name, i))

            # Add all maybe-required arguments that turn out to be required
            maybe_required.update(
                set(FindSymbols(mode='free-symbols').visit(free)))
            for i in fsymbols:
                not_required.update({as_symbol(i), i.indexify()})
                for j in i.symbolic_shape:
                    maybe_required.update(j.free_symbols)
            required = filter_sorted(maybe_required - not_required,
                                     key=attrgetter('name'))
            args.extend([(i.name, Scalar(name=i.name, dtype=i.dtype))
                         for i in required])

            call, params = zip(*args)
            handle = flatten([p.rtargs for p in params])
            name = "f_%d" % root.tag

            # Produce the new Call
            mapper[root] = List(header=noinline, body=Call(name, call))

            # Produce the new Callable
            functions.setdefault(
                name, Callable(name, free, 'void', handle, ('static', )))

        # Transform the main tree
        processed = Transformer(mapper).visit(nodes)

        return processed, {'elemental_functions': functions.values()}
Exemplo n.º 38
0
    def _specialize_iet(self, iet, **kwargs):
        warning("The OPS backend is still work-in-progress")

        # If there is no iteration tree, then there is no loop to be optimized using OPS.
        iteration_tree = retrieve_iteration_tree(iet, mode='normal')
        if not len(iteration_tree):
            return iet
        time_upper_bound = iteration_tree[0].dimensions[TimeFunction._time_position]\
            .extreme_max

        ops_init = Call(namespace['ops_init'], [0, 0, 2])
        ops_partition = Call(namespace['ops_partition'], Literal('""'))
        ops_exit = Call(namespace['ops_exit'])

        # Extract all symbols that need to be converted to ops_dat
        dims = []
        to_dat = set()
        for section, trees in find_affine_trees(iet).items():
            dims.append(len(trees[0].dimensions))
            symbols = set(FindSymbols('symbolics').visit(trees[0].root))
            symbols -= set(FindSymbols('defines').visit(trees[0].root))
            to_dat |= symbols

        # Create the OPS block for this problem
        ops_block = OpsBlock('block')
        ops_block_init = Expression(ClusterizedEq(Eq(
            ops_block,
            namespace['ops_decl_block'](
                dims[0],
                Literal('"block"')
            )
        )))

        # To ensure deterministic code generation we order the datasets to
        # be generated (since a set is an unordered collection)
        to_dat = filter_sorted(to_dat)

        name_to_ops_dat = {}
        pre_time_loop = []
        after_time_loop = []
        for f in to_dat:
            if f.is_Constant:
                continue

            pre_time_loop.extend(create_ops_dat(f, name_to_ops_dat, ops_block))
            # To return the result to Devito, it is necessary to copy the data
            # from the dat object back to the CPU memory.
            after_time_loop.extend(create_ops_fetch(f, name_to_ops_dat, time_upper_bound))

        # Generate ops kernels for each offloadable iteration tree
        mapper = {}
        for n, (section, trees) in enumerate(find_affine_trees(iet).items()):
            pre_loop, ops_kernel, ops_par_loop_call = opsit(
                trees, n, name_to_ops_dat, ops_block, dims[0]
            )

            pre_time_loop.extend(pre_loop)
            self._ops_kernels.append(ops_kernel)
            mapper[trees[0].root] = ops_par_loop_call
            mapper.update({i.root: mapper.get(i.root) for i in trees})  # Drop trees

        iet = Transformer(mapper).visit(iet)

        assert (d == dims[0] for d in dims), \
            "The OPS backend currently assumes that all kernels \
            have the same number of dimensions"

        self._headers.append(namespace['ops_define_dimension'](dims[0]))
        self._includes.append('stdio.h')

        body = [ops_init, ops_block_init, *pre_time_loop,
                ops_partition, iet, *after_time_loop, ops_exit]

        return List(body=body)
Exemplo n.º 39
0
    def _optimize_halospots(self, iet):
        """
        Optimize the HaloSpots in ``iet``.

        * Remove all ``useless`` HaloSpots;
        * Merge all ``hoistable`` HaloSpots with their root HaloSpot, thus
          removing redundant communications and anticipating communications
          that will be required by later Iterations.
        """
        # Drop `useless` HaloSpots
        mapper = {
            hs: hs._rebuild(halo_scheme=hs.halo_scheme.drop(hs.useless))
            for hs in FindNodes(HaloSpot).visit(iet)
        }
        iet = Transformer(mapper, nested=True).visit(iet)

        # Handle `hoistable` HaloSpots
        # First, we merge `hoistable` HaloSpots together, to anticipate communications
        mapper = {}
        for tree in retrieve_iteration_tree(iet):
            halo_spots = FindNodes(HaloSpot).visit(tree.root)
            if not halo_spots:
                continue
            root = halo_spots[0]
            if root in mapper:
                continue
            hss = [root.halo_scheme]
            hss.extend([
                hs.halo_scheme.project(hs.hoistable) for hs in halo_spots[1:]
            ])
            try:
                mapper[root] = root._rebuild(halo_scheme=HaloScheme.union(hss))
            except ValueError:
                # HaloSpots have non-matching `loc_indices` and therefore can't be merged
                warning("Found hoistable HaloSpots with disjoint loc_indices, "
                        "skipping optimization")
                continue
            for hs in halo_spots[1:]:
                halo_scheme = hs.halo_scheme.drop(hs.hoistable)
                if halo_scheme.is_void:
                    mapper[hs] = hs.body
                else:
                    mapper[hs] = hs._rebuild(halo_scheme=halo_scheme)
        iet = Transformer(mapper, nested=True).visit(iet)

        # Then, we make sure the halo exchanges get performed *before*
        # the first distributed Dimension. Again, we do this to anticipate
        # communications, which hopefully has a pay off in performance
        #
        # <Iteration x>                    <HaloSpot(u)>, in y
        #   <HaloSpot(u)>, in y    ---->   <Iteration x>
        #   <Iteration y>                    <Iteration y>
        mapper = {}
        for i, halo_spots in MapNodes(Iteration, HaloSpot).visit(iet).items():
            hoistable = [hs for hs in halo_spots if hs.hoistable]
            if not hoistable:
                continue
            elif len(hoistable) > 1:
                # We should never end up here, but for now we can't prove it formally
                warning(
                    "Found multiple hoistable HaloSpots, skipping optimization"
                )
                continue
            hs = hoistable.pop()
            if hs in mapper:
                continue
            if i.dim.root in hs.dimensions:
                halo_scheme = hs.halo_scheme.drop(hs.hoistable)
                if halo_scheme.is_void:
                    mapper[hs] = hs.body
                else:
                    mapper[hs] = hs._rebuild(halo_scheme=halo_scheme)

                halo_scheme = hs.halo_scheme.project(hs.hoistable)
                mapper[i] = hs._rebuild(halo_scheme=halo_scheme,
                                        body=i._rebuild())
        iet = Transformer(mapper, nested=True).visit(iet)

        # Finally, we try to move HaloSpot-free Iteration nests within HaloSpot
        # subtrees, to overlap as much computation as possible. The HaloSpot-free
        # Iteration nests must be fully affine, otherwise we wouldn't be able to
        # honour the data dependences along the halo
        #
        # <HaloSpot(u,v)>            HaloSpot(u,v)
        #   <A>             ---->      <A>
        # <B>              affine?     <B>
        #
        # Here, <B> doesn't require any halo exchange, but it might still need the
        # output of <A>; thus, if we do computation/communication overlap over <A>
        # *and* want to embed <B> within the HaloSpot, then <B>'s iteration space
        # will have to be split as well. For this, <B> must be affine.
        mapper = {}
        for v in FindAdjacent((HaloSpot, Iteration)).visit(iet).values():
            for g in v:
                root = None
                for i in g:
                    if i.is_HaloSpot:
                        root = i
                        mapper[root] = [root.body]
                    elif root and all(j.is_Affine
                                      for j in FindNodes(Iteration).visit(i)):
                        mapper[root].append(i)
                        mapper[i] = None
                    else:
                        root = None
        mapper = {
            k: k._rebuild(body=List(body=v)) if v else v
            for k, v in mapper.items()
        }
        iet = Transformer(mapper).visit(iet)

        return iet, {}
Exemplo n.º 40
0
    def _loop_blocking(self, nodes, state):
        """
        Apply loop blocking to :class:`Iteration` trees.

        Blocking is applied to parallel iteration trees. Heuristically, innermost
        dimensions are not blocked to maximize the trip count of the SIMD loops.

        Different heuristics may be specified by passing the keywords ``blockshape``
        and ``blockinner`` to the DLE. The former, a dictionary, is used to indicate
        a specific block size for each blocked dimension. For example, for the
        :class:`Iteration` tree: ::

            for i
              for j
                for k
                  ...

        one may provide ``blockshape = {i: 4, j: 7}``, in which case the
        two outer loops will blocked, and the resulting 2-dimensional block will
        have size 4x7. The latter may be set to True to also block innermost parallel
        :class:`Iteration` objects.
        """
        exclude_innermost = not self.params.get('blockinner', False)
        ignore_heuristic = self.params.get('blockalways', False)

        # Make sure loop blocking will span as many Iterations as possible
        fold = fold_blockable_tree(nodes, exclude_innermost)

        mapper = {}
        blocked = OrderedDict()
        for tree in retrieve_iteration_tree(fold):
            # Is the Iteration tree blockable ?
            iterations = [i for i in tree if i.is_Parallel]
            if exclude_innermost:
                iterations = [i for i in iterations if not i.is_Vectorizable]
            if len(iterations) <= 1:
                continue
            root = iterations[0]
            if not IsPerfectIteration().visit(root):
                # Illegal/unsupported
                continue
            if not tree[0].is_Sequential and not ignore_heuristic:
                # Heuristic: avoid polluting the generated code with blocked
                # nests (thus increasing JIT compilation time and affecting
                # readability) if the blockable tree isn't embedded in a
                # sequential loop (e.g., a timestepping loop)
                continue

            # Decorate intra-block iterations with an IterationProperty
            TAG = tagger(len(mapper))

            # Build all necessary Iteration objects, individually. These will
            # subsequently be composed to implement loop blocking.
            inter_blocks = []
            intra_blocks = []
            remainders = []
            for i in iterations:
                name = "%s%d_block" % (i.dim.name, len(mapper))

                # Build Iteration over blocks
                dim = blocked.setdefault(i, Dimension(name=name))
                bsize = dim.symbolic_size
                bstart = i.limits[0]
                binnersize = i.dim.symbolic_extent + (i.offsets[1] -
                                                      i.offsets[0])
                bfinish = i.dim.symbolic_end - (binnersize % bsize) - 1
                inter_block = Iteration([],
                                        dim, [bstart, bfinish, bsize],
                                        offsets=i.offsets,
                                        properties=PARALLEL)
                inter_blocks.append(inter_block)

                # Build Iteration within a block
                limits = (dim, dim + bsize - 1, 1)
                intra_block = i._rebuild([],
                                         limits=limits,
                                         offsets=(0, 0),
                                         properties=i.properties +
                                         (TAG, ELEMENTAL))
                intra_blocks.append(intra_block)

                # Build unitary-increment Iteration over the 'leftover' region.
                # This will be used for remainder loops, executed when any
                # dimension size is not a multiple of the block size.
                remainder = i._rebuild(
                    [],
                    limits=[bfinish + 1, i.dim.symbolic_end, 1],
                    offsets=(i.offsets[1], i.offsets[1]))
                remainders.append(remainder)

            # Build blocked Iteration nest
            blocked_tree = compose_nodes(inter_blocks + intra_blocks +
                                         [iterations[-1].nodes])

            # Build remainder Iterations
            remainder_trees = []
            for n in range(len(iterations)):
                for c in combinations([i.dim for i in iterations], n + 1):
                    # First all inter-block Interations
                    nodes = [
                        b._rebuild(properties=b.properties + (REMAINDER, ))
                        for b, r in zip(inter_blocks, remainders)
                        if r.dim not in c
                    ]
                    # Then intra-block or remainder, for each dim (in order)
                    properties = (REMAINDER, TAG, ELEMENTAL)
                    for b, r in zip(intra_blocks, remainders):
                        handle = r if b.dim in c else b
                        nodes.append(handle._rebuild(properties=properties))
                    nodes.extend([iterations[-1].nodes])
                    remainder_trees.append(compose_nodes(nodes))

            # Will replace with blocked loop tree
            mapper[root] = List(body=[blocked_tree] + remainder_trees)

        rebuilt = Transformer(mapper).visit(fold)

        # Finish unrolling any previously folded Iterations
        processed = unfold_blocked_tree(rebuilt)

        # All blocked dimensions
        if not blocked:
            return processed, {}

        # Determine the block shape
        blockshape = self.params.get('blockshape')
        if not blockshape:
            # Use trivial heuristic for a suitable blockshape
            def heuristic(dim_size):
                ths = 8  # FIXME: This really needs to be improved
                return ths if dim_size > ths else 1

            blockshape = {k: heuristic for k in blocked.keys()}
        else:
            try:
                nitems, nrequired = len(blockshape), len(blocked)
                blockshape = {k: v for k, v in zip(blocked, blockshape)}
                if nitems > nrequired:
                    dle_warning("Provided 'blockshape' has more entries than "
                                "blocked loops; dropping entries ...")
                if nitems < nrequired:
                    dle_warning("Provided 'blockshape' has fewer entries than "
                                "blocked loops; dropping dimensions ...")
            except TypeError:
                blockshape = {list(blocked)[0]: blockshape}
            blockshape.update(
                {k: None
                 for k in blocked.keys() if k not in blockshape})

        # Track any additional arguments required to execute /state.nodes/
        arguments = [
            BlockingArg(v, k, blockshape[k]) for k, v in blocked.items()
        ]

        return processed, {'arguments': arguments, 'flags': 'blocking'}
Exemplo n.º 41
0
    def test_multiple_loops(self):
        grid = Grid(shape=(3, 3, 3))

        f = Function(name='f', grid=grid)
        g = Function(name='g', grid=grid)
        u = TimeFunction(name='u', grid=grid, space_order=2)
        v = TimeFunction(name='v', grid=grid, space_order=2)

        eqns = [
            Eq(f, g * 2),
            Eq(u.forward, u + v * f),
            Eq(v.forward, u.forward.dx + v * f + 4)
        ]

        op = Operator(eqns, opt='noop')

        trees = retrieve_iteration_tree(op)
        assert len(trees) == 3

        # All loop nests must have been parallelized
        assert trees[0][0].pragmas[0].value ==\
            'omp target teams distribute parallel for collapse(3)'
        assert trees[1][1].pragmas[0].value ==\
            'omp target teams distribute parallel for collapse(3)'
        assert trees[2][1].pragmas[0].value ==\
            'omp target teams distribute parallel for collapse(3)'

        # Check `u` and `v`
        for i, f in enumerate([u, v], 1):
            assert op.body[1].header[i].value ==\
                ('omp target enter data map(to: %(n)s[0:%(n)s_vec->size[0]]'
                 '[0:%(n)s_vec->size[1]][0:%(n)s_vec->size[2]][0:%(n)s_vec->size[3]])' %
                 {'n': f.name})
            assert op.body[1].footer[i+1].contents[0].value ==\
                ('omp target update from(%(n)s[0:%(n)s_vec->size[0]]'
                 '[0:%(n)s_vec->size[1]][0:%(n)s_vec->size[2]][0:%(n)s_vec->size[3]])' %
                 {'n': f.name})
            assert op.body[1].footer[i+1].contents[1].value ==\
                ('omp target exit data map(release: %(n)s[0:%(n)s_vec->size[0]]'
                 '[0:%(n)s_vec->size[1]][0:%(n)s_vec->size[2]][0:%(n)s_vec->size[3]])' %
                 {'n': f.name})

        # Check `f`
        assert op.body[1].header[0].value ==\
            ('omp target enter data map(to: f[0:f_vec->size[0]]'
             '[0:f_vec->size[1]][0:f_vec->size[2]])')
        assert op.body[1].footer[1].contents[0].value ==\
            ('omp target update from(f[0:f_vec->size[0]]'
             '[0:f_vec->size[1]][0:f_vec->size[2]])')
        assert op.body[1].footer[1].contents[1].value ==\
            ('omp target exit data map(release: f[0:f_vec->size[0]]'
             '[0:f_vec->size[1]][0:f_vec->size[2]])')

        # Check `g` -- note that unlike `f`, this one should be `delete` upon
        # exit, not `from`
        assert op.body[1].header[3].value ==\
            ('omp target enter data map(to: g[0:g_vec->size[0]]'
             '[0:g_vec->size[1]][0:g_vec->size[2]])')
        assert op.body[1].footer[4].value ==\
            ('omp target exit data map(delete: g[0:g_vec->size[0]]'
             '[0:g_vec->size[1]][0:g_vec->size[2]])'
             ' if((g_vec->size[0] != 0) && (g_vec->size[1] != 0)'
             ' && (g_vec->size[2] != 0))')
Exemplo n.º 42
0
def create_profile(name, node):
    """
    Create a :class:`Profiler` for the Iteration/Expression tree ``node``.
    The following code sections are profiled: ::

        * The whole ``node``;
        * A sequence of perfectly nested loops that have common :class:`Iteration`
          dimensions, but possibly different extent. For example: ::

            for x = 0 to N
              ..
            for x = 1 to N-1
              ..

          Both Iterations have dimension ``x``, and will be profiled as a single
          section, though their extent is different.
        * Any perfectly nested loops.
    """
    profiler = Profiler(name)

    trees = retrieve_iteration_tree(node)
    if not trees:
        return node, profiler

    adjacents = [
        flatten(i) for i in FindAdjacentIterations().visit(node).values() if i
    ]

    def are_adjacent(tree, last):
        for i, j in zip(tree, last):
            if i == j:
                continue
            try:
                return any(
                    abs(a.index(j) - a.index(i)) == 1 for a in adjacents)
            except ValueError:
                return False

    # Group Iterations based on timing region
    key, groups = lambda itspace: {i.defines for i in itspace}, []
    handle = [trees[0]]
    for tree in trees[1:]:
        last = handle[-1]
        if key(tree) == key(last) and are_adjacent(tree, last):
            handle.append(tree)
        else:
            groups.append(tuple(handle))
            handle = [tree]
    groups.append(tuple(handle))

    # Create and track C-level timers
    mapper = OrderedDict()
    for group in groups:
        # We time at the single timestep level
        for i in zip(*group):
            root = i[0]
            remainder = tuple(j for j in i if j is not root)
            if not root.dim.is_Time:
                break
        if root in mapper:
            continue

        # Prepare to transform the Iteration/Expression tree
        body = (root, ) + remainder
        lname = 'section_%d' % len(mapper)
        mapper[root] = TimedList(gname=name, lname=lname, body=body)
        mapper.update(OrderedDict([(j, None) for j in remainder]))

        # Estimate computational properties of the profiled section
        expressions = FindNodes(Expression).visit(body)
        ops = estimate_cost([e.expr for e in expressions])
        memory = estimate_memory([e.expr for e in expressions])

        # Keep track of the new profiled section
        profiler.add(lname, group[0], ops, memory)

    # Transform the Iteration/Expression tree introducing the C-level timers
    processed = Transformer(mapper).visit(node)

    return processed, profiler